crfm-helm 0.4.0__py3-none-any.whl → 0.5.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.10.dist-info/METADATA +369 -0
- crfm_helm-0.5.10.dist-info/RECORD +1008 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +80 -29
- helm/benchmark/adaptation/adapters/adapter.py +2 -2
- helm/benchmark/adaptation/adapters/adapter_factory.py +39 -28
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +24 -8
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +3 -4
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +112 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +6 -3
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +3 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +18 -8
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_adapter.py +5 -4
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +46 -22
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +17 -29
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +138 -16
- helm/benchmark/adaptation/common_adapter_specs.py +443 -0
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/aci_bench_annotator.py +84 -0
- helm/benchmark/annotation/air_bench_annotator.py +79 -0
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/annotator.py +48 -0
- helm/benchmark/annotation/annotator_factory.py +50 -0
- helm/benchmark/annotation/anthropic_red_team_annotator.py +57 -0
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/call_center_annotator.py +258 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +82 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +96 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/financebench_annotator.py +79 -0
- helm/benchmark/annotation/harm_bench_annotator.py +55 -0
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +93 -0
- helm/benchmark/annotation/image2struct/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2struct/lilypond_compiler_annotator.py +86 -0
- helm/benchmark/annotation/image2struct/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/live_qa_annotator.py +76 -0
- helm/benchmark/annotation/med_dialog_annotator.py +88 -0
- helm/benchmark/annotation/medalign_annotator.py +89 -0
- helm/benchmark/annotation/medi_qa_annotator.py +87 -0
- helm/benchmark/annotation/medication_qa_annotator.py +86 -0
- helm/benchmark/annotation/mental_health_annotator.py +87 -0
- helm/benchmark/annotation/mimic_bhc_annotator.py +89 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +89 -0
- helm/benchmark/annotation/model_as_judge.py +309 -0
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +87 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +90 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +131 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +50 -0
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +87 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation/xstest_annotator.py +100 -0
- helm/benchmark/annotation_executor.py +144 -0
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +3 -3
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +26 -4
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +56 -19
- helm/benchmark/augmentations/translate_perturbation.py +31 -0
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/config_registry.py +7 -1
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/executor.py +54 -25
- helm/benchmark/huggingface_registration.py +28 -10
- helm/benchmark/metrics/air_bench_metrics.py +3212 -0
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/annotation_metrics.py +108 -0
- helm/benchmark/metrics/basic_metrics.py +437 -667
- helm/benchmark/metrics/bbq_metrics.py +17 -6
- helm/benchmark/metrics/bias_metrics.py +18 -9
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/classification_metrics.py +107 -22
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +12 -11
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/code_metrics_helper.py +11 -3
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +174 -0
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +83 -0
- helm/benchmark/metrics/copyright_metrics.py +5 -5
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +8 -114
- helm/benchmark/metrics/dry_run_metrics.py +35 -6
- helm/benchmark/metrics/efficiency_metrics.py +287 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +159 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +831 -0
- helm/benchmark/metrics/fin_qa_metrics.py +60 -0
- helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +115 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +48 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +67 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +84 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +20 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +48 -5
- helm/benchmark/metrics/kpi_edgar_metrics.py +142 -0
- helm/benchmark/metrics/language_modeling_metrics.py +111 -0
- helm/benchmark/metrics/live_qa_metrics.py +35 -0
- helm/benchmark/metrics/llm_jury_metrics.py +58 -0
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/machine_translation_metrics.py +89 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +137 -0
- helm/benchmark/metrics/medec_metrics.py +124 -0
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/metric.py +121 -175
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +23 -7
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +127 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/omni_math_metrics.py +44 -0
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +5 -6
- helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
- helm/benchmark/metrics/ranking_metrics.py +5 -5
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/safety_metrics.py +91 -0
- helm/benchmark/metrics/seahelm_metrics.py +201 -0
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +8 -11
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +150 -11
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +145 -70
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +20 -1
- helm/benchmark/metrics/test_metric.py +3 -3
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +11 -3
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +3 -3
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +7 -7
- helm/benchmark/metrics/toxicity_metrics.py +37 -7
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
- helm/benchmark/metrics/unitxt_metrics.py +107 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +347 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +537 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/metrics/wildbench_metrics.py +54 -0
- helm/benchmark/model_deployment_registry.py +69 -5
- helm/benchmark/model_metadata_registry.py +58 -2
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +51 -20
- helm/benchmark/presentation/run_display.py +51 -12
- helm/benchmark/presentation/run_entry.py +2 -2
- helm/benchmark/presentation/schema.py +83 -66
- helm/benchmark/presentation/summarize.py +483 -388
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/presentation/test_run_entry.py +2 -2
- helm/benchmark/presentation/test_schema.py +11 -0
- helm/benchmark/presentation/test_summarize.py +148 -6
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +202 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +151 -87
- helm/benchmark/run_expander.py +418 -33
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +180 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +58 -0
- helm/benchmark/run_specs/arabic_run_specs.py +197 -0
- helm/benchmark/run_specs/audio_run_specs.py +657 -0
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/call_center_run_specs.py +201 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1393 -0
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +316 -0
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
- helm/benchmark/run_specs/experimental_run_specs.py +224 -0
- helm/benchmark/run_specs/finance_run_specs.py +114 -0
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +625 -0
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +188 -0
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1570 -0
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +191 -0
- helm/benchmark/run_specs/seahelm_run_specs.py +652 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +167 -0
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +51 -0
- helm/benchmark/run_specs/vlm_run_specs.py +1057 -0
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/runner.py +63 -62
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +149 -0
- helm/benchmark/scenarios/air_bench_scenario.py +76 -0
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +27 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +82 -0
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
- helm/benchmark/scenarios/aratrust_scenario.py +95 -0
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +79 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +16 -1
- helm/benchmark/scenarios/banking77_scenario.py +77 -0
- helm/benchmark/scenarios/bbq_scenario.py +17 -2
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +112 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bluex_scenario.py +70 -0
- helm/benchmark/scenarios/bold_scenario.py +18 -3
- helm/benchmark/scenarios/boolq_scenario.py +21 -1
- helm/benchmark/scenarios/call_center_scenario.py +84 -0
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +129 -0
- helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +14 -1
- helm/benchmark/scenarios/clear_scenario.py +180 -0
- helm/benchmark/scenarios/cleva_scenario.py +482 -3
- helm/benchmark/scenarios/code_scenario.py +46 -4
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/commonsense_scenario.py +33 -1
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +118 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +261 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +148 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +190 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +143 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +98 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +344 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +217 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +571 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +80 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +90 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -3
- helm/benchmark/scenarios/dischargeme_scenario.py +196 -0
- helm/benchmark/scenarios/disinformation_scenario.py +32 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +25 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1541 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +77 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +33 -3
- helm/benchmark/scenarios/entity_matching_scenario.py +26 -2
- helm/benchmark/scenarios/ewok_scenario.py +116 -0
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +139 -0
- helm/benchmark/scenarios/financebench_scenario.py +74 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +115 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +145 -0
- helm/benchmark/scenarios/gpqa_scenario.py +98 -0
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/grammar_scenario.py +21 -2
- helm/benchmark/scenarios/gsm_scenario.py +31 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +61 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +70 -0
- helm/benchmark/scenarios/headqa_scenario.py +158 -0
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/ice_scenario.py +28 -4
- helm/benchmark/scenarios/ifeval_scenario.py +71 -0
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +26 -3
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +98 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +172 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +149 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +61 -1
- helm/benchmark/scenarios/legal_support_scenario.py +24 -1
- helm/benchmark/scenarios/legalbench_scenario.py +45 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +23 -2
- helm/benchmark/scenarios/lextreme_scenario.py +22 -1
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +15 -1
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +81 -22
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +56 -22
- helm/benchmark/scenarios/med_mcqa_scenario.py +24 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +30 -1
- helm/benchmark/scenarios/medalign_scenario.py +117 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +326 -0
- helm/benchmark/scenarios/medbullets_scenario.py +167 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +149 -0
- helm/benchmark/scenarios/medec_scenario.py +148 -0
- helm/benchmark/scenarios/medhallu_scenario.py +95 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +134 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +96 -0
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +146 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +127 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +121 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +99 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +113 -0
- helm/benchmark/scenarios/mmlu_scenario.py +32 -1
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/msmarco_scenario.py +31 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +166 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +164 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +297 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +20 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +33 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +71 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +33 -2
- helm/benchmark/scenarios/openai_mrcr_scenario.py +94 -0
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -5
- helm/benchmark/scenarios/pubmed_qa_scenario.py +81 -43
- helm/benchmark/scenarios/quac_scenario.py +24 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +175 -0
- helm/benchmark/scenarios/raft_scenario.py +33 -3
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +128 -0
- helm/benchmark/scenarios/scenario.py +44 -1
- helm/benchmark/scenarios/seahelm_scenario.py +2295 -0
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +97 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +95 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +99 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +98 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +94 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +100 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +98 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +104 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +98 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +44 -0
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/spider_scenario.py +109 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +119 -0
- helm/benchmark/scenarios/summarization_scenario.py +48 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +24 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +29 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
- helm/benchmark/scenarios/test_math_scenario.py +4 -3
- helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
- helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +239 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +26 -2
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/unitxt_scenario.py +62 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +4 -2
- helm/benchmark/scenarios/vicuna_scenario.py +22 -2
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +135 -0
- helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2struct/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2struct/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2struct/image2struct_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/image2struct/latex_scenario.py +21 -0
- helm/benchmark/scenarios/vision_language/image2struct/musicsheet_scenario.py +16 -0
- helm/benchmark/scenarios/vision_language/image2struct/utils_latex.py +339 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage_scenario.py +256 -0
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +148 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +105 -0
- helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +131 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +4 -5
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +8 -4
- helm/benchmark/scenarios/wikifact_scenario.py +31 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +101 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +33 -2
- helm/benchmark/scenarios/xstest_scenario.py +35 -0
- helm/benchmark/server.py +32 -2
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +78 -50
- helm/benchmark/static/schema_air_bench.yaml +3149 -0
- helm/benchmark/static/schema_arabic.yaml +271 -0
- helm/benchmark/static/schema_audio.yaml +763 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +269 -0
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_classic.yaml +259 -1140
- helm/benchmark/static/schema_cleva.yaml +768 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_decodingtrust.yaml +444 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +319 -0
- helm/benchmark/static/schema_ewok.yaml +367 -0
- helm/benchmark/static/schema_finance.yaml +191 -0
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_image2struct.yaml +588 -0
- helm/benchmark/static/schema_instruction_following.yaml +161 -0
- helm/benchmark/static/schema_legal.yaml +566 -0
- helm/benchmark/static/schema_lite.yaml +3 -286
- helm/benchmark/static/schema_long_context.yaml +282 -0
- helm/benchmark/static/schema_medhelm.yaml +1176 -0
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_mmlu.yaml +1449 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +283 -0
- helm/benchmark/static/schema_seahelm.yaml +723 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/schema_thai.yaml +244 -0
- helm/benchmark/static/schema_torr.yaml +474 -0
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_unitxt.yaml +370 -0
- helm/benchmark/static/schema_vhelm.yaml +933 -0
- helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/air-overview-DpBbyagA.png +0 -0
- helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-BJtQlEbV.png +0 -0
- helm/benchmark/static_build/assets/helm-safety-COfndXuS.png +0 -0
- helm/benchmark/static_build/assets/helmhero-D9TvmJsp.png +0 -0
- helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
- helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
- helm/benchmark/static_build/assets/medhelm-overview-CND0EIsy.png +0 -0
- helm/benchmark/static_build/assets/medhelm-v1-overview-Cu2tphBB.png +0 -0
- helm/benchmark/static_build/assets/overview-BwypNWnk.png +0 -0
- helm/benchmark/static_build/assets/process-flow-DWDJC733.png +0 -0
- helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
- helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
- helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
- helm/benchmark/static_build/assets/vhelm-aspects-NiDQofvP.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-NxJE4fdA.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-ypCL5Yvq.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +19 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/window_services/default_window_service.py +3 -45
- helm/benchmark/window_services/encoder_decoder_window_service.py +4 -15
- helm/benchmark/window_services/ice_window_service.py +1 -35
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +13 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +22 -5
- helm/benchmark/window_services/test_anthropic_window_service.py +5 -4
- helm/benchmark/window_services/test_bloom_window_service.py +5 -4
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +9 -4
- helm/benchmark/window_services/test_gpt4_window_service.py +10 -4
- helm/benchmark/window_services/test_gptj_window_service.py +11 -5
- helm/benchmark/window_services/test_gptneox_window_service.py +6 -5
- helm/benchmark/window_services/test_openai_window_service.py +18 -12
- helm/benchmark/window_services/test_opt_window_service.py +6 -5
- helm/benchmark/window_services/test_palmyra_window_service.py +5 -4
- helm/benchmark/window_services/test_t0pp_window_service.py +5 -4
- helm/benchmark/window_services/test_t511b_window_service.py +5 -4
- helm/benchmark/window_services/test_ul2_window_service.py +5 -4
- helm/benchmark/window_services/test_utils.py +6 -6
- helm/benchmark/window_services/test_yalm_window_service.py +5 -4
- helm/benchmark/window_services/tokenizer_service.py +7 -13
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +4 -1
- helm/benchmark/window_services/yalm_window_service.py +1 -28
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +78 -12
- helm/clients/aleph_alpha_client.py +114 -0
- helm/{proxy/clients → clients}/anthropic_client.py +304 -21
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +122 -0
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/llama_omni_client.py +199 -0
- helm/clients/audio_language/qwen2_5_omni_client.py +210 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +191 -0
- helm/clients/audio_language/qwen_audiolm_client.py +153 -0
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/audio_language/test.py +62 -0
- helm/{proxy/clients → clients}/auto_client.py +72 -31
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +381 -0
- helm/clients/bedrock_utils.py +105 -0
- helm/{proxy/clients → clients}/client.py +92 -17
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +105 -14
- helm/clients/dspy_client.py +135 -0
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +8 -6
- helm/clients/google_translate_client.py +35 -0
- helm/clients/grok_client.py +36 -0
- helm/{proxy/clients → clients}/http_model_client.py +8 -8
- helm/{proxy/clients → clients}/huggingface_client.py +157 -86
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/ibm_client.py +269 -0
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +80 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +100 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +99 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +144 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +192 -0
- helm/clients/image_generation/dalle2_client.py +194 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +191 -0
- helm/clients/image_generation/deep_floyd_client.py +80 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +250 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +88 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +116 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +113 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +6 -6
- helm/{proxy/clients → clients}/megatron_client.py +7 -5
- helm/clients/mistral_client.py +180 -0
- helm/clients/moderation_api_client.py +111 -0
- helm/clients/nvidia_nim_client.py +32 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +604 -0
- helm/clients/openai_responses_client.py +200 -0
- helm/clients/openrouter_client.py +31 -0
- helm/{proxy/clients → clients}/palmyra_client.py +31 -14
- helm/{proxy/clients → clients}/perspective_api_client.py +18 -14
- helm/clients/reka_client.py +190 -0
- helm/clients/simple_client.py +64 -0
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +95 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/{proxy/clients → clients}/test_auto_client.py +13 -15
- helm/clients/test_client.py +98 -0
- helm/{proxy/clients → clients}/test_huggingface_client.py +31 -16
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/test_simple_client.py +19 -0
- helm/clients/test_together_client.py +184 -0
- helm/clients/together_client.py +599 -0
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +488 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vision2seq_client.py +148 -0
- helm/clients/vision_language/huggingface_vlm_client.py +114 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +61 -51
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/paligemma_client.py +147 -0
- helm/clients/vision_language/palmyra_vision_client.py +101 -0
- helm/clients/vision_language/qwen2_vlm_client.py +189 -0
- helm/clients/vision_language/qwen_vlm_client.py +174 -0
- helm/clients/vllm_client.py +80 -0
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/clients/writer_client.py +105 -0
- helm/clients/yi_client.py +28 -0
- helm/common/audio_utils.py +111 -0
- helm/common/cache.py +23 -33
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/critique_request.py +10 -2
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +10 -3
- helm/common/hierarchical_logger.py +124 -12
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +60 -5
- helm/common/key_value_store.py +41 -10
- helm/common/local_context.py +140 -0
- helm/common/media_object.py +14 -1
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +8 -7
- helm/common/multimodal_request_utils.py +57 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/object_spec.py +23 -8
- helm/common/optional_dependencies.py +1 -1
- helm/common/reeval_parameters.py +12 -0
- helm/common/remote_context.py +61 -0
- helm/common/request.py +45 -19
- helm/common/response_format.py +18 -0
- helm/common/test_cache.py +1 -48
- helm/common/test_general.py +10 -0
- helm/common/test_logging.py +94 -0
- helm/common/test_media_object.py +1 -1
- helm/common/tokenization_request.py +1 -10
- helm/config/model_deployments.yaml +4713 -1005
- helm/config/model_metadata.yaml +4045 -255
- helm/config/tokenizer_configs.yaml +1091 -50
- helm/proxy/accounts.py +31 -4
- helm/proxy/cli.py +6 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/critique/model_critique_client.py +40 -10
- helm/proxy/example_queries.py +33 -28
- helm/proxy/retry.py +5 -0
- helm/proxy/server.py +82 -18
- helm/proxy/services/remote_service.py +32 -7
- helm/proxy/services/server_service.py +71 -69
- helm/proxy/services/service.py +30 -6
- helm/proxy/services/test_remote_service.py +6 -5
- helm/proxy/services/test_service.py +1 -13
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +61 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +462 -0
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/tokenizers/__init__.py +0 -0
- helm/tokenizers/ai21_tokenizer.py +52 -0
- helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +9 -12
- helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +2 -30
- helm/tokenizers/cohere_tokenizer.py +50 -0
- helm/tokenizers/grok_tokenizer.py +55 -0
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +4 -4
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +44 -41
- helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/tokenizers/test_ai21_tokenizer.py +48 -0
- helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_cohere_tokenizer.py +39 -0
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +9 -2
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/tokenizer.py +3 -1
- helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +8 -6
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.4.0.dist-info/METADATA +0 -264
- crfm_helm-0.4.0.dist-info/RECORD +0 -397
- helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
- helm/benchmark/data_overlap/export_scenario_text.py +0 -119
- helm/benchmark/data_overlap/light_scenario.py +0 -60
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/run_specs.py +0 -2762
- helm/benchmark/scenarios/numeracy_scenario.py +0 -784
- helm/benchmark/static/benchmarking.css +0 -156
- helm/benchmark/static/benchmarking.js +0 -1705
- helm/benchmark/static/config.js +0 -3
- helm/benchmark/static/images/helm-logo.png +0 -0
- helm/benchmark/static/images/language-model-helm.png +0 -0
- helm/benchmark/static/images/organizations/ai21.png +0 -0
- helm/benchmark/static/images/organizations/anthropic.png +0 -0
- helm/benchmark/static/images/organizations/bigscience.png +0 -0
- helm/benchmark/static/images/organizations/cohere.png +0 -0
- helm/benchmark/static/images/organizations/eleutherai.png +0 -0
- helm/benchmark/static/images/organizations/google.png +0 -0
- helm/benchmark/static/images/organizations/meta.png +0 -0
- helm/benchmark/static/images/organizations/microsoft.png +0 -0
- helm/benchmark/static/images/organizations/nvidia.png +0 -0
- helm/benchmark/static/images/organizations/openai.png +0 -0
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
- helm/benchmark/static/images/organizations/yandex.png +0 -0
- helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
- helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
- helm/benchmark/static/index.html +0 -68
- helm/benchmark/static/json-urls.js +0 -69
- helm/benchmark/static/plot-captions.js +0 -27
- helm/benchmark/static/utils.js +0 -285
- helm/benchmark/test_model_deployment_definition.py +0 -92
- helm/benchmark/test_model_properties.py +0 -1570
- helm/benchmark/vlm_run_specs.py +0 -97
- helm/benchmark/window_services/ai21_window_service.py +0 -258
- helm/benchmark/window_services/cohere_window_service.py +0 -163
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/huggingface_window_service.py +0 -60
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_ai21_window_service.py +0 -163
- helm/benchmark/window_services/test_cohere_window_service.py +0 -74
- helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
- helm/benchmark/window_services/test_ice_window_service.py +0 -326
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/common/cache_utils.py +0 -14
- helm/proxy/clients/aleph_alpha_client.py +0 -95
- helm/proxy/clients/goose_ai_client.py +0 -99
- helm/proxy/clients/microsoft_client.py +0 -180
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/simple_client.py +0 -60
- helm/proxy/clients/test_client.py +0 -49
- helm/proxy/clients/test_together_client.py +0 -97
- helm/proxy/clients/together_client.py +0 -334
- helm/proxy/clients/vertexai_client.py +0 -115
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
- helm/proxy/token_counters/test_openai_token_counter.py +0 -81
- helm/proxy/tokenizers/ai21_tokenizer.py +0 -60
- helm/proxy/tokenizers/anthropic_tokenizer.py +0 -52
- helm/proxy/tokenizers/cohere_tokenizer.py +0 -83
- helm/proxy/tokenizers/ice_tokenizer.py +0 -30
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- helm/proxy/tokenizers/test_ice_tokenizer.py +0 -57
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
- /helm/benchmark/{data_overlap → annotation}/__init__.py +0 -0
- /helm/{proxy/clients → benchmark/annotation/image2struct}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/metrics/ifeval}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/benchmark/{static/images/crfm-logo.png → static_build/assets/crfm-logo-Du4T1uWZ.png} +0 -0
- /helm/benchmark/{static/images/helm-logo-simple.png → static_build/assets/helm-logo-simple-DzOhNN41.png} +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{benchmark → proxy}/static/general.js +0 -0
- /helm/{benchmark → proxy}/static/info-icon.png +0 -0
|
@@ -0,0 +1,723 @@
|
|
|
1
|
+
---
|
|
2
|
+
############################################################
|
|
3
|
+
metrics:
|
|
4
|
+
- name: num_perplexity_tokens
|
|
5
|
+
display_name: '# tokens'
|
|
6
|
+
description: Average number of tokens in the predicted output (for language modeling, the input too).
|
|
7
|
+
- name: num_bytes
|
|
8
|
+
display_name: '# bytes'
|
|
9
|
+
description: Average number of bytes in the predicted output (for language modeling, the input too).
|
|
10
|
+
- name: num_references
|
|
11
|
+
display_name: '# ref'
|
|
12
|
+
description: Number of references.
|
|
13
|
+
- name: num_train_trials
|
|
14
|
+
display_name: '# trials'
|
|
15
|
+
description: Number of trials, where in each trial we choose an independent, random set of training instances.
|
|
16
|
+
- name: estimated_num_tokens_cost
|
|
17
|
+
display_name: 'cost'
|
|
18
|
+
description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
|
|
19
|
+
- name: num_prompt_tokens
|
|
20
|
+
display_name: '# prompt tokens'
|
|
21
|
+
description: Number of tokens in the prompt.
|
|
22
|
+
- name: num_prompt_characters
|
|
23
|
+
display_name: '# prompt chars'
|
|
24
|
+
description: Number of characters in the prompt.
|
|
25
|
+
- name: num_completion_tokens
|
|
26
|
+
display_name: '# completion tokens'
|
|
27
|
+
description: Actual number of completion tokens (over all completions).
|
|
28
|
+
- name: num_output_tokens
|
|
29
|
+
display_name: '# output tokens'
|
|
30
|
+
description: Actual number of output tokens.
|
|
31
|
+
- name: max_num_output_tokens
|
|
32
|
+
display_name: 'Max output tokens'
|
|
33
|
+
description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
|
|
34
|
+
- name: num_requests
|
|
35
|
+
display_name: '# requests'
|
|
36
|
+
description: Number of distinct API requests.
|
|
37
|
+
- name: num_instances
|
|
38
|
+
display_name: '# eval'
|
|
39
|
+
description: Number of evaluation instances.
|
|
40
|
+
- name: num_train_instances
|
|
41
|
+
display_name: '# train'
|
|
42
|
+
description: Number of training instances (e.g., in-context examples).
|
|
43
|
+
- name: prompt_truncated
|
|
44
|
+
display_name: truncated
|
|
45
|
+
description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
|
|
46
|
+
- name: finish_reason_length
|
|
47
|
+
display_name: finish b/c length
|
|
48
|
+
description: Fraction of instances where the the output was terminated because of the max tokens limit.
|
|
49
|
+
- name: finish_reason_stop
|
|
50
|
+
display_name: finish b/c stop
|
|
51
|
+
description: Fraction of instances where the the output was terminated because of the stop sequences.
|
|
52
|
+
- name: finish_reason_endoftext
|
|
53
|
+
display_name: finish b/c endoftext
|
|
54
|
+
description: Fraction of instances where the the output was terminated because the end of text token was generated.
|
|
55
|
+
- name: finish_reason_unknown
|
|
56
|
+
display_name: finish b/c unknown
|
|
57
|
+
description: Fraction of instances where the the output was terminated for unknown reasons.
|
|
58
|
+
- name: num_completions
|
|
59
|
+
display_name: '# completions'
|
|
60
|
+
description: Number of completions.
|
|
61
|
+
- name: predicted_index
|
|
62
|
+
display_name: Predicted index
|
|
63
|
+
description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
|
|
64
|
+
- name: exact_match
|
|
65
|
+
display_name: Exact match
|
|
66
|
+
short_display_name: EM
|
|
67
|
+
description: Fraction of instances that the predicted output matches a correct reference exactly.
|
|
68
|
+
lower_is_better: false
|
|
69
|
+
- name: chr_f_plus_plus
|
|
70
|
+
display_name: ChrF++
|
|
71
|
+
description: Character n-gram F-score with word n-gram order (ChrF++) [(Popovic, 2015)](https://aclanthology.org/W15-3049/). Code can be found [here](https://github.com/mjpost/sacrebleu).
|
|
72
|
+
lower_is_better: false
|
|
73
|
+
- name: squad_exact_match_score
|
|
74
|
+
display_name: SQuAD exact match
|
|
75
|
+
description: SQuAD exact match score [(Rajpurkar, 2016)](https://aclanthology.org/D16-1264). Code can be found [here](https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py)
|
|
76
|
+
lower_is_better: false
|
|
77
|
+
- name: squad_f1_score
|
|
78
|
+
display_name: SQuAD macro-averaged F1 score
|
|
79
|
+
description: SQuAD macro-averaged F1 score [(Rajpurkar, 2016)](https://aclanthology.org/D16-1264). Code can be found [here](https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py)
|
|
80
|
+
lower_is_better: false
|
|
81
|
+
- name: classification_macro_f1
|
|
82
|
+
display_name: Macro F1 score
|
|
83
|
+
description: Macro F1 score
|
|
84
|
+
lower_is_better: false
|
|
85
|
+
- name: inference_runtime
|
|
86
|
+
display_name: Observed inference runtime (s)
|
|
87
|
+
short_display_name: Observed inference time (s)
|
|
88
|
+
lower_is_better: true
|
|
89
|
+
description: Average observed time to process a request to the model (via an API, and thus depends on particular deployment).
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
############################################################
|
|
93
|
+
perturbations: []
|
|
94
|
+
|
|
95
|
+
############################################################
|
|
96
|
+
metric_groups:
|
|
97
|
+
- name: accuracy
|
|
98
|
+
display_name: Accuracy
|
|
99
|
+
aggregation_strategies:
|
|
100
|
+
- mean
|
|
101
|
+
metrics:
|
|
102
|
+
- name: ${main_name}
|
|
103
|
+
split: ${main_split}
|
|
104
|
+
|
|
105
|
+
- name: efficiency
|
|
106
|
+
display_name: Efficiency
|
|
107
|
+
aggregation_strategies:
|
|
108
|
+
- mean
|
|
109
|
+
metrics:
|
|
110
|
+
- name: inference_runtime
|
|
111
|
+
split: ${main_split}
|
|
112
|
+
|
|
113
|
+
- name: general_information
|
|
114
|
+
display_name: General information
|
|
115
|
+
metrics:
|
|
116
|
+
- name: num_instances
|
|
117
|
+
split: ${main_split}
|
|
118
|
+
- name: num_train_instances
|
|
119
|
+
split: ${main_split}
|
|
120
|
+
- name: prompt_truncated
|
|
121
|
+
split: ${main_split}
|
|
122
|
+
- name: num_prompt_tokens
|
|
123
|
+
split: ${main_split}
|
|
124
|
+
- name: num_output_tokens
|
|
125
|
+
split: ${main_split}
|
|
126
|
+
|
|
127
|
+
############################################################
|
|
128
|
+
|
|
129
|
+
run_groups:
|
|
130
|
+
- name: seahelm_nlu
|
|
131
|
+
display_name: SEA-HELM natural language understanding (NLU)
|
|
132
|
+
description: SEA-HELM natural language understanding (NLU) scenarios
|
|
133
|
+
category: SEA-HELM scenarios
|
|
134
|
+
subgroups:
|
|
135
|
+
- tydiqa
|
|
136
|
+
- xquad_vi
|
|
137
|
+
- xquad_th
|
|
138
|
+
- indicqa
|
|
139
|
+
- nusax
|
|
140
|
+
- uitvsfc
|
|
141
|
+
- wisesight
|
|
142
|
+
- indicsentiment
|
|
143
|
+
- mlhsd
|
|
144
|
+
- vihsd
|
|
145
|
+
- thaitoxicitytweets
|
|
146
|
+
|
|
147
|
+
- name: seahelm_nlg
|
|
148
|
+
display_name: SEA-HELM natural language generation (NLG)
|
|
149
|
+
description: SEA-HELM natural language generation (NLG) scenarios
|
|
150
|
+
category: SEA-HELM scenarios
|
|
151
|
+
subgroups:
|
|
152
|
+
- flores_id_en
|
|
153
|
+
- flores_vi_en
|
|
154
|
+
- flores_th_en
|
|
155
|
+
- flores_ta_en
|
|
156
|
+
- flores_en_id
|
|
157
|
+
- flores_en_vi
|
|
158
|
+
- flores_en_th
|
|
159
|
+
- flores_en_ta
|
|
160
|
+
|
|
161
|
+
- name: seahelm_nlr
|
|
162
|
+
display_name: SEA-HELM natural language reasoning (NLR)
|
|
163
|
+
description: SEA-HELM natural language reasoning (NLR) scenarios
|
|
164
|
+
category: SEA-HELM scenarios
|
|
165
|
+
subgroups:
|
|
166
|
+
- indonli
|
|
167
|
+
- xnli_vi
|
|
168
|
+
- xnli_th
|
|
169
|
+
- indicxnli
|
|
170
|
+
- xcopa_id
|
|
171
|
+
- xcopa_vi
|
|
172
|
+
- xcopa_th
|
|
173
|
+
- xcopa_ta
|
|
174
|
+
|
|
175
|
+
- name: seahelm_lindsea
|
|
176
|
+
display_name: SEA-HELM Linguistic Diagnostics for Southeast Asian Languages (LINDSEA)
|
|
177
|
+
description: SEA-HELM Linguistic Diagnostics for Southeast Asian Languages (LINDSEA) scenarios
|
|
178
|
+
category: SEA-HELM scenarios
|
|
179
|
+
subgroups:
|
|
180
|
+
- lindsea_syntax_minimal_pairs_id
|
|
181
|
+
- lindsea_pragmatics_presuppositions_id
|
|
182
|
+
- lindsea_pragmatics_scalar_implicatures_id
|
|
183
|
+
|
|
184
|
+
- name: tydiqa
|
|
185
|
+
display_name: TyDiQA
|
|
186
|
+
description: >
|
|
187
|
+
TyDiQA [(Clark, 2020)](https://aclanthology.org/2020.tacl-1.30) is an open-book question answering dataset for 11 typologically-diverse languages. The questions are written by people who want to know the answer, but do not know the answer yet,
|
|
188
|
+
and the data is collected directly in each language without the use of translation.
|
|
189
|
+
metric_groups:
|
|
190
|
+
- accuracy
|
|
191
|
+
- efficiency
|
|
192
|
+
- general_information
|
|
193
|
+
environment:
|
|
194
|
+
main_name: squad_f1_score
|
|
195
|
+
main_split: test
|
|
196
|
+
taxonomy:
|
|
197
|
+
task: question answering
|
|
198
|
+
what: questions by human annotators about Wikipedia articles
|
|
199
|
+
who: "human annotators"
|
|
200
|
+
when: "?"
|
|
201
|
+
language: Indonesian
|
|
202
|
+
|
|
203
|
+
- name: xquad_vi
|
|
204
|
+
display_name: XQuAD (Vietnamese)
|
|
205
|
+
description: >
|
|
206
|
+
XQuAD [(Artetxe, 2019)](https://arxiv.org/abs/1910.11856) is an open-book question answering dataset that is parallel across 10 languages. The dataset consists of a subset of 240 paragraphs and 1190 question-answer pairs from the development set of SQuAD v1.1 (Rajpurkar et al., 2016) together with their professional translations.
|
|
207
|
+
metric_groups:
|
|
208
|
+
- accuracy
|
|
209
|
+
- efficiency
|
|
210
|
+
- general_information
|
|
211
|
+
environment:
|
|
212
|
+
main_name: squad_f1_score
|
|
213
|
+
main_split: test
|
|
214
|
+
taxonomy:
|
|
215
|
+
task: question answering
|
|
216
|
+
what: questions by crowdworkers about Wikipedia articles translated from English to Thai and Vietnamese
|
|
217
|
+
who: "?"
|
|
218
|
+
when: "?"
|
|
219
|
+
language: Vietnamese
|
|
220
|
+
|
|
221
|
+
- name: xquad_th
|
|
222
|
+
display_name: XQuAD (Thai)
|
|
223
|
+
description: >
|
|
224
|
+
XQuAD [(Artetxe, 2019)](https://arxiv.org/abs/1910.11856) is an open-book question answering dataset that is parallel across 10 languages. The dataset consists of a subset of 240 paragraphs and 1190 question-answer pairs from the development set of SQuAD v1.1 (Rajpurkar et al., 2016) together with their professional translations.
|
|
225
|
+
metric_groups:
|
|
226
|
+
- accuracy
|
|
227
|
+
- efficiency
|
|
228
|
+
- general_information
|
|
229
|
+
environment:
|
|
230
|
+
main_name: squad_f1_score
|
|
231
|
+
main_split: test
|
|
232
|
+
taxonomy:
|
|
233
|
+
task: question answering
|
|
234
|
+
what: questions by crowdworkers about Wikipedia articles translated from English to Thai and Vietnamese
|
|
235
|
+
who: "?"
|
|
236
|
+
when: "?"
|
|
237
|
+
language: Thai
|
|
238
|
+
|
|
239
|
+
- name: indicqa
|
|
240
|
+
display_name: IndicQA
|
|
241
|
+
description: >
|
|
242
|
+
IndicQA [(Doddapaneni, 2023)](https://aclanthology.org/2023.acl-long.693)is an open-book question answering dataset for 11 Indic languages. Answers to questions are to be extracted from the text provided. The data is taken from Wikipedia articles across various domains and questions and answers were manually created by native speakers.
|
|
243
|
+
metric_groups:
|
|
244
|
+
- accuracy
|
|
245
|
+
- efficiency
|
|
246
|
+
- general_information
|
|
247
|
+
environment:
|
|
248
|
+
main_name: squad_f1_score
|
|
249
|
+
main_split: test
|
|
250
|
+
taxonomy:
|
|
251
|
+
task: question answering
|
|
252
|
+
what: questions about Wikipedia articles translated by native speakers from English to Tamil
|
|
253
|
+
who: "?"
|
|
254
|
+
when: "?"
|
|
255
|
+
language: Tamil
|
|
256
|
+
|
|
257
|
+
- name: nusax
|
|
258
|
+
display_name: NusaX
|
|
259
|
+
description: >
|
|
260
|
+
NusaX [(Winata, 2023)](https://aclanthology.org/2023.eacl-main.57) is an Indonesian sentiment analysis dataset. The data consists of comments and reviews from various online platforms.
|
|
261
|
+
metric_groups:
|
|
262
|
+
- accuracy
|
|
263
|
+
- efficiency
|
|
264
|
+
- general_information
|
|
265
|
+
environment:
|
|
266
|
+
main_name: classification_macro_f1
|
|
267
|
+
main_split: test
|
|
268
|
+
taxonomy:
|
|
269
|
+
task: sentiment analysis
|
|
270
|
+
what: online comments and reviews
|
|
271
|
+
who: "internet users"
|
|
272
|
+
when: "?"
|
|
273
|
+
language: Indonesian
|
|
274
|
+
|
|
275
|
+
- name: uitvsfc
|
|
276
|
+
display_name: UIT-VSFC
|
|
277
|
+
description: >
|
|
278
|
+
UIT-VSFC [(Nguyen, 2018)](https://ieeexplore.ieee.org/document/8573337) is a Vietnamese sentiment analysis dataset. The data consists of student feedback obtained from end-of-semester surveys at a Vietnamese university.
|
|
279
|
+
metric_groups:
|
|
280
|
+
- accuracy
|
|
281
|
+
- efficiency
|
|
282
|
+
- general_information
|
|
283
|
+
environment:
|
|
284
|
+
main_name: classification_macro_f1
|
|
285
|
+
main_split: test
|
|
286
|
+
taxonomy:
|
|
287
|
+
task: sentiment analysis
|
|
288
|
+
what: university student end-of-semester survey responses
|
|
289
|
+
who: "university students"
|
|
290
|
+
when: "?"
|
|
291
|
+
language: Vietnamese
|
|
292
|
+
|
|
293
|
+
- name: wisesight
|
|
294
|
+
display_name: Wisesight
|
|
295
|
+
description: >
|
|
296
|
+
Wisesight [(Suriyawongkul, 2019)](https://doi.org/10.5281/zenodo.3457447) is an Thai sentiment analysis scenario. The data consists of social media messages regarding consumer products and services.
|
|
297
|
+
metric_groups:
|
|
298
|
+
- accuracy
|
|
299
|
+
- efficiency
|
|
300
|
+
- general_information
|
|
301
|
+
environment:
|
|
302
|
+
main_name: classification_macro_f1
|
|
303
|
+
main_split: test
|
|
304
|
+
taxonomy:
|
|
305
|
+
task: sentiment analysis
|
|
306
|
+
what: social media messages regarding consumer products and services
|
|
307
|
+
who: "social media users"
|
|
308
|
+
when: "?"
|
|
309
|
+
language: Thai
|
|
310
|
+
|
|
311
|
+
- name: indicsentiment
|
|
312
|
+
display_name: IndicSentiment
|
|
313
|
+
description: >
|
|
314
|
+
IndicSentiment is a Tamil sentiment analysis dataset that comes from IndicXTREME [(Doddapaneni, 2022)](https://aclanthology.org/2023.acl-long.693/), and consists of product reviews that were written by annotators. Labels are positive or negative.
|
|
315
|
+
metric_groups:
|
|
316
|
+
- accuracy
|
|
317
|
+
- efficiency
|
|
318
|
+
- general_information
|
|
319
|
+
environment:
|
|
320
|
+
main_name: classification_macro_f1
|
|
321
|
+
main_split: test
|
|
322
|
+
taxonomy:
|
|
323
|
+
task: sentiment analysis
|
|
324
|
+
what: product reviews
|
|
325
|
+
who: "human annotators"
|
|
326
|
+
when: "?"
|
|
327
|
+
language: Tamil
|
|
328
|
+
|
|
329
|
+
- name: mlhsd
|
|
330
|
+
display_name: MLHSD
|
|
331
|
+
description: >
|
|
332
|
+
MLHSD [(Ibrohim, 2019)](https://aclanthology.org/W19-3506) is an Indonesian toxicity detection dataset obtained from tweets on Twitter.
|
|
333
|
+
metric_groups:
|
|
334
|
+
- accuracy
|
|
335
|
+
- efficiency
|
|
336
|
+
- general_information
|
|
337
|
+
environment:
|
|
338
|
+
main_name: classification_macro_f1
|
|
339
|
+
main_split: test
|
|
340
|
+
taxonomy:
|
|
341
|
+
task: toxicity detection/classification
|
|
342
|
+
what: tweets
|
|
343
|
+
who: "Twitter users"
|
|
344
|
+
when: "?"
|
|
345
|
+
language: Indonesian
|
|
346
|
+
|
|
347
|
+
- name: vihsd
|
|
348
|
+
display_name: ViHSD
|
|
349
|
+
description: >
|
|
350
|
+
ViHSD [(Luu, 2021)](https://link.springer.com/chapter/10.1007/978-3-030-79457-6_35 )is a Vietnamese toxicity detection dataset obtained from comments on Facebook, Youtube, Instagram, and Tiktok.
|
|
351
|
+
metric_groups:
|
|
352
|
+
- accuracy
|
|
353
|
+
- efficiency
|
|
354
|
+
- general_information
|
|
355
|
+
environment:
|
|
356
|
+
main_name: classification_macro_f1
|
|
357
|
+
main_split: test
|
|
358
|
+
taxonomy:
|
|
359
|
+
task: toxicity detection/classification
|
|
360
|
+
what: social media comments
|
|
361
|
+
who: "Social media users"
|
|
362
|
+
when: "?"
|
|
363
|
+
language: Vietnamese
|
|
364
|
+
|
|
365
|
+
- name: thaitoxicitytweets
|
|
366
|
+
display_name: Thai Toxicity Tweets
|
|
367
|
+
description: >
|
|
368
|
+
Thai Toxicity Tweets [(Sirihattasak, 2018)](http://www.lrec-conf.org/workshops/lrec2018/W32/pdf/1_W32.pdf) is a Thai toxicity detection dataset obtained from tweets on Twitter.
|
|
369
|
+
metric_groups:
|
|
370
|
+
- accuracy
|
|
371
|
+
- efficiency
|
|
372
|
+
- general_information
|
|
373
|
+
environment:
|
|
374
|
+
main_name: classification_macro_f1
|
|
375
|
+
main_split: test
|
|
376
|
+
taxonomy:
|
|
377
|
+
task: toxicity detection/classification
|
|
378
|
+
what: tweets
|
|
379
|
+
who: "Twitter users"
|
|
380
|
+
when: ""
|
|
381
|
+
language: Thai
|
|
382
|
+
|
|
383
|
+
- name: flores_en_id
|
|
384
|
+
display_name: Flores
|
|
385
|
+
description: >
|
|
386
|
+
Flores [(NLLB Team, 2022)](https://research.facebook.com/publications/no-language-left-behind/) was created with professional human translators who translate the FLORES source dataset into the target languages and a separate group of independent translation reviewers who perform quality assessments of the human translations and provide translation feedback to the translators.
|
|
387
|
+
metric_groups:
|
|
388
|
+
- accuracy
|
|
389
|
+
- efficiency
|
|
390
|
+
- general_information
|
|
391
|
+
environment:
|
|
392
|
+
main_name: chr_f_plus_plus
|
|
393
|
+
main_split: test
|
|
394
|
+
taxonomy:
|
|
395
|
+
task: machine translation
|
|
396
|
+
what: translations from professional human translators
|
|
397
|
+
who: "professional human translators"
|
|
398
|
+
when: "?"
|
|
399
|
+
language: English, Indonesian, Tamil, Thai, Vietnamese, English
|
|
400
|
+
|
|
401
|
+
- name: flores_en_vi
|
|
402
|
+
display_name: Flores
|
|
403
|
+
description: >
|
|
404
|
+
Flores [(NLLB Team, 2022)](https://research.facebook.com/publications/no-language-left-behind/) was created with professional human translators who translate the FLORES source dataset into the target languages and a separate group of independent translation reviewers who perform quality assessments of the human translations and provide translation feedback to the translators.
|
|
405
|
+
metric_groups:
|
|
406
|
+
- accuracy
|
|
407
|
+
- efficiency
|
|
408
|
+
- general_information
|
|
409
|
+
environment:
|
|
410
|
+
main_name: chr_f_plus_plus
|
|
411
|
+
main_split: test
|
|
412
|
+
taxonomy:
|
|
413
|
+
task: machine translation
|
|
414
|
+
what: translations from professional human translators
|
|
415
|
+
who: "professional human translators"
|
|
416
|
+
when: "?"
|
|
417
|
+
language: English, Indonesian, Tamil, Thai, Vietnamese, English
|
|
418
|
+
|
|
419
|
+
- name: flores_en_th
|
|
420
|
+
display_name: Flores
|
|
421
|
+
description: >
|
|
422
|
+
Flores [(NLLB Team, 2022)](https://research.facebook.com/publications/no-language-left-behind/) was created with professional human translators who translate the FLORES source dataset into the target languages and a separate group of independent translation reviewers who perform quality assessments of the human translations and provide translation feedback to the translators.
|
|
423
|
+
metric_groups:
|
|
424
|
+
- accuracy
|
|
425
|
+
- efficiency
|
|
426
|
+
- general_information
|
|
427
|
+
environment:
|
|
428
|
+
main_name: chr_f_plus_plus
|
|
429
|
+
main_split: test
|
|
430
|
+
taxonomy:
|
|
431
|
+
task: machine translation
|
|
432
|
+
what: translations from professional human translators
|
|
433
|
+
who: "professional human translators"
|
|
434
|
+
when: "?"
|
|
435
|
+
language: English, Indonesian, Tamil, Thai, Vietnamese, English
|
|
436
|
+
|
|
437
|
+
- name: flores_en_ta
|
|
438
|
+
display_name: Flores
|
|
439
|
+
description: >
|
|
440
|
+
Flores [(NLLB Team, 2022)](https://research.facebook.com/publications/no-language-left-behind/) was created with professional human translators who translate the FLORES source dataset into the target languages and a separate group of independent translation reviewers who perform quality assessments of the human translations and provide translation feedback to the translators.
|
|
441
|
+
metric_groups:
|
|
442
|
+
- accuracy
|
|
443
|
+
- efficiency
|
|
444
|
+
- general_information
|
|
445
|
+
environment:
|
|
446
|
+
main_name: chr_f_plus_plus
|
|
447
|
+
main_split: test
|
|
448
|
+
taxonomy:
|
|
449
|
+
task: machine translation
|
|
450
|
+
what: translations from professional human translators
|
|
451
|
+
who: "professional human translators"
|
|
452
|
+
when: "?"
|
|
453
|
+
language: English, Indonesian, Tamil, Thai, Vietnamese, English
|
|
454
|
+
|
|
455
|
+
- name: flores_id_en
|
|
456
|
+
display_name: Flores
|
|
457
|
+
description: >
|
|
458
|
+
Flores [(NLLB Team, 2022)](https://research.facebook.com/publications/no-language-left-behind/) was created with professional human translators who translate the FLORES source dataset into the target languages and a separate group of independent translation reviewers who perform quality assessments of the human translations and provide translation feedback to the translators.
|
|
459
|
+
metric_groups:
|
|
460
|
+
- accuracy
|
|
461
|
+
- efficiency
|
|
462
|
+
- general_information
|
|
463
|
+
environment:
|
|
464
|
+
main_name: chr_f_plus_plus
|
|
465
|
+
main_split: test
|
|
466
|
+
taxonomy:
|
|
467
|
+
task: machine translation
|
|
468
|
+
what: translations from professional human translators
|
|
469
|
+
who: "professional human translators"
|
|
470
|
+
when: "?"
|
|
471
|
+
language: English, Indonesian, Tamil, Thai, Vietnamese, English
|
|
472
|
+
|
|
473
|
+
- name: flores_vi_en
|
|
474
|
+
display_name: Flores
|
|
475
|
+
description: >
|
|
476
|
+
Flores [(NLLB Team, 2022)](https://research.facebook.com/publications/no-language-left-behind/) was created with professional human translators who translate the FLORES source dataset into the target languages and a separate group of independent translation reviewers who perform quality assessments of the human translations and provide translation feedback to the translators.
|
|
477
|
+
metric_groups:
|
|
478
|
+
- accuracy
|
|
479
|
+
- efficiency
|
|
480
|
+
- general_information
|
|
481
|
+
environment:
|
|
482
|
+
main_name: chr_f_plus_plus
|
|
483
|
+
main_split: test
|
|
484
|
+
taxonomy:
|
|
485
|
+
task: machine translation
|
|
486
|
+
what: translations from professional human translators
|
|
487
|
+
who: "professional human translators"
|
|
488
|
+
when: "?"
|
|
489
|
+
language: English, Indonesian, Tamil, Thai, Vietnamese, English
|
|
490
|
+
|
|
491
|
+
- name: flores_th_en
|
|
492
|
+
display_name: Flores
|
|
493
|
+
description: >
|
|
494
|
+
Flores [(NLLB Team, 2022)](https://research.facebook.com/publications/no-language-left-behind/) was created with professional human translators who translate the FLORES source dataset into the target languages and a separate group of independent translation reviewers who perform quality assessments of the human translations and provide translation feedback to the translators.
|
|
495
|
+
metric_groups:
|
|
496
|
+
- accuracy
|
|
497
|
+
- efficiency
|
|
498
|
+
- general_information
|
|
499
|
+
environment:
|
|
500
|
+
main_name: chr_f_plus_plus
|
|
501
|
+
main_split: test
|
|
502
|
+
taxonomy:
|
|
503
|
+
task: machine translation
|
|
504
|
+
what: translations from professional human translators
|
|
505
|
+
who: "professional human translators"
|
|
506
|
+
when: "?"
|
|
507
|
+
language: English, Indonesian, Tamil, Thai, Vietnamese, English
|
|
508
|
+
|
|
509
|
+
- name: flores_ta_en
|
|
510
|
+
display_name: Flores
|
|
511
|
+
description: >
|
|
512
|
+
Flores [(NLLB Team, 2022)](https://research.facebook.com/publications/no-language-left-behind/) was created with professional human translators who translate the FLORES source dataset into the target languages and a separate group of independent translation reviewers who perform quality assessments of the human translations and provide translation feedback to the translators.
|
|
513
|
+
metric_groups:
|
|
514
|
+
- accuracy
|
|
515
|
+
- efficiency
|
|
516
|
+
- general_information
|
|
517
|
+
environment:
|
|
518
|
+
main_name: chr_f_plus_plus
|
|
519
|
+
main_split: test
|
|
520
|
+
taxonomy:
|
|
521
|
+
task: machine translation
|
|
522
|
+
what: translations from professional human translators
|
|
523
|
+
who: "professional human translators"
|
|
524
|
+
when: "?"
|
|
525
|
+
language: English, Indonesian, Tamil, Thai, Vietnamese, English
|
|
526
|
+
|
|
527
|
+
- name: indonli
|
|
528
|
+
display_name: IndoNLI
|
|
529
|
+
description: >
|
|
530
|
+
IndoNLI [(Mahendra, 2021)](https://aclanthology.org/2021.emnlp-main.821) is a natural language inference dataset obtained from Wikipedia, news, and web articles that incorporates various linguistic phenomena such as numerical reasoning, structural changes, idioms, or temporal and spatial reasoning.
|
|
531
|
+
metric_groups:
|
|
532
|
+
- accuracy
|
|
533
|
+
- efficiency
|
|
534
|
+
- general_information
|
|
535
|
+
environment:
|
|
536
|
+
main_name: exact_match
|
|
537
|
+
main_split: test
|
|
538
|
+
taxonomy:
|
|
539
|
+
task: natural language inference
|
|
540
|
+
what: Wikipedia, news, and web articles
|
|
541
|
+
who: "?"
|
|
542
|
+
when: "?"
|
|
543
|
+
language: Indonesian
|
|
544
|
+
|
|
545
|
+
- name: xnli_vi
|
|
546
|
+
display_name: XNLI (Vietnamese)
|
|
547
|
+
description: >
|
|
548
|
+
XNLI [(Conneau, 2018)](https://aclanthology.org/D18-1269) is a natural language inference dataset obtained from crowdsourced NLI data then professionally translated across 14 other languages.
|
|
549
|
+
metric_groups:
|
|
550
|
+
- accuracy
|
|
551
|
+
- efficiency
|
|
552
|
+
- general_information
|
|
553
|
+
environment:
|
|
554
|
+
main_name: exact_match
|
|
555
|
+
main_split: test
|
|
556
|
+
taxonomy:
|
|
557
|
+
task: natural language inference
|
|
558
|
+
what: crowdsourced NLI data professionally translated
|
|
559
|
+
who: "?"
|
|
560
|
+
when: "?"
|
|
561
|
+
language: Vietnamese
|
|
562
|
+
|
|
563
|
+
- name: xnli_th
|
|
564
|
+
display_name: XNLI (Thai)
|
|
565
|
+
description: >
|
|
566
|
+
XNLI [(Conneau, 2018)](https://aclanthology.org/D18-1269) is a natural language inference dataset obtained from crowdsourced NLI data then professionally translated across 14 other languages.
|
|
567
|
+
metric_groups:
|
|
568
|
+
- accuracy
|
|
569
|
+
- efficiency
|
|
570
|
+
- general_information
|
|
571
|
+
environment:
|
|
572
|
+
main_name: exact_match
|
|
573
|
+
main_split: test
|
|
574
|
+
taxonomy:
|
|
575
|
+
task: natural language inference
|
|
576
|
+
what: crowdsourced NLI data professionally translated
|
|
577
|
+
who: "?"
|
|
578
|
+
when: "?"
|
|
579
|
+
language: Thai
|
|
580
|
+
|
|
581
|
+
- name: indicxnli
|
|
582
|
+
display_name: IndicXNLI
|
|
583
|
+
description: >
|
|
584
|
+
IndicXNLI is a Tamil sentiment analysis dataset that comes from IndicXTREME [(Doddapaneni, 2022)](https://aclanthology.org/2023.acl-long.693/), which automatically translated from XNLI into 11 Indic languages.
|
|
585
|
+
metric_groups:
|
|
586
|
+
- accuracy
|
|
587
|
+
- efficiency
|
|
588
|
+
- general_information
|
|
589
|
+
environment:
|
|
590
|
+
main_name: exact_match
|
|
591
|
+
main_split: test
|
|
592
|
+
taxonomy:
|
|
593
|
+
task: natural language inference
|
|
594
|
+
what: crowdsourced NLI data professionally translated into Tamil
|
|
595
|
+
who: "?"
|
|
596
|
+
when: "?"
|
|
597
|
+
language: Tamil
|
|
598
|
+
|
|
599
|
+
- name: xcopa_id
|
|
600
|
+
display_name: XCOPA (Indonesian)
|
|
601
|
+
description: >
|
|
602
|
+
XCOPA [(Ponti, 2020)](https://ducdauge.github.io/files/xcopa.pdf) is causal reasoning dataset, a translation and reannotation of the English COPA. English COPA included questions that directly assess commonsense causal reasoning.
|
|
603
|
+
metric_groups:
|
|
604
|
+
- accuracy
|
|
605
|
+
- efficiency
|
|
606
|
+
- general_information
|
|
607
|
+
environment:
|
|
608
|
+
main_name: exact_match
|
|
609
|
+
main_split: test
|
|
610
|
+
taxonomy:
|
|
611
|
+
task: causal reasoning
|
|
612
|
+
what: commonsense causal reasoning questions translated into Indonesian
|
|
613
|
+
who: "?"
|
|
614
|
+
when: "?"
|
|
615
|
+
language: Indonesian, Tamil, Thai, Vietnamese
|
|
616
|
+
|
|
617
|
+
- name: xcopa_vi
|
|
618
|
+
display_name: XCOPA (Vietnamese)
|
|
619
|
+
description: >
|
|
620
|
+
XCOPA [(Ponti, 2020)](https://ducdauge.github.io/files/xcopa.pdf) is causal reasoning dataset, a translation and reannotation of the English COPA. English COPA included questions that directly assess commonsense causal reasoning.
|
|
621
|
+
metric_groups:
|
|
622
|
+
- accuracy
|
|
623
|
+
- efficiency
|
|
624
|
+
- general_information
|
|
625
|
+
environment:
|
|
626
|
+
main_name: exact_match
|
|
627
|
+
main_split: test
|
|
628
|
+
taxonomy:
|
|
629
|
+
task: causal reasoning
|
|
630
|
+
what: commonsense causal reasoning questions translated into Vietnamese
|
|
631
|
+
who: "?"
|
|
632
|
+
when: "?"
|
|
633
|
+
language: Vietnamese
|
|
634
|
+
|
|
635
|
+
- name: xcopa_th
|
|
636
|
+
display_name: XCOPA (Thai)
|
|
637
|
+
description: >
|
|
638
|
+
XCOPA [(Ponti, 2020)](https://ducdauge.github.io/files/xcopa.pdf) is causal reasoning dataset, a translation and reannotation of the English COPA. English COPA included questions that directly assess commonsense causal reasoning.
|
|
639
|
+
metric_groups:
|
|
640
|
+
- accuracy
|
|
641
|
+
- efficiency
|
|
642
|
+
- general_information
|
|
643
|
+
environment:
|
|
644
|
+
main_name: exact_match
|
|
645
|
+
main_split: test
|
|
646
|
+
taxonomy:
|
|
647
|
+
task: causal reasoning
|
|
648
|
+
what: commonsense causal reasoning questions translated into Thai
|
|
649
|
+
who: "?"
|
|
650
|
+
when: "?"
|
|
651
|
+
language: Thai
|
|
652
|
+
|
|
653
|
+
- name: xcopa_ta
|
|
654
|
+
display_name: XCOPA (Tamil)
|
|
655
|
+
description: >
|
|
656
|
+
XCOPA [(Ponti, 2020)](https://ducdauge.github.io/files/xcopa.pdf) is causal reasoning dataset, a translation and reannotation of the English COPA. English COPA included questions that directly assess commonsense causal reasoning.
|
|
657
|
+
metric_groups:
|
|
658
|
+
- accuracy
|
|
659
|
+
- efficiency
|
|
660
|
+
- general_information
|
|
661
|
+
environment:
|
|
662
|
+
main_name: exact_match
|
|
663
|
+
main_split: test
|
|
664
|
+
taxonomy:
|
|
665
|
+
task: causal reasoning
|
|
666
|
+
what: commonsense causal reasoning questions translated into Tamil
|
|
667
|
+
who: "?"
|
|
668
|
+
when: "?"
|
|
669
|
+
language: Tamil
|
|
670
|
+
|
|
671
|
+
- name: lindsea_syntax_minimal_pairs_id
|
|
672
|
+
display_name: LINDSEA Syntax Minimal Pairs
|
|
673
|
+
description: >
|
|
674
|
+
LINDSEA minimal pairs is a linguistic diagnostic for syntax dataset from BHASA [(Leong, 2023)](https://arxiv.org/abs/2309.06085), involving pairs of sentences that differ minimally from each other and contrast in grammatical acceptability.
|
|
675
|
+
metric_groups:
|
|
676
|
+
- accuracy
|
|
677
|
+
- efficiency
|
|
678
|
+
- general_information
|
|
679
|
+
environment:
|
|
680
|
+
main_name: exact_match
|
|
681
|
+
main_split: test
|
|
682
|
+
taxonomy:
|
|
683
|
+
task: minimal pairs
|
|
684
|
+
what: sentence pairs with minimal differences and constrasting grammatical acceptability
|
|
685
|
+
who: "?"
|
|
686
|
+
when: "?"
|
|
687
|
+
language: Indonesian
|
|
688
|
+
|
|
689
|
+
- name: lindsea_pragmatics_presuppositions_id
|
|
690
|
+
display_name: LINDSEA Pragmatics Presuppositions
|
|
691
|
+
description: >
|
|
692
|
+
LINDSEA Pragmatics Presuppositions is a linguistic diagnostic for pragmatics dataset from BHASA [(Leong, 2023)](https://arxiv.org/abs/2309.06085), involving two formats: single and pair sentences. For single sentence questions, the system under test needs to determine if the sentence is true/false. For pair sentence questions, the system under test needs to determine whether a conclusion can be drawn from another sentence.
|
|
693
|
+
metric_groups:
|
|
694
|
+
- accuracy
|
|
695
|
+
- efficiency
|
|
696
|
+
- general_information
|
|
697
|
+
environment:
|
|
698
|
+
main_name: exact_match
|
|
699
|
+
main_split: test
|
|
700
|
+
taxonomy:
|
|
701
|
+
task: pragmatic reasoning
|
|
702
|
+
what: presuppositions
|
|
703
|
+
who: "?"
|
|
704
|
+
when: "?"
|
|
705
|
+
language: Indonesian
|
|
706
|
+
|
|
707
|
+
- name: lindsea_pragmatics_scalar_implicatures_id
|
|
708
|
+
display_name: LINDSEA Pragmatics Scalar Implicatures
|
|
709
|
+
description: >
|
|
710
|
+
LINDSEA Pragmatics Scalar Implicatures is a linguistic diagnostic for pragmatics dataset from BHASA [(Leong, 2023)](https://arxiv.org/abs/2309.06085), , involving two formats: single and pair sentences. For single sentence questions, the system under test needs to determine if the sentence is true/false. For pair sentence questions, the system under test needs to determine whether a conclusion can be drawn from another sentence.
|
|
711
|
+
metric_groups:
|
|
712
|
+
- accuracy
|
|
713
|
+
- efficiency
|
|
714
|
+
- general_information
|
|
715
|
+
environment:
|
|
716
|
+
main_name: exact_match
|
|
717
|
+
main_split: test
|
|
718
|
+
taxonomy:
|
|
719
|
+
task: pragmatic reasoning
|
|
720
|
+
what: scalar implicatures
|
|
721
|
+
who: "?"
|
|
722
|
+
when: "?"
|
|
723
|
+
language: Indonesian
|