crfm-helm 0.4.0__py3-none-any.whl → 0.5.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.10.dist-info/METADATA +369 -0
- crfm_helm-0.5.10.dist-info/RECORD +1008 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +80 -29
- helm/benchmark/adaptation/adapters/adapter.py +2 -2
- helm/benchmark/adaptation/adapters/adapter_factory.py +39 -28
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +24 -8
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +3 -4
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +2 -1
- helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +112 -0
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +6 -3
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +3 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +18 -8
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_adapter.py +5 -4
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +46 -22
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +17 -29
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +138 -16
- helm/benchmark/adaptation/common_adapter_specs.py +443 -0
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/adaptation/request_state.py +6 -1
- helm/benchmark/adaptation/scenario_state.py +6 -2
- helm/benchmark/annotation/aci_bench_annotator.py +84 -0
- helm/benchmark/annotation/air_bench_annotator.py +79 -0
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/annotator.py +48 -0
- helm/benchmark/annotation/annotator_factory.py +50 -0
- helm/benchmark/annotation/anthropic_red_team_annotator.py +57 -0
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/call_center_annotator.py +258 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +82 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +96 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/financebench_annotator.py +79 -0
- helm/benchmark/annotation/harm_bench_annotator.py +55 -0
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +93 -0
- helm/benchmark/annotation/image2struct/latex_compiler_annotator.py +59 -0
- helm/benchmark/annotation/image2struct/lilypond_compiler_annotator.py +86 -0
- helm/benchmark/annotation/image2struct/webpage_compiler_annotator.py +132 -0
- helm/benchmark/annotation/live_qa_annotator.py +76 -0
- helm/benchmark/annotation/med_dialog_annotator.py +88 -0
- helm/benchmark/annotation/medalign_annotator.py +89 -0
- helm/benchmark/annotation/medi_qa_annotator.py +87 -0
- helm/benchmark/annotation/medication_qa_annotator.py +86 -0
- helm/benchmark/annotation/mental_health_annotator.py +87 -0
- helm/benchmark/annotation/mimic_bhc_annotator.py +89 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +89 -0
- helm/benchmark/annotation/model_as_judge.py +309 -0
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +87 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +90 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +131 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +50 -0
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +87 -0
- helm/benchmark/annotation/test_annotator_factory.py +26 -0
- helm/benchmark/annotation/test_dummy_annotator.py +44 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation/xstest_annotator.py +100 -0
- helm/benchmark/annotation_executor.py +144 -0
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/data_augmenter.py +0 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +3 -3
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +26 -4
- helm/benchmark/augmentations/perturbation_description.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +29 -0
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +56 -19
- helm/benchmark/augmentations/translate_perturbation.py +31 -0
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/config_registry.py +7 -1
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/executor.py +54 -25
- helm/benchmark/huggingface_registration.py +28 -10
- helm/benchmark/metrics/air_bench_metrics.py +3212 -0
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/annotation_metrics.py +108 -0
- helm/benchmark/metrics/basic_metrics.py +437 -667
- helm/benchmark/metrics/bbq_metrics.py +17 -6
- helm/benchmark/metrics/bias_metrics.py +18 -9
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/classification_metrics.py +107 -22
- helm/benchmark/metrics/cleva_accuracy_metrics.py +8 -5
- helm/benchmark/metrics/cleva_harms_metrics.py +12 -11
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/code_metrics_helper.py +11 -3
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +174 -0
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +83 -0
- helm/benchmark/metrics/copyright_metrics.py +5 -5
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +72 -0
- helm/benchmark/metrics/decodingtrust_ood_knowledge_metrics.py +66 -0
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +101 -0
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +202 -0
- helm/benchmark/metrics/disinformation_metrics.py +8 -114
- helm/benchmark/metrics/dry_run_metrics.py +35 -6
- helm/benchmark/metrics/efficiency_metrics.py +287 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +159 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +59 -0
- helm/benchmark/metrics/evaluate_reference_metrics.py +831 -0
- helm/benchmark/metrics/fin_qa_metrics.py +60 -0
- helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +115 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +48 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +67 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +54 -0
- helm/benchmark/metrics/image_generation/aesthetics_scorer.py +66 -0
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +84 -0
- helm/benchmark/metrics/image_generation/denoised_runtime_metric.py +42 -0
- helm/benchmark/metrics/image_generation/detection_metrics.py +57 -0
- helm/benchmark/metrics/image_generation/detectors/base_detector.py +8 -0
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +178 -0
- helm/benchmark/metrics/image_generation/efficiency_metrics.py +41 -0
- helm/benchmark/metrics/image_generation/fidelity_metrics.py +168 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +63 -0
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +33 -0
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +50 -0
- helm/benchmark/metrics/image_generation/gender_metrics.py +58 -0
- helm/benchmark/metrics/image_generation/image_critique_metrics.py +284 -0
- helm/benchmark/metrics/image_generation/lpips_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/multi_scale_ssim_metrics.py +82 -0
- helm/benchmark/metrics/image_generation/nsfw_detector.py +96 -0
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +103 -0
- helm/benchmark/metrics/image_generation/nudity_metrics.py +38 -0
- helm/benchmark/metrics/image_generation/photorealism_critique_metrics.py +153 -0
- helm/benchmark/metrics/image_generation/psnr_metrics.py +78 -0
- helm/benchmark/metrics/image_generation/q16/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/q16/q16_toxicity_detector.py +90 -0
- helm/benchmark/metrics/image_generation/q16/test_q16.py +20 -0
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +48 -0
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +164 -0
- helm/benchmark/metrics/image_generation/uiqi_metrics.py +92 -0
- helm/benchmark/metrics/image_generation/watermark/__init__.py +0 -0
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +16 -0
- helm/benchmark/metrics/image_generation/watermark/watermark_detector.py +87 -0
- helm/benchmark/metrics/image_generation/watermark_metrics.py +48 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +48 -5
- helm/benchmark/metrics/kpi_edgar_metrics.py +142 -0
- helm/benchmark/metrics/language_modeling_metrics.py +111 -0
- helm/benchmark/metrics/live_qa_metrics.py +35 -0
- helm/benchmark/metrics/llm_jury_metrics.py +58 -0
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/machine_translation_metrics.py +89 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +137 -0
- helm/benchmark/metrics/medec_metrics.py +124 -0
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/metric.py +121 -175
- helm/benchmark/metrics/metric_name.py +0 -1
- helm/benchmark/metrics/metric_service.py +23 -7
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +127 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/omni_math_metrics.py +44 -0
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +5 -6
- helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
- helm/benchmark/metrics/ranking_metrics.py +5 -5
- helm/benchmark/metrics/reference_metric.py +148 -0
- helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/safety_metrics.py +91 -0
- helm/benchmark/metrics/seahelm_metrics.py +201 -0
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +8 -11
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +150 -11
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +145 -70
- helm/benchmark/metrics/test_disinformation_metrics.py +78 -0
- helm/benchmark/metrics/{test_basic_metrics.py → test_evaluate_reference_metrics.py} +20 -1
- helm/benchmark/metrics/test_metric.py +3 -3
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +11 -3
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +3 -3
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +7 -7
- helm/benchmark/metrics/toxicity_metrics.py +37 -7
- helm/benchmark/metrics/toxicity_utils.py +23 -0
- helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
- helm/benchmark/metrics/unitxt_metrics.py +107 -0
- helm/benchmark/metrics/vision_language/__init__.py +0 -0
- helm/benchmark/metrics/vision_language/emd_utils.py +347 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +537 -0
- helm/benchmark/metrics/vision_language/image_utils.py +100 -0
- helm/benchmark/metrics/wildbench_metrics.py +54 -0
- helm/benchmark/model_deployment_registry.py +69 -5
- helm/benchmark/model_metadata_registry.py +58 -2
- helm/benchmark/multi_gpu_runner.py +133 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +51 -20
- helm/benchmark/presentation/run_display.py +51 -12
- helm/benchmark/presentation/run_entry.py +2 -2
- helm/benchmark/presentation/schema.py +83 -66
- helm/benchmark/presentation/summarize.py +483 -388
- helm/benchmark/presentation/table.py +8 -8
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/presentation/test_contamination.py +2 -2
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/presentation/test_run_entry.py +2 -2
- helm/benchmark/presentation/test_schema.py +11 -0
- helm/benchmark/presentation/test_summarize.py +148 -6
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +202 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +151 -87
- helm/benchmark/run_expander.py +418 -33
- helm/benchmark/run_spec.py +93 -0
- helm/benchmark/run_spec_factory.py +180 -0
- helm/benchmark/run_specs/__init__.py +0 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +58 -0
- helm/benchmark/run_specs/arabic_run_specs.py +197 -0
- helm/benchmark/run_specs/audio_run_specs.py +657 -0
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/call_center_run_specs.py +201 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1393 -0
- helm/benchmark/run_specs/cleva_run_specs.py +277 -0
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/decodingtrust_run_specs.py +316 -0
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
- helm/benchmark/run_specs/experimental_run_specs.py +224 -0
- helm/benchmark/run_specs/finance_run_specs.py +114 -0
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +625 -0
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/instruction_following_run_specs.py +129 -0
- helm/benchmark/run_specs/lite_run_specs.py +307 -0
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +188 -0
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1570 -0
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +191 -0
- helm/benchmark/run_specs/seahelm_run_specs.py +652 -0
- helm/benchmark/run_specs/simple_run_specs.py +104 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +167 -0
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +51 -0
- helm/benchmark/run_specs/vlm_run_specs.py +1057 -0
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/runner.py +63 -62
- helm/benchmark/runner_config_registry.py +21 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +149 -0
- helm/benchmark/scenarios/air_bench_scenario.py +76 -0
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +27 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +82 -0
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
- helm/benchmark/scenarios/aratrust_scenario.py +95 -0
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +74 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +70 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +79 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +16 -1
- helm/benchmark/scenarios/banking77_scenario.py +77 -0
- helm/benchmark/scenarios/bbq_scenario.py +17 -2
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +112 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bluex_scenario.py +70 -0
- helm/benchmark/scenarios/bold_scenario.py +18 -3
- helm/benchmark/scenarios/boolq_scenario.py +21 -1
- helm/benchmark/scenarios/call_center_scenario.py +84 -0
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +129 -0
- helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +14 -1
- helm/benchmark/scenarios/clear_scenario.py +180 -0
- helm/benchmark/scenarios/cleva_scenario.py +482 -3
- helm/benchmark/scenarios/code_scenario.py +46 -4
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/commonsense_scenario.py +33 -1
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +118 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +261 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +148 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +190 -0
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +143 -0
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +98 -0
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +344 -0
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +217 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +571 -0
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +80 -0
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +90 -0
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -3
- helm/benchmark/scenarios/dischargeme_scenario.py +196 -0
- helm/benchmark/scenarios/disinformation_scenario.py +32 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +25 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1541 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +77 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +33 -3
- helm/benchmark/scenarios/entity_matching_scenario.py +26 -2
- helm/benchmark/scenarios/ewok_scenario.py +116 -0
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +139 -0
- helm/benchmark/scenarios/financebench_scenario.py +74 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +115 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +145 -0
- helm/benchmark/scenarios/gpqa_scenario.py +98 -0
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/grammar_scenario.py +21 -2
- helm/benchmark/scenarios/gsm_scenario.py +31 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +61 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +70 -0
- helm/benchmark/scenarios/headqa_scenario.py +158 -0
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/ice_scenario.py +28 -4
- helm/benchmark/scenarios/ifeval_scenario.py +71 -0
- helm/benchmark/scenarios/image_generation/__init__.py +0 -0
- helm/benchmark/scenarios/image_generation/common_syntactic_processes_scenario.py +105 -0
- helm/benchmark/scenarios/image_generation/cub200_scenario.py +95 -0
- helm/benchmark/scenarios/image_generation/daily_dalle_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/demographic_stereotypes_scenario.py +82 -0
- helm/benchmark/scenarios/image_generation/detection_scenario.py +83 -0
- helm/benchmark/scenarios/image_generation/draw_bench_scenario.py +74 -0
- helm/benchmark/scenarios/image_generation/i2p_scenario.py +57 -0
- helm/benchmark/scenarios/image_generation/landing_page_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/logos_scenario.py +223 -0
- helm/benchmark/scenarios/image_generation/magazine_cover_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/mental_disorders_scenario.py +46 -0
- helm/benchmark/scenarios/image_generation/mscoco_scenario.py +91 -0
- helm/benchmark/scenarios/image_generation/paint_skills_scenario.py +72 -0
- helm/benchmark/scenarios/image_generation/parti_prompts_scenario.py +94 -0
- helm/benchmark/scenarios/image_generation/radiology_scenario.py +42 -0
- helm/benchmark/scenarios/image_generation/relational_understanding_scenario.py +52 -0
- helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py +124 -0
- helm/benchmark/scenarios/image_generation/winoground_scenario.py +62 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +26 -3
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +111 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +98 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +172 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +149 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +61 -1
- helm/benchmark/scenarios/legal_support_scenario.py +24 -1
- helm/benchmark/scenarios/legalbench_scenario.py +45 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +23 -2
- helm/benchmark/scenarios/lextreme_scenario.py +22 -1
- helm/benchmark/scenarios/live_qa_scenario.py +94 -0
- helm/benchmark/scenarios/lm_entry_scenario.py +185 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +15 -1
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +81 -22
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +56 -22
- helm/benchmark/scenarios/med_mcqa_scenario.py +24 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +30 -1
- helm/benchmark/scenarios/medalign_scenario.py +117 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +326 -0
- helm/benchmark/scenarios/medbullets_scenario.py +167 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +149 -0
- helm/benchmark/scenarios/medec_scenario.py +148 -0
- helm/benchmark/scenarios/medhallu_scenario.py +95 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +134 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +96 -0
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +146 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +127 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +121 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +99 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +113 -0
- helm/benchmark/scenarios/mmlu_scenario.py +32 -1
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/msmarco_scenario.py +31 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +166 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +164 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +297 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +20 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +33 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +71 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +33 -2
- helm/benchmark/scenarios/openai_mrcr_scenario.py +94 -0
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -5
- helm/benchmark/scenarios/pubmed_qa_scenario.py +81 -43
- helm/benchmark/scenarios/quac_scenario.py +24 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +175 -0
- helm/benchmark/scenarios/raft_scenario.py +33 -3
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +128 -0
- helm/benchmark/scenarios/scenario.py +44 -1
- helm/benchmark/scenarios/seahelm_scenario.py +2295 -0
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +97 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +95 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +99 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +98 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +94 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +100 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +98 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +104 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +98 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +44 -0
- helm/benchmark/scenarios/simple_scenarios.py +122 -1
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/spider_scenario.py +109 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +119 -0
- helm/benchmark/scenarios/summarization_scenario.py +48 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +24 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_commonsense_scenario.py +21 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +29 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_financebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_gsm_scenario.py +31 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/test_legalbench_scenario.py +30 -0
- helm/benchmark/scenarios/test_math_scenario.py +4 -3
- helm/benchmark/scenarios/test_med_qa_scenario.py +30 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_mmlu_scenario.py +33 -0
- helm/benchmark/scenarios/test_narrativeqa_scenario.py +73 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_scenario.py +6 -3
- helm/benchmark/scenarios/test_simple_scenarios.py +50 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +239 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +26 -2
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/unitxt_scenario.py +62 -0
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +4 -2
- helm/benchmark/scenarios/vicuna_scenario.py +22 -2
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +135 -0
- helm/benchmark/scenarios/vision_language/exams_v_scenario.py +104 -0
- helm/benchmark/scenarios/vision_language/fair_face_scenario.py +136 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +94 -0
- helm/benchmark/scenarios/vision_language/heim_human_eval_scenario.py +113 -0
- helm/benchmark/scenarios/vision_language/image2struct/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2struct/chart2csv_scenario.py +55 -0
- helm/benchmark/scenarios/vision_language/image2struct/image2struct_scenario.py +225 -0
- helm/benchmark/scenarios/vision_language/image2struct/latex_scenario.py +21 -0
- helm/benchmark/scenarios/vision_language/image2struct/musicsheet_scenario.py +16 -0
- helm/benchmark/scenarios/vision_language/image2struct/utils_latex.py +339 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/__init__.py +0 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/driver.py +84 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/jekyll_server.py +182 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage/utils.py +31 -0
- helm/benchmark/scenarios/vision_language/image2struct/webpage_scenario.py +256 -0
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/mementos_scenario.py +124 -0
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/mme_scenario.py +148 -0
- helm/benchmark/scenarios/vision_language/mmmu_scenario.py +187 -0
- helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/scenarios/vision_language/multipanelvqa_scenario.py +169 -0
- helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
- helm/benchmark/scenarios/vision_language/pope_scenario.py +105 -0
- helm/benchmark/scenarios/vision_language/real_world_qa_scenario.py +57 -0
- helm/benchmark/scenarios/vision_language/seed_bench_scenario.py +131 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +108 -0
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +98 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +4 -5
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +8 -4
- helm/benchmark/scenarios/wikifact_scenario.py +31 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +101 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +33 -2
- helm/benchmark/scenarios/xstest_scenario.py +35 -0
- helm/benchmark/server.py +32 -2
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +78 -50
- helm/benchmark/static/schema_air_bench.yaml +3149 -0
- helm/benchmark/static/schema_arabic.yaml +271 -0
- helm/benchmark/static/schema_audio.yaml +763 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +269 -0
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_classic.yaml +259 -1140
- helm/benchmark/static/schema_cleva.yaml +768 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_decodingtrust.yaml +444 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +319 -0
- helm/benchmark/static/schema_ewok.yaml +367 -0
- helm/benchmark/static/schema_finance.yaml +191 -0
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_image2struct.yaml +588 -0
- helm/benchmark/static/schema_instruction_following.yaml +161 -0
- helm/benchmark/static/schema_legal.yaml +566 -0
- helm/benchmark/static/schema_lite.yaml +3 -286
- helm/benchmark/static/schema_long_context.yaml +282 -0
- helm/benchmark/static/schema_medhelm.yaml +1176 -0
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_mmlu.yaml +1449 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +283 -0
- helm/benchmark/static/schema_seahelm.yaml +723 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/schema_thai.yaml +244 -0
- helm/benchmark/static/schema_torr.yaml +474 -0
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_unitxt.yaml +370 -0
- helm/benchmark/static/schema_vhelm.yaml +933 -0
- helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/air-overview-DpBbyagA.png +0 -0
- helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
- helm/benchmark/static_build/assets/heim-logo-BJtQlEbV.png +0 -0
- helm/benchmark/static_build/assets/helm-safety-COfndXuS.png +0 -0
- helm/benchmark/static_build/assets/helmhero-D9TvmJsp.png +0 -0
- helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
- helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
- helm/benchmark/static_build/assets/medhelm-overview-CND0EIsy.png +0 -0
- helm/benchmark/static_build/assets/medhelm-v1-overview-Cu2tphBB.png +0 -0
- helm/benchmark/static_build/assets/overview-BwypNWnk.png +0 -0
- helm/benchmark/static_build/assets/process-flow-DWDJC733.png +0 -0
- helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
- helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
- helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
- helm/benchmark/static_build/assets/vhelm-aspects-NiDQofvP.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-NxJE4fdA.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-ypCL5Yvq.png +0 -0
- helm/benchmark/static_build/config.js +4 -0
- helm/benchmark/static_build/index.html +19 -0
- helm/benchmark/test_data_preprocessor.py +3 -3
- helm/benchmark/test_run_expander.py +1 -1
- helm/benchmark/window_services/default_window_service.py +3 -45
- helm/benchmark/window_services/encoder_decoder_window_service.py +4 -15
- helm/benchmark/window_services/ice_window_service.py +1 -35
- helm/benchmark/window_services/image_generation/__init__.py +0 -0
- helm/benchmark/window_services/image_generation/clip_window_service.py +13 -0
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +9 -0
- helm/benchmark/window_services/image_generation/test_clip_window_service.py +29 -0
- helm/benchmark/window_services/image_generation/test_openai_dalle_window_service.py +30 -0
- helm/benchmark/window_services/local_window_service.py +22 -5
- helm/benchmark/window_services/test_anthropic_window_service.py +5 -4
- helm/benchmark/window_services/test_bloom_window_service.py +5 -4
- helm/benchmark/window_services/test_flan_t5_window_service.py +2 -1
- helm/benchmark/window_services/test_gpt2_window_service.py +9 -4
- helm/benchmark/window_services/test_gpt4_window_service.py +10 -4
- helm/benchmark/window_services/test_gptj_window_service.py +11 -5
- helm/benchmark/window_services/test_gptneox_window_service.py +6 -5
- helm/benchmark/window_services/test_openai_window_service.py +18 -12
- helm/benchmark/window_services/test_opt_window_service.py +6 -5
- helm/benchmark/window_services/test_palmyra_window_service.py +5 -4
- helm/benchmark/window_services/test_t0pp_window_service.py +5 -4
- helm/benchmark/window_services/test_t511b_window_service.py +5 -4
- helm/benchmark/window_services/test_ul2_window_service.py +5 -4
- helm/benchmark/window_services/test_utils.py +6 -6
- helm/benchmark/window_services/test_yalm_window_service.py +5 -4
- helm/benchmark/window_services/tokenizer_service.py +7 -13
- helm/benchmark/window_services/window_service.py +42 -0
- helm/benchmark/window_services/window_service_factory.py +4 -1
- helm/benchmark/window_services/yalm_window_service.py +1 -28
- helm/clients/__init__.py +0 -0
- helm/{proxy/clients → clients}/ai21_client.py +78 -12
- helm/clients/aleph_alpha_client.py +114 -0
- helm/{proxy/clients → clients}/anthropic_client.py +304 -21
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +122 -0
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/llama_omni_client.py +199 -0
- helm/clients/audio_language/qwen2_5_omni_client.py +210 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +191 -0
- helm/clients/audio_language/qwen_audiolm_client.py +153 -0
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/audio_language/test.py +62 -0
- helm/{proxy/clients → clients}/auto_client.py +72 -31
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +381 -0
- helm/clients/bedrock_utils.py +105 -0
- helm/{proxy/clients → clients}/client.py +92 -17
- helm/clients/clip_score_client.py +49 -0
- helm/clients/clip_scorers/__init__.py +0 -0
- helm/clients/clip_scorers/base_clip_scorer.py +18 -0
- helm/clients/clip_scorers/clip_scorer.py +50 -0
- helm/clients/clip_scorers/multilingual_clip_scorer.py +50 -0
- helm/{proxy/clients → clients}/cohere_client.py +105 -14
- helm/clients/dspy_client.py +135 -0
- helm/clients/gcs_client.py +82 -0
- helm/{proxy/clients → clients}/google_client.py +8 -6
- helm/clients/google_translate_client.py +35 -0
- helm/clients/grok_client.py +36 -0
- helm/{proxy/clients → clients}/http_model_client.py +8 -8
- helm/{proxy/clients → clients}/huggingface_client.py +157 -86
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/ibm_client.py +269 -0
- helm/clients/image_generation/__init__.py +0 -0
- helm/clients/image_generation/adobe_vision_client.py +80 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +100 -0
- helm/clients/image_generation/cogview2/__init__.py +0 -0
- helm/clients/image_generation/cogview2/coglm_strategy.py +96 -0
- helm/clients/image_generation/cogview2/coglm_utils.py +82 -0
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +15 -0
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +99 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +254 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_sampling.py +190 -0
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +144 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_model.py +269 -0
- helm/clients/image_generation/cogview2/sr_pipeline/itersr_sampling.py +120 -0
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +42 -0
- helm/clients/image_generation/cogview2_client.py +192 -0
- helm/clients/image_generation/dalle2_client.py +194 -0
- helm/clients/image_generation/dalle3_client.py +108 -0
- helm/clients/image_generation/dalle_mini/__init__.py +3 -0
- helm/clients/image_generation/dalle_mini/data.py +442 -0
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +175 -0
- helm/clients/image_generation/dalle_mini/model/modeling.py +1834 -0
- helm/clients/image_generation/dalle_mini/model/partitions.py +84 -0
- helm/clients/image_generation/dalle_mini/model/processor.py +63 -0
- helm/clients/image_generation/dalle_mini/model/text.py +251 -0
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +9 -0
- helm/clients/image_generation/dalle_mini/model/utils.py +29 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/configuration_vqgan.py +40 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +107 -0
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +610 -0
- helm/clients/image_generation/dalle_mini_client.py +191 -0
- helm/clients/image_generation/deep_floyd_client.py +80 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +250 -0
- helm/clients/image_generation/image_generation_client_utils.py +9 -0
- helm/clients/image_generation/lexica_client.py +88 -0
- helm/clients/image_generation/mindalle/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/__init__.py +216 -0
- helm/clients/image_generation/mindalle/models/stage1/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +312 -0
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +103 -0
- helm/clients/image_generation/mindalle/models/stage2/__init__.py +0 -0
- helm/clients/image_generation/mindalle/models/stage2/layers.py +144 -0
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +268 -0
- helm/clients/image_generation/mindalle/models/tokenizer.py +30 -0
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -0
- helm/clients/image_generation/mindalle/utils/config.py +129 -0
- helm/clients/image_generation/mindalle/utils/sampling.py +149 -0
- helm/clients/image_generation/mindalle/utils/utils.py +89 -0
- helm/clients/image_generation/mindalle_client.py +116 -0
- helm/clients/image_generation/nudity_check_client.py +64 -0
- helm/clients/image_generation/together_image_generation_client.py +113 -0
- helm/{proxy/clients → clients}/lit_gpt_client.py +6 -6
- helm/{proxy/clients → clients}/megatron_client.py +7 -5
- helm/clients/mistral_client.py +180 -0
- helm/clients/moderation_api_client.py +111 -0
- helm/clients/nvidia_nim_client.py +32 -0
- helm/clients/open_lm_client.py +43 -0
- helm/clients/openai_client.py +604 -0
- helm/clients/openai_responses_client.py +200 -0
- helm/clients/openrouter_client.py +31 -0
- helm/{proxy/clients → clients}/palmyra_client.py +31 -14
- helm/{proxy/clients → clients}/perspective_api_client.py +18 -14
- helm/clients/reka_client.py +190 -0
- helm/clients/simple_client.py +64 -0
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +95 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/{proxy/clients → clients}/test_auto_client.py +13 -15
- helm/clients/test_client.py +98 -0
- helm/{proxy/clients → clients}/test_huggingface_client.py +31 -16
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/test_simple_client.py +19 -0
- helm/clients/test_together_client.py +184 -0
- helm/clients/together_client.py +599 -0
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +488 -0
- helm/clients/vision_language/__init__.py +0 -0
- helm/clients/vision_language/huggingface_vision2seq_client.py +148 -0
- helm/clients/vision_language/huggingface_vlm_client.py +114 -0
- helm/{proxy/clients → clients}/vision_language/idefics_client.py +61 -51
- helm/clients/vision_language/open_flamingo/__init__.py +2 -0
- helm/clients/vision_language/open_flamingo/src/__init__.py +0 -0
- helm/clients/vision_language/open_flamingo/src/factory.py +147 -0
- helm/clients/vision_language/open_flamingo/src/flamingo.py +337 -0
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +155 -0
- helm/clients/vision_language/open_flamingo/src/helpers.py +267 -0
- helm/clients/vision_language/open_flamingo/src/utils.py +47 -0
- helm/clients/vision_language/open_flamingo_client.py +155 -0
- helm/clients/vision_language/paligemma_client.py +147 -0
- helm/clients/vision_language/palmyra_vision_client.py +101 -0
- helm/clients/vision_language/qwen2_vlm_client.py +189 -0
- helm/clients/vision_language/qwen_vlm_client.py +174 -0
- helm/clients/vllm_client.py +80 -0
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/clients/writer_client.py +105 -0
- helm/clients/yi_client.py +28 -0
- helm/common/audio_utils.py +111 -0
- helm/common/cache.py +23 -33
- helm/common/cache_backend_config.py +47 -0
- helm/common/clip_score_request.py +41 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/critique_request.py +10 -2
- helm/common/file_caches/__init__.py +0 -0
- helm/common/file_caches/file_cache.py +16 -0
- helm/common/file_caches/local_file_cache.py +61 -0
- helm/common/file_caches/test_local_file_cache.py +25 -0
- helm/common/file_upload_request.py +27 -0
- helm/common/general.py +10 -3
- helm/common/hierarchical_logger.py +124 -12
- helm/common/image_generation_parameters.py +25 -0
- helm/common/images_utils.py +60 -5
- helm/common/key_value_store.py +41 -10
- helm/common/local_context.py +140 -0
- helm/common/media_object.py +14 -1
- helm/common/moderations_api_request.py +71 -0
- helm/common/mongo_key_value_store.py +8 -7
- helm/common/multimodal_request_utils.py +57 -0
- helm/common/nudity_check_request.py +29 -0
- helm/common/object_spec.py +23 -8
- helm/common/optional_dependencies.py +1 -1
- helm/common/reeval_parameters.py +12 -0
- helm/common/remote_context.py +61 -0
- helm/common/request.py +45 -19
- helm/common/response_format.py +18 -0
- helm/common/test_cache.py +1 -48
- helm/common/test_general.py +10 -0
- helm/common/test_logging.py +94 -0
- helm/common/test_media_object.py +1 -1
- helm/common/tokenization_request.py +1 -10
- helm/config/model_deployments.yaml +4713 -1005
- helm/config/model_metadata.yaml +4045 -255
- helm/config/tokenizer_configs.yaml +1091 -50
- helm/proxy/accounts.py +31 -4
- helm/proxy/cli.py +6 -4
- helm/proxy/critique/mechanical_turk_critique_importer.py +3 -0
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/critique/model_critique_client.py +40 -10
- helm/proxy/example_queries.py +33 -28
- helm/proxy/retry.py +5 -0
- helm/proxy/server.py +82 -18
- helm/proxy/services/remote_service.py +32 -7
- helm/proxy/services/server_service.py +71 -69
- helm/proxy/services/service.py +30 -6
- helm/proxy/services/test_remote_service.py +6 -5
- helm/proxy/services/test_service.py +1 -13
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +61 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +462 -0
- helm/proxy/test_accounts.py +32 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +37 -37
- helm/proxy/token_counters/test_auto_token_counter.py +164 -0
- helm/proxy/token_counters/token_counter.py +3 -5
- helm/tokenizers/__init__.py +0 -0
- helm/tokenizers/ai21_tokenizer.py +52 -0
- helm/{proxy/tokenizers → tokenizers}/aleph_alpha_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/auto_tokenizer.py +9 -12
- helm/{proxy/tokenizers → tokenizers}/caching_tokenizer.py +2 -30
- helm/tokenizers/cohere_tokenizer.py +50 -0
- helm/tokenizers/grok_tokenizer.py +55 -0
- helm/{proxy/tokenizers → tokenizers}/http_model_tokenizer.py +4 -4
- helm/{proxy/tokenizers → tokenizers}/huggingface_tokenizer.py +44 -41
- helm/{proxy/tokenizers → tokenizers}/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/simple_tokenizer.py +33 -0
- helm/tokenizers/test_ai21_tokenizer.py +48 -0
- helm/{proxy/tokenizers → tokenizers}/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_cohere_tokenizer.py +39 -0
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_huggingface_tokenizer.py +9 -2
- helm/tokenizers/test_simple_tokenizer.py +33 -0
- helm/{proxy/tokenizers → tokenizers}/test_yalm_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/tiktoken_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/tokenizer.py +3 -1
- helm/{proxy/tokenizers → tokenizers}/vertexai_tokenizer.py +1 -1
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer.py +8 -6
- helm/tokenizers/yalm_tokenizer_data/__init__.py +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/yalm_tokenizer_data/voc_100b.sp +0 -0
- helm/{proxy/tokenizers → tokenizers}/yalm_tokenizer_data/yalm_tokenizer.py +1 -1
- crfm_helm-0.4.0.dist-info/METADATA +0 -264
- crfm_helm-0.4.0.dist-info/RECORD +0 -397
- helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
- helm/benchmark/data_overlap/export_scenario_text.py +0 -119
- helm/benchmark/data_overlap/light_scenario.py +0 -60
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/run_specs.py +0 -2762
- helm/benchmark/scenarios/numeracy_scenario.py +0 -784
- helm/benchmark/static/benchmarking.css +0 -156
- helm/benchmark/static/benchmarking.js +0 -1705
- helm/benchmark/static/config.js +0 -3
- helm/benchmark/static/images/helm-logo.png +0 -0
- helm/benchmark/static/images/language-model-helm.png +0 -0
- helm/benchmark/static/images/organizations/ai21.png +0 -0
- helm/benchmark/static/images/organizations/anthropic.png +0 -0
- helm/benchmark/static/images/organizations/bigscience.png +0 -0
- helm/benchmark/static/images/organizations/cohere.png +0 -0
- helm/benchmark/static/images/organizations/eleutherai.png +0 -0
- helm/benchmark/static/images/organizations/google.png +0 -0
- helm/benchmark/static/images/organizations/meta.png +0 -0
- helm/benchmark/static/images/organizations/microsoft.png +0 -0
- helm/benchmark/static/images/organizations/nvidia.png +0 -0
- helm/benchmark/static/images/organizations/openai.png +0 -0
- helm/benchmark/static/images/organizations/together.png +0 -0
- helm/benchmark/static/images/organizations/tsinghua-keg.png +0 -0
- helm/benchmark/static/images/organizations/yandex.png +0 -0
- helm/benchmark/static/images/scenarios-by-metrics.png +0 -0
- helm/benchmark/static/images/taxonomy-scenarios.png +0 -0
- helm/benchmark/static/index.html +0 -68
- helm/benchmark/static/json-urls.js +0 -69
- helm/benchmark/static/plot-captions.js +0 -27
- helm/benchmark/static/utils.js +0 -285
- helm/benchmark/test_model_deployment_definition.py +0 -92
- helm/benchmark/test_model_properties.py +0 -1570
- helm/benchmark/vlm_run_specs.py +0 -97
- helm/benchmark/window_services/ai21_window_service.py +0 -258
- helm/benchmark/window_services/cohere_window_service.py +0 -163
- helm/benchmark/window_services/flan_t5_window_service.py +0 -29
- helm/benchmark/window_services/gpt2_window_service.py +0 -32
- helm/benchmark/window_services/huggingface_window_service.py +0 -60
- helm/benchmark/window_services/t0pp_window_service.py +0 -35
- helm/benchmark/window_services/t511b_window_service.py +0 -30
- helm/benchmark/window_services/test_ai21_window_service.py +0 -163
- helm/benchmark/window_services/test_cohere_window_service.py +0 -74
- helm/benchmark/window_services/test_cohere_window_service_utils.py +0 -8328
- helm/benchmark/window_services/test_ice_window_service.py +0 -326
- helm/benchmark/window_services/test_mt_nlg_window_service.py +0 -48
- helm/benchmark/window_services/ul2_window_service.py +0 -30
- helm/benchmark/window_services/wider_ai21_window_service.py +0 -24
- helm/common/cache_utils.py +0 -14
- helm/proxy/clients/aleph_alpha_client.py +0 -95
- helm/proxy/clients/goose_ai_client.py +0 -99
- helm/proxy/clients/microsoft_client.py +0 -180
- helm/proxy/clients/openai_client.py +0 -206
- helm/proxy/clients/simple_client.py +0 -60
- helm/proxy/clients/test_client.py +0 -49
- helm/proxy/clients/test_together_client.py +0 -97
- helm/proxy/clients/together_client.py +0 -334
- helm/proxy/clients/vertexai_client.py +0 -115
- helm/proxy/token_counters/ai21_token_counter.py +0 -20
- helm/proxy/token_counters/cohere_token_counter.py +0 -13
- helm/proxy/token_counters/free_token_counter.py +0 -12
- helm/proxy/token_counters/gooseai_token_counter.py +0 -24
- helm/proxy/token_counters/openai_token_counter.py +0 -22
- helm/proxy/token_counters/test_ai21_token_counter.py +0 -88
- helm/proxy/token_counters/test_openai_token_counter.py +0 -81
- helm/proxy/tokenizers/ai21_tokenizer.py +0 -60
- helm/proxy/tokenizers/anthropic_tokenizer.py +0 -52
- helm/proxy/tokenizers/cohere_tokenizer.py +0 -83
- helm/proxy/tokenizers/ice_tokenizer.py +0 -30
- helm/proxy/tokenizers/simple_tokenizer.py +0 -32
- helm/proxy/tokenizers/test_ice_tokenizer.py +0 -57
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.4.0.dist-info → crfm_helm-0.5.10.dist-info}/top_level.txt +0 -0
- /helm/benchmark/{data_overlap → annotation}/__init__.py +0 -0
- /helm/{proxy/clients → benchmark/annotation/image2struct}/__init__.py +0 -0
- /helm/{proxy/clients/vision_language → benchmark/metrics/ifeval}/__init__.py +0 -0
- /helm/{proxy/tokenizers → benchmark/metrics/image_generation}/__init__.py +0 -0
- /helm/{proxy/tokenizers/yalm_tokenizer_data → benchmark/metrics/image_generation/detectors}/__init__.py +0 -0
- /helm/benchmark/{static/images/crfm-logo.png → static_build/assets/crfm-logo-Du4T1uWZ.png} +0 -0
- /helm/benchmark/{static/images/helm-logo-simple.png → static_build/assets/helm-logo-simple-DzOhNN41.png} +0 -0
- /helm/{proxy/clients → clients}/ai21_utils.py +0 -0
- /helm/{proxy/clients → clients}/cohere_utils.py +0 -0
- /helm/{proxy/clients → clients}/lit_gpt_generate.py +0 -0
- /helm/{proxy/clients → clients}/toxicity_classifier_client.py +0 -0
- /helm/{benchmark → proxy}/static/general.js +0 -0
- /helm/{benchmark → proxy}/static/info-icon.png +0 -0
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
from typing import List, Set
|
|
2
|
+
import csv
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
from helm.common.general import ensure_file_downloaded
|
|
6
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class RelationalUnderstandingScenario(Scenario):
|
|
10
|
+
"""
|
|
11
|
+
From Testing Relational Understanding in Text-Guided Image Generation, based on existing cognitive,
|
|
12
|
+
linguistic, and developmental literature, the authors created a set of 15 relations (8 physical,
|
|
13
|
+
7 agentic) and a set of 12 entities (6 objects, 6 agents). The physical relations were: in, on,
|
|
14
|
+
under, covering, near, occluded by, hanging over, and tied to. The agentic relations were: pushing,
|
|
15
|
+
pulling, touching, hitting, kicking, helping, and hindering. The objects were: box, cylinder,
|
|
16
|
+
blanket, bowl, teacup, and knife. The agents were: man, woman, child, robot, monkey, and iguana.
|
|
17
|
+
|
|
18
|
+
The authors created 5 different prompts for each relation, by randomly sampling two entities five
|
|
19
|
+
times, resulting in 75 distinct basic relation prompts (e.g., a monkey touching an iguana). Withs
|
|
20
|
+
these prompts, the authors showed that DALL-E 2 suffers from a significant lack of commonsense
|
|
21
|
+
reasoning in the form of relational understanding.
|
|
22
|
+
|
|
23
|
+
Paper: https://arxiv.org/abs/2208.00005
|
|
24
|
+
Website: https://osf.io/sm68h
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
name = "relational_understanding"
|
|
28
|
+
description = (
|
|
29
|
+
"Consists of 75 basic relation prompts that tests commonsense reasoning "
|
|
30
|
+
"([paper](https://arxiv.org/abs/2208.00005))."
|
|
31
|
+
)
|
|
32
|
+
tags = ["text-to-image", "reasoning"]
|
|
33
|
+
|
|
34
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
35
|
+
data_path: str = os.path.join(output_path, "choice_data.csv")
|
|
36
|
+
ensure_file_downloaded(source_url="https://osf.io/download/tb3a4", target_path=data_path)
|
|
37
|
+
|
|
38
|
+
instances: List[Instance] = []
|
|
39
|
+
seen_prompts: Set[str] = set()
|
|
40
|
+
with open(data_path) as csv_file:
|
|
41
|
+
csv_reader = csv.reader(csv_file, delimiter=",")
|
|
42
|
+
for i, row in enumerate(csv_reader):
|
|
43
|
+
if i == 0:
|
|
44
|
+
# Skip the header
|
|
45
|
+
continue
|
|
46
|
+
|
|
47
|
+
prompt: str = row[1]
|
|
48
|
+
if prompt not in seen_prompts:
|
|
49
|
+
instances.append(Instance(Input(text=prompt), references=[], split=TEST_SPLIT))
|
|
50
|
+
seen_prompts.add(prompt)
|
|
51
|
+
|
|
52
|
+
return instances
|
helm/benchmark/scenarios/image_generation/time_most_significant_historical_figures_scenario.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class TIMEMostSignificantHistoricalFigures(Scenario):
|
|
7
|
+
"""
|
|
8
|
+
People from TIME's "The 100 Most Significant Figures in History" list.
|
|
9
|
+
|
|
10
|
+
https://ideas.time.com/2013/12/10/whos-biggest-the-100-most-significant-figures-in-history/
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
HISTORICAL_FIGURES: List[str] = [
|
|
14
|
+
"Jesus",
|
|
15
|
+
"Napoleon Bonaparte",
|
|
16
|
+
"Muhammad",
|
|
17
|
+
"William Shakespeare",
|
|
18
|
+
"Abraham Lincoln",
|
|
19
|
+
"George Washington",
|
|
20
|
+
"Adolf Hitler",
|
|
21
|
+
"Aristotle",
|
|
22
|
+
"Alexander the Great",
|
|
23
|
+
"Thomas Jefferson",
|
|
24
|
+
"Henry VIII of England",
|
|
25
|
+
"Charles Darwin",
|
|
26
|
+
"Elizabeth I of England",
|
|
27
|
+
"Karl Marx",
|
|
28
|
+
"Julius Caesar",
|
|
29
|
+
"Queen Victoria",
|
|
30
|
+
"Martin Luther",
|
|
31
|
+
"Joseph Stalin",
|
|
32
|
+
"Albert Einstein",
|
|
33
|
+
"Christopher Columbus",
|
|
34
|
+
"Isaac Newton",
|
|
35
|
+
"Charlemagne",
|
|
36
|
+
"Theodore Roosevelt",
|
|
37
|
+
"Wolfgang Amadeus Mozart",
|
|
38
|
+
"Plato",
|
|
39
|
+
"Louis XIV of France",
|
|
40
|
+
"Ludwig van Beethoven",
|
|
41
|
+
"Ulysses S.Grant",
|
|
42
|
+
"Leonardo da Vinci",
|
|
43
|
+
"Augustus",
|
|
44
|
+
"Carl Linnaeus",
|
|
45
|
+
"Ronald Reagan",
|
|
46
|
+
"Charles Dickens",
|
|
47
|
+
"Paul the Apostle",
|
|
48
|
+
"Benjamin Franklin",
|
|
49
|
+
# "George W.Bush",
|
|
50
|
+
"Winston Churchill",
|
|
51
|
+
"Genghis Khan",
|
|
52
|
+
"Charles I of England",
|
|
53
|
+
"Thomas Edison",
|
|
54
|
+
"James I of England",
|
|
55
|
+
"Friedrich Nietzsche",
|
|
56
|
+
"Franklin D.Roosevelt",
|
|
57
|
+
"Sigmund Freud",
|
|
58
|
+
"Alexander Hamilton",
|
|
59
|
+
"Mohandas Karamchand Gandhi",
|
|
60
|
+
"Woodrow Wilson",
|
|
61
|
+
"Johann Sebastian Bach",
|
|
62
|
+
"Galileo Galilei",
|
|
63
|
+
"Oliver Cromwell",
|
|
64
|
+
"James Madison",
|
|
65
|
+
"Gautama Buddha",
|
|
66
|
+
"Mark Twain",
|
|
67
|
+
"Edgar Allan Poe",
|
|
68
|
+
"Joseph Smith, Jr.",
|
|
69
|
+
"Adam Smith",
|
|
70
|
+
"David, King of Israel",
|
|
71
|
+
"George III of the United Kingdom",
|
|
72
|
+
"Immanuel Kant",
|
|
73
|
+
"James Cook",
|
|
74
|
+
"John Adams",
|
|
75
|
+
"Richard Wagner",
|
|
76
|
+
"Pyotr Ilyich Tchaikovsky",
|
|
77
|
+
"Voltaire",
|
|
78
|
+
"Saint Peter",
|
|
79
|
+
"Andrew Jackson",
|
|
80
|
+
"Constantine the Great",
|
|
81
|
+
"Socrates",
|
|
82
|
+
"Elvis Presley",
|
|
83
|
+
"William the Conqueror",
|
|
84
|
+
"John F.Kennedy",
|
|
85
|
+
"Augustine of Hippo",
|
|
86
|
+
"Vincent van Gogh",
|
|
87
|
+
"Nicolaus Copernicus",
|
|
88
|
+
"Vladimir Lenin",
|
|
89
|
+
"Robert E.Lee",
|
|
90
|
+
"Oscar Wilde",
|
|
91
|
+
"Charles II of England",
|
|
92
|
+
"Cicero",
|
|
93
|
+
"Jean-Jacques Rousseau",
|
|
94
|
+
"Francis Bacon",
|
|
95
|
+
"Richard Nixon",
|
|
96
|
+
"Louis XVI of France",
|
|
97
|
+
"Charles V, Holy Roman Emperor",
|
|
98
|
+
"King Arthur",
|
|
99
|
+
"Michelangelo",
|
|
100
|
+
"Philip II of Spain",
|
|
101
|
+
"Johann Wolfgang von Goethe",
|
|
102
|
+
"Ali, founder of Sufism",
|
|
103
|
+
"Thomas Aquinas",
|
|
104
|
+
"Pope John Paul II",
|
|
105
|
+
"René Descartes",
|
|
106
|
+
"Nikola Tesla",
|
|
107
|
+
"Harry S.Truman",
|
|
108
|
+
"Joan of Arc",
|
|
109
|
+
"Dante Alighieri",
|
|
110
|
+
"Otto von Bismarck",
|
|
111
|
+
"Grover Cleveland",
|
|
112
|
+
"John Calvin",
|
|
113
|
+
"John Locke",
|
|
114
|
+
]
|
|
115
|
+
|
|
116
|
+
name = "time_most_significant_historical_figures"
|
|
117
|
+
description = 'People from TIME\'s "The 100 Most Significant Figures in History" list.'
|
|
118
|
+
tags = ["text-to-image", "knowledge"]
|
|
119
|
+
|
|
120
|
+
def get_instances(self, _) -> List[Instance]:
|
|
121
|
+
return [
|
|
122
|
+
Instance(Input(text=historical_figure), references=[], split=TEST_SPLIT)
|
|
123
|
+
for historical_figure in self.HISTORICAL_FIGURES
|
|
124
|
+
]
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
from datasets import load_dataset
|
|
5
|
+
|
|
6
|
+
from helm.common.general import get_file_name
|
|
7
|
+
from helm.common.images_utils import copy_image
|
|
8
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
9
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, Output, Reference, CORRECT_TAG, TEST_SPLIT
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class WinogroundScenario(Scenario):
|
|
13
|
+
"""
|
|
14
|
+
Winoground is a novel task and dataset for evaluating the ability of vision and language models
|
|
15
|
+
to conduct visio-linguistic compositional reasoning. Given two images and two captions, the
|
|
16
|
+
goal is to match them correctly—but crucially, both captions contain a completely identical set
|
|
17
|
+
of words/morphemes, only in a different order. The dataset was carefully hand-curated by
|
|
18
|
+
expert annotators and is labeled with a rich set of fine-grained tags to assist in analyzing
|
|
19
|
+
model performance.
|
|
20
|
+
|
|
21
|
+
Users must agree to share their contact information before downloading the dataset from
|
|
22
|
+
Hugging Face. Either agree to the terms and set HUGGING_FACE_ACCESS_TOKEN to an access token
|
|
23
|
+
of a valid Hugging Face account or have the dataset pre-downloaded at the Hugging Face cache
|
|
24
|
+
(default path: ~/.cache/huggingface/datasets).
|
|
25
|
+
|
|
26
|
+
Paper: https://arxiv.org/abs/2204.03162
|
|
27
|
+
Website: https://huggingface.co/datasets/facebook/winoground
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
name = "winoground"
|
|
31
|
+
description = (
|
|
32
|
+
"Winoground is a novel task and dataset for evaluating the ability of vision and language models "
|
|
33
|
+
"to conduct visio-linguistic compositional reasoning "
|
|
34
|
+
"([paper](https://arxiv.org/abs/2204.03162))."
|
|
35
|
+
)
|
|
36
|
+
tags = ["text-to-image", "image-to-text", "visual_reasoning"]
|
|
37
|
+
|
|
38
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
39
|
+
auth_token: str = os.environ.get("HUGGING_FACE_ACCESS_TOKEN", "")
|
|
40
|
+
|
|
41
|
+
instances: List[Instance] = []
|
|
42
|
+
for row in load_dataset("facebook/winoground", split="test", use_auth_token=auth_token):
|
|
43
|
+
# Use the first example of the pair for now (index 0)
|
|
44
|
+
caption: str = row["caption_0"]
|
|
45
|
+
image_path: str = row["image_0"].filename
|
|
46
|
+
|
|
47
|
+
# Create a copy of the image in the benchmark output folder for metrics computation
|
|
48
|
+
image_copy_path: str = os.path.join(output_path, get_file_name(image_path))
|
|
49
|
+
if not os.path.exists(image_copy_path):
|
|
50
|
+
copy_image(image_path, image_copy_path)
|
|
51
|
+
content: MultimediaObject = MultimediaObject(
|
|
52
|
+
[MediaObject(content_type="image/png", location=image_copy_path)]
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
instances.append(
|
|
56
|
+
Instance(
|
|
57
|
+
input=Input(text=caption),
|
|
58
|
+
references=[Reference(Output(multimedia_content=content), tags=[CORRECT_TAG])],
|
|
59
|
+
split=TEST_SPLIT,
|
|
60
|
+
)
|
|
61
|
+
)
|
|
62
|
+
return instances
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
from typing import Any, List, Dict
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from datasets import load_dataset
|
|
4
|
+
from helm.common.hierarchical_logger import hlog
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Scenario,
|
|
7
|
+
Instance,
|
|
8
|
+
Reference,
|
|
9
|
+
TRAIN_SPLIT,
|
|
10
|
+
TEST_SPLIT,
|
|
11
|
+
CORRECT_TAG,
|
|
12
|
+
Input,
|
|
13
|
+
Output,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class IMDB_PTBRScenario(Scenario):
|
|
18
|
+
"""
|
|
19
|
+
The IMDB dataset is a widely-used benchmark dataset for natural language processing (NLP)
|
|
20
|
+
particularly for text classification and sentiment analysis.
|
|
21
|
+
This is a translated version that is meant to evaluate PT-BR models.
|
|
22
|
+
It consists of movie reviews from the Internet Movie Database (IMDB) and
|
|
23
|
+
includes both positive and negative sentiments labeled for supervised learning.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
name = "simple_classification"
|
|
27
|
+
description = "Classify movie reviews between positive or negative."
|
|
28
|
+
tags = ["classification"]
|
|
29
|
+
|
|
30
|
+
def process_dataset(self, dataset: Any, split: str) -> List[Instance]:
|
|
31
|
+
instances: List[Instance] = []
|
|
32
|
+
label_names = {0: "negativo", 1: "positivo"}
|
|
33
|
+
for example in dataset[split]:
|
|
34
|
+
input = Input(text=example["text"])
|
|
35
|
+
# NOTE: For classification scenarios, the reference outputs should be the same
|
|
36
|
+
# for all instances, and should include both correct and incorrect classes.
|
|
37
|
+
# HELM only supports single-label classification. Exactly one reference
|
|
38
|
+
# should have the CORRECT_TAG tag.
|
|
39
|
+
references = [
|
|
40
|
+
Reference(Output(text=label_names[example["label"]]), tags=[CORRECT_TAG]),
|
|
41
|
+
]
|
|
42
|
+
instance = Instance(input=input, references=references, split=split)
|
|
43
|
+
instances.append(instance)
|
|
44
|
+
return instances
|
|
45
|
+
|
|
46
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
47
|
+
instances: List[Instance] = []
|
|
48
|
+
cache_dir = str(Path(output_path) / "data")
|
|
49
|
+
dataset = load_dataset("maritaca-ai/imdb_pt", cache_dir=cache_dir)
|
|
50
|
+
splits: Dict[str, str] = {
|
|
51
|
+
"train": TRAIN_SPLIT,
|
|
52
|
+
"test": TEST_SPLIT,
|
|
53
|
+
}
|
|
54
|
+
for split in splits:
|
|
55
|
+
if split not in splits.keys():
|
|
56
|
+
hlog(f"{split} split doesn't exist, skipping")
|
|
57
|
+
continue
|
|
58
|
+
instances.extend(self.process_dataset(dataset, splits[split]))
|
|
59
|
+
|
|
60
|
+
return instances
|
|
@@ -1,9 +1,20 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from typing import List, Dict, Optional
|
|
3
3
|
|
|
4
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
4
5
|
from helm.common.general import ensure_file_downloaded
|
|
5
|
-
from .scenario import
|
|
6
|
-
|
|
6
|
+
from helm.benchmark.scenarios.scenario import (
|
|
7
|
+
Scenario,
|
|
8
|
+
Instance,
|
|
9
|
+
Reference,
|
|
10
|
+
CORRECT_TAG,
|
|
11
|
+
TRAIN_SPLIT,
|
|
12
|
+
VALID_SPLIT,
|
|
13
|
+
Input,
|
|
14
|
+
Output,
|
|
15
|
+
ScenarioMetadata,
|
|
16
|
+
)
|
|
17
|
+
from helm.benchmark.scenarios.imdb_scenario_pinned_file_order import listdir_with_pinned_file_order
|
|
7
18
|
|
|
8
19
|
|
|
9
20
|
class IMDBScenario(Scenario):
|
|
@@ -107,7 +118,6 @@ class IMDBScenario(Scenario):
|
|
|
107
118
|
contrast_map = {}
|
|
108
119
|
|
|
109
120
|
for orig_line, contrast_line in zip(orig_and_contrast_inputs[0], orig_and_contrast_inputs[1]):
|
|
110
|
-
|
|
111
121
|
orig_label_name, orig_context = orig_line.strip().split("\t")
|
|
112
122
|
orig_label = label_name_to_id[orig_label_name]
|
|
113
123
|
|
|
@@ -135,3 +145,16 @@ class IMDBScenario(Scenario):
|
|
|
135
145
|
for split in [TRAIN_SPLIT, VALID_SPLIT]:
|
|
136
146
|
instances.extend(self.get_split_instances(target_path, split, contrast_map))
|
|
137
147
|
return instances
|
|
148
|
+
|
|
149
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
150
|
+
return ScenarioMetadata(
|
|
151
|
+
name="imdb",
|
|
152
|
+
display_name="IMDB",
|
|
153
|
+
description="The IMDB benchmark for sentiment analysis in movie review [(Maas et al., "
|
|
154
|
+
"2011)](https://aclanthology.org/P11-1015/).",
|
|
155
|
+
taxonomy=TaxonomyInfo(
|
|
156
|
+
task="sentiment analysis", what="movie reviews", when="?", who="?", language="English"
|
|
157
|
+
),
|
|
158
|
+
main_metric="quasi_exact_match",
|
|
159
|
+
main_split="valid",
|
|
160
|
+
)
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
from datasets import load_dataset, Features, Value, Sequence, Dataset
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
8
|
+
from helm.benchmark.scenarios.scenario import (
|
|
9
|
+
Scenario,
|
|
10
|
+
Instance,
|
|
11
|
+
Input,
|
|
12
|
+
Reference,
|
|
13
|
+
Output,
|
|
14
|
+
CORRECT_TAG,
|
|
15
|
+
TEST_SPLIT,
|
|
16
|
+
ScenarioMetadata,
|
|
17
|
+
)
|
|
18
|
+
from helm.common.general import ensure_directory_exists
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class InfiniteBenchEnMCScenario(Scenario):
|
|
22
|
+
"""InfiniteBench En.MC
|
|
23
|
+
|
|
24
|
+
InfiniteBench is a benchmark tailored for evaluating the capabilities of language models to process,
|
|
25
|
+
understand, and reason over long contexts (100k+ tokens). InfiniteBench En.MC is a subset of
|
|
26
|
+
InfiniteBench that requires models to perform multiple-choice question answering on questions that necessitate
|
|
27
|
+
long-range dependency and reasoning, beyond simple short passage retrieval.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
name = "infinite_bench_en_mc"
|
|
31
|
+
description = "∞Bench En.MC is a multiple-choice question answering task that necessitates long-range dependency and reasoning. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))" # noqa: E501
|
|
32
|
+
tags = ["question_answering"]
|
|
33
|
+
|
|
34
|
+
def __init__(self, max_num_words: int):
|
|
35
|
+
self.max_num_words = max_num_words
|
|
36
|
+
super().__init__()
|
|
37
|
+
|
|
38
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
39
|
+
# Get InfiniteBench from HuggingFace
|
|
40
|
+
cache_dir = os.path.join(output_path, "data")
|
|
41
|
+
ensure_directory_exists(cache_dir)
|
|
42
|
+
|
|
43
|
+
# Define the features schema
|
|
44
|
+
ft = Features(
|
|
45
|
+
{
|
|
46
|
+
"id": Value("int64"),
|
|
47
|
+
"context": Value("string"),
|
|
48
|
+
"input": Value("string"),
|
|
49
|
+
"answer": Sequence(Value("string")),
|
|
50
|
+
"options": Sequence(Value("string")),
|
|
51
|
+
}
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
# Load the dataset with the specified features
|
|
55
|
+
dataset = load_dataset(
|
|
56
|
+
"xinrongzhang2022/InfiniteBench",
|
|
57
|
+
split="longbook_choice_eng",
|
|
58
|
+
features=ft,
|
|
59
|
+
revision="90f0394333616266d9fe85824ceaf505093cbaa5",
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
assert isinstance(dataset, Dataset)
|
|
63
|
+
|
|
64
|
+
def count_words(text: str) -> int:
|
|
65
|
+
return len(re.split(r"\s+", text.strip()))
|
|
66
|
+
|
|
67
|
+
dataset = dataset.filter(
|
|
68
|
+
lambda example: count_words(example["context"])
|
|
69
|
+
+ count_words(example["input"])
|
|
70
|
+
+ sum(count_words(option) for option in example["options"])
|
|
71
|
+
<= self.max_num_words
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
# Read all instances
|
|
75
|
+
instances: List[Instance] = []
|
|
76
|
+
for row in dataset:
|
|
77
|
+
assert len(row["answer"]) == 1
|
|
78
|
+
id = row["id"]
|
|
79
|
+
input = Input(text=row["context"] + "\n\n" + row["input"])
|
|
80
|
+
references = [
|
|
81
|
+
Reference(Output(text=option), tags=[CORRECT_TAG] if option == row["answer"][0] else [])
|
|
82
|
+
for option in row["options"]
|
|
83
|
+
]
|
|
84
|
+
instance = Instance(
|
|
85
|
+
id=id,
|
|
86
|
+
input=input,
|
|
87
|
+
references=references,
|
|
88
|
+
split=TEST_SPLIT,
|
|
89
|
+
)
|
|
90
|
+
instances.append(instance)
|
|
91
|
+
|
|
92
|
+
return instances
|
|
93
|
+
|
|
94
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
95
|
+
return ScenarioMetadata(
|
|
96
|
+
name="infinite_bench_en_mc",
|
|
97
|
+
display_name="∞Bench En.MC",
|
|
98
|
+
description="∞Bench En.MC is a multiple-choice question answering task that requires "
|
|
99
|
+
"locating and processing information within a novel, performing reasoning "
|
|
100
|
+
"through aggregation or filtering to derive answers. ([Zhang et al., "
|
|
101
|
+
"2024](https://arxiv.org/abs/2402.13718))",
|
|
102
|
+
taxonomy=TaxonomyInfo(
|
|
103
|
+
task="multiple-choice question answering",
|
|
104
|
+
what="Novels",
|
|
105
|
+
when="Before 2024",
|
|
106
|
+
who="Novel authors",
|
|
107
|
+
language="English",
|
|
108
|
+
),
|
|
109
|
+
main_metric="exact_match",
|
|
110
|
+
main_split="test",
|
|
111
|
+
)
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
from datasets import load_dataset, Features, Value, Sequence, Dataset
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
Input,
|
|
11
|
+
Reference,
|
|
12
|
+
Output,
|
|
13
|
+
CORRECT_TAG,
|
|
14
|
+
TEST_SPLIT,
|
|
15
|
+
)
|
|
16
|
+
from helm.common.general import ensure_directory_exists
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class InfiniteBenchEnQAScenario(Scenario):
|
|
20
|
+
"""InfiniteBench En.QA
|
|
21
|
+
|
|
22
|
+
InfiniteBench is a benchmark tailored for evaluating the capabilities of language models to process,
|
|
23
|
+
understand, and reason over long contexts (100k+ tokens). InfiniteBench En.QA is a subset of
|
|
24
|
+
InfiniteBench that requires models to perform open-form question answering on questions that necessitate
|
|
25
|
+
long-range dependency and reasoning, beyond simple short passage retrieval.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
name = "infinite_bench_en_qa"
|
|
29
|
+
description = "∞Bench En.QA is an open-ended question answering task that necessitates long-range dependency and reasoning. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))" # noqa: E501
|
|
30
|
+
tags = ["question_answering"]
|
|
31
|
+
|
|
32
|
+
def __init__(self, max_num_words: int):
|
|
33
|
+
self.max_num_words = max_num_words
|
|
34
|
+
super().__init__()
|
|
35
|
+
|
|
36
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
37
|
+
# Get InfiniteBench from HuggingFace
|
|
38
|
+
cache_dir = os.path.join(output_path, "data")
|
|
39
|
+
ensure_directory_exists(cache_dir)
|
|
40
|
+
|
|
41
|
+
# Define the features schema
|
|
42
|
+
ft = Features(
|
|
43
|
+
{
|
|
44
|
+
"id": Value("int64"),
|
|
45
|
+
"context": Value("string"),
|
|
46
|
+
"input": Value("string"),
|
|
47
|
+
"answer": Sequence(Value("string")),
|
|
48
|
+
"options": Sequence(Value("string")),
|
|
49
|
+
}
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
# Load the dataset with the specified features
|
|
53
|
+
dataset = load_dataset(
|
|
54
|
+
"xinrongzhang2022/InfiniteBench",
|
|
55
|
+
split="longbook_qa_eng",
|
|
56
|
+
features=ft,
|
|
57
|
+
revision="90f0394333616266d9fe85824ceaf505093cbaa5",
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
assert isinstance(dataset, Dataset)
|
|
61
|
+
|
|
62
|
+
def count_words(text: str) -> int:
|
|
63
|
+
return len(re.split(r"\s+", text.strip()))
|
|
64
|
+
|
|
65
|
+
dataset = dataset.filter(
|
|
66
|
+
lambda example: count_words(example["context"])
|
|
67
|
+
+ count_words(example["input"])
|
|
68
|
+
+ sum(count_words(option) for option in example["options"])
|
|
69
|
+
<= self.max_num_words
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
# Read all instances
|
|
73
|
+
instances: List[Instance] = []
|
|
74
|
+
for row in dataset:
|
|
75
|
+
id = row["id"]
|
|
76
|
+
input = Input(text=row["context"] + "\n\n" + row["input"])
|
|
77
|
+
instance = Instance(
|
|
78
|
+
id=id,
|
|
79
|
+
input=input,
|
|
80
|
+
references=[Reference(Output(text=row["answer"][0]), tags=[CORRECT_TAG])],
|
|
81
|
+
split=TEST_SPLIT,
|
|
82
|
+
)
|
|
83
|
+
instances.append(instance)
|
|
84
|
+
|
|
85
|
+
return instances
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
from typing import List
|
|
4
|
+
from datasets import load_dataset, Features, Value, Sequence, Dataset
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
6
|
+
from helm.benchmark.scenarios.scenario import (
|
|
7
|
+
Scenario,
|
|
8
|
+
Instance,
|
|
9
|
+
Input,
|
|
10
|
+
Reference,
|
|
11
|
+
Output,
|
|
12
|
+
CORRECT_TAG,
|
|
13
|
+
TEST_SPLIT,
|
|
14
|
+
ScenarioMetadata,
|
|
15
|
+
)
|
|
16
|
+
from helm.common.general import ensure_directory_exists
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class InfiniteBenchEnSumScenario(Scenario):
|
|
20
|
+
"""InfiniteBench En.Sum
|
|
21
|
+
|
|
22
|
+
InfiniteBench is a benchmark tailored for evaluating the capabilities of language models to process,
|
|
23
|
+
understand, and reason over super long contexts (100k+ tokens). InfiniteBench En.Sum is a subset of
|
|
24
|
+
InfiniteBench that requires models to generate a concise summary of the novel.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
name = "infinite_bench_en_sum"
|
|
28
|
+
description = "∞Bench En.Sum is a summarization task that requires generating a concise summary of a novel. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))" # noqa: E501
|
|
29
|
+
tags = ["summarization"]
|
|
30
|
+
|
|
31
|
+
def __init__(self, max_num_words: int):
|
|
32
|
+
self.max_num_words = max_num_words
|
|
33
|
+
super().__init__()
|
|
34
|
+
|
|
35
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
36
|
+
# Get InfiniteBench from HuggingFace
|
|
37
|
+
cache_dir = os.path.join(output_path, "data")
|
|
38
|
+
ensure_directory_exists(cache_dir)
|
|
39
|
+
|
|
40
|
+
# Define the features schema
|
|
41
|
+
ft = Features(
|
|
42
|
+
{
|
|
43
|
+
"id": Value("int64"),
|
|
44
|
+
"context": Value("string"),
|
|
45
|
+
"input": Value("string"),
|
|
46
|
+
"answer": Sequence(Value("string")),
|
|
47
|
+
"options": Sequence(Value("string")),
|
|
48
|
+
}
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# Load the dataset with the specified features
|
|
52
|
+
dataset = load_dataset(
|
|
53
|
+
"xinrongzhang2022/InfiniteBench",
|
|
54
|
+
split="longbook_sum_eng",
|
|
55
|
+
features=ft,
|
|
56
|
+
revision="90f0394333616266d9fe85824ceaf505093cbaa5",
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
assert isinstance(dataset, Dataset)
|
|
60
|
+
|
|
61
|
+
def count_words(text: str) -> int:
|
|
62
|
+
return len(re.split(r"\s+", text.strip()))
|
|
63
|
+
|
|
64
|
+
dataset = dataset.filter(
|
|
65
|
+
lambda example: count_words(example["context"]) + count_words(example["input"]) <= self.max_num_words
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
# Read all instances
|
|
69
|
+
instances: List[Instance] = []
|
|
70
|
+
for row in dataset:
|
|
71
|
+
id = row["id"]
|
|
72
|
+
input = Input(text=row["context"] + "\n\n" + row["input"])
|
|
73
|
+
instance = Instance(
|
|
74
|
+
id=id,
|
|
75
|
+
input=input,
|
|
76
|
+
references=[Reference(Output(text=row["answer"][0]), tags=[CORRECT_TAG])],
|
|
77
|
+
split=TEST_SPLIT,
|
|
78
|
+
)
|
|
79
|
+
instances.append(instance)
|
|
80
|
+
|
|
81
|
+
return instances
|
|
82
|
+
|
|
83
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
84
|
+
return ScenarioMetadata(
|
|
85
|
+
name="infinite_bench_en_sum",
|
|
86
|
+
display_name="∞Bench En.Sum",
|
|
87
|
+
description="∞Bench En.Sum is a summarization task that requires generating a concise "
|
|
88
|
+
"summary of a novel. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))",
|
|
89
|
+
taxonomy=TaxonomyInfo(
|
|
90
|
+
task="multi-hop question answering",
|
|
91
|
+
what="Novels",
|
|
92
|
+
when="Before 2024",
|
|
93
|
+
who="Novel authors",
|
|
94
|
+
language="English",
|
|
95
|
+
),
|
|
96
|
+
main_metric="rouge_l",
|
|
97
|
+
main_split="test",
|
|
98
|
+
)
|
|
@@ -2,8 +2,8 @@ import os
|
|
|
2
2
|
from typing import List
|
|
3
3
|
|
|
4
4
|
from helm.common.general import ensure_file_downloaded
|
|
5
|
-
from .scenario import Instance, TRAIN_SPLIT, TEST_SPLIT
|
|
6
|
-
from .mmlu_scenario import MMLUScenario
|
|
5
|
+
from helm.benchmark.scenarios.scenario import Instance, TRAIN_SPLIT, TEST_SPLIT
|
|
6
|
+
from helm.benchmark.scenarios.mmlu_scenario import MMLUScenario
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class InteractiveQAMMLUScenario(MMLUScenario):
|