crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.6.dist-info/METADATA +427 -0
- crfm_helm-0.5.6.dist-info/RECORD +941 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +21 -6
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +87 -63
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +214 -6
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +131 -0
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/executor.py +11 -12
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +14 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +14 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +6 -6
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/__init__.py +0 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/llm_jury_metrics.py +46 -0
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +14 -0
- helm/benchmark/metrics/medalign_metrics.py +14 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +14 -0
- helm/benchmark/metrics/medication_qa_metrics.py +10 -19
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/mental_health_metrics.py +14 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/metric_service.py +11 -11
- helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
- helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +2 -3
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +20 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
- helm/benchmark/metrics/toxicity_metrics.py +6 -6
- helm/benchmark/metrics/unitxt_metrics.py +7 -5
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
- helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
- helm/benchmark/metrics/vision_language/image_utils.py +2 -2
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_deployment_registry.py +6 -8
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +33 -12
- helm/benchmark/presentation/run_display.py +13 -0
- helm/benchmark/presentation/schema.py +2 -1
- helm/benchmark/presentation/summarize.py +97 -67
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +202 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +86 -90
- helm/benchmark/run_expander.py +90 -9
- helm/benchmark/run_spec_factory.py +13 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +657 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
- helm/benchmark/run_specs/experimental_run_specs.py +142 -3
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +141 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +103 -2
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/runner.py +5 -5
- helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +157 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +136 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +94 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
- helm/benchmark/scenarios/medbullets_scenario.py +145 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
- helm/benchmark/scenarios/medec_scenario.py +125 -0
- helm/benchmark/scenarios/medhallu_scenario.py +72 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +123 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +12 -2
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
- helm/benchmark/scenarios/raft_scenario.py +17 -2
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +13 -1
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +763 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +319 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_long_context.yaml +283 -0
- helm/benchmark/static/schema_medhelm.yaml +1140 -0
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +18 -1
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +129 -56
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-94295e78.js +10 -0
- helm/benchmark/static_build/assets/index-b9779128.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
- helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +6 -6
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +4 -5
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/tokenizer_service.py +7 -8
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/anthropic_client.py +69 -29
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +120 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
- helm/clients/audio_language/qwen_audiolm_client.py +152 -0
- helm/clients/audio_language/test.py +62 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +203 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/client.py +7 -7
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/grok_client.py +36 -0
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +52 -21
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
- helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
- helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +308 -43
- helm/clients/openai_responses_client.py +174 -0
- helm/clients/palmyra_client.py +3 -9
- helm/clients/reka_client.py +3 -3
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +76 -9
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +45 -13
- helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
- helm/clients/vision_language/huggingface_vlm_client.py +2 -2
- helm/clients/vision_language/idefics_client.py +6 -2
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/paligemma_client.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +188 -0
- helm/clients/vision_language/qwen_vlm_client.py +7 -5
- helm/clients/vllm_client.py +4 -6
- helm/clients/writer_client.py +102 -0
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/general.py +9 -2
- helm/common/hierarchical_logger.py +46 -3
- helm/common/images_utils.py +2 -2
- helm/common/local_context.py +140 -0
- helm/common/media_object.py +2 -2
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/remote_context.py +61 -0
- helm/common/request.py +14 -2
- helm/common/response_format.py +18 -0
- helm/common/test_media_object.py +1 -1
- helm/config/model_deployments.yaml +1792 -28
- helm/config/model_metadata.yaml +1606 -51
- helm/config/tokenizer_configs.yaml +521 -4
- helm/proxy/cli.py +5 -3
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -4
- helm/proxy/services/remote_service.py +1 -1
- helm/proxy/services/server_service.py +22 -86
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/grok_tokenizer.py +53 -0
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +3 -3
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.4.dist-info/METADATA +0 -350
- crfm_helm-0.5.4.dist-info/RECORD +0 -697
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
from datasets import load_dataset
|
|
5
|
+
from tqdm import tqdm
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
CORRECT_TAG,
|
|
9
|
+
VALID_SPLIT,
|
|
10
|
+
Instance,
|
|
11
|
+
Input,
|
|
12
|
+
Output,
|
|
13
|
+
Reference,
|
|
14
|
+
Scenario,
|
|
15
|
+
)
|
|
16
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
17
|
+
from helm.common.images_utils import generate_hash
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class BlinkScenario(Scenario):
|
|
21
|
+
"""
|
|
22
|
+
BLINK is a benchmark containing 14 visual perception tasks that can be solved by humans “within a blink”,
|
|
23
|
+
but pose significant challenges for VLMs.
|
|
24
|
+
|
|
25
|
+
Website: https://zeyofu.github.io/blink/
|
|
26
|
+
|
|
27
|
+
@article{fu2024blink,
|
|
28
|
+
title={BLINK: Multimodal Large Language Models Can See but Not Perceive},
|
|
29
|
+
author={Fu, Xingyu and Hu, Yushi and Li, Bangzheng and Feng, Yu and Wang, Haoyu and Lin, Xudong and Roth,
|
|
30
|
+
Dan and Smith, Noah A and Ma, Wei-Chiu and Krishna, Ranjay},
|
|
31
|
+
journal={arXiv preprint arXiv:2404.12390},
|
|
32
|
+
year={2024}
|
|
33
|
+
}
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
HUGGINGFACE_DATASET_NAME: str = "BLINK-Benchmark/BLINK"
|
|
37
|
+
|
|
38
|
+
VALID_CATEGORIES: List[str] = [
|
|
39
|
+
"Art_Style",
|
|
40
|
+
"Counting",
|
|
41
|
+
"Forensic_Detection",
|
|
42
|
+
"Functional_Correspondence",
|
|
43
|
+
"IQ_Test",
|
|
44
|
+
"Jigsaw",
|
|
45
|
+
"Multi-view_Reasoning",
|
|
46
|
+
"Object_Localization",
|
|
47
|
+
"Relative_Depth",
|
|
48
|
+
"Relative_Reflectance",
|
|
49
|
+
"Semantic_Correspondence",
|
|
50
|
+
"Spatial_Relation",
|
|
51
|
+
"Visual_Correspondence",
|
|
52
|
+
"Visual_Similarity",
|
|
53
|
+
]
|
|
54
|
+
|
|
55
|
+
name = "blink"
|
|
56
|
+
description = (
|
|
57
|
+
"BLINK is a benchmark containing 14 visual perception tasks that can be solved by humans within a blink, "
|
|
58
|
+
"but pose significant challenges for VLMs. ([Fu, 2024](https://arxiv.org/abs/2404.12390))."
|
|
59
|
+
)
|
|
60
|
+
tags = ["vision-language", "knowledge", "reasoning"]
|
|
61
|
+
|
|
62
|
+
def __init__(self, category: str):
|
|
63
|
+
super().__init__()
|
|
64
|
+
|
|
65
|
+
if category not in self.VALID_CATEGORIES:
|
|
66
|
+
raise ValueError(f"Invalid category: {category}. Valid categories are: {self.VALID_CATEGORIES}")
|
|
67
|
+
self._category: str = category
|
|
68
|
+
|
|
69
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
70
|
+
def save_image(image) -> str:
|
|
71
|
+
image_file_name: str = generate_hash(image) + ".jpg"
|
|
72
|
+
local_image_path: str = os.path.join(output_path, image_file_name)
|
|
73
|
+
if not os.path.exists(local_image_path):
|
|
74
|
+
image.save(local_image_path)
|
|
75
|
+
return local_image_path
|
|
76
|
+
|
|
77
|
+
def get_image_header(image_index: int) -> str:
|
|
78
|
+
if image_index == 1:
|
|
79
|
+
return "First image:"
|
|
80
|
+
elif image_index == 2:
|
|
81
|
+
return "Second image:"
|
|
82
|
+
elif image_index == 3:
|
|
83
|
+
return "Third image:"
|
|
84
|
+
elif image_index == 4:
|
|
85
|
+
return "Fourth image:"
|
|
86
|
+
else:
|
|
87
|
+
raise ValueError(f"Invalid image index: {image_index}")
|
|
88
|
+
|
|
89
|
+
instances: List[Instance] = []
|
|
90
|
+
for row in tqdm(
|
|
91
|
+
load_dataset(self.HUGGINGFACE_DATASET_NAME, self._category, split="val", cache_dir=output_path)
|
|
92
|
+
):
|
|
93
|
+
# Save the image(s) to disk
|
|
94
|
+
has_multiple_images: bool = row["image_2"] is not None
|
|
95
|
+
content: List[MediaObject] = []
|
|
96
|
+
|
|
97
|
+
if has_multiple_images:
|
|
98
|
+
# An example can have up to 4 images
|
|
99
|
+
for i in range(1, 5):
|
|
100
|
+
image_i = row[f"image_{i}"]
|
|
101
|
+
if image_i is None:
|
|
102
|
+
break
|
|
103
|
+
|
|
104
|
+
# Before each image, include a header text that indicates which number image it is.
|
|
105
|
+
# Some prompts refer to specific image numbers within the question, e.g.,
|
|
106
|
+
# "Given three similar but different images, take the first image as reference.
|
|
107
|
+
# Can you tell which one of the latter two images is most similar to the first one?
|
|
108
|
+
# Select from the following choices. (A) the second image (B) the third image"
|
|
109
|
+
image_path: str = save_image(image_i)
|
|
110
|
+
content.extend(
|
|
111
|
+
[
|
|
112
|
+
MediaObject(text=get_image_header(i), content_type="text/plain"),
|
|
113
|
+
MediaObject(location=image_path, content_type="image/jpeg"),
|
|
114
|
+
]
|
|
115
|
+
)
|
|
116
|
+
else:
|
|
117
|
+
image1 = row["image_1"]
|
|
118
|
+
image1_path: str = save_image(image1)
|
|
119
|
+
content.append(MediaObject(location=image1_path, content_type="image/jpeg"))
|
|
120
|
+
|
|
121
|
+
# Add the prompt that has both the question and the answer choices
|
|
122
|
+
prompt: str = row["prompt"]
|
|
123
|
+
# Replace (A), (B), (C), (D) with \nA. \nB. \nC. \nD. since we are just expecting the letter answer
|
|
124
|
+
prompt = prompt.replace("(A)", "\nA.").replace("(B)", "\nB.").replace("(C)", "\nC.").replace("(D)", "\nD.")
|
|
125
|
+
content.append(MediaObject(text=prompt, content_type="text/plain"))
|
|
126
|
+
|
|
127
|
+
# The answer has the correct letter choices surrounded by parentheses
|
|
128
|
+
paren_letter_answer: str = row["answer"]
|
|
129
|
+
assert (
|
|
130
|
+
paren_letter_answer[0] == "(" and paren_letter_answer[-1] == ")"
|
|
131
|
+
), f"Unexpected answer format: {paren_letter_answer}"
|
|
132
|
+
letter_answer: str = paren_letter_answer[1]
|
|
133
|
+
references: List[Reference] = [
|
|
134
|
+
Reference(output=Output(text=letter_answer), tags=[CORRECT_TAG]),
|
|
135
|
+
]
|
|
136
|
+
instances.append(
|
|
137
|
+
Instance(Input(multimedia_content=MultimediaObject(content)), references=references, split=VALID_SPLIT)
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
return instances
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
from datasets import load_dataset
|
|
5
|
+
from tqdm import tqdm
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
CORRECT_TAG,
|
|
9
|
+
VALID_SPLIT,
|
|
10
|
+
Instance,
|
|
11
|
+
Input,
|
|
12
|
+
Output,
|
|
13
|
+
Reference,
|
|
14
|
+
Scenario,
|
|
15
|
+
)
|
|
16
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
17
|
+
from helm.common.images_utils import generate_hash
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class MMStarScenario(Scenario):
|
|
21
|
+
"""
|
|
22
|
+
MM-STAR is an elite vision-indispensable multi-modal benchmark comprising 1,500 challenge samples meticulously
|
|
23
|
+
selected by humans. MMStar is designed to benchmark 6 core capabilities and 18 detailed axes, aiming to evaluate
|
|
24
|
+
the multi-modal capacities of LVLMs with a carefully balanced and purified selection of samples. The samples
|
|
25
|
+
are first roughly selected from current benchmarks with an automated pipeline, strict human review is then
|
|
26
|
+
involved to ensure each selected sample exhibits visual dependency, minimal data leakage, and requires advanced
|
|
27
|
+
multi-modal capabilities for the solution.
|
|
28
|
+
|
|
29
|
+
Website: https://mmstar-benchmark.github.io/
|
|
30
|
+
|
|
31
|
+
@article{chen2024we,
|
|
32
|
+
title={Are We on the Right Way for Evaluating Large Vision-Language Models?},
|
|
33
|
+
author={Chen, Lin and Li, Jinsong and Dong, Xiaoyi and Zhang, Pan and Zang, Yuhang and Chen, Zehui and Duan,
|
|
34
|
+
Haodong and Wang, Jiaqi and Qiao, Yu and Lin, Dahua and others},
|
|
35
|
+
journal={arXiv preprint arXiv:2403.20330},
|
|
36
|
+
year={2024}
|
|
37
|
+
}
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
HUGGINGFACE_DATASET_NAME: str = "Lin-Chen/MMStar"
|
|
41
|
+
|
|
42
|
+
VALID_CATEGORIES: List[str] = [
|
|
43
|
+
"coarse perception",
|
|
44
|
+
"fine-grained perception",
|
|
45
|
+
"instance reasoning",
|
|
46
|
+
"logical reasoning",
|
|
47
|
+
"math",
|
|
48
|
+
"science technology",
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
name = "mm_star"
|
|
52
|
+
description = (
|
|
53
|
+
"MM-STAR is an elite vision-indispensable multi-modal benchmark comprising 1,500 challenge samples "
|
|
54
|
+
"meticulously selected by humans."
|
|
55
|
+
"([Chen, 2024](https://arxiv.org/abs/2403.20330))."
|
|
56
|
+
)
|
|
57
|
+
tags = ["vision-language", "knowledge", "reasoning"]
|
|
58
|
+
|
|
59
|
+
def __init__(self, category: str):
|
|
60
|
+
super().__init__()
|
|
61
|
+
|
|
62
|
+
category = category.replace("_", " ")
|
|
63
|
+
if category not in self.VALID_CATEGORIES:
|
|
64
|
+
raise ValueError(f"Invalid category: {category}. Valid categories are: {self.VALID_CATEGORIES}")
|
|
65
|
+
if category == "science technology":
|
|
66
|
+
category = "science & technology"
|
|
67
|
+
|
|
68
|
+
self._category: str = category
|
|
69
|
+
|
|
70
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
71
|
+
instances: List[Instance] = []
|
|
72
|
+
|
|
73
|
+
for row in tqdm(load_dataset(self.HUGGINGFACE_DATASET_NAME, split="val", cache_dir=output_path)):
|
|
74
|
+
# Filter by category
|
|
75
|
+
category: str = row["category"]
|
|
76
|
+
if category != self._category:
|
|
77
|
+
continue
|
|
78
|
+
|
|
79
|
+
# Save the image to disk
|
|
80
|
+
image = row["image"]
|
|
81
|
+
image_file_name: str = generate_hash(image) + ".jpg"
|
|
82
|
+
local_image_path: str = os.path.join(output_path, image_file_name)
|
|
83
|
+
if not os.path.exists(local_image_path):
|
|
84
|
+
image.save(local_image_path)
|
|
85
|
+
|
|
86
|
+
content: List[MediaObject] = [
|
|
87
|
+
MediaObject(location=local_image_path, content_type="image/jpeg"),
|
|
88
|
+
MediaObject(text=row["question"], content_type="text/plain"),
|
|
89
|
+
]
|
|
90
|
+
references: List[Reference] = [Reference(output=Output(text=row["answer"]), tags=[CORRECT_TAG])]
|
|
91
|
+
instances.append(
|
|
92
|
+
Instance(Input(multimedia_content=MultimediaObject(content)), references=references, split=VALID_SPLIT)
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
return instances
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
from collections import defaultdict
|
|
2
|
+
from typing import List
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
from helm.benchmark.scenarios.scenario import (
|
|
7
|
+
CORRECT_TAG,
|
|
8
|
+
TEST_SPLIT,
|
|
9
|
+
Instance,
|
|
10
|
+
Input,
|
|
11
|
+
Output,
|
|
12
|
+
Reference,
|
|
13
|
+
Scenario,
|
|
14
|
+
)
|
|
15
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
16
|
+
from helm.common.general import ensure_file_downloaded
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class MSRVTTScenario(Scenario):
|
|
20
|
+
"""
|
|
21
|
+
A large-scale video benchmark for video understanding, especially the emerging task of translating video to text.
|
|
22
|
+
This is achieved by collecting 257 popular queries from a commercial video search engine, with 118 videos for
|
|
23
|
+
each query. In its current version, MSR-VTT provides 10K web video clips with 41.2 hours and 200K clip-sentence
|
|
24
|
+
pairs in total, covering the most comprehensive categories and diverse visual content, and representing the
|
|
25
|
+
largest dataset in terms of sentence and vocabulary. Each clip is annotated with about 20 natural sentences
|
|
26
|
+
by 1,327 AMT workers.
|
|
27
|
+
|
|
28
|
+
Website link: https://cove.thecvf.com/datasets/839
|
|
29
|
+
|
|
30
|
+
Citation:
|
|
31
|
+
MSR-VTT: A Large Video Description Dataset for Bridging Video and Language Jun Xu, Tao Mei, Ting Yao, Yong Rui
|
|
32
|
+
CVPR 2016
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
DOWNLOAD_URL: str = "https://www.robots.ox.ac.uk/~maxbain/frozen-in-time/data/MSRVTT.zip"
|
|
36
|
+
|
|
37
|
+
name = "msr_vtt"
|
|
38
|
+
description = "Video captioning dataset with 10K web video clips and 200K clip-sentence pairs."
|
|
39
|
+
tags = ["vision-language", "video", "captioning"]
|
|
40
|
+
|
|
41
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
42
|
+
# Download the dataset
|
|
43
|
+
target_path: str = os.path.join(output_path, "data")
|
|
44
|
+
ensure_file_downloaded(
|
|
45
|
+
source_url=self.DOWNLOAD_URL,
|
|
46
|
+
target_path=target_path,
|
|
47
|
+
unpack=True,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
annotation_path: str = os.path.join(target_path, "annotation", "MSR_VTT.json")
|
|
51
|
+
with open(annotation_path, "r") as f:
|
|
52
|
+
annotations = json.load(f)["annotations"]
|
|
53
|
+
|
|
54
|
+
video_path_to_annotations: dict[str, set[str]] = defaultdict(set)
|
|
55
|
+
for annotation in annotations:
|
|
56
|
+
video_id: str = annotation["image_id"]
|
|
57
|
+
video_path: str = os.path.join(target_path, "videos", "all", f"{video_id}.mp4")
|
|
58
|
+
assert os.path.exists(video_path), f"Video does not exist at path: {video_path}"
|
|
59
|
+
video_path_to_annotations[video_path].add(annotation["caption"])
|
|
60
|
+
|
|
61
|
+
instances: List[Instance] = []
|
|
62
|
+
for video_path, captions in video_path_to_annotations.items():
|
|
63
|
+
content: List[MediaObject] = [
|
|
64
|
+
MediaObject(location=video_path, content_type="video/mp4"),
|
|
65
|
+
]
|
|
66
|
+
references: List[Reference] = [Reference(Output(text=caption), tags=[CORRECT_TAG]) for caption in captions]
|
|
67
|
+
instances.append(
|
|
68
|
+
Instance(
|
|
69
|
+
Input(multimedia_content=MultimediaObject(content)),
|
|
70
|
+
references=references,
|
|
71
|
+
split=TEST_SPLIT,
|
|
72
|
+
)
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
return instances
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
from datasets import DatasetDict, load_dataset
|
|
5
|
+
|
|
6
|
+
from helm.benchmark.scenarios.scenario import (
|
|
7
|
+
CORRECT_TAG,
|
|
8
|
+
TEST_SPLIT,
|
|
9
|
+
TRAIN_SPLIT,
|
|
10
|
+
Input,
|
|
11
|
+
Instance,
|
|
12
|
+
Output,
|
|
13
|
+
Reference,
|
|
14
|
+
Scenario,
|
|
15
|
+
)
|
|
16
|
+
from helm.common.general import ensure_directory_exists
|
|
17
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class VQARadScenario(Scenario):
|
|
21
|
+
"""
|
|
22
|
+
VQARad scenario: Processes a visual question answering dataset with radiology images.
|
|
23
|
+
|
|
24
|
+
Each record in the dataset has:
|
|
25
|
+
- image
|
|
26
|
+
- question
|
|
27
|
+
- answer
|
|
28
|
+
|
|
29
|
+
The output is formatted as:
|
|
30
|
+
"Answer: <answer>"
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
HUGGING_FACE_DATASET_PATH: str = "flaviagiammarino/vqa-rad"
|
|
34
|
+
|
|
35
|
+
name = "vqa_rad"
|
|
36
|
+
description = "Visual question answering with radiology images."
|
|
37
|
+
tags = [
|
|
38
|
+
"vision-language",
|
|
39
|
+
"visual question answering",
|
|
40
|
+
"reasoning",
|
|
41
|
+
"medical",
|
|
42
|
+
"radiology",
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
46
|
+
dataset: DatasetDict = load_dataset(self.HUGGING_FACE_DATASET_PATH)
|
|
47
|
+
|
|
48
|
+
splits = {TRAIN_SPLIT: "train", TEST_SPLIT: "test"}
|
|
49
|
+
instances: List[Instance] = []
|
|
50
|
+
# Iterate over the splits
|
|
51
|
+
for (
|
|
52
|
+
helm_split_name,
|
|
53
|
+
dataset_split_name,
|
|
54
|
+
) in splits.items():
|
|
55
|
+
split_path: str = os.path.join(output_path, dataset_split_name)
|
|
56
|
+
ensure_directory_exists(split_path)
|
|
57
|
+
|
|
58
|
+
split_data = dataset[dataset_split_name]
|
|
59
|
+
|
|
60
|
+
for index, example in enumerate(split_data):
|
|
61
|
+
question = example["question"]
|
|
62
|
+
image = example["image"]
|
|
63
|
+
answer = example["answer"]
|
|
64
|
+
|
|
65
|
+
# Convert PIL image to MediaObject
|
|
66
|
+
image_path = os.path.join(split_path, f"{index}.jpg")
|
|
67
|
+
image.save(image_path)
|
|
68
|
+
|
|
69
|
+
content = [
|
|
70
|
+
MediaObject(location=image_path, content_type="image/jpeg"),
|
|
71
|
+
MediaObject(text=question, content_type="text/plain"),
|
|
72
|
+
]
|
|
73
|
+
|
|
74
|
+
# Format the final answer
|
|
75
|
+
instances.append(
|
|
76
|
+
Instance(
|
|
77
|
+
input=Input(multimedia_content=MultimediaObject(content)),
|
|
78
|
+
references=[
|
|
79
|
+
Reference(
|
|
80
|
+
Output(text=answer),
|
|
81
|
+
tags=[CORRECT_TAG],
|
|
82
|
+
)
|
|
83
|
+
],
|
|
84
|
+
split=helm_split_name,
|
|
85
|
+
)
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
return instances
|
|
@@ -4,7 +4,17 @@ import json
|
|
|
4
4
|
|
|
5
5
|
from helm.common.general import ensure_directory_exists, ensure_file_downloaded, flatten_list
|
|
6
6
|
from helm.common.hierarchical_logger import hlog
|
|
7
|
-
from .scenario import
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
Reference,
|
|
11
|
+
TRAIN_SPLIT,
|
|
12
|
+
VALID_SPLIT,
|
|
13
|
+
TEST_SPLIT,
|
|
14
|
+
CORRECT_TAG,
|
|
15
|
+
Input,
|
|
16
|
+
Output,
|
|
17
|
+
)
|
|
8
18
|
|
|
9
19
|
PID_TO_NAME = {
|
|
10
20
|
"P136": "genre",
|
|
@@ -4,7 +4,7 @@ from typing import List
|
|
|
4
4
|
|
|
5
5
|
from helm.common.general import ensure_file_downloaded
|
|
6
6
|
from helm.common.hierarchical_logger import hlog
|
|
7
|
-
from .scenario import Scenario, Instance, TEST_SPLIT, Input
|
|
7
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
class Wikitext103Scenario(Scenario):
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
import datasets
|
|
2
|
+
import os
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Scenario,
|
|
7
|
+
Instance,
|
|
8
|
+
TEST_SPLIT,
|
|
9
|
+
Input,
|
|
10
|
+
)
|
|
11
|
+
from helm.common.general import ensure_directory_exists
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
SUBSETS = ["v2"]
|
|
15
|
+
REFERENCE_MODELS = ["gpt-4-turbo-2024-04-09", "claude-3-haiku-20240307", "Llama-2-70b-chat-hf"]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class WildBenchScenario(Scenario):
|
|
19
|
+
"""WildBench: Benchmarking LLMs with Challenging Tasks from Real Users in the Wild
|
|
20
|
+
|
|
21
|
+
WildBench is a benchmark for evaluating large language models (LLMs) on challenging tasks
|
|
22
|
+
that are more representative of real-world applications. The examples are collected from
|
|
23
|
+
real users by the AI2 WildChat project."""
|
|
24
|
+
|
|
25
|
+
name = "wildbench"
|
|
26
|
+
description = "Benchmarking LLMs with Challenging Tasks from Real Users in the Wild"
|
|
27
|
+
tags = ["instruction following"]
|
|
28
|
+
|
|
29
|
+
def __init__(self, subset: str, use_model_outputs: bool = False):
|
|
30
|
+
super().__init__()
|
|
31
|
+
assert subset in SUBSETS, "Unknown subset: {}".format(subset)
|
|
32
|
+
self.subset = subset
|
|
33
|
+
self.use_model_outputs = use_model_outputs
|
|
34
|
+
|
|
35
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
36
|
+
# Get WildBench from HuggingFace
|
|
37
|
+
cache_dir = os.path.join(output_path, "data")
|
|
38
|
+
ensure_directory_exists(cache_dir)
|
|
39
|
+
dataset = datasets.load_dataset(
|
|
40
|
+
"allenai/WildBench",
|
|
41
|
+
self.subset,
|
|
42
|
+
cache_dir=cache_dir,
|
|
43
|
+
split="test",
|
|
44
|
+
revision="7c05c1b4550282b2ed6a2e6ac5db069f1e07df5c",
|
|
45
|
+
)
|
|
46
|
+
assert isinstance(dataset, datasets.Dataset)
|
|
47
|
+
if self.use_model_outputs:
|
|
48
|
+
baseline_outputs = {
|
|
49
|
+
f"{model}": datasets.load_dataset(
|
|
50
|
+
"allenai/WildBench-V2-Model-Outputs",
|
|
51
|
+
model,
|
|
52
|
+
cache_dir=cache_dir,
|
|
53
|
+
split="train",
|
|
54
|
+
revision="d6755bc68220df853c0825a733430f73f5af2501",
|
|
55
|
+
)
|
|
56
|
+
for model in REFERENCE_MODELS
|
|
57
|
+
}
|
|
58
|
+
assert all(isinstance(baseline_output, datasets.Dataset) for baseline_output in baseline_outputs.values())
|
|
59
|
+
|
|
60
|
+
# Read all instances
|
|
61
|
+
instances: List[Instance] = []
|
|
62
|
+
for idx, row in enumerate(dataset):
|
|
63
|
+
input = Input(
|
|
64
|
+
messages=[
|
|
65
|
+
{"role": message["role"], "content": message["content"]} for message in row["conversation_input"]
|
|
66
|
+
]
|
|
67
|
+
)
|
|
68
|
+
extra_data = {
|
|
69
|
+
"checklist": row["checklist"],
|
|
70
|
+
}
|
|
71
|
+
if self.use_model_outputs:
|
|
72
|
+
extra_data["baseline_outputs"] = {
|
|
73
|
+
model: baseline_outputs[model][idx]["output"][0] for model in REFERENCE_MODELS
|
|
74
|
+
}
|
|
75
|
+
instance = Instance(
|
|
76
|
+
input=input,
|
|
77
|
+
references=[],
|
|
78
|
+
split=TEST_SPLIT,
|
|
79
|
+
extra_data=extra_data,
|
|
80
|
+
)
|
|
81
|
+
instances.append(instance)
|
|
82
|
+
|
|
83
|
+
return instances
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import os
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
from helm.common.general import ensure_file_downloaded
|
|
6
|
+
from helm.common.hierarchical_logger import hlog
|
|
7
|
+
from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, VALID_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Winogrande_Afr_Scenario(Scenario):
|
|
11
|
+
"""
|
|
12
|
+
https://github.com/InstituteforDiseaseModeling/Bridging-the-Gap-Low-Resource-African-Languages
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
name = "winogrande_afr"
|
|
16
|
+
description = "Winogrande (S) translated into 11 African low-resource languages"
|
|
17
|
+
tags = ["knowledge", "multiple_choice", "low_resource_languages"]
|
|
18
|
+
|
|
19
|
+
def __init__(self, lang: str = "af"):
|
|
20
|
+
super().__init__()
|
|
21
|
+
self.lang: str = lang
|
|
22
|
+
|
|
23
|
+
def download_winogrande_afr(self, path: str):
|
|
24
|
+
ensure_file_downloaded(
|
|
25
|
+
source_url="https://github.com/InstituteforDiseaseModeling/Bridging-the-Gap-Low-Resource-African-Languages/raw/refs/heads/main/data/evaluation_benchmarks_afr_release.zip", # noqa: E501
|
|
26
|
+
target_path=path,
|
|
27
|
+
unpack=True,
|
|
28
|
+
unpack_type="unzip",
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
def process_csv(self, csv_path: str, split: str, pseudo_split: str) -> List[Instance]:
|
|
32
|
+
# Match naming in Winogrande
|
|
33
|
+
if pseudo_split == "val":
|
|
34
|
+
pseudo_split = "train_s"
|
|
35
|
+
instances: List[Instance] = []
|
|
36
|
+
hlog(f"Reading {csv_path}")
|
|
37
|
+
with open(csv_path) as f:
|
|
38
|
+
reader = csv.reader(f, delimiter=",")
|
|
39
|
+
next(reader, None) # skip the header
|
|
40
|
+
for row in reader:
|
|
41
|
+
if row[-1] != pseudo_split: # ensure correct split is taken
|
|
42
|
+
continue
|
|
43
|
+
question, answers, correct_choice = row[-5], row[-4:-2], row[-2]
|
|
44
|
+
answers_dict = dict(zip(["1", "2"], answers))
|
|
45
|
+
correct_answer: str = answers_dict[correct_choice]
|
|
46
|
+
|
|
47
|
+
def answer_to_reference(answer: str) -> Reference:
|
|
48
|
+
return Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else [])
|
|
49
|
+
|
|
50
|
+
instance = Instance(
|
|
51
|
+
input=Input(text=question),
|
|
52
|
+
references=list(map(answer_to_reference, answers)),
|
|
53
|
+
split=split,
|
|
54
|
+
)
|
|
55
|
+
instances.append(instance)
|
|
56
|
+
return instances
|
|
57
|
+
|
|
58
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
59
|
+
# Download the raw data
|
|
60
|
+
desired_dir = "winogrande_s"
|
|
61
|
+
data_path: str = os.path.join(output_path, desired_dir)
|
|
62
|
+
self.download_winogrande_afr(data_path)
|
|
63
|
+
|
|
64
|
+
# Read all the instances
|
|
65
|
+
instances: List[Instance] = []
|
|
66
|
+
splits: Dict[str, str] = {
|
|
67
|
+
"dev": TRAIN_SPLIT,
|
|
68
|
+
"val": VALID_SPLIT,
|
|
69
|
+
"test": TEST_SPLIT,
|
|
70
|
+
}
|
|
71
|
+
for split in splits:
|
|
72
|
+
csv_path: str = os.path.join(data_path, desired_dir, f"winogrande_{self.lang}.csv")
|
|
73
|
+
if not os.path.exists(csv_path):
|
|
74
|
+
hlog(f"{csv_path} doesn't exist, skipping")
|
|
75
|
+
continue
|
|
76
|
+
instances.extend(self.process_csv(csv_path, splits[split], split))
|
|
77
|
+
|
|
78
|
+
return instances
|
|
@@ -1,7 +1,17 @@
|
|
|
1
1
|
from typing import List, Any
|
|
2
2
|
from datasets import load_dataset
|
|
3
3
|
from helm.common.hierarchical_logger import htrack_block
|
|
4
|
-
from .scenario import
|
|
4
|
+
from helm.benchmark.scenarios.scenario import (
|
|
5
|
+
Scenario,
|
|
6
|
+
Instance,
|
|
7
|
+
Reference,
|
|
8
|
+
TRAIN_SPLIT,
|
|
9
|
+
VALID_SPLIT,
|
|
10
|
+
TEST_SPLIT,
|
|
11
|
+
CORRECT_TAG,
|
|
12
|
+
Input,
|
|
13
|
+
Output,
|
|
14
|
+
)
|
|
5
15
|
|
|
6
16
|
|
|
7
17
|
MAX_TRAIN_INSTANCES = 20_000 # This is arbitrary, but 20,000 training examples should be enough.
|
|
@@ -61,7 +71,9 @@ class WMT14Scenario(Scenario):
|
|
|
61
71
|
def get_instances(self, output_path: str) -> List[Instance]:
|
|
62
72
|
with htrack_block("Loading the HuggingFace dataset. The first time could take several minutes."):
|
|
63
73
|
subset_name = f"{self.source_language if self.source_language!='en' else self.target_language}-en"
|
|
64
|
-
hf_dataset: Any = load_dataset(
|
|
74
|
+
hf_dataset: Any = load_dataset(
|
|
75
|
+
"wmt14", subset_name, trust_remote_code=True, revision="b199e406369ec1b7634206d3ded5ba45de2fe696"
|
|
76
|
+
)
|
|
65
77
|
splits = {"train": TRAIN_SPLIT, "validation": VALID_SPLIT, "test": TEST_SPLIT}
|
|
66
78
|
|
|
67
79
|
instances: List[Instance] = []
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from typing import List
|
|
2
2
|
from datasets import load_dataset
|
|
3
3
|
|
|
4
|
-
from .scenario import Scenario, Instance, Input, CORRECT_TAG, TEST_SPLIT, Reference, Output
|
|
4
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, CORRECT_TAG, TEST_SPLIT, Reference, Output
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
class XSTestScenario(Scenario):
|
helm/benchmark/server.py
CHANGED
|
@@ -9,7 +9,7 @@ import json
|
|
|
9
9
|
from os import path
|
|
10
10
|
import urllib
|
|
11
11
|
|
|
12
|
-
from bottle import Bottle, static_file, HTTPResponse
|
|
12
|
+
from bottle import Bottle, static_file, HTTPResponse, response
|
|
13
13
|
import yaml
|
|
14
14
|
|
|
15
15
|
from helm.benchmark.presentation.schema import SCHEMA_CLASSIC_YAML_FILENAME
|
|
@@ -21,15 +21,18 @@ app = Bottle()
|
|
|
21
21
|
|
|
22
22
|
@app.get("/config.js")
|
|
23
23
|
def serve_config():
|
|
24
|
+
response.content_type = "application/javascript; charset=UTF-8"
|
|
24
25
|
if app.config["helm.release"]:
|
|
25
26
|
return (
|
|
26
27
|
f'window.BENCHMARK_OUTPUT_BASE_URL = "{app.config["helm.outputurl"]}";\n'
|
|
27
28
|
f'window.RELEASE = "{app.config["helm.release"]}";\n'
|
|
29
|
+
f'window.PROJECT_ID = "{app.config["helm.project"]}";\n'
|
|
28
30
|
)
|
|
29
31
|
else:
|
|
30
32
|
return (
|
|
31
33
|
f'window.BENCHMARK_OUTPUT_BASE_URL = "{app.config["helm.outputurl"]}";\n'
|
|
32
34
|
f'window.SUITE = "{app.config["helm.suite"]}";\n'
|
|
35
|
+
f'window.PROJECT_ID = "{app.config["helm.project"]}";\n'
|
|
33
36
|
)
|
|
34
37
|
|
|
35
38
|
|
|
@@ -113,6 +116,13 @@ def main():
|
|
|
113
116
|
default=None,
|
|
114
117
|
help="Experimental: The release to serve. If unset, don't serve a release, and serve the latest suite instead.",
|
|
115
118
|
)
|
|
119
|
+
|
|
120
|
+
parser.add_argument(
|
|
121
|
+
"--project",
|
|
122
|
+
type=str,
|
|
123
|
+
default=None,
|
|
124
|
+
help="Experimental: The name of the project to display on the landing page.",
|
|
125
|
+
)
|
|
116
126
|
args = parser.parse_args()
|
|
117
127
|
|
|
118
128
|
if args.suite and args.release:
|
|
@@ -143,6 +153,8 @@ def main():
|
|
|
143
153
|
|
|
144
154
|
app.config["helm.suite"] = args.suite or "latest"
|
|
145
155
|
app.config["helm.release"] = args.release
|
|
156
|
+
app.config["helm.release"] = args.release
|
|
157
|
+
app.config["helm.project"] = args.project or "lite"
|
|
146
158
|
|
|
147
159
|
print(f"After the web server has started, go to http://localhost:{args.port} to view your website.\n")
|
|
148
160
|
app.run(host="0.0.0.0", port=args.port)
|