crfm-helm 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.5.dist-info/METADATA +413 -0
- crfm_helm-0.5.5.dist-info/RECORD +894 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +20 -5
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/anthropic_red_team_annotator.py +11 -24
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/call_center_annotator.py +22 -11
- helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/harm_bench_annotator.py +11 -24
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +10 -5
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +90 -61
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +281 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +132 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +11 -25
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation/xstest_annotator.py +20 -30
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +34 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +34 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +34 -0
- helm/benchmark/metrics/medalign_metrics.py +34 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +34 -0
- helm/benchmark/metrics/medication_qa_metrics.py +15 -4
- helm/benchmark/metrics/mental_health_metrics.py +34 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/safety_metrics.py +39 -17
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +1 -1
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +19 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/toxicity_metrics.py +4 -4
- helm/benchmark/metrics/unitxt_metrics.py +21 -4
- helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/create_plots.py +1 -1
- helm/benchmark/presentation/schema.py +3 -0
- helm/benchmark/presentation/summarize.py +119 -256
- helm/benchmark/presentation/test_summarize.py +145 -3
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +203 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +8 -17
- helm/benchmark/run_expander.py +105 -8
- helm/benchmark/run_spec_factory.py +12 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +613 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
- helm/benchmark/run_specs/experimental_run_specs.py +112 -3
- helm/benchmark/run_specs/finance_run_specs.py +6 -2
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +89 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +66 -52
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +83 -5
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +153 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +131 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +88 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
- helm/benchmark/scenarios/medbullets_scenario.py +140 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
- helm/benchmark/scenarios/medec_scenario.py +120 -0
- helm/benchmark/scenarios/medhallu_scenario.py +66 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +112 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +10 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
- helm/benchmark/scenarios/raft_scenario.py +18 -3
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +233 -84
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +11 -0
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +752 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +298 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_legal.yaml +566 -0
- helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
- helm/benchmark/static/schema_medhelm.yaml +1081 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +42 -6
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +40 -26
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +187 -30
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +151 -47
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-262903c1.js +10 -0
- helm/benchmark/static_build/assets/index-42060d71.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
- helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +5 -5
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +1 -1
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/tokenizer_service.py +0 -5
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +118 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
- helm/clients/audio_language/qwen_audiolm_client.py +150 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +201 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +10 -18
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
- helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +255 -21
- helm/clients/palmyra_client.py +2 -6
- helm/clients/reka_client.py +1 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +69 -7
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +39 -13
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +175 -0
- helm/clients/vllm_client.py +4 -6
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/cache.py +8 -30
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/images_utils.py +2 -2
- helm/common/key_value_store.py +9 -9
- helm/common/media_object.py +2 -2
- helm/common/mongo_key_value_store.py +3 -3
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/request.py +6 -2
- helm/common/response_format.py +18 -0
- helm/common/test_cache.py +1 -48
- helm/common/test_media_object.py +1 -1
- helm/common/tokenization_request.py +0 -9
- helm/config/model_deployments.yaml +1258 -33
- helm/config/model_metadata.yaml +1110 -41
- helm/config/tokenizer_configs.yaml +403 -3
- helm/proxy/cli.py +2 -2
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -13
- helm/proxy/services/remote_service.py +1 -7
- helm/proxy/services/server_service.py +6 -19
- helm/proxy/services/service.py +0 -6
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +2 -2
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.3.dist-info/METADATA +0 -355
- crfm_helm-0.5.3.dist-info/RECORD +0 -699
- helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
- helm/benchmark/data_overlap/export_scenario_text.py +0 -119
- helm/benchmark/data_overlap/light_scenario.py +0 -60
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-58f97dcd.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
- /helm/benchmark/{data_overlap → metrics/ifeval}/__init__.py +0 -0
|
@@ -5,8 +5,8 @@ from enum import Enum
|
|
|
5
5
|
import pandas as pd
|
|
6
6
|
|
|
7
7
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
8
|
-
from .ice_scenario_pinned_file_order import listdir_with_pinned_file_order
|
|
9
|
-
from .scenario import Scenario, Instance, TEST_SPLIT, Input
|
|
8
|
+
from helm.benchmark.scenarios.ice_scenario_pinned_file_order import listdir_with_pinned_file_order
|
|
9
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input
|
|
10
10
|
|
|
11
11
|
try:
|
|
12
12
|
# pd.read_excel() uses xlrd
|
|
@@ -114,8 +114,12 @@ class ICEScenario(Scenario):
|
|
|
114
114
|
"""
|
|
115
115
|
The International Corpus of English (ICE).
|
|
116
116
|
|
|
117
|
-
NOTE: This text cannot be downloaded
|
|
118
|
-
|
|
117
|
+
NOTE: This text cannot be downloaded automatically.
|
|
118
|
+
You must extract each subset zip file into args.output_path + '/scenarios/ice',
|
|
119
|
+
which is by default '/benchmark_output/scenarios/ice',
|
|
120
|
+
where args.output_path is parsed from the command line argument.
|
|
121
|
+
See helm.benchmark.runner for more details about args.output_path.
|
|
122
|
+
|
|
119
123
|
The archives should extract into folders named according to the dictionary SUBSET_TO_DIRECTORY
|
|
120
124
|
below.
|
|
121
125
|
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import datasets
|
|
2
|
+
import os
|
|
3
|
+
from typing import List
|
|
4
|
+
from helm.benchmark.scenarios.scenario import (
|
|
5
|
+
Scenario,
|
|
6
|
+
Instance,
|
|
7
|
+
Input,
|
|
8
|
+
TEST_SPLIT,
|
|
9
|
+
)
|
|
10
|
+
from helm.common.general import ensure_directory_exists
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class IFEvalScenario(Scenario):
|
|
14
|
+
"""IFEval
|
|
15
|
+
|
|
16
|
+
IFEval contains around 500 "verifiable instructions" such as "write in more than 400 words"
|
|
17
|
+
and "mention the keyword of AI at least 3 times" which can be verified by heuristics."""
|
|
18
|
+
|
|
19
|
+
name = "ifeval"
|
|
20
|
+
description = "Instruction-Following Evaluation for Large Language Models"
|
|
21
|
+
tags = ["instruction following"]
|
|
22
|
+
|
|
23
|
+
def __init__(self):
|
|
24
|
+
super().__init__()
|
|
25
|
+
|
|
26
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
27
|
+
# Get IFEval from HuggingFace
|
|
28
|
+
cache_dir = os.path.join(output_path, "data")
|
|
29
|
+
ensure_directory_exists(cache_dir)
|
|
30
|
+
dataset = datasets.load_dataset(
|
|
31
|
+
"google/IFEval",
|
|
32
|
+
trust_remote_code=True,
|
|
33
|
+
cache_dir=cache_dir,
|
|
34
|
+
split="train",
|
|
35
|
+
revision="966cd89545d6b6acfd7638bc708b98261ca58e84",
|
|
36
|
+
)
|
|
37
|
+
assert isinstance(dataset, datasets.Dataset)
|
|
38
|
+
|
|
39
|
+
# Read all instances
|
|
40
|
+
instances: List[Instance] = []
|
|
41
|
+
for _, row in enumerate(dataset):
|
|
42
|
+
id = row["key"]
|
|
43
|
+
input = Input(text=row["prompt"].strip())
|
|
44
|
+
instance = Instance(
|
|
45
|
+
id=f"id{id}",
|
|
46
|
+
input=input,
|
|
47
|
+
references=[],
|
|
48
|
+
split=TEST_SPLIT,
|
|
49
|
+
extra_data={"instruction_ids": row["instruction_id_list"], "instruction_kwargs": row["kwargs"]},
|
|
50
|
+
)
|
|
51
|
+
instances.append(instance)
|
|
52
|
+
|
|
53
|
+
return instances
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
from typing import Any, List, Dict
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from datasets import load_dataset
|
|
4
|
+
from helm.common.hierarchical_logger import hlog
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Scenario,
|
|
7
|
+
Instance,
|
|
8
|
+
Reference,
|
|
9
|
+
TRAIN_SPLIT,
|
|
10
|
+
TEST_SPLIT,
|
|
11
|
+
CORRECT_TAG,
|
|
12
|
+
Input,
|
|
13
|
+
Output,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class IMDB_PTBRScenario(Scenario):
|
|
18
|
+
"""
|
|
19
|
+
The IMDB dataset is a widely-used benchmark dataset for natural language processing (NLP)
|
|
20
|
+
particularly for text classification and sentiment analysis.
|
|
21
|
+
This is a translated version that is meant to evaluate PT-BR models.
|
|
22
|
+
It consists of movie reviews from the Internet Movie Database (IMDB) and
|
|
23
|
+
includes both positive and negative sentiments labeled for supervised learning.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
name = "simple_classification"
|
|
27
|
+
description = "Classify movie reviews between positive or negative."
|
|
28
|
+
tags = ["classification"]
|
|
29
|
+
|
|
30
|
+
def process_dataset(self, dataset: Any, split: str) -> List[Instance]:
|
|
31
|
+
instances: List[Instance] = []
|
|
32
|
+
label_names = {0: "negativo", 1: "positivo"}
|
|
33
|
+
for example in dataset[split]:
|
|
34
|
+
input = Input(text=example["text"])
|
|
35
|
+
# NOTE: For classification scenarios, the reference outputs should be the same
|
|
36
|
+
# for all instances, and should include both correct and incorrect classes.
|
|
37
|
+
# HELM only supports single-label classification. Exactly one reference
|
|
38
|
+
# should have the CORRECT_TAG tag.
|
|
39
|
+
references = [
|
|
40
|
+
Reference(Output(text=label_names[example["label"]]), tags=[CORRECT_TAG]),
|
|
41
|
+
]
|
|
42
|
+
instance = Instance(input=input, references=references, split=split)
|
|
43
|
+
instances.append(instance)
|
|
44
|
+
return instances
|
|
45
|
+
|
|
46
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
47
|
+
instances: List[Instance] = []
|
|
48
|
+
cache_dir = str(Path(output_path) / "data")
|
|
49
|
+
dataset = load_dataset("maritaca-ai/imdb_pt", cache_dir=cache_dir)
|
|
50
|
+
splits: Dict[str, str] = {
|
|
51
|
+
"train": TRAIN_SPLIT,
|
|
52
|
+
"test": TEST_SPLIT,
|
|
53
|
+
}
|
|
54
|
+
for split in splits:
|
|
55
|
+
if split not in splits.keys():
|
|
56
|
+
hlog(f"{split} split doesn't exist, skipping")
|
|
57
|
+
continue
|
|
58
|
+
instances.extend(self.process_dataset(dataset, splits[split]))
|
|
59
|
+
|
|
60
|
+
return instances
|
|
@@ -2,8 +2,17 @@ import os
|
|
|
2
2
|
from typing import List, Dict, Optional
|
|
3
3
|
|
|
4
4
|
from helm.common.general import ensure_file_downloaded
|
|
5
|
-
from .scenario import
|
|
6
|
-
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Scenario,
|
|
7
|
+
Instance,
|
|
8
|
+
Reference,
|
|
9
|
+
CORRECT_TAG,
|
|
10
|
+
TRAIN_SPLIT,
|
|
11
|
+
VALID_SPLIT,
|
|
12
|
+
Input,
|
|
13
|
+
Output,
|
|
14
|
+
)
|
|
15
|
+
from helm.benchmark.scenarios.imdb_scenario_pinned_file_order import listdir_with_pinned_file_order
|
|
7
16
|
|
|
8
17
|
|
|
9
18
|
class IMDBScenario(Scenario):
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
from typing import List
|
|
4
|
+
from datasets import load_dataset, Features, Value, Sequence, Dataset
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Scenario,
|
|
7
|
+
Instance,
|
|
8
|
+
Input,
|
|
9
|
+
Reference,
|
|
10
|
+
Output,
|
|
11
|
+
CORRECT_TAG,
|
|
12
|
+
TEST_SPLIT,
|
|
13
|
+
)
|
|
14
|
+
from helm.common.general import ensure_directory_exists
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class InfiniteBenchSumScenario(Scenario):
|
|
18
|
+
"""InfiniteBench Sum
|
|
19
|
+
|
|
20
|
+
InfiniteBench is a benchmark tailored for evaluating the capabilities of language models to process,
|
|
21
|
+
understand, and reason over super long contexts (100k+ tokens). InfiniteBench Sum is a subset of
|
|
22
|
+
InfiniteBench that requires models to generate a concise summary of the novel. The subset is referred
|
|
23
|
+
to as "En.Sum" in the original paper.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
name = "infinite_bench_sum"
|
|
27
|
+
description = "Summarize a novel from InfiniteBench"
|
|
28
|
+
tags = ["summarization"]
|
|
29
|
+
|
|
30
|
+
def __init__(self, min_num_words: int, max_num_words: int):
|
|
31
|
+
self.min_num_words = min_num_words
|
|
32
|
+
self.max_num_words = max_num_words
|
|
33
|
+
super().__init__()
|
|
34
|
+
|
|
35
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
36
|
+
# Get InfiniteBench from HuggingFace
|
|
37
|
+
cache_dir = os.path.join(output_path, "data")
|
|
38
|
+
ensure_directory_exists(cache_dir)
|
|
39
|
+
|
|
40
|
+
# Define the features schema
|
|
41
|
+
ft = Features(
|
|
42
|
+
{
|
|
43
|
+
"id": Value("int64"),
|
|
44
|
+
"context": Value("string"),
|
|
45
|
+
"input": Value("string"),
|
|
46
|
+
"answer": Sequence(Value("string")),
|
|
47
|
+
"options": Sequence(Value("string")),
|
|
48
|
+
}
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# Load the dataset with the specified features
|
|
52
|
+
dataset = load_dataset(
|
|
53
|
+
"xinrongzhang2022/InfiniteBench",
|
|
54
|
+
split="longbook_sum_eng",
|
|
55
|
+
features=ft,
|
|
56
|
+
revision="90f0394333616266d9fe85824ceaf505093cbaa5",
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
assert isinstance(dataset, Dataset)
|
|
60
|
+
|
|
61
|
+
def count_words(text: str) -> int:
|
|
62
|
+
return len(re.split(r"\s+", text.strip()))
|
|
63
|
+
|
|
64
|
+
dataset = dataset.map(
|
|
65
|
+
lambda example: {"prompt_wc": count_words(example["context"]) + count_words(example["input"])}
|
|
66
|
+
).filter(lambda example: self.min_num_words <= example["prompt_wc"] <= self.max_num_words)
|
|
67
|
+
|
|
68
|
+
# Read all instances
|
|
69
|
+
instances: List[Instance] = []
|
|
70
|
+
for row in dataset:
|
|
71
|
+
id = row["id"]
|
|
72
|
+
input = Input(text=row["context"] + "\n\n" + row["input"])
|
|
73
|
+
instance = Instance(
|
|
74
|
+
id=id,
|
|
75
|
+
input=input,
|
|
76
|
+
references=[Reference(Output(text=row["answer"][0]), tags=[CORRECT_TAG])],
|
|
77
|
+
split=TEST_SPLIT,
|
|
78
|
+
extra_data={"word_count": row["prompt_wc"]},
|
|
79
|
+
)
|
|
80
|
+
instances.append(instance)
|
|
81
|
+
|
|
82
|
+
return instances
|
|
@@ -2,8 +2,8 @@ import os
|
|
|
2
2
|
from typing import List
|
|
3
3
|
|
|
4
4
|
from helm.common.general import ensure_file_downloaded
|
|
5
|
-
from .scenario import Instance, TRAIN_SPLIT, TEST_SPLIT
|
|
6
|
-
from .mmlu_scenario import MMLUScenario
|
|
5
|
+
from helm.benchmark.scenarios.scenario import Instance, TRAIN_SPLIT, TEST_SPLIT
|
|
6
|
+
from helm.benchmark.scenarios.mmlu_scenario import MMLUScenario
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class InteractiveQAMMLUScenario(MMLUScenario):
|
|
@@ -3,7 +3,7 @@ import os
|
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
5
|
from helm.common.general import ensure_file_downloaded
|
|
6
|
-
from .scenario import Scenario, Instance, Input, TEST_SPLIT
|
|
6
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class KoalaScenario(Scenario):
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import json
|
|
4
|
+
import re
|
|
5
|
+
|
|
6
|
+
from typing import List
|
|
7
|
+
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
|
|
8
|
+
from helm.benchmark.scenarios.scenario import (
|
|
9
|
+
Input,
|
|
10
|
+
Scenario,
|
|
11
|
+
Instance,
|
|
12
|
+
Reference,
|
|
13
|
+
TRAIN_SPLIT,
|
|
14
|
+
TEST_SPLIT,
|
|
15
|
+
CORRECT_TAG,
|
|
16
|
+
Output,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class LegalContractSummarizationScenario(Scenario):
|
|
21
|
+
"""Legal Contract Summarization
|
|
22
|
+
|
|
23
|
+
A legal contract summarization benchmark based on the paper
|
|
24
|
+
Plain English Summarization of Contracts (Manor & Li, NAACL 2019),
|
|
25
|
+
which presented a dataset of legal text snippets paired with summaries
|
|
26
|
+
written in plain English.
|
|
27
|
+
|
|
28
|
+
@inproceedings{manor-li-2019-plain,
|
|
29
|
+
title = "Plain {E}nglish Summarization of Contracts",
|
|
30
|
+
author = "Manor, Laura and
|
|
31
|
+
Li, Junyi Jessy",
|
|
32
|
+
editor = "Aletras, Nikolaos and
|
|
33
|
+
Ash, Elliott and
|
|
34
|
+
Barrett, Leslie and
|
|
35
|
+
Chen, Daniel and
|
|
36
|
+
Meyers, Adam and
|
|
37
|
+
Preotiuc-Pietro, Daniel and
|
|
38
|
+
Rosenberg, David and
|
|
39
|
+
Stent, Amanda",
|
|
40
|
+
booktitle = "Proceedings of the Natural Legal Language Processing Workshop 2019",
|
|
41
|
+
month = jun,
|
|
42
|
+
year = "2019",
|
|
43
|
+
address = "Minneapolis, Minnesota",
|
|
44
|
+
publisher = "Association for Computational Linguistics",
|
|
45
|
+
url = "https://aclanthology.org/W19-2201",
|
|
46
|
+
doi = "10.18653/v1/W19-2201",
|
|
47
|
+
pages = "1--11",
|
|
48
|
+
abstract = "Unilateral legal contracts, such as terms of service, play a substantial role in modern digital life. However, few read these documents before accepting the terms within, as they are too long and the language too complicated. We propose the task of summarizing such legal documents in plain English, which would enable users to have a better understanding of the terms they are accepting. We propose an initial dataset of legal text snippets paired with summaries written in plain English. We verify the quality of these summaries manually, and show that they involve heavy abstraction, compression, and simplification. Initial experiments show that unsupervised extractive summarization methods do not perform well on this task due to the level of abstraction and style differences. We conclude with a call for resource and technique development for simplification and style transfer for legal language.",
|
|
49
|
+
}
|
|
50
|
+
""" # noqa: E501
|
|
51
|
+
|
|
52
|
+
TRAIN_RATIO: float = 0.2
|
|
53
|
+
ARTICLE_COLUMN_NAME = "original_text"
|
|
54
|
+
SUMMARY_COLUMN_NAME = "reference_summary"
|
|
55
|
+
ID_COLUMN_NAME = "uid"
|
|
56
|
+
|
|
57
|
+
name = "legal_contract_summarization"
|
|
58
|
+
description = (
|
|
59
|
+
"Plain English Summarization of Contracts [(Manor et al., 2019)](https://aclanthology.org/W19-2201.pdf)."
|
|
60
|
+
)
|
|
61
|
+
tags = ["summarization", "legal"]
|
|
62
|
+
|
|
63
|
+
def __init__(self):
|
|
64
|
+
"""
|
|
65
|
+
Initializes the scenario.
|
|
66
|
+
|
|
67
|
+
"""
|
|
68
|
+
super().__init__()
|
|
69
|
+
|
|
70
|
+
@staticmethod
|
|
71
|
+
def _clean(text: str) -> str:
|
|
72
|
+
return re.sub(r"\s+", " ", text)
|
|
73
|
+
|
|
74
|
+
def _load_dataset(self, output_path: str):
|
|
75
|
+
data_dir = os.path.join(output_path, "data")
|
|
76
|
+
ensure_directory_exists(data_dir)
|
|
77
|
+
|
|
78
|
+
source_url = "https://raw.githubusercontent.com/lauramanor/legal_summarization/master/all_v1.json"
|
|
79
|
+
source_file = os.path.basename(source_url)
|
|
80
|
+
target_path = os.path.join(data_dir, source_file)
|
|
81
|
+
ensure_file_downloaded(
|
|
82
|
+
source_url=source_url,
|
|
83
|
+
target_path=target_path,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
target_df = pd.DataFrame()
|
|
87
|
+
with open(target_path) as f:
|
|
88
|
+
json_data = json.load(f)
|
|
89
|
+
target_df = pd.DataFrame.from_records(list(json_data.values()))
|
|
90
|
+
target_df = target_df.dropna(
|
|
91
|
+
subset=[
|
|
92
|
+
LegalContractSummarizationScenario.ARTICLE_COLUMN_NAME,
|
|
93
|
+
LegalContractSummarizationScenario.SUMMARY_COLUMN_NAME,
|
|
94
|
+
LegalContractSummarizationScenario.ID_COLUMN_NAME,
|
|
95
|
+
]
|
|
96
|
+
)
|
|
97
|
+
# Split randomly (works better than split by order)
|
|
98
|
+
train_df = target_df.sample(frac=LegalContractSummarizationScenario.TRAIN_RATIO, random_state=0)
|
|
99
|
+
test_df = target_df.drop(train_df.index).sample(frac=1, random_state=0)
|
|
100
|
+
|
|
101
|
+
return {TRAIN_SPLIT: train_df, TEST_SPLIT: test_df}
|
|
102
|
+
|
|
103
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
104
|
+
dataset = self._load_dataset(output_path)
|
|
105
|
+
|
|
106
|
+
instances: List[Instance] = []
|
|
107
|
+
|
|
108
|
+
for split, split_data in dataset.items():
|
|
109
|
+
for example in split_data.itertuples():
|
|
110
|
+
id = getattr(example, LegalContractSummarizationScenario.ID_COLUMN_NAME)
|
|
111
|
+
article = LegalContractSummarizationScenario._clean(
|
|
112
|
+
getattr(example, LegalContractSummarizationScenario.ARTICLE_COLUMN_NAME)
|
|
113
|
+
)
|
|
114
|
+
summary = LegalContractSummarizationScenario._clean(
|
|
115
|
+
getattr(example, LegalContractSummarizationScenario.SUMMARY_COLUMN_NAME)
|
|
116
|
+
)
|
|
117
|
+
input = Input(
|
|
118
|
+
text=article,
|
|
119
|
+
)
|
|
120
|
+
output = Output(text=summary)
|
|
121
|
+
instance = Instance(
|
|
122
|
+
id=id,
|
|
123
|
+
input=input,
|
|
124
|
+
references=[Reference(output=output, tags=[CORRECT_TAG])],
|
|
125
|
+
split=split,
|
|
126
|
+
)
|
|
127
|
+
instances.append(instance)
|
|
128
|
+
|
|
129
|
+
return instances
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
from helm.benchmark.scenarios.scenario import (
|
|
7
|
+
Scenario,
|
|
8
|
+
Instance,
|
|
9
|
+
Reference,
|
|
10
|
+
TRAIN_SPLIT,
|
|
11
|
+
TEST_SPLIT,
|
|
12
|
+
CORRECT_TAG,
|
|
13
|
+
Input,
|
|
14
|
+
Output,
|
|
15
|
+
)
|
|
16
|
+
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class LegalOpinionSentimentClassificationScenario(Scenario):
|
|
20
|
+
"""
|
|
21
|
+
A legal opinion sentiment classification task based on the paper
|
|
22
|
+
Effective Approach to Develop a Sentiment Annotator For Legal Domain in a Low Resource Setting
|
|
23
|
+
[(Ratnayaka et al., 2020)](https://arxiv.org/pdf/2011.00318.pdf).
|
|
24
|
+
|
|
25
|
+
Example prompt:
|
|
26
|
+
Classify the sentences into one of the 3 sentiment categories. Possible labels: positive, neutral, negative.
|
|
27
|
+
{Sentence}
|
|
28
|
+
Label: {positive/neutral/negative}
|
|
29
|
+
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
# Names of the tasks we support
|
|
33
|
+
|
|
34
|
+
name = "legal_opinion"
|
|
35
|
+
description = "Predicting the sentiment of the legal text in the positive, negative, or neutral."
|
|
36
|
+
tags = ["classification", "sentiment analysis", "legal"]
|
|
37
|
+
|
|
38
|
+
SENTIMENT_CLASSES = ["positive", "negative", "neutral"]
|
|
39
|
+
SPLIT_TO_URL = {
|
|
40
|
+
TRAIN_SPLIT: "https://osf.io/download/hfn62/",
|
|
41
|
+
TEST_SPLIT: "https://osf.io/download/q4adh/",
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
def create_instances(self, df: pd.DataFrame, split: str) -> List[Instance]:
|
|
45
|
+
instances: List[Instance] = []
|
|
46
|
+
assert split in [TRAIN_SPLIT, TEST_SPLIT]
|
|
47
|
+
if split == TRAIN_SPLIT:
|
|
48
|
+
phrase_column_name = "Phrase"
|
|
49
|
+
label_column_name = "Label"
|
|
50
|
+
else:
|
|
51
|
+
phrase_column_name = "sentence"
|
|
52
|
+
label_column_name = "label"
|
|
53
|
+
for row in df.itertuples():
|
|
54
|
+
phrase = getattr(row, phrase_column_name)
|
|
55
|
+
label_index = int(getattr(row, label_column_name))
|
|
56
|
+
label = LegalOpinionSentimentClassificationScenario.SENTIMENT_CLASSES[label_index]
|
|
57
|
+
instance = Instance(
|
|
58
|
+
input=Input(text=phrase), references=[Reference(Output(text=label), tags=[CORRECT_TAG])], split=split
|
|
59
|
+
)
|
|
60
|
+
instances.append(instance)
|
|
61
|
+
return instances
|
|
62
|
+
|
|
63
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
64
|
+
self.data_dir = os.path.join(output_path, "data")
|
|
65
|
+
data_dir = self.data_dir
|
|
66
|
+
ensure_directory_exists(data_dir)
|
|
67
|
+
instances: List[Instance] = []
|
|
68
|
+
for split, url in LegalOpinionSentimentClassificationScenario.SPLIT_TO_URL.items():
|
|
69
|
+
file_name = f"{split.lower()}.xlsx"
|
|
70
|
+
file_path = os.path.join(data_dir, file_name)
|
|
71
|
+
ensure_file_downloaded(
|
|
72
|
+
source_url=url,
|
|
73
|
+
target_path=os.path.join(data_dir, file_name),
|
|
74
|
+
)
|
|
75
|
+
df = pd.read_excel(file_path)
|
|
76
|
+
instances.extend(self.create_instances(df, split))
|
|
77
|
+
return instances
|
|
@@ -5,7 +5,17 @@ from typing import List, Optional, Any
|
|
|
5
5
|
import datasets
|
|
6
6
|
from datasets import load_dataset
|
|
7
7
|
|
|
8
|
-
from .scenario import
|
|
8
|
+
from helm.benchmark.scenarios.scenario import (
|
|
9
|
+
Scenario,
|
|
10
|
+
Instance,
|
|
11
|
+
Reference,
|
|
12
|
+
TRAIN_SPLIT,
|
|
13
|
+
VALID_SPLIT,
|
|
14
|
+
TEST_SPLIT,
|
|
15
|
+
CORRECT_TAG,
|
|
16
|
+
Input,
|
|
17
|
+
Output,
|
|
18
|
+
)
|
|
9
19
|
|
|
10
20
|
_ALL_LANGUAGES = {
|
|
11
21
|
"bulgarian": "bg",
|
|
@@ -3,7 +3,17 @@ import os
|
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
5
|
from helm.common.general import ensure_file_downloaded
|
|
6
|
-
from .scenario import
|
|
6
|
+
from helm.benchmark.scenarios.scenario import (
|
|
7
|
+
Scenario,
|
|
8
|
+
Instance,
|
|
9
|
+
Reference,
|
|
10
|
+
TRAIN_SPLIT,
|
|
11
|
+
VALID_SPLIT,
|
|
12
|
+
TEST_SPLIT,
|
|
13
|
+
CORRECT_TAG,
|
|
14
|
+
Input,
|
|
15
|
+
Output,
|
|
16
|
+
)
|
|
7
17
|
|
|
8
18
|
|
|
9
19
|
class LegalSupportScenario(Scenario):
|
|
@@ -6,7 +6,16 @@ from pathlib import Path
|
|
|
6
6
|
from typing import List, Dict
|
|
7
7
|
|
|
8
8
|
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
|
|
9
|
-
from .scenario import
|
|
9
|
+
from helm.benchmark.scenarios.scenario import (
|
|
10
|
+
Scenario,
|
|
11
|
+
Instance,
|
|
12
|
+
Reference,
|
|
13
|
+
CORRECT_TAG,
|
|
14
|
+
TRAIN_SPLIT,
|
|
15
|
+
TEST_SPLIT,
|
|
16
|
+
Input,
|
|
17
|
+
Output,
|
|
18
|
+
)
|
|
10
19
|
|
|
11
20
|
PROMPT_SETTINGS_URL = "https://raw.githubusercontent.com/HazyResearch/legalbench/main/helm_prompt_settings.jsonl"
|
|
12
21
|
|
|
@@ -97,10 +106,20 @@ class LegalBenchScenario(Scenario):
|
|
|
97
106
|
# Download data from Huggingface. LegalBench provides splits for samples to
|
|
98
107
|
# be used for prompt construction and for testing.
|
|
99
108
|
train_dataset = datasets.load_dataset(
|
|
100
|
-
"nguha/legalbench",
|
|
109
|
+
"nguha/legalbench",
|
|
110
|
+
self.subset,
|
|
111
|
+
trust_remote_code=True,
|
|
112
|
+
cache_dir=cache_dir,
|
|
113
|
+
split="train",
|
|
114
|
+
revision="e042ea68c19df12b737fe768572f22ead61e8e37",
|
|
101
115
|
)
|
|
102
116
|
test_dataset = datasets.load_dataset(
|
|
103
|
-
"nguha/legalbench",
|
|
117
|
+
"nguha/legalbench",
|
|
118
|
+
self.subset,
|
|
119
|
+
trust_remote_code=True,
|
|
120
|
+
cache_dir=cache_dir,
|
|
121
|
+
split="test",
|
|
122
|
+
revision="e042ea68c19df12b737fe768572f22ead61e8e37",
|
|
104
123
|
)
|
|
105
124
|
assert isinstance(train_dataset, datasets.Dataset)
|
|
106
125
|
assert isinstance(test_dataset, datasets.Dataset)
|
|
@@ -5,8 +5,18 @@ from typing import List, Any
|
|
|
5
5
|
import datasets
|
|
6
6
|
from datasets import load_dataset
|
|
7
7
|
|
|
8
|
-
from .lextreme_scenario import TaskType
|
|
9
|
-
from .scenario import
|
|
8
|
+
from helm.benchmark.scenarios.lextreme_scenario import TaskType
|
|
9
|
+
from helm.benchmark.scenarios.scenario import (
|
|
10
|
+
Scenario,
|
|
11
|
+
Instance,
|
|
12
|
+
Reference,
|
|
13
|
+
CORRECT_TAG,
|
|
14
|
+
TRAIN_SPLIT,
|
|
15
|
+
VALID_SPLIT,
|
|
16
|
+
TEST_SPLIT,
|
|
17
|
+
Input,
|
|
18
|
+
Output,
|
|
19
|
+
)
|
|
10
20
|
|
|
11
21
|
ECTHR_A = "ecthr_a"
|
|
12
22
|
ECTHR_B = "ecthr_b"
|
|
@@ -6,7 +6,17 @@ from typing import List, Any
|
|
|
6
6
|
import datasets
|
|
7
7
|
from datasets import load_dataset
|
|
8
8
|
|
|
9
|
-
from .scenario import
|
|
9
|
+
from helm.benchmark.scenarios.scenario import (
|
|
10
|
+
Scenario,
|
|
11
|
+
Instance,
|
|
12
|
+
Reference,
|
|
13
|
+
CORRECT_TAG,
|
|
14
|
+
TRAIN_SPLIT,
|
|
15
|
+
VALID_SPLIT,
|
|
16
|
+
TEST_SPLIT,
|
|
17
|
+
Output,
|
|
18
|
+
Input,
|
|
19
|
+
)
|
|
10
20
|
|
|
11
21
|
|
|
12
22
|
class TaskType:
|
|
@@ -4,7 +4,7 @@ from xml.etree.ElementTree import Element
|
|
|
4
4
|
import xml.etree.ElementTree as ET
|
|
5
5
|
|
|
6
6
|
from helm.common.general import ensure_file_downloaded
|
|
7
|
-
from .scenario import CORRECT_TAG, TEST_SPLIT, Input, Instance, Output, Reference, Scenario
|
|
7
|
+
from helm.benchmark.scenarios.scenario import CORRECT_TAG, TEST_SPLIT, Input, Instance, Output, Reference, Scenario
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
class LiveQAScenario(Scenario):
|
|
@@ -3,7 +3,7 @@ import os
|
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
5
|
from helm.common.general import ensure_file_downloaded
|
|
6
|
-
from .scenario import CORRECT_TAG, Reference, Scenario, Instance, Input, TEST_SPLIT, Output
|
|
6
|
+
from helm.benchmark.scenarios.scenario import CORRECT_TAG, Reference, Scenario, Instance, Input, TEST_SPLIT, Output
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class LMEntryScenario(Scenario):
|
|
@@ -368,7 +368,15 @@ class MATHScenario(Scenario):
|
|
|
368
368
|
cache_dir = os.path.join(output_path, "data")
|
|
369
369
|
ensure_directory_exists(cache_dir)
|
|
370
370
|
data = (
|
|
371
|
-
typing.cast(
|
|
371
|
+
typing.cast(
|
|
372
|
+
DatasetDict,
|
|
373
|
+
load_dataset(
|
|
374
|
+
"hendrycks/competition_math",
|
|
375
|
+
trust_remote_code=True,
|
|
376
|
+
cache_dir=cache_dir,
|
|
377
|
+
revision="71b758ecc688b2822d07ffa7f8393299f1dc7cac",
|
|
378
|
+
),
|
|
379
|
+
)
|
|
372
380
|
.sort("problem")
|
|
373
381
|
.shuffle(seed=42)
|
|
374
382
|
)
|
|
@@ -2,7 +2,16 @@ import os
|
|
|
2
2
|
from typing import List
|
|
3
3
|
|
|
4
4
|
from helm.common.general import ensure_directory_exists, ensure_file_downloaded
|
|
5
|
-
from .scenario import
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Scenario,
|
|
7
|
+
Instance,
|
|
8
|
+
Reference,
|
|
9
|
+
ALL_SPLITS,
|
|
10
|
+
CORRECT_TAG,
|
|
11
|
+
VALID_SPLIT,
|
|
12
|
+
Input,
|
|
13
|
+
Output,
|
|
14
|
+
)
|
|
6
15
|
|
|
7
16
|
|
|
8
17
|
class MeQSumScenario(Scenario):
|