crfm-helm 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.5.dist-info/METADATA +413 -0
- crfm_helm-0.5.5.dist-info/RECORD +894 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +20 -5
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/anthropic_red_team_annotator.py +11 -24
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/call_center_annotator.py +22 -11
- helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/harm_bench_annotator.py +11 -24
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +10 -5
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +90 -61
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +281 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +132 -0
- helm/benchmark/annotation/simple_safety_tests_annotator.py +11 -25
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation/xstest_annotator.py +20 -30
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +34 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +34 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +34 -0
- helm/benchmark/metrics/medalign_metrics.py +34 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +34 -0
- helm/benchmark/metrics/medication_qa_metrics.py +15 -4
- helm/benchmark/metrics/mental_health_metrics.py +34 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/safety_metrics.py +39 -17
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +1 -1
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +19 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/toxicity_metrics.py +4 -4
- helm/benchmark/metrics/unitxt_metrics.py +21 -4
- helm/benchmark/metrics/vision_language/image_metrics.py +7 -3
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/create_plots.py +1 -1
- helm/benchmark/presentation/schema.py +3 -0
- helm/benchmark/presentation/summarize.py +119 -256
- helm/benchmark/presentation/test_summarize.py +145 -3
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +203 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +8 -17
- helm/benchmark/run_expander.py +105 -8
- helm/benchmark/run_spec_factory.py +12 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +613 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
- helm/benchmark/run_specs/experimental_run_specs.py +112 -3
- helm/benchmark/run_specs/finance_run_specs.py +6 -2
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +89 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +66 -52
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +83 -5
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +153 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +131 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +88 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
- helm/benchmark/scenarios/medbullets_scenario.py +140 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
- helm/benchmark/scenarios/medec_scenario.py +120 -0
- helm/benchmark/scenarios/medhallu_scenario.py +66 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +112 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +10 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
- helm/benchmark/scenarios/raft_scenario.py +18 -3
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +233 -84
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +11 -0
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +752 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +298 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_legal.yaml +566 -0
- helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
- helm/benchmark/static/schema_medhelm.yaml +1081 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +42 -6
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +40 -26
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +187 -30
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +151 -47
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-262903c1.js +10 -0
- helm/benchmark/static_build/assets/index-42060d71.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
- helm/benchmark/static_build/assets/vhelm-aspects-1437d673.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-a1ca3f3f.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-8afb7616.png +0 -0
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +5 -5
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +1 -1
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/tokenizer_service.py +0 -5
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +118 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
- helm/clients/audio_language/qwen_audiolm_client.py +150 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +201 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +10 -18
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
- helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +255 -21
- helm/clients/palmyra_client.py +2 -6
- helm/clients/reka_client.py +1 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +69 -7
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +39 -13
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +175 -0
- helm/clients/vllm_client.py +4 -6
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/cache.py +8 -30
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/images_utils.py +2 -2
- helm/common/key_value_store.py +9 -9
- helm/common/media_object.py +2 -2
- helm/common/mongo_key_value_store.py +3 -3
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/request.py +6 -2
- helm/common/response_format.py +18 -0
- helm/common/test_cache.py +1 -48
- helm/common/test_media_object.py +1 -1
- helm/common/tokenization_request.py +0 -9
- helm/config/model_deployments.yaml +1258 -33
- helm/config/model_metadata.yaml +1110 -41
- helm/config/tokenizer_configs.yaml +403 -3
- helm/proxy/cli.py +2 -2
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -13
- helm/proxy/services/remote_service.py +1 -7
- helm/proxy/services/server_service.py +6 -19
- helm/proxy/services/service.py +0 -6
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +2 -2
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.3.dist-info/METADATA +0 -355
- crfm_helm-0.5.3.dist-info/RECORD +0 -699
- helm/benchmark/data_overlap/data_overlap_spec.py +0 -86
- helm/benchmark/data_overlap/export_scenario_text.py +0 -119
- helm/benchmark/data_overlap/light_scenario.py +0 -60
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-58f97dcd.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/vhelm-framework-cde7618a.png +0 -0
- helm/benchmark/static_build/assets/vhelm-model-6d812526.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.3.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
- /helm/benchmark/{data_overlap → metrics/ifeval}/__init__.py +0 -0
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.scenario import (
|
|
5
|
+
Scenario,
|
|
6
|
+
Instance,
|
|
7
|
+
Reference,
|
|
8
|
+
TEST_SPLIT,
|
|
9
|
+
CORRECT_TAG,
|
|
10
|
+
Input,
|
|
11
|
+
Output,
|
|
12
|
+
)
|
|
13
|
+
from tqdm import tqdm
|
|
14
|
+
from helm.common.media_object import MediaObject, MultimediaObject
|
|
15
|
+
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
|
|
16
|
+
from helm.common.audio_utils import use_ffmpeg_to_convert_audio_file
|
|
17
|
+
import pandas as pd
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class VoxCeleb2Scenario(Scenario):
|
|
21
|
+
"""VoxCeleb2
|
|
22
|
+
|
|
23
|
+
VoxCeleb2 is an audio-visual dataset consisting of short clips of human speech, extracted from
|
|
24
|
+
interview videos uploaded to YouTube. This dataset contains over a million utterances from over
|
|
25
|
+
6,000 speakers.
|
|
26
|
+
|
|
27
|
+
Paper: https://www.robots.ox.ac.uk/~vgg/publications/2018/Chung18a/chung18a.pdf
|
|
28
|
+
|
|
29
|
+
Citation:
|
|
30
|
+
@inproceedings{Chung18b,
|
|
31
|
+
author = "Chung, J.~S. and Nagrani, A. and Zisserman, A.",
|
|
32
|
+
title = "VoxCeleb2: Deep Speaker Recognition",
|
|
33
|
+
booktitle = "INTERSPEECH",
|
|
34
|
+
year = "2018",
|
|
35
|
+
}
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
DOWNLOADING_URL = "https://huggingface.co/datasets/ProgramComputer/voxceleb/resolve/main/vox2/vox2_test_aac.zip"
|
|
39
|
+
REFERENCE_URL = (
|
|
40
|
+
"https://huggingface.co/datasets/LAOS-Y/VoxCeleb2-AudioIdentity/resolve/main/voxceleb2_audioidentity.csv"
|
|
41
|
+
)
|
|
42
|
+
IDENTITY_INSTRUCTION = (
|
|
43
|
+
"Listen to the audio and take your best guess to determine if the two speakers are the same person. "
|
|
44
|
+
"Give just the letter of your answer and nothing else."
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
name = "voxceleb2"
|
|
48
|
+
description = (
|
|
49
|
+
"A large-scale dataset of over a million utterances from over 6,000 speakers with their"
|
|
50
|
+
"gender, race, identity information"
|
|
51
|
+
"([Chung et al, 2018](https://www.robots.ox.ac.uk/~vgg/publications/2018/Chung18a/chung18a.pdf))."
|
|
52
|
+
)
|
|
53
|
+
tags: List[str] = ["audio", "identification"]
|
|
54
|
+
options: List[str] = ["Yes", "No"]
|
|
55
|
+
|
|
56
|
+
def _convert_answer_to_label(self, answer: bool) -> str:
|
|
57
|
+
if answer:
|
|
58
|
+
return "A"
|
|
59
|
+
else:
|
|
60
|
+
return "B"
|
|
61
|
+
|
|
62
|
+
def _reformat_and_convert_audio_file(
|
|
63
|
+
self, ori_file_path: str, tgt_audio_data_path: str, audio_data_path: str
|
|
64
|
+
) -> str:
|
|
65
|
+
tgt_audio_path = os.path.join(tgt_audio_data_path, ori_file_path.split(".m4a")[0] + ".wav")
|
|
66
|
+
ensure_directory_exists(os.path.dirname(tgt_audio_path))
|
|
67
|
+
use_ffmpeg_to_convert_audio_file(os.path.join(audio_data_path, ori_file_path), tgt_audio_path)
|
|
68
|
+
return tgt_audio_path
|
|
69
|
+
|
|
70
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
71
|
+
instances: List[Instance] = []
|
|
72
|
+
audio_data_path = os.path.join(output_path, "audio_files")
|
|
73
|
+
tgt_audio_data_path = os.path.join(output_path, "tgt_audio_files")
|
|
74
|
+
ensure_file_downloaded(source_url=VoxCeleb2Scenario.DOWNLOADING_URL, target_path=audio_data_path, unpack=True)
|
|
75
|
+
annotations = pd.read_csv(VoxCeleb2Scenario.REFERENCE_URL, sep=",")
|
|
76
|
+
instances = []
|
|
77
|
+
for _, row in tqdm(annotations.iterrows(), total=len(annotations)):
|
|
78
|
+
tgt_first_audio_path = self._reformat_and_convert_audio_file(
|
|
79
|
+
row["first"], tgt_audio_data_path, audio_data_path
|
|
80
|
+
)
|
|
81
|
+
tgt_second_audio_path = self._reformat_and_convert_audio_file(
|
|
82
|
+
row["second"], tgt_audio_data_path, audio_data_path
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
answer = self._convert_answer_to_label(row["same"])
|
|
86
|
+
# The given correct answer is a letter, but we need an index
|
|
87
|
+
correct_answer_index: int = ord(answer) - ord("A")
|
|
88
|
+
references: List[Reference] = []
|
|
89
|
+
for i, option in enumerate(self.options):
|
|
90
|
+
reference: Reference
|
|
91
|
+
is_correct: bool = i == correct_answer_index
|
|
92
|
+
reference = Reference(Output(text=option), tags=[CORRECT_TAG] if is_correct else [])
|
|
93
|
+
references.append(reference)
|
|
94
|
+
|
|
95
|
+
input = Input(
|
|
96
|
+
multimedia_content=MultimediaObject(
|
|
97
|
+
[
|
|
98
|
+
MediaObject(content_type="audio/wav", location=tgt_first_audio_path),
|
|
99
|
+
MediaObject(content_type="audio/wav", location=tgt_second_audio_path),
|
|
100
|
+
MediaObject(content_type="text/plain", text=self.IDENTITY_INSTRUCTION),
|
|
101
|
+
]
|
|
102
|
+
)
|
|
103
|
+
)
|
|
104
|
+
instances.append(Instance(input=input, references=references, split=TEST_SPLIT))
|
|
105
|
+
|
|
106
|
+
return instances
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
import datasets
|
|
2
|
+
import os
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
CORRECT_TAG,
|
|
7
|
+
Scenario,
|
|
8
|
+
Instance,
|
|
9
|
+
Reference,
|
|
10
|
+
TEST_SPLIT,
|
|
11
|
+
Input,
|
|
12
|
+
Output,
|
|
13
|
+
)
|
|
14
|
+
from helm.common.general import ensure_directory_exists
|
|
15
|
+
from helm.common.hierarchical_logger import hlog
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class AutoBencherCapabilitiesScenario(Scenario):
|
|
19
|
+
"""AutoBencher Capabilities
|
|
20
|
+
|
|
21
|
+
AutoBencher uses a language model to automatically search
|
|
22
|
+
for datasets. AutoBencher Capabilities consists of question
|
|
23
|
+
answering datasets for math, multilingual, and knowledge-intensive
|
|
24
|
+
question answering created by AutoBencher.
|
|
25
|
+
|
|
26
|
+
Paper: https://arxiv.org/abs/2407.08351"""
|
|
27
|
+
|
|
28
|
+
name = "autobencher_capabilities"
|
|
29
|
+
description = (
|
|
30
|
+
"AutoBencher Capabilities consists of question answering datasets "
|
|
31
|
+
"for math, multilingual, and knowledge-intensive "
|
|
32
|
+
"question answering created by AutoBencher. "
|
|
33
|
+
"([paper](https://arxiv.org/abs/2407.08351))"
|
|
34
|
+
)
|
|
35
|
+
tags = ["question answering"]
|
|
36
|
+
|
|
37
|
+
SUBJECTS = ["math", "mt", "econ", "science", "history"]
|
|
38
|
+
|
|
39
|
+
def __init__(self, subject: str):
|
|
40
|
+
super().__init__()
|
|
41
|
+
if subject not in self.SUBJECTS:
|
|
42
|
+
raise ValueError(f"Unexpected subject {subject}, available subjects are {self.SUBJECTS}")
|
|
43
|
+
self.subject: str = subject
|
|
44
|
+
|
|
45
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
46
|
+
cache_dir = os.path.join(output_path, "data")
|
|
47
|
+
ensure_directory_exists(cache_dir)
|
|
48
|
+
|
|
49
|
+
# TODO: Switch this to the production dataset when available.
|
|
50
|
+
dataset = datasets.load_dataset(
|
|
51
|
+
"xlisali1/AutoBencher-capability.json",
|
|
52
|
+
split="train", # Use train split as test, so only zero-shot is supported
|
|
53
|
+
cache_dir=cache_dir,
|
|
54
|
+
revision="efe58dd72b6423e3f5c967f16cbea8cce3a51933",
|
|
55
|
+
)
|
|
56
|
+
instances: List[Instance] = []
|
|
57
|
+
for row in dataset:
|
|
58
|
+
if row["subject"] == self.subject:
|
|
59
|
+
continue
|
|
60
|
+
input = Input(text=row["question"])
|
|
61
|
+
# References are category ID, followed by level 2, 3 and 4 category names.
|
|
62
|
+
references = [Reference(output=Output(text=row["gold_answer"]), tags=[CORRECT_TAG])]
|
|
63
|
+
if row["gold_answer"] is None:
|
|
64
|
+
hlog(f"WARNING: Row had no gold_answer: {row}")
|
|
65
|
+
continue
|
|
66
|
+
instance = Instance(input=input, references=references, split=TEST_SPLIT)
|
|
67
|
+
instances.append(instance)
|
|
68
|
+
return instances
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from typing import List
|
|
5
|
+
|
|
6
|
+
from helm.common.general import ensure_file_downloaded
|
|
7
|
+
|
|
8
|
+
from .scenario import CORRECT_TAG, Scenario, Instance, Input, TEST_SPLIT, Reference, Output
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class AutobencherSafetyScenario(Scenario):
|
|
12
|
+
"""
|
|
13
|
+
Autobencher safety scenario
|
|
14
|
+
|
|
15
|
+
AutoBencher uses a language model to automatically search
|
|
16
|
+
for datasets. AutoBencher Capabilities consists of question
|
|
17
|
+
answering datasets for math, multilingual, and knowledge-intensive
|
|
18
|
+
question answering created by AutoBencher.
|
|
19
|
+
|
|
20
|
+
Paper: https://arxiv.org/abs/2407.08351
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
name = "autobencher_safety"
|
|
24
|
+
description = "Autobencher safety consists of question answering datasets"
|
|
25
|
+
tags = ["safety"]
|
|
26
|
+
|
|
27
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
28
|
+
data_path = os.path.join(output_path, "data")
|
|
29
|
+
os.makedirs(data_path, exist_ok=True)
|
|
30
|
+
url = "https://raw.githubusercontent.com/farzaank/AutoBencher/refs/heads/main/safety_processing/process%20full%20dataset%20for%20mTurk/full_dataset.json" # noqa: E501
|
|
31
|
+
outf_path = os.path.join(data_path, "full_dataset.json")
|
|
32
|
+
ensure_file_downloaded(
|
|
33
|
+
source_url=url,
|
|
34
|
+
target_path=outf_path,
|
|
35
|
+
unpack=False,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
json_data = json.loads(outf_path)
|
|
39
|
+
df = pd.DataFrame(json_data)
|
|
40
|
+
|
|
41
|
+
# Read all the instances
|
|
42
|
+
instances: List[Instance] = []
|
|
43
|
+
|
|
44
|
+
for i, row in df.iterrows():
|
|
45
|
+
references = [Reference(output=Output(text=row["gold_answer"]), tags=[CORRECT_TAG])]
|
|
46
|
+
input_text = row["question"]
|
|
47
|
+
input = Input(text=input_text)
|
|
48
|
+
id = str(row["category"]) + str(row["id"])
|
|
49
|
+
instance = Instance(input=input, split=TEST_SPLIT, references=references, id=id)
|
|
50
|
+
instances.append(instance)
|
|
51
|
+
return instances
|
|
@@ -37,7 +37,12 @@ class Banking77Scenario(Scenario):
|
|
|
37
37
|
ensure_directory_exists(cache_dir)
|
|
38
38
|
|
|
39
39
|
# TODO: Switch this to the production dataset when available.
|
|
40
|
-
dataset = datasets.load_dataset(
|
|
40
|
+
dataset = datasets.load_dataset(
|
|
41
|
+
"PolyAI/banking77",
|
|
42
|
+
cache_dir=cache_dir,
|
|
43
|
+
revision="90d4e2ee5521c04fc1488f065b8b083658768c57",
|
|
44
|
+
trust_remote_code=True,
|
|
45
|
+
)
|
|
41
46
|
|
|
42
47
|
instances: List[Instance] = []
|
|
43
48
|
for split_name in [TRAIN_SPLIT, TEST_SPLIT]:
|
|
@@ -5,7 +5,17 @@ from typing import List, Dict
|
|
|
5
5
|
from urllib.parse import urljoin
|
|
6
6
|
|
|
7
7
|
from helm.common.general import ensure_directory_exists, ensure_file_downloaded
|
|
8
|
-
from .scenario import
|
|
8
|
+
from helm.benchmark.scenarios.scenario import (
|
|
9
|
+
Scenario,
|
|
10
|
+
Instance,
|
|
11
|
+
Reference,
|
|
12
|
+
Input,
|
|
13
|
+
CORRECT_TAG,
|
|
14
|
+
TRAIN_SPLIT,
|
|
15
|
+
VALID_SPLIT,
|
|
16
|
+
TEST_SPLIT,
|
|
17
|
+
Output,
|
|
18
|
+
)
|
|
9
19
|
|
|
10
20
|
|
|
11
21
|
class BIGBenchScenario(Scenario):
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import datasets
|
|
2
|
+
import os
|
|
3
|
+
from typing import List
|
|
4
|
+
from helm.benchmark.scenarios.scenario import (
|
|
5
|
+
Scenario,
|
|
6
|
+
Instance,
|
|
7
|
+
TEST_SPLIT,
|
|
8
|
+
Input,
|
|
9
|
+
)
|
|
10
|
+
from helm.common.general import ensure_directory_exists
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
VERSIONS = ["v0.1.0_hf", "v0.1.1", "v0.1.2", "v0.1.3"]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class BigCodeBenchScenario(Scenario):
|
|
17
|
+
"""BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions
|
|
18
|
+
|
|
19
|
+
BigCodeBench is an easy-to-use benchmark for solving practical and challenging tasks via code.
|
|
20
|
+
It aims to evaluate the true programming capabilities of large language models (LLMs) in a more realistic setting.
|
|
21
|
+
The benchmark is designed for HumanEval-like function-level code generation tasks,
|
|
22
|
+
but with much more complex instructions and diverse function calls."""
|
|
23
|
+
|
|
24
|
+
name = "bigcodebench"
|
|
25
|
+
description = "Benchmarking Code Generation with Diverse Function Calls and Complex Instructions"
|
|
26
|
+
tags = ["coding"]
|
|
27
|
+
|
|
28
|
+
def __init__(self, version: str):
|
|
29
|
+
super().__init__()
|
|
30
|
+
assert version in VERSIONS, "Unknown version: {}".format(version)
|
|
31
|
+
self.version = version
|
|
32
|
+
|
|
33
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
34
|
+
# Get BigCodeBench from HuggingFace
|
|
35
|
+
cache_dir = os.path.join(output_path, "data")
|
|
36
|
+
ensure_directory_exists(cache_dir)
|
|
37
|
+
dataset = datasets.load_dataset(
|
|
38
|
+
"bigcode/bigcodebench",
|
|
39
|
+
revision="057dd1a30dd73d4ed59cfbaaae049870491fa4d6",
|
|
40
|
+
cache_dir=cache_dir,
|
|
41
|
+
split=self.version,
|
|
42
|
+
)
|
|
43
|
+
assert isinstance(dataset, datasets.Dataset)
|
|
44
|
+
|
|
45
|
+
# Read all instances
|
|
46
|
+
instances: List[Instance] = []
|
|
47
|
+
for row in dataset:
|
|
48
|
+
id = row["task_id"]
|
|
49
|
+
input = Input(text=row["instruct_prompt"])
|
|
50
|
+
instance = Instance(
|
|
51
|
+
id=id,
|
|
52
|
+
input=input,
|
|
53
|
+
references=[],
|
|
54
|
+
split=TEST_SPLIT,
|
|
55
|
+
)
|
|
56
|
+
instances.append(instance)
|
|
57
|
+
|
|
58
|
+
return instances
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
from typing import Dict, List
|
|
4
|
+
|
|
5
|
+
from filelock import FileLock
|
|
6
|
+
|
|
7
|
+
from helm.common.general import ensure_directory_exists, ensure_file_downloaded, shell
|
|
8
|
+
from helm.common.hierarchical_logger import hlog
|
|
9
|
+
from helm.benchmark.scenarios.bird_sql_scenario_helper import ( # type: ignore
|
|
10
|
+
generate_comment_prompt,
|
|
11
|
+
generate_schema_prompt,
|
|
12
|
+
)
|
|
13
|
+
from helm.benchmark.scenarios.scenario import (
|
|
14
|
+
CORRECT_TAG,
|
|
15
|
+
Scenario,
|
|
16
|
+
Instance,
|
|
17
|
+
Reference,
|
|
18
|
+
VALID_SPLIT,
|
|
19
|
+
Input,
|
|
20
|
+
Output,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _ensure_file_unzipped(source_path: str, target_path: str):
|
|
25
|
+
with FileLock(f"{target_path}.lock"):
|
|
26
|
+
if os.path.exists(target_path):
|
|
27
|
+
hlog(f"Not decompressing {source_path} because {target_path} already exists")
|
|
28
|
+
return
|
|
29
|
+
tmp_path = target_path + ".tmp"
|
|
30
|
+
ensure_directory_exists(tmp_path)
|
|
31
|
+
shell(["unzip", source_path, "-d", tmp_path])
|
|
32
|
+
shell(["mv", tmp_path, target_path])
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class BIRDSQLScenario(Scenario):
|
|
36
|
+
"""BIRD-SQL (Dev)"""
|
|
37
|
+
|
|
38
|
+
name = "bird_sql"
|
|
39
|
+
description = "bird_sql"
|
|
40
|
+
tags = ["sql"]
|
|
41
|
+
|
|
42
|
+
COT_PROMPT = """
|
|
43
|
+
Think step by step, then generate a single SQL query in valid SQLite syntax. Respond with only your reasoning and SQL query in the following tag-delimited format:
|
|
44
|
+
|
|
45
|
+
<reasoning>
|
|
46
|
+
INSERT_YOUR_REASONING_HERE
|
|
47
|
+
</reasoning>
|
|
48
|
+
<sql>
|
|
49
|
+
INSERT_YOUR_SQL_QUERY_HERE
|
|
50
|
+
</sql>""" # noqa: E501
|
|
51
|
+
|
|
52
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
53
|
+
data_root_path = os.path.join(output_path, "dev")
|
|
54
|
+
ensure_file_downloaded(
|
|
55
|
+
"https://bird-bench.oss-cn-beijing.aliyuncs.com/dev.zip", data_root_path, unpack=True, unpack_type="unzip"
|
|
56
|
+
)
|
|
57
|
+
databases_unzip_target = os.path.join(data_root_path, "unzipped_dev_databases")
|
|
58
|
+
_ensure_file_unzipped(os.path.join(data_root_path, "dev_databases.zip"), databases_unzip_target)
|
|
59
|
+
# Note: Zip file contains .DS_Store file at the root, which makes dev_databases unzip into a nested directory
|
|
60
|
+
databases_root_path = os.path.join(databases_unzip_target, "dev_databases")
|
|
61
|
+
|
|
62
|
+
database_schema_prompts: Dict[str, str] = {}
|
|
63
|
+
for database_name in os.listdir(databases_root_path):
|
|
64
|
+
database_path = os.path.join(databases_root_path, database_name, f"{database_name}.sqlite")
|
|
65
|
+
print(database_path)
|
|
66
|
+
if not os.path.exists(database_path):
|
|
67
|
+
# Ignore stray ".DS_Store" directory
|
|
68
|
+
continue
|
|
69
|
+
|
|
70
|
+
database_schema_prompt = generate_schema_prompt(database_path, num_rows=None)
|
|
71
|
+
database_schema_prompts[database_name] = database_schema_prompt
|
|
72
|
+
|
|
73
|
+
instances: List[Instance] = []
|
|
74
|
+
dataset_path = os.path.join(data_root_path, "dev.json")
|
|
75
|
+
dataset = json.load(open(dataset_path, "r"))
|
|
76
|
+
for row in dataset:
|
|
77
|
+
question_id: int = row["question_id"]
|
|
78
|
+
database_id: str = row["db_id"]
|
|
79
|
+
question: str = row["question"]
|
|
80
|
+
knowledge: str = row["evidence"]
|
|
81
|
+
gold_sql: str = row["SQL"]
|
|
82
|
+
|
|
83
|
+
schema_prompt = database_schema_prompts[database_id]
|
|
84
|
+
comment_prompt = generate_comment_prompt(question, knowledge)
|
|
85
|
+
combined_prompt = schema_prompt + "\n\n" + comment_prompt + self.COT_PROMPT
|
|
86
|
+
instance = Instance(
|
|
87
|
+
id=f"id{question_id}",
|
|
88
|
+
input=Input(text=combined_prompt),
|
|
89
|
+
references=[Reference(output=Output(text=gold_sql), tags=[CORRECT_TAG])],
|
|
90
|
+
extra_data={"db_id": row["db_id"]},
|
|
91
|
+
split=VALID_SPLIT,
|
|
92
|
+
)
|
|
93
|
+
instances.append(instance)
|
|
94
|
+
return instances
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
# flake8: noqa
|
|
2
|
+
# type: ignore
|
|
3
|
+
# fmt: off
|
|
4
|
+
|
|
5
|
+
from typing import Dict
|
|
6
|
+
import os
|
|
7
|
+
import sqlite3
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
# The following code is copied verbatim from:
|
|
11
|
+
# https://github.com/AlibabaResearch/DAMO-ConvAI/blob/90a76ef9ef1e2960c9bdfa38c63cc81b841e813e/bird/llm/src/gpt_request.py
|
|
12
|
+
# under the following license:
|
|
13
|
+
#
|
|
14
|
+
# MIT License
|
|
15
|
+
#
|
|
16
|
+
# Copyright (c) 2022 Alibaba Research
|
|
17
|
+
#
|
|
18
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
19
|
+
# of this software and associated documentation files (the "Software"), to deal
|
|
20
|
+
# in the Software without restriction, including without limitation the rights
|
|
21
|
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
22
|
+
# copies of the Software, and to permit persons to whom the Software is
|
|
23
|
+
# furnished to do so, subject to the following conditions:
|
|
24
|
+
#
|
|
25
|
+
# The above copyright notice and this permission notice shall be included in all
|
|
26
|
+
# copies or substantial portions of the Software.
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def get_db_schemas(bench_root: str, db_name: str) -> Dict[str, str]:
|
|
30
|
+
"""
|
|
31
|
+
Read an sqlite file, and return the CREATE commands for each of the tables in the database.
|
|
32
|
+
"""
|
|
33
|
+
asdf = 'database' if bench_root == 'spider' else 'databases'
|
|
34
|
+
with sqlite3.connect(f'file:{bench_root}/{asdf}/{db_name}/{db_name}.sqlite?mode=ro', uri=True) as conn:
|
|
35
|
+
# conn.text_factory = bytes
|
|
36
|
+
cursor = conn.cursor()
|
|
37
|
+
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
|
|
38
|
+
tables = cursor.fetchall()
|
|
39
|
+
schemas = {}
|
|
40
|
+
for table in tables:
|
|
41
|
+
cursor.execute("SELECT sql FROM sqlite_master WHERE type='table' AND name='{}';".format(table[0]))
|
|
42
|
+
schemas[table[0]] = cursor.fetchone()[0]
|
|
43
|
+
|
|
44
|
+
return schemas
|
|
45
|
+
|
|
46
|
+
def nice_look_table(column_names: list, values: list):
|
|
47
|
+
rows = []
|
|
48
|
+
# Determine the maximum width of each column
|
|
49
|
+
widths = [max(len(str(value[i])) for value in values + [column_names]) for i in range(len(column_names))]
|
|
50
|
+
|
|
51
|
+
# Print the column names
|
|
52
|
+
header = ''.join(f'{column.rjust(width)} ' for column, width in zip(column_names, widths))
|
|
53
|
+
# print(header)
|
|
54
|
+
# Print the values
|
|
55
|
+
for value in values:
|
|
56
|
+
row = ''.join(f'{str(v).rjust(width)} ' for v, width in zip(value, widths))
|
|
57
|
+
rows.append(row)
|
|
58
|
+
rows = "\n".join(rows)
|
|
59
|
+
final_output = header + '\n' + rows
|
|
60
|
+
return final_output
|
|
61
|
+
|
|
62
|
+
def generate_schema_prompt(db_path, num_rows=None):
|
|
63
|
+
# extract create ddls
|
|
64
|
+
'''
|
|
65
|
+
:param root_place:
|
|
66
|
+
:param db_name:
|
|
67
|
+
:return:
|
|
68
|
+
'''
|
|
69
|
+
full_schema_prompt_list = []
|
|
70
|
+
conn = sqlite3.connect(db_path)
|
|
71
|
+
# Create a cursor object
|
|
72
|
+
cursor = conn.cursor()
|
|
73
|
+
cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
|
|
74
|
+
tables = cursor.fetchall()
|
|
75
|
+
schemas = {}
|
|
76
|
+
for table in tables:
|
|
77
|
+
if table == 'sqlite_sequence':
|
|
78
|
+
continue
|
|
79
|
+
cursor.execute("SELECT sql FROM sqlite_master WHERE type='table' AND name='{}';".format(table[0]))
|
|
80
|
+
create_prompt = cursor.fetchone()[0]
|
|
81
|
+
schemas[table[0]] = create_prompt
|
|
82
|
+
if num_rows:
|
|
83
|
+
cur_table = table[0]
|
|
84
|
+
if cur_table in ['order', 'by', 'group']:
|
|
85
|
+
cur_table = "`{}`".format(cur_table)
|
|
86
|
+
|
|
87
|
+
cursor.execute("SELECT * FROM {} LIMIT {}".format(cur_table, num_rows))
|
|
88
|
+
column_names = [description[0] for description in cursor.description]
|
|
89
|
+
values = cursor.fetchall()
|
|
90
|
+
rows_prompt = nice_look_table(column_names=column_names, values=values)
|
|
91
|
+
verbose_prompt = "/* \n {} example rows: \n SELECT * FROM {} LIMIT {}; \n {} \n */".format(num_rows, cur_table, num_rows, rows_prompt)
|
|
92
|
+
schemas[table[0]] = "{} \n {}".format(create_prompt, verbose_prompt)
|
|
93
|
+
|
|
94
|
+
for k, v in schemas.items():
|
|
95
|
+
full_schema_prompt_list.append(v)
|
|
96
|
+
|
|
97
|
+
schema_prompt = "\n\n".join(full_schema_prompt_list)
|
|
98
|
+
|
|
99
|
+
return schema_prompt
|
|
100
|
+
|
|
101
|
+
def generate_comment_prompt(question, knowledge=None):
|
|
102
|
+
pattern_prompt_no_kg = "-- Using valid SQLite, answer the following questions for the tables provided above."
|
|
103
|
+
pattern_prompt_kg = "-- Using valid SQLite and understading External Knowledge, answer the following questions for the tables provided above."
|
|
104
|
+
# question_prompt = "-- {}".format(question) + '\n SELECT '
|
|
105
|
+
question_prompt = "-- {}".format(question)
|
|
106
|
+
knowledge_prompt = "-- External Knowledge: {}".format(knowledge)
|
|
107
|
+
|
|
108
|
+
if not knowledge_prompt:
|
|
109
|
+
result_prompt = pattern_prompt_no_kg + '\n' + question_prompt
|
|
110
|
+
else:
|
|
111
|
+
result_prompt = knowledge_prompt + '\n' + pattern_prompt_kg + '\n' + question_prompt
|
|
112
|
+
|
|
113
|
+
return result_prompt
|
|
114
|
+
|
|
115
|
+
def cot_wizard():
|
|
116
|
+
cot = "\nGenerate the SQL after thinking step by step: "
|
|
117
|
+
|
|
118
|
+
return cot
|
|
@@ -4,7 +4,7 @@ from typing import List
|
|
|
4
4
|
|
|
5
5
|
from helm.common.general import ensure_file_downloaded
|
|
6
6
|
from helm.common.hierarchical_logger import hlog
|
|
7
|
-
from .scenario import CORRECT_TAG, Reference, Scenario, Instance, Input, TEST_SPLIT, Output
|
|
7
|
+
from helm.benchmark.scenarios.scenario import CORRECT_TAG, Reference, Scenario, Instance, Input, TEST_SPLIT, Output
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
class BLiMPScenario(Scenario):
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
import os
|
|
3
|
+
import os.path
|
|
4
|
+
|
|
5
|
+
from datasets import load_dataset, DatasetDict
|
|
6
|
+
|
|
7
|
+
from helm.common.general import ensure_directory_exists
|
|
8
|
+
from helm.benchmark.scenarios.scenario import (
|
|
9
|
+
Input,
|
|
10
|
+
Scenario,
|
|
11
|
+
Instance,
|
|
12
|
+
Reference,
|
|
13
|
+
TRAIN_SPLIT,
|
|
14
|
+
TEST_SPLIT,
|
|
15
|
+
CORRECT_TAG,
|
|
16
|
+
Output,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class CaseHOLDScenario(Scenario):
|
|
21
|
+
"""
|
|
22
|
+
CaseHOLD QA
|
|
23
|
+
CaseHOLD is a multiple choice question answering task derived from legal citations in judicial rulings.
|
|
24
|
+
CaseHOLD consists of ~53,000 questions, mined from the Harvard Law Library case law corpus.
|
|
25
|
+
|
|
26
|
+
Dataset repository
|
|
27
|
+
https://huggingface.co/datasets/casehold/casehold
|
|
28
|
+
Publication
|
|
29
|
+
"When Does Pretraining Help? Assessing Self-Supervised Learning for Law and the CaseHOLD Dataset"
|
|
30
|
+
ICAIL, 2021
|
|
31
|
+
https://reglab.stanford.edu/data/casehold-benchmark/
|
|
32
|
+
https://arxiv.org/abs/2104.08671
|
|
33
|
+
|
|
34
|
+
Data content
|
|
35
|
+
The citing context from the judicial decision serves as the prompt for the question.
|
|
36
|
+
The answer choices are holding statements derived from citations following text in a legal decision.
|
|
37
|
+
There are five answer choices for each citing text.
|
|
38
|
+
The correct answer is the holding statement that corresponds to the citing text.
|
|
39
|
+
The four incorrect answers are other holding statements.
|
|
40
|
+
|
|
41
|
+
""" # noqa: E501
|
|
42
|
+
|
|
43
|
+
name = "casehold"
|
|
44
|
+
description = "CaseHOLD (Case Holdings On Legal Decisions) is a multiple choice question answering scenario where the task is to identify the relevant holding of a cited case [(Zheng et al, 2021)](https://arxiv.org/pdf/2104.08671.pdf)." # noqa: E501
|
|
45
|
+
tags = ["question_answering", "legal"]
|
|
46
|
+
|
|
47
|
+
# Note: Skip the validation split since we don't need it
|
|
48
|
+
HELM_SPLIT_NAME_TO_DATASETS_SPLIT_NAME = {TRAIN_SPLIT: "train", TEST_SPLIT: "test"}
|
|
49
|
+
NUM_REFERENCES = 5
|
|
50
|
+
|
|
51
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
52
|
+
data_path: str = os.path.join(output_path, "data")
|
|
53
|
+
ensure_directory_exists(data_path)
|
|
54
|
+
dataset: DatasetDict = load_dataset(
|
|
55
|
+
"casehold/casehold",
|
|
56
|
+
"all",
|
|
57
|
+
cache_dir=data_path,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
instances: List[Instance] = []
|
|
61
|
+
for helm_split_name, datasets_split_name in self.HELM_SPLIT_NAME_TO_DATASETS_SPLIT_NAME.items():
|
|
62
|
+
split_data = dataset[datasets_split_name]
|
|
63
|
+
for example in split_data:
|
|
64
|
+
example_id = example["example_id"]
|
|
65
|
+
citing_prompt = example["citing_prompt"]
|
|
66
|
+
holdings = [example[f"holding_{i}"] for i in range(self.NUM_REFERENCES)]
|
|
67
|
+
correct_label: str = example["label"]
|
|
68
|
+
references = [
|
|
69
|
+
Reference(Output(text=holdings[i]), tags=([CORRECT_TAG] if correct_label == str(i) else []))
|
|
70
|
+
for i in range(self.NUM_REFERENCES)
|
|
71
|
+
]
|
|
72
|
+
instance = Instance(
|
|
73
|
+
input=Input(text=citing_prompt),
|
|
74
|
+
references=references,
|
|
75
|
+
split=helm_split_name,
|
|
76
|
+
id=f"id{example_id}",
|
|
77
|
+
)
|
|
78
|
+
instances.append(instance)
|
|
79
|
+
return instances
|