crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.6.dist-info/METADATA +427 -0
- crfm_helm-0.5.6.dist-info/RECORD +941 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +21 -6
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +87 -63
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +214 -6
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +131 -0
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/executor.py +11 -12
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +14 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +14 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +6 -6
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/__init__.py +0 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/llm_jury_metrics.py +46 -0
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +14 -0
- helm/benchmark/metrics/medalign_metrics.py +14 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +14 -0
- helm/benchmark/metrics/medication_qa_metrics.py +10 -19
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/mental_health_metrics.py +14 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/metric_service.py +11 -11
- helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
- helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +2 -3
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +20 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
- helm/benchmark/metrics/toxicity_metrics.py +6 -6
- helm/benchmark/metrics/unitxt_metrics.py +7 -5
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
- helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
- helm/benchmark/metrics/vision_language/image_utils.py +2 -2
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_deployment_registry.py +6 -8
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +33 -12
- helm/benchmark/presentation/run_display.py +13 -0
- helm/benchmark/presentation/schema.py +2 -1
- helm/benchmark/presentation/summarize.py +97 -67
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +202 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +86 -90
- helm/benchmark/run_expander.py +90 -9
- helm/benchmark/run_spec_factory.py +13 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +657 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
- helm/benchmark/run_specs/experimental_run_specs.py +142 -3
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +141 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +103 -2
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/runner.py +5 -5
- helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +157 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +136 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +94 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
- helm/benchmark/scenarios/medbullets_scenario.py +145 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
- helm/benchmark/scenarios/medec_scenario.py +125 -0
- helm/benchmark/scenarios/medhallu_scenario.py +72 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +123 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +12 -2
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
- helm/benchmark/scenarios/raft_scenario.py +17 -2
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +13 -1
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +763 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +319 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_long_context.yaml +283 -0
- helm/benchmark/static/schema_medhelm.yaml +1140 -0
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +18 -1
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +129 -56
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-94295e78.js +10 -0
- helm/benchmark/static_build/assets/index-b9779128.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
- helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +6 -6
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +4 -5
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/tokenizer_service.py +7 -8
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/anthropic_client.py +69 -29
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +120 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
- helm/clients/audio_language/qwen_audiolm_client.py +152 -0
- helm/clients/audio_language/test.py +62 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +203 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/client.py +7 -7
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/grok_client.py +36 -0
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +52 -21
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
- helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
- helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +308 -43
- helm/clients/openai_responses_client.py +174 -0
- helm/clients/palmyra_client.py +3 -9
- helm/clients/reka_client.py +3 -3
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +76 -9
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +45 -13
- helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
- helm/clients/vision_language/huggingface_vlm_client.py +2 -2
- helm/clients/vision_language/idefics_client.py +6 -2
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/paligemma_client.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +188 -0
- helm/clients/vision_language/qwen_vlm_client.py +7 -5
- helm/clients/vllm_client.py +4 -6
- helm/clients/writer_client.py +102 -0
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/general.py +9 -2
- helm/common/hierarchical_logger.py +46 -3
- helm/common/images_utils.py +2 -2
- helm/common/local_context.py +140 -0
- helm/common/media_object.py +2 -2
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/remote_context.py +61 -0
- helm/common/request.py +14 -2
- helm/common/response_format.py +18 -0
- helm/common/test_media_object.py +1 -1
- helm/config/model_deployments.yaml +1792 -28
- helm/config/model_metadata.yaml +1606 -51
- helm/config/tokenizer_configs.yaml +521 -4
- helm/proxy/cli.py +5 -3
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -4
- helm/proxy/services/remote_service.py +1 -1
- helm/proxy/services/server_service.py +22 -86
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/grok_tokenizer.py +53 -0
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +3 -3
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.4.dist-info/METADATA +0 -350
- crfm_helm-0.5.4.dist-info/RECORD +0 -697
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,342 @@
|
|
|
1
|
+
from typing import Dict, List, Tuple, Literal
|
|
2
|
+
|
|
3
|
+
import random
|
|
4
|
+
import dataclasses
|
|
5
|
+
|
|
6
|
+
from copy import copy
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from helm.benchmark.scenarios.scenario import (
|
|
9
|
+
Scenario,
|
|
10
|
+
Instance,
|
|
11
|
+
Reference,
|
|
12
|
+
TRAIN_SPLIT,
|
|
13
|
+
TEST_SPLIT,
|
|
14
|
+
VALID_SPLIT,
|
|
15
|
+
CORRECT_TAG,
|
|
16
|
+
Input,
|
|
17
|
+
Output,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass(frozen=True)
|
|
22
|
+
class MELTLanguageLogicalStatement:
|
|
23
|
+
"""This class describes a logical statement in Vietnamese language, inspired by HELM
|
|
24
|
+
implementation of "Transformers as Soft Reasoners over Language" paper.
|
|
25
|
+
https://arxiv.org/abs/2002.05867
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
subject: str # e.g. either the individual or group to which this statement applies
|
|
29
|
+
subject_category: str # e.g. the group to which this fact applies
|
|
30
|
+
specifier_type: Literal["một", "cái"] # the specifier used for the subject
|
|
31
|
+
|
|
32
|
+
def generate_specified_subject(self, upper=False, specifier_type=None) -> str:
|
|
33
|
+
"""Handle the specification of the subject in the statement.
|
|
34
|
+
It is similar to the English "a" or "the" in the statement.
|
|
35
|
+
|
|
36
|
+
Example:
|
|
37
|
+
if (subject="con mèo", subject_category="động vật", specifier_type="cái", upper=False) -> "Cái con mèo"
|
|
38
|
+
if (subject="quả táo", subject_category="thực vật", specifier_type="a", upper=True) -> "Một quả táo"
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
specifier_type = self.specifier_type if specifier_type is None else specifier_type
|
|
42
|
+
if not (self.subject_category != "người") or (self.subject == "người"):
|
|
43
|
+
return self.subject
|
|
44
|
+
base_char = specifier_type[0].upper() if upper else specifier_type[0].lower()
|
|
45
|
+
return f"{base_char}{specifier_type[1:]} {self.subject}"
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass(frozen=True)
|
|
49
|
+
class MELTLanguageRule(MELTLanguageLogicalStatement):
|
|
50
|
+
"""Class describing how a set of attributes about an individual/group imply another attribute.
|
|
51
|
+
This class is inspired by HELM
|
|
52
|
+
implementation of "Transformers as Soft Reasoners over Language" paper.
|
|
53
|
+
https://arxiv.org/abs/2002.05867
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
condition: List[str] # a list of attributes which must apply for the rule to apply
|
|
57
|
+
condition_conjunction: Literal["và", "hoặc"] # "and" or "or", corresponding to
|
|
58
|
+
consequent: str # the attribute resulting from the application of the rule
|
|
59
|
+
|
|
60
|
+
def __str__(self) -> str:
|
|
61
|
+
"""Renders the rule, i.e. corresponding to "if x (and/or y) then z"
|
|
62
|
+
|
|
63
|
+
Rules should have the following format:
|
|
64
|
+
{
|
|
65
|
+
'subject': 'An',
|
|
66
|
+
'subject_category': 'người',
|
|
67
|
+
'specifier_type': 'cái' or 'một'
|
|
68
|
+
'condition': ['đỏ', 'tốt'],
|
|
69
|
+
'condition_conjunction': 'và',
|
|
70
|
+
'consequent': 'cold'
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
and this example will output a string: "Nếu An là đỏ và tốt, thì An là lạnh."
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
condition = f" {self.condition_conjunction} ".join(self.condition)
|
|
77
|
+
specified_subject = self.generate_specified_subject()
|
|
78
|
+
specified_particular_subject = self.generate_specified_subject(specifier_type="cái")
|
|
79
|
+
return f"Nếu {specified_subject} là {condition}, thì {specified_particular_subject} là {self.consequent}."
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
@dataclass(frozen=True)
|
|
83
|
+
class MELTLanguageFact(MELTLanguageLogicalStatement):
|
|
84
|
+
"""Class describing a statement that a subject has some attributes.
|
|
85
|
+
This class is inspired by HELM
|
|
86
|
+
implementation of "Transformers as Soft Reasoners over Language" paper.
|
|
87
|
+
https://arxiv.org/abs/2002.05867
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
specific_attributes: List[str] # more specific versions of the attributes
|
|
91
|
+
generic_attributes: List[str] # a list of attributes which apply to the subject
|
|
92
|
+
use_specific_attributes: bool # whether to use the more specific attributes (i.e. hard mode)
|
|
93
|
+
upper: bool = True # whether the statement should be uppercase
|
|
94
|
+
|
|
95
|
+
def __str__(self) -> str:
|
|
96
|
+
"""Maps from a set of attributes about a subject to a string
|
|
97
|
+
|
|
98
|
+
e.g. if (subject="con chó", attributes=["to", "đỏ"], specifier="cái") ->
|
|
99
|
+
"Cái con chó thì to và đỏ."
|
|
100
|
+
"""
|
|
101
|
+
|
|
102
|
+
if len(self.generic_attributes) == 0:
|
|
103
|
+
return "Không có gì."
|
|
104
|
+
target_attributes = self.specific_attributes if self.use_specific_attributes else self.generic_attributes
|
|
105
|
+
specified_subject = self.generate_specified_subject(upper=self.upper)
|
|
106
|
+
return f"{specified_subject} là {' và '.join(target_attributes)}."
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def get_vocab() -> Tuple[Dict[str, List[str]], Dict[str, List[str]]]:
|
|
110
|
+
"""All potential subjects for the facts and rules for sythetic_reasoning_natural as well as their categories.
|
|
111
|
+
Subjects is a dictionary of subject categories like "người" and "động vật" which correspond to
|
|
112
|
+
a list of potential subjects.
|
|
113
|
+
|
|
114
|
+
Attributes corresponds to an initial list of attributes which are only synonymous with themselves.
|
|
115
|
+
Intially, we default to not including these attributes, but we leave this feature in for convenience.
|
|
116
|
+
|
|
117
|
+
Attribute groups are a more general version of attributes, where a single attribute corresponds to a class of
|
|
118
|
+
attributes e.g. if we know something is chilly, we know that it is cold (but not assuming the reverse).
|
|
119
|
+
"""
|
|
120
|
+
|
|
121
|
+
# A list of subjects and their categories
|
|
122
|
+
subjects: Dict[str, List[str]] = {
|
|
123
|
+
"người": ["An", "Bình", "Cường", "Duy", "Đạt", "Phương"],
|
|
124
|
+
"động vật": [
|
|
125
|
+
"con chó",
|
|
126
|
+
"con mèo",
|
|
127
|
+
"con thỏ",
|
|
128
|
+
"con chuột",
|
|
129
|
+
"con hổ",
|
|
130
|
+
"con sư tử",
|
|
131
|
+
"con gấu",
|
|
132
|
+
"con sóc",
|
|
133
|
+
"con bò",
|
|
134
|
+
"con gấu trúc",
|
|
135
|
+
"con nhím",
|
|
136
|
+
"con voi",
|
|
137
|
+
"con hươu cao cổ",
|
|
138
|
+
"con hà mã",
|
|
139
|
+
],
|
|
140
|
+
"thực vật": ["hoa anh túc", "hoa bồ công anh", "cây", "hoa hồng", "hoa hướng dương"],
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
# Convert list of attributes into dictionary
|
|
144
|
+
# A list of attributes and their overarching meaning (used in hard difficulty)
|
|
145
|
+
attribute_groups = {
|
|
146
|
+
"trẻ": ["trẻ"],
|
|
147
|
+
"mềm": ["mềm"],
|
|
148
|
+
"buồn": ["buồn"],
|
|
149
|
+
"sợ": ["sợ"],
|
|
150
|
+
"lạnh": ["lạnh", "lạnh buốt", "mát mẻ"],
|
|
151
|
+
"nóng": ["nóng", "ấm"],
|
|
152
|
+
"thông minh": ["thông minh", "tài giỏi", "khôn", "sáng trí"],
|
|
153
|
+
"sạch": ["sạch", "ngăn nắp"],
|
|
154
|
+
"nhỏ": ["nhỏ", "bé", "tí nị"],
|
|
155
|
+
"to": ["to", "khổng lồ", "bự", "lớn"],
|
|
156
|
+
"tốt": ["tốt", "tử tế", "tốt bụng"],
|
|
157
|
+
"đẹp": ["đẹp", "xinh"],
|
|
158
|
+
"đỏ": ["đỏ", "đỏ thẫm"],
|
|
159
|
+
"xanh dương": ["xanh dương", "xanh lam"],
|
|
160
|
+
"xanh lục": ["xanh lục", "xanh lá cây"],
|
|
161
|
+
"tím": ["tím", "tím than"],
|
|
162
|
+
"chán": ["chán", "đần"],
|
|
163
|
+
"cũ": ["cũ", "xưa", "cổ"],
|
|
164
|
+
"mạnh": ["mạnh", "mạnh mẽ", "cơ bắp"],
|
|
165
|
+
"yếu": ["yếu", "yếu đuối", "mỏng manh"],
|
|
166
|
+
"nhanh": ["nhanh", "mau"],
|
|
167
|
+
"chậm": ["chậm", "chậm chạp"],
|
|
168
|
+
"xấu": ["xấu", "xấu xa", "ác", "độc ác"],
|
|
169
|
+
"hạnh phúc": ["hạnh phúc", "hân hoan", "vui mừng", "vui vẻ"],
|
|
170
|
+
"tròn": ["tròn", "hình tròn", "hình cầu"],
|
|
171
|
+
}
|
|
172
|
+
# Remove any keys which duplicate subitems
|
|
173
|
+
new_attribute_groups: Dict[str, List[str]] = copy(attribute_groups)
|
|
174
|
+
for general_attribute, specific_attributes in attribute_groups.items():
|
|
175
|
+
for specific_attribute in specific_attributes:
|
|
176
|
+
if (general_attribute != specific_attribute) and (specific_attribute in attribute_groups):
|
|
177
|
+
del new_attribute_groups[specific_attribute]
|
|
178
|
+
|
|
179
|
+
return new_attribute_groups, subjects
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def generate_rules(
|
|
183
|
+
attribute_groups: Dict[str, List[str]],
|
|
184
|
+
subject: str,
|
|
185
|
+
subject_category: str,
|
|
186
|
+
max_rules: int = 5,
|
|
187
|
+
specific_category: bool = False,
|
|
188
|
+
) -> List[MELTLanguageRule]:
|
|
189
|
+
"""Generates a random set of rules about a subject as dictionaries,
|
|
190
|
+
given a list of potential attributes and the category (e.g. người) of the subject (e.g. An)
|
|
191
|
+
|
|
192
|
+
These rules are guaranteed to not contradict one another, and attributes implied by a single rule will
|
|
193
|
+
not imply any attributes in any other rules (i.e. there is only a single step of reasoning).
|
|
194
|
+
"""
|
|
195
|
+
attributes_shuffled = list(attribute_groups.keys()).copy()
|
|
196
|
+
random.shuffle(attributes_shuffled)
|
|
197
|
+
rules: List[MELTLanguageRule] = []
|
|
198
|
+
|
|
199
|
+
while len(attributes_shuffled) > 2 and len(rules) < max_rules:
|
|
200
|
+
rule_subject = subject if specific_category else random.choice([subject_category, subject])
|
|
201
|
+
n_rule_attributes = random.randint(2, 3)
|
|
202
|
+
rule_attributes, attributes_shuffled = (
|
|
203
|
+
attributes_shuffled[:n_rule_attributes],
|
|
204
|
+
attributes_shuffled[n_rule_attributes:],
|
|
205
|
+
)
|
|
206
|
+
rules.append(
|
|
207
|
+
MELTLanguageRule(
|
|
208
|
+
subject=rule_subject,
|
|
209
|
+
subject_category=subject_category,
|
|
210
|
+
specifier_type="một",
|
|
211
|
+
condition=rule_attributes[:-1],
|
|
212
|
+
condition_conjunction=random.choice(["và", "hoặc"]),
|
|
213
|
+
consequent=rule_attributes[-1],
|
|
214
|
+
)
|
|
215
|
+
)
|
|
216
|
+
return rules
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def generate_test(
|
|
220
|
+
attribute_groups: Dict[str, List[str]],
|
|
221
|
+
subject: str,
|
|
222
|
+
subject_category: str,
|
|
223
|
+
rules: List[MELTLanguageRule],
|
|
224
|
+
use_specific_attributes: bool,
|
|
225
|
+
p_consequenceless=0.1,
|
|
226
|
+
) -> Tuple[MELTLanguageFact, List[MELTLanguageRule], MELTLanguageFact]:
|
|
227
|
+
"""Generates a test case given a set of rules, i.e. a statement about the subject from which something
|
|
228
|
+
can be potentially deduced given the rules. We include an argument, p_consequenceless, to re-roll with
|
|
229
|
+
some probability if the generated fact does not allow anything to be determined.
|
|
230
|
+
"""
|
|
231
|
+
|
|
232
|
+
# The generic attributes which the test fact will assign to the subject
|
|
233
|
+
test_attributes: List[str] = random.sample(list(attribute_groups.keys()), 2)
|
|
234
|
+
# The specific versions of the test attributes
|
|
235
|
+
test_attributes_specific: List[str] = [
|
|
236
|
+
random.choice(attribute_groups[subcondition]) for subcondition in test_attributes
|
|
237
|
+
]
|
|
238
|
+
test_consequents: List[str] = [] # The attributes implied by the test attributes and rules
|
|
239
|
+
test_rules_used: List[MELTLanguageRule] = []
|
|
240
|
+
for rule in rules:
|
|
241
|
+
if rule.consequent in test_attributes:
|
|
242
|
+
continue
|
|
243
|
+
if rule.condition_conjunction == "và":
|
|
244
|
+
if set(rule.condition).issubset(test_attributes):
|
|
245
|
+
test_rules_used.append(rule)
|
|
246
|
+
test_consequents.append(rule.consequent)
|
|
247
|
+
elif rule.condition_conjunction == "hoặc":
|
|
248
|
+
if not set(rule.condition).isdisjoint(test_attributes):
|
|
249
|
+
test_rules_used.append(rule)
|
|
250
|
+
test_consequents.append(rule.consequent)
|
|
251
|
+
if len(test_consequents) == 0 and random.random() > p_consequenceless:
|
|
252
|
+
return generate_test(
|
|
253
|
+
attribute_groups, subject, subject_category, rules, use_specific_attributes, p_consequenceless
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
test_fact: MELTLanguageFact = MELTLanguageFact(
|
|
257
|
+
subject,
|
|
258
|
+
subject_category,
|
|
259
|
+
specifier_type="cái",
|
|
260
|
+
specific_attributes=test_attributes_specific,
|
|
261
|
+
generic_attributes=test_attributes,
|
|
262
|
+
use_specific_attributes=use_specific_attributes,
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
target_fact: MELTLanguageFact = dataclasses.replace(
|
|
266
|
+
test_fact,
|
|
267
|
+
specific_attributes=test_consequents,
|
|
268
|
+
generic_attributes=test_consequents,
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
return test_fact, test_rules_used, target_fact
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
class MELTSRNScenario(Scenario):
|
|
275
|
+
"""
|
|
276
|
+
Synthetic Reasoning Natural Language benchmark inspired by "Transformers as Soft Reasoners over Language"
|
|
277
|
+
https://arxiv.org/abs/2002.05867
|
|
278
|
+
"""
|
|
279
|
+
|
|
280
|
+
name = "sythetic_reasoning_natural"
|
|
281
|
+
description = "Language Pattern Matching"
|
|
282
|
+
tags = ["reasoning", "language", "pattern_matching"]
|
|
283
|
+
|
|
284
|
+
def __init__(self, difficulty: str, random_seed=42):
|
|
285
|
+
super().__init__()
|
|
286
|
+
self.attribute_groups, self.subjects = get_vocab()
|
|
287
|
+
|
|
288
|
+
# specific_category specifies that the specific category should always be used
|
|
289
|
+
# e.g. "dog" instead of "an động vật"
|
|
290
|
+
self.specific_category: bool = difficulty == "easy"
|
|
291
|
+
# use_specific_attributes specifies that the synonymous attributes can be used
|
|
292
|
+
# e.g. "chill" instead of "cold"
|
|
293
|
+
self.use_specific_attributes: bool = difficulty == "hard"
|
|
294
|
+
self.include_intermediates: bool = False
|
|
295
|
+
self.num_train_instances: int = 1000
|
|
296
|
+
self.num_val_instances: int = 5000
|
|
297
|
+
self.num_test_instances: int = 5000
|
|
298
|
+
self.random_seed = random_seed
|
|
299
|
+
|
|
300
|
+
def generate_problem(
|
|
301
|
+
self,
|
|
302
|
+
) -> Tuple[List[MELTLanguageRule], MELTLanguageFact, List[MELTLanguageRule], MELTLanguageFact]:
|
|
303
|
+
subject_category = random.choice(list(self.subjects.keys()))
|
|
304
|
+
subject = random.choice(self.subjects[subject_category])
|
|
305
|
+
rules = generate_rules(
|
|
306
|
+
self.attribute_groups, subject, subject_category, specific_category=self.specific_category
|
|
307
|
+
)
|
|
308
|
+
test_fact, test_rules_used, target_fact = generate_test(
|
|
309
|
+
self.attribute_groups, subject, subject_category, rules, self.use_specific_attributes
|
|
310
|
+
)
|
|
311
|
+
return rules, test_fact, test_rules_used, target_fact
|
|
312
|
+
|
|
313
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
314
|
+
# Read all the instances
|
|
315
|
+
instances: List[Instance] = []
|
|
316
|
+
random.seed(self.random_seed)
|
|
317
|
+
|
|
318
|
+
for sample_idx in range(self.num_train_instances + self.num_val_instances + self.num_test_instances):
|
|
319
|
+
rules, test_fact, test_rules_used, target_fact = self.generate_problem()
|
|
320
|
+
|
|
321
|
+
question = "\n".join(str(rule) for rule in rules) + "\n"
|
|
322
|
+
test_specified_subject = test_fact.generate_specified_subject(upper=False)
|
|
323
|
+
question += f"Sự thật:\n{test_fact}\n"
|
|
324
|
+
if self.include_intermediates:
|
|
325
|
+
question += "Luật đã dùng:\n" + "\n".join(str(test_rule) for test_rule in test_rules_used) + "\n"
|
|
326
|
+
question += f"Những điều sau đây có thể được xác định về {test_specified_subject}:"
|
|
327
|
+
|
|
328
|
+
if sample_idx < self.num_train_instances:
|
|
329
|
+
split = TRAIN_SPLIT
|
|
330
|
+
elif sample_idx < self.num_train_instances + self.num_val_instances:
|
|
331
|
+
split = VALID_SPLIT
|
|
332
|
+
else:
|
|
333
|
+
split = TEST_SPLIT
|
|
334
|
+
|
|
335
|
+
instance = Instance(
|
|
336
|
+
input=Input(text=question),
|
|
337
|
+
references=[Reference(Output(text=str(target_fact)), tags=[CORRECT_TAG])],
|
|
338
|
+
split=split,
|
|
339
|
+
)
|
|
340
|
+
instances.append(instance)
|
|
341
|
+
|
|
342
|
+
return instances
|
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
from typing import Dict, List, Tuple
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Scenario,
|
|
7
|
+
Instance,
|
|
8
|
+
Reference,
|
|
9
|
+
TRAIN_SPLIT,
|
|
10
|
+
TEST_SPLIT,
|
|
11
|
+
VALID_SPLIT,
|
|
12
|
+
CORRECT_TAG,
|
|
13
|
+
Input,
|
|
14
|
+
Output,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
ANIMALS = [
|
|
19
|
+
"con ngựa vằn",
|
|
20
|
+
"con rắn hổ mang",
|
|
21
|
+
"con cò",
|
|
22
|
+
"con chim cánh cụt",
|
|
23
|
+
"con cá mập",
|
|
24
|
+
"con sư tử",
|
|
25
|
+
"con trâu",
|
|
26
|
+
"con cá voi",
|
|
27
|
+
"con hải cẩu",
|
|
28
|
+
"con đại bàng",
|
|
29
|
+
"con ngựa",
|
|
30
|
+
"con chuột",
|
|
31
|
+
]
|
|
32
|
+
FRUITS = [
|
|
33
|
+
"quả táo",
|
|
34
|
+
"quả đào",
|
|
35
|
+
"quả dưa hấu",
|
|
36
|
+
"quả chuối",
|
|
37
|
+
"quả nho",
|
|
38
|
+
"quả kiwi",
|
|
39
|
+
"quả lê",
|
|
40
|
+
"quả dâu tây",
|
|
41
|
+
"quả việt quất",
|
|
42
|
+
"quả mâm xôi",
|
|
43
|
+
]
|
|
44
|
+
RULE_SYMBOLS = ["X", "Y", "Z"]
|
|
45
|
+
MATH_SYMBOLS = ["+", "-", "*", "="]
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def subst(pattern: List[str], rule_symbol: str, substitute_str: str) -> List[str]:
|
|
49
|
+
"""
|
|
50
|
+
We substitute one rule symbols in a pattern according by a substitution str.
|
|
51
|
+
|
|
52
|
+
example:
|
|
53
|
+
pattern = "A+B=B+A"
|
|
54
|
+
rule_symbol = "A"
|
|
55
|
+
substitute_str = "quả táo"
|
|
56
|
+
return: "quả táo+B=B+quả táo"
|
|
57
|
+
|
|
58
|
+
:param pattern: A Pattern representing the rule.
|
|
59
|
+
:param rule_symbol: One rule symbol.
|
|
60
|
+
:param substitute_str: The substitution string.
|
|
61
|
+
:return: The result of substitution.
|
|
62
|
+
"""
|
|
63
|
+
assert rule_symbol in pattern
|
|
64
|
+
# check which index is the same as rule_symbol
|
|
65
|
+
indices = [i for i, x in enumerate(pattern) if x == rule_symbol]
|
|
66
|
+
|
|
67
|
+
# form a new string with the symbol replaced
|
|
68
|
+
new_string = pattern[: indices[0]] + [substitute_str]
|
|
69
|
+
for i, j in zip(indices[:-1], indices[1:]):
|
|
70
|
+
new_string += pattern[i + 1 : j]
|
|
71
|
+
new_string += [substitute_str]
|
|
72
|
+
new_string += pattern[indices[-1] + 1 :]
|
|
73
|
+
|
|
74
|
+
return new_string
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def pattern_subst(pattern: List[str], rule_symbols: List[str], substitute_dict: Dict[str, str]) -> List[str]:
|
|
78
|
+
"""
|
|
79
|
+
We substitute the rule symbols in a pattern according to a substitution dictionary.
|
|
80
|
+
|
|
81
|
+
example:
|
|
82
|
+
pattern = "A+B=B+A"
|
|
83
|
+
rule_symbols = ["A", "B"]
|
|
84
|
+
substitute_dict = {"A":"quả táo", "B":"quả đào"}
|
|
85
|
+
return: "quả táo+quả đào=quả đào+quả táo"
|
|
86
|
+
|
|
87
|
+
:param pattern: A Pattern representing the rule.
|
|
88
|
+
:param rule_symbols: The set of rule symbols.
|
|
89
|
+
:param substitute_dict: The substitution dictionary.
|
|
90
|
+
:return: The result of substitution.
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
out = pattern
|
|
94
|
+
# we iteratively replace each rule symbol with its subsitution string
|
|
95
|
+
for symbol in rule_symbols:
|
|
96
|
+
out = subst(out, symbol, substitute_dict[symbol])
|
|
97
|
+
return out
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class MELTSyntheticReasoningScenario(Scenario):
|
|
101
|
+
"""
|
|
102
|
+
Synthetic Reasoning benchmark inspired by
|
|
103
|
+
"LIME: Learning Inductive Bias for Primitives of Mathematical Reasoning"
|
|
104
|
+
https://arxiv.org/abs/2101.06223
|
|
105
|
+
"""
|
|
106
|
+
|
|
107
|
+
name = "synthetic_reasoning"
|
|
108
|
+
description = "Synthetic reasoning benchmark"
|
|
109
|
+
tags = ["reasoning", "language", "pattern_matching"]
|
|
110
|
+
|
|
111
|
+
def __init__(self, mode: str, random_seed=42):
|
|
112
|
+
super().__init__()
|
|
113
|
+
self.num_train_instances: int = 1000
|
|
114
|
+
self.num_val_instances: int = 5000
|
|
115
|
+
self.num_test_instances: int = 5000
|
|
116
|
+
self.rng = np.random.RandomState(random_seed)
|
|
117
|
+
self.mode = mode
|
|
118
|
+
assert self.mode in ["variable_substitution", "pattern_match", "induction"], f"Unsupported mode: {self.mode}"
|
|
119
|
+
|
|
120
|
+
def gen_subst(self, rule_symbols: List[str], tokens: List[str]) -> Tuple[Dict[str, str], str]:
|
|
121
|
+
"""
|
|
122
|
+
For each of the rule symbol, we sample a random substitution string composed of randomly sampled tokens.
|
|
123
|
+
|
|
124
|
+
:param rule_symbols: A list of rule symbols.
|
|
125
|
+
:param tokens: Tokens used to construct the substitution.
|
|
126
|
+
:return: We return a substitution dictionary, and its string representation.
|
|
127
|
+
"""
|
|
128
|
+
substitute_dict = {}
|
|
129
|
+
substitute_str = []
|
|
130
|
+
for char in rule_symbols:
|
|
131
|
+
subst_len = self.rng.randint(1, 3)
|
|
132
|
+
subst = " ".join(self.rng.choice(tokens, size=subst_len))
|
|
133
|
+
substitute_dict.update({char: subst})
|
|
134
|
+
substitute_str.append(char)
|
|
135
|
+
substitute_str.append("bởi")
|
|
136
|
+
substitute_str.append('"')
|
|
137
|
+
substitute_str.append(subst)
|
|
138
|
+
substitute_str.append('"')
|
|
139
|
+
substitute_str.append(",")
|
|
140
|
+
substitute_dict_str = " ".join(substitute_str[:-1])
|
|
141
|
+
return substitute_dict, substitute_dict_str
|
|
142
|
+
|
|
143
|
+
def gen_pattern(self, math_symbols: List[str], rule_symbols: List[str]) -> List[str]:
|
|
144
|
+
"""
|
|
145
|
+
Generate a pattern string.
|
|
146
|
+
|
|
147
|
+
Example Input: math_symbols: ["+", "-", "*"], rule_symbols: ["Y", "Y", "Z"]
|
|
148
|
+
Example Output: ["Y", "Y", "+", "Z", "="]
|
|
149
|
+
"""
|
|
150
|
+
pattern = rule_symbols + math_symbols
|
|
151
|
+
self.rng.shuffle(pattern)
|
|
152
|
+
return pattern
|
|
153
|
+
|
|
154
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
155
|
+
# We fix the seed for data generation to ensure reproducibility.
|
|
156
|
+
# Read all the instances
|
|
157
|
+
instances: List[Instance] = []
|
|
158
|
+
|
|
159
|
+
rule_symbols = RULE_SYMBOLS
|
|
160
|
+
tokens = ANIMALS + FRUITS
|
|
161
|
+
math_symbols = MATH_SYMBOLS
|
|
162
|
+
|
|
163
|
+
for sample_idx in range(self.num_train_instances + self.num_val_instances + self.num_test_instances):
|
|
164
|
+
# Sample rule symbols
|
|
165
|
+
sampled_rule_symbols = list(self.rng.choice(rule_symbols, size=self.rng.randint(2, 4)))
|
|
166
|
+
sampled_rule_symbols_set = sorted(list(set(sampled_rule_symbols))) # sorted to make it deterministic
|
|
167
|
+
|
|
168
|
+
# Sample math symbols
|
|
169
|
+
sampled_math_symbols = list(self.rng.choice(math_symbols, size=self.rng.randint(2, 4)))
|
|
170
|
+
|
|
171
|
+
# generate the pattern
|
|
172
|
+
pattern = self.gen_pattern(sampled_math_symbols, sampled_rule_symbols)
|
|
173
|
+
|
|
174
|
+
# generate one substitution
|
|
175
|
+
substitute_dict, substitute_dict_str = self.gen_subst(sampled_rule_symbols_set, tokens)
|
|
176
|
+
result = pattern_subst(pattern, sampled_rule_symbols_set, substitute_dict)
|
|
177
|
+
|
|
178
|
+
# generate another substitution
|
|
179
|
+
substitute_dict_2, _ = self.gen_subst(sampled_rule_symbols_set, tokens)
|
|
180
|
+
result_2 = pattern_subst(pattern, sampled_rule_symbols_set, substitute_dict_2)
|
|
181
|
+
|
|
182
|
+
result_string = " ".join(result)
|
|
183
|
+
pattern_string = " ".join(pattern)
|
|
184
|
+
|
|
185
|
+
src: str
|
|
186
|
+
tgt: str
|
|
187
|
+
if self.mode == "induction":
|
|
188
|
+
result_string_2 = " ".join(result_2)
|
|
189
|
+
src = f"Hai kết quả: {result_string} | {result_string_2}"
|
|
190
|
+
tgt = f"Quy luật: {pattern_string}"
|
|
191
|
+
elif self.mode == "variable_substitution":
|
|
192
|
+
src = f"Các quy luật: {pattern_string} | Thay thế: {substitute_dict_str}"
|
|
193
|
+
tgt = " ".join(result)
|
|
194
|
+
elif self.mode == "pattern_match":
|
|
195
|
+
# we sample 3 other pattern strings as negatives for patterns matching.
|
|
196
|
+
other_patterns = [
|
|
197
|
+
" ".join(self.gen_pattern(sampled_math_symbols, sampled_rule_symbols_set)) for _ in range(3)
|
|
198
|
+
]
|
|
199
|
+
all_patterns = other_patterns + [pattern_string]
|
|
200
|
+
self.rng.shuffle(all_patterns)
|
|
201
|
+
all_pattern_string = " | ".join(all_patterns)
|
|
202
|
+
src = f"Các quy luật: {all_pattern_string} | Kết quả: {result_string}"
|
|
203
|
+
tgt = pattern_string
|
|
204
|
+
else:
|
|
205
|
+
raise ValueError(f"Invalid mode: {self.mode}")
|
|
206
|
+
|
|
207
|
+
split: str
|
|
208
|
+
if sample_idx < self.num_train_instances:
|
|
209
|
+
split = TRAIN_SPLIT
|
|
210
|
+
elif sample_idx < self.num_train_instances + self.num_val_instances:
|
|
211
|
+
split = VALID_SPLIT
|
|
212
|
+
else:
|
|
213
|
+
split = TEST_SPLIT
|
|
214
|
+
|
|
215
|
+
instance = Instance(
|
|
216
|
+
input=Input(text=src),
|
|
217
|
+
references=[Reference(Output(text=tgt), tags=[CORRECT_TAG])],
|
|
218
|
+
split=split,
|
|
219
|
+
)
|
|
220
|
+
instances.append(instance)
|
|
221
|
+
|
|
222
|
+
return instances
|