crfm-helm 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.5.dist-info/METADATA +413 -0
- crfm_helm-0.5.5.dist-info/RECORD +894 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +20 -5
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +98 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +87 -63
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +218 -6
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +132 -0
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +34 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +34 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +8 -7
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +34 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/__init__.py +0 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +34 -0
- helm/benchmark/metrics/medalign_metrics.py +34 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +34 -0
- helm/benchmark/metrics/medication_qa_metrics.py +15 -4
- helm/benchmark/metrics/mental_health_metrics.py +34 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/mimic_rrs_metrics.py +34 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +34 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +34 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +34 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +1 -1
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +19 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/toxicity_metrics.py +4 -4
- helm/benchmark/metrics/unitxt_metrics.py +4 -1
- helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/summarize.py +23 -10
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +203 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +8 -17
- helm/benchmark/run_expander.py +78 -8
- helm/benchmark/run_spec_factory.py +12 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +613 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +260 -0
- helm/benchmark/run_specs/experimental_run_specs.py +112 -3
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +89 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1155 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +75 -2
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/scenarios/aci_bench_scenario.py +120 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +128 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +69 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +106 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +105 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +153 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +157 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +131 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1546 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +131 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_sum_scenario.py +82 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +22 -24
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +88 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +429 -0
- helm/benchmark/scenarios/medbullets_scenario.py +140 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +125 -0
- helm/benchmark/scenarios/medec_scenario.py +120 -0
- helm/benchmark/scenarios/medhallu_scenario.py +66 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +105 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +112 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +98 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +89 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +71 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +141 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +141 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +271 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +10 -1
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +54 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +142 -0
- helm/benchmark/scenarios/raft_scenario.py +17 -2
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +69 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +70 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +70 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +72 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +66 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +76 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +89 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +69 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +90 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +46 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +10 -1
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +11 -0
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +752 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/schema_capabilities.yaml +254 -0
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +298 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/{schema_medical.yaml → schema_long_context.yaml} +67 -82
- helm/benchmark/static/schema_medhelm.yaml +1081 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +18 -1
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +109 -36
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-262903c1.js +10 -0
- helm/benchmark/static_build/assets/index-42060d71.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-9cefc3c5.js} +1 -1
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +5 -5
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +1 -1
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +1 -1
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +118 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +188 -0
- helm/clients/audio_language/qwen_audiolm_client.py +150 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +201 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +10 -18
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +2 -2
- helm/clients/image_generation/dalle_mini/model/processor.py +4 -4
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +241 -22
- helm/clients/palmyra_client.py +1 -4
- helm/clients/reka_client.py +1 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +47 -7
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +39 -13
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +175 -0
- helm/clients/vllm_client.py +4 -6
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/images_utils.py +2 -2
- helm/common/media_object.py +2 -2
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/request.py +6 -2
- helm/common/response_format.py +18 -0
- helm/common/test_media_object.py +1 -1
- helm/config/model_deployments.yaml +1112 -19
- helm/config/model_metadata.yaml +985 -44
- helm/config/tokenizer_configs.yaml +379 -3
- helm/proxy/cli.py +2 -2
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -4
- helm/proxy/services/remote_service.py +1 -1
- helm/proxy/services/server_service.py +1 -1
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +2 -2
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.4.dist-info/METADATA +0 -350
- crfm_helm-0.5.4.dist-info/RECORD +0 -697
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.5.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
# flake8: noqa
|
|
2
|
+
# type: ignore
|
|
3
|
+
# fmt: off
|
|
4
|
+
# The following code was reproduced with minor modifications to `import` statements from the following URL:
|
|
5
|
+
# https://github.com/google-research/google-research/blob/c7f60c013623e613732a096e2a0c2872491ec912/instruction_following_eval/instructions_registry.py
|
|
6
|
+
|
|
7
|
+
# coding=utf-8
|
|
8
|
+
# Copyright 2023 The Google Research Authors.
|
|
9
|
+
#
|
|
10
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
11
|
+
# you may not use this file except in compliance with the License.
|
|
12
|
+
# You may obtain a copy of the License at
|
|
13
|
+
#
|
|
14
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
15
|
+
#
|
|
16
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
17
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
18
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
19
|
+
# See the License for the specific language governing permissions and
|
|
20
|
+
# limitations under the License.
|
|
21
|
+
|
|
22
|
+
"""Registry of all instructions."""
|
|
23
|
+
import helm.benchmark.metrics.ifeval.instructions as instructions
|
|
24
|
+
|
|
25
|
+
_KEYWORD = "keywords:"
|
|
26
|
+
|
|
27
|
+
_LANGUAGE = "language:"
|
|
28
|
+
|
|
29
|
+
_LENGTH = "length_constraints:"
|
|
30
|
+
|
|
31
|
+
_CONTENT = "detectable_content:"
|
|
32
|
+
|
|
33
|
+
_FORMAT = "detectable_format:"
|
|
34
|
+
|
|
35
|
+
_MULTITURN = "multi-turn:"
|
|
36
|
+
|
|
37
|
+
_COMBINATION = "combination:"
|
|
38
|
+
|
|
39
|
+
_STARTEND = "startend:"
|
|
40
|
+
|
|
41
|
+
_CHANGE_CASES = "change_case:"
|
|
42
|
+
|
|
43
|
+
_PUNCTUATION = "punctuation:"
|
|
44
|
+
|
|
45
|
+
INSTRUCTION_DICT = {
|
|
46
|
+
_KEYWORD + "existence": instructions.KeywordChecker,
|
|
47
|
+
_KEYWORD + "frequency": instructions.KeywordFrequencyChecker,
|
|
48
|
+
# TODO(jeffreyzhou): make a proper set of sentences to choose from
|
|
49
|
+
# _KEYWORD + "key_sentences": instructions.KeySentenceChecker,
|
|
50
|
+
_KEYWORD + "forbidden_words": instructions.ForbiddenWords,
|
|
51
|
+
_KEYWORD + "letter_frequency": instructions.LetterFrequencyChecker,
|
|
52
|
+
_LANGUAGE + "response_language": instructions.ResponseLanguageChecker,
|
|
53
|
+
_LENGTH + "number_sentences": instructions.NumberOfSentences,
|
|
54
|
+
_LENGTH + "number_paragraphs": instructions.ParagraphChecker,
|
|
55
|
+
_LENGTH + "number_words": instructions.NumberOfWords,
|
|
56
|
+
_LENGTH + "nth_paragraph_first_word": instructions.ParagraphFirstWordCheck,
|
|
57
|
+
_CONTENT + "number_placeholders": instructions.PlaceholderChecker,
|
|
58
|
+
_CONTENT + "postscript": instructions.PostscriptChecker,
|
|
59
|
+
_FORMAT + "number_bullet_lists": instructions.BulletListChecker,
|
|
60
|
+
# TODO(jeffreyzhou): Pre-create paragraph or use prompt to replace
|
|
61
|
+
# _CONTENT + "rephrase_paragraph": instructions.RephraseParagraph,
|
|
62
|
+
_FORMAT + "constrained_response": instructions.ConstrainedResponseChecker,
|
|
63
|
+
_FORMAT + "number_highlighted_sections": (
|
|
64
|
+
instructions.HighlightSectionChecker),
|
|
65
|
+
_FORMAT + "multiple_sections": instructions.SectionChecker,
|
|
66
|
+
# TODO(tianjianlu): Re-enable rephrasing with preprocessing the message.
|
|
67
|
+
# _FORMAT + "rephrase": instructions.RephraseChecker,
|
|
68
|
+
_FORMAT + "json_format": instructions.JsonFormat,
|
|
69
|
+
_FORMAT + "title": instructions.TitleChecker,
|
|
70
|
+
# TODO(tianjianlu): Re-enable with specific prompts.
|
|
71
|
+
# _MULTITURN + "constrained_start": instructions.ConstrainedStartChecker,
|
|
72
|
+
_COMBINATION + "two_responses": instructions.TwoResponsesChecker,
|
|
73
|
+
_COMBINATION + "repeat_prompt": instructions.RepeatPromptThenAnswer,
|
|
74
|
+
_STARTEND + "end_checker": instructions.EndChecker,
|
|
75
|
+
_CHANGE_CASES
|
|
76
|
+
+ "capital_word_frequency": instructions.CapitalWordFrequencyChecker,
|
|
77
|
+
_CHANGE_CASES
|
|
78
|
+
+ "english_capital": instructions.CapitalLettersEnglishChecker,
|
|
79
|
+
_CHANGE_CASES
|
|
80
|
+
+ "english_lowercase": instructions.LowercaseLettersEnglishChecker,
|
|
81
|
+
_PUNCTUATION + "no_comma": instructions.CommaChecker,
|
|
82
|
+
_STARTEND + "quotation": instructions.QuotationChecker,
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
INSTRUCTION_CONFLICTS = {
|
|
86
|
+
_KEYWORD + "existence": {_KEYWORD + "existence"},
|
|
87
|
+
_KEYWORD + "frequency": {_KEYWORD + "frequency"},
|
|
88
|
+
# TODO(jeffreyzhou): make a proper set of sentences to choose from
|
|
89
|
+
# _KEYWORD + "key_sentences": instructions.KeySentenceChecker,
|
|
90
|
+
_KEYWORD + "forbidden_words": {_KEYWORD + "forbidden_words"},
|
|
91
|
+
_KEYWORD + "letter_frequency": {_KEYWORD + "letter_frequency"},
|
|
92
|
+
_LANGUAGE
|
|
93
|
+
+ "response_language": {
|
|
94
|
+
_LANGUAGE + "response_language",
|
|
95
|
+
_FORMAT + "multiple_sections",
|
|
96
|
+
_KEYWORD + "existence",
|
|
97
|
+
_KEYWORD + "frequency",
|
|
98
|
+
_KEYWORD + "forbidden_words",
|
|
99
|
+
_STARTEND + "end_checker",
|
|
100
|
+
_CHANGE_CASES + "english_capital",
|
|
101
|
+
_CHANGE_CASES + "english_lowercase",
|
|
102
|
+
},
|
|
103
|
+
_LENGTH + "number_sentences": {_LENGTH + "number_sentences"},
|
|
104
|
+
_LENGTH + "number_paragraphs": {
|
|
105
|
+
_LENGTH + "number_paragraphs",
|
|
106
|
+
_LENGTH + "nth_paragraph_first_word",
|
|
107
|
+
_LENGTH + "number_sentences",
|
|
108
|
+
_LENGTH + "nth_paragraph_first_word",
|
|
109
|
+
},
|
|
110
|
+
_LENGTH + "number_words": {_LENGTH + "number_words"},
|
|
111
|
+
_LENGTH + "nth_paragraph_first_word": {
|
|
112
|
+
_LENGTH + "nth_paragraph_first_word",
|
|
113
|
+
_LENGTH + "number_paragraphs",
|
|
114
|
+
},
|
|
115
|
+
_CONTENT + "number_placeholders": {_CONTENT + "number_placeholders"},
|
|
116
|
+
_CONTENT + "postscript": {_CONTENT + "postscript"},
|
|
117
|
+
_FORMAT + "number_bullet_lists": {_FORMAT + "number_bullet_lists"},
|
|
118
|
+
# TODO(jeffreyzhou): Pre-create paragraph or use prompt to replace
|
|
119
|
+
# _CONTENT + "rephrase_paragraph": instructions.RephraseParagraph,
|
|
120
|
+
_FORMAT + "constrained_response": set(INSTRUCTION_DICT.keys()),
|
|
121
|
+
_FORMAT
|
|
122
|
+
+ "number_highlighted_sections": {_FORMAT + "number_highlighted_sections"},
|
|
123
|
+
_FORMAT
|
|
124
|
+
+ "multiple_sections": {
|
|
125
|
+
_FORMAT + "multiple_sections",
|
|
126
|
+
_LANGUAGE + "response_language",
|
|
127
|
+
_FORMAT + "number_highlighted_sections",
|
|
128
|
+
},
|
|
129
|
+
# TODO(tianjianlu): Re-enable rephrasing with preprocessing the message.
|
|
130
|
+
# _FORMAT + "rephrase": instructions.RephraseChecker,
|
|
131
|
+
_FORMAT
|
|
132
|
+
+ "json_format": set(INSTRUCTION_DICT.keys()).difference(
|
|
133
|
+
{_KEYWORD + "forbidden_words", _KEYWORD + "existence"}
|
|
134
|
+
),
|
|
135
|
+
_FORMAT + "title": {_FORMAT + "title"},
|
|
136
|
+
# TODO(tianjianlu): Re-enable with specific prompts.
|
|
137
|
+
# _MULTITURN + "constrained_start": instructions.ConstrainedStartChecker,
|
|
138
|
+
_COMBINATION
|
|
139
|
+
+ "two_responses": set(INSTRUCTION_DICT.keys()).difference({
|
|
140
|
+
_KEYWORD + "forbidden_words",
|
|
141
|
+
_KEYWORD + "existence",
|
|
142
|
+
_LANGUAGE + "response_language",
|
|
143
|
+
_FORMAT + "title",
|
|
144
|
+
_PUNCTUATION + "no_comma"
|
|
145
|
+
}),
|
|
146
|
+
_COMBINATION + "repeat_prompt": set(INSTRUCTION_DICT.keys()).difference({
|
|
147
|
+
_KEYWORD + "existence",
|
|
148
|
+
_FORMAT + "title",
|
|
149
|
+
_PUNCTUATION + "no_comma"
|
|
150
|
+
}),
|
|
151
|
+
_STARTEND + "end_checker": {_STARTEND + "end_checker"},
|
|
152
|
+
_CHANGE_CASES + "capital_word_frequency": {
|
|
153
|
+
_CHANGE_CASES + "capital_word_frequency",
|
|
154
|
+
_CHANGE_CASES + "english_lowercase",
|
|
155
|
+
_CHANGE_CASES + "english_capital",
|
|
156
|
+
},
|
|
157
|
+
_CHANGE_CASES + "english_capital": {_CHANGE_CASES + "english_capital"},
|
|
158
|
+
_CHANGE_CASES + "english_lowercase": {
|
|
159
|
+
_CHANGE_CASES + "english_lowercase",
|
|
160
|
+
_CHANGE_CASES + "english_capital",
|
|
161
|
+
},
|
|
162
|
+
_PUNCTUATION + "no_comma": {_PUNCTUATION + "no_comma"},
|
|
163
|
+
_STARTEND + "quotation": {_STARTEND + "quotation", _FORMAT + "title"},
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def conflict_make(conflicts):
|
|
168
|
+
"""Makes sure if A conflicts with B, B will conflict with A.
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
conflicts: Dictionary of potential conflicts where key is instruction id
|
|
172
|
+
and value is set of instruction ids that it conflicts with.
|
|
173
|
+
|
|
174
|
+
Returns:
|
|
175
|
+
Revised version of the dictionary. All instructions conflict with
|
|
176
|
+
themselves. If A conflicts with B, B will conflict with A.
|
|
177
|
+
"""
|
|
178
|
+
for key in conflicts:
|
|
179
|
+
for k in conflicts[key]:
|
|
180
|
+
conflicts[k].add(key)
|
|
181
|
+
conflicts[key].add(key)
|
|
182
|
+
return conflicts
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
# flake8: noqa
|
|
2
|
+
# type: ignore
|
|
3
|
+
# fmt: off
|
|
4
|
+
# The following code was reproduced from the following URL:
|
|
5
|
+
# https://github.com/google-research/google-research/blob/c7f60c013623e613732a096e2a0c2872491ec912/instruction_following_eval/instructions_util.py
|
|
6
|
+
|
|
7
|
+
# coding=utf-8
|
|
8
|
+
# Copyright 2023 The Google Research Authors.
|
|
9
|
+
#
|
|
10
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
11
|
+
# you may not use this file except in compliance with the License.
|
|
12
|
+
# You may obtain a copy of the License at
|
|
13
|
+
#
|
|
14
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
15
|
+
#
|
|
16
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
17
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
18
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
19
|
+
# See the License for the specific language governing permissions and
|
|
20
|
+
# limitations under the License.
|
|
21
|
+
|
|
22
|
+
"""Utility library of instructions."""
|
|
23
|
+
|
|
24
|
+
import functools
|
|
25
|
+
import random
|
|
26
|
+
import re
|
|
27
|
+
from typing import List
|
|
28
|
+
|
|
29
|
+
import immutabledict
|
|
30
|
+
import nltk
|
|
31
|
+
|
|
32
|
+
WORD_LIST = ["western", "sentence", "signal", "dump", "spot", "opposite", "bottom", "potato", "administration", "working", "welcome", "morning", "good", "agency", "primary", "wish", "responsibility", "press", "problem", "president", "steal", "brush", "read", "type", "beat", "trainer", "growth", "lock", "bone", "case", "equal", "comfortable", "region", "replacement", "performance", "mate", "walk", "medicine", "film", "thing", "rock", "tap", "total", "competition", "ease", "south", "establishment", "gather", "parking", "world", "plenty", "breath", "claim", "alcohol", "trade", "dear", "highlight", "street", "matter", "decision", "mess", "agreement", "studio", "coach", "assist", "brain", "wing", "style", "private", "top", "brown", "leg", "buy", "procedure", "method", "speed", "high", "company", "valuable", "pie", "analyst", "session", "pattern", "district", "pleasure", "dinner", "swimming", "joke", "order", "plate", "department", "motor", "cell", "spend", "cabinet", "difference", "power", "examination", "engine", "horse", "dimension", "pay", "toe", "curve", "literature", "bother", "fire", "possibility", "debate", "activity", "passage", "hello", "cycle", "background", "quiet", "author", "effect", "actor", "page", "bicycle", "error", "throat", "attack", "character", "phone", "tea", "increase", "outcome", "file", "specific", "inspector", "internal", "potential", "staff", "building", "employer", "shoe", "hand", "direction", "garden", "purchase", "interview", "study", "recognition", "member", "spiritual", "oven", "sandwich", "weird", "passenger", "particular", "response", "reaction", "size", "variation", "a", "cancel", "candy", "exit", "guest", "condition", "fly", "price", "weakness", "convert", "hotel", "great", "mouth", "mind", "song", "sugar", "suspect", "telephone", "ear", "roof", "paint", "refrigerator", "organization", "jury", "reward", "engineering", "day", "possession", "crew", "bar", "road", "description", "celebration", "score", "mark", "letter", "shower", "suggestion", "sir", "luck", "national", "progress", "hall", "stroke", "theory", "offer", "story", "tax", "definition", "history", "ride", "medium", "opening", "glass", "elevator", "stomach", "question", "ability", "leading", "village", "computer", "city", "grand", "confidence", "candle", "priest", "recommendation", "point", "necessary", "body", "desk", "secret", "horror", "noise", "culture", "warning", "water", "round", "diet", "flower", "bus", "tough", "permission", "week", "prompt", "connection", "abuse", "height", "save", "corner", "border", "stress", "drive", "stop", "rip", "meal", "listen", "confusion", "girlfriend", "living", "relation", "significance", "plan", "creative", "atmosphere", "blame", "invite", "housing", "paper", "drink", "roll", "silver", "drunk", "age", "damage", "smoke", "environment", "pack", "savings", "influence", "tourist", "rain", "post", "sign", "grandmother", "run", "profit", "push", "clerk", "final", "wine", "swim", "pause", "stuff", "singer", "funeral", "average", "source", "scene", "tradition", "personal", "snow", "nobody", "distance", "sort", "sensitive", "animal", "major", "negotiation", "click", "mood", "period", "arrival", "expression", "holiday", "repeat", "dust", "closet", "gold", "bad", "sail", "combination", "clothes", "emphasis", "duty", "black", "step", "school", "jump", "document", "professional", "lip", "chemical", "front", "wake", "while", "inside", "watch", "row", "subject", "penalty", "balance", "possible", "adult", "aside", "sample", "appeal", "wedding", "depth", "king", "award", "wife", "blow", "site", "camp", "music", "safe", "gift", "fault", "guess", "act", "shame", "drama", "capital", "exam", "stupid", "record", "sound", "swing", "novel", "minimum", "ratio", "machine", "shape", "lead", "operation", "salary", "cloud", "affair", "hit", "chapter", "stage", "quantity", "access", "army", "chain", "traffic", "kick", "analysis", "airport", "time", "vacation", "philosophy", "ball", "chest", "thanks", "place", "mountain", "advertising", "red", "past", "rent", "return", "tour", "house", "construction", "net", "native", "war", "figure", "fee", "spray", "user", "dirt", "shot", "task", "stick", "friend", "software", "promotion", "interaction", "surround", "block", "purpose", "practice", "conflict", "routine", "requirement", "bonus", "hole", "state", "junior", "sweet", "catch", "tear", "fold", "wall", "editor", "life", "position", "pound", "respect", "bathroom", "coat", "script", "job", "teach", "birth", "view", "resolve", "theme", "employee", "doubt", "market", "education", "serve", "recover", "tone", "harm", "miss", "union", "understanding", "cow", "river", "association", "concept", "training", "recipe", "relationship", "reserve", "depression", "proof", "hair", "revenue", "independent", "lift", "assignment", "temporary", "amount", "loss", "edge", "track", "check", "rope", "estimate", "pollution", "stable", "message", "delivery", "perspective", "mirror", "assistant", "representative", "witness", "nature", "judge", "fruit", "tip", "devil", "town", "emergency", "upper", "drop", "stay", "human", "neck", "speaker", "network", "sing", "resist", "league", "trip", "signature", "lawyer", "importance", "gas", "choice", "engineer", "success", "part", "external", "worker", "simple", "quarter", "student", "heart", "pass", "spite", "shift", "rough", "lady", "grass", "community", "garage", "youth", "standard", "skirt", "promise", "blind", "television", "disease", "commission", "positive", "energy", "calm", "presence", "tune", "basis", "preference", "head", "common", "cut", "somewhere", "presentation", "current", "thought", "revolution", "effort", "master", "implement", "republic", "floor", "principle", "stranger", "shoulder", "grade", "button", "tennis", "police", "collection", "account", "register", "glove", "divide", "professor", "chair", "priority", "combine", "peace", "extension", "maybe", "evening", "frame", "sister", "wave", "code", "application", "mouse", "match", "counter", "bottle", "half", "cheek", "resolution", "back", "knowledge", "make", "discussion", "screw", "length", "accident", "battle", "dress", "knee", "log", "package", "it", "turn", "hearing", "newspaper", "layer", "wealth", "profile", "imagination", "answer", "weekend", "teacher", "appearance", "meet", "bike", "rise", "belt", "crash", "bowl", "equivalent", "support", "image", "poem", "risk", "excitement", "remote", "secretary", "public", "produce", "plane", "display", "money", "sand", "situation", "punch", "customer", "title", "shake", "mortgage", "option", "number", "pop", "window", "extent", "nothing", "experience", "opinion", "departure", "dance", "indication", "boy", "material", "band", "leader", "sun", "beautiful", "muscle", "farmer", "variety", "fat", "handle", "director", "opportunity", "calendar", "outside", "pace", "bath", "fish", "consequence", "put", "owner", "go", "doctor", "information", "share", "hurt", "protection", "career", "finance", "force", "golf", "garbage", "aspect", "kid", "food", "boot", "milk", "respond", "objective", "reality", "raw", "ring", "mall", "one", "impact", "area", "news", "international", "series", "impress", "mother", "shelter", "strike", "loan", "month", "seat", "anything", "entertainment", "familiar", "clue", "year", "glad", "supermarket", "natural", "god", "cost", "conversation", "tie", "ruin", "comfort", "earth", "storm", "percentage", "assistance", "budget", "strength", "beginning", "sleep", "other", "young", "unit", "fill", "store", "desire", "hide", "value", "cup", "maintenance", "nurse", "function", "tower", "role", "class", "camera", "database", "panic", "nation", "basket", "ice", "art", "spirit", "chart", "exchange", "feedback", "statement", "reputation", "search", "hunt", "exercise", "nasty", "notice", "male", "yard", "annual", "collar", "date", "platform", "plant", "fortune", "passion", "friendship", "spread", "cancer", "ticket", "attitude", "island", "active", "object", "service", "buyer", "bite", "card", "face", "steak", "proposal", "patient", "heat", "rule", "resident", "broad", "politics", "west", "knife", "expert", "girl", "design", "salt", "baseball", "grab", "inspection", "cousin", "couple", "magazine", "cook", "dependent", "security", "chicken", "version", "currency", "ladder", "scheme", "kitchen", "employment", "local", "attention", "manager", "fact", "cover", "sad", "guard", "relative", "county", "rate", "lunch", "program", "initiative", "gear", "bridge", "breast", "talk", "dish", "guarantee", "beer", "vehicle", "reception", "woman", "substance", "copy", "lecture", "advantage", "park", "cold", "death", "mix", "hold", "scale", "tomorrow", "blood", "request", "green", "cookie", "church", "strip", "forever", "beyond", "debt", "tackle", "wash", "following", "feel", "maximum", "sector", "sea", "property", "economics", "menu", "bench", "try", "language", "start", "call", "solid", "address", "income", "foot", "senior", "honey", "few", "mixture", "cash", "grocery", "link", "map", "form", "factor", "pot", "model", "writer", "farm", "winter", "skill", "anywhere", "birthday", "policy", "release", "husband", "lab", "hurry", "mail", "equipment", "sink", "pair", "driver", "consideration", "leather", "skin", "blue", "boat", "sale", "brick", "two", "feed", "square", "dot", "rush", "dream", "location", "afternoon", "manufacturer", "control", "occasion", "trouble", "introduction", "advice", "bet", "eat", "kill", "category", "manner", "office", "estate", "pride", "awareness", "slip", "crack", "client", "nail", "shoot", "membership", "soft", "anybody", "web", "official", "individual", "pizza", "interest", "bag", "spell", "profession", "queen", "deal", "resource", "ship", "guy", "chocolate", "joint", "formal", "upstairs", "car", "resort", "abroad", "dealer", "associate", "finger", "surgery", "comment", "team", "detail", "crazy", "path", "tale", "initial", "arm", "radio", "demand", "single", "draw", "yellow", "contest", "piece", "quote", "pull", "commercial", "shirt", "contribution", "cream", "channel", "suit", "discipline", "instruction", "concert", "speech", "low", "effective", "hang", "scratch", "industry", "breakfast", "lay", "join", "metal", "bedroom", "minute", "product", "rest", "temperature", "many", "give", "argument", "print", "purple", "laugh", "health", "credit", "investment", "sell", "setting", "lesson", "egg", "middle", "marriage", "level", "evidence", "phrase", "love", "self", "benefit", "guidance", "affect", "you", "dad", "anxiety", "special", "boyfriend", "test", "blank", "payment", "soup", "obligation", "reply", "smile", "deep", "complaint", "addition", "review", "box", "towel", "minor", "fun", "soil", "issue", "cigarette", "internet", "gain", "tell", "entry", "spare", "incident", "family", "refuse", "branch", "can", "pen", "grandfather", "constant", "tank", "uncle", "climate", "ground", "volume", "communication", "kind", "poet", "child", "screen", "mine", "quit", "gene", "lack", "charity", "memory", "tooth", "fear", "mention", "marketing", "reveal", "reason", "court", "season", "freedom", "land", "sport", "audience", "classroom", "law", "hook", "win", "carry", "eye", "smell", "distribution", "research", "country", "dare", "hope", "whereas", "stretch", "library", "if", "delay", "college", "plastic", "book", "present", "use", "worry", "champion", "goal", "economy", "march", "election", "reflection", "midnight", "slide", "inflation", "action", "challenge", "guitar", "coast", "apple", "campaign", "field", "jacket", "sense", "way", "visual", "remove", "weather", "trash", "cable", "regret", "buddy", "beach", "historian", "courage", "sympathy", "truck", "tension", "permit", "nose", "bed", "son", "person", "base", "meat", "usual", "air", "meeting", "worth", "game", "independence", "physical", "brief", "play", "raise", "board", "she", "key", "writing", "pick", "command", "party", "yesterday", "spring", "candidate", "physics", "university", "concern", "development", "change", "string", "target", "instance", "room", "bitter", "bird", "football", "normal", "split", "impression", "wood", "long", "meaning", "stock", "cap", "leadership", "media", "ambition", "fishing", "essay", "salad", "repair", "today", "designer", "night", "bank", "drawing", "inevitable", "phase", "vast", "chip", "anger", "switch", "cry", "twist", "personality", "attempt", "storage", "being", "preparation", "bat", "selection", "white", "technology", "contract", "side", "section", "station", "till", "structure", "tongue", "taste", "truth", "difficulty", "group", "limit", "main", "move", "feeling", "light", "example", "mission", "might", "wait", "wheel", "shop", "host", "classic", "alternative", "cause", "agent", "consist", "table", "airline", "text", "pool", "craft", "range", "fuel", "tool", "partner", "load", "entrance", "deposit", "hate", "article", "video", "summer", "feature", "extreme", "mobile", "hospital", "flight", "fall", "pension", "piano", "fail", "result", "rub", "gap", "system", "report", "suck", "ordinary", "wind", "nerve", "ask", "shine", "note", "line", "mom", "perception", "brother", "reference", "bend", "charge", "treat", "trick", "term", "homework", "bake", "bid", "status", "project", "strategy", "orange", "let", "enthusiasm", "parent", "concentrate", "device", "travel", "poetry", "business", "society", "kiss", "end", "vegetable", "employ", "schedule", "hour", "brave", "focus", "process", "movie", "illegal", "general", "coffee", "ad", "highway", "chemistry", "psychology", "hire", "bell", "conference", "relief", "show", "neat", "funny", "weight", "quality", "club", "daughter", "zone", "touch", "tonight", "shock", "burn", "excuse", "name", "survey", "landscape", "advance", "satisfaction", "bread", "disaster", "item", "hat", "prior", "shopping", "visit", "east", "photo", "home", "idea", "father", "comparison", "cat", "pipe", "winner", "count", "lake", "fight", "prize", "foundation", "dog", "keep", "ideal", "fan", "struggle", "peak", "safety", "solution", "hell", "conclusion", "population", "strain", "alarm", "measurement", "second", "train", "race", "due", "insurance", "boss", "tree", "monitor", "sick", "course", "drag", "appointment", "slice", "still", "care", "patience", "rich", "escape", "emotion", "royal", "female", "childhood", "government", "picture", "will", "sock", "big", "gate", "oil", "cross", "pin", "improvement", "championship", "silly", "help", "sky", "pitch", "man", "diamond", "most", "transition", "work", "science", "committee", "moment", "fix", "teaching", "dig", "specialist", "complex", "guide", "people", "dead", "voice", "original", "break", "topic", "data", "degree", "reading", "recording", "bunch", "reach", "judgment", "lie", "regular", "set", "painting", "mode", "list", "player", "bear", "north", "wonder", "carpet", "heavy", "officer", "negative", "clock", "unique", "baby", "pain", "assumption", "disk", "iron", "bill", "drawer", "look", "double", "mistake", "finish", "future", "brilliant", "contact", "math", "rice", "leave", "restaurant", "discount", "sex", "virus", "bit", "trust", "event", "wear", "juice", "failure", "bug", "context", "mud", "whole", "wrap", "intention", "draft", "pressure", "cake", "dark", "explanation", "space", "angle", "word", "efficiency", "management", "habit", "star", "chance", "finding", "transportation", "stand", "criticism", "flow", "door", "injury", "insect", "surprise", "apartment"] # pylint: disable=line-too-long
|
|
33
|
+
|
|
34
|
+
# ISO 639-1 codes to language names.
|
|
35
|
+
LANGUAGE_CODES = immutabledict.immutabledict({
|
|
36
|
+
"en": "English",
|
|
37
|
+
"es": "Spanish",
|
|
38
|
+
"pt": "Portuguese",
|
|
39
|
+
"ar": "Arabic",
|
|
40
|
+
"hi": "Hindi",
|
|
41
|
+
"fr": "French",
|
|
42
|
+
"ru": "Russian",
|
|
43
|
+
"de": "German",
|
|
44
|
+
"ja": "Japanese",
|
|
45
|
+
"it": "Italian",
|
|
46
|
+
"bn": "Bengali",
|
|
47
|
+
"uk": "Ukrainian",
|
|
48
|
+
"th": "Thai",
|
|
49
|
+
"ur": "Urdu",
|
|
50
|
+
"ta": "Tamil",
|
|
51
|
+
"te": "Telugu",
|
|
52
|
+
"bg": "Bulgarian",
|
|
53
|
+
"ko": "Korean",
|
|
54
|
+
"pl": "Polish",
|
|
55
|
+
"he": "Hebrew",
|
|
56
|
+
"fa": "Persian",
|
|
57
|
+
"vi": "Vietnamese",
|
|
58
|
+
"ne": "Nepali",
|
|
59
|
+
"sw": "Swahili",
|
|
60
|
+
"kn": "Kannada",
|
|
61
|
+
"mr": "Marathi",
|
|
62
|
+
"gu": "Gujarati",
|
|
63
|
+
"pa": "Punjabi",
|
|
64
|
+
"ml": "Malayalam",
|
|
65
|
+
"fi": "Finnish",
|
|
66
|
+
})
|
|
67
|
+
|
|
68
|
+
_ALPHABETS = "([A-Za-z])"
|
|
69
|
+
_PREFIXES = "(Mr|St|Mrs|Ms|Dr)[.]"
|
|
70
|
+
_SUFFIXES = "(Inc|Ltd|Jr|Sr|Co)"
|
|
71
|
+
_STARTERS = r"(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
|
|
72
|
+
_ACRONYMS = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
|
|
73
|
+
_WEBSITES = "[.](com|net|org|io|gov|edu|me)"
|
|
74
|
+
_DIGITS = "([0-9])"
|
|
75
|
+
_MULTIPLE_DOTS = r"\.{2,}"
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def split_into_sentences(text):
|
|
79
|
+
"""Split the text into sentences.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
text: A string that consists of more than or equal to one sentences.
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
A list of strings where each string is a sentence.
|
|
86
|
+
"""
|
|
87
|
+
text = " " + text + " "
|
|
88
|
+
text = text.replace("\n", " ")
|
|
89
|
+
text = re.sub(_PREFIXES, "\\1<prd>", text)
|
|
90
|
+
text = re.sub(_WEBSITES, "<prd>\\1", text)
|
|
91
|
+
text = re.sub(_DIGITS + "[.]" + _DIGITS, "\\1<prd>\\2", text)
|
|
92
|
+
text = re.sub(
|
|
93
|
+
_MULTIPLE_DOTS,
|
|
94
|
+
lambda match: "<prd>" * len(match.group(0)) + "<stop>",
|
|
95
|
+
text,
|
|
96
|
+
)
|
|
97
|
+
if "Ph.D" in text:
|
|
98
|
+
text = text.replace("Ph.D.", "Ph<prd>D<prd>")
|
|
99
|
+
text = re.sub(r"\s" + _ALPHABETS + "[.] ", " \\1<prd> ", text)
|
|
100
|
+
text = re.sub(_ACRONYMS + " " + _STARTERS, "\\1<stop> \\2", text)
|
|
101
|
+
text = re.sub(
|
|
102
|
+
_ALPHABETS + "[.]" + _ALPHABETS + "[.]" + _ALPHABETS + "[.]",
|
|
103
|
+
"\\1<prd>\\2<prd>\\3<prd>",
|
|
104
|
+
text,
|
|
105
|
+
)
|
|
106
|
+
text = re.sub(
|
|
107
|
+
_ALPHABETS + "[.]" + _ALPHABETS + "[.]", "\\1<prd>\\2<prd>", text
|
|
108
|
+
)
|
|
109
|
+
text = re.sub(" " + _SUFFIXES + "[.] " + _STARTERS, " \\1<stop> \\2", text)
|
|
110
|
+
text = re.sub(" " + _SUFFIXES + "[.]", " \\1<prd>", text)
|
|
111
|
+
text = re.sub(" " + _ALPHABETS + "[.]", " \\1<prd>", text)
|
|
112
|
+
if "”" in text:
|
|
113
|
+
text = text.replace(".”", "”.")
|
|
114
|
+
if '"' in text:
|
|
115
|
+
text = text.replace('."', '".')
|
|
116
|
+
if "!" in text:
|
|
117
|
+
text = text.replace('!"', '"!')
|
|
118
|
+
if "?" in text:
|
|
119
|
+
text = text.replace('?"', '"?')
|
|
120
|
+
text = text.replace(".", ".<stop>")
|
|
121
|
+
text = text.replace("?", "?<stop>")
|
|
122
|
+
text = text.replace("!", "!<stop>")
|
|
123
|
+
text = text.replace("<prd>", ".")
|
|
124
|
+
sentences = text.split("<stop>")
|
|
125
|
+
sentences = [s.strip() for s in sentences]
|
|
126
|
+
if sentences and not sentences[-1]:
|
|
127
|
+
sentences = sentences[:-1]
|
|
128
|
+
return sentences
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def count_words(text):
|
|
132
|
+
"""Counts the number of words."""
|
|
133
|
+
tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
|
|
134
|
+
tokens = tokenizer.tokenize(text)
|
|
135
|
+
num_words = len(tokens)
|
|
136
|
+
return num_words
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
@functools.lru_cache(maxsize=None)
|
|
140
|
+
def _get_sentence_tokenizer():
|
|
141
|
+
return nltk.data.load("nltk:tokenizers/punkt/english.pickle")
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def count_sentences(text):
|
|
145
|
+
"""Count the number of sentences."""
|
|
146
|
+
tokenizer = _get_sentence_tokenizer()
|
|
147
|
+
tokenized_sentences = tokenizer.tokenize(text)
|
|
148
|
+
return len(tokenized_sentences)
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def generate_keywords(num_keywords):
|
|
152
|
+
"""Randomly generates a few keywords."""
|
|
153
|
+
return random.sample(WORD_LIST, k=num_keywords)
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
from helm.common.hierarchical_logger import hlog
|
|
4
|
+
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
5
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
6
|
+
from helm.benchmark.metrics.metric import Metric
|
|
7
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
8
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
9
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
10
|
+
|
|
11
|
+
from helm.benchmark.metrics.ifeval.instructions_registry import INSTRUCTION_DICT
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class IFEvalMetric(Metric):
|
|
15
|
+
def evaluate_generation(
|
|
16
|
+
self,
|
|
17
|
+
adapter_spec: AdapterSpec,
|
|
18
|
+
request_state: RequestState,
|
|
19
|
+
metric_service: MetricService,
|
|
20
|
+
eval_cache_path: str,
|
|
21
|
+
) -> List[Stat]:
|
|
22
|
+
prompt = request_state.instance.input.text
|
|
23
|
+
assert request_state.instance.extra_data
|
|
24
|
+
instruction_ids = request_state.instance.extra_data["instruction_ids"]
|
|
25
|
+
instruction_kwargs = request_state.instance.extra_data["instruction_kwargs"]
|
|
26
|
+
assert len(instruction_ids) > 0
|
|
27
|
+
assert request_state.result
|
|
28
|
+
assert len(request_state.result.completions) == 1, f"Got {len(request_state.result.completions)} completions"
|
|
29
|
+
response = request_state.result.completions[0].text.strip()
|
|
30
|
+
|
|
31
|
+
# The following logic was reproduced with minor modifications from the following URL:
|
|
32
|
+
# https://github.com/google-research/google-research/blob/c7f60c013623e613732a096e2a0c2872491ec912/
|
|
33
|
+
# instruction_following_eval/evaluation_main.py#L96-L125
|
|
34
|
+
is_following_list = []
|
|
35
|
+
for index, instruction_id in enumerate(instruction_ids):
|
|
36
|
+
instruction_cls = INSTRUCTION_DICT[instruction_id]
|
|
37
|
+
instruction = instruction_cls(instruction_id)
|
|
38
|
+
|
|
39
|
+
instruction.build_description(**{k: v for k, v in instruction_kwargs[index].items() if v is not None})
|
|
40
|
+
args = instruction.get_instruction_args()
|
|
41
|
+
if args and "prompt" in args:
|
|
42
|
+
instruction.build_description(prompt=prompt)
|
|
43
|
+
|
|
44
|
+
is_following = False
|
|
45
|
+
if response.strip():
|
|
46
|
+
try:
|
|
47
|
+
is_following = instruction.check_following(response)
|
|
48
|
+
except Exception as e:
|
|
49
|
+
hlog(f"WARNING: Instruction following checking failed with error message {e}")
|
|
50
|
+
if is_following:
|
|
51
|
+
is_following_list.append(1)
|
|
52
|
+
else:
|
|
53
|
+
is_following_list.append(0)
|
|
54
|
+
|
|
55
|
+
return [Stat(MetricName("ifeval_strict_accuracy")).add(sum(is_following_list) / len(is_following_list))]
|
|
@@ -9,7 +9,7 @@ from helm.benchmark.metrics.statistic import Stat
|
|
|
9
9
|
from helm.benchmark.metrics.metric import Metric
|
|
10
10
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
11
11
|
from helm.benchmark.metrics.metric_service import MetricService
|
|
12
|
-
from .aesthetics_scorer import AestheticsScorer
|
|
12
|
+
from helm.benchmark.metrics.image_generation.aesthetics_scorer import AestheticsScorer
|
|
13
13
|
from helm.common.multimodal_request_utils import gather_generated_image_locations
|
|
14
14
|
|
|
15
15
|
|
|
@@ -10,7 +10,7 @@ from helm.benchmark.metrics.metric import Metric
|
|
|
10
10
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
11
11
|
from helm.benchmark.metrics.metric_service import MetricService
|
|
12
12
|
from helm.common.multimodal_request_utils import gather_generated_image_locations
|
|
13
|
-
from .detectors.vitdet import ViTDetDetector
|
|
13
|
+
from helm.benchmark.metrics.image_generation.detectors.vitdet import ViTDetDetector
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
class DetectionMetric(Metric):
|
|
@@ -8,7 +8,7 @@ from helm.common.general import ensure_file_downloaded, hlog
|
|
|
8
8
|
from helm.common.images_utils import open_image
|
|
9
9
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
10
10
|
from helm.common.gpu_utils import get_torch_device
|
|
11
|
-
from .base_detector import BaseDetector
|
|
11
|
+
from helm.benchmark.metrics.image_generation.detectors.base_detector import BaseDetector
|
|
12
12
|
|
|
13
13
|
|
|
14
14
|
MODEL_CONFIG_DOWNLOAD_URL: str = "https://drive.google.com/uc?id=1MLuwQ0ZN0gJQ42oVCc0aFz6Rneb1g3Rt"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import os
|
|
2
2
|
|
|
3
|
-
from .fractal_dimension_util import compute_fractal_dimension
|
|
3
|
+
from helm.benchmark.metrics.image_generation.fractal_dimension.fractal_dimension_util import compute_fractal_dimension
|
|
4
4
|
|
|
5
5
|
|
|
6
6
|
def fractal_dimension_test(image_filename: str, expected_fractal_dimension: float):
|
|
@@ -10,7 +10,7 @@ from helm.benchmark.metrics.metric import Metric
|
|
|
10
10
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
11
11
|
from helm.benchmark.metrics.metric_service import MetricService
|
|
12
12
|
from helm.common.multimodal_request_utils import gather_generated_image_locations
|
|
13
|
-
from .fractal_dimension.fractal_dimension_util import compute_fractal_dimension
|
|
13
|
+
from helm.benchmark.metrics.image_generation.fractal_dimension.fractal_dimension_util import compute_fractal_dimension
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
class FractalDimensionMetric(Metric):
|
|
@@ -10,7 +10,7 @@ from helm.benchmark.metrics.metric_service import MetricService
|
|
|
10
10
|
from helm.benchmark.metrics.toxicity_utils import is_prompt_toxic
|
|
11
11
|
from helm.clients.image_generation.dalle2_client import DALLE2Client
|
|
12
12
|
from helm.common.images_utils import is_blacked_out_image
|
|
13
|
-
from .nsfw_detector import NSFWDetector
|
|
13
|
+
from helm.benchmark.metrics.image_generation.nsfw_detector import NSFWDetector
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
class NSFWMetric(Metric):
|
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
import os
|
|
2
|
+
import pytest
|
|
2
3
|
|
|
3
|
-
from .q16_toxicity_detector import Q16ToxicityDetector
|
|
4
|
+
from helm.benchmark.metrics.image_generation.q16.q16_toxicity_detector import Q16ToxicityDetector
|
|
4
5
|
|
|
5
6
|
|
|
7
|
+
@pytest.mark.skip(reason="Skipping due to flakiness.")
|
|
6
8
|
class TestQ16:
|
|
7
9
|
def setup_method(self, method):
|
|
8
10
|
self._q16_detector = Q16ToxicityDetector()
|
|
@@ -9,7 +9,7 @@ from helm.benchmark.metrics.metric import Metric
|
|
|
9
9
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
10
10
|
from helm.benchmark.metrics.metric_service import MetricService
|
|
11
11
|
from helm.benchmark.metrics.toxicity_utils import is_prompt_toxic
|
|
12
|
-
from .q16.q16_toxicity_detector import Q16ToxicityDetector
|
|
12
|
+
from helm.benchmark.metrics.image_generation.q16.q16_toxicity_detector import Q16ToxicityDetector
|
|
13
13
|
from helm.common.multimodal_request_utils import gather_generated_image_locations
|
|
14
14
|
|
|
15
15
|
|
|
@@ -92,9 +92,9 @@ class SkinToneMetric(Metric):
|
|
|
92
92
|
and (Cr <= ((-2.2857 * Cb) + 432.85))
|
|
93
93
|
):
|
|
94
94
|
|
|
95
|
-
blue.append(img_rgba[i, j].item(
|
|
95
|
+
blue.append(img_rgba[i, j].item(2))
|
|
96
96
|
green.append(img_rgba[i, j].item(1))
|
|
97
|
-
red.append(img_rgba[i, j].item(
|
|
97
|
+
red.append(img_rgba[i, j].item(0))
|
|
98
98
|
else:
|
|
99
99
|
img_rgba[i, j] = [0, 0, 0, 0]
|
|
100
100
|
|
|
@@ -9,7 +9,7 @@ from helm.benchmark.metrics.metric import Metric
|
|
|
9
9
|
from helm.benchmark.metrics.metric_name import MetricName
|
|
10
10
|
from helm.benchmark.metrics.metric_service import MetricService
|
|
11
11
|
from helm.common.multimodal_request_utils import gather_generated_image_locations
|
|
12
|
-
from .watermark.watermark_detector import WatermarkDetector
|
|
12
|
+
from helm.benchmark.metrics.image_generation.watermark.watermark_detector import WatermarkDetector
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
class WatermarkMetric(Metric):
|
|
@@ -3,10 +3,10 @@ from typing import Dict, List
|
|
|
3
3
|
|
|
4
4
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
5
5
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
6
|
-
from .metric import Metric
|
|
7
|
-
from .metric_name import MetricName
|
|
8
|
-
from .metric_service import MetricService
|
|
9
|
-
from .statistic import Stat
|
|
6
|
+
from helm.benchmark.metrics.metric import Metric
|
|
7
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
8
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
9
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
10
10
|
from helm.common.critique_request import CritiqueTaskTemplate, CritiqueQuestionTemplate, CritiqueRequest, QuestionType
|
|
11
11
|
|
|
12
12
|
|
|
@@ -11,10 +11,10 @@ from helm.benchmark.metrics.efficiency_metrics import EfficiencyMetric
|
|
|
11
11
|
|
|
12
12
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
13
13
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
14
|
-
from .metric import MetricInterface, MetricResult, PerInstanceStats, add_context
|
|
15
|
-
from .metric_name import MetricContext, MetricName
|
|
16
|
-
from .metric_service import MetricService
|
|
17
|
-
from .statistic import Stat, merge_stat
|
|
14
|
+
from helm.benchmark.metrics.metric import MetricInterface, MetricResult, PerInstanceStats, add_context
|
|
15
|
+
from helm.benchmark.metrics.metric_name import MetricContext, MetricName
|
|
16
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
17
|
+
from helm.benchmark.metrics.statistic import Stat, merge_stat
|
|
18
18
|
|
|
19
19
|
|
|
20
20
|
class LanguageModelingMetric(MetricInterface):
|
|
@@ -3,8 +3,8 @@ from typing import List
|
|
|
3
3
|
from helm.benchmark.adaptation.request_state import RequestState
|
|
4
4
|
from helm.benchmark.metrics.evaluate_instances_metric import EvaluateInstancesMetric
|
|
5
5
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
6
|
-
from .metric_name import MetricName
|
|
7
|
-
from .statistic import Stat
|
|
6
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
7
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
8
8
|
|
|
9
9
|
try:
|
|
10
10
|
from sacrebleu.metrics import BLEU
|