crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- crfm_helm-0.5.6.dist-info/METADATA +427 -0
- crfm_helm-0.5.6.dist-info/RECORD +941 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapter_spec.py +13 -1
- helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
- helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
- helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
- helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
- helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
- helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
- helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
- helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
- helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
- helm/benchmark/adaptation/common_adapter_specs.py +69 -4
- helm/benchmark/adaptation/prompt.py +1 -1
- helm/benchmark/annotation/aci_bench_annotator.py +95 -0
- helm/benchmark/annotation/air_bench_annotator.py +21 -6
- helm/benchmark/annotation/annotator.py +5 -0
- helm/benchmark/annotation/annotator_factory.py +3 -20
- helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
- helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
- helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
- helm/benchmark/annotation/bird_sql_annotator.py +58 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
- helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
- helm/benchmark/annotation/dischargeme_annotator.py +107 -0
- helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
- helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +99 -0
- helm/benchmark/annotation/medalign_annotator.py +100 -0
- helm/benchmark/annotation/medi_qa_annotator.py +98 -0
- helm/benchmark/annotation/medication_qa_annotator.py +87 -63
- helm/benchmark/annotation/mental_health_annotator.py +98 -0
- helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
- helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +214 -6
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
- helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
- helm/benchmark/annotation/omni_math_annotator.py +131 -0
- helm/benchmark/annotation/spider_annotator.py +18 -0
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
- helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
- helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
- helm/benchmark/annotation/wildbench_annotator.py +119 -0
- helm/benchmark/annotation_executor.py +35 -15
- helm/benchmark/augmentations/cleva_perturbation.py +9 -8
- helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
- helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
- helm/benchmark/augmentations/dialect_perturbation.py +4 -5
- helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
- helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
- helm/benchmark/augmentations/gender_perturbation.py +2 -2
- helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
- helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
- helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
- helm/benchmark/augmentations/person_name_perturbation.py +4 -5
- helm/benchmark/augmentations/perturbation.py +1 -1
- helm/benchmark/augmentations/space_perturbation.py +2 -2
- helm/benchmark/augmentations/suffix_perturbation.py +2 -2
- helm/benchmark/augmentations/synonym_perturbation.py +4 -3
- helm/benchmark/augmentations/test_perturbation.py +16 -13
- helm/benchmark/augmentations/translate_perturbation.py +2 -2
- helm/benchmark/augmentations/typos_perturbation.py +2 -2
- helm/benchmark/data_preprocessor.py +2 -2
- helm/benchmark/executor.py +11 -12
- helm/benchmark/huggingface_registration.py +2 -7
- helm/benchmark/metrics/aci_bench_metrics.py +14 -0
- helm/benchmark/metrics/basic_metrics.py +6 -6
- helm/benchmark/metrics/bbq_metrics.py +2 -2
- helm/benchmark/metrics/bias_metrics.py +12 -3
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
- helm/benchmark/metrics/bird_sql_metrics.py +28 -0
- helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
- helm/benchmark/metrics/classification_metrics.py +76 -12
- helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
- helm/benchmark/metrics/code_metrics.py +5 -5
- helm/benchmark/metrics/comet_metric.py +125 -0
- helm/benchmark/metrics/common_metric_specs.py +9 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
- helm/benchmark/metrics/copyright_metrics.py +4 -4
- helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
- helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +14 -0
- helm/benchmark/metrics/disinformation_metrics.py +4 -4
- helm/benchmark/metrics/dry_run_metrics.py +5 -5
- helm/benchmark/metrics/efficiency_metrics.py +6 -6
- helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
- helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
- helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
- helm/benchmark/metrics/ifeval/__init__.py +0 -0
- helm/benchmark/metrics/ifeval/instructions.py +1574 -0
- helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
- helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
- helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
- helm/benchmark/metrics/ifeval_metrics.py +55 -0
- helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
- helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
- helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
- helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
- helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
- helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
- helm/benchmark/metrics/language_modeling_metrics.py +4 -4
- helm/benchmark/metrics/llm_jury_metrics.py +46 -0
- helm/benchmark/metrics/machine_translation_metrics.py +2 -2
- helm/benchmark/metrics/med_dialog_metrics.py +14 -0
- helm/benchmark/metrics/medalign_metrics.py +14 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
- helm/benchmark/metrics/medec_metrics.py +101 -0
- helm/benchmark/metrics/medi_qa_metrics.py +14 -0
- helm/benchmark/metrics/medication_qa_metrics.py +10 -19
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/mental_health_metrics.py +14 -0
- helm/benchmark/metrics/metric.py +3 -3
- helm/benchmark/metrics/metric_service.py +11 -11
- helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
- helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
- helm/benchmark/metrics/nltk_helper.py +32 -0
- helm/benchmark/metrics/numeracy_metrics.py +4 -4
- helm/benchmark/metrics/omni_math_metrics.py +32 -0
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/output_processing_metric.py +60 -0
- helm/benchmark/metrics/output_processors.py +15 -0
- helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
- helm/benchmark/metrics/ranking_metrics.py +3 -3
- helm/benchmark/metrics/reference_metric.py +3 -3
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
- helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
- helm/benchmark/metrics/spider_metrics.py +7 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
- helm/benchmark/metrics/statistic.py +1 -1
- helm/benchmark/metrics/summac/model_summac.py +2 -3
- helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
- helm/benchmark/metrics/summarization_metrics.py +20 -9
- helm/benchmark/metrics/test_bias_metrics.py +5 -1
- helm/benchmark/metrics/test_classification_metrics.py +140 -68
- helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
- helm/benchmark/metrics/test_metric.py +1 -1
- helm/benchmark/metrics/test_statistic.py +2 -2
- helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
- helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
- helm/benchmark/metrics/toxicity_metrics.py +6 -6
- helm/benchmark/metrics/unitxt_metrics.py +7 -5
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
- helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
- helm/benchmark/metrics/vision_language/image_utils.py +2 -2
- helm/benchmark/metrics/wildbench_metrics.py +34 -0
- helm/benchmark/model_deployment_registry.py +6 -8
- helm/benchmark/model_metadata_registry.py +16 -0
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +33 -12
- helm/benchmark/presentation/run_display.py +13 -0
- helm/benchmark/presentation/schema.py +2 -1
- helm/benchmark/presentation/summarize.py +97 -67
- helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
- helm/benchmark/reeval_run.py +202 -0
- helm/benchmark/reeval_runner.py +355 -0
- helm/benchmark/run.py +86 -90
- helm/benchmark/run_expander.py +90 -9
- helm/benchmark/run_spec_factory.py +13 -0
- helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
- helm/benchmark/run_specs/audio_run_specs.py +657 -0
- helm/benchmark/run_specs/call_center_run_specs.py +49 -0
- helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
- helm/benchmark/run_specs/classic_run_specs.py +1 -69
- helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
- helm/benchmark/run_specs/experimental_run_specs.py +142 -3
- helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
- helm/benchmark/run_specs/lite_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +141 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
- helm/benchmark/run_specs/oab_exams_specs.py +32 -0
- helm/benchmark/run_specs/safety_run_specs.py +37 -0
- helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
- helm/benchmark/run_specs/sql_run_specs.py +54 -0
- helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
- helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
- helm/benchmark/run_specs/vlm_run_specs.py +103 -2
- helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
- helm/benchmark/runner.py +5 -5
- helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
- helm/benchmark/scenarios/air_bench_scenario.py +6 -1
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/__init__.py +0 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
- helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
- helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
- helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
- helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
- helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
- helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
- helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
- helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
- helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
- helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
- helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
- helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
- helm/benchmark/scenarios/banking77_scenario.py +6 -1
- helm/benchmark/scenarios/bbq_scenario.py +1 -1
- helm/benchmark/scenarios/big_bench_scenario.py +11 -1
- helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
- helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
- helm/benchmark/scenarios/blimp_scenario.py +1 -1
- helm/benchmark/scenarios/bold_scenario.py +1 -1
- helm/benchmark/scenarios/boolq_scenario.py +1 -1
- helm/benchmark/scenarios/casehold_scenario.py +79 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
- helm/benchmark/scenarios/clear_scenario.py +157 -0
- helm/benchmark/scenarios/cleva_scenario.py +2 -2
- helm/benchmark/scenarios/code_scenario.py +17 -4
- helm/benchmark/scenarios/commonsense_scenario.py +1 -1
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
- helm/benchmark/scenarios/copyright_scenario.py +1 -1
- helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
- helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
- helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
- helm/benchmark/scenarios/disinformation_scenario.py +10 -1
- helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
- helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
- helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
- helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
- helm/benchmark/scenarios/gpqa_scenario.py +80 -0
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/grammar_scenario.py +2 -2
- helm/benchmark/scenarios/gsm_scenario.py +10 -1
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
- helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
- helm/benchmark/scenarios/headqa_scenario.py +136 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
- helm/benchmark/scenarios/ice_scenario.py +8 -4
- helm/benchmark/scenarios/ifeval_scenario.py +53 -0
- helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
- helm/benchmark/scenarios/imdb_scenario.py +11 -2
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
- helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
- helm/benchmark/scenarios/koala_scenario.py +1 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
- helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
- helm/benchmark/scenarios/legal_support_scenario.py +11 -1
- helm/benchmark/scenarios/legalbench_scenario.py +22 -3
- helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
- helm/benchmark/scenarios/lextreme_scenario.py +11 -1
- helm/benchmark/scenarios/live_qa_scenario.py +1 -1
- helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
- helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
- helm/benchmark/scenarios/math_scenario.py +9 -1
- helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
- helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
- helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
- helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
- helm/benchmark/scenarios/med_qa_scenario.py +10 -1
- helm/benchmark/scenarios/medalign_scenario.py +94 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
- helm/benchmark/scenarios/medbullets_scenario.py +145 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
- helm/benchmark/scenarios/medec_scenario.py +125 -0
- helm/benchmark/scenarios/medhallu_scenario.py +72 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +123 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
- helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
- helm/benchmark/scenarios/mmlu_scenario.py +11 -1
- helm/benchmark/scenarios/msmarco_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
- helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
- helm/benchmark/scenarios/newsqa_scenario.py +1 -1
- helm/benchmark/scenarios/numeracy_scenario.py +12 -2
- helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
- helm/benchmark/scenarios/omni_math_scenario.py +53 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
- helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
- helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
- helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
- helm/benchmark/scenarios/quac_scenario.py +10 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
- helm/benchmark/scenarios/raft_scenario.py +17 -2
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
- helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
- helm/benchmark/scenarios/scenario.py +9 -1
- helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
- helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
- helm/benchmark/scenarios/spider_scenario.py +91 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
- helm/benchmark/scenarios/summarization_scenario.py +11 -1
- helm/benchmark/scenarios/sumosum_scenario.py +157 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
- helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
- helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
- helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
- helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
- helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
- helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
- helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
- helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/test_math_scenario.py +1 -0
- helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
- helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
- helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
- helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
- helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
- helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
- helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
- helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
- helm/benchmark/scenarios/the_pile_scenario.py +1 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
- helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
- helm/benchmark/scenarios/unitxt_scenario.py +8 -2
- helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
- helm/benchmark/scenarios/vicuna_scenario.py +1 -1
- helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
- helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
- helm/benchmark/scenarios/wikifact_scenario.py +11 -1
- helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
- helm/benchmark/scenarios/wildbench_scenario.py +83 -0
- helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
- helm/benchmark/scenarios/xstest_scenario.py +1 -1
- helm/benchmark/server.py +13 -1
- helm/benchmark/slurm_runner.py +1 -1
- helm/benchmark/static/schema_audio.yaml +763 -0
- helm/benchmark/static/schema_autobencher.yaml +150 -0
- helm/benchmark/static/schema_call_center.yaml +97 -60
- helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
- helm/benchmark/static/schema_czech_bank.yaml +148 -0
- helm/benchmark/static/schema_enem_challenge.yaml +146 -0
- helm/benchmark/static/schema_enterprise.yaml +319 -0
- helm/benchmark/static/schema_finance.yaml +14 -12
- helm/benchmark/static/schema_heim.yaml +1389 -0
- helm/benchmark/static/schema_long_context.yaml +283 -0
- helm/benchmark/static/schema_medhelm.yaml +1140 -0
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
- helm/benchmark/static/schema_safety.yaml +18 -1
- helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_social_audio.yaml +224 -0
- helm/benchmark/static/schema_sql.yaml +171 -0
- helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
- helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
- helm/benchmark/static/schema_vhelm.yaml +129 -56
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
- helm/benchmark/static_build/assets/index-94295e78.js +10 -0
- helm/benchmark/static_build/assets/index-b9779128.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
- helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
- helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
- helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
- helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
- helm/benchmark/static_build/config.js +1 -1
- helm/benchmark/static_build/index.html +6 -6
- helm/benchmark/window_services/default_window_service.py +1 -1
- helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
- helm/benchmark/window_services/ice_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
- helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
- helm/benchmark/window_services/local_window_service.py +2 -2
- helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
- helm/benchmark/window_services/test_bloom_window_service.py +3 -3
- helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
- helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
- helm/benchmark/window_services/test_gptj_window_service.py +8 -3
- helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
- helm/benchmark/window_services/test_openai_window_service.py +8 -3
- helm/benchmark/window_services/test_opt_window_service.py +3 -3
- helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
- helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
- helm/benchmark/window_services/test_t511b_window_service.py +3 -3
- helm/benchmark/window_services/test_ul2_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +4 -5
- helm/benchmark/window_services/test_yalm_window_service.py +3 -3
- helm/benchmark/window_services/tokenizer_service.py +7 -8
- helm/benchmark/window_services/yalm_window_service.py +1 -1
- helm/clients/ai21_client.py +3 -3
- helm/clients/aleph_alpha_client.py +1 -1
- helm/clients/anthropic_client.py +69 -29
- helm/clients/audio_language/__init__.py +0 -0
- helm/clients/audio_language/diva_llama_client.py +120 -0
- helm/clients/audio_language/llama_omni_client.py +198 -0
- helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
- helm/clients/audio_language/qwen_audiolm_client.py +152 -0
- helm/clients/audio_language/test.py +62 -0
- helm/clients/auto_client.py +4 -2
- helm/clients/azure_openai_client.py +55 -0
- helm/clients/bedrock_client.py +203 -7
- helm/clients/bedrock_utils.py +33 -0
- helm/clients/client.py +7 -7
- helm/clients/clip_scorers/clip_scorer.py +1 -1
- helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
- helm/clients/cohere_client.py +3 -3
- helm/clients/google_client.py +1 -1
- helm/clients/grok_client.py +36 -0
- helm/clients/http_model_client.py +1 -1
- helm/clients/huggingface_client.py +52 -21
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/ibm_client.py +267 -0
- helm/clients/image_generation/adobe_vision_client.py +1 -1
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
- helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
- helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
- helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
- helm/clients/image_generation/cogview2_client.py +1 -1
- helm/clients/image_generation/dalle2_client.py +1 -1
- helm/clients/image_generation/dalle3_client.py +2 -2
- helm/clients/image_generation/dalle_mini/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/data.py +1 -1
- helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
- helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
- helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
- helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
- helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
- helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
- helm/clients/image_generation/dalle_mini_client.py +1 -1
- helm/clients/image_generation/deep_floyd_client.py +1 -1
- helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
- helm/clients/image_generation/lexica_client.py +1 -1
- helm/clients/image_generation/mindalle/models/__init__.py +6 -6
- helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
- helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
- helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
- helm/clients/image_generation/mindalle_client.py +1 -1
- helm/clients/image_generation/together_image_generation_client.py +1 -1
- helm/clients/lit_gpt_client.py +2 -2
- helm/clients/mistral_client.py +62 -18
- helm/clients/nvidia_nim_client.py +0 -3
- helm/clients/openai_client.py +308 -43
- helm/clients/openai_responses_client.py +174 -0
- helm/clients/palmyra_client.py +3 -9
- helm/clients/reka_client.py +3 -3
- helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
- helm/clients/stanfordhealthcare_claude_client.py +31 -0
- helm/clients/stanfordhealthcare_google_client.py +43 -0
- helm/clients/stanfordhealthcare_http_model_client.py +93 -0
- helm/clients/stanfordhealthcare_openai_client.py +62 -0
- helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
- helm/clients/test_client.py +1 -1
- helm/clients/test_together_client.py +6 -1
- helm/clients/together_client.py +76 -9
- helm/clients/upstage_client.py +23 -0
- helm/clients/vertexai_client.py +45 -13
- helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
- helm/clients/vision_language/huggingface_vlm_client.py +2 -2
- helm/clients/vision_language/idefics_client.py +6 -2
- helm/clients/vision_language/open_flamingo/__init__.py +2 -2
- helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
- helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
- helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
- helm/clients/vision_language/paligemma_client.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +188 -0
- helm/clients/vision_language/qwen_vlm_client.py +7 -5
- helm/clients/vllm_client.py +4 -6
- helm/clients/writer_client.py +102 -0
- helm/clients/yi_client.py +0 -3
- helm/common/audio_utils.py +111 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/file_caches/local_file_cache.py +1 -1
- helm/common/file_caches/test_local_file_cache.py +1 -1
- helm/common/general.py +9 -2
- helm/common/hierarchical_logger.py +46 -3
- helm/common/images_utils.py +2 -2
- helm/common/local_context.py +140 -0
- helm/common/media_object.py +2 -2
- helm/common/multimodal_request_utils.py +26 -0
- helm/common/reeval_parameters.py +12 -0
- helm/common/remote_context.py +61 -0
- helm/common/request.py +14 -2
- helm/common/response_format.py +18 -0
- helm/common/test_media_object.py +1 -1
- helm/config/model_deployments.yaml +1792 -28
- helm/config/model_metadata.yaml +1606 -51
- helm/config/tokenizer_configs.yaml +521 -4
- helm/proxy/cli.py +5 -3
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/example_queries.py +1 -1
- helm/proxy/server.py +11 -4
- helm/proxy/services/remote_service.py +1 -1
- helm/proxy/services/server_service.py +22 -86
- helm/proxy/services/test_remote_service.py +2 -2
- helm/proxy/services/test_service.py +1 -1
- helm/proxy/static/general.js +122 -0
- helm/proxy/static/help.html +99 -0
- helm/proxy/static/index.css +57 -0
- helm/proxy/static/index.html +40 -0
- helm/proxy/static/index.js +456 -0
- helm/proxy/static/info-icon.png +0 -0
- helm/proxy/test_retry.py +1 -1
- helm/proxy/token_counters/auto_token_counter.py +1 -1
- helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
- helm/tokenizers/caching_tokenizer.py +2 -30
- helm/tokenizers/grok_tokenizer.py +53 -0
- helm/tokenizers/http_model_tokenizer.py +1 -1
- helm/tokenizers/huggingface_tokenizer.py +3 -3
- helm/tokenizers/lit_gpt_tokenizer.py +1 -1
- helm/tokenizers/test_anthropic_tokenizer.py +6 -2
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/tokenizers/test_huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_yalm_tokenizer.py +1 -1
- helm/tokenizers/tiktoken_tokenizer.py +1 -1
- helm/tokenizers/tokenizer.py +3 -1
- helm/tokenizers/yalm_tokenizer.py +3 -3
- helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
- crfm_helm-0.5.4.dist-info/METADATA +0 -350
- crfm_helm-0.5.4.dist-info/RECORD +0 -697
- helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
- helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
- helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
- helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
- helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
- helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
- helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
- helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
- helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
- helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
- helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
- helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
- helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
- helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
- helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
- helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
- helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
- helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
- helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
- helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
- helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
- helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
- helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
- helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
- helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
- helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
- helm/tokenizers/anthropic_tokenizer.py +0 -52
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
- {crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
|
|
4
|
+
from typing import Any, Dict, List, Tuple, Optional
|
|
5
|
+
import xml.etree.ElementTree as ET
|
|
6
|
+
|
|
7
|
+
from helm.common.general import ensure_directory_exists
|
|
8
|
+
from helm.benchmark.scenarios.scenario import (
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
TRAIN_SPLIT,
|
|
11
|
+
Input,
|
|
12
|
+
Scenario,
|
|
13
|
+
Instance,
|
|
14
|
+
CORRECT_TAG,
|
|
15
|
+
Reference,
|
|
16
|
+
Output,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
ORIGINAL_DEFINITIONS = {
|
|
20
|
+
"ABDOMINAL": "History of intra-abdominal surgery, small or large intestine resection, or small bowel obstruction",
|
|
21
|
+
"ADVANCED-CAD": "Advanced cardiovascular disease (CAD). For the purposes of this annotation, we define “advanced” \
|
|
22
|
+
as having 2 or more of the following: • Taking 2 or more medications to treat CAD • \
|
|
23
|
+
History of myocardial infarction (MI) • Currently experiencing angina • Ischemia, past or present",
|
|
24
|
+
"ALCOHOL-ABUSE": "Current alcohol use over weekly recommended limits",
|
|
25
|
+
"ASP-FOR-MI": "Use of aspirin for preventing myocardial infarction (MI)",
|
|
26
|
+
"CREATININE": "Serum creatinine level above the upper normal limit",
|
|
27
|
+
"DIETSUPP-2MOS": "Taken a dietary supplement (excluding vitamin D) in the past 2 months",
|
|
28
|
+
"DRUG-ABUSE": "Current or past history of drug abuse",
|
|
29
|
+
"ENGLISH": "Patient must speak English",
|
|
30
|
+
"HBA1C": "Any hemoglobin A1c (HbA1c) value between 6.5% and 9.5%",
|
|
31
|
+
"KETO-1YR": "Diagnosis of ketoacidosis within the past year",
|
|
32
|
+
"MAJOR-DIABETES": "Major diabetes-related complication. For the purposes of this annotation, we define \
|
|
33
|
+
“major complication” (as opposed to “minor complication”) as any of the following that are a result of \
|
|
34
|
+
(or strongly correlated with) uncontrolled diabetes: • Amputation • Kidney damage • Skin conditions • \
|
|
35
|
+
Retinopathy • nephropathy • neuropathy",
|
|
36
|
+
"MAKES-DECISIONS": "Patient must make their own medical decisions",
|
|
37
|
+
"MI-6MOS": "Myocardial infarction (MI) within the past 6 months",
|
|
38
|
+
}
|
|
39
|
+
# Custom definitions for better prompts
|
|
40
|
+
LONG_DEFINITIONS = {
|
|
41
|
+
"ABDOMINAL": "History of intra-abdominal surgery. This could include any form of intra-abdominal surgery, \
|
|
42
|
+
including but not limited to small/large intestine resection or small bowel obstruction",
|
|
43
|
+
"ADVANCED-CAD": "Advanced cardiovascular disease (CAD). For the purposes of this annotation, we define \
|
|
44
|
+
“advanced” as having 2 or more of the following: (a) Taking 2 or more medications to treat CAD (b) History \
|
|
45
|
+
of myocardial infarction (MI) (c) Currently experiencing angina (d) Ischemia, past or present. \
|
|
46
|
+
The patient must have at least 2 of these categories (a,b,c,d) to meet this criterion, otherwise the patient \
|
|
47
|
+
does not meet this criterion. For ADVANCED-CAD, be strict in your evaluation of the patient -- if they just \
|
|
48
|
+
have cardiovascular disease, then they do not meet this criterion.",
|
|
49
|
+
"ALCOHOL-ABUSE": "Current alcohol use over weekly recommended limits",
|
|
50
|
+
"ASP-FOR-MI": "Use of aspirin for preventing myocardial infarction (MI)..",
|
|
51
|
+
"CREATININE": "Serum creatinine level above the upper normal limit",
|
|
52
|
+
"DIETSUPP-2MOS": "Consumption of a dietary supplement (excluding vitamin D) in the past 2 months. To assess \
|
|
53
|
+
this criterion, go through the list of medications_and_supplements taken from the note. If a substance could \
|
|
54
|
+
potentially be used as a dietary supplement (i.e. it is commonly used as a dietary supplement, even if it \
|
|
55
|
+
is not explicitly stated as being used as a dietary supplement), then the patient meets this criterion. \
|
|
56
|
+
Be lenient and broad in what is considered a dietary supplement. For example, a 'multivitamin' and \
|
|
57
|
+
'calcium carbonate' should always be considered a dietary supplement if they are included in this list.",
|
|
58
|
+
"DRUG-ABUSE": "Current or past history of drug abuse",
|
|
59
|
+
"ENGLISH": "Patient speaks English. Assume that the patient speaks English, unless otherwise explicitly noted. \
|
|
60
|
+
If the patient's language is not mentioned in the note, then assume they speak English and thus meet \
|
|
61
|
+
this criteria.",
|
|
62
|
+
"HBA1C": "Any hemoglobin A1c (HbA1c) value between 6.5% and 9.5%",
|
|
63
|
+
"KETO-1YR": "Diagnosis of ketoacidosis within the past year",
|
|
64
|
+
"MAJOR-DIABETES": "Major diabetes-related complication. Examples of “major complication” (as opposed to \
|
|
65
|
+
“minor complication”) include, but are not limited to, any of the following that are a result of (or \
|
|
66
|
+
strongly correlated with) uncontrolled diabetes: • Amputation • Kidney damage • Skin conditions • Retinopathy \
|
|
67
|
+
• nephropathy • neuropathy. Additionally, if multiple conditions together imply a severe case of diabetes, \
|
|
68
|
+
then count that as a major complication.",
|
|
69
|
+
"MAKES-DECISIONS": "Patient must make their own medical decisions. Assume that the patient makes their own \
|
|
70
|
+
medical decisions, unless otherwise explicitly noted. There is no information provided about the \
|
|
71
|
+
patient's ability to make their own medical decisions, then assume they do make their own decisions and \
|
|
72
|
+
therefore meet this criteria.\"",
|
|
73
|
+
"MI-6MOS": "Myocardial infarction (MI) within the past 6 months",
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class XMLDataLoader:
|
|
78
|
+
def __init__(
|
|
79
|
+
self, path_to_folder: str, is_convert_to_numbers=True, is_split_text=True, is_remove_excessive_new_lines=True
|
|
80
|
+
):
|
|
81
|
+
self.path_to_folder = path_to_folder
|
|
82
|
+
self.is_convert_to_numbers = is_convert_to_numbers
|
|
83
|
+
self.is_split_text = is_split_text
|
|
84
|
+
self.is_remove_excessive_new_lines = is_remove_excessive_new_lines
|
|
85
|
+
|
|
86
|
+
def load_data(self) -> List[Dict[str, Any]]:
|
|
87
|
+
"""Main function: Data loader for the XML files"""
|
|
88
|
+
data = []
|
|
89
|
+
file_names = os.listdir(self.path_to_folder)
|
|
90
|
+
file_names = sorted([file for file in file_names if file.endswith(".xml")])
|
|
91
|
+
for file_name in file_names:
|
|
92
|
+
file_path = os.path.join(self.path_to_folder, file_name)
|
|
93
|
+
text, labels = self.parse_xml(file_path)
|
|
94
|
+
data.append({"patient_id": file_name.replace(".xml", ""), "ehr": text, "labels": labels})
|
|
95
|
+
|
|
96
|
+
return data
|
|
97
|
+
|
|
98
|
+
@staticmethod
|
|
99
|
+
def get_date_of_note(patient: Dict[str, Any], note_idx: int) -> Optional[str]:
|
|
100
|
+
"""Get date of note for patient"""
|
|
101
|
+
assert note_idx <= len(patient["ehr"]), f"{note_idx} out of bounds for {patient['patient_id']}"
|
|
102
|
+
note: str = patient["ehr"][note_idx]
|
|
103
|
+
match = re.search(r"Record date: (\d{4}-\d{2}-\d{2})", note)
|
|
104
|
+
date = match.group(1) if match else None
|
|
105
|
+
if not date:
|
|
106
|
+
print(f"ERROR - Could not find the date for patient {patient['patient_id']}")
|
|
107
|
+
return date
|
|
108
|
+
|
|
109
|
+
@staticmethod
|
|
110
|
+
def get_current_date_for_patient(patient: Dict[str, Any]) -> Optional[str]:
|
|
111
|
+
"""Get most recent date visible in files for a given patient"""
|
|
112
|
+
most_recent_date = None
|
|
113
|
+
for note in patient["ehr"]:
|
|
114
|
+
match = re.search(r"Record date: (\d{4}-\d{2}-\d{2})", note)
|
|
115
|
+
most_recent_date = match.group(1) if match else most_recent_date
|
|
116
|
+
if not most_recent_date:
|
|
117
|
+
print(f"ERROR - Could not find the date for patient {patient['patient_id']}")
|
|
118
|
+
return most_recent_date
|
|
119
|
+
|
|
120
|
+
def parse_xml(self, XML_file) -> Tuple[List[str], Dict[str, str]]:
|
|
121
|
+
tree = ET.parse(XML_file)
|
|
122
|
+
root = tree.getroot()
|
|
123
|
+
text_content = ""
|
|
124
|
+
result_text: List[str] = []
|
|
125
|
+
tags = {}
|
|
126
|
+
for elem in root.iter():
|
|
127
|
+
if elem.tag == "TEXT":
|
|
128
|
+
text_content = elem.text if elem.text else ""
|
|
129
|
+
if self.is_remove_excessive_new_lines:
|
|
130
|
+
text_content = self.remove_excessive_newlines(text_content)
|
|
131
|
+
if self.is_split_text:
|
|
132
|
+
result_text = self.split_text(text_content)
|
|
133
|
+
else:
|
|
134
|
+
result_text = [text_content]
|
|
135
|
+
elif elem.tag == "TAGS":
|
|
136
|
+
tags = self.read_tags(root)
|
|
137
|
+
return (result_text, tags)
|
|
138
|
+
|
|
139
|
+
def read_tags(self, root) -> Dict[str, str]:
|
|
140
|
+
"""Reads the tags from an XML file and returns a dictionary of tags"""
|
|
141
|
+
tags_dict = {}
|
|
142
|
+
for tag in root.iter("TAGS"):
|
|
143
|
+
for subtag in tag:
|
|
144
|
+
met_value = subtag.attrib.get("met")
|
|
145
|
+
if self.is_convert_to_numbers:
|
|
146
|
+
met_value = 1 if met_value == "met" else 0
|
|
147
|
+
tags_dict[subtag.tag] = met_value
|
|
148
|
+
return tags_dict
|
|
149
|
+
|
|
150
|
+
def split_text(self, text: str) -> List[str]:
|
|
151
|
+
split_char = "*" * 100
|
|
152
|
+
parts = [x.strip() for x in text.split(split_char) if x.strip() != ""]
|
|
153
|
+
return parts
|
|
154
|
+
|
|
155
|
+
def remove_excessive_newlines(self, text: str) -> str:
|
|
156
|
+
text = text.replace("\n\n\n", "\n")
|
|
157
|
+
return text
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
class N2C2CTMatchingScenario(Scenario):
|
|
161
|
+
"""
|
|
162
|
+
From "Cohort selection for clinical trials: n2c2 2018 shared task track 1" (Stubbs et al. 2019).
|
|
163
|
+
N2C2 is a collection of 288 patients (202 train / 86 test), each with 2-5 deidentified real-world clinical notes.
|
|
164
|
+
We use the prompt LLM formulation from Wornow et al. (2024).
|
|
165
|
+
|
|
166
|
+
Citation
|
|
167
|
+
```
|
|
168
|
+
@article{stubbs2019cohort,
|
|
169
|
+
title={Cohort selection for clinical trials: n2c2 2018 shared task track 1},
|
|
170
|
+
author={Stubbs, Amber and Filannino, Michele and Soysal, Ergin and Henry, Samuel and Uzuner, {\"O}zlem},
|
|
171
|
+
journal={Journal of the American Medical Informatics Association},
|
|
172
|
+
volume={26},
|
|
173
|
+
number={11},
|
|
174
|
+
pages={1163--1171},
|
|
175
|
+
year={2019},
|
|
176
|
+
publisher={Oxford University Press}
|
|
177
|
+
}
|
|
178
|
+
@article{wornow2024zero,
|
|
179
|
+
title={Zero-shot clinical trial patient matching with llms},
|
|
180
|
+
author={Wornow, Michael and Lozano, Alejandro and Dash, Dev and Jindal, Jenelle and Mahaffey, \
|
|
181
|
+
Kenneth W and Shah, Nigam H},
|
|
182
|
+
journal={NEJM AI},
|
|
183
|
+
pages={AIcs2400360},
|
|
184
|
+
year={2024},
|
|
185
|
+
publisher={Massachusetts Medical Society}
|
|
186
|
+
}
|
|
187
|
+
```
|
|
188
|
+
"""
|
|
189
|
+
|
|
190
|
+
name = "n2c2_ct_matching"
|
|
191
|
+
description = (
|
|
192
|
+
"N2C2-CT is a benchmark designed to evaluate a model's ability to match patients to"
|
|
193
|
+
"appropriate clinical trials based on eligibility criteria. Each example includes a clinical"
|
|
194
|
+
"note and a trial description. The model is tasked with determining whether the patient"
|
|
195
|
+
"is a valid candidate for the trial. This benchmark supports automation and decision"
|
|
196
|
+
"support in clinical research enrollment."
|
|
197
|
+
)
|
|
198
|
+
tags = [] # TODO
|
|
199
|
+
|
|
200
|
+
POSSIBLE_ANSWER_CHOICES: List[str] = [
|
|
201
|
+
"yes",
|
|
202
|
+
"no",
|
|
203
|
+
]
|
|
204
|
+
|
|
205
|
+
def __init__(self, data_path: str, subject: str):
|
|
206
|
+
super().__init__()
|
|
207
|
+
self.subject: str = subject # specific inclusion criterion to assess
|
|
208
|
+
self.data_path: str = data_path
|
|
209
|
+
self.path_to_train_dir: str = os.path.join(self.data_path, "train/")
|
|
210
|
+
self.path_to_test_dir: str = os.path.join(self.data_path, "test/")
|
|
211
|
+
|
|
212
|
+
def create_prompt(self, patient: Dict[str, Any]) -> str:
|
|
213
|
+
# Cast None values to empty strings during string formatting, but keep the original functions returning None
|
|
214
|
+
notes_list = [
|
|
215
|
+
f"## Note #{i+1}\nDate: {XMLDataLoader.get_date_of_note(patient, i) or ''}\n{note}"
|
|
216
|
+
for i, note in enumerate(patient["ehr"])
|
|
217
|
+
]
|
|
218
|
+
notes: str = ("\n" + "*" * 50 + "\n\n").join(notes_list)
|
|
219
|
+
current_date = XMLDataLoader.get_current_date_for_patient(patient)
|
|
220
|
+
prompt = f"""
|
|
221
|
+
# Task
|
|
222
|
+
Your job is to decide whether the given patient meets the inclusion criterion for a clinical trial.
|
|
223
|
+
|
|
224
|
+
# Inclusion Criterion
|
|
225
|
+
The inclusion criterion being assessed is: "{self.subject}".
|
|
226
|
+
The definition of the inclusion criterion is: "{LONG_DEFINITIONS[self.subject]}".
|
|
227
|
+
|
|
228
|
+
# Patient Clinical Notes
|
|
229
|
+
Below is a set of {len(patient['ehr'])} clinical notes describing the patient's current health status. \
|
|
230
|
+
Each note is separated by a header with the date that the note was written, as well as a long list of asterisks.
|
|
231
|
+
|
|
232
|
+
{'-' * 100}
|
|
233
|
+
|
|
234
|
+
{notes}
|
|
235
|
+
|
|
236
|
+
{'-' * 100}
|
|
237
|
+
|
|
238
|
+
# Current Date
|
|
239
|
+
Assume that the current date is: {current_date}
|
|
240
|
+
|
|
241
|
+
# Question
|
|
242
|
+
Does the patient meet the inclusion criterion "{self.subject}"?
|
|
243
|
+
"""
|
|
244
|
+
return prompt
|
|
245
|
+
|
|
246
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
247
|
+
instances: List[Instance] = []
|
|
248
|
+
for split in ["train", "test"]:
|
|
249
|
+
# limit to zero shot setting
|
|
250
|
+
if split == "test":
|
|
251
|
+
path_to_data = self.path_to_train_dir if split == "train" else self.path_to_test_dir
|
|
252
|
+
ensure_directory_exists(path_to_data)
|
|
253
|
+
|
|
254
|
+
# Load dataset
|
|
255
|
+
dataloader = XMLDataLoader(path_to_data)
|
|
256
|
+
dataset = dataloader.load_data()
|
|
257
|
+
|
|
258
|
+
# Create instances
|
|
259
|
+
for patient in dataset:
|
|
260
|
+
is_met: bool = patient["labels"][self.subject]
|
|
261
|
+
correct_answer: str = "yes" if is_met else "no"
|
|
262
|
+
|
|
263
|
+
# Build `References. The possible answer choices are "yes" or "no"
|
|
264
|
+
references: List[Reference] = [
|
|
265
|
+
Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else [])
|
|
266
|
+
for answer in N2C2CTMatchingScenario.POSSIBLE_ANSWER_CHOICES
|
|
267
|
+
]
|
|
268
|
+
|
|
269
|
+
instances.append(
|
|
270
|
+
Instance(
|
|
271
|
+
input=Input(text=self.create_prompt(patient)),
|
|
272
|
+
references=references,
|
|
273
|
+
split=TRAIN_SPLIT if split == "train" else TEST_SPLIT,
|
|
274
|
+
)
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
return instances
|
|
@@ -10,7 +10,7 @@ from helm.common.hierarchical_logger import htrack_block, hlog
|
|
|
10
10
|
from typing import List, Dict
|
|
11
11
|
|
|
12
12
|
from helm.common.general import ensure_file_downloaded, ensure_directory_exists, asdict_without_nones
|
|
13
|
-
from .scenario import (
|
|
13
|
+
from helm.benchmark.scenarios.scenario import (
|
|
14
14
|
Scenario,
|
|
15
15
|
Instance,
|
|
16
16
|
Reference,
|
|
@@ -11,11 +11,21 @@ from typing import List, Optional, Tuple, Dict
|
|
|
11
11
|
|
|
12
12
|
from helm.benchmark.adaptation.adapters.adapter_factory import ADAPT_GENERATION
|
|
13
13
|
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
14
|
+
from helm.common.local_context import LocalContext
|
|
14
15
|
from helm.benchmark.window_services.tokenizer_service import TokenizerService
|
|
15
16
|
from helm.common.authentication import Authentication
|
|
16
17
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
17
18
|
from helm.proxy.services.server_service import ServerService
|
|
18
|
-
from .scenario import
|
|
19
|
+
from helm.benchmark.scenarios.scenario import (
|
|
20
|
+
Scenario,
|
|
21
|
+
Instance,
|
|
22
|
+
Reference,
|
|
23
|
+
TRAIN_SPLIT,
|
|
24
|
+
TEST_SPLIT,
|
|
25
|
+
CORRECT_TAG,
|
|
26
|
+
Input,
|
|
27
|
+
Output,
|
|
28
|
+
)
|
|
19
29
|
|
|
20
30
|
try:
|
|
21
31
|
import sympy
|
|
@@ -30,7 +40,7 @@ except ModuleNotFoundError as e:
|
|
|
30
40
|
# https://github.com/stanford-crfm/benchmarking/issues/569
|
|
31
41
|
def get_test_tokenizer_service() -> TokenizerService:
|
|
32
42
|
# Pointed to the default local path set in run.py (--local-path)
|
|
33
|
-
return TokenizerService(
|
|
43
|
+
return TokenizerService(LocalContext(base_path="prod_env"))
|
|
34
44
|
|
|
35
45
|
|
|
36
46
|
SOLUTION_TAG: str = "solution"
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
from typing import List, Any
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from datasets import load_dataset
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Scenario,
|
|
7
|
+
Instance,
|
|
8
|
+
Reference,
|
|
9
|
+
CORRECT_TAG,
|
|
10
|
+
TEST_SPLIT,
|
|
11
|
+
Input,
|
|
12
|
+
Output,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class OABExamsScenario(Scenario):
|
|
17
|
+
"""
|
|
18
|
+
The OAB Exam is a mandatory test for anyone who wants to practice law in Brazil. The exam is composed for
|
|
19
|
+
an objective test with 80 multiple-choice questions covering all areas of Law and a written phase focused
|
|
20
|
+
on a specific legal area (e.g., Civil, Criminal, Labor Law), where candidates must draft a legal document
|
|
21
|
+
and answer four essay questions.
|
|
22
|
+
|
|
23
|
+
This dataset is composed by the exams that occured between 2010 and 2018.
|
|
24
|
+
|
|
25
|
+
The dataset can be found in this link: https://huggingface.co/datasets/eduagarcia/oab_exams
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
name = "oab_exams"
|
|
29
|
+
description = "OAB exams dataset"
|
|
30
|
+
tags = ["knowledge", "multiple_choice", "pt-br"]
|
|
31
|
+
|
|
32
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
33
|
+
# Download the raw data and read all the dialogues
|
|
34
|
+
dataset: Any
|
|
35
|
+
# Read all the instances
|
|
36
|
+
instances: List[Instance] = []
|
|
37
|
+
cache_dir = str(Path(output_path) / "data")
|
|
38
|
+
|
|
39
|
+
dataset = load_dataset("eduagarcia/oab_exams", cache_dir=cache_dir)
|
|
40
|
+
for example in dataset["train"]:
|
|
41
|
+
question = example["question"]
|
|
42
|
+
choices = example["choices"]
|
|
43
|
+
answer = example["answerKey"]
|
|
44
|
+
# Skipping every canceled question!
|
|
45
|
+
if example["nullified"]:
|
|
46
|
+
continue
|
|
47
|
+
answers_dict = dict(zip(choices["label"], choices["text"]))
|
|
48
|
+
correct_answer = answers_dict[answer]
|
|
49
|
+
|
|
50
|
+
def answer_to_reference(answer: str) -> Reference:
|
|
51
|
+
return Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else [])
|
|
52
|
+
|
|
53
|
+
instance = Instance(
|
|
54
|
+
input=Input(text=question), split=TEST_SPLIT, references=list(map(answer_to_reference, choices["text"]))
|
|
55
|
+
)
|
|
56
|
+
instances.append(instance)
|
|
57
|
+
return instances
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import datasets
|
|
2
|
+
import os
|
|
3
|
+
from typing import List
|
|
4
|
+
from helm.benchmark.scenarios.scenario import (
|
|
5
|
+
Scenario,
|
|
6
|
+
Instance,
|
|
7
|
+
Reference,
|
|
8
|
+
TEST_SPLIT,
|
|
9
|
+
Input,
|
|
10
|
+
Output,
|
|
11
|
+
CORRECT_TAG,
|
|
12
|
+
)
|
|
13
|
+
from helm.common.general import ensure_directory_exists
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class OmniMATHScenario(Scenario):
|
|
17
|
+
"""Omni-MATH: A Universal Olympiad Level Mathematic Benchmark for Large Language Models
|
|
18
|
+
|
|
19
|
+
Omni-MATH is a comprehensive and challenging benchmark specifically designed to assess LLMs' mathematical
|
|
20
|
+
reasoning at the Olympiad level. The dataset focuses exclusively on Olympiad mathematics and comprises a \
|
|
21
|
+
vast collection of 4428 competition-level problems. These problems are meticulously categorized into 33 \
|
|
22
|
+
(and potentially more) sub-domains and span across 10 distinct difficulty levels, enabling a nuanced \
|
|
23
|
+
analysis of model performance across various mathematical disciplines and levels of complexity.."""
|
|
24
|
+
|
|
25
|
+
name = "omni_math"
|
|
26
|
+
description = "A Universal Olympiad Level Mathematic Benchmark for Large Language Models"
|
|
27
|
+
tags = ["math"]
|
|
28
|
+
|
|
29
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
30
|
+
# Get Omni-MATH from HuggingFace
|
|
31
|
+
cache_dir = os.path.join(output_path, "data")
|
|
32
|
+
ensure_directory_exists(cache_dir)
|
|
33
|
+
dataset = datasets.load_dataset(
|
|
34
|
+
"KbsdJames/Omni-MATH",
|
|
35
|
+
revision="40ba231d8f16e29ecd40e6407e2c8640145a8f62",
|
|
36
|
+
cache_dir=cache_dir,
|
|
37
|
+
split="test",
|
|
38
|
+
)
|
|
39
|
+
assert isinstance(dataset, datasets.Dataset)
|
|
40
|
+
|
|
41
|
+
# Read all instances
|
|
42
|
+
instances: List[Instance] = []
|
|
43
|
+
for idx, row in enumerate(dataset):
|
|
44
|
+
|
|
45
|
+
input = Input(text=row["problem"])
|
|
46
|
+
instance = Instance(
|
|
47
|
+
input=input,
|
|
48
|
+
references=[Reference(Output(text=row["answer"]), tags=[CORRECT_TAG])],
|
|
49
|
+
split=TEST_SPLIT,
|
|
50
|
+
)
|
|
51
|
+
instances.append(instance)
|
|
52
|
+
|
|
53
|
+
return instances
|
|
@@ -2,7 +2,16 @@ from typing import List, Dict, Any, DefaultDict
|
|
|
2
2
|
from datasets import load_dataset, Dataset
|
|
3
3
|
from collections import defaultdict
|
|
4
4
|
|
|
5
|
-
from .scenario import
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
CORRECT_TAG,
|
|
7
|
+
Reference,
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
Input,
|
|
11
|
+
TRAIN_SPLIT,
|
|
12
|
+
VALID_SPLIT,
|
|
13
|
+
Output,
|
|
14
|
+
)
|
|
6
15
|
|
|
7
16
|
|
|
8
17
|
class OpenAssistantScenario(Scenario):
|
|
@@ -110,7 +119,7 @@ class OpenAssistantScenario(Scenario):
|
|
|
110
119
|
return instances
|
|
111
120
|
|
|
112
121
|
# Download the raw data from Huggingface
|
|
113
|
-
dataset: Any = load_dataset("OpenAssistant/oasst1")
|
|
122
|
+
dataset: Any = load_dataset("OpenAssistant/oasst1", revision="fdf72ae0827c1cda404aff25b6603abec9e3399b")
|
|
114
123
|
|
|
115
124
|
# Get the instances for each split
|
|
116
125
|
train_instances = get_split_instances(dataset["train"], TRAIN_SPLIT)
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import re
|
|
4
|
+
from typing import List, Optional
|
|
5
|
+
|
|
6
|
+
import datasets
|
|
7
|
+
import tiktoken
|
|
8
|
+
|
|
9
|
+
from helm.benchmark.scenarios.scenario import (
|
|
10
|
+
CORRECT_TAG,
|
|
11
|
+
Output,
|
|
12
|
+
Reference,
|
|
13
|
+
Scenario,
|
|
14
|
+
Instance,
|
|
15
|
+
TEST_SPLIT,
|
|
16
|
+
Input,
|
|
17
|
+
)
|
|
18
|
+
from helm.common.general import ensure_directory_exists
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class OpenAIMRCRScenario(Scenario):
|
|
22
|
+
"""OpenAI MRCR scenario
|
|
23
|
+
|
|
24
|
+
OpenAI MRCR (Multi-round co-reference resolution) is a long context dataset for benchmarking
|
|
25
|
+
an LLM's ability to distinguish between multiple needles hidden in context. This eval is
|
|
26
|
+
inspired by the MRCR eval first introduced by Gemini (https://arxiv.org/pdf/2409.12640v2).
|
|
27
|
+
|
|
28
|
+
The task is as follows: The model is given a long, multi-turn, synthetically generated
|
|
29
|
+
conversation between user and model where the user asks for a piece of writing about a topic,
|
|
30
|
+
e.g. "write a poem about tapirs" or "write a blog post about rocks". Hidden in this conversation
|
|
31
|
+
are 2, 4, or 8 identical asks, and the model is ultimately prompted to return the i-th instance
|
|
32
|
+
of one of those asks. For example, "Return the 2nd poem about tapirs".
|
|
33
|
+
|
|
34
|
+
Reference: https://huggingface.co/datasets/openai/mrcr"""
|
|
35
|
+
|
|
36
|
+
name = "openai_mrcr"
|
|
37
|
+
description = "OpenAI MRCR (Multi-round co-reference resolution) is a long context dataset for benchmarking an LLM's ability to distinguish between multiple needles hidden in context. This eval is inspired by the MRCR eval first introduced by [Vodrahalli et al., 2024](https://arxiv.org/pdf/2409.12640v2)." # noqa: E501
|
|
38
|
+
tags = ["long_context", "mrcr"]
|
|
39
|
+
|
|
40
|
+
NEEDLES_OPTIONS = [2, 4, 8]
|
|
41
|
+
|
|
42
|
+
def __init__(self, needles: int, max_num_words: Optional[int] = None):
|
|
43
|
+
super().__init__()
|
|
44
|
+
self.needles = needles
|
|
45
|
+
self.max_num_words = max_num_words
|
|
46
|
+
if needles not in self.NEEDLES_OPTIONS:
|
|
47
|
+
raise Exception(f"Needles must be one of {self.NEEDLES_OPTIONS}")
|
|
48
|
+
self.tokenizer = tiktoken.get_encoding("o200k_base")
|
|
49
|
+
|
|
50
|
+
def count_words(self, messages: list[dict]) -> int:
|
|
51
|
+
return sum([len(re.split(r"\s+", m["content"].strip())) for m in messages])
|
|
52
|
+
|
|
53
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
54
|
+
cache_dir = os.path.join(output_path, "data")
|
|
55
|
+
ensure_directory_exists(cache_dir)
|
|
56
|
+
dataset = datasets.load_dataset(
|
|
57
|
+
"openai/mrcr",
|
|
58
|
+
cache_dir=cache_dir,
|
|
59
|
+
split="train",
|
|
60
|
+
data_files=[f"{self.needles}needle.parquet"],
|
|
61
|
+
revision="204b0d4e8d9ca5c0a90bf942fdb2a5969094adc0",
|
|
62
|
+
)
|
|
63
|
+
instances = []
|
|
64
|
+
for idx, row in enumerate(dataset):
|
|
65
|
+
messages = json.loads(row["prompt"])
|
|
66
|
+
if self.max_num_words and self.count_words(messages) > self.max_num_words:
|
|
67
|
+
continue
|
|
68
|
+
input = Input(messages=messages)
|
|
69
|
+
references = [Reference(output=Output(text=row["answer"]), tags=[CORRECT_TAG])]
|
|
70
|
+
instance = Instance(
|
|
71
|
+
id=f"{self.needles}needle{idx}",
|
|
72
|
+
input=input,
|
|
73
|
+
references=references,
|
|
74
|
+
split=TEST_SPLIT,
|
|
75
|
+
extra_data={"random_string_to_prepend": row["random_string_to_prepend"]},
|
|
76
|
+
)
|
|
77
|
+
instances.append(instance)
|
|
78
|
+
|
|
79
|
+
return instances
|