PyPI - crfm-helm - Versions diffs - 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl - Mend

crfm-helm 0.5.4py3-none-any.whl → 0.5.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crfm-helm might be problematic. Click here for more details.

Files changed (652) hide show

crfm_helm-0.5.6.dist-info/METADATA +427 -0
crfm_helm-0.5.6.dist-info/RECORD +941 -0
{crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
helm/benchmark/adaptation/adapter_spec.py +13 -1
helm/benchmark/adaptation/adapters/adapter_factory.py +15 -1
helm/benchmark/adaptation/adapters/binary_ranking_adapter.py +1 -1
helm/benchmark/adaptation/adapters/chat_adapter.py +49 -0
helm/benchmark/adaptation/adapters/ehr_instruction_adapter.py +108 -0
helm/benchmark/adaptation/adapters/generation_adapter.py +1 -1
helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +4 -4
helm/benchmark/adaptation/adapters/language_modeling_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multimodal/generation_multimodal_adapter.py +4 -2
helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multimodal/test_in_context_learning_multimodal_adapter.py +4 -2
helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +1 -1
helm/benchmark/adaptation/adapters/multiple_choice_calibrated_adapter.py +1 -1
helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +2 -2
helm/benchmark/adaptation/adapters/multiple_choice_joint_chain_of_thought_adapter.py +87 -0
helm/benchmark/adaptation/adapters/multiple_choice_separate_adapter.py +1 -1
helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
helm/benchmark/adaptation/adapters/test_generation_adapter.py +3 -3
helm/benchmark/adaptation/adapters/test_language_modeling_adapter.py +2 -2
helm/benchmark/adaptation/adapters/test_multiple_choice_joint_adapter.py +2 -2
helm/benchmark/adaptation/common_adapter_specs.py +69 -4
helm/benchmark/adaptation/prompt.py +1 -1
helm/benchmark/annotation/aci_bench_annotator.py +95 -0
helm/benchmark/annotation/air_bench_annotator.py +21 -6
helm/benchmark/annotation/annotator.py +5 -0
helm/benchmark/annotation/annotator_factory.py +3 -20
helm/benchmark/annotation/autobencher_capabilities_annotator.py +107 -0
helm/benchmark/annotation/autobencher_safety_annotator.py +98 -0
helm/benchmark/annotation/bigcodebench_annotator.py +108 -0
helm/benchmark/annotation/bird_sql_annotator.py +58 -0
helm/benchmark/annotation/chw_care_plan_annotator.py +93 -0
helm/benchmark/annotation/czech_bank_qa_annotator.py +78 -0
helm/benchmark/annotation/dischargeme_annotator.py +107 -0
helm/benchmark/annotation/ehr_sql_annotator.py +87 -0
helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +131 -0
helm/benchmark/annotation/image2struct/image_compiler_annotator.py +6 -1
helm/benchmark/annotation/live_qa_annotator.py +1 -1
helm/benchmark/annotation/med_dialog_annotator.py +99 -0
helm/benchmark/annotation/medalign_annotator.py +100 -0
helm/benchmark/annotation/medi_qa_annotator.py +98 -0
helm/benchmark/annotation/medication_qa_annotator.py +87 -63
helm/benchmark/annotation/mental_health_annotator.py +98 -0
helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
helm/benchmark/annotation/mimic_rrs_annotator.py +100 -0
helm/benchmark/annotation/model_as_judge.py +214 -6
helm/benchmark/annotation/mtsamples_procedures_annotator.py +98 -0
helm/benchmark/annotation/mtsamples_replicate_annotator.py +101 -0
helm/benchmark/annotation/omni_math/gpt_evaluation_template.txt +152 -0
helm/benchmark/annotation/omni_math/gpt_evaluation_zero_shot_template.txt +36 -0
helm/benchmark/annotation/omni_math_annotator.py +131 -0
helm/benchmark/annotation/spider_annotator.py +18 -0
helm/benchmark/annotation/starr_patient_instructions_annotator.py +98 -0
helm/benchmark/annotation/wildbench/eval_template.pairwise.v2.md +75 -0
helm/benchmark/annotation/wildbench/eval_template.score.v2.md +66 -0
helm/benchmark/annotation/wildbench_annotator.py +119 -0
helm/benchmark/annotation_executor.py +35 -15
helm/benchmark/augmentations/cleva_perturbation.py +9 -8
helm/benchmark/augmentations/contraction_expansion_perturbation.py +2 -2
helm/benchmark/augmentations/contrast_sets_perturbation.py +2 -2
helm/benchmark/augmentations/dialect_perturbation.py +4 -5
helm/benchmark/augmentations/extra_space_perturbation.py +2 -2
helm/benchmark/augmentations/filler_words_perturbation.py +2 -2
helm/benchmark/augmentations/gender_perturbation.py +2 -2
helm/benchmark/augmentations/lowercase_perturbation.py +2 -2
helm/benchmark/augmentations/mild_mix_perturbation.py +6 -6
helm/benchmark/augmentations/misspelling_perturbation.py +2 -2
helm/benchmark/augmentations/person_name_perturbation.py +4 -5
helm/benchmark/augmentations/perturbation.py +1 -1
helm/benchmark/augmentations/space_perturbation.py +2 -2
helm/benchmark/augmentations/suffix_perturbation.py +2 -2
helm/benchmark/augmentations/synonym_perturbation.py +4 -3
helm/benchmark/augmentations/test_perturbation.py +16 -13
helm/benchmark/augmentations/translate_perturbation.py +2 -2
helm/benchmark/augmentations/typos_perturbation.py +2 -2
helm/benchmark/data_preprocessor.py +2 -2
helm/benchmark/executor.py +11 -12
helm/benchmark/huggingface_registration.py +2 -7
helm/benchmark/metrics/aci_bench_metrics.py +14 -0
helm/benchmark/metrics/basic_metrics.py +6 -6
helm/benchmark/metrics/bbq_metrics.py +2 -2
helm/benchmark/metrics/bias_metrics.py +12 -3
helm/benchmark/metrics/bias_word_lists.py +1 -1
helm/benchmark/metrics/bigcodebench_metrics.py +25 -0
helm/benchmark/metrics/bird_sql_metrics.py +28 -0
helm/benchmark/metrics/chw_care_plan_metrics.py +14 -0
helm/benchmark/metrics/classification_metrics.py +76 -12
helm/benchmark/metrics/cleva_harms_metrics.py +10 -9
helm/benchmark/metrics/code_metrics.py +5 -5
helm/benchmark/metrics/comet_metric.py +125 -0
helm/benchmark/metrics/common_metric_specs.py +9 -2
helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +72 -0
helm/benchmark/metrics/copyright_metrics.py +4 -4
helm/benchmark/metrics/czech_bank_qa_metrics.py +29 -0
helm/benchmark/metrics/decodingtrust_fairness_metrics.py +2 -2
helm/benchmark/metrics/decodingtrust_privacy_metrics.py +2 -2
helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +2 -2
helm/benchmark/metrics/dischargeme_metrics.py +14 -0
helm/benchmark/metrics/disinformation_metrics.py +4 -4
helm/benchmark/metrics/dry_run_metrics.py +5 -5
helm/benchmark/metrics/efficiency_metrics.py +6 -6
helm/benchmark/metrics/ehr_sql_metrics.py +103 -0
helm/benchmark/metrics/evaluate_instances_metric.py +3 -3
helm/benchmark/metrics/evaluate_reference_metrics.py +144 -16
helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +103 -0
helm/benchmark/metrics/gpt4_audio_critique_metrics.py +167 -0
helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +36 -0
helm/benchmark/metrics/ifeval/__init__.py +0 -0
helm/benchmark/metrics/ifeval/instructions.py +1574 -0
helm/benchmark/metrics/ifeval/instructions_registry.py +182 -0
helm/benchmark/metrics/ifeval/instructions_registry.pyi +3 -0
helm/benchmark/metrics/ifeval/instructions_util.py +153 -0
helm/benchmark/metrics/ifeval_metrics.py +55 -0
helm/benchmark/metrics/image_generation/aesthetics_metrics.py +1 -1
helm/benchmark/metrics/image_generation/detection_metrics.py +1 -1
helm/benchmark/metrics/image_generation/detectors/vitdet.py +1 -1
helm/benchmark/metrics/image_generation/fractal_dimension/test_fractal_dimension_util.py +1 -1
helm/benchmark/metrics/image_generation/fractal_dimension_metric.py +1 -1
helm/benchmark/metrics/image_generation/nsfw_metrics.py +1 -1
helm/benchmark/metrics/image_generation/q16/test_q16.py +3 -1
helm/benchmark/metrics/image_generation/q16_toxicity_metrics.py +1 -1
helm/benchmark/metrics/image_generation/skin_tone_metrics.py +2 -2
helm/benchmark/metrics/image_generation/watermark/test_watermark_detector.py +1 -1
helm/benchmark/metrics/image_generation/watermark_metrics.py +1 -1
helm/benchmark/metrics/instruction_following_critique_metrics.py +4 -4
helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
helm/benchmark/metrics/language_modeling_metrics.py +4 -4
helm/benchmark/metrics/llm_jury_metrics.py +46 -0
helm/benchmark/metrics/machine_translation_metrics.py +2 -2
helm/benchmark/metrics/med_dialog_metrics.py +14 -0
helm/benchmark/metrics/medalign_metrics.py +14 -0
helm/benchmark/metrics/medcalc_bench_metrics.py +124 -0
helm/benchmark/metrics/medec_metrics.py +101 -0
helm/benchmark/metrics/medi_qa_metrics.py +14 -0
helm/benchmark/metrics/medication_qa_metrics.py +10 -19
helm/benchmark/metrics/melt_bias_metric.py +234 -0
helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
helm/benchmark/metrics/melt_metric_specs.py +43 -0
helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
helm/benchmark/metrics/mental_health_metrics.py +14 -0
helm/benchmark/metrics/metric.py +3 -3
helm/benchmark/metrics/metric_service.py +11 -11
helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
helm/benchmark/metrics/mimic_rrs_metrics.py +14 -0
helm/benchmark/metrics/mimiciv_billing_code_metrics.py +96 -0
helm/benchmark/metrics/mtsamples_procedures_metrics.py +14 -0
helm/benchmark/metrics/mtsamples_replicate_metrics.py +14 -0
helm/benchmark/metrics/nltk_helper.py +32 -0
helm/benchmark/metrics/numeracy_metrics.py +4 -4
helm/benchmark/metrics/omni_math_metrics.py +32 -0
helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
helm/benchmark/metrics/output_processing_metric.py +60 -0
helm/benchmark/metrics/output_processors.py +15 -0
helm/benchmark/metrics/paraphrase_generation_metrics.py +2 -2
helm/benchmark/metrics/ranking_metrics.py +3 -3
helm/benchmark/metrics/reference_metric.py +3 -3
helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
helm/benchmark/metrics/{bhasa_metrics.py → seahelm_metrics.py} +3 -3
helm/benchmark/metrics/seahelm_metrics_specs.py +10 -0
helm/benchmark/metrics/spider_metrics.py +7 -0
helm/benchmark/metrics/starr_patient_instructions_metrics.py +14 -0
helm/benchmark/metrics/statistic.py +1 -1
helm/benchmark/metrics/summac/model_summac.py +2 -3
helm/benchmark/metrics/summarization_critique_metrics.py +4 -4
helm/benchmark/metrics/summarization_metrics.py +20 -9
helm/benchmark/metrics/test_bias_metrics.py +5 -1
helm/benchmark/metrics/test_classification_metrics.py +140 -68
helm/benchmark/metrics/test_evaluate_reference_metrics.py +15 -0
helm/benchmark/metrics/test_metric.py +1 -1
helm/benchmark/metrics/test_statistic.py +2 -2
helm/benchmark/metrics/tokens/ai21_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/auto_token_cost_estimator.py +6 -6
helm/benchmark/metrics/tokens/cohere_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/free_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/gooseai_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/openai_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/test_ai21_token_cost_estimator.py +1 -1
helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +3 -3
helm/benchmark/metrics/toxicity_metrics.py +6 -6
helm/benchmark/metrics/unitxt_metrics.py +7 -5
helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
helm/benchmark/metrics/vision_language/image_metrics.py +1 -1
helm/benchmark/metrics/vision_language/image_utils.py +2 -2
helm/benchmark/metrics/wildbench_metrics.py +34 -0
helm/benchmark/model_deployment_registry.py +6 -8
helm/benchmark/model_metadata_registry.py +16 -0
helm/benchmark/presentation/contamination.py +3 -3
helm/benchmark/presentation/create_plots.py +33 -12
helm/benchmark/presentation/run_display.py +13 -0
helm/benchmark/presentation/schema.py +2 -1
helm/benchmark/presentation/summarize.py +97 -67
helm/benchmark/presentation/torr_robustness_summarizer.py +178 -0
helm/benchmark/reeval_run.py +202 -0
helm/benchmark/reeval_runner.py +355 -0
helm/benchmark/run.py +86 -90
helm/benchmark/run_expander.py +90 -9
helm/benchmark/run_spec_factory.py +13 -0
helm/benchmark/run_specs/air_bench_run_specs.py +21 -3
helm/benchmark/run_specs/audio_run_specs.py +657 -0
helm/benchmark/run_specs/call_center_run_specs.py +49 -0
helm/benchmark/run_specs/capabilities_run_specs.py +308 -0
helm/benchmark/run_specs/classic_run_specs.py +1 -69
helm/benchmark/run_specs/enem_challenge_specs.py +31 -0
helm/benchmark/run_specs/enterprise_run_specs.py +280 -0
helm/benchmark/run_specs/experimental_run_specs.py +142 -3
helm/benchmark/run_specs/imdb_ptbr_run_specs.py +30 -0
helm/benchmark/run_specs/lite_run_specs.py +2 -2
helm/benchmark/run_specs/long_context_run_specs.py +141 -0
helm/benchmark/run_specs/medhelm_run_specs.py +1260 -0
helm/benchmark/run_specs/melt_run_specs.py +783 -0
helm/benchmark/run_specs/mmlu_clinical_afr_run_specs.py +49 -0
helm/benchmark/run_specs/oab_exams_specs.py +32 -0
helm/benchmark/run_specs/safety_run_specs.py +37 -0
helm/benchmark/run_specs/{bhasa_run_specs.py → seahelm_run_specs.py} +44 -44
helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
helm/benchmark/run_specs/sql_run_specs.py +54 -0
helm/benchmark/run_specs/tweetsentbr_run_specs.py +32 -0
helm/benchmark/run_specs/unitxt_run_specs.py +14 -5
helm/benchmark/run_specs/vlm_run_specs.py +103 -2
helm/benchmark/run_specs/winogrande_afr_run_specs.py +47 -0
helm/benchmark/runner.py +5 -5
helm/benchmark/scenarios/aci_bench_scenario.py +126 -0
helm/benchmark/scenarios/air_bench_scenario.py +6 -1
helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +5 -3
helm/benchmark/scenarios/anthropic_red_team_scenario.py +1 -1
helm/benchmark/scenarios/audio_language/__init__.py +0 -0
helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +130 -0
helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +154 -0
helm/benchmark/scenarios/audio_language/ami_scenario.py +96 -0
helm/benchmark/scenarios/audio_language/audio_mnist_scenario.py +62 -0
helm/benchmark/scenarios/audio_language/audio_pairs_scenario.py +62 -0
helm/benchmark/scenarios/audio_language/audiocaps_scenario.py +59 -0
helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +152 -0
helm/benchmark/scenarios/audio_language/common_voice_15_scenario.py +99 -0
helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
helm/benchmark/scenarios/audio_language/covost2_scenario.py +163 -0
helm/benchmark/scenarios/audio_language/fleurs_fairness_scenario.py +83 -0
helm/benchmark/scenarios/audio_language/fleurs_scenario.py +312 -0
helm/benchmark/scenarios/audio_language/iemocap_audio_scenario.py +83 -0
helm/benchmark/scenarios/audio_language/librispeech_fairness_scenario.py +96 -0
helm/benchmark/scenarios/audio_language/librispeech_scenario.py +80 -0
helm/benchmark/scenarios/audio_language/meld_audio_scenario.py +113 -0
helm/benchmark/scenarios/audio_language/multilingual_librispeech_scenario.py +80 -0
helm/benchmark/scenarios/audio_language/mustard_scenario.py +142 -0
helm/benchmark/scenarios/audio_language/mutox_scenario.py +254 -0
helm/benchmark/scenarios/audio_language/parade_scenario.py +97 -0
helm/benchmark/scenarios/audio_language/speech_robust_bench_scenario.py +124 -0
helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +83 -0
helm/benchmark/scenarios/audio_language/voice_jailbreak_attacks_scenario.py +87 -0
helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +105 -0
helm/benchmark/scenarios/autobencher_capabilities_scenario.py +68 -0
helm/benchmark/scenarios/autobencher_safety_scenario.py +51 -0
helm/benchmark/scenarios/babi_qa_scenario.py +1 -1
helm/benchmark/scenarios/banking77_scenario.py +6 -1
helm/benchmark/scenarios/bbq_scenario.py +1 -1
helm/benchmark/scenarios/big_bench_scenario.py +11 -1
helm/benchmark/scenarios/bigcodebench_scenario.py +58 -0
helm/benchmark/scenarios/bird_sql_scenario.py +94 -0
helm/benchmark/scenarios/bird_sql_scenario_helper.py +118 -0
helm/benchmark/scenarios/blimp_scenario.py +1 -1
helm/benchmark/scenarios/bold_scenario.py +1 -1
helm/benchmark/scenarios/boolq_scenario.py +1 -1
helm/benchmark/scenarios/casehold_scenario.py +79 -0
helm/benchmark/scenarios/chw_care_plan_scenario.py +106 -0
helm/benchmark/scenarios/civil_comments_scenario.py +1 -1
helm/benchmark/scenarios/clear_scenario.py +157 -0
helm/benchmark/scenarios/cleva_scenario.py +2 -2
helm/benchmark/scenarios/code_scenario.py +17 -4
helm/benchmark/scenarios/commonsense_scenario.py +1 -1
helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +97 -0
helm/benchmark/scenarios/copyright_scenario.py +1 -1
helm/benchmark/scenarios/covid_dialog_scenario.py +10 -1
helm/benchmark/scenarios/cti_to_mitre_scenario.py +240 -0
helm/benchmark/scenarios/custom_mcqa_scenario.py +1 -1
helm/benchmark/scenarios/czech_bank_qa_scenario.py +130 -0
helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +1 -1
helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +1 -1
helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +1 -1
helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +1 -1
helm/benchmark/scenarios/dialogue_scenarios.py +13 -2
helm/benchmark/scenarios/dischargeme_scenario.py +172 -0
helm/benchmark/scenarios/disinformation_scenario.py +10 -1
helm/benchmark/scenarios/dyck_language_scenario.py +10 -1
helm/benchmark/scenarios/echr_judgment_classification_scenario.py +113 -0
helm/benchmark/scenarios/ehr_sql_scenario.py +137 -0
helm/benchmark/scenarios/ehrshot_scenario.py +1519 -0
helm/benchmark/scenarios/enem_challenge_scenario.py +58 -0
helm/benchmark/scenarios/entity_data_imputation_scenario.py +11 -1
helm/benchmark/scenarios/entity_matching_scenario.py +12 -2
helm/benchmark/scenarios/financial_phrasebank_scenario.py +94 -0
helm/benchmark/scenarios/gold_commodity_news_scenario.py +124 -0
helm/benchmark/scenarios/gpqa_scenario.py +80 -0
helm/benchmark/scenarios/grammar.py +2 -2
helm/benchmark/scenarios/grammar_scenario.py +2 -2
helm/benchmark/scenarios/gsm_scenario.py +10 -1
helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +50 -0
helm/benchmark/scenarios/harm_bench_scenario.py +1 -1
helm/benchmark/scenarios/headqa_scenario.py +136 -0
helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +37 -0
helm/benchmark/scenarios/ice_scenario.py +8 -4
helm/benchmark/scenarios/ifeval_scenario.py +53 -0
helm/benchmark/scenarios/imdb_ptbr_scenario.py +60 -0
helm/benchmark/scenarios/imdb_scenario.py +11 -2
helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +79 -0
helm/benchmark/scenarios/interactive_qa_mmlu_scenario.py +2 -2
helm/benchmark/scenarios/koala_scenario.py +1 -1
helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
helm/benchmark/scenarios/legal_contract_summarization_scenario.py +129 -0
helm/benchmark/scenarios/legal_opinion_sentiment_classification_scenario.py +77 -0
helm/benchmark/scenarios/legal_summarization_scenario.py +11 -1
helm/benchmark/scenarios/legal_support_scenario.py +11 -1
helm/benchmark/scenarios/legalbench_scenario.py +22 -3
helm/benchmark/scenarios/lex_glue_scenario.py +12 -2
helm/benchmark/scenarios/lextreme_scenario.py +11 -1
helm/benchmark/scenarios/live_qa_scenario.py +1 -1
helm/benchmark/scenarios/lm_entry_scenario.py +1 -1
helm/benchmark/scenarios/lsat_qa_scenario.py +1 -1
helm/benchmark/scenarios/math_scenario.py +9 -1
helm/benchmark/scenarios/me_q_sum_scenario.py +10 -1
helm/benchmark/scenarios/med_dialog_scenario.py +25 -22
helm/benchmark/scenarios/med_mcqa_scenario.py +10 -1
helm/benchmark/scenarios/med_paragraph_simplification_scenario.py +10 -1
helm/benchmark/scenarios/med_qa_scenario.py +10 -1
helm/benchmark/scenarios/medalign_scenario.py +94 -0
helm/benchmark/scenarios/medalign_scenario_helper.py +432 -0
helm/benchmark/scenarios/medbullets_scenario.py +145 -0
helm/benchmark/scenarios/medcalc_bench_scenario.py +127 -0
helm/benchmark/scenarios/medec_scenario.py +125 -0
helm/benchmark/scenarios/medhallu_scenario.py +72 -0
helm/benchmark/scenarios/medi_qa_scenario.py +111 -0
helm/benchmark/scenarios/medication_qa_scenario.py +8 -2
helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
helm/benchmark/scenarios/melt_scenarios.py +793 -0
helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
helm/benchmark/scenarios/mental_health_scenario.py +123 -0
helm/benchmark/scenarios/mimic_bhc_scenario.py +103 -0
helm/benchmark/scenarios/mimic_rrs_scenario.py +98 -0
helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +77 -0
helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py +74 -0
helm/benchmark/scenarios/mmlu_pro_scenario.py +95 -0
helm/benchmark/scenarios/mmlu_scenario.py +11 -1
helm/benchmark/scenarios/msmarco_scenario.py +1 -1
helm/benchmark/scenarios/mtsamples_procedures_scenario.py +144 -0
helm/benchmark/scenarios/mtsamples_replicate_scenario.py +142 -0
helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +277 -0
helm/benchmark/scenarios/narrativeqa_scenario.py +1 -1
helm/benchmark/scenarios/natural_qa_scenario.py +1 -1
helm/benchmark/scenarios/newsqa_scenario.py +1 -1
helm/benchmark/scenarios/numeracy_scenario.py +12 -2
helm/benchmark/scenarios/oab_exams_scenario.py +57 -0
helm/benchmark/scenarios/omni_math_scenario.py +53 -0
helm/benchmark/scenarios/open_assistant_scenario.py +11 -2
helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
helm/benchmark/scenarios/opinions_qa_scenario.py +1 -1
helm/benchmark/scenarios/pubmed_qa_scenario.py +59 -43
helm/benchmark/scenarios/quac_scenario.py +10 -1
helm/benchmark/scenarios/race_based_med_scenario.py +152 -0
helm/benchmark/scenarios/raft_scenario.py +17 -2
helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +1 -1
helm/benchmark/scenarios/ruler_qa_scenario_helper.py +171 -0
helm/benchmark/scenarios/ruler_qa_scenarios.py +88 -0
helm/benchmark/scenarios/scenario.py +9 -1
helm/benchmark/scenarios/{bhasa_scenario.py → seahelm_scenario.py} +7 -2
helm/benchmark/scenarios/self_instruct_scenario.py +1 -1
helm/benchmark/scenarios/shc_bmt_scenario.py +75 -0
helm/benchmark/scenarios/shc_cdi_scenario.py +75 -0
helm/benchmark/scenarios/shc_conf_scenario.py +76 -0
helm/benchmark/scenarios/shc_ent_scenario.py +77 -0
helm/benchmark/scenarios/shc_gip_scenario.py +74 -0
helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
helm/benchmark/scenarios/shc_ptbm_scenario.py +81 -0
helm/benchmark/scenarios/shc_sei_scenario.py +94 -0
helm/benchmark/scenarios/shc_sequoia_scenario.py +77 -0
helm/benchmark/scenarios/simple_safety_tests_scenario.py +1 -1
helm/benchmark/scenarios/spider_scenario.py +91 -0
helm/benchmark/scenarios/starr_patient_instructions_scenario.py +97 -0
helm/benchmark/scenarios/summarization_scenario.py +11 -1
helm/benchmark/scenarios/sumosum_scenario.py +157 -0
helm/benchmark/scenarios/synthetic_efficiency_scenario.py +1 -1
helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +11 -1
helm/benchmark/scenarios/synthetic_reasoning_scenario.py +11 -1
helm/benchmark/scenarios/test_bigcodebench_scenario.py +26 -0
helm/benchmark/scenarios/test_czech_bank_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_enem_challenge_scenario.py +53 -0
helm/benchmark/scenarios/test_ewok_scenario.py +6 -2
helm/benchmark/scenarios/test_gold_commodity_news_scenario.py +18 -0
helm/benchmark/scenarios/test_gpqa_scenario.py +44 -0
helm/benchmark/scenarios/test_ifeval_scenario.py +36 -0
helm/benchmark/scenarios/test_imdb_ptbr_scenario.py +27 -0
helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
helm/benchmark/scenarios/test_math_scenario.py +1 -0
helm/benchmark/scenarios/test_mmlu_clinical_afr_scenario.py +21 -0
helm/benchmark/scenarios/test_mmlu_pro_scenario.py +53 -0
helm/benchmark/scenarios/test_oab_exams_scenario.py +51 -0
helm/benchmark/scenarios/test_omni_math_scenario.py +27 -0
helm/benchmark/scenarios/test_tweetsentbr_scenario.py +24 -0
helm/benchmark/scenarios/test_wildbench_scenario.py +15 -0
helm/benchmark/scenarios/test_winogrande_afr_scenario.py +19 -0
helm/benchmark/scenarios/thai_exam_scenario.py +10 -1
helm/benchmark/scenarios/the_pile_scenario.py +1 -1
helm/benchmark/scenarios/truthful_qa_scenario.py +12 -2
helm/benchmark/scenarios/tweetsentbr_scenario.py +66 -0
helm/benchmark/scenarios/twitter_aae_scenario.py +1 -1
helm/benchmark/scenarios/unitxt_scenario.py +8 -2
helm/benchmark/scenarios/verifiability_judgment_scenario.py +1 -1
helm/benchmark/scenarios/vicuna_scenario.py +1 -1
helm/benchmark/scenarios/vision_language/blink_scenario.py +140 -0
helm/benchmark/scenarios/vision_language/mm_star_scenario.py +95 -0
helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
helm/benchmark/scenarios/vision_language/vqa_rad_scenario.py +88 -0
helm/benchmark/scenarios/wikifact_scenario.py +11 -1
helm/benchmark/scenarios/wikitext_103_scenario.py +1 -1
helm/benchmark/scenarios/wildbench_scenario.py +83 -0
helm/benchmark/scenarios/winogrande_afr_scenario.py +78 -0
helm/benchmark/scenarios/wmt_14_scenario.py +14 -2
helm/benchmark/scenarios/xstest_scenario.py +1 -1
helm/benchmark/server.py +13 -1
helm/benchmark/slurm_runner.py +1 -1
helm/benchmark/static/schema_audio.yaml +763 -0
helm/benchmark/static/schema_autobencher.yaml +150 -0
helm/benchmark/static/schema_call_center.yaml +97 -60
helm/benchmark/static/{schema_medical.yaml → schema_capabilities.yaml} +100 -101
helm/benchmark/static/schema_czech_bank.yaml +148 -0
helm/benchmark/static/schema_enem_challenge.yaml +146 -0
helm/benchmark/static/schema_enterprise.yaml +319 -0
helm/benchmark/static/schema_finance.yaml +14 -12
helm/benchmark/static/schema_heim.yaml +1389 -0
helm/benchmark/static/schema_long_context.yaml +283 -0
helm/benchmark/static/schema_medhelm.yaml +1140 -0
helm/benchmark/static/schema_melt.yaml +1257 -0
helm/benchmark/static/schema_mmlu_winogrande_afr.yaml +1045 -0
helm/benchmark/static/schema_safety.yaml +18 -1
helm/benchmark/static/{schema_bhasa.yaml → schema_seahelm.yaml} +30 -16
helm/benchmark/static/schema_slphelm.yaml +162 -0
helm/benchmark/static/schema_social_audio.yaml +224 -0
helm/benchmark/static/schema_sql.yaml +171 -0
helm/benchmark/static/{schema_tables.yaml → schema_torr.yaml} +169 -36
helm/benchmark/static/schema_tweetsentbr.yaml +146 -0
helm/benchmark/static/schema_vhelm.yaml +129 -56
helm/benchmark/static/schema_video.yaml +219 -0
helm/benchmark/static_build/assets/helm-safety-2907a7b6.png +0 -0
helm/benchmark/static_build/assets/index-94295e78.js +10 -0
helm/benchmark/static_build/assets/index-b9779128.css +1 -0
helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
helm/benchmark/static_build/assets/medhelm-v1-overview-3ddfcd65.png +0 -0
helm/benchmark/static_build/assets/{react-d4a0b69b.js → react-f82877fd.js} +1 -1
helm/benchmark/static_build/assets/{recharts-6d337683.js → recharts-4037aff0.js} +1 -1
helm/benchmark/static_build/assets/{tremor-54a99cc4.js → tremor-38a10867.js} +2 -2
helm/benchmark/static_build/config.js +1 -1
helm/benchmark/static_build/index.html +6 -6
helm/benchmark/window_services/default_window_service.py +1 -1
helm/benchmark/window_services/encoder_decoder_window_service.py +4 -4
helm/benchmark/window_services/ice_window_service.py +1 -1
helm/benchmark/window_services/image_generation/lexica_search_window_service.py +1 -1
helm/benchmark/window_services/image_generation/openai_dalle_window_service.py +1 -1
helm/benchmark/window_services/local_window_service.py +2 -2
helm/benchmark/window_services/test_anthropic_window_service.py +3 -3
helm/benchmark/window_services/test_bloom_window_service.py +3 -3
helm/benchmark/window_services/test_gpt2_window_service.py +7 -2
helm/benchmark/window_services/test_gpt4_window_service.py +8 -3
helm/benchmark/window_services/test_gptj_window_service.py +8 -3
helm/benchmark/window_services/test_gptneox_window_service.py +3 -3
helm/benchmark/window_services/test_openai_window_service.py +8 -3
helm/benchmark/window_services/test_opt_window_service.py +3 -3
helm/benchmark/window_services/test_palmyra_window_service.py +3 -3
helm/benchmark/window_services/test_t0pp_window_service.py +3 -3
helm/benchmark/window_services/test_t511b_window_service.py +3 -3
helm/benchmark/window_services/test_ul2_window_service.py +3 -3
helm/benchmark/window_services/test_utils.py +4 -5
helm/benchmark/window_services/test_yalm_window_service.py +3 -3
helm/benchmark/window_services/tokenizer_service.py +7 -8
helm/benchmark/window_services/yalm_window_service.py +1 -1
helm/clients/ai21_client.py +3 -3
helm/clients/aleph_alpha_client.py +1 -1
helm/clients/anthropic_client.py +69 -29
helm/clients/audio_language/__init__.py +0 -0
helm/clients/audio_language/diva_llama_client.py +120 -0
helm/clients/audio_language/llama_omni_client.py +198 -0
helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
helm/clients/audio_language/qwen2_audiolm_client.py +190 -0
helm/clients/audio_language/qwen_audiolm_client.py +152 -0
helm/clients/audio_language/test.py +62 -0
helm/clients/auto_client.py +4 -2
helm/clients/azure_openai_client.py +55 -0
helm/clients/bedrock_client.py +203 -7
helm/clients/bedrock_utils.py +33 -0
helm/clients/client.py +7 -7
helm/clients/clip_scorers/clip_scorer.py +1 -1
helm/clients/clip_scorers/multilingual_clip_scorer.py +1 -1
helm/clients/cohere_client.py +3 -3
helm/clients/google_client.py +1 -1
helm/clients/grok_client.py +36 -0
helm/clients/http_model_client.py +1 -1
helm/clients/huggingface_client.py +52 -21
helm/clients/huggingface_pipeline_client.py +138 -0
helm/clients/ibm_client.py +267 -0
helm/clients/image_generation/adobe_vision_client.py +1 -1
helm/clients/image_generation/aleph_alpha_image_generation_client.py +1 -1
helm/clients/image_generation/cogview2/sr_pipeline/__init__.py +3 -3
helm/clients/image_generation/cogview2/sr_pipeline/direct_sr.py +5 -2
helm/clients/image_generation/cogview2/sr_pipeline/iterative_sr.py +5 -2
helm/clients/image_generation/cogview2/sr_pipeline/sr_group.py +2 -2
helm/clients/image_generation/cogview2_client.py +1 -1
helm/clients/image_generation/dalle2_client.py +1 -1
helm/clients/image_generation/dalle3_client.py +2 -2
helm/clients/image_generation/dalle_mini/__init__.py +1 -1
helm/clients/image_generation/dalle_mini/data.py +1 -1
helm/clients/image_generation/dalle_mini/model/__init__.py +5 -5
helm/clients/image_generation/dalle_mini/model/configuration.py +2 -2
helm/clients/image_generation/dalle_mini/model/modeling.py +3 -3
helm/clients/image_generation/dalle_mini/model/processor.py +5 -5
helm/clients/image_generation/dalle_mini/model/tokenizer.py +2 -2
helm/clients/image_generation/dalle_mini/vqgan_jax/__init__.py +1 -1
helm/clients/image_generation/dalle_mini/vqgan_jax/convert_pt_model_to_jax.py +2 -2
helm/clients/image_generation/dalle_mini/vqgan_jax/modeling_flax_vqgan.py +1 -1
helm/clients/image_generation/dalle_mini_client.py +1 -1
helm/clients/image_generation/deep_floyd_client.py +1 -1
helm/clients/image_generation/huggingface_diffusers_client.py +1 -1
helm/clients/image_generation/lexica_client.py +1 -1
helm/clients/image_generation/mindalle/models/__init__.py +6 -6
helm/clients/image_generation/mindalle/models/stage1/vqgan.py +1 -1
helm/clients/image_generation/mindalle/models/stage2/transformer.py +1 -1
helm/clients/image_generation/mindalle/utils/__init__.py +3 -3
helm/clients/image_generation/mindalle_client.py +1 -1
helm/clients/image_generation/together_image_generation_client.py +1 -1
helm/clients/lit_gpt_client.py +2 -2
helm/clients/mistral_client.py +62 -18
helm/clients/nvidia_nim_client.py +0 -3
helm/clients/openai_client.py +308 -43
helm/clients/openai_responses_client.py +174 -0
helm/clients/palmyra_client.py +3 -9
helm/clients/reka_client.py +3 -3
helm/clients/stanfordhealthcare_azure_openai_client.py +58 -0
helm/clients/stanfordhealthcare_claude_client.py +31 -0
helm/clients/stanfordhealthcare_google_client.py +43 -0
helm/clients/stanfordhealthcare_http_model_client.py +93 -0
helm/clients/stanfordhealthcare_openai_client.py +62 -0
helm/clients/stanfordhealthcare_shc_openai_client.py +42 -0
helm/clients/test_client.py +1 -1
helm/clients/test_together_client.py +6 -1
helm/clients/together_client.py +76 -9
helm/clients/upstage_client.py +23 -0
helm/clients/vertexai_client.py +45 -13
helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
helm/clients/vision_language/huggingface_vlm_client.py +2 -2
helm/clients/vision_language/idefics_client.py +6 -2
helm/clients/vision_language/open_flamingo/__init__.py +2 -2
helm/clients/vision_language/open_flamingo/src/factory.py +3 -3
helm/clients/vision_language/open_flamingo/src/flamingo.py +2 -2
helm/clients/vision_language/open_flamingo/src/flamingo_lm.py +2 -2
helm/clients/vision_language/paligemma_client.py +2 -2
helm/clients/vision_language/qwen2_vlm_client.py +188 -0
helm/clients/vision_language/qwen_vlm_client.py +7 -5
helm/clients/vllm_client.py +4 -6
helm/clients/writer_client.py +102 -0
helm/clients/yi_client.py +0 -3
helm/common/audio_utils.py +111 -0
helm/common/context.py +80 -0
helm/common/credentials_utils.py +5 -5
helm/common/file_caches/local_file_cache.py +1 -1
helm/common/file_caches/test_local_file_cache.py +1 -1
helm/common/general.py +9 -2
helm/common/hierarchical_logger.py +46 -3
helm/common/images_utils.py +2 -2
helm/common/local_context.py +140 -0
helm/common/media_object.py +2 -2
helm/common/multimodal_request_utils.py +26 -0
helm/common/reeval_parameters.py +12 -0
helm/common/remote_context.py +61 -0
helm/common/request.py +14 -2
helm/common/response_format.py +18 -0
helm/common/test_media_object.py +1 -1
helm/config/model_deployments.yaml +1792 -28
helm/config/model_metadata.yaml +1606 -51
helm/config/tokenizer_configs.yaml +521 -4
helm/proxy/cli.py +5 -3
helm/proxy/critique/mechanical_turk_utils.py +1 -1
helm/proxy/example_queries.py +1 -1
helm/proxy/server.py +11 -4
helm/proxy/services/remote_service.py +1 -1
helm/proxy/services/server_service.py +22 -86
helm/proxy/services/test_remote_service.py +2 -2
helm/proxy/services/test_service.py +1 -1
helm/proxy/static/general.js +122 -0
helm/proxy/static/help.html +99 -0
helm/proxy/static/index.css +57 -0
helm/proxy/static/index.html +40 -0
helm/proxy/static/index.js +456 -0
helm/proxy/static/info-icon.png +0 -0
helm/proxy/test_retry.py +1 -1
helm/proxy/token_counters/auto_token_counter.py +1 -1
helm/tokenizers/aleph_alpha_tokenizer.py +1 -1
helm/tokenizers/caching_tokenizer.py +2 -30
helm/tokenizers/grok_tokenizer.py +53 -0
helm/tokenizers/http_model_tokenizer.py +1 -1
helm/tokenizers/huggingface_tokenizer.py +3 -3
helm/tokenizers/lit_gpt_tokenizer.py +1 -1
helm/tokenizers/test_anthropic_tokenizer.py +6 -2
helm/tokenizers/test_grok_tokenizer.py +33 -0
helm/tokenizers/test_huggingface_tokenizer.py +1 -1
helm/tokenizers/test_yalm_tokenizer.py +1 -1
helm/tokenizers/tiktoken_tokenizer.py +1 -1
helm/tokenizers/tokenizer.py +3 -1
helm/tokenizers/yalm_tokenizer.py +3 -3
helm/tokenizers/yalm_tokenizer_data/test_yalm_tokenizer.py +1 -1
crfm_helm-0.5.4.dist-info/METADATA +0 -350
crfm_helm-0.5.4.dist-info/RECORD +0 -697
helm/benchmark/metrics/bhasa_metrics_specs.py +0 -10
helm/benchmark/static_build/assets/01-694cb9b7.png +0 -0
helm/benchmark/static_build/assets/accenture-6f97eeda.png +0 -0
helm/benchmark/static_build/assets/ai21-0eb91ec3.png +0 -0
helm/benchmark/static_build/assets/aisingapore-6dfc9acf.png +0 -0
helm/benchmark/static_build/assets/aleph-alpha-7ce10034.png +0 -0
helm/benchmark/static_build/assets/anthropic-70d8bc39.png +0 -0
helm/benchmark/static_build/assets/bigscience-7f0400c0.png +0 -0
helm/benchmark/static_build/assets/cohere-3550c6cb.png +0 -0
helm/benchmark/static_build/assets/cresta-9e22b983.png +0 -0
helm/benchmark/static_build/assets/cuhk-8c5631e9.png +0 -0
helm/benchmark/static_build/assets/eleutherai-b9451114.png +0 -0
helm/benchmark/static_build/assets/google-06d997ad.png +0 -0
helm/benchmark/static_build/assets/index-05c76bb1.css +0 -1
helm/benchmark/static_build/assets/index-3ee38b3d.js +0 -10
helm/benchmark/static_build/assets/meta-5580e9f1.png +0 -0
helm/benchmark/static_build/assets/microsoft-f5ee5016.png +0 -0
helm/benchmark/static_build/assets/mistral-18e1be23.png +0 -0
helm/benchmark/static_build/assets/nvidia-86fa75c1.png +0 -0
helm/benchmark/static_build/assets/openai-3f8653e4.png +0 -0
helm/benchmark/static_build/assets/scb10x-204bd786.png +0 -0
helm/benchmark/static_build/assets/tii-24de195c.png +0 -0
helm/benchmark/static_build/assets/together-a665a35b.png +0 -0
helm/benchmark/static_build/assets/tsinghua-keg-97d4b395.png +0 -0
helm/benchmark/static_build/assets/wellsfargo-a86a6c4a.png +0 -0
helm/benchmark/static_build/assets/yandex-38e09d70.png +0 -0
helm/tokenizers/anthropic_tokenizer.py +0 -52
{crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
{crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info/licenses}/LICENSE +0 -0
{crfm_helm-0.5.4.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0

helm/benchmark/scenarios/melt_translation_scenario.py ADDED Viewed

@@ -0,0 +1,152 @@
+from typing import Any, Dict, List, Optional
+from datasets import load_dataset, Dataset
+from helm.common.hierarchical_logger import htrack_block
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TRAIN_SPLIT,
+    TEST_SPLIT,
+    VALID_SPLIT,
+    CORRECT_TAG,
+    Input,
+    Output,
+)
+class MELTTranslationScenario(Scenario):
+    name = "melt_translation"
+    description = "Machine Translation scenario."
+    tags = ["machine_translation"]
+    def __init__(
+        self,
+        dataset_name: str,
+        revision: str,
+        source_language: str,
+        target_language: str,
+        subset: Optional[str] = None,
+        splits: Optional[Dict[str, str]] = None,
+    ):
+        """Initializes the question answering scenario.
+        Args:
+            dataset_name: The name of the dataset.
+            revision: The revision of the dataset to use.
+            source_language: The source language to use.
+            target_language: The target language to use.
+            subset: The subset of the dataset to use. Defaults to "".
+            splits: The splits to use for the dataset. Defaults to None.
+        """
+        super().__init__()
+        self.MAX_TRAIN_INSTANCES = 20_000
+        valid_languages = set(["vi", "en"])
+        self.dataset_name = dataset_name
+        self.subset = subset
+        self.revision = revision
+        self.splits = splits
+        self.source_language = source_language
+        self.target_language = target_language
+        if self.source_language not in valid_languages or self.target_language not in valid_languages:
+            raise ValueError("Supported languages: vi, en.")
+        if self.source_language == self.target_language:
+            raise ValueError("The source language and the target language should be different.")
+        if self.source_language != "en" and self.target_language != "en":
+            raise ValueError("One of the languages should be English.")
+    def get_instances_for_splits(self, splits: Dict[str, str]) -> List[Instance]:
+        """
+        Helper for generating instances for a split.
+        Args:
+            splits (dict): Which splits to partition the data into.
+        Returns:
+            List[Instance]: Instances from the file for the specified split.
+        """
+        with htrack_block("Loading the HuggingFace dataset. The first time could take several minutes."):
+            hf_dataset: Any = load_dataset(
+                self.dataset_name,
+                self.subset,
+                revision=self.revision,
+                trust_remote_code=True,
+            )
+        instances: List[Instance] = []
+        for dataset_split_name, helm_split_name in splits.items():
+            if helm_split_name == TRAIN_SPLIT:
+                hf_dataset[dataset_split_name] = hf_dataset[dataset_split_name].shuffle(seed=42)[
+                    : self.MAX_TRAIN_INSTANCES
+                ]
+                hf_dataset[dataset_split_name] = Dataset.from_dict(hf_dataset[dataset_split_name])
+            for example in hf_dataset[dataset_split_name]:
+                source_sentence = example[self.source_language]
+                target_sentence = example[self.target_language]
+                instances.append(
+                    Instance(
+                        input=Input(text=source_sentence),
+                        references=[Reference(Output(text=target_sentence), tags=[CORRECT_TAG])],
+                        split=helm_split_name,
+                    )
+                )
+        return instances
+    def get_instances(self, output_path: str) -> List[Instance]:
+        if self.splits is None:
+            splits = {"train": TRAIN_SPLIT, "validation": VALID_SPLIT, "test": TEST_SPLIT}
+        else:
+            splits = {}
+            if "train" in self.splits:
+                splits[self.splits[TRAIN_SPLIT]] = TRAIN_SPLIT
+            if "validation" in self.splits:
+                splits[self.splits[VALID_SPLIT]] = VALID_SPLIT
+            if "test" in self.splits:
+                splits[self.splits[TEST_SPLIT]] = TEST_SPLIT
+        instances: List[Instance] = self.get_instances_for_splits(splits=splits)
+        return instances
+class MELTTranslationOPUS100Scenario(MELTTranslationScenario):
+    """
+    Scenario for the OPUS100 dataset.
+    """
+    name = "melt_translation_opus100"
+    description = "OPUS100 dataset for machine translation."
+    tags = ["machine_translation"]
+    def __init__(self, **kwargs):
+        super().__init__(
+            dataset_name="vietgpt/opus100_envi",
+            revision="45df06fb0b31edc882d7c8d34389261f995e5208",
+            splits={
+                TRAIN_SPLIT: "train",
+                VALID_SPLIT: "validation",
+                TEST_SPLIT: "test",
+            },
+            **kwargs,
+        )
+class MELTTranslationPhoMTScenario(MELTTranslationScenario):
+    """
+    Scenario for the PhoMT dataset.
+    """
+    name = "melt_translation_phomt"
+    description = "PhoMT dataset for machine translation."
+    tags = ["machine_translation"]
+    def __init__(self, **kwargs):
+        super().__init__(
+            dataset_name="ura-hcmut/PhoMT",
+            revision="74386685db01dc038860ff0a90d9f5fbde284bf7",
+            splits={
+                TRAIN_SPLIT: "train",
+                VALID_SPLIT: "validation",
+                TEST_SPLIT: "test",
+            },
+            **kwargs,
+        )

helm/benchmark/scenarios/mental_health_scenario.py ADDED Viewed

@@ -0,0 +1,123 @@
+import pandas as pd
+from typing import List
+from helm.benchmark.scenarios.scenario import (
+    Scenario,
+    Instance,
+    Reference,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    PassageQuestionInput,
+    Output,
+)
+from helm.common.general import check_file_exists
+class MentalHealthScenario(Scenario):
+    """
+    This scenario evaluates language models' ability to generate appropriate counseling responses
+    in mental health conversations. The dataset contains counseling dialogues covering
+    various topics including workplace issues, anxiety, suicidal thoughts, relationship
+    problems, and more.
+    Each dialogue consists of interactions between a counselor and a client, where the counselor
+    demonstrates expert mental health counseling techniques. The dialogues were selected based on high
+    quality scores from multiple evaluators.
+    Example dialogue structure:
+    ```
+    counselor: Hi there, to start can you tell me your name and a little bit about what's been going on?
+    client: I sleep too much... I'm 23, female and work as IT professional. I feel like I'm not fitting in...
+    counselor: I can see you have been facing challenges with feeling like you don't fit in...
+    ```
+    The task is to generate the next counselor response given the conversation history. Models
+    are evaluated on their ability to:
+    1. Provide empathetic and supportive responses
+    2. Follow proper mental health counseling protocols
+    3. Generate contextually appropriate interventions
+    The dataset includes:
+    - 7 complete dialogues covering different mental health topics
+    - Metadata about dialogue topic and type
+    - Gold-standard counselor responses as references
+    - Full conversation history for context
+    Each instance includes:
+    - input: Previous conversation turns formatted with speaker labels
+    - reference: The actual counselor's response (gold standard)
+    - metadata: Topic and type of mental health conversation
+    """
+    name = "mental_health"
+    description = (
+        "MentalHealth is a benchmark focused on evaluating empathetic communication in"
+        "mental health counseling. It includes simulated conversations between patients"
+        "and counselors, where the task is to generate compassionate and appropriate counselor"
+        "responses. The benchmark assesses a model's ability to support patients emotionally"
+        "and meaningfully engage in therapeutic conversations."
+    )
+    tags = ["dialogue", "counseling", "mental_health", "empathy", "healthcare"]
+    def __init__(self, data_path: str):
+        super().__init__()
+        self.data_path = data_path
+    def process_dialogue_data(self, data: pd.DataFrame) -> List[Instance]:
+        """
+        Process the dialogue data into evaluation instances.
+        Each instance represents a point in the conversation where the model needs to generate
+        a counselor response.
+        Args:
+            data (pd.DataFrame): DataFrame containing processed dialogues with columns:
+                - context: Previous conversation history
+                - gold_counselor_response: Reference counselor response
+                - topic: Type of conversation (e.g., "Anxiety", "Workplace")
+                - dialogue_type: Numerical type identifier
+        Returns:
+            List[Instance]: List of processed instances ready for evaluation
+        """
+        instances: List[Instance] = []
+        for _, row in data.iterrows():
+            # Format input with clear section breaks and instructions
+            input_text = (
+                f"Topic: {row['topic']}\n"
+                f"Type: {row['dialogue_type']}\n"
+                f"\nPrevious conversation:\n{row['context']}\n"
+                f"\nGenerate an empathetic and appropriate counselor response:"
+            )
+            # Create input with empty passage since all context is in question
+            prompt = PassageQuestionInput(passage="", question=input_text)
+            # Create instance with gold standard response
+            instance = Instance(
+                input=prompt,
+                references=[Reference(Output(text=row["gold_counselor_response"]), tags=[CORRECT_TAG])],
+                split=TEST_SPLIT,
+            )
+            instances.append(instance)
+        return instances
+    def get_instances(self, output_path: str) -> List[Instance]:
+        """
+        Load and process the mental health dialogue dataset.
+        Args:
+            output_path (str): Path for any cached or intermediate files
+        Returns:
+            List[Instance]: List of processed instances for evaluation
+        """
+        check_file_exists(
+            self.data_path, msg=f"[MentalHealthScenario] Required data file not found: '{self.data_path}'"
+        )
+        dialogue_data = pd.read_csv(self.data_path)
+        # Process into instances
+        instances = self.process_dialogue_data(dialogue_data)
+        return instances

helm/benchmark/scenarios/mimic_bhc_scenario.py ADDED Viewed

@@ -0,0 +1,103 @@
+import json
+from typing import Dict, List
+from helm.common.general import check_file_exists
+from helm.benchmark.scenarios.scenario import (
+    Input,
+    Scenario,
+    Instance,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Reference,
+    Output,
+)
+class MIMICBHCScenario(Scenario):
+    """
+    MIMIC-IV-BHC presents a curated collection of preprocessed discharge notes with labeled brief hospital
+    course (BHC) summaries. This dataset is derived from MIMIC-IV (https://doi.org/10.1093/jamia/ocae312).
+    In total, the dataset contains 270,033 clinical notes.
+    The splits are provided by the dataset itself.
+    Sample Synthetic Prompt:
+        Summarize the clinical note into a brief hospital course.
+        Clinical Note:
+        <SEX> M <SERVICE> SURGERY <ALLERGIES> No Known Allergies \/ Adverse Drug Reactions
+        ...
+        continue to follow-up with your health care providers as an outpatient.
+        Brief Hospital Course:
+        Mr. ___ was pre-admitted on ___ for liver transplantation
+        ...
+        discharged home to continue home medications and follow-up as an outpatient.
+    @article{aali2024dataset,
+        title={A dataset and benchmark for hospital course summarization with adapted large language models},
+        author={Aali, Asad and Van Veen, Dave and Arefeen, YI and Hom, Jason and Bluethgen, Christian
+        and Reis, Eduardo Pontes and Gatidis, Sergios and Clifford, Namuun and Daws, Joseph
+        and Tehrani, Arash and Kim, Jangwon and Chaudhari, Akshay},
+        journal={Journal of the American Medical Informatics Association},
+        volume={32},
+        number={3},
+        pages={470--479},
+        year={2024},
+        publisher={Oxford University Press}
+    }
+    @article{aali2024mimic,
+        title={MIMIC-IV-Ext-BHC: Labeled Clinical Notes Dataset for Hospital Course Summarization},
+        author={Aali, Asad and Van Veen, Dave and Arefeen, YI and Hom, Jason and Bluethgen, Christian
+        and Reis, Eduardo Pontes and Gatidis, Sergios and Clifford, Namuun and Daws, Joseph
+        and Tehrani, Arash and Kim, Jangwon and Chaudhari, Akshay},
+        journal={PhysioNet},
+        year={2024}
+    }
+    """
+    name = "mimic_bhc"
+    description = (
+        "MIMIC-BHC is a benchmark focused on summarization of discharge notes into Brief"
+        "Hospital Course (BHC) sections. It consists of curated discharge notes from MIMIC-IV,"
+        "each paired with its corresponding BHC summary. The benchmark evaluates a model's"
+        "ability to condense detailed clinical information into accurate, concise summaries that"
+        "reflect the patient's hospital stay."
+    )
+    tags = ["summarization", "biomedical"]
+    def __init__(self, data_path: str):
+        super().__init__()
+        self.data_path = data_path
+    def get_instances(self, output_path: str) -> List[Instance]:
+        check_file_exists(self.data_path, msg=f"[MIMICBHCScenario] Required data file not found: '{self.data_path}'")
+        instances: List[Instance] = []
+        # Limit to zero shot setting for now
+        splits: Dict[str, str] = {
+            # "train": TRAIN_SPLIT,
+            # "validate": VALID_SPLIT,
+            "test": TEST_SPLIT,
+        }
+        with open(self.data_path, "r") as f:
+            data = [json.loads(line) for line in f]
+        for data_split, split in splits.items():
+            clinical_notes: List[str] = [x["input"] for x in data]
+            bhc_summaries: List[str] = [x["target"] for x in data]
+            assert len(clinical_notes) == len(bhc_summaries), "Notes and summaries must have the same length"
+            for clinical_note, bhc_summary in zip(clinical_notes, bhc_summaries):
+                if not clinical_note or not bhc_summary:
+                    continue
+                instances.append(
+                    Instance(
+                        input=Input(text=clinical_note),
+                        references=[Reference(Output(text=bhc_summary), tags=[CORRECT_TAG])],
+                        split=split,
+                    )
+                )
+        return instances

helm/benchmark/scenarios/mimic_rrs_scenario.py ADDED Viewed

@@ -0,0 +1,98 @@
+import os
+from typing import Dict, List
+from helm.common.general import check_file_exists
+from helm.benchmark.scenarios.scenario import (
+    Input,
+    Scenario,
+    Instance,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Reference,
+    Output,
+)
+class MIMICRRSScenario(Scenario):
+    """
+    MIMIC-RRS is a biomedical question answering (QA) dataset collected from MIMIC-III and MIMIC-CXR
+    radiology reports.
+    In this scenario, we only consider the radiology reports from MIMIC-III.
+    In total, the dataset contains 73,259 reports.
+    The splits are provided by the dataset itself.
+    Sample Synthetic Prompt:
+        Generate the impressions of a radiology report based on its findings.
+        Findings:
+        The heart is normal in size. The lungs are clear.
+        Impressions:
+    @inproceedings{Chen_2023,
+        title={Toward Expanding the Scope of Radiology Report Summarization to Multiple Anatomies and Modalities},
+        url={http://dx.doi.org/10.18653/v1/2023.acl-short.41},
+        DOI={10.18653/v1/2023.acl-short.41},
+        booktitle={Proceedings of the 61st Annual Meeting of the Association
+                   for Computational Linguistics (Volume 2: Short Papers)},
+        publisher={Association for Computational Linguistics},
+        author={Chen, Zhihong and Varma, Maya and Wan, Xiang and Langlotz, Curtis and Delbrouck, Jean-Benoit},
+        year={2023},
+        pages={469–484}
+    }
+    """
+    name = "mimic_rrs"
+    description = (
+        "MIMIC-RRS is a benchmark constructed from radiology reports in the MIMIC-III"
+        "database. It contains pairs of 'Findings' and 'Impression' sections, enabling evaluation"
+        "of a model's ability to summarize diagnostic imaging observations into concise, clinically"
+        "relevant conclusions."
+    )
+    tags = ["question_answering", "biomedical"]
+    def __init__(self, data_path: str):
+        super().__init__()
+        self.data_path = data_path
+    def get_instances(self, output_path: str) -> List[Instance]:
+        instances: List[Instance] = []
+        # Limit to zero shot setting for now
+        splits: Dict[str, str] = {
+            # "train": TRAIN_SPLIT,
+            # "validate": VALID_SPLIT,
+            "test": TEST_SPLIT,
+        }
+        for data_split, split in splits.items():
+            split_findings_name: str = f"{data_split}.findings.tok"
+            split_impressions_name: str = f"{data_split}.impression.tok"
+            findings_path: str = os.path.join(self.data_path, split_findings_name)
+            impressions_path: str = os.path.join(self.data_path, split_impressions_name)
+            check_file_exists(
+                findings_path, msg=f"[MIMICRRSScenario] Required findings file not found: '{findings_path}'"
+            )
+            check_file_exists(
+                impressions_path, msg=f"[MIMICRRSScenario] Required impressions file not found: '{impressions_path}'"
+            )
+            findings: List[str] = self.read_file(findings_path)
+            impressions: List[str] = self.read_file(impressions_path)
+            assert len(findings) == len(impressions), "Findings and impressions must have the same length"
+            for finding, impression in zip(findings, impressions):
+                if not finding or not impression:
+                    continue
+                instances.append(
+                    Instance(
+                        input=Input(text=finding),
+                        references=[Reference(Output(text=impression), tags=[CORRECT_TAG])],
+                        split=split,
+                    )
+                )
+        return instances
+    def read_file(self, file_path: str) -> List[str]:
+        with open(file_path, "r") as file:
+            lines = file.readlines()
+        lines = [line.strip() for line in lines]
+        return lines

helm/benchmark/scenarios/mimiciv_billing_code_scenario.py ADDED Viewed

@@ -0,0 +1,77 @@
+import pandas as pd
+import numpy as np
+from typing import List
+from helm.common.general import check_file_exists
+from helm.benchmark.scenarios.scenario import (
+    Input,
+    Scenario,
+    Instance,
+    TEST_SPLIT,
+    CORRECT_TAG,
+    Reference,
+    Output,
+)
+class MIMICIVBillingCodeScenario(Scenario):
+    """
+    A scenario for MIMIC-IV discharge summaries where the task is to predict the ICD-10 code(s).
+    - Input:  The clinical note (column "text").
+    - Output: The list of ICD-10 codes (column "target").
+    """
+    name = "mimiciv_billing_code"
+    description = (
+        "MIMIC-IV Billing Code is a benchmark derived from discharge summaries in the"
+        "MIMIC-IV database, paired with their corresponding ICD-10 billing codes. The task"
+        "requires models to extract structured billing codes based on free-text clinical notes,"
+        "reflecting real-world hospital coding tasks for financial reimbursement."
+    )
+    tags = ["question_answering", "biomedical"]
+    def __init__(self, data_path: str):
+        """
+        :param data_path: Path to the mimiciv_icd10.feather file.
+        """
+        super().__init__()
+        self.data_path = data_path
+    def get_instances(self, output_path: str) -> List[Instance]:
+        check_file_exists(
+            self.data_path, msg=f"[MIMICIVBilligCodeScenario] Required data file not found: '{self.data_path}'"
+        )
+        # Read the preprocessed MIMIC-IV data (.feather format)
+        df = pd.read_feather(self.data_path)  # columns: ["text", "target", ...]
+        instances: List[Instance] = []
+        # Use the entire dataset as one split (TEST_SPLIT)
+        for idx, row in df.iterrows():
+            try:
+                note_text: str = row["text"]
+                icd10_codes = row["target"]
+                # Convert numpy array to list if necessary
+                if isinstance(icd10_codes, np.ndarray):
+                    icd10_codes = icd10_codes.tolist()
+                elif not isinstance(icd10_codes, list):
+                    icd10_codes = [str(icd10_codes)]  # Handle single values
+                # Convert all codes to strings and join
+                codes_str = ",".join(str(code) for code in icd10_codes)
+                # Create one Instance per row
+                instance = Instance(
+                    input=Input(text=note_text),
+                    references=[Reference(Output(text=codes_str), tags=[CORRECT_TAG])],
+                    split=TEST_SPLIT,
+                )
+                instances.append(instance)
+            except Exception as e:
+                print(f"Error processing row {idx}: {str(e)}")
+                continue
+        return instances

helm/benchmark/scenarios/mmlu_clinical_afr_scenario.py ADDED Viewed

@@ -0,0 +1,74 @@
+import csv
+import os
+from typing import Dict, List
+from helm.common.general import ensure_file_downloaded
+from helm.common.hierarchical_logger import hlog
+from .scenario import Scenario, Instance, Reference, TRAIN_SPLIT, VALID_SPLIT, TEST_SPLIT, CORRECT_TAG, Input, Output
+class MMLU_Clinical_Afr_Scenario(Scenario):
+    """
+    https://github.com/InstituteforDiseaseModeling/Bridging-the-Gap-Low-Resource-African-Languages
+    """
+    name = "mmlu_clinical_afr"
+    description = "Massive Multitask Language Understanding (MMLU) translated into 11 African low-resource languages"
+    tags = ["knowledge", "multiple_choice", "low_resource_languages"]
+    def __init__(self, subject: str = "clinical_knowledge", lang: str = "af"):
+        super().__init__()
+        self.subject: str = subject
+        self.lang: str = lang
+    def download_mmlu_clinical_afr(self, path: str):
+        ensure_file_downloaded(
+            source_url="https://github.com/InstituteforDiseaseModeling/Bridging-the-Gap-Low-Resource-African-Languages/raw/refs/heads/main/data/evaluation_benchmarks_afr_release.zip",  # noqa: E501
+            target_path=path,
+            unpack=True,
+            unpack_type="unzip",
+        )
+    def process_csv(self, csv_path: str, split: str) -> List[Instance]:
+        instances: List[Instance] = []
+        hlog(f"Reading {csv_path}")
+        with open(csv_path) as f:
+            reader = csv.reader(f, delimiter=",")
+            for row in reader:
+                question, answers, correct_choice = row[0], row[1:-1], row[-1]
+                answers_dict = dict(zip(["A", "B", "C", "D"], answers))
+                correct_answer: str = answers_dict[correct_choice]
+                def answer_to_reference(answer: str) -> Reference:
+                    return Reference(Output(text=answer), tags=[CORRECT_TAG] if answer == correct_answer else [])
+                instance = Instance(
+                    input=Input(text=question),
+                    references=list(map(answer_to_reference, answers)),
+                    split=split,
+                )
+                instances.append(instance)
+        return instances
+    def get_instances(self, output_path: str) -> List[Instance]:
+        # Download the raw data
+        desired_dir = "mmlu_cm_ck_vir"
+        data_path: str = os.path.join(output_path, desired_dir)
+        self.download_mmlu_clinical_afr(data_path)
+        # Read all the instances
+        instances: List[Instance] = []
+        splits: Dict[str, str] = {
+            "dev": TRAIN_SPLIT,
+            "val": VALID_SPLIT,
+            "test": TEST_SPLIT,
+        }
+        for split in splits:
+            csv_path: str = os.path.join(data_path, desired_dir, f"{self.subject}_{split}_{self.lang}.csv")
+            if not os.path.exists(csv_path):
+                hlog(f"{csv_path} doesn't exist, skipping")
+                continue
+            instances.extend(self.process_csv(csv_path, splits[split]))
+        return instances

crfm-helm 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl

Potentially problematic release.

crfm-helm 0.5.4py3-none-any.whl → 0.5.6py3-none-any.whl