crfm-helm 0.5.7__py3-none-any.whl → 0.5.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/METADATA +7 -77
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/RECORD +315 -282
- helm/benchmark/adaptation/adapter_spec.py +10 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
- helm/benchmark/annotation/aci_bench_annotator.py +11 -22
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
- helm/benchmark/annotation/dischargeme_annotator.py +11 -22
- helm/benchmark/annotation/med_dialog_annotator.py +11 -22
- helm/benchmark/annotation/medalign_annotator.py +11 -22
- helm/benchmark/annotation/medi_qa_annotator.py +11 -22
- helm/benchmark/annotation/medication_qa_annotator.py +11 -22
- helm/benchmark/annotation/mental_health_annotator.py +11 -22
- helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
- helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
- helm/benchmark/annotation/model_as_judge.py +23 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
- helm/benchmark/metrics/air_bench_metrics.py +3157 -1
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/basic_metrics.py +267 -2
- helm/benchmark/metrics/bbq_metrics.py +12 -0
- helm/benchmark/metrics/classification_metrics.py +19 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
- helm/benchmark/metrics/dry_run_metrics.py +30 -1
- helm/benchmark/metrics/efficiency_metrics.py +74 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +311 -0
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
- helm/benchmark/metrics/ifeval_metrics.py +13 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
- helm/benchmark/metrics/language_modeling_metrics.py +13 -1
- helm/benchmark/metrics/live_qa_metrics.py +13 -1
- helm/benchmark/metrics/llm_jury_metrics.py +13 -1
- helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
- helm/benchmark/metrics/medec_metrics.py +25 -2
- helm/benchmark/metrics/metric.py +25 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
- helm/benchmark/metrics/omni_math_metrics.py +13 -1
- helm/benchmark/metrics/safety_metrics.py +13 -1
- helm/benchmark/metrics/seahelm_metrics.py +14 -1
- helm/benchmark/metrics/summac/model_summac.py +2 -2
- helm/benchmark/metrics/summarization_metrics.py +129 -1
- helm/benchmark/metrics/toxicity_metrics.py +31 -1
- helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
- helm/benchmark/metrics/wildbench_metrics.py +21 -1
- helm/benchmark/presentation/run_display.py +13 -3
- helm/benchmark/presentation/run_entry.py +2 -2
- helm/benchmark/presentation/schema.py +5 -22
- helm/benchmark/presentation/summarize.py +180 -11
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/run.py +1 -1
- helm/benchmark/run_expander.py +4 -0
- helm/benchmark/run_specs/arabic_run_specs.py +140 -16
- helm/benchmark/run_specs/bluex_run_specs.py +1 -1
- helm/benchmark/run_specs/classic_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +2 -2
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +362 -52
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +6 -2
- helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
- helm/benchmark/scenarios/air_bench_scenario.py +21 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +8 -4
- helm/benchmark/scenarios/aratrust_scenario.py +19 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +24 -54
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +19 -48
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -61
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -29
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -60
- helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
- helm/benchmark/scenarios/banking77_scenario.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +15 -0
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
- helm/benchmark/scenarios/bluex_scenario.py +6 -2
- helm/benchmark/scenarios/bold_scenario.py +15 -0
- helm/benchmark/scenarios/boolq_scenario.py +20 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
- helm/benchmark/scenarios/clear_scenario.py +23 -0
- helm/benchmark/scenarios/cleva_scenario.py +479 -0
- helm/benchmark/scenarios/code_scenario.py +28 -0
- helm/benchmark/scenarios/commonsense_scenario.py +32 -0
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
- helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
- helm/benchmark/scenarios/disinformation_scenario.py +22 -0
- helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
- helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
- helm/benchmark/scenarios/financebench_scenario.py +21 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
- helm/benchmark/scenarios/gpqa_scenario.py +18 -0
- helm/benchmark/scenarios/grammar_scenario.py +20 -1
- helm/benchmark/scenarios/gsm_scenario.py +21 -0
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
- helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
- helm/benchmark/scenarios/headqa_scenario.py +22 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
- helm/benchmark/scenarios/ice_scenario.py +21 -1
- helm/benchmark/scenarios/ifeval_scenario.py +18 -0
- helm/benchmark/scenarios/imdb_scenario.py +15 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +21 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/legal_support_scenario.py +13 -0
- helm/benchmark/scenarios/legalbench_scenario.py +19 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
- helm/benchmark/scenarios/lextreme_scenario.py +11 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +33 -0
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
- helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
- helm/benchmark/scenarios/med_qa_scenario.py +20 -0
- helm/benchmark/scenarios/medalign_scenario.py +23 -0
- helm/benchmark/scenarios/medbullets_scenario.py +22 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
- helm/benchmark/scenarios/medec_scenario.py +23 -0
- helm/benchmark/scenarios/medhallu_scenario.py +23 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
- helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
- helm/benchmark/scenarios/mental_health_scenario.py +23 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +24 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
- helm/benchmark/scenarios/mmlu_scenario.py +21 -0
- helm/benchmark/scenarios/msmarco_scenario.py +30 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
- helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
- helm/benchmark/scenarios/omni_math_scenario.py +18 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
- helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
- helm/benchmark/scenarios/quac_scenario.py +14 -0
- helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
- helm/benchmark/scenarios/raft_scenario.py +15 -0
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
- helm/benchmark/scenarios/scenario.py +31 -0
- helm/benchmark/scenarios/seahelm_scenario.py +348 -0
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/spider_scenario.py +18 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
- helm/benchmark/scenarios/summarization_scenario.py +37 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +1 -1
- helm/benchmark/scenarios/test_bluex_scenario.py +2 -2
- helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/vicuna_scenario.py +21 -1
- helm/benchmark/scenarios/wikifact_scenario.py +20 -0
- helm/benchmark/scenarios/wildbench_scenario.py +18 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
- helm/benchmark/static/schema_arabic.yaml +55 -12
- helm/benchmark/static/schema_long_context.yaml +11 -30
- helm/benchmark/static/schema_medhelm.yaml +36 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
- helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
- helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
- helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
- helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
- helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
- helm/benchmark/static_build/index.html +5 -6
- helm/clients/ai21_client.py +2 -0
- helm/clients/aleph_alpha_client.py +2 -0
- helm/clients/anthropic_client.py +7 -1
- helm/clients/audio_language/diva_llama_client.py +2 -0
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/llama_omni_client.py +2 -1
- helm/clients/audio_language/qwen2_5_omni_client.py +2 -1
- helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
- helm/clients/audio_language/qwen_audiolm_client.py +2 -1
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/bedrock_client.py +2 -0
- helm/clients/cohere_client.py +3 -0
- helm/clients/google_client.py +2 -0
- helm/clients/http_model_client.py +2 -0
- helm/clients/huggingface_client.py +2 -1
- helm/clients/ibm_client.py +3 -1
- helm/clients/image_generation/adobe_vision_client.py +2 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
- helm/clients/image_generation/cogview2_client.py +2 -1
- helm/clients/image_generation/dalle2_client.py +2 -0
- helm/clients/image_generation/dalle_mini_client.py +2 -1
- helm/clients/image_generation/deep_floyd_client.py +2 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
- helm/clients/image_generation/lexica_client.py +2 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
- helm/clients/image_generation/mindalle_client.py +2 -1
- helm/clients/image_generation/together_image_generation_client.py +2 -0
- helm/clients/megatron_client.py +2 -0
- helm/clients/mistral_client.py +2 -0
- helm/clients/moderation_api_client.py +2 -0
- helm/clients/openai_client.py +36 -20
- helm/clients/openai_responses_client.py +27 -3
- helm/clients/openrouter_client.py +31 -0
- helm/clients/palmyra_client.py +2 -1
- helm/clients/reka_client.py +2 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
- helm/clients/stanfordhealthcare_http_model_client.py +2 -0
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/together_client.py +52 -11
- helm/clients/vertexai_client.py +12 -2
- helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
- helm/clients/vision_language/huggingface_vlm_client.py +2 -0
- helm/clients/vision_language/idefics_client.py +2 -1
- helm/clients/vision_language/open_flamingo_client.py +2 -1
- helm/clients/vision_language/paligemma_client.py +2 -1
- helm/clients/vision_language/palmyra_vision_client.py +2 -0
- helm/clients/vision_language/qwen2_vlm_client.py +2 -1
- helm/clients/vision_language/qwen_vlm_client.py +2 -1
- helm/clients/writer_client.py +2 -0
- helm/common/hierarchical_logger.py +20 -0
- helm/common/optional_dependencies.py +1 -1
- helm/common/test_general.py +4 -0
- helm/config/model_deployments.yaml +300 -1
- helm/config/model_metadata.yaml +302 -9
- helm/config/tokenizer_configs.yaml +92 -4
- helm/proxy/example_queries.py +8 -8
- helm/proxy/server.py +2 -1
- helm/proxy/static/index.css +4 -0
- helm/proxy/static/index.js +7 -1
- helm/benchmark/metrics/aci_bench_metrics.py +0 -14
- helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
- helm/benchmark/metrics/dischargeme_metrics.py +0 -14
- helm/benchmark/metrics/med_dialog_metrics.py +0 -14
- helm/benchmark/metrics/medalign_metrics.py +0 -14
- helm/benchmark/metrics/medi_qa_metrics.py +0 -14
- helm/benchmark/metrics/medication_qa_metrics.py +0 -14
- helm/benchmark/metrics/mental_health_metrics.py +0 -14
- helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
- helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
- helm/benchmark/static_build/assets/index-b9779128.css +0 -1
- helm/benchmark/static_build/assets/index-e439d5e1.js +0 -10
- helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
- helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
- helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/top_level.txt +0 -0
- /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
- /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
- /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
- /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
- /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
- /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
|
@@ -4,6 +4,7 @@ from typing import List
|
|
|
4
4
|
|
|
5
5
|
from datasets import load_dataset, Features, Value, Sequence, Dataset
|
|
6
6
|
|
|
7
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
7
8
|
from helm.benchmark.scenarios.scenario import (
|
|
8
9
|
Scenario,
|
|
9
10
|
Instance,
|
|
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
12
13
|
Output,
|
|
13
14
|
CORRECT_TAG,
|
|
14
15
|
TEST_SPLIT,
|
|
16
|
+
ScenarioMetadata,
|
|
15
17
|
)
|
|
16
18
|
from helm.common.general import ensure_directory_exists
|
|
17
19
|
|
|
@@ -88,3 +90,22 @@ class InfiniteBenchEnMCScenario(Scenario):
|
|
|
88
90
|
instances.append(instance)
|
|
89
91
|
|
|
90
92
|
return instances
|
|
93
|
+
|
|
94
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
95
|
+
return ScenarioMetadata(
|
|
96
|
+
name="infinite_bench_en_mc",
|
|
97
|
+
display_name="∞Bench En.MC",
|
|
98
|
+
description="∞Bench En.MC is a multiple-choice question answering task that requires "
|
|
99
|
+
"locating and processing information within a novel, performing reasoning "
|
|
100
|
+
"through aggregation or filtering to derive answers. ([Zhang et al., "
|
|
101
|
+
"2024](https://arxiv.org/abs/2402.13718))",
|
|
102
|
+
taxonomy=TaxonomyInfo(
|
|
103
|
+
task="multiple-choice question answering",
|
|
104
|
+
what="Novels",
|
|
105
|
+
when="Before 2024",
|
|
106
|
+
who="Novel authors",
|
|
107
|
+
language="English",
|
|
108
|
+
),
|
|
109
|
+
main_metric="exact_match",
|
|
110
|
+
main_split="test",
|
|
111
|
+
)
|
|
@@ -2,6 +2,7 @@ import os
|
|
|
2
2
|
import re
|
|
3
3
|
from typing import List
|
|
4
4
|
from datasets import load_dataset, Features, Value, Sequence, Dataset
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.benchmark.scenarios.scenario import (
|
|
6
7
|
Scenario,
|
|
7
8
|
Instance,
|
|
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
10
11
|
Output,
|
|
11
12
|
CORRECT_TAG,
|
|
12
13
|
TEST_SPLIT,
|
|
14
|
+
ScenarioMetadata,
|
|
13
15
|
)
|
|
14
16
|
from helm.common.general import ensure_directory_exists
|
|
15
17
|
|
|
@@ -77,3 +79,20 @@ class InfiniteBenchEnSumScenario(Scenario):
|
|
|
77
79
|
instances.append(instance)
|
|
78
80
|
|
|
79
81
|
return instances
|
|
82
|
+
|
|
83
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
84
|
+
return ScenarioMetadata(
|
|
85
|
+
name="infinite_bench_en_sum",
|
|
86
|
+
display_name="∞Bench En.Sum",
|
|
87
|
+
description="∞Bench En.Sum is a summarization task that requires generating a concise "
|
|
88
|
+
"summary of a novel. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))",
|
|
89
|
+
taxonomy=TaxonomyInfo(
|
|
90
|
+
task="multi-hop question answering",
|
|
91
|
+
what="Novels",
|
|
92
|
+
when="Before 2024",
|
|
93
|
+
who="Novel authors",
|
|
94
|
+
language="English",
|
|
95
|
+
),
|
|
96
|
+
main_metric="rouge_l",
|
|
97
|
+
main_split="test",
|
|
98
|
+
)
|
|
@@ -2,8 +2,9 @@ import json
|
|
|
2
2
|
import os
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.common.general import ensure_file_downloaded
|
|
6
|
-
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT
|
|
7
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT, ScenarioMetadata
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
class KoalaScenario(Scenario):
|
|
@@ -39,3 +40,22 @@ class KoalaScenario(Scenario):
|
|
|
39
40
|
)
|
|
40
41
|
instances.append(instance)
|
|
41
42
|
return instances
|
|
43
|
+
|
|
44
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
45
|
+
return ScenarioMetadata(
|
|
46
|
+
name="koala",
|
|
47
|
+
display_name="Koala test dataset",
|
|
48
|
+
short_display_name="Koala test dataset",
|
|
49
|
+
description="The test dataset from the [Koala "
|
|
50
|
+
"paper](https://bair.berkeley.edu/blog/2023/04/03/koala/) for evaluating "
|
|
51
|
+
"instruction-following models.",
|
|
52
|
+
taxonomy=TaxonomyInfo(
|
|
53
|
+
task="open-ended instruction following",
|
|
54
|
+
what="Instructions for LLMs",
|
|
55
|
+
when="Before 2023",
|
|
56
|
+
who="Web users",
|
|
57
|
+
language="English",
|
|
58
|
+
),
|
|
59
|
+
main_metric="Helpfulness",
|
|
60
|
+
main_split="test",
|
|
61
|
+
)
|
|
@@ -3,6 +3,7 @@ from typing import List, Dict
|
|
|
3
3
|
import json
|
|
4
4
|
import re
|
|
5
5
|
|
|
6
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
6
7
|
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
|
|
7
8
|
from helm.benchmark.scenarios.scenario import (
|
|
8
9
|
Scenario,
|
|
@@ -14,6 +15,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
14
15
|
CORRECT_TAG,
|
|
15
16
|
Input,
|
|
16
17
|
Output,
|
|
18
|
+
ScenarioMetadata,
|
|
17
19
|
)
|
|
18
20
|
|
|
19
21
|
|
|
@@ -149,3 +151,22 @@ class KPIEDGARScenario(Scenario):
|
|
|
149
151
|
with open(target_path, "r") as f:
|
|
150
152
|
raw_dataset = json.load(f)
|
|
151
153
|
return KPIEDGARScenario.sentences_to_instances(KPIEDGARScenario.get_sentences(raw_dataset))
|
|
154
|
+
|
|
155
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
156
|
+
return ScenarioMetadata(
|
|
157
|
+
name="kpi_edgar",
|
|
158
|
+
display_name="KPI-EDGAR Financial Documents (Named Entity Recognition)",
|
|
159
|
+
short_display_name=None,
|
|
160
|
+
description="A named entity recognition beenchmark based on the paper KPI-EDGAR - A Novel "
|
|
161
|
+
"Dataset and Accompanying Metric for Relation Extraction from Financial "
|
|
162
|
+
"Documents [(Deußer et al., 2022)](https://arxiv.org/pdf/2210.09163.pdf).",
|
|
163
|
+
taxonomy=TaxonomyInfo(
|
|
164
|
+
task="named entity recognition",
|
|
165
|
+
what="financial reports",
|
|
166
|
+
when="before 2022",
|
|
167
|
+
who="financial experts",
|
|
168
|
+
language="English",
|
|
169
|
+
),
|
|
170
|
+
main_metric="adjusted_macro_f1_score",
|
|
171
|
+
main_split="test",
|
|
172
|
+
)
|
|
@@ -4,6 +4,7 @@ import json
|
|
|
4
4
|
import re
|
|
5
5
|
|
|
6
6
|
from typing import List
|
|
7
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
7
8
|
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
|
|
8
9
|
from helm.benchmark.scenarios.scenario import (
|
|
9
10
|
Input,
|
|
@@ -14,6 +15,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
14
15
|
TEST_SPLIT,
|
|
15
16
|
CORRECT_TAG,
|
|
16
17
|
Output,
|
|
18
|
+
ScenarioMetadata,
|
|
17
19
|
)
|
|
18
20
|
|
|
19
21
|
|
|
@@ -127,3 +129,21 @@ class LegalContractSummarizationScenario(Scenario):
|
|
|
127
129
|
instances.append(instance)
|
|
128
130
|
|
|
129
131
|
return instances
|
|
132
|
+
|
|
133
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
134
|
+
return ScenarioMetadata(
|
|
135
|
+
name="legal_contract_summarization",
|
|
136
|
+
display_name="Legal Contract Summarization",
|
|
137
|
+
short_display_name=None,
|
|
138
|
+
description="Plain English Summarization of Contracts [(Manor et al., "
|
|
139
|
+
"2019)](https://aclanthology.org/W19-2201.pdf).",
|
|
140
|
+
taxonomy=TaxonomyInfo(
|
|
141
|
+
task="summarization",
|
|
142
|
+
what="legal contracts (e.g. terms of service, license agreements)",
|
|
143
|
+
when="before 2019",
|
|
144
|
+
who="lawyers",
|
|
145
|
+
language="English",
|
|
146
|
+
),
|
|
147
|
+
main_metric="rouge_l",
|
|
148
|
+
main_split="test",
|
|
149
|
+
)
|
|
@@ -5,6 +5,7 @@ from typing import List, Optional, Any
|
|
|
5
5
|
import datasets
|
|
6
6
|
from datasets import load_dataset
|
|
7
7
|
|
|
8
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
8
9
|
from helm.benchmark.scenarios.scenario import (
|
|
9
10
|
Scenario,
|
|
10
11
|
Instance,
|
|
@@ -15,6 +16,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
15
16
|
CORRECT_TAG,
|
|
16
17
|
Input,
|
|
17
18
|
Output,
|
|
19
|
+
ScenarioMetadata,
|
|
18
20
|
)
|
|
19
21
|
|
|
20
22
|
_ALL_LANGUAGES = {
|
|
@@ -205,3 +207,51 @@ class LegalSummarizationScenario(Scenario):
|
|
|
205
207
|
)
|
|
206
208
|
|
|
207
209
|
return instances
|
|
210
|
+
|
|
211
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
212
|
+
if self.dataset_name == "BillSum":
|
|
213
|
+
return ScenarioMetadata(
|
|
214
|
+
name="billsum_legal_summarization",
|
|
215
|
+
display_name="BillSum",
|
|
216
|
+
description="The BillSum benchmark for legal text summarization ([Kornilova & Eidelmann, "
|
|
217
|
+
"2020](https://aclanthology.org/D19-5406/)).",
|
|
218
|
+
taxonomy=TaxonomyInfo(
|
|
219
|
+
task="summarization", what="legal text from US bills", when=None, who="lawyers", language="English"
|
|
220
|
+
),
|
|
221
|
+
main_metric="rouge_2",
|
|
222
|
+
main_split="test",
|
|
223
|
+
)
|
|
224
|
+
elif self.dataset_name == "MultiLexSum":
|
|
225
|
+
return ScenarioMetadata(
|
|
226
|
+
name="multilexsum_legal_summarization",
|
|
227
|
+
display_name="MultiLexSum",
|
|
228
|
+
description="The MultiLexSum benchmark for legal text summarization ([Shen et al., "
|
|
229
|
+
"2022](https://arxiv.org/abs/2206.10883)).",
|
|
230
|
+
taxonomy=TaxonomyInfo(
|
|
231
|
+
task="summarization",
|
|
232
|
+
what="legal text from US civil rights lawsuits",
|
|
233
|
+
when=None,
|
|
234
|
+
who="lawyers",
|
|
235
|
+
language="English",
|
|
236
|
+
),
|
|
237
|
+
main_metric="rouge_2",
|
|
238
|
+
main_split="test",
|
|
239
|
+
)
|
|
240
|
+
elif self.dataset_name == "EurLexSum":
|
|
241
|
+
return ScenarioMetadata(
|
|
242
|
+
name="eurlexsum_legal_summarization",
|
|
243
|
+
display_name="EurLexSum",
|
|
244
|
+
description="The EurLexSum benchmark for legal text summarization ([Aumiller et al., "
|
|
245
|
+
"2022](https://arxiv.org/abs/2210.13448)).",
|
|
246
|
+
taxonomy=TaxonomyInfo(
|
|
247
|
+
task="summarization",
|
|
248
|
+
what="legal text from EU legislation",
|
|
249
|
+
when="1960 - 2020",
|
|
250
|
+
who="lawyers",
|
|
251
|
+
language="English",
|
|
252
|
+
),
|
|
253
|
+
main_metric="rouge_2",
|
|
254
|
+
main_split="test",
|
|
255
|
+
)
|
|
256
|
+
else:
|
|
257
|
+
raise Exception(f"Unknown dataset {self.dataset_name}")
|
|
@@ -2,6 +2,7 @@ import json
|
|
|
2
2
|
import os
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.common.general import ensure_file_downloaded
|
|
6
7
|
from helm.benchmark.scenarios.scenario import (
|
|
7
8
|
Scenario,
|
|
@@ -13,6 +14,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
13
14
|
CORRECT_TAG,
|
|
14
15
|
Input,
|
|
15
16
|
Output,
|
|
17
|
+
ScenarioMetadata,
|
|
16
18
|
)
|
|
17
19
|
|
|
18
20
|
|
|
@@ -102,3 +104,14 @@ class LegalSupportScenario(Scenario):
|
|
|
102
104
|
instances.append(instance)
|
|
103
105
|
|
|
104
106
|
return instances
|
|
107
|
+
|
|
108
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
109
|
+
return ScenarioMetadata(
|
|
110
|
+
name="legal_support",
|
|
111
|
+
display_name="LegalSupport",
|
|
112
|
+
description="Scenario introduced in this work to measure fine-grained legal reasoning "
|
|
113
|
+
"through reverse entailment.",
|
|
114
|
+
taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
|
|
115
|
+
main_metric="quasi_exact_match",
|
|
116
|
+
main_split="test",
|
|
117
|
+
)
|
|
@@ -5,6 +5,7 @@ import datasets
|
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
from typing import List, Dict
|
|
7
7
|
|
|
8
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
8
9
|
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
|
|
9
10
|
from helm.benchmark.scenarios.scenario import (
|
|
10
11
|
Scenario,
|
|
@@ -15,6 +16,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
15
16
|
TEST_SPLIT,
|
|
16
17
|
Input,
|
|
17
18
|
Output,
|
|
19
|
+
ScenarioMetadata,
|
|
18
20
|
)
|
|
19
21
|
|
|
20
22
|
PROMPT_SETTINGS_URL = "https://raw.githubusercontent.com/HazyResearch/legalbench/main/helm_prompt_settings.jsonl"
|
|
@@ -144,3 +146,20 @@ class LegalBenchScenario(Scenario):
|
|
|
144
146
|
instances.append(instance)
|
|
145
147
|
|
|
146
148
|
return instances
|
|
149
|
+
|
|
150
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
151
|
+
return ScenarioMetadata(
|
|
152
|
+
name=self.name,
|
|
153
|
+
display_name="LegalBench",
|
|
154
|
+
description="LegalBench is a large collaboratively constructed benchmark of legal reasoning "
|
|
155
|
+
"tasks [(Guha et al, 2023)](https://arxiv.org/pdf/2308.11462.pdf).",
|
|
156
|
+
taxonomy=TaxonomyInfo(
|
|
157
|
+
task="multiple-choice question answering",
|
|
158
|
+
what="public legal and admininstrative documents, manually " "constructed questions",
|
|
159
|
+
when="before 2023",
|
|
160
|
+
who="lawyers",
|
|
161
|
+
language="English",
|
|
162
|
+
),
|
|
163
|
+
main_metric="quasi_exact_match",
|
|
164
|
+
main_split="test",
|
|
165
|
+
)
|
|
@@ -16,6 +16,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
16
16
|
TEST_SPLIT,
|
|
17
17
|
Input,
|
|
18
18
|
Output,
|
|
19
|
+
ScenarioMetadata,
|
|
19
20
|
)
|
|
20
21
|
|
|
21
22
|
ECTHR_A = "ecthr_a"
|
|
@@ -261,3 +262,13 @@ class LexGLUEScenario(Scenario):
|
|
|
261
262
|
for subset in self.subsets:
|
|
262
263
|
instances.extend(self.get_instances_for_subset(subset, output_path))
|
|
263
264
|
return instances
|
|
265
|
+
|
|
266
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
267
|
+
return ScenarioMetadata(
|
|
268
|
+
name="lex_glue",
|
|
269
|
+
display_name="LexGLUE",
|
|
270
|
+
description="A Benchmark Dataset for Legal Language Understanding in English",
|
|
271
|
+
taxonomy=None,
|
|
272
|
+
main_metric="classification_macro_f1",
|
|
273
|
+
main_split="test",
|
|
274
|
+
)
|
|
@@ -16,6 +16,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
16
16
|
TEST_SPLIT,
|
|
17
17
|
Output,
|
|
18
18
|
Input,
|
|
19
|
+
ScenarioMetadata,
|
|
19
20
|
)
|
|
20
21
|
|
|
21
22
|
|
|
@@ -466,3 +467,13 @@ class LEXTREMEScenario(Scenario):
|
|
|
466
467
|
for subset in self.subsets:
|
|
467
468
|
instances.extend(self.get_instances_for_subset(subset, output_path))
|
|
468
469
|
return instances
|
|
470
|
+
|
|
471
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
472
|
+
return ScenarioMetadata(
|
|
473
|
+
name="lextreme",
|
|
474
|
+
display_name="LEXTREME",
|
|
475
|
+
description="A Multilingual Legal Benchmark for Natural Language Understanding",
|
|
476
|
+
taxonomy=None,
|
|
477
|
+
main_metric="classification_macro_f1",
|
|
478
|
+
main_split="test",
|
|
479
|
+
)
|
|
@@ -2,6 +2,7 @@ import os
|
|
|
2
2
|
import json
|
|
3
3
|
from typing import Dict, List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
|
|
6
7
|
from helm.benchmark.scenarios.scenario import (
|
|
7
8
|
Scenario,
|
|
@@ -13,6 +14,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
13
14
|
CORRECT_TAG,
|
|
14
15
|
PassageQuestionInput,
|
|
15
16
|
Output,
|
|
17
|
+
ScenarioMetadata,
|
|
16
18
|
)
|
|
17
19
|
|
|
18
20
|
|
|
@@ -143,3 +145,15 @@ class LSATScenario(Scenario):
|
|
|
143
145
|
instances.append(instance)
|
|
144
146
|
|
|
145
147
|
return instances
|
|
148
|
+
|
|
149
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
150
|
+
return ScenarioMetadata(
|
|
151
|
+
name="lsat_qa",
|
|
152
|
+
display_name="LSAT",
|
|
153
|
+
description="The LSAT benchmark for measuring analytical reasoning on the Law School "
|
|
154
|
+
"Admission Test (LSAT; [Zhong et al., "
|
|
155
|
+
"2021](https://arxiv.org/pdf/2104.06598.pdf)).",
|
|
156
|
+
taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
|
|
157
|
+
main_metric="quasi_exact_match",
|
|
158
|
+
main_split="test",
|
|
159
|
+
)
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Dict, List
|
|
3
|
+
|
|
4
|
+
import datasets
|
|
5
|
+
|
|
6
|
+
from helm.common.general import ensure_directory_exists
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
Reference,
|
|
11
|
+
TEST_SPLIT,
|
|
12
|
+
TRAIN_SPLIT,
|
|
13
|
+
CORRECT_TAG,
|
|
14
|
+
Input,
|
|
15
|
+
Output,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class MadinahQAScenario(Scenario):
|
|
20
|
+
"""MadinahQA Scenario"""
|
|
21
|
+
|
|
22
|
+
name = "madinah_qa"
|
|
23
|
+
description = "Arabic language competency benchmark"
|
|
24
|
+
tags = ["language", "multiple_choice"]
|
|
25
|
+
|
|
26
|
+
OPTIONS = ["A", "B", "C", "D"]
|
|
27
|
+
HF_SPLIT_TO_HELM_SPLIT = {"dev": TRAIN_SPLIT, "test": TEST_SPLIT}
|
|
28
|
+
SUBSETS = ["Arabic Language (General)", "Arabic Language (Grammar)"]
|
|
29
|
+
|
|
30
|
+
def __init__(self, subset: str):
|
|
31
|
+
super().__init__()
|
|
32
|
+
subset = subset.replace("_", " ")
|
|
33
|
+
if subset not in self.SUBSETS:
|
|
34
|
+
raise Exception(f"Unknown subset: {subset}")
|
|
35
|
+
self.subset = subset
|
|
36
|
+
|
|
37
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
38
|
+
cache_dir = os.path.join(output_path, "data")
|
|
39
|
+
ensure_directory_exists(cache_dir)
|
|
40
|
+
instances: List[Instance] = []
|
|
41
|
+
dataset_splits: Dict[str, datasets.Dataset] = datasets.load_dataset(
|
|
42
|
+
"MBZUAI/MadinahQA",
|
|
43
|
+
self.subset,
|
|
44
|
+
revision="62e7c86ac5c07245a5a952722691d77ddb41f695",
|
|
45
|
+
cache_dir=cache_dir,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
# Read all instances
|
|
49
|
+
for split_name, dataset in dataset_splits.items():
|
|
50
|
+
assert isinstance(dataset, datasets.Dataset)
|
|
51
|
+
for row_index, row in enumerate(dataset):
|
|
52
|
+
input = Input(text=row["Question"])
|
|
53
|
+
references: List[Reference] = []
|
|
54
|
+
correct_option_index = ord(row["Answer Key"]) - ord("A") + 1
|
|
55
|
+
for option_index in range(1, 6):
|
|
56
|
+
column_name = f"Option {option_index}"
|
|
57
|
+
if not row[column_name]:
|
|
58
|
+
continue
|
|
59
|
+
references.append(
|
|
60
|
+
Reference(
|
|
61
|
+
output=Output(text=row[column_name]),
|
|
62
|
+
tags=[CORRECT_TAG] if option_index == correct_option_index else [],
|
|
63
|
+
)
|
|
64
|
+
)
|
|
65
|
+
instance = Instance(
|
|
66
|
+
id=f"id{row_index}",
|
|
67
|
+
input=input,
|
|
68
|
+
references=references,
|
|
69
|
+
split=self.HF_SPLIT_TO_HELM_SPLIT[split_name],
|
|
70
|
+
)
|
|
71
|
+
instances.append(instance)
|
|
72
|
+
|
|
73
|
+
return instances
|
|
@@ -4,6 +4,7 @@ import typing
|
|
|
4
4
|
from typing import Dict, List, Optional
|
|
5
5
|
from datasets import load_dataset, DatasetDict
|
|
6
6
|
|
|
7
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
7
8
|
from helm.common.general import ensure_directory_exists
|
|
8
9
|
from helm.benchmark.scenarios.scenario import (
|
|
9
10
|
Scenario,
|
|
@@ -14,6 +15,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
14
15
|
CORRECT_TAG,
|
|
15
16
|
Input,
|
|
16
17
|
Output,
|
|
18
|
+
ScenarioMetadata,
|
|
17
19
|
)
|
|
18
20
|
|
|
19
21
|
|
|
@@ -450,3 +452,34 @@ class MATHScenario(Scenario):
|
|
|
450
452
|
instances.append(instance)
|
|
451
453
|
|
|
452
454
|
return instances
|
|
455
|
+
|
|
456
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
457
|
+
taxonomy = TaxonomyInfo(
|
|
458
|
+
task="numeric answer question answering",
|
|
459
|
+
what="math competitions (AMC, AIME, etc.)",
|
|
460
|
+
when="before 2021",
|
|
461
|
+
who="problem setters",
|
|
462
|
+
language="synthetic",
|
|
463
|
+
)
|
|
464
|
+
if self.use_chain_of_thought:
|
|
465
|
+
return ScenarioMetadata(
|
|
466
|
+
name="math_chain_of_thought",
|
|
467
|
+
display_name="MATH",
|
|
468
|
+
description="The MATH benchmark for measuring mathematical problem solving on competition "
|
|
469
|
+
"math problems with chain-of-thought style reasoning [(Hendrycks et al., "
|
|
470
|
+
"2021)](https://arxiv.org/pdf/2103.03874.pdf).",
|
|
471
|
+
taxonomy=taxonomy,
|
|
472
|
+
main_metric="math_equiv_chain_of_thought",
|
|
473
|
+
main_split="test",
|
|
474
|
+
)
|
|
475
|
+
else:
|
|
476
|
+
return ScenarioMetadata(
|
|
477
|
+
name="math_regular",
|
|
478
|
+
display_name="MATH",
|
|
479
|
+
description="The MATH benchmark for measuring mathematical problem solving on competition "
|
|
480
|
+
"math problems [(Hendrycks et al., "
|
|
481
|
+
"2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/be83ab3ecd0db773eb2dc1b0a17836a1-Abstract-round2.html).",
|
|
482
|
+
taxonomy=taxonomy,
|
|
483
|
+
main_metric="math_equiv",
|
|
484
|
+
main_split="test",
|
|
485
|
+
)
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
import datasets
|
|
5
|
+
|
|
6
|
+
from helm.common.general import ensure_directory_exists
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
Reference,
|
|
11
|
+
TEST_SPLIT,
|
|
12
|
+
CORRECT_TAG,
|
|
13
|
+
Input,
|
|
14
|
+
Output,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class MBZUAIHumanTranslatedArabicMMLUScenario(Scenario):
|
|
19
|
+
"""MBZUAI Human-Translated Arabic MMLU
|
|
20
|
+
|
|
21
|
+
A translation from MBZUAI by human translators of the Massive Multitask Language Understanding benchmark from this paper:
|
|
22
|
+
|
|
23
|
+
- https://arxiv.org/pdf/2009.03300.pdf
|
|
24
|
+
""" # noqa: E501
|
|
25
|
+
|
|
26
|
+
name = "mbzuai_human_translated_arabic_mmlu"
|
|
27
|
+
description = (
|
|
28
|
+
"A translation from MBZUAI by human translators of the Massive Multitask Language Understanding benchmark"
|
|
29
|
+
)
|
|
30
|
+
tags = ["knowledge", "multiple_choice"]
|
|
31
|
+
|
|
32
|
+
def __init__(self, subject: str):
|
|
33
|
+
super().__init__()
|
|
34
|
+
self.subject: str = subject
|
|
35
|
+
|
|
36
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
37
|
+
cache_dir = os.path.join(output_path, "data")
|
|
38
|
+
ensure_directory_exists(cache_dir)
|
|
39
|
+
dataset = datasets.load_dataset(
|
|
40
|
+
"MBZUAI/human_translated_arabic_mmlu",
|
|
41
|
+
self.subject,
|
|
42
|
+
revision="5ed7830fd678cfa6f2d7f0a1a13a4e1a1fa422ac",
|
|
43
|
+
cache_dir=cache_dir,
|
|
44
|
+
split="test",
|
|
45
|
+
)
|
|
46
|
+
assert isinstance(dataset, datasets.Dataset)
|
|
47
|
+
|
|
48
|
+
# Read all instances
|
|
49
|
+
instances: List[Instance] = []
|
|
50
|
+
for row_index, row in enumerate(dataset):
|
|
51
|
+
input = Input(text=row["question"])
|
|
52
|
+
references: List[Reference] = []
|
|
53
|
+
for choice_index, choice in enumerate(row["choices"]):
|
|
54
|
+
references.append(
|
|
55
|
+
Reference(
|
|
56
|
+
output=Output(text=choice),
|
|
57
|
+
tags=[CORRECT_TAG] if choice_index == row["answer"] else [],
|
|
58
|
+
)
|
|
59
|
+
)
|
|
60
|
+
instance = Instance(
|
|
61
|
+
id=f"id-{self.subject}-{row_index}",
|
|
62
|
+
input=input,
|
|
63
|
+
references=references,
|
|
64
|
+
split=TEST_SPLIT,
|
|
65
|
+
)
|
|
66
|
+
instances.append(instance)
|
|
67
|
+
|
|
68
|
+
return instances
|
|
@@ -2,8 +2,18 @@ import json
|
|
|
2
2
|
import os
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.common.general import ensure_directory_exists, ensure_file_downloaded
|
|
6
|
-
from helm.benchmark.scenarios.scenario import
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
Reference,
|
|
11
|
+
ALL_SPLITS,
|
|
12
|
+
CORRECT_TAG,
|
|
13
|
+
Input,
|
|
14
|
+
Output,
|
|
15
|
+
ScenarioMetadata,
|
|
16
|
+
)
|
|
7
17
|
|
|
8
18
|
|
|
9
19
|
class MedDialogScenario(Scenario):
|
|
@@ -133,3 +143,24 @@ class MedDialogScenario(Scenario):
|
|
|
133
143
|
)
|
|
134
144
|
|
|
135
145
|
return instances
|
|
146
|
+
|
|
147
|
+
def get_metadata(self):
|
|
148
|
+
return ScenarioMetadata(
|
|
149
|
+
name="med_dialog",
|
|
150
|
+
display_name="MedDialog",
|
|
151
|
+
short_display_name="MedDialog",
|
|
152
|
+
description="MedDialog is a benchmark of real-world doctor-patient conversations focused on "
|
|
153
|
+
"health-related concerns and advice. Each dialogue is paired with a "
|
|
154
|
+
"one-sentence summary that reflects the core patient question or exchange. The "
|
|
155
|
+
"benchmark evaluates a model's ability to condense medical dialogue into "
|
|
156
|
+
"concise, informative summaries.",
|
|
157
|
+
taxonomy=TaxonomyInfo(
|
|
158
|
+
task="Text generation",
|
|
159
|
+
what="Generate summaries of doctor-patient conversations",
|
|
160
|
+
when="Any",
|
|
161
|
+
who="Clinician",
|
|
162
|
+
language="English",
|
|
163
|
+
),
|
|
164
|
+
main_metric="med_dialog_accuracy",
|
|
165
|
+
main_split="test",
|
|
166
|
+
)
|
|
@@ -2,6 +2,7 @@ import json
|
|
|
2
2
|
import os
|
|
3
3
|
from typing import Dict, List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.common.general import ensure_file_downloaded
|
|
6
7
|
from helm.benchmark.scenarios.scenario import (
|
|
7
8
|
Scenario,
|
|
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
12
13
|
VALID_SPLIT,
|
|
13
14
|
Input,
|
|
14
15
|
Output,
|
|
16
|
+
ScenarioMetadata,
|
|
15
17
|
)
|
|
16
18
|
|
|
17
19
|
|
|
@@ -109,3 +111,15 @@ class MedMCQAScenario(Scenario):
|
|
|
109
111
|
instances.append(instance)
|
|
110
112
|
|
|
111
113
|
return instances
|
|
114
|
+
|
|
115
|
+
def get_metadata(self):
|
|
116
|
+
return ScenarioMetadata(
|
|
117
|
+
name="med_mcqa",
|
|
118
|
+
display_name="MedMCQA",
|
|
119
|
+
description='MedMCQA is a "multiple-choice question answering (MCQA) dataset designed to '
|
|
120
|
+
"address real-world medical entrance exam questions ([Flores et al. "
|
|
121
|
+
"2020](https://arxiv.org/abs/2203.14371)).",
|
|
122
|
+
taxonomy=TaxonomyInfo(task="question answering", what="n/a", when="n/a", who="n/a", language="English"),
|
|
123
|
+
main_metric="exact_match",
|
|
124
|
+
main_split="valid",
|
|
125
|
+
)
|