crfm-helm 0.5.7__py3-none-any.whl → 0.5.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/METADATA +7 -77
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/RECORD +315 -282
- helm/benchmark/adaptation/adapter_spec.py +10 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
- helm/benchmark/annotation/aci_bench_annotator.py +11 -22
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
- helm/benchmark/annotation/dischargeme_annotator.py +11 -22
- helm/benchmark/annotation/med_dialog_annotator.py +11 -22
- helm/benchmark/annotation/medalign_annotator.py +11 -22
- helm/benchmark/annotation/medi_qa_annotator.py +11 -22
- helm/benchmark/annotation/medication_qa_annotator.py +11 -22
- helm/benchmark/annotation/mental_health_annotator.py +11 -22
- helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
- helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
- helm/benchmark/annotation/model_as_judge.py +23 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
- helm/benchmark/metrics/air_bench_metrics.py +3157 -1
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/basic_metrics.py +267 -2
- helm/benchmark/metrics/bbq_metrics.py +12 -0
- helm/benchmark/metrics/classification_metrics.py +19 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
- helm/benchmark/metrics/dry_run_metrics.py +30 -1
- helm/benchmark/metrics/efficiency_metrics.py +74 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +311 -0
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
- helm/benchmark/metrics/ifeval_metrics.py +13 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
- helm/benchmark/metrics/language_modeling_metrics.py +13 -1
- helm/benchmark/metrics/live_qa_metrics.py +13 -1
- helm/benchmark/metrics/llm_jury_metrics.py +13 -1
- helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
- helm/benchmark/metrics/medec_metrics.py +25 -2
- helm/benchmark/metrics/metric.py +25 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
- helm/benchmark/metrics/omni_math_metrics.py +13 -1
- helm/benchmark/metrics/safety_metrics.py +13 -1
- helm/benchmark/metrics/seahelm_metrics.py +14 -1
- helm/benchmark/metrics/summac/model_summac.py +2 -2
- helm/benchmark/metrics/summarization_metrics.py +129 -1
- helm/benchmark/metrics/toxicity_metrics.py +31 -1
- helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
- helm/benchmark/metrics/wildbench_metrics.py +21 -1
- helm/benchmark/presentation/run_display.py +13 -3
- helm/benchmark/presentation/run_entry.py +2 -2
- helm/benchmark/presentation/schema.py +5 -22
- helm/benchmark/presentation/summarize.py +180 -11
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/run.py +1 -1
- helm/benchmark/run_expander.py +4 -0
- helm/benchmark/run_specs/arabic_run_specs.py +140 -16
- helm/benchmark/run_specs/bluex_run_specs.py +1 -1
- helm/benchmark/run_specs/classic_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +2 -2
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +362 -52
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +6 -2
- helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
- helm/benchmark/scenarios/air_bench_scenario.py +21 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +8 -4
- helm/benchmark/scenarios/aratrust_scenario.py +19 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +24 -54
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +19 -48
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -61
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -29
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -60
- helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
- helm/benchmark/scenarios/banking77_scenario.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +15 -0
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
- helm/benchmark/scenarios/bluex_scenario.py +6 -2
- helm/benchmark/scenarios/bold_scenario.py +15 -0
- helm/benchmark/scenarios/boolq_scenario.py +20 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
- helm/benchmark/scenarios/clear_scenario.py +23 -0
- helm/benchmark/scenarios/cleva_scenario.py +479 -0
- helm/benchmark/scenarios/code_scenario.py +28 -0
- helm/benchmark/scenarios/commonsense_scenario.py +32 -0
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
- helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
- helm/benchmark/scenarios/disinformation_scenario.py +22 -0
- helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
- helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
- helm/benchmark/scenarios/financebench_scenario.py +21 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
- helm/benchmark/scenarios/gpqa_scenario.py +18 -0
- helm/benchmark/scenarios/grammar_scenario.py +20 -1
- helm/benchmark/scenarios/gsm_scenario.py +21 -0
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
- helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
- helm/benchmark/scenarios/headqa_scenario.py +22 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
- helm/benchmark/scenarios/ice_scenario.py +21 -1
- helm/benchmark/scenarios/ifeval_scenario.py +18 -0
- helm/benchmark/scenarios/imdb_scenario.py +15 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +21 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/legal_support_scenario.py +13 -0
- helm/benchmark/scenarios/legalbench_scenario.py +19 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
- helm/benchmark/scenarios/lextreme_scenario.py +11 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +33 -0
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
- helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
- helm/benchmark/scenarios/med_qa_scenario.py +20 -0
- helm/benchmark/scenarios/medalign_scenario.py +23 -0
- helm/benchmark/scenarios/medbullets_scenario.py +22 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
- helm/benchmark/scenarios/medec_scenario.py +23 -0
- helm/benchmark/scenarios/medhallu_scenario.py +23 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
- helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
- helm/benchmark/scenarios/mental_health_scenario.py +23 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +24 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
- helm/benchmark/scenarios/mmlu_scenario.py +21 -0
- helm/benchmark/scenarios/msmarco_scenario.py +30 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
- helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
- helm/benchmark/scenarios/omni_math_scenario.py +18 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
- helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
- helm/benchmark/scenarios/quac_scenario.py +14 -0
- helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
- helm/benchmark/scenarios/raft_scenario.py +15 -0
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
- helm/benchmark/scenarios/scenario.py +31 -0
- helm/benchmark/scenarios/seahelm_scenario.py +348 -0
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/spider_scenario.py +18 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
- helm/benchmark/scenarios/summarization_scenario.py +37 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +1 -1
- helm/benchmark/scenarios/test_bluex_scenario.py +2 -2
- helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/vicuna_scenario.py +21 -1
- helm/benchmark/scenarios/wikifact_scenario.py +20 -0
- helm/benchmark/scenarios/wildbench_scenario.py +18 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
- helm/benchmark/static/schema_arabic.yaml +55 -12
- helm/benchmark/static/schema_long_context.yaml +11 -30
- helm/benchmark/static/schema_medhelm.yaml +36 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
- helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
- helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
- helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
- helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
- helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
- helm/benchmark/static_build/index.html +5 -6
- helm/clients/ai21_client.py +2 -0
- helm/clients/aleph_alpha_client.py +2 -0
- helm/clients/anthropic_client.py +7 -1
- helm/clients/audio_language/diva_llama_client.py +2 -0
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/llama_omni_client.py +2 -1
- helm/clients/audio_language/qwen2_5_omni_client.py +2 -1
- helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
- helm/clients/audio_language/qwen_audiolm_client.py +2 -1
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/bedrock_client.py +2 -0
- helm/clients/cohere_client.py +3 -0
- helm/clients/google_client.py +2 -0
- helm/clients/http_model_client.py +2 -0
- helm/clients/huggingface_client.py +2 -1
- helm/clients/ibm_client.py +3 -1
- helm/clients/image_generation/adobe_vision_client.py +2 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
- helm/clients/image_generation/cogview2_client.py +2 -1
- helm/clients/image_generation/dalle2_client.py +2 -0
- helm/clients/image_generation/dalle_mini_client.py +2 -1
- helm/clients/image_generation/deep_floyd_client.py +2 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
- helm/clients/image_generation/lexica_client.py +2 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
- helm/clients/image_generation/mindalle_client.py +2 -1
- helm/clients/image_generation/together_image_generation_client.py +2 -0
- helm/clients/megatron_client.py +2 -0
- helm/clients/mistral_client.py +2 -0
- helm/clients/moderation_api_client.py +2 -0
- helm/clients/openai_client.py +36 -20
- helm/clients/openai_responses_client.py +27 -3
- helm/clients/openrouter_client.py +31 -0
- helm/clients/palmyra_client.py +2 -1
- helm/clients/reka_client.py +2 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
- helm/clients/stanfordhealthcare_http_model_client.py +2 -0
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/together_client.py +52 -11
- helm/clients/vertexai_client.py +12 -2
- helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
- helm/clients/vision_language/huggingface_vlm_client.py +2 -0
- helm/clients/vision_language/idefics_client.py +2 -1
- helm/clients/vision_language/open_flamingo_client.py +2 -1
- helm/clients/vision_language/paligemma_client.py +2 -1
- helm/clients/vision_language/palmyra_vision_client.py +2 -0
- helm/clients/vision_language/qwen2_vlm_client.py +2 -1
- helm/clients/vision_language/qwen_vlm_client.py +2 -1
- helm/clients/writer_client.py +2 -0
- helm/common/hierarchical_logger.py +20 -0
- helm/common/optional_dependencies.py +1 -1
- helm/common/test_general.py +4 -0
- helm/config/model_deployments.yaml +300 -1
- helm/config/model_metadata.yaml +302 -9
- helm/config/tokenizer_configs.yaml +92 -4
- helm/proxy/example_queries.py +8 -8
- helm/proxy/server.py +2 -1
- helm/proxy/static/index.css +4 -0
- helm/proxy/static/index.js +7 -1
- helm/benchmark/metrics/aci_bench_metrics.py +0 -14
- helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
- helm/benchmark/metrics/dischargeme_metrics.py +0 -14
- helm/benchmark/metrics/med_dialog_metrics.py +0 -14
- helm/benchmark/metrics/medalign_metrics.py +0 -14
- helm/benchmark/metrics/medi_qa_metrics.py +0 -14
- helm/benchmark/metrics/medication_qa_metrics.py +0 -14
- helm/benchmark/metrics/mental_health_metrics.py +0 -14
- helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
- helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
- helm/benchmark/static_build/assets/index-b9779128.css +0 -1
- helm/benchmark/static_build/assets/index-e439d5e1.js +0 -10
- helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
- helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
- helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/top_level.txt +0 -0
- /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
- /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
- /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
- /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
- /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
- /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
|
@@ -2,6 +2,7 @@ import csv
|
|
|
2
2
|
import os
|
|
3
3
|
from typing import Dict, List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.common.general import ensure_file_downloaded
|
|
6
7
|
from helm.common.hierarchical_logger import hlog
|
|
7
8
|
from helm.benchmark.scenarios.scenario import (
|
|
@@ -14,6 +15,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
14
15
|
CORRECT_TAG,
|
|
15
16
|
Input,
|
|
16
17
|
Output,
|
|
18
|
+
ScenarioMetadata,
|
|
17
19
|
)
|
|
18
20
|
|
|
19
21
|
|
|
@@ -124,3 +126,22 @@ class MMLUScenario(Scenario):
|
|
|
124
126
|
instances.extend(self.process_csv(csv_path, splits[split]))
|
|
125
127
|
|
|
126
128
|
return instances
|
|
129
|
+
|
|
130
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
131
|
+
return ScenarioMetadata(
|
|
132
|
+
name="mmlu",
|
|
133
|
+
display_name="MMLU (Massive Multitask Language Understanding)",
|
|
134
|
+
short_display_name="MMLU",
|
|
135
|
+
description="The Massive Multitask Language Understanding (MMLU) benchmark for "
|
|
136
|
+
"knowledge-intensive question answering across 57 domains [(Hendrycks et al., "
|
|
137
|
+
"2021)](https://arxiv.org/pdf/2009.03300.pdf).",
|
|
138
|
+
taxonomy=TaxonomyInfo(
|
|
139
|
+
task="multiple-choice question answering",
|
|
140
|
+
what="math, science, history, etc.",
|
|
141
|
+
when="before 2021",
|
|
142
|
+
who="various online sources",
|
|
143
|
+
language="English",
|
|
144
|
+
),
|
|
145
|
+
main_metric="exact_match",
|
|
146
|
+
main_split="test",
|
|
147
|
+
)
|
|
@@ -4,6 +4,7 @@ import random
|
|
|
4
4
|
from collections import defaultdict
|
|
5
5
|
from typing import Dict, List, Tuple, Optional, Union
|
|
6
6
|
|
|
7
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
7
8
|
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
|
|
8
9
|
from helm.common.hierarchical_logger import hlog
|
|
9
10
|
from helm.benchmark.scenarios.scenario import (
|
|
@@ -13,6 +14,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
13
14
|
TRAIN_SPLIT,
|
|
14
15
|
VALID_SPLIT,
|
|
15
16
|
CORRECT_TAG,
|
|
17
|
+
ScenarioMetadata,
|
|
16
18
|
make_rank_tag,
|
|
17
19
|
make_relevance_tag,
|
|
18
20
|
Input,
|
|
@@ -657,3 +659,31 @@ class MSMARCOScenario(Scenario):
|
|
|
657
659
|
valid_instances = self.get_valid_instances()
|
|
658
660
|
|
|
659
661
|
return train_instances + valid_instances
|
|
662
|
+
|
|
663
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
664
|
+
if self.track == self.REGULAR_TRACK:
|
|
665
|
+
return ScenarioMetadata(
|
|
666
|
+
name="msmarco_regular",
|
|
667
|
+
display_name="MS MARCO (regular track)",
|
|
668
|
+
short_display_name="MS MARCO (regular)",
|
|
669
|
+
description="The MS MARCO benchmark's regular track for passage retrieval in information "
|
|
670
|
+
"retrieval "
|
|
671
|
+
"[(https://microsoft.github.io/msmarco/)](https://microsoft.github.io/msmarco/).",
|
|
672
|
+
taxonomy=TaxonomyInfo(task="information retrieval", what="?", when="?", who="?", language="English"),
|
|
673
|
+
main_metric="RR@10",
|
|
674
|
+
main_split="valid",
|
|
675
|
+
)
|
|
676
|
+
elif self.track == self.TREC_TRACK:
|
|
677
|
+
return ScenarioMetadata(
|
|
678
|
+
name="msmarco_trec",
|
|
679
|
+
display_name="MS MARCO (TREC track)",
|
|
680
|
+
short_display_name="MS MARCO (TREC)",
|
|
681
|
+
description="The MS MARCO benchmark's deep learning TREC track for passage retrieval in "
|
|
682
|
+
"information retrieval "
|
|
683
|
+
"[(https://trec.nist.gov)](https://microsoft.github.io/msmarco/).",
|
|
684
|
+
taxonomy=TaxonomyInfo(task="information retrieval", what="?", when="?", who="?", language="English"),
|
|
685
|
+
main_metric="NDCG@10",
|
|
686
|
+
main_split="valid",
|
|
687
|
+
)
|
|
688
|
+
else:
|
|
689
|
+
raise Exception(f"Unknown track {self.track}")
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import requests
|
|
3
3
|
from typing import List
|
|
4
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
4
5
|
from helm.benchmark.scenarios.scenario import (
|
|
5
6
|
Scenario,
|
|
6
7
|
Instance,
|
|
@@ -9,6 +10,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
9
10
|
TEST_SPLIT,
|
|
10
11
|
Input,
|
|
11
12
|
Output,
|
|
13
|
+
ScenarioMetadata,
|
|
12
14
|
)
|
|
13
15
|
from helm.common.general import ensure_directory_exists
|
|
14
16
|
|
|
@@ -142,3 +144,23 @@ class MTSamplesProceduresScenario(Scenario):
|
|
|
142
144
|
print(f"Error processing {file_name}: {e}")
|
|
143
145
|
|
|
144
146
|
return instances
|
|
147
|
+
|
|
148
|
+
def get_metadata(self):
|
|
149
|
+
return ScenarioMetadata(
|
|
150
|
+
name="mtsamples_procedures",
|
|
151
|
+
display_name="MTSamples Procedures",
|
|
152
|
+
description="MTSamples Procedures is a benchmark composed of transcribed operative notes, "
|
|
153
|
+
"focused on documenting surgical procedures. Each example presents a brief "
|
|
154
|
+
"patient case involving a surgical intervention, and the model is tasked with "
|
|
155
|
+
"generating a coherent and clinically accurate procedural summary or treatment "
|
|
156
|
+
"plan.",
|
|
157
|
+
taxonomy=TaxonomyInfo(
|
|
158
|
+
task="Text generation",
|
|
159
|
+
what="Document and extract information about medical procedures",
|
|
160
|
+
when="Post-procedure",
|
|
161
|
+
who="Clinician, Researcher",
|
|
162
|
+
language="English",
|
|
163
|
+
),
|
|
164
|
+
main_metric="mtsamples_procedures_accuracy",
|
|
165
|
+
main_split="test",
|
|
166
|
+
)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import requests
|
|
3
3
|
from typing import List
|
|
4
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
4
5
|
from helm.benchmark.scenarios.scenario import (
|
|
5
6
|
Scenario,
|
|
6
7
|
Instance,
|
|
@@ -9,6 +10,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
9
10
|
TEST_SPLIT,
|
|
10
11
|
Input,
|
|
11
12
|
Output,
|
|
13
|
+
ScenarioMetadata,
|
|
12
14
|
)
|
|
13
15
|
from helm.common.general import ensure_directory_exists
|
|
14
16
|
|
|
@@ -140,3 +142,23 @@ class MTSamplesReplicateScenario(Scenario):
|
|
|
140
142
|
print(f"Error processing {file_name}: {e}")
|
|
141
143
|
|
|
142
144
|
return instances
|
|
145
|
+
|
|
146
|
+
def get_metadata(self):
|
|
147
|
+
return ScenarioMetadata(
|
|
148
|
+
name="mtsamples_replicate",
|
|
149
|
+
display_name="MTSamples",
|
|
150
|
+
short_display_name="MTSamples",
|
|
151
|
+
description="MTSamples Replicate is a benchmark that provides transcribed medical reports "
|
|
152
|
+
"from various specialties. It is used to evaluate a model's ability to generate "
|
|
153
|
+
"clinically appropriate treatment plans based on unstructured patient "
|
|
154
|
+
"documentation [(MTSamples, 2025)](https://mtsamples.com).",
|
|
155
|
+
taxonomy=TaxonomyInfo(
|
|
156
|
+
task="Text generation",
|
|
157
|
+
what="Generate treatment plans based on clinical notes",
|
|
158
|
+
when="Post-diagnosis",
|
|
159
|
+
who="Clinician",
|
|
160
|
+
language="English",
|
|
161
|
+
),
|
|
162
|
+
main_metric="mtsamples_replicate_accuracy",
|
|
163
|
+
main_split="test",
|
|
164
|
+
)
|
|
@@ -4,6 +4,7 @@ import re
|
|
|
4
4
|
from typing import Any, Dict, List, Tuple, Optional
|
|
5
5
|
import xml.etree.ElementTree as ET
|
|
6
6
|
|
|
7
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
7
8
|
from helm.common.general import ensure_directory_exists
|
|
8
9
|
from helm.benchmark.scenarios.scenario import (
|
|
9
10
|
TEST_SPLIT,
|
|
@@ -14,6 +15,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
14
15
|
CORRECT_TAG,
|
|
15
16
|
Reference,
|
|
16
17
|
Output,
|
|
18
|
+
ScenarioMetadata,
|
|
17
19
|
)
|
|
18
20
|
|
|
19
21
|
ORIGINAL_DEFINITIONS = {
|
|
@@ -275,3 +277,21 @@ class N2C2CTMatchingScenario(Scenario):
|
|
|
275
277
|
)
|
|
276
278
|
|
|
277
279
|
return instances
|
|
280
|
+
|
|
281
|
+
def get_metadata(self):
|
|
282
|
+
return ScenarioMetadata(
|
|
283
|
+
name="n2c2_ct_matching",
|
|
284
|
+
display_name="N2C2-CT Matching",
|
|
285
|
+
short_display_name="N2C2-CT",
|
|
286
|
+
description="A dataset that provides clinical notes and asks the model to classify whether "
|
|
287
|
+
"the patient is a valid candidate for a provided clinical trial.",
|
|
288
|
+
taxonomy=TaxonomyInfo(
|
|
289
|
+
task="Classification",
|
|
290
|
+
what="Classify whether a patient is a valid candidate for a clinical " "trial based on clinical notes",
|
|
291
|
+
when="Pre-Trial",
|
|
292
|
+
who="Researcher",
|
|
293
|
+
language="English",
|
|
294
|
+
),
|
|
295
|
+
main_metric="exact_match",
|
|
296
|
+
main_split="test",
|
|
297
|
+
)
|
|
@@ -3,6 +3,7 @@ import random
|
|
|
3
3
|
import csv
|
|
4
4
|
from typing import List, Dict
|
|
5
5
|
|
|
6
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
6
7
|
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
|
|
7
8
|
from helm.benchmark.scenarios.scenario import (
|
|
8
9
|
Scenario,
|
|
@@ -13,6 +14,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
13
14
|
Input,
|
|
14
15
|
PassageQuestionInput,
|
|
15
16
|
Output,
|
|
17
|
+
ScenarioMetadata,
|
|
16
18
|
)
|
|
17
19
|
|
|
18
20
|
|
|
@@ -155,3 +157,20 @@ class NarrativeQAScenario(Scenario):
|
|
|
155
157
|
instances.extend(self.get_split_instances(summaries_file=summaries_file, qaps_file=qaps_file, split=split))
|
|
156
158
|
|
|
157
159
|
return instances
|
|
160
|
+
|
|
161
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
162
|
+
return ScenarioMetadata(
|
|
163
|
+
name="narrative_qa",
|
|
164
|
+
display_name="NarrativeQA",
|
|
165
|
+
description="The NarrativeQA benchmark for reading comprehension over narratives [(Kočiský "
|
|
166
|
+
"et al., 2017)](https://aclanthology.org/Q18-1023/).",
|
|
167
|
+
taxonomy=TaxonomyInfo(
|
|
168
|
+
task="short-answer question answering",
|
|
169
|
+
what="passages are books and movie scripts, questions are unknown",
|
|
170
|
+
when="2018",
|
|
171
|
+
who="annotators from summaries",
|
|
172
|
+
language="English",
|
|
173
|
+
),
|
|
174
|
+
main_metric="f1_score",
|
|
175
|
+
main_split="test",
|
|
176
|
+
)
|
|
@@ -6,6 +6,7 @@ import os
|
|
|
6
6
|
import re
|
|
7
7
|
import html
|
|
8
8
|
import random
|
|
9
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
9
10
|
from helm.common.hierarchical_logger import htrack_block, hlog
|
|
10
11
|
from typing import List, Dict
|
|
11
12
|
|
|
@@ -20,6 +21,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
20
21
|
PassageQuestionInput,
|
|
21
22
|
Input,
|
|
22
23
|
Output,
|
|
24
|
+
ScenarioMetadata,
|
|
23
25
|
)
|
|
24
26
|
|
|
25
27
|
|
|
@@ -324,3 +326,33 @@ class NaturalQAScenario(Scenario):
|
|
|
324
326
|
instances.extend(self.get_file_instances(target_path))
|
|
325
327
|
|
|
326
328
|
return instances
|
|
329
|
+
|
|
330
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
331
|
+
if self.context_mode == "closedbook":
|
|
332
|
+
name = "natural_qa_closedbook"
|
|
333
|
+
display_name = "NaturalQuestions (closed-book)"
|
|
334
|
+
elif self.context_mode == "openbook_longans":
|
|
335
|
+
name = "natural_qa_openbook_longans"
|
|
336
|
+
display_name = "NaturalQuestions (open-book)"
|
|
337
|
+
elif self.context_mode == "openbook_wiki":
|
|
338
|
+
name = "natural_qa_openbook_wiki"
|
|
339
|
+
display_name = "NaturalQuestions (open-book Wiki)"
|
|
340
|
+
else:
|
|
341
|
+
raise Exception(f"Unknown context_mode {self.context_mode}")
|
|
342
|
+
return ScenarioMetadata(
|
|
343
|
+
name=name,
|
|
344
|
+
display_name=display_name,
|
|
345
|
+
description="The NaturalQuestions [(Kwiatkowski et al., "
|
|
346
|
+
"2019)](https://aclanthology.org/Q19-1026/) benchmark for question answering "
|
|
347
|
+
"based on naturally-occurring queries through Google Search. The input does not "
|
|
348
|
+
"include the Wikipedia page with the answer.",
|
|
349
|
+
taxonomy=TaxonomyInfo(
|
|
350
|
+
task="question answering",
|
|
351
|
+
what="passages from Wikipedia, questions from search queries",
|
|
352
|
+
when="2010s",
|
|
353
|
+
who="web users",
|
|
354
|
+
language="English",
|
|
355
|
+
),
|
|
356
|
+
main_metric="f1_score",
|
|
357
|
+
main_split="valid",
|
|
358
|
+
)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import datasets
|
|
2
2
|
import os
|
|
3
3
|
from typing import List
|
|
4
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
4
5
|
from helm.benchmark.scenarios.scenario import (
|
|
5
6
|
Scenario,
|
|
6
7
|
Instance,
|
|
@@ -9,6 +10,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
9
10
|
Input,
|
|
10
11
|
Output,
|
|
11
12
|
CORRECT_TAG,
|
|
13
|
+
ScenarioMetadata,
|
|
12
14
|
)
|
|
13
15
|
from helm.common.general import ensure_directory_exists
|
|
14
16
|
|
|
@@ -51,3 +53,19 @@ class OmniMATHScenario(Scenario):
|
|
|
51
53
|
instances.append(instance)
|
|
52
54
|
|
|
53
55
|
return instances
|
|
56
|
+
|
|
57
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
58
|
+
return ScenarioMetadata(
|
|
59
|
+
name=self.name,
|
|
60
|
+
display_name="Omni-MATH",
|
|
61
|
+
description=self.description,
|
|
62
|
+
main_metric="omni_math_accuracy",
|
|
63
|
+
main_split="test",
|
|
64
|
+
taxonomy=TaxonomyInfo(
|
|
65
|
+
task="mathematics",
|
|
66
|
+
what="universal Olympiad level mathematic benchmark",
|
|
67
|
+
who="human annotators",
|
|
68
|
+
when="2024",
|
|
69
|
+
language="English",
|
|
70
|
+
),
|
|
71
|
+
)
|
|
@@ -2,6 +2,7 @@ from typing import List, Dict, Any, DefaultDict
|
|
|
2
2
|
from datasets import load_dataset, Dataset
|
|
3
3
|
from collections import defaultdict
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.benchmark.scenarios.scenario import (
|
|
6
7
|
CORRECT_TAG,
|
|
7
8
|
Reference,
|
|
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
11
12
|
TRAIN_SPLIT,
|
|
12
13
|
VALID_SPLIT,
|
|
13
14
|
Output,
|
|
15
|
+
ScenarioMetadata,
|
|
14
16
|
)
|
|
15
17
|
|
|
16
18
|
|
|
@@ -126,3 +128,23 @@ class OpenAssistantScenario(Scenario):
|
|
|
126
128
|
valid_instances = get_split_instances(dataset["validation"], VALID_SPLIT)
|
|
127
129
|
|
|
128
130
|
return train_instances + valid_instances
|
|
131
|
+
|
|
132
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
133
|
+
return ScenarioMetadata(
|
|
134
|
+
name="open_assistant",
|
|
135
|
+
display_name="Open Assistant",
|
|
136
|
+
short_display_name="Open Assistant",
|
|
137
|
+
description="LAION’s OpenAssistant Conversations Dataset (OASST1) that consists of 66,497 "
|
|
138
|
+
"conversation trees ([Köpf et al., "
|
|
139
|
+
"2023](https://openreview.net/forum?id=VSJotgbPHF)). We only use the initial "
|
|
140
|
+
"prompt in each conversation.",
|
|
141
|
+
taxonomy=TaxonomyInfo(
|
|
142
|
+
task="open-ended instruction following",
|
|
143
|
+
what="Human-written dialogues and response rankings",
|
|
144
|
+
when="2023",
|
|
145
|
+
who="Open Assistant participants",
|
|
146
|
+
language="35 languages",
|
|
147
|
+
),
|
|
148
|
+
main_metric="Helpfulness",
|
|
149
|
+
main_split="valid",
|
|
150
|
+
)
|
|
@@ -6,6 +6,7 @@ from typing import List, Optional
|
|
|
6
6
|
import datasets
|
|
7
7
|
import tiktoken
|
|
8
8
|
|
|
9
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
9
10
|
from helm.benchmark.scenarios.scenario import (
|
|
10
11
|
CORRECT_TAG,
|
|
11
12
|
Output,
|
|
@@ -14,6 +15,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
14
15
|
Instance,
|
|
15
16
|
TEST_SPLIT,
|
|
16
17
|
Input,
|
|
18
|
+
ScenarioMetadata,
|
|
17
19
|
)
|
|
18
20
|
from helm.common.general import ensure_directory_exists
|
|
19
21
|
|
|
@@ -77,3 +79,16 @@ class OpenAIMRCRScenario(Scenario):
|
|
|
77
79
|
instances.append(instance)
|
|
78
80
|
|
|
79
81
|
return instances
|
|
82
|
+
|
|
83
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
84
|
+
return ScenarioMetadata(
|
|
85
|
+
name="openai_mrcr",
|
|
86
|
+
display_name="OpenAI MRCR",
|
|
87
|
+
description="OpenAI MRCR (Multi-round co-reference resolution) is a long context dataset "
|
|
88
|
+
"for benchmarking an LLM's ability to distinguish between multiple needles "
|
|
89
|
+
"hidden in context. This eval is inspired by the MRCR eval first introduced by "
|
|
90
|
+
"[Vodrahalli et al., 2024](https://arxiv.org/pdf/2409.12640v2).",
|
|
91
|
+
taxonomy=TaxonomyInfo(task="MRCR", what="Synthetic data", when="2025", who="None", language="English"),
|
|
92
|
+
main_metric="openai_mrcr_accuracy",
|
|
93
|
+
main_split="test",
|
|
94
|
+
)
|
|
@@ -2,6 +2,7 @@ import json
|
|
|
2
2
|
import os
|
|
3
3
|
from typing import Dict, List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.common.general import ensure_directory_exists, ensure_file_downloaded
|
|
6
7
|
from helm.benchmark.scenarios.scenario import (
|
|
7
8
|
Scenario,
|
|
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
11
12
|
Reference,
|
|
12
13
|
PassageQuestionInput,
|
|
13
14
|
Output,
|
|
15
|
+
ScenarioMetadata,
|
|
14
16
|
)
|
|
15
17
|
|
|
16
18
|
|
|
@@ -186,3 +188,23 @@ class PubMedQAScenario(Scenario):
|
|
|
186
188
|
instances.append(instance)
|
|
187
189
|
|
|
188
190
|
return instances
|
|
191
|
+
|
|
192
|
+
def get_metadata(self):
|
|
193
|
+
return ScenarioMetadata(
|
|
194
|
+
name="pubmed_qa",
|
|
195
|
+
display_name="PubMedQA",
|
|
196
|
+
description="PubMedQA is a biomedical question-answering dataset that evaluates a model's "
|
|
197
|
+
"ability to interpret scientific literature. It consists of PubMed abstracts "
|
|
198
|
+
"paired with yes/no/maybe questions derived from the content. The benchmark "
|
|
199
|
+
"assesses a model's capability to reason over biomedical texts and provide "
|
|
200
|
+
"factually grounded answers.",
|
|
201
|
+
taxonomy=TaxonomyInfo(
|
|
202
|
+
task="Question answering",
|
|
203
|
+
what="Answer questions based on PubMed abstracts",
|
|
204
|
+
when="Any",
|
|
205
|
+
who="Researcher",
|
|
206
|
+
language="English",
|
|
207
|
+
),
|
|
208
|
+
main_metric="exact_match",
|
|
209
|
+
main_split="test",
|
|
210
|
+
)
|
|
@@ -3,6 +3,7 @@ import os
|
|
|
3
3
|
import random
|
|
4
4
|
from typing import List, Tuple
|
|
5
5
|
|
|
6
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
6
7
|
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
|
|
7
8
|
from helm.benchmark.scenarios.scenario import (
|
|
8
9
|
Scenario,
|
|
@@ -13,6 +14,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
13
14
|
CORRECT_TAG,
|
|
14
15
|
Input,
|
|
15
16
|
Output,
|
|
17
|
+
ScenarioMetadata,
|
|
16
18
|
)
|
|
17
19
|
|
|
18
20
|
|
|
@@ -192,3 +194,15 @@ class QuACScenario(Scenario):
|
|
|
192
194
|
instances.extend(self.get_split_instances(split_path, split=split_tag))
|
|
193
195
|
|
|
194
196
|
return instances
|
|
197
|
+
|
|
198
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
199
|
+
return ScenarioMetadata(
|
|
200
|
+
name="quac",
|
|
201
|
+
display_name="QuAC (Question Answering in Context)",
|
|
202
|
+
short_display_name="QuAC",
|
|
203
|
+
description="The QuAC benchmark for question answering in the context of dialogues [(Choi "
|
|
204
|
+
"et al., 2018)](https://aclanthology.org/D18-1241/).",
|
|
205
|
+
taxonomy=TaxonomyInfo(task="question answering", what="?", when="?", who="?", language="English"),
|
|
206
|
+
main_metric="f1_score",
|
|
207
|
+
main_split="valid",
|
|
208
|
+
)
|
|
@@ -4,6 +4,7 @@ import os
|
|
|
4
4
|
from typing import Dict, List
|
|
5
5
|
from docx import Document
|
|
6
6
|
|
|
7
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
7
8
|
from helm.benchmark.scenarios.scenario import (
|
|
8
9
|
Input,
|
|
9
10
|
Scenario,
|
|
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
12
13
|
CORRECT_TAG,
|
|
13
14
|
Reference,
|
|
14
15
|
Output,
|
|
16
|
+
ScenarioMetadata,
|
|
15
17
|
)
|
|
16
18
|
from helm.common.general import ensure_file_downloaded
|
|
17
19
|
|
|
@@ -150,3 +152,24 @@ class RaceBasedMedScenario(Scenario):
|
|
|
150
152
|
)
|
|
151
153
|
|
|
152
154
|
return instances
|
|
155
|
+
|
|
156
|
+
def get_metadata(self):
|
|
157
|
+
return ScenarioMetadata(
|
|
158
|
+
name="race_based_med",
|
|
159
|
+
display_name="RaceBias",
|
|
160
|
+
description="RaceBias is a benchmark used to evaluate language models for racially biased "
|
|
161
|
+
"or inappropriate content in medical question-answering scenarios. Each "
|
|
162
|
+
"instance consists of a medical question and a model-generated response. The "
|
|
163
|
+
"task is to classify whether the response contains race-based, harmful, or "
|
|
164
|
+
"inaccurate content. This benchmark supports research into bias detection and "
|
|
165
|
+
"fairness in clinical AI systems.",
|
|
166
|
+
taxonomy=TaxonomyInfo(
|
|
167
|
+
task="Classification",
|
|
168
|
+
what="Identify race-based bias in LLM-generated medical responses",
|
|
169
|
+
when="Any",
|
|
170
|
+
who="Researcher",
|
|
171
|
+
language="English",
|
|
172
|
+
),
|
|
173
|
+
main_metric="exact_match",
|
|
174
|
+
main_split="test",
|
|
175
|
+
)
|
|
@@ -5,6 +5,7 @@ import datasets
|
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
from typing import List, Dict
|
|
7
7
|
|
|
8
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
8
9
|
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
|
|
9
10
|
from helm.benchmark.scenarios.scenario import (
|
|
10
11
|
Scenario,
|
|
@@ -15,6 +16,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
15
16
|
TEST_SPLIT,
|
|
16
17
|
Input,
|
|
17
18
|
Output,
|
|
19
|
+
ScenarioMetadata,
|
|
18
20
|
)
|
|
19
21
|
|
|
20
22
|
PROMPT_SETTINGS_URL = "https://www.dropbox.com/s/a5cyevryzw8rt4f/prompt_construction_settings.json?dl=0"
|
|
@@ -144,3 +146,16 @@ class RAFTScenario(Scenario):
|
|
|
144
146
|
instances.append(instance)
|
|
145
147
|
|
|
146
148
|
return instances
|
|
149
|
+
|
|
150
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
151
|
+
return ScenarioMetadata(
|
|
152
|
+
name="raft",
|
|
153
|
+
display_name="RAFT (Real-world Annotated Few-Shot)",
|
|
154
|
+
short_display_name="RAFT",
|
|
155
|
+
description="The Real-world annotated few-shot (RAFT) meta-benchmark of 11 real-world text "
|
|
156
|
+
"classification tasks [(Alex et al., "
|
|
157
|
+
"2021)](https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/ca46c1b9512a7a8315fa3c5a946e8265-Abstract-round2.html).",
|
|
158
|
+
taxonomy=TaxonomyInfo(task="text classification", what="?", when="?", who="?", language="English"),
|
|
159
|
+
main_metric="quasi_exact_match",
|
|
160
|
+
main_split="test",
|
|
161
|
+
)
|
|
@@ -3,8 +3,9 @@ import os
|
|
|
3
3
|
import random
|
|
4
4
|
from typing import List, Dict, Optional
|
|
5
5
|
|
|
6
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
6
7
|
from helm.common.general import ensure_file_downloaded
|
|
7
|
-
from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input
|
|
8
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input, ScenarioMetadata
|
|
8
9
|
|
|
9
10
|
TOXIC_SUB_SPLIT: str = "toxic"
|
|
10
11
|
NONTOXIC_SUB_SPLIT: str = "non-toxic"
|
|
@@ -57,3 +58,15 @@ class RealToxicityPromptsScenario(Scenario):
|
|
|
57
58
|
random.shuffle(instances)
|
|
58
59
|
|
|
59
60
|
return instances
|
|
61
|
+
|
|
62
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
63
|
+
return ScenarioMetadata(
|
|
64
|
+
name="real_toxicity_prompts",
|
|
65
|
+
display_name="RealToxicityPrompts",
|
|
66
|
+
description="The RealToxicityPrompts dataset for measuring toxicity in prompted model "
|
|
67
|
+
"generations [(Gehman et al., "
|
|
68
|
+
"2020)](https://aclanthology.org/2020.findings-emnlp.301/).",
|
|
69
|
+
taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
|
|
70
|
+
main_metric="unknown",
|
|
71
|
+
main_split="test",
|
|
72
|
+
)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from typing import List
|
|
3
3
|
|
|
4
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
4
5
|
from helm.common.general import ensure_directory_exists, ensure_file_downloaded
|
|
5
6
|
from helm.benchmark.scenarios.ruler_qa_scenario_helper import generate_samples # type: ignore
|
|
6
7
|
from helm.benchmark.scenarios.scenario import (
|
|
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
11
12
|
CORRECT_TAG,
|
|
12
13
|
Input,
|
|
13
14
|
Output,
|
|
15
|
+
ScenarioMetadata,
|
|
14
16
|
)
|
|
15
17
|
|
|
16
18
|
|
|
@@ -78,6 +80,25 @@ class RULERHotpotQAScenario(_RULERQAScenario):
|
|
|
78
80
|
def __init__(self, max_num_words: int):
|
|
79
81
|
super().__init__("hotpotqa", max_num_words)
|
|
80
82
|
|
|
83
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
84
|
+
return ScenarioMetadata(
|
|
85
|
+
name="ruler_hotpotqa",
|
|
86
|
+
display_name="RULER HotPotQA",
|
|
87
|
+
description="RULER HotPotQA is an augmented version of HotPotQA ([Yang et al., "
|
|
88
|
+
"2018](https://arxiv.org/abs/1809.09600)) introduced by [Hsieh et al., "
|
|
89
|
+
"2024](https://arxiv.org/abs/2404.06654) to simulate a multi-hop question "
|
|
90
|
+
"answering as a long-context scenario.",
|
|
91
|
+
taxonomy=TaxonomyInfo(
|
|
92
|
+
task="question answering with retrieval-augmented generation",
|
|
93
|
+
what="Wikipedia articles",
|
|
94
|
+
when="Before 2018",
|
|
95
|
+
who="Wikipedia authors",
|
|
96
|
+
language="English",
|
|
97
|
+
),
|
|
98
|
+
main_metric="ruler_string_match_part",
|
|
99
|
+
main_split="valid",
|
|
100
|
+
)
|
|
101
|
+
|
|
81
102
|
|
|
82
103
|
class RULERSQuADScenario(_RULERQAScenario):
|
|
83
104
|
name = "ruler_squad"
|
|
@@ -86,3 +107,22 @@ class RULERSQuADScenario(_RULERQAScenario):
|
|
|
86
107
|
|
|
87
108
|
def __init__(self, max_num_words: int):
|
|
88
109
|
super().__init__("squad", max_num_words)
|
|
110
|
+
|
|
111
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
112
|
+
return ScenarioMetadata(
|
|
113
|
+
name="ruler_squad",
|
|
114
|
+
display_name="RULER SQuAD",
|
|
115
|
+
description="RULER SQuAD is an augmented version of SQuAD ([Rajpurkar et al., "
|
|
116
|
+
"2018](https://arxiv.org/abs/1806.03822)) introduced by [Hsieh et al., "
|
|
117
|
+
"2024](https://arxiv.org/abs/2404.06654) to simulate a single-hop question "
|
|
118
|
+
"answering as a long-context scenario.",
|
|
119
|
+
taxonomy=TaxonomyInfo(
|
|
120
|
+
task="question answering",
|
|
121
|
+
what="Wikipedia articles",
|
|
122
|
+
when="Before 2018",
|
|
123
|
+
who="Wikipedia authors and crowdworkers",
|
|
124
|
+
language="English",
|
|
125
|
+
),
|
|
126
|
+
main_metric="ruler_string_match_part",
|
|
127
|
+
main_split="valid",
|
|
128
|
+
)
|