crfm-helm 0.5.7__py3-none-any.whl → 0.5.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/METADATA +7 -77
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/RECORD +315 -282
- helm/benchmark/adaptation/adapter_spec.py +10 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
- helm/benchmark/annotation/aci_bench_annotator.py +11 -22
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
- helm/benchmark/annotation/dischargeme_annotator.py +11 -22
- helm/benchmark/annotation/med_dialog_annotator.py +11 -22
- helm/benchmark/annotation/medalign_annotator.py +11 -22
- helm/benchmark/annotation/medi_qa_annotator.py +11 -22
- helm/benchmark/annotation/medication_qa_annotator.py +11 -22
- helm/benchmark/annotation/mental_health_annotator.py +11 -22
- helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
- helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
- helm/benchmark/annotation/model_as_judge.py +23 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
- helm/benchmark/metrics/air_bench_metrics.py +3157 -1
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/basic_metrics.py +267 -2
- helm/benchmark/metrics/bbq_metrics.py +12 -0
- helm/benchmark/metrics/classification_metrics.py +19 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
- helm/benchmark/metrics/dry_run_metrics.py +30 -1
- helm/benchmark/metrics/efficiency_metrics.py +74 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +311 -0
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
- helm/benchmark/metrics/ifeval_metrics.py +13 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
- helm/benchmark/metrics/language_modeling_metrics.py +13 -1
- helm/benchmark/metrics/live_qa_metrics.py +13 -1
- helm/benchmark/metrics/llm_jury_metrics.py +13 -1
- helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
- helm/benchmark/metrics/medec_metrics.py +25 -2
- helm/benchmark/metrics/metric.py +25 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
- helm/benchmark/metrics/omni_math_metrics.py +13 -1
- helm/benchmark/metrics/safety_metrics.py +13 -1
- helm/benchmark/metrics/seahelm_metrics.py +14 -1
- helm/benchmark/metrics/summac/model_summac.py +2 -2
- helm/benchmark/metrics/summarization_metrics.py +129 -1
- helm/benchmark/metrics/toxicity_metrics.py +31 -1
- helm/benchmark/metrics/ultra_suite_asr_classification_metrics.py +52 -0
- helm/benchmark/metrics/wildbench_metrics.py +21 -1
- helm/benchmark/presentation/run_display.py +13 -3
- helm/benchmark/presentation/run_entry.py +2 -2
- helm/benchmark/presentation/schema.py +5 -22
- helm/benchmark/presentation/summarize.py +180 -11
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/run.py +1 -1
- helm/benchmark/run_expander.py +4 -0
- helm/benchmark/run_specs/arabic_run_specs.py +140 -16
- helm/benchmark/run_specs/bluex_run_specs.py +1 -1
- helm/benchmark/run_specs/classic_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +2 -2
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +362 -52
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +6 -2
- helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
- helm/benchmark/scenarios/air_bench_scenario.py +21 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
- helm/benchmark/scenarios/anthropic_red_team_scenario.py +12 -1
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +8 -4
- helm/benchmark/scenarios/aratrust_scenario.py +19 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +24 -54
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +19 -48
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +22 -61
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +21 -29
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +21 -60
- helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
- helm/benchmark/scenarios/banking77_scenario.py +21 -0
- helm/benchmark/scenarios/bbq_scenario.py +15 -0
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/bird_sql_scenario.py +18 -0
- helm/benchmark/scenarios/bluex_scenario.py +6 -2
- helm/benchmark/scenarios/bold_scenario.py +15 -0
- helm/benchmark/scenarios/boolq_scenario.py +20 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
- helm/benchmark/scenarios/clear_scenario.py +23 -0
- helm/benchmark/scenarios/cleva_scenario.py +479 -0
- helm/benchmark/scenarios/code_scenario.py +28 -0
- helm/benchmark/scenarios/commonsense_scenario.py +32 -0
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
- helm/benchmark/scenarios/czech_bank_qa_scenario.py +18 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
- helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
- helm/benchmark/scenarios/disinformation_scenario.py +22 -0
- helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
- helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
- helm/benchmark/scenarios/fin_qa_scenario.py +20 -0
- helm/benchmark/scenarios/financebench_scenario.py +21 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
- helm/benchmark/scenarios/gpqa_scenario.py +18 -0
- helm/benchmark/scenarios/grammar_scenario.py +20 -1
- helm/benchmark/scenarios/gsm_scenario.py +21 -0
- helm/benchmark/scenarios/harm_bench_gcg_transfer_scenario.py +12 -1
- helm/benchmark/scenarios/harm_bench_scenario.py +12 -1
- helm/benchmark/scenarios/headqa_scenario.py +22 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
- helm/benchmark/scenarios/ice_scenario.py +21 -1
- helm/benchmark/scenarios/ifeval_scenario.py +18 -0
- helm/benchmark/scenarios/imdb_scenario.py +15 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +21 -0
- helm/benchmark/scenarios/infinite_bench_en_sum_scenario.py +19 -0
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/legal_support_scenario.py +13 -0
- helm/benchmark/scenarios/legalbench_scenario.py +19 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
- helm/benchmark/scenarios/lextreme_scenario.py +11 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +33 -0
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
- helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
- helm/benchmark/scenarios/med_qa_scenario.py +20 -0
- helm/benchmark/scenarios/medalign_scenario.py +23 -0
- helm/benchmark/scenarios/medbullets_scenario.py +22 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
- helm/benchmark/scenarios/medec_scenario.py +23 -0
- helm/benchmark/scenarios/medhallu_scenario.py +23 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +24 -1
- helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
- helm/benchmark/scenarios/mental_health_scenario.py +23 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +24 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
- helm/benchmark/scenarios/mmlu_scenario.py +21 -0
- helm/benchmark/scenarios/msmarco_scenario.py +30 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +19 -0
- helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
- helm/benchmark/scenarios/omni_math_scenario.py +18 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
- helm/benchmark/scenarios/openai_mrcr_scenario.py +15 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
- helm/benchmark/scenarios/quac_scenario.py +14 -0
- helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
- helm/benchmark/scenarios/raft_scenario.py +15 -0
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/ruler_qa_scenarios.py +40 -0
- helm/benchmark/scenarios/scenario.py +31 -0
- helm/benchmark/scenarios/seahelm_scenario.py +348 -0
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
- helm/benchmark/scenarios/simple_safety_tests_scenario.py +12 -1
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/spider_scenario.py +18 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
- helm/benchmark/scenarios/summarization_scenario.py +37 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +1 -1
- helm/benchmark/scenarios/test_bluex_scenario.py +2 -2
- helm/benchmark/scenarios/thai_exam_scenario.py +95 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/vicuna_scenario.py +21 -1
- helm/benchmark/scenarios/wikifact_scenario.py +20 -0
- helm/benchmark/scenarios/wildbench_scenario.py +18 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +19 -0
- helm/benchmark/static/schema_arabic.yaml +55 -12
- helm/benchmark/static/schema_long_context.yaml +11 -30
- helm/benchmark/static/schema_medhelm.yaml +36 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static_build/assets/audio-table-Dn5NMMeJ.png +0 -0
- helm/benchmark/static_build/assets/index-oIeiQW2g.css +1 -0
- helm/benchmark/static_build/assets/index-qOFpOyHb.js +10 -0
- helm/benchmark/static_build/assets/react-BteFIppM.js +85 -0
- helm/benchmark/static_build/assets/recharts-DxuQtTOs.js +97 -0
- helm/benchmark/static_build/assets/tremor-DR4fE7ko.js +10 -0
- helm/benchmark/static_build/index.html +5 -6
- helm/clients/ai21_client.py +2 -0
- helm/clients/aleph_alpha_client.py +2 -0
- helm/clients/anthropic_client.py +7 -1
- helm/clients/audio_language/diva_llama_client.py +2 -0
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/llama_omni_client.py +2 -1
- helm/clients/audio_language/qwen2_5_omni_client.py +2 -1
- helm/clients/audio_language/qwen2_audiolm_client.py +2 -1
- helm/clients/audio_language/qwen_audiolm_client.py +2 -1
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/bedrock_client.py +2 -0
- helm/clients/cohere_client.py +3 -0
- helm/clients/google_client.py +2 -0
- helm/clients/http_model_client.py +2 -0
- helm/clients/huggingface_client.py +2 -1
- helm/clients/ibm_client.py +3 -1
- helm/clients/image_generation/adobe_vision_client.py +2 -0
- helm/clients/image_generation/aleph_alpha_image_generation_client.py +2 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
- helm/clients/image_generation/cogview2_client.py +2 -1
- helm/clients/image_generation/dalle2_client.py +2 -0
- helm/clients/image_generation/dalle_mini_client.py +2 -1
- helm/clients/image_generation/deep_floyd_client.py +2 -0
- helm/clients/image_generation/huggingface_diffusers_client.py +2 -1
- helm/clients/image_generation/lexica_client.py +2 -0
- helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
- helm/clients/image_generation/mindalle_client.py +2 -1
- helm/clients/image_generation/together_image_generation_client.py +2 -0
- helm/clients/megatron_client.py +2 -0
- helm/clients/mistral_client.py +2 -0
- helm/clients/moderation_api_client.py +2 -0
- helm/clients/openai_client.py +36 -20
- helm/clients/openai_responses_client.py +27 -3
- helm/clients/openrouter_client.py +31 -0
- helm/clients/palmyra_client.py +2 -1
- helm/clients/reka_client.py +2 -1
- helm/clients/stanfordhealthcare_azure_openai_client.py +2 -2
- helm/clients/stanfordhealthcare_http_model_client.py +2 -0
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/together_client.py +52 -11
- helm/clients/vertexai_client.py +12 -2
- helm/clients/vision_language/huggingface_vision2seq_client.py +2 -1
- helm/clients/vision_language/huggingface_vlm_client.py +2 -0
- helm/clients/vision_language/idefics_client.py +2 -1
- helm/clients/vision_language/open_flamingo_client.py +2 -1
- helm/clients/vision_language/paligemma_client.py +2 -1
- helm/clients/vision_language/palmyra_vision_client.py +2 -0
- helm/clients/vision_language/qwen2_vlm_client.py +2 -1
- helm/clients/vision_language/qwen_vlm_client.py +2 -1
- helm/clients/writer_client.py +2 -0
- helm/common/hierarchical_logger.py +20 -0
- helm/common/optional_dependencies.py +1 -1
- helm/common/test_general.py +4 -0
- helm/config/model_deployments.yaml +300 -1
- helm/config/model_metadata.yaml +302 -9
- helm/config/tokenizer_configs.yaml +92 -4
- helm/proxy/example_queries.py +8 -8
- helm/proxy/server.py +2 -1
- helm/proxy/static/index.css +4 -0
- helm/proxy/static/index.js +7 -1
- helm/benchmark/metrics/aci_bench_metrics.py +0 -14
- helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
- helm/benchmark/metrics/dischargeme_metrics.py +0 -14
- helm/benchmark/metrics/med_dialog_metrics.py +0 -14
- helm/benchmark/metrics/medalign_metrics.py +0 -14
- helm/benchmark/metrics/medi_qa_metrics.py +0 -14
- helm/benchmark/metrics/medication_qa_metrics.py +0 -14
- helm/benchmark/metrics/mental_health_metrics.py +0 -14
- helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
- helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
- helm/benchmark/static_build/assets/index-b9779128.css +0 -1
- helm/benchmark/static_build/assets/index-e439d5e1.js +0 -10
- helm/benchmark/static_build/assets/react-f82877fd.js +0 -85
- helm/benchmark/static_build/assets/recharts-4037aff0.js +0 -97
- helm/benchmark/static_build/assets/tremor-38a10867.js +0 -10
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.9.dist-info}/top_level.txt +0 -0
- /helm/benchmark/static_build/assets/{air-overview-d2e6c49f.png → air-overview-DpBbyagA.png} +0 -0
- /helm/benchmark/static_build/assets/{crfm-logo-74391ab8.png → crfm-logo-Du4T1uWZ.png} +0 -0
- /helm/benchmark/static_build/assets/{heim-logo-3e5e3aa4.png → heim-logo-BJtQlEbV.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-logo-simple-2ed5400b.png → helm-logo-simple-DzOhNN41.png} +0 -0
- /helm/benchmark/static_build/assets/{helm-safety-2907a7b6.png → helm-safety-COfndXuS.png} +0 -0
- /helm/benchmark/static_build/assets/{helmhero-28e90f4d.png → helmhero-D9TvmJsp.png} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-overview-eac29843.png → medhelm-overview-CND0EIsy.png} +0 -0
- /helm/benchmark/static_build/assets/{medhelm-v1-overview-3ddfcd65.png → medhelm-v1-overview-Cu2tphBB.png} +0 -0
- /helm/benchmark/static_build/assets/{overview-74aea3d8.png → overview-BwypNWnk.png} +0 -0
- /helm/benchmark/static_build/assets/{process-flow-bd2eba96.png → process-flow-DWDJC733.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-aspects-1437d673.png → vhelm-aspects-NiDQofvP.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-framework-a1ca3f3f.png → vhelm-framework-NxJE4fdA.png} +0 -0
- /helm/benchmark/static_build/assets/{vhelm-model-8afb7616.png → vhelm-model-ypCL5Yvq.png} +0 -0
|
@@ -2,6 +2,7 @@ import os
|
|
|
2
2
|
from typing import Dict, List
|
|
3
3
|
import json
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.common.general import ensure_file_downloaded
|
|
6
7
|
from helm.common.hierarchical_logger import hlog
|
|
7
8
|
from helm.benchmark.scenarios.scenario import (
|
|
@@ -13,6 +14,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
13
14
|
CORRECT_TAG,
|
|
14
15
|
Input,
|
|
15
16
|
Output,
|
|
17
|
+
ScenarioMetadata,
|
|
16
18
|
)
|
|
17
19
|
|
|
18
20
|
|
|
@@ -142,3 +144,96 @@ class ThaiExamScenario(Scenario):
|
|
|
142
144
|
instances.extend(self.process_jsonl(jsonl_path, splits[split]))
|
|
143
145
|
|
|
144
146
|
return instances
|
|
147
|
+
|
|
148
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
149
|
+
if self.exam == "onet":
|
|
150
|
+
return ScenarioMetadata(
|
|
151
|
+
name="thai_exam_onet",
|
|
152
|
+
display_name="ONET",
|
|
153
|
+
description="The Ordinary National Educational Test (ONET) is an examination for students "
|
|
154
|
+
"in Thailand. We select the grade-12 ONET exam, which comprises 5 subjects and "
|
|
155
|
+
"each question has 5 choices. These subjects are Thai, English, Mathematics, "
|
|
156
|
+
"Social Studies, and Science. Amounting to a total of 170 questions and "
|
|
157
|
+
"options.\n",
|
|
158
|
+
taxonomy=TaxonomyInfo(
|
|
159
|
+
task="question answering",
|
|
160
|
+
what="high school / medical school academic knowledge",
|
|
161
|
+
when="?",
|
|
162
|
+
who="n/a",
|
|
163
|
+
language="Thai and English",
|
|
164
|
+
),
|
|
165
|
+
main_metric="exact_match",
|
|
166
|
+
main_split="test",
|
|
167
|
+
)
|
|
168
|
+
elif self.exam == "ic":
|
|
169
|
+
return ScenarioMetadata(
|
|
170
|
+
name="thai_exam_ic",
|
|
171
|
+
display_name="IC",
|
|
172
|
+
description="The Investment Consultant (IC) examination, a licensing test for investment "
|
|
173
|
+
"professionals in Thailand. Developed by the Stock Exchange of Thailand (SET), "
|
|
174
|
+
"features 4 choices per question. We extracted questions for levels 1, 2, and 3 "
|
|
175
|
+
"resulting in a total of 95 questions and options.\n",
|
|
176
|
+
taxonomy=TaxonomyInfo(
|
|
177
|
+
task="question answering",
|
|
178
|
+
what="licensing for investment professionals",
|
|
179
|
+
when="?",
|
|
180
|
+
who="n/a",
|
|
181
|
+
language="Thai",
|
|
182
|
+
),
|
|
183
|
+
main_metric="exact_match",
|
|
184
|
+
main_split="test",
|
|
185
|
+
)
|
|
186
|
+
elif self.exam == "tgat":
|
|
187
|
+
return ScenarioMetadata(
|
|
188
|
+
name="thai_exam_tgat",
|
|
189
|
+
display_name="TGAT",
|
|
190
|
+
description="The Thai General Aptitude Test (TGAT), a national high school examination in "
|
|
191
|
+
"Thailand. Focuses on critical and logical thinking skills. We collected a "
|
|
192
|
+
"total of 90 questions and answers. The TGAT consists of four choices per "
|
|
193
|
+
"question.\n",
|
|
194
|
+
taxonomy=TaxonomyInfo(
|
|
195
|
+
task="question answering",
|
|
196
|
+
what="high school level questions on reasoning",
|
|
197
|
+
when="?",
|
|
198
|
+
who="n/a",
|
|
199
|
+
language="English",
|
|
200
|
+
),
|
|
201
|
+
main_metric="exact_match",
|
|
202
|
+
main_split="test",
|
|
203
|
+
)
|
|
204
|
+
elif self.exam == "tpat1":
|
|
205
|
+
return ScenarioMetadata(
|
|
206
|
+
name="thai_exam_tpat1",
|
|
207
|
+
display_name="TPAT-1",
|
|
208
|
+
description="TBD",
|
|
209
|
+
taxonomy=TaxonomyInfo(
|
|
210
|
+
task="question answering",
|
|
211
|
+
what="high school / medical school academic knowledge",
|
|
212
|
+
when="?",
|
|
213
|
+
who="n/a",
|
|
214
|
+
language="Thai",
|
|
215
|
+
),
|
|
216
|
+
main_metric="exact_match",
|
|
217
|
+
main_split="test",
|
|
218
|
+
)
|
|
219
|
+
elif self.exam == "a_level":
|
|
220
|
+
return ScenarioMetadata(
|
|
221
|
+
name="thai_exam_a_level",
|
|
222
|
+
display_name="A-Level",
|
|
223
|
+
description="An academic knowledge assessment examination (Applied Knowledge Level) that "
|
|
224
|
+
"covers general foundational subjects taught in schools. The content assessed "
|
|
225
|
+
"in this examination aligns with the curriculum guidelines and emphasizes the "
|
|
226
|
+
"practical application of knowledge in daily life. We collected a total of 175 "
|
|
227
|
+
"questions and answers.\n",
|
|
228
|
+
taxonomy=TaxonomyInfo(
|
|
229
|
+
task="question answering",
|
|
230
|
+
what="high school academic knowledge",
|
|
231
|
+
when="?",
|
|
232
|
+
who="n/a",
|
|
233
|
+
language="Thai and English",
|
|
234
|
+
),
|
|
235
|
+
main_metric="exact_match",
|
|
236
|
+
main_split="test",
|
|
237
|
+
)
|
|
238
|
+
else:
|
|
239
|
+
raise ValueError(f"Unknown exam: {self.exam}")
|
|
@@ -5,9 +5,10 @@ import sys
|
|
|
5
5
|
import requests
|
|
6
6
|
from typing import Dict, List
|
|
7
7
|
|
|
8
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
8
9
|
from helm.common.general import ensure_file_downloaded
|
|
9
10
|
from helm.common.hierarchical_logger import hlog, htrack, htrack_block
|
|
10
|
-
from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input
|
|
11
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input, ScenarioMetadata
|
|
11
12
|
|
|
12
13
|
|
|
13
14
|
class ThePileScenario(Scenario):
|
|
@@ -146,3 +147,14 @@ class ThePileScenario(Scenario):
|
|
|
146
147
|
instances = [instances[i] for i in indices]
|
|
147
148
|
|
|
148
149
|
return instances
|
|
150
|
+
|
|
151
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
152
|
+
return ScenarioMetadata(
|
|
153
|
+
name="the_pile",
|
|
154
|
+
display_name="The Pile",
|
|
155
|
+
description="The Pile corpus for measuring lanugage model performance across various "
|
|
156
|
+
"domains [(Gao et al., 2020)](https://arxiv.org/pdf/2101.00027.pdf).",
|
|
157
|
+
taxonomy=TaxonomyInfo(task="language modeling", what="?", when="?", who="?", language="English, code"),
|
|
158
|
+
main_metric="bits_per_byte",
|
|
159
|
+
main_split="test",
|
|
160
|
+
)
|
|
@@ -2,6 +2,7 @@ import csv
|
|
|
2
2
|
import os
|
|
3
3
|
from typing import List, Dict, Any
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
|
|
6
7
|
from helm.benchmark.scenarios.scenario import (
|
|
7
8
|
Scenario,
|
|
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
12
13
|
CORRECT_TAG,
|
|
13
14
|
Input,
|
|
14
15
|
Output,
|
|
16
|
+
ScenarioMetadata,
|
|
15
17
|
)
|
|
16
18
|
|
|
17
19
|
|
|
@@ -154,3 +156,15 @@ class TruthfulQAScenario(Scenario):
|
|
|
154
156
|
valid_instances: List[Instance] = get_split_instances(VALID_SPLIT, data[split_k:])
|
|
155
157
|
|
|
156
158
|
return train_instances + valid_instances
|
|
159
|
+
|
|
160
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
161
|
+
return ScenarioMetadata(
|
|
162
|
+
name="truthful_qa",
|
|
163
|
+
display_name="TruthfulQA",
|
|
164
|
+
description="The TruthfulQA benchmarking for measuring model truthfulness and commonsense "
|
|
165
|
+
"knowledge in question answering [(Lin et al., "
|
|
166
|
+
"2022)](https://aclanthology.org/2022.acl-long.229/).",
|
|
167
|
+
taxonomy=TaxonomyInfo(task="question answering", what="?", when="?", who="?", language="English"),
|
|
168
|
+
main_metric="exact_match",
|
|
169
|
+
main_split="valid",
|
|
170
|
+
)
|
|
@@ -2,9 +2,10 @@ import csv
|
|
|
2
2
|
import os
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.common.general import ensure_file_downloaded
|
|
6
7
|
from helm.common.hierarchical_logger import hlog
|
|
7
|
-
from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input
|
|
8
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input, ScenarioMetadata
|
|
8
9
|
|
|
9
10
|
CODALAB_URI_TEMPLATE: str = (
|
|
10
11
|
"https://worksheets.codalab.org/rest/bundles/0x31485f8c37ad481fb9f4e9bf7ccff6e5/contents/blob/"
|
|
@@ -56,3 +57,21 @@ class TwitterAAEScenario(Scenario):
|
|
|
56
57
|
instances.append(instance)
|
|
57
58
|
|
|
58
59
|
return instances
|
|
60
|
+
|
|
61
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
62
|
+
return ScenarioMetadata(
|
|
63
|
+
name="twitter_aae",
|
|
64
|
+
display_name="TwitterAAE",
|
|
65
|
+
description="The TwitterAAE corpus of [Blodgett et al. "
|
|
66
|
+
"(2016)](https://aclanthology.org/D16-1120/) for measuring language model "
|
|
67
|
+
"performance in tweets as a function of speaker dialect.",
|
|
68
|
+
taxonomy=TaxonomyInfo(
|
|
69
|
+
task="language modeling",
|
|
70
|
+
what="?",
|
|
71
|
+
when="?",
|
|
72
|
+
who="?",
|
|
73
|
+
language="English (AAE-aligned and White-aligned)",
|
|
74
|
+
),
|
|
75
|
+
main_metric="bits_per_byte",
|
|
76
|
+
main_split="test",
|
|
77
|
+
)
|
|
@@ -2,8 +2,9 @@ import json
|
|
|
2
2
|
import os
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.common.general import ensure_file_downloaded
|
|
6
|
-
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT
|
|
7
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT, ScenarioMetadata
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
class VicunaScenario(Scenario):
|
|
@@ -47,3 +48,22 @@ class VicunaScenario(Scenario):
|
|
|
47
48
|
)
|
|
48
49
|
instances.append(instance)
|
|
49
50
|
return instances
|
|
51
|
+
|
|
52
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
53
|
+
return ScenarioMetadata(
|
|
54
|
+
name="vicuna",
|
|
55
|
+
display_name="Vicuna",
|
|
56
|
+
short_display_name="Vicuna",
|
|
57
|
+
description="The set of prompts used by the "
|
|
58
|
+
"[Vicuna](https://lmsys.org/blog/2023-03-30-vicuna/) team to evaluate "
|
|
59
|
+
"instruction-following models.",
|
|
60
|
+
taxonomy=TaxonomyInfo(
|
|
61
|
+
task="open-ended instruction following",
|
|
62
|
+
what="Instructions for LLMs",
|
|
63
|
+
when="Before 2023",
|
|
64
|
+
who="Unknown",
|
|
65
|
+
language="English",
|
|
66
|
+
),
|
|
67
|
+
main_metric="Helpfulness",
|
|
68
|
+
main_split="test",
|
|
69
|
+
)
|
|
@@ -2,6 +2,7 @@ import os
|
|
|
2
2
|
from typing import List, Dict
|
|
3
3
|
import json
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.common.general import ensure_directory_exists, ensure_file_downloaded, flatten_list
|
|
6
7
|
from helm.common.hierarchical_logger import hlog
|
|
7
8
|
from helm.benchmark.scenarios.scenario import (
|
|
@@ -14,6 +15,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
14
15
|
CORRECT_TAG,
|
|
15
16
|
Input,
|
|
16
17
|
Output,
|
|
18
|
+
ScenarioMetadata,
|
|
17
19
|
)
|
|
18
20
|
|
|
19
21
|
PID_TO_NAME = {
|
|
@@ -183,3 +185,21 @@ class WIKIFactScenario(Scenario):
|
|
|
183
185
|
instances.append(instance)
|
|
184
186
|
|
|
185
187
|
return instances
|
|
188
|
+
|
|
189
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
190
|
+
return ScenarioMetadata(
|
|
191
|
+
name="wikifact",
|
|
192
|
+
display_name="WikiFact",
|
|
193
|
+
description="Scenario introduced in this work, inspired by [Petroni et al. "
|
|
194
|
+
"(2019)](https://aclanthology.org/D19-1250/), to more extensively test factual "
|
|
195
|
+
"knowledge.",
|
|
196
|
+
taxonomy=TaxonomyInfo(
|
|
197
|
+
task="knowledge base completion",
|
|
198
|
+
what="entity-relation-entity triples in natural language form",
|
|
199
|
+
when="?",
|
|
200
|
+
who="automatically generated from templates",
|
|
201
|
+
language="structured English",
|
|
202
|
+
),
|
|
203
|
+
main_metric="quasi_exact_match",
|
|
204
|
+
main_split="test",
|
|
205
|
+
)
|
|
@@ -2,11 +2,13 @@ import datasets
|
|
|
2
2
|
import os
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.benchmark.scenarios.scenario import (
|
|
6
7
|
Scenario,
|
|
7
8
|
Instance,
|
|
8
9
|
TEST_SPLIT,
|
|
9
10
|
Input,
|
|
11
|
+
ScenarioMetadata,
|
|
10
12
|
)
|
|
11
13
|
from helm.common.general import ensure_directory_exists
|
|
12
14
|
|
|
@@ -81,3 +83,19 @@ class WildBenchScenario(Scenario):
|
|
|
81
83
|
instances.append(instance)
|
|
82
84
|
|
|
83
85
|
return instances
|
|
86
|
+
|
|
87
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
88
|
+
return ScenarioMetadata(
|
|
89
|
+
name=self.name,
|
|
90
|
+
display_name="WildBench",
|
|
91
|
+
description=self.description,
|
|
92
|
+
main_metric="wildbench_score_rescaled",
|
|
93
|
+
main_split="test",
|
|
94
|
+
taxonomy=TaxonomyInfo(
|
|
95
|
+
task="instruction following",
|
|
96
|
+
what="GPT-judged instruction following with instructions collected from real-user conversations",
|
|
97
|
+
who="real-world users",
|
|
98
|
+
when="2024",
|
|
99
|
+
language="English",
|
|
100
|
+
),
|
|
101
|
+
)
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from typing import List, Any
|
|
2
2
|
from datasets import load_dataset
|
|
3
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
3
4
|
from helm.common.hierarchical_logger import htrack_block
|
|
4
5
|
from helm.benchmark.scenarios.scenario import (
|
|
5
6
|
Scenario,
|
|
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
11
12
|
CORRECT_TAG,
|
|
12
13
|
Input,
|
|
13
14
|
Output,
|
|
15
|
+
ScenarioMetadata,
|
|
14
16
|
)
|
|
15
17
|
|
|
16
18
|
|
|
@@ -106,3 +108,20 @@ class WMT14Scenario(Scenario):
|
|
|
106
108
|
)
|
|
107
109
|
)
|
|
108
110
|
return instances
|
|
111
|
+
|
|
112
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
113
|
+
return ScenarioMetadata(
|
|
114
|
+
name="wmt_14",
|
|
115
|
+
display_name="WMT 2014",
|
|
116
|
+
description="WMT 2014 is a collection of machine translation datasets "
|
|
117
|
+
"[(website)](https://www.statmt.org/wmt14/index.html).",
|
|
118
|
+
taxonomy=TaxonomyInfo(
|
|
119
|
+
task="machine translation",
|
|
120
|
+
what="multilingual sentences",
|
|
121
|
+
when="before 2014",
|
|
122
|
+
who="Europarl, news, Common Crawl, etc.",
|
|
123
|
+
language="English, French, Czech, etc.",
|
|
124
|
+
),
|
|
125
|
+
main_metric="bleu_4",
|
|
126
|
+
main_split="test",
|
|
127
|
+
)
|
|
@@ -92,6 +92,12 @@ metrics:
|
|
|
92
92
|
short_display_name: PEM
|
|
93
93
|
description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
|
|
94
94
|
lower_is_better: false
|
|
95
|
+
- name: alrage_score
|
|
96
|
+
# TODO: should call this prefix_quasi_exact_match
|
|
97
|
+
display_name: ALRAGE Score
|
|
98
|
+
short_display_name: Score
|
|
99
|
+
description: Score of the output judged by GPT-4o.
|
|
100
|
+
lower_is_better: false
|
|
95
101
|
|
|
96
102
|
############################################################
|
|
97
103
|
perturbations: []
|
|
@@ -134,17 +140,20 @@ run_groups:
|
|
|
134
140
|
- name: arabic_scenarios
|
|
135
141
|
display_name: Arabic Scenarios
|
|
136
142
|
description: Arabic Scenarios
|
|
137
|
-
category:
|
|
143
|
+
category: Scenarios
|
|
138
144
|
subgroups:
|
|
139
|
-
- mmmlu
|
|
140
|
-
- arabic_mmlu
|
|
141
145
|
- alghafa
|
|
142
|
-
-
|
|
146
|
+
- arabic_mmlu
|
|
147
|
+
- arabic_exams
|
|
148
|
+
- madinah_qa
|
|
143
149
|
- aratrust
|
|
150
|
+
- alrage
|
|
151
|
+
- mbzuai_human_translated_arabic_mmlu
|
|
144
152
|
|
|
145
|
-
- name:
|
|
146
|
-
display_name:
|
|
147
|
-
|
|
153
|
+
- name: mbzuai_human_translated_arabic_mmlu
|
|
154
|
+
display_name: MBZUAI Human-Translated Arabic MMLU
|
|
155
|
+
short_display_name: Translated MMLU
|
|
156
|
+
description: A translation from MBZUAI by human translators of the Massive Multitask Language Understanding benchmark.
|
|
148
157
|
metric_groups:
|
|
149
158
|
- accuracy
|
|
150
159
|
- efficiency
|
|
@@ -160,8 +169,8 @@ run_groups:
|
|
|
160
169
|
language: Arabic
|
|
161
170
|
|
|
162
171
|
- name: arabic_mmlu
|
|
163
|
-
display_name:
|
|
164
|
-
description:
|
|
172
|
+
display_name: ArabicMMLU
|
|
173
|
+
description: ArabicMMLU
|
|
165
174
|
metric_groups:
|
|
166
175
|
- accuracy
|
|
167
176
|
- efficiency
|
|
@@ -193,9 +202,9 @@ run_groups:
|
|
|
193
202
|
when: "before 2023"
|
|
194
203
|
language: Arabic
|
|
195
204
|
|
|
196
|
-
- name:
|
|
197
|
-
display_name: EXAMS
|
|
198
|
-
description: EXAMS
|
|
205
|
+
- name: arabic_exams
|
|
206
|
+
display_name: Arabic EXAMS
|
|
207
|
+
description: Arabic EXAMS
|
|
199
208
|
metric_groups:
|
|
200
209
|
- accuracy
|
|
201
210
|
- efficiency
|
|
@@ -226,3 +235,37 @@ run_groups:
|
|
|
226
235
|
who: "academic exams writers and takers"
|
|
227
236
|
when: "before 2024"
|
|
228
237
|
language: Arabic
|
|
238
|
+
|
|
239
|
+
- name: alrage
|
|
240
|
+
display_name: ALRAGE
|
|
241
|
+
description: ALRAGE
|
|
242
|
+
metric_groups:
|
|
243
|
+
- accuracy
|
|
244
|
+
- efficiency
|
|
245
|
+
- general_information
|
|
246
|
+
environment:
|
|
247
|
+
main_name: alrage_score
|
|
248
|
+
main_split: test
|
|
249
|
+
taxonomy:
|
|
250
|
+
task: "openbook (RAG) open-ended question answering"
|
|
251
|
+
what: "?"
|
|
252
|
+
who: "?"
|
|
253
|
+
when: "?"
|
|
254
|
+
language: Arabic
|
|
255
|
+
|
|
256
|
+
- name: madinah_qa
|
|
257
|
+
display_name: MadinahQA
|
|
258
|
+
description: Arabic language competency benchmark
|
|
259
|
+
metric_groups:
|
|
260
|
+
- accuracy
|
|
261
|
+
- efficiency
|
|
262
|
+
- general_information
|
|
263
|
+
environment:
|
|
264
|
+
main_name: exact_match
|
|
265
|
+
main_split: test
|
|
266
|
+
taxonomy:
|
|
267
|
+
task: "question answering"
|
|
268
|
+
what: "academic questions about Arabic language"
|
|
269
|
+
who: "academic exams writers and takers"
|
|
270
|
+
when: "before 2024"
|
|
271
|
+
language: Arabic
|
|
@@ -191,31 +191,12 @@ run_groups:
|
|
|
191
191
|
description: Scenarios for evaluating long context capabilities
|
|
192
192
|
category: All scenarios
|
|
193
193
|
subgroups:
|
|
194
|
-
- ruler_hotpotqa
|
|
195
194
|
- ruler_squad
|
|
196
|
-
-
|
|
197
|
-
- infinite_bench_en_qa
|
|
195
|
+
- ruler_hotpotqa
|
|
198
196
|
- infinite_bench_en_mc
|
|
197
|
+
- infinite_bench_en_sum
|
|
199
198
|
- openai_mrcr
|
|
200
199
|
|
|
201
|
-
- name: ruler_hotpotqa
|
|
202
|
-
display_name: RULER HotPotQA
|
|
203
|
-
description: RULER HotPotQA is an augmented version of HotPotQA ([Yang et al., 2018](https://arxiv.org/abs/1809.09600)) introduced by [Hsieh et al., 2024](https://arxiv.org/abs/2404.06654) to simulate a multi-hop question answering as a long-context scenario.
|
|
204
|
-
metric_groups:
|
|
205
|
-
- accuracy
|
|
206
|
-
- general_information
|
|
207
|
-
- annotation_metrics
|
|
208
|
-
environment:
|
|
209
|
-
main_name: ruler_string_match_part
|
|
210
|
-
main_split: valid
|
|
211
|
-
taxonomy:
|
|
212
|
-
task: question answering with retrieval-augmented generation
|
|
213
|
-
what: Wikipedia articles
|
|
214
|
-
who: Wikipedia authors
|
|
215
|
-
when: Before 2018
|
|
216
|
-
language: English
|
|
217
|
-
|
|
218
|
-
|
|
219
200
|
- name: ruler_squad
|
|
220
201
|
display_name: RULER SQuAD
|
|
221
202
|
description: RULER SQuAD is an augmented version of SQuAD ([Rajpurkar et al., 2018](https://arxiv.org/abs/1806.03822)) introduced by [Hsieh et al., 2024](https://arxiv.org/abs/2404.06654) to simulate a single-hop question answering as a long-context scenario.
|
|
@@ -233,21 +214,21 @@ run_groups:
|
|
|
233
214
|
when: Before 2018
|
|
234
215
|
language: English
|
|
235
216
|
|
|
236
|
-
- name:
|
|
237
|
-
display_name:
|
|
238
|
-
description:
|
|
217
|
+
- name: ruler_hotpotqa
|
|
218
|
+
display_name: RULER HotPotQA
|
|
219
|
+
description: RULER HotPotQA is an augmented version of HotPotQA ([Yang et al., 2018](https://arxiv.org/abs/1809.09600)) introduced by [Hsieh et al., 2024](https://arxiv.org/abs/2404.06654) to simulate a multi-hop question answering as a long-context scenario.
|
|
239
220
|
metric_groups:
|
|
240
221
|
- accuracy
|
|
241
222
|
- general_information
|
|
242
223
|
- annotation_metrics
|
|
243
224
|
environment:
|
|
244
|
-
main_name:
|
|
245
|
-
main_split:
|
|
225
|
+
main_name: ruler_string_match_part
|
|
226
|
+
main_split: valid
|
|
246
227
|
taxonomy:
|
|
247
|
-
task: question answering
|
|
248
|
-
what:
|
|
249
|
-
who:
|
|
250
|
-
when: Before
|
|
228
|
+
task: question answering with retrieval-augmented generation
|
|
229
|
+
what: Wikipedia articles
|
|
230
|
+
who: Wikipedia authors
|
|
231
|
+
when: Before 2018
|
|
251
232
|
language: English
|
|
252
233
|
|
|
253
234
|
- name: infinite_bench_en_mc
|
|
@@ -484,6 +484,8 @@ run_groups:
|
|
|
484
484
|
- ehrshot
|
|
485
485
|
- head_qa
|
|
486
486
|
- medbullets
|
|
487
|
+
- med_qa
|
|
488
|
+
- med_mcqa
|
|
487
489
|
- medalign
|
|
488
490
|
- shc_ptbm_med
|
|
489
491
|
- shc_sei_med
|
|
@@ -657,6 +659,40 @@ run_groups:
|
|
|
657
659
|
when: Any
|
|
658
660
|
language: English
|
|
659
661
|
|
|
662
|
+
- name: med_qa
|
|
663
|
+
display_name: MedQA
|
|
664
|
+
description: MedQA is an open domain question answering dataset composed of questions from professional medical board exams ([Jin et al. 2020](https://arxiv.org/pdf/2009.13081.pdf)).
|
|
665
|
+
metric_groups:
|
|
666
|
+
- accuracy
|
|
667
|
+
- efficiency
|
|
668
|
+
- general_information
|
|
669
|
+
environment:
|
|
670
|
+
main_name: exact_match
|
|
671
|
+
main_split: test
|
|
672
|
+
taxonomy:
|
|
673
|
+
task: question answering
|
|
674
|
+
what: n/a
|
|
675
|
+
who: n/a
|
|
676
|
+
when: n/a
|
|
677
|
+
language: English
|
|
678
|
+
|
|
679
|
+
- name: med_mcqa
|
|
680
|
+
display_name: MedMCQA
|
|
681
|
+
description: MedMCQA is a "multiple-choice question answering (MCQA) dataset designed to address real-world medical entrance exam questions ([Flores et al. 2020](https://arxiv.org/abs/2203.14371)).
|
|
682
|
+
metric_groups:
|
|
683
|
+
- accuracy
|
|
684
|
+
- efficiency
|
|
685
|
+
- general_information
|
|
686
|
+
environment:
|
|
687
|
+
main_name: exact_match
|
|
688
|
+
main_split: valid
|
|
689
|
+
taxonomy:
|
|
690
|
+
task: question answering
|
|
691
|
+
what: n/a
|
|
692
|
+
who: n/a
|
|
693
|
+
when: n/a
|
|
694
|
+
language: English
|
|
695
|
+
|
|
660
696
|
- name: medalign
|
|
661
697
|
display_name: MedAlign
|
|
662
698
|
short_display_name: MedAlign
|