crfm-helm 0.5.6__py3-none-any.whl → 0.5.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/METADATA +60 -125
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/RECORD +293 -229
- helm/benchmark/adaptation/adapter_spec.py +5 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
- helm/benchmark/annotation/aci_bench_annotator.py +11 -22
- helm/benchmark/annotation/air_bench_annotator.py +1 -1
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
- helm/benchmark/annotation/dischargeme_annotator.py +11 -22
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/med_dialog_annotator.py +11 -22
- helm/benchmark/annotation/medalign_annotator.py +11 -22
- helm/benchmark/annotation/medi_qa_annotator.py +11 -22
- helm/benchmark/annotation/medication_qa_annotator.py +11 -22
- helm/benchmark/annotation/mental_health_annotator.py +11 -22
- helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
- helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
- helm/benchmark/annotation/model_as_judge.py +23 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
- helm/benchmark/metrics/air_bench_metrics.py +3157 -1
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/basic_metrics.py +267 -2
- helm/benchmark/metrics/classification_metrics.py +19 -1
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +1 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
- helm/benchmark/metrics/copyright_metrics.py +1 -1
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
- helm/benchmark/metrics/dry_run_metrics.py +30 -1
- helm/benchmark/metrics/efficiency_metrics.py +74 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +300 -1
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
- helm/benchmark/metrics/ifeval_metrics.py +13 -1
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
- helm/benchmark/metrics/language_modeling_metrics.py +13 -1
- helm/benchmark/metrics/live_qa_metrics.py +13 -1
- helm/benchmark/metrics/llm_jury_metrics.py +13 -1
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
- helm/benchmark/metrics/medec_metrics.py +25 -2
- helm/benchmark/metrics/melt_toxicity_metric.py +1 -1
- helm/benchmark/metrics/metric.py +25 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
- helm/benchmark/metrics/omni_math_metrics.py +13 -1
- helm/benchmark/metrics/seahelm_metrics.py +14 -1
- helm/benchmark/metrics/summac/model_summac.py +3 -3
- helm/benchmark/metrics/summarization_metrics.py +129 -1
- helm/benchmark/metrics/toxicity_metrics.py +31 -1
- helm/benchmark/metrics/wildbench_metrics.py +21 -1
- helm/benchmark/model_deployment_registry.py +11 -19
- helm/benchmark/presentation/create_plots.py +11 -2
- helm/benchmark/presentation/schema.py +10 -22
- helm/benchmark/presentation/summarize.py +189 -14
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/run.py +7 -1
- helm/benchmark/run_expander.py +4 -0
- helm/benchmark/run_specs/arabic_run_specs.py +191 -0
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +2 -55
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +3 -1
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +48 -1
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +360 -50
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +5 -11
- helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
- helm/benchmark/scenarios/air_bench_scenario.py +21 -0
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +82 -0
- helm/benchmark/scenarios/aratrust_scenario.py +95 -0
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/{ultra_suite_asr_classification.py → ultra_suite_asr_classification_scenario.py} +9 -8
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +13 -5
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +13 -5
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +13 -5
- helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
- helm/benchmark/scenarios/bbq_scenario.py +15 -0
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/bluex_scenario.py +70 -0
- helm/benchmark/scenarios/bold_scenario.py +15 -0
- helm/benchmark/scenarios/boolq_scenario.py +20 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
- helm/benchmark/scenarios/clear_scenario.py +23 -0
- helm/benchmark/scenarios/cleva_scenario.py +480 -1
- helm/benchmark/scenarios/code_scenario.py +28 -0
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/commonsense_scenario.py +26 -0
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
- helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
- helm/benchmark/scenarios/disinformation_scenario.py +22 -0
- helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
- helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
- helm/benchmark/scenarios/gpqa_scenario.py +18 -0
- helm/benchmark/scenarios/grammar_scenario.py +20 -1
- helm/benchmark/scenarios/gsm_scenario.py +15 -0
- helm/benchmark/scenarios/headqa_scenario.py +22 -0
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
- helm/benchmark/scenarios/ice_scenario.py +21 -1
- helm/benchmark/scenarios/ifeval_scenario.py +18 -0
- helm/benchmark/scenarios/imdb_scenario.py +15 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +1 -1
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/legal_support_scenario.py +13 -0
- helm/benchmark/scenarios/legalbench_scenario.py +20 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
- helm/benchmark/scenarios/lextreme_scenario.py +11 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +47 -20
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
- helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
- helm/benchmark/scenarios/med_qa_scenario.py +14 -0
- helm/benchmark/scenarios/medalign_scenario.py +23 -0
- helm/benchmark/scenarios/medalign_scenario_helper.py +19 -125
- helm/benchmark/scenarios/medbullets_scenario.py +22 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
- helm/benchmark/scenarios/medec_scenario.py +23 -0
- helm/benchmark/scenarios/medhallu_scenario.py +23 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +23 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
- helm/benchmark/scenarios/melt_scenarios.py +2 -2
- helm/benchmark/scenarios/mental_health_scenario.py +23 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +25 -1
- helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
- helm/benchmark/scenarios/mmlu_scenario.py +15 -0
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/msmarco_scenario.py +30 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +20 -0
- helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
- helm/benchmark/scenarios/omni_math_scenario.py +18 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
- helm/benchmark/scenarios/quac_scenario.py +14 -0
- helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
- helm/benchmark/scenarios/raft_scenario.py +15 -0
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/scenario.py +31 -0
- helm/benchmark/scenarios/seahelm_scenario.py +350 -2
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
- helm/benchmark/scenarios/summarization_scenario.py +37 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/vicuna_scenario.py +21 -1
- helm/benchmark/scenarios/wikifact_scenario.py +20 -0
- helm/benchmark/scenarios/wildbench_scenario.py +18 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +12 -0
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +8 -1
- helm/benchmark/static/schema_arabic.yaml +271 -0
- helm/benchmark/static/schema_classic.yaml +0 -17
- helm/benchmark/static/schema_long_context.yaml +24 -6
- helm/benchmark/static/schema_medhelm.yaml +36 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static_build/assets/index-671a5e06.js +10 -0
- helm/benchmark/static_build/assets/index-9352595e.css +1 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/qwen2_5_omni_client.py +19 -7
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/huggingface_client.py +2 -2
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
- helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
- helm/clients/openai_client.py +33 -20
- helm/clients/openai_responses_client.py +34 -8
- helm/clients/openrouter_client.py +31 -0
- helm/clients/test_huggingface_client.py +3 -3
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/together_client.py +48 -13
- helm/clients/vertexai_client.py +19 -11
- helm/clients/vllm_client.py +43 -7
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/common/critique_request.py +0 -1
- helm/common/hierarchical_logger.py +83 -34
- helm/common/object_spec.py +23 -8
- helm/common/test_logging.py +94 -0
- helm/config/model_deployments.yaml +525 -172
- helm/config/model_metadata.yaml +185 -10
- helm/config/tokenizer_configs.yaml +100 -2
- helm/proxy/cli.py +1 -1
- helm/proxy/example_queries.py +8 -8
- helm/proxy/retry.py +5 -0
- helm/proxy/server.py +2 -1
- helm/proxy/static/index.css +4 -0
- helm/proxy/static/index.js +7 -1
- helm/tokenizers/grok_tokenizer.py +2 -0
- helm/benchmark/metrics/aci_bench_metrics.py +0 -14
- helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
- helm/benchmark/metrics/dischargeme_metrics.py +0 -14
- helm/benchmark/metrics/med_dialog_metrics.py +0 -14
- helm/benchmark/metrics/medalign_metrics.py +0 -14
- helm/benchmark/metrics/medi_qa_metrics.py +0 -14
- helm/benchmark/metrics/medication_qa_metrics.py +0 -14
- helm/benchmark/metrics/mental_health_metrics.py +0 -14
- helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
- helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/scenarios/numeracy_scenario.py +0 -794
- helm/benchmark/static_build/assets/index-94295e78.js +0 -10
- helm/benchmark/static_build/assets/index-b9779128.css +0 -1
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.6.dist-info → crfm_helm-0.5.8.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from tempfile import TemporaryDirectory
|
|
3
|
+
|
|
4
|
+
from helm.benchmark.scenarios.healthqa_br_scenario import HEALTHQA_BR_Scenario
|
|
5
|
+
from helm.benchmark.scenarios.scenario import TEST_SPLIT, CORRECT_TAG, Output, Reference
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.mark.scenarios
|
|
9
|
+
def test_healthqa_br_instance():
|
|
10
|
+
scenario = HEALTHQA_BR_Scenario()
|
|
11
|
+
with TemporaryDirectory() as tmpdir:
|
|
12
|
+
instances = scenario.get_instances(tmpdir)
|
|
13
|
+
|
|
14
|
+
instance = instances[35]
|
|
15
|
+
|
|
16
|
+
assert instance.split == TEST_SPLIT
|
|
17
|
+
|
|
18
|
+
assert instance.input.text.startswith("Homem de 22 anos de idade procura a Unidade Básica")
|
|
19
|
+
|
|
20
|
+
assert instance.references == [
|
|
21
|
+
Reference(
|
|
22
|
+
output=Output(
|
|
23
|
+
text="administração de relaxante muscular, colocando o paciente em posição de Trendelenburg, com "
|
|
24
|
+
"tentativa de redução do volume."
|
|
25
|
+
),
|
|
26
|
+
tags=[],
|
|
27
|
+
),
|
|
28
|
+
Reference(
|
|
29
|
+
output=Output(
|
|
30
|
+
text="encaminhamento do paciente ao Serviço de Urgência do Hospital com o pedido de avaliação "
|
|
31
|
+
"imediata do cirurgião."
|
|
32
|
+
),
|
|
33
|
+
tags=[CORRECT_TAG],
|
|
34
|
+
),
|
|
35
|
+
Reference(
|
|
36
|
+
output=Output(
|
|
37
|
+
text="tentativa de redução manual do aumento de volume da região inguinescrotal para a cavidade "
|
|
38
|
+
"abdominal."
|
|
39
|
+
),
|
|
40
|
+
tags=[],
|
|
41
|
+
),
|
|
42
|
+
Reference(
|
|
43
|
+
output=Output(
|
|
44
|
+
text="transiluminação do escroto para tentar diferenciar hérnia inguinal de hidrocele comunicante."
|
|
45
|
+
),
|
|
46
|
+
tags=[],
|
|
47
|
+
),
|
|
48
|
+
Reference(
|
|
49
|
+
output=Output(text="prescrição de antiemético e solicitação de ecografia da região inguinescrotal."),
|
|
50
|
+
tags=[],
|
|
51
|
+
),
|
|
52
|
+
]
|
|
53
|
+
|
|
54
|
+
correct_refs = [ref for ref in instance.references if CORRECT_TAG in ref.tags]
|
|
55
|
+
assert len(correct_refs) == 1
|
|
56
|
+
|
|
57
|
+
assert instance.references[1].is_correct
|
|
@@ -5,9 +5,10 @@ import sys
|
|
|
5
5
|
import requests
|
|
6
6
|
from typing import Dict, List
|
|
7
7
|
|
|
8
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
8
9
|
from helm.common.general import ensure_file_downloaded
|
|
9
10
|
from helm.common.hierarchical_logger import hlog, htrack, htrack_block
|
|
10
|
-
from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input
|
|
11
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input, ScenarioMetadata
|
|
11
12
|
|
|
12
13
|
|
|
13
14
|
class ThePileScenario(Scenario):
|
|
@@ -146,3 +147,14 @@ class ThePileScenario(Scenario):
|
|
|
146
147
|
instances = [instances[i] for i in indices]
|
|
147
148
|
|
|
148
149
|
return instances
|
|
150
|
+
|
|
151
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
152
|
+
return ScenarioMetadata(
|
|
153
|
+
name="the_pile",
|
|
154
|
+
display_name="The Pile",
|
|
155
|
+
description="The Pile corpus for measuring lanugage model performance across various "
|
|
156
|
+
"domains [(Gao et al., 2020)](https://arxiv.org/pdf/2101.00027.pdf).",
|
|
157
|
+
taxonomy=TaxonomyInfo(task="language modeling", what="?", when="?", who="?", language="English, code"),
|
|
158
|
+
main_metric="bits_per_byte",
|
|
159
|
+
main_split="test",
|
|
160
|
+
)
|
|
@@ -2,6 +2,7 @@ import csv
|
|
|
2
2
|
import os
|
|
3
3
|
from typing import List, Dict, Any
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
|
|
6
7
|
from helm.benchmark.scenarios.scenario import (
|
|
7
8
|
Scenario,
|
|
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
12
13
|
CORRECT_TAG,
|
|
13
14
|
Input,
|
|
14
15
|
Output,
|
|
16
|
+
ScenarioMetadata,
|
|
15
17
|
)
|
|
16
18
|
|
|
17
19
|
|
|
@@ -154,3 +156,15 @@ class TruthfulQAScenario(Scenario):
|
|
|
154
156
|
valid_instances: List[Instance] = get_split_instances(VALID_SPLIT, data[split_k:])
|
|
155
157
|
|
|
156
158
|
return train_instances + valid_instances
|
|
159
|
+
|
|
160
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
161
|
+
return ScenarioMetadata(
|
|
162
|
+
name="truthful_qa",
|
|
163
|
+
display_name="TruthfulQA",
|
|
164
|
+
description="The TruthfulQA benchmarking for measuring model truthfulness and commonsense "
|
|
165
|
+
"knowledge in question answering [(Lin et al., "
|
|
166
|
+
"2022)](https://aclanthology.org/2022.acl-long.229/).",
|
|
167
|
+
taxonomy=TaxonomyInfo(task="question answering", what="?", when="?", who="?", language="English"),
|
|
168
|
+
main_metric="exact_match",
|
|
169
|
+
main_split="valid",
|
|
170
|
+
)
|
|
@@ -2,9 +2,10 @@ import csv
|
|
|
2
2
|
import os
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.common.general import ensure_file_downloaded
|
|
6
7
|
from helm.common.hierarchical_logger import hlog
|
|
7
|
-
from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input
|
|
8
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input, ScenarioMetadata
|
|
8
9
|
|
|
9
10
|
CODALAB_URI_TEMPLATE: str = (
|
|
10
11
|
"https://worksheets.codalab.org/rest/bundles/0x31485f8c37ad481fb9f4e9bf7ccff6e5/contents/blob/"
|
|
@@ -56,3 +57,21 @@ class TwitterAAEScenario(Scenario):
|
|
|
56
57
|
instances.append(instance)
|
|
57
58
|
|
|
58
59
|
return instances
|
|
60
|
+
|
|
61
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
62
|
+
return ScenarioMetadata(
|
|
63
|
+
name="twitter_aae",
|
|
64
|
+
display_name="TwitterAAE",
|
|
65
|
+
description="The TwitterAAE corpus of [Blodgett et al. "
|
|
66
|
+
"(2016)](https://aclanthology.org/D16-1120/) for measuring language model "
|
|
67
|
+
"performance in tweets as a function of speaker dialect.",
|
|
68
|
+
taxonomy=TaxonomyInfo(
|
|
69
|
+
task="language modeling",
|
|
70
|
+
what="?",
|
|
71
|
+
when="?",
|
|
72
|
+
who="?",
|
|
73
|
+
language="English (AAE-aligned and White-aligned)",
|
|
74
|
+
),
|
|
75
|
+
main_metric="bits_per_byte",
|
|
76
|
+
main_split="test",
|
|
77
|
+
)
|
|
@@ -2,8 +2,9 @@ import json
|
|
|
2
2
|
import os
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.common.general import ensure_file_downloaded
|
|
6
|
-
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT
|
|
7
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT, ScenarioMetadata
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
class VicunaScenario(Scenario):
|
|
@@ -47,3 +48,22 @@ class VicunaScenario(Scenario):
|
|
|
47
48
|
)
|
|
48
49
|
instances.append(instance)
|
|
49
50
|
return instances
|
|
51
|
+
|
|
52
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
53
|
+
return ScenarioMetadata(
|
|
54
|
+
name="vicuna",
|
|
55
|
+
display_name="Vicuna",
|
|
56
|
+
short_display_name="Vicuna",
|
|
57
|
+
description="The set of prompts used by the "
|
|
58
|
+
"[Vicuna](https://lmsys.org/blog/2023-03-30-vicuna/) team to evaluate "
|
|
59
|
+
"instruction-following models.",
|
|
60
|
+
taxonomy=TaxonomyInfo(
|
|
61
|
+
task="open-ended instruction following",
|
|
62
|
+
what="Instructions for LLMs",
|
|
63
|
+
when="Before 2023",
|
|
64
|
+
who="Unknown",
|
|
65
|
+
language="English",
|
|
66
|
+
),
|
|
67
|
+
main_metric="Helpfulness",
|
|
68
|
+
main_split="test",
|
|
69
|
+
)
|
|
@@ -2,6 +2,7 @@ import os
|
|
|
2
2
|
from typing import List, Dict
|
|
3
3
|
import json
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.common.general import ensure_directory_exists, ensure_file_downloaded, flatten_list
|
|
6
7
|
from helm.common.hierarchical_logger import hlog
|
|
7
8
|
from helm.benchmark.scenarios.scenario import (
|
|
@@ -14,6 +15,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
14
15
|
CORRECT_TAG,
|
|
15
16
|
Input,
|
|
16
17
|
Output,
|
|
18
|
+
ScenarioMetadata,
|
|
17
19
|
)
|
|
18
20
|
|
|
19
21
|
PID_TO_NAME = {
|
|
@@ -183,3 +185,21 @@ class WIKIFactScenario(Scenario):
|
|
|
183
185
|
instances.append(instance)
|
|
184
186
|
|
|
185
187
|
return instances
|
|
188
|
+
|
|
189
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
190
|
+
return ScenarioMetadata(
|
|
191
|
+
name="wikifact",
|
|
192
|
+
display_name="WikiFact",
|
|
193
|
+
description="Scenario introduced in this work, inspired by [Petroni et al. "
|
|
194
|
+
"(2019)](https://aclanthology.org/D19-1250/), to more extensively test factual "
|
|
195
|
+
"knowledge.",
|
|
196
|
+
taxonomy=TaxonomyInfo(
|
|
197
|
+
task="knowledge base completion",
|
|
198
|
+
what="entity-relation-entity triples in natural language form",
|
|
199
|
+
when="?",
|
|
200
|
+
who="automatically generated from templates",
|
|
201
|
+
language="structured English",
|
|
202
|
+
),
|
|
203
|
+
main_metric="quasi_exact_match",
|
|
204
|
+
main_split="test",
|
|
205
|
+
)
|
|
@@ -2,11 +2,13 @@ import datasets
|
|
|
2
2
|
import os
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.benchmark.scenarios.scenario import (
|
|
6
7
|
Scenario,
|
|
7
8
|
Instance,
|
|
8
9
|
TEST_SPLIT,
|
|
9
10
|
Input,
|
|
11
|
+
ScenarioMetadata,
|
|
10
12
|
)
|
|
11
13
|
from helm.common.general import ensure_directory_exists
|
|
12
14
|
|
|
@@ -81,3 +83,19 @@ class WildBenchScenario(Scenario):
|
|
|
81
83
|
instances.append(instance)
|
|
82
84
|
|
|
83
85
|
return instances
|
|
86
|
+
|
|
87
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
88
|
+
return ScenarioMetadata(
|
|
89
|
+
name=self.name,
|
|
90
|
+
display_name="WildBench",
|
|
91
|
+
description=self.description,
|
|
92
|
+
main_metric="wildbench_score_rescaled",
|
|
93
|
+
main_split="test",
|
|
94
|
+
taxonomy=TaxonomyInfo(
|
|
95
|
+
task="instruction following",
|
|
96
|
+
what="GPT-judged instruction following with instructions collected from real-user conversations",
|
|
97
|
+
who="real-world users",
|
|
98
|
+
when="2024",
|
|
99
|
+
language="English",
|
|
100
|
+
),
|
|
101
|
+
)
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from typing import List, Any
|
|
2
2
|
from datasets import load_dataset
|
|
3
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
3
4
|
from helm.common.hierarchical_logger import htrack_block
|
|
4
5
|
from helm.benchmark.scenarios.scenario import (
|
|
5
6
|
Scenario,
|
|
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
11
12
|
CORRECT_TAG,
|
|
12
13
|
Input,
|
|
13
14
|
Output,
|
|
15
|
+
ScenarioMetadata,
|
|
14
16
|
)
|
|
15
17
|
|
|
16
18
|
|
|
@@ -106,3 +108,13 @@ class WMT14Scenario(Scenario):
|
|
|
106
108
|
)
|
|
107
109
|
)
|
|
108
110
|
return instances
|
|
111
|
+
|
|
112
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
113
|
+
return ScenarioMetadata(
|
|
114
|
+
name="wmt_14",
|
|
115
|
+
display_name="WMT 2014",
|
|
116
|
+
description="WMT 2014 is a collection of machine translation datasets.",
|
|
117
|
+
taxonomy=TaxonomyInfo(task="machine translation", what="n/a", when="n/a", who="n/a", language="English"),
|
|
118
|
+
main_metric="bleu_4",
|
|
119
|
+
main_split="test",
|
|
120
|
+
)
|
helm/benchmark/slurm_jobs.py
CHANGED
|
@@ -13,7 +13,6 @@ except ModuleNotFoundError as e:
|
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
class SlurmJobState:
|
|
16
|
-
# TODO: Convert to StrEnum after upgrading to Python 3.11
|
|
17
16
|
# Non-exhaustive list of Slurm job states.
|
|
18
17
|
# See: https://slurm.schedmd.com/squeue.html#SECTION_JOB-STATE-CODES
|
|
19
18
|
|
|
@@ -81,7 +80,7 @@ def get_slurm_job_state(job_id: int) -> str:
|
|
|
81
80
|
except subprocess.CalledProcessError as e:
|
|
82
81
|
# Default CalledProcessError message doesn't have output, so re-raise here to include the output.
|
|
83
82
|
raise Exception(f"{str(e)} output: {e.output}")
|
|
84
|
-
search_result = re.search("JobState=(\w+)", scontrol_output.decode())
|
|
83
|
+
search_result = re.search(r"JobState=(\w+)", scontrol_output.decode())
|
|
85
84
|
if not search_result:
|
|
86
85
|
raise Exception(f"Could not extract JobState from scontrol: {scontrol_output.decode()}")
|
|
87
86
|
return search_result.group(1)
|
helm/benchmark/slurm_runner.py
CHANGED
|
@@ -26,7 +26,7 @@ from helm.benchmark.slurm_jobs import (
|
|
|
26
26
|
FAILURE_SLURM_JOB_STATES,
|
|
27
27
|
)
|
|
28
28
|
from helm.common.general import ensure_directory_exists
|
|
29
|
-
from helm.common.hierarchical_logger import hlog, htrack_block
|
|
29
|
+
from helm.common.hierarchical_logger import hlog, htrack_block, setup_default_logging
|
|
30
30
|
|
|
31
31
|
from helm.benchmark.runner_config_registry import RUNNER_CONFIG
|
|
32
32
|
|
|
@@ -343,7 +343,14 @@ def main():
|
|
|
343
343
|
help="Path to the RunSpec JSON file",
|
|
344
344
|
required=True,
|
|
345
345
|
)
|
|
346
|
+
parser.add_argument(
|
|
347
|
+
"--log-config",
|
|
348
|
+
type=str,
|
|
349
|
+
default=None,
|
|
350
|
+
help="PATH to a YAML file to customize logging",
|
|
351
|
+
)
|
|
346
352
|
args = parser.parse_args()
|
|
353
|
+
setup_default_logging(args.log_config)
|
|
347
354
|
|
|
348
355
|
# Deserialize SlurmRunner and RunSpec from the given files, then run the RunSpec with the SlurmRunner.
|
|
349
356
|
with open(args.slurm_runner_spec_path, "r") as f:
|
|
@@ -0,0 +1,271 @@
|
|
|
1
|
+
---
|
|
2
|
+
# Schema for Arabic scenarios
|
|
3
|
+
############################################################
|
|
4
|
+
metrics:
|
|
5
|
+
# Infrastructure metrics:
|
|
6
|
+
- name: num_perplexity_tokens
|
|
7
|
+
display_name: '# tokens'
|
|
8
|
+
description: Average number of tokens in the predicted output (for language modeling, the input too).
|
|
9
|
+
- name: num_bytes
|
|
10
|
+
display_name: '# bytes'
|
|
11
|
+
description: Average number of bytes in the predicted output (for language modeling, the input too).
|
|
12
|
+
|
|
13
|
+
- name: num_references
|
|
14
|
+
display_name: '# ref'
|
|
15
|
+
description: Number of references.
|
|
16
|
+
- name: num_train_trials
|
|
17
|
+
display_name: '# trials'
|
|
18
|
+
description: Number of trials, where in each trial we choose an independent, random set of training instances.
|
|
19
|
+
- name: estimated_num_tokens_cost
|
|
20
|
+
display_name: 'cost'
|
|
21
|
+
description: An estimate of the number of tokens (including prompt and output completions) needed to perform the request.
|
|
22
|
+
- name: num_prompt_tokens
|
|
23
|
+
display_name: '# prompt tokens'
|
|
24
|
+
description: Number of tokens in the prompt.
|
|
25
|
+
- name: num_prompt_characters
|
|
26
|
+
display_name: '# prompt chars'
|
|
27
|
+
description: Number of characters in the prompt.
|
|
28
|
+
- name: num_completion_tokens
|
|
29
|
+
display_name: '# completion tokens'
|
|
30
|
+
description: Actual number of completion tokens (over all completions).
|
|
31
|
+
- name: num_output_tokens
|
|
32
|
+
display_name: '# output tokens'
|
|
33
|
+
description: Actual number of output tokens.
|
|
34
|
+
- name: max_num_output_tokens
|
|
35
|
+
display_name: 'Max output tokens'
|
|
36
|
+
description: Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences).
|
|
37
|
+
- name: num_requests
|
|
38
|
+
display_name: '# requests'
|
|
39
|
+
description: Number of distinct API requests.
|
|
40
|
+
- name: num_instances
|
|
41
|
+
display_name: '# eval'
|
|
42
|
+
description: Number of evaluation instances.
|
|
43
|
+
- name: num_train_instances
|
|
44
|
+
display_name: '# train'
|
|
45
|
+
description: Number of training instances (e.g., in-context examples).
|
|
46
|
+
- name: prompt_truncated
|
|
47
|
+
display_name: truncated
|
|
48
|
+
description: Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples).
|
|
49
|
+
- name: finish_reason_length
|
|
50
|
+
display_name: finish b/c length
|
|
51
|
+
description: Fraction of instances where the the output was terminated because of the max tokens limit.
|
|
52
|
+
- name: finish_reason_stop
|
|
53
|
+
display_name: finish b/c stop
|
|
54
|
+
description: Fraction of instances where the the output was terminated because of the stop sequences.
|
|
55
|
+
- name: finish_reason_endoftext
|
|
56
|
+
display_name: finish b/c endoftext
|
|
57
|
+
description: Fraction of instances where the the output was terminated because the end of text token was generated.
|
|
58
|
+
- name: finish_reason_unknown
|
|
59
|
+
display_name: finish b/c unknown
|
|
60
|
+
description: Fraction of instances where the the output was terminated for unknown reasons.
|
|
61
|
+
- name: num_completions
|
|
62
|
+
display_name: '# completions'
|
|
63
|
+
description: Number of completions.
|
|
64
|
+
- name: predicted_index
|
|
65
|
+
display_name: Predicted index
|
|
66
|
+
description: Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice).
|
|
67
|
+
- name: inference_runtime
|
|
68
|
+
display_name: Observed inference runtime (s)
|
|
69
|
+
short_display_name: Observed inference time (s)
|
|
70
|
+
lower_is_better: true
|
|
71
|
+
description: Average observed time to process a request to the model (via an API, and thus depends on particular deployment).
|
|
72
|
+
|
|
73
|
+
# Accuracy metrics:
|
|
74
|
+
- name: exact_match
|
|
75
|
+
display_name: Exact match
|
|
76
|
+
short_display_name: EM
|
|
77
|
+
description: Fraction of instances that the predicted output matches a correct reference exactly.
|
|
78
|
+
lower_is_better: false
|
|
79
|
+
- name: quasi_exact_match
|
|
80
|
+
display_name: Quasi-exact match
|
|
81
|
+
short_display_name: EM
|
|
82
|
+
description: Fraction of instances that the predicted output matches a correct reference up to light processing.
|
|
83
|
+
lower_is_better: false
|
|
84
|
+
- name: prefix_exact_match
|
|
85
|
+
display_name: Prefix exact match
|
|
86
|
+
short_display_name: PEM
|
|
87
|
+
description: Fraction of instances that the predicted output matches the prefix of a correct reference exactly.
|
|
88
|
+
lower_is_better: false
|
|
89
|
+
- name: quasi_prefix_exact_match
|
|
90
|
+
# TODO: should call this prefix_quasi_exact_match
|
|
91
|
+
display_name: Prefix quasi-exact match
|
|
92
|
+
short_display_name: PEM
|
|
93
|
+
description: Fraction of instances that the predicted output matches the prefix of a correct reference up to light processing.
|
|
94
|
+
lower_is_better: false
|
|
95
|
+
- name: alrage_score
|
|
96
|
+
# TODO: should call this prefix_quasi_exact_match
|
|
97
|
+
display_name: ALRAGE Score
|
|
98
|
+
short_display_name: Score
|
|
99
|
+
description: Score of the output judged by GPT-4o.
|
|
100
|
+
lower_is_better: false
|
|
101
|
+
|
|
102
|
+
############################################################
|
|
103
|
+
perturbations: []
|
|
104
|
+
|
|
105
|
+
############################################################
|
|
106
|
+
metric_groups:
|
|
107
|
+
- name: accuracy
|
|
108
|
+
display_name: Accuracy
|
|
109
|
+
aggregation_strategies:
|
|
110
|
+
- mean
|
|
111
|
+
metrics:
|
|
112
|
+
- name: ${main_name}
|
|
113
|
+
split: ${main_split}
|
|
114
|
+
|
|
115
|
+
- name: efficiency
|
|
116
|
+
display_name: Efficiency
|
|
117
|
+
aggregation_strategies:
|
|
118
|
+
- mean
|
|
119
|
+
metrics:
|
|
120
|
+
- name: inference_runtime
|
|
121
|
+
split: ${main_split}
|
|
122
|
+
|
|
123
|
+
- name: general_information
|
|
124
|
+
display_name: General information
|
|
125
|
+
hide_win_rates: true
|
|
126
|
+
metrics:
|
|
127
|
+
- name: num_instances
|
|
128
|
+
split: ${main_split}
|
|
129
|
+
- name: num_train_instances
|
|
130
|
+
split: ${main_split}
|
|
131
|
+
- name: prompt_truncated
|
|
132
|
+
split: ${main_split}
|
|
133
|
+
- name: num_prompt_tokens
|
|
134
|
+
split: ${main_split}
|
|
135
|
+
- name: num_output_tokens
|
|
136
|
+
split: ${main_split}
|
|
137
|
+
|
|
138
|
+
############################################################
|
|
139
|
+
run_groups:
|
|
140
|
+
- name: arabic_scenarios
|
|
141
|
+
display_name: Arabic Scenarios
|
|
142
|
+
description: Arabic Scenarios
|
|
143
|
+
category: Scenarios
|
|
144
|
+
subgroups:
|
|
145
|
+
- alghafa
|
|
146
|
+
- arabic_mmlu
|
|
147
|
+
- arabic_exams
|
|
148
|
+
- madinah_qa
|
|
149
|
+
- aratrust
|
|
150
|
+
- alrage
|
|
151
|
+
- mbzuai_human_translated_arabic_mmlu
|
|
152
|
+
|
|
153
|
+
- name: mbzuai_human_translated_arabic_mmlu
|
|
154
|
+
display_name: MBZUAI Human-Translated Arabic MMLU
|
|
155
|
+
short_display_name: Translated MMLU
|
|
156
|
+
description: A translation from MBZUAI by human translators of the Massive Multitask Language Understanding benchmark.
|
|
157
|
+
metric_groups:
|
|
158
|
+
- accuracy
|
|
159
|
+
- efficiency
|
|
160
|
+
- general_information
|
|
161
|
+
environment:
|
|
162
|
+
main_name: exact_match
|
|
163
|
+
main_split: test
|
|
164
|
+
taxonomy:
|
|
165
|
+
task: multiple-choice question answering
|
|
166
|
+
what: math, science, history, etc.
|
|
167
|
+
who: various online sources
|
|
168
|
+
when: before 2021
|
|
169
|
+
language: Arabic
|
|
170
|
+
|
|
171
|
+
- name: arabic_mmlu
|
|
172
|
+
display_name: ArabicMMLU
|
|
173
|
+
description: ArabicMMLU
|
|
174
|
+
metric_groups:
|
|
175
|
+
- accuracy
|
|
176
|
+
- efficiency
|
|
177
|
+
- general_information
|
|
178
|
+
environment:
|
|
179
|
+
main_name: exact_match
|
|
180
|
+
main_split: test
|
|
181
|
+
taxonomy:
|
|
182
|
+
task: "question answering"
|
|
183
|
+
what: "academic questions across various disciplines"
|
|
184
|
+
who: "academic exams writers and takers"
|
|
185
|
+
when: "before 2024"
|
|
186
|
+
language: Arabic
|
|
187
|
+
|
|
188
|
+
- name: alghafa
|
|
189
|
+
display_name: AlGhafa
|
|
190
|
+
description: AlGhafa
|
|
191
|
+
metric_groups:
|
|
192
|
+
- accuracy
|
|
193
|
+
- efficiency
|
|
194
|
+
- general_information
|
|
195
|
+
environment:
|
|
196
|
+
main_name: exact_match
|
|
197
|
+
main_split: test
|
|
198
|
+
taxonomy:
|
|
199
|
+
task: "multiple choice question answering"
|
|
200
|
+
what: Various
|
|
201
|
+
who: Various
|
|
202
|
+
when: "before 2023"
|
|
203
|
+
language: Arabic
|
|
204
|
+
|
|
205
|
+
- name: arabic_exams
|
|
206
|
+
display_name: Arabic EXAMS
|
|
207
|
+
description: Arabic EXAMS
|
|
208
|
+
metric_groups:
|
|
209
|
+
- accuracy
|
|
210
|
+
- efficiency
|
|
211
|
+
- general_information
|
|
212
|
+
environment:
|
|
213
|
+
main_name: exact_match
|
|
214
|
+
main_split: test
|
|
215
|
+
taxonomy:
|
|
216
|
+
task: "multiple choice question answering"
|
|
217
|
+
what: High school examinations
|
|
218
|
+
who: High school examinations writers and test-takers
|
|
219
|
+
when: before 2020
|
|
220
|
+
language: Arabic
|
|
221
|
+
|
|
222
|
+
- name: aratrust
|
|
223
|
+
display_name: AraTrust
|
|
224
|
+
description: AraTrust
|
|
225
|
+
metric_groups:
|
|
226
|
+
- accuracy
|
|
227
|
+
- efficiency
|
|
228
|
+
- general_information
|
|
229
|
+
environment:
|
|
230
|
+
main_name: exact_match
|
|
231
|
+
main_split: test
|
|
232
|
+
taxonomy:
|
|
233
|
+
task: "question answering"
|
|
234
|
+
what: "academic questions across various disciplines"
|
|
235
|
+
who: "academic exams writers and takers"
|
|
236
|
+
when: "before 2024"
|
|
237
|
+
language: Arabic
|
|
238
|
+
|
|
239
|
+
- name: alrage
|
|
240
|
+
display_name: ALRAGE
|
|
241
|
+
description: ALRAGE
|
|
242
|
+
metric_groups:
|
|
243
|
+
- accuracy
|
|
244
|
+
- efficiency
|
|
245
|
+
- general_information
|
|
246
|
+
environment:
|
|
247
|
+
main_name: alrage_score
|
|
248
|
+
main_split: test
|
|
249
|
+
taxonomy:
|
|
250
|
+
task: "openbook (RAG) open-ended question answering"
|
|
251
|
+
what: "?"
|
|
252
|
+
who: "?"
|
|
253
|
+
when: "?"
|
|
254
|
+
language: Arabic
|
|
255
|
+
|
|
256
|
+
- name: madinah_qa
|
|
257
|
+
display_name: MadinahQA
|
|
258
|
+
description: Arabic language competency benchmark
|
|
259
|
+
metric_groups:
|
|
260
|
+
- accuracy
|
|
261
|
+
- efficiency
|
|
262
|
+
- general_information
|
|
263
|
+
environment:
|
|
264
|
+
main_name: exact_match
|
|
265
|
+
main_split: test
|
|
266
|
+
taxonomy:
|
|
267
|
+
task: "question answering"
|
|
268
|
+
what: "academic questions about Arabic language"
|
|
269
|
+
who: "academic exams writers and takers"
|
|
270
|
+
when: "before 2024"
|
|
271
|
+
language: Arabic
|
|
@@ -1683,23 +1683,6 @@ run_groups:
|
|
|
1683
1683
|
when: n/a
|
|
1684
1684
|
language: synthetic
|
|
1685
1685
|
|
|
1686
|
-
- name: numeracy
|
|
1687
|
-
display_name: Numerical reasoning
|
|
1688
|
-
description: Scenario introduced in this work to test numerical reasoning via symbolic regression.
|
|
1689
|
-
metric_groups:
|
|
1690
|
-
- accuracy
|
|
1691
|
-
- efficiency
|
|
1692
|
-
- general_information
|
|
1693
|
-
environment:
|
|
1694
|
-
main_name: absolute_value_difference
|
|
1695
|
-
main_split: test
|
|
1696
|
-
taxonomy:
|
|
1697
|
-
task: next-word prediction
|
|
1698
|
-
what: Dyck formal language
|
|
1699
|
-
who: n/a
|
|
1700
|
-
when: n/a
|
|
1701
|
-
language: synthetic
|
|
1702
|
-
|
|
1703
1686
|
- name: synthetic_reasoning
|
|
1704
1687
|
display_name: Synthetic reasoning (abstract symbols)
|
|
1705
1688
|
description: Synthetic reasoning tasks defined using abstract symbols based on LIME [(Wu et al., 2021)](https://proceedings.mlr.press/v139/wu21c.html).
|
|
@@ -194,7 +194,8 @@ run_groups:
|
|
|
194
194
|
- ruler_hotpotqa
|
|
195
195
|
- ruler_squad
|
|
196
196
|
- infinite_bench_en_sum
|
|
197
|
-
- infinite_bench_en_qa
|
|
197
|
+
# - infinite_bench_en_qa
|
|
198
|
+
- infinite_bench_en_mc
|
|
198
199
|
- openai_mrcr
|
|
199
200
|
|
|
200
201
|
- name: ruler_hotpotqa
|
|
@@ -232,18 +233,35 @@ run_groups:
|
|
|
232
233
|
when: Before 2018
|
|
233
234
|
language: English
|
|
234
235
|
|
|
235
|
-
- name: infinite_bench_en_qa
|
|
236
|
-
|
|
237
|
-
|
|
236
|
+
# - name: infinite_bench_en_qa
|
|
237
|
+
# display_name: ∞Bench En.QA
|
|
238
|
+
# description: ∞Bench En.QA is a open-ended question answering task that requires locating and processing information within a novel, performing reasoning through aggregation or filtering to derive answers. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))
|
|
239
|
+
# metric_groups:
|
|
240
|
+
# - accuracy
|
|
241
|
+
# - general_information
|
|
242
|
+
# - annotation_metrics
|
|
243
|
+
# environment:
|
|
244
|
+
# main_name: f1_score
|
|
245
|
+
# main_split: test
|
|
246
|
+
# taxonomy:
|
|
247
|
+
# task: question answering
|
|
248
|
+
# what: Novels
|
|
249
|
+
# who: Novel authors
|
|
250
|
+
# when: Before 2024
|
|
251
|
+
# language: English
|
|
252
|
+
|
|
253
|
+
- name: infinite_bench_en_mc
|
|
254
|
+
display_name: ∞Bench En.MC
|
|
255
|
+
description: ∞Bench En.MC is a multiple-choice question answering task that requires locating and processing information within a novel, performing reasoning through aggregation or filtering to derive answers. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))
|
|
238
256
|
metric_groups:
|
|
239
257
|
- accuracy
|
|
240
258
|
- general_information
|
|
241
259
|
- annotation_metrics
|
|
242
260
|
environment:
|
|
243
|
-
main_name:
|
|
261
|
+
main_name: exact_match
|
|
244
262
|
main_split: test
|
|
245
263
|
taxonomy:
|
|
246
|
-
task: question answering
|
|
264
|
+
task: multiple-choice question answering
|
|
247
265
|
what: Novels
|
|
248
266
|
who: Novel authors
|
|
249
267
|
when: Before 2024
|