crfm-helm 0.5.7__py3-none-any.whl → 0.5.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/METADATA +5 -77
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/RECORD +228 -197
- helm/benchmark/adaptation/adapter_spec.py +5 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
- helm/benchmark/annotation/aci_bench_annotator.py +11 -22
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
- helm/benchmark/annotation/dischargeme_annotator.py +11 -22
- helm/benchmark/annotation/med_dialog_annotator.py +11 -22
- helm/benchmark/annotation/medalign_annotator.py +11 -22
- helm/benchmark/annotation/medi_qa_annotator.py +11 -22
- helm/benchmark/annotation/medication_qa_annotator.py +11 -22
- helm/benchmark/annotation/mental_health_annotator.py +11 -22
- helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
- helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
- helm/benchmark/annotation/model_as_judge.py +23 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
- helm/benchmark/metrics/air_bench_metrics.py +3157 -1
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/basic_metrics.py +267 -2
- helm/benchmark/metrics/classification_metrics.py +19 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
- helm/benchmark/metrics/dry_run_metrics.py +30 -1
- helm/benchmark/metrics/efficiency_metrics.py +74 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +299 -0
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
- helm/benchmark/metrics/ifeval_metrics.py +13 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
- helm/benchmark/metrics/language_modeling_metrics.py +13 -1
- helm/benchmark/metrics/live_qa_metrics.py +13 -1
- helm/benchmark/metrics/llm_jury_metrics.py +13 -1
- helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
- helm/benchmark/metrics/medec_metrics.py +25 -2
- helm/benchmark/metrics/metric.py +25 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
- helm/benchmark/metrics/omni_math_metrics.py +13 -1
- helm/benchmark/metrics/seahelm_metrics.py +14 -1
- helm/benchmark/metrics/summac/model_summac.py +2 -2
- helm/benchmark/metrics/summarization_metrics.py +129 -1
- helm/benchmark/metrics/toxicity_metrics.py +31 -1
- helm/benchmark/metrics/wildbench_metrics.py +21 -1
- helm/benchmark/presentation/schema.py +5 -22
- helm/benchmark/presentation/summarize.py +180 -11
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/run_expander.py +4 -0
- helm/benchmark/run_specs/arabic_run_specs.py +134 -16
- helm/benchmark/run_specs/bluex_run_specs.py +1 -1
- helm/benchmark/run_specs/classic_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +2 -2
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +360 -50
- helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
- helm/benchmark/scenarios/air_bench_scenario.py +21 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +8 -4
- helm/benchmark/scenarios/aratrust_scenario.py +19 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
- helm/benchmark/scenarios/bbq_scenario.py +15 -0
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/bluex_scenario.py +6 -2
- helm/benchmark/scenarios/bold_scenario.py +15 -0
- helm/benchmark/scenarios/boolq_scenario.py +20 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
- helm/benchmark/scenarios/clear_scenario.py +23 -0
- helm/benchmark/scenarios/cleva_scenario.py +479 -0
- helm/benchmark/scenarios/code_scenario.py +28 -0
- helm/benchmark/scenarios/commonsense_scenario.py +26 -0
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
- helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
- helm/benchmark/scenarios/disinformation_scenario.py +22 -0
- helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
- helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
- helm/benchmark/scenarios/gpqa_scenario.py +18 -0
- helm/benchmark/scenarios/grammar_scenario.py +20 -1
- helm/benchmark/scenarios/gsm_scenario.py +15 -0
- helm/benchmark/scenarios/headqa_scenario.py +22 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
- helm/benchmark/scenarios/ice_scenario.py +21 -1
- helm/benchmark/scenarios/ifeval_scenario.py +18 -0
- helm/benchmark/scenarios/imdb_scenario.py +15 -0
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/legal_support_scenario.py +13 -0
- helm/benchmark/scenarios/legalbench_scenario.py +20 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
- helm/benchmark/scenarios/lextreme_scenario.py +11 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +26 -0
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
- helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
- helm/benchmark/scenarios/med_qa_scenario.py +14 -0
- helm/benchmark/scenarios/medalign_scenario.py +23 -0
- helm/benchmark/scenarios/medbullets_scenario.py +22 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
- helm/benchmark/scenarios/medec_scenario.py +23 -0
- helm/benchmark/scenarios/medhallu_scenario.py +23 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +23 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
- helm/benchmark/scenarios/mental_health_scenario.py +23 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +24 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
- helm/benchmark/scenarios/mmlu_scenario.py +15 -0
- helm/benchmark/scenarios/msmarco_scenario.py +30 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +20 -0
- helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
- helm/benchmark/scenarios/omni_math_scenario.py +18 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
- helm/benchmark/scenarios/quac_scenario.py +14 -0
- helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
- helm/benchmark/scenarios/raft_scenario.py +15 -0
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/scenario.py +31 -0
- helm/benchmark/scenarios/seahelm_scenario.py +348 -0
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
- helm/benchmark/scenarios/summarization_scenario.py +37 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +1 -1
- helm/benchmark/scenarios/test_bluex_scenario.py +2 -2
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/vicuna_scenario.py +21 -1
- helm/benchmark/scenarios/wikifact_scenario.py +20 -0
- helm/benchmark/scenarios/wildbench_scenario.py +18 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +12 -0
- helm/benchmark/static/schema_arabic.yaml +55 -12
- helm/benchmark/static/schema_long_context.yaml +17 -17
- helm/benchmark/static/schema_medhelm.yaml +36 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static_build/assets/index-671a5e06.js +10 -0
- helm/benchmark/static_build/assets/index-9352595e.css +1 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
- helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
- helm/clients/openai_client.py +31 -19
- helm/clients/openai_responses_client.py +27 -3
- helm/clients/openrouter_client.py +31 -0
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/together_client.py +48 -11
- helm/clients/vertexai_client.py +8 -2
- helm/config/model_deployments.yaml +75 -1
- helm/config/model_metadata.yaml +70 -2
- helm/config/tokenizer_configs.yaml +19 -1
- helm/proxy/example_queries.py +8 -8
- helm/proxy/server.py +2 -1
- helm/proxy/static/index.css +4 -0
- helm/proxy/static/index.js +7 -1
- helm/benchmark/metrics/aci_bench_metrics.py +0 -14
- helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
- helm/benchmark/metrics/dischargeme_metrics.py +0 -14
- helm/benchmark/metrics/med_dialog_metrics.py +0 -14
- helm/benchmark/metrics/medalign_metrics.py +0 -14
- helm/benchmark/metrics/medi_qa_metrics.py +0 -14
- helm/benchmark/metrics/medication_qa_metrics.py +0 -14
- helm/benchmark/metrics/mental_health_metrics.py +0 -14
- helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
- helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
- helm/benchmark/static_build/assets/index-b9779128.css +0 -1
- helm/benchmark/static_build/assets/index-e439d5e1.js +0 -10
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import os
|
|
3
3
|
from typing import List
|
|
4
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
4
5
|
from helm.benchmark.scenarios.scenario import (
|
|
5
6
|
Scenario,
|
|
6
7
|
Instance,
|
|
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
10
11
|
TEST_SPLIT,
|
|
11
12
|
Input,
|
|
12
13
|
Output,
|
|
14
|
+
ScenarioMetadata,
|
|
13
15
|
)
|
|
14
16
|
from helm.common.general import ensure_file_downloaded
|
|
15
17
|
|
|
@@ -124,3 +126,24 @@ class ACIBenchScenario(Scenario):
|
|
|
124
126
|
instances.extend(self.process_json(test_json, TEST_SPLIT))
|
|
125
127
|
|
|
126
128
|
return instances
|
|
129
|
+
|
|
130
|
+
def get_metadata(self):
|
|
131
|
+
return ScenarioMetadata(
|
|
132
|
+
name="aci_bench",
|
|
133
|
+
display_name="ACI-Bench",
|
|
134
|
+
description="ACI-Bench is a benchmark of real-world patient-doctor conversations paired "
|
|
135
|
+
"with structured clinical notes. The benchmark evaluates a model's ability to "
|
|
136
|
+
"understand spoken medical dialogue and convert it into formal clinical "
|
|
137
|
+
"documentation, covering sections such as history of present illness, physical "
|
|
138
|
+
"exam findings, results, and assessment and plan [(Yim et al., "
|
|
139
|
+
"2024)](https://www.nature.com/articles/s41597-023-02487-3).",
|
|
140
|
+
taxonomy=TaxonomyInfo(
|
|
141
|
+
task="Text generation",
|
|
142
|
+
what="Extract and structure information from patient-doctor " "conversations",
|
|
143
|
+
when="Any",
|
|
144
|
+
who="Clinician",
|
|
145
|
+
language="English",
|
|
146
|
+
),
|
|
147
|
+
main_metric="aci_bench_accuracy",
|
|
148
|
+
main_split="test",
|
|
149
|
+
)
|
|
@@ -2,6 +2,7 @@ import datasets
|
|
|
2
2
|
import os
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.benchmark.scenarios.scenario import (
|
|
6
7
|
Scenario,
|
|
7
8
|
Instance,
|
|
@@ -9,6 +10,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
9
10
|
TEST_SPLIT,
|
|
10
11
|
Input,
|
|
11
12
|
Output,
|
|
13
|
+
ScenarioMetadata,
|
|
12
14
|
)
|
|
13
15
|
from helm.common.general import ensure_directory_exists
|
|
14
16
|
|
|
@@ -53,3 +55,22 @@ class AIRBench2024Scenario(Scenario):
|
|
|
53
55
|
instance = Instance(input=input, references=references, split=TEST_SPLIT)
|
|
54
56
|
instances.append(instance)
|
|
55
57
|
return instances
|
|
58
|
+
|
|
59
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
60
|
+
return ScenarioMetadata(
|
|
61
|
+
name="air_bench_2024",
|
|
62
|
+
display_name="AIRBench 2024",
|
|
63
|
+
description="AIRBench 2024 is a AI safety benchmark that aligns with emerging government "
|
|
64
|
+
"regulations and company policies. It consists of diverse, malicious prompts "
|
|
65
|
+
"spanning categories of the regulation-based safety categories in the AIR 2024 "
|
|
66
|
+
"safety taxonomy.\n",
|
|
67
|
+
taxonomy=TaxonomyInfo(
|
|
68
|
+
task="open-ended instruction-following text generation",
|
|
69
|
+
what="malicious prompts",
|
|
70
|
+
when="2024",
|
|
71
|
+
who="dataset authors and language models",
|
|
72
|
+
language="English",
|
|
73
|
+
),
|
|
74
|
+
main_metric="air_score",
|
|
75
|
+
main_split="test",
|
|
76
|
+
)
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
import datasets
|
|
5
|
+
|
|
6
|
+
from helm.common.general import ensure_directory_exists
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
Reference,
|
|
11
|
+
TEST_SPLIT,
|
|
12
|
+
CORRECT_TAG,
|
|
13
|
+
Input,
|
|
14
|
+
Output,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class ALRAGEScenario(Scenario):
|
|
19
|
+
"""ALRAGE""" # noqa: E501
|
|
20
|
+
|
|
21
|
+
name = "alrage"
|
|
22
|
+
description = "ALRAGE"
|
|
23
|
+
tags = ["open-book question answering"]
|
|
24
|
+
|
|
25
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
26
|
+
cache_dir = os.path.join(output_path, "data")
|
|
27
|
+
ensure_directory_exists(cache_dir)
|
|
28
|
+
dataset: datasets.Dataset = datasets.load_dataset(
|
|
29
|
+
"OALL/ALRAGE",
|
|
30
|
+
revision="4827b2ed2436aea578e84d9bd4150b66ab8bbe0e",
|
|
31
|
+
split="train",
|
|
32
|
+
cache_dir=cache_dir,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
# Read all instances
|
|
36
|
+
instances: List[Instance] = []
|
|
37
|
+
for row in dataset:
|
|
38
|
+
input = Input(text=f"السؤال:\n{row['question']}\n\nالسياقات المقترحة:\n{row['candidates']}\n")
|
|
39
|
+
references: List[Reference] = []
|
|
40
|
+
references = [
|
|
41
|
+
Reference(
|
|
42
|
+
output=Output(text=row["gold_answer"]),
|
|
43
|
+
tags=[CORRECT_TAG],
|
|
44
|
+
)
|
|
45
|
+
]
|
|
46
|
+
instance = Instance(
|
|
47
|
+
id=row["id"],
|
|
48
|
+
input=input,
|
|
49
|
+
references=references,
|
|
50
|
+
split=TEST_SPLIT,
|
|
51
|
+
)
|
|
52
|
+
instances.append(instance)
|
|
53
|
+
|
|
54
|
+
return instances
|
|
@@ -2,7 +2,8 @@ import re
|
|
|
2
2
|
from typing import List, Any, Dict
|
|
3
3
|
from datasets import load_dataset
|
|
4
4
|
|
|
5
|
-
from helm.benchmark.
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
6
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TRAIN_SPLIT, TEST_SPLIT, ScenarioMetadata
|
|
6
7
|
|
|
7
8
|
|
|
8
9
|
class AnthropicHHRLHFScenario(Scenario):
|
|
@@ -88,3 +89,24 @@ class AnthropicHHRLHFScenario(Scenario):
|
|
|
88
89
|
)
|
|
89
90
|
instances.append(instance)
|
|
90
91
|
return instances
|
|
92
|
+
|
|
93
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
94
|
+
return ScenarioMetadata(
|
|
95
|
+
name="anthropic_hh_rlhf",
|
|
96
|
+
display_name="Anthropic RLHF dataset",
|
|
97
|
+
short_display_name="Anthropic RLHF dataset",
|
|
98
|
+
description="The dialogue datasets released by Anthropic to facilitate research in model "
|
|
99
|
+
"helpfulness and harmlessness ([Bai et al., "
|
|
100
|
+
"2022](https://arxiv.org/pdf/2204.05862.pdf); [Ganguli et al., "
|
|
101
|
+
"2022](https://arxiv.org/pdf/2209.07858.pdf)). We only use the first utterance "
|
|
102
|
+
"of each dialogue.",
|
|
103
|
+
taxonomy=TaxonomyInfo(
|
|
104
|
+
task="open-ended instruction following",
|
|
105
|
+
what="Human-LM dialogues and preference labels",
|
|
106
|
+
when="2022",
|
|
107
|
+
who="Workers from MTurk and Upwork, language models from Anthropic",
|
|
108
|
+
language="English",
|
|
109
|
+
),
|
|
110
|
+
main_metric="Helpfulness",
|
|
111
|
+
main_split="test",
|
|
112
|
+
)
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
import datasets
|
|
5
|
+
|
|
6
|
+
from helm.common.general import ensure_directory_exists
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
Reference,
|
|
11
|
+
TEST_SPLIT,
|
|
12
|
+
TRAIN_SPLIT,
|
|
13
|
+
CORRECT_TAG,
|
|
14
|
+
Input,
|
|
15
|
+
Output,
|
|
16
|
+
)
|
|
17
|
+
from helm.common.hierarchical_logger import hwarn
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class ArabicEXAMSScenario(Scenario):
|
|
21
|
+
"""The Arabic subset of the EXAMS High School Examinations Dataset for Multilingual Question Answering
|
|
22
|
+
|
|
23
|
+
We use the Open Arabic LLM Leaderboard (OALL) version mirror of the Arabic subset of EXAMS, which is in-turn based
|
|
24
|
+
on the AceGPT version.
|
|
25
|
+
|
|
26
|
+
See: https://www.tii.ae/news/introducing-open-arabic-llm-leaderboard-empowering-arabic-language-modeling-community
|
|
27
|
+
|
|
28
|
+
References:
|
|
29
|
+
|
|
30
|
+
```
|
|
31
|
+
@misc{huang2024acegptlocalizinglargelanguage,
|
|
32
|
+
title={AceGPT, Localizing Large Language Models in Arabic},
|
|
33
|
+
author={Huang Huang and Fei Yu and Jianqing Zhu and Xuening Sun and Hao Cheng and Dingjie Song and Zhihong Chen and Abdulmohsen Alharthi and Bang An and Juncai He and Ziche Liu and Zhiyi Zhang and Junying Chen and Jianquan Li and Benyou Wang and Lian Zhang and Ruoyu Sun and Xiang Wan and Haizhou Li and Jinchao Xu},
|
|
34
|
+
year={2024},
|
|
35
|
+
eprint={2309.12053},
|
|
36
|
+
archivePrefix={arXiv},
|
|
37
|
+
primaryClass={cs.CL},
|
|
38
|
+
url={https://arxiv.org/abs/2309.12053},
|
|
39
|
+
}```
|
|
40
|
+
|
|
41
|
+
```
|
|
42
|
+
@inproceedings{hardalov-etal-2020-exams,
|
|
43
|
+
title = "{EXAMS}: A Multi-subject High School Examinations Dataset for Cross-lingual and Multilingual Question Answering",
|
|
44
|
+
author = "Hardalov, Momchil and
|
|
45
|
+
Mihaylov, Todor and
|
|
46
|
+
Zlatkova, Dimitrina and
|
|
47
|
+
Dinkov, Yoan and
|
|
48
|
+
Koychev, Ivan and
|
|
49
|
+
Nakov, Preslav",
|
|
50
|
+
editor = "Webber, Bonnie and
|
|
51
|
+
Cohn, Trevor and
|
|
52
|
+
He, Yulan and
|
|
53
|
+
Liu, Yang",
|
|
54
|
+
booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
|
|
55
|
+
month = nov,
|
|
56
|
+
year = "2020",
|
|
57
|
+
address = "Online",
|
|
58
|
+
publisher = "Association for Computational Linguistics",
|
|
59
|
+
url = "https://aclanthology.org/2020.emnlp-main.438/",
|
|
60
|
+
doi = "10.18653/v1/2020.emnlp-main.438",
|
|
61
|
+
pages = "5427--5444",
|
|
62
|
+
abstract = "We propose EXAMS {--} a new benchmark dataset for cross-lingual and multilingual question answering for high school examinations. We collected more than 24,000 high-quality high school exam questions in 16 languages, covering 8 language families and 24 school subjects from Natural Sciences and Social Sciences, among others.EXAMS offers unique fine-grained evaluation framework across multiple languages and subjects, which allows precise analysis and comparison of the proposed models. We perform various experiments with existing top-performing multilingual pre-trained models and show that EXAMS offers multiple challenges that require multilingual knowledge and reasoning in multiple domains. We hope that EXAMS will enable researchers to explore challenging reasoning and knowledge transfer methods and pre-trained models for school question answering in various languages which was not possible by now. The data, code, pre-trained models, and evaluation are available at http://github.com/mhardalov/exams-qa."
|
|
63
|
+
}```
|
|
64
|
+
""" # noqa: E501
|
|
65
|
+
|
|
66
|
+
name = "arabic_exams"
|
|
67
|
+
description = "EXAMS is a benchmark dataset for multilingual and cross-lingual question answering from high school examinations. " # noqa: E501
|
|
68
|
+
tags = ["knowledge", "multiple_choice"]
|
|
69
|
+
|
|
70
|
+
CHOICES = ["A", "B", "C", "D"]
|
|
71
|
+
# Remap validation split to train split
|
|
72
|
+
HF_SPLIT_TO_HELM_SPLIT = {"validation": TRAIN_SPLIT, "test": TEST_SPLIT}
|
|
73
|
+
|
|
74
|
+
def __init__(self, subject: str):
|
|
75
|
+
super().__init__()
|
|
76
|
+
self.subject: str = subject.replace("_", " ")
|
|
77
|
+
|
|
78
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
79
|
+
cache_dir = os.path.join(output_path, "data")
|
|
80
|
+
ensure_directory_exists(cache_dir)
|
|
81
|
+
dataset_splits = datasets.load_dataset(
|
|
82
|
+
"OALL/Arabic_EXAMS",
|
|
83
|
+
revision="bc7a29346dbcaa16a8cd883b1f3e681ab2b7ff2a",
|
|
84
|
+
cache_dir=cache_dir,
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
instances: List[Instance] = []
|
|
88
|
+
for split_name, dataset in dataset_splits.items():
|
|
89
|
+
for row in dataset:
|
|
90
|
+
subject = row["id"].split("-")[0]
|
|
91
|
+
if self.subject != "all" and self.subject != subject:
|
|
92
|
+
continue
|
|
93
|
+
input = Input(text=row["question"])
|
|
94
|
+
references: List[Reference] = []
|
|
95
|
+
if row["answer"] not in self.CHOICES:
|
|
96
|
+
hwarn(f"Invalid value in answer column in row: {row}")
|
|
97
|
+
continue
|
|
98
|
+
correct_choice = row["answer"]
|
|
99
|
+
for choice in self.CHOICES:
|
|
100
|
+
references.append(
|
|
101
|
+
Reference(
|
|
102
|
+
output=Output(text=row[choice]),
|
|
103
|
+
tags=[CORRECT_TAG] if choice == correct_choice else [],
|
|
104
|
+
)
|
|
105
|
+
)
|
|
106
|
+
instance = Instance(
|
|
107
|
+
id=row["id"],
|
|
108
|
+
input=input,
|
|
109
|
+
references=references,
|
|
110
|
+
split=self.HF_SPLIT_TO_HELM_SPLIT[split_name],
|
|
111
|
+
)
|
|
112
|
+
instances.append(instance)
|
|
113
|
+
|
|
114
|
+
return instances
|
|
@@ -19,8 +19,6 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
19
19
|
class ArabicMMLUScenario(Scenario):
|
|
20
20
|
"""ArabicMMLU
|
|
21
21
|
|
|
22
|
-
EXPERIMENTAL: This scenario may have future reverse incompatible changes.
|
|
23
|
-
|
|
24
22
|
ArabicMMLU is the first multi-task language understanding benchmark
|
|
25
23
|
for Arabic language, sourced from school exams across diverse educational
|
|
26
24
|
levels in different countries spanning North Africa, the Levant, and the
|
|
@@ -39,12 +37,16 @@ class ArabicMMLUScenario(Scenario):
|
|
|
39
37
|
OPTIONS = ["A", "B", "C", "D"]
|
|
40
38
|
HF_SPLIT_TO_HELM_SPLIT = {"dev": TRAIN_SPLIT, "test": TEST_SPLIT}
|
|
41
39
|
|
|
40
|
+
def __init__(self, subset: str):
|
|
41
|
+
super().__init__()
|
|
42
|
+
self.subset = subset.replace("_", " ")
|
|
43
|
+
|
|
42
44
|
def get_instances(self, output_path: str) -> List[Instance]:
|
|
43
45
|
cache_dir = os.path.join(output_path, "data")
|
|
44
46
|
ensure_directory_exists(cache_dir)
|
|
45
47
|
dataset_splits: Dict[str, datasets.Dataset] = datasets.load_dataset(
|
|
46
48
|
"MBZUAI/ArabicMMLU",
|
|
47
|
-
|
|
49
|
+
self.subset,
|
|
48
50
|
revision="7aa530e2893ac420352b3f5c1a1310c010e9758b",
|
|
49
51
|
cache_dir=cache_dir,
|
|
50
52
|
)
|
|
@@ -63,7 +65,9 @@ class ArabicMMLUScenario(Scenario):
|
|
|
63
65
|
continue
|
|
64
66
|
references.append(
|
|
65
67
|
Reference(
|
|
66
|
-
|
|
68
|
+
# Need to convert column to string because the references are floats
|
|
69
|
+
# for the subject "Math (Primary School)"
|
|
70
|
+
output=Output(text=str(row[column_name])),
|
|
67
71
|
tags=[CORRECT_TAG] if option_index == correct_option_index else [],
|
|
68
72
|
)
|
|
69
73
|
)
|
|
@@ -47,8 +47,25 @@ class AraTrustScenario(Scenario):
|
|
|
47
47
|
description = "aratrust"
|
|
48
48
|
tags = ["trustworthiness"]
|
|
49
49
|
|
|
50
|
+
CATEGORIES = [
|
|
51
|
+
"Ethics",
|
|
52
|
+
"Illegal",
|
|
53
|
+
"Mental Health",
|
|
54
|
+
"Offensive",
|
|
55
|
+
"Physical Health",
|
|
56
|
+
"Privacy",
|
|
57
|
+
"Trustfulness",
|
|
58
|
+
"Unfairness",
|
|
59
|
+
]
|
|
50
60
|
OPTION_KEYS = ["A", "B", "C"]
|
|
51
61
|
|
|
62
|
+
def __init__(self, category: str):
|
|
63
|
+
super().__init__()
|
|
64
|
+
category = category.replace("_", " ")
|
|
65
|
+
if category not in self.CATEGORIES and category != "all":
|
|
66
|
+
raise Exception(f"Unknown category {category}")
|
|
67
|
+
self.category = category
|
|
68
|
+
|
|
52
69
|
def get_instances(self, output_path: str) -> List[Instance]:
|
|
53
70
|
cache_dir = os.path.join(output_path, "data")
|
|
54
71
|
ensure_directory_exists(cache_dir)
|
|
@@ -60,6 +77,8 @@ class AraTrustScenario(Scenario):
|
|
|
60
77
|
)
|
|
61
78
|
instances: List[Instance] = []
|
|
62
79
|
for row_index, row in enumerate(dataset):
|
|
80
|
+
if self.category != "all" and self.category != row["Category"]:
|
|
81
|
+
continue
|
|
63
82
|
question_text = row["Question"]
|
|
64
83
|
option_texts = [row[option_key] for option_key in self.OPTION_KEYS if row[option_key]]
|
|
65
84
|
joined_option_texts = "\n".join(option_texts)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from typing import List
|
|
3
3
|
|
|
4
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
4
5
|
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
|
|
5
6
|
from helm.benchmark.scenarios.scenario import (
|
|
6
7
|
Scenario,
|
|
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
12
13
|
CORRECT_TAG,
|
|
13
14
|
PassageQuestionInput,
|
|
14
15
|
Output,
|
|
16
|
+
ScenarioMetadata,
|
|
15
17
|
)
|
|
16
18
|
|
|
17
19
|
|
|
@@ -139,3 +141,16 @@ class BabiQAScenario(Scenario):
|
|
|
139
141
|
story.append(fact)
|
|
140
142
|
|
|
141
143
|
return instances
|
|
144
|
+
|
|
145
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
146
|
+
return ScenarioMetadata(
|
|
147
|
+
name="babi_qa",
|
|
148
|
+
display_name="bAbI",
|
|
149
|
+
description="The bAbI benchmark for measuring understanding and reasoning [(Weston et al., "
|
|
150
|
+
"2015)](https://arxiv.org/pdf/1502.05698.pdf).",
|
|
151
|
+
taxonomy=TaxonomyInfo(
|
|
152
|
+
task="question answering", what="reasoning", when="2015", who="synthetic", language="English"
|
|
153
|
+
),
|
|
154
|
+
main_metric="quasi_exact_match",
|
|
155
|
+
main_split="test",
|
|
156
|
+
)
|
|
@@ -3,6 +3,7 @@ import os
|
|
|
3
3
|
import random
|
|
4
4
|
from typing import List, Dict, Tuple
|
|
5
5
|
|
|
6
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
6
7
|
from helm.common.general import ensure_file_downloaded
|
|
7
8
|
from helm.benchmark.scenarios.scenario import (
|
|
8
9
|
Scenario,
|
|
@@ -14,6 +15,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
14
15
|
DEFAULT_TEST_SIZE,
|
|
15
16
|
PassageQuestionInput,
|
|
16
17
|
Output,
|
|
18
|
+
ScenarioMetadata,
|
|
17
19
|
)
|
|
18
20
|
|
|
19
21
|
AMBIGUOUS_TAG = "ambiguous"
|
|
@@ -237,3 +239,16 @@ class BBQScenario(Scenario):
|
|
|
237
239
|
instances.append(instance)
|
|
238
240
|
|
|
239
241
|
return instances
|
|
242
|
+
|
|
243
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
244
|
+
return ScenarioMetadata(
|
|
245
|
+
name="bbq",
|
|
246
|
+
display_name="BBQ (Bias Benchmark for Question Answering)",
|
|
247
|
+
short_display_name="BBQ",
|
|
248
|
+
description="The Bias Benchmark for Question Answering (BBQ) for measuring social bias in "
|
|
249
|
+
"question answering in ambiguous and unambigous context [(Parrish et al., "
|
|
250
|
+
"2022)](https://aclanthology.org/2022.findings-acl.165/).",
|
|
251
|
+
taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
|
|
252
|
+
main_metric="quasi_exact_match",
|
|
253
|
+
main_split="test",
|
|
254
|
+
)
|