crfm-helm 0.5.7__py3-none-any.whl → 0.5.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/METADATA +5 -77
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/RECORD +228 -197
- helm/benchmark/adaptation/adapter_spec.py +5 -0
- helm/benchmark/adaptation/adapters/multimodal/multiple_choice_joint_multimodal_adapter.py +11 -3
- helm/benchmark/adaptation/adapters/multiple_choice_joint_adapter.py +11 -8
- helm/benchmark/annotation/aci_bench_annotator.py +11 -22
- helm/benchmark/annotation/alrage_annotator.py +90 -0
- helm/benchmark/annotation/chw_care_plan_annotator.py +10 -21
- helm/benchmark/annotation/dischargeme_annotator.py +11 -22
- helm/benchmark/annotation/med_dialog_annotator.py +11 -22
- helm/benchmark/annotation/medalign_annotator.py +11 -22
- helm/benchmark/annotation/medi_qa_annotator.py +11 -22
- helm/benchmark/annotation/medication_qa_annotator.py +11 -22
- helm/benchmark/annotation/mental_health_annotator.py +11 -22
- helm/benchmark/annotation/mimic_bhc_annotator.py +11 -22
- helm/benchmark/annotation/mimic_rrs_annotator.py +11 -22
- helm/benchmark/annotation/model_as_judge.py +23 -18
- helm/benchmark/annotation/mtsamples_procedures_annotator.py +11 -22
- helm/benchmark/annotation/mtsamples_replicate_annotator.py +11 -22
- helm/benchmark/annotation/starr_patient_instructions_annotator.py +11 -22
- helm/benchmark/metrics/air_bench_metrics.py +3157 -1
- helm/benchmark/metrics/alrage_metric.py +35 -0
- helm/benchmark/metrics/basic_metrics.py +267 -2
- helm/benchmark/metrics/classification_metrics.py +19 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +12 -1
- helm/benchmark/metrics/dry_run_metrics.py +30 -1
- helm/benchmark/metrics/efficiency_metrics.py +74 -0
- helm/benchmark/metrics/ehr_sql_metrics.py +57 -1
- helm/benchmark/metrics/evaluate_reference_metrics.py +299 -0
- helm/benchmark/metrics/gpqa_chain_of_thought_metric.py +13 -1
- helm/benchmark/metrics/helpdesk_call_summarization_metrics.py +13 -1
- helm/benchmark/metrics/ifeval_metrics.py +13 -1
- helm/benchmark/metrics/instruction_following_critique_metrics.py +41 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +21 -0
- helm/benchmark/metrics/language_modeling_metrics.py +13 -1
- helm/benchmark/metrics/live_qa_metrics.py +13 -1
- helm/benchmark/metrics/llm_jury_metrics.py +13 -1
- helm/benchmark/metrics/medcalc_bench_metrics.py +14 -1
- helm/benchmark/metrics/medec_metrics.py +25 -2
- helm/benchmark/metrics/metric.py +25 -0
- helm/benchmark/metrics/mimiciv_billing_code_metrics.py +32 -1
- helm/benchmark/metrics/omni_math_metrics.py +13 -1
- helm/benchmark/metrics/seahelm_metrics.py +14 -1
- helm/benchmark/metrics/summac/model_summac.py +2 -2
- helm/benchmark/metrics/summarization_metrics.py +129 -1
- helm/benchmark/metrics/toxicity_metrics.py +31 -1
- helm/benchmark/metrics/wildbench_metrics.py +21 -1
- helm/benchmark/presentation/schema.py +5 -22
- helm/benchmark/presentation/summarize.py +180 -11
- helm/benchmark/presentation/taxonomy_info.py +20 -0
- helm/benchmark/run_expander.py +4 -0
- helm/benchmark/run_specs/arabic_run_specs.py +134 -16
- helm/benchmark/run_specs/bluex_run_specs.py +1 -1
- helm/benchmark/run_specs/classic_run_specs.py +2 -2
- helm/benchmark/run_specs/long_context_run_specs.py +2 -2
- helm/benchmark/run_specs/medhelm/__init__.py +0 -0
- helm/benchmark/run_specs/medhelm/benchmark_config.py +219 -0
- helm/benchmark/run_specs/medhelm_run_specs.py +360 -50
- helm/benchmark/scenarios/aci_bench_scenario.py +23 -0
- helm/benchmark/scenarios/air_bench_scenario.py +21 -0
- helm/benchmark/scenarios/alrage_scenario.py +54 -0
- helm/benchmark/scenarios/anthropic_hh_rlhf_scenario.py +23 -1
- helm/benchmark/scenarios/arabic_exams_scenario.py +114 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +8 -4
- helm/benchmark/scenarios/aratrust_scenario.py +19 -0
- helm/benchmark/scenarios/babi_qa_scenario.py +15 -0
- helm/benchmark/scenarios/bbq_scenario.py +15 -0
- helm/benchmark/scenarios/best_chatgpt_prompts.yaml +473 -0
- helm/benchmark/scenarios/bluex_scenario.py +6 -2
- helm/benchmark/scenarios/bold_scenario.py +15 -0
- helm/benchmark/scenarios/boolq_scenario.py +20 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +23 -0
- helm/benchmark/scenarios/civil_comments_scenario.py +13 -0
- helm/benchmark/scenarios/clear_scenario.py +23 -0
- helm/benchmark/scenarios/cleva_scenario.py +479 -0
- helm/benchmark/scenarios/code_scenario.py +28 -0
- helm/benchmark/scenarios/commonsense_scenario.py +26 -0
- helm/benchmark/scenarios/compositional_instructions.yaml +70 -0
- helm/benchmark/scenarios/conv_fin_qa_calc_scenario.py +21 -0
- helm/benchmark/scenarios/copyright_scenario.py +35 -1
- helm/benchmark/scenarios/cti_to_mitre_scenario.py +21 -0
- helm/benchmark/scenarios/decodingtrust_adv_demonstration_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_adv_robustness_scenario.py +23 -1
- helm/benchmark/scenarios/decodingtrust_fairness_scenario.py +22 -1
- helm/benchmark/scenarios/decodingtrust_machine_ethics_scenario.py +21 -1
- helm/benchmark/scenarios/decodingtrust_ood_robustness_scenario.py +13 -0
- helm/benchmark/scenarios/decodingtrust_privacy_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_stereotype_bias_scenario.py +13 -1
- helm/benchmark/scenarios/decodingtrust_toxicity_prompts_scenario.py +13 -1
- helm/benchmark/scenarios/dischargeme_scenario.py +24 -0
- helm/benchmark/scenarios/disinformation_scenario.py +22 -0
- helm/benchmark/scenarios/dyck_language_scenario.py +15 -0
- helm/benchmark/scenarios/ehrshot_scenario.py +22 -0
- helm/benchmark/scenarios/enem_challenge_scenario.py +19 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +14 -0
- helm/benchmark/scenarios/entity_matching_scenario.py +14 -0
- helm/benchmark/scenarios/financial_phrasebank_scenario.py +21 -0
- helm/benchmark/scenarios/gold_commodity_news_scenario.py +21 -0
- helm/benchmark/scenarios/gpqa_scenario.py +18 -0
- helm/benchmark/scenarios/grammar_scenario.py +20 -1
- helm/benchmark/scenarios/gsm_scenario.py +15 -0
- helm/benchmark/scenarios/headqa_scenario.py +22 -0
- helm/benchmark/scenarios/helpdesk_call_summarization_scenario.py +13 -0
- helm/benchmark/scenarios/ice_scenario.py +21 -1
- helm/benchmark/scenarios/ifeval_scenario.py +18 -0
- helm/benchmark/scenarios/imdb_scenario.py +15 -0
- helm/benchmark/scenarios/koala_scenario.py +21 -1
- helm/benchmark/scenarios/kpi_edgar_scenario.py +21 -0
- helm/benchmark/scenarios/legal_contract_summarization_scenario.py +20 -0
- helm/benchmark/scenarios/legal_summarization_scenario.py +50 -0
- helm/benchmark/scenarios/legal_support_scenario.py +13 -0
- helm/benchmark/scenarios/legalbench_scenario.py +20 -0
- helm/benchmark/scenarios/lex_glue_scenario.py +11 -0
- helm/benchmark/scenarios/lextreme_scenario.py +11 -0
- helm/benchmark/scenarios/lsat_qa_scenario.py +14 -0
- helm/benchmark/scenarios/madinah_qa_scenario.py +73 -0
- helm/benchmark/scenarios/math_scenario.py +26 -0
- helm/benchmark/scenarios/mbzuai_human_translated_arabic_mmlu.py +68 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +32 -1
- helm/benchmark/scenarios/med_mcqa_scenario.py +14 -0
- helm/benchmark/scenarios/med_qa_scenario.py +14 -0
- helm/benchmark/scenarios/medalign_scenario.py +23 -0
- helm/benchmark/scenarios/medbullets_scenario.py +22 -0
- helm/benchmark/scenarios/medcalc_bench_scenario.py +22 -0
- helm/benchmark/scenarios/medec_scenario.py +23 -0
- helm/benchmark/scenarios/medhallu_scenario.py +23 -0
- helm/benchmark/scenarios/medhelm/__init__.py +0 -0
- helm/benchmark/scenarios/medhelm/judges.yaml +14 -0
- helm/benchmark/scenarios/medhelm_configurable_scenario.py +101 -0
- helm/benchmark/scenarios/medi_qa_scenario.py +23 -0
- helm/benchmark/scenarios/medication_qa_scenario.py +31 -1
- helm/benchmark/scenarios/mental_health_scenario.py +23 -0
- helm/benchmark/scenarios/mimic_bhc_scenario.py +24 -0
- helm/benchmark/scenarios/mimic_rrs_scenario.py +23 -0
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +22 -0
- helm/benchmark/scenarios/mmlu_pro_scenario.py +18 -0
- helm/benchmark/scenarios/mmlu_scenario.py +15 -0
- helm/benchmark/scenarios/msmarco_scenario.py +30 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +22 -0
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +22 -0
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +20 -0
- helm/benchmark/scenarios/narrativeqa_scenario.py +20 -0
- helm/benchmark/scenarios/natural_qa_scenario.py +32 -0
- helm/benchmark/scenarios/omni_math_scenario.py +18 -0
- helm/benchmark/scenarios/open_assistant_scenario.py +22 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +22 -0
- helm/benchmark/scenarios/quac_scenario.py +14 -0
- helm/benchmark/scenarios/race_based_med_scenario.py +23 -0
- helm/benchmark/scenarios/raft_scenario.py +15 -0
- helm/benchmark/scenarios/real_toxicity_prompts_scenario.py +14 -1
- helm/benchmark/scenarios/scenario.py +31 -0
- helm/benchmark/scenarios/seahelm_scenario.py +348 -0
- helm/benchmark/scenarios/self_instruct_scenario.py +29 -1
- helm/benchmark/scenarios/shc_bmt_scenario.py +22 -0
- helm/benchmark/scenarios/shc_cdi_scenario.py +20 -0
- helm/benchmark/scenarios/shc_conf_scenario.py +23 -0
- helm/benchmark/scenarios/shc_ent_scenario.py +21 -0
- helm/benchmark/scenarios/shc_gip_scenario.py +20 -0
- helm/benchmark/scenarios/shc_privacy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +22 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +23 -0
- helm/benchmark/scenarios/shc_sequoia_scenario.py +21 -0
- helm/benchmark/scenarios/situation_prompts.yaml +49 -0
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +22 -0
- helm/benchmark/scenarios/summarization_scenario.py +37 -0
- helm/benchmark/scenarios/synthetic_efficiency_scenario.py +22 -1
- helm/benchmark/scenarios/synthetic_reasoning_natural_scenario.py +13 -0
- helm/benchmark/scenarios/test_alrage_scenario.py +23 -0
- helm/benchmark/scenarios/test_arabic_exams_scenario.py +21 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +1 -1
- helm/benchmark/scenarios/test_bluex_scenario.py +2 -2
- helm/benchmark/scenarios/the_pile_scenario.py +13 -1
- helm/benchmark/scenarios/truthful_qa_scenario.py +14 -0
- helm/benchmark/scenarios/twitter_aae_scenario.py +20 -1
- helm/benchmark/scenarios/vicuna_scenario.py +21 -1
- helm/benchmark/scenarios/wikifact_scenario.py +20 -0
- helm/benchmark/scenarios/wildbench_scenario.py +18 -0
- helm/benchmark/scenarios/wmt_14_scenario.py +12 -0
- helm/benchmark/static/schema_arabic.yaml +55 -12
- helm/benchmark/static/schema_long_context.yaml +17 -17
- helm/benchmark/static/schema_medhelm.yaml +36 -0
- helm/benchmark/static/schema_slp.yaml +219 -0
- helm/benchmark/static_build/assets/index-671a5e06.js +10 -0
- helm/benchmark/static_build/assets/index-9352595e.css +1 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/clients/audio_language/llama_omni/arguments.py +61 -0
- helm/clients/audio_language/llama_omni/constants.py +9 -0
- helm/clients/audio_language/llama_omni/conversation.py +213 -0
- helm/clients/audio_language/llama_omni/model/__init__.py +0 -0
- helm/clients/audio_language/llama_omni/model/builder.py +88 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech2s_llama.py +190 -0
- helm/clients/audio_language/llama_omni/model/language_model/omni_speech_llama.py +118 -0
- helm/clients/audio_language/llama_omni/model/omni_speech_arch.py +249 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_encoder/speech_encoder.py +27 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/generation.py +622 -0
- helm/clients/audio_language/llama_omni/model/speech_generator/speech_generator.py +104 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/builder.py +9 -0
- helm/clients/audio_language/llama_omni/model/speech_projector/speech_projector.py +27 -0
- helm/clients/audio_language/llama_omni/preprocess.py +295 -0
- helm/clients/audio_language/llama_omni/utils.py +202 -0
- helm/clients/audio_language/qwen_omni/configuration_qwen2_5_omni.py +519 -0
- helm/clients/audio_language/qwen_omni/modeling_qwen2_5_omni.py +4308 -0
- helm/clients/audio_language/qwen_omni/processing_qwen2_5_omni.py +270 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/__init__.py +0 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/__init__.py +8 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/audio_process.py +56 -0
- helm/clients/audio_language/qwen_omni/qwen2_5_omni_utils/v2_5/vision_process.py +380 -0
- helm/clients/image_generation/cogview2/sr_pipeline/dsr_model.py +1 -1
- helm/clients/image_generation/mindalle/models/stage1/layers.py +2 -2
- helm/clients/openai_client.py +31 -19
- helm/clients/openai_responses_client.py +27 -3
- helm/clients/openrouter_client.py +31 -0
- helm/clients/test_openrouter_client.py +69 -0
- helm/clients/together_client.py +48 -11
- helm/clients/vertexai_client.py +8 -2
- helm/config/model_deployments.yaml +75 -1
- helm/config/model_metadata.yaml +70 -2
- helm/config/tokenizer_configs.yaml +19 -1
- helm/proxy/example_queries.py +8 -8
- helm/proxy/server.py +2 -1
- helm/proxy/static/index.css +4 -0
- helm/proxy/static/index.js +7 -1
- helm/benchmark/metrics/aci_bench_metrics.py +0 -14
- helm/benchmark/metrics/chw_care_plan_metrics.py +0 -14
- helm/benchmark/metrics/dischargeme_metrics.py +0 -14
- helm/benchmark/metrics/med_dialog_metrics.py +0 -14
- helm/benchmark/metrics/medalign_metrics.py +0 -14
- helm/benchmark/metrics/medi_qa_metrics.py +0 -14
- helm/benchmark/metrics/medication_qa_metrics.py +0 -14
- helm/benchmark/metrics/mental_health_metrics.py +0 -14
- helm/benchmark/metrics/mimic_bhc_metrics.py +0 -14
- helm/benchmark/metrics/mimic_rrs_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +0 -14
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +0 -14
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +0 -14
- helm/benchmark/static_build/assets/index-b9779128.css +0 -1
- helm/benchmark/static_build/assets/index-e439d5e1.js +0 -10
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.7.dist-info → crfm_helm-0.5.8.dist-info}/top_level.txt +0 -0
|
@@ -7,6 +7,7 @@ from functools import partial
|
|
|
7
7
|
from tqdm import tqdm
|
|
8
8
|
from typing import Any, Dict, List, Optional, Mapping
|
|
9
9
|
|
|
10
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
10
11
|
from helm.common.general import check_file_exists, ensure_directory_exists
|
|
11
12
|
from helm.benchmark.scenarios.scenario import (
|
|
12
13
|
TEST_SPLIT,
|
|
@@ -16,6 +17,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
16
17
|
CORRECT_TAG,
|
|
17
18
|
Reference,
|
|
18
19
|
Output,
|
|
20
|
+
ScenarioMetadata,
|
|
19
21
|
)
|
|
20
22
|
|
|
21
23
|
##################################
|
|
@@ -1517,3 +1519,23 @@ class EHRSHOTScenario(Scenario):
|
|
|
1517
1519
|
)
|
|
1518
1520
|
|
|
1519
1521
|
return instances
|
|
1522
|
+
|
|
1523
|
+
def get_metadata(self):
|
|
1524
|
+
return ScenarioMetadata(
|
|
1525
|
+
name="ehrshot",
|
|
1526
|
+
display_name="EHRSHOT",
|
|
1527
|
+
description="EHRSHOT is a benchmark designed to evaluate a model's ability to predict "
|
|
1528
|
+
"future clinical events using structured EHR code sequences. Each instance "
|
|
1529
|
+
"contains a patient's historical EHR data and a forward-looking clinical "
|
|
1530
|
+
"question about whether a particular diagnosis, lab result, or hospital event "
|
|
1531
|
+
"will occur [(Wornow et al., 2023)](https://arxiv.org/abs/2307.02028).",
|
|
1532
|
+
taxonomy=TaxonomyInfo(
|
|
1533
|
+
task="Classification",
|
|
1534
|
+
what="Predict whether a medical event will occur in the future based " "on EHR codes",
|
|
1535
|
+
when="Future prediction",
|
|
1536
|
+
who="Clinician, Insurer",
|
|
1537
|
+
language="English",
|
|
1538
|
+
),
|
|
1539
|
+
main_metric="exact_match",
|
|
1540
|
+
main_split="test",
|
|
1541
|
+
)
|
|
@@ -2,6 +2,7 @@ from typing import List, Any
|
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
from datasets import load_dataset
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.benchmark.scenarios.scenario import (
|
|
6
7
|
Scenario,
|
|
7
8
|
Instance,
|
|
@@ -10,6 +11,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
10
11
|
TEST_SPLIT,
|
|
11
12
|
Input,
|
|
12
13
|
Output,
|
|
14
|
+
ScenarioMetadata,
|
|
13
15
|
)
|
|
14
16
|
|
|
15
17
|
|
|
@@ -56,3 +58,20 @@ class ENEMChallengeScenario(Scenario):
|
|
|
56
58
|
)
|
|
57
59
|
instances.append(instance)
|
|
58
60
|
return instances
|
|
61
|
+
|
|
62
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
63
|
+
return ScenarioMetadata(
|
|
64
|
+
name="enem_challenge",
|
|
65
|
+
display_name="ENEM Challenge",
|
|
66
|
+
short_display_name=None,
|
|
67
|
+
description="ENEM Challenge",
|
|
68
|
+
taxonomy=TaxonomyInfo(
|
|
69
|
+
task="multiple-choice question answering",
|
|
70
|
+
what="general academic subjects",
|
|
71
|
+
when="between 2009 and 2023",
|
|
72
|
+
who="brazilian ministry of education",
|
|
73
|
+
language="Portuguese",
|
|
74
|
+
),
|
|
75
|
+
main_metric="exact_match",
|
|
76
|
+
main_split="test",
|
|
77
|
+
)
|
|
@@ -3,6 +3,7 @@ import pandas as pd
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import List, Tuple
|
|
5
5
|
|
|
6
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
6
7
|
from helm.common.hierarchical_logger import hlog
|
|
7
8
|
from helm.common.general import ensure_file_downloaded
|
|
8
9
|
from helm.benchmark.scenarios.scenario import (
|
|
@@ -15,6 +16,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
15
16
|
CORRECT_TAG,
|
|
16
17
|
Input,
|
|
17
18
|
Output,
|
|
19
|
+
ScenarioMetadata,
|
|
18
20
|
)
|
|
19
21
|
|
|
20
22
|
|
|
@@ -160,3 +162,15 @@ class EntityDataImputationScenario(Scenario):
|
|
|
160
162
|
instances.append(instance)
|
|
161
163
|
|
|
162
164
|
return instances
|
|
165
|
+
|
|
166
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
167
|
+
return ScenarioMetadata(
|
|
168
|
+
name="entity_data_imputation",
|
|
169
|
+
display_name="Data imputation",
|
|
170
|
+
description="Scenario from [Mei et al. "
|
|
171
|
+
"(2021)](https://ieeexplore.ieee.org/document/9458712/) that tests the ability "
|
|
172
|
+
"to impute missing entities in a data table.",
|
|
173
|
+
taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
|
|
174
|
+
main_metric="quasi_exact_match",
|
|
175
|
+
main_split="test",
|
|
176
|
+
)
|
|
@@ -2,6 +2,7 @@ import pandas as pd
|
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
from typing import Dict, List, Tuple
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.common.hierarchical_logger import hlog
|
|
6
7
|
from helm.common.general import ensure_file_downloaded
|
|
7
8
|
from helm.benchmark.scenarios.scenario import (
|
|
@@ -14,6 +15,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
14
15
|
CORRECT_TAG,
|
|
15
16
|
Input,
|
|
16
17
|
Output,
|
|
18
|
+
ScenarioMetadata,
|
|
17
19
|
)
|
|
18
20
|
from helm.benchmark.scenarios.entity_matching_scenario_fixed_random_state import set_fixed_random_state_for_dataset
|
|
19
21
|
|
|
@@ -155,3 +157,15 @@ class EntityMatchingScenario(Scenario):
|
|
|
155
157
|
instances.append(instance)
|
|
156
158
|
|
|
157
159
|
return instances
|
|
160
|
+
|
|
161
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
162
|
+
return ScenarioMetadata(
|
|
163
|
+
name="entity_matching",
|
|
164
|
+
display_name="Entity matching",
|
|
165
|
+
description="Scenario from Magellan [(Konda et al., "
|
|
166
|
+
"2016)](https://dl.acm.org/doi/10.14778/3007263.3007314) that tests the ability "
|
|
167
|
+
"to determine if two entities match.",
|
|
168
|
+
taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
|
|
169
|
+
main_metric="quasi_exact_match",
|
|
170
|
+
main_split="test",
|
|
171
|
+
)
|
|
@@ -2,6 +2,7 @@ import os
|
|
|
2
2
|
import random
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.common.general import ensure_file_downloaded
|
|
6
7
|
from helm.benchmark.scenarios.scenario import (
|
|
7
8
|
Scenario,
|
|
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
12
13
|
TEST_SPLIT,
|
|
13
14
|
Input,
|
|
14
15
|
Output,
|
|
16
|
+
ScenarioMetadata,
|
|
15
17
|
)
|
|
16
18
|
|
|
17
19
|
|
|
@@ -92,3 +94,22 @@ Possible labels:\n1. positive\n2. neutral\n3. negative""" # noqa: E501
|
|
|
92
94
|
)
|
|
93
95
|
instances.append(instance)
|
|
94
96
|
return instances
|
|
97
|
+
|
|
98
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
99
|
+
return ScenarioMetadata(
|
|
100
|
+
name="financial_phrasebank",
|
|
101
|
+
display_name="Financial Phrasebank (Sentiment Classification)",
|
|
102
|
+
short_display_name=None,
|
|
103
|
+
description="A sentiment classification benchmark based on the dataset from Good Debt or "
|
|
104
|
+
"Bad Debt - Detecting Semantic Orientations in Economic Texts [(Malo et al., "
|
|
105
|
+
"2013)](https://arxiv.org/abs/1307.5336).",
|
|
106
|
+
taxonomy=TaxonomyInfo(
|
|
107
|
+
task="sentiment analysis",
|
|
108
|
+
what="phrases from financial news texts and company press releases",
|
|
109
|
+
when="before 2013",
|
|
110
|
+
who="annotators with adequate business education background",
|
|
111
|
+
language="English",
|
|
112
|
+
),
|
|
113
|
+
main_metric="classification_weighted_f1",
|
|
114
|
+
main_split="test",
|
|
115
|
+
)
|
|
@@ -6,6 +6,7 @@ from typing import List
|
|
|
6
6
|
|
|
7
7
|
import pandas as pd
|
|
8
8
|
|
|
9
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
9
10
|
from helm.benchmark.runner import TRAIN_SPLIT
|
|
10
11
|
from helm.common.general import ensure_directory_exists, ensure_file_downloaded
|
|
11
12
|
from helm.benchmark.scenarios.scenario import (
|
|
@@ -16,6 +17,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
16
17
|
Reference,
|
|
17
18
|
Scenario,
|
|
18
19
|
Output,
|
|
20
|
+
ScenarioMetadata,
|
|
19
21
|
)
|
|
20
22
|
|
|
21
23
|
|
|
@@ -122,3 +124,22 @@ class GoldCommodityNewsScenario(Scenario):
|
|
|
122
124
|
for train_index in train_indexes:
|
|
123
125
|
instances[train_index] = dataclasses.replace(instances[train_index], split=TRAIN_SPLIT)
|
|
124
126
|
return instances
|
|
127
|
+
|
|
128
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
129
|
+
return ScenarioMetadata(
|
|
130
|
+
name="gold_commodity_news",
|
|
131
|
+
display_name="Gold Commodity News",
|
|
132
|
+
short_display_name=None,
|
|
133
|
+
description="A classification benchmark based on a dataset of human-annotated gold "
|
|
134
|
+
"commodity news headlines ([Sinha & Khandait, "
|
|
135
|
+
"2019](https://arxiv.org/abs/2009.04202)).",
|
|
136
|
+
taxonomy=TaxonomyInfo(
|
|
137
|
+
task="text classification",
|
|
138
|
+
what="gold commodity news headlines",
|
|
139
|
+
when="2000-2019",
|
|
140
|
+
who="financial journalists",
|
|
141
|
+
language="English",
|
|
142
|
+
),
|
|
143
|
+
main_metric="classification_weighted_f1",
|
|
144
|
+
main_split="test",
|
|
145
|
+
)
|
|
@@ -2,6 +2,7 @@ import datasets
|
|
|
2
2
|
import os
|
|
3
3
|
import random
|
|
4
4
|
from typing import List
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.benchmark.scenarios.scenario import (
|
|
6
7
|
Scenario,
|
|
7
8
|
Instance,
|
|
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
11
12
|
CORRECT_TAG,
|
|
12
13
|
Input,
|
|
13
14
|
Output,
|
|
15
|
+
ScenarioMetadata,
|
|
14
16
|
)
|
|
15
17
|
from helm.common.general import ensure_directory_exists
|
|
16
18
|
|
|
@@ -78,3 +80,19 @@ class GPQAScenario(Scenario):
|
|
|
78
80
|
instances.append(instance)
|
|
79
81
|
|
|
80
82
|
return instances
|
|
83
|
+
|
|
84
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
85
|
+
return ScenarioMetadata(
|
|
86
|
+
name=self.name,
|
|
87
|
+
display_name="GPQA",
|
|
88
|
+
description=self.description,
|
|
89
|
+
main_metric="chain_of_thought_correctness",
|
|
90
|
+
main_split="test",
|
|
91
|
+
taxonomy=TaxonomyInfo(
|
|
92
|
+
task="question answering",
|
|
93
|
+
what="complex questions across various disciplines",
|
|
94
|
+
who="domain experts",
|
|
95
|
+
when="2024",
|
|
96
|
+
language="English",
|
|
97
|
+
),
|
|
98
|
+
)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import List
|
|
2
2
|
|
|
3
|
-
from helm.benchmark.
|
|
3
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
4
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT, ScenarioMetadata
|
|
4
5
|
from helm.benchmark.scenarios.grammar import read_grammar, generate_derivations, Derivation, get_values, get_tags
|
|
5
6
|
|
|
6
7
|
|
|
@@ -41,3 +42,21 @@ class GrammarScenario(Scenario):
|
|
|
41
42
|
instances: List[Instance] = list(map(derivation_to_instance, derivations))
|
|
42
43
|
|
|
43
44
|
return instances
|
|
45
|
+
|
|
46
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
47
|
+
return ScenarioMetadata(
|
|
48
|
+
name="grammar",
|
|
49
|
+
display_name="Best ChatGPT Prompts",
|
|
50
|
+
short_display_name="Best ChatGPT Prompts",
|
|
51
|
+
description="A list of “best ChatGPT prompts to power your workflow” summarized by "
|
|
52
|
+
"[GRIDFITI](https://gridfiti.com/best-chatgpt-prompts/).",
|
|
53
|
+
taxonomy=TaxonomyInfo(
|
|
54
|
+
task="open-ended instruction following",
|
|
55
|
+
what="Instructions for LLMs",
|
|
56
|
+
when="2023",
|
|
57
|
+
who="Gridfiti Staff",
|
|
58
|
+
language="English",
|
|
59
|
+
),
|
|
60
|
+
main_metric="Helpfulness",
|
|
61
|
+
main_split="test",
|
|
62
|
+
)
|
|
@@ -2,6 +2,7 @@ import json
|
|
|
2
2
|
import os
|
|
3
3
|
from typing import Dict, List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.common.general import ensure_file_downloaded
|
|
6
7
|
from helm.benchmark.scenarios.scenario import (
|
|
7
8
|
Scenario,
|
|
@@ -12,6 +13,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
12
13
|
TEST_SPLIT,
|
|
13
14
|
Input,
|
|
14
15
|
Output,
|
|
16
|
+
ScenarioMetadata,
|
|
15
17
|
)
|
|
16
18
|
|
|
17
19
|
|
|
@@ -65,3 +67,16 @@ class GSM8KScenario(Scenario):
|
|
|
65
67
|
),
|
|
66
68
|
)
|
|
67
69
|
return instances
|
|
70
|
+
|
|
71
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
72
|
+
return ScenarioMetadata(
|
|
73
|
+
name="gsm",
|
|
74
|
+
display_name="GSM8K (Grade school math word problems)",
|
|
75
|
+
short_display_name="GSM8K",
|
|
76
|
+
description="The grade school math word problems dataset (GSM8K) for testing mathematical "
|
|
77
|
+
"reasoning on grade-school math problems [(Cobbe et al., "
|
|
78
|
+
"2021)](https://arxiv.org/pdf/2110.14168.pdf).",
|
|
79
|
+
taxonomy=TaxonomyInfo(task="?", what="n/a", when="n/a", who="n/a", language="synthetic"),
|
|
80
|
+
main_metric="exact_match_indicator",
|
|
81
|
+
main_split="test",
|
|
82
|
+
)
|
|
@@ -3,6 +3,7 @@ from typing import List, Optional
|
|
|
3
3
|
|
|
4
4
|
from datasets import DatasetDict, load_dataset
|
|
5
5
|
|
|
6
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
6
7
|
from helm.benchmark.scenarios.scenario import (
|
|
7
8
|
CORRECT_TAG,
|
|
8
9
|
TEST_SPLIT,
|
|
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
11
12
|
Output,
|
|
12
13
|
Reference,
|
|
13
14
|
Scenario,
|
|
15
|
+
ScenarioMetadata,
|
|
14
16
|
)
|
|
15
17
|
from helm.common.general import ensure_directory_exists
|
|
16
18
|
|
|
@@ -134,3 +136,23 @@ class HeadQAScenario(Scenario):
|
|
|
134
136
|
)
|
|
135
137
|
|
|
136
138
|
return instances
|
|
139
|
+
|
|
140
|
+
def get_metadata(self):
|
|
141
|
+
return ScenarioMetadata(
|
|
142
|
+
name="head_qa",
|
|
143
|
+
display_name="HeadQA",
|
|
144
|
+
description="HeadQA is a benchmark consisting of biomedical multiple-choice questions "
|
|
145
|
+
"intended to evaluate a model's medical knowledge and reasoning. Each instance "
|
|
146
|
+
"presents a clinical or scientific question with four answer options, requiring "
|
|
147
|
+
"the model to select the most appropriate answer [(Vilares et al., "
|
|
148
|
+
"2019)](https://arxiv.org/abs/1906.04701).",
|
|
149
|
+
taxonomy=TaxonomyInfo(
|
|
150
|
+
task="Question answering",
|
|
151
|
+
what="Medical knowledge testing",
|
|
152
|
+
when="Any",
|
|
153
|
+
who="Medical student, Researcher",
|
|
154
|
+
language="English",
|
|
155
|
+
),
|
|
156
|
+
main_metric="exact_match",
|
|
157
|
+
main_split="test",
|
|
158
|
+
)
|
|
@@ -2,11 +2,13 @@ import csv
|
|
|
2
2
|
import os
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.benchmark.scenarios.scenario import (
|
|
6
7
|
Scenario,
|
|
7
8
|
Instance,
|
|
8
9
|
TEST_SPLIT,
|
|
9
10
|
Input,
|
|
11
|
+
ScenarioMetadata,
|
|
10
12
|
)
|
|
11
13
|
|
|
12
14
|
|
|
@@ -35,3 +37,14 @@ class HelpdeskCallSummarizationScenario(Scenario):
|
|
|
35
37
|
instance = Instance(id=instance_id, input=input, references=[], split=TEST_SPLIT)
|
|
36
38
|
instances.append(instance)
|
|
37
39
|
return instances
|
|
40
|
+
|
|
41
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
42
|
+
return ScenarioMetadata(
|
|
43
|
+
name="helpdesk_call_summarization",
|
|
44
|
+
display_name="Helpdesk Call summarization",
|
|
45
|
+
short_display_name=None,
|
|
46
|
+
description="Helpdesk Call summarization",
|
|
47
|
+
taxonomy=TaxonomyInfo(task="summarization", what="n/a", when="?", who="n/a", language="English"),
|
|
48
|
+
main_metric="unknown",
|
|
49
|
+
main_split="test",
|
|
50
|
+
)
|
|
@@ -4,9 +4,10 @@ from typing import List, Union
|
|
|
4
4
|
from enum import Enum
|
|
5
5
|
import pandas as pd
|
|
6
6
|
|
|
7
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
7
8
|
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
8
9
|
from helm.benchmark.scenarios.ice_scenario_pinned_file_order import listdir_with_pinned_file_order
|
|
9
|
-
from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input
|
|
10
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, TEST_SPLIT, Input, ScenarioMetadata
|
|
10
11
|
|
|
11
12
|
try:
|
|
12
13
|
# pd.read_excel() uses xlrd
|
|
@@ -467,3 +468,22 @@ class ICEScenario(Scenario):
|
|
|
467
468
|
instances.append(Instance(Input(text=t), references=[], split=TEST_SPLIT))
|
|
468
469
|
|
|
469
470
|
return instances
|
|
471
|
+
|
|
472
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
473
|
+
return ScenarioMetadata(
|
|
474
|
+
name="ice",
|
|
475
|
+
display_name="ICE (International Corpus of English)",
|
|
476
|
+
short_display_name="ICE",
|
|
477
|
+
description="The International Corpus of English (ICE) drawn from English speakers from "
|
|
478
|
+
"various places in the world, initiated by [Greenbaum "
|
|
479
|
+
"(1991)](https://www.cambridge.org/core/journals/english-today/article/abs/ice-the-international-corpus-of-english/47808205394C538393C3FD8E62E5E701).",
|
|
480
|
+
taxonomy=TaxonomyInfo(
|
|
481
|
+
task="language modeling",
|
|
482
|
+
what="?",
|
|
483
|
+
when="?",
|
|
484
|
+
who="?",
|
|
485
|
+
language="English varieties from different nations",
|
|
486
|
+
),
|
|
487
|
+
main_metric="bits_per_byte",
|
|
488
|
+
main_split="test",
|
|
489
|
+
)
|
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
import datasets
|
|
2
2
|
import os
|
|
3
3
|
from typing import List
|
|
4
|
+
from helm.benchmark.presentation.schema import TaxonomyInfo
|
|
4
5
|
from helm.benchmark.scenarios.scenario import (
|
|
5
6
|
Scenario,
|
|
7
|
+
ScenarioMetadata,
|
|
6
8
|
Instance,
|
|
7
9
|
Input,
|
|
8
10
|
TEST_SPLIT,
|
|
@@ -51,3 +53,19 @@ class IFEvalScenario(Scenario):
|
|
|
51
53
|
instances.append(instance)
|
|
52
54
|
|
|
53
55
|
return instances
|
|
56
|
+
|
|
57
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
58
|
+
return ScenarioMetadata(
|
|
59
|
+
name=self.name,
|
|
60
|
+
display_name="IFEval",
|
|
61
|
+
description=self.description,
|
|
62
|
+
main_metric="ifeval_strict_accuracy",
|
|
63
|
+
main_split="test",
|
|
64
|
+
taxonomy=TaxonomyInfo(
|
|
65
|
+
task="instruction following",
|
|
66
|
+
what="verifiable general domain instruction following",
|
|
67
|
+
who="human annotators",
|
|
68
|
+
when="2023",
|
|
69
|
+
language="English",
|
|
70
|
+
),
|
|
71
|
+
)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from typing import List, Dict, Optional
|
|
3
3
|
|
|
4
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
4
5
|
from helm.common.general import ensure_file_downloaded
|
|
5
6
|
from helm.benchmark.scenarios.scenario import (
|
|
6
7
|
Scenario,
|
|
@@ -11,6 +12,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
11
12
|
VALID_SPLIT,
|
|
12
13
|
Input,
|
|
13
14
|
Output,
|
|
15
|
+
ScenarioMetadata,
|
|
14
16
|
)
|
|
15
17
|
from helm.benchmark.scenarios.imdb_scenario_pinned_file_order import listdir_with_pinned_file_order
|
|
16
18
|
|
|
@@ -143,3 +145,16 @@ class IMDBScenario(Scenario):
|
|
|
143
145
|
for split in [TRAIN_SPLIT, VALID_SPLIT]:
|
|
144
146
|
instances.extend(self.get_split_instances(target_path, split, contrast_map))
|
|
145
147
|
return instances
|
|
148
|
+
|
|
149
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
150
|
+
return ScenarioMetadata(
|
|
151
|
+
name="imdb",
|
|
152
|
+
display_name="IMDB",
|
|
153
|
+
description="The IMDB benchmark for sentiment analysis in movie review [(Maas et al., "
|
|
154
|
+
"2011)](https://aclanthology.org/P11-1015/).",
|
|
155
|
+
taxonomy=TaxonomyInfo(
|
|
156
|
+
task="sentiment analysis", what="movie reviews", when="?", who="?", language="English"
|
|
157
|
+
),
|
|
158
|
+
main_metric="quasi_exact_match",
|
|
159
|
+
main_split="valid",
|
|
160
|
+
)
|
|
@@ -2,8 +2,9 @@ import json
|
|
|
2
2
|
import os
|
|
3
3
|
from typing import List
|
|
4
4
|
|
|
5
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
5
6
|
from helm.common.general import ensure_file_downloaded
|
|
6
|
-
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT
|
|
7
|
+
from helm.benchmark.scenarios.scenario import Scenario, Instance, Input, TEST_SPLIT, ScenarioMetadata
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
class KoalaScenario(Scenario):
|
|
@@ -39,3 +40,22 @@ class KoalaScenario(Scenario):
|
|
|
39
40
|
)
|
|
40
41
|
instances.append(instance)
|
|
41
42
|
return instances
|
|
43
|
+
|
|
44
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
45
|
+
return ScenarioMetadata(
|
|
46
|
+
name="koala",
|
|
47
|
+
display_name="Koala test dataset",
|
|
48
|
+
short_display_name="Koala test dataset",
|
|
49
|
+
description="The test dataset from the [Koala "
|
|
50
|
+
"paper](https://bair.berkeley.edu/blog/2023/04/03/koala/) for evaluating "
|
|
51
|
+
"instruction-following models.",
|
|
52
|
+
taxonomy=TaxonomyInfo(
|
|
53
|
+
task="open-ended instruction following",
|
|
54
|
+
what="Instructions for LLMs",
|
|
55
|
+
when="Before 2023",
|
|
56
|
+
who="Web users",
|
|
57
|
+
language="English",
|
|
58
|
+
),
|
|
59
|
+
main_metric="Helpfulness",
|
|
60
|
+
main_split="test",
|
|
61
|
+
)
|
|
@@ -3,6 +3,7 @@ from typing import List, Dict
|
|
|
3
3
|
import json
|
|
4
4
|
import re
|
|
5
5
|
|
|
6
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
6
7
|
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
|
|
7
8
|
from helm.benchmark.scenarios.scenario import (
|
|
8
9
|
Scenario,
|
|
@@ -14,6 +15,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
14
15
|
CORRECT_TAG,
|
|
15
16
|
Input,
|
|
16
17
|
Output,
|
|
18
|
+
ScenarioMetadata,
|
|
17
19
|
)
|
|
18
20
|
|
|
19
21
|
|
|
@@ -149,3 +151,22 @@ class KPIEDGARScenario(Scenario):
|
|
|
149
151
|
with open(target_path, "r") as f:
|
|
150
152
|
raw_dataset = json.load(f)
|
|
151
153
|
return KPIEDGARScenario.sentences_to_instances(KPIEDGARScenario.get_sentences(raw_dataset))
|
|
154
|
+
|
|
155
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
156
|
+
return ScenarioMetadata(
|
|
157
|
+
name="kpi_edgar",
|
|
158
|
+
display_name="KPI-EDGAR Financial Documents (Named Entity Recognition)",
|
|
159
|
+
short_display_name=None,
|
|
160
|
+
description="A named entity recognition beenchmark based on the paper KPI-EDGAR - A Novel "
|
|
161
|
+
"Dataset and Accompanying Metric for Relation Extraction from Financial "
|
|
162
|
+
"Documents [(Deußer et al., 2022)](https://arxiv.org/pdf/2210.09163.pdf).",
|
|
163
|
+
taxonomy=TaxonomyInfo(
|
|
164
|
+
task="named entity recognition",
|
|
165
|
+
what="financial reports",
|
|
166
|
+
when="before 2022",
|
|
167
|
+
who="financial experts",
|
|
168
|
+
language="English",
|
|
169
|
+
),
|
|
170
|
+
main_metric="adjusted_macro_f1_score",
|
|
171
|
+
main_split="test",
|
|
172
|
+
)
|
|
@@ -4,6 +4,7 @@ import json
|
|
|
4
4
|
import re
|
|
5
5
|
|
|
6
6
|
from typing import List
|
|
7
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
7
8
|
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
|
|
8
9
|
from helm.benchmark.scenarios.scenario import (
|
|
9
10
|
Input,
|
|
@@ -14,6 +15,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
14
15
|
TEST_SPLIT,
|
|
15
16
|
CORRECT_TAG,
|
|
16
17
|
Output,
|
|
18
|
+
ScenarioMetadata,
|
|
17
19
|
)
|
|
18
20
|
|
|
19
21
|
|
|
@@ -127,3 +129,21 @@ class LegalContractSummarizationScenario(Scenario):
|
|
|
127
129
|
instances.append(instance)
|
|
128
130
|
|
|
129
131
|
return instances
|
|
132
|
+
|
|
133
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
134
|
+
return ScenarioMetadata(
|
|
135
|
+
name="legal_contract_summarization",
|
|
136
|
+
display_name="Legal Contract Summarization",
|
|
137
|
+
short_display_name=None,
|
|
138
|
+
description="Plain English Summarization of Contracts [(Manor et al., "
|
|
139
|
+
"2019)](https://aclanthology.org/W19-2201.pdf).",
|
|
140
|
+
taxonomy=TaxonomyInfo(
|
|
141
|
+
task="summarization",
|
|
142
|
+
what="legal contracts (e.g. terms of service, license agreements)",
|
|
143
|
+
when="before 2019",
|
|
144
|
+
who="lawyers",
|
|
145
|
+
language="English",
|
|
146
|
+
),
|
|
147
|
+
main_metric="rouge_l",
|
|
148
|
+
main_split="test",
|
|
149
|
+
)
|
|
@@ -5,6 +5,7 @@ from typing import List, Optional, Any
|
|
|
5
5
|
import datasets
|
|
6
6
|
from datasets import load_dataset
|
|
7
7
|
|
|
8
|
+
from helm.benchmark.presentation.taxonomy_info import TaxonomyInfo
|
|
8
9
|
from helm.benchmark.scenarios.scenario import (
|
|
9
10
|
Scenario,
|
|
10
11
|
Instance,
|
|
@@ -15,6 +16,7 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
15
16
|
CORRECT_TAG,
|
|
16
17
|
Input,
|
|
17
18
|
Output,
|
|
19
|
+
ScenarioMetadata,
|
|
18
20
|
)
|
|
19
21
|
|
|
20
22
|
_ALL_LANGUAGES = {
|
|
@@ -205,3 +207,51 @@ class LegalSummarizationScenario(Scenario):
|
|
|
205
207
|
)
|
|
206
208
|
|
|
207
209
|
return instances
|
|
210
|
+
|
|
211
|
+
def get_metadata(self) -> ScenarioMetadata:
|
|
212
|
+
if self.dataset_name == "BillSum":
|
|
213
|
+
return ScenarioMetadata(
|
|
214
|
+
name="billsum_legal_summarization",
|
|
215
|
+
display_name="BillSum",
|
|
216
|
+
description="The BillSum benchmark for legal text summarization ([Kornilova & Eidelmann, "
|
|
217
|
+
"2020](https://aclanthology.org/D19-5406/)).",
|
|
218
|
+
taxonomy=TaxonomyInfo(
|
|
219
|
+
task="summarization", what="legal text from US bills", when=None, who="lawyers", language="English"
|
|
220
|
+
),
|
|
221
|
+
main_metric="rouge_2",
|
|
222
|
+
main_split="test",
|
|
223
|
+
)
|
|
224
|
+
elif self.dataset_name == "MultiLexSum":
|
|
225
|
+
return ScenarioMetadata(
|
|
226
|
+
name="multilexsum_legal_summarization",
|
|
227
|
+
display_name="MultiLexSum",
|
|
228
|
+
description="The MultiLexSum benchmark for legal text summarization ([Shen et al., "
|
|
229
|
+
"2022](https://arxiv.org/abs/2206.10883)).",
|
|
230
|
+
taxonomy=TaxonomyInfo(
|
|
231
|
+
task="summarization",
|
|
232
|
+
what="legal text from US civil rights lawsuits",
|
|
233
|
+
when=None,
|
|
234
|
+
who="lawyers",
|
|
235
|
+
language="English",
|
|
236
|
+
),
|
|
237
|
+
main_metric="rouge_2",
|
|
238
|
+
main_split="test",
|
|
239
|
+
)
|
|
240
|
+
elif self.dataset_name == "EurLexSum":
|
|
241
|
+
return ScenarioMetadata(
|
|
242
|
+
name="eurlexsum_legal_summarization",
|
|
243
|
+
display_name="EurLexSum",
|
|
244
|
+
description="The EurLexSum benchmark for legal text summarization ([Aumiller et al., "
|
|
245
|
+
"2022](https://arxiv.org/abs/2210.13448)).",
|
|
246
|
+
taxonomy=TaxonomyInfo(
|
|
247
|
+
task="summarization",
|
|
248
|
+
what="legal text from EU legislation",
|
|
249
|
+
when="1960 - 2020",
|
|
250
|
+
who="lawyers",
|
|
251
|
+
language="English",
|
|
252
|
+
),
|
|
253
|
+
main_metric="rouge_2",
|
|
254
|
+
main_split="test",
|
|
255
|
+
)
|
|
256
|
+
else:
|
|
257
|
+
raise Exception(f"Unknown dataset {self.dataset_name}")
|