crfm-helm 0.5.5__py3-none-any.whl → 0.5.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/METADATA +74 -53
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/RECORD +262 -182
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
- helm/benchmark/annotation/air_bench_annotator.py +2 -2
- helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
- helm/benchmark/annotation/bird_sql_annotator.py +2 -2
- helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
- helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
- helm/benchmark/annotation/live_qa_annotator.py +1 -1
- helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +12 -16
- helm/benchmark/annotation/omni_math_annotator.py +13 -14
- helm/benchmark/annotation/wildbench_annotator.py +9 -9
- helm/benchmark/executor.py +11 -12
- helm/benchmark/metrics/aci_bench_metrics.py +9 -29
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
- helm/benchmark/metrics/classification_metrics.py +3 -3
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/codeinsights_code_efficiency_metrics.py +186 -0
- helm/benchmark/metrics/codeinsights_code_evaluation_metrics.py +477 -0
- helm/benchmark/metrics/codeinsights_correct_code_metrics.py +366 -0
- helm/benchmark/metrics/codeinsights_edge_case_metrics.py +92 -0
- helm/benchmark/metrics/codeinsights_metric_specs.py +51 -0
- helm/benchmark/metrics/comet_metric.py +1 -1
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
- helm/benchmark/metrics/copyright_metrics.py +1 -1
- helm/benchmark/metrics/decodingtrust_stereotype_bias_metrics.py +1 -1
- helm/benchmark/metrics/dischargeme_metrics.py +9 -29
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/evaluate_reference_metrics.py +1 -1
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/ifeval_metrics.py +2 -2
- helm/benchmark/metrics/image_generation/clip_score_metrics.py +13 -2
- helm/benchmark/metrics/image_generation/fractal_dimension/fractal_dimension_util.py +1 -1
- helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
- helm/benchmark/metrics/llm_jury_metrics.py +46 -0
- helm/benchmark/metrics/lmkt_metric_specs.py +12 -0
- helm/benchmark/metrics/lmkt_metrics.py +47 -0
- helm/benchmark/metrics/med_dialog_metrics.py +9 -29
- helm/benchmark/metrics/medalign_metrics.py +9 -29
- helm/benchmark/metrics/medi_qa_metrics.py +9 -29
- helm/benchmark/metrics/medication_qa_metrics.py +10 -30
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/mental_health_metrics.py +9 -29
- helm/benchmark/metrics/metric_service.py +11 -11
- helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
- helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
- helm/benchmark/metrics/summac/model_summac.py +2 -3
- helm/benchmark/metrics/summarization_metrics.py +2 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
- helm/benchmark/metrics/toxicity_metrics.py +2 -2
- helm/benchmark/metrics/unitxt_metrics.py +3 -4
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
- helm/benchmark/metrics/vision_language/image_utils.py +2 -2
- helm/benchmark/model_deployment_registry.py +16 -26
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +43 -13
- helm/benchmark/presentation/run_display.py +13 -0
- helm/benchmark/presentation/schema.py +7 -1
- helm/benchmark/presentation/summarize.py +84 -61
- helm/benchmark/presentation/test_create_plots.py +4 -1
- helm/benchmark/reeval_run.py +3 -4
- helm/benchmark/reeval_runner.py +3 -3
- helm/benchmark/run.py +84 -73
- helm/benchmark/run_expander.py +12 -1
- helm/benchmark/run_spec_factory.py +7 -6
- helm/benchmark/run_specs/arabic_run_specs.py +73 -0
- helm/benchmark/run_specs/audio_run_specs.py +52 -8
- helm/benchmark/run_specs/bluex_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +0 -53
- helm/benchmark/run_specs/codeinsights_run_specs.py +192 -0
- helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
- helm/benchmark/run_specs/experimental_run_specs.py +31 -1
- helm/benchmark/run_specs/healthqa_br_run_specs.py +40 -0
- helm/benchmark/run_specs/heim_run_specs.py +3 -1
- helm/benchmark/run_specs/lmkt_run_specs.py +144 -0
- helm/benchmark/run_specs/long_context_run_specs.py +114 -15
- helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/multilingual_run_specs.py +50 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +163 -0
- helm/benchmark/run_specs/vlm_run_specs.py +28 -0
- helm/benchmark/runner.py +5 -5
- helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
- helm/benchmark/scenarios/alghafa_scenario.py +126 -0
- helm/benchmark/scenarios/arabic_mmlu_scenario.py +78 -0
- helm/benchmark/scenarios/aratrust_scenario.py +76 -0
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
- helm/benchmark/scenarios/audio_language/casual_conversations2_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/mustard_scenario.py +1 -1
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification_scenario.py +104 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_transcription_scenario.py +99 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +118 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +86 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +117 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
- helm/benchmark/scenarios/bluex_scenario.py +66 -0
- helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
- helm/benchmark/scenarios/clear_scenario.py +11 -7
- helm/benchmark/scenarios/cleva_scenario.py +1 -1
- helm/benchmark/scenarios/codeinsights_code_efficiency_scenario.py +197 -0
- helm/benchmark/scenarios/codeinsights_correct_code_scenario.py +78 -0
- helm/benchmark/scenarios/codeinsights_edge_case_scenario.py +192 -0
- helm/benchmark/scenarios/codeinsights_student_coding_scenario.py +162 -0
- helm/benchmark/scenarios/codeinsights_student_mistake_scenario.py +188 -0
- helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
- helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
- helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
- helm/benchmark/scenarios/exams_multilingual_scenario.py +115 -0
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/headqa_scenario.py +6 -1
- helm/benchmark/scenarios/healthqa_br_scenario.py +80 -0
- helm/benchmark/scenarios/infinite_bench_en_mc_scenario.py +90 -0
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
- helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
- helm/benchmark/scenarios/lmkt_scenarios.py +288 -0
- helm/benchmark/scenarios/math_scenario.py +21 -20
- helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
- helm/benchmark/scenarios/medalign_scenario.py +9 -3
- helm/benchmark/scenarios/medalign_scenario_helper.py +27 -130
- helm/benchmark/scenarios/medbullets_scenario.py +7 -2
- helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
- helm/benchmark/scenarios/medec_scenario.py +6 -1
- helm/benchmark/scenarios/medhallu_scenario.py +7 -1
- helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
- helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +16 -5
- helm/benchmark/scenarios/mimic_bhc_scenario.py +13 -8
- helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
- helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
- helm/benchmark/scenarios/mmmlu_scenario.py +85 -0
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
- helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
- helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
- helm/benchmark/scenarios/seahelm_scenario.py +2 -2
- helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
- helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
- helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
- helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
- helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
- helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
- helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
- helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
- helm/benchmark/scenarios/test_alghafa_scenario.py +29 -0
- helm/benchmark/scenarios/test_aratrust_scenario.py +21 -0
- helm/benchmark/scenarios/test_bluex_scenario.py +59 -0
- helm/benchmark/scenarios/test_exams_multilingual_scenario.py +29 -0
- helm/benchmark/scenarios/test_healtha_br_scenario.py +57 -0
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/server.py +2 -1
- helm/benchmark/slurm_jobs.py +1 -2
- helm/benchmark/slurm_runner.py +8 -1
- helm/benchmark/static/schema_arabic.yaml +228 -0
- helm/benchmark/static/schema_audio.yaml +60 -49
- helm/benchmark/static/schema_classic.yaml +0 -17
- helm/benchmark/static/schema_enterprise.yaml +21 -0
- helm/benchmark/static/schema_long_context.yaml +81 -20
- helm/benchmark/static/schema_medhelm.yaml +272 -213
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_vhelm.yaml +26 -26
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/index-b9779128.css +1 -0
- helm/benchmark/static_build/assets/index-e439d5e1.js +10 -0
- helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
- helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
- helm/benchmark/static_build/index.html +4 -4
- helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
- helm/benchmark/window_services/image_generation/clip_window_service.py +1 -3
- helm/benchmark/window_services/test_utils.py +3 -4
- helm/benchmark/window_services/tokenizer_service.py +7 -8
- helm/clients/anthropic_client.py +69 -29
- helm/clients/audio_language/diva_llama_client.py +4 -2
- helm/clients/audio_language/qwen2_5_omni_client.py +209 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
- helm/clients/audio_language/qwen_audiolm_client.py +4 -2
- helm/clients/audio_language/test.py +62 -0
- helm/clients/bedrock_client.py +3 -1
- helm/clients/client.py +7 -7
- helm/clients/grok_client.py +36 -0
- helm/clients/huggingface_client.py +42 -3
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
- helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/openai_client.py +102 -55
- helm/clients/openai_responses_client.py +176 -0
- helm/clients/palmyra_client.py +2 -5
- helm/clients/reka_client.py +2 -2
- helm/clients/test_huggingface_client.py +3 -3
- helm/clients/together_client.py +31 -6
- helm/clients/vertexai_client.py +17 -9
- helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
- helm/clients/vision_language/huggingface_vlm_client.py +2 -2
- helm/clients/vision_language/idefics_client.py +6 -2
- helm/clients/vision_language/paligemma_client.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +66 -53
- helm/clients/vision_language/qwen_vlm_client.py +7 -5
- helm/clients/vllm_client.py +43 -7
- helm/clients/vllm_granite_thinking_client.py +56 -0
- helm/clients/writer_client.py +102 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/critique_request.py +0 -1
- helm/common/general.py +9 -2
- helm/common/hierarchical_logger.py +104 -12
- helm/common/local_context.py +140 -0
- helm/common/object_spec.py +23 -8
- helm/common/remote_context.py +61 -0
- helm/common/request.py +8 -0
- helm/common/test_logging.py +94 -0
- helm/config/model_deployments.yaml +995 -45
- helm/config/model_metadata.yaml +780 -59
- helm/config/tokenizer_configs.yaml +224 -3
- helm/proxy/cli.py +4 -2
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/retry.py +5 -0
- helm/proxy/services/server_service.py +21 -85
- helm/tokenizers/grok_tokenizer.py +55 -0
- helm/tokenizers/huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/benchmark/metrics/numeracy_metrics.py +0 -72
- helm/benchmark/metrics/test_numeracy_metrics.py +0 -95
- helm/benchmark/scenarios/numeracy_scenario.py +0 -793
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
- helm/benchmark/static_build/assets/index-262903c1.js +0 -10
- helm/benchmark/static_build/assets/index-42060d71.css +0 -1
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.7.dist-info}/top_level.txt +0 -0
- /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Dict, List
|
|
3
|
+
|
|
4
|
+
import datasets
|
|
5
|
+
|
|
6
|
+
from helm.common.general import ensure_directory_exists
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
VALID_SPLIT,
|
|
9
|
+
Scenario,
|
|
10
|
+
Instance,
|
|
11
|
+
Reference,
|
|
12
|
+
TEST_SPLIT,
|
|
13
|
+
TRAIN_SPLIT,
|
|
14
|
+
CORRECT_TAG,
|
|
15
|
+
Input,
|
|
16
|
+
Output,
|
|
17
|
+
)
|
|
18
|
+
from helm.common.hierarchical_logger import hwarn
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class EXAMSMultilingualScenario(Scenario):
|
|
22
|
+
"""EXAMS: A Multi-subject High School Examinations Dataset
|
|
23
|
+
|
|
24
|
+
EXAMS is a benchmark dataset for multilingual and cross-lingual
|
|
25
|
+
question answering from high school examinations. It consists of
|
|
26
|
+
more than 24,000 high-quality high school exam questions in 16
|
|
27
|
+
languages, covering 8 language families and 24 school subjects
|
|
28
|
+
from Natural Sciences and Social Sciences, among others.
|
|
29
|
+
|
|
30
|
+
- https://huggingface.co/datasets/mhardalov/exams
|
|
31
|
+
- https://aclanthology.org/2020.emnlp-main.438/
|
|
32
|
+
|
|
33
|
+
Note: Some dataset rows have the value '@' in the `answerKey` column.
|
|
34
|
+
These rows will be ignored.
|
|
35
|
+
|
|
36
|
+
```
|
|
37
|
+
@inproceedings{hardalov-etal-2020-exams,
|
|
38
|
+
title = "{EXAMS}: A Multi-subject High School Examinations Dataset for Cross-lingual and Multilingual Question Answering",
|
|
39
|
+
author = "Hardalov, Momchil and
|
|
40
|
+
Mihaylov, Todor and
|
|
41
|
+
Zlatkova, Dimitrina and
|
|
42
|
+
Dinkov, Yoan and
|
|
43
|
+
Koychev, Ivan and
|
|
44
|
+
Nakov, Preslav",
|
|
45
|
+
editor = "Webber, Bonnie and
|
|
46
|
+
Cohn, Trevor and
|
|
47
|
+
He, Yulan and
|
|
48
|
+
Liu, Yang",
|
|
49
|
+
booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
|
|
50
|
+
month = nov,
|
|
51
|
+
year = "2020",
|
|
52
|
+
address = "Online",
|
|
53
|
+
publisher = "Association for Computational Linguistics",
|
|
54
|
+
url = "https://aclanthology.org/2020.emnlp-main.438/",
|
|
55
|
+
doi = "10.18653/v1/2020.emnlp-main.438",
|
|
56
|
+
pages = "5427--5444",
|
|
57
|
+
abstract = "We propose EXAMS {--} a new benchmark dataset for cross-lingual and multilingual question answering for high school examinations. We collected more than 24,000 high-quality high school exam questions in 16 languages, covering 8 language families and 24 school subjects from Natural Sciences and Social Sciences, among others.EXAMS offers unique fine-grained evaluation framework across multiple languages and subjects, which allows precise analysis and comparison of the proposed models. We perform various experiments with existing top-performing multilingual pre-trained models and show that EXAMS offers multiple challenges that require multilingual knowledge and reasoning in multiple domains. We hope that EXAMS will enable researchers to explore challenging reasoning and knowledge transfer methods and pre-trained models for school question answering in various languages which was not possible by now. The data, code, pre-trained models, and evaluation are available at http://github.com/mhardalov/exams-qa."
|
|
58
|
+
}```
|
|
59
|
+
""" # noqa: E501
|
|
60
|
+
|
|
61
|
+
name = "exams_multilingual"
|
|
62
|
+
description = "EXAMS is a benchmark dataset for multilingual and cross-lingual question answering from high school examinations. " # noqa: E501
|
|
63
|
+
tags = ["knowledge", "multiple_choice"]
|
|
64
|
+
|
|
65
|
+
CHOICES = ["A", "B", "C", "D", "E"]
|
|
66
|
+
HF_SPLIT_TO_HELM_SPLIT = {"train": TRAIN_SPLIT, "test": TEST_SPLIT, "validation": VALID_SPLIT}
|
|
67
|
+
|
|
68
|
+
def __init__(self, language: str, subject: str):
|
|
69
|
+
super().__init__()
|
|
70
|
+
self.language = language
|
|
71
|
+
self.subject = subject
|
|
72
|
+
|
|
73
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
74
|
+
cache_dir = os.path.join(output_path, "data")
|
|
75
|
+
ensure_directory_exists(cache_dir)
|
|
76
|
+
dataset_splits: Dict[str, datasets.Dataset] = datasets.load_dataset(
|
|
77
|
+
"mhardalov/exams",
|
|
78
|
+
"multilingual",
|
|
79
|
+
revision="4ff10804abb3341f8815cacd778181177bba7edd",
|
|
80
|
+
cache_dir=cache_dir,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# Read all instances
|
|
84
|
+
instances: List[Instance] = []
|
|
85
|
+
for split_name, dataset in dataset_splits.items():
|
|
86
|
+
assert isinstance(dataset, datasets.Dataset)
|
|
87
|
+
for row in dataset:
|
|
88
|
+
question = row["question"]
|
|
89
|
+
question_info = row["info"]
|
|
90
|
+
if self.subject != "all" and question_info["subject"] != self.subject:
|
|
91
|
+
continue
|
|
92
|
+
if self.language != "all" and question_info["language"] != self.language:
|
|
93
|
+
continue
|
|
94
|
+
input = Input(text=question["stem"])
|
|
95
|
+
references: List[Reference] = []
|
|
96
|
+
if row["answerKey"] not in self.CHOICES:
|
|
97
|
+
hwarn(f"Invalid value in answerKey column in row: {row}")
|
|
98
|
+
continue
|
|
99
|
+
correct_choice_index = ord(row["answerKey"]) - ord("A")
|
|
100
|
+
for choice_index, choice_text in enumerate(question["choices"]["text"]):
|
|
101
|
+
references.append(
|
|
102
|
+
Reference(
|
|
103
|
+
output=Output(text=choice_text),
|
|
104
|
+
tags=[CORRECT_TAG] if choice_index == correct_choice_index else [],
|
|
105
|
+
)
|
|
106
|
+
)
|
|
107
|
+
instance = Instance(
|
|
108
|
+
id=row["id"],
|
|
109
|
+
input=input,
|
|
110
|
+
references=references,
|
|
111
|
+
split=self.HF_SPLIT_TO_HELM_SPLIT[split_name],
|
|
112
|
+
)
|
|
113
|
+
instances.append(instance)
|
|
114
|
+
|
|
115
|
+
return instances
|
|
@@ -2,7 +2,7 @@ from collections import defaultdict
|
|
|
2
2
|
from dataclasses import dataclass, field, replace
|
|
3
3
|
from functools import cached_property
|
|
4
4
|
from typing import List, Optional
|
|
5
|
-
from helm.common.hierarchical_logger import
|
|
5
|
+
from helm.common.hierarchical_logger import hwarn
|
|
6
6
|
|
|
7
7
|
import dacite
|
|
8
8
|
import re
|
|
@@ -111,7 +111,7 @@ def validate_grammar(grammar: Grammar):
|
|
|
111
111
|
# Make sure all categories are defined
|
|
112
112
|
for category in expansion.categories:
|
|
113
113
|
if category not in grammar.category_to_rules:
|
|
114
|
-
|
|
114
|
+
hwarn(f"Category {category} is not defined")
|
|
115
115
|
|
|
116
116
|
|
|
117
117
|
def read_grammar(path: str) -> Grammar:
|
|
@@ -57,7 +57,12 @@ class HeadQAScenario(Scenario):
|
|
|
57
57
|
SKIP_TEXTQA: bool = False
|
|
58
58
|
|
|
59
59
|
name = "head_qa"
|
|
60
|
-
description =
|
|
60
|
+
description = (
|
|
61
|
+
"HeadQA is a benchmark consisting of biomedical multiple-choice questions intended to"
|
|
62
|
+
"evaluate a model's medical knowledge and reasoning. Each instance presents a clinical"
|
|
63
|
+
"or scientific question with four answer options, requiring the model to select the most"
|
|
64
|
+
"appropriate answer."
|
|
65
|
+
)
|
|
61
66
|
tags = ["question_answering", "biomedical", "medicine"]
|
|
62
67
|
|
|
63
68
|
def __init__(self, language: str = "en", category: Optional[str] = None):
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
from typing import Any, List
|
|
2
|
+
import re
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from datasets import load_dataset
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Scenario,
|
|
7
|
+
Instance,
|
|
8
|
+
Reference,
|
|
9
|
+
TEST_SPLIT,
|
|
10
|
+
CORRECT_TAG,
|
|
11
|
+
Input,
|
|
12
|
+
Output,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class HEALTHQA_BR_Scenario(Scenario):
|
|
17
|
+
"""
|
|
18
|
+
HealthQA-BR is a large-scale benchmark designed to evaluate the clinical knowledge of Large Language Models (LLMs)
|
|
19
|
+
within the Brazilian Unified Health System (SUS) context. It comprises 5,632 multiple-choice questions sourced from
|
|
20
|
+
nationwide licensing exams and residency tests, reflecting real challenges faced by Brazil's public health sector.
|
|
21
|
+
Unlike benchmarks focused on the U.S. medical landscape, HealthQA-BR targets the Brazilian healthcare ecosystem,
|
|
22
|
+
covering a wide range of medical specialties and interdisciplinary professions such as nursing, dentistry,
|
|
23
|
+
psychology, social work, pharmacy, and physiotherapy. This comprehensive approach enables a detailed assessment
|
|
24
|
+
of AI models’ ability to collaborate effectively in the team-based patient care typical of SUS.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
name = "healthqa_br"
|
|
28
|
+
description = "MQA benchmark with questions from Brazilian entrance exams"
|
|
29
|
+
tags = ["knowledge", "multiple_choice", "pt-br"]
|
|
30
|
+
|
|
31
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
32
|
+
# Download the raw data and read all the dialogues
|
|
33
|
+
dataset: Any
|
|
34
|
+
# Read all the instances
|
|
35
|
+
instances: List[Instance] = []
|
|
36
|
+
cache_dir = str(Path(output_path) / "data")
|
|
37
|
+
|
|
38
|
+
dataset = load_dataset("Larxel/healthqa-br", cache_dir=cache_dir)
|
|
39
|
+
for example in dataset["train"]:
|
|
40
|
+
question_choices = example["question"]
|
|
41
|
+
answer = example["answer"].strip().upper()
|
|
42
|
+
|
|
43
|
+
# Separate the question statement from the alternatives
|
|
44
|
+
question_text, choices_text = self.split_question_and_choices(question_choices)
|
|
45
|
+
|
|
46
|
+
# Extract alternatives from text choices_text
|
|
47
|
+
pattern = r"'([A-Z])':\s*'([^']+)'"
|
|
48
|
+
matches = re.findall(pattern, choices_text)
|
|
49
|
+
answers_dict = {label: text for label, text in matches}
|
|
50
|
+
|
|
51
|
+
if answer not in answers_dict:
|
|
52
|
+
continue
|
|
53
|
+
|
|
54
|
+
correct_answer_text = answers_dict[answer]
|
|
55
|
+
|
|
56
|
+
def answer_to_reference(answer: str) -> Reference:
|
|
57
|
+
return Reference(Output(text=answer), tags=[CORRECT_TAG] if correct_answer_text == answer else [])
|
|
58
|
+
|
|
59
|
+
instance = Instance(
|
|
60
|
+
input=Input(text=question_text),
|
|
61
|
+
split=TEST_SPLIT,
|
|
62
|
+
references=[answer_to_reference(text) for text in answers_dict.values()],
|
|
63
|
+
)
|
|
64
|
+
instances.append(instance)
|
|
65
|
+
return instances
|
|
66
|
+
|
|
67
|
+
def split_question_and_choices(self, full_text: str):
|
|
68
|
+
# Search for the first occurrence of the alternative pattern
|
|
69
|
+
match = re.search(r"\n'[A-Z]':\s*'.+", full_text)
|
|
70
|
+
if match:
|
|
71
|
+
# Everything before the alternatives
|
|
72
|
+
question_part = full_text[: match.start()].strip()
|
|
73
|
+
# All of the alternatives (from match to end)
|
|
74
|
+
choices_part = full_text[match.start() :].strip()
|
|
75
|
+
else:
|
|
76
|
+
# If you don't find a pattern, consider everything as a question, and no alternative.
|
|
77
|
+
question_part = full_text.strip()
|
|
78
|
+
choices_part = ""
|
|
79
|
+
|
|
80
|
+
return question_part, choices_part
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
from datasets import load_dataset, Features, Value, Sequence, Dataset
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
Input,
|
|
11
|
+
Reference,
|
|
12
|
+
Output,
|
|
13
|
+
CORRECT_TAG,
|
|
14
|
+
TEST_SPLIT,
|
|
15
|
+
)
|
|
16
|
+
from helm.common.general import ensure_directory_exists
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class InfiniteBenchEnMCScenario(Scenario):
|
|
20
|
+
"""InfiniteBench En.MC
|
|
21
|
+
|
|
22
|
+
InfiniteBench is a benchmark tailored for evaluating the capabilities of language models to process,
|
|
23
|
+
understand, and reason over long contexts (100k+ tokens). InfiniteBench En.MC is a subset of
|
|
24
|
+
InfiniteBench that requires models to perform multiple-choice question answering on questions that necessitate
|
|
25
|
+
long-range dependency and reasoning, beyond simple short passage retrieval.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
name = "infinite_bench_en_mc"
|
|
29
|
+
description = "∞Bench En.MC is a multiple-choice question answering task that necessitates long-range dependency and reasoning. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))" # noqa: E501
|
|
30
|
+
tags = ["question_answering"]
|
|
31
|
+
|
|
32
|
+
def __init__(self, max_num_words: int):
|
|
33
|
+
self.max_num_words = max_num_words
|
|
34
|
+
super().__init__()
|
|
35
|
+
|
|
36
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
37
|
+
# Get InfiniteBench from HuggingFace
|
|
38
|
+
cache_dir = os.path.join(output_path, "data")
|
|
39
|
+
ensure_directory_exists(cache_dir)
|
|
40
|
+
|
|
41
|
+
# Define the features schema
|
|
42
|
+
ft = Features(
|
|
43
|
+
{
|
|
44
|
+
"id": Value("int64"),
|
|
45
|
+
"context": Value("string"),
|
|
46
|
+
"input": Value("string"),
|
|
47
|
+
"answer": Sequence(Value("string")),
|
|
48
|
+
"options": Sequence(Value("string")),
|
|
49
|
+
}
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
# Load the dataset with the specified features
|
|
53
|
+
dataset = load_dataset(
|
|
54
|
+
"xinrongzhang2022/InfiniteBench",
|
|
55
|
+
split="longbook_choice_eng",
|
|
56
|
+
features=ft,
|
|
57
|
+
revision="90f0394333616266d9fe85824ceaf505093cbaa5",
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
assert isinstance(dataset, Dataset)
|
|
61
|
+
|
|
62
|
+
def count_words(text: str) -> int:
|
|
63
|
+
return len(re.split(r"\s+", text.strip()))
|
|
64
|
+
|
|
65
|
+
dataset = dataset.filter(
|
|
66
|
+
lambda example: count_words(example["context"])
|
|
67
|
+
+ count_words(example["input"])
|
|
68
|
+
+ sum(count_words(option) for option in example["options"])
|
|
69
|
+
<= self.max_num_words
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
# Read all instances
|
|
73
|
+
instances: List[Instance] = []
|
|
74
|
+
for row in dataset:
|
|
75
|
+
assert len(row["answer"]) == 1
|
|
76
|
+
id = row["id"]
|
|
77
|
+
input = Input(text=row["context"] + "\n\n" + row["input"])
|
|
78
|
+
references = [
|
|
79
|
+
Reference(Output(text=option), tags=[CORRECT_TAG] if option == row["answer"][0] else [])
|
|
80
|
+
for option in row["options"]
|
|
81
|
+
]
|
|
82
|
+
instance = Instance(
|
|
83
|
+
id=id,
|
|
84
|
+
input=input,
|
|
85
|
+
references=references,
|
|
86
|
+
split=TEST_SPLIT,
|
|
87
|
+
)
|
|
88
|
+
instances.append(instance)
|
|
89
|
+
|
|
90
|
+
return instances
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
from datasets import load_dataset, Features, Value, Sequence, Dataset
|
|
6
|
+
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
Input,
|
|
11
|
+
Reference,
|
|
12
|
+
Output,
|
|
13
|
+
CORRECT_TAG,
|
|
14
|
+
TEST_SPLIT,
|
|
15
|
+
)
|
|
16
|
+
from helm.common.general import ensure_directory_exists
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class InfiniteBenchEnQAScenario(Scenario):
|
|
20
|
+
"""InfiniteBench En.QA
|
|
21
|
+
|
|
22
|
+
InfiniteBench is a benchmark tailored for evaluating the capabilities of language models to process,
|
|
23
|
+
understand, and reason over long contexts (100k+ tokens). InfiniteBench En.QA is a subset of
|
|
24
|
+
InfiniteBench that requires models to perform open-form question answering on questions that necessitate
|
|
25
|
+
long-range dependency and reasoning, beyond simple short passage retrieval.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
name = "infinite_bench_en_qa"
|
|
29
|
+
description = "∞Bench En.QA is an open-ended question answering task that necessitates long-range dependency and reasoning. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))" # noqa: E501
|
|
30
|
+
tags = ["question_answering"]
|
|
31
|
+
|
|
32
|
+
def __init__(self, max_num_words: int):
|
|
33
|
+
self.max_num_words = max_num_words
|
|
34
|
+
super().__init__()
|
|
35
|
+
|
|
36
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
37
|
+
# Get InfiniteBench from HuggingFace
|
|
38
|
+
cache_dir = os.path.join(output_path, "data")
|
|
39
|
+
ensure_directory_exists(cache_dir)
|
|
40
|
+
|
|
41
|
+
# Define the features schema
|
|
42
|
+
ft = Features(
|
|
43
|
+
{
|
|
44
|
+
"id": Value("int64"),
|
|
45
|
+
"context": Value("string"),
|
|
46
|
+
"input": Value("string"),
|
|
47
|
+
"answer": Sequence(Value("string")),
|
|
48
|
+
"options": Sequence(Value("string")),
|
|
49
|
+
}
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
# Load the dataset with the specified features
|
|
53
|
+
dataset = load_dataset(
|
|
54
|
+
"xinrongzhang2022/InfiniteBench",
|
|
55
|
+
split="longbook_qa_eng",
|
|
56
|
+
features=ft,
|
|
57
|
+
revision="90f0394333616266d9fe85824ceaf505093cbaa5",
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
assert isinstance(dataset, Dataset)
|
|
61
|
+
|
|
62
|
+
def count_words(text: str) -> int:
|
|
63
|
+
return len(re.split(r"\s+", text.strip()))
|
|
64
|
+
|
|
65
|
+
dataset = dataset.filter(
|
|
66
|
+
lambda example: count_words(example["context"])
|
|
67
|
+
+ count_words(example["input"])
|
|
68
|
+
+ sum(count_words(option) for option in example["options"])
|
|
69
|
+
<= self.max_num_words
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
# Read all instances
|
|
73
|
+
instances: List[Instance] = []
|
|
74
|
+
for row in dataset:
|
|
75
|
+
id = row["id"]
|
|
76
|
+
input = Input(text=row["context"] + "\n\n" + row["input"])
|
|
77
|
+
instance = Instance(
|
|
78
|
+
id=id,
|
|
79
|
+
input=input,
|
|
80
|
+
references=[Reference(Output(text=row["answer"][0]), tags=[CORRECT_TAG])],
|
|
81
|
+
split=TEST_SPLIT,
|
|
82
|
+
)
|
|
83
|
+
instances.append(instance)
|
|
84
|
+
|
|
85
|
+
return instances
|
helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py}
RENAMED
|
@@ -14,21 +14,19 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
14
14
|
from helm.common.general import ensure_directory_exists
|
|
15
15
|
|
|
16
16
|
|
|
17
|
-
class
|
|
18
|
-
"""InfiniteBench Sum
|
|
17
|
+
class InfiniteBenchEnSumScenario(Scenario):
|
|
18
|
+
"""InfiniteBench En.Sum
|
|
19
19
|
|
|
20
20
|
InfiniteBench is a benchmark tailored for evaluating the capabilities of language models to process,
|
|
21
|
-
understand, and reason over super long contexts (100k+ tokens). InfiniteBench Sum is a subset of
|
|
22
|
-
InfiniteBench that requires models to generate a concise summary of the novel.
|
|
23
|
-
to as "En.Sum" in the original paper.
|
|
21
|
+
understand, and reason over super long contexts (100k+ tokens). InfiniteBench En.Sum is a subset of
|
|
22
|
+
InfiniteBench that requires models to generate a concise summary of the novel.
|
|
24
23
|
"""
|
|
25
24
|
|
|
26
|
-
name = "
|
|
27
|
-
description = "
|
|
25
|
+
name = "infinite_bench_en_sum"
|
|
26
|
+
description = "∞Bench En.Sum is a summarization task that requires generating a concise summary of a novel. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))" # noqa: E501
|
|
28
27
|
tags = ["summarization"]
|
|
29
28
|
|
|
30
|
-
def __init__(self,
|
|
31
|
-
self.min_num_words = min_num_words
|
|
29
|
+
def __init__(self, max_num_words: int):
|
|
32
30
|
self.max_num_words = max_num_words
|
|
33
31
|
super().__init__()
|
|
34
32
|
|
|
@@ -61,9 +59,9 @@ class InfiniteBenchSumScenario(Scenario):
|
|
|
61
59
|
def count_words(text: str) -> int:
|
|
62
60
|
return len(re.split(r"\s+", text.strip()))
|
|
63
61
|
|
|
64
|
-
dataset = dataset.
|
|
65
|
-
lambda example:
|
|
66
|
-
)
|
|
62
|
+
dataset = dataset.filter(
|
|
63
|
+
lambda example: count_words(example["context"]) + count_words(example["input"]) <= self.max_num_words
|
|
64
|
+
)
|
|
67
65
|
|
|
68
66
|
# Read all instances
|
|
69
67
|
instances: List[Instance] = []
|
|
@@ -75,7 +73,6 @@ class InfiniteBenchSumScenario(Scenario):
|
|
|
75
73
|
input=input,
|
|
76
74
|
references=[Reference(Output(text=row["answer"][0]), tags=[CORRECT_TAG])],
|
|
77
75
|
split=TEST_SPLIT,
|
|
78
|
-
extra_data={"word_count": row["prompt_wc"]},
|
|
79
76
|
)
|
|
80
77
|
instances.append(instance)
|
|
81
78
|
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import List, Dict
|
|
3
|
+
import json
|
|
4
|
+
import re
|
|
5
|
+
|
|
6
|
+
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
Reference,
|
|
11
|
+
TRAIN_SPLIT,
|
|
12
|
+
VALID_SPLIT,
|
|
13
|
+
TEST_SPLIT,
|
|
14
|
+
CORRECT_TAG,
|
|
15
|
+
Input,
|
|
16
|
+
Output,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class KPIEDGARScenario(Scenario):
|
|
21
|
+
"""A financial named entity recognition (NER) scenario based on KPI-EDGAR (T. Deußer et al., 2022).
|
|
22
|
+
|
|
23
|
+
This scenario has been modified from the paper. The original paper has 12 entity types and requires the model
|
|
24
|
+
to extract pairs of related entities. This scenario only use four named entity types (kpi, cy, py, py1) and only
|
|
25
|
+
requires the model to extract individual entities.
|
|
26
|
+
|
|
27
|
+
Paper:
|
|
28
|
+
T. Deußer et al.,
|
|
29
|
+
“KPI-EDGAR: A Novel Dataset and Accompanying Metric for Relation Extraction from Financial Documents.” 2022.
|
|
30
|
+
https://arxiv.org/abs/2210.09163
|
|
31
|
+
|
|
32
|
+
Prompt format:
|
|
33
|
+
|
|
34
|
+
```
|
|
35
|
+
Context: {Sentence}
|
|
36
|
+
Task: Extract key performance indicators (KPIs) and values from the above text. Also, specify one of the following categories to each of the extracted KPIs and values in brackets.
|
|
37
|
+
kpi: Key Performance Indicators expressible in numerical and monetary value, cy: Current Year monetary value, py: Prior Year monetary value, py1: Two Year Past Value.
|
|
38
|
+
Answer:
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
Example input:
|
|
42
|
+
|
|
43
|
+
```
|
|
44
|
+
Context: The following table summarizes our total share-based compensation expense and excess tax benefits recognized : As of December 28 , 2019 , there was $ 284 million of total unrecognized compensation cost related to nonvested share-based compensation grants .
|
|
45
|
+
Task: Extract key performance indicators (KPIs) and values from the above text. Also, specify one of the following categories to each of the extracted KPIs and values in brackets.
|
|
46
|
+
kpi: Key Performance Indicators expressible in numerical and monetary value, cy: Current Year monetary value, py: Prior Year monetary value, py1: Two Year Past Value.
|
|
47
|
+
Answer:
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
Example reference:
|
|
51
|
+
```
|
|
52
|
+
284 [cy], total unrecognized compensation cost [kpi]
|
|
53
|
+
```""" # noqa: E501
|
|
54
|
+
|
|
55
|
+
name = "kpi_edgar"
|
|
56
|
+
description = "Named Entity Recognition from financial documents."
|
|
57
|
+
tags = ["named_entity_recognition", "finance"]
|
|
58
|
+
|
|
59
|
+
TAG_DICT = {
|
|
60
|
+
"kpi": "Key Performance Indicators expressible in numerical and monetary value",
|
|
61
|
+
"cy": "Current Year monetary value",
|
|
62
|
+
"py": "Prior Year monetary value",
|
|
63
|
+
"py1": "Two Year Past Value",
|
|
64
|
+
}
|
|
65
|
+
TAG_DESCRIPTIONS = ", ".join(["%s: %s" % (key, val) for (key, val) in TAG_DICT.items()]) + "."
|
|
66
|
+
TAG_PAREN_RE = (r"\[", r"\]")
|
|
67
|
+
TAG_PAREN = tuple((e.strip("\\") for e in TAG_PAREN_RE))
|
|
68
|
+
TAG_PAREN_ESC = ("(", ")")
|
|
69
|
+
DATASET_SPLIT_TO_HELM_SPLIT = {"train": TRAIN_SPLIT, "valid": VALID_SPLIT, "test": TEST_SPLIT}
|
|
70
|
+
JSON_URL = "https://raw.githubusercontent.com/tobideusser/kpi-edgar/2ec7084dcd55b4979bbe288d4aa1e962c685c9ab/data/kpi_edgar.json" # noqa: E501
|
|
71
|
+
JSON_FILENAME = "kpi_edgar.json"
|
|
72
|
+
|
|
73
|
+
@staticmethod
|
|
74
|
+
def get_sentences(dataset: List[Dict]) -> List[Dict]:
|
|
75
|
+
return [
|
|
76
|
+
sentence
|
|
77
|
+
for document in dataset
|
|
78
|
+
for segment in document["segments"]
|
|
79
|
+
for sentence in segment["sentences"] or []
|
|
80
|
+
]
|
|
81
|
+
|
|
82
|
+
@staticmethod
|
|
83
|
+
def escape_parenthesis(text: str) -> str:
|
|
84
|
+
tmp0 = re.sub(KPIEDGARScenario.TAG_PAREN_RE[0], KPIEDGARScenario.TAG_PAREN_ESC[0], text)
|
|
85
|
+
tmp1 = re.sub(KPIEDGARScenario.TAG_PAREN_RE[1], KPIEDGARScenario.TAG_PAREN_ESC[1], tmp0)
|
|
86
|
+
return tmp1
|
|
87
|
+
|
|
88
|
+
@staticmethod
|
|
89
|
+
def get_output_text(
|
|
90
|
+
words: List[str],
|
|
91
|
+
annotations: List[Dict],
|
|
92
|
+
) -> str:
|
|
93
|
+
# def get_entity_for_annotation(words: List[str], annotation: Dict) -> str
|
|
94
|
+
entities: List[str] = []
|
|
95
|
+
for annotation in annotations:
|
|
96
|
+
annotation_type = annotation["type_"]
|
|
97
|
+
if annotation_type not in KPIEDGARScenario.TAG_DICT:
|
|
98
|
+
continue
|
|
99
|
+
start_idx = annotation["start"]
|
|
100
|
+
end_idx = annotation["end"]
|
|
101
|
+
annotated_words = words[start_idx:end_idx]
|
|
102
|
+
phrase = KPIEDGARScenario.escape_parenthesis(" ".join(annotated_words))
|
|
103
|
+
entities.append(
|
|
104
|
+
"%s %s%s%s" % (phrase, KPIEDGARScenario.TAG_PAREN[0], annotation_type, KPIEDGARScenario.TAG_PAREN[1])
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
return ", ".join(entities)
|
|
108
|
+
|
|
109
|
+
@staticmethod
|
|
110
|
+
def sentences_to_instances(sentences: List[Dict]) -> List[Instance]:
|
|
111
|
+
instances: List[Instance] = []
|
|
112
|
+
for sentence in sentences:
|
|
113
|
+
dataset_split: str = sentence["split_type"]
|
|
114
|
+
if dataset_split is None:
|
|
115
|
+
continue
|
|
116
|
+
split = KPIEDGARScenario.DATASET_SPLIT_TO_HELM_SPLIT[dataset_split]
|
|
117
|
+
|
|
118
|
+
words: List[str] = [word_dict["value"] for word_dict in sentence["words"]]
|
|
119
|
+
passage = KPIEDGARScenario.escape_parenthesis(" ".join(words))
|
|
120
|
+
input_text = (
|
|
121
|
+
"Context: %s\n"
|
|
122
|
+
"Task: Extract key performance indicators (KPIs) and values from the above text. Also, specify one of the following categories to each of the extracted KPIs and values in brackets.\n" # noqa: E501
|
|
123
|
+
"%s" % (passage, KPIEDGARScenario.TAG_DESCRIPTIONS)
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
annotations = sentence["entities_anno"]
|
|
127
|
+
output_text = KPIEDGARScenario.get_output_text(words, annotations)
|
|
128
|
+
if not output_text:
|
|
129
|
+
continue
|
|
130
|
+
|
|
131
|
+
instances.append(
|
|
132
|
+
Instance(
|
|
133
|
+
input=Input(text=input_text),
|
|
134
|
+
references=[Reference(Output(text=output_text), tags=[CORRECT_TAG])],
|
|
135
|
+
split=split,
|
|
136
|
+
)
|
|
137
|
+
)
|
|
138
|
+
return instances
|
|
139
|
+
|
|
140
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
141
|
+
data_path = os.path.join(output_path, "data")
|
|
142
|
+
ensure_directory_exists(data_path)
|
|
143
|
+
|
|
144
|
+
base_url = self.JSON_URL
|
|
145
|
+
dataset_file_name = self.JSON_FILENAME
|
|
146
|
+
target_path = os.path.join(data_path, dataset_file_name)
|
|
147
|
+
ensure_file_downloaded(source_url=base_url, target_path=target_path)
|
|
148
|
+
|
|
149
|
+
with open(target_path, "r") as f:
|
|
150
|
+
raw_dataset = json.load(f)
|
|
151
|
+
return KPIEDGARScenario.sentences_to_instances(KPIEDGARScenario.get_sentences(raw_dataset))
|