crfm-helm 0.5.5__py3-none-any.whl → 0.5.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/METADATA +27 -13
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/RECORD +203 -156
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/WHEEL +1 -1
- helm/benchmark/adaptation/adapters/in_context_learning_adapter.py +3 -3
- helm/benchmark/adaptation/adapters/test_adapter.py +4 -4
- helm/benchmark/annotation/air_bench_annotator.py +1 -1
- helm/benchmark/annotation/bigcodebench_annotator.py +3 -3
- helm/benchmark/annotation/bird_sql_annotator.py +2 -2
- helm/benchmark/annotation/chw_care_plan_annotator.py +7 -12
- helm/benchmark/annotation/ehr_sql_annotator.py +2 -2
- helm/benchmark/annotation/helpdesk_call_summarization_annotator.py +7 -7
- helm/benchmark/annotation/mimic_bhc_annotator.py +100 -0
- helm/benchmark/annotation/model_as_judge.py +12 -16
- helm/benchmark/annotation/omni_math_annotator.py +13 -14
- helm/benchmark/annotation/wildbench_annotator.py +9 -9
- helm/benchmark/executor.py +11 -12
- helm/benchmark/metrics/aci_bench_metrics.py +9 -29
- helm/benchmark/metrics/bias_word_lists.py +1 -1
- helm/benchmark/metrics/chw_care_plan_metrics.py +10 -30
- helm/benchmark/metrics/classification_metrics.py +3 -3
- helm/benchmark/metrics/cleva_harms_metrics.py +2 -2
- helm/benchmark/metrics/conv_fin_qa_calc_metrics.py +2 -2
- helm/benchmark/metrics/dischargeme_metrics.py +9 -29
- helm/benchmark/metrics/efficiency_metrics.py +3 -3
- helm/benchmark/metrics/gpt4_audio_refusal_metrics.py +145 -0
- helm/benchmark/metrics/ifeval_metrics.py +2 -2
- helm/benchmark/metrics/kpi_edgar_metrics.py +121 -0
- helm/benchmark/metrics/llm_jury_metrics.py +46 -0
- helm/benchmark/metrics/med_dialog_metrics.py +9 -29
- helm/benchmark/metrics/medalign_metrics.py +9 -29
- helm/benchmark/metrics/medi_qa_metrics.py +9 -29
- helm/benchmark/metrics/medication_qa_metrics.py +10 -30
- helm/benchmark/metrics/melt_bias_metric.py +234 -0
- helm/benchmark/metrics/melt_bias_word_lists.py +1367 -0
- helm/benchmark/metrics/melt_metric_specs.py +43 -0
- helm/benchmark/metrics/melt_toxicity_metric.py +107 -0
- helm/benchmark/metrics/mental_health_metrics.py +9 -29
- helm/benchmark/metrics/metric_service.py +11 -11
- helm/benchmark/metrics/mimic_bhc_metrics.py +14 -0
- helm/benchmark/metrics/mimic_rrs_metrics.py +9 -29
- helm/benchmark/metrics/mtsamples_procedures_metrics.py +9 -29
- helm/benchmark/metrics/mtsamples_replicate_metrics.py +9 -29
- helm/benchmark/metrics/openai_mrcr_metrics.py +52 -0
- helm/benchmark/metrics/ruler_qa_metrics.py +34 -0
- helm/benchmark/metrics/starr_patient_instructions_metrics.py +9 -29
- helm/benchmark/metrics/summac/model_summac.py +1 -2
- helm/benchmark/metrics/summarization_metrics.py +2 -1
- helm/benchmark/metrics/tokens/test_openai_token_cost_estimator.py +2 -2
- helm/benchmark/metrics/toxicity_metrics.py +2 -2
- helm/benchmark/metrics/unitxt_metrics.py +3 -4
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -2
- helm/benchmark/metrics/vision_language/image_utils.py +2 -2
- helm/benchmark/model_deployment_registry.py +6 -8
- helm/benchmark/presentation/contamination.py +3 -3
- helm/benchmark/presentation/create_plots.py +33 -12
- helm/benchmark/presentation/run_display.py +13 -0
- helm/benchmark/presentation/schema.py +2 -1
- helm/benchmark/presentation/summarize.py +76 -59
- helm/benchmark/reeval_run.py +3 -4
- helm/benchmark/reeval_runner.py +3 -3
- helm/benchmark/run.py +78 -73
- helm/benchmark/run_expander.py +12 -1
- helm/benchmark/run_spec_factory.py +7 -6
- helm/benchmark/run_specs/audio_run_specs.py +52 -8
- helm/benchmark/run_specs/enterprise_run_specs.py +20 -0
- helm/benchmark/run_specs/experimental_run_specs.py +31 -1
- helm/benchmark/run_specs/long_context_run_specs.py +67 -15
- helm/benchmark/run_specs/medhelm_run_specs.py +146 -41
- helm/benchmark/run_specs/melt_run_specs.py +783 -0
- helm/benchmark/run_specs/speech_disorder_audio_run_specs.py +169 -0
- helm/benchmark/run_specs/vlm_run_specs.py +28 -0
- helm/benchmark/runner.py +5 -5
- helm/benchmark/scenarios/aci_bench_scenario.py +7 -1
- helm/benchmark/scenarios/audio_language/air_bench_chat_scenario.py +3 -1
- helm/benchmark/scenarios/audio_language/air_bench_foundation_scenario.py +5 -5
- helm/benchmark/scenarios/audio_language/corebench_scenario.py +77 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_asr_classification.py +103 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_classification_scenario.py +110 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_breakdown_scenario.py +78 -0
- helm/benchmark/scenarios/audio_language/ultra_suite_disorder_symptoms_scenario.py +109 -0
- helm/benchmark/scenarios/audio_language/vocal_sound_scenario.py +15 -1
- helm/benchmark/scenarios/audio_language/voxceleb2_scenario.py +1 -2
- helm/benchmark/scenarios/autobencher_capabilities_scenario.py +2 -2
- helm/benchmark/scenarios/chw_care_plan_scenario.py +14 -13
- helm/benchmark/scenarios/clear_scenario.py +11 -7
- helm/benchmark/scenarios/dischargeme_scenario.py +36 -21
- helm/benchmark/scenarios/ehr_sql_scenario.py +7 -1
- helm/benchmark/scenarios/ehrshot_scenario.py +28 -55
- helm/benchmark/scenarios/grammar.py +2 -2
- helm/benchmark/scenarios/headqa_scenario.py +6 -1
- helm/benchmark/scenarios/infinite_bench_en_qa_scenario.py +85 -0
- helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py} +10 -13
- helm/benchmark/scenarios/kpi_edgar_scenario.py +151 -0
- helm/benchmark/scenarios/med_dialog_scenario.py +6 -1
- helm/benchmark/scenarios/medalign_scenario.py +9 -3
- helm/benchmark/scenarios/medalign_scenario_helper.py +8 -5
- helm/benchmark/scenarios/medbullets_scenario.py +7 -2
- helm/benchmark/scenarios/medcalc_bench_scenario.py +4 -2
- helm/benchmark/scenarios/medec_scenario.py +6 -1
- helm/benchmark/scenarios/medhallu_scenario.py +7 -1
- helm/benchmark/scenarios/medi_qa_scenario.py +10 -4
- helm/benchmark/scenarios/medication_qa_scenario.py +7 -1
- helm/benchmark/scenarios/melt_ir_scenario.py +171 -0
- helm/benchmark/scenarios/melt_knowledge_scenario.py +246 -0
- helm/benchmark/scenarios/melt_lm_scenarios.py +252 -0
- helm/benchmark/scenarios/melt_scenarios.py +793 -0
- helm/benchmark/scenarios/melt_srn_scenario.py +342 -0
- helm/benchmark/scenarios/melt_synthetic_reasoning_scenario.py +222 -0
- helm/benchmark/scenarios/melt_translation_scenario.py +152 -0
- helm/benchmark/scenarios/mental_health_scenario.py +16 -5
- helm/benchmark/scenarios/mimic_bhc_scenario.py +12 -7
- helm/benchmark/scenarios/mimic_rrs_scenario.py +17 -8
- helm/benchmark/scenarios/mimiciv_billing_code_scenario.py +14 -8
- helm/benchmark/scenarios/mmlu_pro_scenario.py +1 -1
- helm/benchmark/scenarios/mtsamples_procedures_scenario.py +5 -2
- helm/benchmark/scenarios/mtsamples_replicate_scenario.py +3 -2
- helm/benchmark/scenarios/n2c2_ct_matching_scenario.py +11 -5
- helm/benchmark/scenarios/numeracy_scenario.py +2 -1
- helm/benchmark/scenarios/openai_mrcr_scenario.py +79 -0
- helm/benchmark/scenarios/pubmed_qa_scenario.py +6 -1
- helm/benchmark/scenarios/race_based_med_scenario.py +18 -8
- helm/benchmark/scenarios/ruler_qa_scenario_helper.py +2 -2
- helm/benchmark/scenarios/ruler_qa_scenarios.py +2 -2
- helm/benchmark/scenarios/shc_bmt_scenario.py +12 -6
- helm/benchmark/scenarios/shc_cdi_scenario.py +11 -6
- helm/benchmark/scenarios/shc_conf_scenario.py +12 -6
- helm/benchmark/scenarios/shc_ent_scenario.py +11 -6
- helm/benchmark/scenarios/shc_gip_scenario.py +13 -5
- helm/benchmark/scenarios/shc_privacy_scenario.py +78 -0
- helm/benchmark/scenarios/shc_proxy_scenario.py +76 -0
- helm/benchmark/scenarios/shc_ptbm_scenario.py +12 -7
- helm/benchmark/scenarios/shc_sei_scenario.py +12 -7
- helm/benchmark/scenarios/shc_sequoia_scenario.py +13 -5
- helm/benchmark/scenarios/starr_patient_instructions_scenario.py +15 -8
- helm/benchmark/scenarios/test_infinite_bench_en_qa_scenario.py +18 -0
- helm/benchmark/scenarios/test_infinite_bench_en_sum_scenario.py +31 -0
- helm/benchmark/scenarios/truthful_qa_scenario.py +2 -1
- helm/benchmark/scenarios/vision_language/msr_vtt_scenario.py +75 -0
- helm/benchmark/server.py +2 -1
- helm/benchmark/static/schema_audio.yaml +60 -49
- helm/benchmark/static/schema_enterprise.yaml +21 -0
- helm/benchmark/static/schema_long_context.yaml +63 -20
- helm/benchmark/static/schema_medhelm.yaml +272 -213
- helm/benchmark/static/schema_melt.yaml +1257 -0
- helm/benchmark/static/schema_slphelm.yaml +162 -0
- helm/benchmark/static/schema_vhelm.yaml +26 -26
- helm/benchmark/static/schema_video.yaml +219 -0
- helm/benchmark/static_build/assets/index-94295e78.js +10 -0
- helm/benchmark/static_build/assets/index-b9779128.css +1 -0
- helm/benchmark/static_build/assets/medhelm-overview-eac29843.png +0 -0
- helm/benchmark/static_build/assets/{tremor-9cefc3c5.js → tremor-38a10867.js} +1 -1
- helm/benchmark/static_build/index.html +4 -4
- helm/benchmark/window_services/encoder_decoder_window_service.py +3 -3
- helm/benchmark/window_services/test_utils.py +3 -4
- helm/benchmark/window_services/tokenizer_service.py +7 -8
- helm/clients/anthropic_client.py +69 -29
- helm/clients/audio_language/diva_llama_client.py +4 -2
- helm/clients/audio_language/qwen2_5_omni_client.py +197 -0
- helm/clients/audio_language/qwen2_audiolm_client.py +8 -6
- helm/clients/audio_language/qwen_audiolm_client.py +4 -2
- helm/clients/audio_language/test.py +62 -0
- helm/clients/bedrock_client.py +3 -1
- helm/clients/client.py +7 -7
- helm/clients/grok_client.py +36 -0
- helm/clients/huggingface_client.py +42 -3
- helm/clients/huggingface_pipeline_client.py +138 -0
- helm/clients/image_generation/dalle_mini/model/configuration.py +1 -1
- helm/clients/image_generation/dalle_mini/model/modeling.py +1 -1
- helm/clients/image_generation/dalle_mini/model/processor.py +1 -1
- helm/clients/image_generation/dalle_mini/model/tokenizer.py +1 -1
- helm/clients/openai_client.py +100 -54
- helm/clients/openai_responses_client.py +174 -0
- helm/clients/palmyra_client.py +2 -5
- helm/clients/reka_client.py +2 -2
- helm/clients/together_client.py +31 -4
- helm/clients/vertexai_client.py +6 -0
- helm/clients/vision_language/huggingface_vision2seq_client.py +6 -4
- helm/clients/vision_language/huggingface_vlm_client.py +2 -2
- helm/clients/vision_language/idefics_client.py +6 -2
- helm/clients/vision_language/paligemma_client.py +2 -2
- helm/clients/vision_language/qwen2_vlm_client.py +66 -53
- helm/clients/vision_language/qwen_vlm_client.py +7 -5
- helm/clients/writer_client.py +102 -0
- helm/common/context.py +80 -0
- helm/common/credentials_utils.py +5 -5
- helm/common/general.py +9 -2
- helm/common/hierarchical_logger.py +46 -3
- helm/common/local_context.py +140 -0
- helm/common/remote_context.py +61 -0
- helm/common/request.py +8 -0
- helm/config/model_deployments.yaml +864 -193
- helm/config/model_metadata.yaml +667 -53
- helm/config/tokenizer_configs.yaml +144 -3
- helm/proxy/cli.py +3 -1
- helm/proxy/critique/mechanical_turk_utils.py +1 -1
- helm/proxy/services/server_service.py +21 -85
- helm/tokenizers/grok_tokenizer.py +53 -0
- helm/tokenizers/huggingface_tokenizer.py +1 -1
- helm/tokenizers/test_grok_tokenizer.py +33 -0
- helm/benchmark/scenarios/test_infinite_bench_sum_scenario.py +0 -46
- helm/benchmark/static_build/assets/index-262903c1.js +0 -10
- helm/benchmark/static_build/assets/index-42060d71.css +0 -1
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/licenses/LICENSE +0 -0
- {crfm_helm-0.5.5.dist-info → crfm_helm-0.5.6.dist-info}/top_level.txt +0 -0
- /helm/benchmark/static_build/assets/{medhelm-overview-3ddfcd65.png → medhelm-v1-overview-3ddfcd65.png} +0 -0
helm/benchmark/scenarios/{infinite_bench_sum_scenario.py → infinite_bench_en_sum_scenario.py}
RENAMED
|
@@ -14,21 +14,19 @@ from helm.benchmark.scenarios.scenario import (
|
|
|
14
14
|
from helm.common.general import ensure_directory_exists
|
|
15
15
|
|
|
16
16
|
|
|
17
|
-
class
|
|
18
|
-
"""InfiniteBench Sum
|
|
17
|
+
class InfiniteBenchEnSumScenario(Scenario):
|
|
18
|
+
"""InfiniteBench En.Sum
|
|
19
19
|
|
|
20
20
|
InfiniteBench is a benchmark tailored for evaluating the capabilities of language models to process,
|
|
21
|
-
understand, and reason over super long contexts (100k+ tokens). InfiniteBench Sum is a subset of
|
|
22
|
-
InfiniteBench that requires models to generate a concise summary of the novel.
|
|
23
|
-
to as "En.Sum" in the original paper.
|
|
21
|
+
understand, and reason over super long contexts (100k+ tokens). InfiniteBench En.Sum is a subset of
|
|
22
|
+
InfiniteBench that requires models to generate a concise summary of the novel.
|
|
24
23
|
"""
|
|
25
24
|
|
|
26
|
-
name = "
|
|
27
|
-
description = "
|
|
25
|
+
name = "infinite_bench_en_sum"
|
|
26
|
+
description = "∞Bench En.Sum is a summarization task that requires generating a concise summary of a novel. ([Zhang et al., 2024](https://arxiv.org/abs/2402.13718))" # noqa: E501
|
|
28
27
|
tags = ["summarization"]
|
|
29
28
|
|
|
30
|
-
def __init__(self,
|
|
31
|
-
self.min_num_words = min_num_words
|
|
29
|
+
def __init__(self, max_num_words: int):
|
|
32
30
|
self.max_num_words = max_num_words
|
|
33
31
|
super().__init__()
|
|
34
32
|
|
|
@@ -61,9 +59,9 @@ class InfiniteBenchSumScenario(Scenario):
|
|
|
61
59
|
def count_words(text: str) -> int:
|
|
62
60
|
return len(re.split(r"\s+", text.strip()))
|
|
63
61
|
|
|
64
|
-
dataset = dataset.
|
|
65
|
-
lambda example:
|
|
66
|
-
)
|
|
62
|
+
dataset = dataset.filter(
|
|
63
|
+
lambda example: count_words(example["context"]) + count_words(example["input"]) <= self.max_num_words
|
|
64
|
+
)
|
|
67
65
|
|
|
68
66
|
# Read all instances
|
|
69
67
|
instances: List[Instance] = []
|
|
@@ -75,7 +73,6 @@ class InfiniteBenchSumScenario(Scenario):
|
|
|
75
73
|
input=input,
|
|
76
74
|
references=[Reference(Output(text=row["answer"][0]), tags=[CORRECT_TAG])],
|
|
77
75
|
split=TEST_SPLIT,
|
|
78
|
-
extra_data={"word_count": row["prompt_wc"]},
|
|
79
76
|
)
|
|
80
77
|
instances.append(instance)
|
|
81
78
|
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import List, Dict
|
|
3
|
+
import json
|
|
4
|
+
import re
|
|
5
|
+
|
|
6
|
+
from helm.common.general import ensure_file_downloaded, ensure_directory_exists
|
|
7
|
+
from helm.benchmark.scenarios.scenario import (
|
|
8
|
+
Scenario,
|
|
9
|
+
Instance,
|
|
10
|
+
Reference,
|
|
11
|
+
TRAIN_SPLIT,
|
|
12
|
+
VALID_SPLIT,
|
|
13
|
+
TEST_SPLIT,
|
|
14
|
+
CORRECT_TAG,
|
|
15
|
+
Input,
|
|
16
|
+
Output,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class KPIEDGARScenario(Scenario):
|
|
21
|
+
"""A financial named entity recognition (NER) scenario based on KPI-EDGAR (T. Deußer et al., 2022).
|
|
22
|
+
|
|
23
|
+
This scenario has been modified from the paper. The original paper has 12 entity types and requires the model
|
|
24
|
+
to extract pairs of related entities. This scenario only use four named entity types (kpi, cy, py, py1) and only
|
|
25
|
+
requires the model to extract individual entities.
|
|
26
|
+
|
|
27
|
+
Paper:
|
|
28
|
+
T. Deußer et al.,
|
|
29
|
+
“KPI-EDGAR: A Novel Dataset and Accompanying Metric for Relation Extraction from Financial Documents.” 2022.
|
|
30
|
+
https://arxiv.org/abs/2210.09163
|
|
31
|
+
|
|
32
|
+
Prompt format:
|
|
33
|
+
|
|
34
|
+
```
|
|
35
|
+
Context: {Sentence}
|
|
36
|
+
Task: Extract key performance indicators (KPIs) and values from the above text. Also, specify one of the following categories to each of the extracted KPIs and values in brackets.
|
|
37
|
+
kpi: Key Performance Indicators expressible in numerical and monetary value, cy: Current Year monetary value, py: Prior Year monetary value, py1: Two Year Past Value.
|
|
38
|
+
Answer:
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
Example input:
|
|
42
|
+
|
|
43
|
+
```
|
|
44
|
+
Context: The following table summarizes our total share-based compensation expense and excess tax benefits recognized : As of December 28 , 2019 , there was $ 284 million of total unrecognized compensation cost related to nonvested share-based compensation grants .
|
|
45
|
+
Task: Extract key performance indicators (KPIs) and values from the above text. Also, specify one of the following categories to each of the extracted KPIs and values in brackets.
|
|
46
|
+
kpi: Key Performance Indicators expressible in numerical and monetary value, cy: Current Year monetary value, py: Prior Year monetary value, py1: Two Year Past Value.
|
|
47
|
+
Answer:
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
Example reference:
|
|
51
|
+
```
|
|
52
|
+
284 [cy], total unrecognized compensation cost [kpi]
|
|
53
|
+
```""" # noqa: E501
|
|
54
|
+
|
|
55
|
+
name = "kpi_edgar"
|
|
56
|
+
description = "Named Entity Recognition from financial documents."
|
|
57
|
+
tags = ["named_entity_recognition", "finance"]
|
|
58
|
+
|
|
59
|
+
TAG_DICT = {
|
|
60
|
+
"kpi": "Key Performance Indicators expressible in numerical and monetary value",
|
|
61
|
+
"cy": "Current Year monetary value",
|
|
62
|
+
"py": "Prior Year monetary value",
|
|
63
|
+
"py1": "Two Year Past Value",
|
|
64
|
+
}
|
|
65
|
+
TAG_DESCRIPTIONS = ", ".join(["%s: %s" % (key, val) for (key, val) in TAG_DICT.items()]) + "."
|
|
66
|
+
TAG_PAREN_RE = (r"\[", r"\]")
|
|
67
|
+
TAG_PAREN = tuple((e.strip("\\") for e in TAG_PAREN_RE))
|
|
68
|
+
TAG_PAREN_ESC = ("(", ")")
|
|
69
|
+
DATASET_SPLIT_TO_HELM_SPLIT = {"train": TRAIN_SPLIT, "valid": VALID_SPLIT, "test": TEST_SPLIT}
|
|
70
|
+
JSON_URL = "https://raw.githubusercontent.com/tobideusser/kpi-edgar/2ec7084dcd55b4979bbe288d4aa1e962c685c9ab/data/kpi_edgar.json" # noqa: E501
|
|
71
|
+
JSON_FILENAME = "kpi_edgar.json"
|
|
72
|
+
|
|
73
|
+
@staticmethod
|
|
74
|
+
def get_sentences(dataset: List[Dict]) -> List[Dict]:
|
|
75
|
+
return [
|
|
76
|
+
sentence
|
|
77
|
+
for document in dataset
|
|
78
|
+
for segment in document["segments"]
|
|
79
|
+
for sentence in segment["sentences"] or []
|
|
80
|
+
]
|
|
81
|
+
|
|
82
|
+
@staticmethod
|
|
83
|
+
def escape_parenthesis(text: str) -> str:
|
|
84
|
+
tmp0 = re.sub(KPIEDGARScenario.TAG_PAREN_RE[0], KPIEDGARScenario.TAG_PAREN_ESC[0], text)
|
|
85
|
+
tmp1 = re.sub(KPIEDGARScenario.TAG_PAREN_RE[1], KPIEDGARScenario.TAG_PAREN_ESC[1], tmp0)
|
|
86
|
+
return tmp1
|
|
87
|
+
|
|
88
|
+
@staticmethod
|
|
89
|
+
def get_output_text(
|
|
90
|
+
words: List[str],
|
|
91
|
+
annotations: List[Dict],
|
|
92
|
+
) -> str:
|
|
93
|
+
# def get_entity_for_annotation(words: List[str], annotation: Dict) -> str
|
|
94
|
+
entities: List[str] = []
|
|
95
|
+
for annotation in annotations:
|
|
96
|
+
annotation_type = annotation["type_"]
|
|
97
|
+
if annotation_type not in KPIEDGARScenario.TAG_DICT:
|
|
98
|
+
continue
|
|
99
|
+
start_idx = annotation["start"]
|
|
100
|
+
end_idx = annotation["end"]
|
|
101
|
+
annotated_words = words[start_idx:end_idx]
|
|
102
|
+
phrase = KPIEDGARScenario.escape_parenthesis(" ".join(annotated_words))
|
|
103
|
+
entities.append(
|
|
104
|
+
"%s %s%s%s" % (phrase, KPIEDGARScenario.TAG_PAREN[0], annotation_type, KPIEDGARScenario.TAG_PAREN[1])
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
return ", ".join(entities)
|
|
108
|
+
|
|
109
|
+
@staticmethod
|
|
110
|
+
def sentences_to_instances(sentences: List[Dict]) -> List[Instance]:
|
|
111
|
+
instances: List[Instance] = []
|
|
112
|
+
for sentence in sentences:
|
|
113
|
+
dataset_split: str = sentence["split_type"]
|
|
114
|
+
if dataset_split is None:
|
|
115
|
+
continue
|
|
116
|
+
split = KPIEDGARScenario.DATASET_SPLIT_TO_HELM_SPLIT[dataset_split]
|
|
117
|
+
|
|
118
|
+
words: List[str] = [word_dict["value"] for word_dict in sentence["words"]]
|
|
119
|
+
passage = KPIEDGARScenario.escape_parenthesis(" ".join(words))
|
|
120
|
+
input_text = (
|
|
121
|
+
"Context: %s\n"
|
|
122
|
+
"Task: Extract key performance indicators (KPIs) and values from the above text. Also, specify one of the following categories to each of the extracted KPIs and values in brackets.\n" # noqa: E501
|
|
123
|
+
"%s" % (passage, KPIEDGARScenario.TAG_DESCRIPTIONS)
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
annotations = sentence["entities_anno"]
|
|
127
|
+
output_text = KPIEDGARScenario.get_output_text(words, annotations)
|
|
128
|
+
if not output_text:
|
|
129
|
+
continue
|
|
130
|
+
|
|
131
|
+
instances.append(
|
|
132
|
+
Instance(
|
|
133
|
+
input=Input(text=input_text),
|
|
134
|
+
references=[Reference(Output(text=output_text), tags=[CORRECT_TAG])],
|
|
135
|
+
split=split,
|
|
136
|
+
)
|
|
137
|
+
)
|
|
138
|
+
return instances
|
|
139
|
+
|
|
140
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
141
|
+
data_path = os.path.join(output_path, "data")
|
|
142
|
+
ensure_directory_exists(data_path)
|
|
143
|
+
|
|
144
|
+
base_url = self.JSON_URL
|
|
145
|
+
dataset_file_name = self.JSON_FILENAME
|
|
146
|
+
target_path = os.path.join(data_path, dataset_file_name)
|
|
147
|
+
ensure_file_downloaded(source_url=base_url, target_path=target_path)
|
|
148
|
+
|
|
149
|
+
with open(target_path, "r") as f:
|
|
150
|
+
raw_dataset = json.load(f)
|
|
151
|
+
return KPIEDGARScenario.sentences_to_instances(KPIEDGARScenario.get_sentences(raw_dataset))
|
|
@@ -90,7 +90,12 @@ class MedDialogScenario(Scenario):
|
|
|
90
90
|
"""
|
|
91
91
|
|
|
92
92
|
name = "med_dialog"
|
|
93
|
-
description =
|
|
93
|
+
description = (
|
|
94
|
+
"MedDialog is a benchmark of real-world doctor-patient conversations focused on health-related"
|
|
95
|
+
"concerns and advice. Each dialogue is paired with a one-sentence summary"
|
|
96
|
+
"that reflects the core patient question or exchange. The benchmark evaluates a model's"
|
|
97
|
+
"ability to condense medical dialogue into concise, informative summaries."
|
|
98
|
+
)
|
|
94
99
|
tags = ["dialogue", "biomedical"]
|
|
95
100
|
|
|
96
101
|
def __init__(self, subset: str):
|
|
@@ -60,12 +60,18 @@ class MedalignScenario(Scenario):
|
|
|
60
60
|
"""
|
|
61
61
|
|
|
62
62
|
name = "medalign"
|
|
63
|
-
description =
|
|
63
|
+
description = (
|
|
64
|
+
"MedAlign is a benchmark that evaluates a model's ability to interpret and follow"
|
|
65
|
+
"instructions grounded in longitudinal electronic health records (EHR). Each instance"
|
|
66
|
+
"includes an event-stream style patient record and a natural language question or task,"
|
|
67
|
+
"requiring clinically informed reading comprehension and reasoning."
|
|
68
|
+
)
|
|
64
69
|
tags = ["knowledge", "reasoning", "biomedical"]
|
|
65
70
|
|
|
66
|
-
def __init__(self, max_length: int):
|
|
71
|
+
def __init__(self, max_length: int, data_path: str):
|
|
67
72
|
super().__init__()
|
|
68
73
|
self.max_length = max_length
|
|
74
|
+
self.data_path = data_path
|
|
69
75
|
|
|
70
76
|
def process_tsv(self, data) -> List[Instance]:
|
|
71
77
|
instances: List[Instance] = []
|
|
@@ -84,5 +90,5 @@ class MedalignScenario(Scenario):
|
|
|
84
90
|
return instances
|
|
85
91
|
|
|
86
92
|
def get_instances(self, output_path: str) -> List[Instance]:
|
|
87
|
-
dataset = return_dataset_dataframe(self.max_length)
|
|
93
|
+
dataset = return_dataset_dataframe(self.max_length, self.data_path)
|
|
88
94
|
return self.process_tsv(dataset)
|
|
@@ -19,6 +19,7 @@ from typing import Any, Dict, Optional, Union, Callable
|
|
|
19
19
|
from langchain.schema import Document
|
|
20
20
|
import langchain_community
|
|
21
21
|
|
|
22
|
+
from helm.common.general import check_file_exists
|
|
22
23
|
|
|
23
24
|
|
|
24
25
|
def get_instructions(path_to_instructions: str) -> Dict[int, Dict[str, Any]]:
|
|
@@ -399,19 +400,21 @@ def add_reference_responses(prompts_df, path_to_reference_responses) -> pd.DataF
|
|
|
399
400
|
Returns:
|
|
400
401
|
pd.DataFrame: DataFrame containing the processed data.
|
|
401
402
|
"""
|
|
402
|
-
gold_df = pd.read_csv(path_to_reference_responses)
|
|
403
|
+
gold_df = pd.read_csv(path_to_reference_responses, sep='\t')
|
|
403
404
|
gold_df = gold_df.query("annotator_num == 'Annotator_1'")
|
|
404
405
|
gold_df = gold_df[["instruction_id", "clinician_response"]]
|
|
405
406
|
merged_df = gold_df.merge(prompts_df, on="instruction_id", how="inner")
|
|
406
407
|
return merged_df
|
|
407
408
|
|
|
408
409
|
|
|
409
|
-
def return_dataset_dataframe(max_length: int) -> pd.DataFrame:
|
|
410
|
+
def return_dataset_dataframe(max_length: int, data_path: str) -> pd.DataFrame:
|
|
410
411
|
target_context_length = max_length
|
|
411
412
|
generation_length = 256
|
|
412
|
-
path_to_instructions = "
|
|
413
|
-
|
|
414
|
-
|
|
413
|
+
path_to_instructions = os.path.join(data_path, "clinician-reviewed-model-responses.tsv")
|
|
414
|
+
check_file_exists(path_to_instructions, msg=f"[MedAlignScenario] Required instructions file not found: '{path_to_instructions}'")
|
|
415
|
+
path_to_ehrs = os.path.join(data_path, "medalign_ehr_xml")
|
|
416
|
+
path_to_reference_responses = os.path.join(data_path, "clinician-instruction-responses.tsv")
|
|
417
|
+
check_file_exists(path_to_reference_responses, msg=f"[MedAlignScenario] Required clinician responses file not found: '{path_to_reference_responses}'")
|
|
415
418
|
use_RAG = False
|
|
416
419
|
include_ehr = True
|
|
417
420
|
tokenizer = "tiktoken"
|
|
@@ -67,8 +67,13 @@ class MedBulletsScenario(Scenario):
|
|
|
67
67
|
"https://raw.githubusercontent.com/HanjieChen/ChallengeClinicalQA/refs/heads/main/medbullets/"
|
|
68
68
|
)
|
|
69
69
|
|
|
70
|
-
name = "
|
|
71
|
-
description =
|
|
70
|
+
name = "medbullets"
|
|
71
|
+
description = (
|
|
72
|
+
"Medbullets is a benchmark of USMLE-style medical questions designed to assess a"
|
|
73
|
+
"model’s ability to understand and apply clinical knowledge. Each question is accompanied"
|
|
74
|
+
"by a patient scenario and five multiple-choice options, similar to those found on"
|
|
75
|
+
"Step 2 and Step 3 on the US medical licensing exam."
|
|
76
|
+
)
|
|
72
77
|
tags = ["reasoning", "biomedical"]
|
|
73
78
|
|
|
74
79
|
# Define the possible answer choices
|
|
@@ -71,8 +71,10 @@ class MedCalcBenchScenario(Scenario):
|
|
|
71
71
|
|
|
72
72
|
name = "medcalc_bench"
|
|
73
73
|
description = (
|
|
74
|
-
"
|
|
75
|
-
"
|
|
74
|
+
"MedCalc-Bench is a benchmark designed to evaluate models on their ability to compute"
|
|
75
|
+
"clinically relevant values from patient notes. Each instance consists of a clinical note"
|
|
76
|
+
"describing the patient's condition, a diagnostic question targeting a specific medical"
|
|
77
|
+
"value, and a ground truth response."
|
|
76
78
|
)
|
|
77
79
|
tags = ["knowledge", "reasoning", "biomedical"]
|
|
78
80
|
|
|
@@ -50,7 +50,12 @@ class MedecScenario(Scenario):
|
|
|
50
50
|
TEST_URL = f"https://raw.githubusercontent.com/abachaa/MEDEC/{GIT_HASH}/MEDEC-MS/MEDEC-MS-TestSet-with-GroundTruth-and-ErrorType.csv" # noqa: E501
|
|
51
51
|
|
|
52
52
|
name = "medec"
|
|
53
|
-
description =
|
|
53
|
+
description = (
|
|
54
|
+
"Medec is a benchmark composed of clinical narratives that include either correct"
|
|
55
|
+
"documentation or medical errors. Each entry includes sentence-level identifiers and an"
|
|
56
|
+
"associated correction task. The model must review the narrative and either identify"
|
|
57
|
+
"the erroneous sentence and correct it, or confirm that the text is entirely accurate."
|
|
58
|
+
)
|
|
54
59
|
tags = ["error_detection", "error_correction", "biomedical"]
|
|
55
60
|
|
|
56
61
|
def download_csv(self, url: str, output_path: str, file_name: str) -> str:
|
|
@@ -20,7 +20,13 @@ class MedHalluScenario(Scenario):
|
|
|
20
20
|
"""
|
|
21
21
|
|
|
22
22
|
name = "medhallu"
|
|
23
|
-
description =
|
|
23
|
+
description = (
|
|
24
|
+
"MedHallu is a benchmark focused on evaluating factual correctness in biomedical"
|
|
25
|
+
"question answering. Each instance contains a PubMed-derived knowledge snippet, a"
|
|
26
|
+
"biomedical question, and a model-generated answer. The task is to classify whether the"
|
|
27
|
+
"answer is factually correct or contains hallucinated (non-grounded) information. This"
|
|
28
|
+
"benchmark is designed to assess the factual reliability of medical language models."
|
|
29
|
+
)
|
|
24
30
|
tags = ["knowledge", "reasoning", "biomedical"]
|
|
25
31
|
|
|
26
32
|
def create_instance(self, question, knowledge, answer, label, split):
|
|
@@ -49,9 +49,11 @@ class MediQAScenario(Scenario):
|
|
|
49
49
|
|
|
50
50
|
name = "medi_qa"
|
|
51
51
|
description = (
|
|
52
|
-
"
|
|
53
|
-
"
|
|
54
|
-
"
|
|
52
|
+
"MEDIQA is a benchmark designed to evaluate a model's ability to retrieve and generate"
|
|
53
|
+
"medically accurate answers to patient-generated questions. Each instance includes a"
|
|
54
|
+
"consumer health question, a set of candidate answers (used in ranking tasks), relevance"
|
|
55
|
+
"annotations, and optionally, additional context. The benchmark focuses on supporting"
|
|
56
|
+
"patient understanding and accessibility in health communication."
|
|
55
57
|
)
|
|
56
58
|
tags = ["knowledge", "biomedical"]
|
|
57
59
|
|
|
@@ -88,7 +90,11 @@ class MediQAScenario(Scenario):
|
|
|
88
90
|
|
|
89
91
|
def get_instances(self, output_path: str) -> List[Instance]:
|
|
90
92
|
# Load the MEDIQA dataset from Hugging Face
|
|
91
|
-
dataset = load_dataset(
|
|
93
|
+
dataset = load_dataset(
|
|
94
|
+
"bigbio/mediqa_qa",
|
|
95
|
+
trust_remote_code=True,
|
|
96
|
+
revision="9288641f4c785c95dc9079fa526dabb12efdb041",
|
|
97
|
+
)
|
|
92
98
|
|
|
93
99
|
# Process all the instances
|
|
94
100
|
instances: List[Instance] = []
|
|
@@ -31,7 +31,13 @@ class MedicationQAScenario(Scenario):
|
|
|
31
31
|
FILENAME = "MedInfo2019-QA-Medications.xlsx"
|
|
32
32
|
|
|
33
33
|
name = "medication_qa"
|
|
34
|
-
description =
|
|
34
|
+
description = (
|
|
35
|
+
"MedicationQA is a benchmark composed of open-ended consumer health questions"
|
|
36
|
+
"specifically focused on medications. Each example consists of a free-form question"
|
|
37
|
+
"and a corresponding medically grounded answer. The benchmark evaluates a model's"
|
|
38
|
+
"ability to provide accurate, accessible, and informative medication-related responses"
|
|
39
|
+
"for a lay audience."
|
|
40
|
+
)
|
|
35
41
|
tags = ["knowledge", "generation", "question_answering", "biomedical"]
|
|
36
42
|
|
|
37
43
|
def download_medication_qa(self, path: str):
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
from typing import List, Optional
|
|
2
|
+
|
|
3
|
+
from datasets import load_dataset, Dataset
|
|
4
|
+
from helm.common.hierarchical_logger import hlog
|
|
5
|
+
from helm.benchmark.scenarios.scenario import (
|
|
6
|
+
Scenario,
|
|
7
|
+
Instance,
|
|
8
|
+
Reference,
|
|
9
|
+
TRAIN_SPLIT,
|
|
10
|
+
VALID_SPLIT,
|
|
11
|
+
CORRECT_TAG,
|
|
12
|
+
Input,
|
|
13
|
+
Output,
|
|
14
|
+
make_rank_tag,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class MELTInformationRetrievalScenario(Scenario):
|
|
19
|
+
name = "melt_information_retrieval"
|
|
20
|
+
description = "Scenario for information retrieval tasks."
|
|
21
|
+
tags = ["information_retrieval"]
|
|
22
|
+
|
|
23
|
+
""" Dictionary mapping task track tuples to the number of queries. """
|
|
24
|
+
NUM_TRAIN_QUERIES = 1000
|
|
25
|
+
|
|
26
|
+
""" Upper and lower bounds on top-k.
|
|
27
|
+
|
|
28
|
+
The top-k number represents the number of passages we will consider per
|
|
29
|
+
query. Max top-k for the train and validation files are set to the number
|
|
30
|
+
of passages included in the corresponding top-k files.
|
|
31
|
+
"""
|
|
32
|
+
MIN_TOPK: int = 11
|
|
33
|
+
MAX_TRAIN_TOPK: int = 20
|
|
34
|
+
MAX_VALID_TOPK: int = 1000
|
|
35
|
+
|
|
36
|
+
def __init__(
|
|
37
|
+
self, dataset_name: str, revision: str, subset: Optional[str] = None, valid_topk: Optional[int] = None
|
|
38
|
+
):
|
|
39
|
+
"""The constructor for the MSMARCOScenario.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
dataset_name: The name of the dataset.
|
|
43
|
+
revision: The revision of the dataset to use.
|
|
44
|
+
subset: The subset of the dataset to use. Defaults to "".
|
|
45
|
+
valid_topk: If set, specifies the number of top documents for which the
|
|
46
|
+
validation instances will be created. Must be in the range
|
|
47
|
+
[self.MIN_TOPK, self.MAX_VALID_TOPK].
|
|
48
|
+
"""
|
|
49
|
+
super().__init__()
|
|
50
|
+
|
|
51
|
+
# Input validation
|
|
52
|
+
self.dataset_name = dataset_name
|
|
53
|
+
self.revision = revision
|
|
54
|
+
self.subset = subset
|
|
55
|
+
self.valid_topk: Optional[int] = valid_topk
|
|
56
|
+
if self.valid_topk is not None:
|
|
57
|
+
assert valid_topk and self.MIN_TOPK <= valid_topk <= self.MAX_VALID_TOPK
|
|
58
|
+
|
|
59
|
+
def get_train_instances(self) -> List[Instance]:
|
|
60
|
+
"""Get training instances.
|
|
61
|
+
References for each instance are selected as follows:
|
|
62
|
+
1. We select 1 correct reference, where the documents included
|
|
63
|
+
corresponds to the best document for the given train query.
|
|
64
|
+
2. We create 1 wrong reference, where the document included
|
|
65
|
+
corresponds to a non-gold document for the given train query.
|
|
66
|
+
"""
|
|
67
|
+
dataset = load_dataset(
|
|
68
|
+
self.dataset_name,
|
|
69
|
+
self.subset,
|
|
70
|
+
revision=self.revision,
|
|
71
|
+
trust_remote_code=True,
|
|
72
|
+
)
|
|
73
|
+
instances = []
|
|
74
|
+
for i, sample in enumerate(dataset["train"]):
|
|
75
|
+
|
|
76
|
+
if i >= self.NUM_TRAIN_QUERIES:
|
|
77
|
+
break
|
|
78
|
+
|
|
79
|
+
references = [
|
|
80
|
+
Reference(Output(text=sample["positive"]), tags=[CORRECT_TAG]),
|
|
81
|
+
Reference(Output(text=sample["negative"]), tags=[]),
|
|
82
|
+
]
|
|
83
|
+
|
|
84
|
+
instances.append(Instance(Input(text=sample["query"]), references=references, split=TRAIN_SPLIT))
|
|
85
|
+
return instances
|
|
86
|
+
|
|
87
|
+
def get_valid_instances(self) -> List[Instance]:
|
|
88
|
+
"""Get validation instances.
|
|
89
|
+
By default, we create a reference for each Document ID for which there
|
|
90
|
+
is a judgment with respect to the provided Query ID.
|
|
91
|
+
|
|
92
|
+
If self.valid_topk is not None, we ensure that a reference is created
|
|
93
|
+
for all the documents that appear in top self.valid_topk documents for
|
|
94
|
+
the given validation query.
|
|
95
|
+
"""
|
|
96
|
+
dataset = load_dataset(
|
|
97
|
+
self.dataset_name,
|
|
98
|
+
f"runs-{self.subset}",
|
|
99
|
+
revision=self.revision,
|
|
100
|
+
trust_remote_code=True,
|
|
101
|
+
)
|
|
102
|
+
instances = []
|
|
103
|
+
for sample in dataset["bm25"]:
|
|
104
|
+
references = []
|
|
105
|
+
|
|
106
|
+
for k, passage_dict in enumerate(Dataset.from_dict(sample["passages"])):
|
|
107
|
+
if self.valid_topk is None or k >= self.valid_topk:
|
|
108
|
+
break
|
|
109
|
+
tags = []
|
|
110
|
+
tags.append(f"docid={passage_dict['id']}")
|
|
111
|
+
if k == 0:
|
|
112
|
+
tags.append(CORRECT_TAG)
|
|
113
|
+
tags.append(make_rank_tag(rank=k + 1)) # Top-k rank
|
|
114
|
+
references.append(Reference(Output(text=passage_dict["passage"]), tags=tags))
|
|
115
|
+
|
|
116
|
+
instances.append(Instance(Input(text=sample["query"]), references=references, split=VALID_SPLIT))
|
|
117
|
+
|
|
118
|
+
return instances
|
|
119
|
+
|
|
120
|
+
def get_instances(self, output_path: str) -> List[Instance]:
|
|
121
|
+
"""Get instances for this scenario.
|
|
122
|
+
|
|
123
|
+
Refer to the documentation of the following methods for details on how
|
|
124
|
+
the instances are created:
|
|
125
|
+
* self.get_train_instances
|
|
126
|
+
* self.get_valid_instances
|
|
127
|
+
"""
|
|
128
|
+
|
|
129
|
+
hlog("Preparing training instances.")
|
|
130
|
+
train_instances = self.get_train_instances()
|
|
131
|
+
|
|
132
|
+
hlog("Preparing validation instances.")
|
|
133
|
+
valid_instances = self.get_valid_instances()
|
|
134
|
+
|
|
135
|
+
return train_instances + valid_instances
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
class MELTInformationRetrievalMMARCOScenario(MELTInformationRetrievalScenario):
|
|
139
|
+
"""
|
|
140
|
+
Scenario for the MMARCO dataset.
|
|
141
|
+
"""
|
|
142
|
+
|
|
143
|
+
name = "melt_information_retrieval_mmarco"
|
|
144
|
+
description = "MMARCO dataset for information retrieval in Vietnamese."
|
|
145
|
+
tags = ["information_retrieval"]
|
|
146
|
+
|
|
147
|
+
def __init__(self, **kwargs):
|
|
148
|
+
super().__init__(
|
|
149
|
+
dataset_name="unicamp-dl/mmarco",
|
|
150
|
+
revision="6d039c4638c0ba3e46a9cb7b498b145e7edc6230",
|
|
151
|
+
subset="vietnamese",
|
|
152
|
+
**kwargs,
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
class MELTInformationRetrievalMRobustScenario(MELTInformationRetrievalScenario):
|
|
157
|
+
"""
|
|
158
|
+
Scenario for the MRobust dataset.
|
|
159
|
+
"""
|
|
160
|
+
|
|
161
|
+
name = "melt_information_retrieval_mrobust"
|
|
162
|
+
description = "MRobust dataset for information retrieval in Vietnamese."
|
|
163
|
+
tags = ["information_retrieval"]
|
|
164
|
+
|
|
165
|
+
def __init__(self, **kwargs):
|
|
166
|
+
super().__init__(
|
|
167
|
+
dataset_name="unicamp-dl/mrobust",
|
|
168
|
+
revision="fda452a7fbfd9550db2f78d9d98e6b3ec16734df",
|
|
169
|
+
subset="vietnamese",
|
|
170
|
+
**kwargs,
|
|
171
|
+
)
|