crfm-helm 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crfm-helm might be problematic. Click here for more details.
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/METADATA +19 -5
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/RECORD +121 -76
- helm/benchmark/adaptation/adapter_spec.py +32 -31
- helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
- helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
- helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
- helm/benchmark/annotation/air_bench_annotator.py +64 -0
- helm/benchmark/annotation/annotator_factory.py +6 -0
- helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +1 -1
- helm/benchmark/annotation/live_qa_annotator.py +84 -0
- helm/benchmark/annotation/medication_qa_annotator.py +81 -0
- helm/benchmark/augmentations/perturbation.py +17 -1
- helm/benchmark/augmentations/test_perturbation.py +30 -0
- helm/benchmark/augmentations/translate_perturbation.py +1 -0
- helm/benchmark/huggingface_registration.py +16 -6
- helm/benchmark/metrics/air_bench_metrics.py +56 -0
- helm/benchmark/metrics/efficiency_metrics.py +9 -2
- helm/benchmark/metrics/evaluate_reference_metrics.py +16 -0
- helm/benchmark/metrics/fin_qa_metrics.py +60 -0
- helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
- helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
- helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
- helm/benchmark/metrics/live_qa_metrics.py +23 -0
- helm/benchmark/metrics/medication_qa_metrics.py +23 -0
- helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
- helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
- helm/benchmark/metrics/unitxt_metrics.py +20 -10
- helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
- helm/benchmark/metrics/vision_language/image_metrics.py +104 -21
- helm/benchmark/model_metadata_registry.py +5 -1
- helm/benchmark/presentation/schema.py +54 -4
- helm/benchmark/presentation/test_schema.py +11 -0
- helm/benchmark/run.py +16 -2
- helm/benchmark/run_expander.py +112 -63
- helm/benchmark/run_spec_factory.py +15 -10
- helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
- helm/benchmark/run_specs/classic_run_specs.py +15 -11
- helm/benchmark/run_specs/decodingtrust_run_specs.py +3 -1
- helm/benchmark/run_specs/experimental_run_specs.py +33 -0
- helm/benchmark/run_specs/finance_run_specs.py +33 -0
- helm/benchmark/run_specs/vlm_run_specs.py +444 -65
- helm/benchmark/scenarios/air_bench_scenario.py +50 -0
- helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
- helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
- helm/benchmark/scenarios/fin_qa_scenario.py +117 -0
- helm/benchmark/scenarios/legalbench_scenario.py +6 -2
- helm/benchmark/scenarios/math_scenario.py +1 -1
- helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
- helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
- helm/benchmark/scenarios/vision_language/bingo_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
- helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
- helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
- helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +4 -2
- helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +13 -2
- helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +1 -5
- helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +1 -5
- helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +5 -3
- helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
- helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
- helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
- helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
- helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
- helm/benchmark/scenarios/vision_language/unicorn_scenario.py +3 -3
- helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +95 -0
- helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +2 -2
- helm/benchmark/scenarios/vision_language/vqa_scenario.py +4 -2
- helm/benchmark/static/schema_air_bench.yaml +3149 -0
- helm/benchmark/static/schema_classic.yaml +3 -59
- helm/benchmark/static/schema_finance.yaml +143 -0
- helm/benchmark/static/schema_image2structure.yaml +447 -0
- helm/benchmark/static/schema_instruction_following.yaml +3 -52
- helm/benchmark/static/schema_lite.yaml +3 -61
- helm/benchmark/static/schema_medical.yaml +255 -0
- helm/benchmark/static/schema_mmlu.yaml +3 -61
- helm/benchmark/static/schema_tables.yaml +200 -0
- helm/benchmark/static/schema_thai.yaml +223 -0
- helm/benchmark/static/schema_unitxt.yaml +3 -61
- helm/benchmark/static/schema_vhelm.yaml +824 -0
- helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
- helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
- helm/benchmark/static_build/assets/index-30dbceba.js +10 -0
- helm/benchmark/static_build/assets/index-66b02d40.css +1 -0
- helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
- helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
- helm/benchmark/static_build/index.html +2 -2
- helm/clients/anthropic_client.py +78 -14
- helm/clients/auto_client.py +11 -0
- helm/clients/client.py +24 -7
- helm/clients/cohere_client.py +98 -3
- helm/clients/huggingface_client.py +71 -12
- helm/clients/openai_client.py +11 -5
- helm/clients/reka_client.py +189 -0
- helm/clients/test_client.py +3 -3
- helm/clients/test_huggingface_client.py +19 -3
- helm/clients/test_together_client.py +72 -2
- helm/clients/together_client.py +199 -2
- helm/clients/vertexai_client.py +117 -64
- helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
- helm/clients/vision_language/huggingface_vlm_client.py +12 -4
- helm/clients/vision_language/idefics_client.py +2 -2
- helm/clients/vision_language/paligemma_client.py +146 -0
- helm/clients/vision_language/palmyra_vision_client.py +84 -0
- helm/clients/yi_client.py +31 -0
- helm/common/critique_request.py +10 -1
- helm/common/images_utils.py +29 -3
- helm/config/model_deployments.yaml +504 -12
- helm/config/model_metadata.yaml +579 -52
- helm/config/tokenizer_configs.yaml +100 -1
- helm/proxy/critique/model_critique_client.py +32 -4
- helm/proxy/services/server_service.py +1 -1
- helm/tokenizers/auto_tokenizer.py +1 -1
- helm/tokenizers/cohere_tokenizer.py +44 -2
- helm/tokenizers/huggingface_tokenizer.py +36 -13
- helm/tokenizers/test_cohere_tokenizer.py +39 -0
- helm/tokenizers/test_huggingface_tokenizer.py +5 -1
- helm/benchmark/static/schema_vlm.yaml +0 -576
- helm/benchmark/static_build/assets/index-5088afcb.css +0 -1
- helm/benchmark/static_build/assets/index-d839df55.js +0 -9
- helm/benchmark/test_model_deployment_definition.py +0 -90
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/LICENSE +0 -0
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/WHEEL +0 -0
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/entry_points.txt +0 -0
- {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/top_level.txt +0 -0
|
@@ -10,6 +10,7 @@ from helm.benchmark.metrics.metric_service import MetricService
|
|
|
10
10
|
from helm.benchmark.metrics.statistic import Stat
|
|
11
11
|
from helm.benchmark.scenarios.code_scenario import CodeReference
|
|
12
12
|
from helm.benchmark.scenarios.scenario import Reference
|
|
13
|
+
from helm.common.optional_dependencies import handle_module_not_found_error
|
|
13
14
|
from helm.common.request import GeneratedOutput
|
|
14
15
|
from helm.benchmark.scenarios.math_scenario import is_equiv, is_equiv_chain_of_thought
|
|
15
16
|
from nltk.metrics.scores import f_measure
|
|
@@ -21,6 +22,7 @@ import string
|
|
|
21
22
|
from . import code_metrics_helper
|
|
22
23
|
import nltk
|
|
23
24
|
|
|
25
|
+
|
|
24
26
|
try:
|
|
25
27
|
nltk.data.find("tokenizers/punkt")
|
|
26
28
|
except LookupError:
|
|
@@ -188,6 +190,19 @@ def bleu_4(gold: str, pred: str) -> float:
|
|
|
188
190
|
return sentence_bleu([word_tokenize(gold)], word_tokenize(pred), weights=(0, 0, 0, 1))
|
|
189
191
|
|
|
190
192
|
|
|
193
|
+
def cider(gold: str, pred: str) -> float:
|
|
194
|
+
try:
|
|
195
|
+
from pycocoevalcap.cider.cider import Cider
|
|
196
|
+
except ModuleNotFoundError as e:
|
|
197
|
+
handle_module_not_found_error(e, ["vlm"])
|
|
198
|
+
|
|
199
|
+
cider_evaluator = Cider()
|
|
200
|
+
candidate = {"caption": [pred]}
|
|
201
|
+
reference = {"caption": [gold]}
|
|
202
|
+
average_score, _ = cider_evaluator.compute_score(reference, candidate)
|
|
203
|
+
return average_score
|
|
204
|
+
|
|
205
|
+
|
|
191
206
|
def extract_set_from_text(
|
|
192
207
|
set_str: str,
|
|
193
208
|
set_start_str: str = " is ",
|
|
@@ -325,6 +340,7 @@ def compute_reference_metrics(
|
|
|
325
340
|
"math_equiv_chain_of_thought": is_equiv_chain_of_thought,
|
|
326
341
|
"code_eval_acc": code_eval,
|
|
327
342
|
"pass": code_eval,
|
|
343
|
+
"cider": cider,
|
|
328
344
|
"f1_score": f1_score,
|
|
329
345
|
"rouge_1": get_rouge_function("rouge1"),
|
|
330
346
|
"rouge_2": get_rouge_function("rouge2"),
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
import math
|
|
2
|
+
import json
|
|
3
|
+
from typing import List, Union
|
|
4
|
+
|
|
5
|
+
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
6
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
7
|
+
from helm.benchmark.metrics.metric import Metric
|
|
8
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
9
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
10
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
11
|
+
from helm.benchmark.metrics.fin_qa_metrics_helper import ( # type: ignore
|
|
12
|
+
equal_program,
|
|
13
|
+
eval_program,
|
|
14
|
+
program_tokenization,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _get_program_accuracy(reference_program: List[str], generated_program: List[str]) -> float:
|
|
19
|
+
return 1.0 if equal_program(reference_program, generated_program) else 0.0
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _get_execution_accuracy(reference_execution: str, generated_program: List[str], table: List[List[str]]) -> float:
|
|
23
|
+
invalid_flag: int
|
|
24
|
+
generated_result: Union[str, float]
|
|
25
|
+
invalid_flag, generated_result = eval_program(generated_program, table)
|
|
26
|
+
if invalid_flag:
|
|
27
|
+
return 0.0
|
|
28
|
+
if reference_execution == "yes" or reference_execution == "no":
|
|
29
|
+
return 1.0 if reference_execution == generated_result else 0
|
|
30
|
+
else:
|
|
31
|
+
if not isinstance(generated_result, float):
|
|
32
|
+
return 0.0
|
|
33
|
+
return 1.0 if math.isclose(float(reference_execution), generated_result) else 0
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class FinQAMetric(Metric):
|
|
37
|
+
def evaluate_generation(
|
|
38
|
+
self,
|
|
39
|
+
adapter_spec: AdapterSpec,
|
|
40
|
+
request_state: RequestState,
|
|
41
|
+
metric_service: MetricService,
|
|
42
|
+
eval_cache_path: str,
|
|
43
|
+
) -> List[Stat]:
|
|
44
|
+
assert len(request_state.instance.references) == 3
|
|
45
|
+
reference_text = request_state.instance.references[0].output.text
|
|
46
|
+
reference_program = program_tokenization(reference_text)
|
|
47
|
+
reference_execution = request_state.instance.references[1].output.text
|
|
48
|
+
table: List[List[str]] = json.loads(request_state.instance.references[2].output.text)
|
|
49
|
+
|
|
50
|
+
assert request_state.result
|
|
51
|
+
assert len(request_state.result.completions) == 1
|
|
52
|
+
generated_text = request_state.result.completions[0].text.strip()
|
|
53
|
+
generated_program = program_tokenization(generated_text)
|
|
54
|
+
|
|
55
|
+
return [
|
|
56
|
+
Stat(MetricName("program_accuracy")).add(_get_program_accuracy(reference_program, generated_program)),
|
|
57
|
+
Stat(MetricName("execution_accuracy")).add(
|
|
58
|
+
_get_execution_accuracy(reference_execution, generated_program, table)
|
|
59
|
+
),
|
|
60
|
+
]
|
|
@@ -0,0 +1,398 @@
|
|
|
1
|
+
# type: ignore
|
|
2
|
+
# flake8: noqa
|
|
3
|
+
# fmt: off
|
|
4
|
+
"""Evaluation metrics for FinQA.
|
|
5
|
+
|
|
6
|
+
This evaluation code is reproduced from the following URL with the following license.
|
|
7
|
+
|
|
8
|
+
URL: https://github.com/czyssrs/FinQA/blob/0f16e2867befa6840783e58be38c9efb9229d742/code/evaluate/evaluate.py
|
|
9
|
+
|
|
10
|
+
License: MIT License
|
|
11
|
+
|
|
12
|
+
Copyright (c) 2021 Zhiyu Chen
|
|
13
|
+
|
|
14
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
15
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
16
|
+
in the Software without restriction, including without limitation the rights
|
|
17
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
18
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
19
|
+
furnished to do so, subject to the following conditions:
|
|
20
|
+
|
|
21
|
+
The above copyright notice and this permission notice shall be included in all
|
|
22
|
+
copies or substantial portions of the Software.
|
|
23
|
+
|
|
24
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
25
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
26
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
27
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
28
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
29
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
30
|
+
SOFTWARE."""
|
|
31
|
+
|
|
32
|
+
from sympy import simplify
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
all_ops = ["add", "subtract", "multiply", "divide", "exp", "greater", "table_max", \
|
|
36
|
+
"table_min", "table_sum", "table_average"]
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def str_to_num(text):
|
|
40
|
+
|
|
41
|
+
text = text.replace(",", "")
|
|
42
|
+
try:
|
|
43
|
+
num = float(text)
|
|
44
|
+
except ValueError:
|
|
45
|
+
if "%" in text:
|
|
46
|
+
text = text.replace("%", "")
|
|
47
|
+
try:
|
|
48
|
+
num = float(text)
|
|
49
|
+
num = num / 100.0
|
|
50
|
+
except ValueError:
|
|
51
|
+
num = "n/a"
|
|
52
|
+
elif "const" in text:
|
|
53
|
+
text = text.replace("const_", "")
|
|
54
|
+
if text == "m1":
|
|
55
|
+
text = "-1"
|
|
56
|
+
num = float(text)
|
|
57
|
+
else:
|
|
58
|
+
num = "n/a"
|
|
59
|
+
return num
|
|
60
|
+
|
|
61
|
+
def process_row(row_in):
|
|
62
|
+
|
|
63
|
+
row_out = []
|
|
64
|
+
invalid_flag = 0
|
|
65
|
+
|
|
66
|
+
for num in row_in:
|
|
67
|
+
num = num.replace("$", "").strip()
|
|
68
|
+
num = num.split("(")[0].strip()
|
|
69
|
+
|
|
70
|
+
num = str_to_num(num)
|
|
71
|
+
|
|
72
|
+
if num == "n/a":
|
|
73
|
+
invalid_flag = 1
|
|
74
|
+
break
|
|
75
|
+
|
|
76
|
+
row_out.append(num)
|
|
77
|
+
|
|
78
|
+
if invalid_flag:
|
|
79
|
+
return "n/a"
|
|
80
|
+
|
|
81
|
+
return row_out
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def eval_program(program, table):
|
|
85
|
+
'''
|
|
86
|
+
calculate the numerical results of the program
|
|
87
|
+
'''
|
|
88
|
+
|
|
89
|
+
invalid_flag = 0
|
|
90
|
+
this_res = "n/a"
|
|
91
|
+
|
|
92
|
+
try:
|
|
93
|
+
program = program[:-1] # remove EOF
|
|
94
|
+
# check structure
|
|
95
|
+
for ind, token in enumerate(program):
|
|
96
|
+
if ind % 4 == 0:
|
|
97
|
+
if token.strip("(") not in all_ops:
|
|
98
|
+
return 1, "n/a"
|
|
99
|
+
if (ind + 1) % 4 == 0:
|
|
100
|
+
if token != ")":
|
|
101
|
+
return 1, "n/a"
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
program = "|".join(program)
|
|
105
|
+
steps = program.split(")")[:-1]
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
res_dict = {}
|
|
109
|
+
|
|
110
|
+
# print(program)
|
|
111
|
+
|
|
112
|
+
for ind, step in enumerate(steps):
|
|
113
|
+
step = step.strip()
|
|
114
|
+
|
|
115
|
+
if len(step.split("(")) > 2:
|
|
116
|
+
invalid_flag = 1
|
|
117
|
+
break
|
|
118
|
+
op = step.split("(")[0].strip("|").strip()
|
|
119
|
+
args = step.split("(")[1].strip("|").strip()
|
|
120
|
+
|
|
121
|
+
# print(args)
|
|
122
|
+
# print(op)
|
|
123
|
+
|
|
124
|
+
arg1 = args.split("|")[0].strip()
|
|
125
|
+
arg2 = args.split("|")[1].strip()
|
|
126
|
+
|
|
127
|
+
if op == "add" or op == "subtract" or op == "multiply" or op == "divide" or op == "exp" or op == "greater":
|
|
128
|
+
|
|
129
|
+
if "#" in arg1:
|
|
130
|
+
arg1 = res_dict[int(arg1.replace("#", ""))]
|
|
131
|
+
else:
|
|
132
|
+
# print(arg1)
|
|
133
|
+
arg1 = str_to_num(arg1)
|
|
134
|
+
if arg1 == "n/a":
|
|
135
|
+
invalid_flag = 1
|
|
136
|
+
break
|
|
137
|
+
|
|
138
|
+
if "#" in arg2:
|
|
139
|
+
arg2 = res_dict[int(arg2.replace("#", ""))]
|
|
140
|
+
else:
|
|
141
|
+
arg2 = str_to_num(arg2)
|
|
142
|
+
if arg2 == "n/a":
|
|
143
|
+
invalid_flag = 1
|
|
144
|
+
break
|
|
145
|
+
|
|
146
|
+
if op == "add":
|
|
147
|
+
this_res = arg1 + arg2
|
|
148
|
+
elif op == "subtract":
|
|
149
|
+
this_res = arg1 - arg2
|
|
150
|
+
elif op == "multiply":
|
|
151
|
+
this_res = arg1 * arg2
|
|
152
|
+
elif op == "divide":
|
|
153
|
+
this_res = arg1 / arg2
|
|
154
|
+
elif op == "exp":
|
|
155
|
+
this_res = arg1 ** arg2
|
|
156
|
+
elif op == "greater":
|
|
157
|
+
this_res = "yes" if arg1 > arg2 else "no"
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
# print("ind: ", ind)
|
|
161
|
+
# print(this_res)
|
|
162
|
+
res_dict[ind] = this_res
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
elif "table" in op:
|
|
166
|
+
table_dict = {}
|
|
167
|
+
for row in table:
|
|
168
|
+
table_dict[row[0]] = row[1:]
|
|
169
|
+
|
|
170
|
+
if "#" in arg1:
|
|
171
|
+
arg1 = res_dict[int(arg1.replace("#", ""))]
|
|
172
|
+
else:
|
|
173
|
+
if arg1 not in table_dict:
|
|
174
|
+
invalid_flag = 1
|
|
175
|
+
break
|
|
176
|
+
|
|
177
|
+
cal_row = table_dict[arg1]
|
|
178
|
+
num_row = process_row(cal_row)
|
|
179
|
+
|
|
180
|
+
if num_row == "n/a":
|
|
181
|
+
invalid_flag = 1
|
|
182
|
+
break
|
|
183
|
+
if op == "table_max":
|
|
184
|
+
this_res = max(num_row)
|
|
185
|
+
elif op == "table_min":
|
|
186
|
+
this_res = min(num_row)
|
|
187
|
+
elif op == "table_sum":
|
|
188
|
+
this_res = sum(num_row)
|
|
189
|
+
elif op == "table_average":
|
|
190
|
+
this_res = sum(num_row) / len(num_row)
|
|
191
|
+
|
|
192
|
+
# this_res = round(this_res, 5)
|
|
193
|
+
|
|
194
|
+
res_dict[ind] = this_res
|
|
195
|
+
|
|
196
|
+
# print(this_res)
|
|
197
|
+
|
|
198
|
+
if this_res != "yes" and this_res != "no" and this_res != "n/a":
|
|
199
|
+
# print(this_res)
|
|
200
|
+
this_res = round(this_res, 5)
|
|
201
|
+
|
|
202
|
+
except:
|
|
203
|
+
invalid_flag = 1
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
return invalid_flag, this_res
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def equal_program(program1, program2):
|
|
210
|
+
'''
|
|
211
|
+
symbolic program if equal
|
|
212
|
+
program1: gold
|
|
213
|
+
program2: pred
|
|
214
|
+
'''
|
|
215
|
+
|
|
216
|
+
sym_map = {}
|
|
217
|
+
|
|
218
|
+
program1 = program1[:-1] # remove EOF
|
|
219
|
+
program1 = "|".join(program1)
|
|
220
|
+
steps = program1.split(")")[:-1]
|
|
221
|
+
|
|
222
|
+
invalid_flag = 0
|
|
223
|
+
sym_ind = 0
|
|
224
|
+
step_dict_1 = {}
|
|
225
|
+
|
|
226
|
+
# symbolic map
|
|
227
|
+
for ind, step in enumerate(steps):
|
|
228
|
+
|
|
229
|
+
step = step.strip()
|
|
230
|
+
|
|
231
|
+
assert len(step.split("(")) <= 2
|
|
232
|
+
|
|
233
|
+
op = step.split("(")[0].strip("|").strip()
|
|
234
|
+
args = step.split("(")[1].strip("|").strip()
|
|
235
|
+
|
|
236
|
+
arg1 = args.split("|")[0].strip()
|
|
237
|
+
arg2 = args.split("|")[1].strip()
|
|
238
|
+
|
|
239
|
+
step_dict_1[ind] = step
|
|
240
|
+
|
|
241
|
+
if "table" in op:
|
|
242
|
+
if step not in sym_map:
|
|
243
|
+
sym_map[step] = "a" + str(sym_ind)
|
|
244
|
+
sym_ind += 1
|
|
245
|
+
|
|
246
|
+
else:
|
|
247
|
+
if "#" not in arg1:
|
|
248
|
+
if arg1 not in sym_map:
|
|
249
|
+
sym_map[arg1] = "a" + str(sym_ind)
|
|
250
|
+
sym_ind += 1
|
|
251
|
+
|
|
252
|
+
if "#" not in arg2:
|
|
253
|
+
if arg2 not in sym_map:
|
|
254
|
+
sym_map[arg2] = "a" + str(sym_ind)
|
|
255
|
+
sym_ind += 1
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
# check program 2
|
|
259
|
+
step_dict_2 = {}
|
|
260
|
+
try:
|
|
261
|
+
program2 = program2[:-1] # remove EOF
|
|
262
|
+
# check structure
|
|
263
|
+
for ind, token in enumerate(program2):
|
|
264
|
+
if ind % 4 == 0:
|
|
265
|
+
if token.strip("(") not in all_ops:
|
|
266
|
+
print("structure error")
|
|
267
|
+
return False
|
|
268
|
+
if (ind + 1) % 4 == 0:
|
|
269
|
+
if token != ")":
|
|
270
|
+
print("structure error")
|
|
271
|
+
return False
|
|
272
|
+
|
|
273
|
+
program2 = "|".join(program2)
|
|
274
|
+
steps = program2.split(")")[:-1]
|
|
275
|
+
|
|
276
|
+
for ind, step in enumerate(steps):
|
|
277
|
+
step = step.strip()
|
|
278
|
+
|
|
279
|
+
if len(step.split("(")) > 2:
|
|
280
|
+
return False
|
|
281
|
+
op = step.split("(")[0].strip("|").strip()
|
|
282
|
+
args = step.split("(")[1].strip("|").strip()
|
|
283
|
+
|
|
284
|
+
# print(args)
|
|
285
|
+
# print(op)
|
|
286
|
+
|
|
287
|
+
arg1 = args.split("|")[0].strip()
|
|
288
|
+
arg2 = args.split("|")[1].strip()
|
|
289
|
+
|
|
290
|
+
step_dict_2[ind] = step
|
|
291
|
+
|
|
292
|
+
if "table" in op:
|
|
293
|
+
if step not in sym_map:
|
|
294
|
+
return False
|
|
295
|
+
|
|
296
|
+
else:
|
|
297
|
+
if "#" not in arg1:
|
|
298
|
+
if arg1 not in sym_map:
|
|
299
|
+
return False
|
|
300
|
+
else:
|
|
301
|
+
if int(arg1.strip("#")) >= ind:
|
|
302
|
+
return False
|
|
303
|
+
|
|
304
|
+
if "#" not in arg2:
|
|
305
|
+
if arg2 not in sym_map:
|
|
306
|
+
return False
|
|
307
|
+
else:
|
|
308
|
+
if int(arg2.strip("#")) >= ind:
|
|
309
|
+
return False
|
|
310
|
+
except:
|
|
311
|
+
return False
|
|
312
|
+
|
|
313
|
+
def symbol_recur(step, step_dict):
|
|
314
|
+
|
|
315
|
+
step = step.strip()
|
|
316
|
+
op = step.split("(")[0].strip("|").strip()
|
|
317
|
+
args = step.split("(")[1].strip("|").strip()
|
|
318
|
+
|
|
319
|
+
arg1 = args.split("|")[0].strip()
|
|
320
|
+
arg2 = args.split("|")[1].strip()
|
|
321
|
+
|
|
322
|
+
# print(op)
|
|
323
|
+
# print(arg1)
|
|
324
|
+
# print(arg2)
|
|
325
|
+
|
|
326
|
+
if "table" in op:
|
|
327
|
+
# as var
|
|
328
|
+
return sym_map[step]
|
|
329
|
+
|
|
330
|
+
if "#" in arg1:
|
|
331
|
+
arg1_ind = int(arg1.replace("#", ""))
|
|
332
|
+
arg1_part = symbol_recur(step_dict[arg1_ind], step_dict)
|
|
333
|
+
else:
|
|
334
|
+
arg1_part = sym_map[arg1]
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
if "#" in arg2:
|
|
338
|
+
arg2_ind = int(arg2.replace("#", ""))
|
|
339
|
+
arg2_part = symbol_recur(step_dict[arg2_ind], step_dict)
|
|
340
|
+
else:
|
|
341
|
+
arg2_part = sym_map[arg2]
|
|
342
|
+
|
|
343
|
+
if op == "add":
|
|
344
|
+
return "( " + arg1_part + " + " + arg2_part + " )"
|
|
345
|
+
elif op == "subtract":
|
|
346
|
+
return "( " + arg1_part + " - " + arg2_part + " )"
|
|
347
|
+
elif op == "multiply":
|
|
348
|
+
return "( " + arg1_part + " * " + arg2_part + " )"
|
|
349
|
+
elif op == "divide":
|
|
350
|
+
return "( " + arg1_part + " / " + arg2_part + " )"
|
|
351
|
+
elif op == "exp":
|
|
352
|
+
return "( " + arg1_part + " ** " + arg2_part + " )"
|
|
353
|
+
elif op == "greater":
|
|
354
|
+
return "( " + arg1_part + " > " + arg2_part + " )"
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
# # derive symbolic program 1
|
|
358
|
+
# print(program1)
|
|
359
|
+
steps = program1.split(")")[:-1]
|
|
360
|
+
# print(steps)
|
|
361
|
+
# print(steps)
|
|
362
|
+
# print(sym_map)
|
|
363
|
+
sym_prog1 = symbol_recur(steps[-1], step_dict_1)
|
|
364
|
+
sym_prog1 = simplify(sym_prog1, evaluate=False)
|
|
365
|
+
# print("########")
|
|
366
|
+
# print(sym_prog1)
|
|
367
|
+
|
|
368
|
+
try:
|
|
369
|
+
# derive symbolic program 2
|
|
370
|
+
steps = program2.split(")")[:-1]
|
|
371
|
+
sym_prog2 = symbol_recur(steps[-1], step_dict_2)
|
|
372
|
+
sym_prog2 = simplify(sym_prog2, evaluate=False)
|
|
373
|
+
# print(sym_prog2)
|
|
374
|
+
except:
|
|
375
|
+
return False
|
|
376
|
+
|
|
377
|
+
return sym_prog1 == sym_prog2
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
def program_tokenization(original_program):
|
|
381
|
+
original_program = original_program.split(', ')
|
|
382
|
+
program = []
|
|
383
|
+
for tok in original_program:
|
|
384
|
+
cur_tok = ''
|
|
385
|
+
for c in tok:
|
|
386
|
+
if c == ')':
|
|
387
|
+
if cur_tok != '':
|
|
388
|
+
program.append(cur_tok)
|
|
389
|
+
cur_tok = ''
|
|
390
|
+
cur_tok += c
|
|
391
|
+
if c in ['(', ')']:
|
|
392
|
+
program.append(cur_tok)
|
|
393
|
+
cur_tok = ''
|
|
394
|
+
if cur_tok != '':
|
|
395
|
+
program.append(cur_tok)
|
|
396
|
+
program.append('EOF')
|
|
397
|
+
return program
|
|
398
|
+
# fmt: on
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
from typing import Dict, List
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
4
|
+
from helm.benchmark.adaptation.scenario_state import ScenarioState
|
|
5
|
+
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
6
|
+
from helm.benchmark.metrics.metric import MetricInterface, MetricResult, PerInstanceStats, add_context
|
|
7
|
+
from helm.benchmark.metrics.metric_name import MetricContext, MetricName
|
|
8
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
9
|
+
from helm.benchmark.metrics.statistic import Stat, merge_stat
|
|
10
|
+
from helm.common.critique_request import CritiqueTaskTemplate, CritiqueQuestionTemplate, CritiqueRequest, QuestionType
|
|
11
|
+
from helm.common.hierarchical_logger import hlog
|
|
12
|
+
from helm.common.request import RequestResult, Request, GeneratedOutput
|
|
13
|
+
from helm.common.media_object import MultimediaObject, IMAGE_TYPE, MediaObject
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class GPT4VCritiqueMetric(MetricInterface):
|
|
17
|
+
"""
|
|
18
|
+
Critique evaluation for evaluating how original the generated text are given the image by GPT4V.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
# We can add more evaluation aspects here
|
|
22
|
+
ORIGINALITY_NAME: str = "originality_gpt4v"
|
|
23
|
+
ORIGINALITY_ANSWER_TO_SCORE: Dict[str, int] = {
|
|
24
|
+
"I’ve seen something like this before to the point it’s become tiresome.": 1,
|
|
25
|
+
"The text is not really original, but it has some originality to it.": 2,
|
|
26
|
+
"Neutral.": 3,
|
|
27
|
+
"I find the text to be fresh and original.": 4,
|
|
28
|
+
"I find the text to be extremely creative and out of this world.": 5,
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
def __init__(self, num_respondents: int):
|
|
32
|
+
self._num_respondents = num_respondents
|
|
33
|
+
|
|
34
|
+
def __repr__(self) -> str:
|
|
35
|
+
return "GPT4CritiqueMetric()"
|
|
36
|
+
|
|
37
|
+
def evaluate(
|
|
38
|
+
self,
|
|
39
|
+
scenario_state: ScenarioState,
|
|
40
|
+
metric_service: MetricService,
|
|
41
|
+
eval_cache_path: str,
|
|
42
|
+
parallelism: int,
|
|
43
|
+
) -> MetricResult:
|
|
44
|
+
request_states: List[RequestState] = scenario_state.request_states
|
|
45
|
+
|
|
46
|
+
all_stats: Dict[MetricName, Stat] = {}
|
|
47
|
+
per_instance_stats: List[PerInstanceStats] = []
|
|
48
|
+
for request_state in request_states:
|
|
49
|
+
context = MetricContext.from_instance(request_state.instance)
|
|
50
|
+
stats_without_context = self.evaluate_generation(
|
|
51
|
+
scenario_state.adapter_spec,
|
|
52
|
+
request_state,
|
|
53
|
+
metric_service,
|
|
54
|
+
eval_cache_path,
|
|
55
|
+
)
|
|
56
|
+
stats = [add_context(stat_without_context, context) for stat_without_context in stats_without_context]
|
|
57
|
+
for stat in stats:
|
|
58
|
+
merge_stat(all_stats, stat)
|
|
59
|
+
assert request_state.instance.id is not None
|
|
60
|
+
per_instance_stats.append(
|
|
61
|
+
PerInstanceStats(
|
|
62
|
+
instance_id=request_state.instance.id,
|
|
63
|
+
perturbation=request_state.instance.perturbation,
|
|
64
|
+
train_trial_index=request_state.train_trial_index,
|
|
65
|
+
stats=stats,
|
|
66
|
+
)
|
|
67
|
+
)
|
|
68
|
+
return MetricResult(aggregated_stats=list(all_stats.values()), per_instance_stats=per_instance_stats)
|
|
69
|
+
|
|
70
|
+
def evaluate_generation(
|
|
71
|
+
self,
|
|
72
|
+
adapter_spec: AdapterSpec,
|
|
73
|
+
request_state: RequestState,
|
|
74
|
+
metric_service: MetricService,
|
|
75
|
+
eval_cache_path: str,
|
|
76
|
+
) -> List[Stat]:
|
|
77
|
+
input_request: Request = request_state.request
|
|
78
|
+
# Predicted outputs and their originality scores
|
|
79
|
+
assert request_state.result is not None
|
|
80
|
+
request_result: RequestResult = request_state.result
|
|
81
|
+
# Get input image and generated response for the originality evaluation
|
|
82
|
+
assert input_request.multimodal_prompt is not None
|
|
83
|
+
completions: List[GeneratedOutput] = request_result.completions
|
|
84
|
+
input_text: str = completions[0].text
|
|
85
|
+
input_media: MultimediaObject = input_request.multimodal_prompt
|
|
86
|
+
image_objects: List[MediaObject] = [
|
|
87
|
+
item for item in input_media.media_objects if item.is_type(IMAGE_TYPE) and item.location
|
|
88
|
+
]
|
|
89
|
+
|
|
90
|
+
template = CritiqueTaskTemplate(
|
|
91
|
+
name="vhelm_gpt4v_originality",
|
|
92
|
+
# TODO: Add proper instructions
|
|
93
|
+
instructions="Answer the multiple choice question by just giving the letter of the correct "
|
|
94
|
+
"answer.\n\n{{prompt}}",
|
|
95
|
+
num_respondents=self._num_respondents,
|
|
96
|
+
questions=[
|
|
97
|
+
CritiqueQuestionTemplate(
|
|
98
|
+
name=self.ORIGINALITY_NAME,
|
|
99
|
+
question_type=QuestionType.MULTIPLE_CHOICE,
|
|
100
|
+
text="How original is the text, given it was created with the image?",
|
|
101
|
+
options=list(self.ORIGINALITY_ANSWER_TO_SCORE.keys()),
|
|
102
|
+
media_object=image_objects[0], # we only take the first image as input
|
|
103
|
+
)
|
|
104
|
+
],
|
|
105
|
+
)
|
|
106
|
+
request = CritiqueRequest(template=template, fields={"prompt": input_text})
|
|
107
|
+
|
|
108
|
+
# send to critique request
|
|
109
|
+
result = metric_service.make_critique_request(request)
|
|
110
|
+
if not result or not result.responses:
|
|
111
|
+
# Skip computing metrics if there aren't any responses yet
|
|
112
|
+
hlog("Waiting for responses to be generated.")
|
|
113
|
+
return []
|
|
114
|
+
|
|
115
|
+
stats: Dict[str, Stat] = {}
|
|
116
|
+
for question in template.questions:
|
|
117
|
+
stats[question.name] = Stat(MetricName(question.name))
|
|
118
|
+
|
|
119
|
+
for response in result.responses:
|
|
120
|
+
for answer_name, answer in response.answers.items():
|
|
121
|
+
assert isinstance(answer, str)
|
|
122
|
+
answer_value: float
|
|
123
|
+
answer_value = self.ORIGINALITY_ANSWER_TO_SCORE[answer]
|
|
124
|
+
stats[answer_name].add(answer_value)
|
|
125
|
+
|
|
126
|
+
return list(stats.values())
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
4
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
5
|
+
from helm.benchmark.metrics.metric import Metric
|
|
6
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
7
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
8
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class LiveQAScoreMetric(Metric):
|
|
12
|
+
"""Score metrics for LiveQA."""
|
|
13
|
+
|
|
14
|
+
def evaluate_generation(
|
|
15
|
+
self,
|
|
16
|
+
adapter_spec: AdapterSpec,
|
|
17
|
+
request_state: RequestState,
|
|
18
|
+
metric_service: MetricService,
|
|
19
|
+
eval_cache_path: str,
|
|
20
|
+
) -> List[Stat]:
|
|
21
|
+
assert request_state.annotations
|
|
22
|
+
score = request_state.annotations["live_qa"]["score"]
|
|
23
|
+
return [Stat(MetricName("live_qa_score")).add(score)]
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
from helm.benchmark.adaptation.adapter_spec import AdapterSpec
|
|
4
|
+
from helm.benchmark.adaptation.request_state import RequestState
|
|
5
|
+
from helm.benchmark.metrics.metric import Metric
|
|
6
|
+
from helm.benchmark.metrics.metric_name import MetricName
|
|
7
|
+
from helm.benchmark.metrics.metric_service import MetricService
|
|
8
|
+
from helm.benchmark.metrics.statistic import Stat
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class MedicationQAScoreMetric(Metric):
|
|
12
|
+
"""Score metrics for MedicationQA."""
|
|
13
|
+
|
|
14
|
+
def evaluate_generation(
|
|
15
|
+
self,
|
|
16
|
+
adapter_spec: AdapterSpec,
|
|
17
|
+
request_state: RequestState,
|
|
18
|
+
metric_service: MetricService,
|
|
19
|
+
eval_cache_path: str,
|
|
20
|
+
) -> List[Stat]:
|
|
21
|
+
assert request_state.annotations
|
|
22
|
+
score = request_state.annotations["medication_qa"]["score"]
|
|
23
|
+
return [Stat(MetricName("medication_qa_score")).add(score)]
|