crfm-helm 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crfm-helm might be problematic. Click here for more details.

Files changed (125) hide show
  1. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/METADATA +19 -5
  2. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/RECORD +121 -76
  3. helm/benchmark/adaptation/adapter_spec.py +32 -31
  4. helm/benchmark/adaptation/adapters/multimodal/in_context_learning_multimodal_adapter.py +1 -0
  5. helm/benchmark/adaptation/adapters/multimodal/multimodal_prompt.py +7 -0
  6. helm/benchmark/adaptation/adapters/multimodal/test_multimodal_prompt.py +2 -0
  7. helm/benchmark/annotation/air_bench_annotator.py +64 -0
  8. helm/benchmark/annotation/annotator_factory.py +6 -0
  9. helm/benchmark/annotation/image2structure/lilypond_compiler_annotator.py +1 -1
  10. helm/benchmark/annotation/live_qa_annotator.py +84 -0
  11. helm/benchmark/annotation/medication_qa_annotator.py +81 -0
  12. helm/benchmark/augmentations/perturbation.py +17 -1
  13. helm/benchmark/augmentations/test_perturbation.py +30 -0
  14. helm/benchmark/augmentations/translate_perturbation.py +1 -0
  15. helm/benchmark/huggingface_registration.py +16 -6
  16. helm/benchmark/metrics/air_bench_metrics.py +56 -0
  17. helm/benchmark/metrics/efficiency_metrics.py +9 -2
  18. helm/benchmark/metrics/evaluate_reference_metrics.py +16 -0
  19. helm/benchmark/metrics/fin_qa_metrics.py +60 -0
  20. helm/benchmark/metrics/fin_qa_metrics_helper.py +398 -0
  21. helm/benchmark/metrics/gpt4v_originality_critique_metrics.py +126 -0
  22. helm/benchmark/metrics/instruction_following_critique_metrics.py +1 -0
  23. helm/benchmark/metrics/live_qa_metrics.py +23 -0
  24. helm/benchmark/metrics/medication_qa_metrics.py +23 -0
  25. helm/benchmark/metrics/prometheus_vision_critique_metrics.py +185 -0
  26. helm/benchmark/metrics/reka_vibe_critique_metrics.py +158 -0
  27. helm/benchmark/metrics/unitxt_metrics.py +20 -10
  28. helm/benchmark/metrics/vision_language/emd_utils.py +4 -0
  29. helm/benchmark/metrics/vision_language/image_metrics.py +104 -21
  30. helm/benchmark/model_metadata_registry.py +5 -1
  31. helm/benchmark/presentation/schema.py +54 -4
  32. helm/benchmark/presentation/test_schema.py +11 -0
  33. helm/benchmark/run.py +16 -2
  34. helm/benchmark/run_expander.py +112 -63
  35. helm/benchmark/run_spec_factory.py +15 -10
  36. helm/benchmark/run_specs/air_bench_run_specs.py +40 -0
  37. helm/benchmark/run_specs/classic_run_specs.py +15 -11
  38. helm/benchmark/run_specs/decodingtrust_run_specs.py +3 -1
  39. helm/benchmark/run_specs/experimental_run_specs.py +33 -0
  40. helm/benchmark/run_specs/finance_run_specs.py +33 -0
  41. helm/benchmark/run_specs/vlm_run_specs.py +444 -65
  42. helm/benchmark/scenarios/air_bench_scenario.py +50 -0
  43. helm/benchmark/scenarios/ci_mcqa_scenario.py +80 -0
  44. helm/benchmark/scenarios/entity_data_imputation_scenario.py +8 -2
  45. helm/benchmark/scenarios/fin_qa_scenario.py +117 -0
  46. helm/benchmark/scenarios/legalbench_scenario.py +6 -2
  47. helm/benchmark/scenarios/math_scenario.py +1 -1
  48. helm/benchmark/scenarios/test_air_bench_scenario.py +27 -0
  49. helm/benchmark/scenarios/vision_language/a_okvqa_scenario.py +83 -0
  50. helm/benchmark/scenarios/vision_language/bingo_scenario.py +3 -3
  51. helm/benchmark/scenarios/vision_language/crossmodal_3600_scenario.py +134 -0
  52. helm/benchmark/scenarios/vision_language/flickr30k_scenario.py +74 -0
  53. helm/benchmark/scenarios/vision_language/gqa_scenario.py +91 -0
  54. helm/benchmark/scenarios/vision_language/hateful_memes_scenario.py +4 -2
  55. helm/benchmark/scenarios/vision_language/image2structure/image2structure_scenario.py +13 -2
  56. helm/benchmark/scenarios/vision_language/image2structure/latex_scenario.py +1 -5
  57. helm/benchmark/scenarios/vision_language/image2structure/musicsheet_scenario.py +1 -5
  58. helm/benchmark/scenarios/vision_language/image2structure/webpage_scenario.py +5 -3
  59. helm/benchmark/scenarios/vision_language/math_vista_scenario.py +117 -0
  60. helm/benchmark/scenarios/vision_language/mm_safety_bench_scenario.py +103 -0
  61. helm/benchmark/scenarios/vision_language/mscoco_captioning_scenario.py +92 -0
  62. helm/benchmark/scenarios/vision_language/mscoco_categorization_scenario.py +117 -0
  63. helm/benchmark/scenarios/vision_language/originality_scenario.py +35 -0
  64. helm/benchmark/scenarios/vision_language/pairs_scenario.py +247 -0
  65. helm/benchmark/scenarios/vision_language/unicorn_scenario.py +3 -3
  66. helm/benchmark/scenarios/vision_language/vibe_eval_scenario.py +95 -0
  67. helm/benchmark/scenarios/vision_language/viz_wiz_scenario.py +2 -2
  68. helm/benchmark/scenarios/vision_language/vqa_scenario.py +4 -2
  69. helm/benchmark/static/schema_air_bench.yaml +3149 -0
  70. helm/benchmark/static/schema_classic.yaml +3 -59
  71. helm/benchmark/static/schema_finance.yaml +143 -0
  72. helm/benchmark/static/schema_image2structure.yaml +447 -0
  73. helm/benchmark/static/schema_instruction_following.yaml +3 -52
  74. helm/benchmark/static/schema_lite.yaml +3 -61
  75. helm/benchmark/static/schema_medical.yaml +255 -0
  76. helm/benchmark/static/schema_mmlu.yaml +3 -61
  77. helm/benchmark/static/schema_tables.yaml +200 -0
  78. helm/benchmark/static/schema_thai.yaml +223 -0
  79. helm/benchmark/static/schema_unitxt.yaml +3 -61
  80. helm/benchmark/static/schema_vhelm.yaml +824 -0
  81. helm/benchmark/static/schema_vhelm_lite.yaml +109 -0
  82. helm/benchmark/static_build/assets/air-overview-d2e6c49f.png +0 -0
  83. helm/benchmark/static_build/assets/index-30dbceba.js +10 -0
  84. helm/benchmark/static_build/assets/index-66b02d40.css +1 -0
  85. helm/benchmark/static_build/assets/overview-74aea3d8.png +0 -0
  86. helm/benchmark/static_build/assets/process-flow-bd2eba96.png +0 -0
  87. helm/benchmark/static_build/index.html +2 -2
  88. helm/clients/anthropic_client.py +78 -14
  89. helm/clients/auto_client.py +11 -0
  90. helm/clients/client.py +24 -7
  91. helm/clients/cohere_client.py +98 -3
  92. helm/clients/huggingface_client.py +71 -12
  93. helm/clients/openai_client.py +11 -5
  94. helm/clients/reka_client.py +189 -0
  95. helm/clients/test_client.py +3 -3
  96. helm/clients/test_huggingface_client.py +19 -3
  97. helm/clients/test_together_client.py +72 -2
  98. helm/clients/together_client.py +199 -2
  99. helm/clients/vertexai_client.py +117 -64
  100. helm/clients/vision_language/huggingface_vision2seq_client.py +145 -0
  101. helm/clients/vision_language/huggingface_vlm_client.py +12 -4
  102. helm/clients/vision_language/idefics_client.py +2 -2
  103. helm/clients/vision_language/paligemma_client.py +146 -0
  104. helm/clients/vision_language/palmyra_vision_client.py +84 -0
  105. helm/clients/yi_client.py +31 -0
  106. helm/common/critique_request.py +10 -1
  107. helm/common/images_utils.py +29 -3
  108. helm/config/model_deployments.yaml +504 -12
  109. helm/config/model_metadata.yaml +579 -52
  110. helm/config/tokenizer_configs.yaml +100 -1
  111. helm/proxy/critique/model_critique_client.py +32 -4
  112. helm/proxy/services/server_service.py +1 -1
  113. helm/tokenizers/auto_tokenizer.py +1 -1
  114. helm/tokenizers/cohere_tokenizer.py +44 -2
  115. helm/tokenizers/huggingface_tokenizer.py +36 -13
  116. helm/tokenizers/test_cohere_tokenizer.py +39 -0
  117. helm/tokenizers/test_huggingface_tokenizer.py +5 -1
  118. helm/benchmark/static/schema_vlm.yaml +0 -576
  119. helm/benchmark/static_build/assets/index-5088afcb.css +0 -1
  120. helm/benchmark/static_build/assets/index-d839df55.js +0 -9
  121. helm/benchmark/test_model_deployment_definition.py +0 -90
  122. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/LICENSE +0 -0
  123. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/WHEEL +0 -0
  124. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/entry_points.txt +0 -0
  125. {crfm_helm-0.5.0.dist-info → crfm_helm-0.5.2.dist-info}/top_level.txt +0 -0
@@ -10,6 +10,7 @@ from helm.benchmark.metrics.metric_service import MetricService
10
10
  from helm.benchmark.metrics.statistic import Stat
11
11
  from helm.benchmark.scenarios.code_scenario import CodeReference
12
12
  from helm.benchmark.scenarios.scenario import Reference
13
+ from helm.common.optional_dependencies import handle_module_not_found_error
13
14
  from helm.common.request import GeneratedOutput
14
15
  from helm.benchmark.scenarios.math_scenario import is_equiv, is_equiv_chain_of_thought
15
16
  from nltk.metrics.scores import f_measure
@@ -21,6 +22,7 @@ import string
21
22
  from . import code_metrics_helper
22
23
  import nltk
23
24
 
25
+
24
26
  try:
25
27
  nltk.data.find("tokenizers/punkt")
26
28
  except LookupError:
@@ -188,6 +190,19 @@ def bleu_4(gold: str, pred: str) -> float:
188
190
  return sentence_bleu([word_tokenize(gold)], word_tokenize(pred), weights=(0, 0, 0, 1))
189
191
 
190
192
 
193
+ def cider(gold: str, pred: str) -> float:
194
+ try:
195
+ from pycocoevalcap.cider.cider import Cider
196
+ except ModuleNotFoundError as e:
197
+ handle_module_not_found_error(e, ["vlm"])
198
+
199
+ cider_evaluator = Cider()
200
+ candidate = {"caption": [pred]}
201
+ reference = {"caption": [gold]}
202
+ average_score, _ = cider_evaluator.compute_score(reference, candidate)
203
+ return average_score
204
+
205
+
191
206
  def extract_set_from_text(
192
207
  set_str: str,
193
208
  set_start_str: str = " is ",
@@ -325,6 +340,7 @@ def compute_reference_metrics(
325
340
  "math_equiv_chain_of_thought": is_equiv_chain_of_thought,
326
341
  "code_eval_acc": code_eval,
327
342
  "pass": code_eval,
343
+ "cider": cider,
328
344
  "f1_score": f1_score,
329
345
  "rouge_1": get_rouge_function("rouge1"),
330
346
  "rouge_2": get_rouge_function("rouge2"),
@@ -0,0 +1,60 @@
1
+ import math
2
+ import json
3
+ from typing import List, Union
4
+
5
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
6
+ from helm.benchmark.adaptation.request_state import RequestState
7
+ from helm.benchmark.metrics.metric import Metric
8
+ from helm.benchmark.metrics.metric_name import MetricName
9
+ from helm.benchmark.metrics.metric_service import MetricService
10
+ from helm.benchmark.metrics.statistic import Stat
11
+ from helm.benchmark.metrics.fin_qa_metrics_helper import ( # type: ignore
12
+ equal_program,
13
+ eval_program,
14
+ program_tokenization,
15
+ )
16
+
17
+
18
+ def _get_program_accuracy(reference_program: List[str], generated_program: List[str]) -> float:
19
+ return 1.0 if equal_program(reference_program, generated_program) else 0.0
20
+
21
+
22
+ def _get_execution_accuracy(reference_execution: str, generated_program: List[str], table: List[List[str]]) -> float:
23
+ invalid_flag: int
24
+ generated_result: Union[str, float]
25
+ invalid_flag, generated_result = eval_program(generated_program, table)
26
+ if invalid_flag:
27
+ return 0.0
28
+ if reference_execution == "yes" or reference_execution == "no":
29
+ return 1.0 if reference_execution == generated_result else 0
30
+ else:
31
+ if not isinstance(generated_result, float):
32
+ return 0.0
33
+ return 1.0 if math.isclose(float(reference_execution), generated_result) else 0
34
+
35
+
36
+ class FinQAMetric(Metric):
37
+ def evaluate_generation(
38
+ self,
39
+ adapter_spec: AdapterSpec,
40
+ request_state: RequestState,
41
+ metric_service: MetricService,
42
+ eval_cache_path: str,
43
+ ) -> List[Stat]:
44
+ assert len(request_state.instance.references) == 3
45
+ reference_text = request_state.instance.references[0].output.text
46
+ reference_program = program_tokenization(reference_text)
47
+ reference_execution = request_state.instance.references[1].output.text
48
+ table: List[List[str]] = json.loads(request_state.instance.references[2].output.text)
49
+
50
+ assert request_state.result
51
+ assert len(request_state.result.completions) == 1
52
+ generated_text = request_state.result.completions[0].text.strip()
53
+ generated_program = program_tokenization(generated_text)
54
+
55
+ return [
56
+ Stat(MetricName("program_accuracy")).add(_get_program_accuracy(reference_program, generated_program)),
57
+ Stat(MetricName("execution_accuracy")).add(
58
+ _get_execution_accuracy(reference_execution, generated_program, table)
59
+ ),
60
+ ]
@@ -0,0 +1,398 @@
1
+ # type: ignore
2
+ # flake8: noqa
3
+ # fmt: off
4
+ """Evaluation metrics for FinQA.
5
+
6
+ This evaluation code is reproduced from the following URL with the following license.
7
+
8
+ URL: https://github.com/czyssrs/FinQA/blob/0f16e2867befa6840783e58be38c9efb9229d742/code/evaluate/evaluate.py
9
+
10
+ License: MIT License
11
+
12
+ Copyright (c) 2021 Zhiyu Chen
13
+
14
+ Permission is hereby granted, free of charge, to any person obtaining a copy
15
+ of this software and associated documentation files (the "Software"), to deal
16
+ in the Software without restriction, including without limitation the rights
17
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
18
+ copies of the Software, and to permit persons to whom the Software is
19
+ furnished to do so, subject to the following conditions:
20
+
21
+ The above copyright notice and this permission notice shall be included in all
22
+ copies or substantial portions of the Software.
23
+
24
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
29
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30
+ SOFTWARE."""
31
+
32
+ from sympy import simplify
33
+
34
+
35
+ all_ops = ["add", "subtract", "multiply", "divide", "exp", "greater", "table_max", \
36
+ "table_min", "table_sum", "table_average"]
37
+
38
+
39
+ def str_to_num(text):
40
+
41
+ text = text.replace(",", "")
42
+ try:
43
+ num = float(text)
44
+ except ValueError:
45
+ if "%" in text:
46
+ text = text.replace("%", "")
47
+ try:
48
+ num = float(text)
49
+ num = num / 100.0
50
+ except ValueError:
51
+ num = "n/a"
52
+ elif "const" in text:
53
+ text = text.replace("const_", "")
54
+ if text == "m1":
55
+ text = "-1"
56
+ num = float(text)
57
+ else:
58
+ num = "n/a"
59
+ return num
60
+
61
+ def process_row(row_in):
62
+
63
+ row_out = []
64
+ invalid_flag = 0
65
+
66
+ for num in row_in:
67
+ num = num.replace("$", "").strip()
68
+ num = num.split("(")[0].strip()
69
+
70
+ num = str_to_num(num)
71
+
72
+ if num == "n/a":
73
+ invalid_flag = 1
74
+ break
75
+
76
+ row_out.append(num)
77
+
78
+ if invalid_flag:
79
+ return "n/a"
80
+
81
+ return row_out
82
+
83
+
84
+ def eval_program(program, table):
85
+ '''
86
+ calculate the numerical results of the program
87
+ '''
88
+
89
+ invalid_flag = 0
90
+ this_res = "n/a"
91
+
92
+ try:
93
+ program = program[:-1] # remove EOF
94
+ # check structure
95
+ for ind, token in enumerate(program):
96
+ if ind % 4 == 0:
97
+ if token.strip("(") not in all_ops:
98
+ return 1, "n/a"
99
+ if (ind + 1) % 4 == 0:
100
+ if token != ")":
101
+ return 1, "n/a"
102
+
103
+
104
+ program = "|".join(program)
105
+ steps = program.split(")")[:-1]
106
+
107
+
108
+ res_dict = {}
109
+
110
+ # print(program)
111
+
112
+ for ind, step in enumerate(steps):
113
+ step = step.strip()
114
+
115
+ if len(step.split("(")) > 2:
116
+ invalid_flag = 1
117
+ break
118
+ op = step.split("(")[0].strip("|").strip()
119
+ args = step.split("(")[1].strip("|").strip()
120
+
121
+ # print(args)
122
+ # print(op)
123
+
124
+ arg1 = args.split("|")[0].strip()
125
+ arg2 = args.split("|")[1].strip()
126
+
127
+ if op == "add" or op == "subtract" or op == "multiply" or op == "divide" or op == "exp" or op == "greater":
128
+
129
+ if "#" in arg1:
130
+ arg1 = res_dict[int(arg1.replace("#", ""))]
131
+ else:
132
+ # print(arg1)
133
+ arg1 = str_to_num(arg1)
134
+ if arg1 == "n/a":
135
+ invalid_flag = 1
136
+ break
137
+
138
+ if "#" in arg2:
139
+ arg2 = res_dict[int(arg2.replace("#", ""))]
140
+ else:
141
+ arg2 = str_to_num(arg2)
142
+ if arg2 == "n/a":
143
+ invalid_flag = 1
144
+ break
145
+
146
+ if op == "add":
147
+ this_res = arg1 + arg2
148
+ elif op == "subtract":
149
+ this_res = arg1 - arg2
150
+ elif op == "multiply":
151
+ this_res = arg1 * arg2
152
+ elif op == "divide":
153
+ this_res = arg1 / arg2
154
+ elif op == "exp":
155
+ this_res = arg1 ** arg2
156
+ elif op == "greater":
157
+ this_res = "yes" if arg1 > arg2 else "no"
158
+
159
+
160
+ # print("ind: ", ind)
161
+ # print(this_res)
162
+ res_dict[ind] = this_res
163
+
164
+
165
+ elif "table" in op:
166
+ table_dict = {}
167
+ for row in table:
168
+ table_dict[row[0]] = row[1:]
169
+
170
+ if "#" in arg1:
171
+ arg1 = res_dict[int(arg1.replace("#", ""))]
172
+ else:
173
+ if arg1 not in table_dict:
174
+ invalid_flag = 1
175
+ break
176
+
177
+ cal_row = table_dict[arg1]
178
+ num_row = process_row(cal_row)
179
+
180
+ if num_row == "n/a":
181
+ invalid_flag = 1
182
+ break
183
+ if op == "table_max":
184
+ this_res = max(num_row)
185
+ elif op == "table_min":
186
+ this_res = min(num_row)
187
+ elif op == "table_sum":
188
+ this_res = sum(num_row)
189
+ elif op == "table_average":
190
+ this_res = sum(num_row) / len(num_row)
191
+
192
+ # this_res = round(this_res, 5)
193
+
194
+ res_dict[ind] = this_res
195
+
196
+ # print(this_res)
197
+
198
+ if this_res != "yes" and this_res != "no" and this_res != "n/a":
199
+ # print(this_res)
200
+ this_res = round(this_res, 5)
201
+
202
+ except:
203
+ invalid_flag = 1
204
+
205
+
206
+ return invalid_flag, this_res
207
+
208
+
209
+ def equal_program(program1, program2):
210
+ '''
211
+ symbolic program if equal
212
+ program1: gold
213
+ program2: pred
214
+ '''
215
+
216
+ sym_map = {}
217
+
218
+ program1 = program1[:-1] # remove EOF
219
+ program1 = "|".join(program1)
220
+ steps = program1.split(")")[:-1]
221
+
222
+ invalid_flag = 0
223
+ sym_ind = 0
224
+ step_dict_1 = {}
225
+
226
+ # symbolic map
227
+ for ind, step in enumerate(steps):
228
+
229
+ step = step.strip()
230
+
231
+ assert len(step.split("(")) <= 2
232
+
233
+ op = step.split("(")[0].strip("|").strip()
234
+ args = step.split("(")[1].strip("|").strip()
235
+
236
+ arg1 = args.split("|")[0].strip()
237
+ arg2 = args.split("|")[1].strip()
238
+
239
+ step_dict_1[ind] = step
240
+
241
+ if "table" in op:
242
+ if step not in sym_map:
243
+ sym_map[step] = "a" + str(sym_ind)
244
+ sym_ind += 1
245
+
246
+ else:
247
+ if "#" not in arg1:
248
+ if arg1 not in sym_map:
249
+ sym_map[arg1] = "a" + str(sym_ind)
250
+ sym_ind += 1
251
+
252
+ if "#" not in arg2:
253
+ if arg2 not in sym_map:
254
+ sym_map[arg2] = "a" + str(sym_ind)
255
+ sym_ind += 1
256
+
257
+
258
+ # check program 2
259
+ step_dict_2 = {}
260
+ try:
261
+ program2 = program2[:-1] # remove EOF
262
+ # check structure
263
+ for ind, token in enumerate(program2):
264
+ if ind % 4 == 0:
265
+ if token.strip("(") not in all_ops:
266
+ print("structure error")
267
+ return False
268
+ if (ind + 1) % 4 == 0:
269
+ if token != ")":
270
+ print("structure error")
271
+ return False
272
+
273
+ program2 = "|".join(program2)
274
+ steps = program2.split(")")[:-1]
275
+
276
+ for ind, step in enumerate(steps):
277
+ step = step.strip()
278
+
279
+ if len(step.split("(")) > 2:
280
+ return False
281
+ op = step.split("(")[0].strip("|").strip()
282
+ args = step.split("(")[1].strip("|").strip()
283
+
284
+ # print(args)
285
+ # print(op)
286
+
287
+ arg1 = args.split("|")[0].strip()
288
+ arg2 = args.split("|")[1].strip()
289
+
290
+ step_dict_2[ind] = step
291
+
292
+ if "table" in op:
293
+ if step not in sym_map:
294
+ return False
295
+
296
+ else:
297
+ if "#" not in arg1:
298
+ if arg1 not in sym_map:
299
+ return False
300
+ else:
301
+ if int(arg1.strip("#")) >= ind:
302
+ return False
303
+
304
+ if "#" not in arg2:
305
+ if arg2 not in sym_map:
306
+ return False
307
+ else:
308
+ if int(arg2.strip("#")) >= ind:
309
+ return False
310
+ except:
311
+ return False
312
+
313
+ def symbol_recur(step, step_dict):
314
+
315
+ step = step.strip()
316
+ op = step.split("(")[0].strip("|").strip()
317
+ args = step.split("(")[1].strip("|").strip()
318
+
319
+ arg1 = args.split("|")[0].strip()
320
+ arg2 = args.split("|")[1].strip()
321
+
322
+ # print(op)
323
+ # print(arg1)
324
+ # print(arg2)
325
+
326
+ if "table" in op:
327
+ # as var
328
+ return sym_map[step]
329
+
330
+ if "#" in arg1:
331
+ arg1_ind = int(arg1.replace("#", ""))
332
+ arg1_part = symbol_recur(step_dict[arg1_ind], step_dict)
333
+ else:
334
+ arg1_part = sym_map[arg1]
335
+
336
+
337
+ if "#" in arg2:
338
+ arg2_ind = int(arg2.replace("#", ""))
339
+ arg2_part = symbol_recur(step_dict[arg2_ind], step_dict)
340
+ else:
341
+ arg2_part = sym_map[arg2]
342
+
343
+ if op == "add":
344
+ return "( " + arg1_part + " + " + arg2_part + " )"
345
+ elif op == "subtract":
346
+ return "( " + arg1_part + " - " + arg2_part + " )"
347
+ elif op == "multiply":
348
+ return "( " + arg1_part + " * " + arg2_part + " )"
349
+ elif op == "divide":
350
+ return "( " + arg1_part + " / " + arg2_part + " )"
351
+ elif op == "exp":
352
+ return "( " + arg1_part + " ** " + arg2_part + " )"
353
+ elif op == "greater":
354
+ return "( " + arg1_part + " > " + arg2_part + " )"
355
+
356
+
357
+ # # derive symbolic program 1
358
+ # print(program1)
359
+ steps = program1.split(")")[:-1]
360
+ # print(steps)
361
+ # print(steps)
362
+ # print(sym_map)
363
+ sym_prog1 = symbol_recur(steps[-1], step_dict_1)
364
+ sym_prog1 = simplify(sym_prog1, evaluate=False)
365
+ # print("########")
366
+ # print(sym_prog1)
367
+
368
+ try:
369
+ # derive symbolic program 2
370
+ steps = program2.split(")")[:-1]
371
+ sym_prog2 = symbol_recur(steps[-1], step_dict_2)
372
+ sym_prog2 = simplify(sym_prog2, evaluate=False)
373
+ # print(sym_prog2)
374
+ except:
375
+ return False
376
+
377
+ return sym_prog1 == sym_prog2
378
+
379
+
380
+ def program_tokenization(original_program):
381
+ original_program = original_program.split(', ')
382
+ program = []
383
+ for tok in original_program:
384
+ cur_tok = ''
385
+ for c in tok:
386
+ if c == ')':
387
+ if cur_tok != '':
388
+ program.append(cur_tok)
389
+ cur_tok = ''
390
+ cur_tok += c
391
+ if c in ['(', ')']:
392
+ program.append(cur_tok)
393
+ cur_tok = ''
394
+ if cur_tok != '':
395
+ program.append(cur_tok)
396
+ program.append('EOF')
397
+ return program
398
+ # fmt: on
@@ -0,0 +1,126 @@
1
+ from typing import Dict, List
2
+
3
+ from helm.benchmark.adaptation.request_state import RequestState
4
+ from helm.benchmark.adaptation.scenario_state import ScenarioState
5
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
6
+ from helm.benchmark.metrics.metric import MetricInterface, MetricResult, PerInstanceStats, add_context
7
+ from helm.benchmark.metrics.metric_name import MetricContext, MetricName
8
+ from helm.benchmark.metrics.metric_service import MetricService
9
+ from helm.benchmark.metrics.statistic import Stat, merge_stat
10
+ from helm.common.critique_request import CritiqueTaskTemplate, CritiqueQuestionTemplate, CritiqueRequest, QuestionType
11
+ from helm.common.hierarchical_logger import hlog
12
+ from helm.common.request import RequestResult, Request, GeneratedOutput
13
+ from helm.common.media_object import MultimediaObject, IMAGE_TYPE, MediaObject
14
+
15
+
16
+ class GPT4VCritiqueMetric(MetricInterface):
17
+ """
18
+ Critique evaluation for evaluating how original the generated text are given the image by GPT4V.
19
+ """
20
+
21
+ # We can add more evaluation aspects here
22
+ ORIGINALITY_NAME: str = "originality_gpt4v"
23
+ ORIGINALITY_ANSWER_TO_SCORE: Dict[str, int] = {
24
+ "I’ve seen something like this before to the point it’s become tiresome.": 1,
25
+ "The text is not really original, but it has some originality to it.": 2,
26
+ "Neutral.": 3,
27
+ "I find the text to be fresh and original.": 4,
28
+ "I find the text to be extremely creative and out of this world.": 5,
29
+ }
30
+
31
+ def __init__(self, num_respondents: int):
32
+ self._num_respondents = num_respondents
33
+
34
+ def __repr__(self) -> str:
35
+ return "GPT4CritiqueMetric()"
36
+
37
+ def evaluate(
38
+ self,
39
+ scenario_state: ScenarioState,
40
+ metric_service: MetricService,
41
+ eval_cache_path: str,
42
+ parallelism: int,
43
+ ) -> MetricResult:
44
+ request_states: List[RequestState] = scenario_state.request_states
45
+
46
+ all_stats: Dict[MetricName, Stat] = {}
47
+ per_instance_stats: List[PerInstanceStats] = []
48
+ for request_state in request_states:
49
+ context = MetricContext.from_instance(request_state.instance)
50
+ stats_without_context = self.evaluate_generation(
51
+ scenario_state.adapter_spec,
52
+ request_state,
53
+ metric_service,
54
+ eval_cache_path,
55
+ )
56
+ stats = [add_context(stat_without_context, context) for stat_without_context in stats_without_context]
57
+ for stat in stats:
58
+ merge_stat(all_stats, stat)
59
+ assert request_state.instance.id is not None
60
+ per_instance_stats.append(
61
+ PerInstanceStats(
62
+ instance_id=request_state.instance.id,
63
+ perturbation=request_state.instance.perturbation,
64
+ train_trial_index=request_state.train_trial_index,
65
+ stats=stats,
66
+ )
67
+ )
68
+ return MetricResult(aggregated_stats=list(all_stats.values()), per_instance_stats=per_instance_stats)
69
+
70
+ def evaluate_generation(
71
+ self,
72
+ adapter_spec: AdapterSpec,
73
+ request_state: RequestState,
74
+ metric_service: MetricService,
75
+ eval_cache_path: str,
76
+ ) -> List[Stat]:
77
+ input_request: Request = request_state.request
78
+ # Predicted outputs and their originality scores
79
+ assert request_state.result is not None
80
+ request_result: RequestResult = request_state.result
81
+ # Get input image and generated response for the originality evaluation
82
+ assert input_request.multimodal_prompt is not None
83
+ completions: List[GeneratedOutput] = request_result.completions
84
+ input_text: str = completions[0].text
85
+ input_media: MultimediaObject = input_request.multimodal_prompt
86
+ image_objects: List[MediaObject] = [
87
+ item for item in input_media.media_objects if item.is_type(IMAGE_TYPE) and item.location
88
+ ]
89
+
90
+ template = CritiqueTaskTemplate(
91
+ name="vhelm_gpt4v_originality",
92
+ # TODO: Add proper instructions
93
+ instructions="Answer the multiple choice question by just giving the letter of the correct "
94
+ "answer.\n\n{{prompt}}",
95
+ num_respondents=self._num_respondents,
96
+ questions=[
97
+ CritiqueQuestionTemplate(
98
+ name=self.ORIGINALITY_NAME,
99
+ question_type=QuestionType.MULTIPLE_CHOICE,
100
+ text="How original is the text, given it was created with the image?",
101
+ options=list(self.ORIGINALITY_ANSWER_TO_SCORE.keys()),
102
+ media_object=image_objects[0], # we only take the first image as input
103
+ )
104
+ ],
105
+ )
106
+ request = CritiqueRequest(template=template, fields={"prompt": input_text})
107
+
108
+ # send to critique request
109
+ result = metric_service.make_critique_request(request)
110
+ if not result or not result.responses:
111
+ # Skip computing metrics if there aren't any responses yet
112
+ hlog("Waiting for responses to be generated.")
113
+ return []
114
+
115
+ stats: Dict[str, Stat] = {}
116
+ for question in template.questions:
117
+ stats[question.name] = Stat(MetricName(question.name))
118
+
119
+ for response in result.responses:
120
+ for answer_name, answer in response.answers.items():
121
+ assert isinstance(answer, str)
122
+ answer_value: float
123
+ answer_value = self.ORIGINALITY_ANSWER_TO_SCORE[answer]
124
+ stats[answer_name].add(answer_value)
125
+
126
+ return list(stats.values())
@@ -1,3 +1,4 @@
1
+ # noqa: E501
1
2
  from typing import Dict, List
2
3
 
3
4
  from helm.benchmark.adaptation.request_state import RequestState
@@ -0,0 +1,23 @@
1
+ from typing import List
2
+
3
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
4
+ from helm.benchmark.adaptation.request_state import RequestState
5
+ from helm.benchmark.metrics.metric import Metric
6
+ from helm.benchmark.metrics.metric_name import MetricName
7
+ from helm.benchmark.metrics.metric_service import MetricService
8
+ from helm.benchmark.metrics.statistic import Stat
9
+
10
+
11
+ class LiveQAScoreMetric(Metric):
12
+ """Score metrics for LiveQA."""
13
+
14
+ def evaluate_generation(
15
+ self,
16
+ adapter_spec: AdapterSpec,
17
+ request_state: RequestState,
18
+ metric_service: MetricService,
19
+ eval_cache_path: str,
20
+ ) -> List[Stat]:
21
+ assert request_state.annotations
22
+ score = request_state.annotations["live_qa"]["score"]
23
+ return [Stat(MetricName("live_qa_score")).add(score)]
@@ -0,0 +1,23 @@
1
+ from typing import List
2
+
3
+ from helm.benchmark.adaptation.adapter_spec import AdapterSpec
4
+ from helm.benchmark.adaptation.request_state import RequestState
5
+ from helm.benchmark.metrics.metric import Metric
6
+ from helm.benchmark.metrics.metric_name import MetricName
7
+ from helm.benchmark.metrics.metric_service import MetricService
8
+ from helm.benchmark.metrics.statistic import Stat
9
+
10
+
11
+ class MedicationQAScoreMetric(Metric):
12
+ """Score metrics for MedicationQA."""
13
+
14
+ def evaluate_generation(
15
+ self,
16
+ adapter_spec: AdapterSpec,
17
+ request_state: RequestState,
18
+ metric_service: MetricService,
19
+ eval_cache_path: str,
20
+ ) -> List[Stat]:
21
+ assert request_state.annotations
22
+ score = request_state.annotations["medication_qa"]["score"]
23
+ return [Stat(MetricName("medication_qa_score")).add(score)]