evalscope 0.7.1__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (233) hide show
  1. evalscope/__init__.py +1 -1
  2. evalscope/arguments.py +73 -0
  3. evalscope/backend/base.py +5 -1
  4. evalscope/backend/opencompass/api_meta_template.py +8 -14
  5. evalscope/backend/opencompass/backend_manager.py +24 -15
  6. evalscope/backend/opencompass/tasks/eval_api.py +1 -6
  7. evalscope/backend/opencompass/tasks/eval_datasets.py +26 -28
  8. evalscope/backend/rag_eval/__init__.py +3 -3
  9. evalscope/backend/rag_eval/backend_manager.py +21 -25
  10. evalscope/backend/rag_eval/clip_benchmark/__init__.py +1 -1
  11. evalscope/backend/rag_eval/clip_benchmark/arguments.py +6 -6
  12. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +62 -79
  13. evalscope/backend/rag_eval/clip_benchmark/task_template.py +29 -43
  14. evalscope/backend/rag_eval/clip_benchmark/tasks/image_caption.py +20 -22
  15. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +16 -23
  16. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_retrieval.py +14 -35
  17. evalscope/backend/rag_eval/clip_benchmark/utils/webdataset_convert.py +69 -90
  18. evalscope/backend/rag_eval/cmteb/__init__.py +3 -3
  19. evalscope/backend/rag_eval/cmteb/arguments.py +25 -27
  20. evalscope/backend/rag_eval/cmteb/base.py +22 -23
  21. evalscope/backend/rag_eval/cmteb/task_template.py +15 -17
  22. evalscope/backend/rag_eval/cmteb/tasks/Classification.py +98 -79
  23. evalscope/backend/rag_eval/cmteb/tasks/Clustering.py +17 -22
  24. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +17 -19
  25. evalscope/backend/rag_eval/cmteb/tasks/PairClassification.py +35 -29
  26. evalscope/backend/rag_eval/cmteb/tasks/Reranking.py +18 -5
  27. evalscope/backend/rag_eval/cmteb/tasks/Retrieval.py +163 -163
  28. evalscope/backend/rag_eval/cmteb/tasks/STS.py +126 -104
  29. evalscope/backend/rag_eval/cmteb/tasks/__init__.py +33 -34
  30. evalscope/backend/rag_eval/ragas/__init__.py +2 -2
  31. evalscope/backend/rag_eval/ragas/arguments.py +3 -8
  32. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +9 -9
  33. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +2 -2
  34. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +3 -3
  35. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +5 -5
  36. evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +7 -0
  37. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +8 -8
  38. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +5 -5
  39. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +7 -5
  40. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +2 -2
  41. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  42. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  43. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +27 -4
  44. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +2 -2
  45. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +2 -2
  46. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +5 -5
  47. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +3 -3
  48. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +21 -4
  49. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +3 -3
  50. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +4 -4
  51. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +2 -2
  52. evalscope/backend/rag_eval/ragas/prompts/persona_prompt.py +0 -1
  53. evalscope/backend/rag_eval/ragas/task_template.py +10 -15
  54. evalscope/backend/rag_eval/ragas/tasks/__init__.py +1 -1
  55. evalscope/backend/rag_eval/ragas/tasks/build_distribution.py +45 -0
  56. evalscope/backend/rag_eval/ragas/tasks/build_transform.py +135 -0
  57. evalscope/backend/rag_eval/ragas/tasks/testset_generation.py +17 -133
  58. evalscope/backend/rag_eval/ragas/tasks/translate_prompt.py +8 -18
  59. evalscope/backend/rag_eval/utils/clip.py +46 -50
  60. evalscope/backend/rag_eval/utils/embedding.py +12 -11
  61. evalscope/backend/rag_eval/utils/llm.py +8 -6
  62. evalscope/backend/rag_eval/utils/tools.py +12 -11
  63. evalscope/backend/vlm_eval_kit/__init__.py +1 -1
  64. evalscope/backend/vlm_eval_kit/custom_dataset.py +7 -8
  65. evalscope/benchmarks/arc/__init__.py +3 -2
  66. evalscope/benchmarks/arc/ai2_arc.py +19 -16
  67. evalscope/benchmarks/arc/arc_adapter.py +32 -24
  68. evalscope/benchmarks/bbh/__init__.py +1 -2
  69. evalscope/benchmarks/bbh/bbh_adapter.py +28 -25
  70. evalscope/benchmarks/bbh/cot_prompts/boolean_expressions.txt +1 -1
  71. evalscope/benchmarks/bbh/cot_prompts/causal_judgement.txt +1 -1
  72. evalscope/benchmarks/bbh/cot_prompts/date_understanding.txt +1 -1
  73. evalscope/benchmarks/bbh/cot_prompts/disambiguation_qa.txt +1 -1
  74. evalscope/benchmarks/bbh/cot_prompts/dyck_languages.txt +1 -1
  75. evalscope/benchmarks/bbh/cot_prompts/formal_fallacies.txt +1 -1
  76. evalscope/benchmarks/bbh/cot_prompts/geometric_shapes.txt +1 -1
  77. evalscope/benchmarks/bbh/cot_prompts/hyperbaton.txt +1 -1
  78. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_five_objects.txt +1 -1
  79. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_seven_objects.txt +1 -1
  80. evalscope/benchmarks/bbh/cot_prompts/logical_deduction_three_objects.txt +1 -1
  81. evalscope/benchmarks/bbh/cot_prompts/movie_recommendation.txt +1 -1
  82. evalscope/benchmarks/bbh/cot_prompts/multistep_arithmetic_two.txt +1 -1
  83. evalscope/benchmarks/bbh/cot_prompts/navigate.txt +1 -1
  84. evalscope/benchmarks/bbh/cot_prompts/object_counting.txt +1 -1
  85. evalscope/benchmarks/bbh/cot_prompts/penguins_in_a_table.txt +1 -1
  86. evalscope/benchmarks/bbh/cot_prompts/reasoning_about_colored_objects.txt +1 -1
  87. evalscope/benchmarks/bbh/cot_prompts/ruin_names.txt +1 -1
  88. evalscope/benchmarks/bbh/cot_prompts/salient_translation_error_detection.txt +1 -1
  89. evalscope/benchmarks/bbh/cot_prompts/snarks.txt +1 -1
  90. evalscope/benchmarks/bbh/cot_prompts/sports_understanding.txt +1 -1
  91. evalscope/benchmarks/bbh/cot_prompts/temporal_sequences.txt +1 -1
  92. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_five_objects.txt +1 -1
  93. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_seven_objects.txt +1 -1
  94. evalscope/benchmarks/bbh/cot_prompts/tracking_shuffled_objects_three_objects.txt +1 -1
  95. evalscope/benchmarks/bbh/cot_prompts/web_of_lies.txt +1 -1
  96. evalscope/benchmarks/bbh/cot_prompts/word_sorting.txt +1 -1
  97. evalscope/benchmarks/benchmark.py +16 -16
  98. evalscope/benchmarks/ceval/__init__.py +3 -2
  99. evalscope/benchmarks/ceval/ceval_adapter.py +80 -69
  100. evalscope/benchmarks/ceval/ceval_exam.py +18 -31
  101. evalscope/benchmarks/cmmlu/__init__.py +3 -2
  102. evalscope/benchmarks/cmmlu/cmmlu.py +87 -92
  103. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +109 -155
  104. evalscope/benchmarks/cmmlu/samples.jsonl +1 -1
  105. evalscope/benchmarks/competition_math/__init__.py +3 -2
  106. evalscope/benchmarks/competition_math/competition_math.py +7 -16
  107. evalscope/benchmarks/competition_math/competition_math_adapter.py +32 -34
  108. evalscope/benchmarks/data_adapter.py +24 -24
  109. evalscope/benchmarks/general_qa/__init__.py +3 -2
  110. evalscope/benchmarks/general_qa/general_qa_adapter.py +34 -38
  111. evalscope/benchmarks/gsm8k/__init__.py +1 -1
  112. evalscope/benchmarks/gsm8k/gsm8k.py +6 -12
  113. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +26 -24
  114. evalscope/benchmarks/hellaswag/__init__.py +3 -2
  115. evalscope/benchmarks/hellaswag/hellaswag.py +15 -19
  116. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +27 -23
  117. evalscope/benchmarks/humaneval/__init__.py +1 -1
  118. evalscope/benchmarks/humaneval/humaneval.py +15 -18
  119. evalscope/benchmarks/humaneval/humaneval_adapter.py +0 -1
  120. evalscope/benchmarks/mmlu/__init__.py +3 -2
  121. evalscope/benchmarks/mmlu/mmlu.py +15 -29
  122. evalscope/benchmarks/mmlu/mmlu_adapter.py +85 -77
  123. evalscope/benchmarks/race/__init__.py +3 -2
  124. evalscope/benchmarks/race/race.py +21 -35
  125. evalscope/benchmarks/race/race_adapter.py +32 -29
  126. evalscope/benchmarks/race/samples.jsonl +1 -1
  127. evalscope/benchmarks/trivia_qa/__init__.py +3 -2
  128. evalscope/benchmarks/trivia_qa/samples.jsonl +1 -1
  129. evalscope/benchmarks/trivia_qa/trivia_qa.py +19 -34
  130. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +27 -22
  131. evalscope/benchmarks/truthful_qa/__init__.py +3 -2
  132. evalscope/benchmarks/truthful_qa/truthful_qa.py +25 -29
  133. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +36 -37
  134. evalscope/cli/cli.py +6 -5
  135. evalscope/cli/start_eval.py +31 -0
  136. evalscope/cli/start_perf.py +0 -3
  137. evalscope/cli/start_server.py +27 -41
  138. evalscope/config.py +119 -95
  139. evalscope/constants.py +61 -29
  140. evalscope/evaluator/__init__.py +1 -0
  141. evalscope/evaluator/evaluator.py +96 -377
  142. evalscope/evaluator/humaneval_evaluator.py +158 -0
  143. evalscope/evaluator/rating_eval.py +12 -33
  144. evalscope/evaluator/reviewer/auto_reviewer.py +47 -76
  145. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +10 -20
  146. evalscope/metrics/code_metric.py +3 -9
  147. evalscope/metrics/math_accuracy.py +3 -6
  148. evalscope/metrics/metrics.py +21 -21
  149. evalscope/metrics/rouge_metric.py +11 -25
  150. evalscope/models/__init__.py +1 -2
  151. evalscope/models/api/openai_api.py +40 -29
  152. evalscope/models/custom/__init__.py +0 -1
  153. evalscope/models/custom/custom_model.py +3 -3
  154. evalscope/models/dummy_chat_model.py +7 -8
  155. evalscope/models/model_adapter.py +89 -156
  156. evalscope/models/openai_model.py +20 -20
  157. evalscope/perf/arguments.py +15 -3
  158. evalscope/perf/benchmark.py +7 -9
  159. evalscope/perf/http_client.py +3 -8
  160. evalscope/perf/main.py +10 -0
  161. evalscope/perf/plugin/api/custom_api.py +1 -2
  162. evalscope/perf/plugin/api/dashscope_api.py +1 -2
  163. evalscope/perf/plugin/api/openai_api.py +3 -4
  164. evalscope/perf/plugin/datasets/base.py +1 -2
  165. evalscope/perf/plugin/datasets/flickr8k.py +1 -2
  166. evalscope/perf/plugin/datasets/longalpaca.py +1 -2
  167. evalscope/perf/plugin/datasets/openqa.py +1 -2
  168. evalscope/perf/utils/analysis_result.py +1 -2
  169. evalscope/perf/utils/benchmark_util.py +1 -2
  170. evalscope/perf/utils/db_util.py +11 -8
  171. evalscope/perf/utils/local_server.py +19 -13
  172. evalscope/registry/config/cfg_arena_zhihu.yaml +1 -1
  173. evalscope/registry/tasks/arc.yaml +2 -3
  174. evalscope/registry/tasks/bbh.yaml +3 -4
  175. evalscope/registry/tasks/bbh_mini.yaml +3 -4
  176. evalscope/registry/tasks/ceval.yaml +3 -3
  177. evalscope/registry/tasks/ceval_mini.yaml +3 -4
  178. evalscope/registry/tasks/cmmlu.yaml +3 -3
  179. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +1 -1
  180. evalscope/registry/tasks/general_qa.yaml +1 -1
  181. evalscope/registry/tasks/gsm8k.yaml +2 -2
  182. evalscope/registry/tasks/mmlu.yaml +3 -3
  183. evalscope/registry/tasks/mmlu_mini.yaml +3 -3
  184. evalscope/run.py +184 -375
  185. evalscope/run_arena.py +20 -25
  186. evalscope/summarizer.py +16 -17
  187. evalscope/third_party/longbench_write/README.md +99 -42
  188. evalscope/third_party/longbench_write/default_task.json +1 -1
  189. evalscope/third_party/longbench_write/default_task.yaml +8 -7
  190. evalscope/third_party/longbench_write/eval.py +29 -28
  191. evalscope/third_party/longbench_write/infer.py +16 -104
  192. evalscope/third_party/longbench_write/longbench_write.py +5 -5
  193. evalscope/third_party/longbench_write/resources/judge.txt +1 -1
  194. evalscope/third_party/longbench_write/tools/data_etl.py +4 -5
  195. evalscope/third_party/longbench_write/utils.py +0 -1
  196. evalscope/third_party/toolbench_static/eval.py +14 -15
  197. evalscope/third_party/toolbench_static/infer.py +48 -69
  198. evalscope/third_party/toolbench_static/llm/swift_infer.py +4 -12
  199. evalscope/third_party/toolbench_static/requirements.txt +1 -1
  200. evalscope/third_party/toolbench_static/toolbench_static.py +3 -3
  201. evalscope/tools/combine_reports.py +25 -30
  202. evalscope/tools/rewrite_eval_results.py +14 -46
  203. evalscope/utils/__init__.py +0 -1
  204. evalscope/utils/arena_utils.py +18 -48
  205. evalscope/{perf/utils → utils}/chat_service.py +3 -4
  206. evalscope/utils/completion_parsers.py +3 -8
  207. evalscope/utils/logger.py +9 -7
  208. evalscope/utils/model_utils.py +11 -0
  209. evalscope/utils/utils.py +12 -138
  210. evalscope/version.py +2 -2
  211. {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/METADATA +125 -120
  212. evalscope-0.8.0.dist-info/RECORD +285 -0
  213. tests/cli/test_run.py +54 -15
  214. tests/perf/test_perf.py +4 -0
  215. tests/rag/test_clip_benchmark.py +38 -38
  216. tests/rag/test_mteb.py +3 -2
  217. tests/rag/test_ragas.py +5 -5
  218. tests/swift/test_run_swift_eval.py +2 -3
  219. tests/swift/test_run_swift_vlm_eval.py +2 -3
  220. tests/swift/test_run_swift_vlm_jugde_eval.py +2 -3
  221. evalscope/backend/rag_eval/ragas/metrics/__init__.py +0 -2
  222. evalscope/backend/rag_eval/ragas/metrics/multi_modal_faithfulness.py +0 -91
  223. evalscope/backend/rag_eval/ragas/metrics/multi_modal_relevance.py +0 -99
  224. evalscope/cache.py +0 -98
  225. evalscope/models/template.py +0 -1446
  226. evalscope/run_ms.py +0 -140
  227. evalscope/utils/task_cfg_parser.py +0 -10
  228. evalscope/utils/task_utils.py +0 -22
  229. evalscope-0.7.1.dist-info/RECORD +0 -286
  230. {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/LICENSE +0 -0
  231. {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/WHEEL +0 -0
  232. {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/entry_points.txt +0 -0
  233. {evalscope-0.7.1.dist-info → evalscope-0.8.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,158 @@
1
+ import json
2
+ import os
3
+ import re
4
+ from tqdm import tqdm
5
+ from typing import List, Optional
6
+
7
+ from evalscope.constants import OutputsStructure
8
+ from evalscope.evaluator.evaluator import logger
9
+ from evalscope.models.model_adapter import BaseModelAdapter
10
+ from evalscope.tools.combine_reports import gen_table
11
+ from evalscope.utils import normalize_score
12
+
13
+
14
+ class HumanevalEvaluator(object):
15
+
16
+ def __init__(
17
+ self,
18
+ problem_file: str,
19
+ model_id: str,
20
+ model_revision: str,
21
+ model_adapter: BaseModelAdapter,
22
+ outputs: Optional[OutputsStructure] = None,
23
+ k: List[int] = [1, 10, 100],
24
+ n_workers: int = 4,
25
+ timeout: float = 3.0,
26
+ ):
27
+ try:
28
+ from human_eval.data import read_problems, write_jsonl
29
+ from human_eval.evaluation import evaluate_functional_correctness
30
+ except ImportError:
31
+ raise ImportError('Please install human_eval:'
32
+ 'https://github.com/openai/human-eval/tree/master#installation , '
33
+ 'Note that you need to enable the execution code in the human_eval/execution.py first.')
34
+
35
+ self.problem_file = problem_file
36
+ self.k = k
37
+ self.num_workers = n_workers
38
+ self.timeout = timeout
39
+ self.model_adapter = model_adapter
40
+
41
+ self.read_problems_func = read_problems
42
+ self.write_jsonl_func = write_jsonl
43
+ self.eval_func = evaluate_functional_correctness
44
+
45
+ # {'task_id': {'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}, ...}
46
+ self.problems = self.read_problems_func(self.problem_file)
47
+
48
+ # Deal with the output paths
49
+ self.outputs_structure = OutputsStructure(outputs)
50
+
51
+ def get_answers(self, infer_cfg: dict) -> List[dict]:
52
+ ans_list: list = []
53
+ system_prompt: str = 'Complete the following python code:\n'
54
+ for task_id, data_d in tqdm(self.problems.items(), total=len(self.problems), desc='Predicting(problems)'):
55
+ prompt: str = system_prompt + data_d['prompt']
56
+ inputs: dict = {'data': [prompt]}
57
+ # pred_res: dict = self.model_adapter.predict(inputs)
58
+
59
+ pred_res: dict = self.model_adapter.predict(inputs=inputs, infer_cfg=infer_cfg)
60
+
61
+ pred_ans: str = pred_res['choices'][0]['message']['content']
62
+ pred_ans = self._postprocess(pred_ans)
63
+
64
+ ans_list.append({'task_id': task_id, 'completion': pred_ans})
65
+
66
+ return ans_list
67
+
68
+ def eval(self, infer_cfg: dict, **kwargs):
69
+
70
+ # predict
71
+ ans_list: list = self.get_answers(infer_cfg)
72
+ ans_out_file: str = os.path.join(self.outputs_structure.predictions_dir, 'human_eval_predictions.jsonl')
73
+
74
+ self.write_jsonl_func(filename=ans_out_file, data=ans_list)
75
+ # logger.info(f'** Dump predictions to {ans_out_file} successfully.')
76
+ logger.info('** Dump predictions successfully.')
77
+
78
+ # evaluate results: e.g. {'pass@1': 0.333, 'pass@10': 0.111}
79
+ results = self.eval_func(
80
+ sample_file=ans_out_file,
81
+ k=self.k,
82
+ n_workers=self.num_workers,
83
+ timeout=self.timeout,
84
+ problem_file=self.problem_file)
85
+
86
+ # output: report
87
+ report_map: dict = self.gen_report(results=results)
88
+ report_dir: str = self.outputs_structure.reports_dir
89
+ report_file: str = os.path.join(report_dir, 'human_eval_report.json')
90
+
91
+ with open(report_file, 'w') as f:
92
+ f.write(json.dumps(report_map, ensure_ascii=False, indent=4))
93
+ # logger.info(f'** Dump report to {report_file} \n')
94
+ logger.info('** Dump report \n')
95
+
96
+ try:
97
+ # Make table
98
+ report_table: str = gen_table([report_dir])
99
+ logger.info(f'** Report table: \n {report_table} \n')
100
+ except Exception:
101
+ logger.error('Failed to generate report table.')
102
+
103
+ def gen_report(self, results: dict) -> dict:
104
+ """
105
+ Generate report from evaluation results.
106
+
107
+ Returns:
108
+ {
109
+ "name":"ARC-Challenge",
110
+ "metric":"WeightedAverageAccuracy",
111
+ "score":0.3389,
112
+ "category":[
113
+ {
114
+ "name":"DEFAULT",
115
+ "score":0.3389,
116
+ "subset":[
117
+ {
118
+ "name":"ARC-Challenge",
119
+ "score":0.3389
120
+ },
121
+ ]
122
+ }
123
+ ],
124
+ "total_num":100
125
+ }
126
+ """
127
+ results = {k: normalize_score(score=v) for k, v in results.items()}
128
+
129
+ category_d = dict(name='DEFAULT', score=results, subset=[])
130
+
131
+ res_map = dict(
132
+ name='HumanEval', metric='pass@k', score=results, category=[category_d], total_num=len(self.problems))
133
+
134
+ return res_map
135
+
136
+ @classmethod
137
+ def _postprocess(cls, text: str) -> str:
138
+ if '```' in text:
139
+ blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
140
+ if len(blocks) == 0:
141
+ text = text.split('```')[1] # fall back to default strategy
142
+ else:
143
+ text = blocks[0] # fetch the first code block
144
+ if not text.startswith('\n'): # in case starting with ```python
145
+ text = text[max(text.find('\n') + 1, 0):]
146
+ if text.strip().startswith('from') or text.strip().startswith('import'):
147
+ def_idx = text.find('def')
148
+ if def_idx != -1:
149
+ text = text[max(text.find('\n', def_idx) + 1, 0):]
150
+ text = text.split('\n\n')[0]
151
+ if text.strip().startswith('def'):
152
+ text = '\n'.join(text.split('\n')[1:])
153
+ if not text.startswith(' '):
154
+ if text.startswith(' '):
155
+ text = ' ' + text.lstrip()
156
+ else:
157
+ text = '\n'.join([' ' + line for line in text.split('\n')])
158
+ return text
@@ -1,24 +1,17 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- from typing import List, Union
4
-
5
3
  import pandas as pd
6
4
  import pyarrow as pa
5
+ from typing import List, Union
7
6
 
8
7
  from evalscope.constants import MetricMembers
8
+ from evalscope.utils import jsonl_to_list
9
9
  from evalscope.utils.arena_utils import compute_elo
10
10
  from evalscope.utils.logger import get_logger
11
- from evalscope.utils import jsonl_to_list
12
11
 
13
12
  logger = get_logger()
14
13
 
15
- DEFAULT_COLUMNS_MAPPING = {
16
- 'model_a': 'model_a',
17
- 'model_b': 'model_b',
18
- 'win': 'win',
19
- 'tstamp': 'ts',
20
- 'language': 'lang'
21
- }
14
+ DEFAULT_COLUMNS_MAPPING = {'model_a': 'model_a', 'model_b': 'model_b', 'win': 'win', 'tstamp': 'ts', 'language': 'lang'}
22
15
 
23
16
 
24
17
  class RatingEvaluate(object):
@@ -41,10 +34,9 @@ class RatingEvaluate(object):
41
34
  elo_ratings = compute_elo(battles)
42
35
  col_model = 'Model'
43
36
  col_elo_rating = 'Elo_Rating'
44
- elo_ratings_res = pd.DataFrame(
45
- [[n, elo_ratings[n]] for n in elo_ratings.keys()],
46
- columns=[col_model, col_elo_rating]).sort_values(
47
- col_elo_rating, ascending=False).reset_index(drop=True)
37
+ elo_ratings_res = pd.DataFrame([[n, elo_ratings[n]] for n in elo_ratings.keys()],
38
+ columns=[col_model, col_elo_rating]).sort_values(
39
+ col_elo_rating, ascending=False).reset_index(drop=True)
48
40
  elo_ratings_res = elo_ratings_res.round({col_elo_rating: 1})
49
41
  return elo_ratings_res
50
42
 
@@ -89,23 +81,11 @@ class RatingEvaluate(object):
89
81
  'tie': 1
90
82
  }]
91
83
  else:
92
- return [{
93
- 'model': winner,
94
- 'win': 1,
95
- 'loss': 0,
96
- 'tie': 0
97
- }, {
98
- 'model': loser,
99
- 'win': 0,
100
- 'loss': 1,
101
- 'tie': 0
102
- }]
84
+ return [{'model': winner, 'win': 1, 'loss': 0, 'tie': 0}, {'model': loser, 'win': 0, 'loss': 1, 'tie': 0}]
103
85
 
104
86
  def compute_pairwise_rating(self, raw_data):
105
87
  df_all = self.preprocess(raw_data_df=raw_data)
106
- model_list = (
107
- df_all['model_a'].unique().tolist()
108
- + df_all['model_b'].unique().tolist())
88
+ model_list = (df_all['model_a'].unique().tolist() + df_all['model_b'].unique().tolist())
109
89
  model_list = list(set(model_list))
110
90
 
111
91
  list_res = []
@@ -114,8 +94,7 @@ class RatingEvaluate(object):
114
94
  if self.baseline_model is not None:
115
95
  if self.baseline_model not in [row['model_a'], row['model_b']]:
116
96
  logger.warning(
117
- f'One of the models in the battle should be the baseline model: {self.baseline_model}'
118
- )
97
+ f'One of the models in the battle should be the baseline model: {self.baseline_model}')
119
98
  continue
120
99
  rating = self.get_single_pairwise_rating(row)
121
100
  list_res = list_res + rating
@@ -149,15 +128,15 @@ class RatingEvaluate(object):
149
128
 
150
129
  for metric in self.metrics:
151
130
 
152
- if metric == MetricMembers.ELO.value:
131
+ if metric == MetricMembers.ELO:
153
132
  res = self.compute_elo_rating(raw_data)
154
133
  res_all.append(res)
155
134
 
156
- elif metric == MetricMembers.PAIRWISE.value:
135
+ elif metric == MetricMembers.PAIRWISE:
157
136
  res = self.compute_pairwise_rating(raw_data)
158
137
  res_all.append(res)
159
138
 
160
- elif metric == MetricMembers.SCORE.value:
139
+ elif metric == MetricMembers.SCORE:
161
140
  res = self.compute_score_rating(raw_data)
162
141
  res_all.append(res)
163
142
 
@@ -2,6 +2,7 @@
2
2
  # flake8: noqa
3
3
 
4
4
  import os
5
+ import pandas as pd
5
6
  import random
6
7
  import sys
7
8
  import time
@@ -9,15 +10,10 @@ from abc import ABC, abstractmethod
9
10
  from functools import partial
10
11
  from typing import Any, List
11
12
 
12
- import pandas as pd
13
-
14
13
  from evalscope.constants import ArenaMode, EvalConfigKeys, FnCompletionParser, PositionBiasMitigation
15
14
  from evalscope.models.openai_model import OpenAIModel
16
- from evalscope.utils import completion_parsers
17
- from evalscope.utils.arena_utils import (get_battle_pairs,
18
- merge_ques_ans,
19
- shuffle_pairwise_preferences)
20
- from evalscope.utils import dump_jsonl_data, jsonl_to_list, random_seeded_choice
15
+ from evalscope.utils import completion_parsers, dump_jsonl_data, jsonl_to_list, random_seeded_choice
16
+ from evalscope.utils.arena_utils import get_battle_pairs, merge_ques_ans, shuffle_pairwise_preferences
21
17
  from evalscope.utils.logger import get_logger
22
18
 
23
19
  logger = get_logger()
@@ -33,8 +29,7 @@ class BaseReviewer(ABC):
33
29
  """
34
30
  Run pairwise battles with given models.
35
31
  """
36
- raise NotImplementedError(
37
- 'run() method must be implemented in your subclass.')
32
+ raise NotImplementedError('run() method must be implemented in your subclass.')
38
33
 
39
34
 
40
35
  class AutoReviewerGpt4(BaseReviewer):
@@ -71,13 +66,9 @@ class AutoReviewerGpt4(BaseReviewer):
71
66
 
72
67
  self.review_result_file = review_result_file
73
68
  self.prompt_list = jsonl_to_list(prompt_file)
74
- self.answer_list = [
75
- jsonl_to_list(answer_file) for answer_file in answer_file_list
76
- ]
77
- self.reference_list = jsonl_to_list(
78
- reference_file) if reference_file else []
79
- self.cache_list = jsonl_to_list(
80
- cache_file) if cache_file and os.path.isfile(cache_file) else []
69
+ self.answer_list = [jsonl_to_list(answer_file) for answer_file in answer_file_list]
70
+ self.reference_list = jsonl_to_list(reference_file) if reference_file else []
71
+ self.cache_list = jsonl_to_list(cache_file) if cache_file and os.path.isfile(cache_file) else []
81
72
 
82
73
  self.reviewer_args = reviewer_args if reviewer_args \
83
74
  else self._get_default_args()
@@ -88,24 +79,18 @@ class AutoReviewerGpt4(BaseReviewer):
88
79
  self.answer_list.append(jsonl_to_list(baseline_file))
89
80
  self.baseline_idx = len(self.answer_list) - 1
90
81
 
91
- self.position_bias_mitigation = self.reviewer_args.pop(
92
- EvalConfigKeys.POSITION_BIAS_MITIGATION,
93
- PositionBiasMitigation.NONE)
82
+ self.position_bias_mitigation = self.reviewer_args.pop(EvalConfigKeys.POSITION_BIAS_MITIGATION,
83
+ PositionBiasMitigation.NONE)
94
84
  if self.position_bias_mitigation == PositionBiasMitigation.RANDOMIZE_ORDER:
95
- self.random_seed = self.reviewer_args.pop(
96
- EvalConfigKeys.RANDOM_SEED, 123)
97
-
98
- fn_completion_parser = self.reviewer_args.pop(
99
- EvalConfigKeys.FN_COMPLETION_PARSER,
100
- FnCompletionParser.LMSYS_PARSER)
101
- completion_parser_kwargs = self.reviewer_args.pop(
102
- EvalConfigKeys.COMPLETION_PARSER_KWARGS, {})
85
+ self.random_seed = self.reviewer_args.pop(EvalConfigKeys.RANDOM_SEED, 123)
86
+
87
+ fn_completion_parser = self.reviewer_args.pop(EvalConfigKeys.FN_COMPLETION_PARSER,
88
+ FnCompletionParser.LMSYS_PARSER)
89
+ completion_parser_kwargs = self.reviewer_args.pop(EvalConfigKeys.COMPLETION_PARSER_KWARGS, {})
103
90
  if isinstance(fn_completion_parser, str):
104
- fn_completion_parser = getattr(completion_parsers,
105
- fn_completion_parser)
91
+ fn_completion_parser = getattr(completion_parsers, fn_completion_parser)
106
92
 
107
- self.fn_completion_parser = partial(fn_completion_parser,
108
- **completion_parser_kwargs)
93
+ self.fn_completion_parser = partial(fn_completion_parser, **completion_parser_kwargs)
109
94
  self.gpt_predictor = OpenAIModel(model_cfg=self.reviewer_args)
110
95
 
111
96
  @staticmethod
@@ -133,45 +118,35 @@ class AutoReviewerGpt4(BaseReviewer):
133
118
  # Default to general category (idx 0)
134
119
  target_prompt_dict = prompts_list[0]
135
120
  for item in prompts_list:
136
- is_category_match = category in item['category'] if isinstance(
137
- item['category'], list) else item['category'] == category
121
+ is_category_match = category in item['category'] if isinstance(item['category'],
122
+ list) else item['category'] == category
138
123
  is_type_match = item.get('type', ArenaMode.PAIRWISE) == type
139
124
  if is_category_match and is_type_match:
140
125
  target_prompt_dict = item
141
126
  break
142
- elif is_type_match and target_prompt_dict.get('type',
143
- ArenaMode.PAIRWISE) != type:
127
+ elif is_type_match and target_prompt_dict.get('type', ArenaMode.PAIRWISE) != type:
144
128
  target_prompt_dict = item # fallback to type match
145
129
 
146
130
  sys_prompt = target_prompt_dict['system_prompt']
147
131
  prompt_template = target_prompt_dict['prompt_template']
148
132
  defaults = target_prompt_dict.get('defaults', dict({}))
149
- output_format = target_prompt_dict.get('output_format',
150
- '[[rating_a,rating_b]]')
133
+ output_format = target_prompt_dict.get('output_format', '[[rating_a,rating_b]]')
151
134
 
152
135
  if type == ArenaMode.SINGLE:
153
- user_prompt = prompt_template.format(
154
- question=ques, answer=ans1, ref_answer_1=ans_ref, **defaults)
136
+ user_prompt = prompt_template.format(question=ques, answer=ans1, ref_answer_1=ans_ref, **defaults)
155
137
  else:
156
138
  user_prompt = prompt_template.format(
157
- question=ques,
158
- answer_a=ans1,
159
- answer_b=ans2,
160
- ref_answer_1=ans_ref,
161
- **defaults)
139
+ question=ques, answer_a=ans1, answer_b=ans2, ref_answer_1=ans_ref, **defaults)
162
140
 
163
141
  return sys_prompt, user_prompt, output_format
164
142
 
165
143
  def get_review_cache(self, model_a, model_b, question) -> list:
166
144
  if model_b:
167
- cache_hit = next(
168
- (r for r in self.cache_list if r['model_a'] == model_a
169
- and r['model_b'] == model_b and r['question'] == question),
170
- None)
145
+ cache_hit = next((r for r in self.cache_list
146
+ if r['model_a'] == model_a and r['model_b'] == model_b and r['question'] == question),
147
+ None)
171
148
  else:
172
- cache_hit = next(
173
- (r for r in self.cache_list
174
- if r['model'] == model_a and r['question'] == question), None)
149
+ cache_hit = next((r for r in self.cache_list if r['model'] == model_a and r['question'] == question), None)
175
150
  return cache_hit
176
151
 
177
152
  def get_review_pair(self, item: List[dict], dry_run=False, **kwargs) -> dict:
@@ -265,12 +240,10 @@ class AutoReviewerGpt4(BaseReviewer):
265
240
  return review_result
266
241
 
267
242
  def _get_review_pair(self, model_a, model_b, question, category, ans1, ans2, dry_run=False, **kwargs) -> (str, Any):
268
- input_msg = dict(
269
- ques=question, category=category, ans1=ans1, ans2=ans2)
243
+ input_msg = dict(ques=question, category=category, ans1=ans1, ans2=ans2)
270
244
 
271
245
  if self.reference_list:
272
- ans_ref = next((ref for ref in self.reference_list
273
- if ref.get('text') == question), None)
246
+ ans_ref = next((ref for ref in self.reference_list if ref.get('text') == question), None)
274
247
  assert ans_ref['answer']
275
248
  input_msg['ans_ref'] = ans_ref['answer']
276
249
 
@@ -284,8 +257,7 @@ class AutoReviewerGpt4(BaseReviewer):
284
257
  else:
285
258
  review_text = self._get_reviewer_prediction(sys_prompt, user_prompt, **kwargs)
286
259
 
287
- result = self.fn_completion_parser(
288
- review_text, output_format=output_format)
260
+ result = self.fn_completion_parser(review_text, output_format=output_format)
289
261
  if not isinstance(result, tuple):
290
262
  result = (result, None)
291
263
  return review_text, *result
@@ -294,8 +266,7 @@ class AutoReviewerGpt4(BaseReviewer):
294
266
  input_msg = dict(ques=question, category=category, ans1=answer)
295
267
 
296
268
  if self.reference_list:
297
- ans_ref = next((ref for ref in self.reference_list
298
- if ref.get('text') == question), None)
269
+ ans_ref = next((ref for ref in self.reference_list if ref.get('text') == question), None)
299
270
  assert ans_ref['answer']
300
271
  input_msg['ans_ref'] = ans_ref['answer']
301
272
 
@@ -312,8 +283,7 @@ class AutoReviewerGpt4(BaseReviewer):
312
283
  score = self.fn_completion_parser(review_text, output_format)
313
284
  return review_text, score
314
285
 
315
- def _get_reviewer_prediction_dummy(self, sys_prompt: str, user_prompt: str,
316
- output_format) -> str:
286
+ def _get_reviewer_prediction_dummy(self, sys_prompt: str, user_prompt: str, output_format) -> str:
317
287
  logger.info('Get dummy scores for input prompt ...')
318
288
  if output_format == '[[rating]]':
319
289
  return f'[[{round(random.random(), 2)}]]'
@@ -359,8 +329,7 @@ class AutoReviewerGpt4(BaseReviewer):
359
329
  if self.review_mode == ArenaMode.PAIRWISE:
360
330
  battle_pairs = get_battle_pairs(merged_ans_df.columns)
361
331
  elif self.review_mode == ArenaMode.PAIRWISE_BASELINE:
362
- battle_pairs = get_battle_pairs(merged_ans_df.columns,
363
- self.baseline_idx)
332
+ battle_pairs = get_battle_pairs(merged_ans_df.columns, self.baseline_idx)
364
333
  elif self.review_mode == ArenaMode.SINGLE:
365
334
  battle_pairs = [(col, ) for col in merged_ans_df.columns]
366
335
  else:
@@ -373,14 +342,12 @@ class AutoReviewerGpt4(BaseReviewer):
373
342
  pair_df.columns = ['output_1', 'output_2']
374
343
  pair_df['is_switched_outputs'] = pair_df.apply(
375
344
  lambda x: random_seeded_choice(
376
- seed='is_switched_outputs' + x[0]['text'] + str(
377
- self.random_seed),
345
+ seed='is_switched_outputs' + x[0]['text'] + str(self.random_seed),
378
346
  choices=[False, True],
379
347
  ),
380
348
  axis=1,
381
349
  )
382
- pair_df = shuffle_pairwise_preferences(
383
- pair_df, pair_df['is_switched_outputs'])
350
+ pair_df = shuffle_pairwise_preferences(pair_df, pair_df['is_switched_outputs'])
384
351
 
385
352
  for index, row in pair_df.iterrows():
386
353
  row_result = self.get_review_pair(row.to_list(), dry_run=dry_run, **kwargs) \
@@ -395,17 +362,21 @@ if __name__ == '__main__':
395
362
 
396
363
  work_path = os.path.join(Path(__file__).absolute().parent, '../../../')
397
364
  prompt_template_path = os.path.join(work_path, 'evalscope/registry/data/prompt_template/prompt_templates.jsonl')
398
- answer_file_list = [os.path.join(work_path, 'outputs/arena/default/answers/answer_chatglm2-6b.jsonl'),
399
- os.path.join(work_path, 'outputs/arena/default/answers/answer_llama2-7b.jsonl')]
365
+ answer_file_list = [
366
+ os.path.join(work_path, 'outputs/arena/default/answers/answer_chatglm2-6b.jsonl'),
367
+ os.path.join(work_path, 'outputs/arena/default/answers/answer_llama2-7b.jsonl')
368
+ ]
400
369
  review_result_file_path = os.path.join(work_path, 'outputs/arena/default/reviews/review_gpt4.jsonl')
401
370
 
402
- input_kwargs = dict(prompt_file=prompt_template_path,
403
- answer_file_list=answer_file_list,
404
- review_result_file=review_result_file_path,
405
- reviewer_args={},
406
- baseline_file='',
407
- reference_file='',
408
- cache_file='', )
371
+ input_kwargs = dict(
372
+ prompt_file=prompt_template_path,
373
+ answer_file_list=answer_file_list,
374
+ review_result_file=review_result_file_path,
375
+ reviewer_args={},
376
+ baseline_file='',
377
+ reference_file='',
378
+ cache_file='',
379
+ )
409
380
 
410
381
  auto_reviewer = AutoReviewerGpt4(**input_kwargs)
411
382
  auto_reviewer.run(dry_run=True)
@@ -29,16 +29,17 @@ In these examples settings.xml lists input files and formats.
29
29
  """
30
30
 
31
31
  from __future__ import absolute_import, division, print_function
32
- import collections
33
- import re
34
- import os
35
32
 
33
+ import collections
36
34
  import nltk
37
35
  import numpy as np
36
+ import os
37
+ import re
38
38
  import six
39
39
  from absl import logging
40
40
  from rouge_score import scoring, tokenizers
41
41
  from six.moves import map, range
42
+
42
43
  from evalscope.utils import get_logger
43
44
 
44
45
  logger = get_logger()
@@ -81,11 +82,7 @@ class RougeScorer(scoring.BaseScorer):
81
82
  ... 'The quick brown dog jumps on the log.')
82
83
  """
83
84
 
84
- def __init__(self,
85
- rouge_types,
86
- use_stemmer=False,
87
- split_summaries=False,
88
- tokenizer=None):
85
+ def __init__(self, rouge_types, use_stemmer=False, split_summaries=False, tokenizer=None):
89
86
 
90
87
  self.rouge_types = rouge_types
91
88
  if tokenizer:
@@ -160,21 +157,15 @@ class RougeScorer(scoring.BaseScorer):
160
157
  sents = [x for x in sents if len(x)]
161
158
  return sents
162
159
 
163
- target_tokens_list = [
164
- self._tokenizer.tokenize(s) for s in get_sents(target)
165
- ]
166
- prediction_tokens_list = [
167
- self._tokenizer.tokenize(s) for s in get_sents(prediction)
168
- ]
160
+ target_tokens_list = [self._tokenizer.tokenize(s) for s in get_sents(target)]
161
+ prediction_tokens_list = [self._tokenizer.tokenize(s) for s in get_sents(prediction)]
169
162
 
170
- scores = _summary_level_lcs(target_tokens_list,
171
- prediction_tokens_list)
163
+ scores = _summary_level_lcs(target_tokens_list, prediction_tokens_list)
172
164
  elif re.match(r'rouge[0-9]$', six.ensure_str(rouge_type)):
173
165
  # Rouge from n-grams.
174
166
  n = int(rouge_type[5:])
175
167
  if n <= 0:
176
- raise ValueError('rougen requires positive n: %s'
177
- % rouge_type)
168
+ raise ValueError('rougen requires positive n: %s' % rouge_type)
178
169
  target_ngrams = _create_ngrams(target_tokens, n)
179
170
  prediction_ngrams = _create_ngrams(prediction_tokens, n)
180
171
  scores = _score_ngrams(target_ngrams, prediction_ngrams)
@@ -349,8 +340,7 @@ def _score_ngrams(target_ngrams, prediction_ngrams):
349
340
 
350
341
  intersection_ngrams_count = 0
351
342
  for ngram in six.iterkeys(target_ngrams):
352
- intersection_ngrams_count += min(target_ngrams[ngram],
353
- prediction_ngrams[ngram])
343
+ intersection_ngrams_count += min(target_ngrams[ngram], prediction_ngrams[ngram])
354
344
  target_ngrams_count = sum(target_ngrams.values())
355
345
  prediction_ngrams_count = sum(prediction_ngrams.values())
356
346
 
@@ -4,7 +4,6 @@ import inspect
4
4
  import re
5
5
  import signal
6
6
  from collections import defaultdict
7
-
8
7
  from tqdm import tqdm
9
8
 
10
9
 
@@ -20,8 +19,7 @@ def check_input(text, arg):
20
19
  code_block = code_block_pattern.search(text)
21
20
  code_string = code_block.group(1)
22
21
 
23
- function_name_pattern = re.compile(r'def\s+([a-zA-Z_][a-zA-Z0-9_]*)\(',
24
- re.DOTALL)
22
+ function_name_pattern = re.compile(r'def\s+([a-zA-Z_][a-zA-Z0-9_]*)\(', re.DOTALL)
25
23
  function_name_block = function_name_pattern.search(code_string)
26
24
  function_name = function_name_block.group(1)
27
25
 
@@ -52,9 +50,7 @@ def exec_func(func, arr):
52
50
 
53
51
 
54
52
  def compute_pass_k_one_sample(predict, func_args, func_outputs, k=4):
55
- assert len(
56
- predict
57
- ) >= k, f'pass@k must have {k} generations, now have {len(predict)}'
53
+ assert len(predict) >= k, f'pass@k must have {k} generations, now have {len(predict)}'
58
54
  for predict_i in predict[:k]:
59
55
  try:
60
56
  for arg, gold in zip(func_args, func_outputs):
@@ -87,9 +83,7 @@ def compute_pass_k(predict_l, reference_l, func_args_l, k=4, lang='py'):
87
83
  def run_code_eval(data_l, k=4, md_level=2):
88
84
  print(f"{'#' * md_level} Code Eval(pass@{k})")
89
85
  for data in tqdm(data_l):
90
- data[f'pass@{k}'] = compute_pass_k_one_sample(data['gen'],
91
- data['func_args'],
92
- data['func_outputs'], k)
86
+ data[f'pass@{k}'] = compute_pass_k_one_sample(data['gen'], data['func_args'], data['func_outputs'], k)
93
87
  task_data_d = defaultdict(list)
94
88
  for data in data_l:
95
89
  for task in data['task_tags']:
@@ -2,7 +2,6 @@
2
2
 
3
3
  import re
4
4
  from collections import defaultdict
5
-
6
5
  from tqdm import tqdm
7
6
 
8
7
  from evalscope.constants import MetricsConstant
@@ -44,8 +43,7 @@ def compute_math_accuracy(predict_l, reference_l):
44
43
  def run_math_eval(data_l, md_level=2):
45
44
  print(f"{'#' * md_level} Math Eval(math accuracy)")
46
45
  for data in tqdm(data_l):
47
- data['math_accuracy'] = compute_math_accuracy_one_sample(
48
- data['gen'], data['target'])
46
+ data['math_accuracy'] = compute_math_accuracy_one_sample(data['gen'], data['target'])
49
47
  task_data_d = defaultdict(list)
50
48
  for data in data_l:
51
49
  for task in data['task_tags']:
@@ -54,7 +52,6 @@ def run_math_eval(data_l, md_level=2):
54
52
  print(f'[total], count: {len(data_l)}, math accuracy: '
55
53
  f'{correct_cnt / len(data_l) * 100:0.2f}%')
56
54
  for task in task_data_d.keys():
57
- correct_cnt = sum(
58
- [data['math_accuracy'] for data in task_data_d[task]])
55
+ correct_cnt = sum([data['math_accuracy'] for data in task_data_d[task]])
59
56
  print(f'[{task}], count: {len(task_data_d[task])}, math accuracy: '
60
- f'{correct_cnt/len(task_data_d[task])*100:0.2f}%')
57
+ f'{correct_cnt / len(task_data_d[task]) * 100:0.2f}%')