evalscope 1.0.1__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (87) hide show
  1. evalscope/api/benchmark/adapters/default_data_adapter.py +6 -4
  2. evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
  3. evalscope/api/benchmark/adapters/text2image_adapter.py +5 -4
  4. evalscope/api/benchmark/adapters/vision_language_adapter.py +3 -1
  5. evalscope/api/benchmark/benchmark.py +27 -2
  6. evalscope/api/benchmark/meta.py +3 -0
  7. evalscope/api/evaluator/evaluator.py +5 -0
  8. evalscope/api/evaluator/state.py +5 -0
  9. evalscope/api/messages/chat_message.py +6 -1
  10. evalscope/api/mixin/__init__.py +1 -0
  11. evalscope/api/mixin/llm_judge_mixin.py +2 -0
  12. evalscope/api/mixin/sandbox_mixin.py +204 -0
  13. evalscope/api/model/generate_config.py +0 -3
  14. evalscope/api/model/model.py +1 -1
  15. evalscope/api/tool/tool_info.py +1 -1
  16. evalscope/arguments.py +6 -0
  17. evalscope/benchmarks/ai2d/__init__.py +0 -0
  18. evalscope/benchmarks/ai2d/ai2d_adapter.py +53 -0
  19. evalscope/benchmarks/amc/__init__.py +0 -0
  20. evalscope/benchmarks/amc/amc_adapter.py +46 -0
  21. evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
  22. evalscope/benchmarks/bfcl/bfcl_adapter.py +141 -2
  23. evalscope/benchmarks/bfcl/generation.py +7 -7
  24. evalscope/benchmarks/drop/drop_adapter.py +1 -1
  25. evalscope/benchmarks/healthbench/__init__.py +0 -0
  26. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  27. evalscope/benchmarks/healthbench/utils.py +102 -0
  28. evalscope/benchmarks/humaneval/humaneval_adapter.py +19 -35
  29. evalscope/benchmarks/humaneval/utils.py +235 -0
  30. evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
  31. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +60 -37
  32. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  33. evalscope/benchmarks/math_500/math_500_adapter.py +0 -1
  34. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  35. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +48 -0
  36. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  37. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  38. evalscope/benchmarks/mm_star/__init__.py +0 -0
  39. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  40. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +4 -9
  41. evalscope/benchmarks/multi_if/__init__.py +0 -0
  42. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  43. evalscope/benchmarks/multi_if/metrics.py +120 -0
  44. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  45. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -4
  46. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  47. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  48. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  49. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  50. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  51. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  52. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  53. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +6 -1
  54. evalscope/config.py +24 -1
  55. evalscope/constants.py +3 -0
  56. evalscope/evaluator/evaluator.py +25 -7
  57. evalscope/metrics/metric.py +27 -2
  58. evalscope/models/model_apis.py +10 -8
  59. evalscope/models/utils/openai.py +1 -2
  60. evalscope/perf/arguments.py +2 -0
  61. evalscope/perf/plugin/api/base.py +2 -2
  62. evalscope/perf/plugin/api/default_api.py +7 -7
  63. evalscope/perf/plugin/api/openai_api.py +83 -19
  64. evalscope/perf/plugin/datasets/flickr8k.py +2 -2
  65. evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
  66. evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
  67. evalscope/perf/utils/benchmark_util.py +1 -2
  68. evalscope/report/combinator.py +0 -25
  69. evalscope/report/report.py +8 -4
  70. evalscope/run.py +1 -1
  71. evalscope/utils/function_utils.py +41 -0
  72. evalscope/utils/import_utils.py +63 -13
  73. evalscope/utils/io_utils.py +19 -11
  74. evalscope/utils/json_schema.py +23 -2
  75. evalscope/utils/logger.py +19 -0
  76. evalscope/utils/model_utils.py +1 -1
  77. evalscope/version.py +2 -2
  78. {evalscope-1.0.1.dist-info → evalscope-1.0.2.dist-info}/METADATA +6 -10
  79. {evalscope-1.0.1.dist-info → evalscope-1.0.2.dist-info}/RECORD +87 -59
  80. tests/benchmark/test_eval.py +51 -7
  81. tests/benchmark/test_sandbox.py +81 -0
  82. tests/benchmark/test_vlm.py +60 -3
  83. tests/perf/test_perf.py +40 -12
  84. {evalscope-1.0.1.dist-info → evalscope-1.0.2.dist-info}/LICENSE +0 -0
  85. {evalscope-1.0.1.dist-info → evalscope-1.0.2.dist-info}/WHEEL +0 -0
  86. {evalscope-1.0.1.dist-info → evalscope-1.0.2.dist-info}/entry_points.txt +0 -0
  87. {evalscope-1.0.1.dist-info → evalscope-1.0.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,120 @@
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from typing import Any, Dict, List, Optional, Tuple
15
+
16
+ from evalscope.utils import get_logger
17
+ from . import ifeval
18
+
19
+ logger = get_logger()
20
+
21
+
22
+ def gen_acc_strict(x: Dict[str, Any]) -> Dict[str, List]:
23
+ # reference: fbcode/gen_ai/github/fair_evals/evals/tasks/finetune/ifeval.py
24
+ response = str(x['response'])
25
+ instruction_list = x['instruction_id_list']
26
+ is_following_list = []
27
+ for index, instruction_id in enumerate(instruction_list):
28
+ instruction_cls = ifeval.INSTRUCTION_DICT[instruction_id]
29
+ instruction = instruction_cls(instruction_id)
30
+
31
+ instruction.build_description(**x['kwargs'][index])
32
+ if response and instruction.check_following(response):
33
+ is_following_list.append(True)
34
+ else:
35
+ is_following_list.append(False)
36
+
37
+ return {
38
+ 'follow_instruction_list': is_following_list,
39
+ 'instruction_id_list': instruction_list,
40
+ }
41
+
42
+
43
+ def gen_acc_loose(x: Dict[str, Any]) -> Dict[str, List]:
44
+ response = str(x['response'])
45
+ r = response.split('\n')
46
+ response_remove_first = '\n'.join(r[1:]).strip()
47
+ response_remove_last = '\n'.join(r[:-1]).strip()
48
+ response_remove_both = '\n'.join(r[1:-1]).strip()
49
+ revised_response = response.replace('*', '')
50
+ revised_response_remove_first = response_remove_first.replace('*', '')
51
+ revised_response_remove_last = response_remove_last.replace('*', '')
52
+ revised_response_remove_both = response_remove_both.replace('*', '')
53
+ all_responses = [
54
+ response,
55
+ revised_response,
56
+ response_remove_first,
57
+ response_remove_last,
58
+ response_remove_both,
59
+ revised_response_remove_first,
60
+ revised_response_remove_last,
61
+ revised_response_remove_both,
62
+ ]
63
+ instruction_list = x['instruction_id_list']
64
+ is_following_list = []
65
+ for index, instruction_id in enumerate(instruction_list):
66
+ instruction_cls = ifeval.INSTRUCTION_DICT[instruction_id]
67
+ instruction = instruction_cls(instruction_id)
68
+
69
+ instruction.build_description(**x['kwargs'][index])
70
+
71
+ is_following = False
72
+ for r in all_responses: # type: ignore
73
+ if r.strip() and instruction.check_following(r): # type: ignore
74
+ is_following = True
75
+ break
76
+
77
+ is_following_list.append(is_following)
78
+ return {
79
+ 'follow_instruction_list': is_following_list,
80
+ 'instruction_id_list': instruction_list,
81
+ }
82
+
83
+
84
+ def parse_result(outputs: List[Dict[str, Any]]) -> Tuple[float, float]:
85
+
86
+ prompt_total = 0
87
+ prompt_correct = 0
88
+ instruction_total = 0
89
+ instruction_correct = 0
90
+
91
+ for example in outputs:
92
+ follow_instruction_list = example['follow_instruction_list']
93
+ instruction_id_list = example['instruction_id_list']
94
+
95
+ prompt_total += 1
96
+ if all(follow_instruction_list):
97
+ prompt_correct += 1
98
+
99
+ instruction_total += len(instruction_id_list)
100
+ instruction_correct += sum(follow_instruction_list)
101
+
102
+ return prompt_correct / prompt_total if prompt_total > 0 else 0, \
103
+ instruction_correct / instruction_total if instruction_total > 0 else 0
104
+
105
+
106
+ def parse_result_no_reduce(outputs: List[Dict[str, Any]]) -> Tuple[List, List]:
107
+
108
+ prompt_res = []
109
+ inst_res = []
110
+
111
+ for example in outputs:
112
+ follow_instruction_list = example['follow_instruction_list']
113
+ instruction_id_list = example['instruction_id_list']
114
+ if all(follow_instruction_list):
115
+ prompt_res.append(1)
116
+ else:
117
+ prompt_res.append(0)
118
+ inst_res.append(sum(follow_instruction_list) / len(instruction_id_list) if instruction_id_list else 0.0)
119
+
120
+ return prompt_res, inst_res
@@ -0,0 +1,161 @@
1
+ import json
2
+ from typing import Any, Dict, List
3
+
4
+ from evalscope.api.benchmark import BenchmarkMeta, DefaultDataAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.evaluator import TaskState
7
+ from evalscope.api.messages import ChatMessageUser, messages_pretty_str
8
+ from evalscope.api.metric import Score
9
+ from evalscope.api.model import Model
10
+ from evalscope.api.registry import register_benchmark
11
+ from evalscope.constants import Tags
12
+ from evalscope.utils.import_utils import check_import
13
+ from evalscope.utils.logger import get_logger
14
+
15
+ logger = get_logger()
16
+
17
+ SUBSET_LIST = [
18
+ 'Chinese',
19
+ 'English',
20
+ 'German',
21
+ 'Italian',
22
+ 'Vietnamese',
23
+ 'Spanish',
24
+ 'Hindi',
25
+ 'Portuguese',
26
+ 'French',
27
+ 'Thai',
28
+ 'Russian',
29
+ ]
30
+
31
+
32
+ @register_benchmark(
33
+ BenchmarkMeta(
34
+ name='multi_if',
35
+ pretty_name='Multi-IF',
36
+ description=
37
+ 'Multi-IF is a benchmark designed to evaluate the performance of LLM models\' capabilities in multi-turn instruction following within a multilingual environment.', # noqa: E501
38
+ tags=[Tags.INSTRUCTION_FOLLOWING, Tags.MULTI_LINGUAL, Tags.MULTI_TURN],
39
+ dataset_id='facebook/Multi-IF',
40
+ subset_list=SUBSET_LIST,
41
+ metric_list=[
42
+ 'prompt_level_strict',
43
+ 'inst_level_strict',
44
+ 'prompt_level_loose',
45
+ 'inst_level_loose',
46
+ ],
47
+ few_shot_num=0,
48
+ train_split=None,
49
+ eval_split='train',
50
+ extra_params={
51
+ 'max_turns': 3, # maximum number of turns to evaluate
52
+ }
53
+ )
54
+ )
55
+ class MultiIFAdapter(DefaultDataAdapter):
56
+
57
+ def __init__(self, **kwargs):
58
+ super().__init__(**kwargs)
59
+
60
+ # Ensure required packages are installed
61
+ check_import(
62
+ module_name=['nltk', 'langdetect'],
63
+ package=['nltk', 'langdetect'],
64
+ raise_error=True,
65
+ feature_name=self.pretty_name
66
+ )
67
+ if 'Chinese' in self.subset_list:
68
+ check_import(module_name='emoji', package='emoji', raise_error=True, feature_name='Chinese subset')
69
+ if 'Thai' in self.subset_list:
70
+ check_import(module_name='pythainlp', package='pythainlp', raise_error=True, feature_name='Thai subset')
71
+
72
+ self.reformat_subset = True
73
+ self.max_turns = self.extra_params.get('max_turns', 3)
74
+ if not isinstance(self.max_turns, int) or self.max_turns < 1 or self.max_turns > 3:
75
+ logger.warning(f'max_turns should be an integer between 1 and 3, got {self.max_turns}, clamping to 3.')
76
+ self.max_turns = 3
77
+
78
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
79
+ return Sample(
80
+ input=[ChatMessageUser(content='')], # NOTE: we will build the multi turn conversation in the evaluator
81
+ target='',
82
+ subset_key=record['language'],
83
+ metadata=record,
84
+ )
85
+
86
+ def run_inference(self, model: Model, sample: Sample, output_dir: str, **kwargs) -> TaskState:
87
+ """
88
+ Run multi-turn inference with the model and sample.
89
+ """
90
+ record = sample.metadata
91
+ history = []
92
+ step_record = {}
93
+ for step in range(1, self.max_turns + 1):
94
+ current_prompt = json.loads(record[f'turn_{step}_prompt'])
95
+ history.append(ChatMessageUser(content=current_prompt['content']))
96
+ # Generate model output
97
+ model_output = model.generate(input=history, tools=sample.tools)
98
+
99
+ response = model_output.completion
100
+ instruction_id_list = json.loads(record[f'turn_{step}_instruction_id_list'])
101
+ kwargs_list = json.loads(record[f'turn_{step}_kwargs'])
102
+ _kwargs = [json.loads(kwarg) for kwarg in kwargs_list]
103
+
104
+ step_record[step] = {
105
+ 'prompt': messages_pretty_str(history),
106
+ 'response': response,
107
+ 'instruction_id_list': instruction_id_list,
108
+ 'kwargs': _kwargs
109
+ }
110
+
111
+ # Append model output to history for next turn
112
+ history.append(model_output.message)
113
+
114
+ sample.metadata['step_record'] = step_record
115
+ return TaskState(
116
+ model=model.name,
117
+ sample=sample,
118
+ messages=history,
119
+ output=model_output,
120
+ completed=True,
121
+ )
122
+
123
+ def match_score(
124
+ self, original_prediction: str, filtered_prediction: str, reference: Dict, task_state: TaskState
125
+ ) -> Score:
126
+ """
127
+ Calculate evaluation scores by comparing prediction with reference.
128
+ """
129
+ from .metrics import gen_acc_loose, gen_acc_strict, parse_result
130
+
131
+ # Initialize the score object with prediction details
132
+ score = Score(
133
+ extracted_prediction=filtered_prediction,
134
+ prediction=original_prediction,
135
+ )
136
+
137
+ step_record = task_state.metadata['step_record']
138
+ results = {}
139
+ try:
140
+ for step, record in step_record.items():
141
+ outputs_strict = gen_acc_strict(record)
142
+ outputs_loose = gen_acc_loose(record)
143
+ prompt_level_strict, inst_level_strict = parse_result([outputs_strict])
144
+ prompt_level_loose, inst_level_loose = parse_result([outputs_loose])
145
+ results.update({
146
+ f'turn_{step}_prompt_level_strict': prompt_level_strict,
147
+ f'turn_{step}_inst_level_strict': inst_level_strict,
148
+ f'turn_{step}_prompt_level_loose': prompt_level_loose,
149
+ f'turn_{step}_inst_level_loose': inst_level_loose,
150
+ })
151
+ score.value.update(results)
152
+
153
+ # Set main score name
154
+ if results:
155
+ score.main_score_name = f'turn_{step}_prompt_level_strict'
156
+
157
+ except Exception as e:
158
+ logger.error(f'Error calculating ifeval metrics: {e}')
159
+ score.value = {}
160
+
161
+ return score
@@ -73,6 +73,7 @@ class NeedleHaystackAdapter(DefaultDataAdapter):
73
73
  super().__init__(**kwargs)
74
74
 
75
75
  self._use_llm_judge = True
76
+ self.add_aggregation_name = False # Don't add aggregation name for needle haystack adapter
76
77
  # set extra params
77
78
  self.retrieval_question = self.extra_params.get(
78
79
  'retrieval_question', 'What is the best thing to do in San Francisco?'
@@ -359,10 +360,6 @@ class NeedleHaystackAdapter(DefaultDataAdapter):
359
360
 
360
361
  return score
361
362
 
362
- def _on_generate_report(self, scores, model_name, add_aggregation_name=True):
363
- # Don't add aggregation name for needle haystack adapter
364
- return super()._on_generate_report(scores, model_name, False)
365
-
366
363
  def _on_generate_report_end(self, report: 'Report', output_dir: str, **kwargs):
367
364
  try:
368
365
  import os
File without changes
@@ -0,0 +1,163 @@
1
+ from typing import Any, Dict, List
2
+
3
+ from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
4
+ from evalscope.api.dataset import Sample
5
+ from evalscope.api.evaluator.state import TaskState
6
+ from evalscope.api.messages.chat_message import ChatMessageUser
7
+ from evalscope.api.messages.content import Content, ContentImage, ContentText
8
+ from evalscope.api.metric.scorer import Score
9
+ from evalscope.api.registry import register_benchmark
10
+ from evalscope.constants import Tags
11
+ from evalscope.utils.io_utils import bytes_to_base64
12
+ from evalscope.utils.logger import get_logger
13
+
14
+ logger = get_logger()
15
+
16
+ SUBSET_LIST = [
17
+ 'OE_MM_maths_en_COMP',
18
+ 'OE_MM_maths_zh_CEE',
19
+ 'OE_MM_maths_zh_COMP',
20
+ 'OE_MM_physics_en_COMP',
21
+ 'OE_MM_physics_zh_CEE',
22
+ 'OE_TO_maths_en_COMP',
23
+ 'OE_TO_maths_zh_CEE',
24
+ 'OE_TO_maths_zh_COMP',
25
+ 'OE_TO_physics_en_COMP',
26
+ 'OE_TO_physics_zh_CEE',
27
+ 'TP_MM_maths_en_COMP',
28
+ 'TP_MM_maths_zh_CEE',
29
+ 'TP_MM_maths_zh_COMP',
30
+ 'TP_MM_physics_en_COMP',
31
+ 'TP_TO_maths_en_COMP',
32
+ 'TP_TO_maths_zh_CEE',
33
+ 'TP_TO_maths_zh_COMP',
34
+ 'TP_TO_physics_en_COMP',
35
+ ]
36
+
37
+
38
+ @register_benchmark(
39
+ BenchmarkMeta(
40
+ name='olympiad_bench',
41
+ pretty_name='OlympiadBench',
42
+ tags=[Tags.MATH, Tags.REASONING],
43
+ description='OlympiadBench is an Olympiad-level bilingual multimodal '
44
+ 'scientific benchmark, featuring 8,476 problems from '
45
+ 'Olympiad-level mathematics and physics competitions, '
46
+ 'including the Chinese college entrance exam. '
47
+ 'In the subsets: `OE` stands for `Open-Ended`, '
48
+ '`TP` stands for `Theorem Proving`, '
49
+ '`MM` stands for `Multimodal`, '
50
+ '`TO` stands for `Text-Only`, '
51
+ '`CEE` stands for `Chinese Entrance Exam`, '
52
+ '`COMP` stands for `Comprehensive`. '
53
+ '**Note: The `TP` subsets can\'t be evaluated with auto-judge for now**.',
54
+ dataset_id='AI-ModelScope/OlympiadBench',
55
+ subset_list=SUBSET_LIST,
56
+ metric_list=['acc'],
57
+ eval_split='train',
58
+ prompt_template='{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
59
+ )
60
+ )
61
+ class OlympiadBenchAdapter(VisionLanguageAdapter):
62
+
63
+ def __init__(self, *args, **kwargs):
64
+ super().__init__(*args, **kwargs)
65
+
66
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
67
+ """Generate prompt for a single item."""
68
+ from .utils import OlympiadBenchPrompter
69
+
70
+ question = record.get('question', '')
71
+ language = record.get('language', 'English')
72
+ subject = record.get('subject', 'Math')
73
+ question_type = record.get('question_type', '')
74
+ answer_type = record.get('answer_type', '')
75
+ is_multiple_answer = record.get('is_multiple_answer', False)
76
+ unit = record.get('unit', '')
77
+ # Generate prompt
78
+ prompt = OlympiadBenchPrompter().make_prompt(
79
+ problem=question,
80
+ language=language,
81
+ subject=subject,
82
+ question_type=question_type,
83
+ answer_type=answer_type,
84
+ is_multiple_answer=is_multiple_answer,
85
+ unit=unit,
86
+ )
87
+ # Construct content list
88
+ content_list: List[Content] = []
89
+ # Add images if available
90
+ for i in range(9):
91
+ image = record.get(f'image_{i+1}')
92
+ if image:
93
+ image_base64 = bytes_to_base64(image['bytes'], format='jpg', add_header=True)
94
+ content_list.append(ContentImage(image=image_base64))
95
+ prompt = prompt.replace(f'<image_{i+1}>', f'[image_{i+1}]') # replace html tag
96
+ # Add text content
97
+ content_list.insert(0, ContentText(text=prompt))
98
+
99
+ final_answer = record.get('final_answer', [])
100
+ return Sample(
101
+ input=[ChatMessageUser(content=content_list)],
102
+ target=','.join(final_answer) if final_answer else '',
103
+ metadata={
104
+ 'id': record.get('id', ''),
105
+ 'subfield': record.get('subfield', ''),
106
+ 'context': record.get('context', ''),
107
+ 'solution': record.get('solution', []),
108
+ 'final_answer': record.get('final_answer', []),
109
+ 'is_multiple_answer': is_multiple_answer,
110
+ 'unit': unit,
111
+ 'answer_type': answer_type,
112
+ 'question_type': question_type,
113
+ 'language': language,
114
+ 'subject': subject,
115
+ 'error': record.get('error', None),
116
+ },
117
+ )
118
+
119
+ def extract_answer(self, prediction: str, task_state: TaskState):
120
+ import re
121
+
122
+ if task_state.metadata['language'] == 'Chinese':
123
+ matches = re.findall('所以最终答案是(.*)', prediction)
124
+ else:
125
+ matches = re.findall('So the final answer is (.*)', prediction)
126
+
127
+ # If found matches, take the last one, otherwise return the whole text
128
+ if matches:
129
+ return matches[-1].strip()
130
+ return prediction
131
+
132
+ def match_score(self, original_prediction, filtered_prediction, reference, task_state) -> Score:
133
+ from .utils import MathJudger
134
+
135
+ judger = MathJudger()
136
+ score = Score(
137
+ extracted_prediction=filtered_prediction,
138
+ prediction=original_prediction,
139
+ )
140
+ question = task_state.metadata
141
+ model_answer = filtered_prediction
142
+ # Get precision/error threshold from reference if available
143
+ answer_type = question['answer_type']
144
+ try:
145
+ if 'Tuple' in answer_type: # 目前可机评的数据中 没有 need_human_evaluate
146
+ judge_result = judger.judge(model_answer, question['final_answer'][0])
147
+ else:
148
+ if question['error']:
149
+ if ',' in question['error']:
150
+ precisions = question['error'].split(',')
151
+ precisions = [float(p) if p else 1e-8 for p in precisions]
152
+ judge_result = judger.judge(model_answer, question['final_answer'][0], precisions)
153
+ else:
154
+ precision = float(question['error'])
155
+ judge_result = judger.judge(model_answer, question['final_answer'][0], precision)
156
+ else:
157
+ judge_result = judger.judge(model_answer, question['final_answer'][0])
158
+ except Exception as e:
159
+ logger.warning(f'Error in judging answer: {e}')
160
+ judge_result = False
161
+
162
+ score.value = {'acc': float(judge_result)}
163
+ return score