evalscope 1.0.2__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (176) hide show
  1. evalscope/api/benchmark/__init__.py +8 -1
  2. evalscope/api/benchmark/adapters/__init__.py +1 -0
  3. evalscope/api/benchmark/adapters/default_data_adapter.py +12 -0
  4. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  5. evalscope/api/benchmark/benchmark.py +14 -0
  6. evalscope/api/dataset/dataset.py +21 -0
  7. evalscope/api/dataset/loader.py +6 -2
  8. evalscope/api/mixin/sandbox_mixin.py +32 -54
  9. evalscope/api/model/generate_config.py +6 -0
  10. evalscope/app/ui/multi_model.py +6 -1
  11. evalscope/app/ui/single_model.py +8 -2
  12. evalscope/app/utils/data_utils.py +3 -2
  13. evalscope/app/utils/visualization.py +2 -2
  14. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  15. evalscope/benchmarks/ai2d/ai2d_adapter.py +3 -2
  16. evalscope/benchmarks/bfcl/bfcl_adapter.py +11 -46
  17. evalscope/benchmarks/blink/__init__.py +0 -0
  18. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  19. evalscope/benchmarks/chartqa/__init__.py +0 -0
  20. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  21. evalscope/benchmarks/chartqa/utils.py +38 -0
  22. evalscope/benchmarks/data_collection/data_collection_adapter.py +2 -1
  23. evalscope/benchmarks/docvqa/__init__.py +0 -0
  24. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  25. evalscope/benchmarks/general_arena/general_arena_adapter.py +1 -1
  26. evalscope/benchmarks/general_arena/utils.py +2 -1
  27. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
  28. evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  29. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +23 -4
  30. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  31. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +158 -0
  32. evalscope/benchmarks/hle/hle_adapter.py +3 -2
  33. evalscope/benchmarks/humaneval/humaneval_adapter.py +2 -1
  34. evalscope/benchmarks/infovqa/__init__.py +0 -0
  35. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  36. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +3 -1
  37. evalscope/benchmarks/math_verse/__init__.py +0 -0
  38. evalscope/benchmarks/math_verse/math_verse_adapter.py +100 -0
  39. evalscope/benchmarks/math_vision/__init__.py +0 -0
  40. evalscope/benchmarks/math_vision/math_vision_adapter.py +111 -0
  41. evalscope/benchmarks/math_vista/math_vista_adapter.py +6 -26
  42. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +2 -2
  43. evalscope/benchmarks/mmmu/mmmu_adapter.py +1 -1
  44. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -1
  45. evalscope/benchmarks/ner/__init__.py +0 -0
  46. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  47. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  48. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  49. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  50. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  51. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  52. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  53. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  54. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  55. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  56. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  57. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  58. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  59. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  60. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  61. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  62. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  63. evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py +101 -0
  64. evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py +87 -0
  65. evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py +963 -0
  66. evalscope/benchmarks/ocr_bench_v2/__init__.py +0 -0
  67. evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  68. evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py +50 -0
  69. evalscope/benchmarks/ocr_bench_v2/parallel.py +46 -0
  70. evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  71. evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  72. evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  73. evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py +481 -0
  74. evalscope/benchmarks/ocr_bench_v2/spotting_metric.py +179 -0
  75. evalscope/benchmarks/ocr_bench_v2/utils.py +433 -0
  76. evalscope/benchmarks/ocr_bench_v2/vqa_metric.py +254 -0
  77. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  78. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  79. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  80. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  81. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  82. evalscope/benchmarks/poly_math/__init__.py +0 -0
  83. evalscope/benchmarks/poly_math/poly_math_adapter.py +127 -0
  84. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  85. evalscope/benchmarks/pope/__init__.py +0 -0
  86. evalscope/benchmarks/pope/pope_adapter.py +111 -0
  87. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  88. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  89. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  90. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  91. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +1 -1
  92. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +1 -1
  93. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  94. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  95. evalscope/benchmarks/zerobench/__init__.py +0 -0
  96. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  97. evalscope/constants.py +4 -0
  98. evalscope/evaluator/evaluator.py +72 -79
  99. evalscope/metrics/math_parser.py +14 -0
  100. evalscope/metrics/metric.py +52 -1
  101. evalscope/metrics/metrics.py +16 -0
  102. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
  103. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
  104. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
  105. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
  106. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
  107. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
  108. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
  109. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
  110. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
  111. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
  112. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
  113. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
  114. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
  115. evalscope/models/utils/openai.py +4 -0
  116. evalscope/perf/arguments.py +24 -4
  117. evalscope/perf/benchmark.py +74 -89
  118. evalscope/perf/http_client.py +31 -16
  119. evalscope/perf/main.py +15 -2
  120. evalscope/perf/plugin/api/base.py +9 -7
  121. evalscope/perf/plugin/api/custom_api.py +13 -58
  122. evalscope/perf/plugin/api/default_api.py +179 -79
  123. evalscope/perf/plugin/api/openai_api.py +4 -3
  124. evalscope/perf/plugin/datasets/base.py +21 -0
  125. evalscope/perf/plugin/datasets/custom.py +2 -3
  126. evalscope/perf/plugin/datasets/line_by_line.py +2 -3
  127. evalscope/perf/plugin/datasets/longalpaca.py +2 -3
  128. evalscope/perf/plugin/datasets/openqa.py +2 -4
  129. evalscope/perf/plugin/datasets/random_dataset.py +1 -3
  130. evalscope/perf/utils/benchmark_util.py +36 -22
  131. evalscope/perf/utils/db_util.py +14 -19
  132. evalscope/perf/utils/local_server.py +0 -44
  133. evalscope/perf/utils/log_utils.py +21 -6
  134. evalscope/report/__init__.py +11 -2
  135. evalscope/report/combinator.py +52 -2
  136. evalscope/run.py +4 -0
  137. evalscope/utils/function_utils.py +195 -12
  138. evalscope/utils/io_utils.py +74 -0
  139. evalscope/utils/json_schema.py +8 -6
  140. evalscope/utils/logger.py +49 -17
  141. evalscope/utils/multi_choices.py +16 -1
  142. evalscope/utils/ner.py +377 -0
  143. evalscope/version.py +2 -2
  144. {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/METADATA +239 -393
  145. {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/RECORD +140 -98
  146. {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/WHEEL +1 -1
  147. {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/top_level.txt +0 -1
  148. tests/__init__.py +0 -1
  149. tests/benchmark/__init__.py +0 -1
  150. tests/benchmark/test_eval.py +0 -429
  151. tests/benchmark/test_image_edit.py +0 -65
  152. tests/benchmark/test_sandbox.py +0 -81
  153. tests/benchmark/test_t2i.py +0 -142
  154. tests/benchmark/test_vlm.py +0 -137
  155. tests/cli/__init__.py +0 -1
  156. tests/cli/test_all.py +0 -269
  157. tests/cli/test_collection.py +0 -99
  158. tests/cli/test_custom.py +0 -268
  159. tests/cli/test_reasoning.py +0 -81
  160. tests/common.py +0 -73
  161. tests/perf/__init__.py +0 -1
  162. tests/perf/test_perf.py +0 -206
  163. tests/rag/test_clip_benchmark.py +0 -87
  164. tests/rag/test_mteb.py +0 -213
  165. tests/rag/test_ragas.py +0 -128
  166. tests/swift/__init__.py +0 -1
  167. tests/swift/test_run_swift_eval.py +0 -146
  168. tests/swift/test_run_swift_vlm_eval.py +0 -128
  169. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
  170. tests/test_run_all.py +0 -12
  171. tests/utils.py +0 -13
  172. tests/vlm/__init__.py +0 -1
  173. tests/vlm/test_vlmeval.py +0 -102
  174. {tests/rag → evalscope/benchmarks/aa_lcr}/__init__.py +0 -0
  175. {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/entry_points.txt +0 -0
  176. {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,75 @@
1
+ # flake8: noqa: E501
2
+ from typing import Any, Dict, List
3
+
4
+ from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.evaluator import TaskState
7
+ from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
8
+ from evalscope.api.registry import register_benchmark
9
+ from evalscope.constants import Tags
10
+ from evalscope.utils.io_utils import bytes_to_base64
11
+ from evalscope.utils.logger import get_logger
12
+ from evalscope.utils.multi_choices import parse_answers
13
+
14
+ logger = get_logger()
15
+
16
+ MULT_CHOICE_PROMPT = """
17
+ Answer the following multiple choice question. The last line of your response should be of the following format: 'ANSWER: $LETTER' (without quotes) where LETTER is one of A, B, C, D. Think step by step before answering.
18
+
19
+ {question}
20
+ """
21
+
22
+ SUBSET_LIST = [
23
+ 'Quantitative Reasoning', 'Other', 'Positional Reasoning', 'Stylistic Reasoning', 'Spatial Reasoning',
24
+ 'Attribute Reasoning'
25
+ ]
26
+
27
+
28
+ @register_benchmark(
29
+ BenchmarkMeta(
30
+ name='visulogic',
31
+ pretty_name='VisuLogic',
32
+ dataset_id='evalscope/VisuLogic',
33
+ tags=[Tags.MATH, Tags.REASONING, Tags.MULTIPLE_CHOICE, Tags.MULTI_MODAL],
34
+ description=
35
+ 'VisuLogic is a benchmark aimed at evaluating the visual reasoning capabilities of Multi-modal Large Language Models (MLLMs), independent of textual reasoning processes. It features carefully constructed visual reasoning tasks spanning multiple categories, divided into six types based on required reasoning skills (e.g., Quantitative Reasoning, which involves understanding and deducing changes in the quantity of elements in images). Unlike existing benchmarks, VisuLogic is a challenging visual reasoning benchmark that is inherently difficult to articulate using language, providing a more rigorous evaluation of the visual reasoning capabilities of MLLMs.',
36
+ subset_list=SUBSET_LIST,
37
+ metric_list=['acc'],
38
+ eval_split='test',
39
+ prompt_template=MULT_CHOICE_PROMPT,
40
+ )
41
+ )
42
+ class VisuLogicAdapter(VisionLanguageAdapter):
43
+
44
+ def __init__(self, **kwargs):
45
+ super().__init__(**kwargs)
46
+ self.reformat_subset = True
47
+
48
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
49
+ question = record.get('question', '')
50
+ content_list: List[Content] = []
51
+ prompt_text = self.prompt_template.format(question=question).strip()
52
+ content_list.append(ContentText(text=prompt_text))
53
+
54
+ image = record.get('image')
55
+ if image and isinstance(image, dict):
56
+ image_bytes = image.get('bytes')
57
+ if image_bytes:
58
+ image_base64 = bytes_to_base64(image_bytes, format='png', add_header=True)
59
+ content_list.append(ContentImage(image=image_base64))
60
+
61
+ metadata = {
62
+ 'id': record['id'],
63
+ }
64
+
65
+ return Sample(
66
+ input=[ChatMessageUser(content=content_list)],
67
+ target=record['label'],
68
+ choices=['A', 'B', 'C', 'D'],
69
+ subset_key=record['tag'],
70
+ metadata=metadata,
71
+ )
72
+
73
+ def extract_answer(self, prediction: str, task_state: TaskState) -> str:
74
+ answers = parse_answers(task_state)
75
+ return ''.join(sorted(list(answers)))
File without changes
@@ -0,0 +1,64 @@
1
+ # flake8: noqa: E501
2
+ from typing import Any, Dict, List
3
+
4
+ from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
7
+ from evalscope.api.registry import register_benchmark
8
+ from evalscope.constants import Tags
9
+ from evalscope.utils.io_utils import bytes_to_base64, compress_image_to_limit
10
+ from evalscope.utils.logger import get_logger
11
+
12
+ logger = get_logger()
13
+
14
+ # 定义提示模板
15
+ PROMPT_TEMPLATE = """{question}
16
+ \n\n\nLet's think step by step and give the final answer in curly braces,
17
+ like this: {{final answer}}"
18
+ """
19
+
20
+ SUBSET_LIST = ['default']
21
+
22
+
23
+ @register_benchmark(
24
+ BenchmarkMeta(
25
+ name='zerobench',
26
+ pretty_name='ZeroBench',
27
+ dataset_id='evalscope/zerobench',
28
+ tags=[Tags.KNOWLEDGE, Tags.QA, Tags.MULTI_MODAL],
29
+ description=
30
+ 'ZeroBench is a challenging visual reasoning benchmark for Large Multimodal Models (LMMs). It consists of a main set of 100 high-quality, manually curated questions covering numerous domains, reasoning types and image type. Questions in ZeroBench have been designed and calibrated to be beyond the capabilities of current frontier models. As such, none of the evaluated models achieves a non-zero pass@1 (with greedy decoding) or 5/5 reliability score.',
31
+ subset_list=SUBSET_LIST,
32
+ metric_list=['acc'],
33
+ eval_split='zerobench',
34
+ train_split='zerobench_subquestions',
35
+ prompt_template=PROMPT_TEMPLATE,
36
+ )
37
+ )
38
+ class ZeroBenchAdapter(VisionLanguageAdapter):
39
+
40
+ def __init__(self, *args, **kwargs):
41
+ super().__init__(*args, **kwargs)
42
+
43
+ self._use_llm_judge = True
44
+
45
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
46
+ question = record['question_text']
47
+ content_list: List[Content] = [ContentText(text=self.prompt_template.format(question=question))]
48
+ image = record['question_images_decoded']
49
+ if len(image) > 0:
50
+ for img in image:
51
+ # Ensure image is under OpenAI's 10MB data-URI limit by compressing if needed
52
+ processed_bytes, fmt = compress_image_to_limit(img['bytes'], 10_000_000)
53
+ image_base64 = bytes_to_base64(processed_bytes, format=fmt, add_header=True)
54
+ content_list.append(ContentImage(image=image_base64))
55
+
56
+ metadata = {
57
+ 'question_id': record['question_id'],
58
+ 'question_images': record['question_images'],
59
+ 'image_attribution': record['image_attribution']
60
+ }
61
+
62
+ return Sample(
63
+ input=[ChatMessageUser(content=content_list)], target=record['question_answer'], metadata=metadata
64
+ )
evalscope/constants.py CHANGED
@@ -16,6 +16,7 @@ DEFAULT_EVALSCOPE_CACHE_DIR = os.path.expanduser(
16
16
  os.getenv('EVALSCOPE_CACHE', '~/.cache/evalscope')
17
17
  ) # ~/.cache/evalscope
18
18
  IS_BUILD_DOC = os.getenv('BUILD_DOC', '0') == '1' # To avoid some heavy dependencies when building doc
19
+ HEARTBEAT_INTERVAL_SEC = 60 # 60 seconds
19
20
 
20
21
 
21
22
  class HubType:
@@ -121,6 +122,7 @@ class Tags:
121
122
  CHINESE = 'Chinese'
122
123
  COMMONSENSE = 'Commonsense'
123
124
  QA = 'QA'
125
+ NER = 'NER'
124
126
  READING_COMPREHENSION = 'ReadingComprehension'
125
127
  CUSTOM = 'Custom'
126
128
  INSTRUCTION_FOLLOWING = 'InstructionFollowing'
@@ -133,6 +135,8 @@ class Tags:
133
135
  MULTI_MODAL = 'MultiModal'
134
136
  MULTI_LINGUAL = 'MultiLingual'
135
137
  MULTI_TURN = 'MultiTurn'
138
+ YES_NO = 'Yes/No'
139
+ HALLUCINATION = 'Hallucination'
136
140
 
137
141
 
138
142
  class FileConstants:
@@ -10,14 +10,14 @@ and report generation.
10
10
  import os
11
11
  import traceback
12
12
  from collections import defaultdict
13
- from concurrent.futures import ThreadPoolExecutor, TimeoutError, as_completed
14
- from tqdm import tqdm
15
- from typing import TYPE_CHECKING, Dict, List, Tuple, Union
13
+ from typing import TYPE_CHECKING, Dict, List
16
14
 
17
15
  from evalscope.api.dataset import Dataset, DatasetDict, Sample
18
16
  from evalscope.api.evaluator import CacheManager, Evaluator, TaskState
19
17
  from evalscope.api.metric import AggScore, SampleScore
18
+ from evalscope.constants import HEARTBEAT_INTERVAL_SEC
20
19
  from evalscope.report import Report, gen_table
20
+ from evalscope.utils.function_utils import run_in_threads_with_progress
21
21
  from evalscope.utils.logger import get_logger
22
22
 
23
23
  if TYPE_CHECKING:
@@ -91,22 +91,27 @@ class DefaultEvaluator(Evaluator):
91
91
  Report: The complete evaluation report containing all metrics and results.
92
92
  """
93
93
  # Load the dataset and evaluate each subset
94
+ logger.info(f'Start evaluating benchmark: {self.benchmark_name}')
94
95
  dataset_dict = self.benchmark.load_dataset()
95
96
  agg_score_dict = defaultdict(list)
96
97
 
97
98
  # Process each subset (e.g., test, validation) independently
99
+ logger.info('Evaluating all subsets of the dataset...')
98
100
  for subset, dataset in dataset_dict.items():
99
101
  if len(dataset) == 0:
100
102
  logger.info(f'No samples found in subset: {subset}, skipping.')
101
103
  continue
104
+ logger.info(f'Evaluating subset: {subset}')
102
105
  subset_score = self.evaluate_subset(subset, dataset)
103
106
  agg_score_dict[subset] = subset_score
104
107
 
105
108
  # Generate the report based on aggregated scores
109
+ logger.info('Generating report...')
106
110
  report = self.get_report(agg_score_dict)
107
111
 
108
112
  # Finalize the evaluation process
109
113
  self.finalize()
114
+ logger.info(f'Benchmark {self.benchmark_name} evaluation finished.')
110
115
  return report
111
116
 
112
117
  def evaluate_subset(self, subset: str, dataset: Dataset) -> List[AggScore]:
@@ -126,12 +131,15 @@ class DefaultEvaluator(Evaluator):
126
131
  List[AggScore]: Aggregated scores for this subset.
127
132
  """
128
133
  # Get model predictions for all samples in the subset
134
+ logger.info(f'Getting predictions for subset: {subset}')
129
135
  task_states = self.get_answers(subset, dataset)
130
136
 
131
137
  # Calculate evaluation metrics for each prediction
138
+ logger.info(f'Getting reviews for subset: {subset}')
132
139
  sample_scores = self.get_reviews(subset, task_states)
133
140
 
134
141
  # Aggregate individual sample scores into subset-level metrics
142
+ logger.info(f'Aggregating scores for subset: {subset}')
135
143
  agg_scores = self.benchmark.aggregate_scores(sample_scores=sample_scores)
136
144
  return agg_scores
137
145
 
@@ -162,44 +170,38 @@ class DefaultEvaluator(Evaluator):
162
170
 
163
171
  # Convert dataset to list for parallel processing
164
172
  dataset_list = list(dataset)
165
-
166
173
  if not dataset_list:
167
174
  return task_state_list
168
175
 
169
- # Process samples in parallel using ThreadPoolExecutor
170
- with ThreadPoolExecutor(max_workers=min(len(dataset_list), self.task_config.eval_batch_size)) as executor:
171
- # Submit all prediction tasks
172
- future_to_sample = {
173
- executor.submit(self._predict_sample, sample, model_prediction_dir): sample
174
- for sample in dataset_list
175
- }
176
-
177
- # Process completed tasks with progress bar
178
- with tqdm(total=len(dataset_list), desc=f'Predicting[{self.benchmark_name}@{subset}]: ') as pbar:
179
- for future in as_completed(future_to_sample):
180
- sample = future_to_sample[future]
181
- try:
182
- task_state = future.result()
183
- task_state_list.append(task_state)
184
-
185
- # Save the prediction result to cache for future use
186
- model_result = self.cache_manager.save_prediction_cache(
187
- subset, task_state, self.benchmark.save_metadata
188
- )
189
- logger.debug(f'Model result: \n{model_result.pretty_print()}')
190
-
191
- except Exception as exc:
192
- tb_str = traceback.format_exc()
193
- logger.error(
194
- f'{sample.model_dump_json(indent=2)} prediction failed: due to {exc}\nTraceback:\n{tb_str}'
195
- )
196
- if self.task_config.ignore_errors:
197
- logger.warning('Error ignored, continuing with next sample.')
198
- else:
199
- raise exc
200
- finally:
201
- pbar.update(1)
176
+ logger.info(f'Processing {len(dataset_list)} samples, if data is large, it may take a while.')
177
+
178
+ def worker(sample: Sample) -> TaskState:
179
+ return self._predict_sample(sample, model_prediction_dir)
180
+
181
+ def on_result(sample: Sample, task_state: TaskState) -> None:
182
+ model_result = self.cache_manager.save_prediction_cache(subset, task_state, self.benchmark.save_metadata)
183
+ logger.debug(f'Model result: \n{model_result.pretty_print()}')
184
+
185
+ def on_error(sample: Sample, exc: Exception) -> None:
186
+ tb_str = traceback.format_exc()
187
+ logger.error(f'{sample.model_dump_json(indent=2)} prediction failed: due to {exc}\nTraceback:\n{tb_str}')
188
+ if self.task_config.ignore_errors:
189
+ logger.warning('Error ignored, continuing with next sample.')
190
+ return
191
+ raise exc
192
+
193
+ new_task_states = run_in_threads_with_progress(
194
+ dataset_list,
195
+ worker,
196
+ desc=f'Predicting[{self.benchmark_name}@{subset}]: ',
197
+ max_workers=self.task_config.eval_batch_size,
198
+ heartbeat_sec=HEARTBEAT_INTERVAL_SEC,
199
+ on_result=on_result,
200
+ on_error=on_error,
201
+ )
202
+ task_state_list.extend(new_task_states)
202
203
 
204
+ logger.info(f'Finished getting predictions for subset: {subset}.')
203
205
  return task_state_list
204
206
 
205
207
  def _predict_sample(self, sample: Sample, model_prediction_dir: str) -> TaskState:
@@ -246,49 +248,40 @@ class DefaultEvaluator(Evaluator):
246
248
  if not task_states:
247
249
  return sample_score_list
248
250
 
249
- # Process task states in parallel using ThreadPoolExecutor
250
- with ThreadPoolExecutor(max_workers=min(len(task_states), self.task_config.judge_worker_num)) as executor:
251
- # Submit all review tasks
252
- future_to_task_state = {
253
- executor.submit(self._review_task_state, task_state): task_state
254
- for task_state in task_states
255
- }
256
-
257
- # Process completed tasks with progress bar
258
- with tqdm(total=len(task_states), desc=f'Reviewing[{self.benchmark_name}@{subset}]: ') as pbar:
259
- for future in as_completed(future_to_task_state):
260
- task_state = future_to_task_state[future]
261
- try:
262
- try:
263
- sample_score = future.result()
264
- except TimeoutError:
265
- logger.warning(
266
- f'Timeout when reviewing sample {task_state.sample_id}, setting score to zero.'
267
- )
268
- sample_score = SampleScore(sample_id=task_state.sample_id, scores={})
269
- sample_score_list.append(sample_score)
270
-
271
- # Save the review result to cache for future use
272
- review_result = self.cache_manager.save_review_cache(
273
- subset=subset,
274
- task_state=task_state,
275
- sample_score=sample_score,
276
- save_metadata=self.benchmark.save_metadata
277
- )
278
- logger.debug(f'Review result: \n{review_result.pretty_print()}')
279
-
280
- except Exception as exc:
281
- tb_str = traceback.format_exc()
282
- logger.error(
283
- f'Error when review sample {task_state.sample_id}: due to {exc}\nTraceback:\n{tb_str}'
284
- )
285
- if self.task_config.ignore_errors:
286
- logger.warning('Error ignored, continuing with next sample.')
287
- else:
288
- raise exc
289
- finally:
290
- pbar.update(1)
251
+ logger.info(f'Reviewing {len(task_states)} samples, if data is large, it may take a while.')
252
+
253
+ def worker(task_state: TaskState) -> SampleScore:
254
+ return self._review_task_state(task_state)
255
+
256
+ def on_result(task_state: TaskState, sample_score: SampleScore) -> None:
257
+ review_result = self.cache_manager.save_review_cache(
258
+ subset=subset,
259
+ task_state=task_state,
260
+ sample_score=sample_score,
261
+ save_metadata=self.benchmark.save_metadata
262
+ )
263
+ logger.debug(f'Review result: \n{review_result.pretty_print()}')
264
+
265
+ def on_error(task_state: TaskState, exc: Exception) -> None:
266
+ tb_str = traceback.format_exc()
267
+ logger.error(f'Error when review sample {task_state.sample_id}: due to {exc}\nTraceback:\n{tb_str}')
268
+ if self.task_config.ignore_errors:
269
+ logger.warning('Error ignored, continuing with next sample.')
270
+ return
271
+ raise exc
272
+
273
+ new_scores = run_in_threads_with_progress(
274
+ task_states,
275
+ worker,
276
+ desc=f'Reviewing[{self.benchmark_name}@{subset}]: ',
277
+ max_workers=self.task_config.judge_worker_num,
278
+ heartbeat_sec=HEARTBEAT_INTERVAL_SEC,
279
+ on_result=on_result,
280
+ on_error=on_error,
281
+ )
282
+ sample_score_list.extend(new_scores)
291
283
 
284
+ logger.info(f'Finished reviewing subset: {subset}. Total reviewed: {len(sample_score_list)}')
292
285
  return sample_score_list
293
286
 
294
287
  def _review_task_state(self, task_state: TaskState) -> SampleScore:
@@ -211,6 +211,11 @@ def strip_answer_string(string):
211
211
  # Remove grade level (e.g., 12th grade) and just maintain the integer
212
212
  string = re.sub(r'thgrade$', '', string)
213
213
 
214
+ # Normalize thousands-formatted numbers (e.g., 70,000 or -1,234,567.89) by removing commas
215
+ # This must run before the "list of integers" sorting to avoid misclassifying numbers with thousand separators.
216
+ if re.fullmatch(r'\s*-?\d{1,3}(?:,\d{3})+(?:\.\d+)?\s*', string):
217
+ string = string.replace(',', '')
218
+
214
219
  # If the answer is a list of integers (without parenthesis), sort them
215
220
  if re.fullmatch(r'(\s*-?\d+\s*,)*\s*-?\d+\s*', string):
216
221
  # Split the string into a list of integers
@@ -262,6 +267,8 @@ def extract_answer(pred_str, use_last_number=True):
262
267
  elif '答案是' in pred_str:
263
268
  # Handle Chinese few-shot multiple choice problem answer extraction
264
269
  pred = pred_str.split('答案是')[1].strip().split('\n\n')[0].strip()
270
+ elif 'ANSWER:' in pred_str:
271
+ pred = pred_str.split('ANSWER:')[-1].strip()
265
272
  else: # use the last number
266
273
  if use_last_number:
267
274
  pattern = '-?\d*\.?\d+'
@@ -529,3 +536,10 @@ def symbolic_equal(a, b):
529
536
  pass
530
537
 
531
538
  return False
539
+
540
+
541
+ if __name__ == '__main__':
542
+ print(math_equal('\n\\boxed{70,\\!000}\n', '70000'))
543
+ print(extract_answer('The answer is \\boxed{70,\\!000}'))
544
+ print(strip_answer_string(extract_answer('The answer is \\boxed{70,\\!000}')))
545
+ print(math_equal(extract_answer('The answer is \\boxed{70,\\!000}'), '70000'))
@@ -1,3 +1,4 @@
1
+ import json
1
2
  from collections import defaultdict
2
3
  from typing import List
3
4
 
@@ -42,7 +43,7 @@ class Accuracy(ExactMatch):
42
43
 
43
44
  results = []
44
45
  for prediction, reference in zip(predictions, references):
45
- pred_answer = strip_answer_string(extract_answer(prediction))
46
+ pred_answer = extract_answer(prediction)
46
47
  ref_answer = strip_answer_string(reference)
47
48
  results.append(float(math_equal(pred_answer, ref_answer)))
48
49
 
@@ -100,6 +101,56 @@ class MultiChoiceAcc(Metric):
100
101
  return res
101
102
 
102
103
 
104
+ @register_metric(name='anls')
105
+ class ANLS(Metric):
106
+
107
+ def __init__(self, thresh_hold=0.5):
108
+ self.thresh_hold = thresh_hold
109
+
110
+ def apply(self, predictions, references):
111
+ """
112
+ Calculate ANLS (Average Normalized Levenshtein Similarity) for a list of predictions and references.
113
+ This implementation is adapted from
114
+ https://github.com/QwenLM/Qwen-VL/blob/master/eval_mm/infographicsvqa_eval.py
115
+
116
+ Args:
117
+ references (List[str]): List of correct answers. Each answer can be a string of json.
118
+ predictions (List[str]): List of predicted answers.
119
+ """
120
+ from .metrics import levenshtein_distance
121
+
122
+ res = []
123
+ # Unwrap predictions if it's a nested list
124
+ for prediction, reference in zip(predictions, references):
125
+ # Parse the reference which is a json string
126
+ try:
127
+ answer = json.loads(reference)
128
+ except json.JSONDecodeError:
129
+ answer = reference
130
+ if isinstance(answer, str):
131
+ answer = [answer]
132
+ assert isinstance(answer, list), 'The reference answer should be a list of answers.'
133
+
134
+ # Calculate ANLS for each reference answer
135
+ values = []
136
+ for ans in answer:
137
+ # preprocess both the answers - gt and prediction
138
+ gt_answer = ' '.join(ans.strip().lower().split())
139
+ det_answer = ' '.join(prediction.strip().lower().split())
140
+
141
+ dist = levenshtein_distance(gt_answer, det_answer)
142
+ length = max(len(ans.upper()), len(prediction.upper()))
143
+ values.append(0.0 if length == 0 else float(dist) / float(length))
144
+
145
+ question_result = 0.0
146
+ if values:
147
+ question_result = 1 - min(values)
148
+ if question_result < self.thresh_hold:
149
+ question_result = 0.0
150
+ res.append(question_result)
151
+ return res
152
+
153
+
103
154
  # ##################
104
155
  # T2I Metrics ######
105
156
  ####################
@@ -467,3 +467,19 @@ def calculate_pass_at_k(
467
467
  num_samples_it = iter(num_samples)
468
468
 
469
469
  return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])
470
+
471
+
472
+ def levenshtein_distance(s1, s2):
473
+ if len(s1) > len(s2):
474
+ s1, s2 = s2, s1
475
+
476
+ distances = range(len(s1) + 1)
477
+ for i2, c2 in enumerate(s2):
478
+ distances_ = [i2 + 1]
479
+ for i1, c1 in enumerate(s1):
480
+ if c1 == c2:
481
+ distances_.append(distances[i1])
482
+ else:
483
+ distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
484
+ distances = distances_
485
+ return distances[-1]
@@ -30,13 +30,9 @@ from transformers.modeling_outputs import (
30
30
  SequenceClassifierOutput,
31
31
  TokenClassifierOutput,
32
32
  )
33
- from transformers.modeling_utils import (
34
- PreTrainedModel,
35
- apply_chunking_to_forward,
36
- find_pruneable_heads_and_indices,
37
- prune_linear_layer,
38
- )
33
+ from transformers.modeling_utils import PreTrainedModel
39
34
  from transformers.models.bert.configuration_bert import BertConfig
35
+ from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
40
36
  from transformers.utils import logging
41
37
  from typing import Any, Dict, Optional, Tuple
42
38
 
@@ -14,13 +14,9 @@ from transformers.modeling_outputs import (
14
14
  BaseModelOutputWithPastAndCrossAttentions,
15
15
  BaseModelOutputWithPoolingAndCrossAttentions,
16
16
  )
17
- from transformers.modeling_utils import (
18
- PreTrainedModel,
19
- apply_chunking_to_forward,
20
- find_pruneable_heads_and_indices,
21
- prune_linear_layer,
22
- )
17
+ from transformers.modeling_utils import PreTrainedModel
23
18
  from transformers.models.bert.configuration_bert import BertConfig
19
+ from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
24
20
  from transformers.utils import logging
25
21
  from typing import Tuple
26
22
 
@@ -31,13 +31,9 @@ from transformers.modeling_outputs import (
31
31
  SequenceClassifierOutput,
32
32
  TokenClassifierOutput,
33
33
  )
34
- from transformers.modeling_utils import (
35
- PreTrainedModel,
36
- apply_chunking_to_forward,
37
- find_pruneable_heads_and_indices,
38
- prune_linear_layer,
39
- )
34
+ from transformers.modeling_utils import PreTrainedModel
40
35
  from transformers.models.bert.configuration_bert import BertConfig
36
+ from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
41
37
  from transformers.utils import logging
42
38
  from typing import Optional, Tuple
43
39
 
@@ -204,6 +204,10 @@ def openai_completion_params(model: str, config: GenerateConfig, tools: bool) ->
204
204
  )
205
205
  if config.extra_body:
206
206
  params['extra_body'] = config.extra_body
207
+ if config.extra_query:
208
+ params['extra_query'] = config.extra_query
209
+ if config.extra_headers:
210
+ params['extra_headers'] = config.extra_headers
207
211
 
208
212
  return params
209
213