evalscope 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (155) hide show
  1. evalscope/api/benchmark/adapters/default_data_adapter.py +18 -4
  2. evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
  3. evalscope/api/benchmark/adapters/text2image_adapter.py +5 -4
  4. evalscope/api/benchmark/adapters/vision_language_adapter.py +3 -1
  5. evalscope/api/benchmark/benchmark.py +27 -2
  6. evalscope/api/benchmark/meta.py +3 -0
  7. evalscope/api/evaluator/evaluator.py +5 -0
  8. evalscope/api/evaluator/state.py +5 -0
  9. evalscope/api/messages/chat_message.py +6 -1
  10. evalscope/api/mixin/__init__.py +1 -0
  11. evalscope/api/mixin/llm_judge_mixin.py +2 -0
  12. evalscope/api/mixin/sandbox_mixin.py +204 -0
  13. evalscope/api/model/generate_config.py +0 -3
  14. evalscope/api/model/model.py +1 -1
  15. evalscope/api/tool/tool_info.py +1 -1
  16. evalscope/app/ui/multi_model.py +6 -1
  17. evalscope/app/ui/single_model.py +8 -2
  18. evalscope/app/utils/data_utils.py +3 -2
  19. evalscope/app/utils/visualization.py +2 -2
  20. evalscope/arguments.py +6 -0
  21. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  22. evalscope/benchmarks/amc/__init__.py +0 -0
  23. evalscope/benchmarks/amc/amc_adapter.py +46 -0
  24. evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
  25. evalscope/benchmarks/bfcl/bfcl_adapter.py +106 -2
  26. evalscope/benchmarks/bfcl/generation.py +7 -7
  27. evalscope/benchmarks/blink/__init__.py +0 -0
  28. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  29. evalscope/benchmarks/chartqa/__init__.py +0 -0
  30. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  31. evalscope/benchmarks/chartqa/utils.py +38 -0
  32. evalscope/benchmarks/docvqa/__init__.py +0 -0
  33. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  34. evalscope/benchmarks/drop/drop_adapter.py +1 -1
  35. evalscope/benchmarks/general_arena/utils.py +2 -1
  36. evalscope/benchmarks/healthbench/__init__.py +0 -0
  37. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  38. evalscope/benchmarks/healthbench/utils.py +102 -0
  39. evalscope/benchmarks/hle/hle_adapter.py +3 -2
  40. evalscope/benchmarks/humaneval/humaneval_adapter.py +19 -35
  41. evalscope/benchmarks/humaneval/utils.py +235 -0
  42. evalscope/benchmarks/infovqa/__init__.py +0 -0
  43. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  44. evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
  45. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +60 -37
  46. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  47. evalscope/benchmarks/math_500/math_500_adapter.py +0 -1
  48. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  49. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +48 -0
  50. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  51. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  52. evalscope/benchmarks/mm_star/__init__.py +0 -0
  53. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  54. evalscope/benchmarks/mmmu/mmmu_adapter.py +1 -1
  55. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +4 -9
  56. evalscope/benchmarks/multi_if/__init__.py +0 -0
  57. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  58. evalscope/benchmarks/multi_if/metrics.py +120 -0
  59. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  60. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -4
  61. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  62. evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py +101 -0
  63. evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py +87 -0
  64. evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py +963 -0
  65. evalscope/benchmarks/ocr_bench_v2/__init__.py +0 -0
  66. evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  67. evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py +50 -0
  68. evalscope/benchmarks/ocr_bench_v2/parallel.py +46 -0
  69. evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  70. evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  71. evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  72. evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py +481 -0
  73. evalscope/benchmarks/ocr_bench_v2/spotting_metric.py +179 -0
  74. evalscope/benchmarks/ocr_bench_v2/utils.py +432 -0
  75. evalscope/benchmarks/ocr_bench_v2/vqa_metric.py +254 -0
  76. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  77. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  78. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  79. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  80. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  81. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  82. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  83. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +6 -1
  84. evalscope/config.py +24 -1
  85. evalscope/constants.py +3 -0
  86. evalscope/evaluator/evaluator.py +25 -7
  87. evalscope/metrics/metric.py +78 -2
  88. evalscope/metrics/metrics.py +16 -0
  89. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
  90. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
  91. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
  92. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
  93. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
  94. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
  95. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
  96. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
  97. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
  98. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
  99. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
  100. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
  101. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
  102. evalscope/models/model_apis.py +10 -8
  103. evalscope/models/utils/openai.py +1 -2
  104. evalscope/perf/arguments.py +2 -0
  105. evalscope/perf/plugin/api/base.py +2 -2
  106. evalscope/perf/plugin/api/default_api.py +7 -7
  107. evalscope/perf/plugin/api/openai_api.py +83 -19
  108. evalscope/perf/plugin/datasets/flickr8k.py +2 -2
  109. evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
  110. evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
  111. evalscope/perf/utils/benchmark_util.py +1 -2
  112. evalscope/report/__init__.py +9 -1
  113. evalscope/report/combinator.py +45 -20
  114. evalscope/report/report.py +8 -4
  115. evalscope/run.py +1 -1
  116. evalscope/utils/function_utils.py +41 -0
  117. evalscope/utils/import_utils.py +63 -13
  118. evalscope/utils/io_utils.py +19 -11
  119. evalscope/utils/json_schema.py +25 -2
  120. evalscope/utils/logger.py +19 -0
  121. evalscope/utils/model_utils.py +1 -1
  122. evalscope/utils/multi_choices.py +16 -1
  123. evalscope/version.py +2 -2
  124. {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/METADATA +10 -40
  125. {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/RECORD +120 -95
  126. {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/top_level.txt +0 -1
  127. tests/__init__.py +0 -1
  128. tests/benchmark/__init__.py +0 -1
  129. tests/benchmark/test_eval.py +0 -385
  130. tests/benchmark/test_image_edit.py +0 -65
  131. tests/benchmark/test_t2i.py +0 -142
  132. tests/benchmark/test_vlm.py +0 -80
  133. tests/cli/__init__.py +0 -1
  134. tests/cli/test_all.py +0 -269
  135. tests/cli/test_collection.py +0 -99
  136. tests/cli/test_custom.py +0 -268
  137. tests/cli/test_reasoning.py +0 -81
  138. tests/common.py +0 -73
  139. tests/perf/__init__.py +0 -1
  140. tests/perf/test_perf.py +0 -178
  141. tests/rag/test_clip_benchmark.py +0 -87
  142. tests/rag/test_mteb.py +0 -213
  143. tests/rag/test_ragas.py +0 -128
  144. tests/swift/__init__.py +0 -1
  145. tests/swift/test_run_swift_eval.py +0 -146
  146. tests/swift/test_run_swift_vlm_eval.py +0 -128
  147. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
  148. tests/test_run_all.py +0 -12
  149. tests/utils.py +0 -13
  150. tests/vlm/__init__.py +0 -1
  151. tests/vlm/test_vlmeval.py +0 -102
  152. {tests/rag → evalscope/benchmarks/ai2d}/__init__.py +0 -0
  153. {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/LICENSE +0 -0
  154. {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/WHEEL +0 -0
  155. {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,64 @@
1
+ import re
2
+ from typing import Any, Dict, List
3
+
4
+ from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.evaluator import TaskState
7
+ from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
8
+ from evalscope.api.registry import register_benchmark
9
+ from evalscope.constants import Tags
10
+ from evalscope.utils.io_utils import bytes_to_base64
11
+ from evalscope.utils.logger import get_logger
12
+
13
+ logger = get_logger()
14
+
15
+ SUBSET_LIST = ['default']
16
+
17
+ OPEN_PROMPT = (
18
+ 'Read the picture and solve the following problem step by step.'
19
+ 'The last line of your response should be of the form'
20
+ ' "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the problem.\n\n'
21
+ '{question}\n\n'
22
+ 'Remember to put your answer on its own line at the end in the form'
23
+ ' "ANSWER: $ANSWER" (without quotes) where $ANSWER is the answer to the problem,'
24
+ ' and you do not need to use a \\boxed command.'
25
+ )
26
+
27
+
28
+ @register_benchmark(
29
+ BenchmarkMeta(
30
+ name='real_world_qa',
31
+ pretty_name='RealWorldQA',
32
+ tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
33
+ description=
34
+ 'RealWorldQA is a benchmark designed to evaluate the real-world spatial understanding capabilities of multimodal AI models, contributed by XAI. It assesses how well these models comprehend physical environments. The benchmark consists of 700+ images, each accompanied by a question and a verifiable answer. These images are drawn from real-world scenarios, including those captured from vehicles. The goal is to advance AI models\' understanding of our physical world.', # noqa: E501
35
+ dataset_id='lmms-lab/RealWorldQA',
36
+ subset_list=SUBSET_LIST,
37
+ metric_list=['acc'],
38
+ eval_split='test',
39
+ prompt_template=OPEN_PROMPT,
40
+ )
41
+ )
42
+ class RealWorldQAAdapter(VisionLanguageAdapter):
43
+
44
+ def __init__(self, **kwargs):
45
+ super().__init__(**kwargs)
46
+
47
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
48
+ content_list: list[Content] = [ContentText(text=OPEN_PROMPT.format(question=record['question']))]
49
+ image = record.get('image')
50
+ if image:
51
+ image_base64 = bytes_to_base64(image['bytes'], format='webp', add_header=True)
52
+ content_list.append(ContentImage(image=image_base64))
53
+ return Sample(
54
+ input=[ChatMessageUser(content=content_list)],
55
+ target=record['answer'],
56
+ metadata={'image_path': record['image_path']}
57
+ )
58
+
59
+ def extract_answer(self, prediction: str, task_state: TaskState) -> str:
60
+ pattern = r'ANSWER:\s*(.*)'
61
+ match = re.search(pattern, prediction)
62
+ if match:
63
+ return match.group(1).strip()
64
+ return ''
@@ -47,7 +47,12 @@ class TauBenchAdapter(DefaultDataAdapter):
47
47
  def __init__(self, **kwargs):
48
48
  super().__init__(**kwargs)
49
49
 
50
- check_import('tau_bench', package='git+https://github.com/sierra-research/tau-bench', raise_error=True)
50
+ check_import(
51
+ 'tau_bench',
52
+ package='git+https://github.com/sierra-research/tau-bench',
53
+ raise_error=True,
54
+ feature_name=self.pretty_name
55
+ )
51
56
 
52
57
  # setup user model args
53
58
  self.user_model = self.extra_params.get('user_model', 'qwen-plus')
evalscope/config.py CHANGED
@@ -18,6 +18,7 @@ from evalscope.constants import (
18
18
  )
19
19
  from evalscope.utils.argument_utils import BaseArgument, parse_int_or_float
20
20
  from evalscope.utils.deprecation_utils import deprecated_warning
21
+ from evalscope.utils.import_utils import check_import
21
22
  from evalscope.utils.io_utils import dict_to_yaml, gen_hash, safe_filename
22
23
  from evalscope.utils.logger import get_logger
23
24
 
@@ -124,6 +125,19 @@ class TaskConfig(BaseArgument):
124
125
  analysis_report: bool = False
125
126
  """Whether to generate detailed analysis reports after evaluation."""
126
127
 
128
+ # Sandbox configuration arguments
129
+ use_sandbox: bool = False
130
+ """Whether to execute code in a sandboxed environment."""
131
+
132
+ sandbox_type: Optional[str] = 'docker'
133
+ """Type of sandbox environment for code execution (e.g., docker). Default is 'docker'."""
134
+
135
+ sandbox_manager_config: Optional[Dict] = field(default_factory=dict)
136
+ """Configuration for the sandbox manager. Default is local manager. If url is provided, it will use remote manager."""
137
+
138
+ sandbox_config: Optional[Dict] = field(default_factory=dict)
139
+ """Configuration for sandboxed code execution environments."""
140
+
127
141
  def __post_init__(self):
128
142
  self.__init_model_and_id()
129
143
 
@@ -132,6 +146,7 @@ class TaskConfig(BaseArgument):
132
146
  # Set default generation_config and model_args
133
147
  self.__init_default_generation_config()
134
148
  self.__init_default_model_args()
149
+ self.__init_default_sandbox_config()
135
150
 
136
151
  def __init_model_and_id(self):
137
152
  # Set model to DummyCustomModel if not provided
@@ -223,6 +238,14 @@ class TaskConfig(BaseArgument):
223
238
  'precision': 'torch.float16',
224
239
  }
225
240
 
241
+ def __init_default_sandbox_config(self):
242
+ if not self.use_sandbox:
243
+ return
244
+ check_import('ms_enclave', 'ms_enclave[docker]', raise_error=True)
245
+
246
+ if not self.sandbox_type:
247
+ self.sandbox_type = 'docker'
248
+
226
249
  def update(self, other: Union['TaskConfig', dict]):
227
250
  if isinstance(other, TaskConfig):
228
251
  other = other.to_dict()
@@ -238,7 +261,7 @@ class TaskConfig(BaseArgument):
238
261
  logger.warning(f'Failed to dump overall task config: {e}')
239
262
 
240
263
  def to_dict(self):
241
- result = copy.deepcopy(self.__dict__)
264
+ result = copy.copy(self.__dict__)
242
265
  del result['api_key'] # Do not expose api_key in the config
243
266
 
244
267
  if isinstance(self.model, (Model, ModelAPI)):
evalscope/constants.py CHANGED
@@ -15,6 +15,7 @@ DEFAULT_ROOT_CACHE_DIR = DEFAULT_DATASET_CACHE_DIR # compatible with old versio
15
15
  DEFAULT_EVALSCOPE_CACHE_DIR = os.path.expanduser(
16
16
  os.getenv('EVALSCOPE_CACHE', '~/.cache/evalscope')
17
17
  ) # ~/.cache/evalscope
18
+ IS_BUILD_DOC = os.getenv('BUILD_DOC', '0') == '1' # To avoid some heavy dependencies when building doc
18
19
 
19
20
 
20
21
  class HubType:
@@ -130,6 +131,8 @@ class Tags:
130
131
  TEXT_TO_IMAGE = 'TextToImage'
131
132
  IMAGE_EDITING = 'ImageEditing'
132
133
  MULTI_MODAL = 'MultiModal'
134
+ MULTI_LINGUAL = 'MultiLingual'
135
+ MULTI_TURN = 'MultiTurn'
133
136
 
134
137
 
135
138
  class FileConstants:
@@ -8,8 +8,9 @@ and report generation.
8
8
  """
9
9
 
10
10
  import os
11
+ import traceback
11
12
  from collections import defaultdict
12
- from concurrent.futures import ThreadPoolExecutor, as_completed
13
+ from concurrent.futures import ThreadPoolExecutor, TimeoutError, as_completed
13
14
  from tqdm import tqdm
14
15
  from typing import TYPE_CHECKING, Dict, List, Tuple, Union
15
16
 
@@ -17,6 +18,7 @@ from evalscope.api.dataset import Dataset, DatasetDict, Sample
17
18
  from evalscope.api.evaluator import CacheManager, Evaluator, TaskState
18
19
  from evalscope.api.metric import AggScore, SampleScore
19
20
  from evalscope.report import Report, gen_table
21
+ from evalscope.utils.logger import get_logger
20
22
 
21
23
  if TYPE_CHECKING:
22
24
  from evalscope.api.benchmark import DataAdapter
@@ -24,8 +26,6 @@ if TYPE_CHECKING:
24
26
  from evalscope.config import TaskConfig
25
27
  from evalscope.utils.io_utils import OutputsStructure
26
28
 
27
- from evalscope.utils.logger import get_logger
28
-
29
29
  logger = get_logger()
30
30
 
31
31
 
@@ -104,6 +104,9 @@ class DefaultEvaluator(Evaluator):
104
104
 
105
105
  # Generate the report based on aggregated scores
106
106
  report = self.get_report(agg_score_dict)
107
+
108
+ # Finalize the evaluation process
109
+ self.finalize()
107
110
  return report
108
111
 
109
112
  def evaluate_subset(self, subset: str, dataset: Dataset) -> List[AggScore]:
@@ -186,7 +189,10 @@ class DefaultEvaluator(Evaluator):
186
189
  logger.debug(f'Model result: \n{model_result.pretty_print()}')
187
190
 
188
191
  except Exception as exc:
189
- logger.error(f'{sample.model_dump_json(indent=2)} prediction failed: due to {exc}')
192
+ tb_str = traceback.format_exc()
193
+ logger.error(
194
+ f'{sample.model_dump_json(indent=2)} prediction failed: due to {exc}\nTraceback:\n{tb_str}'
195
+ )
190
196
  if self.task_config.ignore_errors:
191
197
  logger.warning('Error ignored, continuing with next sample.')
192
198
  else:
@@ -253,7 +259,13 @@ class DefaultEvaluator(Evaluator):
253
259
  for future in as_completed(future_to_task_state):
254
260
  task_state = future_to_task_state[future]
255
261
  try:
256
- sample_score = future.result()
262
+ try:
263
+ sample_score = future.result()
264
+ except TimeoutError:
265
+ logger.warning(
266
+ f'Timeout when reviewing sample {task_state.sample_id}, setting score to zero.'
267
+ )
268
+ sample_score = SampleScore(sample_id=task_state.sample_id, scores={})
257
269
  sample_score_list.append(sample_score)
258
270
 
259
271
  # Save the review result to cache for future use
@@ -266,7 +278,10 @@ class DefaultEvaluator(Evaluator):
266
278
  logger.debug(f'Review result: \n{review_result.pretty_print()}')
267
279
 
268
280
  except Exception as exc:
269
- logger.error(f'Error when review sample {task_state.sample_id}: due to {exc}')
281
+ tb_str = traceback.format_exc()
282
+ logger.error(
283
+ f'Error when review sample {task_state.sample_id}: due to {exc}\nTraceback:\n{tb_str}'
284
+ )
270
285
  if self.task_config.ignore_errors:
271
286
  logger.warning('Error ignored, continuing with next sample.')
272
287
  else:
@@ -319,7 +334,7 @@ class DefaultEvaluator(Evaluator):
319
334
 
320
335
  # Generate and display a summary table of results
321
336
  try:
322
- report_table = gen_table(report_list=[report], add_overall_metric=True)
337
+ report_table = gen_table(report_list=[report], add_overall_metric=self.benchmark.add_overall_metric)
323
338
  logger.info(f'\n{self.benchmark_name} report table:'
324
339
  f'\n{report_table} \n')
325
340
  except Exception:
@@ -337,3 +352,6 @@ class DefaultEvaluator(Evaluator):
337
352
  report.to_json(report_file)
338
353
  logger.info(f'Dump report to: {report_file} \n')
339
354
  return report
355
+
356
+ def finalize(self, *args, **kwargs):
357
+ self.benchmark.finalize(*args, **kwargs)
@@ -1,3 +1,4 @@
1
+ import json
1
2
  from collections import defaultdict
2
3
  from typing import List
3
4
 
@@ -6,11 +7,19 @@ from evalscope.api.registry import register_aggregation, register_metric
6
7
  from .metrics import mean
7
8
 
8
9
 
10
+ def normalize_text(text: str) -> str:
11
+ """Normalize text by lowering case and stripping whitespace."""
12
+ return text.strip().lower()
13
+
14
+
9
15
  @register_metric(name='exact_match')
10
16
  class ExactMatch(Metric):
11
17
 
12
18
  def apply(self, predictions, references):
13
- return [float(prediction == reference) for prediction, reference in zip(predictions, references)]
19
+ return [
20
+ float(normalize_text(prediction) == normalize_text(reference))
21
+ for prediction, reference in zip(predictions, references)
22
+ ]
14
23
 
15
24
 
16
25
  @register_metric(name='acc')
@@ -92,6 +101,56 @@ class MultiChoiceAcc(Metric):
92
101
  return res
93
102
 
94
103
 
104
+ @register_metric(name='anls')
105
+ class ANLS(Metric):
106
+
107
+ def __init__(self, thresh_hold=0.5):
108
+ self.thresh_hold = thresh_hold
109
+
110
+ def apply(self, predictions, references):
111
+ """
112
+ Calculate ANLS (Average Normalized Levenshtein Similarity) for a list of predictions and references.
113
+ This implementation is adapted from
114
+ https://github.com/QwenLM/Qwen-VL/blob/master/eval_mm/infographicsvqa_eval.py
115
+
116
+ Args:
117
+ references (List[str]): List of correct answers. Each answer can be a string of json.
118
+ predictions (List[str]): List of predicted answers.
119
+ """
120
+ from .metrics import levenshtein_distance
121
+
122
+ res = []
123
+ # Unwrap predictions if it's a nested list
124
+ for prediction, reference in zip(predictions, references):
125
+ # Parse the reference which is a json string
126
+ try:
127
+ answer = json.loads(reference)
128
+ except json.JSONDecodeError:
129
+ answer = reference
130
+ if isinstance(answer, str):
131
+ answer = [answer]
132
+ assert isinstance(answer, list), 'The reference answer should be a list of answers.'
133
+
134
+ # Calculate ANLS for each reference answer
135
+ values = []
136
+ for ans in answer:
137
+ # preprocess both the answers - gt and prediction
138
+ gt_answer = ' '.join(ans.strip().lower().split())
139
+ det_answer = ' '.join(prediction.strip().lower().split())
140
+
141
+ dist = levenshtein_distance(gt_answer, det_answer)
142
+ length = max(len(ans.upper()), len(prediction.upper()))
143
+ values.append(0.0 if length == 0 else float(dist) / float(length))
144
+
145
+ question_result = 0.0
146
+ if values:
147
+ question_result = 1 - min(values)
148
+ if question_result < self.thresh_hold:
149
+ question_result = 0.0
150
+ res.append(question_result)
151
+ return res
152
+
153
+
95
154
  # ##################
96
155
  # T2I Metrics ######
97
156
  ####################
@@ -202,6 +261,9 @@ class Mean(Aggregator):
202
261
 
203
262
  name = 'mean'
204
263
 
264
+ def agg_func(self, values: List[float]) -> float:
265
+ return mean(values)
266
+
205
267
  def __call__(self, scores: List[SampleScore]) -> List[AggScore]:
206
268
  """Aggregate scores by computing the mean for each metric.
207
269
 
@@ -230,7 +292,7 @@ class Mean(Aggregator):
230
292
  if values: # Only process non-empty value lists
231
293
  aggregated_scores.append(
232
294
  AggScore(
233
- score=mean(values),
295
+ score=self.agg_func(values),
234
296
  metric_name=metric_name,
235
297
  aggregation_name=self.name,
236
298
  num=len(values),
@@ -241,6 +303,20 @@ class Mean(Aggregator):
241
303
  return aggregated_scores
242
304
 
243
305
 
306
+ @register_aggregation(name='clipped_mean')
307
+ class ClippedMean(Mean):
308
+
309
+ name = 'clipped_mean'
310
+
311
+ def __init__(self, clip_min: float = 0.0, clip_max: float = 1.0):
312
+ self.clip_min = clip_min
313
+ self.clip_max = clip_max
314
+
315
+ def agg_func(self, values: List[float]) -> float:
316
+ clipped_values = min(max(mean(values), self.clip_min), self.clip_max)
317
+ return clipped_values
318
+
319
+
244
320
  @register_aggregation(name='pass_at_k')
245
321
  class PassAtK(Aggregator):
246
322
 
@@ -467,3 +467,19 @@ def calculate_pass_at_k(
467
467
  num_samples_it = iter(num_samples)
468
468
 
469
469
  return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])
470
+
471
+
472
+ def levenshtein_distance(s1, s2):
473
+ if len(s1) > len(s2):
474
+ s1, s2 = s2, s1
475
+
476
+ distances = range(len(s1) + 1)
477
+ for i2, c2 in enumerate(s2):
478
+ distances_ = [i2 + 1]
479
+ for i1, c1 in enumerate(s1):
480
+ if c1 == c2:
481
+ distances_.append(distances[i1])
482
+ else:
483
+ distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
484
+ distances = distances_
485
+ return distances[-1]
@@ -30,13 +30,9 @@ from transformers.modeling_outputs import (
30
30
  SequenceClassifierOutput,
31
31
  TokenClassifierOutput,
32
32
  )
33
- from transformers.modeling_utils import (
34
- PreTrainedModel,
35
- apply_chunking_to_forward,
36
- find_pruneable_heads_and_indices,
37
- prune_linear_layer,
38
- )
33
+ from transformers.modeling_utils import PreTrainedModel
39
34
  from transformers.models.bert.configuration_bert import BertConfig
35
+ from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
40
36
  from transformers.utils import logging
41
37
  from typing import Any, Dict, Optional, Tuple
42
38
 
@@ -14,13 +14,9 @@ from transformers.modeling_outputs import (
14
14
  BaseModelOutputWithPastAndCrossAttentions,
15
15
  BaseModelOutputWithPoolingAndCrossAttentions,
16
16
  )
17
- from transformers.modeling_utils import (
18
- PreTrainedModel,
19
- apply_chunking_to_forward,
20
- find_pruneable_heads_and_indices,
21
- prune_linear_layer,
22
- )
17
+ from transformers.modeling_utils import PreTrainedModel
23
18
  from transformers.models.bert.configuration_bert import BertConfig
19
+ from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
24
20
  from transformers.utils import logging
25
21
  from typing import Tuple
26
22
 
@@ -31,13 +31,9 @@ from transformers.modeling_outputs import (
31
31
  SequenceClassifierOutput,
32
32
  TokenClassifierOutput,
33
33
  )
34
- from transformers.modeling_utils import (
35
- PreTrainedModel,
36
- apply_chunking_to_forward,
37
- find_pruneable_heads_and_indices,
38
- prune_linear_layer,
39
- )
34
+ from transformers.modeling_utils import PreTrainedModel
40
35
  from transformers.models.bert.configuration_bert import BertConfig
36
+ from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
41
37
  from transformers.utils import logging
42
38
  from typing import Optional, Tuple
43
39
 
@@ -28,7 +28,7 @@ def server() -> type[ModelAPI]:
28
28
 
29
29
  @register_model_api(name='llm_ckpt')
30
30
  def llm_ckpt() -> type[ModelAPI]:
31
- check_import('torch', package='torch', raise_error=True)
31
+ check_import('torch', package='torch', raise_error=True, feature_name='llm_ckpt')
32
32
 
33
33
  from .modelscope import ModelScopeAPI
34
34
 
@@ -38,7 +38,7 @@ def llm_ckpt() -> type[ModelAPI]:
38
38
  @register_model_api(name='checkpoint')
39
39
  @deprecated(since='1.0.0', remove_in='1.1.0', alternative='llm_ckpt')
40
40
  def checkpoint() -> type[ModelAPI]:
41
- check_import('torch', package='torch', raise_error=True)
41
+ check_import('torch', package='torch', raise_error=True, feature_name='llm_ckpt')
42
42
 
43
43
  from .modelscope import ModelScopeAPI
44
44
 
@@ -47,9 +47,10 @@ def checkpoint() -> type[ModelAPI]:
47
47
 
48
48
  @register_model_api(name='text2image')
49
49
  def text2image() -> type[ModelAPI]:
50
- check_import('torch', package='evalscope[aigc]', raise_error=True)
51
- check_import('torchvision', package='evalscope[aigc]', raise_error=True)
52
- check_import('diffusers', package='evalscope[aigc]', raise_error=True)
50
+ check_import(['torch', 'torchvision', 'diffusers'],
51
+ package='evalscope[aigc]',
52
+ raise_error=True,
53
+ feature_name='text2image')
53
54
 
54
55
  from .text2image_model import Text2ImageAPI
55
56
 
@@ -58,9 +59,10 @@ def text2image() -> type[ModelAPI]:
58
59
 
59
60
  @register_model_api(name='image_editing')
60
61
  def image_editing() -> type[ModelAPI]:
61
- check_import('torch', package='evalscope[aigc]', raise_error=True)
62
- check_import('torchvision', package='evalscope[aigc]', raise_error=True)
63
- check_import('diffusers', package='evalscope[aigc]', raise_error=True)
62
+ check_import(['torch', 'torchvision', 'diffusers'],
63
+ package='evalscope[aigc]',
64
+ raise_error=True,
65
+ feature_name='image_editing')
64
66
 
65
67
  from .image_edit_model import ImageEditAPI
66
68
 
@@ -104,10 +104,9 @@ def openai_chat_completion_part(content: Content) -> ChatCompletionContentPartPa
104
104
  )
105
105
  elif content.type == 'audio':
106
106
  audio_data_uri = file_as_data_uri(content.audio)
107
- audio_data = audio_data_uri.split('base64,')[1]
108
107
 
109
108
  return ChatCompletionContentPartInputAudioParam(
110
- type='input_audio', input_audio=dict(data=audio_data, format=content.format)
109
+ type='input_audio', input_audio=dict(data=audio_data_uri, format=content.format)
111
110
  )
112
111
 
113
112
  else:
@@ -55,6 +55,7 @@ class Arguments(BaseArgument):
55
55
  image_height: int = 224 # Height of the image for random VL dataset
56
56
  image_format: str = 'RGB' # Image format for random VL dataset
57
57
  image_num: int = 1 # Number of images for random VL dataset
58
+ image_patch_size: int = 28 # Patch size for image tokenizer, only for local image token calculation
58
59
 
59
60
  # Dataset settings
60
61
  dataset: str = 'openqa' # Dataset type (default: 'line_by_line')
@@ -171,6 +172,7 @@ def add_argument(parser: argparse.ArgumentParser):
171
172
  parser.add_argument('--image-height', type=int, default=224, help='Height of the image for random VL dataset')
172
173
  parser.add_argument('--image-format', type=str, default='RGB', help='Image format for random VL dataset')
173
174
  parser.add_argument('--image-num', type=int, default=1, help='Number of images for random VL dataset')
175
+ parser.add_argument('--image-patch-size', type=int, default=28, help='Patch size for image tokenizer, only for local image token calculation') # noqa: E501
174
176
 
175
177
  # Output settings
176
178
  parser.add_argument('--outputs-dir', help='Outputs dir.', default='outputs')
@@ -43,7 +43,7 @@ class ApiPluginBase:
43
43
 
44
44
  @abstractmethod
45
45
  async def process_request(self, client_session: aiohttp.ClientSession, url: str, headers: Dict,
46
- body: Dict) -> AsyncGenerator[Tuple[bool, int, str], None]:
46
+ body: Dict) -> AsyncGenerator[Tuple[bool, int, Any], None]:
47
47
  """Process the HTTP request and handle the response.
48
48
 
49
49
  Args:
@@ -53,7 +53,7 @@ class ApiPluginBase:
53
53
  body: The request body
54
54
 
55
55
  Yields:
56
- Tuple[bool, int, str]: (is_error, status_code, response_data)
56
+ Tuple[bool, int, Any]: (is_error, status_code, response_data)
57
57
  """
58
58
  raise NotImplementedError
59
59
 
@@ -18,7 +18,7 @@ class DefaultApiPlugin(ApiPluginBase):
18
18
  super().__init__(param)
19
19
 
20
20
  async def process_request(self, client_session: aiohttp.ClientSession, url: str, headers: Dict,
21
- body: Dict) -> AsyncGenerator[Tuple[bool, int, str], None]:
21
+ body: Dict) -> AsyncGenerator[Tuple[bool, int, Any], None]:
22
22
  """Process the HTTP request and handle the response.
23
23
 
24
24
  Args:
@@ -28,7 +28,7 @@ class DefaultApiPlugin(ApiPluginBase):
28
28
  body: The request body
29
29
 
30
30
  Yields:
31
- Tuple[bool, int, str]: (is_error, status_code, response_data)
31
+ Tuple[bool, int, Any]: (is_error, status_code, response_data)
32
32
  """
33
33
  try:
34
34
  headers = {'Content-Type': 'application/json', **headers}
@@ -40,7 +40,7 @@ class DefaultApiPlugin(ApiPluginBase):
40
40
  logger.error(f'Error in process_request: {e}')
41
41
  yield (True, None, str(e))
42
42
 
43
- async def _handle_stream(self, response: aiohttp.ClientResponse) -> AsyncGenerator[Tuple[bool, int, str], None]:
43
+ async def _handle_stream(self, response: aiohttp.ClientResponse) -> AsyncGenerator[Tuple[bool, int, Any], None]:
44
44
  """Handle streaming response from server-sent events.
45
45
 
46
46
  Args:
@@ -71,14 +71,14 @@ class DefaultApiPlugin(ApiPluginBase):
71
71
  logger.error(f'Error in _handle_stream: {e}')
72
72
  yield True, response.status, str(e)
73
73
 
74
- async def _handle_response(self, response: aiohttp.ClientResponse) -> AsyncGenerator[Tuple[bool, int, str], None]:
74
+ async def _handle_response(self, response: aiohttp.ClientResponse) -> AsyncGenerator[Tuple[bool, int, Any], None]:
75
75
  """Handle the HTTP response based on content type and status.
76
76
 
77
77
  Args:
78
78
  response: The aiohttp response object
79
79
 
80
80
  Yields:
81
- Tuple[bool, int, str]: (is_error, status_code, response_data)
81
+ Tuple[bool, int, Any]: (is_error, status_code, response_data)
82
82
  """
83
83
  response_status = response.status
84
84
  response_content_type = response.content_type
@@ -94,7 +94,7 @@ class DefaultApiPlugin(ApiPluginBase):
94
94
  # Handle successful response with 'application/json' content type
95
95
  elif content_type_json in response_content_type:
96
96
  content = await response.json()
97
- yield (False, response_status, json.dumps(content, ensure_ascii=False))
97
+ yield (False, response_status, content)
98
98
  # Handle other successful responses
99
99
  else:
100
100
  content = await response.read()
@@ -102,4 +102,4 @@ class DefaultApiPlugin(ApiPluginBase):
102
102
  else:
103
103
  # error is always in JSON format
104
104
  error = await response.json()
105
- yield (True, response_status, json.dumps(error, ensure_ascii=False))
105
+ yield (True, response_status, error)