evalscope 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (155) hide show
  1. evalscope/api/benchmark/adapters/default_data_adapter.py +18 -4
  2. evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
  3. evalscope/api/benchmark/adapters/text2image_adapter.py +5 -4
  4. evalscope/api/benchmark/adapters/vision_language_adapter.py +3 -1
  5. evalscope/api/benchmark/benchmark.py +27 -2
  6. evalscope/api/benchmark/meta.py +3 -0
  7. evalscope/api/evaluator/evaluator.py +5 -0
  8. evalscope/api/evaluator/state.py +5 -0
  9. evalscope/api/messages/chat_message.py +6 -1
  10. evalscope/api/mixin/__init__.py +1 -0
  11. evalscope/api/mixin/llm_judge_mixin.py +2 -0
  12. evalscope/api/mixin/sandbox_mixin.py +204 -0
  13. evalscope/api/model/generate_config.py +0 -3
  14. evalscope/api/model/model.py +1 -1
  15. evalscope/api/tool/tool_info.py +1 -1
  16. evalscope/app/ui/multi_model.py +6 -1
  17. evalscope/app/ui/single_model.py +8 -2
  18. evalscope/app/utils/data_utils.py +3 -2
  19. evalscope/app/utils/visualization.py +2 -2
  20. evalscope/arguments.py +6 -0
  21. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  22. evalscope/benchmarks/amc/__init__.py +0 -0
  23. evalscope/benchmarks/amc/amc_adapter.py +46 -0
  24. evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
  25. evalscope/benchmarks/bfcl/bfcl_adapter.py +106 -2
  26. evalscope/benchmarks/bfcl/generation.py +7 -7
  27. evalscope/benchmarks/blink/__init__.py +0 -0
  28. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  29. evalscope/benchmarks/chartqa/__init__.py +0 -0
  30. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  31. evalscope/benchmarks/chartqa/utils.py +38 -0
  32. evalscope/benchmarks/docvqa/__init__.py +0 -0
  33. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  34. evalscope/benchmarks/drop/drop_adapter.py +1 -1
  35. evalscope/benchmarks/general_arena/utils.py +2 -1
  36. evalscope/benchmarks/healthbench/__init__.py +0 -0
  37. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  38. evalscope/benchmarks/healthbench/utils.py +102 -0
  39. evalscope/benchmarks/hle/hle_adapter.py +3 -2
  40. evalscope/benchmarks/humaneval/humaneval_adapter.py +19 -35
  41. evalscope/benchmarks/humaneval/utils.py +235 -0
  42. evalscope/benchmarks/infovqa/__init__.py +0 -0
  43. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  44. evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
  45. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +60 -37
  46. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  47. evalscope/benchmarks/math_500/math_500_adapter.py +0 -1
  48. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  49. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +48 -0
  50. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  51. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  52. evalscope/benchmarks/mm_star/__init__.py +0 -0
  53. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  54. evalscope/benchmarks/mmmu/mmmu_adapter.py +1 -1
  55. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +4 -9
  56. evalscope/benchmarks/multi_if/__init__.py +0 -0
  57. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  58. evalscope/benchmarks/multi_if/metrics.py +120 -0
  59. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  60. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -4
  61. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  62. evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py +101 -0
  63. evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py +87 -0
  64. evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py +963 -0
  65. evalscope/benchmarks/ocr_bench_v2/__init__.py +0 -0
  66. evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  67. evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py +50 -0
  68. evalscope/benchmarks/ocr_bench_v2/parallel.py +46 -0
  69. evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  70. evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  71. evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  72. evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py +481 -0
  73. evalscope/benchmarks/ocr_bench_v2/spotting_metric.py +179 -0
  74. evalscope/benchmarks/ocr_bench_v2/utils.py +432 -0
  75. evalscope/benchmarks/ocr_bench_v2/vqa_metric.py +254 -0
  76. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  77. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  78. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  79. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  80. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  81. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  82. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  83. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +6 -1
  84. evalscope/config.py +24 -1
  85. evalscope/constants.py +3 -0
  86. evalscope/evaluator/evaluator.py +25 -7
  87. evalscope/metrics/metric.py +78 -2
  88. evalscope/metrics/metrics.py +16 -0
  89. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
  90. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
  91. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
  92. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
  93. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
  94. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
  95. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
  96. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
  97. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
  98. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
  99. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
  100. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
  101. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
  102. evalscope/models/model_apis.py +10 -8
  103. evalscope/models/utils/openai.py +1 -2
  104. evalscope/perf/arguments.py +2 -0
  105. evalscope/perf/plugin/api/base.py +2 -2
  106. evalscope/perf/plugin/api/default_api.py +7 -7
  107. evalscope/perf/plugin/api/openai_api.py +83 -19
  108. evalscope/perf/plugin/datasets/flickr8k.py +2 -2
  109. evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
  110. evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
  111. evalscope/perf/utils/benchmark_util.py +1 -2
  112. evalscope/report/__init__.py +9 -1
  113. evalscope/report/combinator.py +45 -20
  114. evalscope/report/report.py +8 -4
  115. evalscope/run.py +1 -1
  116. evalscope/utils/function_utils.py +41 -0
  117. evalscope/utils/import_utils.py +63 -13
  118. evalscope/utils/io_utils.py +19 -11
  119. evalscope/utils/json_schema.py +25 -2
  120. evalscope/utils/logger.py +19 -0
  121. evalscope/utils/model_utils.py +1 -1
  122. evalscope/utils/multi_choices.py +16 -1
  123. evalscope/version.py +2 -2
  124. {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/METADATA +10 -40
  125. {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/RECORD +120 -95
  126. {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/top_level.txt +0 -1
  127. tests/__init__.py +0 -1
  128. tests/benchmark/__init__.py +0 -1
  129. tests/benchmark/test_eval.py +0 -385
  130. tests/benchmark/test_image_edit.py +0 -65
  131. tests/benchmark/test_t2i.py +0 -142
  132. tests/benchmark/test_vlm.py +0 -80
  133. tests/cli/__init__.py +0 -1
  134. tests/cli/test_all.py +0 -269
  135. tests/cli/test_collection.py +0 -99
  136. tests/cli/test_custom.py +0 -268
  137. tests/cli/test_reasoning.py +0 -81
  138. tests/common.py +0 -73
  139. tests/perf/__init__.py +0 -1
  140. tests/perf/test_perf.py +0 -178
  141. tests/rag/test_clip_benchmark.py +0 -87
  142. tests/rag/test_mteb.py +0 -213
  143. tests/rag/test_ragas.py +0 -128
  144. tests/swift/__init__.py +0 -1
  145. tests/swift/test_run_swift_eval.py +0 -146
  146. tests/swift/test_run_swift_vlm_eval.py +0 -128
  147. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
  148. tests/test_run_all.py +0 -12
  149. tests/utils.py +0 -13
  150. tests/vlm/__init__.py +0 -1
  151. tests/vlm/test_vlmeval.py +0 -102
  152. {tests/rag → evalscope/benchmarks/ai2d}/__init__.py +0 -0
  153. {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/LICENSE +0 -0
  154. {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/WHEEL +0 -0
  155. {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/entry_points.txt +0 -0
@@ -1,10 +1,13 @@
1
1
  import json
2
+ import math
2
3
  import os
4
+ from collections import defaultdict
3
5
  from typing import Any, Dict, List, Tuple, Union
4
6
 
5
7
  from evalscope.perf.arguments import Arguments
6
8
  from evalscope.perf.plugin.api.default_api import DefaultApiPlugin
7
9
  from evalscope.perf.plugin.registry import register_api
10
+ from evalscope.utils.io_utils import base64_to_PIL
8
11
  from evalscope.utils.logger import get_logger
9
12
 
10
13
  logger = get_logger()
@@ -113,7 +116,7 @@ class OpenaiPlugin(DefaultApiPlugin):
113
116
  return input_tokens, output_tokens
114
117
 
115
118
  # no usage information in the response, parse the response to get the tokens
116
- delta_contents = {}
119
+ delta_contents = defaultdict(list)
117
120
  for response in responses:
118
121
  if 'object' in response:
119
122
  self.__process_response_object(response, delta_contents)
@@ -123,41 +126,46 @@ class OpenaiPlugin(DefaultApiPlugin):
123
126
  input_tokens, output_tokens = self.__calculate_tokens_from_content(request, delta_contents)
124
127
  return input_tokens, output_tokens
125
128
 
126
- def __process_response_object(self, js, delta_contents):
127
- if js['object'] == 'chat.completion':
128
- for choice in js['choices']:
129
+ def __process_response_object(self, response, delta_contents):
130
+ if not response.get('choices'):
131
+ return
132
+ if response['object'] == 'chat.completion':
133
+ for choice in response['choices']:
129
134
  delta_contents[choice['index']] = [choice['message']['content']]
130
- elif js['object'] == 'text_completion':
131
- for choice in js['choices']:
132
- delta_contents[choice['index']] = [choice['text']]
133
- elif js['object'] == 'chat.completion.chunk':
134
- for choice in js.get('choices', []):
135
+ elif response['object'] == 'text_completion':
136
+ for choice in response['choices']:
137
+ if 'text' in choice and 'index' in choice:
138
+ delta_contents[choice['index']].append(choice['text'])
139
+ elif response['object'] == 'chat.completion.chunk':
140
+ for choice in response['choices']:
135
141
  if 'delta' in choice and 'index' in choice:
136
142
  delta = choice['delta']
137
143
  idx = choice['index']
138
144
  if 'content' in delta:
139
- delta_content = delta['content']
140
- delta_contents.setdefault(idx, []).append(delta_content)
145
+ delta_contents[idx].append(delta['content'])
141
146
 
142
- def __process_no_object(self, js, delta_contents):
147
+ def __process_no_object(self, response, delta_contents):
143
148
  # assume the response is a single choice
144
- for choice in js['choices']:
149
+ if not response.get('choices'):
150
+ return
151
+ for choice in response['choices']:
145
152
  if 'delta' in choice:
146
153
  delta = choice['delta']
147
154
  idx = choice['index']
148
155
  if 'content' in delta:
149
- delta_content = delta['content']
150
- delta_contents.setdefault(idx, []).append(delta_content)
156
+ delta_contents[idx].append(delta['content'])
151
157
  else:
152
158
  delta_contents[choice['index']] = [choice['message']['content']]
153
159
 
154
- def __calculate_tokens_from_content(self, request, delta_contents):
160
+ def __calculate_tokens_from_content(self, request, content):
155
161
  input_tokens = output_tokens = 0
156
162
  if self.tokenizer is not None:
157
- for idx, choice_contents in delta_contents.items():
163
+ # Calculate input tokens
164
+ input_tokens += self._count_input_tokens(request)
165
+ for idx, choice_contents in content.items():
158
166
  full_response_content = ''.join(choice_contents)
159
- input_tokens += len(self.tokenizer.encode(request['messages'][0]['content']))
160
- output_tokens += len(self.tokenizer.encode(full_response_content))
167
+ # Calculate output tokens
168
+ output_tokens += self._count_output_tokens(full_response_content)
161
169
  else:
162
170
  raise ValueError(
163
171
  'Error: Unable to retrieve usage information\n\n'
@@ -171,3 +179,59 @@ class OpenaiPlugin(DefaultApiPlugin):
171
179
  'please open an issue on our GitHub repository https://github.com/modelscope/evalscope .'
172
180
  )
173
181
  return input_tokens, output_tokens
182
+
183
+ def _count_input_tokens(self, request: Dict) -> int:
184
+ """Count the number of input tokens in the request.
185
+
186
+ This method handles different types of requests and calculates tokens for:
187
+ - Text content in messages or prompts
188
+ - Images in multimodal messages (converted to patch tokens)
189
+
190
+ Args:
191
+ request (Dict): The request dictionary containing either 'messages' for chat
192
+ completion or 'prompt' for text completion.
193
+
194
+ Returns:
195
+ int: The total number of input tokens including text and image tokens.
196
+ """
197
+ input_tokens = 0
198
+ if 'messages' in request:
199
+ input_content = self.tokenizer.apply_chat_template(
200
+ request['messages'], tokenize=True, add_generation_prompt=True
201
+ )
202
+ input_tokens += len(input_content)
203
+ # handle image tokens if any
204
+ for message in request['messages']:
205
+ content = message.get('content', '')
206
+ if isinstance(content, str):
207
+ continue
208
+ for cont in content:
209
+ if cont['type'] == 'image_url':
210
+ try:
211
+ # assuming image_url is base64 string
212
+ image_base64 = cont['image_url']['url']
213
+ image = base64_to_PIL(image_base64)
214
+ # Use math.ceil for more accurate token count when image dimensions
215
+ # aren't perfectly divisible by patch size
216
+ n_patches = (
217
+ math.ceil(image.height / self.param.image_patch_size)
218
+ * math.ceil(image.width / self.param.image_patch_size)
219
+ )
220
+ input_tokens += n_patches
221
+ except Exception as e:
222
+ logger.warning(f'Failed to process image for token counting: {e}')
223
+ # Continue processing other content without failing
224
+ elif 'prompt' in request:
225
+ input_tokens += len(self.tokenizer.encode(request['prompt'], add_special_tokens=False))
226
+ return input_tokens
227
+
228
+ def _count_output_tokens(self, response: str) -> int:
229
+ """Count the number of output tokens in the response. Only string response is supported.
230
+
231
+ Args:
232
+ response (str): The API response text.
233
+
234
+ Returns:
235
+ int: The number of output tokens.
236
+ """
237
+ return len(self.tokenizer.encode(response, add_special_tokens=False))
@@ -22,7 +22,7 @@ class FlickrDatasetPlugin(DatasetPluginBase):
22
22
  for item in dataset:
23
23
  pil_image = item['jpg']
24
24
  text = item['txt']
25
- base64_image = PIL_to_base64(pil_image)
25
+ base64_image = PIL_to_base64(pil_image, add_header=True)
26
26
 
27
- message = self.create_message(text=text, image_urls=f'data:image/jpeg;base64,{base64_image}')
27
+ message = self.create_message(text=text, image_urls=base64_image)
28
28
  yield [message]
@@ -22,7 +22,7 @@ class KontextDatasetPlugin(DatasetPluginBase):
22
22
  for item in dataset:
23
23
  pil_image = item['image']
24
24
  text = item['instruction']
25
- base64_image = PIL_to_base64(pil_image)
25
+ base64_image = PIL_to_base64(pil_image, add_header=True)
26
26
 
27
- message = self.create_message(text=text, image_urls=f'data:image/jpeg;base64,{base64_image}')
27
+ message = self.create_message(text=text, image_urls=base64_image)
28
28
  yield [message]
@@ -31,7 +31,7 @@ class RandomVLDatasetPlugin(RandomDatasetPlugin):
31
31
  # Generate random images based on image_num
32
32
  images_b64 = []
33
33
  for _ in range(self.image_num):
34
- images_b64.append(f'data:image/png;base64,{self._generate_random_image_b64()}')
34
+ images_b64.append(self._generate_random_image_b64())
35
35
 
36
36
  message = self.create_message(text=prompt, image_urls=images_b64)
37
37
  yield [message]
@@ -77,4 +77,4 @@ class RandomVLDatasetPlugin(RandomDatasetPlugin):
77
77
  draw.line(coords, fill=shape_color, width=random.randint(1, 5))
78
78
 
79
79
  # Convert to base64
80
- return PIL_to_base64(image, format='PNG')
80
+ return PIL_to_base64(image, format='PNG', add_header=True)
@@ -44,8 +44,7 @@ class BenchmarkData:
44
44
  api_plugin.parse_responses(self.response_messages, request=self.request)
45
45
 
46
46
  def update_gpu_usage(self):
47
- if check_import('torch'):
48
-
47
+ if check_import('torch', raise_warning=False):
49
48
  import torch
50
49
  total_memory = 0
51
50
  for i in range(torch.cuda.device_count()):
@@ -4,7 +4,13 @@ from typing import TYPE_CHECKING
4
4
  from evalscope.utils.import_utils import _LazyModule
5
5
 
6
6
  if TYPE_CHECKING:
7
- from .combinator import gen_table, get_data_frame, get_report_list
7
+ from .combinator import (
8
+ gen_table,
9
+ get_data_frame,
10
+ get_report_list,
11
+ unweighted_average_from_subsets,
12
+ weighted_average_from_subsets,
13
+ )
8
14
  from .generator import ReportGenerator
9
15
  from .report import Category, Report, ReportKey, Subset
10
16
 
@@ -14,6 +20,8 @@ else:
14
20
  'gen_table',
15
21
  'get_data_frame',
16
22
  'get_report_list',
23
+ 'weighted_average_from_subsets',
24
+ 'unweighted_average_from_subsets',
17
25
  ],
18
26
  'generator': [
19
27
  'ReportGenerator',
@@ -4,9 +4,9 @@ import glob
4
4
  import os
5
5
  import pandas as pd
6
6
  from tabulate import tabulate
7
- from typing import List, Tuple
7
+ from typing import Dict, List, Tuple, Union
8
8
 
9
- from evalscope.report.report import Report
9
+ from evalscope.report.report import Report, Subset
10
10
  from evalscope.utils.logger import get_logger
11
11
 
12
12
  logger = get_logger()
@@ -88,26 +88,51 @@ def gen_table(
88
88
  return tabulate(table, headers=table.columns, tablefmt='grid', showindex=False)
89
89
 
90
90
 
91
- class ReportsRecorder:
92
- COMMON_DATASET_PATH = []
93
- CUSTOM_DATASET_PATH = []
91
+ def weighted_average_from_subsets(
92
+ subset_names: List[str], subset_dict: Dict[str, Subset], new_name: str = ''
93
+ ) -> Subset:
94
+ """Calculate weighted average for given subsets.
94
95
 
95
- def __init__(self, oss_url: str = '', endpoint: str = ''):
96
- pass
96
+ Args:
97
+ subset_names (List[str]): List of subset names to include in the average.
98
+ subset_dict (Dict[str, Subset]): Dictionary mapping subset names to Subset objects.
99
+ new_name (str): Name for the resulting Subset object.
100
+
101
+ Returns:
102
+ Subset: A new Subset object with weighted average score
103
+ """
104
+ total_score = 0
105
+ total_count = 0
106
+ for name in subset_names:
107
+ if name in subset_dict:
108
+ subset = subset_dict[name]
109
+ total_score += subset.score * subset.num
110
+ total_count += subset.num
97
111
 
112
+ weighted_avg = total_score / total_count if total_count > 0 else 0
113
+ return Subset(name=new_name, score=weighted_avg, num=total_count)
98
114
 
99
- if __name__ == '__main__':
100
- report_dir_1 = './outputs/20250117_151926'
101
- # report_dir_2 = './outputs/20250107_204445/reports'
102
115
 
103
- report_table = gen_table(reports_path_list=[report_dir_1])
104
- print(report_table)
116
+ def unweighted_average_from_subsets(
117
+ subset_names: List[str], subset_dict: Dict[str, Subset], new_name: str = ''
118
+ ) -> Subset:
119
+ """Calculate unweighted average for given subsets.
105
120
 
106
- # ALL VALUES ONLY FOR EXAMPLE
107
- # +--------------------------+-------------------+-------------+
108
- # | Model | CompetitionMath | GSM8K |
109
- # +==========================+===================+=============+
110
- # | ZhipuAI_chatglm2-6b-base | 25.0 (acc) | 30.50 (acc) |
111
- # +--------------------------+-------------------+-------------+
112
- # | ZhipuAI_chatglm2-6b | 30.5 (acc) | 40.50 (acc) |
113
- # +--------------------------+-------------------+-------------+
121
+ Args:
122
+ subset_names (List[str]): List of subset names to include in the average.
123
+ subset_dict (Dict[str, Subset]): Dictionary mapping subset names to Subset objects.
124
+ new_name (str): Name for the resulting Subset object.
125
+
126
+ Returns:
127
+ Subset: A new Subset object with unweighted average score
128
+ """
129
+ scores = []
130
+ total_count = 0
131
+ for name in subset_names:
132
+ if name in subset_dict:
133
+ subset = subset_dict[name]
134
+ scores.append(subset.score)
135
+ total_count += subset.num
136
+
137
+ unweighted_avg = sum(scores) / len(scores) if scores else 0
138
+ return Subset(name=new_name, score=unweighted_avg, num=total_count)
@@ -22,7 +22,7 @@ ANALYSIS_PROMPT = """根据给出的json格式的模型评测结果,输出分
22
22
  """
23
23
 
24
24
 
25
- def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float, dict]:
25
+ def normalize_score(score: Union[float, dict, int], keep_num: int = 4) -> Union[float, dict]:
26
26
  """
27
27
  Normalize score.
28
28
 
@@ -37,9 +37,10 @@ def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float
37
37
  score = round(score, keep_num)
38
38
  elif isinstance(score, dict):
39
39
  score = {k: round(v, keep_num) for k, v in score.items()}
40
+ elif isinstance(score, int):
41
+ score = float(score)
40
42
  else:
41
43
  logger.warning(f'Unknown score type: {type(score)}')
42
-
43
44
  return score
44
45
 
45
46
 
@@ -103,6 +104,7 @@ class ReportKey:
103
104
  subset_name = 'Subset'
104
105
  num = 'Num'
105
106
  score = 'Score'
107
+ overall_score = 'OVERALL'
106
108
 
107
109
 
108
110
  @dataclass
@@ -181,12 +183,14 @@ class Report:
181
183
  table[ReportKey.num].append(subset.num)
182
184
  table[ReportKey.score].append(subset.score)
183
185
  # add overall metric when there are multiple subsets
184
- if metric_count > 1 and add_overall_metric:
186
+ if metric_count > 1 and add_overall_metric and (
187
+ ReportKey.overall_score not in table[ReportKey.subset_name]
188
+ ):
185
189
  table[ReportKey.model_name].append(self.model_name)
186
190
  table[ReportKey.dataset_name].append(self.dataset_name)
187
191
  table[ReportKey.metric_name].append(metric.name)
188
192
  table[ReportKey.category_name].append(('-', ))
189
- table[ReportKey.subset_name].append('OVERALL')
193
+ table[ReportKey.subset_name].append(ReportKey.overall_score)
190
194
  table[ReportKey.num].append(metric.num)
191
195
  table[ReportKey.score].append(metric.score)
192
196
  # NOTE: only flatten metrics if needed, use the first metric by default
evalscope/run.py CHANGED
@@ -159,7 +159,7 @@ def evaluate_model(task_config: TaskConfig, outputs: OutputsStructure) -> dict:
159
159
  gc.collect()
160
160
 
161
161
  from evalscope.utils.import_utils import check_import
162
- if check_import('torch'):
162
+ if check_import('torch', raise_warning=False):
163
163
  import torch
164
164
  if torch.cuda.is_available():
165
165
  torch.cuda.empty_cache()
@@ -1,4 +1,6 @@
1
1
  import threading
2
+ import time
3
+ from contextlib import contextmanager
2
4
  from functools import wraps
3
5
 
4
6
 
@@ -27,3 +29,42 @@ def thread_safe(func):
27
29
  return func(*args, **kwargs)
28
30
 
29
31
  return wrapper
32
+
33
+
34
+ def retry_func(retries=3, sleep_interval=0):
35
+ """A decorator that retries a function call up to `retries` times if an exception occurs."""
36
+
37
+ def decorator(func):
38
+
39
+ @wraps(func)
40
+ def wrapper(*args, **kwargs):
41
+ last_exception = None
42
+ for attempt in range(retries):
43
+ try:
44
+ return func(*args, **kwargs)
45
+ except Exception as e:
46
+ last_exception = e
47
+ if sleep_interval > 0:
48
+ time.sleep(sleep_interval)
49
+ raise last_exception
50
+
51
+ return wrapper
52
+
53
+ return decorator
54
+
55
+
56
+ @contextmanager
57
+ def retry_context(retries=3, sleep_interval=0):
58
+ """A context manager that retries the code block up to `retries` times if an exception occurs."""
59
+ last_exception = None
60
+ for attempt in range(retries):
61
+ try:
62
+ yield
63
+ return # If no exception, exit successfully
64
+ except Exception as e:
65
+ last_exception = e
66
+ if sleep_interval > 0:
67
+ time.sleep(sleep_interval)
68
+ if attempt == retries - 1: # Last attempt
69
+ break
70
+ raise last_exception
@@ -7,32 +7,82 @@ from itertools import chain
7
7
  from types import ModuleType
8
8
  from typing import Any, Optional, Union
9
9
 
10
+ from evalscope.constants import IS_BUILD_DOC
10
11
  from .logger import get_logger
11
12
 
12
13
  logger = get_logger() # pylint: disable=invalid-name
13
14
 
14
15
 
15
- def check_import(module_name: str, package: Optional[str] = None, raise_error: bool = False) -> bool:
16
- """Check if a module can be imported.
16
+ def check_import(
17
+ module_name: Union[str, list[str]],
18
+ package: Optional[Union[str, list[str]]] = None,
19
+ raise_warning: bool = True,
20
+ raise_error: bool = False,
21
+ feature_name: Optional[str] = 'this feature',
22
+ ) -> bool:
23
+ """Check if a module or list of modules can be imported.
17
24
 
18
25
  Args:
19
- module_name (str): The name of the module to check.
20
- package (str, optional): The package to install if the module is not found. Defaults to None.
21
- raise_error (bool, optional): Whether to raise an error if the module is not found. Defaults to False.
26
+ module_name (Union[str, list[str]]): The name(s) of the module(s) to check.
27
+ package (Union[str, list[str]], optional): The package(s) to install if the module(s) are not found.
28
+ Defaults to None.
29
+ raise_error (bool, optional): Whether to raise an error if any module is not found. Defaults to False.
30
+ raise_warning (bool, optional): Whether to log a warning if any module is not found. Defaults to True.
31
+ feature_name (str, optional): The feature name that requires the module(s). Used in the warning/error message.
32
+ Defaults to 'this feature'.
33
+
34
+ Returns:
35
+ bool: True if all modules can be imported, False otherwise.
22
36
  """
23
- try:
24
- importlib.import_module(module_name)
25
- return True
26
- except ImportError:
27
- error_msg = f'`{module_name}` not found.'
28
- if package:
29
- error_msg += f' Please run `pip install {package}` to use this feature.'
37
+ # Convert single strings to lists for uniform processing
38
+ if isinstance(module_name, str):
39
+ module_names = [module_name]
40
+ else:
41
+ module_names = module_name
42
+
43
+ if package is None:
44
+ packages = [None] * len(module_names)
45
+ elif isinstance(package, str):
46
+ packages = [package] * len(module_names)
47
+ else:
48
+ packages = package
49
+ # Ensure packages list has same length as module_names
50
+ if len(packages) < len(module_names):
51
+ packages.extend([None] * (len(module_names) - len(packages)))
52
+
53
+ missing_modules = []
54
+ missing_packages = []
55
+
56
+ for i, mod_name in enumerate(module_names):
57
+ try:
58
+ importlib.import_module(mod_name)
59
+ except ImportError:
60
+ missing_modules.append(mod_name)
61
+ if i < len(packages) and packages[i]:
62
+ missing_packages.append(packages[i])
63
+
64
+ if missing_modules:
65
+ if len(missing_modules) == 1:
66
+ error_msg = f'`{missing_modules[0]}` not found.'
67
+ else:
68
+ error_msg = f'The following modules are not found: {", ".join(f"`{mod}`" for mod in missing_modules)}.'
69
+
70
+ if missing_packages:
71
+ if len(missing_packages) == 1:
72
+ error_msg += f' Please run `pip install {missing_packages[0]}` to use {feature_name}.'
73
+ else:
74
+ unique_packages = list(dict.fromkeys(missing_packages)) # Remove duplicates while preserving order
75
+ error_msg += f' Please run `pip install {" ".join(unique_packages)}` to use {feature_name}.'
76
+
77
+ if raise_warning:
30
78
  logger.warning(error_msg)
31
79
 
32
- if raise_error:
80
+ if not IS_BUILD_DOC and raise_error:
33
81
  raise ImportError(error_msg)
34
82
  return False
35
83
 
84
+ return True
85
+
36
86
 
37
87
  class _LazyModule(ModuleType):
38
88
  """
@@ -9,6 +9,7 @@ import re
9
9
  import string
10
10
  import unicodedata
11
11
  import yaml
12
+ from datetime import datetime
12
13
  from io import BytesIO
13
14
  from PIL import Image
14
15
 
@@ -123,6 +124,9 @@ def dump_jsonl_data(data_list, jsonl_file, dump_mode=DumpMode.OVERWRITE):
123
124
  if not isinstance(data_list, list):
124
125
  data_list = [data_list]
125
126
 
127
+ # Convert non-serializable types to serializable ones
128
+ data_list = convert_normal_types(data_list)
129
+
126
130
  if dump_mode == DumpMode.OVERWRITE:
127
131
  dump_mode = 'w'
128
132
  elif dump_mode == DumpMode.APPEND:
@@ -304,20 +308,22 @@ def PIL_to_base64(image: Image.Image, format: str = 'JPEG', add_header: bool = F
304
308
  return img_str
305
309
 
306
310
 
307
- def bytes_to_base64(bytes_data: bytes, format: str = 'png', add_header: bool = False) -> str:
308
- """Convert image bytes to a base64 encoded string.
311
+ def bytes_to_base64(bytes_data: bytes, *, format: str = 'png', add_header: bool = False, content_type='image') -> str:
312
+ """Convert bytes to a base64 encoded string.
309
313
 
310
314
  Args:
311
315
  bytes_data (bytes): The bytes to convert.
316
+ format (str): The format of the image. Default is 'png'.
312
317
  add_header (bool): Whether to add the base64 header. Default is False.
318
+ content_type (str): The type of the data, 'image' or 'audio'. Default is 'image'.
313
319
 
314
320
  Returns:
315
321
  str: Base64 encoded string of the bytes.
316
322
  """
317
- img_str = base64.b64encode(bytes_data).decode('utf-8')
323
+ base64_str = base64.b64encode(bytes_data).decode('utf-8')
318
324
  if add_header:
319
- img_str = f'data:image/{format};base64,{img_str}'
320
- return img_str
325
+ base64_str = f'data:{content_type}/{format};base64,{base64_str}'
326
+ return base64_str
321
327
 
322
328
 
323
329
  def base64_to_PIL(base64_str):
@@ -392,11 +398,13 @@ def safe_filename(s: str, max_length: int = 255) -> str:
392
398
  return s
393
399
 
394
400
 
395
- def convert_numpy_types(obj):
396
- """Recursively convert numpy types to native Python types for JSON serialization."""
401
+ def convert_normal_types(obj):
402
+ """Recursively convert numpy types and datetime objects to native Python types for JSON serialization."""
397
403
  import numpy as np
398
404
 
399
- if isinstance(obj, np.bool_):
405
+ if isinstance(obj, datetime):
406
+ return obj.isoformat()
407
+ elif isinstance(obj, np.bool_):
400
408
  return bool(obj)
401
409
  elif isinstance(obj, np.integer):
402
410
  return int(obj)
@@ -405,10 +413,10 @@ def convert_numpy_types(obj):
405
413
  elif isinstance(obj, np.ndarray):
406
414
  return obj.tolist()
407
415
  elif isinstance(obj, dict):
408
- return {key: convert_numpy_types(value) for key, value in obj.items()}
416
+ return {key: convert_normal_types(value) for key, value in obj.items()}
409
417
  elif isinstance(obj, list):
410
- return [convert_numpy_types(item) for item in obj]
418
+ return [convert_normal_types(item) for item in obj]
411
419
  elif isinstance(obj, tuple):
412
- return tuple(convert_numpy_types(item) for item in obj)
420
+ return tuple(convert_normal_types(item) for item in obj)
413
421
  else:
414
422
  return obj
@@ -4,7 +4,7 @@ from copy import deepcopy
4
4
  from dataclasses import is_dataclass
5
5
  from datetime import date, datetime, time
6
6
  from enum import EnumMeta
7
- from pydantic import BaseModel, Field
7
+ from pydantic import BaseModel, Field, field_validator, model_validator
8
8
  from typing import (
9
9
  Any,
10
10
  Dict,
@@ -59,6 +59,28 @@ class JSONSchema(BaseModel):
59
59
  required: Optional[List[str]] = Field(default=None)
60
60
  """Required fields for object parameters."""
61
61
 
62
+ @model_validator(mode='before')
63
+ def convert_type_before_validation(cls, values):
64
+ values = deepcopy(values)
65
+
66
+ def recursive_convert_type(obj):
67
+ if isinstance(obj, dict):
68
+ # Convert 'type' field if it's a string
69
+ if 'type' in obj and isinstance(obj['type'], str):
70
+ try:
71
+ obj['type'] = python_type_to_json_type(obj['type'])
72
+ except ValueError:
73
+ # If conversion fails, leave it as is
74
+ pass
75
+ # Recursively process nested structures
76
+ for k, v in obj.items():
77
+ obj[k] = recursive_convert_type(v)
78
+ elif isinstance(obj, list):
79
+ return [recursive_convert_type(item) for item in obj]
80
+ return obj
81
+
82
+ return recursive_convert_type(values)
83
+
62
84
 
63
85
  def json_schema(t: Type[Any]) -> JSONSchema:
64
86
  """Provide a JSON Schema for the specified type.
@@ -152,6 +174,8 @@ def cls_json_schema(cls: Type[Any]) -> JSONSchema:
152
174
 
153
175
 
154
176
  def python_type_to_json_type(python_type: Optional[str]) -> JSONType:
177
+ if python_type is not None and python_type in get_args(JSONType):
178
+ return python_type
155
179
  if python_type == 'str':
156
180
  return 'string'
157
181
  elif python_type == 'int':
@@ -205,4 +229,3 @@ def resolve_schema_references(schema: Dict[str, Any]) -> Dict[str, Any]:
205
229
  return obj
206
230
 
207
231
  return cast(Dict[str, Any], _resolve_refs(schema))
208
- return cast(Dict[str, Any], _resolve_refs(schema))
evalscope/utils/logger.py CHANGED
@@ -28,6 +28,25 @@ logging.getLogger('datasets').setLevel(logging.WARNING)
28
28
  logging.getLogger('httpx').setLevel(logging.WARNING)
29
29
  logging.getLogger('modelscope').setLevel(logging.ERROR)
30
30
 
31
+ info_set = set()
32
+ warning_set = set()
33
+
34
+
35
+ def info_once(self, msg, *args, **kwargs):
36
+ hash_id = kwargs.get('hash_id') or msg
37
+ if hash_id in info_set:
38
+ return
39
+ info_set.add(hash_id)
40
+ self.info(msg)
41
+
42
+
43
+ def warning_once(self, msg, *args, **kwargs):
44
+ hash_id = kwargs.get('hash_id') or msg
45
+ if hash_id in warning_set:
46
+ return
47
+ warning_set.add(hash_id)
48
+ self.warning(msg)
49
+
31
50
 
32
51
  def get_logger(
33
52
  log_file: Optional[str] = None,