evalscope 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/api/benchmark/adapters/default_data_adapter.py +18 -4
- evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
- evalscope/api/benchmark/adapters/text2image_adapter.py +5 -4
- evalscope/api/benchmark/adapters/vision_language_adapter.py +3 -1
- evalscope/api/benchmark/benchmark.py +27 -2
- evalscope/api/benchmark/meta.py +3 -0
- evalscope/api/evaluator/evaluator.py +5 -0
- evalscope/api/evaluator/state.py +5 -0
- evalscope/api/messages/chat_message.py +6 -1
- evalscope/api/mixin/__init__.py +1 -0
- evalscope/api/mixin/llm_judge_mixin.py +2 -0
- evalscope/api/mixin/sandbox_mixin.py +204 -0
- evalscope/api/model/generate_config.py +0 -3
- evalscope/api/model/model.py +1 -1
- evalscope/api/tool/tool_info.py +1 -1
- evalscope/app/ui/multi_model.py +6 -1
- evalscope/app/ui/single_model.py +8 -2
- evalscope/app/utils/data_utils.py +3 -2
- evalscope/app/utils/visualization.py +2 -2
- evalscope/arguments.py +6 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
- evalscope/benchmarks/amc/__init__.py +0 -0
- evalscope/benchmarks/amc/amc_adapter.py +46 -0
- evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
- evalscope/benchmarks/bfcl/bfcl_adapter.py +106 -2
- evalscope/benchmarks/bfcl/generation.py +7 -7
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/drop/drop_adapter.py +1 -1
- evalscope/benchmarks/general_arena/utils.py +2 -1
- evalscope/benchmarks/healthbench/__init__.py +0 -0
- evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
- evalscope/benchmarks/healthbench/utils.py +102 -0
- evalscope/benchmarks/hle/hle_adapter.py +3 -2
- evalscope/benchmarks/humaneval/humaneval_adapter.py +19 -35
- evalscope/benchmarks/humaneval/utils.py +235 -0
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +60 -37
- evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +0 -1
- evalscope/benchmarks/minerva_math/__init__.py +0 -0
- evalscope/benchmarks/minerva_math/minerva_math_adapter.py +48 -0
- evalscope/benchmarks/mm_bench/__init__.py +0 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
- evalscope/benchmarks/mm_star/__init__.py +0 -0
- evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
- evalscope/benchmarks/mmmu/mmmu_adapter.py +1 -1
- evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +4 -9
- evalscope/benchmarks/multi_if/__init__.py +0 -0
- evalscope/benchmarks/multi_if/ifeval.py +3354 -0
- evalscope/benchmarks/multi_if/metrics.py +120 -0
- evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -4
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench_v2/utils.py +432 -0
- evalscope/benchmarks/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
- evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
- evalscope/benchmarks/olympiad_bench/utils.py +565 -0
- evalscope/benchmarks/omni_bench/__init__.py +0 -0
- evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
- evalscope/benchmarks/real_world_qa/__init__.py +0 -0
- evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +6 -1
- evalscope/config.py +24 -1
- evalscope/constants.py +3 -0
- evalscope/evaluator/evaluator.py +25 -7
- evalscope/metrics/metric.py +78 -2
- evalscope/metrics/metrics.py +16 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
- evalscope/models/model_apis.py +10 -8
- evalscope/models/utils/openai.py +1 -2
- evalscope/perf/arguments.py +2 -0
- evalscope/perf/plugin/api/base.py +2 -2
- evalscope/perf/plugin/api/default_api.py +7 -7
- evalscope/perf/plugin/api/openai_api.py +83 -19
- evalscope/perf/plugin/datasets/flickr8k.py +2 -2
- evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
- evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
- evalscope/perf/utils/benchmark_util.py +1 -2
- evalscope/report/__init__.py +9 -1
- evalscope/report/combinator.py +45 -20
- evalscope/report/report.py +8 -4
- evalscope/run.py +1 -1
- evalscope/utils/function_utils.py +41 -0
- evalscope/utils/import_utils.py +63 -13
- evalscope/utils/io_utils.py +19 -11
- evalscope/utils/json_schema.py +25 -2
- evalscope/utils/logger.py +19 -0
- evalscope/utils/model_utils.py +1 -1
- evalscope/utils/multi_choices.py +16 -1
- evalscope/version.py +2 -2
- {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/METADATA +10 -40
- {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/RECORD +120 -95
- {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/top_level.txt +0 -1
- tests/__init__.py +0 -1
- tests/benchmark/__init__.py +0 -1
- tests/benchmark/test_eval.py +0 -385
- tests/benchmark/test_image_edit.py +0 -65
- tests/benchmark/test_t2i.py +0 -142
- tests/benchmark/test_vlm.py +0 -80
- tests/cli/__init__.py +0 -1
- tests/cli/test_all.py +0 -269
- tests/cli/test_collection.py +0 -99
- tests/cli/test_custom.py +0 -268
- tests/cli/test_reasoning.py +0 -81
- tests/common.py +0 -73
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -178
- tests/rag/test_clip_benchmark.py +0 -87
- tests/rag/test_mteb.py +0 -213
- tests/rag/test_ragas.py +0 -128
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -146
- tests/swift/test_run_swift_vlm_eval.py +0 -128
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
- tests/test_run_all.py +0 -12
- tests/utils.py +0 -13
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -102
- {tests/rag → evalscope/benchmarks/ai2d}/__init__.py +0 -0
- {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/LICENSE +0 -0
- {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/WHEEL +0 -0
- {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/entry_points.txt +0 -0
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import math
|
|
2
3
|
import os
|
|
4
|
+
from collections import defaultdict
|
|
3
5
|
from typing import Any, Dict, List, Tuple, Union
|
|
4
6
|
|
|
5
7
|
from evalscope.perf.arguments import Arguments
|
|
6
8
|
from evalscope.perf.plugin.api.default_api import DefaultApiPlugin
|
|
7
9
|
from evalscope.perf.plugin.registry import register_api
|
|
10
|
+
from evalscope.utils.io_utils import base64_to_PIL
|
|
8
11
|
from evalscope.utils.logger import get_logger
|
|
9
12
|
|
|
10
13
|
logger = get_logger()
|
|
@@ -113,7 +116,7 @@ class OpenaiPlugin(DefaultApiPlugin):
|
|
|
113
116
|
return input_tokens, output_tokens
|
|
114
117
|
|
|
115
118
|
# no usage information in the response, parse the response to get the tokens
|
|
116
|
-
delta_contents =
|
|
119
|
+
delta_contents = defaultdict(list)
|
|
117
120
|
for response in responses:
|
|
118
121
|
if 'object' in response:
|
|
119
122
|
self.__process_response_object(response, delta_contents)
|
|
@@ -123,41 +126,46 @@ class OpenaiPlugin(DefaultApiPlugin):
|
|
|
123
126
|
input_tokens, output_tokens = self.__calculate_tokens_from_content(request, delta_contents)
|
|
124
127
|
return input_tokens, output_tokens
|
|
125
128
|
|
|
126
|
-
def __process_response_object(self,
|
|
127
|
-
if
|
|
128
|
-
|
|
129
|
+
def __process_response_object(self, response, delta_contents):
|
|
130
|
+
if not response.get('choices'):
|
|
131
|
+
return
|
|
132
|
+
if response['object'] == 'chat.completion':
|
|
133
|
+
for choice in response['choices']:
|
|
129
134
|
delta_contents[choice['index']] = [choice['message']['content']]
|
|
130
|
-
elif
|
|
131
|
-
for choice in
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
+
elif response['object'] == 'text_completion':
|
|
136
|
+
for choice in response['choices']:
|
|
137
|
+
if 'text' in choice and 'index' in choice:
|
|
138
|
+
delta_contents[choice['index']].append(choice['text'])
|
|
139
|
+
elif response['object'] == 'chat.completion.chunk':
|
|
140
|
+
for choice in response['choices']:
|
|
135
141
|
if 'delta' in choice and 'index' in choice:
|
|
136
142
|
delta = choice['delta']
|
|
137
143
|
idx = choice['index']
|
|
138
144
|
if 'content' in delta:
|
|
139
|
-
|
|
140
|
-
delta_contents.setdefault(idx, []).append(delta_content)
|
|
145
|
+
delta_contents[idx].append(delta['content'])
|
|
141
146
|
|
|
142
|
-
def __process_no_object(self,
|
|
147
|
+
def __process_no_object(self, response, delta_contents):
|
|
143
148
|
# assume the response is a single choice
|
|
144
|
-
|
|
149
|
+
if not response.get('choices'):
|
|
150
|
+
return
|
|
151
|
+
for choice in response['choices']:
|
|
145
152
|
if 'delta' in choice:
|
|
146
153
|
delta = choice['delta']
|
|
147
154
|
idx = choice['index']
|
|
148
155
|
if 'content' in delta:
|
|
149
|
-
|
|
150
|
-
delta_contents.setdefault(idx, []).append(delta_content)
|
|
156
|
+
delta_contents[idx].append(delta['content'])
|
|
151
157
|
else:
|
|
152
158
|
delta_contents[choice['index']] = [choice['message']['content']]
|
|
153
159
|
|
|
154
|
-
def __calculate_tokens_from_content(self, request,
|
|
160
|
+
def __calculate_tokens_from_content(self, request, content):
|
|
155
161
|
input_tokens = output_tokens = 0
|
|
156
162
|
if self.tokenizer is not None:
|
|
157
|
-
|
|
163
|
+
# Calculate input tokens
|
|
164
|
+
input_tokens += self._count_input_tokens(request)
|
|
165
|
+
for idx, choice_contents in content.items():
|
|
158
166
|
full_response_content = ''.join(choice_contents)
|
|
159
|
-
|
|
160
|
-
output_tokens +=
|
|
167
|
+
# Calculate output tokens
|
|
168
|
+
output_tokens += self._count_output_tokens(full_response_content)
|
|
161
169
|
else:
|
|
162
170
|
raise ValueError(
|
|
163
171
|
'Error: Unable to retrieve usage information\n\n'
|
|
@@ -171,3 +179,59 @@ class OpenaiPlugin(DefaultApiPlugin):
|
|
|
171
179
|
'please open an issue on our GitHub repository https://github.com/modelscope/evalscope .'
|
|
172
180
|
)
|
|
173
181
|
return input_tokens, output_tokens
|
|
182
|
+
|
|
183
|
+
def _count_input_tokens(self, request: Dict) -> int:
|
|
184
|
+
"""Count the number of input tokens in the request.
|
|
185
|
+
|
|
186
|
+
This method handles different types of requests and calculates tokens for:
|
|
187
|
+
- Text content in messages or prompts
|
|
188
|
+
- Images in multimodal messages (converted to patch tokens)
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
request (Dict): The request dictionary containing either 'messages' for chat
|
|
192
|
+
completion or 'prompt' for text completion.
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
int: The total number of input tokens including text and image tokens.
|
|
196
|
+
"""
|
|
197
|
+
input_tokens = 0
|
|
198
|
+
if 'messages' in request:
|
|
199
|
+
input_content = self.tokenizer.apply_chat_template(
|
|
200
|
+
request['messages'], tokenize=True, add_generation_prompt=True
|
|
201
|
+
)
|
|
202
|
+
input_tokens += len(input_content)
|
|
203
|
+
# handle image tokens if any
|
|
204
|
+
for message in request['messages']:
|
|
205
|
+
content = message.get('content', '')
|
|
206
|
+
if isinstance(content, str):
|
|
207
|
+
continue
|
|
208
|
+
for cont in content:
|
|
209
|
+
if cont['type'] == 'image_url':
|
|
210
|
+
try:
|
|
211
|
+
# assuming image_url is base64 string
|
|
212
|
+
image_base64 = cont['image_url']['url']
|
|
213
|
+
image = base64_to_PIL(image_base64)
|
|
214
|
+
# Use math.ceil for more accurate token count when image dimensions
|
|
215
|
+
# aren't perfectly divisible by patch size
|
|
216
|
+
n_patches = (
|
|
217
|
+
math.ceil(image.height / self.param.image_patch_size)
|
|
218
|
+
* math.ceil(image.width / self.param.image_patch_size)
|
|
219
|
+
)
|
|
220
|
+
input_tokens += n_patches
|
|
221
|
+
except Exception as e:
|
|
222
|
+
logger.warning(f'Failed to process image for token counting: {e}')
|
|
223
|
+
# Continue processing other content without failing
|
|
224
|
+
elif 'prompt' in request:
|
|
225
|
+
input_tokens += len(self.tokenizer.encode(request['prompt'], add_special_tokens=False))
|
|
226
|
+
return input_tokens
|
|
227
|
+
|
|
228
|
+
def _count_output_tokens(self, response: str) -> int:
|
|
229
|
+
"""Count the number of output tokens in the response. Only string response is supported.
|
|
230
|
+
|
|
231
|
+
Args:
|
|
232
|
+
response (str): The API response text.
|
|
233
|
+
|
|
234
|
+
Returns:
|
|
235
|
+
int: The number of output tokens.
|
|
236
|
+
"""
|
|
237
|
+
return len(self.tokenizer.encode(response, add_special_tokens=False))
|
|
@@ -22,7 +22,7 @@ class FlickrDatasetPlugin(DatasetPluginBase):
|
|
|
22
22
|
for item in dataset:
|
|
23
23
|
pil_image = item['jpg']
|
|
24
24
|
text = item['txt']
|
|
25
|
-
base64_image = PIL_to_base64(pil_image)
|
|
25
|
+
base64_image = PIL_to_base64(pil_image, add_header=True)
|
|
26
26
|
|
|
27
|
-
message = self.create_message(text=text, image_urls=
|
|
27
|
+
message = self.create_message(text=text, image_urls=base64_image)
|
|
28
28
|
yield [message]
|
|
@@ -22,7 +22,7 @@ class KontextDatasetPlugin(DatasetPluginBase):
|
|
|
22
22
|
for item in dataset:
|
|
23
23
|
pil_image = item['image']
|
|
24
24
|
text = item['instruction']
|
|
25
|
-
base64_image = PIL_to_base64(pil_image)
|
|
25
|
+
base64_image = PIL_to_base64(pil_image, add_header=True)
|
|
26
26
|
|
|
27
|
-
message = self.create_message(text=text, image_urls=
|
|
27
|
+
message = self.create_message(text=text, image_urls=base64_image)
|
|
28
28
|
yield [message]
|
|
@@ -31,7 +31,7 @@ class RandomVLDatasetPlugin(RandomDatasetPlugin):
|
|
|
31
31
|
# Generate random images based on image_num
|
|
32
32
|
images_b64 = []
|
|
33
33
|
for _ in range(self.image_num):
|
|
34
|
-
images_b64.append(
|
|
34
|
+
images_b64.append(self._generate_random_image_b64())
|
|
35
35
|
|
|
36
36
|
message = self.create_message(text=prompt, image_urls=images_b64)
|
|
37
37
|
yield [message]
|
|
@@ -77,4 +77,4 @@ class RandomVLDatasetPlugin(RandomDatasetPlugin):
|
|
|
77
77
|
draw.line(coords, fill=shape_color, width=random.randint(1, 5))
|
|
78
78
|
|
|
79
79
|
# Convert to base64
|
|
80
|
-
return PIL_to_base64(image, format='PNG')
|
|
80
|
+
return PIL_to_base64(image, format='PNG', add_header=True)
|
|
@@ -44,8 +44,7 @@ class BenchmarkData:
|
|
|
44
44
|
api_plugin.parse_responses(self.response_messages, request=self.request)
|
|
45
45
|
|
|
46
46
|
def update_gpu_usage(self):
|
|
47
|
-
if check_import('torch'):
|
|
48
|
-
|
|
47
|
+
if check_import('torch', raise_warning=False):
|
|
49
48
|
import torch
|
|
50
49
|
total_memory = 0
|
|
51
50
|
for i in range(torch.cuda.device_count()):
|
evalscope/report/__init__.py
CHANGED
|
@@ -4,7 +4,13 @@ from typing import TYPE_CHECKING
|
|
|
4
4
|
from evalscope.utils.import_utils import _LazyModule
|
|
5
5
|
|
|
6
6
|
if TYPE_CHECKING:
|
|
7
|
-
from .combinator import
|
|
7
|
+
from .combinator import (
|
|
8
|
+
gen_table,
|
|
9
|
+
get_data_frame,
|
|
10
|
+
get_report_list,
|
|
11
|
+
unweighted_average_from_subsets,
|
|
12
|
+
weighted_average_from_subsets,
|
|
13
|
+
)
|
|
8
14
|
from .generator import ReportGenerator
|
|
9
15
|
from .report import Category, Report, ReportKey, Subset
|
|
10
16
|
|
|
@@ -14,6 +20,8 @@ else:
|
|
|
14
20
|
'gen_table',
|
|
15
21
|
'get_data_frame',
|
|
16
22
|
'get_report_list',
|
|
23
|
+
'weighted_average_from_subsets',
|
|
24
|
+
'unweighted_average_from_subsets',
|
|
17
25
|
],
|
|
18
26
|
'generator': [
|
|
19
27
|
'ReportGenerator',
|
evalscope/report/combinator.py
CHANGED
|
@@ -4,9 +4,9 @@ import glob
|
|
|
4
4
|
import os
|
|
5
5
|
import pandas as pd
|
|
6
6
|
from tabulate import tabulate
|
|
7
|
-
from typing import List, Tuple
|
|
7
|
+
from typing import Dict, List, Tuple, Union
|
|
8
8
|
|
|
9
|
-
from evalscope.report.report import Report
|
|
9
|
+
from evalscope.report.report import Report, Subset
|
|
10
10
|
from evalscope.utils.logger import get_logger
|
|
11
11
|
|
|
12
12
|
logger = get_logger()
|
|
@@ -88,26 +88,51 @@ def gen_table(
|
|
|
88
88
|
return tabulate(table, headers=table.columns, tablefmt='grid', showindex=False)
|
|
89
89
|
|
|
90
90
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
91
|
+
def weighted_average_from_subsets(
|
|
92
|
+
subset_names: List[str], subset_dict: Dict[str, Subset], new_name: str = ''
|
|
93
|
+
) -> Subset:
|
|
94
|
+
"""Calculate weighted average for given subsets.
|
|
94
95
|
|
|
95
|
-
|
|
96
|
-
|
|
96
|
+
Args:
|
|
97
|
+
subset_names (List[str]): List of subset names to include in the average.
|
|
98
|
+
subset_dict (Dict[str, Subset]): Dictionary mapping subset names to Subset objects.
|
|
99
|
+
new_name (str): Name for the resulting Subset object.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
Subset: A new Subset object with weighted average score
|
|
103
|
+
"""
|
|
104
|
+
total_score = 0
|
|
105
|
+
total_count = 0
|
|
106
|
+
for name in subset_names:
|
|
107
|
+
if name in subset_dict:
|
|
108
|
+
subset = subset_dict[name]
|
|
109
|
+
total_score += subset.score * subset.num
|
|
110
|
+
total_count += subset.num
|
|
97
111
|
|
|
112
|
+
weighted_avg = total_score / total_count if total_count > 0 else 0
|
|
113
|
+
return Subset(name=new_name, score=weighted_avg, num=total_count)
|
|
98
114
|
|
|
99
|
-
if __name__ == '__main__':
|
|
100
|
-
report_dir_1 = './outputs/20250117_151926'
|
|
101
|
-
# report_dir_2 = './outputs/20250107_204445/reports'
|
|
102
115
|
|
|
103
|
-
|
|
104
|
-
|
|
116
|
+
def unweighted_average_from_subsets(
|
|
117
|
+
subset_names: List[str], subset_dict: Dict[str, Subset], new_name: str = ''
|
|
118
|
+
) -> Subset:
|
|
119
|
+
"""Calculate unweighted average for given subsets.
|
|
105
120
|
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
121
|
+
Args:
|
|
122
|
+
subset_names (List[str]): List of subset names to include in the average.
|
|
123
|
+
subset_dict (Dict[str, Subset]): Dictionary mapping subset names to Subset objects.
|
|
124
|
+
new_name (str): Name for the resulting Subset object.
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
Subset: A new Subset object with unweighted average score
|
|
128
|
+
"""
|
|
129
|
+
scores = []
|
|
130
|
+
total_count = 0
|
|
131
|
+
for name in subset_names:
|
|
132
|
+
if name in subset_dict:
|
|
133
|
+
subset = subset_dict[name]
|
|
134
|
+
scores.append(subset.score)
|
|
135
|
+
total_count += subset.num
|
|
136
|
+
|
|
137
|
+
unweighted_avg = sum(scores) / len(scores) if scores else 0
|
|
138
|
+
return Subset(name=new_name, score=unweighted_avg, num=total_count)
|
evalscope/report/report.py
CHANGED
|
@@ -22,7 +22,7 @@ ANALYSIS_PROMPT = """根据给出的json格式的模型评测结果,输出分
|
|
|
22
22
|
"""
|
|
23
23
|
|
|
24
24
|
|
|
25
|
-
def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float, dict]:
|
|
25
|
+
def normalize_score(score: Union[float, dict, int], keep_num: int = 4) -> Union[float, dict]:
|
|
26
26
|
"""
|
|
27
27
|
Normalize score.
|
|
28
28
|
|
|
@@ -37,9 +37,10 @@ def normalize_score(score: Union[float, dict], keep_num: int = 4) -> Union[float
|
|
|
37
37
|
score = round(score, keep_num)
|
|
38
38
|
elif isinstance(score, dict):
|
|
39
39
|
score = {k: round(v, keep_num) for k, v in score.items()}
|
|
40
|
+
elif isinstance(score, int):
|
|
41
|
+
score = float(score)
|
|
40
42
|
else:
|
|
41
43
|
logger.warning(f'Unknown score type: {type(score)}')
|
|
42
|
-
|
|
43
44
|
return score
|
|
44
45
|
|
|
45
46
|
|
|
@@ -103,6 +104,7 @@ class ReportKey:
|
|
|
103
104
|
subset_name = 'Subset'
|
|
104
105
|
num = 'Num'
|
|
105
106
|
score = 'Score'
|
|
107
|
+
overall_score = 'OVERALL'
|
|
106
108
|
|
|
107
109
|
|
|
108
110
|
@dataclass
|
|
@@ -181,12 +183,14 @@ class Report:
|
|
|
181
183
|
table[ReportKey.num].append(subset.num)
|
|
182
184
|
table[ReportKey.score].append(subset.score)
|
|
183
185
|
# add overall metric when there are multiple subsets
|
|
184
|
-
if metric_count > 1 and add_overall_metric
|
|
186
|
+
if metric_count > 1 and add_overall_metric and (
|
|
187
|
+
ReportKey.overall_score not in table[ReportKey.subset_name]
|
|
188
|
+
):
|
|
185
189
|
table[ReportKey.model_name].append(self.model_name)
|
|
186
190
|
table[ReportKey.dataset_name].append(self.dataset_name)
|
|
187
191
|
table[ReportKey.metric_name].append(metric.name)
|
|
188
192
|
table[ReportKey.category_name].append(('-', ))
|
|
189
|
-
table[ReportKey.subset_name].append(
|
|
193
|
+
table[ReportKey.subset_name].append(ReportKey.overall_score)
|
|
190
194
|
table[ReportKey.num].append(metric.num)
|
|
191
195
|
table[ReportKey.score].append(metric.score)
|
|
192
196
|
# NOTE: only flatten metrics if needed, use the first metric by default
|
evalscope/run.py
CHANGED
|
@@ -159,7 +159,7 @@ def evaluate_model(task_config: TaskConfig, outputs: OutputsStructure) -> dict:
|
|
|
159
159
|
gc.collect()
|
|
160
160
|
|
|
161
161
|
from evalscope.utils.import_utils import check_import
|
|
162
|
-
if check_import('torch'):
|
|
162
|
+
if check_import('torch', raise_warning=False):
|
|
163
163
|
import torch
|
|
164
164
|
if torch.cuda.is_available():
|
|
165
165
|
torch.cuda.empty_cache()
|
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
import threading
|
|
2
|
+
import time
|
|
3
|
+
from contextlib import contextmanager
|
|
2
4
|
from functools import wraps
|
|
3
5
|
|
|
4
6
|
|
|
@@ -27,3 +29,42 @@ def thread_safe(func):
|
|
|
27
29
|
return func(*args, **kwargs)
|
|
28
30
|
|
|
29
31
|
return wrapper
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def retry_func(retries=3, sleep_interval=0):
|
|
35
|
+
"""A decorator that retries a function call up to `retries` times if an exception occurs."""
|
|
36
|
+
|
|
37
|
+
def decorator(func):
|
|
38
|
+
|
|
39
|
+
@wraps(func)
|
|
40
|
+
def wrapper(*args, **kwargs):
|
|
41
|
+
last_exception = None
|
|
42
|
+
for attempt in range(retries):
|
|
43
|
+
try:
|
|
44
|
+
return func(*args, **kwargs)
|
|
45
|
+
except Exception as e:
|
|
46
|
+
last_exception = e
|
|
47
|
+
if sleep_interval > 0:
|
|
48
|
+
time.sleep(sleep_interval)
|
|
49
|
+
raise last_exception
|
|
50
|
+
|
|
51
|
+
return wrapper
|
|
52
|
+
|
|
53
|
+
return decorator
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@contextmanager
|
|
57
|
+
def retry_context(retries=3, sleep_interval=0):
|
|
58
|
+
"""A context manager that retries the code block up to `retries` times if an exception occurs."""
|
|
59
|
+
last_exception = None
|
|
60
|
+
for attempt in range(retries):
|
|
61
|
+
try:
|
|
62
|
+
yield
|
|
63
|
+
return # If no exception, exit successfully
|
|
64
|
+
except Exception as e:
|
|
65
|
+
last_exception = e
|
|
66
|
+
if sleep_interval > 0:
|
|
67
|
+
time.sleep(sleep_interval)
|
|
68
|
+
if attempt == retries - 1: # Last attempt
|
|
69
|
+
break
|
|
70
|
+
raise last_exception
|
evalscope/utils/import_utils.py
CHANGED
|
@@ -7,32 +7,82 @@ from itertools import chain
|
|
|
7
7
|
from types import ModuleType
|
|
8
8
|
from typing import Any, Optional, Union
|
|
9
9
|
|
|
10
|
+
from evalscope.constants import IS_BUILD_DOC
|
|
10
11
|
from .logger import get_logger
|
|
11
12
|
|
|
12
13
|
logger = get_logger() # pylint: disable=invalid-name
|
|
13
14
|
|
|
14
15
|
|
|
15
|
-
def check_import(
|
|
16
|
-
|
|
16
|
+
def check_import(
|
|
17
|
+
module_name: Union[str, list[str]],
|
|
18
|
+
package: Optional[Union[str, list[str]]] = None,
|
|
19
|
+
raise_warning: bool = True,
|
|
20
|
+
raise_error: bool = False,
|
|
21
|
+
feature_name: Optional[str] = 'this feature',
|
|
22
|
+
) -> bool:
|
|
23
|
+
"""Check if a module or list of modules can be imported.
|
|
17
24
|
|
|
18
25
|
Args:
|
|
19
|
-
module_name (str): The name of the module to check.
|
|
20
|
-
package (str, optional): The package to install if the module
|
|
21
|
-
|
|
26
|
+
module_name (Union[str, list[str]]): The name(s) of the module(s) to check.
|
|
27
|
+
package (Union[str, list[str]], optional): The package(s) to install if the module(s) are not found.
|
|
28
|
+
Defaults to None.
|
|
29
|
+
raise_error (bool, optional): Whether to raise an error if any module is not found. Defaults to False.
|
|
30
|
+
raise_warning (bool, optional): Whether to log a warning if any module is not found. Defaults to True.
|
|
31
|
+
feature_name (str, optional): The feature name that requires the module(s). Used in the warning/error message.
|
|
32
|
+
Defaults to 'this feature'.
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
bool: True if all modules can be imported, False otherwise.
|
|
22
36
|
"""
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
37
|
+
# Convert single strings to lists for uniform processing
|
|
38
|
+
if isinstance(module_name, str):
|
|
39
|
+
module_names = [module_name]
|
|
40
|
+
else:
|
|
41
|
+
module_names = module_name
|
|
42
|
+
|
|
43
|
+
if package is None:
|
|
44
|
+
packages = [None] * len(module_names)
|
|
45
|
+
elif isinstance(package, str):
|
|
46
|
+
packages = [package] * len(module_names)
|
|
47
|
+
else:
|
|
48
|
+
packages = package
|
|
49
|
+
# Ensure packages list has same length as module_names
|
|
50
|
+
if len(packages) < len(module_names):
|
|
51
|
+
packages.extend([None] * (len(module_names) - len(packages)))
|
|
52
|
+
|
|
53
|
+
missing_modules = []
|
|
54
|
+
missing_packages = []
|
|
55
|
+
|
|
56
|
+
for i, mod_name in enumerate(module_names):
|
|
57
|
+
try:
|
|
58
|
+
importlib.import_module(mod_name)
|
|
59
|
+
except ImportError:
|
|
60
|
+
missing_modules.append(mod_name)
|
|
61
|
+
if i < len(packages) and packages[i]:
|
|
62
|
+
missing_packages.append(packages[i])
|
|
63
|
+
|
|
64
|
+
if missing_modules:
|
|
65
|
+
if len(missing_modules) == 1:
|
|
66
|
+
error_msg = f'`{missing_modules[0]}` not found.'
|
|
67
|
+
else:
|
|
68
|
+
error_msg = f'The following modules are not found: {", ".join(f"`{mod}`" for mod in missing_modules)}.'
|
|
69
|
+
|
|
70
|
+
if missing_packages:
|
|
71
|
+
if len(missing_packages) == 1:
|
|
72
|
+
error_msg += f' Please run `pip install {missing_packages[0]}` to use {feature_name}.'
|
|
73
|
+
else:
|
|
74
|
+
unique_packages = list(dict.fromkeys(missing_packages)) # Remove duplicates while preserving order
|
|
75
|
+
error_msg += f' Please run `pip install {" ".join(unique_packages)}` to use {feature_name}.'
|
|
76
|
+
|
|
77
|
+
if raise_warning:
|
|
30
78
|
logger.warning(error_msg)
|
|
31
79
|
|
|
32
|
-
if raise_error:
|
|
80
|
+
if not IS_BUILD_DOC and raise_error:
|
|
33
81
|
raise ImportError(error_msg)
|
|
34
82
|
return False
|
|
35
83
|
|
|
84
|
+
return True
|
|
85
|
+
|
|
36
86
|
|
|
37
87
|
class _LazyModule(ModuleType):
|
|
38
88
|
"""
|
evalscope/utils/io_utils.py
CHANGED
|
@@ -9,6 +9,7 @@ import re
|
|
|
9
9
|
import string
|
|
10
10
|
import unicodedata
|
|
11
11
|
import yaml
|
|
12
|
+
from datetime import datetime
|
|
12
13
|
from io import BytesIO
|
|
13
14
|
from PIL import Image
|
|
14
15
|
|
|
@@ -123,6 +124,9 @@ def dump_jsonl_data(data_list, jsonl_file, dump_mode=DumpMode.OVERWRITE):
|
|
|
123
124
|
if not isinstance(data_list, list):
|
|
124
125
|
data_list = [data_list]
|
|
125
126
|
|
|
127
|
+
# Convert non-serializable types to serializable ones
|
|
128
|
+
data_list = convert_normal_types(data_list)
|
|
129
|
+
|
|
126
130
|
if dump_mode == DumpMode.OVERWRITE:
|
|
127
131
|
dump_mode = 'w'
|
|
128
132
|
elif dump_mode == DumpMode.APPEND:
|
|
@@ -304,20 +308,22 @@ def PIL_to_base64(image: Image.Image, format: str = 'JPEG', add_header: bool = F
|
|
|
304
308
|
return img_str
|
|
305
309
|
|
|
306
310
|
|
|
307
|
-
def bytes_to_base64(bytes_data: bytes, format: str = 'png', add_header: bool = False) -> str:
|
|
308
|
-
"""Convert
|
|
311
|
+
def bytes_to_base64(bytes_data: bytes, *, format: str = 'png', add_header: bool = False, content_type='image') -> str:
|
|
312
|
+
"""Convert bytes to a base64 encoded string.
|
|
309
313
|
|
|
310
314
|
Args:
|
|
311
315
|
bytes_data (bytes): The bytes to convert.
|
|
316
|
+
format (str): The format of the image. Default is 'png'.
|
|
312
317
|
add_header (bool): Whether to add the base64 header. Default is False.
|
|
318
|
+
content_type (str): The type of the data, 'image' or 'audio'. Default is 'image'.
|
|
313
319
|
|
|
314
320
|
Returns:
|
|
315
321
|
str: Base64 encoded string of the bytes.
|
|
316
322
|
"""
|
|
317
|
-
|
|
323
|
+
base64_str = base64.b64encode(bytes_data).decode('utf-8')
|
|
318
324
|
if add_header:
|
|
319
|
-
|
|
320
|
-
return
|
|
325
|
+
base64_str = f'data:{content_type}/{format};base64,{base64_str}'
|
|
326
|
+
return base64_str
|
|
321
327
|
|
|
322
328
|
|
|
323
329
|
def base64_to_PIL(base64_str):
|
|
@@ -392,11 +398,13 @@ def safe_filename(s: str, max_length: int = 255) -> str:
|
|
|
392
398
|
return s
|
|
393
399
|
|
|
394
400
|
|
|
395
|
-
def
|
|
396
|
-
"""Recursively convert numpy types to native Python types for JSON serialization."""
|
|
401
|
+
def convert_normal_types(obj):
|
|
402
|
+
"""Recursively convert numpy types and datetime objects to native Python types for JSON serialization."""
|
|
397
403
|
import numpy as np
|
|
398
404
|
|
|
399
|
-
if isinstance(obj,
|
|
405
|
+
if isinstance(obj, datetime):
|
|
406
|
+
return obj.isoformat()
|
|
407
|
+
elif isinstance(obj, np.bool_):
|
|
400
408
|
return bool(obj)
|
|
401
409
|
elif isinstance(obj, np.integer):
|
|
402
410
|
return int(obj)
|
|
@@ -405,10 +413,10 @@ def convert_numpy_types(obj):
|
|
|
405
413
|
elif isinstance(obj, np.ndarray):
|
|
406
414
|
return obj.tolist()
|
|
407
415
|
elif isinstance(obj, dict):
|
|
408
|
-
return {key:
|
|
416
|
+
return {key: convert_normal_types(value) for key, value in obj.items()}
|
|
409
417
|
elif isinstance(obj, list):
|
|
410
|
-
return [
|
|
418
|
+
return [convert_normal_types(item) for item in obj]
|
|
411
419
|
elif isinstance(obj, tuple):
|
|
412
|
-
return tuple(
|
|
420
|
+
return tuple(convert_normal_types(item) for item in obj)
|
|
413
421
|
else:
|
|
414
422
|
return obj
|
evalscope/utils/json_schema.py
CHANGED
|
@@ -4,7 +4,7 @@ from copy import deepcopy
|
|
|
4
4
|
from dataclasses import is_dataclass
|
|
5
5
|
from datetime import date, datetime, time
|
|
6
6
|
from enum import EnumMeta
|
|
7
|
-
from pydantic import BaseModel, Field
|
|
7
|
+
from pydantic import BaseModel, Field, field_validator, model_validator
|
|
8
8
|
from typing import (
|
|
9
9
|
Any,
|
|
10
10
|
Dict,
|
|
@@ -59,6 +59,28 @@ class JSONSchema(BaseModel):
|
|
|
59
59
|
required: Optional[List[str]] = Field(default=None)
|
|
60
60
|
"""Required fields for object parameters."""
|
|
61
61
|
|
|
62
|
+
@model_validator(mode='before')
|
|
63
|
+
def convert_type_before_validation(cls, values):
|
|
64
|
+
values = deepcopy(values)
|
|
65
|
+
|
|
66
|
+
def recursive_convert_type(obj):
|
|
67
|
+
if isinstance(obj, dict):
|
|
68
|
+
# Convert 'type' field if it's a string
|
|
69
|
+
if 'type' in obj and isinstance(obj['type'], str):
|
|
70
|
+
try:
|
|
71
|
+
obj['type'] = python_type_to_json_type(obj['type'])
|
|
72
|
+
except ValueError:
|
|
73
|
+
# If conversion fails, leave it as is
|
|
74
|
+
pass
|
|
75
|
+
# Recursively process nested structures
|
|
76
|
+
for k, v in obj.items():
|
|
77
|
+
obj[k] = recursive_convert_type(v)
|
|
78
|
+
elif isinstance(obj, list):
|
|
79
|
+
return [recursive_convert_type(item) for item in obj]
|
|
80
|
+
return obj
|
|
81
|
+
|
|
82
|
+
return recursive_convert_type(values)
|
|
83
|
+
|
|
62
84
|
|
|
63
85
|
def json_schema(t: Type[Any]) -> JSONSchema:
|
|
64
86
|
"""Provide a JSON Schema for the specified type.
|
|
@@ -152,6 +174,8 @@ def cls_json_schema(cls: Type[Any]) -> JSONSchema:
|
|
|
152
174
|
|
|
153
175
|
|
|
154
176
|
def python_type_to_json_type(python_type: Optional[str]) -> JSONType:
|
|
177
|
+
if python_type is not None and python_type in get_args(JSONType):
|
|
178
|
+
return python_type
|
|
155
179
|
if python_type == 'str':
|
|
156
180
|
return 'string'
|
|
157
181
|
elif python_type == 'int':
|
|
@@ -205,4 +229,3 @@ def resolve_schema_references(schema: Dict[str, Any]) -> Dict[str, Any]:
|
|
|
205
229
|
return obj
|
|
206
230
|
|
|
207
231
|
return cast(Dict[str, Any], _resolve_refs(schema))
|
|
208
|
-
return cast(Dict[str, Any], _resolve_refs(schema))
|
evalscope/utils/logger.py
CHANGED
|
@@ -28,6 +28,25 @@ logging.getLogger('datasets').setLevel(logging.WARNING)
|
|
|
28
28
|
logging.getLogger('httpx').setLevel(logging.WARNING)
|
|
29
29
|
logging.getLogger('modelscope').setLevel(logging.ERROR)
|
|
30
30
|
|
|
31
|
+
info_set = set()
|
|
32
|
+
warning_set = set()
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def info_once(self, msg, *args, **kwargs):
|
|
36
|
+
hash_id = kwargs.get('hash_id') or msg
|
|
37
|
+
if hash_id in info_set:
|
|
38
|
+
return
|
|
39
|
+
info_set.add(hash_id)
|
|
40
|
+
self.info(msg)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def warning_once(self, msg, *args, **kwargs):
|
|
44
|
+
hash_id = kwargs.get('hash_id') or msg
|
|
45
|
+
if hash_id in warning_set:
|
|
46
|
+
return
|
|
47
|
+
warning_set.add(hash_id)
|
|
48
|
+
self.warning(msg)
|
|
49
|
+
|
|
31
50
|
|
|
32
51
|
def get_logger(
|
|
33
52
|
log_file: Optional[str] = None,
|