evalscope 1.0.2__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/api/benchmark/adapters/default_data_adapter.py +12 -0
- evalscope/app/ui/multi_model.py +6 -1
- evalscope/app/ui/single_model.py +8 -2
- evalscope/app/utils/data_utils.py +3 -2
- evalscope/app/utils/visualization.py +2 -2
- evalscope/benchmarks/ai2d/ai2d_adapter.py +3 -2
- evalscope/benchmarks/bfcl/bfcl_adapter.py +10 -45
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/general_arena/utils.py +2 -1
- evalscope/benchmarks/hle/hle_adapter.py +3 -2
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +2 -2
- evalscope/benchmarks/mmmu/mmmu_adapter.py +1 -1
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench_v2/utils.py +432 -0
- evalscope/benchmarks/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/metrics/metric.py +51 -0
- evalscope/metrics/metrics.py +16 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
- evalscope/report/__init__.py +9 -1
- evalscope/report/combinator.py +52 -2
- evalscope/utils/json_schema.py +8 -6
- evalscope/utils/multi_choices.py +16 -1
- evalscope/version.py +2 -2
- {evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/METADATA +6 -32
- {evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/RECORD +51 -54
- {evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/top_level.txt +0 -1
- tests/__init__.py +0 -1
- tests/benchmark/__init__.py +0 -1
- tests/benchmark/test_eval.py +0 -429
- tests/benchmark/test_image_edit.py +0 -65
- tests/benchmark/test_sandbox.py +0 -81
- tests/benchmark/test_t2i.py +0 -142
- tests/benchmark/test_vlm.py +0 -137
- tests/cli/__init__.py +0 -1
- tests/cli/test_all.py +0 -269
- tests/cli/test_collection.py +0 -99
- tests/cli/test_custom.py +0 -268
- tests/cli/test_reasoning.py +0 -81
- tests/common.py +0 -73
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -206
- tests/rag/test_clip_benchmark.py +0 -87
- tests/rag/test_mteb.py +0 -213
- tests/rag/test_ragas.py +0 -128
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -146
- tests/swift/test_run_swift_vlm_eval.py +0 -128
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
- tests/test_run_all.py +0 -12
- tests/utils.py +0 -13
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -102
- {tests/rag → evalscope/benchmarks/blink}/__init__.py +0 -0
- {evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/LICENSE +0 -0
- {evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/WHEEL +0 -0
- {evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Any, Dict, List
|
|
3
|
+
|
|
4
|
+
from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
|
|
5
|
+
from evalscope.api.dataset import Sample
|
|
6
|
+
from evalscope.api.evaluator.state import TaskState
|
|
7
|
+
from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
|
|
8
|
+
from evalscope.api.metric.scorer import Score
|
|
9
|
+
from evalscope.api.registry import register_benchmark
|
|
10
|
+
from evalscope.constants import Tags
|
|
11
|
+
from evalscope.utils.io_utils import bytes_to_base64
|
|
12
|
+
from evalscope.utils.logger import get_logger
|
|
13
|
+
|
|
14
|
+
logger = get_logger()
|
|
15
|
+
|
|
16
|
+
SUBSET_LIST = [
|
|
17
|
+
'Regular Text Recognition', 'Irregular Text Recognition', 'Artistic Text Recognition', 'Handwriting Recognition',
|
|
18
|
+
'Digit String Recognition', 'Non-Semantic Text Recognition', 'Scene Text-centric VQA', 'Doc-oriented VQA',
|
|
19
|
+
'Key Information Extraction', 'Handwritten Mathematical Expression Recognition'
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@register_benchmark(
|
|
24
|
+
BenchmarkMeta(
|
|
25
|
+
name='ocr_bench',
|
|
26
|
+
pretty_name='OCRBench',
|
|
27
|
+
tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
|
|
28
|
+
description=
|
|
29
|
+
'OCRBench is a comprehensive evaluation benchmark designed to assess the OCR capabilities of Large Multimodal Models. It comprises five components: Text Recognition, SceneText-Centric VQA, Document-Oriented VQA, Key Information Extraction, and Handwritten Mathematical Expression Recognition. The benchmark includes 1000 question-answer pairs, and all the answers undergo manual verification and correction to ensure a more precise evaluation.', # noqa: E501
|
|
30
|
+
dataset_id='evalscope/OCRBench',
|
|
31
|
+
subset_list=SUBSET_LIST,
|
|
32
|
+
metric_list=['acc'],
|
|
33
|
+
eval_split='test',
|
|
34
|
+
prompt_template='{question}',
|
|
35
|
+
)
|
|
36
|
+
)
|
|
37
|
+
class OCRBenchAdapter(VisionLanguageAdapter):
|
|
38
|
+
|
|
39
|
+
def __init__(self, **kwargs):
|
|
40
|
+
super().__init__(**kwargs)
|
|
41
|
+
self.add_aggregation_name = False
|
|
42
|
+
self.reformat_subset = True
|
|
43
|
+
|
|
44
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
45
|
+
|
|
46
|
+
input_text = self.prompt_template.format(question=record['question'])
|
|
47
|
+
content_list: List[Content] = [ContentText(text=input_text)]
|
|
48
|
+
image = record.get('image')
|
|
49
|
+
if image:
|
|
50
|
+
image_base64 = bytes_to_base64(image['bytes'], format='jpeg', add_header=True)
|
|
51
|
+
content_list.append(ContentImage(image=image_base64))
|
|
52
|
+
return Sample(
|
|
53
|
+
input=[ChatMessageUser(content=content_list)],
|
|
54
|
+
target=json.dumps(record.get('answer'), ensure_ascii=False), # answers is a list
|
|
55
|
+
subset_key=record.get('question_type'),
|
|
56
|
+
metadata={
|
|
57
|
+
'dataset': record.get('dataset'),
|
|
58
|
+
'question_type': record.get('question_type'),
|
|
59
|
+
}
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
def match_score(
|
|
63
|
+
self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
|
|
64
|
+
) -> Score:
|
|
65
|
+
|
|
66
|
+
score = Score(
|
|
67
|
+
extracted_prediction=filtered_prediction,
|
|
68
|
+
prediction=original_prediction,
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
pred = filtered_prediction.lower().strip()
|
|
72
|
+
gt_ans = json.loads(reference)
|
|
73
|
+
dataset_name = task_state.metadata['dataset']
|
|
74
|
+
|
|
75
|
+
score_value = 0
|
|
76
|
+
if dataset_name == 'HME100k':
|
|
77
|
+
if isinstance(gt_ans, list):
|
|
78
|
+
for j in range(len(gt_ans)):
|
|
79
|
+
answer = gt_ans[j].strip().replace('\n', ' ').replace(' ', '')
|
|
80
|
+
predict = pred.strip().replace('\n', ' ').replace(' ', '')
|
|
81
|
+
if answer in predict:
|
|
82
|
+
score_value = 1
|
|
83
|
+
else:
|
|
84
|
+
answer = gt_ans.strip().replace('\n', ' ').replace(' ', '')
|
|
85
|
+
predict = pred.strip().replace('\n', ' ').replace(' ', '')
|
|
86
|
+
if answer in predict:
|
|
87
|
+
score_value = 1
|
|
88
|
+
else:
|
|
89
|
+
if isinstance(gt_ans, list):
|
|
90
|
+
for j in range(len(gt_ans)):
|
|
91
|
+
answer = gt_ans[j].lower().strip().replace('\n', ' ')
|
|
92
|
+
predict = pred.lower().strip().replace('\n', ' ')
|
|
93
|
+
if answer in predict:
|
|
94
|
+
score_value = 1
|
|
95
|
+
else:
|
|
96
|
+
answer = gt_ans.lower().strip().replace('\n', ' ')
|
|
97
|
+
predict = pred.lower().strip().replace('\n', ' ')
|
|
98
|
+
if answer in predict:
|
|
99
|
+
score_value = 1
|
|
100
|
+
score.value = {'acc': score_value}
|
|
101
|
+
return score
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# flake8: noqa
|
|
2
|
+
import ast
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
from .vqa_metric import vqa_evaluation
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def calculate_iou(box1, box2):
|
|
9
|
+
try:
|
|
10
|
+
box1 = [int(coordinate) for coordinate in box1]
|
|
11
|
+
box2 = [int(coordinate) for coordinate in box2]
|
|
12
|
+
except:
|
|
13
|
+
return 0
|
|
14
|
+
|
|
15
|
+
x1_inter = max(box1[0], box2[0])
|
|
16
|
+
y1_inter = max(box1[1], box2[1])
|
|
17
|
+
x2_inter = min(box1[2], box2[2])
|
|
18
|
+
y2_inter = min(box1[3], box2[3])
|
|
19
|
+
|
|
20
|
+
inter_area = max(0, x2_inter - x1_inter) * max(0, y2_inter - y1_inter)
|
|
21
|
+
|
|
22
|
+
box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
|
|
23
|
+
box2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])
|
|
24
|
+
|
|
25
|
+
union_area = box1_area + box2_area - inter_area
|
|
26
|
+
|
|
27
|
+
iou = inter_area / union_area if union_area != 0 else 0
|
|
28
|
+
|
|
29
|
+
return iou
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def vqa_with_position_evaluation(predict, img_metas):
|
|
33
|
+
score_content, score_bbox = 0.0, 0.0
|
|
34
|
+
if 'answer' in predict.keys():
|
|
35
|
+
score_content = vqa_evaluation(predict['answer'], img_metas['answers'])
|
|
36
|
+
if 'bbox' in predict.keys():
|
|
37
|
+
gt_bbox = img_metas['bbox']
|
|
38
|
+
try:
|
|
39
|
+
predict_bbox_list = ast.literal_eval(predict['bbox'])
|
|
40
|
+
score_bbox = calculate_iou(predict_bbox_list, gt_bbox)
|
|
41
|
+
except:
|
|
42
|
+
score_bbox = 0
|
|
43
|
+
return 0.5 * score_content + 0.5 * score_bbox
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def extract_coordinates(text):
|
|
47
|
+
# Regex pattern to match coordinates in either (x1, y1, x2, y2) or [x1, y1, x2, y2] format
|
|
48
|
+
|
|
49
|
+
pattern = r'[\(\[]\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*[\)\]]'
|
|
50
|
+
|
|
51
|
+
matches = list(re.finditer(pattern, text))
|
|
52
|
+
coords_list = []
|
|
53
|
+
coords_set = set()
|
|
54
|
+
for match in matches:
|
|
55
|
+
x1, y1, x2, y2 = map(int, match.groups())
|
|
56
|
+
|
|
57
|
+
if all(0 <= n <= 1000 for n in [x1, y1, x2, y2]):
|
|
58
|
+
coords = (x1, y1, x2, y2)
|
|
59
|
+
|
|
60
|
+
if coords in coords_set:
|
|
61
|
+
coords_list = [c for c in coords_list if c != coords]
|
|
62
|
+
|
|
63
|
+
coords_list.append(coords)
|
|
64
|
+
coords_set.add(coords)
|
|
65
|
+
if coords_list:
|
|
66
|
+
last_coords = coords_list[-1]
|
|
67
|
+
return list(last_coords)
|
|
68
|
+
else:
|
|
69
|
+
return None
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
if __name__ == '__main__':
|
|
73
|
+
print('Example for Text Grounding task.')
|
|
74
|
+
box1 = [50, 50, 150, 150]
|
|
75
|
+
box2 = [60, 60, 140, 140]
|
|
76
|
+
iou_score = calculate_iou(box1, box2)
|
|
77
|
+
print(f'IoU score: {iou_score}')
|
|
78
|
+
|
|
79
|
+
print('Example for VQA with position task.')
|
|
80
|
+
pred = {'content': 'The content is Hello Buddies', 'bbox': box1}
|
|
81
|
+
gt = {'content': 'Hello Buddies', 'bbox': box2}
|
|
82
|
+
|
|
83
|
+
vqa_score = vqa_evaluation(pred['content'], gt['content'])
|
|
84
|
+
iou_score = calculate_iou(pred['bbox'], gt['bbox'])
|
|
85
|
+
|
|
86
|
+
print(f'VQA score: {vqa_score}')
|
|
87
|
+
print(f'IoU score: {iou_score}')
|