evalscope 1.0.2__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (87) hide show
  1. evalscope/api/benchmark/adapters/default_data_adapter.py +12 -0
  2. evalscope/app/ui/multi_model.py +6 -1
  3. evalscope/app/ui/single_model.py +8 -2
  4. evalscope/app/utils/data_utils.py +3 -2
  5. evalscope/app/utils/visualization.py +2 -2
  6. evalscope/benchmarks/ai2d/ai2d_adapter.py +3 -2
  7. evalscope/benchmarks/bfcl/bfcl_adapter.py +10 -45
  8. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  9. evalscope/benchmarks/chartqa/__init__.py +0 -0
  10. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  11. evalscope/benchmarks/chartqa/utils.py +38 -0
  12. evalscope/benchmarks/docvqa/__init__.py +0 -0
  13. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  14. evalscope/benchmarks/general_arena/utils.py +2 -1
  15. evalscope/benchmarks/hle/hle_adapter.py +3 -2
  16. evalscope/benchmarks/infovqa/__init__.py +0 -0
  17. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  18. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +2 -2
  19. evalscope/benchmarks/mmmu/mmmu_adapter.py +1 -1
  20. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  21. evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py +101 -0
  22. evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py +87 -0
  23. evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py +963 -0
  24. evalscope/benchmarks/ocr_bench_v2/__init__.py +0 -0
  25. evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  26. evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py +50 -0
  27. evalscope/benchmarks/ocr_bench_v2/parallel.py +46 -0
  28. evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  29. evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  30. evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  31. evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py +481 -0
  32. evalscope/benchmarks/ocr_bench_v2/spotting_metric.py +179 -0
  33. evalscope/benchmarks/ocr_bench_v2/utils.py +432 -0
  34. evalscope/benchmarks/ocr_bench_v2/vqa_metric.py +254 -0
  35. evalscope/metrics/metric.py +51 -0
  36. evalscope/metrics/metrics.py +16 -0
  37. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
  38. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
  39. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
  40. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
  41. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
  42. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
  43. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
  44. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
  45. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
  46. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
  47. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
  48. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
  49. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
  50. evalscope/report/__init__.py +9 -1
  51. evalscope/report/combinator.py +52 -2
  52. evalscope/utils/json_schema.py +8 -6
  53. evalscope/utils/multi_choices.py +16 -1
  54. evalscope/version.py +2 -2
  55. {evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/METADATA +6 -32
  56. {evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/RECORD +51 -54
  57. {evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/top_level.txt +0 -1
  58. tests/__init__.py +0 -1
  59. tests/benchmark/__init__.py +0 -1
  60. tests/benchmark/test_eval.py +0 -429
  61. tests/benchmark/test_image_edit.py +0 -65
  62. tests/benchmark/test_sandbox.py +0 -81
  63. tests/benchmark/test_t2i.py +0 -142
  64. tests/benchmark/test_vlm.py +0 -137
  65. tests/cli/__init__.py +0 -1
  66. tests/cli/test_all.py +0 -269
  67. tests/cli/test_collection.py +0 -99
  68. tests/cli/test_custom.py +0 -268
  69. tests/cli/test_reasoning.py +0 -81
  70. tests/common.py +0 -73
  71. tests/perf/__init__.py +0 -1
  72. tests/perf/test_perf.py +0 -206
  73. tests/rag/test_clip_benchmark.py +0 -87
  74. tests/rag/test_mteb.py +0 -213
  75. tests/rag/test_ragas.py +0 -128
  76. tests/swift/__init__.py +0 -1
  77. tests/swift/test_run_swift_eval.py +0 -146
  78. tests/swift/test_run_swift_vlm_eval.py +0 -128
  79. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
  80. tests/test_run_all.py +0 -12
  81. tests/utils.py +0 -13
  82. tests/vlm/__init__.py +0 -1
  83. tests/vlm/test_vlmeval.py +0 -102
  84. {tests/rag → evalscope/benchmarks/blink}/__init__.py +0 -0
  85. {evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/LICENSE +0 -0
  86. {evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/WHEEL +0 -0
  87. {evalscope-1.0.2.dist-info → evalscope-1.1.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,101 @@
1
+ import json
2
+ from typing import Any, Dict, List
3
+
4
+ from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
5
+ from evalscope.api.dataset import Sample
6
+ from evalscope.api.evaluator.state import TaskState
7
+ from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
8
+ from evalscope.api.metric.scorer import Score
9
+ from evalscope.api.registry import register_benchmark
10
+ from evalscope.constants import Tags
11
+ from evalscope.utils.io_utils import bytes_to_base64
12
+ from evalscope.utils.logger import get_logger
13
+
14
+ logger = get_logger()
15
+
16
+ SUBSET_LIST = [
17
+ 'Regular Text Recognition', 'Irregular Text Recognition', 'Artistic Text Recognition', 'Handwriting Recognition',
18
+ 'Digit String Recognition', 'Non-Semantic Text Recognition', 'Scene Text-centric VQA', 'Doc-oriented VQA',
19
+ 'Key Information Extraction', 'Handwritten Mathematical Expression Recognition'
20
+ ]
21
+
22
+
23
+ @register_benchmark(
24
+ BenchmarkMeta(
25
+ name='ocr_bench',
26
+ pretty_name='OCRBench',
27
+ tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
28
+ description=
29
+ 'OCRBench is a comprehensive evaluation benchmark designed to assess the OCR capabilities of Large Multimodal Models. It comprises five components: Text Recognition, SceneText-Centric VQA, Document-Oriented VQA, Key Information Extraction, and Handwritten Mathematical Expression Recognition. The benchmark includes 1000 question-answer pairs, and all the answers undergo manual verification and correction to ensure a more precise evaluation.', # noqa: E501
30
+ dataset_id='evalscope/OCRBench',
31
+ subset_list=SUBSET_LIST,
32
+ metric_list=['acc'],
33
+ eval_split='test',
34
+ prompt_template='{question}',
35
+ )
36
+ )
37
+ class OCRBenchAdapter(VisionLanguageAdapter):
38
+
39
+ def __init__(self, **kwargs):
40
+ super().__init__(**kwargs)
41
+ self.add_aggregation_name = False
42
+ self.reformat_subset = True
43
+
44
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
45
+
46
+ input_text = self.prompt_template.format(question=record['question'])
47
+ content_list: List[Content] = [ContentText(text=input_text)]
48
+ image = record.get('image')
49
+ if image:
50
+ image_base64 = bytes_to_base64(image['bytes'], format='jpeg', add_header=True)
51
+ content_list.append(ContentImage(image=image_base64))
52
+ return Sample(
53
+ input=[ChatMessageUser(content=content_list)],
54
+ target=json.dumps(record.get('answer'), ensure_ascii=False), # answers is a list
55
+ subset_key=record.get('question_type'),
56
+ metadata={
57
+ 'dataset': record.get('dataset'),
58
+ 'question_type': record.get('question_type'),
59
+ }
60
+ )
61
+
62
+ def match_score(
63
+ self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
64
+ ) -> Score:
65
+
66
+ score = Score(
67
+ extracted_prediction=filtered_prediction,
68
+ prediction=original_prediction,
69
+ )
70
+
71
+ pred = filtered_prediction.lower().strip()
72
+ gt_ans = json.loads(reference)
73
+ dataset_name = task_state.metadata['dataset']
74
+
75
+ score_value = 0
76
+ if dataset_name == 'HME100k':
77
+ if isinstance(gt_ans, list):
78
+ for j in range(len(gt_ans)):
79
+ answer = gt_ans[j].strip().replace('\n', ' ').replace(' ', '')
80
+ predict = pred.strip().replace('\n', ' ').replace(' ', '')
81
+ if answer in predict:
82
+ score_value = 1
83
+ else:
84
+ answer = gt_ans.strip().replace('\n', ' ').replace(' ', '')
85
+ predict = pred.strip().replace('\n', ' ').replace(' ', '')
86
+ if answer in predict:
87
+ score_value = 1
88
+ else:
89
+ if isinstance(gt_ans, list):
90
+ for j in range(len(gt_ans)):
91
+ answer = gt_ans[j].lower().strip().replace('\n', ' ')
92
+ predict = pred.lower().strip().replace('\n', ' ')
93
+ if answer in predict:
94
+ score_value = 1
95
+ else:
96
+ answer = gt_ans.lower().strip().replace('\n', ' ')
97
+ predict = pred.lower().strip().replace('\n', ' ')
98
+ if answer in predict:
99
+ score_value = 1
100
+ score.value = {'acc': score_value}
101
+ return score
@@ -0,0 +1,87 @@
1
+ # flake8: noqa
2
+ import ast
3
+ import re
4
+
5
+ from .vqa_metric import vqa_evaluation
6
+
7
+
8
+ def calculate_iou(box1, box2):
9
+ try:
10
+ box1 = [int(coordinate) for coordinate in box1]
11
+ box2 = [int(coordinate) for coordinate in box2]
12
+ except:
13
+ return 0
14
+
15
+ x1_inter = max(box1[0], box2[0])
16
+ y1_inter = max(box1[1], box2[1])
17
+ x2_inter = min(box1[2], box2[2])
18
+ y2_inter = min(box1[3], box2[3])
19
+
20
+ inter_area = max(0, x2_inter - x1_inter) * max(0, y2_inter - y1_inter)
21
+
22
+ box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
23
+ box2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])
24
+
25
+ union_area = box1_area + box2_area - inter_area
26
+
27
+ iou = inter_area / union_area if union_area != 0 else 0
28
+
29
+ return iou
30
+
31
+
32
+ def vqa_with_position_evaluation(predict, img_metas):
33
+ score_content, score_bbox = 0.0, 0.0
34
+ if 'answer' in predict.keys():
35
+ score_content = vqa_evaluation(predict['answer'], img_metas['answers'])
36
+ if 'bbox' in predict.keys():
37
+ gt_bbox = img_metas['bbox']
38
+ try:
39
+ predict_bbox_list = ast.literal_eval(predict['bbox'])
40
+ score_bbox = calculate_iou(predict_bbox_list, gt_bbox)
41
+ except:
42
+ score_bbox = 0
43
+ return 0.5 * score_content + 0.5 * score_bbox
44
+
45
+
46
+ def extract_coordinates(text):
47
+ # Regex pattern to match coordinates in either (x1, y1, x2, y2) or [x1, y1, x2, y2] format
48
+
49
+ pattern = r'[\(\[]\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*[\)\]]'
50
+
51
+ matches = list(re.finditer(pattern, text))
52
+ coords_list = []
53
+ coords_set = set()
54
+ for match in matches:
55
+ x1, y1, x2, y2 = map(int, match.groups())
56
+
57
+ if all(0 <= n <= 1000 for n in [x1, y1, x2, y2]):
58
+ coords = (x1, y1, x2, y2)
59
+
60
+ if coords in coords_set:
61
+ coords_list = [c for c in coords_list if c != coords]
62
+
63
+ coords_list.append(coords)
64
+ coords_set.add(coords)
65
+ if coords_list:
66
+ last_coords = coords_list[-1]
67
+ return list(last_coords)
68
+ else:
69
+ return None
70
+
71
+
72
+ if __name__ == '__main__':
73
+ print('Example for Text Grounding task.')
74
+ box1 = [50, 50, 150, 150]
75
+ box2 = [60, 60, 140, 140]
76
+ iou_score = calculate_iou(box1, box2)
77
+ print(f'IoU score: {iou_score}')
78
+
79
+ print('Example for VQA with position task.')
80
+ pred = {'content': 'The content is Hello Buddies', 'bbox': box1}
81
+ gt = {'content': 'Hello Buddies', 'bbox': box2}
82
+
83
+ vqa_score = vqa_evaluation(pred['content'], gt['content'])
84
+ iou_score = calculate_iou(pred['bbox'], gt['bbox'])
85
+
86
+ print(f'VQA score: {vqa_score}')
87
+ print(f'IoU score: {iou_score}')