evalscope 1.0.1__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (155) hide show
  1. evalscope/api/benchmark/adapters/default_data_adapter.py +18 -4
  2. evalscope/api/benchmark/adapters/multi_choice_adapter.py +5 -2
  3. evalscope/api/benchmark/adapters/text2image_adapter.py +5 -4
  4. evalscope/api/benchmark/adapters/vision_language_adapter.py +3 -1
  5. evalscope/api/benchmark/benchmark.py +27 -2
  6. evalscope/api/benchmark/meta.py +3 -0
  7. evalscope/api/evaluator/evaluator.py +5 -0
  8. evalscope/api/evaluator/state.py +5 -0
  9. evalscope/api/messages/chat_message.py +6 -1
  10. evalscope/api/mixin/__init__.py +1 -0
  11. evalscope/api/mixin/llm_judge_mixin.py +2 -0
  12. evalscope/api/mixin/sandbox_mixin.py +204 -0
  13. evalscope/api/model/generate_config.py +0 -3
  14. evalscope/api/model/model.py +1 -1
  15. evalscope/api/tool/tool_info.py +1 -1
  16. evalscope/app/ui/multi_model.py +6 -1
  17. evalscope/app/ui/single_model.py +8 -2
  18. evalscope/app/utils/data_utils.py +3 -2
  19. evalscope/app/utils/visualization.py +2 -2
  20. evalscope/arguments.py +6 -0
  21. evalscope/benchmarks/ai2d/ai2d_adapter.py +54 -0
  22. evalscope/benchmarks/amc/__init__.py +0 -0
  23. evalscope/benchmarks/amc/amc_adapter.py +46 -0
  24. evalscope/benchmarks/bbh/bbh_adapter.py +43 -17
  25. evalscope/benchmarks/bfcl/bfcl_adapter.py +106 -2
  26. evalscope/benchmarks/bfcl/generation.py +7 -7
  27. evalscope/benchmarks/blink/__init__.py +0 -0
  28. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  29. evalscope/benchmarks/chartqa/__init__.py +0 -0
  30. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  31. evalscope/benchmarks/chartqa/utils.py +38 -0
  32. evalscope/benchmarks/docvqa/__init__.py +0 -0
  33. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  34. evalscope/benchmarks/drop/drop_adapter.py +1 -1
  35. evalscope/benchmarks/general_arena/utils.py +2 -1
  36. evalscope/benchmarks/healthbench/__init__.py +0 -0
  37. evalscope/benchmarks/healthbench/healthbench_adapter.py +282 -0
  38. evalscope/benchmarks/healthbench/utils.py +102 -0
  39. evalscope/benchmarks/hle/hle_adapter.py +3 -2
  40. evalscope/benchmarks/humaneval/humaneval_adapter.py +19 -35
  41. evalscope/benchmarks/humaneval/utils.py +235 -0
  42. evalscope/benchmarks/infovqa/__init__.py +0 -0
  43. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  44. evalscope/benchmarks/live_code_bench/evaluate_utils.py +13 -6
  45. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +60 -37
  46. evalscope/benchmarks/live_code_bench/sandbox_evaluate_utils.py +220 -0
  47. evalscope/benchmarks/math_500/math_500_adapter.py +0 -1
  48. evalscope/benchmarks/minerva_math/__init__.py +0 -0
  49. evalscope/benchmarks/minerva_math/minerva_math_adapter.py +48 -0
  50. evalscope/benchmarks/mm_bench/__init__.py +0 -0
  51. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +99 -0
  52. evalscope/benchmarks/mm_star/__init__.py +0 -0
  53. evalscope/benchmarks/mm_star/mm_star_adapter.py +73 -0
  54. evalscope/benchmarks/mmmu/mmmu_adapter.py +1 -1
  55. evalscope/benchmarks/mmmu_pro/mmmu_pro_adapter.py +4 -9
  56. evalscope/benchmarks/multi_if/__init__.py +0 -0
  57. evalscope/benchmarks/multi_if/ifeval.py +3354 -0
  58. evalscope/benchmarks/multi_if/metrics.py +120 -0
  59. evalscope/benchmarks/multi_if/multi_if_adapter.py +161 -0
  60. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -4
  61. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  62. evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py +101 -0
  63. evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py +87 -0
  64. evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py +963 -0
  65. evalscope/benchmarks/ocr_bench_v2/__init__.py +0 -0
  66. evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  67. evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py +50 -0
  68. evalscope/benchmarks/ocr_bench_v2/parallel.py +46 -0
  69. evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  70. evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  71. evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  72. evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py +481 -0
  73. evalscope/benchmarks/ocr_bench_v2/spotting_metric.py +179 -0
  74. evalscope/benchmarks/ocr_bench_v2/utils.py +432 -0
  75. evalscope/benchmarks/ocr_bench_v2/vqa_metric.py +254 -0
  76. evalscope/benchmarks/olympiad_bench/__init__.py +0 -0
  77. evalscope/benchmarks/olympiad_bench/olympiad_bench_adapter.py +163 -0
  78. evalscope/benchmarks/olympiad_bench/utils.py +565 -0
  79. evalscope/benchmarks/omni_bench/__init__.py +0 -0
  80. evalscope/benchmarks/omni_bench/omni_bench_adapter.py +86 -0
  81. evalscope/benchmarks/real_world_qa/__init__.py +0 -0
  82. evalscope/benchmarks/real_world_qa/real_world_qa_adapter.py +64 -0
  83. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +6 -1
  84. evalscope/config.py +24 -1
  85. evalscope/constants.py +3 -0
  86. evalscope/evaluator/evaluator.py +25 -7
  87. evalscope/metrics/metric.py +78 -2
  88. evalscope/metrics/metrics.py +16 -0
  89. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
  90. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
  91. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
  92. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
  93. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
  94. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
  95. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
  96. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
  97. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
  98. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
  99. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
  100. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
  101. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
  102. evalscope/models/model_apis.py +10 -8
  103. evalscope/models/utils/openai.py +1 -2
  104. evalscope/perf/arguments.py +2 -0
  105. evalscope/perf/plugin/api/base.py +2 -2
  106. evalscope/perf/plugin/api/default_api.py +7 -7
  107. evalscope/perf/plugin/api/openai_api.py +83 -19
  108. evalscope/perf/plugin/datasets/flickr8k.py +2 -2
  109. evalscope/perf/plugin/datasets/kontext_bench.py +2 -2
  110. evalscope/perf/plugin/datasets/random_vl_dataset.py +2 -2
  111. evalscope/perf/utils/benchmark_util.py +1 -2
  112. evalscope/report/__init__.py +9 -1
  113. evalscope/report/combinator.py +45 -20
  114. evalscope/report/report.py +8 -4
  115. evalscope/run.py +1 -1
  116. evalscope/utils/function_utils.py +41 -0
  117. evalscope/utils/import_utils.py +63 -13
  118. evalscope/utils/io_utils.py +19 -11
  119. evalscope/utils/json_schema.py +25 -2
  120. evalscope/utils/logger.py +19 -0
  121. evalscope/utils/model_utils.py +1 -1
  122. evalscope/utils/multi_choices.py +16 -1
  123. evalscope/version.py +2 -2
  124. {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/METADATA +10 -40
  125. {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/RECORD +120 -95
  126. {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/top_level.txt +0 -1
  127. tests/__init__.py +0 -1
  128. tests/benchmark/__init__.py +0 -1
  129. tests/benchmark/test_eval.py +0 -385
  130. tests/benchmark/test_image_edit.py +0 -65
  131. tests/benchmark/test_t2i.py +0 -142
  132. tests/benchmark/test_vlm.py +0 -80
  133. tests/cli/__init__.py +0 -1
  134. tests/cli/test_all.py +0 -269
  135. tests/cli/test_collection.py +0 -99
  136. tests/cli/test_custom.py +0 -268
  137. tests/cli/test_reasoning.py +0 -81
  138. tests/common.py +0 -73
  139. tests/perf/__init__.py +0 -1
  140. tests/perf/test_perf.py +0 -178
  141. tests/rag/test_clip_benchmark.py +0 -87
  142. tests/rag/test_mteb.py +0 -213
  143. tests/rag/test_ragas.py +0 -128
  144. tests/swift/__init__.py +0 -1
  145. tests/swift/test_run_swift_eval.py +0 -146
  146. tests/swift/test_run_swift_vlm_eval.py +0 -128
  147. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
  148. tests/test_run_all.py +0 -12
  149. tests/utils.py +0 -13
  150. tests/vlm/__init__.py +0 -1
  151. tests/vlm/test_vlmeval.py +0 -102
  152. {tests/rag → evalscope/benchmarks/ai2d}/__init__.py +0 -0
  153. {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/LICENSE +0 -0
  154. {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/WHEEL +0 -0
  155. {evalscope-1.0.1.dist-info → evalscope-1.1.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,163 @@
1
+ from typing import Any, Dict, List
2
+
3
+ from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
4
+ from evalscope.api.dataset import Sample
5
+ from evalscope.api.evaluator.state import TaskState
6
+ from evalscope.api.messages.chat_message import ChatMessageUser
7
+ from evalscope.api.messages.content import Content, ContentImage, ContentText
8
+ from evalscope.api.metric.scorer import Score
9
+ from evalscope.api.registry import register_benchmark
10
+ from evalscope.constants import Tags
11
+ from evalscope.utils.io_utils import bytes_to_base64
12
+ from evalscope.utils.logger import get_logger
13
+
14
+ logger = get_logger()
15
+
16
+ SUBSET_LIST = [
17
+ 'OE_MM_maths_en_COMP',
18
+ 'OE_MM_maths_zh_CEE',
19
+ 'OE_MM_maths_zh_COMP',
20
+ 'OE_MM_physics_en_COMP',
21
+ 'OE_MM_physics_zh_CEE',
22
+ 'OE_TO_maths_en_COMP',
23
+ 'OE_TO_maths_zh_CEE',
24
+ 'OE_TO_maths_zh_COMP',
25
+ 'OE_TO_physics_en_COMP',
26
+ 'OE_TO_physics_zh_CEE',
27
+ 'TP_MM_maths_en_COMP',
28
+ 'TP_MM_maths_zh_CEE',
29
+ 'TP_MM_maths_zh_COMP',
30
+ 'TP_MM_physics_en_COMP',
31
+ 'TP_TO_maths_en_COMP',
32
+ 'TP_TO_maths_zh_CEE',
33
+ 'TP_TO_maths_zh_COMP',
34
+ 'TP_TO_physics_en_COMP',
35
+ ]
36
+
37
+
38
+ @register_benchmark(
39
+ BenchmarkMeta(
40
+ name='olympiad_bench',
41
+ pretty_name='OlympiadBench',
42
+ tags=[Tags.MATH, Tags.REASONING],
43
+ description='OlympiadBench is an Olympiad-level bilingual multimodal '
44
+ 'scientific benchmark, featuring 8,476 problems from '
45
+ 'Olympiad-level mathematics and physics competitions, '
46
+ 'including the Chinese college entrance exam. '
47
+ 'In the subsets: `OE` stands for `Open-Ended`, '
48
+ '`TP` stands for `Theorem Proving`, '
49
+ '`MM` stands for `Multimodal`, '
50
+ '`TO` stands for `Text-Only`, '
51
+ '`CEE` stands for `Chinese Entrance Exam`, '
52
+ '`COMP` stands for `Comprehensive`. '
53
+ '**Note: The `TP` subsets can\'t be evaluated with auto-judge for now**.',
54
+ dataset_id='AI-ModelScope/OlympiadBench',
55
+ subset_list=SUBSET_LIST,
56
+ metric_list=['acc'],
57
+ eval_split='train',
58
+ prompt_template='{question}\nPlease reason step by step, and put your final answer within \\boxed{{}}.',
59
+ )
60
+ )
61
+ class OlympiadBenchAdapter(VisionLanguageAdapter):
62
+
63
+ def __init__(self, *args, **kwargs):
64
+ super().__init__(*args, **kwargs)
65
+
66
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
67
+ """Generate prompt for a single item."""
68
+ from .utils import OlympiadBenchPrompter
69
+
70
+ question = record.get('question', '')
71
+ language = record.get('language', 'English')
72
+ subject = record.get('subject', 'Math')
73
+ question_type = record.get('question_type', '')
74
+ answer_type = record.get('answer_type', '')
75
+ is_multiple_answer = record.get('is_multiple_answer', False)
76
+ unit = record.get('unit', '')
77
+ # Generate prompt
78
+ prompt = OlympiadBenchPrompter().make_prompt(
79
+ problem=question,
80
+ language=language,
81
+ subject=subject,
82
+ question_type=question_type,
83
+ answer_type=answer_type,
84
+ is_multiple_answer=is_multiple_answer,
85
+ unit=unit,
86
+ )
87
+ # Construct content list
88
+ content_list: List[Content] = []
89
+ # Add images if available
90
+ for i in range(9):
91
+ image = record.get(f'image_{i+1}')
92
+ if image:
93
+ image_base64 = bytes_to_base64(image['bytes'], format='jpg', add_header=True)
94
+ content_list.append(ContentImage(image=image_base64))
95
+ prompt = prompt.replace(f'<image_{i+1}>', f'[image_{i+1}]') # replace html tag
96
+ # Add text content
97
+ content_list.insert(0, ContentText(text=prompt))
98
+
99
+ final_answer = record.get('final_answer', [])
100
+ return Sample(
101
+ input=[ChatMessageUser(content=content_list)],
102
+ target=','.join(final_answer) if final_answer else '',
103
+ metadata={
104
+ 'id': record.get('id', ''),
105
+ 'subfield': record.get('subfield', ''),
106
+ 'context': record.get('context', ''),
107
+ 'solution': record.get('solution', []),
108
+ 'final_answer': record.get('final_answer', []),
109
+ 'is_multiple_answer': is_multiple_answer,
110
+ 'unit': unit,
111
+ 'answer_type': answer_type,
112
+ 'question_type': question_type,
113
+ 'language': language,
114
+ 'subject': subject,
115
+ 'error': record.get('error', None),
116
+ },
117
+ )
118
+
119
+ def extract_answer(self, prediction: str, task_state: TaskState):
120
+ import re
121
+
122
+ if task_state.metadata['language'] == 'Chinese':
123
+ matches = re.findall('所以最终答案是(.*)', prediction)
124
+ else:
125
+ matches = re.findall('So the final answer is (.*)', prediction)
126
+
127
+ # If found matches, take the last one, otherwise return the whole text
128
+ if matches:
129
+ return matches[-1].strip()
130
+ return prediction
131
+
132
+ def match_score(self, original_prediction, filtered_prediction, reference, task_state) -> Score:
133
+ from .utils import MathJudger
134
+
135
+ judger = MathJudger()
136
+ score = Score(
137
+ extracted_prediction=filtered_prediction,
138
+ prediction=original_prediction,
139
+ )
140
+ question = task_state.metadata
141
+ model_answer = filtered_prediction
142
+ # Get precision/error threshold from reference if available
143
+ answer_type = question['answer_type']
144
+ try:
145
+ if 'Tuple' in answer_type: # 目前可机评的数据中 没有 need_human_evaluate
146
+ judge_result = judger.judge(model_answer, question['final_answer'][0])
147
+ else:
148
+ if question['error']:
149
+ if ',' in question['error']:
150
+ precisions = question['error'].split(',')
151
+ precisions = [float(p) if p else 1e-8 for p in precisions]
152
+ judge_result = judger.judge(model_answer, question['final_answer'][0], precisions)
153
+ else:
154
+ precision = float(question['error'])
155
+ judge_result = judger.judge(model_answer, question['final_answer'][0], precision)
156
+ else:
157
+ judge_result = judger.judge(model_answer, question['final_answer'][0])
158
+ except Exception as e:
159
+ logger.warning(f'Error in judging answer: {e}')
160
+ judge_result = False
161
+
162
+ score.value = {'acc': float(judge_result)}
163
+ return score