evalscope 1.0.2__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (176) hide show
  1. evalscope/api/benchmark/__init__.py +8 -1
  2. evalscope/api/benchmark/adapters/__init__.py +1 -0
  3. evalscope/api/benchmark/adapters/default_data_adapter.py +12 -0
  4. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  5. evalscope/api/benchmark/benchmark.py +14 -0
  6. evalscope/api/dataset/dataset.py +21 -0
  7. evalscope/api/dataset/loader.py +6 -2
  8. evalscope/api/mixin/sandbox_mixin.py +32 -54
  9. evalscope/api/model/generate_config.py +6 -0
  10. evalscope/app/ui/multi_model.py +6 -1
  11. evalscope/app/ui/single_model.py +8 -2
  12. evalscope/app/utils/data_utils.py +3 -2
  13. evalscope/app/utils/visualization.py +2 -2
  14. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  15. evalscope/benchmarks/ai2d/ai2d_adapter.py +3 -2
  16. evalscope/benchmarks/bfcl/bfcl_adapter.py +11 -46
  17. evalscope/benchmarks/blink/__init__.py +0 -0
  18. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  19. evalscope/benchmarks/chartqa/__init__.py +0 -0
  20. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  21. evalscope/benchmarks/chartqa/utils.py +38 -0
  22. evalscope/benchmarks/data_collection/data_collection_adapter.py +2 -1
  23. evalscope/benchmarks/docvqa/__init__.py +0 -0
  24. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  25. evalscope/benchmarks/general_arena/general_arena_adapter.py +1 -1
  26. evalscope/benchmarks/general_arena/utils.py +2 -1
  27. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
  28. evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  29. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +23 -4
  30. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  31. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +158 -0
  32. evalscope/benchmarks/hle/hle_adapter.py +3 -2
  33. evalscope/benchmarks/humaneval/humaneval_adapter.py +2 -1
  34. evalscope/benchmarks/infovqa/__init__.py +0 -0
  35. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  36. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +3 -1
  37. evalscope/benchmarks/math_verse/__init__.py +0 -0
  38. evalscope/benchmarks/math_verse/math_verse_adapter.py +100 -0
  39. evalscope/benchmarks/math_vision/__init__.py +0 -0
  40. evalscope/benchmarks/math_vision/math_vision_adapter.py +111 -0
  41. evalscope/benchmarks/math_vista/math_vista_adapter.py +6 -26
  42. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +2 -2
  43. evalscope/benchmarks/mmmu/mmmu_adapter.py +1 -1
  44. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -1
  45. evalscope/benchmarks/ner/__init__.py +0 -0
  46. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  47. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  48. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  49. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  50. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  51. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  52. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  53. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  54. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  55. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  56. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  57. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  58. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  59. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  60. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  61. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  62. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  63. evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py +101 -0
  64. evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py +87 -0
  65. evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py +963 -0
  66. evalscope/benchmarks/ocr_bench_v2/__init__.py +0 -0
  67. evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  68. evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py +50 -0
  69. evalscope/benchmarks/ocr_bench_v2/parallel.py +46 -0
  70. evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  71. evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  72. evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  73. evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py +481 -0
  74. evalscope/benchmarks/ocr_bench_v2/spotting_metric.py +179 -0
  75. evalscope/benchmarks/ocr_bench_v2/utils.py +433 -0
  76. evalscope/benchmarks/ocr_bench_v2/vqa_metric.py +254 -0
  77. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  78. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  79. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  80. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  81. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  82. evalscope/benchmarks/poly_math/__init__.py +0 -0
  83. evalscope/benchmarks/poly_math/poly_math_adapter.py +127 -0
  84. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  85. evalscope/benchmarks/pope/__init__.py +0 -0
  86. evalscope/benchmarks/pope/pope_adapter.py +111 -0
  87. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  88. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  89. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  90. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  91. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +1 -1
  92. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +1 -1
  93. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  94. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  95. evalscope/benchmarks/zerobench/__init__.py +0 -0
  96. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  97. evalscope/constants.py +4 -0
  98. evalscope/evaluator/evaluator.py +72 -79
  99. evalscope/metrics/math_parser.py +14 -0
  100. evalscope/metrics/metric.py +52 -1
  101. evalscope/metrics/metrics.py +16 -0
  102. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
  103. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
  104. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
  105. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
  106. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
  107. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
  108. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
  109. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
  110. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
  111. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
  112. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
  113. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
  114. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
  115. evalscope/models/utils/openai.py +4 -0
  116. evalscope/perf/arguments.py +24 -4
  117. evalscope/perf/benchmark.py +74 -89
  118. evalscope/perf/http_client.py +31 -16
  119. evalscope/perf/main.py +15 -2
  120. evalscope/perf/plugin/api/base.py +9 -7
  121. evalscope/perf/plugin/api/custom_api.py +13 -58
  122. evalscope/perf/plugin/api/default_api.py +179 -79
  123. evalscope/perf/plugin/api/openai_api.py +4 -3
  124. evalscope/perf/plugin/datasets/base.py +21 -0
  125. evalscope/perf/plugin/datasets/custom.py +2 -3
  126. evalscope/perf/plugin/datasets/line_by_line.py +2 -3
  127. evalscope/perf/plugin/datasets/longalpaca.py +2 -3
  128. evalscope/perf/plugin/datasets/openqa.py +2 -4
  129. evalscope/perf/plugin/datasets/random_dataset.py +1 -3
  130. evalscope/perf/utils/benchmark_util.py +36 -22
  131. evalscope/perf/utils/db_util.py +14 -19
  132. evalscope/perf/utils/local_server.py +0 -44
  133. evalscope/perf/utils/log_utils.py +21 -6
  134. evalscope/report/__init__.py +11 -2
  135. evalscope/report/combinator.py +52 -2
  136. evalscope/run.py +4 -0
  137. evalscope/utils/function_utils.py +195 -12
  138. evalscope/utils/io_utils.py +74 -0
  139. evalscope/utils/json_schema.py +8 -6
  140. evalscope/utils/logger.py +49 -17
  141. evalscope/utils/multi_choices.py +16 -1
  142. evalscope/utils/ner.py +377 -0
  143. evalscope/version.py +2 -2
  144. {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/METADATA +239 -393
  145. {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/RECORD +140 -98
  146. {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/WHEEL +1 -1
  147. {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/top_level.txt +0 -1
  148. tests/__init__.py +0 -1
  149. tests/benchmark/__init__.py +0 -1
  150. tests/benchmark/test_eval.py +0 -429
  151. tests/benchmark/test_image_edit.py +0 -65
  152. tests/benchmark/test_sandbox.py +0 -81
  153. tests/benchmark/test_t2i.py +0 -142
  154. tests/benchmark/test_vlm.py +0 -137
  155. tests/cli/__init__.py +0 -1
  156. tests/cli/test_all.py +0 -269
  157. tests/cli/test_collection.py +0 -99
  158. tests/cli/test_custom.py +0 -268
  159. tests/cli/test_reasoning.py +0 -81
  160. tests/common.py +0 -73
  161. tests/perf/__init__.py +0 -1
  162. tests/perf/test_perf.py +0 -206
  163. tests/rag/test_clip_benchmark.py +0 -87
  164. tests/rag/test_mteb.py +0 -213
  165. tests/rag/test_ragas.py +0 -128
  166. tests/swift/__init__.py +0 -1
  167. tests/swift/test_run_swift_eval.py +0 -146
  168. tests/swift/test_run_swift_vlm_eval.py +0 -128
  169. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
  170. tests/test_run_all.py +0 -12
  171. tests/utils.py +0 -13
  172. tests/vlm/__init__.py +0 -1
  173. tests/vlm/test_vlmeval.py +0 -102
  174. {tests/rag → evalscope/benchmarks/aa_lcr}/__init__.py +0 -0
  175. {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/entry_points.txt +0 -0
  176. {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info/licenses}/LICENSE +0 -0
@@ -0,0 +1,481 @@
1
+ # flake8: noqa
2
+ #!/usr/bin/env python
3
+ # -*- coding: utf-8 -*-
4
+ # encoding=utf8
5
+ # File: E2E_iou_1_1.py
6
+ # Version: 1.1
7
+ # Version info: changes for Python 3
8
+ # Date: 2019-12-29
9
+ # Description: Evaluation script that computes End to End Recognition. For Text Localization it's used Intersection over Union criteria.
10
+ # Average Precision is also calcuted when 'CONFIDENCES' parameter is True
11
+ # There are 2 modes to determine if a detection is correct or not:
12
+ # with Word Spotting: The detected word must coincide (ingnoring case) to a filtered Ground Truth containing only dictionary words (see include_in_dictionary and include_in_dictionary_transcription functions)
13
+ # without Word Spotting: words must be equal excluding a set of special characters
14
+
15
+ import importlib
16
+ from collections import namedtuple
17
+
18
+ from . import rrc_evaluation_funcs_1_1 as rrc_evaluation_funcs
19
+
20
+
21
+ def evaluation_imports():
22
+ """
23
+ evaluation_imports: Dictionary ( key = module name , value = alias ) with python modules used in the evaluation.
24
+ """
25
+ return {'Polygon': 'plg', 'numpy': 'np'}
26
+
27
+
28
+ def default_evaluation_params():
29
+ """
30
+ default_evaluation_params: Default parameters to use for the validation and evaluation.
31
+ """
32
+ return {
33
+ 'IOU_CONSTRAINT': 0.5,
34
+ 'AREA_PRECISION_CONSTRAINT': 0.5,
35
+ 'WORD_SPOTTING': False,
36
+ 'MIN_LENGTH_CARE_WORD': 3,
37
+ 'GT_SAMPLE_NAME_2_ID': 'gt_img_([0-9]+).txt',
38
+ 'DET_SAMPLE_NAME_2_ID': 'res_img_([0-9]+).txt',
39
+ 'LTRB': False, # LTRB:2points(left,top,right,bottom) or 4 points(x1,y1,x2,y2,x3,y3,x4,y4)
40
+ 'CRLF': False, # Lines are delimited by Windows CRLF format
41
+ 'CONFIDENCES': False, # Detections must include confidence value. AP will be calculated,
42
+ 'SPECIAL_CHARACTERS': "!?.:,*\"()·[]/'",
43
+ 'ONLY_REMOVE_FIRST_LAST_CHARACTER': True,
44
+ }
45
+
46
+
47
+ def validate_data(gtFilePath, submFilePath, evaluationParams):
48
+ """
49
+ Method validate_data: validates that all files in the results folder are correct (have the correct name contents).
50
+ Validates also that there are no missing files in the folder.
51
+ If some error detected, the method raises the error
52
+ """
53
+ gt = rrc_evaluation_funcs.load_zip_file(gtFilePath, evaluationParams['GT_SAMPLE_NAME_2_ID'])
54
+
55
+ subm = rrc_evaluation_funcs.load_zip_file(submFilePath, evaluationParams['DET_SAMPLE_NAME_2_ID'], True)
56
+
57
+ # Validate format of GroundTruth
58
+ for k in gt:
59
+ rrc_evaluation_funcs.validate_lines_in_file(k, gt[k], evaluationParams['CRLF'], evaluationParams['LTRB'], True)
60
+
61
+ # Validate format of results
62
+ for k in subm:
63
+ if (k in gt) == False:
64
+ raise Exception('The sample %s not present in GT' % k)
65
+
66
+ rrc_evaluation_funcs.validate_lines_in_file(
67
+ k, subm[k], evaluationParams['CRLF'], evaluationParams['LTRB'], True, evaluationParams['CONFIDENCES']
68
+ )
69
+
70
+
71
+ def evaluate_method(gtFilePath, submFilePath, evaluationParams):
72
+ """
73
+ Method evaluate_method: evaluate method and returns the results
74
+ Results. Dictionary with the following values:
75
+ - method (required) Global method metrics. Ex: { 'Precision':0.8,'Recall':0.9 }
76
+ - samples (optional) Per sample metrics. Ex: {'sample1' : { 'Precision':0.8,'Recall':0.9 } , 'sample2' : { 'Precision':0.8,'Recall':0.9 }
77
+ """
78
+ for module, alias in evaluation_imports().items():
79
+ globals()[alias] = importlib.import_module(module)
80
+
81
+ def polygon_from_points(points, correctOffset=False):
82
+ """
83
+ Returns a Polygon object to use with the Polygon2 class from a list of 8 points: x1,y1,x2,y2,x3,y3,x4,y4
84
+ """
85
+
86
+ if correctOffset: # this will substract 1 from the coordinates that correspond to the xmax and ymax
87
+ points[2] -= 1
88
+ points[4] -= 1
89
+ points[5] -= 1
90
+ points[7] -= 1
91
+
92
+ resBoxes = np.empty([1, 8], dtype='int32')
93
+ resBoxes[0, 0] = int(points[0])
94
+ resBoxes[0, 4] = int(points[1])
95
+ resBoxes[0, 1] = int(points[2])
96
+ resBoxes[0, 5] = int(points[3])
97
+ resBoxes[0, 2] = int(points[4])
98
+ resBoxes[0, 6] = int(points[5])
99
+ resBoxes[0, 3] = int(points[6])
100
+ resBoxes[0, 7] = int(points[7])
101
+ pointMat = resBoxes[0].reshape([2, 4]).T
102
+ return plg.Polygon(pointMat)
103
+
104
+ def rectangle_to_polygon(rect):
105
+ resBoxes = np.empty([1, 8], dtype='int32')
106
+ resBoxes[0, 0] = int(rect.xmin)
107
+ resBoxes[0, 4] = int(rect.ymax)
108
+ resBoxes[0, 1] = int(rect.xmin)
109
+ resBoxes[0, 5] = int(rect.ymin)
110
+ resBoxes[0, 2] = int(rect.xmax)
111
+ resBoxes[0, 6] = int(rect.ymin)
112
+ resBoxes[0, 3] = int(rect.xmax)
113
+ resBoxes[0, 7] = int(rect.ymax)
114
+
115
+ pointMat = resBoxes[0].reshape([2, 4]).T
116
+
117
+ return plg.Polygon(pointMat)
118
+
119
+ def rectangle_to_points(rect):
120
+ points = [
121
+ int(rect.xmin),
122
+ int(rect.ymax),
123
+ int(rect.xmax),
124
+ int(rect.ymax),
125
+ int(rect.xmax),
126
+ int(rect.ymin),
127
+ int(rect.xmin),
128
+ int(rect.ymin)
129
+ ]
130
+ return points
131
+
132
+ def get_union(pD, pG):
133
+ areaA = pD.area()
134
+ areaB = pG.area()
135
+ return areaA + areaB - get_intersection(pD, pG)
136
+
137
+ def get_intersection_over_union(pD, pG):
138
+ try:
139
+ return get_intersection(pD, pG) / get_union(pD, pG)
140
+ except:
141
+ return 0
142
+
143
+ def get_intersection(pD, pG):
144
+ pInt = pD & pG
145
+ if len(pInt) == 0:
146
+ return 0
147
+ return pInt.area()
148
+
149
+ def compute_ap(confList, matchList, numGtCare):
150
+ correct = 0
151
+ AP = 0
152
+ if len(confList) > 0:
153
+ confList = np.array(confList)
154
+ matchList = np.array(matchList)
155
+ sorted_ind = np.argsort(-confList)
156
+ confList = confList[sorted_ind]
157
+ matchList = matchList[sorted_ind]
158
+ for n in range(len(confList)):
159
+ match = matchList[n]
160
+ if match:
161
+ correct += 1
162
+ AP += float(correct) / (n + 1)
163
+
164
+ if numGtCare > 0:
165
+ AP /= numGtCare
166
+
167
+ return AP
168
+
169
+ def transcription_match(
170
+ transGt, transDet, specialCharacters="!?.:,*\"()·[]/'", onlyRemoveFirstLastCharacterGT=True
171
+ ):
172
+ if onlyRemoveFirstLastCharacterGT:
173
+ # special characters in GT are allowed only at initial or final position
174
+ if transGt == transDet:
175
+ return True
176
+
177
+ if specialCharacters.find(transGt[0]) > -1:
178
+ if transGt[1:] == transDet:
179
+ return True
180
+
181
+ if specialCharacters.find(transGt[-1]) > -1:
182
+ if transGt[0:len(transGt) - 1] == transDet:
183
+ return True
184
+
185
+ if specialCharacters.find(transGt[0]) > -1 and specialCharacters.find(transGt[-1]) > -1:
186
+ if transGt[1:len(transGt) - 1] == transDet:
187
+ return True
188
+ return False
189
+ else:
190
+ # Special characters are removed from the begining and the end of both Detection and GroundTruth
191
+ while len(transGt) > 0 and specialCharacters.find(transGt[0]) > -1:
192
+ transGt = transGt[1:]
193
+
194
+ while len(transDet) > 0 and specialCharacters.find(transDet[0]) > -1:
195
+ transDet = transDet[1:]
196
+
197
+ while len(transGt) > 0 and specialCharacters.find(transGt[-1]) > -1:
198
+ transGt = transGt[0:len(transGt) - 1]
199
+
200
+ while len(transDet) > 0 and specialCharacters.find(transDet[-1]) > -1:
201
+ transDet = transDet[0:len(transDet) - 1]
202
+
203
+ return transGt == transDet
204
+
205
+ def include_in_dictionary(transcription):
206
+ """
207
+ Function used in Word Spotting that finds if the Ground Truth transcription meets the rules to enter into the dictionary. If not, the transcription will be cared as don't care
208
+ """
209
+ # special case 's at final
210
+ if transcription[len(transcription) - 2:] == "'s" or transcription[len(transcription) - 2:] == "'S":
211
+ transcription = transcription[0:len(transcription) - 2]
212
+
213
+ # hypens at init or final of the word
214
+ transcription = transcription.strip('-')
215
+
216
+ specialCharacters = "'!?.:,*\"()·[]/"
217
+ for character in specialCharacters:
218
+ transcription = transcription.replace(character, ' ')
219
+
220
+ transcription = transcription.strip()
221
+
222
+ if len(transcription) != len(transcription.replace(' ', '')):
223
+ return False
224
+
225
+ if len(transcription) < evaluationParams['MIN_LENGTH_CARE_WORD']:
226
+ return False
227
+
228
+ notAllowed = '×÷·'
229
+
230
+ range1 = [ord('a'), ord('z')]
231
+ range2 = [ord('A'), ord('Z')]
232
+ range3 = [ord('À'), ord('ƿ')]
233
+ range4 = [ord('DŽ'), ord('ɿ')]
234
+ range5 = [ord('Ά'), ord('Ͽ')]
235
+ range6 = [ord('-'), ord('-')]
236
+
237
+ for char in transcription:
238
+ charCode = ord(char)
239
+ if notAllowed.find(char) != -1:
240
+ return False
241
+
242
+ valid = ((charCode >= range1[0] and charCode <= range1[1])
243
+ or (charCode >= range2[0] and charCode <= range2[1])
244
+ or (charCode >= range3[0] and charCode <= range3[1])
245
+ or (charCode >= range4[0] and charCode <= range4[1])
246
+ or (charCode >= range5[0] and charCode <= range5[1])
247
+ or (charCode >= range6[0] and charCode <= range6[1]))
248
+ if valid == False:
249
+ return False
250
+
251
+ return True
252
+
253
+ def include_in_dictionary_transcription(transcription):
254
+ """
255
+ Function applied to the Ground Truth transcriptions used in Word Spotting. It removes special characters or terminations
256
+ """
257
+ # special case 's at final
258
+ if transcription[len(transcription) - 2:] == "'s" or transcription[len(transcription) - 2:] == "'S":
259
+ transcription = transcription[0:len(transcription) - 2]
260
+
261
+ # hypens at init or final of the word
262
+ transcription = transcription.strip('-')
263
+
264
+ specialCharacters = "'!?.:,*\"()·[]/"
265
+ for character in specialCharacters:
266
+ transcription = transcription.replace(character, ' ')
267
+
268
+ transcription = transcription.strip()
269
+
270
+ return transcription
271
+
272
+ perSampleMetrics = {}
273
+
274
+ matchedSum = 0
275
+
276
+ Rectangle = namedtuple('Rectangle', 'xmin ymin xmax ymax')
277
+
278
+ gt = rrc_evaluation_funcs.load_zip_file(gtFilePath, evaluationParams['GT_SAMPLE_NAME_2_ID'])
279
+ subm = rrc_evaluation_funcs.load_zip_file(submFilePath, evaluationParams['DET_SAMPLE_NAME_2_ID'], True)
280
+
281
+ numGlobalCareGt = 0
282
+ numGlobalCareDet = 0
283
+
284
+ arrGlobalConfidences = []
285
+ arrGlobalMatches = []
286
+
287
+ for resFile in gt:
288
+ gtFile = rrc_evaluation_funcs.decode_utf8(gt[resFile])
289
+ if gtFile is None:
290
+ raise Exception('The file %s is not UTF-8' % resFile)
291
+
292
+ recall = 0
293
+ precision = 0
294
+ hmean = 0
295
+ detCorrect = 0
296
+ iouMat = np.empty([1, 1])
297
+ gtPols = []
298
+ detPols = []
299
+ gtTrans = []
300
+ detTrans = []
301
+ gtPolPoints = []
302
+ detPolPoints = []
303
+ gtDontCarePolsNum = [] # Array of Ground Truth Polygons' keys marked as don't Care
304
+ detDontCarePolsNum = [] # Array of Detected Polygons' matched with a don't Care GT
305
+ detMatchedNums = []
306
+ pairs = []
307
+
308
+ arrSampleConfidences = []
309
+ arrSampleMatch = []
310
+ sampleAP = 0
311
+
312
+ evaluationLog = ''
313
+
314
+ pointsList, _, transcriptionsList = rrc_evaluation_funcs.get_tl_line_values_from_file_contents(
315
+ gtFile, evaluationParams['CRLF'], evaluationParams['LTRB'], True, False
316
+ )
317
+ for n in range(len(pointsList)):
318
+ points = pointsList[n]
319
+ transcription = transcriptionsList[n]
320
+ dontCare = transcription == '###'
321
+ if evaluationParams['LTRB']:
322
+ gtRect = Rectangle(*points)
323
+ gtPol = rectangle_to_polygon(gtRect)
324
+ else:
325
+ gtPol = polygon_from_points(points)
326
+ gtPols.append(gtPol)
327
+ gtPolPoints.append(points)
328
+
329
+ # On word spotting we will filter some transcriptions with special characters
330
+ if evaluationParams['WORD_SPOTTING']:
331
+ if dontCare == False:
332
+ if include_in_dictionary(transcription) == False:
333
+ dontCare = True
334
+ else:
335
+ transcription = include_in_dictionary_transcription(transcription)
336
+
337
+ gtTrans.append(transcription)
338
+ if dontCare:
339
+ gtDontCarePolsNum.append(len(gtPols) - 1)
340
+
341
+ evaluationLog += 'GT polygons: ' + str(
342
+ len(gtPols)
343
+ ) + (' (' + str(len(gtDontCarePolsNum)) + " don't care)\n" if len(gtDontCarePolsNum) > 0 else '\n')
344
+
345
+ if resFile in subm:
346
+ detFile = rrc_evaluation_funcs.decode_utf8(subm[resFile])
347
+
348
+ pointsList, confidencesList, transcriptionsList = rrc_evaluation_funcs.get_tl_line_values_from_file_contents(
349
+ detFile, evaluationParams['CRLF'], evaluationParams['LTRB'], True, evaluationParams['CONFIDENCES']
350
+ )
351
+
352
+ for n in range(len(pointsList)):
353
+ points = pointsList[n]
354
+ transcription = transcriptionsList[n]
355
+
356
+ if evaluationParams['LTRB']:
357
+ detRect = Rectangle(*points)
358
+ detPol = rectangle_to_polygon(detRect)
359
+ else:
360
+ detPol = polygon_from_points(points)
361
+ detPols.append(detPol)
362
+ detPolPoints.append(points)
363
+ detTrans.append(transcription)
364
+
365
+ if len(gtDontCarePolsNum) > 0:
366
+ for dontCarePol in gtDontCarePolsNum:
367
+ dontCarePol = gtPols[dontCarePol]
368
+ intersected_area = get_intersection(dontCarePol, detPol)
369
+ pdDimensions = detPol.area()
370
+ precision = 0 if pdDimensions == 0 else intersected_area / pdDimensions
371
+ if precision > evaluationParams['AREA_PRECISION_CONSTRAINT']:
372
+ detDontCarePolsNum.append(len(detPols) - 1)
373
+ break
374
+
375
+ evaluationLog += 'DET polygons: ' + str(
376
+ len(detPols)
377
+ ) + (' (' + str(len(detDontCarePolsNum)) + " don't care)\n" if len(detDontCarePolsNum) > 0 else '\n')
378
+
379
+ if len(gtPols) > 0 and len(detPols) > 0:
380
+ # Calculate IoU and precision matrixs
381
+ outputShape = [len(gtPols), len(detPols)]
382
+ iouMat = np.empty(outputShape)
383
+ gtRectMat = np.zeros(len(gtPols), np.int8)
384
+ detRectMat = np.zeros(len(detPols), np.int8)
385
+ for gtNum in range(len(gtPols)):
386
+ for detNum in range(len(detPols)):
387
+ pG = gtPols[gtNum]
388
+ pD = detPols[detNum]
389
+ iouMat[gtNum, detNum] = get_intersection_over_union(pD, pG)
390
+
391
+ for gtNum in range(len(gtPols)):
392
+ for detNum in range(len(detPols)):
393
+ if gtRectMat[gtNum] == 0 and detRectMat[
394
+ detNum] == 0 and gtNum not in gtDontCarePolsNum and detNum not in detDontCarePolsNum:
395
+ if iouMat[gtNum, detNum] > evaluationParams['IOU_CONSTRAINT']:
396
+ gtRectMat[gtNum] = 1
397
+ detRectMat[detNum] = 1
398
+ # detection matched only if transcription is equal
399
+ if evaluationParams['WORD_SPOTTING']:
400
+ correct = gtTrans[gtNum].upper() == detTrans[detNum].upper()
401
+ else:
402
+ correct = transcription_match(
403
+ gtTrans[gtNum].upper(), detTrans[detNum].upper(),
404
+ evaluationParams['SPECIAL_CHARACTERS'],
405
+ evaluationParams['ONLY_REMOVE_FIRST_LAST_CHARACTER']
406
+ ) == True
407
+ detCorrect += 1 if correct else 0
408
+ if correct:
409
+ detMatchedNums.append(detNum)
410
+ pairs.append({'gt': gtNum, 'det': detNum, 'correct': correct})
411
+ evaluationLog += 'Match GT #' + str(gtNum) + ' with Det #' + str(
412
+ detNum
413
+ ) + ' trans. correct: ' + str(correct) + '\n'
414
+
415
+ if evaluationParams['CONFIDENCES']:
416
+ for detNum in range(len(detPols)):
417
+ if detNum not in detDontCarePolsNum:
418
+ # we exclude the don't care detections
419
+ match = detNum in detMatchedNums
420
+
421
+ arrSampleConfidences.append(confidencesList[detNum])
422
+ arrSampleMatch.append(match)
423
+
424
+ arrGlobalConfidences.append(confidencesList[detNum])
425
+ arrGlobalMatches.append(match)
426
+
427
+ numGtCare = len(gtPols) - len(gtDontCarePolsNum)
428
+ numDetCare = len(detPols) - len(detDontCarePolsNum)
429
+ if numGtCare == 0:
430
+ recall = float(1)
431
+ precision = float(0) if numDetCare > 0 else float(1)
432
+ sampleAP = precision
433
+ else:
434
+ recall = float(detCorrect) / numGtCare
435
+ precision = 0 if numDetCare == 0 else float(detCorrect) / numDetCare
436
+ if evaluationParams['CONFIDENCES']:
437
+ sampleAP = compute_ap(arrSampleConfidences, arrSampleMatch, numGtCare)
438
+
439
+ hmean = 0 if (precision + recall) == 0 else 2.0 * precision * recall / (precision + recall)
440
+
441
+ matchedSum += detCorrect
442
+ numGlobalCareGt += numGtCare
443
+ numGlobalCareDet += numDetCare
444
+
445
+ perSampleMetrics[resFile] = {
446
+ 'precision': precision,
447
+ 'recall': recall,
448
+ 'hmean': hmean,
449
+ 'pairs': pairs,
450
+ 'AP': sampleAP,
451
+ 'iouMat': [] if len(detPols) > 100 else iouMat.tolist(),
452
+ 'gtPolPoints': gtPolPoints,
453
+ 'detPolPoints': detPolPoints,
454
+ 'gtTrans': gtTrans,
455
+ 'detTrans': detTrans,
456
+ 'gtDontCare': gtDontCarePolsNum,
457
+ 'detDontCare': detDontCarePolsNum,
458
+ 'evaluationParams': evaluationParams,
459
+ 'evaluationLog': evaluationLog,
460
+ }
461
+
462
+ # Compute AP
463
+ AP = 0
464
+ if evaluationParams['CONFIDENCES']:
465
+ AP = compute_ap(arrGlobalConfidences, arrGlobalMatches, numGlobalCareGt)
466
+
467
+ methodRecall = 0 if numGlobalCareGt == 0 else float(matchedSum) / numGlobalCareGt
468
+ methodPrecision = 0 if numGlobalCareDet == 0 else float(matchedSum) / numGlobalCareDet
469
+ methodHmean = 0 if methodRecall + methodPrecision == 0 else 2 * methodRecall * methodPrecision / (
470
+ methodRecall + methodPrecision
471
+ )
472
+
473
+ methodMetrics = {'precision': methodPrecision, 'recall': methodRecall, 'hmean': methodHmean, 'AP': AP}
474
+
475
+ resDict = {'calculated': True, 'Message': '', 'method': methodMetrics, 'per_sample': perSampleMetrics}
476
+
477
+ return resDict
478
+
479
+
480
+ if __name__ == '__main__':
481
+ rrc_evaluation_funcs.main_evaluation(None, default_evaluation_params, validate_data, evaluate_method)
@@ -0,0 +1,179 @@
1
+ import ast
2
+ import os
3
+ import re
4
+ import shutil
5
+ import zipfile
6
+
7
+ from evalscope.constants import DEFAULT_EVALSCOPE_CACHE_DIR
8
+ from evalscope.utils.function_utils import thread_safe
9
+ from .spotting_eval import rrc_evaluation_funcs_1_1 as rrc_evaluation_funcs
10
+ from .spotting_eval.script import default_evaluation_params, evaluate_method, validate_data
11
+
12
+
13
+ def extract_bounding_boxes_robust(predict_str):
14
+ """
15
+ Extract coordinates and text content from the given prediction string,
16
+ handling potential format issues.
17
+
18
+ Args:
19
+ predict_str (str): Model prediction output as a string.
20
+
21
+ Returns:
22
+ list: Extracted data in the format [[x1, y1, x2, y2, text_content], ...].
23
+ Returns None if no valid data is extracted.
24
+ """
25
+ results = []
26
+ seen = set()
27
+
28
+ # try parsing with ast.literal_eval
29
+ try:
30
+ data = ast.literal_eval(predict_str)
31
+ except Exception:
32
+ data = None
33
+
34
+ if data is not None:
35
+ if isinstance(data, (list, tuple)):
36
+ for item in data:
37
+ if isinstance(item, (list, tuple)) and len(item) >= 5:
38
+ x1_str, y1_str, x2_str, y2_str = item[:4]
39
+ text_content = item[4]
40
+
41
+ x1_str = str(x1_str).strip()
42
+ y1_str = str(y1_str).strip()
43
+ x2_str = str(x2_str).strip()
44
+ y2_str = str(y2_str).strip()
45
+ text_content = str(text_content).replace('\n', '').strip().strip('"').strip("'")
46
+
47
+ try:
48
+ x1 = int(x1_str)
49
+ y1 = int(y1_str)
50
+ x2 = int(x2_str)
51
+ y2 = int(y2_str)
52
+
53
+ if not (0 <= x1 <= 1000 and 0 <= y1 <= 1000 and 0 <= x2 <= 1000 and 0 <= y2 <= 1000):
54
+ continue
55
+
56
+ key = (x1, y1, x2, y2, text_content)
57
+ if key in seen:
58
+ continue
59
+
60
+ seen.add(key)
61
+ results.append([x1, y1, x2, y2, text_content])
62
+ except ValueError:
63
+ continue
64
+ else:
65
+ # try parsing with regular expression
66
+
67
+ list_content = predict_str
68
+ items = re.findall(r'[\[\(]\s*([^\[\]\(\)]*?)\s*[\]\)]', list_content)
69
+
70
+ if not items:
71
+ return None
72
+
73
+ for item in items:
74
+ parts = item.split(',', 4)
75
+ if len(parts) < 5:
76
+ continue
77
+
78
+ x1_str, y1_str, x2_str, y2_str, text_content = parts
79
+
80
+ x1_str = x1_str.strip()
81
+ y1_str = y1_str.strip()
82
+ x2_str = x2_str.strip()
83
+ y2_str = y2_str.strip()
84
+ text_content = text_content.replace('\n', '').strip().strip('"').strip("'")
85
+
86
+ try:
87
+ x1 = int(x1_str)
88
+ y1 = int(y1_str)
89
+ x2 = int(x2_str)
90
+ y2 = int(y2_str)
91
+
92
+ if not (0 <= x1 <= 1000 and 0 <= y1 <= 1000 and 0 <= x2 <= 1000 and 0 <= y2 <= 1000):
93
+ continue
94
+
95
+ key = (x1, y1, x2, y2, text_content)
96
+ if key in seen:
97
+ continue
98
+
99
+ seen.add(key)
100
+ results.append([x1, y1, x2, y2, text_content])
101
+ except ValueError:
102
+ continue
103
+
104
+ if not results:
105
+ return None
106
+
107
+ return results
108
+
109
+
110
+ def zip_folder(source_folder, destination_zip):
111
+ abs_source = os.path.abspath(source_folder)
112
+ abs_destination = os.path.abspath(destination_zip)
113
+
114
+ with zipfile.ZipFile(abs_destination, 'w', zipfile.ZIP_DEFLATED) as zf:
115
+ for root, _, files in os.walk(abs_source):
116
+ for file in files:
117
+ abs_file_path = os.path.join(root, file)
118
+
119
+ relative_path = os.path.relpath(abs_file_path, abs_source)
120
+ zf.write(abs_file_path, relative_path)
121
+
122
+
123
+ @thread_safe
124
+ def spotting_evaluation(prediction_list, img_metas):
125
+ score = 0
126
+
127
+ submit_path = os.path.join(DEFAULT_EVALSCOPE_CACHE_DIR, 'eval', 'ocrbench_v2', 'submit')
128
+ gt_path = os.path.join(DEFAULT_EVALSCOPE_CACHE_DIR, 'eval', 'ocrbench_v2', 'gt')
129
+ submit_zip_path = os.path.join(DEFAULT_EVALSCOPE_CACHE_DIR, 'eval', 'ocrbench_v2', 'submit.zip')
130
+ gt_zip_path = os.path.join(DEFAULT_EVALSCOPE_CACHE_DIR, 'eval', 'ocrbench_v2', 'gt.zip')
131
+ for file_path in [submit_path, gt_path, submit_zip_path, gt_zip_path]:
132
+ if 'zip' in file_path:
133
+ if os.path.exists(file_path):
134
+ os.remove(file_path)
135
+ else:
136
+ if os.path.exists(file_path):
137
+ shutil.rmtree(file_path)
138
+ os.makedirs(file_path, exist_ok=True)
139
+
140
+ res_submit_list = []
141
+ for item in prediction_list:
142
+ x1, y1, x2, y2, rec = item
143
+ if x1 >= x2 or y1 >= y2:
144
+ continue
145
+
146
+ res_submit_list.append(','.join([str(x1), str(y1), str(x2), str(y1), str(x2), str(y2), str(x1), str(y2), rec]))
147
+
148
+ res_gt_list = []
149
+ for bbox, rec in zip(img_metas['bbox_list'], img_metas['content']):
150
+ x_coords = bbox[0::2]
151
+ y_coords = bbox[1::2]
152
+
153
+ x1, y1 = min(x_coords), min(y_coords)
154
+ x2, y2 = max(x_coords), max(y_coords)
155
+
156
+ res_gt_list.append(','.join([str(x1), str(y1), str(x2), str(y1), str(x2), str(y2), str(x1), str(y2), rec]))
157
+
158
+ if len(res_submit_list) == 0 or len(res_gt_list) == 0:
159
+ return 0
160
+
161
+ with open(os.path.join(submit_path, 'res_img_0.txt'), 'w') as f:
162
+ for item in res_submit_list[:-1]:
163
+ f.write(item + '\n')
164
+ f.write(res_submit_list[-1])
165
+
166
+ with open(os.path.join(gt_path, 'gt_img_0.txt'), 'w') as f:
167
+ for item in res_gt_list[:-1]:
168
+ f.write(item + '\n')
169
+ f.write(res_gt_list[-1])
170
+
171
+ zip_folder(submit_path, submit_zip_path)
172
+ zip_folder(gt_path, gt_zip_path)
173
+
174
+ command = {'g': gt_zip_path, 's': submit_zip_path, 'o': DEFAULT_EVALSCOPE_CACHE_DIR, 'p': '{"IOU_CONSTRAINT":0.5}'}
175
+
176
+ # run rrc_evaluation_funcs
177
+ result = rrc_evaluation_funcs.main_evaluation(command, default_evaluation_params, validate_data, evaluate_method)
178
+ score = result['method']['hmean']
179
+ return score