evalscope 1.0.2__py3-none-any.whl → 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/api/benchmark/__init__.py +8 -1
- evalscope/api/benchmark/adapters/__init__.py +1 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +12 -0
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/benchmark.py +14 -0
- evalscope/api/dataset/dataset.py +21 -0
- evalscope/api/dataset/loader.py +6 -2
- evalscope/api/mixin/sandbox_mixin.py +32 -54
- evalscope/api/model/generate_config.py +6 -0
- evalscope/app/ui/multi_model.py +6 -1
- evalscope/app/ui/single_model.py +8 -2
- evalscope/app/utils/data_utils.py +3 -2
- evalscope/app/utils/visualization.py +2 -2
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +3 -2
- evalscope/benchmarks/bfcl/bfcl_adapter.py +11 -46
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +2 -1
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +1 -1
- evalscope/benchmarks/general_arena/utils.py +2 -1
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +23 -4
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +158 -0
- evalscope/benchmarks/hle/hle_adapter.py +3 -2
- evalscope/benchmarks/humaneval/humaneval_adapter.py +2 -1
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +3 -1
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +100 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +111 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +6 -26
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +2 -2
- evalscope/benchmarks/mmmu/mmmu_adapter.py +1 -1
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -1
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +127 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +111 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +1 -1
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +1 -1
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/constants.py +4 -0
- evalscope/evaluator/evaluator.py +72 -79
- evalscope/metrics/math_parser.py +14 -0
- evalscope/metrics/metric.py +52 -1
- evalscope/metrics/metrics.py +16 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
- evalscope/models/utils/openai.py +4 -0
- evalscope/perf/arguments.py +24 -4
- evalscope/perf/benchmark.py +74 -89
- evalscope/perf/http_client.py +31 -16
- evalscope/perf/main.py +15 -2
- evalscope/perf/plugin/api/base.py +9 -7
- evalscope/perf/plugin/api/custom_api.py +13 -58
- evalscope/perf/plugin/api/default_api.py +179 -79
- evalscope/perf/plugin/api/openai_api.py +4 -3
- evalscope/perf/plugin/datasets/base.py +21 -0
- evalscope/perf/plugin/datasets/custom.py +2 -3
- evalscope/perf/plugin/datasets/line_by_line.py +2 -3
- evalscope/perf/plugin/datasets/longalpaca.py +2 -3
- evalscope/perf/plugin/datasets/openqa.py +2 -4
- evalscope/perf/plugin/datasets/random_dataset.py +1 -3
- evalscope/perf/utils/benchmark_util.py +36 -22
- evalscope/perf/utils/db_util.py +14 -19
- evalscope/perf/utils/local_server.py +0 -44
- evalscope/perf/utils/log_utils.py +21 -6
- evalscope/report/__init__.py +11 -2
- evalscope/report/combinator.py +52 -2
- evalscope/run.py +4 -0
- evalscope/utils/function_utils.py +195 -12
- evalscope/utils/io_utils.py +74 -0
- evalscope/utils/json_schema.py +8 -6
- evalscope/utils/logger.py +49 -17
- evalscope/utils/multi_choices.py +16 -1
- evalscope/utils/ner.py +377 -0
- evalscope/version.py +2 -2
- {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/METADATA +239 -393
- {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/RECORD +140 -98
- {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/WHEEL +1 -1
- {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/top_level.txt +0 -1
- tests/__init__.py +0 -1
- tests/benchmark/__init__.py +0 -1
- tests/benchmark/test_eval.py +0 -429
- tests/benchmark/test_image_edit.py +0 -65
- tests/benchmark/test_sandbox.py +0 -81
- tests/benchmark/test_t2i.py +0 -142
- tests/benchmark/test_vlm.py +0 -137
- tests/cli/__init__.py +0 -1
- tests/cli/test_all.py +0 -269
- tests/cli/test_collection.py +0 -99
- tests/cli/test_custom.py +0 -268
- tests/cli/test_reasoning.py +0 -81
- tests/common.py +0 -73
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -206
- tests/rag/test_clip_benchmark.py +0 -87
- tests/rag/test_mteb.py +0 -213
- tests/rag/test_ragas.py +0 -128
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -146
- tests/swift/test_run_swift_vlm_eval.py +0 -128
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
- tests/test_run_all.py +0 -12
- tests/utils.py +0 -13
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -102
- {tests/rag → evalscope/benchmarks/aa_lcr}/__init__.py +0 -0
- {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/entry_points.txt +0 -0
- {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,349 @@
|
|
|
1
|
+
# flake8: noqa
|
|
2
|
+
import numpy as np
|
|
3
|
+
import os
|
|
4
|
+
import sys
|
|
5
|
+
from collections import defaultdict
|
|
6
|
+
from typing import Dict, List
|
|
7
|
+
|
|
8
|
+
from evalscope.utils import get_logger
|
|
9
|
+
|
|
10
|
+
logger = get_logger()
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class End2EndEvaluator():
|
|
14
|
+
|
|
15
|
+
def __init__(
|
|
16
|
+
self,
|
|
17
|
+
prediction: List,
|
|
18
|
+
reference: List,
|
|
19
|
+
metrics: Dict,
|
|
20
|
+
match_method: str = 'quick_match',
|
|
21
|
+
filter_types: dict = None
|
|
22
|
+
):
|
|
23
|
+
|
|
24
|
+
self.match_method = match_method
|
|
25
|
+
self.references = reference
|
|
26
|
+
self.predictions = prediction
|
|
27
|
+
self.dafault_metircs_dict = metrics
|
|
28
|
+
|
|
29
|
+
filtered_gt_samples = []
|
|
30
|
+
if filter_types:
|
|
31
|
+
for gt_sample in self.references:
|
|
32
|
+
select_flag = True
|
|
33
|
+
for k, v in filter_types.items():
|
|
34
|
+
if gt_sample['page_info']['page_attribute'][k] != v:
|
|
35
|
+
select_flag = False
|
|
36
|
+
if select_flag:
|
|
37
|
+
filtered_gt_samples.append(gt_sample)
|
|
38
|
+
else:
|
|
39
|
+
filtered_gt_samples = self.references #[{},{},{}]
|
|
40
|
+
self.references = filtered_gt_samples
|
|
41
|
+
|
|
42
|
+
def score(self) -> dict:
|
|
43
|
+
samples = self.get_matched_elements(self.references, self.predictions)
|
|
44
|
+
metrics = self.process_generated_metric_results(samples)
|
|
45
|
+
return metrics
|
|
46
|
+
|
|
47
|
+
def get_page_elements(self, selected_annos):
|
|
48
|
+
saved_element_dict = defaultdict(list)
|
|
49
|
+
related_truncated = []
|
|
50
|
+
truncated_all = {}
|
|
51
|
+
for relation in selected_annos['extra']['relation']: # Handle truncated text issues
|
|
52
|
+
if relation['relation_type'] == 'truncated':
|
|
53
|
+
truncated_all[relation['source_anno_id']] = ''
|
|
54
|
+
truncated_all[relation['target_anno_id']] = ''
|
|
55
|
+
exist_flag = False
|
|
56
|
+
for merge_list in related_truncated:
|
|
57
|
+
if relation['source_anno_id'] in merge_list or relation[
|
|
58
|
+
'target_anno_id'] in merge_list: # Consider cases where three text blocks may need to be merged
|
|
59
|
+
merge_list.append(relation['source_anno_id'])
|
|
60
|
+
merge_list.append(relation['target_anno_id'])
|
|
61
|
+
exist_flag = True
|
|
62
|
+
if not exist_flag:
|
|
63
|
+
related_truncated.append([relation['source_anno_id'], relation['target_anno_id']])
|
|
64
|
+
|
|
65
|
+
for item in selected_annos['layout_dets']:
|
|
66
|
+
if item['anno_id'] not in truncated_all.keys():
|
|
67
|
+
saved_element_dict[item['category_type']].append(item)
|
|
68
|
+
else:
|
|
69
|
+
truncated_all[item['anno_id']] = item
|
|
70
|
+
|
|
71
|
+
for merge_list in related_truncated:
|
|
72
|
+
text_block_list = [truncated_all[key] for key in merge_list]
|
|
73
|
+
sorted_block = sorted(text_block_list, key=lambda x: x['order'])
|
|
74
|
+
text = ''
|
|
75
|
+
for block in sorted_block:
|
|
76
|
+
text += block['text']
|
|
77
|
+
merged_block = {
|
|
78
|
+
'category_type': sorted_block[0]['category_type'], # Directly use information from the first block
|
|
79
|
+
'order': sorted_block[0]['order'],
|
|
80
|
+
'anno_id': sorted_block[0]['anno_id'],
|
|
81
|
+
'text': text,
|
|
82
|
+
'merge_list': sorted_block
|
|
83
|
+
}
|
|
84
|
+
saved_element_dict[sorted_block[0]['category_type']].append(merged_block)
|
|
85
|
+
|
|
86
|
+
return saved_element_dict
|
|
87
|
+
|
|
88
|
+
def get_page_elements_list(self, gt_page_elements, category_list):
|
|
89
|
+
element_list = []
|
|
90
|
+
for category_type in category_list:
|
|
91
|
+
if gt_page_elements.get(category_type):
|
|
92
|
+
element_list.extend(gt_page_elements[category_type])
|
|
93
|
+
return element_list
|
|
94
|
+
|
|
95
|
+
def get_sorted_text_list(self, selected_annos):
|
|
96
|
+
# txt_type: text, latex, html
|
|
97
|
+
text_list = []
|
|
98
|
+
for item in selected_annos:
|
|
99
|
+
if item.get('order'):
|
|
100
|
+
order = item['order']
|
|
101
|
+
else:
|
|
102
|
+
order = 0
|
|
103
|
+
# 【txt_type,selecte_annos]
|
|
104
|
+
text_list.append((order, item))
|
|
105
|
+
sorted_text_list = sorted(text_list, key=lambda x: x[0])
|
|
106
|
+
return [_[1] for _ in sorted_text_list]
|
|
107
|
+
|
|
108
|
+
def filtered_out_ignore(self, items, ignore_category_list):
|
|
109
|
+
filted_items = []
|
|
110
|
+
for item in items:
|
|
111
|
+
if item['gt_category_type'] not in ignore_category_list:
|
|
112
|
+
filted_items.append(item)
|
|
113
|
+
return filted_items
|
|
114
|
+
|
|
115
|
+
def get_order_paired(self, order_match_s, img_name):
|
|
116
|
+
matched = [(item['gt_position'], item['pred_position'])
|
|
117
|
+
for item in order_match_s
|
|
118
|
+
if (item['gt_position'] != [''] and item['pred_position'] != '')]
|
|
119
|
+
gt_idx_all = [item['gt_position'] for item in order_match_s if (item['gt_position'] != [''])]
|
|
120
|
+
read_order_pred = [i[0] for i in sorted(matched, key=lambda x: x[1])]
|
|
121
|
+
read_order_gt = sum(gt_idx_all, []) # Convert to one-dimensional list
|
|
122
|
+
read_order_gt = [x for x in read_order_gt if x]
|
|
123
|
+
gt = sorted(read_order_gt)
|
|
124
|
+
pred = sum(read_order_pred, [])
|
|
125
|
+
pred = [x for x in pred if x]
|
|
126
|
+
if len(pred) > 0 or len(gt) > 0:
|
|
127
|
+
import Levenshtein
|
|
128
|
+
edit = Levenshtein.distance(gt, pred) / max(len(pred), len(gt))
|
|
129
|
+
return {'gt': gt, 'pred': pred, 'img_id': img_name, 'edit': edit}
|
|
130
|
+
else:
|
|
131
|
+
return {} # If both GT and pred are empty for the page, return empty
|
|
132
|
+
|
|
133
|
+
def formula_format(self, formula_matches, img_name):
|
|
134
|
+
# formated_list = []
|
|
135
|
+
for i, item in enumerate(formula_matches):
|
|
136
|
+
item['img_id'] = img_name + '_' + str(i)
|
|
137
|
+
return formula_matches
|
|
138
|
+
|
|
139
|
+
def get_matched_elements(self, references: list, predictions: list) -> dict:
|
|
140
|
+
from .metrics import recogition_end2end_base_dataset, recogition_end2end_table_dataset
|
|
141
|
+
|
|
142
|
+
plain_text_match = []
|
|
143
|
+
display_formula_match = []
|
|
144
|
+
html_table_match = []
|
|
145
|
+
latex_table_match = []
|
|
146
|
+
order_match = []
|
|
147
|
+
|
|
148
|
+
for i, sample in enumerate(references):
|
|
149
|
+
img_name = os.path.basename(sample['page_info']['image_path'])
|
|
150
|
+
pred_content = predictions[i]
|
|
151
|
+
result = self.process_get_matched_elements(sample, pred_content, img_name)
|
|
152
|
+
[
|
|
153
|
+
plain_text_match_clean, formated_display_formula, latex_table_match_s, html_table_match_s,
|
|
154
|
+
order_match_single
|
|
155
|
+
] = result
|
|
156
|
+
|
|
157
|
+
if order_match_single:
|
|
158
|
+
order_match.append(order_match_single)
|
|
159
|
+
if plain_text_match_clean:
|
|
160
|
+
plain_text_match.extend(plain_text_match_clean)
|
|
161
|
+
if formated_display_formula:
|
|
162
|
+
display_formula_match.extend(formated_display_formula)
|
|
163
|
+
if latex_table_match_s:
|
|
164
|
+
latex_table_match.extend(latex_table_match_s)
|
|
165
|
+
if html_table_match_s:
|
|
166
|
+
html_table_match.extend(html_table_match_s)
|
|
167
|
+
|
|
168
|
+
if len(latex_table_match) > len(html_table_match):
|
|
169
|
+
table_match = latex_table_match
|
|
170
|
+
table_format = 'latex'
|
|
171
|
+
else:
|
|
172
|
+
table_match = html_table_match
|
|
173
|
+
table_format = 'html'
|
|
174
|
+
|
|
175
|
+
matched_samples_all = {
|
|
176
|
+
'text_block': recogition_end2end_base_dataset(plain_text_match),
|
|
177
|
+
'display_formula': recogition_end2end_base_dataset(display_formula_match),
|
|
178
|
+
'table': recogition_end2end_table_dataset(table_match, table_format),
|
|
179
|
+
'reading_order': recogition_end2end_base_dataset(order_match)
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
return matched_samples_all
|
|
183
|
+
|
|
184
|
+
def process_get_matched_elements(self, sample, pred_content, img_name):
|
|
185
|
+
from func_timeout import FunctionTimedOut, func_timeout
|
|
186
|
+
|
|
187
|
+
from .utils import match_gt2pred_no_split, match_gt2pred_quick, match_gt2pred_simple, md_tex_filter
|
|
188
|
+
|
|
189
|
+
if self.match_method == 'simple_match': # add match choice
|
|
190
|
+
match_gt2pred = match_gt2pred_simple
|
|
191
|
+
elif self.match_method == 'quick_match':
|
|
192
|
+
match_gt2pred = match_gt2pred_quick
|
|
193
|
+
elif self.match_method == 'no_split':
|
|
194
|
+
match_gt2pred = match_gt2pred_no_split
|
|
195
|
+
else:
|
|
196
|
+
match_gt2pred = match_gt2pred_quick
|
|
197
|
+
|
|
198
|
+
pred_dataset = md_tex_filter(pred_content)
|
|
199
|
+
gt_page_elements = self.get_page_elements(sample)
|
|
200
|
+
|
|
201
|
+
text_all = self.get_page_elements_list(
|
|
202
|
+
gt_page_elements, [
|
|
203
|
+
'text_block', 'title', 'code_txt', 'code_txt_caption', 'reference', 'equation_caption',
|
|
204
|
+
'figure_caption', 'figure_footnote', 'table_caption', 'table_footnote', 'code_algorithm',
|
|
205
|
+
'code_algorithm_caption', 'header', 'footer', 'page_footnote', 'page_number'
|
|
206
|
+
]
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
display_formula_match_s = []
|
|
210
|
+
plain_text_match_clean = []
|
|
211
|
+
latex_table_match_s = []
|
|
212
|
+
html_table_match_s = []
|
|
213
|
+
order_match_single = []
|
|
214
|
+
if text_all:
|
|
215
|
+
gt_text_list = self.get_sorted_text_list(text_all)
|
|
216
|
+
try:
|
|
217
|
+
plain_text_match_s = func_timeout(
|
|
218
|
+
30, match_gt2pred, args=(gt_text_list, pred_dataset['text_all'], 'text', img_name)
|
|
219
|
+
)
|
|
220
|
+
except FunctionTimedOut as e:
|
|
221
|
+
logger.warning(f'Time out for plain text match of {img_name}, match_gt2pred_simple will be used.')
|
|
222
|
+
plain_text_match_s = match_gt2pred_simple(gt_text_list, pred_dataset['text_all'], 'text', img_name)
|
|
223
|
+
logger.error(str(e))
|
|
224
|
+
raise e
|
|
225
|
+
|
|
226
|
+
if not plain_text_match_s:
|
|
227
|
+
logger.warning(f'No text match of {img_name}. The plain text match will be empty.')
|
|
228
|
+
else:
|
|
229
|
+
plain_text_match_clean = self.filtered_out_ignore(
|
|
230
|
+
plain_text_match_s, [
|
|
231
|
+
'figure_caption', 'figure_footnote', 'table_caption', 'table_footnote', 'code_algorithm',
|
|
232
|
+
'code_algorithm_caption', 'header', 'footer', 'page_footnote', 'page_number', 'equation_caption'
|
|
233
|
+
]
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
if gt_page_elements.get('equation_isolated'):
|
|
237
|
+
gt_display_list = self.get_sorted_text_list(gt_page_elements['equation_isolated'])
|
|
238
|
+
display_formula_match_s = match_gt2pred(
|
|
239
|
+
gt_display_list, pred_dataset['equation_isolated'], 'formula', img_name
|
|
240
|
+
)
|
|
241
|
+
display_formula_match_s = [x for x in display_formula_match_s if x['gt_idx'] != ['']]
|
|
242
|
+
if not display_formula_match_s:
|
|
243
|
+
logger.warning(f'No display_formula_match of {img_name}. The display_formula_match will be empty.')
|
|
244
|
+
|
|
245
|
+
if gt_page_elements.get('table'):
|
|
246
|
+
gt_table_list = self.get_sorted_text_list(gt_page_elements['table'])
|
|
247
|
+
if pred_dataset['latex_table']:
|
|
248
|
+
latex_table_match_s = match_gt2pred_simple(
|
|
249
|
+
gt_table_list, pred_dataset['latex_table'], 'latex_table', img_name
|
|
250
|
+
)
|
|
251
|
+
latex_table_match_s = [x for x in latex_table_match_s if x['gt_idx'] != ['']]
|
|
252
|
+
if pred_dataset['html_table']:
|
|
253
|
+
html_table_match_s = match_gt2pred_simple(
|
|
254
|
+
gt_table_list, pred_dataset['html_table'], 'html_table', img_name
|
|
255
|
+
)
|
|
256
|
+
html_table_match_s = [x for x in html_table_match_s if x['gt_idx'] != ['']]
|
|
257
|
+
else:
|
|
258
|
+
html_table_match_s = match_gt2pred_simple(gt_table_list, [], 'html_table', img_name)
|
|
259
|
+
html_table_match_s = [x for x in html_table_match_s if x['gt_idx'] != ['']]
|
|
260
|
+
|
|
261
|
+
order_match_s = plain_text_match_clean
|
|
262
|
+
if order_match_s:
|
|
263
|
+
order_match_single = self.get_order_paired(order_match_s, img_name)
|
|
264
|
+
|
|
265
|
+
return [
|
|
266
|
+
plain_text_match_clean, display_formula_match_s, latex_table_match_s, html_table_match_s, order_match_single
|
|
267
|
+
]
|
|
268
|
+
|
|
269
|
+
def process_generated_metric_results(self, samples, save_name: str = 'end2end_quick_match'):
|
|
270
|
+
from .metrics import METRIC_REGISTRY, get_full_labels_results, get_page_split, show_result
|
|
271
|
+
|
|
272
|
+
result_all = {}
|
|
273
|
+
page_info = {}
|
|
274
|
+
metircs_dict = self.dafault_metircs_dict
|
|
275
|
+
pages = self.references #gt_samples list
|
|
276
|
+
|
|
277
|
+
for page in pages:
|
|
278
|
+
img_path = os.path.basename(page['page_info']['image_path'])
|
|
279
|
+
page_info[img_path] = page['page_info']['page_attribute']
|
|
280
|
+
|
|
281
|
+
for element in metircs_dict.keys():
|
|
282
|
+
|
|
283
|
+
result = {}
|
|
284
|
+
group_info = metircs_dict[element].get('group', [])
|
|
285
|
+
# samples = samples.get(element) ##
|
|
286
|
+
cur_samples = samples[element]
|
|
287
|
+
|
|
288
|
+
for metric in metircs_dict[element]['metric']:
|
|
289
|
+
metric_val = METRIC_REGISTRY.get(metric)
|
|
290
|
+
|
|
291
|
+
cur_samples, result_s = metric_val(cur_samples).evaluate(group_info, f'{save_name}_{element}')
|
|
292
|
+
if result_s:
|
|
293
|
+
result.update(result_s)
|
|
294
|
+
|
|
295
|
+
if result:
|
|
296
|
+
logger.info(f'{element}')
|
|
297
|
+
show_result(result)
|
|
298
|
+
result_all[element] = {}
|
|
299
|
+
|
|
300
|
+
group_result = get_full_labels_results(cur_samples)
|
|
301
|
+
page_result = get_page_split(cur_samples, page_info)
|
|
302
|
+
|
|
303
|
+
result_all[element] = {'all': result, 'group': group_result, 'page': page_result}
|
|
304
|
+
|
|
305
|
+
save_dict = {}
|
|
306
|
+
en_overall = []
|
|
307
|
+
ch_overall = []
|
|
308
|
+
for category_type, metric in [('text_block', 'Edit_dist'), ('display_formula', 'Edit_dist'),
|
|
309
|
+
('display_formula', 'CDM'), ('table', 'TEDS'), ('table', 'Edit_dist'),
|
|
310
|
+
('reading_order', 'Edit_dist')]:
|
|
311
|
+
if metric == 'TEDS':
|
|
312
|
+
if category_type in result_all and 'page' in result_all[category_type] and metric in result_all[
|
|
313
|
+
category_type]['page']:
|
|
314
|
+
save_dict[category_type + '_' + metric
|
|
315
|
+
+ '_EN'] = result_all[category_type]['page'][metric]['language: english']
|
|
316
|
+
save_dict[category_type + '_' + metric
|
|
317
|
+
+ '_CH'] = result_all[category_type]['page'][metric]['language: simplified_chinese']
|
|
318
|
+
else:
|
|
319
|
+
save_dict[category_type + '_' + metric + '_EN'] = np.nan
|
|
320
|
+
save_dict[category_type + '_' + metric + '_CH'] = np.nan
|
|
321
|
+
else:
|
|
322
|
+
if category_type in result_all and 'page' in result_all[category_type] and metric in result_all[
|
|
323
|
+
category_type]['page']:
|
|
324
|
+
save_dict[category_type + '_' + metric
|
|
325
|
+
+ '_EN'] = result_all[category_type]['page'][metric].get('language: english', np.nan)
|
|
326
|
+
save_dict[category_type + '_' + metric + '_CH'] = result_all[category_type]['page'][metric].get(
|
|
327
|
+
'language: simplified_chinese', np.nan
|
|
328
|
+
)
|
|
329
|
+
else:
|
|
330
|
+
save_dict[category_type + '_' + metric + '_EN'] = np.nan
|
|
331
|
+
save_dict[category_type + '_' + metric + '_CH'] = np.nan
|
|
332
|
+
|
|
333
|
+
if metric == 'Edit_dist':
|
|
334
|
+
if category_type in result_all and 'page' in result_all[category_type] and metric in result_all[
|
|
335
|
+
category_type]['page']:
|
|
336
|
+
en_overall.append(result_all[category_type]['page'][metric].get('language: english', np.nan))
|
|
337
|
+
ch_overall.append(
|
|
338
|
+
result_all[category_type]['page'][metric].get('language: simplified_chinese', np.nan)
|
|
339
|
+
)
|
|
340
|
+
else:
|
|
341
|
+
en_overall.append(np.nan)
|
|
342
|
+
ch_overall.append(np.nan)
|
|
343
|
+
|
|
344
|
+
en_overall_filtered = [x for x in en_overall if not np.isnan(x)]
|
|
345
|
+
ch_overall_filtered = [x for x in ch_overall if not np.isnan(x)]
|
|
346
|
+
save_dict['overall_EN'] = sum(en_overall_filtered) / len(en_overall_filtered) if en_overall_filtered else np.nan
|
|
347
|
+
save_dict['overall_CH'] = sum(ch_overall_filtered) / len(ch_overall_filtered) if ch_overall_filtered else np.nan
|
|
348
|
+
|
|
349
|
+
return save_dict
|