evalscope 1.1.0__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (100) hide show
  1. evalscope/api/benchmark/__init__.py +8 -1
  2. evalscope/api/benchmark/adapters/__init__.py +1 -0
  3. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  4. evalscope/api/benchmark/benchmark.py +14 -0
  5. evalscope/api/dataset/dataset.py +21 -0
  6. evalscope/api/dataset/loader.py +6 -2
  7. evalscope/api/mixin/sandbox_mixin.py +32 -54
  8. evalscope/api/model/generate_config.py +6 -0
  9. evalscope/benchmarks/aa_lcr/__init__.py +0 -0
  10. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  11. evalscope/benchmarks/bfcl/bfcl_adapter.py +1 -1
  12. evalscope/benchmarks/data_collection/data_collection_adapter.py +2 -1
  13. evalscope/benchmarks/general_arena/general_arena_adapter.py +1 -1
  14. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
  15. evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  16. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +23 -4
  17. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  18. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +158 -0
  19. evalscope/benchmarks/humaneval/humaneval_adapter.py +2 -1
  20. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +3 -1
  21. evalscope/benchmarks/math_verse/__init__.py +0 -0
  22. evalscope/benchmarks/math_verse/math_verse_adapter.py +100 -0
  23. evalscope/benchmarks/math_vision/__init__.py +0 -0
  24. evalscope/benchmarks/math_vision/math_vision_adapter.py +111 -0
  25. evalscope/benchmarks/math_vista/math_vista_adapter.py +6 -26
  26. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -1
  27. evalscope/benchmarks/ner/__init__.py +0 -0
  28. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  29. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  30. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  31. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  32. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  33. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  34. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  35. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  36. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  37. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  38. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  39. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  40. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  41. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  42. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  43. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  44. evalscope/benchmarks/ocr_bench_v2/utils.py +1 -0
  45. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  46. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  47. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  48. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  49. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  50. evalscope/benchmarks/poly_math/__init__.py +0 -0
  51. evalscope/benchmarks/poly_math/poly_math_adapter.py +127 -0
  52. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  53. evalscope/benchmarks/pope/__init__.py +0 -0
  54. evalscope/benchmarks/pope/pope_adapter.py +111 -0
  55. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  56. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  57. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  58. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  59. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +1 -1
  60. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +1 -1
  61. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  62. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  63. evalscope/benchmarks/zerobench/__init__.py +0 -0
  64. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  65. evalscope/constants.py +4 -0
  66. evalscope/evaluator/evaluator.py +72 -79
  67. evalscope/metrics/math_parser.py +14 -0
  68. evalscope/metrics/metric.py +1 -1
  69. evalscope/models/utils/openai.py +4 -0
  70. evalscope/perf/arguments.py +24 -4
  71. evalscope/perf/benchmark.py +74 -89
  72. evalscope/perf/http_client.py +31 -16
  73. evalscope/perf/main.py +15 -2
  74. evalscope/perf/plugin/api/base.py +9 -7
  75. evalscope/perf/plugin/api/custom_api.py +13 -58
  76. evalscope/perf/plugin/api/default_api.py +179 -79
  77. evalscope/perf/plugin/api/openai_api.py +4 -3
  78. evalscope/perf/plugin/datasets/base.py +21 -0
  79. evalscope/perf/plugin/datasets/custom.py +2 -3
  80. evalscope/perf/plugin/datasets/line_by_line.py +2 -3
  81. evalscope/perf/plugin/datasets/longalpaca.py +2 -3
  82. evalscope/perf/plugin/datasets/openqa.py +2 -4
  83. evalscope/perf/plugin/datasets/random_dataset.py +1 -3
  84. evalscope/perf/utils/benchmark_util.py +36 -22
  85. evalscope/perf/utils/db_util.py +14 -19
  86. evalscope/perf/utils/local_server.py +0 -44
  87. evalscope/perf/utils/log_utils.py +21 -6
  88. evalscope/report/__init__.py +2 -1
  89. evalscope/run.py +4 -0
  90. evalscope/utils/function_utils.py +195 -12
  91. evalscope/utils/io_utils.py +74 -0
  92. evalscope/utils/logger.py +49 -17
  93. evalscope/utils/ner.py +377 -0
  94. evalscope/version.py +2 -2
  95. {evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info}/METADATA +235 -363
  96. {evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info}/RECORD +100 -55
  97. {evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info}/WHEEL +1 -1
  98. {evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info}/entry_points.txt +0 -0
  99. {evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info/licenses}/LICENSE +0 -0
  100. {evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,74 @@
1
+ from evalscope.api.benchmark import BenchmarkMeta, NERAdapter
2
+ from evalscope.api.registry import register_benchmark
3
+ from evalscope.constants import Tags
4
+ from evalscope.utils.ner import FEWSHOT_TEMPLATE, PROMPT_TEMPLATE
5
+
6
+ DESCRIPTION = (
7
+ 'The MIT-Movie-Trivia dataset, originally created for slot filling, is modified by '
8
+ 'ignoring some slot types (e.g. genre, rating) and merging others (e.g. director '
9
+ 'and actor in person, and song and movie title in title) in order to keep '
10
+ 'consistent named entity types across all datasets.'
11
+ )
12
+
13
+
14
+ @register_benchmark(
15
+ BenchmarkMeta(
16
+ name='mit-movie-trivia',
17
+ pretty_name='MIT-Movie-Trivia',
18
+ dataset_id='extraordinarylab/mit-movie-trivia',
19
+ tags=[Tags.KNOWLEDGE, Tags.NER],
20
+ description=DESCRIPTION.strip(),
21
+ few_shot_num=5,
22
+ train_split='train',
23
+ eval_split='test',
24
+ metric_list=['precision', 'recall', 'f1_score', 'accuracy'],
25
+ prompt_template=PROMPT_TEMPLATE,
26
+ few_shot_prompt_template=FEWSHOT_TEMPLATE,
27
+ )
28
+ )
29
+ class MITMovieTriviaAdapter(NERAdapter):
30
+ """
31
+ Adapter for the MIT-Movie-Trivia Named Entity Recognition dataset.
32
+
33
+ This adapter inherits the NER functionality from NERAdapter and
34
+ configures it specifically for the MIT-Movie-Trivia dataset's entity types.
35
+ """
36
+
37
+ def __init__(self, **kwargs):
38
+ # Initialize the parent class first
39
+ super().__init__(**kwargs)
40
+
41
+ # Define MIT-Movie-Trivia-specific entity mappings
42
+ self.entity_type_map = {
43
+ 'ACTOR': 'actor',
44
+ 'AWARD': 'award',
45
+ 'CHARACTER_NAME': 'character_name',
46
+ 'DIRECTOR': 'director',
47
+ 'GENRE': 'genre',
48
+ 'OPINION': 'opinion',
49
+ 'ORIGIN': 'origin',
50
+ 'PLOT': 'plot',
51
+ 'QUOTE': 'quote',
52
+ 'RELATIONSHIP': 'relationship',
53
+ 'SOUNDTRACK': 'soundtrack',
54
+ 'YEAR': 'year'
55
+ }
56
+
57
+ # Add descriptions for each entity type
58
+ self.entity_descriptions = {
59
+ 'ACTOR': 'The name of an actor or actress starring in the movie.',
60
+ 'AWARD': 'An award the movie won or was nominated for.',
61
+ 'CHARACTER_NAME': 'The name of a character in the movie.',
62
+ 'DIRECTOR': 'The name of the person who directed the movie.',
63
+ 'GENRE': 'The category or style of the movie.',
64
+ 'OPINION': 'A subjective review or personal opinion about the movie.',
65
+ 'ORIGIN': 'The source material or basis for the movie.',
66
+ 'PLOT': 'A description or summary of the movie\'s storyline.',
67
+ 'QUOTE': 'A memorable line or phrase spoken in the movie.',
68
+ 'RELATIONSHIP': 'The connection or relationship between characters.',
69
+ 'SOUNDTRACK': 'The music or a specific song from the movie.',
70
+ 'YEAR': 'The release year of the movie.'
71
+ }
72
+
73
+ # Setup entity mappings based on the defined entity types
74
+ self.setup_entity_mappings()
@@ -0,0 +1,66 @@
1
+ from evalscope.api.benchmark import BenchmarkMeta, NERAdapter
2
+ from evalscope.api.registry import register_benchmark
3
+ from evalscope.constants import Tags
4
+ from evalscope.utils.ner import FEWSHOT_TEMPLATE, PROMPT_TEMPLATE
5
+
6
+ DESCRIPTION = (
7
+ 'The MIT-Restaurant dataset is a collection of restaurant review text specifically '
8
+ 'curated for training and testing Natural Language Processing (NLP) models, '
9
+ 'particularly for Named Entity Recognition (NER). It contains sentences from real '
10
+ 'reviews, along with corresponding labels in the BIO format.'
11
+ )
12
+
13
+
14
+ @register_benchmark(
15
+ BenchmarkMeta(
16
+ name='mit-restaurant',
17
+ pretty_name='MIT-Restaurant',
18
+ dataset_id='extraordinarylab/mit-restaurant',
19
+ tags=[Tags.KNOWLEDGE, Tags.NER],
20
+ description=DESCRIPTION.strip(),
21
+ few_shot_num=5,
22
+ train_split='train',
23
+ eval_split='test',
24
+ metric_list=['precision', 'recall', 'f1_score', 'accuracy'],
25
+ prompt_template=PROMPT_TEMPLATE,
26
+ few_shot_prompt_template=FEWSHOT_TEMPLATE,
27
+ )
28
+ )
29
+ class MITRestaurantAdapter(NERAdapter):
30
+ """
31
+ Adapter for the MIT-Restaurant Named Entity Recognition dataset.
32
+
33
+ This adapter inherits the NER functionality from NERAdapter and
34
+ configures it specifically for the MIT-Restaurant dataset's entity types.
35
+ """
36
+
37
+ def __init__(self, **kwargs):
38
+ # Initialize the parent class first
39
+ super().__init__(**kwargs)
40
+
41
+ # Define MIT-Restaurant-specific entity mappings
42
+ self.entity_type_map = {
43
+ 'AMENITY': 'amenity',
44
+ 'CUISINE': 'cuisine',
45
+ 'DISH': 'dish',
46
+ 'HOURS': 'hours',
47
+ 'LOCATION': 'location',
48
+ 'PRICE': 'price',
49
+ 'RATING': 'rating',
50
+ 'RESTAURANT_NAME': 'restaurant_name'
51
+ }
52
+
53
+ # Add descriptions for each entity type
54
+ self.entity_descriptions = {
55
+ 'AMENITY': 'A feature or service offered by the restaurant.',
56
+ 'CUISINE': 'The type of food a restaurant serves.',
57
+ 'DISH': 'A specific food or drink item.',
58
+ 'HOURS': 'The operating hours of a restaurant.',
59
+ 'LOCATION': 'The address or general location of a restaurant.',
60
+ 'PRICE': 'The price range of a restaurant.',
61
+ 'RATING': 'A rating or review of the restaurant.',
62
+ 'RESTAURANT_NAME': 'The name of a restaurant.',
63
+ }
64
+
65
+ # Setup entity mappings based on the defined entity types
66
+ self.setup_entity_mappings()
@@ -0,0 +1,87 @@
1
+ from evalscope.api.benchmark import BenchmarkMeta, NERAdapter
2
+ from evalscope.api.registry import register_benchmark
3
+ from evalscope.constants import Tags
4
+ from evalscope.utils.ner import FEWSHOT_TEMPLATE, PROMPT_TEMPLATE
5
+
6
+ DESCRIPTION = (
7
+ 'OntoNotes Release 5.0 is a large, multilingual corpus containing text in English, '
8
+ 'Chinese, and Arabic across various genres like news, weblogs, and broadcast '
9
+ 'conversations. It is richly annotated with multiple layers of linguistic information, '
10
+ 'including syntax, predicate-argument structure, word sense, named entities, and '
11
+ 'coreference to support research and development in natural language processing.'
12
+ )
13
+
14
+
15
+ @register_benchmark(
16
+ BenchmarkMeta(
17
+ name='ontonotes5',
18
+ pretty_name='OntoNotes5',
19
+ dataset_id='extraordinarylab/ontonotes5',
20
+ tags=[Tags.KNOWLEDGE, Tags.NER],
21
+ description=DESCRIPTION.strip(),
22
+ few_shot_num=5,
23
+ train_split='train',
24
+ eval_split='test',
25
+ metric_list=['precision', 'recall', 'f1_score', 'accuracy'],
26
+ prompt_template=PROMPT_TEMPLATE,
27
+ few_shot_prompt_template=FEWSHOT_TEMPLATE,
28
+ )
29
+ )
30
+ class OntoNotes5Adapter(NERAdapter):
31
+ """
32
+ Adapter for the OntoNotes5 Named Entity Recognition dataset.
33
+
34
+ This adapter inherits the NER functionality from NERAdapter and
35
+ configures it specifically for the OntoNotes5 dataset's entity types.
36
+ """
37
+
38
+ def __init__(self, **kwargs):
39
+ # Initialize the parent class first
40
+ super().__init__(**kwargs)
41
+
42
+ # Define OntoNotes5-specific entity mappings
43
+ self.entity_type_map = {
44
+ 'CARDINAL': 'cardinal',
45
+ 'DATE': 'date',
46
+ 'EVENT': 'event',
47
+ 'FAC': 'facility',
48
+ 'GPE': 'geopolitical_entity',
49
+ 'LANGUAGE': 'language',
50
+ 'LAW': 'law',
51
+ 'LOC': 'location',
52
+ 'MONEY': 'money',
53
+ 'NORP': 'nationalities_or_religious_or_political_groups',
54
+ 'ORDINAL': 'ordinal',
55
+ 'ORG': 'organization',
56
+ 'PERCENT': 'percent',
57
+ 'PERSON': 'person',
58
+ 'PRODUCT': 'product',
59
+ 'QUANTITY': 'quantity',
60
+ 'TIME': 'time',
61
+ 'WORK_OF_ART': 'work_of_art'
62
+ }
63
+
64
+ # Add descriptions for each entity type
65
+ self.entity_descriptions = {
66
+ 'PERSON': 'People, including fictional',
67
+ 'NORP': 'Nationalities or religious or political groups',
68
+ 'FAC': 'Buildings, airports, highways, bridges, etc.',
69
+ 'ORG': 'Companies, agencies, institutions, etc.',
70
+ 'GPE': 'Countries, cities, states',
71
+ 'LOC': 'Non-GPE locations, mountain ranges, bodies of water',
72
+ 'PRODUCT': 'Vehicles, weapons, foods, etc. (Not services)',
73
+ 'EVENT': 'Named hurricanes, battles, wars, sports events, etc.',
74
+ 'WORK_OF_ART': 'Titles of books, songs, etc.',
75
+ 'LAW': 'Named documents made into laws',
76
+ 'LANGUAGE': 'Any named language',
77
+ 'DATE': 'Absolute or relative dates or periods',
78
+ 'TIME': 'Times smaller than a day',
79
+ 'PERCENT': 'Percentage (including "%")',
80
+ 'MONEY': 'Monetary values, including unit',
81
+ 'QUANTITY': 'Measurements, as of weight or distance',
82
+ 'ORDINAL': '"first", "second"',
83
+ 'CARDINAL': 'Numerals that do not fall under another type'
84
+ }
85
+
86
+ # Setup entity mappings based on the defined entity types
87
+ self.setup_entity_mappings()
@@ -0,0 +1,61 @@
1
+ from evalscope.api.benchmark import BenchmarkMeta, NERAdapter
2
+ from evalscope.api.registry import register_benchmark
3
+ from evalscope.constants import Tags
4
+ from evalscope.utils.ner import FEWSHOT_TEMPLATE, PROMPT_TEMPLATE
5
+
6
+ DESCRIPTION = (
7
+ 'The WNUT2017 dataset is a collection of user-generated text from various social '
8
+ 'media platforms, like Twitter and YouTube, specifically designed for a named-entity '
9
+ 'recognition task.'
10
+ )
11
+
12
+
13
+ @register_benchmark(
14
+ BenchmarkMeta(
15
+ name='wnut2017',
16
+ pretty_name='WNUT2017',
17
+ dataset_id='extraordinarylab/wnut2017',
18
+ tags=[Tags.KNOWLEDGE, Tags.NER],
19
+ description=DESCRIPTION.strip(),
20
+ few_shot_num=5,
21
+ train_split='train',
22
+ eval_split='test',
23
+ metric_list=['precision', 'recall', 'f1_score', 'accuracy'],
24
+ prompt_template=PROMPT_TEMPLATE,
25
+ few_shot_prompt_template=FEWSHOT_TEMPLATE,
26
+ )
27
+ )
28
+ class WNUT2017Adapter(NERAdapter):
29
+ """
30
+ Adapter for the WNUT2017 Named Entity Recognition dataset.
31
+
32
+ This adapter inherits the NER functionality from NERAdapter and
33
+ configures it specifically for the WNUT2017 dataset's entity types.
34
+ """
35
+
36
+ def __init__(self, **kwargs):
37
+ # Initialize the parent class first
38
+ super().__init__(**kwargs)
39
+
40
+ # Define WNUT2017-specific entity mappings
41
+ self.entity_type_map = {
42
+ 'CORPORATION': 'corporation',
43
+ 'CREATIVE-WORK': 'creative_work',
44
+ 'GROUP': 'group',
45
+ 'LOCATION': 'location',
46
+ 'PERSON': 'person',
47
+ 'PRODUCT': 'product'
48
+ }
49
+
50
+ # Add descriptions for each entity type
51
+ self.entity_descriptions = {
52
+ 'CORPORATION': 'Named companies, businesses, agencies, and other institutions.',
53
+ 'CREATIVE-WORK': 'Named books, songs, movies, paintings, and other works of art.',
54
+ 'GROUP': 'Named groups of people, such as sports teams, bands, or political groups.',
55
+ 'LOCATION': 'Named geographical locations, such as cities, countries, and natural landmarks.',
56
+ 'PERSON': 'Named individuals, including both real and fictional people.',
57
+ 'PRODUCT': 'Named commercial products, including vehicles, software, and other goods.'
58
+ }
59
+
60
+ # Setup entity mappings based on the defined entity types
61
+ self.setup_entity_mappings()
@@ -1,5 +1,6 @@
1
1
  # flake8: noqa
2
2
  import ast
3
+ import os
3
4
  import re
4
5
 
5
6
  from .IoUscore_metric import calculate_iou, extract_coordinates, vqa_with_position_evaluation
File without changes
@@ -0,0 +1,349 @@
1
+ # flake8: noqa
2
+ import numpy as np
3
+ import os
4
+ import sys
5
+ from collections import defaultdict
6
+ from typing import Dict, List
7
+
8
+ from evalscope.utils import get_logger
9
+
10
+ logger = get_logger()
11
+
12
+
13
+ class End2EndEvaluator():
14
+
15
+ def __init__(
16
+ self,
17
+ prediction: List,
18
+ reference: List,
19
+ metrics: Dict,
20
+ match_method: str = 'quick_match',
21
+ filter_types: dict = None
22
+ ):
23
+
24
+ self.match_method = match_method
25
+ self.references = reference
26
+ self.predictions = prediction
27
+ self.dafault_metircs_dict = metrics
28
+
29
+ filtered_gt_samples = []
30
+ if filter_types:
31
+ for gt_sample in self.references:
32
+ select_flag = True
33
+ for k, v in filter_types.items():
34
+ if gt_sample['page_info']['page_attribute'][k] != v:
35
+ select_flag = False
36
+ if select_flag:
37
+ filtered_gt_samples.append(gt_sample)
38
+ else:
39
+ filtered_gt_samples = self.references #[{},{},{}]
40
+ self.references = filtered_gt_samples
41
+
42
+ def score(self) -> dict:
43
+ samples = self.get_matched_elements(self.references, self.predictions)
44
+ metrics = self.process_generated_metric_results(samples)
45
+ return metrics
46
+
47
+ def get_page_elements(self, selected_annos):
48
+ saved_element_dict = defaultdict(list)
49
+ related_truncated = []
50
+ truncated_all = {}
51
+ for relation in selected_annos['extra']['relation']: # Handle truncated text issues
52
+ if relation['relation_type'] == 'truncated':
53
+ truncated_all[relation['source_anno_id']] = ''
54
+ truncated_all[relation['target_anno_id']] = ''
55
+ exist_flag = False
56
+ for merge_list in related_truncated:
57
+ if relation['source_anno_id'] in merge_list or relation[
58
+ 'target_anno_id'] in merge_list: # Consider cases where three text blocks may need to be merged
59
+ merge_list.append(relation['source_anno_id'])
60
+ merge_list.append(relation['target_anno_id'])
61
+ exist_flag = True
62
+ if not exist_flag:
63
+ related_truncated.append([relation['source_anno_id'], relation['target_anno_id']])
64
+
65
+ for item in selected_annos['layout_dets']:
66
+ if item['anno_id'] not in truncated_all.keys():
67
+ saved_element_dict[item['category_type']].append(item)
68
+ else:
69
+ truncated_all[item['anno_id']] = item
70
+
71
+ for merge_list in related_truncated:
72
+ text_block_list = [truncated_all[key] for key in merge_list]
73
+ sorted_block = sorted(text_block_list, key=lambda x: x['order'])
74
+ text = ''
75
+ for block in sorted_block:
76
+ text += block['text']
77
+ merged_block = {
78
+ 'category_type': sorted_block[0]['category_type'], # Directly use information from the first block
79
+ 'order': sorted_block[0]['order'],
80
+ 'anno_id': sorted_block[0]['anno_id'],
81
+ 'text': text,
82
+ 'merge_list': sorted_block
83
+ }
84
+ saved_element_dict[sorted_block[0]['category_type']].append(merged_block)
85
+
86
+ return saved_element_dict
87
+
88
+ def get_page_elements_list(self, gt_page_elements, category_list):
89
+ element_list = []
90
+ for category_type in category_list:
91
+ if gt_page_elements.get(category_type):
92
+ element_list.extend(gt_page_elements[category_type])
93
+ return element_list
94
+
95
+ def get_sorted_text_list(self, selected_annos):
96
+ # txt_type: text, latex, html
97
+ text_list = []
98
+ for item in selected_annos:
99
+ if item.get('order'):
100
+ order = item['order']
101
+ else:
102
+ order = 0
103
+ # 【txt_type,selecte_annos]
104
+ text_list.append((order, item))
105
+ sorted_text_list = sorted(text_list, key=lambda x: x[0])
106
+ return [_[1] for _ in sorted_text_list]
107
+
108
+ def filtered_out_ignore(self, items, ignore_category_list):
109
+ filted_items = []
110
+ for item in items:
111
+ if item['gt_category_type'] not in ignore_category_list:
112
+ filted_items.append(item)
113
+ return filted_items
114
+
115
+ def get_order_paired(self, order_match_s, img_name):
116
+ matched = [(item['gt_position'], item['pred_position'])
117
+ for item in order_match_s
118
+ if (item['gt_position'] != [''] and item['pred_position'] != '')]
119
+ gt_idx_all = [item['gt_position'] for item in order_match_s if (item['gt_position'] != [''])]
120
+ read_order_pred = [i[0] for i in sorted(matched, key=lambda x: x[1])]
121
+ read_order_gt = sum(gt_idx_all, []) # Convert to one-dimensional list
122
+ read_order_gt = [x for x in read_order_gt if x]
123
+ gt = sorted(read_order_gt)
124
+ pred = sum(read_order_pred, [])
125
+ pred = [x for x in pred if x]
126
+ if len(pred) > 0 or len(gt) > 0:
127
+ import Levenshtein
128
+ edit = Levenshtein.distance(gt, pred) / max(len(pred), len(gt))
129
+ return {'gt': gt, 'pred': pred, 'img_id': img_name, 'edit': edit}
130
+ else:
131
+ return {} # If both GT and pred are empty for the page, return empty
132
+
133
+ def formula_format(self, formula_matches, img_name):
134
+ # formated_list = []
135
+ for i, item in enumerate(formula_matches):
136
+ item['img_id'] = img_name + '_' + str(i)
137
+ return formula_matches
138
+
139
+ def get_matched_elements(self, references: list, predictions: list) -> dict:
140
+ from .metrics import recogition_end2end_base_dataset, recogition_end2end_table_dataset
141
+
142
+ plain_text_match = []
143
+ display_formula_match = []
144
+ html_table_match = []
145
+ latex_table_match = []
146
+ order_match = []
147
+
148
+ for i, sample in enumerate(references):
149
+ img_name = os.path.basename(sample['page_info']['image_path'])
150
+ pred_content = predictions[i]
151
+ result = self.process_get_matched_elements(sample, pred_content, img_name)
152
+ [
153
+ plain_text_match_clean, formated_display_formula, latex_table_match_s, html_table_match_s,
154
+ order_match_single
155
+ ] = result
156
+
157
+ if order_match_single:
158
+ order_match.append(order_match_single)
159
+ if plain_text_match_clean:
160
+ plain_text_match.extend(plain_text_match_clean)
161
+ if formated_display_formula:
162
+ display_formula_match.extend(formated_display_formula)
163
+ if latex_table_match_s:
164
+ latex_table_match.extend(latex_table_match_s)
165
+ if html_table_match_s:
166
+ html_table_match.extend(html_table_match_s)
167
+
168
+ if len(latex_table_match) > len(html_table_match):
169
+ table_match = latex_table_match
170
+ table_format = 'latex'
171
+ else:
172
+ table_match = html_table_match
173
+ table_format = 'html'
174
+
175
+ matched_samples_all = {
176
+ 'text_block': recogition_end2end_base_dataset(plain_text_match),
177
+ 'display_formula': recogition_end2end_base_dataset(display_formula_match),
178
+ 'table': recogition_end2end_table_dataset(table_match, table_format),
179
+ 'reading_order': recogition_end2end_base_dataset(order_match)
180
+ }
181
+
182
+ return matched_samples_all
183
+
184
+ def process_get_matched_elements(self, sample, pred_content, img_name):
185
+ from func_timeout import FunctionTimedOut, func_timeout
186
+
187
+ from .utils import match_gt2pred_no_split, match_gt2pred_quick, match_gt2pred_simple, md_tex_filter
188
+
189
+ if self.match_method == 'simple_match': # add match choice
190
+ match_gt2pred = match_gt2pred_simple
191
+ elif self.match_method == 'quick_match':
192
+ match_gt2pred = match_gt2pred_quick
193
+ elif self.match_method == 'no_split':
194
+ match_gt2pred = match_gt2pred_no_split
195
+ else:
196
+ match_gt2pred = match_gt2pred_quick
197
+
198
+ pred_dataset = md_tex_filter(pred_content)
199
+ gt_page_elements = self.get_page_elements(sample)
200
+
201
+ text_all = self.get_page_elements_list(
202
+ gt_page_elements, [
203
+ 'text_block', 'title', 'code_txt', 'code_txt_caption', 'reference', 'equation_caption',
204
+ 'figure_caption', 'figure_footnote', 'table_caption', 'table_footnote', 'code_algorithm',
205
+ 'code_algorithm_caption', 'header', 'footer', 'page_footnote', 'page_number'
206
+ ]
207
+ )
208
+
209
+ display_formula_match_s = []
210
+ plain_text_match_clean = []
211
+ latex_table_match_s = []
212
+ html_table_match_s = []
213
+ order_match_single = []
214
+ if text_all:
215
+ gt_text_list = self.get_sorted_text_list(text_all)
216
+ try:
217
+ plain_text_match_s = func_timeout(
218
+ 30, match_gt2pred, args=(gt_text_list, pred_dataset['text_all'], 'text', img_name)
219
+ )
220
+ except FunctionTimedOut as e:
221
+ logger.warning(f'Time out for plain text match of {img_name}, match_gt2pred_simple will be used.')
222
+ plain_text_match_s = match_gt2pred_simple(gt_text_list, pred_dataset['text_all'], 'text', img_name)
223
+ logger.error(str(e))
224
+ raise e
225
+
226
+ if not plain_text_match_s:
227
+ logger.warning(f'No text match of {img_name}. The plain text match will be empty.')
228
+ else:
229
+ plain_text_match_clean = self.filtered_out_ignore(
230
+ plain_text_match_s, [
231
+ 'figure_caption', 'figure_footnote', 'table_caption', 'table_footnote', 'code_algorithm',
232
+ 'code_algorithm_caption', 'header', 'footer', 'page_footnote', 'page_number', 'equation_caption'
233
+ ]
234
+ )
235
+
236
+ if gt_page_elements.get('equation_isolated'):
237
+ gt_display_list = self.get_sorted_text_list(gt_page_elements['equation_isolated'])
238
+ display_formula_match_s = match_gt2pred(
239
+ gt_display_list, pred_dataset['equation_isolated'], 'formula', img_name
240
+ )
241
+ display_formula_match_s = [x for x in display_formula_match_s if x['gt_idx'] != ['']]
242
+ if not display_formula_match_s:
243
+ logger.warning(f'No display_formula_match of {img_name}. The display_formula_match will be empty.')
244
+
245
+ if gt_page_elements.get('table'):
246
+ gt_table_list = self.get_sorted_text_list(gt_page_elements['table'])
247
+ if pred_dataset['latex_table']:
248
+ latex_table_match_s = match_gt2pred_simple(
249
+ gt_table_list, pred_dataset['latex_table'], 'latex_table', img_name
250
+ )
251
+ latex_table_match_s = [x for x in latex_table_match_s if x['gt_idx'] != ['']]
252
+ if pred_dataset['html_table']:
253
+ html_table_match_s = match_gt2pred_simple(
254
+ gt_table_list, pred_dataset['html_table'], 'html_table', img_name
255
+ )
256
+ html_table_match_s = [x for x in html_table_match_s if x['gt_idx'] != ['']]
257
+ else:
258
+ html_table_match_s = match_gt2pred_simple(gt_table_list, [], 'html_table', img_name)
259
+ html_table_match_s = [x for x in html_table_match_s if x['gt_idx'] != ['']]
260
+
261
+ order_match_s = plain_text_match_clean
262
+ if order_match_s:
263
+ order_match_single = self.get_order_paired(order_match_s, img_name)
264
+
265
+ return [
266
+ plain_text_match_clean, display_formula_match_s, latex_table_match_s, html_table_match_s, order_match_single
267
+ ]
268
+
269
+ def process_generated_metric_results(self, samples, save_name: str = 'end2end_quick_match'):
270
+ from .metrics import METRIC_REGISTRY, get_full_labels_results, get_page_split, show_result
271
+
272
+ result_all = {}
273
+ page_info = {}
274
+ metircs_dict = self.dafault_metircs_dict
275
+ pages = self.references #gt_samples list
276
+
277
+ for page in pages:
278
+ img_path = os.path.basename(page['page_info']['image_path'])
279
+ page_info[img_path] = page['page_info']['page_attribute']
280
+
281
+ for element in metircs_dict.keys():
282
+
283
+ result = {}
284
+ group_info = metircs_dict[element].get('group', [])
285
+ # samples = samples.get(element) ##
286
+ cur_samples = samples[element]
287
+
288
+ for metric in metircs_dict[element]['metric']:
289
+ metric_val = METRIC_REGISTRY.get(metric)
290
+
291
+ cur_samples, result_s = metric_val(cur_samples).evaluate(group_info, f'{save_name}_{element}')
292
+ if result_s:
293
+ result.update(result_s)
294
+
295
+ if result:
296
+ logger.info(f'{element}')
297
+ show_result(result)
298
+ result_all[element] = {}
299
+
300
+ group_result = get_full_labels_results(cur_samples)
301
+ page_result = get_page_split(cur_samples, page_info)
302
+
303
+ result_all[element] = {'all': result, 'group': group_result, 'page': page_result}
304
+
305
+ save_dict = {}
306
+ en_overall = []
307
+ ch_overall = []
308
+ for category_type, metric in [('text_block', 'Edit_dist'), ('display_formula', 'Edit_dist'),
309
+ ('display_formula', 'CDM'), ('table', 'TEDS'), ('table', 'Edit_dist'),
310
+ ('reading_order', 'Edit_dist')]:
311
+ if metric == 'TEDS':
312
+ if category_type in result_all and 'page' in result_all[category_type] and metric in result_all[
313
+ category_type]['page']:
314
+ save_dict[category_type + '_' + metric
315
+ + '_EN'] = result_all[category_type]['page'][metric]['language: english']
316
+ save_dict[category_type + '_' + metric
317
+ + '_CH'] = result_all[category_type]['page'][metric]['language: simplified_chinese']
318
+ else:
319
+ save_dict[category_type + '_' + metric + '_EN'] = np.nan
320
+ save_dict[category_type + '_' + metric + '_CH'] = np.nan
321
+ else:
322
+ if category_type in result_all and 'page' in result_all[category_type] and metric in result_all[
323
+ category_type]['page']:
324
+ save_dict[category_type + '_' + metric
325
+ + '_EN'] = result_all[category_type]['page'][metric].get('language: english', np.nan)
326
+ save_dict[category_type + '_' + metric + '_CH'] = result_all[category_type]['page'][metric].get(
327
+ 'language: simplified_chinese', np.nan
328
+ )
329
+ else:
330
+ save_dict[category_type + '_' + metric + '_EN'] = np.nan
331
+ save_dict[category_type + '_' + metric + '_CH'] = np.nan
332
+
333
+ if metric == 'Edit_dist':
334
+ if category_type in result_all and 'page' in result_all[category_type] and metric in result_all[
335
+ category_type]['page']:
336
+ en_overall.append(result_all[category_type]['page'][metric].get('language: english', np.nan))
337
+ ch_overall.append(
338
+ result_all[category_type]['page'][metric].get('language: simplified_chinese', np.nan)
339
+ )
340
+ else:
341
+ en_overall.append(np.nan)
342
+ ch_overall.append(np.nan)
343
+
344
+ en_overall_filtered = [x for x in en_overall if not np.isnan(x)]
345
+ ch_overall_filtered = [x for x in ch_overall if not np.isnan(x)]
346
+ save_dict['overall_EN'] = sum(en_overall_filtered) / len(en_overall_filtered) if en_overall_filtered else np.nan
347
+ save_dict['overall_CH'] = sum(ch_overall_filtered) / len(ch_overall_filtered) if ch_overall_filtered else np.nan
348
+
349
+ return save_dict