evalscope 1.1.0__py3-none-any.whl → 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/api/benchmark/__init__.py +8 -1
- evalscope/api/benchmark/adapters/__init__.py +1 -0
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/benchmark.py +14 -0
- evalscope/api/dataset/dataset.py +21 -0
- evalscope/api/dataset/loader.py +6 -2
- evalscope/api/mixin/sandbox_mixin.py +32 -54
- evalscope/api/model/generate_config.py +6 -0
- evalscope/benchmarks/aa_lcr/__init__.py +0 -0
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/bfcl/bfcl_adapter.py +1 -1
- evalscope/benchmarks/data_collection/data_collection_adapter.py +2 -1
- evalscope/benchmarks/general_arena/general_arena_adapter.py +1 -1
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +23 -4
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +158 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +2 -1
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +3 -1
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +100 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +111 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +6 -26
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -1
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench_v2/utils.py +1 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +127 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +111 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +1 -1
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +1 -1
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/constants.py +4 -0
- evalscope/evaluator/evaluator.py +72 -79
- evalscope/metrics/math_parser.py +14 -0
- evalscope/metrics/metric.py +1 -1
- evalscope/models/utils/openai.py +4 -0
- evalscope/perf/arguments.py +24 -4
- evalscope/perf/benchmark.py +74 -89
- evalscope/perf/http_client.py +31 -16
- evalscope/perf/main.py +15 -2
- evalscope/perf/plugin/api/base.py +9 -7
- evalscope/perf/plugin/api/custom_api.py +13 -58
- evalscope/perf/plugin/api/default_api.py +179 -79
- evalscope/perf/plugin/api/openai_api.py +4 -3
- evalscope/perf/plugin/datasets/base.py +21 -0
- evalscope/perf/plugin/datasets/custom.py +2 -3
- evalscope/perf/plugin/datasets/line_by_line.py +2 -3
- evalscope/perf/plugin/datasets/longalpaca.py +2 -3
- evalscope/perf/plugin/datasets/openqa.py +2 -4
- evalscope/perf/plugin/datasets/random_dataset.py +1 -3
- evalscope/perf/utils/benchmark_util.py +36 -22
- evalscope/perf/utils/db_util.py +14 -19
- evalscope/perf/utils/local_server.py +0 -44
- evalscope/perf/utils/log_utils.py +21 -6
- evalscope/report/__init__.py +2 -1
- evalscope/run.py +4 -0
- evalscope/utils/function_utils.py +195 -12
- evalscope/utils/io_utils.py +74 -0
- evalscope/utils/logger.py +49 -17
- evalscope/utils/ner.py +377 -0
- evalscope/version.py +2 -2
- {evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info}/METADATA +235 -363
- {evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info}/RECORD +100 -55
- {evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info}/WHEEL +1 -1
- {evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info}/entry_points.txt +0 -0
- {evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info/licenses}/LICENSE +0 -0
- {evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
from evalscope.api.benchmark import BenchmarkMeta, NERAdapter
|
|
2
|
+
from evalscope.api.registry import register_benchmark
|
|
3
|
+
from evalscope.constants import Tags
|
|
4
|
+
from evalscope.utils.ner import FEWSHOT_TEMPLATE, PROMPT_TEMPLATE
|
|
5
|
+
|
|
6
|
+
DESCRIPTION = (
|
|
7
|
+
'The MIT-Movie-Trivia dataset, originally created for slot filling, is modified by '
|
|
8
|
+
'ignoring some slot types (e.g. genre, rating) and merging others (e.g. director '
|
|
9
|
+
'and actor in person, and song and movie title in title) in order to keep '
|
|
10
|
+
'consistent named entity types across all datasets.'
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@register_benchmark(
|
|
15
|
+
BenchmarkMeta(
|
|
16
|
+
name='mit-movie-trivia',
|
|
17
|
+
pretty_name='MIT-Movie-Trivia',
|
|
18
|
+
dataset_id='extraordinarylab/mit-movie-trivia',
|
|
19
|
+
tags=[Tags.KNOWLEDGE, Tags.NER],
|
|
20
|
+
description=DESCRIPTION.strip(),
|
|
21
|
+
few_shot_num=5,
|
|
22
|
+
train_split='train',
|
|
23
|
+
eval_split='test',
|
|
24
|
+
metric_list=['precision', 'recall', 'f1_score', 'accuracy'],
|
|
25
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
26
|
+
few_shot_prompt_template=FEWSHOT_TEMPLATE,
|
|
27
|
+
)
|
|
28
|
+
)
|
|
29
|
+
class MITMovieTriviaAdapter(NERAdapter):
|
|
30
|
+
"""
|
|
31
|
+
Adapter for the MIT-Movie-Trivia Named Entity Recognition dataset.
|
|
32
|
+
|
|
33
|
+
This adapter inherits the NER functionality from NERAdapter and
|
|
34
|
+
configures it specifically for the MIT-Movie-Trivia dataset's entity types.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
def __init__(self, **kwargs):
|
|
38
|
+
# Initialize the parent class first
|
|
39
|
+
super().__init__(**kwargs)
|
|
40
|
+
|
|
41
|
+
# Define MIT-Movie-Trivia-specific entity mappings
|
|
42
|
+
self.entity_type_map = {
|
|
43
|
+
'ACTOR': 'actor',
|
|
44
|
+
'AWARD': 'award',
|
|
45
|
+
'CHARACTER_NAME': 'character_name',
|
|
46
|
+
'DIRECTOR': 'director',
|
|
47
|
+
'GENRE': 'genre',
|
|
48
|
+
'OPINION': 'opinion',
|
|
49
|
+
'ORIGIN': 'origin',
|
|
50
|
+
'PLOT': 'plot',
|
|
51
|
+
'QUOTE': 'quote',
|
|
52
|
+
'RELATIONSHIP': 'relationship',
|
|
53
|
+
'SOUNDTRACK': 'soundtrack',
|
|
54
|
+
'YEAR': 'year'
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
# Add descriptions for each entity type
|
|
58
|
+
self.entity_descriptions = {
|
|
59
|
+
'ACTOR': 'The name of an actor or actress starring in the movie.',
|
|
60
|
+
'AWARD': 'An award the movie won or was nominated for.',
|
|
61
|
+
'CHARACTER_NAME': 'The name of a character in the movie.',
|
|
62
|
+
'DIRECTOR': 'The name of the person who directed the movie.',
|
|
63
|
+
'GENRE': 'The category or style of the movie.',
|
|
64
|
+
'OPINION': 'A subjective review or personal opinion about the movie.',
|
|
65
|
+
'ORIGIN': 'The source material or basis for the movie.',
|
|
66
|
+
'PLOT': 'A description or summary of the movie\'s storyline.',
|
|
67
|
+
'QUOTE': 'A memorable line or phrase spoken in the movie.',
|
|
68
|
+
'RELATIONSHIP': 'The connection or relationship between characters.',
|
|
69
|
+
'SOUNDTRACK': 'The music or a specific song from the movie.',
|
|
70
|
+
'YEAR': 'The release year of the movie.'
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
# Setup entity mappings based on the defined entity types
|
|
74
|
+
self.setup_entity_mappings()
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
from evalscope.api.benchmark import BenchmarkMeta, NERAdapter
|
|
2
|
+
from evalscope.api.registry import register_benchmark
|
|
3
|
+
from evalscope.constants import Tags
|
|
4
|
+
from evalscope.utils.ner import FEWSHOT_TEMPLATE, PROMPT_TEMPLATE
|
|
5
|
+
|
|
6
|
+
DESCRIPTION = (
|
|
7
|
+
'The MIT-Restaurant dataset is a collection of restaurant review text specifically '
|
|
8
|
+
'curated for training and testing Natural Language Processing (NLP) models, '
|
|
9
|
+
'particularly for Named Entity Recognition (NER). It contains sentences from real '
|
|
10
|
+
'reviews, along with corresponding labels in the BIO format.'
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@register_benchmark(
|
|
15
|
+
BenchmarkMeta(
|
|
16
|
+
name='mit-restaurant',
|
|
17
|
+
pretty_name='MIT-Restaurant',
|
|
18
|
+
dataset_id='extraordinarylab/mit-restaurant',
|
|
19
|
+
tags=[Tags.KNOWLEDGE, Tags.NER],
|
|
20
|
+
description=DESCRIPTION.strip(),
|
|
21
|
+
few_shot_num=5,
|
|
22
|
+
train_split='train',
|
|
23
|
+
eval_split='test',
|
|
24
|
+
metric_list=['precision', 'recall', 'f1_score', 'accuracy'],
|
|
25
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
26
|
+
few_shot_prompt_template=FEWSHOT_TEMPLATE,
|
|
27
|
+
)
|
|
28
|
+
)
|
|
29
|
+
class MITRestaurantAdapter(NERAdapter):
|
|
30
|
+
"""
|
|
31
|
+
Adapter for the MIT-Restaurant Named Entity Recognition dataset.
|
|
32
|
+
|
|
33
|
+
This adapter inherits the NER functionality from NERAdapter and
|
|
34
|
+
configures it specifically for the MIT-Restaurant dataset's entity types.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
def __init__(self, **kwargs):
|
|
38
|
+
# Initialize the parent class first
|
|
39
|
+
super().__init__(**kwargs)
|
|
40
|
+
|
|
41
|
+
# Define MIT-Restaurant-specific entity mappings
|
|
42
|
+
self.entity_type_map = {
|
|
43
|
+
'AMENITY': 'amenity',
|
|
44
|
+
'CUISINE': 'cuisine',
|
|
45
|
+
'DISH': 'dish',
|
|
46
|
+
'HOURS': 'hours',
|
|
47
|
+
'LOCATION': 'location',
|
|
48
|
+
'PRICE': 'price',
|
|
49
|
+
'RATING': 'rating',
|
|
50
|
+
'RESTAURANT_NAME': 'restaurant_name'
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
# Add descriptions for each entity type
|
|
54
|
+
self.entity_descriptions = {
|
|
55
|
+
'AMENITY': 'A feature or service offered by the restaurant.',
|
|
56
|
+
'CUISINE': 'The type of food a restaurant serves.',
|
|
57
|
+
'DISH': 'A specific food or drink item.',
|
|
58
|
+
'HOURS': 'The operating hours of a restaurant.',
|
|
59
|
+
'LOCATION': 'The address or general location of a restaurant.',
|
|
60
|
+
'PRICE': 'The price range of a restaurant.',
|
|
61
|
+
'RATING': 'A rating or review of the restaurant.',
|
|
62
|
+
'RESTAURANT_NAME': 'The name of a restaurant.',
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
# Setup entity mappings based on the defined entity types
|
|
66
|
+
self.setup_entity_mappings()
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
from evalscope.api.benchmark import BenchmarkMeta, NERAdapter
|
|
2
|
+
from evalscope.api.registry import register_benchmark
|
|
3
|
+
from evalscope.constants import Tags
|
|
4
|
+
from evalscope.utils.ner import FEWSHOT_TEMPLATE, PROMPT_TEMPLATE
|
|
5
|
+
|
|
6
|
+
DESCRIPTION = (
|
|
7
|
+
'OntoNotes Release 5.0 is a large, multilingual corpus containing text in English, '
|
|
8
|
+
'Chinese, and Arabic across various genres like news, weblogs, and broadcast '
|
|
9
|
+
'conversations. It is richly annotated with multiple layers of linguistic information, '
|
|
10
|
+
'including syntax, predicate-argument structure, word sense, named entities, and '
|
|
11
|
+
'coreference to support research and development in natural language processing.'
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@register_benchmark(
|
|
16
|
+
BenchmarkMeta(
|
|
17
|
+
name='ontonotes5',
|
|
18
|
+
pretty_name='OntoNotes5',
|
|
19
|
+
dataset_id='extraordinarylab/ontonotes5',
|
|
20
|
+
tags=[Tags.KNOWLEDGE, Tags.NER],
|
|
21
|
+
description=DESCRIPTION.strip(),
|
|
22
|
+
few_shot_num=5,
|
|
23
|
+
train_split='train',
|
|
24
|
+
eval_split='test',
|
|
25
|
+
metric_list=['precision', 'recall', 'f1_score', 'accuracy'],
|
|
26
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
27
|
+
few_shot_prompt_template=FEWSHOT_TEMPLATE,
|
|
28
|
+
)
|
|
29
|
+
)
|
|
30
|
+
class OntoNotes5Adapter(NERAdapter):
|
|
31
|
+
"""
|
|
32
|
+
Adapter for the OntoNotes5 Named Entity Recognition dataset.
|
|
33
|
+
|
|
34
|
+
This adapter inherits the NER functionality from NERAdapter and
|
|
35
|
+
configures it specifically for the OntoNotes5 dataset's entity types.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def __init__(self, **kwargs):
|
|
39
|
+
# Initialize the parent class first
|
|
40
|
+
super().__init__(**kwargs)
|
|
41
|
+
|
|
42
|
+
# Define OntoNotes5-specific entity mappings
|
|
43
|
+
self.entity_type_map = {
|
|
44
|
+
'CARDINAL': 'cardinal',
|
|
45
|
+
'DATE': 'date',
|
|
46
|
+
'EVENT': 'event',
|
|
47
|
+
'FAC': 'facility',
|
|
48
|
+
'GPE': 'geopolitical_entity',
|
|
49
|
+
'LANGUAGE': 'language',
|
|
50
|
+
'LAW': 'law',
|
|
51
|
+
'LOC': 'location',
|
|
52
|
+
'MONEY': 'money',
|
|
53
|
+
'NORP': 'nationalities_or_religious_or_political_groups',
|
|
54
|
+
'ORDINAL': 'ordinal',
|
|
55
|
+
'ORG': 'organization',
|
|
56
|
+
'PERCENT': 'percent',
|
|
57
|
+
'PERSON': 'person',
|
|
58
|
+
'PRODUCT': 'product',
|
|
59
|
+
'QUANTITY': 'quantity',
|
|
60
|
+
'TIME': 'time',
|
|
61
|
+
'WORK_OF_ART': 'work_of_art'
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
# Add descriptions for each entity type
|
|
65
|
+
self.entity_descriptions = {
|
|
66
|
+
'PERSON': 'People, including fictional',
|
|
67
|
+
'NORP': 'Nationalities or religious or political groups',
|
|
68
|
+
'FAC': 'Buildings, airports, highways, bridges, etc.',
|
|
69
|
+
'ORG': 'Companies, agencies, institutions, etc.',
|
|
70
|
+
'GPE': 'Countries, cities, states',
|
|
71
|
+
'LOC': 'Non-GPE locations, mountain ranges, bodies of water',
|
|
72
|
+
'PRODUCT': 'Vehicles, weapons, foods, etc. (Not services)',
|
|
73
|
+
'EVENT': 'Named hurricanes, battles, wars, sports events, etc.',
|
|
74
|
+
'WORK_OF_ART': 'Titles of books, songs, etc.',
|
|
75
|
+
'LAW': 'Named documents made into laws',
|
|
76
|
+
'LANGUAGE': 'Any named language',
|
|
77
|
+
'DATE': 'Absolute or relative dates or periods',
|
|
78
|
+
'TIME': 'Times smaller than a day',
|
|
79
|
+
'PERCENT': 'Percentage (including "%")',
|
|
80
|
+
'MONEY': 'Monetary values, including unit',
|
|
81
|
+
'QUANTITY': 'Measurements, as of weight or distance',
|
|
82
|
+
'ORDINAL': '"first", "second"',
|
|
83
|
+
'CARDINAL': 'Numerals that do not fall under another type'
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
# Setup entity mappings based on the defined entity types
|
|
87
|
+
self.setup_entity_mappings()
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
from evalscope.api.benchmark import BenchmarkMeta, NERAdapter
|
|
2
|
+
from evalscope.api.registry import register_benchmark
|
|
3
|
+
from evalscope.constants import Tags
|
|
4
|
+
from evalscope.utils.ner import FEWSHOT_TEMPLATE, PROMPT_TEMPLATE
|
|
5
|
+
|
|
6
|
+
DESCRIPTION = (
|
|
7
|
+
'The WNUT2017 dataset is a collection of user-generated text from various social '
|
|
8
|
+
'media platforms, like Twitter and YouTube, specifically designed for a named-entity '
|
|
9
|
+
'recognition task.'
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@register_benchmark(
|
|
14
|
+
BenchmarkMeta(
|
|
15
|
+
name='wnut2017',
|
|
16
|
+
pretty_name='WNUT2017',
|
|
17
|
+
dataset_id='extraordinarylab/wnut2017',
|
|
18
|
+
tags=[Tags.KNOWLEDGE, Tags.NER],
|
|
19
|
+
description=DESCRIPTION.strip(),
|
|
20
|
+
few_shot_num=5,
|
|
21
|
+
train_split='train',
|
|
22
|
+
eval_split='test',
|
|
23
|
+
metric_list=['precision', 'recall', 'f1_score', 'accuracy'],
|
|
24
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
25
|
+
few_shot_prompt_template=FEWSHOT_TEMPLATE,
|
|
26
|
+
)
|
|
27
|
+
)
|
|
28
|
+
class WNUT2017Adapter(NERAdapter):
|
|
29
|
+
"""
|
|
30
|
+
Adapter for the WNUT2017 Named Entity Recognition dataset.
|
|
31
|
+
|
|
32
|
+
This adapter inherits the NER functionality from NERAdapter and
|
|
33
|
+
configures it specifically for the WNUT2017 dataset's entity types.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(self, **kwargs):
|
|
37
|
+
# Initialize the parent class first
|
|
38
|
+
super().__init__(**kwargs)
|
|
39
|
+
|
|
40
|
+
# Define WNUT2017-specific entity mappings
|
|
41
|
+
self.entity_type_map = {
|
|
42
|
+
'CORPORATION': 'corporation',
|
|
43
|
+
'CREATIVE-WORK': 'creative_work',
|
|
44
|
+
'GROUP': 'group',
|
|
45
|
+
'LOCATION': 'location',
|
|
46
|
+
'PERSON': 'person',
|
|
47
|
+
'PRODUCT': 'product'
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
# Add descriptions for each entity type
|
|
51
|
+
self.entity_descriptions = {
|
|
52
|
+
'CORPORATION': 'Named companies, businesses, agencies, and other institutions.',
|
|
53
|
+
'CREATIVE-WORK': 'Named books, songs, movies, paintings, and other works of art.',
|
|
54
|
+
'GROUP': 'Named groups of people, such as sports teams, bands, or political groups.',
|
|
55
|
+
'LOCATION': 'Named geographical locations, such as cities, countries, and natural landmarks.',
|
|
56
|
+
'PERSON': 'Named individuals, including both real and fictional people.',
|
|
57
|
+
'PRODUCT': 'Named commercial products, including vehicles, software, and other goods.'
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
# Setup entity mappings based on the defined entity types
|
|
61
|
+
self.setup_entity_mappings()
|
|
File without changes
|
|
@@ -0,0 +1,349 @@
|
|
|
1
|
+
# flake8: noqa
|
|
2
|
+
import numpy as np
|
|
3
|
+
import os
|
|
4
|
+
import sys
|
|
5
|
+
from collections import defaultdict
|
|
6
|
+
from typing import Dict, List
|
|
7
|
+
|
|
8
|
+
from evalscope.utils import get_logger
|
|
9
|
+
|
|
10
|
+
logger = get_logger()
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class End2EndEvaluator():
|
|
14
|
+
|
|
15
|
+
def __init__(
|
|
16
|
+
self,
|
|
17
|
+
prediction: List,
|
|
18
|
+
reference: List,
|
|
19
|
+
metrics: Dict,
|
|
20
|
+
match_method: str = 'quick_match',
|
|
21
|
+
filter_types: dict = None
|
|
22
|
+
):
|
|
23
|
+
|
|
24
|
+
self.match_method = match_method
|
|
25
|
+
self.references = reference
|
|
26
|
+
self.predictions = prediction
|
|
27
|
+
self.dafault_metircs_dict = metrics
|
|
28
|
+
|
|
29
|
+
filtered_gt_samples = []
|
|
30
|
+
if filter_types:
|
|
31
|
+
for gt_sample in self.references:
|
|
32
|
+
select_flag = True
|
|
33
|
+
for k, v in filter_types.items():
|
|
34
|
+
if gt_sample['page_info']['page_attribute'][k] != v:
|
|
35
|
+
select_flag = False
|
|
36
|
+
if select_flag:
|
|
37
|
+
filtered_gt_samples.append(gt_sample)
|
|
38
|
+
else:
|
|
39
|
+
filtered_gt_samples = self.references #[{},{},{}]
|
|
40
|
+
self.references = filtered_gt_samples
|
|
41
|
+
|
|
42
|
+
def score(self) -> dict:
|
|
43
|
+
samples = self.get_matched_elements(self.references, self.predictions)
|
|
44
|
+
metrics = self.process_generated_metric_results(samples)
|
|
45
|
+
return metrics
|
|
46
|
+
|
|
47
|
+
def get_page_elements(self, selected_annos):
|
|
48
|
+
saved_element_dict = defaultdict(list)
|
|
49
|
+
related_truncated = []
|
|
50
|
+
truncated_all = {}
|
|
51
|
+
for relation in selected_annos['extra']['relation']: # Handle truncated text issues
|
|
52
|
+
if relation['relation_type'] == 'truncated':
|
|
53
|
+
truncated_all[relation['source_anno_id']] = ''
|
|
54
|
+
truncated_all[relation['target_anno_id']] = ''
|
|
55
|
+
exist_flag = False
|
|
56
|
+
for merge_list in related_truncated:
|
|
57
|
+
if relation['source_anno_id'] in merge_list or relation[
|
|
58
|
+
'target_anno_id'] in merge_list: # Consider cases where three text blocks may need to be merged
|
|
59
|
+
merge_list.append(relation['source_anno_id'])
|
|
60
|
+
merge_list.append(relation['target_anno_id'])
|
|
61
|
+
exist_flag = True
|
|
62
|
+
if not exist_flag:
|
|
63
|
+
related_truncated.append([relation['source_anno_id'], relation['target_anno_id']])
|
|
64
|
+
|
|
65
|
+
for item in selected_annos['layout_dets']:
|
|
66
|
+
if item['anno_id'] not in truncated_all.keys():
|
|
67
|
+
saved_element_dict[item['category_type']].append(item)
|
|
68
|
+
else:
|
|
69
|
+
truncated_all[item['anno_id']] = item
|
|
70
|
+
|
|
71
|
+
for merge_list in related_truncated:
|
|
72
|
+
text_block_list = [truncated_all[key] for key in merge_list]
|
|
73
|
+
sorted_block = sorted(text_block_list, key=lambda x: x['order'])
|
|
74
|
+
text = ''
|
|
75
|
+
for block in sorted_block:
|
|
76
|
+
text += block['text']
|
|
77
|
+
merged_block = {
|
|
78
|
+
'category_type': sorted_block[0]['category_type'], # Directly use information from the first block
|
|
79
|
+
'order': sorted_block[0]['order'],
|
|
80
|
+
'anno_id': sorted_block[0]['anno_id'],
|
|
81
|
+
'text': text,
|
|
82
|
+
'merge_list': sorted_block
|
|
83
|
+
}
|
|
84
|
+
saved_element_dict[sorted_block[0]['category_type']].append(merged_block)
|
|
85
|
+
|
|
86
|
+
return saved_element_dict
|
|
87
|
+
|
|
88
|
+
def get_page_elements_list(self, gt_page_elements, category_list):
|
|
89
|
+
element_list = []
|
|
90
|
+
for category_type in category_list:
|
|
91
|
+
if gt_page_elements.get(category_type):
|
|
92
|
+
element_list.extend(gt_page_elements[category_type])
|
|
93
|
+
return element_list
|
|
94
|
+
|
|
95
|
+
def get_sorted_text_list(self, selected_annos):
|
|
96
|
+
# txt_type: text, latex, html
|
|
97
|
+
text_list = []
|
|
98
|
+
for item in selected_annos:
|
|
99
|
+
if item.get('order'):
|
|
100
|
+
order = item['order']
|
|
101
|
+
else:
|
|
102
|
+
order = 0
|
|
103
|
+
# 【txt_type,selecte_annos]
|
|
104
|
+
text_list.append((order, item))
|
|
105
|
+
sorted_text_list = sorted(text_list, key=lambda x: x[0])
|
|
106
|
+
return [_[1] for _ in sorted_text_list]
|
|
107
|
+
|
|
108
|
+
def filtered_out_ignore(self, items, ignore_category_list):
|
|
109
|
+
filted_items = []
|
|
110
|
+
for item in items:
|
|
111
|
+
if item['gt_category_type'] not in ignore_category_list:
|
|
112
|
+
filted_items.append(item)
|
|
113
|
+
return filted_items
|
|
114
|
+
|
|
115
|
+
def get_order_paired(self, order_match_s, img_name):
|
|
116
|
+
matched = [(item['gt_position'], item['pred_position'])
|
|
117
|
+
for item in order_match_s
|
|
118
|
+
if (item['gt_position'] != [''] and item['pred_position'] != '')]
|
|
119
|
+
gt_idx_all = [item['gt_position'] for item in order_match_s if (item['gt_position'] != [''])]
|
|
120
|
+
read_order_pred = [i[0] for i in sorted(matched, key=lambda x: x[1])]
|
|
121
|
+
read_order_gt = sum(gt_idx_all, []) # Convert to one-dimensional list
|
|
122
|
+
read_order_gt = [x for x in read_order_gt if x]
|
|
123
|
+
gt = sorted(read_order_gt)
|
|
124
|
+
pred = sum(read_order_pred, [])
|
|
125
|
+
pred = [x for x in pred if x]
|
|
126
|
+
if len(pred) > 0 or len(gt) > 0:
|
|
127
|
+
import Levenshtein
|
|
128
|
+
edit = Levenshtein.distance(gt, pred) / max(len(pred), len(gt))
|
|
129
|
+
return {'gt': gt, 'pred': pred, 'img_id': img_name, 'edit': edit}
|
|
130
|
+
else:
|
|
131
|
+
return {} # If both GT and pred are empty for the page, return empty
|
|
132
|
+
|
|
133
|
+
def formula_format(self, formula_matches, img_name):
|
|
134
|
+
# formated_list = []
|
|
135
|
+
for i, item in enumerate(formula_matches):
|
|
136
|
+
item['img_id'] = img_name + '_' + str(i)
|
|
137
|
+
return formula_matches
|
|
138
|
+
|
|
139
|
+
def get_matched_elements(self, references: list, predictions: list) -> dict:
|
|
140
|
+
from .metrics import recogition_end2end_base_dataset, recogition_end2end_table_dataset
|
|
141
|
+
|
|
142
|
+
plain_text_match = []
|
|
143
|
+
display_formula_match = []
|
|
144
|
+
html_table_match = []
|
|
145
|
+
latex_table_match = []
|
|
146
|
+
order_match = []
|
|
147
|
+
|
|
148
|
+
for i, sample in enumerate(references):
|
|
149
|
+
img_name = os.path.basename(sample['page_info']['image_path'])
|
|
150
|
+
pred_content = predictions[i]
|
|
151
|
+
result = self.process_get_matched_elements(sample, pred_content, img_name)
|
|
152
|
+
[
|
|
153
|
+
plain_text_match_clean, formated_display_formula, latex_table_match_s, html_table_match_s,
|
|
154
|
+
order_match_single
|
|
155
|
+
] = result
|
|
156
|
+
|
|
157
|
+
if order_match_single:
|
|
158
|
+
order_match.append(order_match_single)
|
|
159
|
+
if plain_text_match_clean:
|
|
160
|
+
plain_text_match.extend(plain_text_match_clean)
|
|
161
|
+
if formated_display_formula:
|
|
162
|
+
display_formula_match.extend(formated_display_formula)
|
|
163
|
+
if latex_table_match_s:
|
|
164
|
+
latex_table_match.extend(latex_table_match_s)
|
|
165
|
+
if html_table_match_s:
|
|
166
|
+
html_table_match.extend(html_table_match_s)
|
|
167
|
+
|
|
168
|
+
if len(latex_table_match) > len(html_table_match):
|
|
169
|
+
table_match = latex_table_match
|
|
170
|
+
table_format = 'latex'
|
|
171
|
+
else:
|
|
172
|
+
table_match = html_table_match
|
|
173
|
+
table_format = 'html'
|
|
174
|
+
|
|
175
|
+
matched_samples_all = {
|
|
176
|
+
'text_block': recogition_end2end_base_dataset(plain_text_match),
|
|
177
|
+
'display_formula': recogition_end2end_base_dataset(display_formula_match),
|
|
178
|
+
'table': recogition_end2end_table_dataset(table_match, table_format),
|
|
179
|
+
'reading_order': recogition_end2end_base_dataset(order_match)
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
return matched_samples_all
|
|
183
|
+
|
|
184
|
+
def process_get_matched_elements(self, sample, pred_content, img_name):
|
|
185
|
+
from func_timeout import FunctionTimedOut, func_timeout
|
|
186
|
+
|
|
187
|
+
from .utils import match_gt2pred_no_split, match_gt2pred_quick, match_gt2pred_simple, md_tex_filter
|
|
188
|
+
|
|
189
|
+
if self.match_method == 'simple_match': # add match choice
|
|
190
|
+
match_gt2pred = match_gt2pred_simple
|
|
191
|
+
elif self.match_method == 'quick_match':
|
|
192
|
+
match_gt2pred = match_gt2pred_quick
|
|
193
|
+
elif self.match_method == 'no_split':
|
|
194
|
+
match_gt2pred = match_gt2pred_no_split
|
|
195
|
+
else:
|
|
196
|
+
match_gt2pred = match_gt2pred_quick
|
|
197
|
+
|
|
198
|
+
pred_dataset = md_tex_filter(pred_content)
|
|
199
|
+
gt_page_elements = self.get_page_elements(sample)
|
|
200
|
+
|
|
201
|
+
text_all = self.get_page_elements_list(
|
|
202
|
+
gt_page_elements, [
|
|
203
|
+
'text_block', 'title', 'code_txt', 'code_txt_caption', 'reference', 'equation_caption',
|
|
204
|
+
'figure_caption', 'figure_footnote', 'table_caption', 'table_footnote', 'code_algorithm',
|
|
205
|
+
'code_algorithm_caption', 'header', 'footer', 'page_footnote', 'page_number'
|
|
206
|
+
]
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
display_formula_match_s = []
|
|
210
|
+
plain_text_match_clean = []
|
|
211
|
+
latex_table_match_s = []
|
|
212
|
+
html_table_match_s = []
|
|
213
|
+
order_match_single = []
|
|
214
|
+
if text_all:
|
|
215
|
+
gt_text_list = self.get_sorted_text_list(text_all)
|
|
216
|
+
try:
|
|
217
|
+
plain_text_match_s = func_timeout(
|
|
218
|
+
30, match_gt2pred, args=(gt_text_list, pred_dataset['text_all'], 'text', img_name)
|
|
219
|
+
)
|
|
220
|
+
except FunctionTimedOut as e:
|
|
221
|
+
logger.warning(f'Time out for plain text match of {img_name}, match_gt2pred_simple will be used.')
|
|
222
|
+
plain_text_match_s = match_gt2pred_simple(gt_text_list, pred_dataset['text_all'], 'text', img_name)
|
|
223
|
+
logger.error(str(e))
|
|
224
|
+
raise e
|
|
225
|
+
|
|
226
|
+
if not plain_text_match_s:
|
|
227
|
+
logger.warning(f'No text match of {img_name}. The plain text match will be empty.')
|
|
228
|
+
else:
|
|
229
|
+
plain_text_match_clean = self.filtered_out_ignore(
|
|
230
|
+
plain_text_match_s, [
|
|
231
|
+
'figure_caption', 'figure_footnote', 'table_caption', 'table_footnote', 'code_algorithm',
|
|
232
|
+
'code_algorithm_caption', 'header', 'footer', 'page_footnote', 'page_number', 'equation_caption'
|
|
233
|
+
]
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
if gt_page_elements.get('equation_isolated'):
|
|
237
|
+
gt_display_list = self.get_sorted_text_list(gt_page_elements['equation_isolated'])
|
|
238
|
+
display_formula_match_s = match_gt2pred(
|
|
239
|
+
gt_display_list, pred_dataset['equation_isolated'], 'formula', img_name
|
|
240
|
+
)
|
|
241
|
+
display_formula_match_s = [x for x in display_formula_match_s if x['gt_idx'] != ['']]
|
|
242
|
+
if not display_formula_match_s:
|
|
243
|
+
logger.warning(f'No display_formula_match of {img_name}. The display_formula_match will be empty.')
|
|
244
|
+
|
|
245
|
+
if gt_page_elements.get('table'):
|
|
246
|
+
gt_table_list = self.get_sorted_text_list(gt_page_elements['table'])
|
|
247
|
+
if pred_dataset['latex_table']:
|
|
248
|
+
latex_table_match_s = match_gt2pred_simple(
|
|
249
|
+
gt_table_list, pred_dataset['latex_table'], 'latex_table', img_name
|
|
250
|
+
)
|
|
251
|
+
latex_table_match_s = [x for x in latex_table_match_s if x['gt_idx'] != ['']]
|
|
252
|
+
if pred_dataset['html_table']:
|
|
253
|
+
html_table_match_s = match_gt2pred_simple(
|
|
254
|
+
gt_table_list, pred_dataset['html_table'], 'html_table', img_name
|
|
255
|
+
)
|
|
256
|
+
html_table_match_s = [x for x in html_table_match_s if x['gt_idx'] != ['']]
|
|
257
|
+
else:
|
|
258
|
+
html_table_match_s = match_gt2pred_simple(gt_table_list, [], 'html_table', img_name)
|
|
259
|
+
html_table_match_s = [x for x in html_table_match_s if x['gt_idx'] != ['']]
|
|
260
|
+
|
|
261
|
+
order_match_s = plain_text_match_clean
|
|
262
|
+
if order_match_s:
|
|
263
|
+
order_match_single = self.get_order_paired(order_match_s, img_name)
|
|
264
|
+
|
|
265
|
+
return [
|
|
266
|
+
plain_text_match_clean, display_formula_match_s, latex_table_match_s, html_table_match_s, order_match_single
|
|
267
|
+
]
|
|
268
|
+
|
|
269
|
+
def process_generated_metric_results(self, samples, save_name: str = 'end2end_quick_match'):
|
|
270
|
+
from .metrics import METRIC_REGISTRY, get_full_labels_results, get_page_split, show_result
|
|
271
|
+
|
|
272
|
+
result_all = {}
|
|
273
|
+
page_info = {}
|
|
274
|
+
metircs_dict = self.dafault_metircs_dict
|
|
275
|
+
pages = self.references #gt_samples list
|
|
276
|
+
|
|
277
|
+
for page in pages:
|
|
278
|
+
img_path = os.path.basename(page['page_info']['image_path'])
|
|
279
|
+
page_info[img_path] = page['page_info']['page_attribute']
|
|
280
|
+
|
|
281
|
+
for element in metircs_dict.keys():
|
|
282
|
+
|
|
283
|
+
result = {}
|
|
284
|
+
group_info = metircs_dict[element].get('group', [])
|
|
285
|
+
# samples = samples.get(element) ##
|
|
286
|
+
cur_samples = samples[element]
|
|
287
|
+
|
|
288
|
+
for metric in metircs_dict[element]['metric']:
|
|
289
|
+
metric_val = METRIC_REGISTRY.get(metric)
|
|
290
|
+
|
|
291
|
+
cur_samples, result_s = metric_val(cur_samples).evaluate(group_info, f'{save_name}_{element}')
|
|
292
|
+
if result_s:
|
|
293
|
+
result.update(result_s)
|
|
294
|
+
|
|
295
|
+
if result:
|
|
296
|
+
logger.info(f'{element}')
|
|
297
|
+
show_result(result)
|
|
298
|
+
result_all[element] = {}
|
|
299
|
+
|
|
300
|
+
group_result = get_full_labels_results(cur_samples)
|
|
301
|
+
page_result = get_page_split(cur_samples, page_info)
|
|
302
|
+
|
|
303
|
+
result_all[element] = {'all': result, 'group': group_result, 'page': page_result}
|
|
304
|
+
|
|
305
|
+
save_dict = {}
|
|
306
|
+
en_overall = []
|
|
307
|
+
ch_overall = []
|
|
308
|
+
for category_type, metric in [('text_block', 'Edit_dist'), ('display_formula', 'Edit_dist'),
|
|
309
|
+
('display_formula', 'CDM'), ('table', 'TEDS'), ('table', 'Edit_dist'),
|
|
310
|
+
('reading_order', 'Edit_dist')]:
|
|
311
|
+
if metric == 'TEDS':
|
|
312
|
+
if category_type in result_all and 'page' in result_all[category_type] and metric in result_all[
|
|
313
|
+
category_type]['page']:
|
|
314
|
+
save_dict[category_type + '_' + metric
|
|
315
|
+
+ '_EN'] = result_all[category_type]['page'][metric]['language: english']
|
|
316
|
+
save_dict[category_type + '_' + metric
|
|
317
|
+
+ '_CH'] = result_all[category_type]['page'][metric]['language: simplified_chinese']
|
|
318
|
+
else:
|
|
319
|
+
save_dict[category_type + '_' + metric + '_EN'] = np.nan
|
|
320
|
+
save_dict[category_type + '_' + metric + '_CH'] = np.nan
|
|
321
|
+
else:
|
|
322
|
+
if category_type in result_all and 'page' in result_all[category_type] and metric in result_all[
|
|
323
|
+
category_type]['page']:
|
|
324
|
+
save_dict[category_type + '_' + metric
|
|
325
|
+
+ '_EN'] = result_all[category_type]['page'][metric].get('language: english', np.nan)
|
|
326
|
+
save_dict[category_type + '_' + metric + '_CH'] = result_all[category_type]['page'][metric].get(
|
|
327
|
+
'language: simplified_chinese', np.nan
|
|
328
|
+
)
|
|
329
|
+
else:
|
|
330
|
+
save_dict[category_type + '_' + metric + '_EN'] = np.nan
|
|
331
|
+
save_dict[category_type + '_' + metric + '_CH'] = np.nan
|
|
332
|
+
|
|
333
|
+
if metric == 'Edit_dist':
|
|
334
|
+
if category_type in result_all and 'page' in result_all[category_type] and metric in result_all[
|
|
335
|
+
category_type]['page']:
|
|
336
|
+
en_overall.append(result_all[category_type]['page'][metric].get('language: english', np.nan))
|
|
337
|
+
ch_overall.append(
|
|
338
|
+
result_all[category_type]['page'][metric].get('language: simplified_chinese', np.nan)
|
|
339
|
+
)
|
|
340
|
+
else:
|
|
341
|
+
en_overall.append(np.nan)
|
|
342
|
+
ch_overall.append(np.nan)
|
|
343
|
+
|
|
344
|
+
en_overall_filtered = [x for x in en_overall if not np.isnan(x)]
|
|
345
|
+
ch_overall_filtered = [x for x in ch_overall if not np.isnan(x)]
|
|
346
|
+
save_dict['overall_EN'] = sum(en_overall_filtered) / len(en_overall_filtered) if en_overall_filtered else np.nan
|
|
347
|
+
save_dict['overall_CH'] = sum(ch_overall_filtered) / len(ch_overall_filtered) if ch_overall_filtered else np.nan
|
|
348
|
+
|
|
349
|
+
return save_dict
|