evalscope 1.0.2__py3-none-any.whl → 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/api/benchmark/__init__.py +8 -1
- evalscope/api/benchmark/adapters/__init__.py +1 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +12 -0
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/benchmark.py +14 -0
- evalscope/api/dataset/dataset.py +21 -0
- evalscope/api/dataset/loader.py +6 -2
- evalscope/api/mixin/sandbox_mixin.py +32 -54
- evalscope/api/model/generate_config.py +6 -0
- evalscope/app/ui/multi_model.py +6 -1
- evalscope/app/ui/single_model.py +8 -2
- evalscope/app/utils/data_utils.py +3 -2
- evalscope/app/utils/visualization.py +2 -2
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +3 -2
- evalscope/benchmarks/bfcl/bfcl_adapter.py +11 -46
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +2 -1
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +1 -1
- evalscope/benchmarks/general_arena/utils.py +2 -1
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +23 -4
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +158 -0
- evalscope/benchmarks/hle/hle_adapter.py +3 -2
- evalscope/benchmarks/humaneval/humaneval_adapter.py +2 -1
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +3 -1
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +100 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +111 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +6 -26
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +2 -2
- evalscope/benchmarks/mmmu/mmmu_adapter.py +1 -1
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -1
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +127 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +111 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +1 -1
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +1 -1
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/constants.py +4 -0
- evalscope/evaluator/evaluator.py +72 -79
- evalscope/metrics/math_parser.py +14 -0
- evalscope/metrics/metric.py +52 -1
- evalscope/metrics/metrics.py +16 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
- evalscope/models/utils/openai.py +4 -0
- evalscope/perf/arguments.py +24 -4
- evalscope/perf/benchmark.py +74 -89
- evalscope/perf/http_client.py +31 -16
- evalscope/perf/main.py +15 -2
- evalscope/perf/plugin/api/base.py +9 -7
- evalscope/perf/plugin/api/custom_api.py +13 -58
- evalscope/perf/plugin/api/default_api.py +179 -79
- evalscope/perf/plugin/api/openai_api.py +4 -3
- evalscope/perf/plugin/datasets/base.py +21 -0
- evalscope/perf/plugin/datasets/custom.py +2 -3
- evalscope/perf/plugin/datasets/line_by_line.py +2 -3
- evalscope/perf/plugin/datasets/longalpaca.py +2 -3
- evalscope/perf/plugin/datasets/openqa.py +2 -4
- evalscope/perf/plugin/datasets/random_dataset.py +1 -3
- evalscope/perf/utils/benchmark_util.py +36 -22
- evalscope/perf/utils/db_util.py +14 -19
- evalscope/perf/utils/local_server.py +0 -44
- evalscope/perf/utils/log_utils.py +21 -6
- evalscope/report/__init__.py +11 -2
- evalscope/report/combinator.py +52 -2
- evalscope/run.py +4 -0
- evalscope/utils/function_utils.py +195 -12
- evalscope/utils/io_utils.py +74 -0
- evalscope/utils/json_schema.py +8 -6
- evalscope/utils/logger.py +49 -17
- evalscope/utils/multi_choices.py +16 -1
- evalscope/utils/ner.py +377 -0
- evalscope/version.py +2 -2
- {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/METADATA +239 -393
- {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/RECORD +140 -98
- {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/WHEEL +1 -1
- {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/top_level.txt +0 -1
- tests/__init__.py +0 -1
- tests/benchmark/__init__.py +0 -1
- tests/benchmark/test_eval.py +0 -429
- tests/benchmark/test_image_edit.py +0 -65
- tests/benchmark/test_sandbox.py +0 -81
- tests/benchmark/test_t2i.py +0 -142
- tests/benchmark/test_vlm.py +0 -137
- tests/cli/__init__.py +0 -1
- tests/cli/test_all.py +0 -269
- tests/cli/test_collection.py +0 -99
- tests/cli/test_custom.py +0 -268
- tests/cli/test_reasoning.py +0 -81
- tests/common.py +0 -73
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -206
- tests/rag/test_clip_benchmark.py +0 -87
- tests/rag/test_mteb.py +0 -213
- tests/rag/test_ragas.py +0 -128
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -146
- tests/swift/test_run_swift_vlm_eval.py +0 -128
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
- tests/test_run_all.py +0 -12
- tests/utils.py +0 -13
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -102
- {tests/rag → evalscope/benchmarks/aa_lcr}/__init__.py +0 -0
- {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/entry_points.txt +0 -0
- {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
from evalscope.api.benchmark import BenchmarkMeta, NERAdapter
|
|
2
|
+
from evalscope.api.registry import register_benchmark
|
|
3
|
+
from evalscope.constants import Tags
|
|
4
|
+
from evalscope.utils.ner import FEWSHOT_TEMPLATE, PROMPT_TEMPLATE
|
|
5
|
+
|
|
6
|
+
DESCRIPTION = (
|
|
7
|
+
'The MIT-Movie-Trivia dataset, originally created for slot filling, is modified by '
|
|
8
|
+
'ignoring some slot types (e.g. genre, rating) and merging others (e.g. director '
|
|
9
|
+
'and actor in person, and song and movie title in title) in order to keep '
|
|
10
|
+
'consistent named entity types across all datasets.'
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@register_benchmark(
|
|
15
|
+
BenchmarkMeta(
|
|
16
|
+
name='mit-movie-trivia',
|
|
17
|
+
pretty_name='MIT-Movie-Trivia',
|
|
18
|
+
dataset_id='extraordinarylab/mit-movie-trivia',
|
|
19
|
+
tags=[Tags.KNOWLEDGE, Tags.NER],
|
|
20
|
+
description=DESCRIPTION.strip(),
|
|
21
|
+
few_shot_num=5,
|
|
22
|
+
train_split='train',
|
|
23
|
+
eval_split='test',
|
|
24
|
+
metric_list=['precision', 'recall', 'f1_score', 'accuracy'],
|
|
25
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
26
|
+
few_shot_prompt_template=FEWSHOT_TEMPLATE,
|
|
27
|
+
)
|
|
28
|
+
)
|
|
29
|
+
class MITMovieTriviaAdapter(NERAdapter):
|
|
30
|
+
"""
|
|
31
|
+
Adapter for the MIT-Movie-Trivia Named Entity Recognition dataset.
|
|
32
|
+
|
|
33
|
+
This adapter inherits the NER functionality from NERAdapter and
|
|
34
|
+
configures it specifically for the MIT-Movie-Trivia dataset's entity types.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
def __init__(self, **kwargs):
|
|
38
|
+
# Initialize the parent class first
|
|
39
|
+
super().__init__(**kwargs)
|
|
40
|
+
|
|
41
|
+
# Define MIT-Movie-Trivia-specific entity mappings
|
|
42
|
+
self.entity_type_map = {
|
|
43
|
+
'ACTOR': 'actor',
|
|
44
|
+
'AWARD': 'award',
|
|
45
|
+
'CHARACTER_NAME': 'character_name',
|
|
46
|
+
'DIRECTOR': 'director',
|
|
47
|
+
'GENRE': 'genre',
|
|
48
|
+
'OPINION': 'opinion',
|
|
49
|
+
'ORIGIN': 'origin',
|
|
50
|
+
'PLOT': 'plot',
|
|
51
|
+
'QUOTE': 'quote',
|
|
52
|
+
'RELATIONSHIP': 'relationship',
|
|
53
|
+
'SOUNDTRACK': 'soundtrack',
|
|
54
|
+
'YEAR': 'year'
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
# Add descriptions for each entity type
|
|
58
|
+
self.entity_descriptions = {
|
|
59
|
+
'ACTOR': 'The name of an actor or actress starring in the movie.',
|
|
60
|
+
'AWARD': 'An award the movie won or was nominated for.',
|
|
61
|
+
'CHARACTER_NAME': 'The name of a character in the movie.',
|
|
62
|
+
'DIRECTOR': 'The name of the person who directed the movie.',
|
|
63
|
+
'GENRE': 'The category or style of the movie.',
|
|
64
|
+
'OPINION': 'A subjective review or personal opinion about the movie.',
|
|
65
|
+
'ORIGIN': 'The source material or basis for the movie.',
|
|
66
|
+
'PLOT': 'A description or summary of the movie\'s storyline.',
|
|
67
|
+
'QUOTE': 'A memorable line or phrase spoken in the movie.',
|
|
68
|
+
'RELATIONSHIP': 'The connection or relationship between characters.',
|
|
69
|
+
'SOUNDTRACK': 'The music or a specific song from the movie.',
|
|
70
|
+
'YEAR': 'The release year of the movie.'
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
# Setup entity mappings based on the defined entity types
|
|
74
|
+
self.setup_entity_mappings()
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
from evalscope.api.benchmark import BenchmarkMeta, NERAdapter
|
|
2
|
+
from evalscope.api.registry import register_benchmark
|
|
3
|
+
from evalscope.constants import Tags
|
|
4
|
+
from evalscope.utils.ner import FEWSHOT_TEMPLATE, PROMPT_TEMPLATE
|
|
5
|
+
|
|
6
|
+
DESCRIPTION = (
|
|
7
|
+
'The MIT-Restaurant dataset is a collection of restaurant review text specifically '
|
|
8
|
+
'curated for training and testing Natural Language Processing (NLP) models, '
|
|
9
|
+
'particularly for Named Entity Recognition (NER). It contains sentences from real '
|
|
10
|
+
'reviews, along with corresponding labels in the BIO format.'
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@register_benchmark(
|
|
15
|
+
BenchmarkMeta(
|
|
16
|
+
name='mit-restaurant',
|
|
17
|
+
pretty_name='MIT-Restaurant',
|
|
18
|
+
dataset_id='extraordinarylab/mit-restaurant',
|
|
19
|
+
tags=[Tags.KNOWLEDGE, Tags.NER],
|
|
20
|
+
description=DESCRIPTION.strip(),
|
|
21
|
+
few_shot_num=5,
|
|
22
|
+
train_split='train',
|
|
23
|
+
eval_split='test',
|
|
24
|
+
metric_list=['precision', 'recall', 'f1_score', 'accuracy'],
|
|
25
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
26
|
+
few_shot_prompt_template=FEWSHOT_TEMPLATE,
|
|
27
|
+
)
|
|
28
|
+
)
|
|
29
|
+
class MITRestaurantAdapter(NERAdapter):
|
|
30
|
+
"""
|
|
31
|
+
Adapter for the MIT-Restaurant Named Entity Recognition dataset.
|
|
32
|
+
|
|
33
|
+
This adapter inherits the NER functionality from NERAdapter and
|
|
34
|
+
configures it specifically for the MIT-Restaurant dataset's entity types.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
def __init__(self, **kwargs):
|
|
38
|
+
# Initialize the parent class first
|
|
39
|
+
super().__init__(**kwargs)
|
|
40
|
+
|
|
41
|
+
# Define MIT-Restaurant-specific entity mappings
|
|
42
|
+
self.entity_type_map = {
|
|
43
|
+
'AMENITY': 'amenity',
|
|
44
|
+
'CUISINE': 'cuisine',
|
|
45
|
+
'DISH': 'dish',
|
|
46
|
+
'HOURS': 'hours',
|
|
47
|
+
'LOCATION': 'location',
|
|
48
|
+
'PRICE': 'price',
|
|
49
|
+
'RATING': 'rating',
|
|
50
|
+
'RESTAURANT_NAME': 'restaurant_name'
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
# Add descriptions for each entity type
|
|
54
|
+
self.entity_descriptions = {
|
|
55
|
+
'AMENITY': 'A feature or service offered by the restaurant.',
|
|
56
|
+
'CUISINE': 'The type of food a restaurant serves.',
|
|
57
|
+
'DISH': 'A specific food or drink item.',
|
|
58
|
+
'HOURS': 'The operating hours of a restaurant.',
|
|
59
|
+
'LOCATION': 'The address or general location of a restaurant.',
|
|
60
|
+
'PRICE': 'The price range of a restaurant.',
|
|
61
|
+
'RATING': 'A rating or review of the restaurant.',
|
|
62
|
+
'RESTAURANT_NAME': 'The name of a restaurant.',
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
# Setup entity mappings based on the defined entity types
|
|
66
|
+
self.setup_entity_mappings()
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
from evalscope.api.benchmark import BenchmarkMeta, NERAdapter
|
|
2
|
+
from evalscope.api.registry import register_benchmark
|
|
3
|
+
from evalscope.constants import Tags
|
|
4
|
+
from evalscope.utils.ner import FEWSHOT_TEMPLATE, PROMPT_TEMPLATE
|
|
5
|
+
|
|
6
|
+
DESCRIPTION = (
|
|
7
|
+
'OntoNotes Release 5.0 is a large, multilingual corpus containing text in English, '
|
|
8
|
+
'Chinese, and Arabic across various genres like news, weblogs, and broadcast '
|
|
9
|
+
'conversations. It is richly annotated with multiple layers of linguistic information, '
|
|
10
|
+
'including syntax, predicate-argument structure, word sense, named entities, and '
|
|
11
|
+
'coreference to support research and development in natural language processing.'
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@register_benchmark(
|
|
16
|
+
BenchmarkMeta(
|
|
17
|
+
name='ontonotes5',
|
|
18
|
+
pretty_name='OntoNotes5',
|
|
19
|
+
dataset_id='extraordinarylab/ontonotes5',
|
|
20
|
+
tags=[Tags.KNOWLEDGE, Tags.NER],
|
|
21
|
+
description=DESCRIPTION.strip(),
|
|
22
|
+
few_shot_num=5,
|
|
23
|
+
train_split='train',
|
|
24
|
+
eval_split='test',
|
|
25
|
+
metric_list=['precision', 'recall', 'f1_score', 'accuracy'],
|
|
26
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
27
|
+
few_shot_prompt_template=FEWSHOT_TEMPLATE,
|
|
28
|
+
)
|
|
29
|
+
)
|
|
30
|
+
class OntoNotes5Adapter(NERAdapter):
|
|
31
|
+
"""
|
|
32
|
+
Adapter for the OntoNotes5 Named Entity Recognition dataset.
|
|
33
|
+
|
|
34
|
+
This adapter inherits the NER functionality from NERAdapter and
|
|
35
|
+
configures it specifically for the OntoNotes5 dataset's entity types.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def __init__(self, **kwargs):
|
|
39
|
+
# Initialize the parent class first
|
|
40
|
+
super().__init__(**kwargs)
|
|
41
|
+
|
|
42
|
+
# Define OntoNotes5-specific entity mappings
|
|
43
|
+
self.entity_type_map = {
|
|
44
|
+
'CARDINAL': 'cardinal',
|
|
45
|
+
'DATE': 'date',
|
|
46
|
+
'EVENT': 'event',
|
|
47
|
+
'FAC': 'facility',
|
|
48
|
+
'GPE': 'geopolitical_entity',
|
|
49
|
+
'LANGUAGE': 'language',
|
|
50
|
+
'LAW': 'law',
|
|
51
|
+
'LOC': 'location',
|
|
52
|
+
'MONEY': 'money',
|
|
53
|
+
'NORP': 'nationalities_or_religious_or_political_groups',
|
|
54
|
+
'ORDINAL': 'ordinal',
|
|
55
|
+
'ORG': 'organization',
|
|
56
|
+
'PERCENT': 'percent',
|
|
57
|
+
'PERSON': 'person',
|
|
58
|
+
'PRODUCT': 'product',
|
|
59
|
+
'QUANTITY': 'quantity',
|
|
60
|
+
'TIME': 'time',
|
|
61
|
+
'WORK_OF_ART': 'work_of_art'
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
# Add descriptions for each entity type
|
|
65
|
+
self.entity_descriptions = {
|
|
66
|
+
'PERSON': 'People, including fictional',
|
|
67
|
+
'NORP': 'Nationalities or religious or political groups',
|
|
68
|
+
'FAC': 'Buildings, airports, highways, bridges, etc.',
|
|
69
|
+
'ORG': 'Companies, agencies, institutions, etc.',
|
|
70
|
+
'GPE': 'Countries, cities, states',
|
|
71
|
+
'LOC': 'Non-GPE locations, mountain ranges, bodies of water',
|
|
72
|
+
'PRODUCT': 'Vehicles, weapons, foods, etc. (Not services)',
|
|
73
|
+
'EVENT': 'Named hurricanes, battles, wars, sports events, etc.',
|
|
74
|
+
'WORK_OF_ART': 'Titles of books, songs, etc.',
|
|
75
|
+
'LAW': 'Named documents made into laws',
|
|
76
|
+
'LANGUAGE': 'Any named language',
|
|
77
|
+
'DATE': 'Absolute or relative dates or periods',
|
|
78
|
+
'TIME': 'Times smaller than a day',
|
|
79
|
+
'PERCENT': 'Percentage (including "%")',
|
|
80
|
+
'MONEY': 'Monetary values, including unit',
|
|
81
|
+
'QUANTITY': 'Measurements, as of weight or distance',
|
|
82
|
+
'ORDINAL': '"first", "second"',
|
|
83
|
+
'CARDINAL': 'Numerals that do not fall under another type'
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
# Setup entity mappings based on the defined entity types
|
|
87
|
+
self.setup_entity_mappings()
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
from evalscope.api.benchmark import BenchmarkMeta, NERAdapter
|
|
2
|
+
from evalscope.api.registry import register_benchmark
|
|
3
|
+
from evalscope.constants import Tags
|
|
4
|
+
from evalscope.utils.ner import FEWSHOT_TEMPLATE, PROMPT_TEMPLATE
|
|
5
|
+
|
|
6
|
+
DESCRIPTION = (
|
|
7
|
+
'The WNUT2017 dataset is a collection of user-generated text from various social '
|
|
8
|
+
'media platforms, like Twitter and YouTube, specifically designed for a named-entity '
|
|
9
|
+
'recognition task.'
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@register_benchmark(
|
|
14
|
+
BenchmarkMeta(
|
|
15
|
+
name='wnut2017',
|
|
16
|
+
pretty_name='WNUT2017',
|
|
17
|
+
dataset_id='extraordinarylab/wnut2017',
|
|
18
|
+
tags=[Tags.KNOWLEDGE, Tags.NER],
|
|
19
|
+
description=DESCRIPTION.strip(),
|
|
20
|
+
few_shot_num=5,
|
|
21
|
+
train_split='train',
|
|
22
|
+
eval_split='test',
|
|
23
|
+
metric_list=['precision', 'recall', 'f1_score', 'accuracy'],
|
|
24
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
25
|
+
few_shot_prompt_template=FEWSHOT_TEMPLATE,
|
|
26
|
+
)
|
|
27
|
+
)
|
|
28
|
+
class WNUT2017Adapter(NERAdapter):
|
|
29
|
+
"""
|
|
30
|
+
Adapter for the WNUT2017 Named Entity Recognition dataset.
|
|
31
|
+
|
|
32
|
+
This adapter inherits the NER functionality from NERAdapter and
|
|
33
|
+
configures it specifically for the WNUT2017 dataset's entity types.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(self, **kwargs):
|
|
37
|
+
# Initialize the parent class first
|
|
38
|
+
super().__init__(**kwargs)
|
|
39
|
+
|
|
40
|
+
# Define WNUT2017-specific entity mappings
|
|
41
|
+
self.entity_type_map = {
|
|
42
|
+
'CORPORATION': 'corporation',
|
|
43
|
+
'CREATIVE-WORK': 'creative_work',
|
|
44
|
+
'GROUP': 'group',
|
|
45
|
+
'LOCATION': 'location',
|
|
46
|
+
'PERSON': 'person',
|
|
47
|
+
'PRODUCT': 'product'
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
# Add descriptions for each entity type
|
|
51
|
+
self.entity_descriptions = {
|
|
52
|
+
'CORPORATION': 'Named companies, businesses, agencies, and other institutions.',
|
|
53
|
+
'CREATIVE-WORK': 'Named books, songs, movies, paintings, and other works of art.',
|
|
54
|
+
'GROUP': 'Named groups of people, such as sports teams, bands, or political groups.',
|
|
55
|
+
'LOCATION': 'Named geographical locations, such as cities, countries, and natural landmarks.',
|
|
56
|
+
'PERSON': 'Named individuals, including both real and fictional people.',
|
|
57
|
+
'PRODUCT': 'Named commercial products, including vehicles, software, and other goods.'
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
# Setup entity mappings based on the defined entity types
|
|
61
|
+
self.setup_entity_mappings()
|
|
File without changes
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Any, Dict, List
|
|
3
|
+
|
|
4
|
+
from evalscope.api.benchmark import BenchmarkMeta, VisionLanguageAdapter
|
|
5
|
+
from evalscope.api.dataset import Sample
|
|
6
|
+
from evalscope.api.evaluator.state import TaskState
|
|
7
|
+
from evalscope.api.messages import ChatMessageUser, Content, ContentImage, ContentText
|
|
8
|
+
from evalscope.api.metric.scorer import Score
|
|
9
|
+
from evalscope.api.registry import register_benchmark
|
|
10
|
+
from evalscope.constants import Tags
|
|
11
|
+
from evalscope.utils.io_utils import bytes_to_base64
|
|
12
|
+
from evalscope.utils.logger import get_logger
|
|
13
|
+
|
|
14
|
+
logger = get_logger()
|
|
15
|
+
|
|
16
|
+
SUBSET_LIST = [
|
|
17
|
+
'Regular Text Recognition', 'Irregular Text Recognition', 'Artistic Text Recognition', 'Handwriting Recognition',
|
|
18
|
+
'Digit String Recognition', 'Non-Semantic Text Recognition', 'Scene Text-centric VQA', 'Doc-oriented VQA',
|
|
19
|
+
'Key Information Extraction', 'Handwritten Mathematical Expression Recognition'
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@register_benchmark(
|
|
24
|
+
BenchmarkMeta(
|
|
25
|
+
name='ocr_bench',
|
|
26
|
+
pretty_name='OCRBench',
|
|
27
|
+
tags=[Tags.MULTI_MODAL, Tags.KNOWLEDGE, Tags.QA],
|
|
28
|
+
description=
|
|
29
|
+
'OCRBench is a comprehensive evaluation benchmark designed to assess the OCR capabilities of Large Multimodal Models. It comprises five components: Text Recognition, SceneText-Centric VQA, Document-Oriented VQA, Key Information Extraction, and Handwritten Mathematical Expression Recognition. The benchmark includes 1000 question-answer pairs, and all the answers undergo manual verification and correction to ensure a more precise evaluation.', # noqa: E501
|
|
30
|
+
dataset_id='evalscope/OCRBench',
|
|
31
|
+
subset_list=SUBSET_LIST,
|
|
32
|
+
metric_list=['acc'],
|
|
33
|
+
eval_split='test',
|
|
34
|
+
prompt_template='{question}',
|
|
35
|
+
)
|
|
36
|
+
)
|
|
37
|
+
class OCRBenchAdapter(VisionLanguageAdapter):
|
|
38
|
+
|
|
39
|
+
def __init__(self, **kwargs):
|
|
40
|
+
super().__init__(**kwargs)
|
|
41
|
+
self.add_aggregation_name = False
|
|
42
|
+
self.reformat_subset = True
|
|
43
|
+
|
|
44
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
45
|
+
|
|
46
|
+
input_text = self.prompt_template.format(question=record['question'])
|
|
47
|
+
content_list: List[Content] = [ContentText(text=input_text)]
|
|
48
|
+
image = record.get('image')
|
|
49
|
+
if image:
|
|
50
|
+
image_base64 = bytes_to_base64(image['bytes'], format='jpeg', add_header=True)
|
|
51
|
+
content_list.append(ContentImage(image=image_base64))
|
|
52
|
+
return Sample(
|
|
53
|
+
input=[ChatMessageUser(content=content_list)],
|
|
54
|
+
target=json.dumps(record.get('answer'), ensure_ascii=False), # answers is a list
|
|
55
|
+
subset_key=record.get('question_type'),
|
|
56
|
+
metadata={
|
|
57
|
+
'dataset': record.get('dataset'),
|
|
58
|
+
'question_type': record.get('question_type'),
|
|
59
|
+
}
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
def match_score(
|
|
63
|
+
self, original_prediction: str, filtered_prediction: str, reference: str, task_state: TaskState
|
|
64
|
+
) -> Score:
|
|
65
|
+
|
|
66
|
+
score = Score(
|
|
67
|
+
extracted_prediction=filtered_prediction,
|
|
68
|
+
prediction=original_prediction,
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
pred = filtered_prediction.lower().strip()
|
|
72
|
+
gt_ans = json.loads(reference)
|
|
73
|
+
dataset_name = task_state.metadata['dataset']
|
|
74
|
+
|
|
75
|
+
score_value = 0
|
|
76
|
+
if dataset_name == 'HME100k':
|
|
77
|
+
if isinstance(gt_ans, list):
|
|
78
|
+
for j in range(len(gt_ans)):
|
|
79
|
+
answer = gt_ans[j].strip().replace('\n', ' ').replace(' ', '')
|
|
80
|
+
predict = pred.strip().replace('\n', ' ').replace(' ', '')
|
|
81
|
+
if answer in predict:
|
|
82
|
+
score_value = 1
|
|
83
|
+
else:
|
|
84
|
+
answer = gt_ans.strip().replace('\n', ' ').replace(' ', '')
|
|
85
|
+
predict = pred.strip().replace('\n', ' ').replace(' ', '')
|
|
86
|
+
if answer in predict:
|
|
87
|
+
score_value = 1
|
|
88
|
+
else:
|
|
89
|
+
if isinstance(gt_ans, list):
|
|
90
|
+
for j in range(len(gt_ans)):
|
|
91
|
+
answer = gt_ans[j].lower().strip().replace('\n', ' ')
|
|
92
|
+
predict = pred.lower().strip().replace('\n', ' ')
|
|
93
|
+
if answer in predict:
|
|
94
|
+
score_value = 1
|
|
95
|
+
else:
|
|
96
|
+
answer = gt_ans.lower().strip().replace('\n', ' ')
|
|
97
|
+
predict = pred.lower().strip().replace('\n', ' ')
|
|
98
|
+
if answer in predict:
|
|
99
|
+
score_value = 1
|
|
100
|
+
score.value = {'acc': score_value}
|
|
101
|
+
return score
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# flake8: noqa
|
|
2
|
+
import ast
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
from .vqa_metric import vqa_evaluation
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def calculate_iou(box1, box2):
|
|
9
|
+
try:
|
|
10
|
+
box1 = [int(coordinate) for coordinate in box1]
|
|
11
|
+
box2 = [int(coordinate) for coordinate in box2]
|
|
12
|
+
except:
|
|
13
|
+
return 0
|
|
14
|
+
|
|
15
|
+
x1_inter = max(box1[0], box2[0])
|
|
16
|
+
y1_inter = max(box1[1], box2[1])
|
|
17
|
+
x2_inter = min(box1[2], box2[2])
|
|
18
|
+
y2_inter = min(box1[3], box2[3])
|
|
19
|
+
|
|
20
|
+
inter_area = max(0, x2_inter - x1_inter) * max(0, y2_inter - y1_inter)
|
|
21
|
+
|
|
22
|
+
box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
|
|
23
|
+
box2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])
|
|
24
|
+
|
|
25
|
+
union_area = box1_area + box2_area - inter_area
|
|
26
|
+
|
|
27
|
+
iou = inter_area / union_area if union_area != 0 else 0
|
|
28
|
+
|
|
29
|
+
return iou
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def vqa_with_position_evaluation(predict, img_metas):
|
|
33
|
+
score_content, score_bbox = 0.0, 0.0
|
|
34
|
+
if 'answer' in predict.keys():
|
|
35
|
+
score_content = vqa_evaluation(predict['answer'], img_metas['answers'])
|
|
36
|
+
if 'bbox' in predict.keys():
|
|
37
|
+
gt_bbox = img_metas['bbox']
|
|
38
|
+
try:
|
|
39
|
+
predict_bbox_list = ast.literal_eval(predict['bbox'])
|
|
40
|
+
score_bbox = calculate_iou(predict_bbox_list, gt_bbox)
|
|
41
|
+
except:
|
|
42
|
+
score_bbox = 0
|
|
43
|
+
return 0.5 * score_content + 0.5 * score_bbox
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def extract_coordinates(text):
|
|
47
|
+
# Regex pattern to match coordinates in either (x1, y1, x2, y2) or [x1, y1, x2, y2] format
|
|
48
|
+
|
|
49
|
+
pattern = r'[\(\[]\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*[\)\]]'
|
|
50
|
+
|
|
51
|
+
matches = list(re.finditer(pattern, text))
|
|
52
|
+
coords_list = []
|
|
53
|
+
coords_set = set()
|
|
54
|
+
for match in matches:
|
|
55
|
+
x1, y1, x2, y2 = map(int, match.groups())
|
|
56
|
+
|
|
57
|
+
if all(0 <= n <= 1000 for n in [x1, y1, x2, y2]):
|
|
58
|
+
coords = (x1, y1, x2, y2)
|
|
59
|
+
|
|
60
|
+
if coords in coords_set:
|
|
61
|
+
coords_list = [c for c in coords_list if c != coords]
|
|
62
|
+
|
|
63
|
+
coords_list.append(coords)
|
|
64
|
+
coords_set.add(coords)
|
|
65
|
+
if coords_list:
|
|
66
|
+
last_coords = coords_list[-1]
|
|
67
|
+
return list(last_coords)
|
|
68
|
+
else:
|
|
69
|
+
return None
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
if __name__ == '__main__':
|
|
73
|
+
print('Example for Text Grounding task.')
|
|
74
|
+
box1 = [50, 50, 150, 150]
|
|
75
|
+
box2 = [60, 60, 140, 140]
|
|
76
|
+
iou_score = calculate_iou(box1, box2)
|
|
77
|
+
print(f'IoU score: {iou_score}')
|
|
78
|
+
|
|
79
|
+
print('Example for VQA with position task.')
|
|
80
|
+
pred = {'content': 'The content is Hello Buddies', 'bbox': box1}
|
|
81
|
+
gt = {'content': 'Hello Buddies', 'bbox': box2}
|
|
82
|
+
|
|
83
|
+
vqa_score = vqa_evaluation(pred['content'], gt['content'])
|
|
84
|
+
iou_score = calculate_iou(pred['bbox'], gt['bbox'])
|
|
85
|
+
|
|
86
|
+
print(f'VQA score: {vqa_score}')
|
|
87
|
+
print(f'IoU score: {iou_score}')
|