evalscope 1.1.0__py3-none-any.whl → 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/api/benchmark/__init__.py +8 -1
- evalscope/api/benchmark/adapters/__init__.py +1 -0
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/benchmark.py +14 -0
- evalscope/api/dataset/dataset.py +21 -0
- evalscope/api/dataset/loader.py +6 -2
- evalscope/api/mixin/sandbox_mixin.py +32 -54
- evalscope/api/model/generate_config.py +6 -0
- evalscope/benchmarks/aa_lcr/__init__.py +0 -0
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/bfcl/bfcl_adapter.py +1 -1
- evalscope/benchmarks/data_collection/data_collection_adapter.py +2 -1
- evalscope/benchmarks/general_arena/general_arena_adapter.py +1 -1
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +23 -4
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +158 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +2 -1
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +3 -1
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +100 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +111 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +6 -26
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -1
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench_v2/utils.py +1 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +127 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +111 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +1 -1
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +1 -1
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/constants.py +4 -0
- evalscope/evaluator/evaluator.py +72 -79
- evalscope/metrics/math_parser.py +14 -0
- evalscope/metrics/metric.py +1 -1
- evalscope/models/utils/openai.py +4 -0
- evalscope/perf/arguments.py +24 -4
- evalscope/perf/benchmark.py +74 -89
- evalscope/perf/http_client.py +31 -16
- evalscope/perf/main.py +15 -2
- evalscope/perf/plugin/api/base.py +9 -7
- evalscope/perf/plugin/api/custom_api.py +13 -58
- evalscope/perf/plugin/api/default_api.py +179 -79
- evalscope/perf/plugin/api/openai_api.py +4 -3
- evalscope/perf/plugin/datasets/base.py +21 -0
- evalscope/perf/plugin/datasets/custom.py +2 -3
- evalscope/perf/plugin/datasets/line_by_line.py +2 -3
- evalscope/perf/plugin/datasets/longalpaca.py +2 -3
- evalscope/perf/plugin/datasets/openqa.py +2 -4
- evalscope/perf/plugin/datasets/random_dataset.py +1 -3
- evalscope/perf/utils/benchmark_util.py +36 -22
- evalscope/perf/utils/db_util.py +14 -19
- evalscope/perf/utils/local_server.py +0 -44
- evalscope/perf/utils/log_utils.py +21 -6
- evalscope/report/__init__.py +2 -1
- evalscope/run.py +4 -0
- evalscope/utils/function_utils.py +195 -12
- evalscope/utils/io_utils.py +74 -0
- evalscope/utils/logger.py +49 -17
- evalscope/utils/ner.py +377 -0
- evalscope/version.py +2 -2
- {evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info}/METADATA +235 -363
- {evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info}/RECORD +100 -55
- {evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info}/WHEEL +1 -1
- {evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info}/entry_points.txt +0 -0
- {evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info/licenses}/LICENSE +0 -0
- {evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
from evalscope.api.benchmark import BenchmarkMeta, NERAdapter
|
|
2
|
+
from evalscope.api.registry import register_benchmark
|
|
3
|
+
from evalscope.constants import Tags
|
|
4
|
+
from evalscope.utils.ner import FEWSHOT_TEMPLATE, PROMPT_TEMPLATE
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@register_benchmark(
|
|
8
|
+
BenchmarkMeta(
|
|
9
|
+
name='conll2003',
|
|
10
|
+
pretty_name='CoNLL2003',
|
|
11
|
+
dataset_id='evalscope/conll2003',
|
|
12
|
+
tags=[Tags.KNOWLEDGE, Tags.NER],
|
|
13
|
+
description='The ConLL-2003 dataset is for the Named Entity Recognition (NER) task. It was introduced as part '
|
|
14
|
+
'of the ConLL-2003 Shared Task conference and contains texts annotated with entities such as '
|
|
15
|
+
'people, organizations, places, and various names.',
|
|
16
|
+
few_shot_num=5,
|
|
17
|
+
train_split='train',
|
|
18
|
+
eval_split='test',
|
|
19
|
+
metric_list=['precision', 'recall', 'f1_score', 'accuracy'],
|
|
20
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
21
|
+
few_shot_prompt_template=FEWSHOT_TEMPLATE,
|
|
22
|
+
)
|
|
23
|
+
)
|
|
24
|
+
class CoNLL2003Adapter(NERAdapter):
|
|
25
|
+
"""
|
|
26
|
+
Adapter for the CoNLL2003 Named Entity Recognition dataset.
|
|
27
|
+
|
|
28
|
+
This adapter inherits the NER functionality from NERAdapter and
|
|
29
|
+
configures it specifically for the CoNLL2003 dataset's entity types.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(self, **kwargs):
|
|
33
|
+
# Initialize the parent class first
|
|
34
|
+
super().__init__(**kwargs)
|
|
35
|
+
|
|
36
|
+
# Define CoNLL2003-specific entity mappings
|
|
37
|
+
self.entity_type_map = {'PER': 'person', 'ORG': 'organization', 'LOC': 'location', 'MISC': 'miscellaneous'}
|
|
38
|
+
|
|
39
|
+
# Add descriptions for each entity type
|
|
40
|
+
self.entity_descriptions = {
|
|
41
|
+
'PER': 'Names of people, including first and last names',
|
|
42
|
+
'ORG': 'Names of companies, institutions, organizations, etc.',
|
|
43
|
+
'LOC': 'Names of locations, cities, states, countries, etc.',
|
|
44
|
+
'MISC': 'Miscellaneous entities not in the above categories'
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
# Setup entity mappings based on the defined entity types
|
|
48
|
+
self.setup_entity_mappings()
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
from evalscope.api.benchmark import BenchmarkMeta, NERAdapter
|
|
2
|
+
from evalscope.api.registry import register_benchmark
|
|
3
|
+
from evalscope.constants import Tags
|
|
4
|
+
from evalscope.utils.ner import FEWSHOT_TEMPLATE, PROMPT_TEMPLATE
|
|
5
|
+
|
|
6
|
+
DESCRIPTION = (
|
|
7
|
+
'Copious corpus is a gold standard corpus that covers a wide range of biodiversity '
|
|
8
|
+
'entities, consisting of 668 documents downloaded from the Biodiversity Heritage '
|
|
9
|
+
'Library with over 26K sentences and more than 28K entities.'
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@register_benchmark(
|
|
14
|
+
BenchmarkMeta(
|
|
15
|
+
name='copious',
|
|
16
|
+
pretty_name='Copious',
|
|
17
|
+
dataset_id='extraordinarylab/copious',
|
|
18
|
+
tags=[Tags.KNOWLEDGE, Tags.NER],
|
|
19
|
+
description=DESCRIPTION.strip(),
|
|
20
|
+
few_shot_num=5,
|
|
21
|
+
train_split='train',
|
|
22
|
+
eval_split='test',
|
|
23
|
+
metric_list=['precision', 'recall', 'f1_score', 'accuracy'],
|
|
24
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
25
|
+
few_shot_prompt_template=FEWSHOT_TEMPLATE,
|
|
26
|
+
)
|
|
27
|
+
)
|
|
28
|
+
class CopiousAdapter(NERAdapter):
|
|
29
|
+
"""
|
|
30
|
+
Adapter for the Copious Named Entity Recognition dataset.
|
|
31
|
+
|
|
32
|
+
This adapter inherits the NER functionality from NERAdapter and
|
|
33
|
+
configures it specifically for the Copious dataset's entity types.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(self, **kwargs):
|
|
37
|
+
# Initialize the parent class first
|
|
38
|
+
super().__init__(**kwargs)
|
|
39
|
+
|
|
40
|
+
# Define Copious-specific entity mappings
|
|
41
|
+
self.entity_type_map = {
|
|
42
|
+
'TAXON': 'taxon',
|
|
43
|
+
'GEOGRAPHICAL_LOCATION': 'geographical_location',
|
|
44
|
+
'HABITAT': 'habitat',
|
|
45
|
+
'PERSON': 'person',
|
|
46
|
+
'TEMPORAL_EXPRESSION': 'temporal_expression'
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
# Add descriptions for each entity type
|
|
50
|
+
self.entity_descriptions = {
|
|
51
|
+
'TAXON': (
|
|
52
|
+
'Mentions of taxonomic ranks such as species, genus, and family. '
|
|
53
|
+
'This includes scientific names (e.g., "Salvelinus alpinus") and '
|
|
54
|
+
'vernacular names (e.g., "flying fox"), but excludes general terms '
|
|
55
|
+
'like "fish" or "birds" and microorganism names.'
|
|
56
|
+
),
|
|
57
|
+
'GEOGRAPHICAL_LOCATION': (
|
|
58
|
+
'Identifiable points or areas on the planet, including continents, '
|
|
59
|
+
'countries, cities, landforms, and bodies of water (e.g., "East coast '
|
|
60
|
+
'of Mindoro", "Balayan Bay"). This also includes geographical '
|
|
61
|
+
'coordinates (e.g., "13o 36\' 11\\" N.").'
|
|
62
|
+
),
|
|
63
|
+
'HABITAT': (
|
|
64
|
+
'Descriptions of environments where organisms live. This includes '
|
|
65
|
+
'natural environments (e.g., "Lowland forest", "subalpine calcareous '
|
|
66
|
+
'pastures") and places where parasites or epiphytes reside (e.g., '
|
|
67
|
+
'"parasitic on Achillea holosericea"). It excludes habitat attributes '
|
|
68
|
+
'like altitude or depth.'
|
|
69
|
+
),
|
|
70
|
+
'PERSON': (
|
|
71
|
+
'Proper nouns referring to person names, including those in historical '
|
|
72
|
+
'accounts or citations related to a species observation (e.g., "In 1905, '
|
|
73
|
+
'[Tattersall] follows..."). It excludes titles, general references like '
|
|
74
|
+
'"the researcher", and names that are part of a taxon\'s authority.'
|
|
75
|
+
),
|
|
76
|
+
'TEMPORAL_EXPRESSION': (
|
|
77
|
+
'Spans of text referring to points in time. This includes specific dates '
|
|
78
|
+
'(e.g., "10 June 2013"), years, decades, seasons, and geochronological ages '
|
|
79
|
+
'(e.g., "late Pleistocene"). It excludes time-of-day information and dates '
|
|
80
|
+
'within a taxon name\'s authority.'
|
|
81
|
+
)
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
# Setup entity mappings based on the defined entity types
|
|
85
|
+
self.setup_entity_mappings()
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Set, Tuple
|
|
2
|
+
|
|
3
|
+
from evalscope.api.benchmark import BenchmarkMeta, NERAdapter
|
|
4
|
+
from evalscope.api.dataset import Sample
|
|
5
|
+
from evalscope.api.registry import register_benchmark
|
|
6
|
+
from evalscope.benchmarks.ner.cross_ner_entities import ai, literature, music, politics, science
|
|
7
|
+
from evalscope.constants import Tags
|
|
8
|
+
from evalscope.utils.ner import FEWSHOT_TEMPLATE, PROMPT_TEMPLATE, create_target_text
|
|
9
|
+
|
|
10
|
+
DESCRIPTION = (
|
|
11
|
+
'CrossNER is a fully-labelled collected of named entity recognition (NER) data '
|
|
12
|
+
'spanning over five diverse domains (AI, Literature, Music, Politics, Science).'
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@register_benchmark(
|
|
17
|
+
BenchmarkMeta(
|
|
18
|
+
name='cross-ner',
|
|
19
|
+
pretty_name='CrossNER',
|
|
20
|
+
dataset_id='extraordinarylab/cross-ner',
|
|
21
|
+
subset_list=['ai', 'literature', 'music', 'politics', 'science'],
|
|
22
|
+
tags=[Tags.KNOWLEDGE, Tags.NER],
|
|
23
|
+
description=DESCRIPTION.strip(),
|
|
24
|
+
few_shot_num=5,
|
|
25
|
+
train_split='train',
|
|
26
|
+
eval_split='test',
|
|
27
|
+
metric_list=['precision', 'recall', 'f1_score', 'accuracy'],
|
|
28
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
29
|
+
few_shot_prompt_template=FEWSHOT_TEMPLATE,
|
|
30
|
+
)
|
|
31
|
+
)
|
|
32
|
+
class CrossNERAdapter(NERAdapter):
|
|
33
|
+
"""
|
|
34
|
+
Adapter for the CrossNER Named Entity Recognition dataset.
|
|
35
|
+
|
|
36
|
+
This adapter inherits the NER functionality from NERAdapter and
|
|
37
|
+
configures it specifically for the CrossNER dataset's entity types.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
def __init__(self, **kwargs):
|
|
41
|
+
# Initialize the parent class first
|
|
42
|
+
super().__init__(**kwargs)
|
|
43
|
+
|
|
44
|
+
# Define CrossNER-specific entity mappings
|
|
45
|
+
self.entity_type_map = {}
|
|
46
|
+
|
|
47
|
+
# Add descriptions for each entity type
|
|
48
|
+
self.entity_descriptions = {}
|
|
49
|
+
|
|
50
|
+
def setup_entity_mappings(self):
|
|
51
|
+
"""
|
|
52
|
+
Setup entity mappings and descriptions for prompt formatting.
|
|
53
|
+
This should be called after entity_type_map and entity_descriptions are defined.
|
|
54
|
+
"""
|
|
55
|
+
if self.current_subset_name == 'ai':
|
|
56
|
+
self.entity_type_map, self.entity_descriptions = ai.get_entity_mappings()
|
|
57
|
+
elif self.current_subset_name == 'literature':
|
|
58
|
+
self.entity_type_map, self.entity_descriptions = literature.get_entity_mappings()
|
|
59
|
+
elif self.current_subset_name == 'music':
|
|
60
|
+
self.entity_type_map, self.entity_descriptions = music.get_entity_mappings()
|
|
61
|
+
elif self.current_subset_name == 'politics':
|
|
62
|
+
self.entity_type_map, self.entity_descriptions = politics.get_entity_mappings()
|
|
63
|
+
elif self.current_subset_name == 'science':
|
|
64
|
+
self.entity_type_map, self.entity_descriptions = science.get_entity_mappings()
|
|
65
|
+
|
|
66
|
+
# Reverse mapping for converting back from prediction to evaluation
|
|
67
|
+
self.reverse_entity_map = {v.lower(): k for k, v in self.entity_type_map.items()}
|
|
68
|
+
|
|
69
|
+
# Create list of tags for prompt formatting
|
|
70
|
+
self.entity_list = [f'<{ent.lower()}>' for ent in self.entity_type_map.values()]
|
|
71
|
+
|
|
72
|
+
# Create description of entities for prompt
|
|
73
|
+
self.entities_description = ', '.join([
|
|
74
|
+
f'{self.entity_type_map[tag]} ({self.entity_descriptions[tag]})' for tag in self.entity_type_map
|
|
75
|
+
])
|
|
76
|
+
|
|
77
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
78
|
+
"""
|
|
79
|
+
Convert a record with tokens and NER tags into a Sample.
|
|
80
|
+
Creates both the raw text input and annotated text target.
|
|
81
|
+
"""
|
|
82
|
+
# Setup entity mappings based on the defined entity types
|
|
83
|
+
self.setup_entity_mappings()
|
|
84
|
+
|
|
85
|
+
tokens: List[str] = record['tokens']
|
|
86
|
+
ner_tags: List[str] = record['ner_tags']
|
|
87
|
+
|
|
88
|
+
# Create the input text by joining tokens
|
|
89
|
+
input_text = ' '.join(tokens)
|
|
90
|
+
|
|
91
|
+
# Process tokens and tags to create annotated target text
|
|
92
|
+
target_text = create_target_text(tokens, ner_tags, self.entity_type_map)
|
|
93
|
+
|
|
94
|
+
# Store tokens and tags in metadata for evaluation
|
|
95
|
+
metadata = {'tokens': tokens, 'ner_tags': ner_tags}
|
|
96
|
+
|
|
97
|
+
return Sample(input=input_text, target=target_text, metadata=metadata)
|
|
98
|
+
|
|
99
|
+
def format_prompt_template(self, sample):
|
|
100
|
+
"""
|
|
101
|
+
Format the prompt with entity types, available tags, and text to annotate.
|
|
102
|
+
"""
|
|
103
|
+
# Setup entity mappings based on the defined entity types
|
|
104
|
+
self.setup_entity_mappings()
|
|
105
|
+
return self.prompt_template.format(
|
|
106
|
+
entities=self.entities_description, entity_list=', '.join(self.entity_list), text=sample.input
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
def format_fewshot_template(self, fewshot, sample):
|
|
110
|
+
"""
|
|
111
|
+
Format the few-shot prompt with all required parameters.
|
|
112
|
+
"""
|
|
113
|
+
# Setup entity mappings based on the defined entity types
|
|
114
|
+
self.setup_entity_mappings()
|
|
115
|
+
return self.few_shot_prompt_template.format(
|
|
116
|
+
fewshot=fewshot,
|
|
117
|
+
entities=self.entities_description,
|
|
118
|
+
entity_list=', '.join(self.entity_list),
|
|
119
|
+
text=sample.input
|
|
120
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
def get_entity_mappings():
|
|
2
|
+
entity_type_map = {
|
|
3
|
+
'ALGORITHM': 'algorithm',
|
|
4
|
+
'CONFERENCE': 'conference',
|
|
5
|
+
'COUNTRY': 'country',
|
|
6
|
+
'FIELD': 'field',
|
|
7
|
+
'LOCATION': 'location',
|
|
8
|
+
'METRICS': 'metrics',
|
|
9
|
+
'MISC': 'misc',
|
|
10
|
+
'ORGANISATION': 'organisation',
|
|
11
|
+
'PERSON': 'person',
|
|
12
|
+
'PRODUCT': 'product',
|
|
13
|
+
'PROGRAMLANG': 'programming_language',
|
|
14
|
+
'RESEARCHER': 'researcher',
|
|
15
|
+
'TASK': 'task',
|
|
16
|
+
'UNIVERSITY': 'university'
|
|
17
|
+
}
|
|
18
|
+
entity_descriptions = {
|
|
19
|
+
'ALGORITHM':
|
|
20
|
+
('A specific algorithm or model architecture in AI (e.g., "Transformer", '
|
|
21
|
+
'"gradient descent", "ResNet").'),
|
|
22
|
+
'CONFERENCE': ('An academic conference related to AI (e.g., "NeurIPS", "ICML", "CVPR").'),
|
|
23
|
+
'COUNTRY': ('A country mentioned in the context of AI research or development '
|
|
24
|
+
'(e.g., "USA", "China").'),
|
|
25
|
+
'FIELD':
|
|
26
|
+
('A sub-field or area of study within AI (e.g., "Natural Language Processing", '
|
|
27
|
+
'"Computer Vision").'),
|
|
28
|
+
'LOCATION':
|
|
29
|
+
('A specific geographical location relevant to AI, other than countries '
|
|
30
|
+
'(e.g., "Silicon Valley").'),
|
|
31
|
+
'METRICS': ('A performance metric used to evaluate AI models (e.g., "F1-score", '
|
|
32
|
+
'"BLEU", "accuracy").'),
|
|
33
|
+
'MISC': ('Miscellaneous AI-related terms that don\'t fit other categories '
|
|
34
|
+
'(e.g., "Turing Award").'),
|
|
35
|
+
'ORGANISATION':
|
|
36
|
+
('An organization, company, or lab involved in AI (e.g., "Google AI", '
|
|
37
|
+
'"OpenAI", "DeepMind").'),
|
|
38
|
+
'PERSON':
|
|
39
|
+
('A person mentioned in the context of AI, who is not a researcher '
|
|
40
|
+
'(e.g., a CEO or public figure).'),
|
|
41
|
+
'PRODUCT': ('An AI-related product, framework, or software (e.g., "TensorFlow", '
|
|
42
|
+
'"PyTorch", "AlphaGo").'),
|
|
43
|
+
'PROGRAMLANG': ('A programming language used in AI (e.g., "Python", "C++", "Julia").'),
|
|
44
|
+
'RESEARCHER': ('A person who conducts research in the field of AI (e.g., "Yann LeCun", '
|
|
45
|
+
'"Geoffrey Hinton").'),
|
|
46
|
+
'TASK': (
|
|
47
|
+
'A specific problem or task that AI is used to solve (e.g., "Image Classification", '
|
|
48
|
+
'"Sentiment Analysis").'
|
|
49
|
+
),
|
|
50
|
+
'UNIVERSITY':
|
|
51
|
+
('A university or academic institution involved in AI research (e.g., '
|
|
52
|
+
'"Stanford University", "MIT").')
|
|
53
|
+
}
|
|
54
|
+
return entity_type_map, entity_descriptions
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
def get_entity_mappings():
|
|
2
|
+
entity_type_map = {
|
|
3
|
+
'AWARD': 'award',
|
|
4
|
+
'BOOK': 'book',
|
|
5
|
+
'COUNTRY': 'country',
|
|
6
|
+
'EVENT': 'event',
|
|
7
|
+
'LITERARYGENRE': 'literary_genre',
|
|
8
|
+
'LOCATION': 'location',
|
|
9
|
+
'MAGAZINE': 'magazine',
|
|
10
|
+
'MISC': 'misc',
|
|
11
|
+
'ORGANISATION': 'organisation',
|
|
12
|
+
'PERSON': 'person',
|
|
13
|
+
'POEM': 'poem',
|
|
14
|
+
'WRITER': 'writer'
|
|
15
|
+
}
|
|
16
|
+
entity_descriptions = {
|
|
17
|
+
'AWARD': ('A literary award or prize (e.g., "Nobel Prize in Literature", "Booker Prize").'),
|
|
18
|
+
'BOOK': ('The title of a book (e.g., "Pride and Prejudice", "One Hundred Years of Solitude").'),
|
|
19
|
+
'COUNTRY': ('A country relevant to the literary context (e.g., "England", "Russia").'),
|
|
20
|
+
'EVENT': ('A literary festival or significant event (e.g., "Hay Festival", "Frankfurt Book Fair").'),
|
|
21
|
+
'LITERARYGENRE':
|
|
22
|
+
('A genre or category of literature (e.g., "Science Fiction", "Gothic novel", '
|
|
23
|
+
'"magical realism").'),
|
|
24
|
+
'LOCATION': ('A real or fictional place mentioned in a literary context (e.g., "London", '
|
|
25
|
+
'"Middle-earth").'),
|
|
26
|
+
'MAGAZINE': ('A magazine or literary journal (e.g., "The New Yorker", "Paris Review").'),
|
|
27
|
+
'MISC': ('Miscellaneous literary terms (e.g., "protagonist", "sonnet", '
|
|
28
|
+
'"Shakespeare\'s Globe").'),
|
|
29
|
+
'ORGANISATION': ('A publishing house or literary organization (e.g., "Penguin Random House").'),
|
|
30
|
+
'PERSON': ('A character or person mentioned who is not a writer (e.g., "Elizabeth Bennet", '
|
|
31
|
+
'"King Lear").'),
|
|
32
|
+
'POEM': ('The title of a poem (e.g., "The Waste Land", "Ozymandias").'),
|
|
33
|
+
'WRITER': ('The name of a writer, author, or poet (e.g., "Jane Austen", '
|
|
34
|
+
'"Gabriel Garcia Marquez").')
|
|
35
|
+
}
|
|
36
|
+
return entity_type_map, entity_descriptions
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
def get_entity_mappings():
|
|
2
|
+
entity_type_map = {
|
|
3
|
+
'ALBUM': 'album',
|
|
4
|
+
'AWARD': 'award',
|
|
5
|
+
'BAND': 'band',
|
|
6
|
+
'COUNTRY': 'country',
|
|
7
|
+
'EVENT': 'event',
|
|
8
|
+
'LOCATION': 'location',
|
|
9
|
+
'MISC': 'misc',
|
|
10
|
+
'MUSICALARTIST': 'musical_artist',
|
|
11
|
+
'MUSICALINSTRUMENT': 'musical_instrument',
|
|
12
|
+
'MUSICGENRE': 'music_genre',
|
|
13
|
+
'ORGANISATION': 'organisation',
|
|
14
|
+
'PERSON': 'person',
|
|
15
|
+
'SONG': 'song'
|
|
16
|
+
}
|
|
17
|
+
entity_descriptions = {
|
|
18
|
+
'ALBUM': ('The title of a music album (e.g., "Abbey Road", "Thriller", "Lemonade").'),
|
|
19
|
+
'AWARD': ('A music award or prize (e.g., "Grammy Award", "MTV Music Award").'),
|
|
20
|
+
'BAND': ('The name of a musical group or band (e.g., "The Beatles", "Queen", "BTS").'),
|
|
21
|
+
'COUNTRY': ('A country relevant to the music context (e.g., "USA", "UK", "South Korea").'),
|
|
22
|
+
'EVENT': ('A music festival, concert tour, or event (e.g., "Glastonbury Festival", '
|
|
23
|
+
'"Woodstock").'),
|
|
24
|
+
'LOCATION':
|
|
25
|
+
('A venue, studio, or place relevant to music (e.g., "Madison Square Garden", '
|
|
26
|
+
'"Abbey Road Studios").'),
|
|
27
|
+
'MISC': ('Miscellaneous music-related terms (e.g., "synthesizer", "major key", '
|
|
28
|
+
'"a cappella").'),
|
|
29
|
+
'MUSICALARTIST': ('A solo musician or singer (e.g., "Michael Jackson", "Taylor Swift", '
|
|
30
|
+
'"Ed Sheeran").'),
|
|
31
|
+
'MUSICALINSTRUMENT': ('A musical instrument (e.g., "guitar", "piano", "violin").'),
|
|
32
|
+
'MUSICGENRE': ('A genre or style of music (e.g., "Rock", "Pop", "Jazz", "K-Pop").'),
|
|
33
|
+
'ORGANISATION': ('A record label or music organization (e.g., "Capitol Records", "Sony Music").'),
|
|
34
|
+
'PERSON':
|
|
35
|
+
('A person related to music who is not a primary artist (e.g., a producer, '
|
|
36
|
+
'a songwriter, "John Lennon").'),
|
|
37
|
+
'SONG': ('The title of a song (e.g., "Bohemian Rhapsody", "Hey Jude", "Dynamite").')
|
|
38
|
+
}
|
|
39
|
+
return entity_type_map, entity_descriptions
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
def get_entity_mappings():
|
|
2
|
+
entity_type_map = {
|
|
3
|
+
'COUNTRY': 'country',
|
|
4
|
+
'ELECTION': 'election',
|
|
5
|
+
'EVENT': 'event',
|
|
6
|
+
'LOCATION': 'location',
|
|
7
|
+
'MISC': 'misc',
|
|
8
|
+
'ORGANISATION': 'organisation',
|
|
9
|
+
'PERSON': 'person',
|
|
10
|
+
'POLITICALPARTY': 'political_party',
|
|
11
|
+
'POLITICIAN': 'politician'
|
|
12
|
+
}
|
|
13
|
+
entity_descriptions = {
|
|
14
|
+
'COUNTRY': ('A country or sovereign state (e.g., "United States", "Germany").'),
|
|
15
|
+
'ELECTION': ('A specific election event (e.g., "2024 presidential election", '
|
|
16
|
+
'"midterm elections").'),
|
|
17
|
+
'EVENT':
|
|
18
|
+
('A significant political event, summit, or incident (e.g., "G7 Summit", '
|
|
19
|
+
'"Brexit", "Watergate scandal").'),
|
|
20
|
+
'LOCATION':
|
|
21
|
+
('A politically significant building or location (e.g., "The White House", '
|
|
22
|
+
'"10 Downing Street").'),
|
|
23
|
+
'MISC': (
|
|
24
|
+
'Miscellaneous political terms, ideologies, or documents (e.g., "democracy", '
|
|
25
|
+
'"impeachment", "the Constitution").'
|
|
26
|
+
),
|
|
27
|
+
'ORGANISATION':
|
|
28
|
+
('A political or governmental organization (e.g., "United Nations", "NATO", '
|
|
29
|
+
'"European Union").'),
|
|
30
|
+
'PERSON':
|
|
31
|
+
('A person mentioned in a political context who is not a politician '
|
|
32
|
+
'(e.g., a journalist, an activist).'),
|
|
33
|
+
'POLITICALPARTY': ('A named political party (e.g., "Democratic Party", "Conservative Party").'),
|
|
34
|
+
'POLITICIAN': ('A person who holds or seeks political office (e.g., "Joe Biden", '
|
|
35
|
+
'"Angela Merkel").')
|
|
36
|
+
}
|
|
37
|
+
return entity_type_map, entity_descriptions
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
def get_entity_mappings():
|
|
2
|
+
entity_type_map = {
|
|
3
|
+
'ACADEMICJOURNAL': 'academic_journal',
|
|
4
|
+
'ASTRONOMICALOBJECT': 'astronomical_object',
|
|
5
|
+
'AWARD': 'award',
|
|
6
|
+
'CHEMICALCOMPOUND': 'chemical_compound',
|
|
7
|
+
'CHEMICALELEMENT': 'chemical_element',
|
|
8
|
+
'COUNTRY': 'country',
|
|
9
|
+
'DISCIPLINE': 'discipline',
|
|
10
|
+
'ENZYME': 'enzyme',
|
|
11
|
+
'EVENT': 'event',
|
|
12
|
+
'LOCATION': 'location',
|
|
13
|
+
'MISC': 'misc',
|
|
14
|
+
'ORGANISATION': 'organisation',
|
|
15
|
+
'PERSON': 'person',
|
|
16
|
+
'PROTEIN': 'protein',
|
|
17
|
+
'SCIENTIST': 'scientist',
|
|
18
|
+
'THEORY': 'theory',
|
|
19
|
+
'UNIVERSITY': 'university'
|
|
20
|
+
}
|
|
21
|
+
entity_descriptions = {
|
|
22
|
+
'ACADEMICJOURNAL': ('A scientific journal or publication (e.g., "Nature", "Science", "The Lancet").'),
|
|
23
|
+
'ASTRONOMICALOBJECT': ('A natural object in space (e.g., "Mars", "Andromeda Galaxy", '
|
|
24
|
+
'"Halley\'s Comet").'),
|
|
25
|
+
'AWARD': ('A scientific award or prize (e.g., "Nobel Prize in Physics", "Fields Medal").'),
|
|
26
|
+
'CHEMICALCOMPOUND':
|
|
27
|
+
('A chemical substance consisting of two or more elements (e.g., "H2O", '
|
|
28
|
+
'"Carbon Dioxide").'),
|
|
29
|
+
'CHEMICALELEMENT': ('An element from the periodic table (e.g., "Hydrogen", "Oxygen", "Gold").'),
|
|
30
|
+
'COUNTRY': ('A country relevant to a scientific context (e.g., "Switzerland" for CERN).'),
|
|
31
|
+
'DISCIPLINE':
|
|
32
|
+
('A branch of science or academic discipline (e.g., "Physics", '
|
|
33
|
+
'"Molecular Biology", "Astronomy").'),
|
|
34
|
+
'ENZYME': ('A specific type of protein that acts as a catalyst (e.g., "Lactase", "Catalase").'),
|
|
35
|
+
'EVENT': ('A significant scientific mission or event (e.g., "Apollo 11 mission", '
|
|
36
|
+
'"Human Genome Project").'),
|
|
37
|
+
'LOCATION':
|
|
38
|
+
('A research facility or location of scientific importance (e.g., "CERN", '
|
|
39
|
+
'"International Space Station").'),
|
|
40
|
+
'MISC':
|
|
41
|
+
('Miscellaneous scientific terms or concepts (e.g., "double helix", '
|
|
42
|
+
'"black hole", "quantum mechanics").'),
|
|
43
|
+
'ORGANISATION': ('A scientific organization or agency (e.g., "NASA", "Max Planck Society", "WHO").'),
|
|
44
|
+
'PERSON':
|
|
45
|
+
('A person mentioned in a scientific context who is not a scientist '
|
|
46
|
+
'(e.g., a patient, a benefactor).'),
|
|
47
|
+
'PROTEIN': ('A specific protein (that is not an enzyme) (e.g., "Hemoglobin", '
|
|
48
|
+
'"Insulin", "Keratin").'),
|
|
49
|
+
'SCIENTIST':
|
|
50
|
+
('A person who is a scientist, researcher, or inventor (e.g., "Albert Einstein", '
|
|
51
|
+
'"Marie Curie").'),
|
|
52
|
+
'THEORY': ('A named scientific theory or law (e.g., "Theory of Relativity", '
|
|
53
|
+
'"Big Bang Theory").'),
|
|
54
|
+
'UNIVERSITY':
|
|
55
|
+
('A university or academic institution involved in science (e.g., '
|
|
56
|
+
'"Cambridge University", "Caltech").')
|
|
57
|
+
}
|
|
58
|
+
return entity_type_map, entity_descriptions
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
from evalscope.api.benchmark import BenchmarkMeta, NERAdapter
|
|
2
|
+
from evalscope.api.registry import register_benchmark
|
|
3
|
+
from evalscope.constants import Tags
|
|
4
|
+
from evalscope.utils.ner import FEWSHOT_TEMPLATE, PROMPT_TEMPLATE
|
|
5
|
+
|
|
6
|
+
DESCRIPTION = (
|
|
7
|
+
'GeniaNER consisting of 2,000 MEDLINE abstracts has been released with more than '
|
|
8
|
+
'400,000 words and almost 100,000 annotations for biological terms.'
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@register_benchmark(
|
|
13
|
+
BenchmarkMeta(
|
|
14
|
+
name='genia-ner',
|
|
15
|
+
pretty_name='GeniaNER',
|
|
16
|
+
dataset_id='extraordinarylab/genia-ner',
|
|
17
|
+
tags=[Tags.KNOWLEDGE, Tags.NER],
|
|
18
|
+
description=DESCRIPTION.strip(),
|
|
19
|
+
few_shot_num=5,
|
|
20
|
+
train_split='train',
|
|
21
|
+
eval_split='test',
|
|
22
|
+
metric_list=['precision', 'recall', 'f1_score', 'accuracy'],
|
|
23
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
24
|
+
few_shot_prompt_template=FEWSHOT_TEMPLATE,
|
|
25
|
+
)
|
|
26
|
+
)
|
|
27
|
+
class GeniaNERAdapter(NERAdapter):
|
|
28
|
+
"""
|
|
29
|
+
Adapter for the GeniaNER Named Entity Recognition dataset.
|
|
30
|
+
|
|
31
|
+
This adapter inherits the NER functionality from NERAdapter and
|
|
32
|
+
configures it specifically for the GeniaNER dataset's entity types.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
def __init__(self, **kwargs):
|
|
36
|
+
# Initialize the parent class first
|
|
37
|
+
super().__init__(**kwargs)
|
|
38
|
+
|
|
39
|
+
# Define GeniaNER-specific entity mappings
|
|
40
|
+
self.entity_type_map = {
|
|
41
|
+
'CELL_LINE': 'cell_line',
|
|
42
|
+
'CELL_TYPE': 'cell_type',
|
|
43
|
+
'DNA': 'dna',
|
|
44
|
+
'PROTEIN': 'protein',
|
|
45
|
+
'RNA': 'rna'
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
# Add descriptions for each entity type
|
|
49
|
+
self.entity_descriptions = {
|
|
50
|
+
'CELL_LINE':
|
|
51
|
+
'A population of cells derived from a single cell and grown in a culture.',
|
|
52
|
+
'CELL_TYPE':
|
|
53
|
+
('A category of cells that are part of a larger organism and share a specific '
|
|
54
|
+
'structure and function.'),
|
|
55
|
+
'DNA':
|
|
56
|
+
'Deoxyribonucleic acid. This includes specific genes, domains, and regions of a DNA molecule.',
|
|
57
|
+
'PROTEIN': (
|
|
58
|
+
'Molecules composed of amino acids that perform a vast array of functions within '
|
|
59
|
+
'organisms. This includes enzymes, receptors, and signaling molecules.'
|
|
60
|
+
),
|
|
61
|
+
'RNA':
|
|
62
|
+
'Ribonucleic acid. This refers to RNA molecules, including messenger RNA (mRNA) and other types.'
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
# Setup entity mappings based on the defined entity types
|
|
66
|
+
self.setup_entity_mappings()
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
from evalscope.api.benchmark import BenchmarkMeta, NERAdapter
|
|
2
|
+
from evalscope.api.registry import register_benchmark
|
|
3
|
+
from evalscope.constants import Tags
|
|
4
|
+
from evalscope.utils.ner import FEWSHOT_TEMPLATE, PROMPT_TEMPLATE
|
|
5
|
+
|
|
6
|
+
DESCRIPTION = (
|
|
7
|
+
'HarveyNER is a dataset with fine-grained locations annotated in tweets. This dataset '
|
|
8
|
+
'presents unique challenges and characterizes many complex and long location mentions '
|
|
9
|
+
'in informal descriptions.'
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@register_benchmark(
|
|
14
|
+
BenchmarkMeta(
|
|
15
|
+
name='harvey-ner',
|
|
16
|
+
pretty_name='HarveyNER',
|
|
17
|
+
dataset_id='extraordinarylab/harvey-ner',
|
|
18
|
+
tags=[Tags.KNOWLEDGE, Tags.NER],
|
|
19
|
+
description=DESCRIPTION.strip(),
|
|
20
|
+
few_shot_num=5,
|
|
21
|
+
train_split='train',
|
|
22
|
+
eval_split='test',
|
|
23
|
+
metric_list=['precision', 'recall', 'f1_score', 'accuracy'],
|
|
24
|
+
prompt_template=PROMPT_TEMPLATE,
|
|
25
|
+
few_shot_prompt_template=FEWSHOT_TEMPLATE,
|
|
26
|
+
)
|
|
27
|
+
)
|
|
28
|
+
class HarveyNERAdapter(NERAdapter):
|
|
29
|
+
"""
|
|
30
|
+
Adapter for the HarveyNER Named Entity Recognition dataset.
|
|
31
|
+
|
|
32
|
+
This adapter inherits the NER functionality from NERAdapter and
|
|
33
|
+
configures it specifically for the HarveyNER dataset's entity types.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(self, **kwargs):
|
|
37
|
+
# Initialize the parent class first
|
|
38
|
+
super().__init__(**kwargs)
|
|
39
|
+
|
|
40
|
+
# Define HarveyNER-specific entity mappings
|
|
41
|
+
self.entity_type_map = {'AREA': 'area', 'POINT': 'point', 'RIVER': 'river', 'ROAD': 'road'}
|
|
42
|
+
|
|
43
|
+
# Add descriptions for each entity type
|
|
44
|
+
self.entity_descriptions = {
|
|
45
|
+
'AREA':
|
|
46
|
+
'Geographical entities such as city subdivisions, neighborhoods, etc.',
|
|
47
|
+
'POINT': (
|
|
48
|
+
'An exact location that a geocoordinate can be assigned. E.g., a uniquely named '
|
|
49
|
+
'building, intersections of roads or rivers.'
|
|
50
|
+
),
|
|
51
|
+
'RIVER':
|
|
52
|
+
'A river or a section of a river.',
|
|
53
|
+
'ROAD':
|
|
54
|
+
'A road or a section of a road.'
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
# Setup entity mappings based on the defined entity types
|
|
58
|
+
self.setup_entity_mappings()
|