evalscope 1.0.2__py3-none-any.whl → 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/api/benchmark/__init__.py +8 -1
- evalscope/api/benchmark/adapters/__init__.py +1 -0
- evalscope/api/benchmark/adapters/default_data_adapter.py +12 -0
- evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
- evalscope/api/benchmark/benchmark.py +14 -0
- evalscope/api/dataset/dataset.py +21 -0
- evalscope/api/dataset/loader.py +6 -2
- evalscope/api/mixin/sandbox_mixin.py +32 -54
- evalscope/api/model/generate_config.py +6 -0
- evalscope/app/ui/multi_model.py +6 -1
- evalscope/app/ui/single_model.py +8 -2
- evalscope/app/utils/data_utils.py +3 -2
- evalscope/app/utils/visualization.py +2 -2
- evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
- evalscope/benchmarks/ai2d/ai2d_adapter.py +3 -2
- evalscope/benchmarks/bfcl/bfcl_adapter.py +11 -46
- evalscope/benchmarks/blink/__init__.py +0 -0
- evalscope/benchmarks/blink/blink_adapter.py +61 -0
- evalscope/benchmarks/chartqa/__init__.py +0 -0
- evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
- evalscope/benchmarks/chartqa/utils.py +38 -0
- evalscope/benchmarks/data_collection/data_collection_adapter.py +2 -1
- evalscope/benchmarks/docvqa/__init__.py +0 -0
- evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +1 -1
- evalscope/benchmarks/general_arena/utils.py +2 -1
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +23 -4
- evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
- evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +158 -0
- evalscope/benchmarks/hle/hle_adapter.py +3 -2
- evalscope/benchmarks/humaneval/humaneval_adapter.py +2 -1
- evalscope/benchmarks/infovqa/__init__.py +0 -0
- evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +3 -1
- evalscope/benchmarks/math_verse/__init__.py +0 -0
- evalscope/benchmarks/math_verse/math_verse_adapter.py +100 -0
- evalscope/benchmarks/math_vision/__init__.py +0 -0
- evalscope/benchmarks/math_vision/math_vision_adapter.py +111 -0
- evalscope/benchmarks/math_vista/math_vista_adapter.py +6 -26
- evalscope/benchmarks/mm_bench/mm_bench_adapter.py +2 -2
- evalscope/benchmarks/mmmu/mmmu_adapter.py +1 -1
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -1
- evalscope/benchmarks/ner/__init__.py +0 -0
- evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
- evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
- evalscope/benchmarks/ner/copious_adapter.py +85 -0
- evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
- evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
- evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
- evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
- evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
- evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
- evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
- evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
- evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
- evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
- evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
- evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
- evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
- evalscope/benchmarks/ocr_bench/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py +101 -0
- evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py +87 -0
- evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py +963 -0
- evalscope/benchmarks/ocr_bench_v2/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
- evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py +50 -0
- evalscope/benchmarks/ocr_bench_v2/parallel.py +46 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py +0 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt +26 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py +481 -0
- evalscope/benchmarks/ocr_bench_v2/spotting_metric.py +179 -0
- evalscope/benchmarks/ocr_bench_v2/utils.py +433 -0
- evalscope/benchmarks/ocr_bench_v2/vqa_metric.py +254 -0
- evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
- evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
- evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
- evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
- evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
- evalscope/benchmarks/poly_math/__init__.py +0 -0
- evalscope/benchmarks/poly_math/poly_math_adapter.py +127 -0
- evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
- evalscope/benchmarks/pope/__init__.py +0 -0
- evalscope/benchmarks/pope/pope_adapter.py +111 -0
- evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
- evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
- evalscope/benchmarks/simple_vqa/__init__.py +0 -0
- evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +1 -1
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +1 -1
- evalscope/benchmarks/visu_logic/__init__.py +0 -0
- evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
- evalscope/benchmarks/zerobench/__init__.py +0 -0
- evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
- evalscope/constants.py +4 -0
- evalscope/evaluator/evaluator.py +72 -79
- evalscope/metrics/math_parser.py +14 -0
- evalscope/metrics/metric.py +52 -1
- evalscope/metrics/metrics.py +16 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
- evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
- evalscope/models/utils/openai.py +4 -0
- evalscope/perf/arguments.py +24 -4
- evalscope/perf/benchmark.py +74 -89
- evalscope/perf/http_client.py +31 -16
- evalscope/perf/main.py +15 -2
- evalscope/perf/plugin/api/base.py +9 -7
- evalscope/perf/plugin/api/custom_api.py +13 -58
- evalscope/perf/plugin/api/default_api.py +179 -79
- evalscope/perf/plugin/api/openai_api.py +4 -3
- evalscope/perf/plugin/datasets/base.py +21 -0
- evalscope/perf/plugin/datasets/custom.py +2 -3
- evalscope/perf/plugin/datasets/line_by_line.py +2 -3
- evalscope/perf/plugin/datasets/longalpaca.py +2 -3
- evalscope/perf/plugin/datasets/openqa.py +2 -4
- evalscope/perf/plugin/datasets/random_dataset.py +1 -3
- evalscope/perf/utils/benchmark_util.py +36 -22
- evalscope/perf/utils/db_util.py +14 -19
- evalscope/perf/utils/local_server.py +0 -44
- evalscope/perf/utils/log_utils.py +21 -6
- evalscope/report/__init__.py +11 -2
- evalscope/report/combinator.py +52 -2
- evalscope/run.py +4 -0
- evalscope/utils/function_utils.py +195 -12
- evalscope/utils/io_utils.py +74 -0
- evalscope/utils/json_schema.py +8 -6
- evalscope/utils/logger.py +49 -17
- evalscope/utils/multi_choices.py +16 -1
- evalscope/utils/ner.py +377 -0
- evalscope/version.py +2 -2
- {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/METADATA +239 -393
- {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/RECORD +140 -98
- {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/WHEEL +1 -1
- {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/top_level.txt +0 -1
- tests/__init__.py +0 -1
- tests/benchmark/__init__.py +0 -1
- tests/benchmark/test_eval.py +0 -429
- tests/benchmark/test_image_edit.py +0 -65
- tests/benchmark/test_sandbox.py +0 -81
- tests/benchmark/test_t2i.py +0 -142
- tests/benchmark/test_vlm.py +0 -137
- tests/cli/__init__.py +0 -1
- tests/cli/test_all.py +0 -269
- tests/cli/test_collection.py +0 -99
- tests/cli/test_custom.py +0 -268
- tests/cli/test_reasoning.py +0 -81
- tests/common.py +0 -73
- tests/perf/__init__.py +0 -1
- tests/perf/test_perf.py +0 -206
- tests/rag/test_clip_benchmark.py +0 -87
- tests/rag/test_mteb.py +0 -213
- tests/rag/test_ragas.py +0 -128
- tests/swift/__init__.py +0 -1
- tests/swift/test_run_swift_eval.py +0 -146
- tests/swift/test_run_swift_vlm_eval.py +0 -128
- tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
- tests/test_run_all.py +0 -12
- tests/utils.py +0 -13
- tests/vlm/__init__.py +0 -1
- tests/vlm/test_vlmeval.py +0 -102
- {tests/rag → evalscope/benchmarks/aa_lcr}/__init__.py +0 -0
- {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/entry_points.txt +0 -0
- {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info/licenses}/LICENSE +0 -0
|
@@ -1,3 +1,10 @@
|
|
|
1
|
-
from .adapters import
|
|
1
|
+
from .adapters import (
|
|
2
|
+
DefaultDataAdapter,
|
|
3
|
+
ImageEditAdapter,
|
|
4
|
+
MultiChoiceAdapter,
|
|
5
|
+
NERAdapter,
|
|
6
|
+
Text2ImageAdapter,
|
|
7
|
+
VisionLanguageAdapter,
|
|
8
|
+
)
|
|
2
9
|
from .benchmark import DataAdapter
|
|
3
10
|
from .meta import BenchmarkMeta
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from .default_data_adapter import DefaultDataAdapter
|
|
2
2
|
from .image_edit_adapter import ImageEditAdapter
|
|
3
3
|
from .multi_choice_adapter import MultiChoiceAdapter
|
|
4
|
+
from .ner_adapter import NERAdapter
|
|
4
5
|
from .text2image_adapter import Text2ImageAdapter
|
|
5
6
|
from .vision_language_adapter import VisionLanguageAdapter
|
|
@@ -128,6 +128,9 @@ class DefaultDataAdapter(DataAdapter):
|
|
|
128
128
|
for sample in self.test_dataset[subset]:
|
|
129
129
|
if isinstance(sample.input, str):
|
|
130
130
|
sample.input = self.process_sample_str_input(sample, subset)
|
|
131
|
+
elif isinstance(sample.input, list):
|
|
132
|
+
# Handle list[ChatMessage] and add system prompt if needed
|
|
133
|
+
sample.input = self.process_sample_messages_input(sample, subset)
|
|
131
134
|
|
|
132
135
|
def process_sample_str_input(self, sample: Sample, subset: str) -> List[ChatMessage]:
|
|
133
136
|
"""
|
|
@@ -142,6 +145,15 @@ class DefaultDataAdapter(DataAdapter):
|
|
|
142
145
|
input_messages.insert(0, ChatMessageSystem(content=self.system_prompt))
|
|
143
146
|
return input_messages
|
|
144
147
|
|
|
148
|
+
def process_sample_messages_input(self, sample: Sample, subset: str) -> List[ChatMessage]:
|
|
149
|
+
"""
|
|
150
|
+
Normalize a sample's existing List[ChatMessage] input and ensure system prompt is set once.
|
|
151
|
+
"""
|
|
152
|
+
messages = list(sample.input) # shallow copy to avoid in-place mutations
|
|
153
|
+
if self.system_prompt and not any(isinstance(m, ChatMessageSystem) for m in messages):
|
|
154
|
+
messages = [ChatMessageSystem(content=self.system_prompt)] + messages
|
|
155
|
+
return messages
|
|
156
|
+
|
|
145
157
|
def process_sample_input(self, sample: Sample, subset: str) -> str:
|
|
146
158
|
"""
|
|
147
159
|
Process a single sample's input by applying prompt templates and few-shot formatting.
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
from typing import Any, Dict, List, Set, Tuple
|
|
2
|
+
|
|
3
|
+
from evalscope.api.dataset import Sample
|
|
4
|
+
from evalscope.api.metric.scorer import AggScore, SampleScore, Score
|
|
5
|
+
from evalscope.utils.import_utils import check_import
|
|
6
|
+
from evalscope.utils.logger import get_logger
|
|
7
|
+
from evalscope.utils.ner import (
|
|
8
|
+
DEFAULT_TAG_FIX_PATTERNS,
|
|
9
|
+
calculate_bio_metrics,
|
|
10
|
+
clean_prediction,
|
|
11
|
+
create_target_text,
|
|
12
|
+
extract_entities_from_text,
|
|
13
|
+
extract_spans_from_bio,
|
|
14
|
+
xml_to_bio_tags,
|
|
15
|
+
)
|
|
16
|
+
from .default_data_adapter import DefaultDataAdapter
|
|
17
|
+
|
|
18
|
+
logger = get_logger()
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class NERAdapter(DefaultDataAdapter):
|
|
22
|
+
"""
|
|
23
|
+
Base adapter class for Named Entity Recognition (NER) tasks.
|
|
24
|
+
|
|
25
|
+
This adapter handles converting between BIO tagging schemes and XML-style entity markup,
|
|
26
|
+
and provides evaluation metrics using seqeval.
|
|
27
|
+
|
|
28
|
+
Subclasses should define their entity types and register the benchmark.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(self, **kwargs):
|
|
32
|
+
super().__init__(**kwargs)
|
|
33
|
+
# Define mapping from BIO tags to user-friendly tag names
|
|
34
|
+
self.entity_type_map = {}
|
|
35
|
+
# Add descriptions for each entity type
|
|
36
|
+
self.entity_descriptions = {}
|
|
37
|
+
|
|
38
|
+
# These will be initialized in setup_entity_mappings
|
|
39
|
+
self.reverse_entity_map = {}
|
|
40
|
+
self.entity_list = []
|
|
41
|
+
self.entities_description = ''
|
|
42
|
+
|
|
43
|
+
# Define common error patterns to handle
|
|
44
|
+
self.tag_fix_patterns = DEFAULT_TAG_FIX_PATTERNS
|
|
45
|
+
|
|
46
|
+
check_import('seqeval', 'seqeval', raise_error=True, feature_name='NER metrics')
|
|
47
|
+
# Note: setup_entity_mappings() should be called by subclasses
|
|
48
|
+
# after they define their entity_type_map and entity_descriptions
|
|
49
|
+
|
|
50
|
+
def setup_entity_mappings(self):
|
|
51
|
+
"""
|
|
52
|
+
Setup entity mappings and descriptions for prompt formatting.
|
|
53
|
+
This should be called after entity_type_map and entity_descriptions are defined.
|
|
54
|
+
"""
|
|
55
|
+
# Reverse mapping for converting back from prediction to evaluation
|
|
56
|
+
self.reverse_entity_map = {v.lower(): k for k, v in self.entity_type_map.items()}
|
|
57
|
+
|
|
58
|
+
# Create list of tags for prompt formatting
|
|
59
|
+
self.entity_list = [f'<{ent.lower()}>' for ent in self.entity_type_map.values()]
|
|
60
|
+
|
|
61
|
+
# Create description of entities for prompt
|
|
62
|
+
self.entities_description = ', '.join([
|
|
63
|
+
f'{self.entity_type_map[tag]} ({self.entity_descriptions[tag]})' for tag in self.entity_type_map
|
|
64
|
+
])
|
|
65
|
+
|
|
66
|
+
def record_to_sample(self, record: Dict[str, Any]) -> Sample:
|
|
67
|
+
"""
|
|
68
|
+
Convert a record with tokens and NER tags into a Sample.
|
|
69
|
+
Creates both the raw text input and annotated text target.
|
|
70
|
+
"""
|
|
71
|
+
tokens: List[str] = record['tokens']
|
|
72
|
+
ner_tags: List[str] = record['ner_tags']
|
|
73
|
+
|
|
74
|
+
# Create the input text by joining tokens
|
|
75
|
+
input_text = ' '.join(tokens)
|
|
76
|
+
|
|
77
|
+
# Process tokens and tags to create annotated target text
|
|
78
|
+
target_text = create_target_text(tokens, ner_tags, self.entity_type_map)
|
|
79
|
+
|
|
80
|
+
# Store tokens and tags in metadata for evaluation
|
|
81
|
+
metadata = {'tokens': tokens, 'ner_tags': ner_tags}
|
|
82
|
+
|
|
83
|
+
return Sample(input=input_text, target=target_text, metadata=metadata)
|
|
84
|
+
|
|
85
|
+
def format_prompt_template(self, sample):
|
|
86
|
+
"""
|
|
87
|
+
Format the prompt with entity types, available tags, and text to annotate.
|
|
88
|
+
"""
|
|
89
|
+
return self.prompt_template.format(
|
|
90
|
+
entities=self.entities_description, entity_list=', '.join(self.entity_list), text=sample.input
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
def format_fewshot_template(self, fewshot, sample):
|
|
94
|
+
"""
|
|
95
|
+
Format the few-shot prompt with all required parameters.
|
|
96
|
+
"""
|
|
97
|
+
return self.few_shot_prompt_template.format(
|
|
98
|
+
fewshot=fewshot,
|
|
99
|
+
entities=self.entities_description,
|
|
100
|
+
entity_list=', '.join(self.entity_list),
|
|
101
|
+
text=sample.input
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
def sample_to_fewshot(self, sample: Sample) -> str:
|
|
105
|
+
"""
|
|
106
|
+
Format a sample as a few-shot example showing original and annotated text.
|
|
107
|
+
"""
|
|
108
|
+
if not sample.metadata:
|
|
109
|
+
return ''
|
|
110
|
+
|
|
111
|
+
# Format few-shot examples to match the expected response format
|
|
112
|
+
return f'Input:\n{sample.input}\n\nOutput:\n{sample.target}'
|
|
113
|
+
|
|
114
|
+
def match_score(self, original_prediction, filtered_prediction, reference, task_state) -> Score:
|
|
115
|
+
"""
|
|
116
|
+
Evaluate named entity recognition performance using seqeval.
|
|
117
|
+
"""
|
|
118
|
+
from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
|
|
119
|
+
|
|
120
|
+
score = Score(
|
|
121
|
+
extracted_prediction=filtered_prediction,
|
|
122
|
+
prediction=original_prediction,
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
try:
|
|
126
|
+
# Get the original tokens and tags from the reference metadata
|
|
127
|
+
original_tokens = task_state.metadata['tokens']
|
|
128
|
+
original_tags = task_state.metadata['ner_tags']
|
|
129
|
+
|
|
130
|
+
if not original_tokens or len(original_tokens) == 0:
|
|
131
|
+
if hasattr(reference, 'metadata') and reference.metadata:
|
|
132
|
+
original_tokens = reference.metadata['tokens']
|
|
133
|
+
original_tags = reference.metadata['ner_tags']
|
|
134
|
+
|
|
135
|
+
# Clean and normalize the prediction
|
|
136
|
+
cleaned_prediction = clean_prediction(filtered_prediction, self.tag_fix_patterns)
|
|
137
|
+
|
|
138
|
+
# Convert XML-style prediction back to BIO tags aligned with original tokens
|
|
139
|
+
pred_bio_tags = xml_to_bio_tags(cleaned_prediction, original_tokens, self.reverse_entity_map)
|
|
140
|
+
|
|
141
|
+
# Use seqeval to calculate metrics
|
|
142
|
+
# Note: seqeval expects lists of lists (one per sequence)
|
|
143
|
+
y_true = [original_tags]
|
|
144
|
+
y_pred = [pred_bio_tags]
|
|
145
|
+
|
|
146
|
+
precision = precision_score(y_true, y_pred)
|
|
147
|
+
recall = recall_score(y_true, y_pred)
|
|
148
|
+
f1 = f1_score(y_true, y_pred)
|
|
149
|
+
accuracy = accuracy_score(y_true, y_pred)
|
|
150
|
+
|
|
151
|
+
score.value = {'precision': precision, 'recall': recall, 'f1_score': f1, 'accuracy': accuracy}
|
|
152
|
+
|
|
153
|
+
# Store tags for aggregation (proper micro-averaging in aggregate_scores)
|
|
154
|
+
# This way aggregate_scores can compute metrics across all samples at once,
|
|
155
|
+
# which gives you true micro-averaged scores rather than averaged macro scores.
|
|
156
|
+
score.metadata = {'y_true': original_tags, 'y_pred': pred_bio_tags}
|
|
157
|
+
except Exception as e:
|
|
158
|
+
logger.warning(f'Error evaluating NER prediction: {str(e)}')
|
|
159
|
+
score.value = {'precision': 0.0, 'recall': 0.0, 'f1_score': 0.0, 'accuracy': 0.0}
|
|
160
|
+
|
|
161
|
+
return score
|
|
162
|
+
|
|
163
|
+
def aggregate_scores(self, sample_scores: List[SampleScore]) -> List[AggScore]:
|
|
164
|
+
"""
|
|
165
|
+
Aggregate metrics across all samples using seqeval.
|
|
166
|
+
"""
|
|
167
|
+
from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
|
|
168
|
+
|
|
169
|
+
# Collect all predictions and references
|
|
170
|
+
y_true_all = []
|
|
171
|
+
y_pred_all = []
|
|
172
|
+
|
|
173
|
+
for ss in sample_scores:
|
|
174
|
+
# Extract the BIO tags from metadata if available
|
|
175
|
+
# You may need to store these during match_score
|
|
176
|
+
if hasattr(ss.score, 'metadata') and 'y_true' in ss.score.metadata and 'y_pred' in ss.score.metadata:
|
|
177
|
+
y_true_all.append(ss.score.metadata['y_true'])
|
|
178
|
+
y_pred_all.append(ss.score.metadata['y_pred'])
|
|
179
|
+
|
|
180
|
+
if not y_true_all:
|
|
181
|
+
# Fallback: calculate averages from individual scores
|
|
182
|
+
num_samples = len(sample_scores)
|
|
183
|
+
avg_precision = sum(ss.score.value.get('precision', 0.0) for ss in sample_scores) / num_samples
|
|
184
|
+
avg_recall = sum(ss.score.value.get('recall', 0.0) for ss in sample_scores) / num_samples
|
|
185
|
+
avg_f1 = sum(ss.score.value.get('f1_score', 0.0) for ss in sample_scores) / num_samples
|
|
186
|
+
avg_accuracy = sum(ss.score.value.get('accuracy', 0.0) for ss in sample_scores) / num_samples
|
|
187
|
+
else:
|
|
188
|
+
# Use seqeval for micro-averaged metrics across all samples
|
|
189
|
+
avg_precision = precision_score(y_true_all, y_pred_all)
|
|
190
|
+
avg_recall = recall_score(y_true_all, y_pred_all)
|
|
191
|
+
avg_f1 = f1_score(y_true_all, y_pred_all)
|
|
192
|
+
avg_accuracy = accuracy_score(y_true_all, y_pred_all)
|
|
193
|
+
|
|
194
|
+
num_samples = len(sample_scores)
|
|
195
|
+
|
|
196
|
+
agg_scores = [
|
|
197
|
+
AggScore(
|
|
198
|
+
metric_name='precision',
|
|
199
|
+
score=avg_precision,
|
|
200
|
+
num=num_samples,
|
|
201
|
+
metadata={'type': 'seqeval-micro-average'}
|
|
202
|
+
),
|
|
203
|
+
AggScore(
|
|
204
|
+
metric_name='recall', score=avg_recall, num=num_samples, metadata={'type': 'seqeval-micro-average'}
|
|
205
|
+
),
|
|
206
|
+
AggScore(metric_name='f1_score', score=avg_f1, num=num_samples, metadata={'type': 'seqeval-micro-average'}),
|
|
207
|
+
AggScore(
|
|
208
|
+
metric_name='accuracy', score=avg_accuracy, num=num_samples, metadata={'type': 'seqeval-accuracy'}
|
|
209
|
+
)
|
|
210
|
+
]
|
|
211
|
+
|
|
212
|
+
return agg_scores
|
|
@@ -216,6 +216,13 @@ class DataAdapter(LLMJudgeMixin, SandboxMixin, ABC):
|
|
|
216
216
|
"""
|
|
217
217
|
return self._benchmark_meta.train_split
|
|
218
218
|
|
|
219
|
+
@train_split.setter
|
|
220
|
+
def train_split(self, value: str):
|
|
221
|
+
"""
|
|
222
|
+
Set the train split of the benchmark.
|
|
223
|
+
"""
|
|
224
|
+
self._benchmark_meta.train_split = value
|
|
225
|
+
|
|
219
226
|
@property
|
|
220
227
|
def eval_split(self) -> Optional[str]:
|
|
221
228
|
"""
|
|
@@ -223,6 +230,13 @@ class DataAdapter(LLMJudgeMixin, SandboxMixin, ABC):
|
|
|
223
230
|
"""
|
|
224
231
|
return self._benchmark_meta.eval_split
|
|
225
232
|
|
|
233
|
+
@eval_split.setter
|
|
234
|
+
def eval_split(self, value: str):
|
|
235
|
+
"""
|
|
236
|
+
Set the eval split of the benchmark.
|
|
237
|
+
"""
|
|
238
|
+
self._benchmark_meta.eval_split = value
|
|
239
|
+
|
|
226
240
|
@property
|
|
227
241
|
def prompt_template(self) -> Optional[str]:
|
|
228
242
|
"""
|
evalscope/api/dataset/dataset.py
CHANGED
|
@@ -347,3 +347,24 @@ class DatasetDict:
|
|
|
347
347
|
cur_dataset.reindex(group_size=repeats)
|
|
348
348
|
dataset_dict[key] = cur_dataset
|
|
349
349
|
return cls(dataset_dict)
|
|
350
|
+
|
|
351
|
+
@classmethod
|
|
352
|
+
def from_dataset_dicts(cls, dataset_dicts: List['DatasetDict']) -> 'DatasetDict':
|
|
353
|
+
"""
|
|
354
|
+
Create a DatasetDict by merging multiple DatasetDicts.
|
|
355
|
+
|
|
356
|
+
Args:
|
|
357
|
+
dataset_dicts (List[DatasetDict]): List of DatasetDicts to merge.
|
|
358
|
+
|
|
359
|
+
Returns:
|
|
360
|
+
DatasetDict: A new DatasetDict containing the merged datasets.
|
|
361
|
+
"""
|
|
362
|
+
merged_dict = defaultdict(list)
|
|
363
|
+
for dataset_dict in dataset_dicts:
|
|
364
|
+
for key, dataset in dataset_dict.items():
|
|
365
|
+
merged_dict[key].extend(dataset.samples)
|
|
366
|
+
# Create a MemoryDataset for each subset key
|
|
367
|
+
final_dict = {}
|
|
368
|
+
for key, samples in merged_dict.items():
|
|
369
|
+
final_dict[key] = MemoryDataset(samples, name=key)
|
|
370
|
+
return cls(final_dict)
|
evalscope/api/dataset/loader.py
CHANGED
|
@@ -8,7 +8,7 @@ from typing import Callable, Dict, List, Optional, Union
|
|
|
8
8
|
from evalscope.api.dataset.utils import record_to_sample_fn
|
|
9
9
|
from evalscope.constants import DEFAULT_EVALSCOPE_CACHE_DIR, HubType
|
|
10
10
|
from evalscope.utils import get_logger
|
|
11
|
-
from evalscope.utils.io_utils import csv_to_list, gen_hash, jsonl_to_list, safe_filename
|
|
11
|
+
from evalscope.utils.io_utils import csv_to_list, gen_hash, jsonl_to_list, safe_filename, tsv_to_list
|
|
12
12
|
from .dataset import Dataset, FieldSpec, MemoryDataset, Sample
|
|
13
13
|
from .utils import data_to_samples, shuffle_choices_if_requested
|
|
14
14
|
|
|
@@ -168,7 +168,11 @@ class LocalDataLoader(DataLoader):
|
|
|
168
168
|
dataset = []
|
|
169
169
|
|
|
170
170
|
# Check for JSONL or CSV files in the specified path
|
|
171
|
-
for ext, loader in [
|
|
171
|
+
for ext, loader in [
|
|
172
|
+
('.jsonl', jsonl_to_list),
|
|
173
|
+
('.csv', csv_to_list),
|
|
174
|
+
('.tsv', tsv_to_list),
|
|
175
|
+
]:
|
|
172
176
|
# Check if the file exists with the given extension
|
|
173
177
|
if os.path.isfile(path) and path.endswith(ext):
|
|
174
178
|
file_paths = [path]
|
|
@@ -1,7 +1,6 @@
|
|
|
1
|
-
import
|
|
2
|
-
import threading
|
|
3
|
-
from typing import TYPE_CHECKING, Any, Dict, List, Optional
|
|
1
|
+
from typing import TYPE_CHECKING, Any, Dict, Optional
|
|
4
2
|
|
|
3
|
+
from evalscope.utils.function_utils import AsyncioLoopRunner, thread_safe
|
|
5
4
|
from evalscope.utils.logger import get_logger
|
|
6
5
|
|
|
7
6
|
if TYPE_CHECKING:
|
|
@@ -24,25 +23,10 @@ class SandboxMixin:
|
|
|
24
23
|
self._sandbox_id: Optional[str] = None
|
|
25
24
|
"""Sandbox ID."""
|
|
26
25
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
# Initialize sandbox synchronously by running async methods
|
|
31
|
-
if self.use_sandbox:
|
|
32
|
-
self._loop = asyncio.new_event_loop()
|
|
33
|
-
|
|
34
|
-
# Start the loop in a separate thread
|
|
35
|
-
def run_loop():
|
|
36
|
-
asyncio.set_event_loop(self._loop)
|
|
37
|
-
self._loop.run_forever()
|
|
38
|
-
|
|
39
|
-
self._loop_thread = threading.Thread(target=run_loop, daemon=True)
|
|
40
|
-
self._loop_thread.start()
|
|
41
|
-
|
|
42
|
-
# Wait for initialization
|
|
43
|
-
future = asyncio.run_coroutine_threadsafe(self._async_init(), self._loop)
|
|
44
|
-
future.result()
|
|
26
|
+
# Lazy init state
|
|
27
|
+
self._initialized: bool = False
|
|
45
28
|
|
|
29
|
+
# NOTE: Initialization is deferred.
|
|
46
30
|
super().__init__()
|
|
47
31
|
|
|
48
32
|
async def _async_init(self):
|
|
@@ -70,6 +54,25 @@ class SandboxMixin:
|
|
|
70
54
|
"""Get the sandbox ID."""
|
|
71
55
|
return self._sandbox_id
|
|
72
56
|
|
|
57
|
+
@thread_safe
|
|
58
|
+
def ensure_sandbox_ready(self) -> bool:
|
|
59
|
+
"""
|
|
60
|
+
Ensure the sandbox loop, manager, and sandbox instance are initialized.
|
|
61
|
+
This method is thread-safe and idempotent.
|
|
62
|
+
"""
|
|
63
|
+
if not self.use_sandbox:
|
|
64
|
+
return False
|
|
65
|
+
|
|
66
|
+
if self._initialized and self._manager and self._sandbox_id:
|
|
67
|
+
return True
|
|
68
|
+
|
|
69
|
+
# Initialize manager and sandbox using the class-level runner
|
|
70
|
+
AsyncioLoopRunner.run(self.init_sandbox_manager_async())
|
|
71
|
+
AsyncioLoopRunner.run(self.init_sandbox_async())
|
|
72
|
+
|
|
73
|
+
self._initialized = True
|
|
74
|
+
return True
|
|
75
|
+
|
|
73
76
|
async def init_sandbox_manager_async(self) -> Optional['SandboxManager']:
|
|
74
77
|
"""Initialize the sandbox manager asynchronously."""
|
|
75
78
|
if self._manager is not None:
|
|
@@ -100,13 +103,7 @@ class SandboxMixin:
|
|
|
100
103
|
if not self.use_sandbox:
|
|
101
104
|
return None
|
|
102
105
|
|
|
103
|
-
|
|
104
|
-
if self._loop and not self._loop.is_closed():
|
|
105
|
-
future = asyncio.run_coroutine_threadsafe(self.init_sandbox_manager_async(), self._loop)
|
|
106
|
-
return future.result()
|
|
107
|
-
else:
|
|
108
|
-
# Fallback for cases where no loop is available
|
|
109
|
-
return asyncio.run(self.init_sandbox_manager_async())
|
|
106
|
+
return AsyncioLoopRunner.run(self.init_sandbox_manager_async())
|
|
110
107
|
|
|
111
108
|
async def init_sandbox_async(self) -> Optional[str]:
|
|
112
109
|
"""Initialize the sandbox instance asynchronously."""
|
|
@@ -141,17 +138,12 @@ class SandboxMixin:
|
|
|
141
138
|
if not self.use_sandbox:
|
|
142
139
|
return None
|
|
143
140
|
|
|
144
|
-
|
|
145
|
-
if self._loop and not self._loop.is_closed():
|
|
146
|
-
future = asyncio.run_coroutine_threadsafe(self.init_sandbox_async(), self._loop)
|
|
147
|
-
return future.result()
|
|
148
|
-
else:
|
|
149
|
-
# Fallback for cases where no loop is available
|
|
150
|
-
return asyncio.run(self.init_sandbox_async())
|
|
141
|
+
return AsyncioLoopRunner.run(self.init_sandbox_async())
|
|
151
142
|
|
|
152
143
|
def execute_code_in_sandbox(self, code: str, timeout: int = 60, language: str = 'python') -> Dict[str, Any]:
|
|
153
144
|
"""Execute code in the sandbox."""
|
|
154
|
-
|
|
145
|
+
# Lazy, thread-safe initialization
|
|
146
|
+
if not self.ensure_sandbox_ready():
|
|
155
147
|
logger.warning('Sandbox is not initialized.')
|
|
156
148
|
return {'error': 'Sandbox is not initialized.'}
|
|
157
149
|
|
|
@@ -175,30 +167,16 @@ class SandboxMixin:
|
|
|
175
167
|
)
|
|
176
168
|
return result
|
|
177
169
|
|
|
178
|
-
#
|
|
179
|
-
|
|
180
|
-
future = asyncio.run_coroutine_threadsafe(_execute_async(), self._loop)
|
|
181
|
-
result = future.result(timeout + 10) # Add some buffer to the timeout
|
|
182
|
-
else:
|
|
183
|
-
# Fallback for cases where no loop is available
|
|
184
|
-
result = asyncio.run(_execute_async())
|
|
185
|
-
|
|
170
|
+
# Execute in background loop via class-level runner
|
|
171
|
+
result = AsyncioLoopRunner.run(_execute_async(), timeout=timeout + 10)
|
|
186
172
|
return result.model_dump(exclude_none=True)
|
|
187
173
|
|
|
188
174
|
def sandbox_finalize(self, *args, **kwargs):
|
|
189
175
|
"""Finalize the sandbox manager."""
|
|
190
176
|
if self._manager:
|
|
191
177
|
try:
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
future = asyncio.run_coroutine_threadsafe(self._manager.stop(), self._loop)
|
|
195
|
-
future.result(timeout=30)
|
|
196
|
-
|
|
197
|
-
# Stop the event loop
|
|
198
|
-
self._loop.call_soon_threadsafe(self._loop.stop)
|
|
199
|
-
if hasattr(self, '_loop_thread'):
|
|
200
|
-
self._loop_thread.join(timeout=5)
|
|
201
|
-
|
|
178
|
+
# Stop the manager but keep the shared loop alive
|
|
179
|
+
AsyncioLoopRunner.run(self._manager.stop(), timeout=30)
|
|
202
180
|
logger.info('Sandbox manager finalized.')
|
|
203
181
|
except Exception as e:
|
|
204
182
|
logger.warning(f'Error finalizing sandbox manager: {e}')
|
|
@@ -108,6 +108,12 @@ class GenerateConfig(BaseModel):
|
|
|
108
108
|
extra_body: Optional[Dict[str, Any]] = Field(default=None)
|
|
109
109
|
"""Extra body to be sent with requests to OpenAI compatible servers. OpenAI, vLLM, and SGLang only."""
|
|
110
110
|
|
|
111
|
+
extra_query: Optional[Dict[str, Any]] = Field(default=None)
|
|
112
|
+
"""Extra query parameters to be sent with requests to OpenAI compatible servers. OpenAI, vLLM, and SGLang only."""
|
|
113
|
+
|
|
114
|
+
extra_headers: Optional[Dict[str, str]] = Field(default=None)
|
|
115
|
+
"""Extra headers to be sent with requests to OpenAI compatible servers. OpenAI, vLLM, and SGLang only."""
|
|
116
|
+
|
|
111
117
|
height: Optional[int] = Field(default=None)
|
|
112
118
|
"""Image height for image generation model only"""
|
|
113
119
|
|
evalscope/app/ui/multi_model.py
CHANGED
|
@@ -204,7 +204,12 @@ def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
|
|
|
204
204
|
data_score_df_b, _ = get_single_dataset_df(report_df_b, dataset_name)
|
|
205
205
|
|
|
206
206
|
# Get subset choices - should be same for both models
|
|
207
|
-
subsets
|
|
207
|
+
# Only select the subsets that Cat.0 is not '-'
|
|
208
|
+
df_for_subsets = data_score_df_a.copy()
|
|
209
|
+
subsets = sorted(
|
|
210
|
+
df_for_subsets.loc[df_for_subsets[f'{ReportKey.category_prefix}0'].ne('-'),
|
|
211
|
+
ReportKey.subset_name].dropna().unique().tolist()
|
|
212
|
+
)
|
|
208
213
|
|
|
209
214
|
return gr.update(choices=subsets, value=None), None
|
|
210
215
|
|
evalscope/app/ui/single_model.py
CHANGED
|
@@ -134,11 +134,17 @@ def create_single_model_tab(sidebar: 'SidebarComponents', lang: str):
|
|
|
134
134
|
)
|
|
135
135
|
def update_single_report_dataset(dataset_name, report_list):
|
|
136
136
|
logger.debug(f'Updating single report dataset: {dataset_name}')
|
|
137
|
-
report_df = get_data_frame(report_list=report_list)
|
|
137
|
+
report_df = get_data_frame(report_list=report_list, flatten_metrics=True, flatten_categories=True)
|
|
138
138
|
analysis = get_report_analysis(report_list, dataset_name)
|
|
139
139
|
data_score_df, styler = get_single_dataset_df(report_df, dataset_name)
|
|
140
140
|
data_score_plot = plot_single_dataset_scores(data_score_df)
|
|
141
|
-
subsets
|
|
141
|
+
# Only select the subsets that Cat.0 is not '-'
|
|
142
|
+
df_for_subsets = data_score_df.copy()
|
|
143
|
+
subsets = sorted(
|
|
144
|
+
df_for_subsets.loc[df_for_subsets[f'{ReportKey.category_prefix}0'].ne('-'),
|
|
145
|
+
ReportKey.subset_name].dropna().unique().tolist()
|
|
146
|
+
)
|
|
147
|
+
|
|
142
148
|
logger.debug(f'subsets: {subsets}')
|
|
143
149
|
return data_score_plot, styler, gr.update(choices=subsets, value=None), None, analysis
|
|
144
150
|
|
|
@@ -168,9 +168,10 @@ def get_model_prediction(work_dir: str, model_name: str, dataset_name: str, subs
|
|
|
168
168
|
'Index': str(review_result.index),
|
|
169
169
|
'Input': review_result.input.replace('\n', '\n\n'), # for markdown
|
|
170
170
|
'Metadata': metadata,
|
|
171
|
-
'Generated': prediction,
|
|
171
|
+
'Generated': prediction or '', # Ensure no None value
|
|
172
172
|
'Gold': target,
|
|
173
|
-
'Pred': extracted_prediction if extracted_prediction != prediction else '*Same as Generated*'
|
|
173
|
+
'Pred': (extracted_prediction if extracted_prediction != prediction else '*Same as Generated*')
|
|
174
|
+
or '', # Ensure no None value
|
|
174
175
|
'Score': score.model_dump(exclude_none=True),
|
|
175
176
|
'NScore': normalize_score(score.main_value)
|
|
176
177
|
}
|
|
@@ -18,7 +18,7 @@ logger = get_logger()
|
|
|
18
18
|
def plot_single_report_scores(df: pd.DataFrame):
|
|
19
19
|
if df is None:
|
|
20
20
|
return None
|
|
21
|
-
logger.debug(f'df: {df}')
|
|
21
|
+
logger.debug(f'df: \n{df}')
|
|
22
22
|
plot = px.bar(df, x=df[ReportKey.dataset_name], y=df[ReportKey.score], text=df[ReportKey.score])
|
|
23
23
|
|
|
24
24
|
width = DEFAULT_BAR_WIDTH if len(df[ReportKey.dataset_name]) <= 5 else None
|
|
@@ -36,7 +36,7 @@ def plot_single_report_sunburst(report_list: List[Report]):
|
|
|
36
36
|
df = get_data_frame(report_list=report_list, flatten_metrics=False)
|
|
37
37
|
categories = sorted([i for i in df.columns if i.startswith(ReportKey.category_prefix)])
|
|
38
38
|
path = [ReportKey.dataset_name] + categories + [ReportKey.subset_name]
|
|
39
|
-
logger.debug(f'df: {df}')
|
|
39
|
+
logger.debug(f'df: \n{df}')
|
|
40
40
|
df[categories] = df[categories].fillna('default') # NOTE: fillna for empty categories
|
|
41
41
|
|
|
42
42
|
plot = px.sunburst(
|