evalscope 1.0.2__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (176) hide show
  1. evalscope/api/benchmark/__init__.py +8 -1
  2. evalscope/api/benchmark/adapters/__init__.py +1 -0
  3. evalscope/api/benchmark/adapters/default_data_adapter.py +12 -0
  4. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  5. evalscope/api/benchmark/benchmark.py +14 -0
  6. evalscope/api/dataset/dataset.py +21 -0
  7. evalscope/api/dataset/loader.py +6 -2
  8. evalscope/api/mixin/sandbox_mixin.py +32 -54
  9. evalscope/api/model/generate_config.py +6 -0
  10. evalscope/app/ui/multi_model.py +6 -1
  11. evalscope/app/ui/single_model.py +8 -2
  12. evalscope/app/utils/data_utils.py +3 -2
  13. evalscope/app/utils/visualization.py +2 -2
  14. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  15. evalscope/benchmarks/ai2d/ai2d_adapter.py +3 -2
  16. evalscope/benchmarks/bfcl/bfcl_adapter.py +11 -46
  17. evalscope/benchmarks/blink/__init__.py +0 -0
  18. evalscope/benchmarks/blink/blink_adapter.py +61 -0
  19. evalscope/benchmarks/chartqa/__init__.py +0 -0
  20. evalscope/benchmarks/chartqa/chartqa_adapter.py +80 -0
  21. evalscope/benchmarks/chartqa/utils.py +38 -0
  22. evalscope/benchmarks/data_collection/data_collection_adapter.py +2 -1
  23. evalscope/benchmarks/docvqa/__init__.py +0 -0
  24. evalscope/benchmarks/docvqa/docvqa_adapter.py +67 -0
  25. evalscope/benchmarks/general_arena/general_arena_adapter.py +1 -1
  26. evalscope/benchmarks/general_arena/utils.py +2 -1
  27. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
  28. evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  29. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +23 -4
  30. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  31. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +158 -0
  32. evalscope/benchmarks/hle/hle_adapter.py +3 -2
  33. evalscope/benchmarks/humaneval/humaneval_adapter.py +2 -1
  34. evalscope/benchmarks/infovqa/__init__.py +0 -0
  35. evalscope/benchmarks/infovqa/infovqa_adapter.py +66 -0
  36. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +3 -1
  37. evalscope/benchmarks/math_verse/__init__.py +0 -0
  38. evalscope/benchmarks/math_verse/math_verse_adapter.py +100 -0
  39. evalscope/benchmarks/math_vision/__init__.py +0 -0
  40. evalscope/benchmarks/math_vision/math_vision_adapter.py +111 -0
  41. evalscope/benchmarks/math_vista/math_vista_adapter.py +6 -26
  42. evalscope/benchmarks/mm_bench/mm_bench_adapter.py +2 -2
  43. evalscope/benchmarks/mmmu/mmmu_adapter.py +1 -1
  44. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -1
  45. evalscope/benchmarks/ner/__init__.py +0 -0
  46. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  47. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  48. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  49. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  50. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  51. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  52. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  53. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  54. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  55. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  56. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  57. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  58. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  59. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  60. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  61. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  62. evalscope/benchmarks/ocr_bench/__init__.py +0 -0
  63. evalscope/benchmarks/ocr_bench/ocr_bench_adapter.py +101 -0
  64. evalscope/benchmarks/ocr_bench_v2/IoUscore_metric.py +87 -0
  65. evalscope/benchmarks/ocr_bench_v2/TEDS_metric.py +963 -0
  66. evalscope/benchmarks/ocr_bench_v2/__init__.py +0 -0
  67. evalscope/benchmarks/ocr_bench_v2/ocr_bench_v2_adapter.py +161 -0
  68. evalscope/benchmarks/ocr_bench_v2/page_ocr_metric.py +50 -0
  69. evalscope/benchmarks/ocr_bench_v2/parallel.py +46 -0
  70. evalscope/benchmarks/ocr_bench_v2/spotting_eval/__init__.py +0 -0
  71. evalscope/benchmarks/ocr_bench_v2/spotting_eval/readme.txt +26 -0
  72. evalscope/benchmarks/ocr_bench_v2/spotting_eval/rrc_evaluation_funcs_1_1.py +537 -0
  73. evalscope/benchmarks/ocr_bench_v2/spotting_eval/script.py +481 -0
  74. evalscope/benchmarks/ocr_bench_v2/spotting_metric.py +179 -0
  75. evalscope/benchmarks/ocr_bench_v2/utils.py +433 -0
  76. evalscope/benchmarks/ocr_bench_v2/vqa_metric.py +254 -0
  77. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  78. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  79. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  80. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  81. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  82. evalscope/benchmarks/poly_math/__init__.py +0 -0
  83. evalscope/benchmarks/poly_math/poly_math_adapter.py +127 -0
  84. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  85. evalscope/benchmarks/pope/__init__.py +0 -0
  86. evalscope/benchmarks/pope/pope_adapter.py +111 -0
  87. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  88. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  89. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  90. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  91. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +1 -1
  92. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +1 -1
  93. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  94. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  95. evalscope/benchmarks/zerobench/__init__.py +0 -0
  96. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  97. evalscope/constants.py +4 -0
  98. evalscope/evaluator/evaluator.py +72 -79
  99. evalscope/metrics/math_parser.py +14 -0
  100. evalscope/metrics/metric.py +52 -1
  101. evalscope/metrics/metrics.py +16 -0
  102. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/config.py +0 -0
  103. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/dist_utils.py +0 -0
  104. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/gradcam.py +0 -0
  105. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/logger.py +0 -0
  106. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/optims.py +0 -0
  107. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/registry.py +0 -0
  108. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/utils.py +0 -0
  109. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/__init__.py +0 -0
  110. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa.py +0 -0
  111. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/common/vqa_tools/vqa_eval.py +0 -0
  112. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip2_models/Qformer.py +2 -6
  113. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/blip_models/nlvr_encoder.py +2 -6
  114. evalscope/metrics/t2v_metrics/models/vqascore_models/lavis/models/med.py +2 -6
  115. evalscope/models/utils/openai.py +4 -0
  116. evalscope/perf/arguments.py +24 -4
  117. evalscope/perf/benchmark.py +74 -89
  118. evalscope/perf/http_client.py +31 -16
  119. evalscope/perf/main.py +15 -2
  120. evalscope/perf/plugin/api/base.py +9 -7
  121. evalscope/perf/plugin/api/custom_api.py +13 -58
  122. evalscope/perf/plugin/api/default_api.py +179 -79
  123. evalscope/perf/plugin/api/openai_api.py +4 -3
  124. evalscope/perf/plugin/datasets/base.py +21 -0
  125. evalscope/perf/plugin/datasets/custom.py +2 -3
  126. evalscope/perf/plugin/datasets/line_by_line.py +2 -3
  127. evalscope/perf/plugin/datasets/longalpaca.py +2 -3
  128. evalscope/perf/plugin/datasets/openqa.py +2 -4
  129. evalscope/perf/plugin/datasets/random_dataset.py +1 -3
  130. evalscope/perf/utils/benchmark_util.py +36 -22
  131. evalscope/perf/utils/db_util.py +14 -19
  132. evalscope/perf/utils/local_server.py +0 -44
  133. evalscope/perf/utils/log_utils.py +21 -6
  134. evalscope/report/__init__.py +11 -2
  135. evalscope/report/combinator.py +52 -2
  136. evalscope/run.py +4 -0
  137. evalscope/utils/function_utils.py +195 -12
  138. evalscope/utils/io_utils.py +74 -0
  139. evalscope/utils/json_schema.py +8 -6
  140. evalscope/utils/logger.py +49 -17
  141. evalscope/utils/multi_choices.py +16 -1
  142. evalscope/utils/ner.py +377 -0
  143. evalscope/version.py +2 -2
  144. {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/METADATA +239 -393
  145. {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/RECORD +140 -98
  146. {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/WHEEL +1 -1
  147. {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/top_level.txt +0 -1
  148. tests/__init__.py +0 -1
  149. tests/benchmark/__init__.py +0 -1
  150. tests/benchmark/test_eval.py +0 -429
  151. tests/benchmark/test_image_edit.py +0 -65
  152. tests/benchmark/test_sandbox.py +0 -81
  153. tests/benchmark/test_t2i.py +0 -142
  154. tests/benchmark/test_vlm.py +0 -137
  155. tests/cli/__init__.py +0 -1
  156. tests/cli/test_all.py +0 -269
  157. tests/cli/test_collection.py +0 -99
  158. tests/cli/test_custom.py +0 -268
  159. tests/cli/test_reasoning.py +0 -81
  160. tests/common.py +0 -73
  161. tests/perf/__init__.py +0 -1
  162. tests/perf/test_perf.py +0 -206
  163. tests/rag/test_clip_benchmark.py +0 -87
  164. tests/rag/test_mteb.py +0 -213
  165. tests/rag/test_ragas.py +0 -128
  166. tests/swift/__init__.py +0 -1
  167. tests/swift/test_run_swift_eval.py +0 -146
  168. tests/swift/test_run_swift_vlm_eval.py +0 -128
  169. tests/swift/test_run_swift_vlm_jugde_eval.py +0 -157
  170. tests/test_run_all.py +0 -12
  171. tests/utils.py +0 -13
  172. tests/vlm/__init__.py +0 -1
  173. tests/vlm/test_vlmeval.py +0 -102
  174. {tests/rag → evalscope/benchmarks/aa_lcr}/__init__.py +0 -0
  175. {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info}/entry_points.txt +0 -0
  176. {evalscope-1.0.2.dist-info → evalscope-1.1.1.dist-info/licenses}/LICENSE +0 -0
@@ -1,3 +1,10 @@
1
- from .adapters import DefaultDataAdapter, ImageEditAdapter, MultiChoiceAdapter, Text2ImageAdapter, VisionLanguageAdapter
1
+ from .adapters import (
2
+ DefaultDataAdapter,
3
+ ImageEditAdapter,
4
+ MultiChoiceAdapter,
5
+ NERAdapter,
6
+ Text2ImageAdapter,
7
+ VisionLanguageAdapter,
8
+ )
2
9
  from .benchmark import DataAdapter
3
10
  from .meta import BenchmarkMeta
@@ -1,5 +1,6 @@
1
1
  from .default_data_adapter import DefaultDataAdapter
2
2
  from .image_edit_adapter import ImageEditAdapter
3
3
  from .multi_choice_adapter import MultiChoiceAdapter
4
+ from .ner_adapter import NERAdapter
4
5
  from .text2image_adapter import Text2ImageAdapter
5
6
  from .vision_language_adapter import VisionLanguageAdapter
@@ -128,6 +128,9 @@ class DefaultDataAdapter(DataAdapter):
128
128
  for sample in self.test_dataset[subset]:
129
129
  if isinstance(sample.input, str):
130
130
  sample.input = self.process_sample_str_input(sample, subset)
131
+ elif isinstance(sample.input, list):
132
+ # Handle list[ChatMessage] and add system prompt if needed
133
+ sample.input = self.process_sample_messages_input(sample, subset)
131
134
 
132
135
  def process_sample_str_input(self, sample: Sample, subset: str) -> List[ChatMessage]:
133
136
  """
@@ -142,6 +145,15 @@ class DefaultDataAdapter(DataAdapter):
142
145
  input_messages.insert(0, ChatMessageSystem(content=self.system_prompt))
143
146
  return input_messages
144
147
 
148
+ def process_sample_messages_input(self, sample: Sample, subset: str) -> List[ChatMessage]:
149
+ """
150
+ Normalize a sample's existing List[ChatMessage] input and ensure system prompt is set once.
151
+ """
152
+ messages = list(sample.input) # shallow copy to avoid in-place mutations
153
+ if self.system_prompt and not any(isinstance(m, ChatMessageSystem) for m in messages):
154
+ messages = [ChatMessageSystem(content=self.system_prompt)] + messages
155
+ return messages
156
+
145
157
  def process_sample_input(self, sample: Sample, subset: str) -> str:
146
158
  """
147
159
  Process a single sample's input by applying prompt templates and few-shot formatting.
@@ -0,0 +1,212 @@
1
+ from typing import Any, Dict, List, Set, Tuple
2
+
3
+ from evalscope.api.dataset import Sample
4
+ from evalscope.api.metric.scorer import AggScore, SampleScore, Score
5
+ from evalscope.utils.import_utils import check_import
6
+ from evalscope.utils.logger import get_logger
7
+ from evalscope.utils.ner import (
8
+ DEFAULT_TAG_FIX_PATTERNS,
9
+ calculate_bio_metrics,
10
+ clean_prediction,
11
+ create_target_text,
12
+ extract_entities_from_text,
13
+ extract_spans_from_bio,
14
+ xml_to_bio_tags,
15
+ )
16
+ from .default_data_adapter import DefaultDataAdapter
17
+
18
+ logger = get_logger()
19
+
20
+
21
+ class NERAdapter(DefaultDataAdapter):
22
+ """
23
+ Base adapter class for Named Entity Recognition (NER) tasks.
24
+
25
+ This adapter handles converting between BIO tagging schemes and XML-style entity markup,
26
+ and provides evaluation metrics using seqeval.
27
+
28
+ Subclasses should define their entity types and register the benchmark.
29
+ """
30
+
31
+ def __init__(self, **kwargs):
32
+ super().__init__(**kwargs)
33
+ # Define mapping from BIO tags to user-friendly tag names
34
+ self.entity_type_map = {}
35
+ # Add descriptions for each entity type
36
+ self.entity_descriptions = {}
37
+
38
+ # These will be initialized in setup_entity_mappings
39
+ self.reverse_entity_map = {}
40
+ self.entity_list = []
41
+ self.entities_description = ''
42
+
43
+ # Define common error patterns to handle
44
+ self.tag_fix_patterns = DEFAULT_TAG_FIX_PATTERNS
45
+
46
+ check_import('seqeval', 'seqeval', raise_error=True, feature_name='NER metrics')
47
+ # Note: setup_entity_mappings() should be called by subclasses
48
+ # after they define their entity_type_map and entity_descriptions
49
+
50
+ def setup_entity_mappings(self):
51
+ """
52
+ Setup entity mappings and descriptions for prompt formatting.
53
+ This should be called after entity_type_map and entity_descriptions are defined.
54
+ """
55
+ # Reverse mapping for converting back from prediction to evaluation
56
+ self.reverse_entity_map = {v.lower(): k for k, v in self.entity_type_map.items()}
57
+
58
+ # Create list of tags for prompt formatting
59
+ self.entity_list = [f'<{ent.lower()}>' for ent in self.entity_type_map.values()]
60
+
61
+ # Create description of entities for prompt
62
+ self.entities_description = ', '.join([
63
+ f'{self.entity_type_map[tag]} ({self.entity_descriptions[tag]})' for tag in self.entity_type_map
64
+ ])
65
+
66
+ def record_to_sample(self, record: Dict[str, Any]) -> Sample:
67
+ """
68
+ Convert a record with tokens and NER tags into a Sample.
69
+ Creates both the raw text input and annotated text target.
70
+ """
71
+ tokens: List[str] = record['tokens']
72
+ ner_tags: List[str] = record['ner_tags']
73
+
74
+ # Create the input text by joining tokens
75
+ input_text = ' '.join(tokens)
76
+
77
+ # Process tokens and tags to create annotated target text
78
+ target_text = create_target_text(tokens, ner_tags, self.entity_type_map)
79
+
80
+ # Store tokens and tags in metadata for evaluation
81
+ metadata = {'tokens': tokens, 'ner_tags': ner_tags}
82
+
83
+ return Sample(input=input_text, target=target_text, metadata=metadata)
84
+
85
+ def format_prompt_template(self, sample):
86
+ """
87
+ Format the prompt with entity types, available tags, and text to annotate.
88
+ """
89
+ return self.prompt_template.format(
90
+ entities=self.entities_description, entity_list=', '.join(self.entity_list), text=sample.input
91
+ )
92
+
93
+ def format_fewshot_template(self, fewshot, sample):
94
+ """
95
+ Format the few-shot prompt with all required parameters.
96
+ """
97
+ return self.few_shot_prompt_template.format(
98
+ fewshot=fewshot,
99
+ entities=self.entities_description,
100
+ entity_list=', '.join(self.entity_list),
101
+ text=sample.input
102
+ )
103
+
104
+ def sample_to_fewshot(self, sample: Sample) -> str:
105
+ """
106
+ Format a sample as a few-shot example showing original and annotated text.
107
+ """
108
+ if not sample.metadata:
109
+ return ''
110
+
111
+ # Format few-shot examples to match the expected response format
112
+ return f'Input:\n{sample.input}\n\nOutput:\n{sample.target}'
113
+
114
+ def match_score(self, original_prediction, filtered_prediction, reference, task_state) -> Score:
115
+ """
116
+ Evaluate named entity recognition performance using seqeval.
117
+ """
118
+ from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
119
+
120
+ score = Score(
121
+ extracted_prediction=filtered_prediction,
122
+ prediction=original_prediction,
123
+ )
124
+
125
+ try:
126
+ # Get the original tokens and tags from the reference metadata
127
+ original_tokens = task_state.metadata['tokens']
128
+ original_tags = task_state.metadata['ner_tags']
129
+
130
+ if not original_tokens or len(original_tokens) == 0:
131
+ if hasattr(reference, 'metadata') and reference.metadata:
132
+ original_tokens = reference.metadata['tokens']
133
+ original_tags = reference.metadata['ner_tags']
134
+
135
+ # Clean and normalize the prediction
136
+ cleaned_prediction = clean_prediction(filtered_prediction, self.tag_fix_patterns)
137
+
138
+ # Convert XML-style prediction back to BIO tags aligned with original tokens
139
+ pred_bio_tags = xml_to_bio_tags(cleaned_prediction, original_tokens, self.reverse_entity_map)
140
+
141
+ # Use seqeval to calculate metrics
142
+ # Note: seqeval expects lists of lists (one per sequence)
143
+ y_true = [original_tags]
144
+ y_pred = [pred_bio_tags]
145
+
146
+ precision = precision_score(y_true, y_pred)
147
+ recall = recall_score(y_true, y_pred)
148
+ f1 = f1_score(y_true, y_pred)
149
+ accuracy = accuracy_score(y_true, y_pred)
150
+
151
+ score.value = {'precision': precision, 'recall': recall, 'f1_score': f1, 'accuracy': accuracy}
152
+
153
+ # Store tags for aggregation (proper micro-averaging in aggregate_scores)
154
+ # This way aggregate_scores can compute metrics across all samples at once,
155
+ # which gives you true micro-averaged scores rather than averaged macro scores.
156
+ score.metadata = {'y_true': original_tags, 'y_pred': pred_bio_tags}
157
+ except Exception as e:
158
+ logger.warning(f'Error evaluating NER prediction: {str(e)}')
159
+ score.value = {'precision': 0.0, 'recall': 0.0, 'f1_score': 0.0, 'accuracy': 0.0}
160
+
161
+ return score
162
+
163
+ def aggregate_scores(self, sample_scores: List[SampleScore]) -> List[AggScore]:
164
+ """
165
+ Aggregate metrics across all samples using seqeval.
166
+ """
167
+ from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
168
+
169
+ # Collect all predictions and references
170
+ y_true_all = []
171
+ y_pred_all = []
172
+
173
+ for ss in sample_scores:
174
+ # Extract the BIO tags from metadata if available
175
+ # You may need to store these during match_score
176
+ if hasattr(ss.score, 'metadata') and 'y_true' in ss.score.metadata and 'y_pred' in ss.score.metadata:
177
+ y_true_all.append(ss.score.metadata['y_true'])
178
+ y_pred_all.append(ss.score.metadata['y_pred'])
179
+
180
+ if not y_true_all:
181
+ # Fallback: calculate averages from individual scores
182
+ num_samples = len(sample_scores)
183
+ avg_precision = sum(ss.score.value.get('precision', 0.0) for ss in sample_scores) / num_samples
184
+ avg_recall = sum(ss.score.value.get('recall', 0.0) for ss in sample_scores) / num_samples
185
+ avg_f1 = sum(ss.score.value.get('f1_score', 0.0) for ss in sample_scores) / num_samples
186
+ avg_accuracy = sum(ss.score.value.get('accuracy', 0.0) for ss in sample_scores) / num_samples
187
+ else:
188
+ # Use seqeval for micro-averaged metrics across all samples
189
+ avg_precision = precision_score(y_true_all, y_pred_all)
190
+ avg_recall = recall_score(y_true_all, y_pred_all)
191
+ avg_f1 = f1_score(y_true_all, y_pred_all)
192
+ avg_accuracy = accuracy_score(y_true_all, y_pred_all)
193
+
194
+ num_samples = len(sample_scores)
195
+
196
+ agg_scores = [
197
+ AggScore(
198
+ metric_name='precision',
199
+ score=avg_precision,
200
+ num=num_samples,
201
+ metadata={'type': 'seqeval-micro-average'}
202
+ ),
203
+ AggScore(
204
+ metric_name='recall', score=avg_recall, num=num_samples, metadata={'type': 'seqeval-micro-average'}
205
+ ),
206
+ AggScore(metric_name='f1_score', score=avg_f1, num=num_samples, metadata={'type': 'seqeval-micro-average'}),
207
+ AggScore(
208
+ metric_name='accuracy', score=avg_accuracy, num=num_samples, metadata={'type': 'seqeval-accuracy'}
209
+ )
210
+ ]
211
+
212
+ return agg_scores
@@ -216,6 +216,13 @@ class DataAdapter(LLMJudgeMixin, SandboxMixin, ABC):
216
216
  """
217
217
  return self._benchmark_meta.train_split
218
218
 
219
+ @train_split.setter
220
+ def train_split(self, value: str):
221
+ """
222
+ Set the train split of the benchmark.
223
+ """
224
+ self._benchmark_meta.train_split = value
225
+
219
226
  @property
220
227
  def eval_split(self) -> Optional[str]:
221
228
  """
@@ -223,6 +230,13 @@ class DataAdapter(LLMJudgeMixin, SandboxMixin, ABC):
223
230
  """
224
231
  return self._benchmark_meta.eval_split
225
232
 
233
+ @eval_split.setter
234
+ def eval_split(self, value: str):
235
+ """
236
+ Set the eval split of the benchmark.
237
+ """
238
+ self._benchmark_meta.eval_split = value
239
+
226
240
  @property
227
241
  def prompt_template(self) -> Optional[str]:
228
242
  """
@@ -347,3 +347,24 @@ class DatasetDict:
347
347
  cur_dataset.reindex(group_size=repeats)
348
348
  dataset_dict[key] = cur_dataset
349
349
  return cls(dataset_dict)
350
+
351
+ @classmethod
352
+ def from_dataset_dicts(cls, dataset_dicts: List['DatasetDict']) -> 'DatasetDict':
353
+ """
354
+ Create a DatasetDict by merging multiple DatasetDicts.
355
+
356
+ Args:
357
+ dataset_dicts (List[DatasetDict]): List of DatasetDicts to merge.
358
+
359
+ Returns:
360
+ DatasetDict: A new DatasetDict containing the merged datasets.
361
+ """
362
+ merged_dict = defaultdict(list)
363
+ for dataset_dict in dataset_dicts:
364
+ for key, dataset in dataset_dict.items():
365
+ merged_dict[key].extend(dataset.samples)
366
+ # Create a MemoryDataset for each subset key
367
+ final_dict = {}
368
+ for key, samples in merged_dict.items():
369
+ final_dict[key] = MemoryDataset(samples, name=key)
370
+ return cls(final_dict)
@@ -8,7 +8,7 @@ from typing import Callable, Dict, List, Optional, Union
8
8
  from evalscope.api.dataset.utils import record_to_sample_fn
9
9
  from evalscope.constants import DEFAULT_EVALSCOPE_CACHE_DIR, HubType
10
10
  from evalscope.utils import get_logger
11
- from evalscope.utils.io_utils import csv_to_list, gen_hash, jsonl_to_list, safe_filename
11
+ from evalscope.utils.io_utils import csv_to_list, gen_hash, jsonl_to_list, safe_filename, tsv_to_list
12
12
  from .dataset import Dataset, FieldSpec, MemoryDataset, Sample
13
13
  from .utils import data_to_samples, shuffle_choices_if_requested
14
14
 
@@ -168,7 +168,11 @@ class LocalDataLoader(DataLoader):
168
168
  dataset = []
169
169
 
170
170
  # Check for JSONL or CSV files in the specified path
171
- for ext, loader in [('.jsonl', jsonl_to_list), ('.csv', csv_to_list)]:
171
+ for ext, loader in [
172
+ ('.jsonl', jsonl_to_list),
173
+ ('.csv', csv_to_list),
174
+ ('.tsv', tsv_to_list),
175
+ ]:
172
176
  # Check if the file exists with the given extension
173
177
  if os.path.isfile(path) and path.endswith(ext):
174
178
  file_paths = [path]
@@ -1,7 +1,6 @@
1
- import asyncio
2
- import threading
3
- from typing import TYPE_CHECKING, Any, Dict, List, Optional
1
+ from typing import TYPE_CHECKING, Any, Dict, Optional
4
2
 
3
+ from evalscope.utils.function_utils import AsyncioLoopRunner, thread_safe
5
4
  from evalscope.utils.logger import get_logger
6
5
 
7
6
  if TYPE_CHECKING:
@@ -24,25 +23,10 @@ class SandboxMixin:
24
23
  self._sandbox_id: Optional[str] = None
25
24
  """Sandbox ID."""
26
25
 
27
- self._loop: Optional[asyncio.AbstractEventLoop] = None
28
- """Event loop for async operations."""
29
-
30
- # Initialize sandbox synchronously by running async methods
31
- if self.use_sandbox:
32
- self._loop = asyncio.new_event_loop()
33
-
34
- # Start the loop in a separate thread
35
- def run_loop():
36
- asyncio.set_event_loop(self._loop)
37
- self._loop.run_forever()
38
-
39
- self._loop_thread = threading.Thread(target=run_loop, daemon=True)
40
- self._loop_thread.start()
41
-
42
- # Wait for initialization
43
- future = asyncio.run_coroutine_threadsafe(self._async_init(), self._loop)
44
- future.result()
26
+ # Lazy init state
27
+ self._initialized: bool = False
45
28
 
29
+ # NOTE: Initialization is deferred.
46
30
  super().__init__()
47
31
 
48
32
  async def _async_init(self):
@@ -70,6 +54,25 @@ class SandboxMixin:
70
54
  """Get the sandbox ID."""
71
55
  return self._sandbox_id
72
56
 
57
+ @thread_safe
58
+ def ensure_sandbox_ready(self) -> bool:
59
+ """
60
+ Ensure the sandbox loop, manager, and sandbox instance are initialized.
61
+ This method is thread-safe and idempotent.
62
+ """
63
+ if not self.use_sandbox:
64
+ return False
65
+
66
+ if self._initialized and self._manager and self._sandbox_id:
67
+ return True
68
+
69
+ # Initialize manager and sandbox using the class-level runner
70
+ AsyncioLoopRunner.run(self.init_sandbox_manager_async())
71
+ AsyncioLoopRunner.run(self.init_sandbox_async())
72
+
73
+ self._initialized = True
74
+ return True
75
+
73
76
  async def init_sandbox_manager_async(self) -> Optional['SandboxManager']:
74
77
  """Initialize the sandbox manager asynchronously."""
75
78
  if self._manager is not None:
@@ -100,13 +103,7 @@ class SandboxMixin:
100
103
  if not self.use_sandbox:
101
104
  return None
102
105
 
103
- # Use the dedicated loop if available
104
- if self._loop and not self._loop.is_closed():
105
- future = asyncio.run_coroutine_threadsafe(self.init_sandbox_manager_async(), self._loop)
106
- return future.result()
107
- else:
108
- # Fallback for cases where no loop is available
109
- return asyncio.run(self.init_sandbox_manager_async())
106
+ return AsyncioLoopRunner.run(self.init_sandbox_manager_async())
110
107
 
111
108
  async def init_sandbox_async(self) -> Optional[str]:
112
109
  """Initialize the sandbox instance asynchronously."""
@@ -141,17 +138,12 @@ class SandboxMixin:
141
138
  if not self.use_sandbox:
142
139
  return None
143
140
 
144
- # Use the dedicated loop if available
145
- if self._loop and not self._loop.is_closed():
146
- future = asyncio.run_coroutine_threadsafe(self.init_sandbox_async(), self._loop)
147
- return future.result()
148
- else:
149
- # Fallback for cases where no loop is available
150
- return asyncio.run(self.init_sandbox_async())
141
+ return AsyncioLoopRunner.run(self.init_sandbox_async())
151
142
 
152
143
  def execute_code_in_sandbox(self, code: str, timeout: int = 60, language: str = 'python') -> Dict[str, Any]:
153
144
  """Execute code in the sandbox."""
154
- if not self._sandbox_id or not self._manager:
145
+ # Lazy, thread-safe initialization
146
+ if not self.ensure_sandbox_ready():
155
147
  logger.warning('Sandbox is not initialized.')
156
148
  return {'error': 'Sandbox is not initialized.'}
157
149
 
@@ -175,30 +167,16 @@ class SandboxMixin:
175
167
  )
176
168
  return result
177
169
 
178
- # Use the dedicated loop if available
179
- if self._loop and not self._loop.is_closed():
180
- future = asyncio.run_coroutine_threadsafe(_execute_async(), self._loop)
181
- result = future.result(timeout + 10) # Add some buffer to the timeout
182
- else:
183
- # Fallback for cases where no loop is available
184
- result = asyncio.run(_execute_async())
185
-
170
+ # Execute in background loop via class-level runner
171
+ result = AsyncioLoopRunner.run(_execute_async(), timeout=timeout + 10)
186
172
  return result.model_dump(exclude_none=True)
187
173
 
188
174
  def sandbox_finalize(self, *args, **kwargs):
189
175
  """Finalize the sandbox manager."""
190
176
  if self._manager:
191
177
  try:
192
- if self._loop and not self._loop.is_closed():
193
- # Stop the manager using the dedicated loop
194
- future = asyncio.run_coroutine_threadsafe(self._manager.stop(), self._loop)
195
- future.result(timeout=30)
196
-
197
- # Stop the event loop
198
- self._loop.call_soon_threadsafe(self._loop.stop)
199
- if hasattr(self, '_loop_thread'):
200
- self._loop_thread.join(timeout=5)
201
-
178
+ # Stop the manager but keep the shared loop alive
179
+ AsyncioLoopRunner.run(self._manager.stop(), timeout=30)
202
180
  logger.info('Sandbox manager finalized.')
203
181
  except Exception as e:
204
182
  logger.warning(f'Error finalizing sandbox manager: {e}')
@@ -108,6 +108,12 @@ class GenerateConfig(BaseModel):
108
108
  extra_body: Optional[Dict[str, Any]] = Field(default=None)
109
109
  """Extra body to be sent with requests to OpenAI compatible servers. OpenAI, vLLM, and SGLang only."""
110
110
 
111
+ extra_query: Optional[Dict[str, Any]] = Field(default=None)
112
+ """Extra query parameters to be sent with requests to OpenAI compatible servers. OpenAI, vLLM, and SGLang only."""
113
+
114
+ extra_headers: Optional[Dict[str, str]] = Field(default=None)
115
+ """Extra headers to be sent with requests to OpenAI compatible servers. OpenAI, vLLM, and SGLang only."""
116
+
111
117
  height: Optional[int] = Field(default=None)
112
118
  """Image height for image generation model only"""
113
119
 
@@ -204,7 +204,12 @@ def create_multi_model_tab(sidebar: 'SidebarComponents', lang: str):
204
204
  data_score_df_b, _ = get_single_dataset_df(report_df_b, dataset_name)
205
205
 
206
206
  # Get subset choices - should be same for both models
207
- subsets = data_score_df_a[ReportKey.subset_name].unique().tolist()
207
+ # Only select the subsets that Cat.0 is not '-'
208
+ df_for_subsets = data_score_df_a.copy()
209
+ subsets = sorted(
210
+ df_for_subsets.loc[df_for_subsets[f'{ReportKey.category_prefix}0'].ne('-'),
211
+ ReportKey.subset_name].dropna().unique().tolist()
212
+ )
208
213
 
209
214
  return gr.update(choices=subsets, value=None), None
210
215
 
@@ -134,11 +134,17 @@ def create_single_model_tab(sidebar: 'SidebarComponents', lang: str):
134
134
  )
135
135
  def update_single_report_dataset(dataset_name, report_list):
136
136
  logger.debug(f'Updating single report dataset: {dataset_name}')
137
- report_df = get_data_frame(report_list=report_list)
137
+ report_df = get_data_frame(report_list=report_list, flatten_metrics=True, flatten_categories=True)
138
138
  analysis = get_report_analysis(report_list, dataset_name)
139
139
  data_score_df, styler = get_single_dataset_df(report_df, dataset_name)
140
140
  data_score_plot = plot_single_dataset_scores(data_score_df)
141
- subsets = data_score_df[ReportKey.subset_name].unique().tolist()
141
+ # Only select the subsets that Cat.0 is not '-'
142
+ df_for_subsets = data_score_df.copy()
143
+ subsets = sorted(
144
+ df_for_subsets.loc[df_for_subsets[f'{ReportKey.category_prefix}0'].ne('-'),
145
+ ReportKey.subset_name].dropna().unique().tolist()
146
+ )
147
+
142
148
  logger.debug(f'subsets: {subsets}')
143
149
  return data_score_plot, styler, gr.update(choices=subsets, value=None), None, analysis
144
150
 
@@ -168,9 +168,10 @@ def get_model_prediction(work_dir: str, model_name: str, dataset_name: str, subs
168
168
  'Index': str(review_result.index),
169
169
  'Input': review_result.input.replace('\n', '\n\n'), # for markdown
170
170
  'Metadata': metadata,
171
- 'Generated': prediction,
171
+ 'Generated': prediction or '', # Ensure no None value
172
172
  'Gold': target,
173
- 'Pred': extracted_prediction if extracted_prediction != prediction else '*Same as Generated*',
173
+ 'Pred': (extracted_prediction if extracted_prediction != prediction else '*Same as Generated*')
174
+ or '', # Ensure no None value
174
175
  'Score': score.model_dump(exclude_none=True),
175
176
  'NScore': normalize_score(score.main_value)
176
177
  }
@@ -18,7 +18,7 @@ logger = get_logger()
18
18
  def plot_single_report_scores(df: pd.DataFrame):
19
19
  if df is None:
20
20
  return None
21
- logger.debug(f'df: {df}')
21
+ logger.debug(f'df: \n{df}')
22
22
  plot = px.bar(df, x=df[ReportKey.dataset_name], y=df[ReportKey.score], text=df[ReportKey.score])
23
23
 
24
24
  width = DEFAULT_BAR_WIDTH if len(df[ReportKey.dataset_name]) <= 5 else None
@@ -36,7 +36,7 @@ def plot_single_report_sunburst(report_list: List[Report]):
36
36
  df = get_data_frame(report_list=report_list, flatten_metrics=False)
37
37
  categories = sorted([i for i in df.columns if i.startswith(ReportKey.category_prefix)])
38
38
  path = [ReportKey.dataset_name] + categories + [ReportKey.subset_name]
39
- logger.debug(f'df: {df}')
39
+ logger.debug(f'df: \n{df}')
40
40
  df[categories] = df[categories].fillna('default') # NOTE: fillna for empty categories
41
41
 
42
42
  plot = px.sunburst(