evalscope 1.1.0__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (100) hide show
  1. evalscope/api/benchmark/__init__.py +8 -1
  2. evalscope/api/benchmark/adapters/__init__.py +1 -0
  3. evalscope/api/benchmark/adapters/ner_adapter.py +212 -0
  4. evalscope/api/benchmark/benchmark.py +14 -0
  5. evalscope/api/dataset/dataset.py +21 -0
  6. evalscope/api/dataset/loader.py +6 -2
  7. evalscope/api/mixin/sandbox_mixin.py +32 -54
  8. evalscope/api/model/generate_config.py +6 -0
  9. evalscope/benchmarks/aa_lcr/__init__.py +0 -0
  10. evalscope/benchmarks/aa_lcr/aa_lcr_adapter.py +205 -0
  11. evalscope/benchmarks/bfcl/bfcl_adapter.py +1 -1
  12. evalscope/benchmarks/data_collection/data_collection_adapter.py +2 -1
  13. evalscope/benchmarks/general_arena/general_arena_adapter.py +1 -1
  14. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
  15. evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  16. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +23 -4
  17. evalscope/benchmarks/hallusion_bench/__init__.py +0 -0
  18. evalscope/benchmarks/hallusion_bench/hallusion_bench_adapter.py +158 -0
  19. evalscope/benchmarks/humaneval/humaneval_adapter.py +2 -1
  20. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +3 -1
  21. evalscope/benchmarks/math_verse/__init__.py +0 -0
  22. evalscope/benchmarks/math_verse/math_verse_adapter.py +100 -0
  23. evalscope/benchmarks/math_vision/__init__.py +0 -0
  24. evalscope/benchmarks/math_vision/math_vision_adapter.py +111 -0
  25. evalscope/benchmarks/math_vista/math_vista_adapter.py +6 -26
  26. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +1 -1
  27. evalscope/benchmarks/ner/__init__.py +0 -0
  28. evalscope/benchmarks/ner/broad_twitter_corpus_adapter.py +52 -0
  29. evalscope/benchmarks/ner/conll2003_adapter.py +48 -0
  30. evalscope/benchmarks/ner/copious_adapter.py +85 -0
  31. evalscope/benchmarks/ner/cross_ner_adapter.py +120 -0
  32. evalscope/benchmarks/ner/cross_ner_entities/__init__.py +0 -0
  33. evalscope/benchmarks/ner/cross_ner_entities/ai.py +54 -0
  34. evalscope/benchmarks/ner/cross_ner_entities/literature.py +36 -0
  35. evalscope/benchmarks/ner/cross_ner_entities/music.py +39 -0
  36. evalscope/benchmarks/ner/cross_ner_entities/politics.py +37 -0
  37. evalscope/benchmarks/ner/cross_ner_entities/science.py +58 -0
  38. evalscope/benchmarks/ner/genia_ner_adapter.py +66 -0
  39. evalscope/benchmarks/ner/harvey_ner_adapter.py +58 -0
  40. evalscope/benchmarks/ner/mit_movie_trivia_adapter.py +74 -0
  41. evalscope/benchmarks/ner/mit_restaurant_adapter.py +66 -0
  42. evalscope/benchmarks/ner/ontonotes5_adapter.py +87 -0
  43. evalscope/benchmarks/ner/wnut2017_adapter.py +61 -0
  44. evalscope/benchmarks/ocr_bench_v2/utils.py +1 -0
  45. evalscope/benchmarks/omnidoc_bench/__init__.py +0 -0
  46. evalscope/benchmarks/omnidoc_bench/end2end_eval.py +349 -0
  47. evalscope/benchmarks/omnidoc_bench/metrics.py +547 -0
  48. evalscope/benchmarks/omnidoc_bench/omnidoc_bench_adapter.py +135 -0
  49. evalscope/benchmarks/omnidoc_bench/utils.py +1937 -0
  50. evalscope/benchmarks/poly_math/__init__.py +0 -0
  51. evalscope/benchmarks/poly_math/poly_math_adapter.py +127 -0
  52. evalscope/benchmarks/poly_math/utils/instruction.py +105 -0
  53. evalscope/benchmarks/pope/__init__.py +0 -0
  54. evalscope/benchmarks/pope/pope_adapter.py +111 -0
  55. evalscope/benchmarks/seed_bench_2_plus/__init__.py +0 -0
  56. evalscope/benchmarks/seed_bench_2_plus/seed_bench_2_plus_adapter.py +72 -0
  57. evalscope/benchmarks/simple_vqa/__init__.py +0 -0
  58. evalscope/benchmarks/simple_vqa/simple_vqa_adapter.py +169 -0
  59. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +1 -1
  60. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +1 -1
  61. evalscope/benchmarks/visu_logic/__init__.py +0 -0
  62. evalscope/benchmarks/visu_logic/visu_logic_adapter.py +75 -0
  63. evalscope/benchmarks/zerobench/__init__.py +0 -0
  64. evalscope/benchmarks/zerobench/zerobench_adapter.py +64 -0
  65. evalscope/constants.py +4 -0
  66. evalscope/evaluator/evaluator.py +72 -79
  67. evalscope/metrics/math_parser.py +14 -0
  68. evalscope/metrics/metric.py +1 -1
  69. evalscope/models/utils/openai.py +4 -0
  70. evalscope/perf/arguments.py +24 -4
  71. evalscope/perf/benchmark.py +74 -89
  72. evalscope/perf/http_client.py +31 -16
  73. evalscope/perf/main.py +15 -2
  74. evalscope/perf/plugin/api/base.py +9 -7
  75. evalscope/perf/plugin/api/custom_api.py +13 -58
  76. evalscope/perf/plugin/api/default_api.py +179 -79
  77. evalscope/perf/plugin/api/openai_api.py +4 -3
  78. evalscope/perf/plugin/datasets/base.py +21 -0
  79. evalscope/perf/plugin/datasets/custom.py +2 -3
  80. evalscope/perf/plugin/datasets/line_by_line.py +2 -3
  81. evalscope/perf/plugin/datasets/longalpaca.py +2 -3
  82. evalscope/perf/plugin/datasets/openqa.py +2 -4
  83. evalscope/perf/plugin/datasets/random_dataset.py +1 -3
  84. evalscope/perf/utils/benchmark_util.py +36 -22
  85. evalscope/perf/utils/db_util.py +14 -19
  86. evalscope/perf/utils/local_server.py +0 -44
  87. evalscope/perf/utils/log_utils.py +21 -6
  88. evalscope/report/__init__.py +2 -1
  89. evalscope/run.py +4 -0
  90. evalscope/utils/function_utils.py +195 -12
  91. evalscope/utils/io_utils.py +74 -0
  92. evalscope/utils/logger.py +49 -17
  93. evalscope/utils/ner.py +377 -0
  94. evalscope/version.py +2 -2
  95. {evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info}/METADATA +235 -363
  96. {evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info}/RECORD +100 -55
  97. {evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info}/WHEEL +1 -1
  98. {evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info}/entry_points.txt +0 -0
  99. {evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info/licenses}/LICENSE +0 -0
  100. {evalscope-1.1.0.dist-info → evalscope-1.1.1.dist-info}/top_level.txt +0 -0
evalscope/utils/ner.py ADDED
@@ -0,0 +1,377 @@
1
+ import re
2
+ from typing import Any, Dict, List, Set, Tuple
3
+
4
+ from evalscope.utils.logger import get_logger
5
+
6
+ logger = get_logger()
7
+
8
+ PROMPT_TEMPLATE = """
9
+ You are a named entity recognition system that identifies the following entity types:
10
+ {entities}
11
+
12
+ Process the provided text and mark all named entities with XML-style tags.
13
+
14
+ For example:
15
+ <person>John Smith</person> works at <organization>Google</organization> in <location>Mountain View</location>.
16
+
17
+ Available entity tags: {entity_list}
18
+
19
+ INSTRUCTIONS:
20
+ 1. Wrap your entire response in <response>...</response> tags.
21
+ 2. Inside these tags, include the original text with entity tags inserted.
22
+ 3. Do not change the original text in any way (preserve spacing, punctuation, case, etc.).
23
+ 4. Tag ALL entities you can identify using the exact tag names provided.
24
+ 5. Do not include explanations, just the tagged text.
25
+ 6. If entity spans overlap, choose the most specific entity type.
26
+ 7. Ensure every opening tag has a matching closing tag.
27
+
28
+ Text to process:
29
+ {text}
30
+ """.lstrip()
31
+
32
+ FEWSHOT_TEMPLATE = """
33
+ Here are some examples of named entity recognition:
34
+
35
+ {fewshot}
36
+
37
+ You are a named entity recognition system that identifies the following entity types:
38
+ {entities}
39
+
40
+ Process the provided text and mark all named entities with XML-style tags.
41
+
42
+ For example:
43
+ <person>John Smith</person> works at <organization>Google</organization> in <location>Mountain View</location>.
44
+
45
+ Available entity tags: {entity_list}
46
+
47
+ INSTRUCTIONS:
48
+ 1. Wrap your entire response in <response>...</response> tags.
49
+ 2. Inside these tags, include the original text with entity tags inserted.
50
+ 3. Do not change the original text in any way (preserve spacing, punctuation, case, etc.).
51
+ 4. Tag ALL entities you can identify using the exact tag names provided.
52
+ 5. Do not include explanations, just the tagged text.
53
+ 6. If entity spans overlap, choose the most specific entity type.
54
+ 7. Ensure every opening tag has a matching closing tag.
55
+
56
+ Text to process:
57
+ {text}
58
+ """.lstrip()
59
+
60
+ # Common error patterns to handle in XML predictions
61
+ DEFAULT_TAG_FIX_PATTERNS = [
62
+ # Fix mismatched tags
63
+ (r'<(\w+)>(.*?)</\w+>', r'<\1>\2</\1>'),
64
+ ]
65
+
66
+
67
+ def create_target_text(tokens: List[str], ner_tags: List[str], entity_type_map: Dict[str, str]) -> str:
68
+ """
69
+ Create annotated text from tokens and NER tags.
70
+ Handles BIO tagging scheme conversion to inline XML-style tags.
71
+
72
+ Args:
73
+ tokens: List of text tokens
74
+ ner_tags: List of BIO tags corresponding to tokens
75
+ entity_type_map: Mapping from BIO entity types to user-friendly tag names
76
+
77
+ Returns:
78
+ String with XML-style entity markup wrapped in <response> tags
79
+ """
80
+ result = []
81
+ current_entity = None
82
+ entity_tokens = []
83
+
84
+ for i, (token, tag) in enumerate(zip(tokens, ner_tags)):
85
+ if tag.startswith('B-'): # Beginning of entity
86
+ # Close previous entity if exists
87
+ if current_entity:
88
+ entity_type = entity_type_map.get(current_entity, '')
89
+ if entity_type:
90
+ result.append(f'<{entity_type.lower()}>{" ".join(entity_tokens)}</{entity_type.lower()}>')
91
+ else:
92
+ result.append(' '.join(entity_tokens))
93
+ entity_tokens = []
94
+
95
+ current_entity = tag[2:] # Remove B- prefix
96
+ entity_tokens.append(token)
97
+ elif tag.startswith('I-') and current_entity and tag[2:] == current_entity: # Inside entity
98
+ entity_tokens.append(token)
99
+ else: # Outside any entity (O tag)
100
+ if current_entity: # Close previous entity
101
+ entity_type = entity_type_map.get(current_entity, '')
102
+ if entity_type:
103
+ result.append(f'<{entity_type.lower()}>{" ".join(entity_tokens)}</{entity_type.lower()}>')
104
+ else:
105
+ result.append(' '.join(entity_tokens))
106
+ current_entity = None
107
+ entity_tokens = []
108
+
109
+ result.append(token)
110
+
111
+ # Handle any remaining entity at end of sequence
112
+ if current_entity:
113
+ entity_type = entity_type_map.get(current_entity, '')
114
+ if entity_type:
115
+ result.append(f'<{entity_type.lower()}>{" ".join(entity_tokens)}</{entity_type.lower()}>')
116
+ else:
117
+ result.append(' '.join(entity_tokens))
118
+
119
+ # Wrap the entire response in <response> tags as required by the pipeline
120
+ return f'<response>{" ".join(result)}</response>'
121
+
122
+
123
+ def clean_prediction(text: str, tag_fix_patterns: List[Tuple[str, str]] = None) -> str:
124
+ """
125
+ Clean and fix common XML errors in model predictions.
126
+
127
+ Args:
128
+ text: The prediction text to clean
129
+ tag_fix_patterns: List of regex patterns and replacements to fix common XML errors
130
+
131
+ Returns:
132
+ Cleaned text with fixed XML tags
133
+ """
134
+ if tag_fix_patterns is None:
135
+ tag_fix_patterns = DEFAULT_TAG_FIX_PATTERNS
136
+
137
+ cleaned = text
138
+
139
+ # Extract content from response tags if present
140
+ response_match = re.search(r'<response>(.*?)</response>', cleaned, re.DOTALL)
141
+ if response_match:
142
+ cleaned = response_match.group(1)
143
+
144
+ # Apply fix patterns for common XML errors
145
+ for pattern, replacement in tag_fix_patterns:
146
+ cleaned = re.sub(pattern, replacement, cleaned)
147
+
148
+ return cleaned
149
+
150
+
151
+ def extract_entities_from_text(text: str, reverse_entity_map: Dict[str, str]) -> List[Tuple]:
152
+ """
153
+ Extract entities from tagged text with robust error handling.
154
+
155
+ Args:
156
+ text: Text with XML entity tags
157
+ reverse_entity_map: Mapping from user-friendly tag names to BIO entity types
158
+
159
+ Returns:
160
+ List of (entity_type, entity_text, start_idx, end_idx) tuples
161
+ """
162
+ entities = []
163
+
164
+ # Define regex pattern to find XML-style entity tags - handle potential errors
165
+ pattern = r'<(\w+)>(.*?)</\1>'
166
+
167
+ try:
168
+ for match in re.finditer(pattern, text):
169
+ entity_type = match.group(1).lower() # Normalize type to lowercase
170
+ entity_text = match.group(2)
171
+ start_idx = match.start()
172
+ end_idx = match.end()
173
+
174
+ # Map back to entity types if possible
175
+ mapped_type = reverse_entity_map.get(entity_type)
176
+
177
+ if mapped_type:
178
+ entities.append((mapped_type, entity_text, start_idx, end_idx))
179
+ else:
180
+ # Unknown entity type but still count it for evaluation
181
+ entities.append((entity_type, entity_text, start_idx, end_idx))
182
+
183
+ except Exception as e:
184
+ logger.warning(f'Error parsing entities in text: {str(e)}')
185
+
186
+ # Handle malformed XML by trying to find additional tag patterns
187
+ # This is a fallback for when the model produces incorrect tags
188
+ unclosed_pattern = r'<(\w+)>(.*?)(?=<|$)'
189
+ try:
190
+ # Find potential unclosed tags
191
+ for match in re.finditer(unclosed_pattern, text):
192
+ # Skip if already part of a well-formed tag
193
+ if any(start_idx <= match.start() < end_idx for _, _, start_idx, end_idx in entities):
194
+ continue
195
+
196
+ entity_type = match.group(1).lower()
197
+ entity_text = match.group(2)
198
+ start_idx = match.start()
199
+ end_idx = match.end()
200
+
201
+ # Map back to entity types
202
+ mapped_type = reverse_entity_map.get(entity_type)
203
+ if mapped_type:
204
+ entities.append((mapped_type, entity_text, start_idx, end_idx))
205
+
206
+ except Exception as e:
207
+ logger.warning(f'Error handling malformed tags: {str(e)}')
208
+
209
+ return entities
210
+
211
+
212
+ def xml_to_bio_tags(xml_text: str, original_tokens: List[str], reverse_entity_map: Dict[str, str]) -> List[str]:
213
+ """
214
+ Convert XML-annotated text back to BIO tags aligned with the original tokens.
215
+
216
+ Args:
217
+ xml_text: Text with XML entity annotations
218
+ original_tokens: Original tokens to align with
219
+ reverse_entity_map: Mapping from user-friendly tag names to BIO entity types
220
+
221
+ Returns:
222
+ List of BIO tags corresponding to the original tokens
223
+ """
224
+ # Extract entities with their character positions
225
+ entities = extract_entities_from_text(xml_text, reverse_entity_map)
226
+
227
+ # Initialize all tags as 'O'
228
+ bio_tags = ['O'] * len(original_tokens)
229
+
230
+ # Reconstruct the original text to find character positions for each token
231
+ original_text = ' '.join(original_tokens)
232
+
233
+ # Track token start positions in the original text
234
+ token_positions = []
235
+ pos = 0
236
+ for token in original_tokens:
237
+ token_pos = original_text.find(token, pos)
238
+ if token_pos == -1:
239
+ # Fallback: just use the current position if we can't find the exact match
240
+ token_positions.append(pos)
241
+ else:
242
+ token_positions.append(token_pos)
243
+ pos = token_pos + len(token)
244
+
245
+ # Add token end positions
246
+ token_ends = [pos + len(token) for pos, token in zip(token_positions, original_tokens)]
247
+
248
+ # Map entities to tokens based on character positions
249
+ for entity_type, entity_text, start_pos, end_pos in entities:
250
+ # Extract the context from the XML text to help locate the correct entity occurrence
251
+ # Get some context before and after the entity in the XML text
252
+ context_start = max(0, start_pos - 20)
253
+ context_end = min(len(xml_text), end_pos + 20)
254
+
255
+ # Extract context without XML tags
256
+ context_before = re.sub(r'<[^>]+>', '', xml_text[context_start:start_pos])
257
+ context_after = re.sub(r'<[^>]+>', '', xml_text[end_pos:context_end])
258
+
259
+ # Use context to find the correct entity position in original text
260
+ search_pos = 0
261
+ entity_start = -1
262
+
263
+ while search_pos < len(original_text):
264
+ # Find the next occurrence of the entity
265
+ potential_start = original_text.find(entity_text, search_pos)
266
+ if potential_start == -1:
267
+ break
268
+
269
+ # Check if the context matches
270
+ potential_context_start = max(0, potential_start - len(context_before))
271
+ potential_context_end = min(len(original_text), potential_start + len(entity_text) + len(context_after))
272
+
273
+ before_match = context_before.strip() in original_text[potential_context_start:potential_start].strip()
274
+ after_match = context_after.strip() in original_text[potential_start
275
+ + len(entity_text):potential_context_end].strip()
276
+
277
+ # If context matches or we can't find a better match, use this position
278
+ if before_match or after_match or search_pos > len(original_text) // 2:
279
+ entity_start = potential_start
280
+ break
281
+
282
+ # Move search position forward
283
+ search_pos = potential_start + 1
284
+
285
+ # If we couldn't find the entity with context, fall back to the first occurrence
286
+ if entity_start == -1:
287
+ entity_start = original_text.find(entity_text)
288
+ if entity_start == -1:
289
+ continue
290
+
291
+ entity_end = entity_start + len(entity_text)
292
+
293
+ # Find tokens that overlap with this entity
294
+ for i, (token_start, token_end) in enumerate(zip(token_positions, token_ends)):
295
+ if token_start <= entity_end and token_end >= entity_start:
296
+ # This token overlaps with the entity
297
+ if bio_tags[i] == 'O':
298
+ # Start of entity
299
+ if i == 0 or bio_tags[i - 1] == 'O' or not bio_tags[i - 1].endswith(entity_type):
300
+ bio_tags[i] = f'B-{entity_type}'
301
+ else:
302
+ # Continuation of entity
303
+ bio_tags[i] = f'I-{entity_type}'
304
+
305
+ return bio_tags
306
+
307
+
308
+ def calculate_bio_metrics(pred_tags: List[str], gold_tags: List[str], tokens: List[str]) -> Tuple[int, int, int]:
309
+ """
310
+ Calculate metrics by comparing BIO tag sequences.
311
+
312
+ Args:
313
+ pred_tags: Predicted BIO tags
314
+ gold_tags: Gold standard BIO tags
315
+ tokens: Original tokens
316
+
317
+ Returns:
318
+ Tuple of (true_positives, false_positives, false_negatives)
319
+ """
320
+ # Extract entity spans from BIO tags
321
+ pred_spans = extract_spans_from_bio(pred_tags, tokens)
322
+ gold_spans = extract_spans_from_bio(gold_tags, tokens)
323
+
324
+ # Calculate metrics
325
+ true_positives = len(pred_spans.intersection(gold_spans))
326
+ false_positives = len(pred_spans - gold_spans)
327
+ false_negatives = len(gold_spans - pred_spans)
328
+
329
+ return true_positives, false_positives, false_negatives
330
+
331
+
332
+ def extract_spans_from_bio(tags: List[str], tokens: List[str]) -> Set[Tuple]:
333
+ """
334
+ Extract entity spans from BIO tags.
335
+
336
+ Args:
337
+ tags: List of BIO tags
338
+ tokens: List of tokens corresponding to the tags
339
+
340
+ Returns:
341
+ Set of (entity_type, start_idx, end_idx, text) tuples
342
+ """
343
+ spans = set()
344
+ current_entity = None
345
+ start_idx = None
346
+ entity_tokens = []
347
+
348
+ for i, (token, tag) in enumerate(zip(tokens, tags)):
349
+ if tag.startswith('B-'): # Beginning of entity
350
+ # Close previous entity if exists
351
+ if current_entity:
352
+ entity_type = current_entity
353
+ entity_text = ' '.join(entity_tokens)
354
+ spans.add((entity_type, start_idx, i - 1, entity_text))
355
+ entity_tokens = []
356
+
357
+ current_entity = tag[2:] # Remove B- prefix
358
+ start_idx = i
359
+ entity_tokens.append(token)
360
+ elif tag.startswith('I-') and current_entity: # Inside entity
361
+ entity_tokens.append(token)
362
+ elif tag == 'O': # Outside any entity
363
+ if current_entity: # Close previous entity
364
+ entity_type = current_entity
365
+ entity_text = ' '.join(entity_tokens)
366
+ spans.add((entity_type, start_idx, i - 1, entity_text))
367
+ current_entity = None
368
+ start_idx = None
369
+ entity_tokens = []
370
+
371
+ # Handle any remaining entity at end of sequence
372
+ if current_entity:
373
+ entity_type = current_entity
374
+ entity_text = ' '.join(entity_tokens)
375
+ spans.add((entity_type, start_idx, len(tokens) - 1, entity_text))
376
+
377
+ return spans
evalscope/version.py CHANGED
@@ -1,4 +1,4 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- __version__ = '1.1.0'
4
- __release_datetime__ = '2025-10-14 14:00:00'
3
+ __version__ = '1.1.1'
4
+ __release_datetime__ = '2025-10-27 17:00:00'