evalscope 0.16.1__py3-none-any.whl → 0.16.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (82) hide show
  1. evalscope/app/app.py +20 -5
  2. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +23 -11
  3. evalscope/backend/rag_eval/utils/embedding.py +2 -4
  4. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +1 -0
  5. evalscope/benchmarks/aime/aime24_adapter.py +3 -1
  6. evalscope/benchmarks/aime/aime25_adapter.py +3 -1
  7. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +5 -0
  8. evalscope/benchmarks/arc/arc_adapter.py +3 -0
  9. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +7 -3
  10. evalscope/benchmarks/bbh/bbh_adapter.py +3 -0
  11. evalscope/benchmarks/benchmark.py +1 -0
  12. evalscope/benchmarks/bfcl/__init__.py +0 -0
  13. evalscope/benchmarks/bfcl/bfcl_adapter.py +237 -0
  14. evalscope/benchmarks/ceval/ceval_adapter.py +3 -0
  15. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +4 -1
  16. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +3 -0
  17. evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -0
  18. evalscope/benchmarks/data_adapter.py +2 -0
  19. evalscope/benchmarks/data_collection/data_collection_adapter.py +1 -0
  20. evalscope/benchmarks/docmath/docmath_adapter.py +1 -0
  21. evalscope/benchmarks/drop/drop_adapter.py +3 -0
  22. evalscope/benchmarks/frames/frames_adapter.py +1 -0
  23. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +19 -23
  24. evalscope/benchmarks/general_qa/general_qa_adapter.py +3 -0
  25. evalscope/benchmarks/gpqa/gpqa_adapter.py +3 -0
  26. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +3 -0
  27. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -0
  28. evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -0
  29. evalscope/benchmarks/ifeval/ifeval_adapter.py +3 -0
  30. evalscope/benchmarks/iquiz/iquiz_adapter.py +3 -0
  31. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +4 -1
  32. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +3 -0
  33. evalscope/benchmarks/math_500/math_500_adapter.py +3 -0
  34. evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -0
  35. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +3 -0
  36. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +3 -0
  37. evalscope/benchmarks/musr/musr_adapter.py +3 -0
  38. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +15 -8
  39. evalscope/benchmarks/needle_haystack/utils.py +2 -2
  40. evalscope/benchmarks/process_bench/process_bench_adapter.py +3 -0
  41. evalscope/benchmarks/race/race_adapter.py +3 -0
  42. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +3 -0
  43. evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +1 -0
  44. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +21 -3
  45. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +1 -0
  46. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +5 -0
  47. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -0
  48. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +3 -0
  49. evalscope/benchmarks/winogrande/winogrande_adapter.py +3 -0
  50. evalscope/collections/evaluator.py +50 -28
  51. evalscope/constants.py +1 -1
  52. evalscope/evaluator/evaluator.py +6 -5
  53. evalscope/metrics/t2v_metrics/__init__.py +9 -23
  54. evalscope/models/adapters/__init__.py +2 -0
  55. evalscope/models/adapters/base_adapter.py +29 -27
  56. evalscope/models/adapters/bfcl_adapter.py +244 -0
  57. evalscope/models/adapters/server_adapter.py +78 -17
  58. evalscope/models/custom/custom_model.py +0 -3
  59. evalscope/models/custom/dummy_model.py +77 -39
  60. evalscope/models/local_model.py +1 -1
  61. evalscope/models/register.py +2 -1
  62. evalscope/perf/arguments.py +2 -0
  63. evalscope/perf/benchmark.py +16 -3
  64. evalscope/perf/plugin/api/openai_api.py +2 -0
  65. evalscope/report/combinator.py +38 -12
  66. evalscope/report/utils.py +24 -1
  67. evalscope/run.py +1 -1
  68. evalscope/summarizer.py +1 -1
  69. evalscope/utils/io_utils.py +59 -2
  70. evalscope/version.py +2 -2
  71. {evalscope-0.16.1.dist-info → evalscope-0.16.3.dist-info}/METADATA +4 -3
  72. {evalscope-0.16.1.dist-info → evalscope-0.16.3.dist-info}/RECORD +82 -79
  73. tests/aigc/test_t2i.py +8 -8
  74. tests/cli/test_all.py +40 -33
  75. tests/cli/test_collection.py +4 -3
  76. tests/cli/test_run.py +36 -21
  77. tests/rag/test_clip_benchmark.py +5 -1
  78. tests/rag/test_mteb.py +46 -2
  79. {evalscope-0.16.1.dist-info → evalscope-0.16.3.dist-info}/LICENSE +0 -0
  80. {evalscope-0.16.1.dist-info → evalscope-0.16.3.dist-info}/WHEEL +0 -0
  81. {evalscope-0.16.1.dist-info → evalscope-0.16.3.dist-info}/entry_points.txt +0 -0
  82. {evalscope-0.16.1.dist-info → evalscope-0.16.3.dist-info}/top_level.txt +0 -0
evalscope/app/app.py CHANGED
@@ -1,6 +1,7 @@
1
1
  import argparse
2
2
  import glob
3
3
  import gradio as gr
4
+ import json
4
5
  import numpy as np
5
6
  import os
6
7
  import pandas as pd
@@ -135,11 +136,11 @@ def plot_single_report_scores(df: pd.DataFrame):
135
136
 
136
137
  def plot_single_report_sunburst(report_list: List[Report]):
137
138
  if report_list[0].name == DataCollection.NAME:
138
- df = get_data_frame(report_list)
139
+ df = get_data_frame(report_list=report_list)
139
140
  categories = sorted([i for i in df.columns if i.startswith(ReportKey.category_prefix)])
140
141
  path = categories + [ReportKey.subset_name]
141
142
  else:
142
- df = get_data_frame(report_list, flatten_metrics=False)
143
+ df = get_data_frame(report_list=report_list, flatten_metrics=False)
143
144
  categories = sorted([i for i in df.columns if i.startswith(ReportKey.category_prefix)])
144
145
  path = [ReportKey.dataset_name] + categories + [ReportKey.subset_name]
145
146
  logger.debug(f'df: {df}')
@@ -233,7 +234,7 @@ def convert_html_tags(text):
233
234
  def process_string(string: str, max_length: int = 2048) -> str:
234
235
  string = convert_html_tags(string) # for display labels e.g.
235
236
  if max_length and len(string) > max_length:
236
- return f'{string[:max_length // 2]}......{string[-max_length // 2:]}'
237
+ return f'{string[:max_length // 2]}...[truncate]...{string[-max_length // 2:]}'
237
238
  return string
238
239
 
239
240
 
@@ -257,7 +258,7 @@ def dict_to_markdown(data) -> str:
257
258
  return '\n\n'.join(markdown_lines)
258
259
 
259
260
 
260
- def process_model_prediction(item: Any, max_length: int = 2048) -> str:
261
+ def process_model_prediction_old(item: Any, max_length: int = 2048) -> str:
261
262
  """
262
263
  Process model prediction output into a formatted string.
263
264
 
@@ -281,6 +282,20 @@ def process_model_prediction(item: Any, max_length: int = 2048) -> str:
281
282
  return result
282
283
 
283
284
 
285
+ def process_model_prediction(item: Any, max_length: int = 4096) -> str:
286
+ if isinstance(item, (dict, list)):
287
+ result = json.dumps(item, ensure_ascii=False, indent=2)
288
+ result = f'```json\n{result}\n```'
289
+ else:
290
+ result = str(item)
291
+
292
+ # Apply HTML tag conversion and truncation only at the final output
293
+ if max_length is not None:
294
+ return process_string(result, max_length)
295
+
296
+ return result
297
+
298
+
284
299
  def normalize_score(score):
285
300
  try:
286
301
  if isinstance(score, bool):
@@ -583,7 +598,7 @@ def create_single_model_tab(sidebar: SidebarComponents, lang: str):
583
598
  outputs=[dataset_plot, dataset_table, subset_select, data_review_df, report_analysis])
584
599
  def update_single_report_dataset(dataset_name, report_list):
585
600
  logger.debug(f'Updating single report dataset: {dataset_name}')
586
- report_df = get_data_frame(report_list)
601
+ report_df = get_data_frame(report_list=report_list)
587
602
  analysis = get_report_analysis(report_list, dataset_name)
588
603
  data_score_df, styler = get_single_dataset_df(report_df, dataset_name)
589
604
  data_score_plot = plot_single_dataset_scores(data_score_df)
@@ -1,4 +1,5 @@
1
1
  import os
2
+ import posixpath # For URL path handling
2
3
  import torch
3
4
  from torch.utils.data import DataLoader
4
5
  from torch.utils.data import Dataset as TorchDataset
@@ -186,42 +187,53 @@ def build_wds_dataset(dataset_name, transform, split='test', data_dir='root', ca
186
187
 
187
188
  Set `cache_dir` to a path to cache the dataset, otherwise, no caching will occur.
188
189
  """
190
+ import requests
189
191
  import webdataset as wds
190
192
 
191
193
  def read_txt(fname):
192
- if '://' in fname:
193
- stream = os.popen("curl -L -s --fail '%s'" % fname, 'r')
194
- value = stream.read()
195
- if stream.close():
196
- raise FileNotFoundError('Failed to retreive data')
194
+ if fname.startswith(('http://', 'https://')):
195
+ try:
196
+ response = requests.get(fname)
197
+ response.raise_for_status() # Ensure the HTTP request was successful
198
+ return response.text
199
+ except requests.exceptions.RequestException as e:
200
+ raise FileNotFoundError(f'Failed to read {fname}: {e}')
197
201
  else:
198
202
  with open(fname, 'r') as file:
199
- value = file.read()
200
- return value
203
+ return file.read()
204
+
205
+ def url_path_join(*parts):
206
+ """Join URL path parts with forward slashes regardless of platform"""
207
+ return posixpath.join(*parts)
201
208
 
202
209
  if not data_dir:
203
210
  data_dir = f'https://modelscope.cn/datasets/clip-benchmark/wds_{dataset_name}/resolve/master'
204
211
 
205
212
  # Git LFS files have a different file path to access the raw data than other files
206
- if data_dir.startswith('https://modelscope.cn/datasets'):
213
+ is_url = data_dir.startswith(('http://', 'https://'))
214
+ if is_url and data_dir.startswith('https://modelscope.cn/datasets'):
207
215
  *split_url_head, _, url_path = data_dir.split('/', 7)
208
216
  url_head = '/'.join(split_url_head)
209
217
  metadata_dir = '/'.join([url_head, 'resolve', url_path])
210
218
  tardata_dir = '/'.join([url_head, 'resolve', url_path])
211
219
  else:
212
220
  metadata_dir = tardata_dir = data_dir
221
+
222
+ # Use appropriate path joining function based on whether we're dealing with a URL
223
+ path_join = url_path_join if is_url else os.path.join
224
+
213
225
  # Get number of shards
214
- nshards_fname = os.path.join(metadata_dir, split, 'nshards.txt')
226
+ nshards_fname = path_join(metadata_dir, split, 'nshards.txt')
215
227
  nshards = int(read_txt(nshards_fname)) # Do not catch FileNotFound, nshards.txt should be mandatory
216
228
 
217
229
  # Get dataset type (classification or retrieval)
218
- type_fname = os.path.join(metadata_dir, 'dataset_type.txt')
230
+ type_fname = path_join(metadata_dir, 'dataset_type.txt')
219
231
  try:
220
232
  dataset_type = read_txt(type_fname).strip().lower()
221
233
  except FileNotFoundError:
222
234
  dataset_type = 'classification'
223
235
 
224
- filepattern = os.path.join(tardata_dir, split, '{0..%d}.tar' % (nshards - 1))
236
+ filepattern = path_join(tardata_dir, split, '{0..%d}.tar' % (nshards - 1))
225
237
  # Load webdataset (support WEBP, PNG, and JPG for now)
226
238
  if not cache_dir or not isinstance(cache_dir, str):
227
239
  cache_dir = None
@@ -172,11 +172,9 @@ class CrossEncoderModel(BaseModel):
172
172
  kwargs.pop(key)
173
173
  self.encode_kwargs.update(kwargs)
174
174
 
175
- if len(sentences[0]) == 3: # Note: For mteb retrieval task
175
+ if len(sentences[0]) == 2: # Note: For mteb retrieval task
176
176
  processed_sentences = []
177
- for query, docs, instruction in sentences:
178
- if isinstance(docs, dict):
179
- docs = docs['text']
177
+ for query, docs in sentences:
180
178
  processed_sentences.append((self.prompt + query, docs))
181
179
  sentences = processed_sentences
182
180
  embeddings = self.model.predict(sentences, **self.encode_kwargs)
@@ -69,6 +69,7 @@ class EvalMuseAdapter(T2IBaseAdapter):
69
69
  if 'FGA_BLIP2Score' in metric_name and '(' in metric_name: # FGA_BLIP2Score element score
70
70
  metrics_prefix = metric_name.split(':')[0]
71
71
  category = metric_name.rpartition('(')[-1].split(')')[0]
72
+ category = category.split('-')[0].lower() # remove the suffix if exists
72
73
  new_items[f'{metrics_prefix}:{category}'].extend(value_list)
73
74
  else:
74
75
  new_items[metric_name].extend(value_list)
@@ -1,5 +1,4 @@
1
1
  from evalscope.benchmarks import Benchmark, DataAdapter
2
- from evalscope.constants import OutputType
3
2
  from evalscope.metrics import extract_answer, math_equal, strip_answer_string
4
3
  from evalscope.utils.logger import get_logger
5
4
 
@@ -11,6 +10,9 @@ logger = get_logger()
11
10
  @Benchmark.register(
12
11
  name='aime24',
13
12
  pretty_name='AIME-2024',
13
+ tags=['Mathematics'],
14
+ description=
15
+ 'The AIME 2024 benchmark is based on problems from the American Invitational Mathematics Examination, a prestigious high school mathematics competition. This benchmark tests a model’s ability to solve challenging mathematics problems by generating step-by-step solutions and providing the correct final answer.', # noqa: E501
14
16
  dataset_id='HuggingFaceH4/aime_2024',
15
17
  subset_list=['default'],
16
18
  metric_list=['AveragePass@1'],
@@ -1,5 +1,4 @@
1
1
  from evalscope.benchmarks import Benchmark, DataAdapter
2
- from evalscope.constants import OutputType
3
2
  from evalscope.metrics import extract_answer, math_equal, strip_answer_string
4
3
  from evalscope.utils.logger import get_logger
5
4
 
@@ -11,6 +10,9 @@ logger = get_logger()
11
10
  @Benchmark.register(
12
11
  name='aime25',
13
12
  pretty_name='AIME-2025',
13
+ tags=['Mathematics'],
14
+ description=
15
+ 'The AIME 2025 benchmark is based on problems from the American Invitational Mathematics Examination, a prestigious high school mathematics competition. This benchmark tests a model’s ability to solve challenging mathematics problems by generating step-by-step solutions and providing the correct final answer.',
14
16
  dataset_id='opencompass/AIME2025',
15
17
  subset_list=['AIME2025-I', 'AIME2025-II'],
16
18
  metric_list=['AveragePass@1'],
@@ -47,6 +47,11 @@ Evaluate the models based on the quality and relevance of their outputs, and sel
47
47
  @Benchmark.register(
48
48
  name='alpaca_eval',
49
49
  pretty_name='AlpacaEval2.0',
50
+ tags=['Instruction-Following', 'Reasoning'],
51
+ description='Alpaca Eval 2.0 is an enhanced framework for evaluating instruction-following language models, '
52
+ 'featuring an improved auto-annotator, updated baselines, and continuous preference calculation to '
53
+ 'provide more accurate and cost-effective model assessments. '
54
+ 'Currently not support `length-controlled winrate`; the official Judge model is `gpt-4-1106-preview`, while the baseline model is `gpt-4-turbo`.', # noqa: E501
50
55
  dataset_id='AI-ModelScope/alpaca_eval',
51
56
  subset_list=['alpaca_eval_gpt4_baseline'],
52
57
  metric_list=['winrate'],
@@ -17,6 +17,9 @@ logger = get_logger()
17
17
  @Benchmark.register(
18
18
  name='arc',
19
19
  pretty_name='ARC',
20
+ tags=['Reasoning', 'MCQ'],
21
+ description=
22
+ 'The ARC (AI2 Reasoning Challenge) benchmark is designed to evaluate the reasoning capabilities of AI models through multiple-choice questions derived from science exams. It includes two subsets: ARC-Easy and ARC-Challenge, which vary in difficulty.', # noqa: E501
20
23
  dataset_id='modelscope/ai2_arc',
21
24
  model_adapter=OutputType.GENERATION,
22
25
  output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
@@ -1,5 +1,3 @@
1
- import re
2
- from collections import defaultdict
3
1
  from typing import Any, List
4
2
 
5
3
  from evalscope.benchmarks import Benchmark, DataAdapter
@@ -19,12 +17,18 @@ GRADER_TEMPLATE = "<|User Prompt|>\n{question}\n\n<|The Start of Assistant A's A
19
17
  @Benchmark.register(
20
18
  name='arena_hard',
21
19
  pretty_name='ArenaHard',
20
+ tags=['Instruction-Following', 'Reasoning'],
21
+ description=
22
+ 'ArenaHard is a benchmark designed to evaluate the performance of large language models in a competitive setting, '
23
+ 'where models are pitted against each other in a series of tasks to determine their relative strengths and weaknesses. '
24
+ 'It includes a set of challenging tasks that require reasoning, understanding, and generation capabilities. '
25
+ 'Currently not support `style-controlled winrate`; the official Judge model is `gpt-4-1106-preview`, while the baseline model is `gpt-4-0314`.', # noqa: E501
22
26
  dataset_id='AI-ModelScope/arena-hard-auto-v0.1',
23
27
  metric_list=['winrate'],
24
28
  few_shot_num=0,
25
29
  train_split=None,
26
30
  eval_split='test')
27
- class AlpacaEvalAdapter(DataAdapter):
31
+ class ArenaHardAdapter(DataAdapter):
28
32
 
29
33
  def __init__(self, *args, **kwargs):
30
34
  super().__init__(*args, **kwargs)
@@ -59,6 +59,9 @@ SUBSET_LIST = MULTIPLE_CHOICE_LIST + FREE_FORM_LIST
59
59
  @Benchmark.register(
60
60
  name='bbh',
61
61
  pretty_name='BBH',
62
+ tags=['Reasoning'],
63
+ description=
64
+ 'The BBH (Big Bench Hard) benchmark is a collection of challenging tasks designed to evaluate the reasoning capabilities of AI models. It includes both free-form and multiple-choice tasks, covering a wide range of reasoning skills.', # noqa: E501
62
65
  dataset_id='modelscope/bbh',
63
66
  subset_list=SUBSET_LIST,
64
67
  metric_list=['AverageAccuracy'],
@@ -29,6 +29,7 @@ class BenchmarkMeta:
29
29
  query_template: Optional[str] = None
30
30
  pretty_name: Optional[str] = None
31
31
  description: Optional[str] = None
32
+ tags: Optional[List[str]] = field(default_factory=list)
32
33
  filters: Optional[OrderedDict] = None
33
34
  extra_params: Optional[Dict] = field(default_factory=dict)
34
35
 
File without changes
@@ -0,0 +1,237 @@
1
+ import copy
2
+ import importlib
3
+ import json
4
+ import re
5
+ import traceback
6
+ from typing import Any, List
7
+
8
+ from evalscope.benchmarks import Benchmark, DataAdapter
9
+ from evalscope.constants import EvalType
10
+ from evalscope.utils.logger import get_logger
11
+
12
+ logger = get_logger()
13
+
14
+ SUBJECT_MAPPING = {
15
+ 'simple': 'AST_NON_LIVE',
16
+ 'multiple': 'AST_NON_LIVE',
17
+ 'parallel': 'AST_NON_LIVE',
18
+ 'parallel_multiple': 'AST_NON_LIVE',
19
+ 'java': 'AST_NON_LIVE',
20
+ 'javascript': 'AST_NON_LIVE',
21
+ 'live_simple': 'AST_LIVE',
22
+ 'live_multiple': 'AST_LIVE',
23
+ 'live_parallel': 'AST_LIVE',
24
+ 'live_parallel_multiple': 'AST_LIVE',
25
+ 'irrelevance': 'RELEVANCE',
26
+ 'live_relevance': 'RELEVANCE',
27
+ 'live_irrelevance': 'RELEVANCE',
28
+ 'multi_turn_base': 'MULTI_TURN',
29
+ 'multi_turn_miss_func': 'MULTI_TURN',
30
+ 'multi_turn_miss_param': 'MULTI_TURN',
31
+ 'multi_turn_long_context': 'MULTI_TURN'
32
+ }
33
+
34
+
35
+ @Benchmark.register(
36
+ name='bfcl_v3',
37
+ pretty_name='BFCL-v3',
38
+ tags=['Agent'],
39
+ description=
40
+ 'Berkeley Function Calling Leaderboard (BFCL), the **first comprehensive and executable function call evaluation** '
41
+ 'dedicated to assessing Large Language Models\' (LLMs) ability to invoke functions. Unlike previous evaluations, '
42
+ 'BFCL accounts for various forms of function calls, diverse scenarios, and executability. '
43
+ 'Need to run `pip install bfcl-eval` before evaluating. '
44
+ '[Usage Example](https://evalscope.readthedocs.io/zh-cn/latest/third_party/bfcl_v3.html)', # noqa: E501
45
+ dataset_id='AI-ModelScope/bfcl_v3',
46
+ subset_list=list(SUBJECT_MAPPING.keys()),
47
+ model_adapter='bfcl_server',
48
+ metric_list=['AverageAccuracy'],
49
+ few_shot_num=0,
50
+ train_split=None,
51
+ eval_split='train',
52
+ extra_params={
53
+ 'underscore_to_dot': True,
54
+ 'is_fc_model': True,
55
+ })
56
+ class BFCLAdapter(DataAdapter):
57
+
58
+ def __init__(self, **kwargs):
59
+ super().__init__(**kwargs)
60
+
61
+ spec = importlib.util.find_spec('bfcl_eval')
62
+ if spec is None:
63
+ raise ImportError(
64
+ '`bfcl_eval` not found, please install it with `pip install bfcl-eval` before evaluating.')
65
+
66
+ self.category_map = SUBJECT_MAPPING
67
+
68
+ extra_params = kwargs.get('extra_params', {})
69
+ self.underscore_to_dot = extra_params.get('underscore_to_dot', False)
70
+ self.is_fc_model = extra_params.get('is_fc_model', True)
71
+
72
+ def load(self, **kwargs):
73
+ kwargs['subset_list'] = ['default']
74
+ data_dict = super().load(**kwargs)
75
+ return self.reformat_subset(data_dict, subset_key='subset', format='{}')
76
+
77
+ def preprocess_row(self, row: dict):
78
+ """
79
+ Inplace preprocess the row to ensure it has the correct format for BFCL evaluation.
80
+ """
81
+ row['should_execute_tool_calls'] = True if row['multi_turn'] else False
82
+ row['functions'] = json.loads(row['functions'])
83
+ row['tools'] = json.loads(row['tools'])
84
+ row['turns'] = json.loads(row['turns'])
85
+ row['missing_functions'] = json.loads(row['missed_functions'])
86
+ row['ground_truth'] = json.loads(row.get('ground_truth', '{}'))
87
+ row['initial_config'] = json.loads(row['initial_config'])
88
+ row['is_fc_model'] = self.is_fc_model
89
+
90
+ def gen_prompt(self, input_d, subset_name, few_shot_list, **kwargs):
91
+ self.preprocess_row(input_d)
92
+
93
+ # If the model is a function calling model, we need to remove the system prompt
94
+ if self.is_fc_model:
95
+ turns = input_d['turns']
96
+ new_turns = []
97
+ for turn_idx, messages in enumerate(turns):
98
+ current_messages = messages.copy()
99
+ if len(current_messages) > 0 and current_messages[0]['role'] == 'system':
100
+ current_messages = current_messages[1:]
101
+ new_turns.append(current_messages)
102
+ input_d['turns'] = new_turns
103
+
104
+ return self.gen_prompt_data(prompt='', messages=input_d)
105
+
106
+ def get_gold_answer(self, input_d: dict) -> str:
107
+ # Get the gold choice
108
+ return input_d.get('ground_truth', )
109
+
110
+ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> dict:
111
+ row = copy.deepcopy(raw_input_d)
112
+ del row['turns'] # Remove turns as they are not needed for the match function
113
+
114
+ row['generation'] = result
115
+ return row
116
+
117
+ def match(self, gold: dict, pred: dict) -> dict:
118
+ from bfcl_eval.eval_checker.ast_eval.ast_checker import ast_checker
119
+ from bfcl_eval.eval_checker.multi_turn_eval.multi_turn_checker import multi_turn_checker
120
+ from bfcl_eval.model_handler.utils import (convert_to_function_call, default_decode_ast_prompting,
121
+ default_decode_execute_prompting)
122
+ from bfcl_eval.utils import is_empty_output
123
+
124
+ # NOTE: This is hardcoded dummy model since its only use is to infer underscore_to_dot
125
+ # which decides if model was provided with functions of the type
126
+ # spotify.list_songs or spotify_list_songs
127
+ # It is False for all llama models (when using via prompting)
128
+ # and True for API calls
129
+ if self.underscore_to_dot:
130
+ dummy_model = 'gpt-4o-2024-11-20-FC'
131
+ else:
132
+ dummy_model = 'meta-llama/Llama-3.3-70B-Instruct-FC'
133
+
134
+ row = pred
135
+ test_category = re.sub(r'_[0-9_-]+$', '', row['id'])
136
+ if test_category in {'irrelevance', 'live_irrelevance', 'live_relevance'}:
137
+ error = None
138
+ try:
139
+ if self.is_fc_model:
140
+ decoded_tool_calls = []
141
+ for tool_call in row['generation'][0]:
142
+ name = list(tool_call.keys())[0]
143
+ params = json.loads(tool_call[name])
144
+ decoded_tool_calls.append({name: params})
145
+ else:
146
+ decoded_tool_calls = default_decode_ast_prompting(row['generation'][0][0], row['language'])
147
+
148
+ # successful decode means valid function call was present
149
+ contains_func_call = True
150
+ if is_empty_output(decoded_tool_calls):
151
+ # Empty output is not considered as a valid function call
152
+ contains_func_call = False
153
+ error = 'Empty decoded output.'
154
+ except Exception:
155
+ contains_func_call = False
156
+ error = f'Failed to decode with traceback: {traceback.format_exc()}'
157
+ finally:
158
+ valid = contains_func_call if test_category == 'live_relevance' else not contains_func_call
159
+ score_result = {'valid': valid, 'error_message': error}
160
+
161
+ elif row['multi_turn']:
162
+ # each step might give a list of tool calls and each turn is multi-step
163
+ # and multi-turn has generations of all the turns
164
+ # hence in a multi-turn setting,
165
+ # multi_turn_decoded_generations is a list of list of list of strings
166
+ multi_turn_decoded_generations: list[list[list[str]]] = []
167
+ for single_turn_generations in row['generation']:
168
+ single_turn_decoded_generations: list[list[str]] = []
169
+ for generation in single_turn_generations:
170
+ try:
171
+ if self.is_fc_model:
172
+ tool_calls = convert_to_function_call(generation)
173
+ else:
174
+ tool_calls = default_decode_execute_prompting(generation)
175
+
176
+ single_turn_decoded_generations.append(tool_calls)
177
+ except Exception:
178
+ single_turn_decoded_generations.append([generation])
179
+
180
+ multi_turn_decoded_generations.append(single_turn_decoded_generations)
181
+
182
+ try:
183
+ raw_score_result = multi_turn_checker(
184
+ multi_turn_decoded_generations,
185
+ row['ground_truth'],
186
+ row,
187
+ test_category,
188
+ dummy_model,
189
+ )
190
+ except Exception:
191
+ raw_score_result = {
192
+ 'valid': False,
193
+ 'error_type': 'multi_turn:checker_failed',
194
+ 'error_message': f'Failed to grade multi-turn. Traceback: {traceback.format_exc()}',
195
+ }
196
+
197
+ score_result = {
198
+ 'valid': float(raw_score_result['valid']),
199
+ 'error_message': raw_score_result.get('error_message', ''),
200
+ 'error_type': raw_score_result.get('error_type', ''),
201
+ }
202
+ else:
203
+ try:
204
+ if self.is_fc_model:
205
+ decoded_tool_calls = []
206
+ for tool_call in row['generation'][0]:
207
+ name = list(tool_call.keys())[0]
208
+ params = json.loads(tool_call[name])
209
+ decoded_tool_calls.append({name: params})
210
+ else:
211
+ decoded_tool_calls = default_decode_ast_prompting(row['generation'][0][0], row['language'])
212
+
213
+ score_result = ast_checker(
214
+ row['functions'],
215
+ decoded_tool_calls,
216
+ row['ground_truth'],
217
+ row['language'],
218
+ row['test_category'],
219
+ dummy_model,
220
+ )
221
+ except Exception:
222
+ score_result = {
223
+ 'valid': False,
224
+ 'error_message': f'Invalid syntax. Failed to decode AST. Traceback: {traceback.format_exc()}',
225
+ 'error_type': 'ast_decoder:decoder_failed',
226
+ }
227
+
228
+ return {
229
+ 'AverageAccuracy': float(score_result['valid']),
230
+ 'raw_score_result': score_result,
231
+ }
232
+
233
+ def compute_metric(self, review_res_list: List[dict], **kwargs) -> Any:
234
+ # aggregate review results
235
+ res_dict = super().compute_dict_metric(review_res_list, **kwargs)
236
+
237
+ return super().compute_metric(res_dict, **kwargs)
@@ -126,6 +126,9 @@ SUBJECT_MAPPING = {
126
126
  @Benchmark.register(
127
127
  name='ceval',
128
128
  pretty_name='C-Eval',
129
+ tags=['Knowledge', 'MCQ', 'Chinese'],
130
+ description=
131
+ 'C-Eval is a benchmark designed to evaluate the performance of AI models on Chinese exams across various subjects, including STEM, social sciences, and humanities. It consists of multiple-choice questions that test knowledge and reasoning abilities in these areas.', # noqa: E501
129
132
  dataset_id='modelscope/ceval-exam',
130
133
  model_adapter=OutputType.GENERATION,
131
134
  output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
@@ -87,7 +87,10 @@ SUBSET_LIST = ['中华文化', '人文与社会科学', '工程、技术与应
87
87
 
88
88
  @Benchmark.register(
89
89
  name='chinese_simpleqa',
90
- pretty_name='Chinese SimpleQA',
90
+ pretty_name='Chinese-SimpleQA',
91
+ tags=['Knowledge', 'QA', 'Chinese'],
92
+ description=
93
+ "Chinese SimpleQA is a Chinese question-answering dataset designed to evaluate the performance of language models on simple factual questions. It includes a variety of topics and is structured to test the model's ability to understand and generate correct answers in Chinese.", # noqa: E501
91
94
  subset_list=SUBSET_LIST,
92
95
  dataset_id='AI-ModelScope/Chinese-SimpleQA',
93
96
  metric_list=['is_correct', 'is_incorrect', 'is_not_attempted'],
@@ -103,6 +103,9 @@ SUBJECT_MAPPING = {
103
103
  @Benchmark.register(
104
104
  name='cmmlu',
105
105
  pretty_name='C-MMLU',
106
+ tags=['Knowledge', 'MCQ', 'Chinese'],
107
+ description=
108
+ 'C-MMLU is a benchmark designed to evaluate the performance of AI models on Chinese language tasks, including reading comprehension, text classification, and more.',
106
109
  dataset_id='modelscope/cmmlu',
107
110
  model_adapter=OutputType.GENERATION,
108
111
  output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
@@ -17,6 +17,9 @@ logger = get_logger()
17
17
  @Benchmark.register(
18
18
  name='competition_math',
19
19
  pretty_name='MATH',
20
+ tags=['Mathematics'],
21
+ description=
22
+ 'The MATH (Mathematics) benchmark is designed to evaluate the mathematical reasoning abilities of AI models through a variety of problem types, including arithmetic, algebra, geometry, and more.',
20
23
  dataset_id='modelscope/competition_math',
21
24
  subset_list=['Level 1', 'Level 2', 'Level 3', 'Level 4', 'Level 5'],
22
25
  metric_list=['AveragePass@1'],
@@ -39,6 +39,7 @@ class DataAdapter(ABC):
39
39
  query_template: Optional[str] = None,
40
40
  pretty_name: Optional[str] = None,
41
41
  description: Optional[str] = None,
42
+ tags: Optional[List[str]] = None,
42
43
  **kwargs):
43
44
  """
44
45
  Args:
@@ -76,6 +77,7 @@ class DataAdapter(ABC):
76
77
  self.query_template = query_template
77
78
  self.pretty_name = pretty_name
78
79
  self.description = description
80
+ self.tags = tags or []
79
81
  self.config_kwargs = kwargs
80
82
  self.category_map = kwargs.get('category_map', {})
81
83
  self.choices = kwargs.get('choices', None)
@@ -14,6 +14,7 @@ logger = get_logger()
14
14
  @Benchmark.register(
15
15
  name='data_collection',
16
16
  dataset_id='', # dataset_id need to be set
17
+ description='Data collection',
17
18
  subset_list=['default'],
18
19
  metric_list=['AverageAccuracy'],
19
20
  few_shot_num=0,
@@ -16,6 +16,7 @@ Format your response as follows: "Therefore, the answer is (insert answer here)"
16
16
  @Benchmark.register(
17
17
  name='docmath',
18
18
  pretty_name='DocMath',
19
+ tags=['Reasoning', 'Mathematics', 'Long Context'],
19
20
  description=
20
21
  'DocMath-Eval is a comprehensive benchmark focused on numerical reasoning within specialized domains. It requires the model to comprehend long and specialized documents and perform numerical reasoning to answer the given question.', # noqa: E501
21
22
  dataset_id='yale-nlp/DocMath-Eval',
@@ -31,6 +31,9 @@ Answer: 43
31
31
  @Benchmark.register(
32
32
  name='drop',
33
33
  pretty_name='DROP',
34
+ tags=['Reasoning'],
35
+ description=
36
+ 'The DROP (Discrete Reasoning Over Paragraphs) benchmark is designed to evaluate the reading comprehension and reasoning capabilities of AI models. It includes a variety of tasks that require models to read passages and answer questions based on the content.', # noqa: E501
34
37
  dataset_id='AI-ModelScope/DROP',
35
38
  metric_list=['AverageAccuracy'],
36
39
  few_shot_num=0,
@@ -16,6 +16,7 @@ Format your response as follows: "Therefore, the answer is (insert answer here)"
16
16
  @Benchmark.register(
17
17
  name='frames',
18
18
  pretty_name='FRAMES',
19
+ tags=['Reasoning', 'Long Context'],
19
20
  description=
20
21
  'FRAMES is a comprehensive evaluation dataset designed to test the capabilities of Retrieval-Augmented Generation (RAG) systems across factuality, retrieval accuracy, and reasoning.', # noqa: E501
21
22
  dataset_id='iic/frames',