evalscope 0.16.1__py3-none-any.whl → 0.16.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (82) hide show
  1. evalscope/app/app.py +20 -5
  2. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +23 -11
  3. evalscope/backend/rag_eval/utils/embedding.py +2 -4
  4. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +1 -0
  5. evalscope/benchmarks/aime/aime24_adapter.py +3 -1
  6. evalscope/benchmarks/aime/aime25_adapter.py +3 -1
  7. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +5 -0
  8. evalscope/benchmarks/arc/arc_adapter.py +3 -0
  9. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +7 -3
  10. evalscope/benchmarks/bbh/bbh_adapter.py +3 -0
  11. evalscope/benchmarks/benchmark.py +1 -0
  12. evalscope/benchmarks/bfcl/__init__.py +0 -0
  13. evalscope/benchmarks/bfcl/bfcl_adapter.py +237 -0
  14. evalscope/benchmarks/ceval/ceval_adapter.py +3 -0
  15. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +4 -1
  16. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +3 -0
  17. evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -0
  18. evalscope/benchmarks/data_adapter.py +2 -0
  19. evalscope/benchmarks/data_collection/data_collection_adapter.py +1 -0
  20. evalscope/benchmarks/docmath/docmath_adapter.py +1 -0
  21. evalscope/benchmarks/drop/drop_adapter.py +3 -0
  22. evalscope/benchmarks/frames/frames_adapter.py +1 -0
  23. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +19 -23
  24. evalscope/benchmarks/general_qa/general_qa_adapter.py +3 -0
  25. evalscope/benchmarks/gpqa/gpqa_adapter.py +3 -0
  26. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +3 -0
  27. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -0
  28. evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -0
  29. evalscope/benchmarks/ifeval/ifeval_adapter.py +3 -0
  30. evalscope/benchmarks/iquiz/iquiz_adapter.py +3 -0
  31. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +4 -1
  32. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +3 -0
  33. evalscope/benchmarks/math_500/math_500_adapter.py +3 -0
  34. evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -0
  35. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +3 -0
  36. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +3 -0
  37. evalscope/benchmarks/musr/musr_adapter.py +3 -0
  38. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +15 -8
  39. evalscope/benchmarks/needle_haystack/utils.py +2 -2
  40. evalscope/benchmarks/process_bench/process_bench_adapter.py +3 -0
  41. evalscope/benchmarks/race/race_adapter.py +3 -0
  42. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +3 -0
  43. evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +1 -0
  44. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +21 -3
  45. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +1 -0
  46. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +5 -0
  47. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -0
  48. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +3 -0
  49. evalscope/benchmarks/winogrande/winogrande_adapter.py +3 -0
  50. evalscope/collections/evaluator.py +50 -28
  51. evalscope/constants.py +1 -1
  52. evalscope/evaluator/evaluator.py +6 -5
  53. evalscope/metrics/t2v_metrics/__init__.py +9 -23
  54. evalscope/models/adapters/__init__.py +2 -0
  55. evalscope/models/adapters/base_adapter.py +31 -27
  56. evalscope/models/adapters/bfcl_adapter.py +244 -0
  57. evalscope/models/adapters/server_adapter.py +78 -17
  58. evalscope/models/custom/custom_model.py +0 -3
  59. evalscope/models/custom/dummy_model.py +77 -39
  60. evalscope/models/local_model.py +1 -1
  61. evalscope/models/register.py +2 -1
  62. evalscope/perf/arguments.py +2 -0
  63. evalscope/perf/benchmark.py +16 -3
  64. evalscope/perf/plugin/api/openai_api.py +2 -0
  65. evalscope/report/combinator.py +38 -12
  66. evalscope/report/utils.py +24 -1
  67. evalscope/run.py +1 -1
  68. evalscope/summarizer.py +1 -1
  69. evalscope/utils/io_utils.py +59 -2
  70. evalscope/version.py +2 -2
  71. {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/METADATA +4 -3
  72. {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/RECORD +82 -79
  73. tests/aigc/test_t2i.py +8 -8
  74. tests/cli/test_all.py +40 -33
  75. tests/cli/test_collection.py +4 -3
  76. tests/cli/test_run.py +36 -21
  77. tests/rag/test_clip_benchmark.py +5 -1
  78. tests/rag/test_mteb.py +46 -2
  79. {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/LICENSE +0 -0
  80. {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/WHEEL +0 -0
  81. {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/entry_points.txt +0 -0
  82. {evalscope-0.16.1.dist-info → evalscope-0.16.2.dist-info}/top_level.txt +0 -0
@@ -1,61 +1,99 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
- import os
3
2
  import time
4
3
  from typing import List
5
4
 
5
+ from evalscope.models import CustomModel
6
6
  from evalscope.utils.logger import get_logger
7
- from .custom_model import CustomModel
8
7
 
9
8
  logger = get_logger()
10
- """
11
- This script is used to rewrite the evaluation results without re-running the model predictions.
12
- """
13
9
 
14
10
 
15
11
  class DummyCustomModel(CustomModel):
16
12
 
17
- def __init__(self, config: dict = {'model_id': 'dummy-model'}, **kwargs):
13
+ def __init__(self, config: dict = {}, **kwargs):
18
14
  super(DummyCustomModel, self).__init__(config=config, **kwargs)
19
15
 
20
- def predict(self, prompts: List[dict], **kwargs):
21
- # ONLY FOR DUMMY IMPLEMENTATION, DO NOT EDIT OR USE IN PRODUCTION.
16
+ def make_request_messages(self, input_item: dict) -> list:
17
+ """
18
+ Make request messages for OpenAI API.
19
+ """
20
+ if input_item.get('messages', None):
21
+ return input_item['messages']
22
22
 
23
- response = ''
23
+ data: list = input_item['data']
24
+ if isinstance(data[0], tuple): # for truthful_qa and hellaswag
25
+ query = '\n'.join(''.join(item) for item in data)
26
+ system_prompt = input_item.get('system_prompt', None)
27
+ else:
28
+ query = data[0]
29
+ system_prompt = input_item.get('system_prompt', None)
24
30
 
25
- res_d: dict = {
26
- 'choices': [{
27
- 'index': 0,
28
- 'message': {
29
- 'content': response,
30
- 'role': 'assistant'
31
- }
32
- }],
33
- 'created': time.time(),
34
- 'model': self.config.get('model_id'), # should be model_id
35
- 'object': 'chat.completion',
36
- 'usage': {
37
- 'completion_tokens': 0,
38
- 'prompt_tokens': 0,
39
- 'total_tokens': 0
40
- }
41
- }
31
+ messages = []
32
+ if system_prompt:
33
+ messages.append({'role': 'system', 'content': system_prompt})
42
34
 
43
- return [res_d for _ in prompts]
35
+ messages.append({'role': 'user', 'content': query})
44
36
 
37
+ return messages
45
38
 
46
- if __name__ == '__main__':
47
- from evalscope.run import run_task
48
- from evalscope.utils.io_utils import yaml_to_dict
49
-
50
- # step1: 如果outputs做了迁移,需要修改outputs/eval_xxx 中的configs/task_output_config.yaml中的路径配置
51
- # step2: 执行此脚本,默认使用use_cache=True,实现免推理对eval结果进行刷新
39
+ def predict(self, prompts: List[dict], **kwargs):
40
+ original_inputs = kwargs.get('origin_inputs', None)
41
+ infer_cfg = kwargs.get('infer_cfg', None)
42
+
43
+ logger.debug(f'** Prompts: {prompts}')
44
+ if original_inputs is not None:
45
+ logger.debug(f'** Original inputs: {original_inputs}')
46
+ if infer_cfg is not None:
47
+ logger.debug(f'** Inference config: {infer_cfg}')
48
+
49
+ # Simulate a response based on the prompts
50
+ # Must return a list of dicts with the same format as the OpenAI API.
51
+ responses = []
52
+ for input_item in original_inputs:
53
+ message = self.make_request_messages(input_item)
54
+ response = f'Dummy response for prompt: {message}'
55
+
56
+ res_d = {
57
+ 'choices': [{
58
+ 'index': 0,
59
+ 'message': {
60
+ 'content': response,
61
+ 'role': 'assistant'
62
+ }
63
+ }],
64
+ 'created': time.time(),
65
+ 'model': self.config.get('model_id'),
66
+ 'object': 'chat.completion',
67
+ 'usage': {
68
+ 'completion_tokens': 0,
69
+ 'prompt_tokens': 0,
70
+ 'total_tokens': 0
71
+ }
72
+ }
52
73
 
53
- swift_model = DummyCustomModel(config={'model_id': 'swift-model-dummy'})
74
+ responses.append(res_d)
54
75
 
55
- task_cfg_file = '/path/to/eval_your_model_results/configs/task_output_config.yaml'
76
+ return responses
56
77
 
57
- task_cfg_d = yaml_to_dict(task_cfg_file)
58
- task_cfg_d.update({'model': swift_model})
59
78
 
60
- eval_results: dict = run_task(task_cfg=task_cfg_d)
61
- print('** Evaluation results finished !\n')
79
+ if __name__ == '__main__':
80
+ from evalscope import TaskConfig, run_task
81
+
82
+ dummy_model = DummyCustomModel()
83
+ task_config = TaskConfig(
84
+ model=dummy_model,
85
+ model_id='evalscope-model-dummy',
86
+ datasets=['gsm8k'],
87
+ eval_type='custom', # must be custom for custom model evaluation
88
+ generation_config={
89
+ 'max_new_tokens': 100,
90
+ 'temperature': 0.0,
91
+ 'top_p': 1.0,
92
+ 'top_k': 50,
93
+ 'repetition_penalty': 1.0
94
+ },
95
+ debug=True,
96
+ limit=5,
97
+ )
98
+
99
+ eval_results = run_task(task_cfg=task_config)
@@ -82,7 +82,7 @@ class LocalImageModel(LocalModel):
82
82
  def __init__(self, **kwargs):
83
83
  super().__init__(**kwargs)
84
84
 
85
- self.pipeline_cls = kwargs.pop('pipeline_cls', None)
85
+ self.pipeline_cls = self.kwargs.pop('pipeline_cls', None)
86
86
  # default to DiffusionPipeline if not specified
87
87
  if self.pipeline_cls is None:
88
88
  if 'flux' in self.model_id.lower():
@@ -47,8 +47,9 @@ def register_model_adapter_class(cls, name=None):
47
47
  # register all model adapters
48
48
  register_model_adapter_class(BaseModelAdapter, name='base')
49
49
  register_model_adapter_class(ChatGenerationModelAdapter, name=OutputType.GENERATION)
50
- register_model_adapter_class(ContinuationLogitsModelAdapter, name=OutputType.LOGITS)
50
+ register_model_adapter_class(ContinuationLogitsModelAdapter, name=OutputType.CONTINUOUS)
51
51
  register_model_adapter_class(MultiChoiceModelAdapter, name=OutputType.MULTIPLE_CHOICE)
52
52
  register_model_adapter_class(CustomModelAdapter, name='custom')
53
53
  register_model_adapter_class(ServerModelAdapter, name='server')
54
+ register_model_adapter_class(BFCLAdapter, name='bfcl_server')
54
55
  register_model_adapter_class(T2IModelAdapter, name=OutputType.IMAGE_GENERATION)
@@ -55,6 +55,7 @@ class Arguments:
55
55
 
56
56
  # Response settings
57
57
  frequency_penalty: Optional[float] = None # Frequency penalty for the response
58
+ repetition_penalty: Optional[float] = None # Repetition penalty for the response
58
59
  logprobs: Optional[bool] = None # Whether to log probabilities
59
60
  max_tokens: Optional[int] = 2048 # Maximum number of tokens in the response
60
61
  min_tokens: Optional[int] = None # Minimum number of tokens in the response
@@ -181,6 +182,7 @@ def add_argument(parser: argparse.ArgumentParser):
181
182
 
182
183
  # Response settings
183
184
  parser.add_argument('--frequency-penalty', type=float, help='The frequency_penalty value', default=None)
185
+ parser.add_argument('--repetition-penalty', type=float, help='The repetition_penalty value', default=None)
184
186
  parser.add_argument('--logprobs', action='store_true', help='The logprobs', default=None)
185
187
  parser.add_argument(
186
188
  '--max-tokens', type=int, help='The maximum number of tokens that can be generated', default=2048)
@@ -41,14 +41,27 @@ async def get_requests(args: Arguments) -> AsyncGenerator[dict, None]:
41
41
  message_generator_class = DatasetRegistry(args.dataset)
42
42
  message_generator = message_generator_class(args)
43
43
 
44
+ dataset_messages = []
45
+ try:
46
+ for messages in message_generator:
47
+ dataset_messages.append(messages)
48
+ except StopIteration:
49
+ pass
50
+
51
+ if not dataset_messages:
52
+ raise Exception('Dataset is empty!')
53
+
44
54
  count = 0
45
- for messages in message_generator:
55
+ dataset_index = 0
56
+
57
+ while count < args.number:
58
+ messages = dataset_messages[dataset_index]
46
59
  request = query_generator.build_request(messages, args)
47
60
  if request is not None:
48
61
  yield request
49
62
  count += 1
50
- if args.number and count >= args.number:
51
- break
63
+
64
+ dataset_index = (dataset_index + 1) % len(dataset_messages)
52
65
 
53
66
  if args.prompt:
54
67
  prompt = load_prompt(args.prompt)
@@ -75,6 +75,8 @@ class OpenaiPlugin(ApiPluginBase):
75
75
  payload['min_tokens'] = param.min_tokens
76
76
  if param.frequency_penalty is not None:
77
77
  payload['frequency_penalty'] = param.frequency_penalty
78
+ if param.repetition_penalty is not None:
79
+ payload['repetition_penalty'] = param.repetition_penalty
78
80
  if param.logprobs is not None:
79
81
  payload['logprobs'] = param.logprobs
80
82
  if param.n_choices is not None:
@@ -34,25 +34,51 @@ def get_report_list(reports_path_list: List[str]) -> List[Report]:
34
34
 
35
35
  def get_data_frame(report_list: List[Report],
36
36
  flatten_metrics: bool = True,
37
- flatten_categories: bool = True) -> pd.DataFrame:
37
+ flatten_categories: bool = True,
38
+ add_overall_metric: bool = False) -> pd.DataFrame:
38
39
  tables = []
39
40
  for report in report_list:
40
- df = report.to_dataframe(flatten_metrics=flatten_metrics, flatten_categories=flatten_categories)
41
+ df = report.to_dataframe(
42
+ flatten_metrics=flatten_metrics,
43
+ flatten_categories=flatten_categories,
44
+ add_overall_metric=add_overall_metric)
41
45
  tables.append(df)
42
46
  return pd.concat(tables, ignore_index=True)
43
47
 
44
48
 
45
- def gen_table(reports_path_list: list) -> str:
46
- report_list = get_report_list(reports_path_list)
47
- table = get_data_frame(report_list)
48
- return tabulate(table, headers=table.columns, tablefmt='grid', showindex=False)
49
-
50
-
51
- def gen_report_table(report: Report) -> str:
49
+ def gen_table(reports_path_list: list[str] = None,
50
+ report_list: list[Report] = None,
51
+ flatten_metrics: bool = True,
52
+ flatten_categories: bool = True,
53
+ add_overall_metric: bool = False) -> str:
52
54
  """
53
- Generate a report table for a single report.
55
+ Generates a formatted table from a list of report paths or Report objects.
56
+
57
+ Args:
58
+ reports_path_list (list[str], optional): List of file paths to report files.
59
+ Either this or `report_list` must be provided.
60
+ report_list (list[Report], optional): List of Report objects.
61
+ Either this or `reports_path_list` must be provided.
62
+ flatten_metrics (bool, optional): Whether to flatten the metrics in the output table. Defaults to True.
63
+ flatten_categories (bool, optional): Whether to flatten the categories in the output table. Defaults to True.
64
+ add_overall_metric (bool, optional): Whether to add an overall metric column to the table. Defaults to False.
65
+
66
+ Returns:
67
+ str: A string representation of the table in grid format.
68
+
69
+ Raises:
70
+ AssertionError: If neither `reports_path_list` nor `report_list` is provided.
54
71
  """
55
- table = report.to_dataframe(flatten_metrics=True, flatten_categories=True)
72
+ assert (reports_path_list is not None) or (report_list is not None), \
73
+ 'Either reports_path_list or report_list must be provided.'
74
+ if report_list is None:
75
+ report_list = get_report_list(reports_path_list)
76
+ # Generate a DataFrame from the report list
77
+ table = get_data_frame(
78
+ report_list,
79
+ flatten_metrics=flatten_metrics,
80
+ flatten_categories=flatten_categories,
81
+ add_overall_metric=add_overall_metric)
56
82
  return tabulate(table, headers=table.columns, tablefmt='grid', showindex=False)
57
83
 
58
84
 
@@ -68,7 +94,7 @@ if __name__ == '__main__':
68
94
  report_dir_1 = './outputs/20250117_151926'
69
95
  # report_dir_2 = './outputs/20250107_204445/reports'
70
96
 
71
- report_table = gen_table([report_dir_1])
97
+ report_table = gen_table(reports_path_list=[report_dir_1])
72
98
  print(report_table)
73
99
 
74
100
  # ALL VALUES ONLY FOR EXAMPLE
evalscope/report/utils.py CHANGED
@@ -133,11 +133,25 @@ class Report:
133
133
  data = json.load(f)
134
134
  return cls.from_dict(data)
135
135
 
136
- def to_dataframe(self, flatten_metrics: bool = True, flatten_categories: bool = True):
136
+ def to_dataframe(self,
137
+ flatten_metrics: bool = True,
138
+ flatten_categories: bool = True,
139
+ add_overall_metric: bool = False) -> pd.DataFrame:
140
+ """
141
+ Convert the report to a pandas DataFrame.
142
+ Args:
143
+ flatten_metrics (bool): Whether to flatten the metrics to a single row.
144
+ flatten_categories (bool): Whether to flatten the categories to multiple rows.
145
+ add_overall_metric (bool): Whether to add an overall metric row.
146
+ Returns:
147
+ pd.DataFrame: The report as a pandas DataFrame.
148
+ """
137
149
  table = defaultdict(list)
138
150
  for metric in self.metrics:
151
+ metric_count = 0
139
152
  for category in metric.categories:
140
153
  for subset in category.subsets:
154
+ metric_count += 1
141
155
  table[ReportKey.model_name].append(self.model_name)
142
156
  table[ReportKey.dataset_name].append(self.dataset_name)
143
157
  table[ReportKey.metric_name].append(metric.name)
@@ -145,6 +159,15 @@ class Report:
145
159
  table[ReportKey.subset_name].append(subset.name)
146
160
  table[ReportKey.num].append(subset.num)
147
161
  table[ReportKey.score].append(subset.score)
162
+ # add overall metric when there are multiple subsets
163
+ if metric_count > 1 and add_overall_metric:
164
+ table[ReportKey.model_name].append(self.model_name)
165
+ table[ReportKey.dataset_name].append(self.dataset_name)
166
+ table[ReportKey.metric_name].append(metric.name)
167
+ table[ReportKey.category_name].append(('-', ))
168
+ table[ReportKey.subset_name].append('OVERALL')
169
+ table[ReportKey.num].append(metric.num)
170
+ table[ReportKey.score].append(metric.score)
148
171
  # NOTE: only flatten metrics if needed, use the first metric by default
149
172
  if not flatten_metrics:
150
173
  break
evalscope/run.py CHANGED
@@ -133,7 +133,7 @@ def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
133
133
 
134
134
  # Make overall report
135
135
  try:
136
- report_table: str = gen_table([outputs.reports_dir])
136
+ report_table: str = gen_table(reports_path_list=[outputs.reports_dir], add_overall_metric=True)
137
137
  logger.info(f'Overall report table: \n{report_table} \n')
138
138
  except Exception:
139
139
  logger.error('Failed to generate report table.')
evalscope/summarizer.py CHANGED
@@ -30,7 +30,7 @@ class Summarizer:
30
30
  with open(report_file, 'r') as f:
31
31
  res_list.append(json.load(f))
32
32
 
33
- report_table: str = gen_table([reports_dir])
33
+ report_table: str = gen_table(reports_path_list=[reports_dir])
34
34
  logger.info(f'*** Report table ***\n{report_table}')
35
35
 
36
36
  return res_list
@@ -1,3 +1,4 @@
1
+ import csv
1
2
  import json
2
3
  import jsonlines as jsonl
3
4
  import os
@@ -112,8 +113,58 @@ def dump_jsonl_data(data_list, jsonl_file, dump_mode=DumpMode.OVERWRITE):
112
113
  writer.write_all(data_list)
113
114
 
114
115
 
115
- def jsonl_to_csv():
116
- pass
116
+ def jsonl_to_csv(jsonl_file, csv_file):
117
+ """
118
+ Convert jsonl file to csv file.
119
+
120
+ Args:
121
+ jsonl_file: jsonl file path.
122
+ csv_file: csv file path.
123
+ """
124
+ data = jsonl_to_list(jsonl_file)
125
+ if not data:
126
+ logger.warning(f'No data found in {jsonl_file}.')
127
+ return
128
+
129
+ with open(csv_file, 'w', newline='', encoding='utf-8') as f:
130
+ writer = csv.writer(f)
131
+ writer.writerow(data[0].keys()) # Write header
132
+ for item in data:
133
+ writer.writerow(item.values())
134
+
135
+
136
+ def csv_to_list(csv_file) -> list:
137
+ """
138
+ Read csv file to list.
139
+
140
+ Args:
141
+ csv_file: csv file path.
142
+
143
+ Returns:
144
+ list: list of lines. Each line is a dict.
145
+ """
146
+ res_list = []
147
+ with open(csv_file, 'r', encoding='utf-8') as f:
148
+ reader = csv.DictReader(f)
149
+ for row in reader:
150
+ res_list.append(row)
151
+ return res_list
152
+
153
+
154
+ def csv_to_jsonl(csv_file, jsonl_file):
155
+ """
156
+ Convert csv file to jsonl file.
157
+
158
+ Args:
159
+ csv_file: csv file path.
160
+ jsonl_file: jsonl file path.
161
+ """
162
+ data = csv_to_list(csv_file)
163
+ if not data:
164
+ logger.warning(f'No data found in {csv_file}.')
165
+ return
166
+
167
+ dump_jsonl_data(data, jsonl_file, dump_mode=DumpMode.OVERWRITE)
117
168
 
118
169
 
119
170
  def yaml_to_dict(yaml_file) -> dict:
@@ -168,3 +219,9 @@ def dict_to_json(d: dict, json_file: str):
168
219
  """
169
220
  with open(json_file, 'w') as f:
170
221
  json.dump(d, f, indent=4, ensure_ascii=False)
222
+
223
+
224
+ if __name__ == '__main__':
225
+ csv_file = 'custom_eval/text/mcq/example_val.csv'
226
+ jsonl_file = 'custom_eval/text/mcq/example_val.jsonl'
227
+ csv_to_jsonl(csv_file, jsonl_file)
evalscope/version.py CHANGED
@@ -1,4 +1,4 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- __version__ = '0.16.1'
4
- __release_datetime__ = '2025-06-03 20:00:00'
3
+ __version__ = '0.16.2'
4
+ __release_datetime__ = '2025-06-23 20:00:00'
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: evalscope
3
- Version: 0.16.1
3
+ Version: 0.16.2
4
4
  Summary: EvalScope: Lightweight LLMs Evaluation Framework
5
5
  Home-page: https://github.com/modelscope/evalscope
6
6
  Author: ModelScope team
@@ -40,7 +40,6 @@ Requires-Dist: seaborn
40
40
  Requires-Dist: sympy
41
41
  Requires-Dist: tabulate
42
42
  Requires-Dist: torch
43
- Requires-Dist: torchvision
44
43
  Requires-Dist: tqdm
45
44
  Requires-Dist: transformers>=4.33
46
45
  Requires-Dist: word2number
@@ -50,6 +49,7 @@ Requires-Dist: iopath; extra == "aigc"
50
49
  Requires-Dist: omegaconf; extra == "aigc"
51
50
  Requires-Dist: open-clip-torch; extra == "aigc"
52
51
  Requires-Dist: opencv-python; extra == "aigc"
52
+ Requires-Dist: torchvision; extra == "aigc"
53
53
  Provides-Extra: all
54
54
  Requires-Dist: accelerate; extra == "all"
55
55
  Requires-Dist: datasets>=3.0; extra == "all"
@@ -75,7 +75,6 @@ Requires-Dist: seaborn; extra == "all"
75
75
  Requires-Dist: sympy; extra == "all"
76
76
  Requires-Dist: tabulate; extra == "all"
77
77
  Requires-Dist: torch; extra == "all"
78
- Requires-Dist: torchvision; extra == "all"
79
78
  Requires-Dist: tqdm; extra == "all"
80
79
  Requires-Dist: transformers>=4.33; extra == "all"
81
80
  Requires-Dist: word2number; extra == "all"
@@ -102,6 +101,7 @@ Requires-Dist: iopath; extra == "all"
102
101
  Requires-Dist: omegaconf; extra == "all"
103
102
  Requires-Dist: open-clip-torch; extra == "all"
104
103
  Requires-Dist: opencv-python; extra == "all"
104
+ Requires-Dist: torchvision; extra == "all"
105
105
  Provides-Extra: app
106
106
  Requires-Dist: gradio==5.4.0; extra == "app"
107
107
  Requires-Dist: plotly<6.0.0,>=5.23.0; extra == "app"
@@ -230,6 +230,7 @@ Please scan the QR code below to join our community groups:
230
230
 
231
231
  ## 🎉 News
232
232
 
233
+ - 🔥 **[2025.06.19]** Added support for the BFCL-v3 benchmark, designed to evaluate model function-calling capabilities across various scenarios. For more information, refer to the [documentation](https://evalscope.readthedocs.io/zh-cn/latest/third_party/bfcl_v3.html).
233
234
  - 🔥 **[2025.06.02]** Added support for the Needle-in-a-Haystack test. Simply specify `needle_haystack` to conduct the test, and a corresponding heatmap will be generated in the `outputs/reports` folder, providing a visual representation of the model's performance. Refer to the [documentation](https://evalscope.readthedocs.io/en/latest/third_party/needle_haystack.html) for more details.
234
235
  - 🔥 **[2025.05.29]** Added support for two long document evaluation benchmarks: [DocMath](https://modelscope.cn/datasets/yale-nlp/DocMath-Eval/summary) and [FRAMES](https://modelscope.cn/datasets/iic/frames/summary). For usage guidelines, please refer to the [documentation](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html).
235
236
  - 🔥 **[2025.05.16]** Model service performance stress testing now supports setting various levels of concurrency and outputs a performance test report. [Reference example](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/quick_start.html#id3).