evalscope 0.16.0__py3-none-any.whl → 0.16.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (114) hide show
  1. evalscope/app/__init__.py +28 -0
  2. evalscope/{report → app}/app.py +40 -30
  3. evalscope/app/constants.py +21 -0
  4. evalscope/arguments.py +2 -1
  5. evalscope/backend/opencompass/backend_manager.py +2 -1
  6. evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +23 -11
  7. evalscope/backend/rag_eval/cmteb/arguments.py +4 -1
  8. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  9. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  10. evalscope/backend/rag_eval/utils/embedding.py +77 -39
  11. evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +1 -0
  12. evalscope/benchmarks/aime/aime24_adapter.py +3 -1
  13. evalscope/benchmarks/aime/aime25_adapter.py +3 -1
  14. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +5 -0
  15. evalscope/benchmarks/arc/arc_adapter.py +3 -0
  16. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +7 -3
  17. evalscope/benchmarks/bbh/bbh_adapter.py +3 -0
  18. evalscope/benchmarks/benchmark.py +2 -0
  19. evalscope/benchmarks/bfcl/__init__.py +0 -0
  20. evalscope/benchmarks/bfcl/bfcl_adapter.py +237 -0
  21. evalscope/benchmarks/ceval/ceval_adapter.py +3 -0
  22. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +4 -1
  23. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +3 -0
  24. evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -0
  25. evalscope/benchmarks/data_adapter.py +99 -16
  26. evalscope/benchmarks/data_collection/data_collection_adapter.py +1 -0
  27. evalscope/benchmarks/docmath/__init__.py +0 -0
  28. evalscope/benchmarks/docmath/docmath_adapter.py +85 -0
  29. evalscope/benchmarks/docmath/utils.py +220 -0
  30. evalscope/benchmarks/drop/drop_adapter.py +3 -0
  31. evalscope/benchmarks/frames/__init__.py +0 -0
  32. evalscope/benchmarks/frames/frames_adapter.py +91 -0
  33. evalscope/benchmarks/frames/utils.py +37 -0
  34. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +19 -23
  35. evalscope/benchmarks/general_qa/general_qa_adapter.py +3 -0
  36. evalscope/benchmarks/gpqa/gpqa_adapter.py +3 -0
  37. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +3 -0
  38. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -0
  39. evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -0
  40. evalscope/benchmarks/ifeval/ifeval_adapter.py +3 -0
  41. evalscope/benchmarks/iquiz/iquiz_adapter.py +3 -0
  42. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +4 -1
  43. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +3 -0
  44. evalscope/benchmarks/math_500/math_500_adapter.py +3 -0
  45. evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -0
  46. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +3 -0
  47. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +3 -0
  48. evalscope/benchmarks/musr/musr_adapter.py +3 -0
  49. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  50. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +348 -0
  51. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  52. evalscope/benchmarks/process_bench/process_bench_adapter.py +3 -0
  53. evalscope/benchmarks/race/race_adapter.py +3 -0
  54. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +3 -0
  55. evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +1 -0
  56. evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +21 -3
  57. evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +1 -0
  58. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +9 -1
  59. evalscope/benchmarks/tool_bench/utils.py +5 -4
  60. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -0
  61. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +3 -0
  62. evalscope/benchmarks/utils.py +25 -0
  63. evalscope/benchmarks/winogrande/winogrande_adapter.py +3 -0
  64. evalscope/cli/start_app.py +2 -2
  65. evalscope/collections/__init__.py +35 -3
  66. evalscope/collections/evaluator.py +68 -34
  67. evalscope/config.py +8 -2
  68. evalscope/constants.py +1 -1
  69. evalscope/evaluator/evaluator.py +40 -28
  70. evalscope/metrics/__init__.py +3 -1
  71. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  72. evalscope/metrics/llm_judge.py +12 -5
  73. evalscope/metrics/math_parser.py +1 -1
  74. evalscope/metrics/t2v_metrics/__init__.py +9 -23
  75. evalscope/models/adapters/__init__.py +2 -0
  76. evalscope/models/adapters/base_adapter.py +31 -27
  77. evalscope/models/adapters/bfcl_adapter.py +244 -0
  78. evalscope/models/adapters/server_adapter.py +80 -23
  79. evalscope/models/custom/custom_model.py +0 -3
  80. evalscope/models/custom/dummy_model.py +77 -39
  81. evalscope/models/local_model.py +1 -1
  82. evalscope/models/register.py +2 -1
  83. evalscope/perf/arguments.py +4 -2
  84. evalscope/perf/benchmark.py +16 -12
  85. evalscope/perf/main.py +7 -0
  86. evalscope/perf/plugin/api/openai_api.py +2 -0
  87. evalscope/perf/plugin/datasets/custom.py +15 -0
  88. evalscope/perf/utils/benchmark_util.py +1 -1
  89. evalscope/perf/utils/local_server.py +1 -0
  90. evalscope/perf/utils/log_utils.py +12 -5
  91. evalscope/perf/utils/rich_display.py +1 -1
  92. evalscope/report/__init__.py +36 -4
  93. evalscope/report/combinator.py +40 -6
  94. evalscope/report/generator.py +33 -9
  95. evalscope/report/utils.py +84 -4
  96. evalscope/run.py +12 -0
  97. evalscope/summarizer.py +1 -1
  98. evalscope/utils/io_utils.py +59 -2
  99. evalscope/utils/logger.py +1 -1
  100. evalscope/utils/utils.py +12 -0
  101. evalscope/version.py +2 -2
  102. {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/METADATA +16 -13
  103. {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/RECORD +114 -100
  104. tests/aigc/test_t2i.py +48 -11
  105. tests/cli/test_all.py +14 -3
  106. tests/cli/test_collection.py +6 -4
  107. tests/cli/test_run.py +50 -25
  108. tests/rag/test_clip_benchmark.py +5 -1
  109. tests/rag/test_mteb.py +51 -7
  110. /evalscope/{report/app_arguments.py → app/arguments.py} +0 -0
  111. {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/LICENSE +0 -0
  112. {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/WHEEL +0 -0
  113. {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/entry_points.txt +0 -0
  114. {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/top_level.txt +0 -0
@@ -1,11 +1,8 @@
1
1
  import asyncio
2
- import copy
3
2
  import json
4
3
  import numpy as np
5
- import os
6
4
  import platform
7
5
  import sqlite3
8
- import threading
9
6
  import time
10
7
  from http import HTTPStatus
11
8
  from tqdm import tqdm
@@ -17,7 +14,6 @@ from evalscope.perf.plugin.registry import ApiRegistry, DatasetRegistry
17
14
  from evalscope.perf.utils.benchmark_util import BenchmarkData, BenchmarkMetrics
18
15
  from evalscope.perf.utils.db_util import create_result_table, get_result_db_path, insert_benchmark_data, summary_result
19
16
  from evalscope.perf.utils.handler import add_signal_handlers, exception_handler
20
- from evalscope.perf.utils.local_server import start_app
21
17
  from evalscope.utils.logger import get_logger
22
18
 
23
19
  logger = get_logger()
@@ -45,14 +41,27 @@ async def get_requests(args: Arguments) -> AsyncGenerator[dict, None]:
45
41
  message_generator_class = DatasetRegistry(args.dataset)
46
42
  message_generator = message_generator_class(args)
47
43
 
44
+ dataset_messages = []
45
+ try:
46
+ for messages in message_generator:
47
+ dataset_messages.append(messages)
48
+ except StopIteration:
49
+ pass
50
+
51
+ if not dataset_messages:
52
+ raise Exception('Dataset is empty!')
53
+
48
54
  count = 0
49
- for messages in message_generator:
55
+ dataset_index = 0
56
+
57
+ while count < args.number:
58
+ messages = dataset_messages[dataset_index]
50
59
  request = query_generator.build_request(messages, args)
51
60
  if request is not None:
52
61
  yield request
53
62
  count += 1
54
- if args.number and count >= args.number:
55
- break
63
+
64
+ dataset_index = (dataset_index + 1) % len(dataset_messages)
56
65
 
57
66
  if args.prompt:
58
67
  prompt = load_prompt(args.prompt)
@@ -164,11 +173,6 @@ async def statistic_benchmark_metric(benchmark_data_queue: asyncio.Queue, args:
164
173
 
165
174
  @exception_handler
166
175
  async def connect_test(args: Arguments) -> bool:
167
- if args.api.startswith('local'):
168
- # start local server
169
- server = threading.Thread(target=start_app, args=(copy.deepcopy(args), ), daemon=True)
170
- server.start()
171
-
172
176
  if (not args.no_test_connection) and (not await test_connection(args)):
173
177
  raise TimeoutError('Test connection failed')
174
178
 
evalscope/perf/main.py CHANGED
@@ -2,9 +2,11 @@ import asyncio
2
2
  import copy
3
3
  import os
4
4
  import platform
5
+ import threading
5
6
  import time
6
7
  from argparse import Namespace
7
8
 
9
+ from evalscope.perf.utils.local_server import start_app
8
10
  from evalscope.perf.utils.log_utils import init_swanlab, init_wandb
9
11
  from evalscope.utils.logger import configure_logging, get_logger
10
12
  from evalscope.utils.utils import seed_everything
@@ -82,6 +84,11 @@ def run_perf_benchmark(args):
82
84
  if args.swanlab_api_key:
83
85
  init_swanlab(args)
84
86
 
87
+ # Initialize local server if needed
88
+ if args.api.startswith('local'):
89
+ # start local server
90
+ server = threading.Thread(target=start_app, args=(copy.deepcopy(args), ), daemon=True)
91
+ server.start()
85
92
  # Start benchmark
86
93
  if len(args.number) == 1:
87
94
  return run_one_benchmark(args, output_path=output_path)
@@ -75,6 +75,8 @@ class OpenaiPlugin(ApiPluginBase):
75
75
  payload['min_tokens'] = param.min_tokens
76
76
  if param.frequency_penalty is not None:
77
77
  payload['frequency_penalty'] = param.frequency_penalty
78
+ if param.repetition_penalty is not None:
79
+ payload['repetition_penalty'] = param.repetition_penalty
78
80
  if param.logprobs is not None:
79
81
  payload['logprobs'] = param.logprobs
80
82
  if param.n_choices is not None:
@@ -22,3 +22,18 @@ class CustomDatasetPlugin(DatasetPluginBase):
22
22
  yield [{'role': 'user', 'content': prompt}]
23
23
  else:
24
24
  yield prompt
25
+
26
+
27
+ if __name__ == '__main__':
28
+ from evalscope.perf.arguments import Arguments
29
+ from evalscope.perf.main import run_perf_benchmark
30
+
31
+ args = Arguments(
32
+ model='qwen2.5-7b-instruct',
33
+ url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
34
+ dataset_path='outputs/perf_data.txt',
35
+ api_key='EMPTY',
36
+ dataset='custom',
37
+ )
38
+
39
+ run_perf_benchmark(args)
@@ -38,7 +38,7 @@ class BenchmarkData:
38
38
  self.first_chunk_latency = self.query_latency
39
39
  self.n_chunks = 1
40
40
  self.n_chunks_time = self.query_latency
41
- self.time_per_output_token = self.n_chunks_time / self.completion_tokens
41
+ self.time_per_output_token = self.n_chunks_time / self.n_chunks
42
42
 
43
43
  def _calculate_tokens(self, api_plugin):
44
44
  self.prompt_tokens, self.completion_tokens = \
@@ -96,6 +96,7 @@ def create_app(model, attn_implementation=None) -> FastAPI:
96
96
 
97
97
 
98
98
  def start_app(args: Arguments):
99
+ logger.info('Starting local server, please wait...')
99
100
  if args.api == 'local':
100
101
  app = create_app(args.model, args.attn_implementation)
101
102
  uvicorn.run(app, host='0.0.0.0', port=args.port, workers=1)
@@ -34,8 +34,15 @@ def init_swanlab(args: Arguments) -> None:
34
34
  current_time = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
35
35
  name = args.name if args.name else f'{args.model_id}_{current_time}'
36
36
  swanlab.config.update({'framework': '📏evalscope'})
37
- swanlab.init(
38
- project=os.getenv('SWANLAB_PROJ_NAME', 'perf_benchmark'),
39
- name=name,
40
- config=args.to_dict(),
41
- mode='local' if args.swanlab_api_key == 'local' else None)
37
+ init_kwargs = {
38
+ 'project': os.getenv('SWANLAB_PROJ_NAME', 'perf_benchmark'),
39
+ 'name': name,
40
+ 'config': args.to_dict(),
41
+ 'mode': 'local' if args.swanlab_api_key == 'local' else None
42
+ }
43
+
44
+ workspace = os.getenv('SWANLAB_WORKSPACE')
45
+ if workspace:
46
+ init_kwargs['workspace'] = workspace
47
+
48
+ swanlab.init(**init_kwargs)
@@ -92,7 +92,7 @@ def print_summary(all_results, model_name):
92
92
  basic_info.add_row('Model', model_name)
93
93
  basic_info.add_row('Total Generated', f'{total_tokens:,} tokens')
94
94
  basic_info.add_row('Total Test Time', f'{total_time:.2f} seconds')
95
- basic_info.add_row('Avg Output Rate', f'{total_tokens/total_time:.2f} tokens/sec')
95
+ basic_info.add_row('Avg Output Rate', f'{total_tokens / total_time:.2f} tokens/sec')
96
96
 
97
97
  console.print('\nBasic Information:')
98
98
  console.print(basic_info)
@@ -1,6 +1,38 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from typing import TYPE_CHECKING
2
3
 
3
- from evalscope.report.app_arguments import add_argument
4
- from evalscope.report.combinator import gen_table, get_data_frame, get_report_list
5
- from evalscope.report.generator import ReportGenerator
6
- from evalscope.report.utils import Category, Report, ReportKey, Subset
4
+ from evalscope.utils.import_utils import _LazyModule
5
+
6
+ if TYPE_CHECKING:
7
+ from .combinator import gen_report_table, gen_table, get_data_frame, get_report_list
8
+ from .generator import ReportGenerator
9
+ from .utils import Category, Report, ReportKey, Subset
10
+
11
+ else:
12
+ _import_structure = {
13
+ 'combinator': [
14
+ 'gen_table',
15
+ 'get_data_frame',
16
+ 'get_report_list',
17
+ 'gen_report_table',
18
+ ],
19
+ 'generator': [
20
+ 'ReportGenerator',
21
+ ],
22
+ 'utils': [
23
+ 'Category',
24
+ 'Report',
25
+ 'ReportKey',
26
+ 'Subset',
27
+ ],
28
+ }
29
+
30
+ import sys
31
+
32
+ sys.modules[__name__] = _LazyModule(
33
+ __name__,
34
+ globals()['__file__'],
35
+ _import_structure,
36
+ module_spec=__spec__,
37
+ extra_objects={},
38
+ )
@@ -34,17 +34,51 @@ def get_report_list(reports_path_list: List[str]) -> List[Report]:
34
34
 
35
35
  def get_data_frame(report_list: List[Report],
36
36
  flatten_metrics: bool = True,
37
- flatten_categories: bool = True) -> pd.DataFrame:
37
+ flatten_categories: bool = True,
38
+ add_overall_metric: bool = False) -> pd.DataFrame:
38
39
  tables = []
39
40
  for report in report_list:
40
- df = report.to_dataframe(flatten_metrics=flatten_metrics, flatten_categories=flatten_categories)
41
+ df = report.to_dataframe(
42
+ flatten_metrics=flatten_metrics,
43
+ flatten_categories=flatten_categories,
44
+ add_overall_metric=add_overall_metric)
41
45
  tables.append(df)
42
46
  return pd.concat(tables, ignore_index=True)
43
47
 
44
48
 
45
- def gen_table(reports_path_list: list) -> str:
46
- report_list = get_report_list(reports_path_list)
47
- table = get_data_frame(report_list)
49
+ def gen_table(reports_path_list: list[str] = None,
50
+ report_list: list[Report] = None,
51
+ flatten_metrics: bool = True,
52
+ flatten_categories: bool = True,
53
+ add_overall_metric: bool = False) -> str:
54
+ """
55
+ Generates a formatted table from a list of report paths or Report objects.
56
+
57
+ Args:
58
+ reports_path_list (list[str], optional): List of file paths to report files.
59
+ Either this or `report_list` must be provided.
60
+ report_list (list[Report], optional): List of Report objects.
61
+ Either this or `reports_path_list` must be provided.
62
+ flatten_metrics (bool, optional): Whether to flatten the metrics in the output table. Defaults to True.
63
+ flatten_categories (bool, optional): Whether to flatten the categories in the output table. Defaults to True.
64
+ add_overall_metric (bool, optional): Whether to add an overall metric column to the table. Defaults to False.
65
+
66
+ Returns:
67
+ str: A string representation of the table in grid format.
68
+
69
+ Raises:
70
+ AssertionError: If neither `reports_path_list` nor `report_list` is provided.
71
+ """
72
+ assert (reports_path_list is not None) or (report_list is not None), \
73
+ 'Either reports_path_list or report_list must be provided.'
74
+ if report_list is None:
75
+ report_list = get_report_list(reports_path_list)
76
+ # Generate a DataFrame from the report list
77
+ table = get_data_frame(
78
+ report_list,
79
+ flatten_metrics=flatten_metrics,
80
+ flatten_categories=flatten_categories,
81
+ add_overall_metric=add_overall_metric)
48
82
  return tabulate(table, headers=table.columns, tablefmt='grid', showindex=False)
49
83
 
50
84
 
@@ -60,7 +94,7 @@ if __name__ == '__main__':
60
94
  report_dir_1 = './outputs/20250117_151926'
61
95
  # report_dir_2 = './outputs/20250107_204445/reports'
62
96
 
63
- report_table = gen_table([report_dir_1])
97
+ report_table = gen_table(reports_path_list=[report_dir_1])
64
98
  print(report_table)
65
99
 
66
100
  # ALL VALUES ONLY FOR EXAMPLE
@@ -1,24 +1,42 @@
1
1
  import pandas as pd
2
2
  from pandas import DataFrame
3
+ from typing import TYPE_CHECKING
3
4
 
4
5
  from evalscope.constants import DataCollection
5
6
  from evalscope.report.utils import *
6
7
 
8
+ if TYPE_CHECKING:
9
+ from evalscope.benchmarks import DataAdapter
10
+
7
11
 
8
12
  class ReportGenerator:
9
13
 
10
14
  @staticmethod
11
- def gen_report(subset_score_map: dict, report_name: str, **kwargs) -> Report:
15
+ def gen_report(subset_score_map: dict, model_name: str, data_adapter: 'DataAdapter', **kwargs) -> Report:
12
16
  """
13
- Generate report for specific dataset.
14
- subset_score_map: e.g. {subset_name: [{'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100}, {'metric_name': 'WeightedAverageAccuracy', 'score': 0.3389, 'num': 100}]}
15
- category_map: e.g. {'subset_name': ['category_name1', 'category_name2'], ...}
16
- metric_list: e.g. [{'object': AverageAccuracy, 'name': 'AverageAccuracy'}, {'object': 'WeightedAverageAccuracy', 'name': 'WeightedAverageAccuracy'}]
17
+ Generate a report for a specific dataset based on provided subset scores.
18
+
19
+ Args:
20
+ subset_score_map (dict): A mapping from subset names to a list of score dictionaries.
21
+ {
22
+ 'subset_name': [
23
+ {'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100},
24
+ {'metric_name': 'WeightedAverageAccuracy', 'score': 0.3389, 'num': 100}
25
+ ],
26
+ ...
27
+ }
28
+ report_name (str): The name of the report to generate.
29
+ data_adapter (DataAdapter): An adapter object for data handling.
30
+
31
+ Returns:
32
+ Report: A structured report object containing metrics, categories, and subsets.
33
+
34
+ >>> report = gen_report(subset_score_map, "My Report", data_adapter, dataset_name="Dataset", model_name="Model")
17
35
  """ # noqa: E501
18
36
 
19
- dataset_name = kwargs.get('dataset_name', None)
20
- model_name = kwargs.get('model_name', None)
21
- category_map = kwargs.get('category_map', {})
37
+ dataset_name = data_adapter.name
38
+ category_map = data_adapter.category_map
39
+ report_name = f'{model_name}@{dataset_name}'
22
40
 
23
41
  def flatten_subset() -> DataFrame:
24
42
  """
@@ -59,7 +77,13 @@ class ReportGenerator:
59
77
 
60
78
  metrics_list.append(Metric(name=metric_name, categories=categories))
61
79
 
62
- report = Report(name=report_name, metrics=metrics_list, dataset_name=dataset_name, model_name=model_name)
80
+ report = Report(
81
+ name=report_name,
82
+ metrics=metrics_list,
83
+ dataset_name=dataset_name,
84
+ model_name=model_name,
85
+ dataset_description=data_adapter.description,
86
+ dataset_pretty_name=data_adapter.pretty_name)
63
87
  return report
64
88
 
65
89
  @staticmethod
evalscope/report/utils.py CHANGED
@@ -1,4 +1,5 @@
1
1
  import json
2
+ import os
2
3
  import pandas as pd
3
4
  from collections import defaultdict
4
5
  from dataclasses import asdict, dataclass, field
@@ -6,6 +7,9 @@ from typing import Any, Dict, List
6
7
 
7
8
  from evalscope.metrics import macro_mean, micro_mean
8
9
  from evalscope.utils import normalize_score
10
+ from evalscope.utils.logger import get_logger
11
+
12
+ logger = get_logger()
9
13
 
10
14
 
11
15
  @dataclass
@@ -70,13 +74,28 @@ class ReportKey:
70
74
  score = 'Score'
71
75
 
72
76
 
77
+ ANALYSIS_PROMPT = """根据给出的json格式的模型评测结果,输出分析报告,要求如下:
78
+ 1. 报告分为 总体表现、关键指标分析、改进建议、结论 四部分
79
+ 2. 若模型有多种指标,将其分为低分、中分、高分三个部分,并列出markdown表格
80
+ 3. 只列出报告本身,不要有其他多余内容
81
+ 4. 输出报告语言为{language}
82
+
83
+ ```json
84
+ {report_str}
85
+ ```
86
+ """
87
+
88
+
73
89
  @dataclass
74
90
  class Report:
75
91
  name: str = 'default_report'
76
92
  dataset_name: str = 'default_dataset'
93
+ dataset_pretty_name: str = ''
94
+ dataset_description: str = ''
77
95
  model_name: str = 'default_model'
78
96
  score: float = 0.0
79
97
  metrics: List[Metric] = field(default_factory=list)
98
+ analysis: str = 'N/A'
80
99
 
81
100
  def __post_init__(self):
82
101
  self.score = self.metrics[0].score # NOTE: only use the first metric by default
@@ -84,15 +103,29 @@ class Report:
84
103
  def to_dict(self) -> Dict[str, Any]:
85
104
  return asdict(self)
86
105
 
106
+ def to_json_str(self) -> str:
107
+ return json.dumps(self.to_dict(), indent=4, ensure_ascii=False)
108
+
109
+ def to_json(self, json_file: str):
110
+ # ensure the directory exists
111
+ os.makedirs(os.path.dirname(json_file), exist_ok=True)
112
+ # write the report to a json file
113
+ with open(json_file, 'w', encoding='utf-8') as f:
114
+ json.dump(self.to_dict(), f, indent=4, ensure_ascii=False)
115
+
87
116
  @classmethod
88
117
  def from_dict(cls, data: dict):
89
118
  metrics = [Metric.from_dict(metric) for metric in data.get('metrics', [])]
90
119
  return cls(
91
120
  name=data['name'],
121
+ dataset_name=data['dataset_name'],
122
+ dataset_pretty_name=data.get('dataset_pretty_name'),
123
+ dataset_description=data.get('dataset_description'),
92
124
  score=data['score'],
125
+ model_name=data['model_name'],
93
126
  metrics=metrics,
94
- dataset_name=data['dataset_name'],
95
- model_name=data['model_name'])
127
+ analysis=data.get('analysis', 'N/A'),
128
+ )
96
129
 
97
130
  @classmethod
98
131
  def from_json(cls, json_file: str):
@@ -100,18 +133,41 @@ class Report:
100
133
  data = json.load(f)
101
134
  return cls.from_dict(data)
102
135
 
103
- def to_dataframe(self, flatten_metrics: bool = True, flatten_categories: bool = True):
136
+ def to_dataframe(self,
137
+ flatten_metrics: bool = True,
138
+ flatten_categories: bool = True,
139
+ add_overall_metric: bool = False) -> pd.DataFrame:
140
+ """
141
+ Convert the report to a pandas DataFrame.
142
+ Args:
143
+ flatten_metrics (bool): Whether to flatten the metrics to a single row.
144
+ flatten_categories (bool): Whether to flatten the categories to multiple rows.
145
+ add_overall_metric (bool): Whether to add an overall metric row.
146
+ Returns:
147
+ pd.DataFrame: The report as a pandas DataFrame.
148
+ """
104
149
  table = defaultdict(list)
105
150
  for metric in self.metrics:
151
+ metric_count = 0
106
152
  for category in metric.categories:
107
153
  for subset in category.subsets:
154
+ metric_count += 1
108
155
  table[ReportKey.model_name].append(self.model_name)
109
156
  table[ReportKey.dataset_name].append(self.dataset_name)
110
157
  table[ReportKey.metric_name].append(metric.name)
111
158
  table[ReportKey.category_name].append(category.name)
112
159
  table[ReportKey.subset_name].append(subset.name)
113
160
  table[ReportKey.num].append(subset.num)
114
- table[ReportKey.score].append(subset.score) # TODO: convert to percentage
161
+ table[ReportKey.score].append(subset.score)
162
+ # add overall metric when there are multiple subsets
163
+ if metric_count > 1 and add_overall_metric:
164
+ table[ReportKey.model_name].append(self.model_name)
165
+ table[ReportKey.dataset_name].append(self.dataset_name)
166
+ table[ReportKey.metric_name].append(metric.name)
167
+ table[ReportKey.category_name].append(('-', ))
168
+ table[ReportKey.subset_name].append('OVERALL')
169
+ table[ReportKey.num].append(metric.num)
170
+ table[ReportKey.score].append(metric.score)
115
171
  # NOTE: only flatten metrics if needed, use the first metric by default
116
172
  if not flatten_metrics:
117
173
  break
@@ -131,3 +187,27 @@ class Report:
131
187
 
132
188
  df_categories.drop(columns=[ReportKey.category_name], inplace=True)
133
189
  return df_categories
190
+
191
+ def generate_analysis(self, judge_llm_config: dict) -> str:
192
+ import locale
193
+
194
+ from evalscope.metrics import LLMJudge
195
+
196
+ try:
197
+ # get the default locale
198
+ lang, _ = locale.getlocale()
199
+
200
+ if lang is None:
201
+ language = '中文'
202
+ else:
203
+ language = 'en' if lang.startswith('en') else '中文'
204
+
205
+ prompt = ANALYSIS_PROMPT.format(language=language, report_str=self.to_json_str())
206
+ judge_llm = LLMJudge(**judge_llm_config)
207
+ response = judge_llm(prompt)
208
+ except Exception as e:
209
+ logger.error(f'Error generating analysis: {e}')
210
+ response = 'N/A'
211
+
212
+ self.analysis = response
213
+ return response
evalscope/run.py CHANGED
@@ -43,6 +43,9 @@ def run_single_task(task_cfg: TaskConfig, run_time: str) -> dict:
43
43
  else:
44
44
  result = evaluate_model(task_cfg, outputs)
45
45
 
46
+ logger.info(f'Finished evaluation for {task_cfg.model_id} on {task_cfg.datasets}')
47
+ logger.info(f'Output directory: {outputs.outputs_dir}')
48
+
46
49
  return result
47
50
 
48
51
 
@@ -109,6 +112,7 @@ def get_backend_manager_class(eval_backend: EvalBackend):
109
112
  def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
110
113
  """Evaluate the model based on the provided task configuration."""
111
114
  from evalscope.models import get_local_model
115
+ from evalscope.report import gen_table
112
116
 
113
117
  # Initialize evaluator
114
118
  eval_results = {}
@@ -122,10 +126,18 @@ def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
122
126
  task_cfg.dump_yaml(outputs.configs_dir)
123
127
  logger.info(task_cfg)
124
128
 
129
+ # Run evaluation for each evaluator
125
130
  for evaluator in evaluators:
126
131
  res_dict = evaluator.eval()
127
132
  eval_results[evaluator.dataset_name] = res_dict
128
133
 
134
+ # Make overall report
135
+ try:
136
+ report_table: str = gen_table(reports_path_list=[outputs.reports_dir], add_overall_metric=True)
137
+ logger.info(f'Overall report table: \n{report_table} \n')
138
+ except Exception:
139
+ logger.error('Failed to generate report table.')
140
+
129
141
  # Clean up
130
142
  if base_model is not None:
131
143
  import gc
evalscope/summarizer.py CHANGED
@@ -30,7 +30,7 @@ class Summarizer:
30
30
  with open(report_file, 'r') as f:
31
31
  res_list.append(json.load(f))
32
32
 
33
- report_table: str = gen_table([reports_dir])
33
+ report_table: str = gen_table(reports_path_list=[reports_dir])
34
34
  logger.info(f'*** Report table ***\n{report_table}')
35
35
 
36
36
  return res_list
@@ -1,3 +1,4 @@
1
+ import csv
1
2
  import json
2
3
  import jsonlines as jsonl
3
4
  import os
@@ -112,8 +113,58 @@ def dump_jsonl_data(data_list, jsonl_file, dump_mode=DumpMode.OVERWRITE):
112
113
  writer.write_all(data_list)
113
114
 
114
115
 
115
- def jsonl_to_csv():
116
- pass
116
+ def jsonl_to_csv(jsonl_file, csv_file):
117
+ """
118
+ Convert jsonl file to csv file.
119
+
120
+ Args:
121
+ jsonl_file: jsonl file path.
122
+ csv_file: csv file path.
123
+ """
124
+ data = jsonl_to_list(jsonl_file)
125
+ if not data:
126
+ logger.warning(f'No data found in {jsonl_file}.')
127
+ return
128
+
129
+ with open(csv_file, 'w', newline='', encoding='utf-8') as f:
130
+ writer = csv.writer(f)
131
+ writer.writerow(data[0].keys()) # Write header
132
+ for item in data:
133
+ writer.writerow(item.values())
134
+
135
+
136
+ def csv_to_list(csv_file) -> list:
137
+ """
138
+ Read csv file to list.
139
+
140
+ Args:
141
+ csv_file: csv file path.
142
+
143
+ Returns:
144
+ list: list of lines. Each line is a dict.
145
+ """
146
+ res_list = []
147
+ with open(csv_file, 'r', encoding='utf-8') as f:
148
+ reader = csv.DictReader(f)
149
+ for row in reader:
150
+ res_list.append(row)
151
+ return res_list
152
+
153
+
154
+ def csv_to_jsonl(csv_file, jsonl_file):
155
+ """
156
+ Convert csv file to jsonl file.
157
+
158
+ Args:
159
+ csv_file: csv file path.
160
+ jsonl_file: jsonl file path.
161
+ """
162
+ data = csv_to_list(csv_file)
163
+ if not data:
164
+ logger.warning(f'No data found in {csv_file}.')
165
+ return
166
+
167
+ dump_jsonl_data(data, jsonl_file, dump_mode=DumpMode.OVERWRITE)
117
168
 
118
169
 
119
170
  def yaml_to_dict(yaml_file) -> dict:
@@ -168,3 +219,9 @@ def dict_to_json(d: dict, json_file: str):
168
219
  """
169
220
  with open(json_file, 'w') as f:
170
221
  json.dump(d, f, indent=4, ensure_ascii=False)
222
+
223
+
224
+ if __name__ == '__main__':
225
+ csv_file = 'custom_eval/text/mcq/example_val.csv'
226
+ jsonl_file = 'custom_eval/text/mcq/example_val.jsonl'
227
+ csv_to_jsonl(csv_file, jsonl_file)
evalscope/utils/logger.py CHANGED
@@ -10,7 +10,7 @@ simple_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
10
10
 
11
11
  detailed_formatter = logging.Formatter(detailed_format)
12
12
  simple_formatter = logging.Formatter(simple_format)
13
- DEFAULT_LEVEL = logging.DEBUG if os.getenv('LOG_LEVEL', 'INFO') == 'DEBUG' else logging.INFO
13
+ DEFAULT_LEVEL = logging.DEBUG if os.getenv('EVALSCOPE_LOG_LEVEL', 'INFO') == 'DEBUG' else logging.INFO
14
14
 
15
15
  logging.basicConfig(format=simple_format, level=DEFAULT_LEVEL, force=True)
16
16
 
evalscope/utils/utils.py CHANGED
@@ -10,6 +10,7 @@ import os
10
10
  import random
11
11
  import re
12
12
  import torch
13
+ from inspect import signature
13
14
  from typing import Any, Dict, List, Tuple, Union
14
15
 
15
16
  from evalscope.utils.logger import get_logger
@@ -313,6 +314,17 @@ def seed_everything(seed: int):
313
314
  torch.backends.cudnn.deterministic = True
314
315
  torch.backends.cudnn.benchmark = False
315
316
 
317
+ def get_supported_params(func):
318
+ """Get the supported parameters of a function."""
319
+ sig = signature(func)
320
+ return list(sig.parameters.keys())
321
+
322
+ def parse_int_or_float(num):
323
+ number = float(num)
324
+ if number.is_integer():
325
+ return int(number)
326
+ return number
327
+
316
328
  if __name__ == '__main__':
317
329
  options = ['A', 'B', 'C', 'D']
318
330
  answers = ['Context .... ANSWER: A', 'answer: A']
evalscope/version.py CHANGED
@@ -1,4 +1,4 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- __version__ = '0.16.0'
4
- __release_datetime__ = '2025-05-19 18:00:00'
3
+ __version__ = '0.16.2'
4
+ __release_datetime__ = '2025-06-23 20:00:00'