evalscope 0.15.1__py3-none-any.whl → 0.16.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (78) hide show
  1. evalscope/app/__init__.py +28 -0
  2. evalscope/{report → app}/app.py +67 -59
  3. evalscope/app/constants.py +21 -0
  4. evalscope/arguments.py +12 -1
  5. evalscope/backend/opencompass/backend_manager.py +2 -1
  6. evalscope/backend/rag_eval/cmteb/arguments.py +4 -1
  7. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  8. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  9. evalscope/backend/rag_eval/utils/embedding.py +75 -35
  10. evalscope/backend/rag_eval/utils/llm.py +1 -1
  11. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +0 -6
  12. evalscope/benchmarks/benchmark.py +1 -0
  13. evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -0
  14. evalscope/benchmarks/data_adapter.py +101 -18
  15. evalscope/benchmarks/docmath/__init__.py +0 -0
  16. evalscope/benchmarks/docmath/docmath_adapter.py +84 -0
  17. evalscope/benchmarks/docmath/utils.py +220 -0
  18. evalscope/benchmarks/drop/__init__.py +0 -0
  19. evalscope/benchmarks/drop/drop_adapter.py +133 -0
  20. evalscope/benchmarks/drop/utils.py +59 -0
  21. evalscope/benchmarks/frames/__init__.py +0 -0
  22. evalscope/benchmarks/frames/frames_adapter.py +90 -0
  23. evalscope/benchmarks/frames/utils.py +37 -0
  24. evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -1
  25. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  26. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +341 -0
  27. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  28. evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -0
  29. evalscope/benchmarks/tool_bench/__init__.py +0 -0
  30. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +70 -0
  31. evalscope/benchmarks/tool_bench/utils.py +203 -0
  32. evalscope/benchmarks/utils.py +28 -2
  33. evalscope/benchmarks/winogrande/__init__.py +0 -0
  34. evalscope/benchmarks/winogrande/winogrande_adapter.py +57 -0
  35. evalscope/cli/start_app.py +2 -2
  36. evalscope/collections/__init__.py +35 -3
  37. evalscope/collections/evaluator.py +94 -32
  38. evalscope/config.py +54 -17
  39. evalscope/evaluator/evaluator.py +80 -41
  40. evalscope/metrics/__init__.py +3 -1
  41. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
  42. evalscope/metrics/llm_judge.py +15 -8
  43. evalscope/metrics/math_parser.py +1 -1
  44. evalscope/metrics/rouge_metric.py +11 -13
  45. evalscope/models/adapters/chat_adapter.py +51 -34
  46. evalscope/models/adapters/server_adapter.py +17 -25
  47. evalscope/perf/arguments.py +16 -7
  48. evalscope/perf/benchmark.py +0 -15
  49. evalscope/perf/main.py +72 -15
  50. evalscope/perf/plugin/datasets/custom.py +15 -0
  51. evalscope/perf/utils/benchmark_util.py +34 -16
  52. evalscope/perf/utils/db_util.py +25 -15
  53. evalscope/perf/utils/local_server.py +1 -0
  54. evalscope/perf/utils/log_utils.py +12 -5
  55. evalscope/perf/utils/rich_display.py +186 -0
  56. evalscope/report/__init__.py +36 -4
  57. evalscope/report/combinator.py +8 -0
  58. evalscope/report/generator.py +33 -9
  59. evalscope/report/utils.py +61 -4
  60. evalscope/run.py +12 -0
  61. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  62. evalscope/utils/deprecation_utils.py +42 -0
  63. evalscope/utils/logger.py +1 -1
  64. evalscope/utils/utils.py +12 -0
  65. evalscope/version.py +2 -2
  66. {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/METADATA +57 -31
  67. {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/RECORD +78 -57
  68. tests/aigc/test_t2i.py +40 -3
  69. tests/cli/test_all.py +39 -32
  70. tests/cli/test_collection.py +8 -6
  71. tests/cli/test_run.py +43 -17
  72. tests/perf/test_perf.py +23 -0
  73. tests/rag/test_mteb.py +5 -5
  74. /evalscope/{report/app_arguments.py → app/arguments.py} +0 -0
  75. {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/LICENSE +0 -0
  76. {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/WHEEL +0 -0
  77. {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/entry_points.txt +0 -0
  78. {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,186 @@
1
+ # the following code is largely adapted from https://github.com/lework/llm-benchmark
2
+
3
+ import numpy as np
4
+ from rich.console import Console
5
+ from rich.panel import Panel
6
+ from rich.style import Style
7
+ from rich.table import Table
8
+ from rich.text import Text
9
+
10
+ from evalscope.utils.logger import get_logger
11
+ from .benchmark_util import Metrics
12
+ from .db_util import PercentileMetrics
13
+
14
+ logger = get_logger()
15
+
16
+
17
+ def analyze_results(all_results):
18
+ """Analyze all test results and generate a summary report"""
19
+ summary = []
20
+ total_tokens = 0
21
+ total_time = 0
22
+
23
+ for result in all_results:
24
+ total_metrics = result[0]
25
+ percentile_metrics = result[1]
26
+ percentiles = percentile_metrics[PercentileMetrics.PERCENTILES]
27
+ try:
28
+ concurrency = total_metrics.get(Metrics.NUMBER_OF_CONCURRENCY, 0)
29
+ rps = total_metrics.get(Metrics.REQUEST_THROUGHPUT, 0)
30
+ avg_latency = total_metrics.get(Metrics.AVERAGE_LATENCY, 0)
31
+ p99_latency = percentile_metrics.get(PercentileMetrics.LATENCY)[percentiles.index('99%')]
32
+ avg_tps = total_metrics.get(Metrics.OUTPUT_TOKEN_THROUGHPUT, 0)
33
+ avg_ttft = total_metrics.get(Metrics.AVERAGE_TIME_TO_FIRST_TOKEN, 0)
34
+ p99_ttft = percentile_metrics.get(PercentileMetrics.TTFT)[percentiles.index('99%')]
35
+ success_rate = (total_metrics.get(Metrics.SUCCEED_REQUESTS, 0)
36
+ / total_metrics.get(Metrics.TOTAL_REQUESTS, 1)) * 100
37
+ avg_tpot = total_metrics.get(Metrics.AVERAGE_TIME_PER_OUTPUT_TOKEN, 0)
38
+ p99_tpot = percentile_metrics.get(PercentileMetrics.TPOT)[percentiles.index('99%')]
39
+
40
+ # Ensure all values are valid numbers
41
+ if any(x is None for x in [concurrency, rps, avg_latency, p99_latency, avg_tps, avg_ttft]):
42
+ logger.warning(f'Warning: Test results for concurrency {concurrency} contain invalid data, skipped')
43
+ continue
44
+
45
+ summary.append([
46
+ concurrency,
47
+ f'{rps:.2f}' if rps is not None else 'N/A',
48
+ f'{avg_latency:.3f}' if avg_latency is not None else 'N/A',
49
+ f'{p99_latency:.3f}' if p99_latency is not None else 'N/A',
50
+ f'{avg_tps:.2f}' if avg_tps is not None else 'N/A',
51
+ f'{avg_ttft:.3f}' if avg_ttft is not None else 'N/A',
52
+ f'{success_rate:.1f}%' if success_rate is not None else 'N/A',
53
+ f'{p99_ttft:.3f}' if p99_ttft is not None else 'N/A',
54
+ f'{avg_tpot:.3f}' if avg_tpot is not None else 'N/A',
55
+ f'{p99_tpot:.3f}' if p99_tpot is not None else 'N/A',
56
+ ])
57
+
58
+ total_tokens += total_metrics.get(Metrics.AVERAGE_OUTPUT_TOKENS_PER_REQUEST, 0) * total_metrics.get(
59
+ Metrics.SUCCEED_REQUESTS, 0)
60
+ total_time += total_metrics.get(Metrics.TIME_TAKEN_FOR_TESTS, 0)
61
+ except Exception as e:
62
+ logger.warning(
63
+ f"Warning: Error processing results for concurrency {result.get('concurrency', 'unknown')}: {str(e)}")
64
+ continue
65
+
66
+ if not summary:
67
+ logger.warning('Error: No valid test result data')
68
+ return [], 0, 0
69
+
70
+ return summary, total_tokens, total_time
71
+
72
+
73
+ def print_summary(all_results, model_name):
74
+ """Print test results summary"""
75
+ summary, total_tokens, total_time = analyze_results(all_results)
76
+
77
+ if not summary:
78
+ logger.warning('No available test result data to display')
79
+ return
80
+
81
+ console = Console(width=100) # Set fixed width
82
+
83
+ # Create title panel
84
+ title = Text('Performance Test Summary Report', style='bold')
85
+ console.print(Panel(title, width=60))
86
+
87
+ # Print basic information
88
+ basic_info = Table(show_header=False, width=60)
89
+ basic_info.add_column('Name', style='cyan', width=25)
90
+ basic_info.add_column('Value', style='green', width=35)
91
+
92
+ basic_info.add_row('Model', model_name)
93
+ basic_info.add_row('Total Generated', f'{total_tokens:,} tokens')
94
+ basic_info.add_row('Total Test Time', f'{total_time:.2f} seconds')
95
+ basic_info.add_row('Avg Output Rate', f'{total_tokens / total_time:.2f} tokens/sec')
96
+
97
+ console.print('\nBasic Information:')
98
+ console.print(basic_info)
99
+
100
+ # Create detailed performance metrics table
101
+ table = Table(
102
+ title='Detailed Performance Metrics',
103
+ show_header=True,
104
+ header_style='bold cyan',
105
+ border_style='blue',
106
+ width=100, # Set total table width
107
+ pad_edge=False, # Reduce edge padding
108
+ min_width=60, # Minimum width
109
+ )
110
+
111
+ # Add columns (set fixed column widths)
112
+ table.add_column('Conc.', justify='right', style='cyan')
113
+ table.add_column('RPS', justify='right')
114
+ table.add_column('Avg Lat.(s)', justify='right')
115
+ table.add_column('P99 Lat.(s)', justify='right')
116
+ table.add_column('Gen. toks/s', justify='right')
117
+ table.add_column('Avg TTFT(s)', justify='right')
118
+ table.add_column('P99 TTFT(s)', justify='right')
119
+ table.add_column('Avg TPOT(s)', justify='right')
120
+ table.add_column('P99 TPOT(s)', justify='right')
121
+ table.add_column('Success Rate', justify='right', style='green')
122
+
123
+ # Add data rows
124
+ for row in summary:
125
+ try:
126
+ # Set row style based on success rate
127
+ success_rate = float(row[6].rstrip('%'))
128
+ row_style = 'green' if success_rate >= 95 else 'yellow' if success_rate >= 80 else 'red'
129
+
130
+ table.add_row(
131
+ str(row[0]), # Concurrency
132
+ f'{float(row[1]):.2f}', # RPS
133
+ f'{float(row[2]):.3f}', # Average Latency
134
+ f'{float(row[3]):.3f}', # P99 Latency
135
+ f'{float(row[4]):.2f}', # Average TPS
136
+ f'{float(row[5]):.3f}', # First Token Latency
137
+ f'{float(row[7]):.3f}', # P99 TTFT
138
+ f'{float(row[8]):.3f}', # Average TPOT
139
+ f'{float(row[9]):.3f}', # P99 TPOT
140
+ row[6], # Success Rate
141
+ style=row_style)
142
+ except ValueError as e:
143
+ console.print(f'Warning: Error processing row data: {str(e)}', style='bold red')
144
+ continue
145
+
146
+ console.print('\n')
147
+ console.print(table)
148
+
149
+ # Calculate and display best performance configuration
150
+ try:
151
+ best_rps_idx = np.argmax([float(row[1]) if row[1] != 'N/A' else -1 for row in summary])
152
+ best_latency_idx = np.argmin([float(row[2]) if row[2] != 'N/A' else float('inf') for row in summary])
153
+
154
+ perf_info = Table(title='Best Performance Configuration', show_header=False, box=None, width=60)
155
+ perf_info.add_column('Metric', style='cyan', width=20)
156
+ perf_info.add_column('Value', style='green', width=40)
157
+
158
+ perf_info.add_row('Highest RPS', f'Concurrency {summary[best_rps_idx][0]} ({summary[best_rps_idx][1]} req/sec)')
159
+ perf_info.add_row('Lowest Latency',
160
+ f'Concurrency {summary[best_latency_idx][0]} ({summary[best_latency_idx][2]} seconds)')
161
+
162
+ console.print('\n')
163
+ console.print(perf_info)
164
+
165
+ # Performance recommendations
166
+ recommendations = []
167
+ if best_rps_idx == len(summary) - 1:
168
+ recommendations.append(
169
+ 'The system seems not to have reached its performance bottleneck, try higher concurrency')
170
+ elif best_rps_idx == 0:
171
+ recommendations.append('Consider lowering concurrency, current load may be too high')
172
+ else:
173
+ recommendations.append(f'Optimal concurrency range is around {summary[best_rps_idx][0]}')
174
+
175
+ success_rate = float(summary[-1][6][:-1])
176
+ if success_rate < 95:
177
+ recommendations.append(
178
+ 'Success rate is low at high concurrency, check system resources or reduce concurrency')
179
+
180
+ recommend_text = Text('\nPerformance Recommendations:', style='bold cyan')
181
+ console.print(recommend_text)
182
+ for rec in recommendations:
183
+ console.print(f'• {rec}', style='yellow')
184
+
185
+ except Exception as e:
186
+ console.print(f'Warning: Error generating performance analysis: {str(e)}', style='bold red')
@@ -1,6 +1,38 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from typing import TYPE_CHECKING
2
3
 
3
- from evalscope.report.app_arguments import add_argument
4
- from evalscope.report.combinator import gen_table, get_data_frame, get_report_list
5
- from evalscope.report.generator import ReportGenerator
6
- from evalscope.report.utils import Category, Report, ReportKey, Subset
4
+ from evalscope.utils.import_utils import _LazyModule
5
+
6
+ if TYPE_CHECKING:
7
+ from .combinator import gen_report_table, gen_table, get_data_frame, get_report_list
8
+ from .generator import ReportGenerator
9
+ from .utils import Category, Report, ReportKey, Subset
10
+
11
+ else:
12
+ _import_structure = {
13
+ 'combinator': [
14
+ 'gen_table',
15
+ 'get_data_frame',
16
+ 'get_report_list',
17
+ 'gen_report_table',
18
+ ],
19
+ 'generator': [
20
+ 'ReportGenerator',
21
+ ],
22
+ 'utils': [
23
+ 'Category',
24
+ 'Report',
25
+ 'ReportKey',
26
+ 'Subset',
27
+ ],
28
+ }
29
+
30
+ import sys
31
+
32
+ sys.modules[__name__] = _LazyModule(
33
+ __name__,
34
+ globals()['__file__'],
35
+ _import_structure,
36
+ module_spec=__spec__,
37
+ extra_objects={},
38
+ )
@@ -48,6 +48,14 @@ def gen_table(reports_path_list: list) -> str:
48
48
  return tabulate(table, headers=table.columns, tablefmt='grid', showindex=False)
49
49
 
50
50
 
51
+ def gen_report_table(report: Report) -> str:
52
+ """
53
+ Generate a report table for a single report.
54
+ """
55
+ table = report.to_dataframe(flatten_metrics=True, flatten_categories=True)
56
+ return tabulate(table, headers=table.columns, tablefmt='grid', showindex=False)
57
+
58
+
51
59
  class ReportsRecorder:
52
60
  COMMON_DATASET_PATH = []
53
61
  CUSTOM_DATASET_PATH = []
@@ -1,24 +1,42 @@
1
1
  import pandas as pd
2
2
  from pandas import DataFrame
3
+ from typing import TYPE_CHECKING
3
4
 
4
5
  from evalscope.constants import DataCollection
5
6
  from evalscope.report.utils import *
6
7
 
8
+ if TYPE_CHECKING:
9
+ from evalscope.benchmarks import DataAdapter
10
+
7
11
 
8
12
  class ReportGenerator:
9
13
 
10
14
  @staticmethod
11
- def gen_report(subset_score_map: dict, report_name: str, **kwargs) -> Report:
15
+ def gen_report(subset_score_map: dict, model_name: str, data_adapter: 'DataAdapter', **kwargs) -> Report:
12
16
  """
13
- Generate report for specific dataset.
14
- subset_score_map: e.g. {subset_name: [{'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100}, {'metric_name': 'WeightedAverageAccuracy', 'score': 0.3389, 'num': 100}]}
15
- category_map: e.g. {'subset_name': ['category_name1', 'category_name2'], ...}
16
- metric_list: e.g. [{'object': AverageAccuracy, 'name': 'AverageAccuracy'}, {'object': 'WeightedAverageAccuracy', 'name': 'WeightedAverageAccuracy'}]
17
+ Generate a report for a specific dataset based on provided subset scores.
18
+
19
+ Args:
20
+ subset_score_map (dict): A mapping from subset names to a list of score dictionaries.
21
+ {
22
+ 'subset_name': [
23
+ {'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100},
24
+ {'metric_name': 'WeightedAverageAccuracy', 'score': 0.3389, 'num': 100}
25
+ ],
26
+ ...
27
+ }
28
+ report_name (str): The name of the report to generate.
29
+ data_adapter (DataAdapter): An adapter object for data handling.
30
+
31
+ Returns:
32
+ Report: A structured report object containing metrics, categories, and subsets.
33
+
34
+ >>> report = gen_report(subset_score_map, "My Report", data_adapter, dataset_name="Dataset", model_name="Model")
17
35
  """ # noqa: E501
18
36
 
19
- dataset_name = kwargs.get('dataset_name', None)
20
- model_name = kwargs.get('model_name', None)
21
- category_map = kwargs.get('category_map', {})
37
+ dataset_name = data_adapter.name
38
+ category_map = data_adapter.category_map
39
+ report_name = f'{model_name}@{dataset_name}'
22
40
 
23
41
  def flatten_subset() -> DataFrame:
24
42
  """
@@ -59,7 +77,13 @@ class ReportGenerator:
59
77
 
60
78
  metrics_list.append(Metric(name=metric_name, categories=categories))
61
79
 
62
- report = Report(name=report_name, metrics=metrics_list, dataset_name=dataset_name, model_name=model_name)
80
+ report = Report(
81
+ name=report_name,
82
+ metrics=metrics_list,
83
+ dataset_name=dataset_name,
84
+ model_name=model_name,
85
+ dataset_description=data_adapter.description,
86
+ dataset_pretty_name=data_adapter.pretty_name)
63
87
  return report
64
88
 
65
89
  @staticmethod
evalscope/report/utils.py CHANGED
@@ -1,4 +1,5 @@
1
1
  import json
2
+ import os
2
3
  import pandas as pd
3
4
  from collections import defaultdict
4
5
  from dataclasses import asdict, dataclass, field
@@ -6,6 +7,9 @@ from typing import Any, Dict, List
6
7
 
7
8
  from evalscope.metrics import macro_mean, micro_mean
8
9
  from evalscope.utils import normalize_score
10
+ from evalscope.utils.logger import get_logger
11
+
12
+ logger = get_logger()
9
13
 
10
14
 
11
15
  @dataclass
@@ -70,13 +74,28 @@ class ReportKey:
70
74
  score = 'Score'
71
75
 
72
76
 
77
+ ANALYSIS_PROMPT = """根据给出的json格式的模型评测结果,输出分析报告,要求如下:
78
+ 1. 报告分为 总体表现、关键指标分析、改进建议、结论 四部分
79
+ 2. 若模型有多种指标,将其分为低分、中分、高分三个部分,并列出markdown表格
80
+ 3. 只列出报告本身,不要有其他多余内容
81
+ 4. 输出报告语言为{language}
82
+
83
+ ```json
84
+ {report_str}
85
+ ```
86
+ """
87
+
88
+
73
89
  @dataclass
74
90
  class Report:
75
91
  name: str = 'default_report'
76
92
  dataset_name: str = 'default_dataset'
93
+ dataset_pretty_name: str = ''
94
+ dataset_description: str = ''
77
95
  model_name: str = 'default_model'
78
96
  score: float = 0.0
79
97
  metrics: List[Metric] = field(default_factory=list)
98
+ analysis: str = 'N/A'
80
99
 
81
100
  def __post_init__(self):
82
101
  self.score = self.metrics[0].score # NOTE: only use the first metric by default
@@ -84,19 +103,33 @@ class Report:
84
103
  def to_dict(self) -> Dict[str, Any]:
85
104
  return asdict(self)
86
105
 
106
+ def to_json_str(self) -> str:
107
+ return json.dumps(self.to_dict(), indent=4, ensure_ascii=False)
108
+
109
+ def to_json(self, json_file: str):
110
+ # ensure the directory exists
111
+ os.makedirs(os.path.dirname(json_file), exist_ok=True)
112
+ # write the report to a json file
113
+ with open(json_file, 'w', encoding='utf-8') as f:
114
+ json.dump(self.to_dict(), f, indent=4, ensure_ascii=False)
115
+
87
116
  @classmethod
88
117
  def from_dict(cls, data: dict):
89
118
  metrics = [Metric.from_dict(metric) for metric in data.get('metrics', [])]
90
119
  return cls(
91
120
  name=data['name'],
121
+ dataset_name=data['dataset_name'],
122
+ dataset_pretty_name=data.get('dataset_pretty_name'),
123
+ dataset_description=data.get('dataset_description'),
92
124
  score=data['score'],
125
+ model_name=data['model_name'],
93
126
  metrics=metrics,
94
- dataset_name=data['dataset_name'],
95
- model_name=data['model_name'])
127
+ analysis=data.get('analysis', 'N/A'),
128
+ )
96
129
 
97
130
  @classmethod
98
131
  def from_json(cls, json_file: str):
99
- with open(json_file, 'r') as f:
132
+ with open(json_file, 'r', encoding='utf-8') as f:
100
133
  data = json.load(f)
101
134
  return cls.from_dict(data)
102
135
 
@@ -111,7 +144,7 @@ class Report:
111
144
  table[ReportKey.category_name].append(category.name)
112
145
  table[ReportKey.subset_name].append(subset.name)
113
146
  table[ReportKey.num].append(subset.num)
114
- table[ReportKey.score].append(subset.score) # TODO: convert to percentage
147
+ table[ReportKey.score].append(subset.score)
115
148
  # NOTE: only flatten metrics if needed, use the first metric by default
116
149
  if not flatten_metrics:
117
150
  break
@@ -131,3 +164,27 @@ class Report:
131
164
 
132
165
  df_categories.drop(columns=[ReportKey.category_name], inplace=True)
133
166
  return df_categories
167
+
168
+ def generate_analysis(self, judge_llm_config: dict) -> str:
169
+ import locale
170
+
171
+ from evalscope.metrics import LLMJudge
172
+
173
+ try:
174
+ # get the default locale
175
+ lang, _ = locale.getlocale()
176
+
177
+ if lang is None:
178
+ language = '中文'
179
+ else:
180
+ language = 'en' if lang.startswith('en') else '中文'
181
+
182
+ prompt = ANALYSIS_PROMPT.format(language=language, report_str=self.to_json_str())
183
+ judge_llm = LLMJudge(**judge_llm_config)
184
+ response = judge_llm(prompt)
185
+ except Exception as e:
186
+ logger.error(f'Error generating analysis: {e}')
187
+ response = 'N/A'
188
+
189
+ self.analysis = response
190
+ return response
evalscope/run.py CHANGED
@@ -43,6 +43,9 @@ def run_single_task(task_cfg: TaskConfig, run_time: str) -> dict:
43
43
  else:
44
44
  result = evaluate_model(task_cfg, outputs)
45
45
 
46
+ logger.info(f'Finished evaluation for {task_cfg.model_id} on {task_cfg.datasets}')
47
+ logger.info(f'Output directory: {outputs.outputs_dir}')
48
+
46
49
  return result
47
50
 
48
51
 
@@ -109,6 +112,7 @@ def get_backend_manager_class(eval_backend: EvalBackend):
109
112
  def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
110
113
  """Evaluate the model based on the provided task configuration."""
111
114
  from evalscope.models import get_local_model
115
+ from evalscope.report import gen_table
112
116
 
113
117
  # Initialize evaluator
114
118
  eval_results = {}
@@ -122,10 +126,18 @@ def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
122
126
  task_cfg.dump_yaml(outputs.configs_dir)
123
127
  logger.info(task_cfg)
124
128
 
129
+ # Run evaluation for each evaluator
125
130
  for evaluator in evaluators:
126
131
  res_dict = evaluator.eval()
127
132
  eval_results[evaluator.dataset_name] = res_dict
128
133
 
134
+ # Make overall report
135
+ try:
136
+ report_table: str = gen_table([outputs.reports_dir])
137
+ logger.info(f'Overall report table: \n{report_table} \n')
138
+ except Exception:
139
+ logger.error('Failed to generate report table.')
140
+
129
141
  # Clean up
130
142
  if base_model is not None:
131
143
  import gc
@@ -6,11 +6,12 @@ from typing import Union
6
6
  from evalscope.third_party.toolbench_static.eval import EvalArgs, run_eval
7
7
  from evalscope.third_party.toolbench_static.infer import InferArgs, run_infer
8
8
  from evalscope.utils import get_logger
9
+ from evalscope.utils.deprecation_utils import deprecated
9
10
  from evalscope.utils.io_utils import json_to_dict, yaml_to_dict
10
11
 
11
12
  logger = get_logger()
12
13
 
13
-
14
+ @deprecated(since='0.15.1', remove_in='0.18.0', alternative='Native implementation of ToolBench')
14
15
  def run_task(task_cfg: Union[str, dict]):
15
16
 
16
17
  if isinstance(task_cfg, str):
@@ -0,0 +1,42 @@
1
+ import functools
2
+ import inspect
3
+ from typing import Callable, Optional
4
+
5
+ from .logger import get_logger
6
+
7
+ logger = get_logger()
8
+
9
+
10
+ def deprecated(since: str, remove_in: Optional[str] = None, alternative: Optional[str] = None) -> Callable:
11
+ """
12
+ Decorator to mark functions as deprecated.
13
+
14
+ :param since: String indicating the version since deprecation
15
+ :param remove_in: Optional string indicating the version when it will be removed
16
+ :param alternative: Optional string suggesting an alternative
17
+ :return: Decorated function
18
+ """
19
+
20
+ def decorator(func: Callable) -> Callable:
21
+
22
+ @functools.wraps(func)
23
+ def wrapper(*args, **kwargs):
24
+ # Get the file name where the function is defined
25
+ file_name = inspect.getfile(func)
26
+
27
+ # Construct the warning message
28
+ warning_parts = [
29
+ f'{func.__name__} in {file_name} has been deprecated since version {since}',
30
+ f'and will be removed in version {remove_in}' if remove_in else None,
31
+ f'Use {alternative} instead' if alternative else None
32
+ ]
33
+ warning_message = '. '.join(filter(None, warning_parts))
34
+
35
+ # Log the warning
36
+ logger.warning(warning_message)
37
+
38
+ return func(*args, **kwargs)
39
+
40
+ return wrapper
41
+
42
+ return decorator
evalscope/utils/logger.py CHANGED
@@ -10,7 +10,7 @@ simple_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
10
10
 
11
11
  detailed_formatter = logging.Formatter(detailed_format)
12
12
  simple_formatter = logging.Formatter(simple_format)
13
- DEFAULT_LEVEL = logging.DEBUG if os.getenv('LOG_LEVEL', 'INFO') == 'DEBUG' else logging.INFO
13
+ DEFAULT_LEVEL = logging.DEBUG if os.getenv('EVALSCOPE_LOG_LEVEL', 'INFO') == 'DEBUG' else logging.INFO
14
14
 
15
15
  logging.basicConfig(format=simple_format, level=DEFAULT_LEVEL, force=True)
16
16
 
evalscope/utils/utils.py CHANGED
@@ -10,6 +10,7 @@ import os
10
10
  import random
11
11
  import re
12
12
  import torch
13
+ from inspect import signature
13
14
  from typing import Any, Dict, List, Tuple, Union
14
15
 
15
16
  from evalscope.utils.logger import get_logger
@@ -313,6 +314,17 @@ def seed_everything(seed: int):
313
314
  torch.backends.cudnn.deterministic = True
314
315
  torch.backends.cudnn.benchmark = False
315
316
 
317
+ def get_supported_params(func):
318
+ """Get the supported parameters of a function."""
319
+ sig = signature(func)
320
+ return list(sig.parameters.keys())
321
+
322
+ def parse_int_or_float(num):
323
+ number = float(num)
324
+ if number.is_integer():
325
+ return int(number)
326
+ return number
327
+
316
328
  if __name__ == '__main__':
317
329
  options = ['A', 'B', 'C', 'D']
318
330
  answers = ['Context .... ANSWER: A', 'answer: A']
evalscope/version.py CHANGED
@@ -1,4 +1,4 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
- __version__ = '0.15.1'
4
- __release_datetime__ = '2025-04-30 12:00:00'
3
+ __version__ = '0.16.1'
4
+ __release_datetime__ = '2025-06-03 20:00:00'