evalscope 0.15.1__py3-none-any.whl → 0.16.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/app/__init__.py +28 -0
- evalscope/{report → app}/app.py +67 -59
- evalscope/app/constants.py +21 -0
- evalscope/arguments.py +12 -1
- evalscope/backend/opencompass/backend_manager.py +2 -1
- evalscope/backend/rag_eval/cmteb/arguments.py +4 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/utils/embedding.py +75 -35
- evalscope/backend/rag_eval/utils/llm.py +1 -1
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +0 -6
- evalscope/benchmarks/benchmark.py +1 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -0
- evalscope/benchmarks/data_adapter.py +101 -18
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +84 -0
- evalscope/benchmarks/docmath/utils.py +220 -0
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +133 -0
- evalscope/benchmarks/drop/utils.py +59 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +90 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -1
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +341 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +70 -0
- evalscope/benchmarks/tool_bench/utils.py +203 -0
- evalscope/benchmarks/utils.py +28 -2
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +57 -0
- evalscope/cli/start_app.py +2 -2
- evalscope/collections/__init__.py +35 -3
- evalscope/collections/evaluator.py +94 -32
- evalscope/config.py +54 -17
- evalscope/evaluator/evaluator.py +80 -41
- evalscope/metrics/__init__.py +3 -1
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +15 -8
- evalscope/metrics/math_parser.py +1 -1
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/models/adapters/chat_adapter.py +51 -34
- evalscope/models/adapters/server_adapter.py +17 -25
- evalscope/perf/arguments.py +16 -7
- evalscope/perf/benchmark.py +0 -15
- evalscope/perf/main.py +72 -15
- evalscope/perf/plugin/datasets/custom.py +15 -0
- evalscope/perf/utils/benchmark_util.py +34 -16
- evalscope/perf/utils/db_util.py +25 -15
- evalscope/perf/utils/local_server.py +1 -0
- evalscope/perf/utils/log_utils.py +12 -5
- evalscope/perf/utils/rich_display.py +186 -0
- evalscope/report/__init__.py +36 -4
- evalscope/report/combinator.py +8 -0
- evalscope/report/generator.py +33 -9
- evalscope/report/utils.py +61 -4
- evalscope/run.py +12 -0
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/deprecation_utils.py +42 -0
- evalscope/utils/logger.py +1 -1
- evalscope/utils/utils.py +12 -0
- evalscope/version.py +2 -2
- {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/METADATA +57 -31
- {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/RECORD +78 -57
- tests/aigc/test_t2i.py +40 -3
- tests/cli/test_all.py +39 -32
- tests/cli/test_collection.py +8 -6
- tests/cli/test_run.py +43 -17
- tests/perf/test_perf.py +23 -0
- tests/rag/test_mteb.py +5 -5
- /evalscope/{report/app_arguments.py → app/arguments.py} +0 -0
- {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/LICENSE +0 -0
- {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/WHEEL +0 -0
- {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
# the following code is largely adapted from https://github.com/lework/llm-benchmark
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
from rich.console import Console
|
|
5
|
+
from rich.panel import Panel
|
|
6
|
+
from rich.style import Style
|
|
7
|
+
from rich.table import Table
|
|
8
|
+
from rich.text import Text
|
|
9
|
+
|
|
10
|
+
from evalscope.utils.logger import get_logger
|
|
11
|
+
from .benchmark_util import Metrics
|
|
12
|
+
from .db_util import PercentileMetrics
|
|
13
|
+
|
|
14
|
+
logger = get_logger()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def analyze_results(all_results):
|
|
18
|
+
"""Analyze all test results and generate a summary report"""
|
|
19
|
+
summary = []
|
|
20
|
+
total_tokens = 0
|
|
21
|
+
total_time = 0
|
|
22
|
+
|
|
23
|
+
for result in all_results:
|
|
24
|
+
total_metrics = result[0]
|
|
25
|
+
percentile_metrics = result[1]
|
|
26
|
+
percentiles = percentile_metrics[PercentileMetrics.PERCENTILES]
|
|
27
|
+
try:
|
|
28
|
+
concurrency = total_metrics.get(Metrics.NUMBER_OF_CONCURRENCY, 0)
|
|
29
|
+
rps = total_metrics.get(Metrics.REQUEST_THROUGHPUT, 0)
|
|
30
|
+
avg_latency = total_metrics.get(Metrics.AVERAGE_LATENCY, 0)
|
|
31
|
+
p99_latency = percentile_metrics.get(PercentileMetrics.LATENCY)[percentiles.index('99%')]
|
|
32
|
+
avg_tps = total_metrics.get(Metrics.OUTPUT_TOKEN_THROUGHPUT, 0)
|
|
33
|
+
avg_ttft = total_metrics.get(Metrics.AVERAGE_TIME_TO_FIRST_TOKEN, 0)
|
|
34
|
+
p99_ttft = percentile_metrics.get(PercentileMetrics.TTFT)[percentiles.index('99%')]
|
|
35
|
+
success_rate = (total_metrics.get(Metrics.SUCCEED_REQUESTS, 0)
|
|
36
|
+
/ total_metrics.get(Metrics.TOTAL_REQUESTS, 1)) * 100
|
|
37
|
+
avg_tpot = total_metrics.get(Metrics.AVERAGE_TIME_PER_OUTPUT_TOKEN, 0)
|
|
38
|
+
p99_tpot = percentile_metrics.get(PercentileMetrics.TPOT)[percentiles.index('99%')]
|
|
39
|
+
|
|
40
|
+
# Ensure all values are valid numbers
|
|
41
|
+
if any(x is None for x in [concurrency, rps, avg_latency, p99_latency, avg_tps, avg_ttft]):
|
|
42
|
+
logger.warning(f'Warning: Test results for concurrency {concurrency} contain invalid data, skipped')
|
|
43
|
+
continue
|
|
44
|
+
|
|
45
|
+
summary.append([
|
|
46
|
+
concurrency,
|
|
47
|
+
f'{rps:.2f}' if rps is not None else 'N/A',
|
|
48
|
+
f'{avg_latency:.3f}' if avg_latency is not None else 'N/A',
|
|
49
|
+
f'{p99_latency:.3f}' if p99_latency is not None else 'N/A',
|
|
50
|
+
f'{avg_tps:.2f}' if avg_tps is not None else 'N/A',
|
|
51
|
+
f'{avg_ttft:.3f}' if avg_ttft is not None else 'N/A',
|
|
52
|
+
f'{success_rate:.1f}%' if success_rate is not None else 'N/A',
|
|
53
|
+
f'{p99_ttft:.3f}' if p99_ttft is not None else 'N/A',
|
|
54
|
+
f'{avg_tpot:.3f}' if avg_tpot is not None else 'N/A',
|
|
55
|
+
f'{p99_tpot:.3f}' if p99_tpot is not None else 'N/A',
|
|
56
|
+
])
|
|
57
|
+
|
|
58
|
+
total_tokens += total_metrics.get(Metrics.AVERAGE_OUTPUT_TOKENS_PER_REQUEST, 0) * total_metrics.get(
|
|
59
|
+
Metrics.SUCCEED_REQUESTS, 0)
|
|
60
|
+
total_time += total_metrics.get(Metrics.TIME_TAKEN_FOR_TESTS, 0)
|
|
61
|
+
except Exception as e:
|
|
62
|
+
logger.warning(
|
|
63
|
+
f"Warning: Error processing results for concurrency {result.get('concurrency', 'unknown')}: {str(e)}")
|
|
64
|
+
continue
|
|
65
|
+
|
|
66
|
+
if not summary:
|
|
67
|
+
logger.warning('Error: No valid test result data')
|
|
68
|
+
return [], 0, 0
|
|
69
|
+
|
|
70
|
+
return summary, total_tokens, total_time
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def print_summary(all_results, model_name):
|
|
74
|
+
"""Print test results summary"""
|
|
75
|
+
summary, total_tokens, total_time = analyze_results(all_results)
|
|
76
|
+
|
|
77
|
+
if not summary:
|
|
78
|
+
logger.warning('No available test result data to display')
|
|
79
|
+
return
|
|
80
|
+
|
|
81
|
+
console = Console(width=100) # Set fixed width
|
|
82
|
+
|
|
83
|
+
# Create title panel
|
|
84
|
+
title = Text('Performance Test Summary Report', style='bold')
|
|
85
|
+
console.print(Panel(title, width=60))
|
|
86
|
+
|
|
87
|
+
# Print basic information
|
|
88
|
+
basic_info = Table(show_header=False, width=60)
|
|
89
|
+
basic_info.add_column('Name', style='cyan', width=25)
|
|
90
|
+
basic_info.add_column('Value', style='green', width=35)
|
|
91
|
+
|
|
92
|
+
basic_info.add_row('Model', model_name)
|
|
93
|
+
basic_info.add_row('Total Generated', f'{total_tokens:,} tokens')
|
|
94
|
+
basic_info.add_row('Total Test Time', f'{total_time:.2f} seconds')
|
|
95
|
+
basic_info.add_row('Avg Output Rate', f'{total_tokens / total_time:.2f} tokens/sec')
|
|
96
|
+
|
|
97
|
+
console.print('\nBasic Information:')
|
|
98
|
+
console.print(basic_info)
|
|
99
|
+
|
|
100
|
+
# Create detailed performance metrics table
|
|
101
|
+
table = Table(
|
|
102
|
+
title='Detailed Performance Metrics',
|
|
103
|
+
show_header=True,
|
|
104
|
+
header_style='bold cyan',
|
|
105
|
+
border_style='blue',
|
|
106
|
+
width=100, # Set total table width
|
|
107
|
+
pad_edge=False, # Reduce edge padding
|
|
108
|
+
min_width=60, # Minimum width
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
# Add columns (set fixed column widths)
|
|
112
|
+
table.add_column('Conc.', justify='right', style='cyan')
|
|
113
|
+
table.add_column('RPS', justify='right')
|
|
114
|
+
table.add_column('Avg Lat.(s)', justify='right')
|
|
115
|
+
table.add_column('P99 Lat.(s)', justify='right')
|
|
116
|
+
table.add_column('Gen. toks/s', justify='right')
|
|
117
|
+
table.add_column('Avg TTFT(s)', justify='right')
|
|
118
|
+
table.add_column('P99 TTFT(s)', justify='right')
|
|
119
|
+
table.add_column('Avg TPOT(s)', justify='right')
|
|
120
|
+
table.add_column('P99 TPOT(s)', justify='right')
|
|
121
|
+
table.add_column('Success Rate', justify='right', style='green')
|
|
122
|
+
|
|
123
|
+
# Add data rows
|
|
124
|
+
for row in summary:
|
|
125
|
+
try:
|
|
126
|
+
# Set row style based on success rate
|
|
127
|
+
success_rate = float(row[6].rstrip('%'))
|
|
128
|
+
row_style = 'green' if success_rate >= 95 else 'yellow' if success_rate >= 80 else 'red'
|
|
129
|
+
|
|
130
|
+
table.add_row(
|
|
131
|
+
str(row[0]), # Concurrency
|
|
132
|
+
f'{float(row[1]):.2f}', # RPS
|
|
133
|
+
f'{float(row[2]):.3f}', # Average Latency
|
|
134
|
+
f'{float(row[3]):.3f}', # P99 Latency
|
|
135
|
+
f'{float(row[4]):.2f}', # Average TPS
|
|
136
|
+
f'{float(row[5]):.3f}', # First Token Latency
|
|
137
|
+
f'{float(row[7]):.3f}', # P99 TTFT
|
|
138
|
+
f'{float(row[8]):.3f}', # Average TPOT
|
|
139
|
+
f'{float(row[9]):.3f}', # P99 TPOT
|
|
140
|
+
row[6], # Success Rate
|
|
141
|
+
style=row_style)
|
|
142
|
+
except ValueError as e:
|
|
143
|
+
console.print(f'Warning: Error processing row data: {str(e)}', style='bold red')
|
|
144
|
+
continue
|
|
145
|
+
|
|
146
|
+
console.print('\n')
|
|
147
|
+
console.print(table)
|
|
148
|
+
|
|
149
|
+
# Calculate and display best performance configuration
|
|
150
|
+
try:
|
|
151
|
+
best_rps_idx = np.argmax([float(row[1]) if row[1] != 'N/A' else -1 for row in summary])
|
|
152
|
+
best_latency_idx = np.argmin([float(row[2]) if row[2] != 'N/A' else float('inf') for row in summary])
|
|
153
|
+
|
|
154
|
+
perf_info = Table(title='Best Performance Configuration', show_header=False, box=None, width=60)
|
|
155
|
+
perf_info.add_column('Metric', style='cyan', width=20)
|
|
156
|
+
perf_info.add_column('Value', style='green', width=40)
|
|
157
|
+
|
|
158
|
+
perf_info.add_row('Highest RPS', f'Concurrency {summary[best_rps_idx][0]} ({summary[best_rps_idx][1]} req/sec)')
|
|
159
|
+
perf_info.add_row('Lowest Latency',
|
|
160
|
+
f'Concurrency {summary[best_latency_idx][0]} ({summary[best_latency_idx][2]} seconds)')
|
|
161
|
+
|
|
162
|
+
console.print('\n')
|
|
163
|
+
console.print(perf_info)
|
|
164
|
+
|
|
165
|
+
# Performance recommendations
|
|
166
|
+
recommendations = []
|
|
167
|
+
if best_rps_idx == len(summary) - 1:
|
|
168
|
+
recommendations.append(
|
|
169
|
+
'The system seems not to have reached its performance bottleneck, try higher concurrency')
|
|
170
|
+
elif best_rps_idx == 0:
|
|
171
|
+
recommendations.append('Consider lowering concurrency, current load may be too high')
|
|
172
|
+
else:
|
|
173
|
+
recommendations.append(f'Optimal concurrency range is around {summary[best_rps_idx][0]}')
|
|
174
|
+
|
|
175
|
+
success_rate = float(summary[-1][6][:-1])
|
|
176
|
+
if success_rate < 95:
|
|
177
|
+
recommendations.append(
|
|
178
|
+
'Success rate is low at high concurrency, check system resources or reduce concurrency')
|
|
179
|
+
|
|
180
|
+
recommend_text = Text('\nPerformance Recommendations:', style='bold cyan')
|
|
181
|
+
console.print(recommend_text)
|
|
182
|
+
for rec in recommendations:
|
|
183
|
+
console.print(f'• {rec}', style='yellow')
|
|
184
|
+
|
|
185
|
+
except Exception as e:
|
|
186
|
+
console.print(f'Warning: Error generating performance analysis: {str(e)}', style='bold red')
|
evalscope/report/__init__.py
CHANGED
|
@@ -1,6 +1,38 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
2
3
|
|
|
3
|
-
from evalscope.
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
from
|
|
4
|
+
from evalscope.utils.import_utils import _LazyModule
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from .combinator import gen_report_table, gen_table, get_data_frame, get_report_list
|
|
8
|
+
from .generator import ReportGenerator
|
|
9
|
+
from .utils import Category, Report, ReportKey, Subset
|
|
10
|
+
|
|
11
|
+
else:
|
|
12
|
+
_import_structure = {
|
|
13
|
+
'combinator': [
|
|
14
|
+
'gen_table',
|
|
15
|
+
'get_data_frame',
|
|
16
|
+
'get_report_list',
|
|
17
|
+
'gen_report_table',
|
|
18
|
+
],
|
|
19
|
+
'generator': [
|
|
20
|
+
'ReportGenerator',
|
|
21
|
+
],
|
|
22
|
+
'utils': [
|
|
23
|
+
'Category',
|
|
24
|
+
'Report',
|
|
25
|
+
'ReportKey',
|
|
26
|
+
'Subset',
|
|
27
|
+
],
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
import sys
|
|
31
|
+
|
|
32
|
+
sys.modules[__name__] = _LazyModule(
|
|
33
|
+
__name__,
|
|
34
|
+
globals()['__file__'],
|
|
35
|
+
_import_structure,
|
|
36
|
+
module_spec=__spec__,
|
|
37
|
+
extra_objects={},
|
|
38
|
+
)
|
evalscope/report/combinator.py
CHANGED
|
@@ -48,6 +48,14 @@ def gen_table(reports_path_list: list) -> str:
|
|
|
48
48
|
return tabulate(table, headers=table.columns, tablefmt='grid', showindex=False)
|
|
49
49
|
|
|
50
50
|
|
|
51
|
+
def gen_report_table(report: Report) -> str:
|
|
52
|
+
"""
|
|
53
|
+
Generate a report table for a single report.
|
|
54
|
+
"""
|
|
55
|
+
table = report.to_dataframe(flatten_metrics=True, flatten_categories=True)
|
|
56
|
+
return tabulate(table, headers=table.columns, tablefmt='grid', showindex=False)
|
|
57
|
+
|
|
58
|
+
|
|
51
59
|
class ReportsRecorder:
|
|
52
60
|
COMMON_DATASET_PATH = []
|
|
53
61
|
CUSTOM_DATASET_PATH = []
|
evalscope/report/generator.py
CHANGED
|
@@ -1,24 +1,42 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
2
|
from pandas import DataFrame
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
3
4
|
|
|
4
5
|
from evalscope.constants import DataCollection
|
|
5
6
|
from evalscope.report.utils import *
|
|
6
7
|
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from evalscope.benchmarks import DataAdapter
|
|
10
|
+
|
|
7
11
|
|
|
8
12
|
class ReportGenerator:
|
|
9
13
|
|
|
10
14
|
@staticmethod
|
|
11
|
-
def gen_report(subset_score_map: dict,
|
|
15
|
+
def gen_report(subset_score_map: dict, model_name: str, data_adapter: 'DataAdapter', **kwargs) -> Report:
|
|
12
16
|
"""
|
|
13
|
-
Generate report for specific dataset.
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
+
Generate a report for a specific dataset based on provided subset scores.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
subset_score_map (dict): A mapping from subset names to a list of score dictionaries.
|
|
21
|
+
{
|
|
22
|
+
'subset_name': [
|
|
23
|
+
{'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100},
|
|
24
|
+
{'metric_name': 'WeightedAverageAccuracy', 'score': 0.3389, 'num': 100}
|
|
25
|
+
],
|
|
26
|
+
...
|
|
27
|
+
}
|
|
28
|
+
report_name (str): The name of the report to generate.
|
|
29
|
+
data_adapter (DataAdapter): An adapter object for data handling.
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
Report: A structured report object containing metrics, categories, and subsets.
|
|
33
|
+
|
|
34
|
+
>>> report = gen_report(subset_score_map, "My Report", data_adapter, dataset_name="Dataset", model_name="Model")
|
|
17
35
|
""" # noqa: E501
|
|
18
36
|
|
|
19
|
-
dataset_name =
|
|
20
|
-
|
|
21
|
-
|
|
37
|
+
dataset_name = data_adapter.name
|
|
38
|
+
category_map = data_adapter.category_map
|
|
39
|
+
report_name = f'{model_name}@{dataset_name}'
|
|
22
40
|
|
|
23
41
|
def flatten_subset() -> DataFrame:
|
|
24
42
|
"""
|
|
@@ -59,7 +77,13 @@ class ReportGenerator:
|
|
|
59
77
|
|
|
60
78
|
metrics_list.append(Metric(name=metric_name, categories=categories))
|
|
61
79
|
|
|
62
|
-
report = Report(
|
|
80
|
+
report = Report(
|
|
81
|
+
name=report_name,
|
|
82
|
+
metrics=metrics_list,
|
|
83
|
+
dataset_name=dataset_name,
|
|
84
|
+
model_name=model_name,
|
|
85
|
+
dataset_description=data_adapter.description,
|
|
86
|
+
dataset_pretty_name=data_adapter.pretty_name)
|
|
63
87
|
return report
|
|
64
88
|
|
|
65
89
|
@staticmethod
|
evalscope/report/utils.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import os
|
|
2
3
|
import pandas as pd
|
|
3
4
|
from collections import defaultdict
|
|
4
5
|
from dataclasses import asdict, dataclass, field
|
|
@@ -6,6 +7,9 @@ from typing import Any, Dict, List
|
|
|
6
7
|
|
|
7
8
|
from evalscope.metrics import macro_mean, micro_mean
|
|
8
9
|
from evalscope.utils import normalize_score
|
|
10
|
+
from evalscope.utils.logger import get_logger
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
9
13
|
|
|
10
14
|
|
|
11
15
|
@dataclass
|
|
@@ -70,13 +74,28 @@ class ReportKey:
|
|
|
70
74
|
score = 'Score'
|
|
71
75
|
|
|
72
76
|
|
|
77
|
+
ANALYSIS_PROMPT = """根据给出的json格式的模型评测结果,输出分析报告,要求如下:
|
|
78
|
+
1. 报告分为 总体表现、关键指标分析、改进建议、结论 四部分
|
|
79
|
+
2. 若模型有多种指标,将其分为低分、中分、高分三个部分,并列出markdown表格
|
|
80
|
+
3. 只列出报告本身,不要有其他多余内容
|
|
81
|
+
4. 输出报告语言为{language}
|
|
82
|
+
|
|
83
|
+
```json
|
|
84
|
+
{report_str}
|
|
85
|
+
```
|
|
86
|
+
"""
|
|
87
|
+
|
|
88
|
+
|
|
73
89
|
@dataclass
|
|
74
90
|
class Report:
|
|
75
91
|
name: str = 'default_report'
|
|
76
92
|
dataset_name: str = 'default_dataset'
|
|
93
|
+
dataset_pretty_name: str = ''
|
|
94
|
+
dataset_description: str = ''
|
|
77
95
|
model_name: str = 'default_model'
|
|
78
96
|
score: float = 0.0
|
|
79
97
|
metrics: List[Metric] = field(default_factory=list)
|
|
98
|
+
analysis: str = 'N/A'
|
|
80
99
|
|
|
81
100
|
def __post_init__(self):
|
|
82
101
|
self.score = self.metrics[0].score # NOTE: only use the first metric by default
|
|
@@ -84,19 +103,33 @@ class Report:
|
|
|
84
103
|
def to_dict(self) -> Dict[str, Any]:
|
|
85
104
|
return asdict(self)
|
|
86
105
|
|
|
106
|
+
def to_json_str(self) -> str:
|
|
107
|
+
return json.dumps(self.to_dict(), indent=4, ensure_ascii=False)
|
|
108
|
+
|
|
109
|
+
def to_json(self, json_file: str):
|
|
110
|
+
# ensure the directory exists
|
|
111
|
+
os.makedirs(os.path.dirname(json_file), exist_ok=True)
|
|
112
|
+
# write the report to a json file
|
|
113
|
+
with open(json_file, 'w', encoding='utf-8') as f:
|
|
114
|
+
json.dump(self.to_dict(), f, indent=4, ensure_ascii=False)
|
|
115
|
+
|
|
87
116
|
@classmethod
|
|
88
117
|
def from_dict(cls, data: dict):
|
|
89
118
|
metrics = [Metric.from_dict(metric) for metric in data.get('metrics', [])]
|
|
90
119
|
return cls(
|
|
91
120
|
name=data['name'],
|
|
121
|
+
dataset_name=data['dataset_name'],
|
|
122
|
+
dataset_pretty_name=data.get('dataset_pretty_name'),
|
|
123
|
+
dataset_description=data.get('dataset_description'),
|
|
92
124
|
score=data['score'],
|
|
125
|
+
model_name=data['model_name'],
|
|
93
126
|
metrics=metrics,
|
|
94
|
-
|
|
95
|
-
|
|
127
|
+
analysis=data.get('analysis', 'N/A'),
|
|
128
|
+
)
|
|
96
129
|
|
|
97
130
|
@classmethod
|
|
98
131
|
def from_json(cls, json_file: str):
|
|
99
|
-
with open(json_file, 'r') as f:
|
|
132
|
+
with open(json_file, 'r', encoding='utf-8') as f:
|
|
100
133
|
data = json.load(f)
|
|
101
134
|
return cls.from_dict(data)
|
|
102
135
|
|
|
@@ -111,7 +144,7 @@ class Report:
|
|
|
111
144
|
table[ReportKey.category_name].append(category.name)
|
|
112
145
|
table[ReportKey.subset_name].append(subset.name)
|
|
113
146
|
table[ReportKey.num].append(subset.num)
|
|
114
|
-
table[ReportKey.score].append(subset.score)
|
|
147
|
+
table[ReportKey.score].append(subset.score)
|
|
115
148
|
# NOTE: only flatten metrics if needed, use the first metric by default
|
|
116
149
|
if not flatten_metrics:
|
|
117
150
|
break
|
|
@@ -131,3 +164,27 @@ class Report:
|
|
|
131
164
|
|
|
132
165
|
df_categories.drop(columns=[ReportKey.category_name], inplace=True)
|
|
133
166
|
return df_categories
|
|
167
|
+
|
|
168
|
+
def generate_analysis(self, judge_llm_config: dict) -> str:
|
|
169
|
+
import locale
|
|
170
|
+
|
|
171
|
+
from evalscope.metrics import LLMJudge
|
|
172
|
+
|
|
173
|
+
try:
|
|
174
|
+
# get the default locale
|
|
175
|
+
lang, _ = locale.getlocale()
|
|
176
|
+
|
|
177
|
+
if lang is None:
|
|
178
|
+
language = '中文'
|
|
179
|
+
else:
|
|
180
|
+
language = 'en' if lang.startswith('en') else '中文'
|
|
181
|
+
|
|
182
|
+
prompt = ANALYSIS_PROMPT.format(language=language, report_str=self.to_json_str())
|
|
183
|
+
judge_llm = LLMJudge(**judge_llm_config)
|
|
184
|
+
response = judge_llm(prompt)
|
|
185
|
+
except Exception as e:
|
|
186
|
+
logger.error(f'Error generating analysis: {e}')
|
|
187
|
+
response = 'N/A'
|
|
188
|
+
|
|
189
|
+
self.analysis = response
|
|
190
|
+
return response
|
evalscope/run.py
CHANGED
|
@@ -43,6 +43,9 @@ def run_single_task(task_cfg: TaskConfig, run_time: str) -> dict:
|
|
|
43
43
|
else:
|
|
44
44
|
result = evaluate_model(task_cfg, outputs)
|
|
45
45
|
|
|
46
|
+
logger.info(f'Finished evaluation for {task_cfg.model_id} on {task_cfg.datasets}')
|
|
47
|
+
logger.info(f'Output directory: {outputs.outputs_dir}')
|
|
48
|
+
|
|
46
49
|
return result
|
|
47
50
|
|
|
48
51
|
|
|
@@ -109,6 +112,7 @@ def get_backend_manager_class(eval_backend: EvalBackend):
|
|
|
109
112
|
def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
|
|
110
113
|
"""Evaluate the model based on the provided task configuration."""
|
|
111
114
|
from evalscope.models import get_local_model
|
|
115
|
+
from evalscope.report import gen_table
|
|
112
116
|
|
|
113
117
|
# Initialize evaluator
|
|
114
118
|
eval_results = {}
|
|
@@ -122,10 +126,18 @@ def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
|
|
|
122
126
|
task_cfg.dump_yaml(outputs.configs_dir)
|
|
123
127
|
logger.info(task_cfg)
|
|
124
128
|
|
|
129
|
+
# Run evaluation for each evaluator
|
|
125
130
|
for evaluator in evaluators:
|
|
126
131
|
res_dict = evaluator.eval()
|
|
127
132
|
eval_results[evaluator.dataset_name] = res_dict
|
|
128
133
|
|
|
134
|
+
# Make overall report
|
|
135
|
+
try:
|
|
136
|
+
report_table: str = gen_table([outputs.reports_dir])
|
|
137
|
+
logger.info(f'Overall report table: \n{report_table} \n')
|
|
138
|
+
except Exception:
|
|
139
|
+
logger.error('Failed to generate report table.')
|
|
140
|
+
|
|
129
141
|
# Clean up
|
|
130
142
|
if base_model is not None:
|
|
131
143
|
import gc
|
|
@@ -6,11 +6,12 @@ from typing import Union
|
|
|
6
6
|
from evalscope.third_party.toolbench_static.eval import EvalArgs, run_eval
|
|
7
7
|
from evalscope.third_party.toolbench_static.infer import InferArgs, run_infer
|
|
8
8
|
from evalscope.utils import get_logger
|
|
9
|
+
from evalscope.utils.deprecation_utils import deprecated
|
|
9
10
|
from evalscope.utils.io_utils import json_to_dict, yaml_to_dict
|
|
10
11
|
|
|
11
12
|
logger = get_logger()
|
|
12
13
|
|
|
13
|
-
|
|
14
|
+
@deprecated(since='0.15.1', remove_in='0.18.0', alternative='Native implementation of ToolBench')
|
|
14
15
|
def run_task(task_cfg: Union[str, dict]):
|
|
15
16
|
|
|
16
17
|
if isinstance(task_cfg, str):
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
import functools
|
|
2
|
+
import inspect
|
|
3
|
+
from typing import Callable, Optional
|
|
4
|
+
|
|
5
|
+
from .logger import get_logger
|
|
6
|
+
|
|
7
|
+
logger = get_logger()
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def deprecated(since: str, remove_in: Optional[str] = None, alternative: Optional[str] = None) -> Callable:
|
|
11
|
+
"""
|
|
12
|
+
Decorator to mark functions as deprecated.
|
|
13
|
+
|
|
14
|
+
:param since: String indicating the version since deprecation
|
|
15
|
+
:param remove_in: Optional string indicating the version when it will be removed
|
|
16
|
+
:param alternative: Optional string suggesting an alternative
|
|
17
|
+
:return: Decorated function
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def decorator(func: Callable) -> Callable:
|
|
21
|
+
|
|
22
|
+
@functools.wraps(func)
|
|
23
|
+
def wrapper(*args, **kwargs):
|
|
24
|
+
# Get the file name where the function is defined
|
|
25
|
+
file_name = inspect.getfile(func)
|
|
26
|
+
|
|
27
|
+
# Construct the warning message
|
|
28
|
+
warning_parts = [
|
|
29
|
+
f'{func.__name__} in {file_name} has been deprecated since version {since}',
|
|
30
|
+
f'and will be removed in version {remove_in}' if remove_in else None,
|
|
31
|
+
f'Use {alternative} instead' if alternative else None
|
|
32
|
+
]
|
|
33
|
+
warning_message = '. '.join(filter(None, warning_parts))
|
|
34
|
+
|
|
35
|
+
# Log the warning
|
|
36
|
+
logger.warning(warning_message)
|
|
37
|
+
|
|
38
|
+
return func(*args, **kwargs)
|
|
39
|
+
|
|
40
|
+
return wrapper
|
|
41
|
+
|
|
42
|
+
return decorator
|
evalscope/utils/logger.py
CHANGED
|
@@ -10,7 +10,7 @@ simple_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
|
10
10
|
|
|
11
11
|
detailed_formatter = logging.Formatter(detailed_format)
|
|
12
12
|
simple_formatter = logging.Formatter(simple_format)
|
|
13
|
-
DEFAULT_LEVEL = logging.DEBUG if os.getenv('
|
|
13
|
+
DEFAULT_LEVEL = logging.DEBUG if os.getenv('EVALSCOPE_LOG_LEVEL', 'INFO') == 'DEBUG' else logging.INFO
|
|
14
14
|
|
|
15
15
|
logging.basicConfig(format=simple_format, level=DEFAULT_LEVEL, force=True)
|
|
16
16
|
|
evalscope/utils/utils.py
CHANGED
|
@@ -10,6 +10,7 @@ import os
|
|
|
10
10
|
import random
|
|
11
11
|
import re
|
|
12
12
|
import torch
|
|
13
|
+
from inspect import signature
|
|
13
14
|
from typing import Any, Dict, List, Tuple, Union
|
|
14
15
|
|
|
15
16
|
from evalscope.utils.logger import get_logger
|
|
@@ -313,6 +314,17 @@ def seed_everything(seed: int):
|
|
|
313
314
|
torch.backends.cudnn.deterministic = True
|
|
314
315
|
torch.backends.cudnn.benchmark = False
|
|
315
316
|
|
|
317
|
+
def get_supported_params(func):
|
|
318
|
+
"""Get the supported parameters of a function."""
|
|
319
|
+
sig = signature(func)
|
|
320
|
+
return list(sig.parameters.keys())
|
|
321
|
+
|
|
322
|
+
def parse_int_or_float(num):
|
|
323
|
+
number = float(num)
|
|
324
|
+
if number.is_integer():
|
|
325
|
+
return int(number)
|
|
326
|
+
return number
|
|
327
|
+
|
|
316
328
|
if __name__ == '__main__':
|
|
317
329
|
options = ['A', 'B', 'C', 'D']
|
|
318
330
|
answers = ['Context .... ANSWER: A', 'answer: A']
|
evalscope/version.py
CHANGED