evalscope 0.16.0__py3-none-any.whl → 0.16.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/app/__init__.py +28 -0
- evalscope/{report → app}/app.py +40 -30
- evalscope/app/constants.py +21 -0
- evalscope/arguments.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +2 -1
- evalscope/backend/rag_eval/clip_benchmark/dataset_builder.py +23 -11
- evalscope/backend/rag_eval/cmteb/arguments.py +4 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/utils/embedding.py +77 -39
- evalscope/benchmarks/aigc/t2i/evalmuse_adapter.py +1 -0
- evalscope/benchmarks/aime/aime24_adapter.py +3 -1
- evalscope/benchmarks/aime/aime25_adapter.py +3 -1
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +5 -0
- evalscope/benchmarks/arc/arc_adapter.py +3 -0
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +7 -3
- evalscope/benchmarks/bbh/bbh_adapter.py +3 -0
- evalscope/benchmarks/benchmark.py +2 -0
- evalscope/benchmarks/bfcl/__init__.py +0 -0
- evalscope/benchmarks/bfcl/bfcl_adapter.py +237 -0
- evalscope/benchmarks/ceval/ceval_adapter.py +3 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +4 -1
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +3 -0
- evalscope/benchmarks/competition_math/competition_math_adapter.py +3 -0
- evalscope/benchmarks/data_adapter.py +99 -16
- evalscope/benchmarks/data_collection/data_collection_adapter.py +1 -0
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +85 -0
- evalscope/benchmarks/docmath/utils.py +220 -0
- evalscope/benchmarks/drop/drop_adapter.py +3 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +91 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +19 -23
- evalscope/benchmarks/general_qa/general_qa_adapter.py +3 -0
- evalscope/benchmarks/gpqa/gpqa_adapter.py +3 -0
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +3 -0
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +3 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +3 -0
- evalscope/benchmarks/ifeval/ifeval_adapter.py +3 -0
- evalscope/benchmarks/iquiz/iquiz_adapter.py +3 -0
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +4 -1
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +3 -0
- evalscope/benchmarks/math_500/math_500_adapter.py +3 -0
- evalscope/benchmarks/mmlu/mmlu_adapter.py +3 -0
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +3 -0
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +3 -0
- evalscope/benchmarks/musr/musr_adapter.py +3 -0
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +348 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/process_bench/process_bench_adapter.py +3 -0
- evalscope/benchmarks/race/race_adapter.py +3 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +3 -0
- evalscope/benchmarks/super_gpqa/five_shot_prompt.txt +1 -0
- evalscope/benchmarks/super_gpqa/super_gpqa_adapter.py +21 -3
- evalscope/benchmarks/super_gpqa/zero_shot_prompt.txt +1 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +9 -1
- evalscope/benchmarks/tool_bench/utils.py +5 -4
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +3 -0
- evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +3 -0
- evalscope/benchmarks/utils.py +25 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +3 -0
- evalscope/cli/start_app.py +2 -2
- evalscope/collections/__init__.py +35 -3
- evalscope/collections/evaluator.py +68 -34
- evalscope/config.py +8 -2
- evalscope/constants.py +1 -1
- evalscope/evaluator/evaluator.py +40 -28
- evalscope/metrics/__init__.py +3 -1
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
- evalscope/metrics/llm_judge.py +12 -5
- evalscope/metrics/math_parser.py +1 -1
- evalscope/metrics/t2v_metrics/__init__.py +9 -23
- evalscope/models/adapters/__init__.py +2 -0
- evalscope/models/adapters/base_adapter.py +31 -27
- evalscope/models/adapters/bfcl_adapter.py +244 -0
- evalscope/models/adapters/server_adapter.py +80 -23
- evalscope/models/custom/custom_model.py +0 -3
- evalscope/models/custom/dummy_model.py +77 -39
- evalscope/models/local_model.py +1 -1
- evalscope/models/register.py +2 -1
- evalscope/perf/arguments.py +4 -2
- evalscope/perf/benchmark.py +16 -12
- evalscope/perf/main.py +7 -0
- evalscope/perf/plugin/api/openai_api.py +2 -0
- evalscope/perf/plugin/datasets/custom.py +15 -0
- evalscope/perf/utils/benchmark_util.py +1 -1
- evalscope/perf/utils/local_server.py +1 -0
- evalscope/perf/utils/log_utils.py +12 -5
- evalscope/perf/utils/rich_display.py +1 -1
- evalscope/report/__init__.py +36 -4
- evalscope/report/combinator.py +40 -6
- evalscope/report/generator.py +33 -9
- evalscope/report/utils.py +84 -4
- evalscope/run.py +12 -0
- evalscope/summarizer.py +1 -1
- evalscope/utils/io_utils.py +59 -2
- evalscope/utils/logger.py +1 -1
- evalscope/utils/utils.py +12 -0
- evalscope/version.py +2 -2
- {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/METADATA +16 -13
- {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/RECORD +114 -100
- tests/aigc/test_t2i.py +48 -11
- tests/cli/test_all.py +14 -3
- tests/cli/test_collection.py +6 -4
- tests/cli/test_run.py +50 -25
- tests/rag/test_clip_benchmark.py +5 -1
- tests/rag/test_mteb.py +51 -7
- /evalscope/{report/app_arguments.py → app/arguments.py} +0 -0
- {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/LICENSE +0 -0
- {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/WHEEL +0 -0
- {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/entry_points.txt +0 -0
- {evalscope-0.16.0.dist-info → evalscope-0.16.2.dist-info}/top_level.txt +0 -0
evalscope/perf/benchmark.py
CHANGED
|
@@ -1,11 +1,8 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
-
import copy
|
|
3
2
|
import json
|
|
4
3
|
import numpy as np
|
|
5
|
-
import os
|
|
6
4
|
import platform
|
|
7
5
|
import sqlite3
|
|
8
|
-
import threading
|
|
9
6
|
import time
|
|
10
7
|
from http import HTTPStatus
|
|
11
8
|
from tqdm import tqdm
|
|
@@ -17,7 +14,6 @@ from evalscope.perf.plugin.registry import ApiRegistry, DatasetRegistry
|
|
|
17
14
|
from evalscope.perf.utils.benchmark_util import BenchmarkData, BenchmarkMetrics
|
|
18
15
|
from evalscope.perf.utils.db_util import create_result_table, get_result_db_path, insert_benchmark_data, summary_result
|
|
19
16
|
from evalscope.perf.utils.handler import add_signal_handlers, exception_handler
|
|
20
|
-
from evalscope.perf.utils.local_server import start_app
|
|
21
17
|
from evalscope.utils.logger import get_logger
|
|
22
18
|
|
|
23
19
|
logger = get_logger()
|
|
@@ -45,14 +41,27 @@ async def get_requests(args: Arguments) -> AsyncGenerator[dict, None]:
|
|
|
45
41
|
message_generator_class = DatasetRegistry(args.dataset)
|
|
46
42
|
message_generator = message_generator_class(args)
|
|
47
43
|
|
|
44
|
+
dataset_messages = []
|
|
45
|
+
try:
|
|
46
|
+
for messages in message_generator:
|
|
47
|
+
dataset_messages.append(messages)
|
|
48
|
+
except StopIteration:
|
|
49
|
+
pass
|
|
50
|
+
|
|
51
|
+
if not dataset_messages:
|
|
52
|
+
raise Exception('Dataset is empty!')
|
|
53
|
+
|
|
48
54
|
count = 0
|
|
49
|
-
|
|
55
|
+
dataset_index = 0
|
|
56
|
+
|
|
57
|
+
while count < args.number:
|
|
58
|
+
messages = dataset_messages[dataset_index]
|
|
50
59
|
request = query_generator.build_request(messages, args)
|
|
51
60
|
if request is not None:
|
|
52
61
|
yield request
|
|
53
62
|
count += 1
|
|
54
|
-
|
|
55
|
-
|
|
63
|
+
|
|
64
|
+
dataset_index = (dataset_index + 1) % len(dataset_messages)
|
|
56
65
|
|
|
57
66
|
if args.prompt:
|
|
58
67
|
prompt = load_prompt(args.prompt)
|
|
@@ -164,11 +173,6 @@ async def statistic_benchmark_metric(benchmark_data_queue: asyncio.Queue, args:
|
|
|
164
173
|
|
|
165
174
|
@exception_handler
|
|
166
175
|
async def connect_test(args: Arguments) -> bool:
|
|
167
|
-
if args.api.startswith('local'):
|
|
168
|
-
# start local server
|
|
169
|
-
server = threading.Thread(target=start_app, args=(copy.deepcopy(args), ), daemon=True)
|
|
170
|
-
server.start()
|
|
171
|
-
|
|
172
176
|
if (not args.no_test_connection) and (not await test_connection(args)):
|
|
173
177
|
raise TimeoutError('Test connection failed')
|
|
174
178
|
|
evalscope/perf/main.py
CHANGED
|
@@ -2,9 +2,11 @@ import asyncio
|
|
|
2
2
|
import copy
|
|
3
3
|
import os
|
|
4
4
|
import platform
|
|
5
|
+
import threading
|
|
5
6
|
import time
|
|
6
7
|
from argparse import Namespace
|
|
7
8
|
|
|
9
|
+
from evalscope.perf.utils.local_server import start_app
|
|
8
10
|
from evalscope.perf.utils.log_utils import init_swanlab, init_wandb
|
|
9
11
|
from evalscope.utils.logger import configure_logging, get_logger
|
|
10
12
|
from evalscope.utils.utils import seed_everything
|
|
@@ -82,6 +84,11 @@ def run_perf_benchmark(args):
|
|
|
82
84
|
if args.swanlab_api_key:
|
|
83
85
|
init_swanlab(args)
|
|
84
86
|
|
|
87
|
+
# Initialize local server if needed
|
|
88
|
+
if args.api.startswith('local'):
|
|
89
|
+
# start local server
|
|
90
|
+
server = threading.Thread(target=start_app, args=(copy.deepcopy(args), ), daemon=True)
|
|
91
|
+
server.start()
|
|
85
92
|
# Start benchmark
|
|
86
93
|
if len(args.number) == 1:
|
|
87
94
|
return run_one_benchmark(args, output_path=output_path)
|
|
@@ -75,6 +75,8 @@ class OpenaiPlugin(ApiPluginBase):
|
|
|
75
75
|
payload['min_tokens'] = param.min_tokens
|
|
76
76
|
if param.frequency_penalty is not None:
|
|
77
77
|
payload['frequency_penalty'] = param.frequency_penalty
|
|
78
|
+
if param.repetition_penalty is not None:
|
|
79
|
+
payload['repetition_penalty'] = param.repetition_penalty
|
|
78
80
|
if param.logprobs is not None:
|
|
79
81
|
payload['logprobs'] = param.logprobs
|
|
80
82
|
if param.n_choices is not None:
|
|
@@ -22,3 +22,18 @@ class CustomDatasetPlugin(DatasetPluginBase):
|
|
|
22
22
|
yield [{'role': 'user', 'content': prompt}]
|
|
23
23
|
else:
|
|
24
24
|
yield prompt
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
if __name__ == '__main__':
|
|
28
|
+
from evalscope.perf.arguments import Arguments
|
|
29
|
+
from evalscope.perf.main import run_perf_benchmark
|
|
30
|
+
|
|
31
|
+
args = Arguments(
|
|
32
|
+
model='qwen2.5-7b-instruct',
|
|
33
|
+
url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
|
|
34
|
+
dataset_path='outputs/perf_data.txt',
|
|
35
|
+
api_key='EMPTY',
|
|
36
|
+
dataset='custom',
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
run_perf_benchmark(args)
|
|
@@ -38,7 +38,7 @@ class BenchmarkData:
|
|
|
38
38
|
self.first_chunk_latency = self.query_latency
|
|
39
39
|
self.n_chunks = 1
|
|
40
40
|
self.n_chunks_time = self.query_latency
|
|
41
|
-
self.time_per_output_token = self.n_chunks_time / self.
|
|
41
|
+
self.time_per_output_token = self.n_chunks_time / self.n_chunks
|
|
42
42
|
|
|
43
43
|
def _calculate_tokens(self, api_plugin):
|
|
44
44
|
self.prompt_tokens, self.completion_tokens = \
|
|
@@ -96,6 +96,7 @@ def create_app(model, attn_implementation=None) -> FastAPI:
|
|
|
96
96
|
|
|
97
97
|
|
|
98
98
|
def start_app(args: Arguments):
|
|
99
|
+
logger.info('Starting local server, please wait...')
|
|
99
100
|
if args.api == 'local':
|
|
100
101
|
app = create_app(args.model, args.attn_implementation)
|
|
101
102
|
uvicorn.run(app, host='0.0.0.0', port=args.port, workers=1)
|
|
@@ -34,8 +34,15 @@ def init_swanlab(args: Arguments) -> None:
|
|
|
34
34
|
current_time = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
35
35
|
name = args.name if args.name else f'{args.model_id}_{current_time}'
|
|
36
36
|
swanlab.config.update({'framework': '📏evalscope'})
|
|
37
|
-
|
|
38
|
-
project
|
|
39
|
-
name
|
|
40
|
-
config
|
|
41
|
-
mode
|
|
37
|
+
init_kwargs = {
|
|
38
|
+
'project': os.getenv('SWANLAB_PROJ_NAME', 'perf_benchmark'),
|
|
39
|
+
'name': name,
|
|
40
|
+
'config': args.to_dict(),
|
|
41
|
+
'mode': 'local' if args.swanlab_api_key == 'local' else None
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
workspace = os.getenv('SWANLAB_WORKSPACE')
|
|
45
|
+
if workspace:
|
|
46
|
+
init_kwargs['workspace'] = workspace
|
|
47
|
+
|
|
48
|
+
swanlab.init(**init_kwargs)
|
|
@@ -92,7 +92,7 @@ def print_summary(all_results, model_name):
|
|
|
92
92
|
basic_info.add_row('Model', model_name)
|
|
93
93
|
basic_info.add_row('Total Generated', f'{total_tokens:,} tokens')
|
|
94
94
|
basic_info.add_row('Total Test Time', f'{total_time:.2f} seconds')
|
|
95
|
-
basic_info.add_row('Avg Output Rate', f'{total_tokens/total_time:.2f} tokens/sec')
|
|
95
|
+
basic_info.add_row('Avg Output Rate', f'{total_tokens / total_time:.2f} tokens/sec')
|
|
96
96
|
|
|
97
97
|
console.print('\nBasic Information:')
|
|
98
98
|
console.print(basic_info)
|
evalscope/report/__init__.py
CHANGED
|
@@ -1,6 +1,38 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
2
3
|
|
|
3
|
-
from evalscope.
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
from
|
|
4
|
+
from evalscope.utils.import_utils import _LazyModule
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from .combinator import gen_report_table, gen_table, get_data_frame, get_report_list
|
|
8
|
+
from .generator import ReportGenerator
|
|
9
|
+
from .utils import Category, Report, ReportKey, Subset
|
|
10
|
+
|
|
11
|
+
else:
|
|
12
|
+
_import_structure = {
|
|
13
|
+
'combinator': [
|
|
14
|
+
'gen_table',
|
|
15
|
+
'get_data_frame',
|
|
16
|
+
'get_report_list',
|
|
17
|
+
'gen_report_table',
|
|
18
|
+
],
|
|
19
|
+
'generator': [
|
|
20
|
+
'ReportGenerator',
|
|
21
|
+
],
|
|
22
|
+
'utils': [
|
|
23
|
+
'Category',
|
|
24
|
+
'Report',
|
|
25
|
+
'ReportKey',
|
|
26
|
+
'Subset',
|
|
27
|
+
],
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
import sys
|
|
31
|
+
|
|
32
|
+
sys.modules[__name__] = _LazyModule(
|
|
33
|
+
__name__,
|
|
34
|
+
globals()['__file__'],
|
|
35
|
+
_import_structure,
|
|
36
|
+
module_spec=__spec__,
|
|
37
|
+
extra_objects={},
|
|
38
|
+
)
|
evalscope/report/combinator.py
CHANGED
|
@@ -34,17 +34,51 @@ def get_report_list(reports_path_list: List[str]) -> List[Report]:
|
|
|
34
34
|
|
|
35
35
|
def get_data_frame(report_list: List[Report],
|
|
36
36
|
flatten_metrics: bool = True,
|
|
37
|
-
flatten_categories: bool = True
|
|
37
|
+
flatten_categories: bool = True,
|
|
38
|
+
add_overall_metric: bool = False) -> pd.DataFrame:
|
|
38
39
|
tables = []
|
|
39
40
|
for report in report_list:
|
|
40
|
-
df = report.to_dataframe(
|
|
41
|
+
df = report.to_dataframe(
|
|
42
|
+
flatten_metrics=flatten_metrics,
|
|
43
|
+
flatten_categories=flatten_categories,
|
|
44
|
+
add_overall_metric=add_overall_metric)
|
|
41
45
|
tables.append(df)
|
|
42
46
|
return pd.concat(tables, ignore_index=True)
|
|
43
47
|
|
|
44
48
|
|
|
45
|
-
def gen_table(reports_path_list: list
|
|
46
|
-
|
|
47
|
-
|
|
49
|
+
def gen_table(reports_path_list: list[str] = None,
|
|
50
|
+
report_list: list[Report] = None,
|
|
51
|
+
flatten_metrics: bool = True,
|
|
52
|
+
flatten_categories: bool = True,
|
|
53
|
+
add_overall_metric: bool = False) -> str:
|
|
54
|
+
"""
|
|
55
|
+
Generates a formatted table from a list of report paths or Report objects.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
reports_path_list (list[str], optional): List of file paths to report files.
|
|
59
|
+
Either this or `report_list` must be provided.
|
|
60
|
+
report_list (list[Report], optional): List of Report objects.
|
|
61
|
+
Either this or `reports_path_list` must be provided.
|
|
62
|
+
flatten_metrics (bool, optional): Whether to flatten the metrics in the output table. Defaults to True.
|
|
63
|
+
flatten_categories (bool, optional): Whether to flatten the categories in the output table. Defaults to True.
|
|
64
|
+
add_overall_metric (bool, optional): Whether to add an overall metric column to the table. Defaults to False.
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
str: A string representation of the table in grid format.
|
|
68
|
+
|
|
69
|
+
Raises:
|
|
70
|
+
AssertionError: If neither `reports_path_list` nor `report_list` is provided.
|
|
71
|
+
"""
|
|
72
|
+
assert (reports_path_list is not None) or (report_list is not None), \
|
|
73
|
+
'Either reports_path_list or report_list must be provided.'
|
|
74
|
+
if report_list is None:
|
|
75
|
+
report_list = get_report_list(reports_path_list)
|
|
76
|
+
# Generate a DataFrame from the report list
|
|
77
|
+
table = get_data_frame(
|
|
78
|
+
report_list,
|
|
79
|
+
flatten_metrics=flatten_metrics,
|
|
80
|
+
flatten_categories=flatten_categories,
|
|
81
|
+
add_overall_metric=add_overall_metric)
|
|
48
82
|
return tabulate(table, headers=table.columns, tablefmt='grid', showindex=False)
|
|
49
83
|
|
|
50
84
|
|
|
@@ -60,7 +94,7 @@ if __name__ == '__main__':
|
|
|
60
94
|
report_dir_1 = './outputs/20250117_151926'
|
|
61
95
|
# report_dir_2 = './outputs/20250107_204445/reports'
|
|
62
96
|
|
|
63
|
-
report_table = gen_table([report_dir_1])
|
|
97
|
+
report_table = gen_table(reports_path_list=[report_dir_1])
|
|
64
98
|
print(report_table)
|
|
65
99
|
|
|
66
100
|
# ALL VALUES ONLY FOR EXAMPLE
|
evalscope/report/generator.py
CHANGED
|
@@ -1,24 +1,42 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
2
|
from pandas import DataFrame
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
3
4
|
|
|
4
5
|
from evalscope.constants import DataCollection
|
|
5
6
|
from evalscope.report.utils import *
|
|
6
7
|
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from evalscope.benchmarks import DataAdapter
|
|
10
|
+
|
|
7
11
|
|
|
8
12
|
class ReportGenerator:
|
|
9
13
|
|
|
10
14
|
@staticmethod
|
|
11
|
-
def gen_report(subset_score_map: dict,
|
|
15
|
+
def gen_report(subset_score_map: dict, model_name: str, data_adapter: 'DataAdapter', **kwargs) -> Report:
|
|
12
16
|
"""
|
|
13
|
-
Generate report for specific dataset.
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
+
Generate a report for a specific dataset based on provided subset scores.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
subset_score_map (dict): A mapping from subset names to a list of score dictionaries.
|
|
21
|
+
{
|
|
22
|
+
'subset_name': [
|
|
23
|
+
{'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100},
|
|
24
|
+
{'metric_name': 'WeightedAverageAccuracy', 'score': 0.3389, 'num': 100}
|
|
25
|
+
],
|
|
26
|
+
...
|
|
27
|
+
}
|
|
28
|
+
report_name (str): The name of the report to generate.
|
|
29
|
+
data_adapter (DataAdapter): An adapter object for data handling.
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
Report: A structured report object containing metrics, categories, and subsets.
|
|
33
|
+
|
|
34
|
+
>>> report = gen_report(subset_score_map, "My Report", data_adapter, dataset_name="Dataset", model_name="Model")
|
|
17
35
|
""" # noqa: E501
|
|
18
36
|
|
|
19
|
-
dataset_name =
|
|
20
|
-
|
|
21
|
-
|
|
37
|
+
dataset_name = data_adapter.name
|
|
38
|
+
category_map = data_adapter.category_map
|
|
39
|
+
report_name = f'{model_name}@{dataset_name}'
|
|
22
40
|
|
|
23
41
|
def flatten_subset() -> DataFrame:
|
|
24
42
|
"""
|
|
@@ -59,7 +77,13 @@ class ReportGenerator:
|
|
|
59
77
|
|
|
60
78
|
metrics_list.append(Metric(name=metric_name, categories=categories))
|
|
61
79
|
|
|
62
|
-
report = Report(
|
|
80
|
+
report = Report(
|
|
81
|
+
name=report_name,
|
|
82
|
+
metrics=metrics_list,
|
|
83
|
+
dataset_name=dataset_name,
|
|
84
|
+
model_name=model_name,
|
|
85
|
+
dataset_description=data_adapter.description,
|
|
86
|
+
dataset_pretty_name=data_adapter.pretty_name)
|
|
63
87
|
return report
|
|
64
88
|
|
|
65
89
|
@staticmethod
|
evalscope/report/utils.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import os
|
|
2
3
|
import pandas as pd
|
|
3
4
|
from collections import defaultdict
|
|
4
5
|
from dataclasses import asdict, dataclass, field
|
|
@@ -6,6 +7,9 @@ from typing import Any, Dict, List
|
|
|
6
7
|
|
|
7
8
|
from evalscope.metrics import macro_mean, micro_mean
|
|
8
9
|
from evalscope.utils import normalize_score
|
|
10
|
+
from evalscope.utils.logger import get_logger
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
9
13
|
|
|
10
14
|
|
|
11
15
|
@dataclass
|
|
@@ -70,13 +74,28 @@ class ReportKey:
|
|
|
70
74
|
score = 'Score'
|
|
71
75
|
|
|
72
76
|
|
|
77
|
+
ANALYSIS_PROMPT = """根据给出的json格式的模型评测结果,输出分析报告,要求如下:
|
|
78
|
+
1. 报告分为 总体表现、关键指标分析、改进建议、结论 四部分
|
|
79
|
+
2. 若模型有多种指标,将其分为低分、中分、高分三个部分,并列出markdown表格
|
|
80
|
+
3. 只列出报告本身,不要有其他多余内容
|
|
81
|
+
4. 输出报告语言为{language}
|
|
82
|
+
|
|
83
|
+
```json
|
|
84
|
+
{report_str}
|
|
85
|
+
```
|
|
86
|
+
"""
|
|
87
|
+
|
|
88
|
+
|
|
73
89
|
@dataclass
|
|
74
90
|
class Report:
|
|
75
91
|
name: str = 'default_report'
|
|
76
92
|
dataset_name: str = 'default_dataset'
|
|
93
|
+
dataset_pretty_name: str = ''
|
|
94
|
+
dataset_description: str = ''
|
|
77
95
|
model_name: str = 'default_model'
|
|
78
96
|
score: float = 0.0
|
|
79
97
|
metrics: List[Metric] = field(default_factory=list)
|
|
98
|
+
analysis: str = 'N/A'
|
|
80
99
|
|
|
81
100
|
def __post_init__(self):
|
|
82
101
|
self.score = self.metrics[0].score # NOTE: only use the first metric by default
|
|
@@ -84,15 +103,29 @@ class Report:
|
|
|
84
103
|
def to_dict(self) -> Dict[str, Any]:
|
|
85
104
|
return asdict(self)
|
|
86
105
|
|
|
106
|
+
def to_json_str(self) -> str:
|
|
107
|
+
return json.dumps(self.to_dict(), indent=4, ensure_ascii=False)
|
|
108
|
+
|
|
109
|
+
def to_json(self, json_file: str):
|
|
110
|
+
# ensure the directory exists
|
|
111
|
+
os.makedirs(os.path.dirname(json_file), exist_ok=True)
|
|
112
|
+
# write the report to a json file
|
|
113
|
+
with open(json_file, 'w', encoding='utf-8') as f:
|
|
114
|
+
json.dump(self.to_dict(), f, indent=4, ensure_ascii=False)
|
|
115
|
+
|
|
87
116
|
@classmethod
|
|
88
117
|
def from_dict(cls, data: dict):
|
|
89
118
|
metrics = [Metric.from_dict(metric) for metric in data.get('metrics', [])]
|
|
90
119
|
return cls(
|
|
91
120
|
name=data['name'],
|
|
121
|
+
dataset_name=data['dataset_name'],
|
|
122
|
+
dataset_pretty_name=data.get('dataset_pretty_name'),
|
|
123
|
+
dataset_description=data.get('dataset_description'),
|
|
92
124
|
score=data['score'],
|
|
125
|
+
model_name=data['model_name'],
|
|
93
126
|
metrics=metrics,
|
|
94
|
-
|
|
95
|
-
|
|
127
|
+
analysis=data.get('analysis', 'N/A'),
|
|
128
|
+
)
|
|
96
129
|
|
|
97
130
|
@classmethod
|
|
98
131
|
def from_json(cls, json_file: str):
|
|
@@ -100,18 +133,41 @@ class Report:
|
|
|
100
133
|
data = json.load(f)
|
|
101
134
|
return cls.from_dict(data)
|
|
102
135
|
|
|
103
|
-
def to_dataframe(self,
|
|
136
|
+
def to_dataframe(self,
|
|
137
|
+
flatten_metrics: bool = True,
|
|
138
|
+
flatten_categories: bool = True,
|
|
139
|
+
add_overall_metric: bool = False) -> pd.DataFrame:
|
|
140
|
+
"""
|
|
141
|
+
Convert the report to a pandas DataFrame.
|
|
142
|
+
Args:
|
|
143
|
+
flatten_metrics (bool): Whether to flatten the metrics to a single row.
|
|
144
|
+
flatten_categories (bool): Whether to flatten the categories to multiple rows.
|
|
145
|
+
add_overall_metric (bool): Whether to add an overall metric row.
|
|
146
|
+
Returns:
|
|
147
|
+
pd.DataFrame: The report as a pandas DataFrame.
|
|
148
|
+
"""
|
|
104
149
|
table = defaultdict(list)
|
|
105
150
|
for metric in self.metrics:
|
|
151
|
+
metric_count = 0
|
|
106
152
|
for category in metric.categories:
|
|
107
153
|
for subset in category.subsets:
|
|
154
|
+
metric_count += 1
|
|
108
155
|
table[ReportKey.model_name].append(self.model_name)
|
|
109
156
|
table[ReportKey.dataset_name].append(self.dataset_name)
|
|
110
157
|
table[ReportKey.metric_name].append(metric.name)
|
|
111
158
|
table[ReportKey.category_name].append(category.name)
|
|
112
159
|
table[ReportKey.subset_name].append(subset.name)
|
|
113
160
|
table[ReportKey.num].append(subset.num)
|
|
114
|
-
table[ReportKey.score].append(subset.score)
|
|
161
|
+
table[ReportKey.score].append(subset.score)
|
|
162
|
+
# add overall metric when there are multiple subsets
|
|
163
|
+
if metric_count > 1 and add_overall_metric:
|
|
164
|
+
table[ReportKey.model_name].append(self.model_name)
|
|
165
|
+
table[ReportKey.dataset_name].append(self.dataset_name)
|
|
166
|
+
table[ReportKey.metric_name].append(metric.name)
|
|
167
|
+
table[ReportKey.category_name].append(('-', ))
|
|
168
|
+
table[ReportKey.subset_name].append('OVERALL')
|
|
169
|
+
table[ReportKey.num].append(metric.num)
|
|
170
|
+
table[ReportKey.score].append(metric.score)
|
|
115
171
|
# NOTE: only flatten metrics if needed, use the first metric by default
|
|
116
172
|
if not flatten_metrics:
|
|
117
173
|
break
|
|
@@ -131,3 +187,27 @@ class Report:
|
|
|
131
187
|
|
|
132
188
|
df_categories.drop(columns=[ReportKey.category_name], inplace=True)
|
|
133
189
|
return df_categories
|
|
190
|
+
|
|
191
|
+
def generate_analysis(self, judge_llm_config: dict) -> str:
|
|
192
|
+
import locale
|
|
193
|
+
|
|
194
|
+
from evalscope.metrics import LLMJudge
|
|
195
|
+
|
|
196
|
+
try:
|
|
197
|
+
# get the default locale
|
|
198
|
+
lang, _ = locale.getlocale()
|
|
199
|
+
|
|
200
|
+
if lang is None:
|
|
201
|
+
language = '中文'
|
|
202
|
+
else:
|
|
203
|
+
language = 'en' if lang.startswith('en') else '中文'
|
|
204
|
+
|
|
205
|
+
prompt = ANALYSIS_PROMPT.format(language=language, report_str=self.to_json_str())
|
|
206
|
+
judge_llm = LLMJudge(**judge_llm_config)
|
|
207
|
+
response = judge_llm(prompt)
|
|
208
|
+
except Exception as e:
|
|
209
|
+
logger.error(f'Error generating analysis: {e}')
|
|
210
|
+
response = 'N/A'
|
|
211
|
+
|
|
212
|
+
self.analysis = response
|
|
213
|
+
return response
|
evalscope/run.py
CHANGED
|
@@ -43,6 +43,9 @@ def run_single_task(task_cfg: TaskConfig, run_time: str) -> dict:
|
|
|
43
43
|
else:
|
|
44
44
|
result = evaluate_model(task_cfg, outputs)
|
|
45
45
|
|
|
46
|
+
logger.info(f'Finished evaluation for {task_cfg.model_id} on {task_cfg.datasets}')
|
|
47
|
+
logger.info(f'Output directory: {outputs.outputs_dir}')
|
|
48
|
+
|
|
46
49
|
return result
|
|
47
50
|
|
|
48
51
|
|
|
@@ -109,6 +112,7 @@ def get_backend_manager_class(eval_backend: EvalBackend):
|
|
|
109
112
|
def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
|
|
110
113
|
"""Evaluate the model based on the provided task configuration."""
|
|
111
114
|
from evalscope.models import get_local_model
|
|
115
|
+
from evalscope.report import gen_table
|
|
112
116
|
|
|
113
117
|
# Initialize evaluator
|
|
114
118
|
eval_results = {}
|
|
@@ -122,10 +126,18 @@ def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
|
|
|
122
126
|
task_cfg.dump_yaml(outputs.configs_dir)
|
|
123
127
|
logger.info(task_cfg)
|
|
124
128
|
|
|
129
|
+
# Run evaluation for each evaluator
|
|
125
130
|
for evaluator in evaluators:
|
|
126
131
|
res_dict = evaluator.eval()
|
|
127
132
|
eval_results[evaluator.dataset_name] = res_dict
|
|
128
133
|
|
|
134
|
+
# Make overall report
|
|
135
|
+
try:
|
|
136
|
+
report_table: str = gen_table(reports_path_list=[outputs.reports_dir], add_overall_metric=True)
|
|
137
|
+
logger.info(f'Overall report table: \n{report_table} \n')
|
|
138
|
+
except Exception:
|
|
139
|
+
logger.error('Failed to generate report table.')
|
|
140
|
+
|
|
129
141
|
# Clean up
|
|
130
142
|
if base_model is not None:
|
|
131
143
|
import gc
|
evalscope/summarizer.py
CHANGED
|
@@ -30,7 +30,7 @@ class Summarizer:
|
|
|
30
30
|
with open(report_file, 'r') as f:
|
|
31
31
|
res_list.append(json.load(f))
|
|
32
32
|
|
|
33
|
-
report_table: str = gen_table([reports_dir])
|
|
33
|
+
report_table: str = gen_table(reports_path_list=[reports_dir])
|
|
34
34
|
logger.info(f'*** Report table ***\n{report_table}')
|
|
35
35
|
|
|
36
36
|
return res_list
|
evalscope/utils/io_utils.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import csv
|
|
1
2
|
import json
|
|
2
3
|
import jsonlines as jsonl
|
|
3
4
|
import os
|
|
@@ -112,8 +113,58 @@ def dump_jsonl_data(data_list, jsonl_file, dump_mode=DumpMode.OVERWRITE):
|
|
|
112
113
|
writer.write_all(data_list)
|
|
113
114
|
|
|
114
115
|
|
|
115
|
-
def jsonl_to_csv():
|
|
116
|
-
|
|
116
|
+
def jsonl_to_csv(jsonl_file, csv_file):
|
|
117
|
+
"""
|
|
118
|
+
Convert jsonl file to csv file.
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
jsonl_file: jsonl file path.
|
|
122
|
+
csv_file: csv file path.
|
|
123
|
+
"""
|
|
124
|
+
data = jsonl_to_list(jsonl_file)
|
|
125
|
+
if not data:
|
|
126
|
+
logger.warning(f'No data found in {jsonl_file}.')
|
|
127
|
+
return
|
|
128
|
+
|
|
129
|
+
with open(csv_file, 'w', newline='', encoding='utf-8') as f:
|
|
130
|
+
writer = csv.writer(f)
|
|
131
|
+
writer.writerow(data[0].keys()) # Write header
|
|
132
|
+
for item in data:
|
|
133
|
+
writer.writerow(item.values())
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def csv_to_list(csv_file) -> list:
|
|
137
|
+
"""
|
|
138
|
+
Read csv file to list.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
csv_file: csv file path.
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
list: list of lines. Each line is a dict.
|
|
145
|
+
"""
|
|
146
|
+
res_list = []
|
|
147
|
+
with open(csv_file, 'r', encoding='utf-8') as f:
|
|
148
|
+
reader = csv.DictReader(f)
|
|
149
|
+
for row in reader:
|
|
150
|
+
res_list.append(row)
|
|
151
|
+
return res_list
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def csv_to_jsonl(csv_file, jsonl_file):
|
|
155
|
+
"""
|
|
156
|
+
Convert csv file to jsonl file.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
csv_file: csv file path.
|
|
160
|
+
jsonl_file: jsonl file path.
|
|
161
|
+
"""
|
|
162
|
+
data = csv_to_list(csv_file)
|
|
163
|
+
if not data:
|
|
164
|
+
logger.warning(f'No data found in {csv_file}.')
|
|
165
|
+
return
|
|
166
|
+
|
|
167
|
+
dump_jsonl_data(data, jsonl_file, dump_mode=DumpMode.OVERWRITE)
|
|
117
168
|
|
|
118
169
|
|
|
119
170
|
def yaml_to_dict(yaml_file) -> dict:
|
|
@@ -168,3 +219,9 @@ def dict_to_json(d: dict, json_file: str):
|
|
|
168
219
|
"""
|
|
169
220
|
with open(json_file, 'w') as f:
|
|
170
221
|
json.dump(d, f, indent=4, ensure_ascii=False)
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
if __name__ == '__main__':
|
|
225
|
+
csv_file = 'custom_eval/text/mcq/example_val.csv'
|
|
226
|
+
jsonl_file = 'custom_eval/text/mcq/example_val.jsonl'
|
|
227
|
+
csv_to_jsonl(csv_file, jsonl_file)
|
evalscope/utils/logger.py
CHANGED
|
@@ -10,7 +10,7 @@ simple_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
|
10
10
|
|
|
11
11
|
detailed_formatter = logging.Formatter(detailed_format)
|
|
12
12
|
simple_formatter = logging.Formatter(simple_format)
|
|
13
|
-
DEFAULT_LEVEL = logging.DEBUG if os.getenv('
|
|
13
|
+
DEFAULT_LEVEL = logging.DEBUG if os.getenv('EVALSCOPE_LOG_LEVEL', 'INFO') == 'DEBUG' else logging.INFO
|
|
14
14
|
|
|
15
15
|
logging.basicConfig(format=simple_format, level=DEFAULT_LEVEL, force=True)
|
|
16
16
|
|
evalscope/utils/utils.py
CHANGED
|
@@ -10,6 +10,7 @@ import os
|
|
|
10
10
|
import random
|
|
11
11
|
import re
|
|
12
12
|
import torch
|
|
13
|
+
from inspect import signature
|
|
13
14
|
from typing import Any, Dict, List, Tuple, Union
|
|
14
15
|
|
|
15
16
|
from evalscope.utils.logger import get_logger
|
|
@@ -313,6 +314,17 @@ def seed_everything(seed: int):
|
|
|
313
314
|
torch.backends.cudnn.deterministic = True
|
|
314
315
|
torch.backends.cudnn.benchmark = False
|
|
315
316
|
|
|
317
|
+
def get_supported_params(func):
|
|
318
|
+
"""Get the supported parameters of a function."""
|
|
319
|
+
sig = signature(func)
|
|
320
|
+
return list(sig.parameters.keys())
|
|
321
|
+
|
|
322
|
+
def parse_int_or_float(num):
|
|
323
|
+
number = float(num)
|
|
324
|
+
if number.is_integer():
|
|
325
|
+
return int(number)
|
|
326
|
+
return number
|
|
327
|
+
|
|
316
328
|
if __name__ == '__main__':
|
|
317
329
|
options = ['A', 'B', 'C', 'D']
|
|
318
330
|
answers = ['Context .... ANSWER: A', 'answer: A']
|
evalscope/version.py
CHANGED