evalscope 0.16.0__py3-none-any.whl → 0.16.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/app/__init__.py +28 -0
- evalscope/{report → app}/app.py +20 -25
- evalscope/app/constants.py +21 -0
- evalscope/arguments.py +2 -1
- evalscope/backend/opencompass/backend_manager.py +2 -1
- evalscope/backend/rag_eval/cmteb/arguments.py +4 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/utils/embedding.py +75 -35
- evalscope/benchmarks/benchmark.py +1 -0
- evalscope/benchmarks/data_adapter.py +97 -16
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +84 -0
- evalscope/benchmarks/docmath/utils.py +220 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +90 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +341 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +4 -1
- evalscope/benchmarks/tool_bench/utils.py +5 -4
- evalscope/benchmarks/utils.py +25 -0
- evalscope/cli/start_app.py +2 -2
- evalscope/collections/__init__.py +35 -3
- evalscope/collections/evaluator.py +18 -6
- evalscope/config.py +8 -2
- evalscope/evaluator/evaluator.py +38 -27
- evalscope/metrics/__init__.py +3 -1
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
- evalscope/metrics/llm_judge.py +12 -5
- evalscope/metrics/math_parser.py +1 -1
- evalscope/models/adapters/server_adapter.py +2 -6
- evalscope/perf/arguments.py +2 -2
- evalscope/perf/benchmark.py +0 -9
- evalscope/perf/main.py +7 -0
- evalscope/perf/plugin/datasets/custom.py +15 -0
- evalscope/perf/utils/benchmark_util.py +1 -1
- evalscope/perf/utils/local_server.py +1 -0
- evalscope/perf/utils/log_utils.py +12 -5
- evalscope/perf/utils/rich_display.py +1 -1
- evalscope/report/__init__.py +36 -4
- evalscope/report/combinator.py +8 -0
- evalscope/report/generator.py +33 -9
- evalscope/report/utils.py +60 -3
- evalscope/run.py +12 -0
- evalscope/utils/logger.py +1 -1
- evalscope/utils/utils.py +12 -0
- evalscope/version.py +2 -2
- {evalscope-0.16.0.dist-info → evalscope-0.16.1.dist-info}/METADATA +13 -11
- {evalscope-0.16.0.dist-info → evalscope-0.16.1.dist-info}/RECORD +61 -50
- tests/aigc/test_t2i.py +40 -3
- tests/cli/test_all.py +39 -35
- tests/cli/test_collection.py +7 -6
- tests/cli/test_run.py +21 -11
- tests/rag/test_mteb.py +5 -5
- /evalscope/{report/app_arguments.py → app/arguments.py} +0 -0
- {evalscope-0.16.0.dist-info → evalscope-0.16.1.dist-info}/LICENSE +0 -0
- {evalscope-0.16.0.dist-info → evalscope-0.16.1.dist-info}/WHEEL +0 -0
- {evalscope-0.16.0.dist-info → evalscope-0.16.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.16.0.dist-info → evalscope-0.16.1.dist-info}/top_level.txt +0 -0
evalscope/evaluator/evaluator.py
CHANGED
|
@@ -13,7 +13,7 @@ from evalscope.benchmarks import DataAdapter
|
|
|
13
13
|
from evalscope.config import TaskConfig
|
|
14
14
|
from evalscope.constants import AnswerKeys, DumpMode, EvalStage, EvalType, JudgeStrategy, ReviewKeys
|
|
15
15
|
from evalscope.models import BaseModelAdapter
|
|
16
|
-
from evalscope.report import Report,
|
|
16
|
+
from evalscope.report import Report, gen_report_table
|
|
17
17
|
from evalscope.utils import dict_torch_dtype_to_str, gen_hash
|
|
18
18
|
from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
|
|
19
19
|
from evalscope.utils.logger import get_logger
|
|
@@ -46,7 +46,6 @@ class Evaluator(object):
|
|
|
46
46
|
self.dataset_name = data_adapter.name
|
|
47
47
|
self.dataset_name_or_path = os.path.expanduser(data_adapter.dataset_id)
|
|
48
48
|
self.model_name = task_cfg.model_id
|
|
49
|
-
self.custom_task_name = f'{self.model_name}_{self.dataset_name}'
|
|
50
49
|
|
|
51
50
|
self.data_adapter = data_adapter
|
|
52
51
|
self.model_adapter = model_adapter
|
|
@@ -79,8 +78,16 @@ class Evaluator(object):
|
|
|
79
78
|
# Limit and index prompts
|
|
80
79
|
limited_prompts = defaultdict(list)
|
|
81
80
|
for subset_name, prompts_list in prompts.items():
|
|
82
|
-
|
|
83
|
-
|
|
81
|
+
# If limit is None, use all prompts
|
|
82
|
+
if self.task_cfg.limit is None:
|
|
83
|
+
limit = len(prompts_list)
|
|
84
|
+
else:
|
|
85
|
+
if isinstance(self.task_cfg.limit, int):
|
|
86
|
+
limit = self.task_cfg.limit
|
|
87
|
+
elif isinstance(self.task_cfg.limit, float):
|
|
88
|
+
limit = int(len(prompts_list) * self.task_cfg.limit)
|
|
89
|
+
# Limit the number of prompts
|
|
90
|
+
for index, prompt in enumerate(prompts_list[:min(limit, len(prompts_list))]):
|
|
84
91
|
prompt[AnswerKeys.INDEX] = index
|
|
85
92
|
limited_prompts[subset_name].append(prompt)
|
|
86
93
|
|
|
@@ -371,41 +378,45 @@ class Evaluator(object):
|
|
|
371
378
|
|
|
372
379
|
return metric_score
|
|
373
380
|
|
|
374
|
-
def dump_report(self, reviews_score_all: List[dict]
|
|
381
|
+
def dump_report(self, reviews_score_all: List[dict]):
|
|
375
382
|
"""
|
|
376
383
|
Get report for total reviews of specific dataset.
|
|
377
384
|
It is required to rewrite this method to support your own evaluator.
|
|
378
385
|
|
|
379
386
|
Args:
|
|
380
387
|
reviews_score_all: reviews score list. Generated by func self.data_adapter.compute_metric().
|
|
381
|
-
use_table: whether to generate table for reports. Default to True.
|
|
382
388
|
|
|
383
389
|
Returns: None
|
|
384
390
|
"""
|
|
391
|
+
report_path = os.path.join(self.outputs_structure.reports_dir, self.model_name)
|
|
392
|
+
os.makedirs(report_path, exist_ok=True)
|
|
385
393
|
# Get report map
|
|
386
394
|
report_map: Report = self.data_adapter.gen_report(
|
|
387
|
-
subset_score_map=reviews_score_all,
|
|
388
|
-
report_name=self.custom_task_name,
|
|
389
|
-
model_name=self.model_name,
|
|
390
|
-
dataset_name=self.dataset_name)
|
|
391
|
-
|
|
392
|
-
# Dump report
|
|
393
|
-
report_path: str = os.path.join(self.outputs_structure.reports_dir, self.model_name,
|
|
394
|
-
self.dataset_name + '.json')
|
|
395
|
-
os.makedirs(os.path.dirname(report_path), exist_ok=True)
|
|
395
|
+
subset_score_map=reviews_score_all, model_name=self.model_name)
|
|
396
396
|
|
|
397
|
-
#
|
|
398
|
-
|
|
399
|
-
f.write(json.dumps(report_map.to_dict(), ensure_ascii=False, indent=4))
|
|
400
|
-
logger.info(f'Dump report: {report_path} \n')
|
|
397
|
+
# Post process report
|
|
398
|
+
self.data_adapter.post_process_report(report_map, report_path=report_path)
|
|
401
399
|
|
|
402
400
|
# Make table
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
401
|
+
try:
|
|
402
|
+
report_table = gen_report_table(report_map)
|
|
403
|
+
logger.info(f'{self.dataset_name_or_path} report table: \n{report_table} \n')
|
|
404
|
+
except Exception:
|
|
405
|
+
logger.error('Failed to generate report table.')
|
|
406
|
+
|
|
407
|
+
# Make report analysis
|
|
408
|
+
if self.task_cfg.analysis_report:
|
|
409
|
+
logger.info('Generating report analysis, please wait ...')
|
|
410
|
+
analysis = report_map.generate_analysis(self.task_cfg.judge_model_args)
|
|
411
|
+
logger.info('Report analysis:\n%s', analysis)
|
|
412
|
+
else:
|
|
413
|
+
logger.info('Skipping report analysis (`analysis_report=False`).')
|
|
414
|
+
|
|
415
|
+
# Dump report
|
|
416
|
+
report_file = os.path.join(report_path, f'{self.dataset_name}.json')
|
|
417
|
+
report_map.to_json(report_file)
|
|
418
|
+
logger.info(f'Dump report to: {report_file} \n')
|
|
419
|
+
|
|
409
420
|
return report_map
|
|
410
421
|
|
|
411
422
|
def eval(self, **kwargs) -> dict:
|
|
@@ -431,7 +442,7 @@ class Evaluator(object):
|
|
|
431
442
|
stage == 'review': return the reviews_map
|
|
432
443
|
"""
|
|
433
444
|
|
|
434
|
-
logger.info(f'
|
|
445
|
+
logger.info(f'Start evaluating on dataset {self.dataset_name_or_path}')
|
|
435
446
|
|
|
436
447
|
reviews_score_all = {} # {subset_name: (score, num)}
|
|
437
448
|
stage_answers_dict = {}
|
|
@@ -461,6 +472,6 @@ class Evaluator(object):
|
|
|
461
472
|
# Generate report
|
|
462
473
|
report_map = self.dump_report(reviews_score_all)
|
|
463
474
|
|
|
464
|
-
logger.info(f'
|
|
475
|
+
logger.info(f'Evaluation finished on {self.dataset_name_or_path}')
|
|
465
476
|
|
|
466
477
|
return report_map
|
evalscope/metrics/__init__.py
CHANGED
|
@@ -9,7 +9,7 @@ if TYPE_CHECKING:
|
|
|
9
9
|
from .metrics import (bleu_ngram_one_sample, exact_match, macro_mean, mean, micro_mean, simple_f1_score,
|
|
10
10
|
weighted_mean)
|
|
11
11
|
from .named_metrics import Metric, metric_registry
|
|
12
|
-
from .rouge_metric import compute_rouge_score_one_sample_zh
|
|
12
|
+
from .rouge_metric import compute_rouge_score, compute_rouge_score_one_sample, compute_rouge_score_one_sample_zh
|
|
13
13
|
|
|
14
14
|
else:
|
|
15
15
|
_import_structure = {
|
|
@@ -28,6 +28,8 @@ else:
|
|
|
28
28
|
],
|
|
29
29
|
'rouge_metric': [
|
|
30
30
|
'compute_rouge_score_one_sample_zh',
|
|
31
|
+
'compute_rouge_score',
|
|
32
|
+
'compute_rouge_score_one_sample',
|
|
31
33
|
],
|
|
32
34
|
'llm_judge': [
|
|
33
35
|
'LLMJudge',
|
|
@@ -88,11 +88,11 @@ class RougeScorer(scoring.BaseScorer):
|
|
|
88
88
|
"""
|
|
89
89
|
|
|
90
90
|
def __init__(self, rouge_types, use_stemmer=False, split_summaries=False, tokenizer=None):
|
|
91
|
-
check_nltk_data()
|
|
92
91
|
self.rouge_types = rouge_types
|
|
93
92
|
if tokenizer:
|
|
94
93
|
self._tokenizer = tokenizer
|
|
95
94
|
else:
|
|
95
|
+
check_nltk_data()
|
|
96
96
|
self._tokenizer = tokenizers.DefaultTokenizer(use_stemmer)
|
|
97
97
|
logging.info('Using default tokenizer.')
|
|
98
98
|
|
evalscope/metrics/llm_judge.py
CHANGED
|
@@ -22,6 +22,9 @@ B: INCORRECT
|
|
|
22
22
|
Just return the letters "A" or "B", with no text around it.
|
|
23
23
|
""" # noqa: E501
|
|
24
24
|
|
|
25
|
+
DEFAULT_JUDGE_MODEL = 'Qwen/Qwen3-235B-A22B'
|
|
26
|
+
DEFAULT_API_URL = 'https://api-inference.modelscope.cn/v1/'
|
|
27
|
+
|
|
25
28
|
|
|
26
29
|
class LLMJudge:
|
|
27
30
|
"""
|
|
@@ -47,12 +50,12 @@ class LLMJudge:
|
|
|
47
50
|
prompt_template (str, optional): Prompt template for the judge
|
|
48
51
|
generation_config (dict, optional): Generation configuration for the judge
|
|
49
52
|
"""
|
|
50
|
-
self.api_key = api_key or os.environ.get('
|
|
51
|
-
self.api_url = api_url or os.environ.get('
|
|
52
|
-
self.model_id = model_id or os.environ.get('
|
|
53
|
+
self.api_key = api_key or os.environ.get('MODELSCOPE_SDK_TOKEN', 'EMPTY')
|
|
54
|
+
self.api_url = api_url or os.environ.get('MODELSCOPE_API_BASE', DEFAULT_API_URL)
|
|
55
|
+
self.model_id = model_id or os.environ.get('MODELSCOPE_JUDGE_LLM', DEFAULT_JUDGE_MODEL)
|
|
53
56
|
self.system_prompt = system_prompt or os.environ.get('JUDGE_SYSTEM_PROMPT', None)
|
|
54
57
|
self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE', DEFAULT_PROMPT_TEMPLATE)
|
|
55
|
-
self.generation_config = generation_config
|
|
58
|
+
self.generation_config = generation_config or {}
|
|
56
59
|
|
|
57
60
|
from evalscope.models import ServerModelAdapter
|
|
58
61
|
|
|
@@ -74,6 +77,10 @@ class LLMJudge:
|
|
|
74
77
|
if self.generation_config:
|
|
75
78
|
infer_cfg.update(self.generation_config)
|
|
76
79
|
|
|
80
|
+
if self.model_id == DEFAULT_JUDGE_MODEL:
|
|
81
|
+
# Disable thinking for the default judge model
|
|
82
|
+
infer_cfg['enable_thinking'] = self.generation_config.get('enable_thinking', False)
|
|
83
|
+
|
|
77
84
|
try:
|
|
78
85
|
# Send request using ServerModelAdapter
|
|
79
86
|
response = self.server_adapter.process_single_input(input_data, infer_cfg)
|
|
@@ -82,7 +89,7 @@ class LLMJudge:
|
|
|
82
89
|
llm_response = response.get('choices', [{}])[0].get('message', {}).get('content', '')
|
|
83
90
|
return llm_response
|
|
84
91
|
except Exception as e:
|
|
85
|
-
logger.error(f'Error during LLM evaluation: {e}')
|
|
92
|
+
logger.error(f'Error occurred during {self.model_id}@{self.api_url} LLM judge evaluation: {e}')
|
|
86
93
|
return ''
|
|
87
94
|
|
|
88
95
|
def build_prompt(self, pred: str, gold: str, question: Optional[str] = None):
|
evalscope/metrics/math_parser.py
CHANGED
|
@@ -4,7 +4,7 @@ The logic in this file largely borrows from Qwen2.5-Math codebase at https://git
|
|
|
4
4
|
# flake8: noqa
|
|
5
5
|
import re
|
|
6
6
|
import regex
|
|
7
|
-
from
|
|
7
|
+
from latex2sympy2_extended import latex2sympy
|
|
8
8
|
from math import isclose
|
|
9
9
|
from sympy import N, simplify
|
|
10
10
|
from sympy.parsing.latex import parse_latex
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
import openai
|
|
2
2
|
from collections import defaultdict
|
|
3
|
-
from inspect import signature
|
|
4
3
|
from openai.types.chat import ChatCompletion, ChatCompletionChunk
|
|
5
4
|
from openai.types.chat.chat_completion import ChatCompletionMessage, Choice
|
|
6
5
|
from typing import List, Optional, Union
|
|
7
6
|
|
|
8
7
|
from evalscope.utils.logger import get_logger
|
|
8
|
+
from evalscope.utils.utils import get_supported_params
|
|
9
9
|
from .base_adapter import BaseModelAdapter
|
|
10
10
|
|
|
11
11
|
logger = get_logger()
|
|
@@ -31,7 +31,7 @@ class ServerModelAdapter(BaseModelAdapter):
|
|
|
31
31
|
api_key=api_key,
|
|
32
32
|
base_url=self.api_url,
|
|
33
33
|
)
|
|
34
|
-
self.supported_params = self.
|
|
34
|
+
self.supported_params = get_supported_params(self.client.chat.completions.create)
|
|
35
35
|
|
|
36
36
|
self.seed = kwargs.get('seed', None)
|
|
37
37
|
self.timeout = kwargs.get('timeout', 60)
|
|
@@ -39,10 +39,6 @@ class ServerModelAdapter(BaseModelAdapter):
|
|
|
39
39
|
self.model_cfg = {'api_url': api_url, 'model_id': model_id, 'api_key': api_key}
|
|
40
40
|
super().__init__(model=None, model_cfg=self.model_cfg, **kwargs)
|
|
41
41
|
|
|
42
|
-
def _get_supported_params(self):
|
|
43
|
-
sig = signature(self.client.chat.completions.create)
|
|
44
|
-
return list(sig.parameters.keys())
|
|
45
|
-
|
|
46
42
|
def predict(self, inputs: List[dict], infer_cfg: Optional[dict] = None) -> List[dict]:
|
|
47
43
|
"""
|
|
48
44
|
Model prediction func.
|
evalscope/perf/arguments.py
CHANGED
|
@@ -60,8 +60,8 @@ class Arguments:
|
|
|
60
60
|
min_tokens: Optional[int] = None # Minimum number of tokens in the response
|
|
61
61
|
n_choices: Optional[int] = None # Number of response choices
|
|
62
62
|
seed: Optional[int] = 0 # Random seed for reproducibility
|
|
63
|
-
stop: Optional[List[str]] =
|
|
64
|
-
stop_token_ids: Optional[List[str]] =
|
|
63
|
+
stop: Optional[List[str]] = None # Stop sequences for the response
|
|
64
|
+
stop_token_ids: Optional[List[str]] = None # Stop token IDs for the response
|
|
65
65
|
stream: Optional[bool] = True # Whether to stream the response
|
|
66
66
|
temperature: float = 0.0 # Temperature setting for the response
|
|
67
67
|
top_p: Optional[float] = None # Top-p (nucleus) sampling setting for the response
|
evalscope/perf/benchmark.py
CHANGED
|
@@ -1,11 +1,8 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
-
import copy
|
|
3
2
|
import json
|
|
4
3
|
import numpy as np
|
|
5
|
-
import os
|
|
6
4
|
import platform
|
|
7
5
|
import sqlite3
|
|
8
|
-
import threading
|
|
9
6
|
import time
|
|
10
7
|
from http import HTTPStatus
|
|
11
8
|
from tqdm import tqdm
|
|
@@ -17,7 +14,6 @@ from evalscope.perf.plugin.registry import ApiRegistry, DatasetRegistry
|
|
|
17
14
|
from evalscope.perf.utils.benchmark_util import BenchmarkData, BenchmarkMetrics
|
|
18
15
|
from evalscope.perf.utils.db_util import create_result_table, get_result_db_path, insert_benchmark_data, summary_result
|
|
19
16
|
from evalscope.perf.utils.handler import add_signal_handlers, exception_handler
|
|
20
|
-
from evalscope.perf.utils.local_server import start_app
|
|
21
17
|
from evalscope.utils.logger import get_logger
|
|
22
18
|
|
|
23
19
|
logger = get_logger()
|
|
@@ -164,11 +160,6 @@ async def statistic_benchmark_metric(benchmark_data_queue: asyncio.Queue, args:
|
|
|
164
160
|
|
|
165
161
|
@exception_handler
|
|
166
162
|
async def connect_test(args: Arguments) -> bool:
|
|
167
|
-
if args.api.startswith('local'):
|
|
168
|
-
# start local server
|
|
169
|
-
server = threading.Thread(target=start_app, args=(copy.deepcopy(args), ), daemon=True)
|
|
170
|
-
server.start()
|
|
171
|
-
|
|
172
163
|
if (not args.no_test_connection) and (not await test_connection(args)):
|
|
173
164
|
raise TimeoutError('Test connection failed')
|
|
174
165
|
|
evalscope/perf/main.py
CHANGED
|
@@ -2,9 +2,11 @@ import asyncio
|
|
|
2
2
|
import copy
|
|
3
3
|
import os
|
|
4
4
|
import platform
|
|
5
|
+
import threading
|
|
5
6
|
import time
|
|
6
7
|
from argparse import Namespace
|
|
7
8
|
|
|
9
|
+
from evalscope.perf.utils.local_server import start_app
|
|
8
10
|
from evalscope.perf.utils.log_utils import init_swanlab, init_wandb
|
|
9
11
|
from evalscope.utils.logger import configure_logging, get_logger
|
|
10
12
|
from evalscope.utils.utils import seed_everything
|
|
@@ -82,6 +84,11 @@ def run_perf_benchmark(args):
|
|
|
82
84
|
if args.swanlab_api_key:
|
|
83
85
|
init_swanlab(args)
|
|
84
86
|
|
|
87
|
+
# Initialize local server if needed
|
|
88
|
+
if args.api.startswith('local'):
|
|
89
|
+
# start local server
|
|
90
|
+
server = threading.Thread(target=start_app, args=(copy.deepcopy(args), ), daemon=True)
|
|
91
|
+
server.start()
|
|
85
92
|
# Start benchmark
|
|
86
93
|
if len(args.number) == 1:
|
|
87
94
|
return run_one_benchmark(args, output_path=output_path)
|
|
@@ -22,3 +22,18 @@ class CustomDatasetPlugin(DatasetPluginBase):
|
|
|
22
22
|
yield [{'role': 'user', 'content': prompt}]
|
|
23
23
|
else:
|
|
24
24
|
yield prompt
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
if __name__ == '__main__':
|
|
28
|
+
from evalscope.perf.arguments import Arguments
|
|
29
|
+
from evalscope.perf.main import run_perf_benchmark
|
|
30
|
+
|
|
31
|
+
args = Arguments(
|
|
32
|
+
model='qwen2.5-7b-instruct',
|
|
33
|
+
url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
|
|
34
|
+
dataset_path='outputs/perf_data.txt',
|
|
35
|
+
api_key='EMPTY',
|
|
36
|
+
dataset='custom',
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
run_perf_benchmark(args)
|
|
@@ -38,7 +38,7 @@ class BenchmarkData:
|
|
|
38
38
|
self.first_chunk_latency = self.query_latency
|
|
39
39
|
self.n_chunks = 1
|
|
40
40
|
self.n_chunks_time = self.query_latency
|
|
41
|
-
self.time_per_output_token = self.n_chunks_time / self.
|
|
41
|
+
self.time_per_output_token = self.n_chunks_time / self.n_chunks
|
|
42
42
|
|
|
43
43
|
def _calculate_tokens(self, api_plugin):
|
|
44
44
|
self.prompt_tokens, self.completion_tokens = \
|
|
@@ -96,6 +96,7 @@ def create_app(model, attn_implementation=None) -> FastAPI:
|
|
|
96
96
|
|
|
97
97
|
|
|
98
98
|
def start_app(args: Arguments):
|
|
99
|
+
logger.info('Starting local server, please wait...')
|
|
99
100
|
if args.api == 'local':
|
|
100
101
|
app = create_app(args.model, args.attn_implementation)
|
|
101
102
|
uvicorn.run(app, host='0.0.0.0', port=args.port, workers=1)
|
|
@@ -34,8 +34,15 @@ def init_swanlab(args: Arguments) -> None:
|
|
|
34
34
|
current_time = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
35
35
|
name = args.name if args.name else f'{args.model_id}_{current_time}'
|
|
36
36
|
swanlab.config.update({'framework': '📏evalscope'})
|
|
37
|
-
|
|
38
|
-
project
|
|
39
|
-
name
|
|
40
|
-
config
|
|
41
|
-
mode
|
|
37
|
+
init_kwargs = {
|
|
38
|
+
'project': os.getenv('SWANLAB_PROJ_NAME', 'perf_benchmark'),
|
|
39
|
+
'name': name,
|
|
40
|
+
'config': args.to_dict(),
|
|
41
|
+
'mode': 'local' if args.swanlab_api_key == 'local' else None
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
workspace = os.getenv('SWANLAB_WORKSPACE')
|
|
45
|
+
if workspace:
|
|
46
|
+
init_kwargs['workspace'] = workspace
|
|
47
|
+
|
|
48
|
+
swanlab.init(**init_kwargs)
|
|
@@ -92,7 +92,7 @@ def print_summary(all_results, model_name):
|
|
|
92
92
|
basic_info.add_row('Model', model_name)
|
|
93
93
|
basic_info.add_row('Total Generated', f'{total_tokens:,} tokens')
|
|
94
94
|
basic_info.add_row('Total Test Time', f'{total_time:.2f} seconds')
|
|
95
|
-
basic_info.add_row('Avg Output Rate', f'{total_tokens/total_time:.2f} tokens/sec')
|
|
95
|
+
basic_info.add_row('Avg Output Rate', f'{total_tokens / total_time:.2f} tokens/sec')
|
|
96
96
|
|
|
97
97
|
console.print('\nBasic Information:')
|
|
98
98
|
console.print(basic_info)
|
evalscope/report/__init__.py
CHANGED
|
@@ -1,6 +1,38 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
2
3
|
|
|
3
|
-
from evalscope.
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
from
|
|
4
|
+
from evalscope.utils.import_utils import _LazyModule
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from .combinator import gen_report_table, gen_table, get_data_frame, get_report_list
|
|
8
|
+
from .generator import ReportGenerator
|
|
9
|
+
from .utils import Category, Report, ReportKey, Subset
|
|
10
|
+
|
|
11
|
+
else:
|
|
12
|
+
_import_structure = {
|
|
13
|
+
'combinator': [
|
|
14
|
+
'gen_table',
|
|
15
|
+
'get_data_frame',
|
|
16
|
+
'get_report_list',
|
|
17
|
+
'gen_report_table',
|
|
18
|
+
],
|
|
19
|
+
'generator': [
|
|
20
|
+
'ReportGenerator',
|
|
21
|
+
],
|
|
22
|
+
'utils': [
|
|
23
|
+
'Category',
|
|
24
|
+
'Report',
|
|
25
|
+
'ReportKey',
|
|
26
|
+
'Subset',
|
|
27
|
+
],
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
import sys
|
|
31
|
+
|
|
32
|
+
sys.modules[__name__] = _LazyModule(
|
|
33
|
+
__name__,
|
|
34
|
+
globals()['__file__'],
|
|
35
|
+
_import_structure,
|
|
36
|
+
module_spec=__spec__,
|
|
37
|
+
extra_objects={},
|
|
38
|
+
)
|
evalscope/report/combinator.py
CHANGED
|
@@ -48,6 +48,14 @@ def gen_table(reports_path_list: list) -> str:
|
|
|
48
48
|
return tabulate(table, headers=table.columns, tablefmt='grid', showindex=False)
|
|
49
49
|
|
|
50
50
|
|
|
51
|
+
def gen_report_table(report: Report) -> str:
|
|
52
|
+
"""
|
|
53
|
+
Generate a report table for a single report.
|
|
54
|
+
"""
|
|
55
|
+
table = report.to_dataframe(flatten_metrics=True, flatten_categories=True)
|
|
56
|
+
return tabulate(table, headers=table.columns, tablefmt='grid', showindex=False)
|
|
57
|
+
|
|
58
|
+
|
|
51
59
|
class ReportsRecorder:
|
|
52
60
|
COMMON_DATASET_PATH = []
|
|
53
61
|
CUSTOM_DATASET_PATH = []
|
evalscope/report/generator.py
CHANGED
|
@@ -1,24 +1,42 @@
|
|
|
1
1
|
import pandas as pd
|
|
2
2
|
from pandas import DataFrame
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
3
4
|
|
|
4
5
|
from evalscope.constants import DataCollection
|
|
5
6
|
from evalscope.report.utils import *
|
|
6
7
|
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from evalscope.benchmarks import DataAdapter
|
|
10
|
+
|
|
7
11
|
|
|
8
12
|
class ReportGenerator:
|
|
9
13
|
|
|
10
14
|
@staticmethod
|
|
11
|
-
def gen_report(subset_score_map: dict,
|
|
15
|
+
def gen_report(subset_score_map: dict, model_name: str, data_adapter: 'DataAdapter', **kwargs) -> Report:
|
|
12
16
|
"""
|
|
13
|
-
Generate report for specific dataset.
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
+
Generate a report for a specific dataset based on provided subset scores.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
subset_score_map (dict): A mapping from subset names to a list of score dictionaries.
|
|
21
|
+
{
|
|
22
|
+
'subset_name': [
|
|
23
|
+
{'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100},
|
|
24
|
+
{'metric_name': 'WeightedAverageAccuracy', 'score': 0.3389, 'num': 100}
|
|
25
|
+
],
|
|
26
|
+
...
|
|
27
|
+
}
|
|
28
|
+
report_name (str): The name of the report to generate.
|
|
29
|
+
data_adapter (DataAdapter): An adapter object for data handling.
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
Report: A structured report object containing metrics, categories, and subsets.
|
|
33
|
+
|
|
34
|
+
>>> report = gen_report(subset_score_map, "My Report", data_adapter, dataset_name="Dataset", model_name="Model")
|
|
17
35
|
""" # noqa: E501
|
|
18
36
|
|
|
19
|
-
dataset_name =
|
|
20
|
-
|
|
21
|
-
|
|
37
|
+
dataset_name = data_adapter.name
|
|
38
|
+
category_map = data_adapter.category_map
|
|
39
|
+
report_name = f'{model_name}@{dataset_name}'
|
|
22
40
|
|
|
23
41
|
def flatten_subset() -> DataFrame:
|
|
24
42
|
"""
|
|
@@ -59,7 +77,13 @@ class ReportGenerator:
|
|
|
59
77
|
|
|
60
78
|
metrics_list.append(Metric(name=metric_name, categories=categories))
|
|
61
79
|
|
|
62
|
-
report = Report(
|
|
80
|
+
report = Report(
|
|
81
|
+
name=report_name,
|
|
82
|
+
metrics=metrics_list,
|
|
83
|
+
dataset_name=dataset_name,
|
|
84
|
+
model_name=model_name,
|
|
85
|
+
dataset_description=data_adapter.description,
|
|
86
|
+
dataset_pretty_name=data_adapter.pretty_name)
|
|
63
87
|
return report
|
|
64
88
|
|
|
65
89
|
@staticmethod
|
evalscope/report/utils.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import os
|
|
2
3
|
import pandas as pd
|
|
3
4
|
from collections import defaultdict
|
|
4
5
|
from dataclasses import asdict, dataclass, field
|
|
@@ -6,6 +7,9 @@ from typing import Any, Dict, List
|
|
|
6
7
|
|
|
7
8
|
from evalscope.metrics import macro_mean, micro_mean
|
|
8
9
|
from evalscope.utils import normalize_score
|
|
10
|
+
from evalscope.utils.logger import get_logger
|
|
11
|
+
|
|
12
|
+
logger = get_logger()
|
|
9
13
|
|
|
10
14
|
|
|
11
15
|
@dataclass
|
|
@@ -70,13 +74,28 @@ class ReportKey:
|
|
|
70
74
|
score = 'Score'
|
|
71
75
|
|
|
72
76
|
|
|
77
|
+
ANALYSIS_PROMPT = """根据给出的json格式的模型评测结果,输出分析报告,要求如下:
|
|
78
|
+
1. 报告分为 总体表现、关键指标分析、改进建议、结论 四部分
|
|
79
|
+
2. 若模型有多种指标,将其分为低分、中分、高分三个部分,并列出markdown表格
|
|
80
|
+
3. 只列出报告本身,不要有其他多余内容
|
|
81
|
+
4. 输出报告语言为{language}
|
|
82
|
+
|
|
83
|
+
```json
|
|
84
|
+
{report_str}
|
|
85
|
+
```
|
|
86
|
+
"""
|
|
87
|
+
|
|
88
|
+
|
|
73
89
|
@dataclass
|
|
74
90
|
class Report:
|
|
75
91
|
name: str = 'default_report'
|
|
76
92
|
dataset_name: str = 'default_dataset'
|
|
93
|
+
dataset_pretty_name: str = ''
|
|
94
|
+
dataset_description: str = ''
|
|
77
95
|
model_name: str = 'default_model'
|
|
78
96
|
score: float = 0.0
|
|
79
97
|
metrics: List[Metric] = field(default_factory=list)
|
|
98
|
+
analysis: str = 'N/A'
|
|
80
99
|
|
|
81
100
|
def __post_init__(self):
|
|
82
101
|
self.score = self.metrics[0].score # NOTE: only use the first metric by default
|
|
@@ -84,15 +103,29 @@ class Report:
|
|
|
84
103
|
def to_dict(self) -> Dict[str, Any]:
|
|
85
104
|
return asdict(self)
|
|
86
105
|
|
|
106
|
+
def to_json_str(self) -> str:
|
|
107
|
+
return json.dumps(self.to_dict(), indent=4, ensure_ascii=False)
|
|
108
|
+
|
|
109
|
+
def to_json(self, json_file: str):
|
|
110
|
+
# ensure the directory exists
|
|
111
|
+
os.makedirs(os.path.dirname(json_file), exist_ok=True)
|
|
112
|
+
# write the report to a json file
|
|
113
|
+
with open(json_file, 'w', encoding='utf-8') as f:
|
|
114
|
+
json.dump(self.to_dict(), f, indent=4, ensure_ascii=False)
|
|
115
|
+
|
|
87
116
|
@classmethod
|
|
88
117
|
def from_dict(cls, data: dict):
|
|
89
118
|
metrics = [Metric.from_dict(metric) for metric in data.get('metrics', [])]
|
|
90
119
|
return cls(
|
|
91
120
|
name=data['name'],
|
|
121
|
+
dataset_name=data['dataset_name'],
|
|
122
|
+
dataset_pretty_name=data.get('dataset_pretty_name'),
|
|
123
|
+
dataset_description=data.get('dataset_description'),
|
|
92
124
|
score=data['score'],
|
|
125
|
+
model_name=data['model_name'],
|
|
93
126
|
metrics=metrics,
|
|
94
|
-
|
|
95
|
-
|
|
127
|
+
analysis=data.get('analysis', 'N/A'),
|
|
128
|
+
)
|
|
96
129
|
|
|
97
130
|
@classmethod
|
|
98
131
|
def from_json(cls, json_file: str):
|
|
@@ -111,7 +144,7 @@ class Report:
|
|
|
111
144
|
table[ReportKey.category_name].append(category.name)
|
|
112
145
|
table[ReportKey.subset_name].append(subset.name)
|
|
113
146
|
table[ReportKey.num].append(subset.num)
|
|
114
|
-
table[ReportKey.score].append(subset.score)
|
|
147
|
+
table[ReportKey.score].append(subset.score)
|
|
115
148
|
# NOTE: only flatten metrics if needed, use the first metric by default
|
|
116
149
|
if not flatten_metrics:
|
|
117
150
|
break
|
|
@@ -131,3 +164,27 @@ class Report:
|
|
|
131
164
|
|
|
132
165
|
df_categories.drop(columns=[ReportKey.category_name], inplace=True)
|
|
133
166
|
return df_categories
|
|
167
|
+
|
|
168
|
+
def generate_analysis(self, judge_llm_config: dict) -> str:
|
|
169
|
+
import locale
|
|
170
|
+
|
|
171
|
+
from evalscope.metrics import LLMJudge
|
|
172
|
+
|
|
173
|
+
try:
|
|
174
|
+
# get the default locale
|
|
175
|
+
lang, _ = locale.getlocale()
|
|
176
|
+
|
|
177
|
+
if lang is None:
|
|
178
|
+
language = '中文'
|
|
179
|
+
else:
|
|
180
|
+
language = 'en' if lang.startswith('en') else '中文'
|
|
181
|
+
|
|
182
|
+
prompt = ANALYSIS_PROMPT.format(language=language, report_str=self.to_json_str())
|
|
183
|
+
judge_llm = LLMJudge(**judge_llm_config)
|
|
184
|
+
response = judge_llm(prompt)
|
|
185
|
+
except Exception as e:
|
|
186
|
+
logger.error(f'Error generating analysis: {e}')
|
|
187
|
+
response = 'N/A'
|
|
188
|
+
|
|
189
|
+
self.analysis = response
|
|
190
|
+
return response
|
evalscope/run.py
CHANGED
|
@@ -43,6 +43,9 @@ def run_single_task(task_cfg: TaskConfig, run_time: str) -> dict:
|
|
|
43
43
|
else:
|
|
44
44
|
result = evaluate_model(task_cfg, outputs)
|
|
45
45
|
|
|
46
|
+
logger.info(f'Finished evaluation for {task_cfg.model_id} on {task_cfg.datasets}')
|
|
47
|
+
logger.info(f'Output directory: {outputs.outputs_dir}')
|
|
48
|
+
|
|
46
49
|
return result
|
|
47
50
|
|
|
48
51
|
|
|
@@ -109,6 +112,7 @@ def get_backend_manager_class(eval_backend: EvalBackend):
|
|
|
109
112
|
def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
|
|
110
113
|
"""Evaluate the model based on the provided task configuration."""
|
|
111
114
|
from evalscope.models import get_local_model
|
|
115
|
+
from evalscope.report import gen_table
|
|
112
116
|
|
|
113
117
|
# Initialize evaluator
|
|
114
118
|
eval_results = {}
|
|
@@ -122,10 +126,18 @@ def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
|
|
|
122
126
|
task_cfg.dump_yaml(outputs.configs_dir)
|
|
123
127
|
logger.info(task_cfg)
|
|
124
128
|
|
|
129
|
+
# Run evaluation for each evaluator
|
|
125
130
|
for evaluator in evaluators:
|
|
126
131
|
res_dict = evaluator.eval()
|
|
127
132
|
eval_results[evaluator.dataset_name] = res_dict
|
|
128
133
|
|
|
134
|
+
# Make overall report
|
|
135
|
+
try:
|
|
136
|
+
report_table: str = gen_table([outputs.reports_dir])
|
|
137
|
+
logger.info(f'Overall report table: \n{report_table} \n')
|
|
138
|
+
except Exception:
|
|
139
|
+
logger.error('Failed to generate report table.')
|
|
140
|
+
|
|
129
141
|
# Clean up
|
|
130
142
|
if base_model is not None:
|
|
131
143
|
import gc
|