evalscope 0.16.0__py3-none-any.whl → 0.16.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (61) hide show
  1. evalscope/app/__init__.py +28 -0
  2. evalscope/{report → app}/app.py +20 -25
  3. evalscope/app/constants.py +21 -0
  4. evalscope/arguments.py +2 -1
  5. evalscope/backend/opencompass/backend_manager.py +2 -1
  6. evalscope/backend/rag_eval/cmteb/arguments.py +4 -1
  7. evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
  8. evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
  9. evalscope/backend/rag_eval/utils/embedding.py +75 -35
  10. evalscope/benchmarks/benchmark.py +1 -0
  11. evalscope/benchmarks/data_adapter.py +97 -16
  12. evalscope/benchmarks/docmath/__init__.py +0 -0
  13. evalscope/benchmarks/docmath/docmath_adapter.py +84 -0
  14. evalscope/benchmarks/docmath/utils.py +220 -0
  15. evalscope/benchmarks/frames/__init__.py +0 -0
  16. evalscope/benchmarks/frames/frames_adapter.py +90 -0
  17. evalscope/benchmarks/frames/utils.py +37 -0
  18. evalscope/benchmarks/needle_haystack/__init__.py +0 -0
  19. evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +341 -0
  20. evalscope/benchmarks/needle_haystack/utils.py +79 -0
  21. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +4 -1
  22. evalscope/benchmarks/tool_bench/utils.py +5 -4
  23. evalscope/benchmarks/utils.py +25 -0
  24. evalscope/cli/start_app.py +2 -2
  25. evalscope/collections/__init__.py +35 -3
  26. evalscope/collections/evaluator.py +18 -6
  27. evalscope/config.py +8 -2
  28. evalscope/evaluator/evaluator.py +38 -27
  29. evalscope/metrics/__init__.py +3 -1
  30. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  31. evalscope/metrics/llm_judge.py +12 -5
  32. evalscope/metrics/math_parser.py +1 -1
  33. evalscope/models/adapters/server_adapter.py +2 -6
  34. evalscope/perf/arguments.py +2 -2
  35. evalscope/perf/benchmark.py +0 -9
  36. evalscope/perf/main.py +7 -0
  37. evalscope/perf/plugin/datasets/custom.py +15 -0
  38. evalscope/perf/utils/benchmark_util.py +1 -1
  39. evalscope/perf/utils/local_server.py +1 -0
  40. evalscope/perf/utils/log_utils.py +12 -5
  41. evalscope/perf/utils/rich_display.py +1 -1
  42. evalscope/report/__init__.py +36 -4
  43. evalscope/report/combinator.py +8 -0
  44. evalscope/report/generator.py +33 -9
  45. evalscope/report/utils.py +60 -3
  46. evalscope/run.py +12 -0
  47. evalscope/utils/logger.py +1 -1
  48. evalscope/utils/utils.py +12 -0
  49. evalscope/version.py +2 -2
  50. {evalscope-0.16.0.dist-info → evalscope-0.16.1.dist-info}/METADATA +13 -11
  51. {evalscope-0.16.0.dist-info → evalscope-0.16.1.dist-info}/RECORD +61 -50
  52. tests/aigc/test_t2i.py +40 -3
  53. tests/cli/test_all.py +39 -35
  54. tests/cli/test_collection.py +7 -6
  55. tests/cli/test_run.py +21 -11
  56. tests/rag/test_mteb.py +5 -5
  57. /evalscope/{report/app_arguments.py → app/arguments.py} +0 -0
  58. {evalscope-0.16.0.dist-info → evalscope-0.16.1.dist-info}/LICENSE +0 -0
  59. {evalscope-0.16.0.dist-info → evalscope-0.16.1.dist-info}/WHEEL +0 -0
  60. {evalscope-0.16.0.dist-info → evalscope-0.16.1.dist-info}/entry_points.txt +0 -0
  61. {evalscope-0.16.0.dist-info → evalscope-0.16.1.dist-info}/top_level.txt +0 -0
@@ -13,7 +13,7 @@ from evalscope.benchmarks import DataAdapter
13
13
  from evalscope.config import TaskConfig
14
14
  from evalscope.constants import AnswerKeys, DumpMode, EvalStage, EvalType, JudgeStrategy, ReviewKeys
15
15
  from evalscope.models import BaseModelAdapter
16
- from evalscope.report import Report, gen_table
16
+ from evalscope.report import Report, gen_report_table
17
17
  from evalscope.utils import dict_torch_dtype_to_str, gen_hash
18
18
  from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
19
19
  from evalscope.utils.logger import get_logger
@@ -46,7 +46,6 @@ class Evaluator(object):
46
46
  self.dataset_name = data_adapter.name
47
47
  self.dataset_name_or_path = os.path.expanduser(data_adapter.dataset_id)
48
48
  self.model_name = task_cfg.model_id
49
- self.custom_task_name = f'{self.model_name}_{self.dataset_name}'
50
49
 
51
50
  self.data_adapter = data_adapter
52
51
  self.model_adapter = model_adapter
@@ -79,8 +78,16 @@ class Evaluator(object):
79
78
  # Limit and index prompts
80
79
  limited_prompts = defaultdict(list)
81
80
  for subset_name, prompts_list in prompts.items():
82
- limit = self.task_cfg.limit or len(prompts_list)
83
- for index, prompt in enumerate(prompts_list[:limit]):
81
+ # If limit is None, use all prompts
82
+ if self.task_cfg.limit is None:
83
+ limit = len(prompts_list)
84
+ else:
85
+ if isinstance(self.task_cfg.limit, int):
86
+ limit = self.task_cfg.limit
87
+ elif isinstance(self.task_cfg.limit, float):
88
+ limit = int(len(prompts_list) * self.task_cfg.limit)
89
+ # Limit the number of prompts
90
+ for index, prompt in enumerate(prompts_list[:min(limit, len(prompts_list))]):
84
91
  prompt[AnswerKeys.INDEX] = index
85
92
  limited_prompts[subset_name].append(prompt)
86
93
 
@@ -371,41 +378,45 @@ class Evaluator(object):
371
378
 
372
379
  return metric_score
373
380
 
374
- def dump_report(self, reviews_score_all: List[dict], use_table: bool = True):
381
+ def dump_report(self, reviews_score_all: List[dict]):
375
382
  """
376
383
  Get report for total reviews of specific dataset.
377
384
  It is required to rewrite this method to support your own evaluator.
378
385
 
379
386
  Args:
380
387
  reviews_score_all: reviews score list. Generated by func self.data_adapter.compute_metric().
381
- use_table: whether to generate table for reports. Default to True.
382
388
 
383
389
  Returns: None
384
390
  """
391
+ report_path = os.path.join(self.outputs_structure.reports_dir, self.model_name)
392
+ os.makedirs(report_path, exist_ok=True)
385
393
  # Get report map
386
394
  report_map: Report = self.data_adapter.gen_report(
387
- subset_score_map=reviews_score_all,
388
- report_name=self.custom_task_name,
389
- model_name=self.model_name,
390
- dataset_name=self.dataset_name)
391
-
392
- # Dump report
393
- report_path: str = os.path.join(self.outputs_structure.reports_dir, self.model_name,
394
- self.dataset_name + '.json')
395
- os.makedirs(os.path.dirname(report_path), exist_ok=True)
395
+ subset_score_map=reviews_score_all, model_name=self.model_name)
396
396
 
397
- # Write report
398
- with open(report_path, 'w', encoding='utf-8') as f:
399
- f.write(json.dumps(report_map.to_dict(), ensure_ascii=False, indent=4))
400
- logger.info(f'Dump report: {report_path} \n')
397
+ # Post process report
398
+ self.data_adapter.post_process_report(report_map, report_path=report_path)
401
399
 
402
400
  # Make table
403
- if use_table:
404
- try:
405
- report_table: str = gen_table([self.outputs_structure.reports_dir])
406
- logger.info(f'Report table: \n{report_table} \n')
407
- except Exception:
408
- logger.error('Failed to generate report table.')
401
+ try:
402
+ report_table = gen_report_table(report_map)
403
+ logger.info(f'{self.dataset_name_or_path} report table: \n{report_table} \n')
404
+ except Exception:
405
+ logger.error('Failed to generate report table.')
406
+
407
+ # Make report analysis
408
+ if self.task_cfg.analysis_report:
409
+ logger.info('Generating report analysis, please wait ...')
410
+ analysis = report_map.generate_analysis(self.task_cfg.judge_model_args)
411
+ logger.info('Report analysis:\n%s', analysis)
412
+ else:
413
+ logger.info('Skipping report analysis (`analysis_report=False`).')
414
+
415
+ # Dump report
416
+ report_file = os.path.join(report_path, f'{self.dataset_name}.json')
417
+ report_map.to_json(report_file)
418
+ logger.info(f'Dump report to: {report_file} \n')
419
+
409
420
  return report_map
410
421
 
411
422
  def eval(self, **kwargs) -> dict:
@@ -431,7 +442,7 @@ class Evaluator(object):
431
442
  stage == 'review': return the reviews_map
432
443
  """
433
444
 
434
- logger.info(f'**** Start evaluating on dataset {self.dataset_name_or_path} ****')
445
+ logger.info(f'Start evaluating on dataset {self.dataset_name_or_path}')
435
446
 
436
447
  reviews_score_all = {} # {subset_name: (score, num)}
437
448
  stage_answers_dict = {}
@@ -461,6 +472,6 @@ class Evaluator(object):
461
472
  # Generate report
462
473
  report_map = self.dump_report(reviews_score_all)
463
474
 
464
- logger.info(f'**** Evaluation finished on {self.dataset_name_or_path} ****\n')
475
+ logger.info(f'Evaluation finished on {self.dataset_name_or_path}')
465
476
 
466
477
  return report_map
@@ -9,7 +9,7 @@ if TYPE_CHECKING:
9
9
  from .metrics import (bleu_ngram_one_sample, exact_match, macro_mean, mean, micro_mean, simple_f1_score,
10
10
  weighted_mean)
11
11
  from .named_metrics import Metric, metric_registry
12
- from .rouge_metric import compute_rouge_score_one_sample_zh
12
+ from .rouge_metric import compute_rouge_score, compute_rouge_score_one_sample, compute_rouge_score_one_sample_zh
13
13
 
14
14
  else:
15
15
  _import_structure = {
@@ -28,6 +28,8 @@ else:
28
28
  ],
29
29
  'rouge_metric': [
30
30
  'compute_rouge_score_one_sample_zh',
31
+ 'compute_rouge_score',
32
+ 'compute_rouge_score_one_sample',
31
33
  ],
32
34
  'llm_judge': [
33
35
  'LLMJudge',
@@ -88,11 +88,11 @@ class RougeScorer(scoring.BaseScorer):
88
88
  """
89
89
 
90
90
  def __init__(self, rouge_types, use_stemmer=False, split_summaries=False, tokenizer=None):
91
- check_nltk_data()
92
91
  self.rouge_types = rouge_types
93
92
  if tokenizer:
94
93
  self._tokenizer = tokenizer
95
94
  else:
95
+ check_nltk_data()
96
96
  self._tokenizer = tokenizers.DefaultTokenizer(use_stemmer)
97
97
  logging.info('Using default tokenizer.')
98
98
 
@@ -22,6 +22,9 @@ B: INCORRECT
22
22
  Just return the letters "A" or "B", with no text around it.
23
23
  """ # noqa: E501
24
24
 
25
+ DEFAULT_JUDGE_MODEL = 'Qwen/Qwen3-235B-A22B'
26
+ DEFAULT_API_URL = 'https://api-inference.modelscope.cn/v1/'
27
+
25
28
 
26
29
  class LLMJudge:
27
30
  """
@@ -47,12 +50,12 @@ class LLMJudge:
47
50
  prompt_template (str, optional): Prompt template for the judge
48
51
  generation_config (dict, optional): Generation configuration for the judge
49
52
  """
50
- self.api_key = api_key or os.environ.get('OPENAI_API_KEY', 'EMPTY')
51
- self.api_url = api_url or os.environ.get('OPENAI_API_BASE', 'https://api.openai.com/v1')
52
- self.model_id = model_id or os.environ.get('LOCAL_LLM', 'gpt-4')
53
+ self.api_key = api_key or os.environ.get('MODELSCOPE_SDK_TOKEN', 'EMPTY')
54
+ self.api_url = api_url or os.environ.get('MODELSCOPE_API_BASE', DEFAULT_API_URL)
55
+ self.model_id = model_id or os.environ.get('MODELSCOPE_JUDGE_LLM', DEFAULT_JUDGE_MODEL)
53
56
  self.system_prompt = system_prompt or os.environ.get('JUDGE_SYSTEM_PROMPT', None)
54
57
  self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE', DEFAULT_PROMPT_TEMPLATE)
55
- self.generation_config = generation_config
58
+ self.generation_config = generation_config or {}
56
59
 
57
60
  from evalscope.models import ServerModelAdapter
58
61
 
@@ -74,6 +77,10 @@ class LLMJudge:
74
77
  if self.generation_config:
75
78
  infer_cfg.update(self.generation_config)
76
79
 
80
+ if self.model_id == DEFAULT_JUDGE_MODEL:
81
+ # Disable thinking for the default judge model
82
+ infer_cfg['enable_thinking'] = self.generation_config.get('enable_thinking', False)
83
+
77
84
  try:
78
85
  # Send request using ServerModelAdapter
79
86
  response = self.server_adapter.process_single_input(input_data, infer_cfg)
@@ -82,7 +89,7 @@ class LLMJudge:
82
89
  llm_response = response.get('choices', [{}])[0].get('message', {}).get('content', '')
83
90
  return llm_response
84
91
  except Exception as e:
85
- logger.error(f'Error during LLM evaluation: {e}')
92
+ logger.error(f'Error occurred during {self.model_id}@{self.api_url} LLM judge evaluation: {e}')
86
93
  return ''
87
94
 
88
95
  def build_prompt(self, pred: str, gold: str, question: Optional[str] = None):
@@ -4,7 +4,7 @@ The logic in this file largely borrows from Qwen2.5-Math codebase at https://git
4
4
  # flake8: noqa
5
5
  import re
6
6
  import regex
7
- from latex2sympy2 import latex2sympy
7
+ from latex2sympy2_extended import latex2sympy
8
8
  from math import isclose
9
9
  from sympy import N, simplify
10
10
  from sympy.parsing.latex import parse_latex
@@ -1,11 +1,11 @@
1
1
  import openai
2
2
  from collections import defaultdict
3
- from inspect import signature
4
3
  from openai.types.chat import ChatCompletion, ChatCompletionChunk
5
4
  from openai.types.chat.chat_completion import ChatCompletionMessage, Choice
6
5
  from typing import List, Optional, Union
7
6
 
8
7
  from evalscope.utils.logger import get_logger
8
+ from evalscope.utils.utils import get_supported_params
9
9
  from .base_adapter import BaseModelAdapter
10
10
 
11
11
  logger = get_logger()
@@ -31,7 +31,7 @@ class ServerModelAdapter(BaseModelAdapter):
31
31
  api_key=api_key,
32
32
  base_url=self.api_url,
33
33
  )
34
- self.supported_params = self._get_supported_params()
34
+ self.supported_params = get_supported_params(self.client.chat.completions.create)
35
35
 
36
36
  self.seed = kwargs.get('seed', None)
37
37
  self.timeout = kwargs.get('timeout', 60)
@@ -39,10 +39,6 @@ class ServerModelAdapter(BaseModelAdapter):
39
39
  self.model_cfg = {'api_url': api_url, 'model_id': model_id, 'api_key': api_key}
40
40
  super().__init__(model=None, model_cfg=self.model_cfg, **kwargs)
41
41
 
42
- def _get_supported_params(self):
43
- sig = signature(self.client.chat.completions.create)
44
- return list(sig.parameters.keys())
45
-
46
42
  def predict(self, inputs: List[dict], infer_cfg: Optional[dict] = None) -> List[dict]:
47
43
  """
48
44
  Model prediction func.
@@ -60,8 +60,8 @@ class Arguments:
60
60
  min_tokens: Optional[int] = None # Minimum number of tokens in the response
61
61
  n_choices: Optional[int] = None # Number of response choices
62
62
  seed: Optional[int] = 0 # Random seed for reproducibility
63
- stop: Optional[List[str]] = field(default_factory=list) # Stop sequences for the response
64
- stop_token_ids: Optional[List[str]] = field(default_factory=list) # Stop token IDs for the response
63
+ stop: Optional[List[str]] = None # Stop sequences for the response
64
+ stop_token_ids: Optional[List[str]] = None # Stop token IDs for the response
65
65
  stream: Optional[bool] = True # Whether to stream the response
66
66
  temperature: float = 0.0 # Temperature setting for the response
67
67
  top_p: Optional[float] = None # Top-p (nucleus) sampling setting for the response
@@ -1,11 +1,8 @@
1
1
  import asyncio
2
- import copy
3
2
  import json
4
3
  import numpy as np
5
- import os
6
4
  import platform
7
5
  import sqlite3
8
- import threading
9
6
  import time
10
7
  from http import HTTPStatus
11
8
  from tqdm import tqdm
@@ -17,7 +14,6 @@ from evalscope.perf.plugin.registry import ApiRegistry, DatasetRegistry
17
14
  from evalscope.perf.utils.benchmark_util import BenchmarkData, BenchmarkMetrics
18
15
  from evalscope.perf.utils.db_util import create_result_table, get_result_db_path, insert_benchmark_data, summary_result
19
16
  from evalscope.perf.utils.handler import add_signal_handlers, exception_handler
20
- from evalscope.perf.utils.local_server import start_app
21
17
  from evalscope.utils.logger import get_logger
22
18
 
23
19
  logger = get_logger()
@@ -164,11 +160,6 @@ async def statistic_benchmark_metric(benchmark_data_queue: asyncio.Queue, args:
164
160
 
165
161
  @exception_handler
166
162
  async def connect_test(args: Arguments) -> bool:
167
- if args.api.startswith('local'):
168
- # start local server
169
- server = threading.Thread(target=start_app, args=(copy.deepcopy(args), ), daemon=True)
170
- server.start()
171
-
172
163
  if (not args.no_test_connection) and (not await test_connection(args)):
173
164
  raise TimeoutError('Test connection failed')
174
165
 
evalscope/perf/main.py CHANGED
@@ -2,9 +2,11 @@ import asyncio
2
2
  import copy
3
3
  import os
4
4
  import platform
5
+ import threading
5
6
  import time
6
7
  from argparse import Namespace
7
8
 
9
+ from evalscope.perf.utils.local_server import start_app
8
10
  from evalscope.perf.utils.log_utils import init_swanlab, init_wandb
9
11
  from evalscope.utils.logger import configure_logging, get_logger
10
12
  from evalscope.utils.utils import seed_everything
@@ -82,6 +84,11 @@ def run_perf_benchmark(args):
82
84
  if args.swanlab_api_key:
83
85
  init_swanlab(args)
84
86
 
87
+ # Initialize local server if needed
88
+ if args.api.startswith('local'):
89
+ # start local server
90
+ server = threading.Thread(target=start_app, args=(copy.deepcopy(args), ), daemon=True)
91
+ server.start()
85
92
  # Start benchmark
86
93
  if len(args.number) == 1:
87
94
  return run_one_benchmark(args, output_path=output_path)
@@ -22,3 +22,18 @@ class CustomDatasetPlugin(DatasetPluginBase):
22
22
  yield [{'role': 'user', 'content': prompt}]
23
23
  else:
24
24
  yield prompt
25
+
26
+
27
+ if __name__ == '__main__':
28
+ from evalscope.perf.arguments import Arguments
29
+ from evalscope.perf.main import run_perf_benchmark
30
+
31
+ args = Arguments(
32
+ model='qwen2.5-7b-instruct',
33
+ url='https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions',
34
+ dataset_path='outputs/perf_data.txt',
35
+ api_key='EMPTY',
36
+ dataset='custom',
37
+ )
38
+
39
+ run_perf_benchmark(args)
@@ -38,7 +38,7 @@ class BenchmarkData:
38
38
  self.first_chunk_latency = self.query_latency
39
39
  self.n_chunks = 1
40
40
  self.n_chunks_time = self.query_latency
41
- self.time_per_output_token = self.n_chunks_time / self.completion_tokens
41
+ self.time_per_output_token = self.n_chunks_time / self.n_chunks
42
42
 
43
43
  def _calculate_tokens(self, api_plugin):
44
44
  self.prompt_tokens, self.completion_tokens = \
@@ -96,6 +96,7 @@ def create_app(model, attn_implementation=None) -> FastAPI:
96
96
 
97
97
 
98
98
  def start_app(args: Arguments):
99
+ logger.info('Starting local server, please wait...')
99
100
  if args.api == 'local':
100
101
  app = create_app(args.model, args.attn_implementation)
101
102
  uvicorn.run(app, host='0.0.0.0', port=args.port, workers=1)
@@ -34,8 +34,15 @@ def init_swanlab(args: Arguments) -> None:
34
34
  current_time = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
35
35
  name = args.name if args.name else f'{args.model_id}_{current_time}'
36
36
  swanlab.config.update({'framework': '📏evalscope'})
37
- swanlab.init(
38
- project=os.getenv('SWANLAB_PROJ_NAME', 'perf_benchmark'),
39
- name=name,
40
- config=args.to_dict(),
41
- mode='local' if args.swanlab_api_key == 'local' else None)
37
+ init_kwargs = {
38
+ 'project': os.getenv('SWANLAB_PROJ_NAME', 'perf_benchmark'),
39
+ 'name': name,
40
+ 'config': args.to_dict(),
41
+ 'mode': 'local' if args.swanlab_api_key == 'local' else None
42
+ }
43
+
44
+ workspace = os.getenv('SWANLAB_WORKSPACE')
45
+ if workspace:
46
+ init_kwargs['workspace'] = workspace
47
+
48
+ swanlab.init(**init_kwargs)
@@ -92,7 +92,7 @@ def print_summary(all_results, model_name):
92
92
  basic_info.add_row('Model', model_name)
93
93
  basic_info.add_row('Total Generated', f'{total_tokens:,} tokens')
94
94
  basic_info.add_row('Total Test Time', f'{total_time:.2f} seconds')
95
- basic_info.add_row('Avg Output Rate', f'{total_tokens/total_time:.2f} tokens/sec')
95
+ basic_info.add_row('Avg Output Rate', f'{total_tokens / total_time:.2f} tokens/sec')
96
96
 
97
97
  console.print('\nBasic Information:')
98
98
  console.print(basic_info)
@@ -1,6 +1,38 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
+ from typing import TYPE_CHECKING
2
3
 
3
- from evalscope.report.app_arguments import add_argument
4
- from evalscope.report.combinator import gen_table, get_data_frame, get_report_list
5
- from evalscope.report.generator import ReportGenerator
6
- from evalscope.report.utils import Category, Report, ReportKey, Subset
4
+ from evalscope.utils.import_utils import _LazyModule
5
+
6
+ if TYPE_CHECKING:
7
+ from .combinator import gen_report_table, gen_table, get_data_frame, get_report_list
8
+ from .generator import ReportGenerator
9
+ from .utils import Category, Report, ReportKey, Subset
10
+
11
+ else:
12
+ _import_structure = {
13
+ 'combinator': [
14
+ 'gen_table',
15
+ 'get_data_frame',
16
+ 'get_report_list',
17
+ 'gen_report_table',
18
+ ],
19
+ 'generator': [
20
+ 'ReportGenerator',
21
+ ],
22
+ 'utils': [
23
+ 'Category',
24
+ 'Report',
25
+ 'ReportKey',
26
+ 'Subset',
27
+ ],
28
+ }
29
+
30
+ import sys
31
+
32
+ sys.modules[__name__] = _LazyModule(
33
+ __name__,
34
+ globals()['__file__'],
35
+ _import_structure,
36
+ module_spec=__spec__,
37
+ extra_objects={},
38
+ )
@@ -48,6 +48,14 @@ def gen_table(reports_path_list: list) -> str:
48
48
  return tabulate(table, headers=table.columns, tablefmt='grid', showindex=False)
49
49
 
50
50
 
51
+ def gen_report_table(report: Report) -> str:
52
+ """
53
+ Generate a report table for a single report.
54
+ """
55
+ table = report.to_dataframe(flatten_metrics=True, flatten_categories=True)
56
+ return tabulate(table, headers=table.columns, tablefmt='grid', showindex=False)
57
+
58
+
51
59
  class ReportsRecorder:
52
60
  COMMON_DATASET_PATH = []
53
61
  CUSTOM_DATASET_PATH = []
@@ -1,24 +1,42 @@
1
1
  import pandas as pd
2
2
  from pandas import DataFrame
3
+ from typing import TYPE_CHECKING
3
4
 
4
5
  from evalscope.constants import DataCollection
5
6
  from evalscope.report.utils import *
6
7
 
8
+ if TYPE_CHECKING:
9
+ from evalscope.benchmarks import DataAdapter
10
+
7
11
 
8
12
  class ReportGenerator:
9
13
 
10
14
  @staticmethod
11
- def gen_report(subset_score_map: dict, report_name: str, **kwargs) -> Report:
15
+ def gen_report(subset_score_map: dict, model_name: str, data_adapter: 'DataAdapter', **kwargs) -> Report:
12
16
  """
13
- Generate report for specific dataset.
14
- subset_score_map: e.g. {subset_name: [{'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100}, {'metric_name': 'WeightedAverageAccuracy', 'score': 0.3389, 'num': 100}]}
15
- category_map: e.g. {'subset_name': ['category_name1', 'category_name2'], ...}
16
- metric_list: e.g. [{'object': AverageAccuracy, 'name': 'AverageAccuracy'}, {'object': 'WeightedAverageAccuracy', 'name': 'WeightedAverageAccuracy'}]
17
+ Generate a report for a specific dataset based on provided subset scores.
18
+
19
+ Args:
20
+ subset_score_map (dict): A mapping from subset names to a list of score dictionaries.
21
+ {
22
+ 'subset_name': [
23
+ {'metric_name': 'AverageAccuracy', 'score': 0.3389, 'num': 100},
24
+ {'metric_name': 'WeightedAverageAccuracy', 'score': 0.3389, 'num': 100}
25
+ ],
26
+ ...
27
+ }
28
+ report_name (str): The name of the report to generate.
29
+ data_adapter (DataAdapter): An adapter object for data handling.
30
+
31
+ Returns:
32
+ Report: A structured report object containing metrics, categories, and subsets.
33
+
34
+ >>> report = gen_report(subset_score_map, "My Report", data_adapter, dataset_name="Dataset", model_name="Model")
17
35
  """ # noqa: E501
18
36
 
19
- dataset_name = kwargs.get('dataset_name', None)
20
- model_name = kwargs.get('model_name', None)
21
- category_map = kwargs.get('category_map', {})
37
+ dataset_name = data_adapter.name
38
+ category_map = data_adapter.category_map
39
+ report_name = f'{model_name}@{dataset_name}'
22
40
 
23
41
  def flatten_subset() -> DataFrame:
24
42
  """
@@ -59,7 +77,13 @@ class ReportGenerator:
59
77
 
60
78
  metrics_list.append(Metric(name=metric_name, categories=categories))
61
79
 
62
- report = Report(name=report_name, metrics=metrics_list, dataset_name=dataset_name, model_name=model_name)
80
+ report = Report(
81
+ name=report_name,
82
+ metrics=metrics_list,
83
+ dataset_name=dataset_name,
84
+ model_name=model_name,
85
+ dataset_description=data_adapter.description,
86
+ dataset_pretty_name=data_adapter.pretty_name)
63
87
  return report
64
88
 
65
89
  @staticmethod
evalscope/report/utils.py CHANGED
@@ -1,4 +1,5 @@
1
1
  import json
2
+ import os
2
3
  import pandas as pd
3
4
  from collections import defaultdict
4
5
  from dataclasses import asdict, dataclass, field
@@ -6,6 +7,9 @@ from typing import Any, Dict, List
6
7
 
7
8
  from evalscope.metrics import macro_mean, micro_mean
8
9
  from evalscope.utils import normalize_score
10
+ from evalscope.utils.logger import get_logger
11
+
12
+ logger = get_logger()
9
13
 
10
14
 
11
15
  @dataclass
@@ -70,13 +74,28 @@ class ReportKey:
70
74
  score = 'Score'
71
75
 
72
76
 
77
+ ANALYSIS_PROMPT = """根据给出的json格式的模型评测结果,输出分析报告,要求如下:
78
+ 1. 报告分为 总体表现、关键指标分析、改进建议、结论 四部分
79
+ 2. 若模型有多种指标,将其分为低分、中分、高分三个部分,并列出markdown表格
80
+ 3. 只列出报告本身,不要有其他多余内容
81
+ 4. 输出报告语言为{language}
82
+
83
+ ```json
84
+ {report_str}
85
+ ```
86
+ """
87
+
88
+
73
89
  @dataclass
74
90
  class Report:
75
91
  name: str = 'default_report'
76
92
  dataset_name: str = 'default_dataset'
93
+ dataset_pretty_name: str = ''
94
+ dataset_description: str = ''
77
95
  model_name: str = 'default_model'
78
96
  score: float = 0.0
79
97
  metrics: List[Metric] = field(default_factory=list)
98
+ analysis: str = 'N/A'
80
99
 
81
100
  def __post_init__(self):
82
101
  self.score = self.metrics[0].score # NOTE: only use the first metric by default
@@ -84,15 +103,29 @@ class Report:
84
103
  def to_dict(self) -> Dict[str, Any]:
85
104
  return asdict(self)
86
105
 
106
+ def to_json_str(self) -> str:
107
+ return json.dumps(self.to_dict(), indent=4, ensure_ascii=False)
108
+
109
+ def to_json(self, json_file: str):
110
+ # ensure the directory exists
111
+ os.makedirs(os.path.dirname(json_file), exist_ok=True)
112
+ # write the report to a json file
113
+ with open(json_file, 'w', encoding='utf-8') as f:
114
+ json.dump(self.to_dict(), f, indent=4, ensure_ascii=False)
115
+
87
116
  @classmethod
88
117
  def from_dict(cls, data: dict):
89
118
  metrics = [Metric.from_dict(metric) for metric in data.get('metrics', [])]
90
119
  return cls(
91
120
  name=data['name'],
121
+ dataset_name=data['dataset_name'],
122
+ dataset_pretty_name=data.get('dataset_pretty_name'),
123
+ dataset_description=data.get('dataset_description'),
92
124
  score=data['score'],
125
+ model_name=data['model_name'],
93
126
  metrics=metrics,
94
- dataset_name=data['dataset_name'],
95
- model_name=data['model_name'])
127
+ analysis=data.get('analysis', 'N/A'),
128
+ )
96
129
 
97
130
  @classmethod
98
131
  def from_json(cls, json_file: str):
@@ -111,7 +144,7 @@ class Report:
111
144
  table[ReportKey.category_name].append(category.name)
112
145
  table[ReportKey.subset_name].append(subset.name)
113
146
  table[ReportKey.num].append(subset.num)
114
- table[ReportKey.score].append(subset.score) # TODO: convert to percentage
147
+ table[ReportKey.score].append(subset.score)
115
148
  # NOTE: only flatten metrics if needed, use the first metric by default
116
149
  if not flatten_metrics:
117
150
  break
@@ -131,3 +164,27 @@ class Report:
131
164
 
132
165
  df_categories.drop(columns=[ReportKey.category_name], inplace=True)
133
166
  return df_categories
167
+
168
+ def generate_analysis(self, judge_llm_config: dict) -> str:
169
+ import locale
170
+
171
+ from evalscope.metrics import LLMJudge
172
+
173
+ try:
174
+ # get the default locale
175
+ lang, _ = locale.getlocale()
176
+
177
+ if lang is None:
178
+ language = '中文'
179
+ else:
180
+ language = 'en' if lang.startswith('en') else '中文'
181
+
182
+ prompt = ANALYSIS_PROMPT.format(language=language, report_str=self.to_json_str())
183
+ judge_llm = LLMJudge(**judge_llm_config)
184
+ response = judge_llm(prompt)
185
+ except Exception as e:
186
+ logger.error(f'Error generating analysis: {e}')
187
+ response = 'N/A'
188
+
189
+ self.analysis = response
190
+ return response
evalscope/run.py CHANGED
@@ -43,6 +43,9 @@ def run_single_task(task_cfg: TaskConfig, run_time: str) -> dict:
43
43
  else:
44
44
  result = evaluate_model(task_cfg, outputs)
45
45
 
46
+ logger.info(f'Finished evaluation for {task_cfg.model_id} on {task_cfg.datasets}')
47
+ logger.info(f'Output directory: {outputs.outputs_dir}')
48
+
46
49
  return result
47
50
 
48
51
 
@@ -109,6 +112,7 @@ def get_backend_manager_class(eval_backend: EvalBackend):
109
112
  def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
110
113
  """Evaluate the model based on the provided task configuration."""
111
114
  from evalscope.models import get_local_model
115
+ from evalscope.report import gen_table
112
116
 
113
117
  # Initialize evaluator
114
118
  eval_results = {}
@@ -122,10 +126,18 @@ def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict:
122
126
  task_cfg.dump_yaml(outputs.configs_dir)
123
127
  logger.info(task_cfg)
124
128
 
129
+ # Run evaluation for each evaluator
125
130
  for evaluator in evaluators:
126
131
  res_dict = evaluator.eval()
127
132
  eval_results[evaluator.dataset_name] = res_dict
128
133
 
134
+ # Make overall report
135
+ try:
136
+ report_table: str = gen_table([outputs.reports_dir])
137
+ logger.info(f'Overall report table: \n{report_table} \n')
138
+ except Exception:
139
+ logger.error('Failed to generate report table.')
140
+
129
141
  # Clean up
130
142
  if base_model is not None:
131
143
  import gc