evalscope 0.16.2__py3-none-any.whl → 0.17.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (117) hide show
  1. evalscope/app/app.py +9 -762
  2. evalscope/app/constants.py +1 -0
  3. evalscope/app/ui/__init__.py +20 -0
  4. evalscope/app/ui/app_ui.py +52 -0
  5. evalscope/app/ui/multi_model.py +323 -0
  6. evalscope/app/ui/sidebar.py +42 -0
  7. evalscope/app/ui/single_model.py +202 -0
  8. evalscope/app/ui/visualization.py +36 -0
  9. evalscope/app/utils/data_utils.py +178 -0
  10. evalscope/app/utils/localization.py +221 -0
  11. evalscope/app/utils/text_utils.py +119 -0
  12. evalscope/app/utils/visualization.py +91 -0
  13. evalscope/backend/opencompass/backend_manager.py +2 -1
  14. evalscope/backend/rag_eval/backend_manager.py +2 -1
  15. evalscope/backend/rag_eval/utils/embedding.py +1 -1
  16. evalscope/backend/vlm_eval_kit/backend_manager.py +4 -1
  17. evalscope/benchmarks/__init__.py +15 -1
  18. evalscope/benchmarks/aime/aime24_adapter.py +2 -1
  19. evalscope/benchmarks/aime/aime25_adapter.py +2 -1
  20. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -1
  21. evalscope/benchmarks/arc/arc_adapter.py +1 -1
  22. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -1
  23. evalscope/benchmarks/arena_hard/utils.py +0 -12
  24. evalscope/benchmarks/ceval/ceval_adapter.py +5 -16
  25. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -21
  26. evalscope/benchmarks/competition_math/competition_math_adapter.py +2 -1
  27. evalscope/benchmarks/data_adapter.py +20 -5
  28. evalscope/benchmarks/general_arena/__init__.py +0 -0
  29. evalscope/benchmarks/general_arena/general_arena_adapter.py +411 -0
  30. evalscope/benchmarks/general_arena/utils.py +226 -0
  31. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
  32. evalscope/benchmarks/general_qa/general_qa_adapter.py +42 -29
  33. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
  34. evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -4
  35. evalscope/benchmarks/iquiz/iquiz_adapter.py +1 -1
  36. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -6
  37. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +1 -1
  38. evalscope/benchmarks/math_500/math_500_adapter.py +2 -1
  39. evalscope/benchmarks/mmlu/mmlu_adapter.py +1 -1
  40. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  41. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
  42. evalscope/benchmarks/musr/musr_adapter.py +1 -1
  43. evalscope/benchmarks/race/race_adapter.py +1 -1
  44. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +9 -4
  45. evalscope/benchmarks/utils.py +1 -2
  46. evalscope/benchmarks/winogrande/winogrande_adapter.py +1 -1
  47. evalscope/config.py +8 -123
  48. evalscope/evaluator/evaluator.py +15 -12
  49. evalscope/metrics/__init__.py +6 -0
  50. evalscope/{utils/utils.py → metrics/completion_parsers.py} +68 -180
  51. evalscope/metrics/llm_judge.py +105 -20
  52. evalscope/metrics/metrics.py +1 -1
  53. evalscope/models/adapters/base_adapter.py +0 -2
  54. evalscope/models/adapters/server_adapter.py +2 -2
  55. evalscope/models/custom/dummy_model.py +3 -3
  56. evalscope/perf/arguments.py +2 -16
  57. evalscope/perf/main.py +1 -1
  58. evalscope/perf/utils/analysis_result.py +24 -23
  59. evalscope/perf/utils/benchmark_util.py +1 -1
  60. evalscope/report/__init__.py +1 -1
  61. evalscope/report/utils.py +34 -15
  62. evalscope/run.py +1 -1
  63. evalscope/summarizer.py +1 -2
  64. evalscope/utils/__init__.py +63 -2
  65. evalscope/utils/argument_utils.py +64 -0
  66. evalscope/utils/import_utils.py +16 -0
  67. evalscope/utils/io_utils.py +45 -4
  68. evalscope/utils/model_utils.py +37 -1
  69. evalscope/version.py +2 -2
  70. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/METADATA +55 -26
  71. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/RECORD +90 -101
  72. tests/aigc/test_t2i.py +1 -1
  73. tests/cli/test_all.py +50 -2
  74. tests/cli/test_collection.py +1 -1
  75. tests/cli/test_custom.py +261 -0
  76. tests/cli/test_run.py +13 -37
  77. tests/perf/test_perf.py +2 -2
  78. tests/rag/test_clip_benchmark.py +2 -1
  79. tests/rag/test_mteb.py +3 -1
  80. tests/rag/test_ragas.py +3 -1
  81. tests/swift/test_run_swift_eval.py +2 -1
  82. tests/swift/test_run_swift_vlm_eval.py +2 -1
  83. tests/swift/test_run_swift_vlm_jugde_eval.py +2 -1
  84. tests/utils.py +13 -0
  85. tests/vlm/test_vlmeval.py +8 -2
  86. evalscope/evaluator/rating_eval.py +0 -157
  87. evalscope/evaluator/reviewer/__init__.py +0 -1
  88. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  89. evalscope/registry/__init__.py +0 -1
  90. evalscope/registry/config/cfg_arena.yaml +0 -77
  91. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  92. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  93. evalscope/registry/config/cfg_single.yaml +0 -78
  94. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  95. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  96. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  97. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  98. evalscope/registry/data/question.jsonl +0 -80
  99. evalscope/registry/tasks/arc.yaml +0 -28
  100. evalscope/registry/tasks/bbh.yaml +0 -26
  101. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  102. evalscope/registry/tasks/ceval.yaml +0 -27
  103. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  104. evalscope/registry/tasks/cmmlu.yaml +0 -27
  105. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  106. evalscope/registry/tasks/general_qa.yaml +0 -27
  107. evalscope/registry/tasks/gsm8k.yaml +0 -29
  108. evalscope/registry/tasks/mmlu.yaml +0 -29
  109. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  110. evalscope/run_arena.py +0 -202
  111. evalscope/utils/arena_utils.py +0 -217
  112. evalscope/utils/completion_parsers.py +0 -82
  113. /evalscope/{utils → benchmarks}/filters.py +0 -0
  114. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/LICENSE +0 -0
  115. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/WHEEL +0 -0
  116. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/entry_points.txt +0 -0
  117. {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/top_level.txt +0 -0
@@ -25,13 +25,21 @@ logger = get_logger()
25
25
  prompt_template='请回答问题\n{query}',
26
26
  )
27
27
  class GeneralQAAdapter(DataAdapter):
28
- # TODO: set few_shot_num
29
28
 
30
29
  def __init__(self, **kwargs):
31
-
32
30
  super().__init__(**kwargs)
33
31
 
34
32
  def load(self, dataset_name_or_path: str = None, subset_list: list = None, **kwargs) -> dict:
33
+ """
34
+ Load dataset from the given path or dataset name.
35
+
36
+ Args:
37
+ dataset_name_or_path (str): Path to dataset directory or file.
38
+ subset_list (list): List of subset names to load.
39
+
40
+ Returns:
41
+ dict: Loaded dataset organized by subset.
42
+ """
35
43
  dataset_name_or_path = dataset_name_or_path or self.dataset_id
36
44
  subset_list = subset_list or self.subset_list
37
45
 
@@ -61,58 +69,64 @@ class GeneralQAAdapter(DataAdapter):
61
69
 
62
70
  def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
63
71
  """
72
+ Generate prompt for the model based on input data.
73
+
64
74
  Args:
65
- input_d:
66
- format1: {'history': [['q1', 'a1'], ['q2', 'a2']], 'question': '', 'answer': ''}
67
- format2: {'history': [['q1', 'a1'], ['q2', 'a2']], 'query': '', 'response': ''}
75
+ input_d (dict): Input data dictionary.
76
+ subset_name (str): Name of the subset.
77
+ few_shot_list (list): List of few-shot examples.
68
78
 
69
79
  Returns:
70
- {'data': [prompt]}
71
-
80
+ dict: Dictionary containing the generated prompt.
72
81
  """
73
- # prompt = f"'<|im_start|>user\n{input_d['input']}<|im_end|>\n<|im_start|>assistant\n'"
74
- history = input_d.get('history', []) # history: [['q1', 'a1'], ['q2', 'a2'], ...]
75
- if len(history) > 0:
76
- logger.warning('The history is not included in the prompt for GeneralQA. \
77
- To be supported in the future.')
78
-
82
+ messages = input_d.get('messages')
79
83
  query = input_d.get('question', '') or input_d.get('query', '')
80
84
  system_prompt = input_d.get('system')
81
85
  prompt = self.prompt_template.format(query=query)
82
- return self.gen_prompt_data(prompt, system_prompt=system_prompt)
86
+ return self.gen_prompt_data(prompt, system_prompt=system_prompt, messages=messages)
83
87
 
84
88
  def get_gold_answer(self, input_d: dict) -> str:
85
89
  """
90
+ Extract the gold (reference) answer from the input data.
91
+
86
92
  Args:
87
- input_d: {'history': [], 'question': '', 'answer': ''}
93
+ input_d (dict): Input data dictionary.
88
94
 
89
95
  Returns:
90
- gold_answer: str
91
-
96
+ str: Gold answer string.
92
97
  """
93
- return input_d.get('answer', '') or input_d.get('response', '')
98
+ return input_d.get('answer') or input_d.get('response')
94
99
 
95
100
  def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
96
101
  """
102
+ Parse the prediction result.
103
+
97
104
  Args:
98
- result: str
105
+ result (str): Model prediction result.
106
+ raw_input_d (dict, optional): Original input data.
107
+ eval_type (str): Evaluation type.
99
108
 
100
109
  Returns:
101
- pred_result: str
102
-
110
+ str: Parsed prediction result.
103
111
  """
104
112
  return result
105
113
 
106
114
  def match(self, gold: str, pred: str) -> dict:
107
115
  """
116
+ Compute metric scores between gold and predicted answers.
117
+
108
118
  Args:
109
- gold: str
110
- pred: str
119
+ gold (str): Gold answer.
120
+ pred (str): Predicted answer.
111
121
 
112
122
  Returns:
113
- bleu_score: dict
114
-
123
+ dict: Dictionary of computed metric scores.
115
124
  """
125
+ # reference free metrics
126
+ if gold is None:
127
+ return {'AverageAccuracy': -1}
128
+
129
+ # calculate rouge and bleu scores
116
130
  res = dict()
117
131
  if 'AverageRouge' in self.metric_list:
118
132
  from evalscope.metrics.rouge_metric import compute_rouge_score_one_sample_zh
@@ -128,14 +142,13 @@ class GeneralQAAdapter(DataAdapter):
128
142
 
129
143
  def compute_metric(self, review_res_list: Union[List[dict], List[List[dict]]], **kwargs) -> List[dict]:
130
144
  """
131
- compute weighted mean of the bleu score of all samples
145
+ Compute weighted mean of the metric scores for all samples.
132
146
 
133
147
  Args:
134
- review_res_list: [score1, score2, ...]
148
+ review_res_list (list): List of metric score dictionaries.
135
149
 
136
150
  Returns:
137
- avg_res: List[dict]
138
-
151
+ list: List of dictionaries with averaged metric results.
139
152
  """
140
153
  items = super().compute_dict_metric(review_res_list, **kwargs)
141
154
  return [{'metric_name': k, 'score': mean(v), 'num': len(v)} for k, v in items.items()]
@@ -6,9 +6,9 @@ import re
6
6
  from evalscope.benchmarks import Benchmark, DataAdapter
7
7
  from evalscope.constants import EvalType, OutputType
8
8
  from evalscope.metrics import exact_match
9
+ from evalscope.metrics.completion_parsers import ResponseParser
9
10
  from evalscope.utils.io_utils import jsonl_to_list
10
11
  from evalscope.utils.logger import get_logger
11
- from evalscope.utils.utils import ResponseParser
12
12
 
13
13
  # flake8: noqa
14
14
 
@@ -2,7 +2,6 @@ from collections import defaultdict
2
2
  from typing import Any, Dict, List
3
3
 
4
4
  from evalscope.benchmarks import Benchmark, DataAdapter
5
- from evalscope.benchmarks.ifeval.utils import process_results
6
5
  from evalscope.constants import EvalType
7
6
  from evalscope.metrics import Metric, mean, metric_registry
8
7
 
@@ -43,10 +42,9 @@ class IFEvalAdapter(DataAdapter):
43
42
  def get_gold_answer(self, input_d: dict) -> str:
44
43
  return input_d
45
44
 
46
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
47
- return result
48
-
49
45
  def match(self, gold: Any, pred: Any) -> Dict:
46
+ from evalscope.benchmarks.ifeval.utils import process_results
47
+
50
48
  return process_results(gold, [pred])
51
49
 
52
50
  def compute_metric(self, review_res_list: List[dict], **kwargs) -> Any:
@@ -1,7 +1,7 @@
1
1
  from evalscope.benchmarks import Benchmark, DataAdapter
2
2
  from evalscope.constants import EvalType, OutputType
3
3
  from evalscope.metrics import exact_match
4
- from evalscope.utils.utils import ResponseParser
4
+ from evalscope.metrics.completion_parsers import ResponseParser
5
5
 
6
6
 
7
7
  @Benchmark.register(
@@ -69,12 +69,6 @@ class LiveCodeBenchAdapter(DataAdapter):
69
69
  # Extract the gold answer from the input dict.
70
70
  return input_d
71
71
 
72
- def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
73
- """
74
- Parse the model output to get the answer. Could be the best choice index.
75
- """
76
- return result
77
-
78
72
  def match(self, gold: dict, pred: str) -> float:
79
73
  from .evaluate_utils import codegen_metrics
80
74
  from .extract_utils import extract_code_generation
@@ -3,7 +3,7 @@ from typing import Any
3
3
  from evalscope.benchmarks import Benchmark, DataAdapter
4
4
  from evalscope.constants import EvalType, OutputType
5
5
  from evalscope.metrics import exact_match
6
- from evalscope.utils.utils import ResponseParser
6
+ from evalscope.metrics.completion_parsers import ResponseParser
7
7
 
8
8
  SUBSET_LIST = ['default']
9
9
 
@@ -54,4 +54,5 @@ class Math500Adapter(DataAdapter):
54
54
  return result
55
55
 
56
56
  def match(self, gold: str, pred: str) -> float:
57
- return math_equal(pred, gold)
57
+ res = math_equal(pred, gold)
58
+ return 1.0 if res else 0.0
@@ -5,7 +5,7 @@ import os
5
5
  from evalscope.benchmarks import Benchmark, DataAdapter
6
6
  from evalscope.constants import EvalType, OutputType
7
7
  from evalscope.metrics import exact_match
8
- from evalscope.utils import ResponseParser
8
+ from evalscope.metrics.completion_parsers import ResponseParser
9
9
  from evalscope.utils.logger import get_logger
10
10
 
11
11
  # flake8: noqa
@@ -4,7 +4,7 @@ from typing import Any, Dict
4
4
  from evalscope.benchmarks import Benchmark, DataAdapter
5
5
  from evalscope.constants import EvalType, OutputType
6
6
  from evalscope.metrics import exact_match
7
- from evalscope.utils.utils import ResponseParser
7
+ from evalscope.metrics.completion_parsers import ResponseParser
8
8
 
9
9
  SUBSET_LIST = [
10
10
  'computer science', 'math', 'chemistry', 'engineering', 'law', 'biology', 'health', 'physics', 'business',
@@ -4,8 +4,8 @@ from typing import Any, Dict
4
4
  from evalscope.benchmarks import Benchmark, DataAdapter
5
5
  from evalscope.constants import EvalType, OutputType
6
6
  from evalscope.metrics import exact_match
7
+ from evalscope.metrics.completion_parsers import ResponseParser
7
8
  from evalscope.utils.logger import get_logger
8
- from evalscope.utils.utils import ResponseParser
9
9
 
10
10
  logger = get_logger()
11
11
 
@@ -4,7 +4,7 @@ from typing import Any
4
4
  from evalscope.benchmarks import Benchmark, DataAdapter
5
5
  from evalscope.constants import EvalType, OutputType
6
6
  from evalscope.metrics import exact_match
7
- from evalscope.utils.utils import ResponseParser
7
+ from evalscope.metrics.completion_parsers import ResponseParser
8
8
 
9
9
 
10
10
  @Benchmark.register(
@@ -5,7 +5,7 @@ import os
5
5
  from evalscope.benchmarks import Benchmark, DataAdapter
6
6
  from evalscope.constants import EvalType, OutputType
7
7
  from evalscope.metrics import exact_match
8
- from evalscope.utils import ResponseParser
8
+ from evalscope.metrics.completion_parsers import ResponseParser
9
9
  from evalscope.utils.io_utils import jsonl_to_list
10
10
  from evalscope.utils.logger import get_logger
11
11
 
@@ -96,13 +96,16 @@ class TriviaQaAdapter(DataAdapter):
96
96
  def get_sys_prompt(inp: dict) -> str:
97
97
  return inp['input'][0]['content']
98
98
 
99
- prompt = get_sys_prompt(input_d)
99
+ if self.few_shot_num > 0:
100
+ sys_prompt = get_sys_prompt(input_d)
101
+ else:
102
+ sys_prompt = None
100
103
  few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
101
- context: str = '\n'.join(few_shot_prompts) + '\n'
104
+ context = '\n'.join(few_shot_prompts) + '\n'
102
105
  context += self._generate_prompt(input_d=input_d, include_answer=False)
103
106
  full_prompt = context
104
107
 
105
- return self.gen_prompt_data(full_prompt)
108
+ return self.gen_prompt_data(full_prompt, system_prompt=sys_prompt)
106
109
 
107
110
  def get_gold_answer(self, input_d: dict) -> list:
108
111
  # Get the gold choice
@@ -124,7 +127,9 @@ class TriviaQaAdapter(DataAdapter):
124
127
  return result
125
128
 
126
129
  def match(self, gold: list, pred: str) -> float:
127
- is_correct = any([cand in pred for cand in gold])
130
+ lower_pred = pred.lower()
131
+ gold = [g.lower() for g in gold]
132
+ is_correct = any([cand in lower_pred for cand in gold])
128
133
  return 1 if is_correct else 0
129
134
 
130
135
  @classmethod
@@ -2,8 +2,7 @@ from dataclasses import asdict, dataclass
2
2
  from functools import wraps
3
3
  from typing import Dict, List, Optional, Union
4
4
 
5
- from evalscope.constants import EvalType
6
- from evalscope.utils.filters import Filter
5
+ from .filters import Filter
7
6
 
8
7
 
9
8
  @dataclass
@@ -1,7 +1,7 @@
1
1
  from evalscope.benchmarks import Benchmark, DataAdapter
2
2
  from evalscope.constants import EvalType, OutputType
3
3
  from evalscope.metrics import exact_match
4
- from evalscope.utils.utils import ResponseParser
4
+ from evalscope.metrics.completion_parsers import ResponseParser
5
5
 
6
6
 
7
7
  @Benchmark.register(
evalscope/config.py CHANGED
@@ -1,7 +1,6 @@
1
1
  # Copyright (c) Alibaba, Inc. and its affiliates.
2
2
 
3
3
  import copy
4
- import json
5
4
  import os
6
5
  from argparse import Namespace
7
6
  from dataclasses import dataclass, field
@@ -10,18 +9,15 @@ from typing import Dict, List, Optional, Union
10
9
  from evalscope.constants import (DEFAULT_DATASET_CACHE_DIR, DEFAULT_WORK_DIR, EvalBackend, EvalStage, EvalType, HubType,
11
10
  JudgeStrategy, ModelTask, OutputType)
12
11
  from evalscope.models import CustomModel, DummyCustomModel
13
- from evalscope.utils import gen_hash
14
- from evalscope.utils.io_utils import dict_to_yaml, json_to_dict, yaml_to_dict
12
+ from evalscope.utils.argument_utils import BaseArgument, parse_int_or_float
13
+ from evalscope.utils.io_utils import dict_to_yaml, gen_hash
15
14
  from evalscope.utils.logger import get_logger
16
- from evalscope.utils.utils import parse_int_or_float
17
15
 
18
16
  logger = get_logger()
19
17
 
20
- cur_path = os.path.dirname(os.path.abspath(__file__))
21
-
22
18
 
23
19
  @dataclass
24
- class TaskConfig:
20
+ class TaskConfig(BaseArgument):
25
21
  # Model-related arguments
26
22
  model: Union[str, 'CustomModel', None] = None
27
23
  model_id: Optional[str] = None
@@ -132,15 +128,6 @@ class TaskConfig:
132
128
  'precision': 'torch.float16',
133
129
  }
134
130
 
135
- def to_dict(self):
136
- result = self.__dict__.copy()
137
- if isinstance(self.model, CustomModel):
138
- result['model'] = self.model.__class__.__name__
139
- return result
140
-
141
- def __str__(self):
142
- return json.dumps(self.to_dict(), indent=4, default=str, ensure_ascii=False)
143
-
144
131
  def update(self, other: Union['TaskConfig', dict]):
145
132
  if isinstance(other, TaskConfig):
146
133
  other = other.to_dict()
@@ -155,91 +142,11 @@ class TaskConfig:
155
142
  except Exception as e:
156
143
  logger.warning(f'Failed to dump overall task config: {e}')
157
144
 
158
- @staticmethod
159
- def list():
160
- return list(registry_tasks.keys())
161
-
162
- @staticmethod
163
- def from_yaml(yaml_file: str):
164
- return TaskConfig.from_dict(yaml_to_dict(yaml_file))
165
-
166
- @staticmethod
167
- def from_dict(d: dict):
168
- return TaskConfig(**d)
169
-
170
- @staticmethod
171
- def from_json(json_file: str):
172
- return TaskConfig.from_dict(json_to_dict(json_file))
173
-
174
- @staticmethod
175
- def from_args(args: Namespace):
176
- # Convert Namespace to a dictionary and filter out None values
177
- args_dict = {k: v for k, v in vars(args).items() if v is not None}
178
-
179
- if 'func' in args_dict:
180
- del args_dict['func'] # Note: compat CLI arguments
181
-
182
- return TaskConfig.from_dict(args_dict)
183
-
184
- @staticmethod
185
- def load(custom_model: CustomModel, tasks: List[str]) -> List['TaskConfig']:
186
- res_list = []
187
- for task_name in tasks:
188
- task = registry_tasks.get(task_name, None)
189
- if task is None:
190
- logger.error(f'No task found in tasks: {list(registry_tasks.keys())}, got task_name: {task_name}')
191
- continue
192
-
193
- task.model = custom_model
194
- task.model_args = custom_model.config
195
- task.model_id = type(custom_model).__name__
196
- res_list.append(task)
197
-
198
- return res_list
199
-
200
- @staticmethod
201
- def registry(name: str, data_pattern: str, dataset_dir: str = None, subset_list: list = None) -> None:
202
- """
203
- Register a new task (dataset) for evaluation.
204
-
205
- Args:
206
- name: str, the dataset name.
207
- data_pattern: str, the data pattern for the task.
208
- e.g. `mmlu`, `ceval`, `gsm8k`, ...
209
- refer to task_config.list() for all available datasets.
210
- dataset_dir: str, the directory to store multiple datasets files. e.g. /path/to/data,
211
- then your specific custom dataset directory will be /path/to/data/{name}
212
- subset_list: list, the subset list for the dataset.
213
- e.g. ['middle_school_politics', 'operating_system']
214
- refer to the mmlu for example. https://github.com/hendrycks/test/blob/master/categories.py
215
- """
216
- available_datasets = list(registry_tasks.keys())
217
- if data_pattern not in available_datasets:
218
- logger.error(
219
- f'No dataset found in available datasets: {available_datasets}, got data_pattern: {data_pattern}')
220
- return
221
-
222
- # Reuse the existing task config and update the datasets
223
- pattern_config = registry_tasks[data_pattern]
224
-
225
- custom_config = copy.deepcopy(pattern_config)
226
- custom_config.datasets = [data_pattern]
227
- custom_config.dataset_args = {data_pattern: {}}
228
- custom_config.eval_type = EvalType.CHECKPOINT
229
-
230
- if dataset_dir is not None:
231
- custom_config.dataset_args[data_pattern].update({'local_path': dataset_dir})
232
-
233
- if subset_list is not None:
234
- custom_config.dataset_args[data_pattern].update({'subset_list': subset_list})
235
-
236
- registry_tasks.update({name: custom_config})
237
- logger.info(f'** Registered task: {name} with data pattern: {data_pattern}')
238
-
239
-
240
- tasks = ['arc', 'gsm8k', 'mmlu', 'cmmlu', 'ceval', 'bbh', 'general_qa']
241
-
242
- registry_tasks = {task: TaskConfig.from_yaml(os.path.join(cur_path, f'registry/tasks/{task}.yaml')) for task in tasks}
145
+ def to_dict(self):
146
+ result = self.__dict__.copy()
147
+ if isinstance(self.model, CustomModel):
148
+ result['model'] = self.model.__class__.__name__
149
+ return result
243
150
 
244
151
 
245
152
  def parse_task_config(task_cfg) -> TaskConfig:
@@ -264,25 +171,3 @@ def parse_task_config(task_cfg) -> TaskConfig:
264
171
  else:
265
172
  raise ValueError('Args: Please provide a valid task config.')
266
173
  return task_cfg
267
-
268
-
269
- class TempModel(CustomModel):
270
-
271
- def __init__(self, config: dict):
272
- super().__init__(config=config)
273
-
274
- def predict(self, prompts: str, **kwargs):
275
- return [item + ': response' for item in prompts]
276
-
277
-
278
- if __name__ == '__main__':
279
- model = TempModel(config={'model_id': 'test-swift-dummy-model'})
280
- task_config = TaskConfig()
281
-
282
- # Register a new task
283
- TaskConfig.registry(name='arc_swift', data_pattern='arc', dataset_dir='/path/to/swift_custom_work')
284
-
285
- swift_eval_task: List[TaskConfig] = TaskConfig.load(custom_model=model, tasks=['gsm8k', 'arc', 'arc_swift'])
286
- for item in swift_eval_task:
287
- print(item)
288
- print()
@@ -14,9 +14,9 @@ from evalscope.config import TaskConfig
14
14
  from evalscope.constants import AnswerKeys, DumpMode, EvalStage, EvalType, JudgeStrategy, ReviewKeys
15
15
  from evalscope.models import BaseModelAdapter
16
16
  from evalscope.report import Report, gen_table
17
- from evalscope.utils import dict_torch_dtype_to_str, gen_hash
18
- from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
17
+ from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, gen_hash, jsonl_to_list
19
18
  from evalscope.utils.logger import get_logger
19
+ from evalscope.utils.model_utils import dict_torch_dtype_to_str
20
20
 
21
21
  logger = get_logger()
22
22
 
@@ -237,9 +237,10 @@ class Evaluator(object):
237
237
  if use_llm:
238
238
  # Use LLM as judge
239
239
  assert self.judge is not None, f'Judge model is required for LLM judging {self.data_adapter.name}'
240
+ pred_content = self.data_adapter.llm_parse_pred_result(
241
+ result=answer_content, raw_input_d=raw_input_d, eval_type=self.eval_type)
240
242
  review_result = self.data_adapter.llm_match(
241
- gold_content, answer_content, self.judge, raw_input=raw_input_d)
242
- pred = answer_content
243
+ gold_content, pred_content, self.judge, raw_input=raw_input_d)
243
244
  else:
244
245
  # Use rule-based judging
245
246
  pred_content = self.data_adapter.parse_pred_result(
@@ -250,15 +251,14 @@ class Evaluator(object):
250
251
  if (self.task_cfg.judge_strategy == JudgeStrategy.LLM_RECALL
251
252
  and isinstance(review_result, (bool, int, float)) and not bool(review_result)):
252
253
  assert self.judge is not None, f'Judge model is required for LLM_RECALL strategy {self.data_adapter.name}' # noqa: E501
254
+ pred_content = self.data_adapter.llm_parse_pred_result(
255
+ result=answer_content, raw_input_d=raw_input_d, eval_type=self.eval_type)
253
256
  review_result = self.data_adapter.llm_match(
254
- gold_content, answer_content, self.judge, raw_input=raw_input_d)
255
- pred = answer_content
256
- else:
257
- pred = pred_content
257
+ gold_content, pred_content, self.judge, raw_input=raw_input_d)
258
258
 
259
259
  choice[ReviewKeys.REVIEW] = {
260
260
  ReviewKeys.GOLD: gold_content if gold_content != raw_input_d else '*Same as Input*',
261
- ReviewKeys.PRED: pred,
261
+ ReviewKeys.PRED: pred_content,
262
262
  ReviewKeys.RESULT: review_result
263
263
  }
264
264
  rev_choices.append(choice)
@@ -394,9 +394,6 @@ class Evaluator(object):
394
394
  report_map: Report = self.data_adapter.gen_report(
395
395
  subset_score_map=reviews_score_all, model_name=self.model_name)
396
396
 
397
- # Post process report
398
- self.data_adapter.post_process_report(report_map, report_path=report_path)
399
-
400
397
  # Make table
401
398
  try:
402
399
  report_table = gen_table(report_list=[report_map], add_overall_metric=True)
@@ -418,6 +415,12 @@ class Evaluator(object):
418
415
  report_map.to_json(report_file)
419
416
  logger.info(f'Dump report to: {report_file} \n')
420
417
 
418
+ # Post process report
419
+ try:
420
+ self.data_adapter.post_process_report(report_map, report_path=report_path)
421
+ except Exception as e:
422
+ logger.error(f'Failed to post process report: {e}')
423
+
421
424
  return report_map
422
425
 
423
426
  def eval(self, **kwargs) -> dict:
@@ -4,6 +4,7 @@ from typing import TYPE_CHECKING
4
4
  from evalscope.utils.import_utils import _LazyModule
5
5
 
6
6
  if TYPE_CHECKING:
7
+ from .completion_parsers import ResponseParser, lmsys_parser, ranking_parser
7
8
  from .llm_judge import LLMJudge
8
9
  from .math_parser import extract_answer, math_equal, strip_answer_string
9
10
  from .metrics import (bleu_ngram_one_sample, exact_match, macro_mean, mean, micro_mean, simple_f1_score,
@@ -39,6 +40,11 @@ else:
39
40
  'math_equal',
40
41
  'strip_answer_string',
41
42
  ],
43
+ 'completion_parsers': [
44
+ 'ResponseParser',
45
+ 'lmsys_parser',
46
+ 'ranking_parser',
47
+ ],
42
48
  }
43
49
 
44
50
  import sys