evalscope 0.15.1__py3-none-any.whl → 0.16.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/app/__init__.py +28 -0
- evalscope/{report → app}/app.py +67 -59
- evalscope/app/constants.py +21 -0
- evalscope/arguments.py +12 -1
- evalscope/backend/opencompass/backend_manager.py +2 -1
- evalscope/backend/rag_eval/cmteb/arguments.py +4 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/utils/embedding.py +75 -35
- evalscope/backend/rag_eval/utils/llm.py +1 -1
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +0 -6
- evalscope/benchmarks/benchmark.py +1 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -0
- evalscope/benchmarks/data_adapter.py +101 -18
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +84 -0
- evalscope/benchmarks/docmath/utils.py +220 -0
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +133 -0
- evalscope/benchmarks/drop/utils.py +59 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +90 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -1
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +341 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +70 -0
- evalscope/benchmarks/tool_bench/utils.py +203 -0
- evalscope/benchmarks/utils.py +28 -2
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +57 -0
- evalscope/cli/start_app.py +2 -2
- evalscope/collections/__init__.py +35 -3
- evalscope/collections/evaluator.py +94 -32
- evalscope/config.py +54 -17
- evalscope/evaluator/evaluator.py +80 -41
- evalscope/metrics/__init__.py +3 -1
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +15 -8
- evalscope/metrics/math_parser.py +1 -1
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/models/adapters/chat_adapter.py +51 -34
- evalscope/models/adapters/server_adapter.py +17 -25
- evalscope/perf/arguments.py +16 -7
- evalscope/perf/benchmark.py +0 -15
- evalscope/perf/main.py +72 -15
- evalscope/perf/plugin/datasets/custom.py +15 -0
- evalscope/perf/utils/benchmark_util.py +34 -16
- evalscope/perf/utils/db_util.py +25 -15
- evalscope/perf/utils/local_server.py +1 -0
- evalscope/perf/utils/log_utils.py +12 -5
- evalscope/perf/utils/rich_display.py +186 -0
- evalscope/report/__init__.py +36 -4
- evalscope/report/combinator.py +8 -0
- evalscope/report/generator.py +33 -9
- evalscope/report/utils.py +61 -4
- evalscope/run.py +12 -0
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/deprecation_utils.py +42 -0
- evalscope/utils/logger.py +1 -1
- evalscope/utils/utils.py +12 -0
- evalscope/version.py +2 -2
- {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/METADATA +57 -31
- {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/RECORD +78 -57
- tests/aigc/test_t2i.py +40 -3
- tests/cli/test_all.py +39 -32
- tests/cli/test_collection.py +8 -6
- tests/cli/test_run.py +43 -17
- tests/perf/test_perf.py +23 -0
- tests/rag/test_mteb.py +5 -5
- /evalscope/{report/app_arguments.py → app/arguments.py} +0 -0
- {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/LICENSE +0 -0
- {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/WHEEL +0 -0
- {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/top_level.txt +0 -0
evalscope/config.py
CHANGED
|
@@ -13,29 +13,20 @@ from evalscope.models import CustomModel, DummyCustomModel
|
|
|
13
13
|
from evalscope.utils import gen_hash
|
|
14
14
|
from evalscope.utils.io_utils import dict_to_yaml, json_to_dict, yaml_to_dict
|
|
15
15
|
from evalscope.utils.logger import get_logger
|
|
16
|
+
from evalscope.utils.utils import parse_int_or_float
|
|
16
17
|
|
|
17
18
|
logger = get_logger()
|
|
18
19
|
|
|
19
20
|
cur_path = os.path.dirname(os.path.abspath(__file__))
|
|
20
21
|
|
|
21
|
-
DEFAULT_MODEL_ARGS = {'revision': 'master', 'precision': 'torch.float16'}
|
|
22
|
-
DEFAULT_GENERATION_CONFIG = {
|
|
23
|
-
'max_length': 2048,
|
|
24
|
-
'max_new_tokens': 512,
|
|
25
|
-
'do_sample': False,
|
|
26
|
-
'top_k': 50,
|
|
27
|
-
'top_p': 1.0,
|
|
28
|
-
'temperature': 1.0,
|
|
29
|
-
}
|
|
30
|
-
|
|
31
22
|
|
|
32
23
|
@dataclass
|
|
33
24
|
class TaskConfig:
|
|
34
25
|
# Model-related arguments
|
|
35
26
|
model: Union[str, 'CustomModel', None] = None
|
|
36
27
|
model_id: Optional[str] = None
|
|
37
|
-
model_args:
|
|
38
|
-
model_task:
|
|
28
|
+
model_args: Dict = field(default_factory=dict)
|
|
29
|
+
model_task: str = ModelTask.TEXT_GENERATION
|
|
39
30
|
|
|
40
31
|
# Template-related arguments
|
|
41
32
|
template_type: Optional[str] = None # Deprecated, will be removed in v1.0.0.
|
|
@@ -48,14 +39,14 @@ class TaskConfig:
|
|
|
48
39
|
dataset_hub: str = HubType.MODELSCOPE
|
|
49
40
|
|
|
50
41
|
# Generation configuration arguments
|
|
51
|
-
generation_config:
|
|
42
|
+
generation_config: Dict = field(default_factory=dict)
|
|
52
43
|
|
|
53
44
|
# Evaluation-related arguments
|
|
54
45
|
eval_type: str = EvalType.CHECKPOINT
|
|
55
46
|
eval_backend: str = EvalBackend.NATIVE
|
|
56
47
|
eval_config: Union[str, Dict, None] = None
|
|
57
48
|
stage: str = EvalStage.ALL
|
|
58
|
-
limit: Optional[int] = None
|
|
49
|
+
limit: Optional[Union[int, float]] = None
|
|
59
50
|
eval_batch_size: Optional[int] = None
|
|
60
51
|
|
|
61
52
|
# Cache and working directory arguments
|
|
@@ -65,6 +56,7 @@ class TaskConfig:
|
|
|
65
56
|
outputs: Optional[str] = None # Deprecated, will be removed in v1.0.0.
|
|
66
57
|
|
|
67
58
|
# Debug and runtime mode arguments
|
|
59
|
+
ignore_errors: bool = False
|
|
68
60
|
debug: bool = False
|
|
69
61
|
dry_run: bool = False
|
|
70
62
|
seed: Optional[int] = 42
|
|
@@ -76,7 +68,8 @@ class TaskConfig:
|
|
|
76
68
|
# LLMJudge arguments
|
|
77
69
|
judge_strategy: str = JudgeStrategy.AUTO
|
|
78
70
|
judge_worker_num: int = 1
|
|
79
|
-
judge_model_args: Optional[Dict] = field(default_factory=
|
|
71
|
+
judge_model_args: Optional[Dict] = field(default_factory=dict)
|
|
72
|
+
analysis_report: bool = False
|
|
80
73
|
|
|
81
74
|
def __post_init__(self):
|
|
82
75
|
if self.model is None:
|
|
@@ -95,6 +88,50 @@ class TaskConfig:
|
|
|
95
88
|
if self.eval_batch_size is None:
|
|
96
89
|
self.eval_batch_size = 8 if self.eval_type == EvalType.SERVICE else 1
|
|
97
90
|
|
|
91
|
+
# Post process limit
|
|
92
|
+
if self.limit is not None:
|
|
93
|
+
self.limit = parse_int_or_float(self.limit)
|
|
94
|
+
|
|
95
|
+
# Set default generation_config and model_args
|
|
96
|
+
self.__init_default_generation_config()
|
|
97
|
+
self.__init_default_model_args()
|
|
98
|
+
|
|
99
|
+
def __init_default_generation_config(self):
|
|
100
|
+
if self.generation_config:
|
|
101
|
+
return
|
|
102
|
+
if self.model_task == ModelTask.IMAGE_GENERATION:
|
|
103
|
+
self.generation_config = {
|
|
104
|
+
'height': 1024,
|
|
105
|
+
'width': 1024,
|
|
106
|
+
'num_inference_steps': 50,
|
|
107
|
+
'guidance_scale': 9.0,
|
|
108
|
+
}
|
|
109
|
+
elif self.model_task == ModelTask.TEXT_GENERATION:
|
|
110
|
+
if self.eval_type == EvalType.CHECKPOINT:
|
|
111
|
+
self.generation_config = {
|
|
112
|
+
'max_length': 2048,
|
|
113
|
+
'max_new_tokens': 512,
|
|
114
|
+
'do_sample': False,
|
|
115
|
+
'top_k': 50,
|
|
116
|
+
'top_p': 1.0,
|
|
117
|
+
'temperature': 1.0,
|
|
118
|
+
}
|
|
119
|
+
elif self.eval_type == EvalType.SERVICE:
|
|
120
|
+
self.generation_config = {
|
|
121
|
+
'max_tokens': 2048,
|
|
122
|
+
'temperature': 0.0,
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
def __init_default_model_args(self):
|
|
126
|
+
if self.model_args:
|
|
127
|
+
return
|
|
128
|
+
if self.model_task == ModelTask.TEXT_GENERATION:
|
|
129
|
+
if self.eval_type == EvalType.CHECKPOINT:
|
|
130
|
+
self.model_args = {
|
|
131
|
+
'revision': 'master',
|
|
132
|
+
'precision': 'torch.float16',
|
|
133
|
+
}
|
|
134
|
+
|
|
98
135
|
def to_dict(self):
|
|
99
136
|
result = self.__dict__.copy()
|
|
100
137
|
if isinstance(self.model, CustomModel):
|
|
@@ -218,9 +255,9 @@ def parse_task_config(task_cfg) -> TaskConfig:
|
|
|
218
255
|
elif isinstance(task_cfg, str):
|
|
219
256
|
extension = os.path.splitext(task_cfg)[-1]
|
|
220
257
|
logger.info(f'Args: Task config is provided with {extension} file type.')
|
|
221
|
-
if extension in ['yaml', 'yml']:
|
|
258
|
+
if extension in ['.yaml', '.yml']:
|
|
222
259
|
task_cfg = TaskConfig.from_yaml(task_cfg)
|
|
223
|
-
elif extension == 'json':
|
|
260
|
+
elif extension == '.json':
|
|
224
261
|
task_cfg = TaskConfig.from_json(task_cfg)
|
|
225
262
|
else:
|
|
226
263
|
raise ValueError('Args: Unsupported file extension.')
|
evalscope/evaluator/evaluator.py
CHANGED
|
@@ -13,7 +13,7 @@ from evalscope.benchmarks import DataAdapter
|
|
|
13
13
|
from evalscope.config import TaskConfig
|
|
14
14
|
from evalscope.constants import AnswerKeys, DumpMode, EvalStage, EvalType, JudgeStrategy, ReviewKeys
|
|
15
15
|
from evalscope.models import BaseModelAdapter
|
|
16
|
-
from evalscope.report import Report,
|
|
16
|
+
from evalscope.report import Report, gen_report_table
|
|
17
17
|
from evalscope.utils import dict_torch_dtype_to_str, gen_hash
|
|
18
18
|
from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
|
|
19
19
|
from evalscope.utils.logger import get_logger
|
|
@@ -46,7 +46,6 @@ class Evaluator(object):
|
|
|
46
46
|
self.dataset_name = data_adapter.name
|
|
47
47
|
self.dataset_name_or_path = os.path.expanduser(data_adapter.dataset_id)
|
|
48
48
|
self.model_name = task_cfg.model_id
|
|
49
|
-
self.custom_task_name = f'{self.model_name}_{self.dataset_name}'
|
|
50
49
|
|
|
51
50
|
self.data_adapter = data_adapter
|
|
52
51
|
self.model_adapter = model_adapter
|
|
@@ -79,8 +78,16 @@ class Evaluator(object):
|
|
|
79
78
|
# Limit and index prompts
|
|
80
79
|
limited_prompts = defaultdict(list)
|
|
81
80
|
for subset_name, prompts_list in prompts.items():
|
|
82
|
-
|
|
83
|
-
|
|
81
|
+
# If limit is None, use all prompts
|
|
82
|
+
if self.task_cfg.limit is None:
|
|
83
|
+
limit = len(prompts_list)
|
|
84
|
+
else:
|
|
85
|
+
if isinstance(self.task_cfg.limit, int):
|
|
86
|
+
limit = self.task_cfg.limit
|
|
87
|
+
elif isinstance(self.task_cfg.limit, float):
|
|
88
|
+
limit = int(len(prompts_list) * self.task_cfg.limit)
|
|
89
|
+
# Limit the number of prompts
|
|
90
|
+
for index, prompt in enumerate(prompts_list[:min(limit, len(prompts_list))]):
|
|
84
91
|
prompt[AnswerKeys.INDEX] = index
|
|
85
92
|
limited_prompts[subset_name].append(prompt)
|
|
86
93
|
|
|
@@ -97,13 +104,23 @@ class Evaluator(object):
|
|
|
97
104
|
answer_d[AnswerKeys.ANSWER_ID] = answer_id
|
|
98
105
|
answer_d[AnswerKeys.SUBSET_NAME] = subset_name
|
|
99
106
|
answer_d[AnswerKeys.RAW_INPUT] = input_d[AnswerKeys.RAW_INPUT]
|
|
100
|
-
# answer_d[AnswerKeys.ORIGIN_PROMPT] = input_d
|
|
101
107
|
answer_d[AnswerKeys.INDEX] = input_d[AnswerKeys.INDEX]
|
|
102
108
|
return answer_d
|
|
103
109
|
|
|
104
110
|
def _get_answer(self, input_prompts, subset_name, infer_cfg) -> List[dict]:
|
|
105
111
|
answers_list = []
|
|
106
|
-
|
|
112
|
+
try:
|
|
113
|
+
# get answer from model
|
|
114
|
+
answer_ds: List[dict] = self.model_adapter.predict(inputs=input_prompts, infer_cfg=infer_cfg)
|
|
115
|
+
except Exception as e:
|
|
116
|
+
logger.error(f'Failed to get answer for {input_prompts}, due to {e}')
|
|
117
|
+
# if ignore_errors is True, continue to next input
|
|
118
|
+
if self.task_cfg.ignore_errors:
|
|
119
|
+
logger.warning('`ignore_errors` is set to True. Dropping this prompt and continuing with evaluation.')
|
|
120
|
+
return answers_list
|
|
121
|
+
else:
|
|
122
|
+
raise e
|
|
123
|
+
# process answer
|
|
107
124
|
for answer_d, input_prompt in zip(answer_ds, input_prompts):
|
|
108
125
|
answer_id = self._generate_answer_id(self.model_adapter.model_cfg, input_prompt, infer_cfg)
|
|
109
126
|
processed_answer = self._process_answer(answer_d, input_prompt, subset_name, answer_id)
|
|
@@ -197,16 +214,17 @@ class Evaluator(object):
|
|
|
197
214
|
reviewer_spec = {}
|
|
198
215
|
|
|
199
216
|
review_res = deepcopy(answer_d)
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
review_res[ReviewKeys.REVIEWED] =
|
|
217
|
+
if AnswerKeys.CHOICES not in review_res:
|
|
218
|
+
review_res[AnswerKeys.CHOICES] = []
|
|
219
|
+
review_res[ReviewKeys.REVIEWED] = True
|
|
203
220
|
review_res[ReviewKeys.REVIEW_ID] = None
|
|
204
221
|
review_res[ReviewKeys.REVIEWER_SPEC] = reviewer_spec
|
|
205
222
|
review_res[ReviewKeys.REVIEW_TIME] = time.time()
|
|
223
|
+
logger.warning(f'No choices found for answer dict: {review_res}')
|
|
206
224
|
return review_res
|
|
207
225
|
|
|
208
226
|
rev_choices = []
|
|
209
|
-
for choice in
|
|
227
|
+
for choice in review_res[AnswerKeys.CHOICES]:
|
|
210
228
|
raw_input_d: dict = review_res[AnswerKeys.RAW_INPUT]
|
|
211
229
|
answer_content = choice[ReviewKeys.MESSAGE][ReviewKeys.CONTENT]
|
|
212
230
|
gold_content = self.data_adapter.get_gold_answer(raw_input_d)
|
|
@@ -280,11 +298,20 @@ class Evaluator(object):
|
|
|
280
298
|
review_file_path = os.path.join(self.outputs_structure.reviews_dir, self.model_name, review_file_name)
|
|
281
299
|
os.makedirs(os.path.dirname(review_file_path), exist_ok=True)
|
|
282
300
|
|
|
301
|
+
# Load existing reviews if using cache
|
|
302
|
+
existing_reviews = {}
|
|
283
303
|
if self.use_cache and os.path.exists(review_file_path):
|
|
284
|
-
|
|
285
|
-
|
|
304
|
+
with open(review_file_path, 'r') as f:
|
|
305
|
+
for line in f:
|
|
306
|
+
review = json.loads(line.strip())
|
|
307
|
+
existing_reviews[review['index']] = review
|
|
308
|
+
logger.info(f'Reusing review result from {review_file_path}, got {len(existing_reviews)} reviews.')
|
|
286
309
|
|
|
287
310
|
def process_single_review(answer_d):
|
|
311
|
+
# Check if review already exists in cache
|
|
312
|
+
if self.use_cache and answer_d['index'] in existing_reviews:
|
|
313
|
+
return existing_reviews[answer_d['index']]
|
|
314
|
+
|
|
288
315
|
review_id, reviewer_spec = self._generate_review_id(answer_d)
|
|
289
316
|
# Get review
|
|
290
317
|
review_d = self._get_review(answer_d=answer_d, review_id=review_id, reviewer_spec=reviewer_spec)
|
|
@@ -299,8 +326,9 @@ class Evaluator(object):
|
|
|
299
326
|
for future in tqdm(as_completed(futures), total=len(futures), desc=f'Reviewing({subset_name}): '):
|
|
300
327
|
review_d = future.result()
|
|
301
328
|
reviews_list.append(review_d)
|
|
302
|
-
# Dump reviews
|
|
303
|
-
|
|
329
|
+
# Dump new reviews only if not using cache or review is new
|
|
330
|
+
if not self.use_cache or review_d['index'] not in existing_reviews:
|
|
331
|
+
dump_jsonl_data(review_d, review_file_path, dump_mode=DumpMode.APPEND)
|
|
304
332
|
|
|
305
333
|
return reviews_list
|
|
306
334
|
|
|
@@ -315,17 +343,24 @@ class Evaluator(object):
|
|
|
315
343
|
Returns:
|
|
316
344
|
The metric result. Depends on the metric function in data_adapter.
|
|
317
345
|
"""
|
|
346
|
+
# Get max choices
|
|
347
|
+
choices_lengths = [
|
|
348
|
+
len(review_d[AnswerKeys.CHOICES]) for review_d in reviews_list if review_d.get(ReviewKeys.REVIEWED)
|
|
349
|
+
]
|
|
350
|
+
if choices_lengths:
|
|
351
|
+
max_choices = max(choices_lengths)
|
|
352
|
+
else:
|
|
353
|
+
max_choices = 0
|
|
318
354
|
|
|
355
|
+
# Get review result
|
|
319
356
|
review_res_list = []
|
|
320
|
-
max_choices = max(
|
|
321
|
-
len(review_d[AnswerKeys.CHOICES]) for review_d in reviews_list if review_d[ReviewKeys.REVIEWED])
|
|
322
357
|
for review_d in reviews_list:
|
|
323
358
|
if not review_d[ReviewKeys.REVIEWED]:
|
|
324
|
-
logger.warning(f'Review not finished for answer_id: {review_d[AnswerKeys.ANSWER_ID]}')
|
|
359
|
+
logger.warning(f'Review not finished for answer_id: {review_d[AnswerKeys.ANSWER_ID]}, skipping ...')
|
|
325
360
|
continue
|
|
326
361
|
|
|
327
362
|
if len(review_d[AnswerKeys.CHOICES]) == 0:
|
|
328
|
-
logger.warning(f'No choices found for answer_id: {review_d[AnswerKeys.ANSWER_ID]}')
|
|
363
|
+
logger.warning(f'No choices found for answer_id: {review_d[AnswerKeys.ANSWER_ID]}, skipping ...')
|
|
329
364
|
continue
|
|
330
365
|
elif len(review_d[AnswerKeys.CHOICES]) == 1 and max_choices == 1:
|
|
331
366
|
review_res = review_d[AnswerKeys.CHOICES][0][ReviewKeys.REVIEW][ReviewKeys.RESULT]
|
|
@@ -343,41 +378,45 @@ class Evaluator(object):
|
|
|
343
378
|
|
|
344
379
|
return metric_score
|
|
345
380
|
|
|
346
|
-
def dump_report(self, reviews_score_all: List[dict]
|
|
381
|
+
def dump_report(self, reviews_score_all: List[dict]):
|
|
347
382
|
"""
|
|
348
383
|
Get report for total reviews of specific dataset.
|
|
349
384
|
It is required to rewrite this method to support your own evaluator.
|
|
350
385
|
|
|
351
386
|
Args:
|
|
352
387
|
reviews_score_all: reviews score list. Generated by func self.data_adapter.compute_metric().
|
|
353
|
-
use_table: whether to generate table for reports. Default to True.
|
|
354
388
|
|
|
355
389
|
Returns: None
|
|
356
390
|
"""
|
|
391
|
+
report_path = os.path.join(self.outputs_structure.reports_dir, self.model_name)
|
|
392
|
+
os.makedirs(report_path, exist_ok=True)
|
|
357
393
|
# Get report map
|
|
358
394
|
report_map: Report = self.data_adapter.gen_report(
|
|
359
|
-
subset_score_map=reviews_score_all,
|
|
360
|
-
report_name=self.custom_task_name,
|
|
361
|
-
model_name=self.model_name,
|
|
362
|
-
dataset_name=self.dataset_name)
|
|
363
|
-
|
|
364
|
-
# Dump report
|
|
365
|
-
report_path: str = os.path.join(self.outputs_structure.reports_dir, self.model_name,
|
|
366
|
-
self.dataset_name + '.json')
|
|
367
|
-
os.makedirs(os.path.dirname(report_path), exist_ok=True)
|
|
395
|
+
subset_score_map=reviews_score_all, model_name=self.model_name)
|
|
368
396
|
|
|
369
|
-
#
|
|
370
|
-
|
|
371
|
-
f.write(json.dumps(report_map.to_dict(), ensure_ascii=False, indent=4))
|
|
372
|
-
logger.info(f'Dump report: {report_path} \n')
|
|
397
|
+
# Post process report
|
|
398
|
+
self.data_adapter.post_process_report(report_map, report_path=report_path)
|
|
373
399
|
|
|
374
400
|
# Make table
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
401
|
+
try:
|
|
402
|
+
report_table = gen_report_table(report_map)
|
|
403
|
+
logger.info(f'{self.dataset_name_or_path} report table: \n{report_table} \n')
|
|
404
|
+
except Exception:
|
|
405
|
+
logger.error('Failed to generate report table.')
|
|
406
|
+
|
|
407
|
+
# Make report analysis
|
|
408
|
+
if self.task_cfg.analysis_report:
|
|
409
|
+
logger.info('Generating report analysis, please wait ...')
|
|
410
|
+
analysis = report_map.generate_analysis(self.task_cfg.judge_model_args)
|
|
411
|
+
logger.info('Report analysis:\n%s', analysis)
|
|
412
|
+
else:
|
|
413
|
+
logger.info('Skipping report analysis (`analysis_report=False`).')
|
|
414
|
+
|
|
415
|
+
# Dump report
|
|
416
|
+
report_file = os.path.join(report_path, f'{self.dataset_name}.json')
|
|
417
|
+
report_map.to_json(report_file)
|
|
418
|
+
logger.info(f'Dump report to: {report_file} \n')
|
|
419
|
+
|
|
381
420
|
return report_map
|
|
382
421
|
|
|
383
422
|
def eval(self, **kwargs) -> dict:
|
|
@@ -403,7 +442,7 @@ class Evaluator(object):
|
|
|
403
442
|
stage == 'review': return the reviews_map
|
|
404
443
|
"""
|
|
405
444
|
|
|
406
|
-
logger.info(f'
|
|
445
|
+
logger.info(f'Start evaluating on dataset {self.dataset_name_or_path}')
|
|
407
446
|
|
|
408
447
|
reviews_score_all = {} # {subset_name: (score, num)}
|
|
409
448
|
stage_answers_dict = {}
|
|
@@ -433,6 +472,6 @@ class Evaluator(object):
|
|
|
433
472
|
# Generate report
|
|
434
473
|
report_map = self.dump_report(reviews_score_all)
|
|
435
474
|
|
|
436
|
-
logger.info(f'
|
|
475
|
+
logger.info(f'Evaluation finished on {self.dataset_name_or_path}')
|
|
437
476
|
|
|
438
477
|
return report_map
|
evalscope/metrics/__init__.py
CHANGED
|
@@ -9,7 +9,7 @@ if TYPE_CHECKING:
|
|
|
9
9
|
from .metrics import (bleu_ngram_one_sample, exact_match, macro_mean, mean, micro_mean, simple_f1_score,
|
|
10
10
|
weighted_mean)
|
|
11
11
|
from .named_metrics import Metric, metric_registry
|
|
12
|
-
from .rouge_metric import compute_rouge_score_one_sample_zh
|
|
12
|
+
from .rouge_metric import compute_rouge_score, compute_rouge_score_one_sample, compute_rouge_score_one_sample_zh
|
|
13
13
|
|
|
14
14
|
else:
|
|
15
15
|
_import_structure = {
|
|
@@ -28,6 +28,8 @@ else:
|
|
|
28
28
|
],
|
|
29
29
|
'rouge_metric': [
|
|
30
30
|
'compute_rouge_score_one_sample_zh',
|
|
31
|
+
'compute_rouge_score',
|
|
32
|
+
'compute_rouge_score_one_sample',
|
|
31
33
|
],
|
|
32
34
|
'llm_judge': [
|
|
33
35
|
'LLMJudge',
|
|
@@ -44,20 +44,25 @@ from evalscope.utils import get_logger
|
|
|
44
44
|
|
|
45
45
|
logger = get_logger()
|
|
46
46
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
os.
|
|
56
|
-
os.
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
47
|
+
|
|
48
|
+
def check_nltk_data():
|
|
49
|
+
"""
|
|
50
|
+
Check if nltk data is available in the system.
|
|
51
|
+
If not, download the necessary data files.
|
|
52
|
+
"""
|
|
53
|
+
try:
|
|
54
|
+
nltk_dir = os.path.join(os.path.expanduser('~'), 'nltk_data/tokenizers')
|
|
55
|
+
os.makedirs(nltk_dir, exist_ok=True)
|
|
56
|
+
punkt_path = os.path.join(nltk_dir, 'punkt_tab.zip')
|
|
57
|
+
punkt_tab_url = 'https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/open_data/nltk_data/punkt_tab.zip'
|
|
58
|
+
|
|
59
|
+
if not os.path.exists(punkt_path):
|
|
60
|
+
os.system(f'wget --timeout=10 --tries=3 -P {nltk_dir} {punkt_tab_url}')
|
|
61
|
+
os.system(f'unzip {punkt_path} -d {nltk_dir}')
|
|
62
|
+
else:
|
|
63
|
+
logger.debug(f'{punkt_path} already exists, skipping download')
|
|
64
|
+
except Exception as e:
|
|
65
|
+
logger.error(f'Try to download punkt_tab.zip for nltk failed: {e}')
|
|
61
66
|
|
|
62
67
|
|
|
63
68
|
class RougeScorer(scoring.BaseScorer):
|
|
@@ -83,11 +88,11 @@ class RougeScorer(scoring.BaseScorer):
|
|
|
83
88
|
"""
|
|
84
89
|
|
|
85
90
|
def __init__(self, rouge_types, use_stemmer=False, split_summaries=False, tokenizer=None):
|
|
86
|
-
|
|
87
91
|
self.rouge_types = rouge_types
|
|
88
92
|
if tokenizer:
|
|
89
93
|
self._tokenizer = tokenizer
|
|
90
94
|
else:
|
|
95
|
+
check_nltk_data()
|
|
91
96
|
self._tokenizer = tokenizers.DefaultTokenizer(use_stemmer)
|
|
92
97
|
logging.info('Using default tokenizer.')
|
|
93
98
|
|
evalscope/metrics/llm_judge.py
CHANGED
|
@@ -22,6 +22,9 @@ B: INCORRECT
|
|
|
22
22
|
Just return the letters "A" or "B", with no text around it.
|
|
23
23
|
""" # noqa: E501
|
|
24
24
|
|
|
25
|
+
DEFAULT_JUDGE_MODEL = 'Qwen/Qwen3-235B-A22B'
|
|
26
|
+
DEFAULT_API_URL = 'https://api-inference.modelscope.cn/v1/'
|
|
27
|
+
|
|
25
28
|
|
|
26
29
|
class LLMJudge:
|
|
27
30
|
"""
|
|
@@ -47,25 +50,25 @@ class LLMJudge:
|
|
|
47
50
|
prompt_template (str, optional): Prompt template for the judge
|
|
48
51
|
generation_config (dict, optional): Generation configuration for the judge
|
|
49
52
|
"""
|
|
50
|
-
self.api_key = api_key or os.environ.get('
|
|
51
|
-
self.api_url = api_url or os.environ.get('
|
|
52
|
-
self.model_id = model_id or os.environ.get('
|
|
53
|
+
self.api_key = api_key or os.environ.get('MODELSCOPE_SDK_TOKEN', 'EMPTY')
|
|
54
|
+
self.api_url = api_url or os.environ.get('MODELSCOPE_API_BASE', DEFAULT_API_URL)
|
|
55
|
+
self.model_id = model_id or os.environ.get('MODELSCOPE_JUDGE_LLM', DEFAULT_JUDGE_MODEL)
|
|
53
56
|
self.system_prompt = system_prompt or os.environ.get('JUDGE_SYSTEM_PROMPT', None)
|
|
54
57
|
self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE', DEFAULT_PROMPT_TEMPLATE)
|
|
55
|
-
self.generation_config = generation_config
|
|
58
|
+
self.generation_config = generation_config or {}
|
|
56
59
|
|
|
57
60
|
from evalscope.models import ServerModelAdapter
|
|
58
61
|
|
|
59
62
|
# Initialize ServerModelAdapter
|
|
60
63
|
self.server_adapter = ServerModelAdapter(api_url=self.api_url, model_id=self.model_id, api_key=self.api_key)
|
|
61
64
|
|
|
62
|
-
def __call__(self, prompt: str, system_prompt: Optional[str] = None) ->
|
|
65
|
+
def __call__(self, prompt: str, system_prompt: Optional[str] = None) -> str:
|
|
63
66
|
"""
|
|
64
67
|
Args:
|
|
65
68
|
prompt (str): The prompt to evaluate
|
|
66
69
|
system_prompt (str, optional): The system prompt to use for the evaluation
|
|
67
70
|
Returns:
|
|
68
|
-
|
|
71
|
+
str: The response from the LLM
|
|
69
72
|
"""
|
|
70
73
|
input_data = {'data': [prompt], 'system_prompt': system_prompt or self.system_prompt}
|
|
71
74
|
|
|
@@ -74,6 +77,10 @@ class LLMJudge:
|
|
|
74
77
|
if self.generation_config:
|
|
75
78
|
infer_cfg.update(self.generation_config)
|
|
76
79
|
|
|
80
|
+
if self.model_id == DEFAULT_JUDGE_MODEL:
|
|
81
|
+
# Disable thinking for the default judge model
|
|
82
|
+
infer_cfg['enable_thinking'] = self.generation_config.get('enable_thinking', False)
|
|
83
|
+
|
|
77
84
|
try:
|
|
78
85
|
# Send request using ServerModelAdapter
|
|
79
86
|
response = self.server_adapter.process_single_input(input_data, infer_cfg)
|
|
@@ -82,8 +89,8 @@ class LLMJudge:
|
|
|
82
89
|
llm_response = response.get('choices', [{}])[0].get('message', {}).get('content', '')
|
|
83
90
|
return llm_response
|
|
84
91
|
except Exception as e:
|
|
85
|
-
logger.error(f'Error during LLM evaluation: {e}')
|
|
86
|
-
return
|
|
92
|
+
logger.error(f'Error occurred during {self.model_id}@{self.api_url} LLM judge evaluation: {e}')
|
|
93
|
+
return ''
|
|
87
94
|
|
|
88
95
|
def build_prompt(self, pred: str, gold: str, question: Optional[str] = None):
|
|
89
96
|
if question is None:
|
evalscope/metrics/math_parser.py
CHANGED
|
@@ -4,7 +4,7 @@ The logic in this file largely borrows from Qwen2.5-Math codebase at https://git
|
|
|
4
4
|
# flake8: noqa
|
|
5
5
|
import re
|
|
6
6
|
import regex
|
|
7
|
-
from
|
|
7
|
+
from latex2sympy2_extended import latex2sympy
|
|
8
8
|
from math import isclose
|
|
9
9
|
from sympy import N, simplify
|
|
10
10
|
from sympy.parsing.latex import parse_latex
|
|
@@ -19,10 +19,6 @@ class DummyTokenizer:
|
|
|
19
19
|
return text.split()
|
|
20
20
|
|
|
21
21
|
|
|
22
|
-
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], tokenizer=DummyTokenizer())
|
|
23
|
-
zh_scorer = Rouge()
|
|
24
|
-
|
|
25
|
-
|
|
26
22
|
def is_contains_chinese(strs):
|
|
27
23
|
for _char in strs:
|
|
28
24
|
if '\u4e00' <= _char <= '\u9fa5':
|
|
@@ -51,6 +47,7 @@ def compute_rouge_score(predict_l, reference_l):
|
|
|
51
47
|
|
|
52
48
|
def compute_rouge_score_one_sample_zh(predict, reference):
|
|
53
49
|
result = dict()
|
|
50
|
+
zh_scorer = Rouge()
|
|
54
51
|
for p, r in zip(predict, reference):
|
|
55
52
|
p = ' '.join(jieba.cut(p)) if is_contains_chinese(p) else p
|
|
56
53
|
r = ' '.join(jieba.cut(r)) if is_contains_chinese(r) else r
|
|
@@ -60,21 +57,22 @@ def compute_rouge_score_one_sample_zh(predict, reference):
|
|
|
60
57
|
except Exception as e:
|
|
61
58
|
logger.warning(f'rouge score error: {p} {r} {e}')
|
|
62
59
|
continue
|
|
63
|
-
result['
|
|
64
|
-
result['
|
|
65
|
-
result['
|
|
66
|
-
result['
|
|
67
|
-
result['
|
|
68
|
-
result['
|
|
69
|
-
result['
|
|
70
|
-
result['
|
|
71
|
-
result['
|
|
60
|
+
result['Rouge-1-R'] = score['rouge-1']['r']
|
|
61
|
+
result['Rouge-1-P'] = score['rouge-1']['p']
|
|
62
|
+
result['Rouge-1-F'] = score['rouge-1']['f']
|
|
63
|
+
result['Rouge-2-R'] = score['rouge-2']['r']
|
|
64
|
+
result['Rouge-2-P'] = score['rouge-2']['p']
|
|
65
|
+
result['Rouge-2-F'] = score['rouge-2']['f']
|
|
66
|
+
result['Rouge-L-R'] = score['rouge-l']['r']
|
|
67
|
+
result['Rouge-L-P'] = score['rouge-l']['p']
|
|
68
|
+
result['Rouge-L-F'] = score['rouge-l']['f']
|
|
72
69
|
|
|
73
70
|
return result
|
|
74
71
|
|
|
75
72
|
|
|
76
73
|
def compute_rouge_score_one_sample(predict, reference):
|
|
77
74
|
result = dict()
|
|
75
|
+
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], tokenizer=DummyTokenizer())
|
|
78
76
|
for p, r in zip(predict, reference):
|
|
79
77
|
try:
|
|
80
78
|
score = scorer.score(p, r)
|