evalscope 0.16.2__py3-none-any.whl → 0.17.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/app/app.py +9 -762
- evalscope/app/constants.py +1 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +52 -0
- evalscope/app/ui/multi_model.py +323 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +202 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +178 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +91 -0
- evalscope/backend/opencompass/backend_manager.py +2 -1
- evalscope/backend/rag_eval/backend_manager.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +1 -1
- evalscope/backend/vlm_eval_kit/backend_manager.py +4 -1
- evalscope/benchmarks/__init__.py +15 -1
- evalscope/benchmarks/aime/aime24_adapter.py +2 -1
- evalscope/benchmarks/aime/aime25_adapter.py +2 -1
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -1
- evalscope/benchmarks/arc/arc_adapter.py +1 -1
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -1
- evalscope/benchmarks/arena_hard/utils.py +0 -12
- evalscope/benchmarks/ceval/ceval_adapter.py +5 -16
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -21
- evalscope/benchmarks/competition_math/competition_math_adapter.py +2 -1
- evalscope/benchmarks/data_adapter.py +20 -5
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +411 -0
- evalscope/benchmarks/general_arena/utils.py +226 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +1 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +42 -29
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
- evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -4
- evalscope/benchmarks/iquiz/iquiz_adapter.py +1 -1
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -6
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +1 -1
- evalscope/benchmarks/math_500/math_500_adapter.py +2 -1
- evalscope/benchmarks/mmlu/mmlu_adapter.py +1 -1
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
- evalscope/benchmarks/musr/musr_adapter.py +1 -1
- evalscope/benchmarks/race/race_adapter.py +1 -1
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +9 -4
- evalscope/benchmarks/utils.py +1 -2
- evalscope/benchmarks/winogrande/winogrande_adapter.py +1 -1
- evalscope/config.py +8 -123
- evalscope/evaluator/evaluator.py +15 -12
- evalscope/metrics/__init__.py +6 -0
- evalscope/{utils/utils.py → metrics/completion_parsers.py} +68 -180
- evalscope/metrics/llm_judge.py +105 -20
- evalscope/metrics/metrics.py +1 -1
- evalscope/models/adapters/base_adapter.py +0 -2
- evalscope/models/adapters/server_adapter.py +2 -2
- evalscope/models/custom/dummy_model.py +3 -3
- evalscope/perf/arguments.py +2 -16
- evalscope/perf/main.py +1 -1
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +1 -1
- evalscope/report/__init__.py +1 -1
- evalscope/report/utils.py +34 -15
- evalscope/run.py +1 -1
- evalscope/summarizer.py +1 -2
- evalscope/utils/__init__.py +63 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/import_utils.py +16 -0
- evalscope/utils/io_utils.py +45 -4
- evalscope/utils/model_utils.py +37 -1
- evalscope/version.py +2 -2
- {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/METADATA +55 -26
- {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/RECORD +90 -101
- tests/aigc/test_t2i.py +1 -1
- tests/cli/test_all.py +50 -2
- tests/cli/test_collection.py +1 -1
- tests/cli/test_custom.py +261 -0
- tests/cli/test_run.py +13 -37
- tests/perf/test_perf.py +2 -2
- tests/rag/test_clip_benchmark.py +2 -1
- tests/rag/test_mteb.py +3 -1
- tests/rag/test_ragas.py +3 -1
- tests/swift/test_run_swift_eval.py +2 -1
- tests/swift/test_run_swift_vlm_eval.py +2 -1
- tests/swift/test_run_swift_vlm_jugde_eval.py +2 -1
- tests/utils.py +13 -0
- tests/vlm/test_vlmeval.py +8 -2
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- /evalscope/{utils → benchmarks}/filters.py +0 -0
- {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/LICENSE +0 -0
- {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/WHEEL +0 -0
- {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/entry_points.txt +0 -0
- {evalscope-0.16.2.dist-info → evalscope-0.17.0.dist-info}/top_level.txt +0 -0
|
@@ -25,13 +25,21 @@ logger = get_logger()
|
|
|
25
25
|
prompt_template='请回答问题\n{query}',
|
|
26
26
|
)
|
|
27
27
|
class GeneralQAAdapter(DataAdapter):
|
|
28
|
-
# TODO: set few_shot_num
|
|
29
28
|
|
|
30
29
|
def __init__(self, **kwargs):
|
|
31
|
-
|
|
32
30
|
super().__init__(**kwargs)
|
|
33
31
|
|
|
34
32
|
def load(self, dataset_name_or_path: str = None, subset_list: list = None, **kwargs) -> dict:
|
|
33
|
+
"""
|
|
34
|
+
Load dataset from the given path or dataset name.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
dataset_name_or_path (str): Path to dataset directory or file.
|
|
38
|
+
subset_list (list): List of subset names to load.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
dict: Loaded dataset organized by subset.
|
|
42
|
+
"""
|
|
35
43
|
dataset_name_or_path = dataset_name_or_path or self.dataset_id
|
|
36
44
|
subset_list = subset_list or self.subset_list
|
|
37
45
|
|
|
@@ -61,58 +69,64 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
61
69
|
|
|
62
70
|
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
|
|
63
71
|
"""
|
|
72
|
+
Generate prompt for the model based on input data.
|
|
73
|
+
|
|
64
74
|
Args:
|
|
65
|
-
input_d:
|
|
66
|
-
|
|
67
|
-
|
|
75
|
+
input_d (dict): Input data dictionary.
|
|
76
|
+
subset_name (str): Name of the subset.
|
|
77
|
+
few_shot_list (list): List of few-shot examples.
|
|
68
78
|
|
|
69
79
|
Returns:
|
|
70
|
-
|
|
71
|
-
|
|
80
|
+
dict: Dictionary containing the generated prompt.
|
|
72
81
|
"""
|
|
73
|
-
|
|
74
|
-
history = input_d.get('history', []) # history: [['q1', 'a1'], ['q2', 'a2'], ...]
|
|
75
|
-
if len(history) > 0:
|
|
76
|
-
logger.warning('The history is not included in the prompt for GeneralQA. \
|
|
77
|
-
To be supported in the future.')
|
|
78
|
-
|
|
82
|
+
messages = input_d.get('messages')
|
|
79
83
|
query = input_d.get('question', '') or input_d.get('query', '')
|
|
80
84
|
system_prompt = input_d.get('system')
|
|
81
85
|
prompt = self.prompt_template.format(query=query)
|
|
82
|
-
return self.gen_prompt_data(prompt, system_prompt=system_prompt)
|
|
86
|
+
return self.gen_prompt_data(prompt, system_prompt=system_prompt, messages=messages)
|
|
83
87
|
|
|
84
88
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
85
89
|
"""
|
|
90
|
+
Extract the gold (reference) answer from the input data.
|
|
91
|
+
|
|
86
92
|
Args:
|
|
87
|
-
input_d
|
|
93
|
+
input_d (dict): Input data dictionary.
|
|
88
94
|
|
|
89
95
|
Returns:
|
|
90
|
-
|
|
91
|
-
|
|
96
|
+
str: Gold answer string.
|
|
92
97
|
"""
|
|
93
|
-
return input_d.get('answer'
|
|
98
|
+
return input_d.get('answer') or input_d.get('response')
|
|
94
99
|
|
|
95
100
|
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
|
|
96
101
|
"""
|
|
102
|
+
Parse the prediction result.
|
|
103
|
+
|
|
97
104
|
Args:
|
|
98
|
-
result:
|
|
105
|
+
result (str): Model prediction result.
|
|
106
|
+
raw_input_d (dict, optional): Original input data.
|
|
107
|
+
eval_type (str): Evaluation type.
|
|
99
108
|
|
|
100
109
|
Returns:
|
|
101
|
-
|
|
102
|
-
|
|
110
|
+
str: Parsed prediction result.
|
|
103
111
|
"""
|
|
104
112
|
return result
|
|
105
113
|
|
|
106
114
|
def match(self, gold: str, pred: str) -> dict:
|
|
107
115
|
"""
|
|
116
|
+
Compute metric scores between gold and predicted answers.
|
|
117
|
+
|
|
108
118
|
Args:
|
|
109
|
-
gold:
|
|
110
|
-
pred:
|
|
119
|
+
gold (str): Gold answer.
|
|
120
|
+
pred (str): Predicted answer.
|
|
111
121
|
|
|
112
122
|
Returns:
|
|
113
|
-
|
|
114
|
-
|
|
123
|
+
dict: Dictionary of computed metric scores.
|
|
115
124
|
"""
|
|
125
|
+
# reference free metrics
|
|
126
|
+
if gold is None:
|
|
127
|
+
return {'AverageAccuracy': -1}
|
|
128
|
+
|
|
129
|
+
# calculate rouge and bleu scores
|
|
116
130
|
res = dict()
|
|
117
131
|
if 'AverageRouge' in self.metric_list:
|
|
118
132
|
from evalscope.metrics.rouge_metric import compute_rouge_score_one_sample_zh
|
|
@@ -128,14 +142,13 @@ class GeneralQAAdapter(DataAdapter):
|
|
|
128
142
|
|
|
129
143
|
def compute_metric(self, review_res_list: Union[List[dict], List[List[dict]]], **kwargs) -> List[dict]:
|
|
130
144
|
"""
|
|
131
|
-
|
|
145
|
+
Compute weighted mean of the metric scores for all samples.
|
|
132
146
|
|
|
133
147
|
Args:
|
|
134
|
-
review_res_list:
|
|
148
|
+
review_res_list (list): List of metric score dictionaries.
|
|
135
149
|
|
|
136
150
|
Returns:
|
|
137
|
-
|
|
138
|
-
|
|
151
|
+
list: List of dictionaries with averaged metric results.
|
|
139
152
|
"""
|
|
140
153
|
items = super().compute_dict_metric(review_res_list, **kwargs)
|
|
141
154
|
return [{'metric_name': k, 'score': mean(v), 'num': len(v)} for k, v in items.items()]
|
|
@@ -6,9 +6,9 @@ import re
|
|
|
6
6
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
7
7
|
from evalscope.constants import EvalType, OutputType
|
|
8
8
|
from evalscope.metrics import exact_match
|
|
9
|
+
from evalscope.metrics.completion_parsers import ResponseParser
|
|
9
10
|
from evalscope.utils.io_utils import jsonl_to_list
|
|
10
11
|
from evalscope.utils.logger import get_logger
|
|
11
|
-
from evalscope.utils.utils import ResponseParser
|
|
12
12
|
|
|
13
13
|
# flake8: noqa
|
|
14
14
|
|
|
@@ -2,7 +2,6 @@ from collections import defaultdict
|
|
|
2
2
|
from typing import Any, Dict, List
|
|
3
3
|
|
|
4
4
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
5
|
-
from evalscope.benchmarks.ifeval.utils import process_results
|
|
6
5
|
from evalscope.constants import EvalType
|
|
7
6
|
from evalscope.metrics import Metric, mean, metric_registry
|
|
8
7
|
|
|
@@ -43,10 +42,9 @@ class IFEvalAdapter(DataAdapter):
|
|
|
43
42
|
def get_gold_answer(self, input_d: dict) -> str:
|
|
44
43
|
return input_d
|
|
45
44
|
|
|
46
|
-
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
47
|
-
return result
|
|
48
|
-
|
|
49
45
|
def match(self, gold: Any, pred: Any) -> Dict:
|
|
46
|
+
from evalscope.benchmarks.ifeval.utils import process_results
|
|
47
|
+
|
|
50
48
|
return process_results(gold, [pred])
|
|
51
49
|
|
|
52
50
|
def compute_metric(self, review_res_list: List[dict], **kwargs) -> Any:
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
2
2
|
from evalscope.constants import EvalType, OutputType
|
|
3
3
|
from evalscope.metrics import exact_match
|
|
4
|
-
from evalscope.
|
|
4
|
+
from evalscope.metrics.completion_parsers import ResponseParser
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
@Benchmark.register(
|
|
@@ -69,12 +69,6 @@ class LiveCodeBenchAdapter(DataAdapter):
|
|
|
69
69
|
# Extract the gold answer from the input dict.
|
|
70
70
|
return input_d
|
|
71
71
|
|
|
72
|
-
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = 'checkpoint') -> str:
|
|
73
|
-
"""
|
|
74
|
-
Parse the model output to get the answer. Could be the best choice index.
|
|
75
|
-
"""
|
|
76
|
-
return result
|
|
77
|
-
|
|
78
72
|
def match(self, gold: dict, pred: str) -> float:
|
|
79
73
|
from .evaluate_utils import codegen_metrics
|
|
80
74
|
from .extract_utils import extract_code_generation
|
|
@@ -3,7 +3,7 @@ from typing import Any
|
|
|
3
3
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
4
4
|
from evalscope.constants import EvalType, OutputType
|
|
5
5
|
from evalscope.metrics import exact_match
|
|
6
|
-
from evalscope.
|
|
6
|
+
from evalscope.metrics.completion_parsers import ResponseParser
|
|
7
7
|
|
|
8
8
|
SUBSET_LIST = ['default']
|
|
9
9
|
|
|
@@ -5,7 +5,7 @@ import os
|
|
|
5
5
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
6
|
from evalscope.constants import EvalType, OutputType
|
|
7
7
|
from evalscope.metrics import exact_match
|
|
8
|
-
from evalscope.
|
|
8
|
+
from evalscope.metrics.completion_parsers import ResponseParser
|
|
9
9
|
from evalscope.utils.logger import get_logger
|
|
10
10
|
|
|
11
11
|
# flake8: noqa
|
|
@@ -4,7 +4,7 @@ from typing import Any, Dict
|
|
|
4
4
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
5
5
|
from evalscope.constants import EvalType, OutputType
|
|
6
6
|
from evalscope.metrics import exact_match
|
|
7
|
-
from evalscope.
|
|
7
|
+
from evalscope.metrics.completion_parsers import ResponseParser
|
|
8
8
|
|
|
9
9
|
SUBSET_LIST = [
|
|
10
10
|
'computer science', 'math', 'chemistry', 'engineering', 'law', 'biology', 'health', 'physics', 'business',
|
|
@@ -4,8 +4,8 @@ from typing import Any, Dict
|
|
|
4
4
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
5
5
|
from evalscope.constants import EvalType, OutputType
|
|
6
6
|
from evalscope.metrics import exact_match
|
|
7
|
+
from evalscope.metrics.completion_parsers import ResponseParser
|
|
7
8
|
from evalscope.utils.logger import get_logger
|
|
8
|
-
from evalscope.utils.utils import ResponseParser
|
|
9
9
|
|
|
10
10
|
logger = get_logger()
|
|
11
11
|
|
|
@@ -4,7 +4,7 @@ from typing import Any
|
|
|
4
4
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
5
5
|
from evalscope.constants import EvalType, OutputType
|
|
6
6
|
from evalscope.metrics import exact_match
|
|
7
|
-
from evalscope.
|
|
7
|
+
from evalscope.metrics.completion_parsers import ResponseParser
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
@Benchmark.register(
|
|
@@ -5,7 +5,7 @@ import os
|
|
|
5
5
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
6
6
|
from evalscope.constants import EvalType, OutputType
|
|
7
7
|
from evalscope.metrics import exact_match
|
|
8
|
-
from evalscope.
|
|
8
|
+
from evalscope.metrics.completion_parsers import ResponseParser
|
|
9
9
|
from evalscope.utils.io_utils import jsonl_to_list
|
|
10
10
|
from evalscope.utils.logger import get_logger
|
|
11
11
|
|
|
@@ -96,13 +96,16 @@ class TriviaQaAdapter(DataAdapter):
|
|
|
96
96
|
def get_sys_prompt(inp: dict) -> str:
|
|
97
97
|
return inp['input'][0]['content']
|
|
98
98
|
|
|
99
|
-
|
|
99
|
+
if self.few_shot_num > 0:
|
|
100
|
+
sys_prompt = get_sys_prompt(input_d)
|
|
101
|
+
else:
|
|
102
|
+
sys_prompt = None
|
|
100
103
|
few_shot_prompts = [self._generate_prompt(input_d=sample, include_answer=True) for sample in few_shot_list]
|
|
101
|
-
context
|
|
104
|
+
context = '\n'.join(few_shot_prompts) + '\n'
|
|
102
105
|
context += self._generate_prompt(input_d=input_d, include_answer=False)
|
|
103
106
|
full_prompt = context
|
|
104
107
|
|
|
105
|
-
return self.gen_prompt_data(full_prompt)
|
|
108
|
+
return self.gen_prompt_data(full_prompt, system_prompt=sys_prompt)
|
|
106
109
|
|
|
107
110
|
def get_gold_answer(self, input_d: dict) -> list:
|
|
108
111
|
# Get the gold choice
|
|
@@ -124,7 +127,9 @@ class TriviaQaAdapter(DataAdapter):
|
|
|
124
127
|
return result
|
|
125
128
|
|
|
126
129
|
def match(self, gold: list, pred: str) -> float:
|
|
127
|
-
|
|
130
|
+
lower_pred = pred.lower()
|
|
131
|
+
gold = [g.lower() for g in gold]
|
|
132
|
+
is_correct = any([cand in lower_pred for cand in gold])
|
|
128
133
|
return 1 if is_correct else 0
|
|
129
134
|
|
|
130
135
|
@classmethod
|
evalscope/benchmarks/utils.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
2
2
|
from evalscope.constants import EvalType, OutputType
|
|
3
3
|
from evalscope.metrics import exact_match
|
|
4
|
-
from evalscope.
|
|
4
|
+
from evalscope.metrics.completion_parsers import ResponseParser
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
@Benchmark.register(
|
evalscope/config.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
2
|
|
|
3
3
|
import copy
|
|
4
|
-
import json
|
|
5
4
|
import os
|
|
6
5
|
from argparse import Namespace
|
|
7
6
|
from dataclasses import dataclass, field
|
|
@@ -10,18 +9,15 @@ from typing import Dict, List, Optional, Union
|
|
|
10
9
|
from evalscope.constants import (DEFAULT_DATASET_CACHE_DIR, DEFAULT_WORK_DIR, EvalBackend, EvalStage, EvalType, HubType,
|
|
11
10
|
JudgeStrategy, ModelTask, OutputType)
|
|
12
11
|
from evalscope.models import CustomModel, DummyCustomModel
|
|
13
|
-
from evalscope.utils import
|
|
14
|
-
from evalscope.utils.io_utils import dict_to_yaml,
|
|
12
|
+
from evalscope.utils.argument_utils import BaseArgument, parse_int_or_float
|
|
13
|
+
from evalscope.utils.io_utils import dict_to_yaml, gen_hash
|
|
15
14
|
from evalscope.utils.logger import get_logger
|
|
16
|
-
from evalscope.utils.utils import parse_int_or_float
|
|
17
15
|
|
|
18
16
|
logger = get_logger()
|
|
19
17
|
|
|
20
|
-
cur_path = os.path.dirname(os.path.abspath(__file__))
|
|
21
|
-
|
|
22
18
|
|
|
23
19
|
@dataclass
|
|
24
|
-
class TaskConfig:
|
|
20
|
+
class TaskConfig(BaseArgument):
|
|
25
21
|
# Model-related arguments
|
|
26
22
|
model: Union[str, 'CustomModel', None] = None
|
|
27
23
|
model_id: Optional[str] = None
|
|
@@ -132,15 +128,6 @@ class TaskConfig:
|
|
|
132
128
|
'precision': 'torch.float16',
|
|
133
129
|
}
|
|
134
130
|
|
|
135
|
-
def to_dict(self):
|
|
136
|
-
result = self.__dict__.copy()
|
|
137
|
-
if isinstance(self.model, CustomModel):
|
|
138
|
-
result['model'] = self.model.__class__.__name__
|
|
139
|
-
return result
|
|
140
|
-
|
|
141
|
-
def __str__(self):
|
|
142
|
-
return json.dumps(self.to_dict(), indent=4, default=str, ensure_ascii=False)
|
|
143
|
-
|
|
144
131
|
def update(self, other: Union['TaskConfig', dict]):
|
|
145
132
|
if isinstance(other, TaskConfig):
|
|
146
133
|
other = other.to_dict()
|
|
@@ -155,91 +142,11 @@ class TaskConfig:
|
|
|
155
142
|
except Exception as e:
|
|
156
143
|
logger.warning(f'Failed to dump overall task config: {e}')
|
|
157
144
|
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
def from_yaml(yaml_file: str):
|
|
164
|
-
return TaskConfig.from_dict(yaml_to_dict(yaml_file))
|
|
165
|
-
|
|
166
|
-
@staticmethod
|
|
167
|
-
def from_dict(d: dict):
|
|
168
|
-
return TaskConfig(**d)
|
|
169
|
-
|
|
170
|
-
@staticmethod
|
|
171
|
-
def from_json(json_file: str):
|
|
172
|
-
return TaskConfig.from_dict(json_to_dict(json_file))
|
|
173
|
-
|
|
174
|
-
@staticmethod
|
|
175
|
-
def from_args(args: Namespace):
|
|
176
|
-
# Convert Namespace to a dictionary and filter out None values
|
|
177
|
-
args_dict = {k: v for k, v in vars(args).items() if v is not None}
|
|
178
|
-
|
|
179
|
-
if 'func' in args_dict:
|
|
180
|
-
del args_dict['func'] # Note: compat CLI arguments
|
|
181
|
-
|
|
182
|
-
return TaskConfig.from_dict(args_dict)
|
|
183
|
-
|
|
184
|
-
@staticmethod
|
|
185
|
-
def load(custom_model: CustomModel, tasks: List[str]) -> List['TaskConfig']:
|
|
186
|
-
res_list = []
|
|
187
|
-
for task_name in tasks:
|
|
188
|
-
task = registry_tasks.get(task_name, None)
|
|
189
|
-
if task is None:
|
|
190
|
-
logger.error(f'No task found in tasks: {list(registry_tasks.keys())}, got task_name: {task_name}')
|
|
191
|
-
continue
|
|
192
|
-
|
|
193
|
-
task.model = custom_model
|
|
194
|
-
task.model_args = custom_model.config
|
|
195
|
-
task.model_id = type(custom_model).__name__
|
|
196
|
-
res_list.append(task)
|
|
197
|
-
|
|
198
|
-
return res_list
|
|
199
|
-
|
|
200
|
-
@staticmethod
|
|
201
|
-
def registry(name: str, data_pattern: str, dataset_dir: str = None, subset_list: list = None) -> None:
|
|
202
|
-
"""
|
|
203
|
-
Register a new task (dataset) for evaluation.
|
|
204
|
-
|
|
205
|
-
Args:
|
|
206
|
-
name: str, the dataset name.
|
|
207
|
-
data_pattern: str, the data pattern for the task.
|
|
208
|
-
e.g. `mmlu`, `ceval`, `gsm8k`, ...
|
|
209
|
-
refer to task_config.list() for all available datasets.
|
|
210
|
-
dataset_dir: str, the directory to store multiple datasets files. e.g. /path/to/data,
|
|
211
|
-
then your specific custom dataset directory will be /path/to/data/{name}
|
|
212
|
-
subset_list: list, the subset list for the dataset.
|
|
213
|
-
e.g. ['middle_school_politics', 'operating_system']
|
|
214
|
-
refer to the mmlu for example. https://github.com/hendrycks/test/blob/master/categories.py
|
|
215
|
-
"""
|
|
216
|
-
available_datasets = list(registry_tasks.keys())
|
|
217
|
-
if data_pattern not in available_datasets:
|
|
218
|
-
logger.error(
|
|
219
|
-
f'No dataset found in available datasets: {available_datasets}, got data_pattern: {data_pattern}')
|
|
220
|
-
return
|
|
221
|
-
|
|
222
|
-
# Reuse the existing task config and update the datasets
|
|
223
|
-
pattern_config = registry_tasks[data_pattern]
|
|
224
|
-
|
|
225
|
-
custom_config = copy.deepcopy(pattern_config)
|
|
226
|
-
custom_config.datasets = [data_pattern]
|
|
227
|
-
custom_config.dataset_args = {data_pattern: {}}
|
|
228
|
-
custom_config.eval_type = EvalType.CHECKPOINT
|
|
229
|
-
|
|
230
|
-
if dataset_dir is not None:
|
|
231
|
-
custom_config.dataset_args[data_pattern].update({'local_path': dataset_dir})
|
|
232
|
-
|
|
233
|
-
if subset_list is not None:
|
|
234
|
-
custom_config.dataset_args[data_pattern].update({'subset_list': subset_list})
|
|
235
|
-
|
|
236
|
-
registry_tasks.update({name: custom_config})
|
|
237
|
-
logger.info(f'** Registered task: {name} with data pattern: {data_pattern}')
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
tasks = ['arc', 'gsm8k', 'mmlu', 'cmmlu', 'ceval', 'bbh', 'general_qa']
|
|
241
|
-
|
|
242
|
-
registry_tasks = {task: TaskConfig.from_yaml(os.path.join(cur_path, f'registry/tasks/{task}.yaml')) for task in tasks}
|
|
145
|
+
def to_dict(self):
|
|
146
|
+
result = self.__dict__.copy()
|
|
147
|
+
if isinstance(self.model, CustomModel):
|
|
148
|
+
result['model'] = self.model.__class__.__name__
|
|
149
|
+
return result
|
|
243
150
|
|
|
244
151
|
|
|
245
152
|
def parse_task_config(task_cfg) -> TaskConfig:
|
|
@@ -264,25 +171,3 @@ def parse_task_config(task_cfg) -> TaskConfig:
|
|
|
264
171
|
else:
|
|
265
172
|
raise ValueError('Args: Please provide a valid task config.')
|
|
266
173
|
return task_cfg
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
class TempModel(CustomModel):
|
|
270
|
-
|
|
271
|
-
def __init__(self, config: dict):
|
|
272
|
-
super().__init__(config=config)
|
|
273
|
-
|
|
274
|
-
def predict(self, prompts: str, **kwargs):
|
|
275
|
-
return [item + ': response' for item in prompts]
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
if __name__ == '__main__':
|
|
279
|
-
model = TempModel(config={'model_id': 'test-swift-dummy-model'})
|
|
280
|
-
task_config = TaskConfig()
|
|
281
|
-
|
|
282
|
-
# Register a new task
|
|
283
|
-
TaskConfig.registry(name='arc_swift', data_pattern='arc', dataset_dir='/path/to/swift_custom_work')
|
|
284
|
-
|
|
285
|
-
swift_eval_task: List[TaskConfig] = TaskConfig.load(custom_model=model, tasks=['gsm8k', 'arc', 'arc_swift'])
|
|
286
|
-
for item in swift_eval_task:
|
|
287
|
-
print(item)
|
|
288
|
-
print()
|
evalscope/evaluator/evaluator.py
CHANGED
|
@@ -14,9 +14,9 @@ from evalscope.config import TaskConfig
|
|
|
14
14
|
from evalscope.constants import AnswerKeys, DumpMode, EvalStage, EvalType, JudgeStrategy, ReviewKeys
|
|
15
15
|
from evalscope.models import BaseModelAdapter
|
|
16
16
|
from evalscope.report import Report, gen_table
|
|
17
|
-
from evalscope.utils import
|
|
18
|
-
from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, jsonl_to_list
|
|
17
|
+
from evalscope.utils.io_utils import OutputsStructure, dump_jsonl_data, gen_hash, jsonl_to_list
|
|
19
18
|
from evalscope.utils.logger import get_logger
|
|
19
|
+
from evalscope.utils.model_utils import dict_torch_dtype_to_str
|
|
20
20
|
|
|
21
21
|
logger = get_logger()
|
|
22
22
|
|
|
@@ -237,9 +237,10 @@ class Evaluator(object):
|
|
|
237
237
|
if use_llm:
|
|
238
238
|
# Use LLM as judge
|
|
239
239
|
assert self.judge is not None, f'Judge model is required for LLM judging {self.data_adapter.name}'
|
|
240
|
+
pred_content = self.data_adapter.llm_parse_pred_result(
|
|
241
|
+
result=answer_content, raw_input_d=raw_input_d, eval_type=self.eval_type)
|
|
240
242
|
review_result = self.data_adapter.llm_match(
|
|
241
|
-
gold_content,
|
|
242
|
-
pred = answer_content
|
|
243
|
+
gold_content, pred_content, self.judge, raw_input=raw_input_d)
|
|
243
244
|
else:
|
|
244
245
|
# Use rule-based judging
|
|
245
246
|
pred_content = self.data_adapter.parse_pred_result(
|
|
@@ -250,15 +251,14 @@ class Evaluator(object):
|
|
|
250
251
|
if (self.task_cfg.judge_strategy == JudgeStrategy.LLM_RECALL
|
|
251
252
|
and isinstance(review_result, (bool, int, float)) and not bool(review_result)):
|
|
252
253
|
assert self.judge is not None, f'Judge model is required for LLM_RECALL strategy {self.data_adapter.name}' # noqa: E501
|
|
254
|
+
pred_content = self.data_adapter.llm_parse_pred_result(
|
|
255
|
+
result=answer_content, raw_input_d=raw_input_d, eval_type=self.eval_type)
|
|
253
256
|
review_result = self.data_adapter.llm_match(
|
|
254
|
-
gold_content,
|
|
255
|
-
pred = answer_content
|
|
256
|
-
else:
|
|
257
|
-
pred = pred_content
|
|
257
|
+
gold_content, pred_content, self.judge, raw_input=raw_input_d)
|
|
258
258
|
|
|
259
259
|
choice[ReviewKeys.REVIEW] = {
|
|
260
260
|
ReviewKeys.GOLD: gold_content if gold_content != raw_input_d else '*Same as Input*',
|
|
261
|
-
ReviewKeys.PRED:
|
|
261
|
+
ReviewKeys.PRED: pred_content,
|
|
262
262
|
ReviewKeys.RESULT: review_result
|
|
263
263
|
}
|
|
264
264
|
rev_choices.append(choice)
|
|
@@ -394,9 +394,6 @@ class Evaluator(object):
|
|
|
394
394
|
report_map: Report = self.data_adapter.gen_report(
|
|
395
395
|
subset_score_map=reviews_score_all, model_name=self.model_name)
|
|
396
396
|
|
|
397
|
-
# Post process report
|
|
398
|
-
self.data_adapter.post_process_report(report_map, report_path=report_path)
|
|
399
|
-
|
|
400
397
|
# Make table
|
|
401
398
|
try:
|
|
402
399
|
report_table = gen_table(report_list=[report_map], add_overall_metric=True)
|
|
@@ -418,6 +415,12 @@ class Evaluator(object):
|
|
|
418
415
|
report_map.to_json(report_file)
|
|
419
416
|
logger.info(f'Dump report to: {report_file} \n')
|
|
420
417
|
|
|
418
|
+
# Post process report
|
|
419
|
+
try:
|
|
420
|
+
self.data_adapter.post_process_report(report_map, report_path=report_path)
|
|
421
|
+
except Exception as e:
|
|
422
|
+
logger.error(f'Failed to post process report: {e}')
|
|
423
|
+
|
|
421
424
|
return report_map
|
|
422
425
|
|
|
423
426
|
def eval(self, **kwargs) -> dict:
|
evalscope/metrics/__init__.py
CHANGED
|
@@ -4,6 +4,7 @@ from typing import TYPE_CHECKING
|
|
|
4
4
|
from evalscope.utils.import_utils import _LazyModule
|
|
5
5
|
|
|
6
6
|
if TYPE_CHECKING:
|
|
7
|
+
from .completion_parsers import ResponseParser, lmsys_parser, ranking_parser
|
|
7
8
|
from .llm_judge import LLMJudge
|
|
8
9
|
from .math_parser import extract_answer, math_equal, strip_answer_string
|
|
9
10
|
from .metrics import (bleu_ngram_one_sample, exact_match, macro_mean, mean, micro_mean, simple_f1_score,
|
|
@@ -39,6 +40,11 @@ else:
|
|
|
39
40
|
'math_equal',
|
|
40
41
|
'strip_answer_string',
|
|
41
42
|
],
|
|
43
|
+
'completion_parsers': [
|
|
44
|
+
'ResponseParser',
|
|
45
|
+
'lmsys_parser',
|
|
46
|
+
'ranking_parser',
|
|
47
|
+
],
|
|
42
48
|
}
|
|
43
49
|
|
|
44
50
|
import sys
|