evalscope 0.15.1__py3-none-any.whl → 0.16.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/app/__init__.py +28 -0
- evalscope/{report → app}/app.py +67 -59
- evalscope/app/constants.py +21 -0
- evalscope/arguments.py +12 -1
- evalscope/backend/opencompass/backend_manager.py +2 -1
- evalscope/backend/rag_eval/cmteb/arguments.py +4 -1
- evalscope/backend/rag_eval/cmteb/task_template.py +19 -3
- evalscope/backend/rag_eval/cmteb/tasks/CustomTask.py +1 -1
- evalscope/backend/rag_eval/utils/embedding.py +75 -35
- evalscope/backend/rag_eval/utils/llm.py +1 -1
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +0 -6
- evalscope/benchmarks/benchmark.py +1 -0
- evalscope/benchmarks/chinese_simple_qa/csimple_qa_adapter.py +1 -0
- evalscope/benchmarks/data_adapter.py +101 -18
- evalscope/benchmarks/docmath/__init__.py +0 -0
- evalscope/benchmarks/docmath/docmath_adapter.py +84 -0
- evalscope/benchmarks/docmath/utils.py +220 -0
- evalscope/benchmarks/drop/__init__.py +0 -0
- evalscope/benchmarks/drop/drop_adapter.py +133 -0
- evalscope/benchmarks/drop/utils.py +59 -0
- evalscope/benchmarks/frames/__init__.py +0 -0
- evalscope/benchmarks/frames/frames_adapter.py +90 -0
- evalscope/benchmarks/frames/utils.py +37 -0
- evalscope/benchmarks/general_qa/general_qa_adapter.py +5 -1
- evalscope/benchmarks/needle_haystack/__init__.py +0 -0
- evalscope/benchmarks/needle_haystack/needle_haystack_adapter.py +341 -0
- evalscope/benchmarks/needle_haystack/utils.py +79 -0
- evalscope/benchmarks/simple_qa/simple_qa_adapter.py +1 -0
- evalscope/benchmarks/tool_bench/__init__.py +0 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +70 -0
- evalscope/benchmarks/tool_bench/utils.py +203 -0
- evalscope/benchmarks/utils.py +28 -2
- evalscope/benchmarks/winogrande/__init__.py +0 -0
- evalscope/benchmarks/winogrande/winogrande_adapter.py +57 -0
- evalscope/cli/start_app.py +2 -2
- evalscope/collections/__init__.py +35 -3
- evalscope/collections/evaluator.py +94 -32
- evalscope/config.py +54 -17
- evalscope/evaluator/evaluator.py +80 -41
- evalscope/metrics/__init__.py +3 -1
- evalscope/metrics/bundled_rouge_score/rouge_scorer.py +20 -15
- evalscope/metrics/llm_judge.py +15 -8
- evalscope/metrics/math_parser.py +1 -1
- evalscope/metrics/rouge_metric.py +11 -13
- evalscope/models/adapters/chat_adapter.py +51 -34
- evalscope/models/adapters/server_adapter.py +17 -25
- evalscope/perf/arguments.py +16 -7
- evalscope/perf/benchmark.py +0 -15
- evalscope/perf/main.py +72 -15
- evalscope/perf/plugin/datasets/custom.py +15 -0
- evalscope/perf/utils/benchmark_util.py +34 -16
- evalscope/perf/utils/db_util.py +25 -15
- evalscope/perf/utils/local_server.py +1 -0
- evalscope/perf/utils/log_utils.py +12 -5
- evalscope/perf/utils/rich_display.py +186 -0
- evalscope/report/__init__.py +36 -4
- evalscope/report/combinator.py +8 -0
- evalscope/report/generator.py +33 -9
- evalscope/report/utils.py +61 -4
- evalscope/run.py +12 -0
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/utils/deprecation_utils.py +42 -0
- evalscope/utils/logger.py +1 -1
- evalscope/utils/utils.py +12 -0
- evalscope/version.py +2 -2
- {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/METADATA +57 -31
- {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/RECORD +78 -57
- tests/aigc/test_t2i.py +40 -3
- tests/cli/test_all.py +39 -32
- tests/cli/test_collection.py +8 -6
- tests/cli/test_run.py +43 -17
- tests/perf/test_perf.py +23 -0
- tests/rag/test_mteb.py +5 -5
- /evalscope/{report/app_arguments.py → app/arguments.py} +0 -0
- {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/LICENSE +0 -0
- {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/WHEEL +0 -0
- {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.15.1.dist-info → evalscope-0.16.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
from typing import Dict, List
|
|
2
|
+
|
|
3
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
4
|
+
from evalscope.constants import EvalType, OutputType
|
|
5
|
+
from evalscope.metrics import Metric, mean, metric_registry
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@Benchmark.register(
|
|
9
|
+
name='tool_bench',
|
|
10
|
+
pretty_name='ToolBench-Static',
|
|
11
|
+
dataset_id='AI-ModelScope/ToolBench-Static',
|
|
12
|
+
subset_list=['in_domain', 'out_of_domain'],
|
|
13
|
+
metric_list=['Act.EM', 'Plan.EM', 'F1', 'HalluRate', 'Rouge-L'],
|
|
14
|
+
few_shot_num=0,
|
|
15
|
+
train_split=None,
|
|
16
|
+
eval_split='test',
|
|
17
|
+
)
|
|
18
|
+
class ToolBenchAdapter(DataAdapter):
|
|
19
|
+
|
|
20
|
+
def __init__(self, **kwargs):
|
|
21
|
+
super().__init__(**kwargs)
|
|
22
|
+
|
|
23
|
+
metric_registry.register(Metric(name='Rouge-L', object=mean))
|
|
24
|
+
metric_registry.register(Metric(name='Act.EM', object=mean))
|
|
25
|
+
metric_registry.register(Metric(name='Plan.EM', object=mean))
|
|
26
|
+
metric_registry.register(Metric(name='F1', object=mean))
|
|
27
|
+
metric_registry.register(Metric(name='HalluRate', object=mean))
|
|
28
|
+
|
|
29
|
+
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
|
|
30
|
+
"""
|
|
31
|
+
Generate model prompt from input data.
|
|
32
|
+
"""
|
|
33
|
+
messages = input_d['messages']
|
|
34
|
+
# use prepared messages and remove the name field
|
|
35
|
+
for message in messages:
|
|
36
|
+
if 'name' in message:
|
|
37
|
+
del message['name']
|
|
38
|
+
return self.gen_prompt_data(prompt='', messages=messages)
|
|
39
|
+
|
|
40
|
+
def get_gold_answer(self, input_d: dict) -> str:
|
|
41
|
+
"""
|
|
42
|
+
Parse the raw input labels (gold).
|
|
43
|
+
"""
|
|
44
|
+
return input_d
|
|
45
|
+
|
|
46
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
47
|
+
"""
|
|
48
|
+
Parse the predicted result and extract proper answer.
|
|
49
|
+
"""
|
|
50
|
+
return result
|
|
51
|
+
|
|
52
|
+
def match(self, gold: dict, pred: str) -> Dict:
|
|
53
|
+
"""
|
|
54
|
+
Match the gold answer and the predicted answer.
|
|
55
|
+
"""
|
|
56
|
+
from .utils import calculate_metrics
|
|
57
|
+
|
|
58
|
+
data = {
|
|
59
|
+
'target': gold['target'],
|
|
60
|
+
'predictions': pred,
|
|
61
|
+
'tools': gold['tools'],
|
|
62
|
+
}
|
|
63
|
+
metrics = calculate_metrics(data)
|
|
64
|
+
return metrics
|
|
65
|
+
|
|
66
|
+
def compute_metric(self, review_res_list: List[dict], **kwargs) -> Dict:
|
|
67
|
+
# aggregate review results
|
|
68
|
+
res_dict = super().compute_dict_metric(review_res_list, **kwargs)
|
|
69
|
+
|
|
70
|
+
return super().compute_metric(res_dict, **kwargs)
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
from evalscope.metrics import compute_rouge_score_one_sample
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def evaluate_rougel(cand_list: list, ref_list: list):
|
|
7
|
+
if len(ref_list) == 0:
|
|
8
|
+
return 0
|
|
9
|
+
rouge_score = compute_rouge_score_one_sample(cand_list, ref_list)
|
|
10
|
+
rougel = rouge_score.get('rouge-l-f', 0)
|
|
11
|
+
|
|
12
|
+
return rougel
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def evaluate_action_em(cand_list: list, ref_list: list):
|
|
16
|
+
if len(ref_list) == 0:
|
|
17
|
+
return 0
|
|
18
|
+
em = 0
|
|
19
|
+
for cand, ref in zip(cand_list, ref_list):
|
|
20
|
+
em += (1 if cand == ref else 0)
|
|
21
|
+
return em / len(cand_list)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def evaluate_action_input_f1(action_pred: list, action_ref: list, cand_list: list, ref_list: list):
|
|
25
|
+
easy_f1 = []
|
|
26
|
+
hard_f1 = []
|
|
27
|
+
f1 = []
|
|
28
|
+
for i in range(len(action_pred)):
|
|
29
|
+
ref_action = action_ref[i]
|
|
30
|
+
pred_action = action_pred[i]
|
|
31
|
+
|
|
32
|
+
ref_input = ref_list[i]
|
|
33
|
+
cand_input = cand_list[i]
|
|
34
|
+
|
|
35
|
+
if ref_action != pred_action:
|
|
36
|
+
easy_f1.append(0)
|
|
37
|
+
hard_f1.append(0)
|
|
38
|
+
f1.append(0)
|
|
39
|
+
else:
|
|
40
|
+
try:
|
|
41
|
+
ref_input_json = json.loads(ref_input)
|
|
42
|
+
try:
|
|
43
|
+
cand_input_json = json.loads(cand_input)
|
|
44
|
+
half_match = 0
|
|
45
|
+
full_match = 0
|
|
46
|
+
if ref_input_json == {}:
|
|
47
|
+
if cand_input_json == {}:
|
|
48
|
+
easy_f1.append(1)
|
|
49
|
+
f1.append(1)
|
|
50
|
+
else:
|
|
51
|
+
easy_f1.append(0)
|
|
52
|
+
f1.append(0)
|
|
53
|
+
else:
|
|
54
|
+
for k, v in ref_input_json.items():
|
|
55
|
+
if k in cand_input_json.keys():
|
|
56
|
+
if cand_input_json[k] == v:
|
|
57
|
+
full_match += 1
|
|
58
|
+
else:
|
|
59
|
+
half_match += 1
|
|
60
|
+
|
|
61
|
+
recall = (0.5 * half_match + full_match) / (len(ref_input_json) + 1e-30)
|
|
62
|
+
precision = (0.5 * half_match + full_match) / (len(cand_input_json) + 1e-30)
|
|
63
|
+
hard_f1.append((2 * recall * precision) / (recall + precision))
|
|
64
|
+
f1.append((2 * recall * precision) / (recall + precision))
|
|
65
|
+
except Exception:
|
|
66
|
+
# cand_input = cand_input.replace("\n","").replace("\"","")
|
|
67
|
+
# ref_input = cand_input.replace("\n","").replace("\"","")
|
|
68
|
+
# rouge = Rouge()
|
|
69
|
+
# rouge_score = rouge.get_scores(hyps=[cand_input], refs=[ref_input], avg=True)
|
|
70
|
+
if ref_input_json == {}:
|
|
71
|
+
easy_f1.append(0)
|
|
72
|
+
else:
|
|
73
|
+
hard_f1.append(0)
|
|
74
|
+
# hard_f1.append(rouge_score["rouge-l"]["f"])
|
|
75
|
+
# f1.append(rouge_score["rouge-l"]["f"])
|
|
76
|
+
f1.append(0)
|
|
77
|
+
except Exception:
|
|
78
|
+
pass
|
|
79
|
+
|
|
80
|
+
# 检查列表是否为空,如果为空则返回0
|
|
81
|
+
easy_f1_avg = sum(easy_f1) / len(easy_f1) if easy_f1 else 0
|
|
82
|
+
hard_f1_avg = sum(hard_f1) / len(hard_f1) if hard_f1 else 0
|
|
83
|
+
f1_avg = sum(f1) / len(f1) if f1 else 0
|
|
84
|
+
|
|
85
|
+
return easy_f1_avg, hard_f1_avg, f1_avg
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def parse_action(text):
|
|
89
|
+
action = 'None'
|
|
90
|
+
action_input = '{}'
|
|
91
|
+
if 'Action Input:' in text:
|
|
92
|
+
input_idx = text.rindex('Action Input:')
|
|
93
|
+
action_input = text[input_idx + len('Action Input:'):].strip()
|
|
94
|
+
else:
|
|
95
|
+
action_input = '{}'
|
|
96
|
+
|
|
97
|
+
if 'Action:' in text:
|
|
98
|
+
action_idx = text.rindex('Action:')
|
|
99
|
+
action = text[action_idx + len('Action:'):].strip()
|
|
100
|
+
if 'Action Input:' in action:
|
|
101
|
+
input_idx = action.index('Action Input:')
|
|
102
|
+
action = action[:input_idx].strip()
|
|
103
|
+
else:
|
|
104
|
+
action = 'none'
|
|
105
|
+
return action, action_input
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def parse_output(text):
|
|
109
|
+
action, action_input = parse_action(text)
|
|
110
|
+
if action == 'Finish':
|
|
111
|
+
try:
|
|
112
|
+
action_input = json.loads(action_input)
|
|
113
|
+
# print(action_input)
|
|
114
|
+
# print(json.dumps(action_input,indent=2))
|
|
115
|
+
return_type = action_input['return_type']
|
|
116
|
+
if return_type == 'give_answer':
|
|
117
|
+
if 'final_answer' in action_input.keys():
|
|
118
|
+
answer = str(action_input['final_answer'])
|
|
119
|
+
if answer.strip() in ['', '.', ',']:
|
|
120
|
+
answer = 'None'
|
|
121
|
+
else:
|
|
122
|
+
answer = 'None'
|
|
123
|
+
return 'finish', action, action_input, answer
|
|
124
|
+
else:
|
|
125
|
+
return 'give up', None, None, None
|
|
126
|
+
except Exception:
|
|
127
|
+
return 'give up', None, None, None
|
|
128
|
+
else:
|
|
129
|
+
plan = 'call'
|
|
130
|
+
answer = None
|
|
131
|
+
return plan, action, action_input, answer
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def calculate_metrics(data):
|
|
135
|
+
"""
|
|
136
|
+
Calculate the metrics for the given data.
|
|
137
|
+
"""
|
|
138
|
+
plan_ref = []
|
|
139
|
+
plan_pred = []
|
|
140
|
+
hallu_cases = []
|
|
141
|
+
answer_ref = []
|
|
142
|
+
action_ref = []
|
|
143
|
+
action_input_ref = []
|
|
144
|
+
answer_pred = []
|
|
145
|
+
action_pred = []
|
|
146
|
+
action_input_pred = []
|
|
147
|
+
hallu_pred = 0
|
|
148
|
+
|
|
149
|
+
reference = data['target']
|
|
150
|
+
prediction = data['predictions']
|
|
151
|
+
ref_plan, ref_action, ref_input, ref_ans = parse_output(reference)
|
|
152
|
+
# ref_plan: call
|
|
153
|
+
# ref_action: spott
|
|
154
|
+
# ref_input: {"is_id": "city center" }
|
|
155
|
+
# ref_ans: None
|
|
156
|
+
|
|
157
|
+
pred_plan, pred_action, pred_input, pred_ans = parse_output(prediction)
|
|
158
|
+
if ref_action is not None and ref_action == 'invalid_hallucination_function_name':
|
|
159
|
+
return {}
|
|
160
|
+
if pred_action is not None and ref_action != 'none' and ref_action not in [t['name'] for t in data['tools']]:
|
|
161
|
+
return {}
|
|
162
|
+
|
|
163
|
+
if pred_action is not None and pred_action != 'none' and pred_action not in [t['name'] for t in data['tools']]:
|
|
164
|
+
hallu_pred += 1
|
|
165
|
+
hallu_cases.append(data)
|
|
166
|
+
|
|
167
|
+
plan_ref.append(ref_plan)
|
|
168
|
+
plan_pred.append(pred_plan)
|
|
169
|
+
if ref_plan == 'give up':
|
|
170
|
+
pass
|
|
171
|
+
elif ref_plan == 'finish':
|
|
172
|
+
answer_ref.append(ref_ans)
|
|
173
|
+
if pred_ans is None:
|
|
174
|
+
answer_pred.append('none')
|
|
175
|
+
else:
|
|
176
|
+
answer_pred.append(pred_ans)
|
|
177
|
+
else:
|
|
178
|
+
action_ref.append(ref_action)
|
|
179
|
+
action_input_ref.append(ref_input)
|
|
180
|
+
if pred_action is None:
|
|
181
|
+
action_pred.append('none')
|
|
182
|
+
else:
|
|
183
|
+
action_pred.append(pred_action)
|
|
184
|
+
|
|
185
|
+
if pred_input is None:
|
|
186
|
+
action_input_pred.append('{}')
|
|
187
|
+
else:
|
|
188
|
+
action_input_pred.append(pred_input)
|
|
189
|
+
|
|
190
|
+
metric = {}
|
|
191
|
+
rouge = evaluate_rougel(answer_pred, answer_ref)
|
|
192
|
+
plan_em = evaluate_action_em(cand_list=plan_pred, ref_list=plan_ref)
|
|
193
|
+
action_em = evaluate_action_em(cand_list=action_pred, ref_list=action_ref)
|
|
194
|
+
easy_f1, hard_f1, f1 = evaluate_action_input_f1(action_pred, action_ref, action_input_pred, action_input_ref)
|
|
195
|
+
hallu_rate = hallu_pred
|
|
196
|
+
metric['Act.EM'] = action_em
|
|
197
|
+
metric['F1'] = f1
|
|
198
|
+
metric['HalluRate'] = hallu_rate
|
|
199
|
+
metric['plan_em'] = plan_em
|
|
200
|
+
metric['Easy_F1'] = easy_f1
|
|
201
|
+
metric['Hard_F1'] = hard_f1
|
|
202
|
+
metric['Rouge-L'] = rouge
|
|
203
|
+
return metric
|
evalscope/benchmarks/utils.py
CHANGED
|
@@ -13,6 +13,7 @@ class PromptData:
|
|
|
13
13
|
system_prompt: Optional[str] = None
|
|
14
14
|
multi_choices: Optional[List[str]] = None
|
|
15
15
|
id: Optional[str] = None
|
|
16
|
+
messages: Optional[List[dict]] = None
|
|
16
17
|
|
|
17
18
|
def to_dict(self) -> Dict:
|
|
18
19
|
return {k: v for k, v in asdict(self).items() if v is not None}
|
|
@@ -21,7 +22,7 @@ class PromptData:
|
|
|
21
22
|
def preprocess_decorator(func):
|
|
22
23
|
|
|
23
24
|
@wraps(func)
|
|
24
|
-
def wrapper(self, result: str, raw_input_d: dict = None,
|
|
25
|
+
def wrapper(self, result: str, raw_input_d: dict = None, **kwargs):
|
|
25
26
|
if result is None:
|
|
26
27
|
result = ''
|
|
27
28
|
filters = self.config_kwargs.get('filters', None)
|
|
@@ -29,6 +30,31 @@ def preprocess_decorator(func):
|
|
|
29
30
|
# Apply filters to the resultply filters to the result
|
|
30
31
|
for filter_name, filter_value in filters.items():
|
|
31
32
|
result = Filter.apply(filter_name, result, filter_value)
|
|
32
|
-
return func(self, result, raw_input_d,
|
|
33
|
+
return func(self, result, raw_input_d, **kwargs)
|
|
33
34
|
|
|
34
35
|
return wrapper
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def load_file_with_extension(file_path: Union[str, List[str]]) -> List[dict]:
|
|
39
|
+
"""
|
|
40
|
+
Load a file with a specific extension and return its content as a list of dictionaries.
|
|
41
|
+
"""
|
|
42
|
+
import json
|
|
43
|
+
import os
|
|
44
|
+
|
|
45
|
+
if isinstance(file_path, str):
|
|
46
|
+
file_path = [file_path]
|
|
47
|
+
|
|
48
|
+
data = []
|
|
49
|
+
for path in file_path:
|
|
50
|
+
if not os.path.exists(path):
|
|
51
|
+
raise FileNotFoundError(f'The file {path} does not exist.')
|
|
52
|
+
|
|
53
|
+
with open(path, 'r', encoding='utf-8') as f:
|
|
54
|
+
if path.endswith('.json'):
|
|
55
|
+
data.extend(json.load(f))
|
|
56
|
+
elif path.endswith('.jsonl'):
|
|
57
|
+
data.extend([json.loads(line) for line in f])
|
|
58
|
+
elif path.endswith('.txt'):
|
|
59
|
+
data.extend([{'text': f.read()}])
|
|
60
|
+
return data
|
|
File without changes
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
2
|
+
from evalscope.constants import EvalType, OutputType
|
|
3
|
+
from evalscope.metrics import exact_match
|
|
4
|
+
from evalscope.utils.utils import ResponseParser
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@Benchmark.register(
|
|
8
|
+
name='winogrande',
|
|
9
|
+
pretty_name='Winogrande',
|
|
10
|
+
dataset_id='AI-ModelScope/winogrande_val',
|
|
11
|
+
model_adapter=OutputType.GENERATION,
|
|
12
|
+
output_types=[OutputType.MULTIPLE_CHOICE, OutputType.GENERATION],
|
|
13
|
+
metric_list=['AverageAccuracy'],
|
|
14
|
+
few_shot_num=0,
|
|
15
|
+
train_split=None,
|
|
16
|
+
eval_split='validation',
|
|
17
|
+
prompt_template='Question: {query}\nA. {option1}\nB. {option2}\nAnswer:', # noqa: E501
|
|
18
|
+
)
|
|
19
|
+
class WinograndeAdapter(DataAdapter):
|
|
20
|
+
|
|
21
|
+
def __init__(self, **kwargs):
|
|
22
|
+
super().__init__(**kwargs)
|
|
23
|
+
|
|
24
|
+
self.choices = ['A', 'B']
|
|
25
|
+
|
|
26
|
+
def gen_prompt(self, input_d: dict, subset_name: str, few_shot_list: list, **kwargs) -> dict:
|
|
27
|
+
"""
|
|
28
|
+
Generate model prompt from input data.
|
|
29
|
+
"""
|
|
30
|
+
prompt = self.prompt_template.format(
|
|
31
|
+
query=input_d['sentence'],
|
|
32
|
+
option1=input_d['option1'],
|
|
33
|
+
option2=input_d['option2'],
|
|
34
|
+
)
|
|
35
|
+
return self.gen_prompt_data(prompt)
|
|
36
|
+
|
|
37
|
+
def get_gold_answer(self, input_d: dict) -> str:
|
|
38
|
+
"""
|
|
39
|
+
Parse the raw input labels (gold).
|
|
40
|
+
"""
|
|
41
|
+
answer_index = int(input_d['answer']) - 1
|
|
42
|
+
return self.choices[answer_index]
|
|
43
|
+
|
|
44
|
+
def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: str = EvalType.CHECKPOINT) -> str:
|
|
45
|
+
"""
|
|
46
|
+
Parse the predicted result and extract proper answer.
|
|
47
|
+
"""
|
|
48
|
+
if self.model_adapter == OutputType.MULTIPLE_CHOICE:
|
|
49
|
+
return result
|
|
50
|
+
else:
|
|
51
|
+
return ResponseParser.parse_first_option_with_choices(result, self.choices)
|
|
52
|
+
|
|
53
|
+
def match(self, gold: str, pred: str) -> float:
|
|
54
|
+
"""
|
|
55
|
+
Match the gold answer and the predicted answer.
|
|
56
|
+
"""
|
|
57
|
+
return exact_match(gold=gold, pred=pred)
|
evalscope/cli/start_app.py
CHANGED
|
@@ -21,13 +21,13 @@ class StartAppCMD(CLICommand):
|
|
|
21
21
|
def define_args(parsers: ArgumentParser):
|
|
22
22
|
""" define args for create pipeline template command.
|
|
23
23
|
"""
|
|
24
|
-
from evalscope.
|
|
24
|
+
from evalscope.app import add_argument
|
|
25
25
|
|
|
26
26
|
parser = parsers.add_parser(StartAppCMD.name)
|
|
27
27
|
add_argument(parser)
|
|
28
28
|
parser.set_defaults(func=subparser_func)
|
|
29
29
|
|
|
30
30
|
def execute(self):
|
|
31
|
-
from evalscope.
|
|
31
|
+
from evalscope.app import create_app
|
|
32
32
|
|
|
33
33
|
create_app(self.args)
|
|
@@ -1,3 +1,35 @@
|
|
|
1
|
-
|
|
2
|
-
from
|
|
3
|
-
|
|
1
|
+
# Copyright (c) Alibaba, Inc. and its affiliates.
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
3
|
+
|
|
4
|
+
from evalscope.utils.import_utils import _LazyModule
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from .evaluator import EvaluatorCollection
|
|
8
|
+
from .sampler import StratifiedSampler, UniformSampler, WeightedSampler
|
|
9
|
+
from .schema import CollectionSchema, DatasetInfo
|
|
10
|
+
|
|
11
|
+
else:
|
|
12
|
+
_import_structure = {
|
|
13
|
+
'evaluator': [
|
|
14
|
+
'EvaluatorCollection',
|
|
15
|
+
],
|
|
16
|
+
'sampler': [
|
|
17
|
+
'StratifiedSampler',
|
|
18
|
+
'UniformSampler',
|
|
19
|
+
'WeightedSampler',
|
|
20
|
+
],
|
|
21
|
+
'schema': [
|
|
22
|
+
'CollectionSchema',
|
|
23
|
+
'DatasetInfo',
|
|
24
|
+
],
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
import sys
|
|
28
|
+
|
|
29
|
+
sys.modules[__name__] = _LazyModule(
|
|
30
|
+
__name__,
|
|
31
|
+
globals()['__file__'],
|
|
32
|
+
_import_structure,
|
|
33
|
+
module_spec=__spec__,
|
|
34
|
+
extra_objects={},
|
|
35
|
+
)
|
|
@@ -7,7 +7,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
|
7
7
|
from copy import deepcopy
|
|
8
8
|
from tabulate import tabulate
|
|
9
9
|
from tqdm import tqdm
|
|
10
|
-
from typing import List
|
|
10
|
+
from typing import Any, Dict, List
|
|
11
11
|
|
|
12
12
|
from evalscope.benchmarks import Benchmark, DataAdapter
|
|
13
13
|
from evalscope.collections.sampler import DatasetEntry
|
|
@@ -70,9 +70,13 @@ class EvaluatorCollection:
|
|
|
70
70
|
dataset_name = os.path.splitext(os.path.basename(self.data_adapter.dataset_id))[0]
|
|
71
71
|
raw_dataset = self.data_adapter.load()
|
|
72
72
|
# random limit the dataset
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
73
|
+
limit = len(raw_dataset)
|
|
74
|
+
if self.task_cfg.limit is not None:
|
|
75
|
+
if isinstance(self.task_cfg.limit, int):
|
|
76
|
+
limit = self.task_cfg.limit
|
|
77
|
+
elif isinstance(self.task_cfg.limit, float):
|
|
78
|
+
limit = int(len(raw_dataset) * self.task_cfg.limit)
|
|
79
|
+
raw_dataset = random.sample(raw_dataset, min(limit, len(raw_dataset)))
|
|
76
80
|
# index dataset
|
|
77
81
|
datasets = []
|
|
78
82
|
for sample in raw_dataset:
|
|
@@ -179,32 +183,43 @@ class EvaluatorCollection:
|
|
|
179
183
|
logger.info(f'{level} Report:\n{table}')
|
|
180
184
|
|
|
181
185
|
report = ReportGenerator.gen_collection_report(df, self.dataset_name, self.task_cfg.model_id)
|
|
186
|
+
# Make report analysis
|
|
187
|
+
if self.task_cfg.analysis_report:
|
|
188
|
+
logger.info('Generating report analysis, please wait ...')
|
|
189
|
+
analysis = report.generate_analysis(self.task_cfg.judge_model_args)
|
|
190
|
+
logger.info('Report analysis:\n%s', analysis)
|
|
191
|
+
else:
|
|
192
|
+
logger.info('Skipping report analysis (`analysis_report=False`).')
|
|
193
|
+
|
|
182
194
|
# save report to JSON file
|
|
183
195
|
report_file_path = os.path.join(self.outputs.reports_dir, self.task_cfg.model_id, f'{self.dataset_name}.json')
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
196
|
+
report.to_json(report_file_path)
|
|
197
|
+
|
|
198
|
+
logger.info(f'Report saved to {report_file_path}')
|
|
187
199
|
return report
|
|
188
200
|
|
|
189
201
|
def _filter_answer(self, pred_file_path):
|
|
190
202
|
answer_dict = defaultdict(dict)
|
|
191
203
|
if self.task_cfg.use_cache and os.path.exists(pred_file_path):
|
|
192
204
|
answers_list = jsonl_to_list(pred_file_path)
|
|
205
|
+
# Create a set of sample indices for which we have answers
|
|
193
206
|
indices = set()
|
|
194
207
|
for answer in answers_list:
|
|
195
208
|
index = answer.get(AnswerKeys.INDEX)
|
|
196
209
|
answer_dict[index] = answer
|
|
197
210
|
indices.add(index)
|
|
198
211
|
|
|
199
|
-
|
|
200
|
-
for sample in self.dataset
|
|
201
|
-
|
|
202
|
-
|
|
212
|
+
# Filter dataset to only include samples that don't have answers
|
|
213
|
+
data = [sample for sample in self.dataset if sample.index not in indices]
|
|
214
|
+
|
|
215
|
+
# Initialize name map for the filtered dataset
|
|
203
216
|
data_map = self._init_name_map(data)
|
|
204
217
|
|
|
205
218
|
logger.info(f'Reuse from {pred_file_path}. Loaded {len(indices)} samples, remain {len(data)} samples.')
|
|
206
219
|
return answer_dict, data, data_map
|
|
207
|
-
|
|
220
|
+
else:
|
|
221
|
+
# If cache isn't enabled or file doesn't exist, return the full dataset
|
|
222
|
+
return answer_dict, self.dataset, self.dataset_name_map
|
|
208
223
|
|
|
209
224
|
def get_answers(self):
|
|
210
225
|
pred_file_path = os.path.join(self.outputs.predictions_dir, self.task_cfg.model_id,
|
|
@@ -214,13 +229,16 @@ class EvaluatorCollection:
|
|
|
214
229
|
answers, dataset, dataset_name_map = self._filter_answer(pred_file_path)
|
|
215
230
|
|
|
216
231
|
eval_batch_size = self.task_cfg.eval_batch_size
|
|
232
|
+
# Process samples and get answers
|
|
217
233
|
with tqdm(total=len(dataset), desc='Getting answers') as pbar:
|
|
218
234
|
if self.task_cfg.eval_type == EvalType.SERVICE:
|
|
235
|
+
# Create a thread pool for parallel processing
|
|
219
236
|
with ThreadPoolExecutor(max_workers=eval_batch_size) as executor:
|
|
220
237
|
futures = []
|
|
221
238
|
for sample in dataset:
|
|
222
239
|
evaluator = self.evaluators[sample.dataset_name]
|
|
223
240
|
futures.append(executor.submit(evaluator.get_answer, [sample], self.task_cfg.generation_config))
|
|
241
|
+
# Process completed tasks
|
|
224
242
|
for future in as_completed(futures):
|
|
225
243
|
answer_list, samples = future.result()
|
|
226
244
|
answers[samples[0].index] = answer_list[0]
|
|
@@ -244,35 +262,79 @@ class EvaluatorCollection:
|
|
|
244
262
|
pbar.update(len(batch_ids))
|
|
245
263
|
return answers
|
|
246
264
|
|
|
247
|
-
def get_reviews(self, answers):
|
|
265
|
+
def get_reviews(self, answers: Dict[int, Any]) -> Dict[int, Any]:
|
|
266
|
+
"""
|
|
267
|
+
Retrieve or generate reviews for given answers.
|
|
268
|
+
|
|
269
|
+
Args:
|
|
270
|
+
answers: Dictionary of answers indexed by sample index.
|
|
271
|
+
|
|
272
|
+
Returns:
|
|
273
|
+
Dictionary of reviews indexed by sample index.
|
|
274
|
+
"""
|
|
275
|
+
# Set up the review file path
|
|
248
276
|
review_file_path = os.path.join(self.outputs.reviews_dir, self.task_cfg.model_id)
|
|
249
277
|
os.makedirs(review_file_path, exist_ok=True)
|
|
250
278
|
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
if os.path.isfile(file_path):
|
|
259
|
-
os.remove(file_path)
|
|
260
|
-
except Exception as e:
|
|
261
|
-
logger.error(f'Error deleting file {file_path}: {e}')
|
|
279
|
+
review_history_map = defaultdict(dict)
|
|
280
|
+
|
|
281
|
+
# Handle caching logic
|
|
282
|
+
if os.path.exists(review_file_path):
|
|
283
|
+
if not self.task_cfg.use_cache:
|
|
284
|
+
# Clear existing reviews if not using cache
|
|
285
|
+
self._clear_review_files(review_file_path)
|
|
262
286
|
else:
|
|
263
|
-
|
|
287
|
+
# Load existing reviews if using cache
|
|
288
|
+
self._load_existing_reviews(review_file_path, review_history_map)
|
|
264
289
|
|
|
265
|
-
reviews =
|
|
290
|
+
reviews = {}
|
|
266
291
|
for sample in tqdm(self.dataset, desc='Getting reviews'):
|
|
267
|
-
|
|
268
|
-
|
|
292
|
+
file_name = f'{self.dataset_name}_{sample.dataset_name}_{sample.subset_name}.jsonl'
|
|
293
|
+
|
|
294
|
+
if self.task_cfg.use_cache and sample.index in review_history_map.get(file_name, {}):
|
|
295
|
+
# Use cached review if available
|
|
296
|
+
review_d = review_history_map[file_name][sample.index]
|
|
297
|
+
else:
|
|
298
|
+
# Generate new review
|
|
299
|
+
evaluator = self.evaluators[sample.dataset_name]
|
|
300
|
+
review_d = evaluator.get_review(answers[sample.index])
|
|
301
|
+
# Only save the review if it's not in the cache
|
|
302
|
+
self._save_review(review_file_path, file_name, review_d)
|
|
303
|
+
|
|
269
304
|
reviews[sample.index] = review_d
|
|
270
|
-
|
|
271
|
-
review_d,
|
|
272
|
-
os.path.join(review_file_path, f'{self.dataset_name}_{sample.dataset_name}_{sample.subset_name}.jsonl'),
|
|
273
|
-
dump_mode=DumpMode.APPEND)
|
|
305
|
+
|
|
274
306
|
return reviews
|
|
275
307
|
|
|
308
|
+
def _clear_review_files(self, review_file_path: str) -> None:
|
|
309
|
+
"""Clear existing review files."""
|
|
310
|
+
if os.path.isdir(review_file_path):
|
|
311
|
+
for filename in os.listdir(review_file_path):
|
|
312
|
+
file_path = os.path.join(review_file_path, filename)
|
|
313
|
+
try:
|
|
314
|
+
if os.path.isfile(file_path):
|
|
315
|
+
os.remove(file_path)
|
|
316
|
+
except Exception as e:
|
|
317
|
+
logger.error(f'Error deleting file {file_path}: {e}')
|
|
318
|
+
else:
|
|
319
|
+
os.remove(review_file_path)
|
|
320
|
+
|
|
321
|
+
def _load_existing_reviews(self, review_file_path: str, review_history_map: Dict[str, Dict[int, Any]]) -> None:
|
|
322
|
+
"""Load existing reviews from files."""
|
|
323
|
+
logger.info(f'use_cache={self.task_cfg.use_cache}, reloading the review file: {review_file_path}')
|
|
324
|
+
if os.path.isdir(review_file_path):
|
|
325
|
+
for filename in os.listdir(review_file_path):
|
|
326
|
+
if '.ipynb_checkpoints' in filename:
|
|
327
|
+
continue
|
|
328
|
+
file_path = os.path.join(review_file_path, filename)
|
|
329
|
+
with open(file_path, 'r') as f:
|
|
330
|
+
review_history = [json.loads(line.strip()) for line in f]
|
|
331
|
+
review_history_map[filename] = {item['index']: item for item in review_history}
|
|
332
|
+
|
|
333
|
+
def _save_review(self, review_file_path: str, file_name: str, review_d: Dict[str, Any]) -> None:
|
|
334
|
+
"""Save a single review to file."""
|
|
335
|
+
file_path = os.path.join(review_file_path, file_name)
|
|
336
|
+
dump_jsonl_data(review_d, file_path, dump_mode=DumpMode.APPEND)
|
|
337
|
+
|
|
276
338
|
def get_scores(self, reviews) -> float:
|
|
277
339
|
scores = defaultdict(dict)
|
|
278
340
|
for sample in tqdm(self.dataset, desc='Getting scores'):
|