evalscope 0.8.0__py3-none-any.whl → 0.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/backend/base.py +1 -1
- evalscope/backend/rag_eval/utils/clip.py +2 -2
- evalscope/backend/rag_eval/utils/embedding.py +1 -1
- evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
- evalscope/benchmarks/gsm8k/gsm8k_adapter.py +2 -1
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +2 -1
- evalscope/benchmarks/humaneval/humaneval_adapter.py +193 -7
- evalscope/benchmarks/race/race_adapter.py +2 -1
- evalscope/config.py +35 -1
- evalscope/constants.py +24 -38
- evalscope/evaluator/__init__.py +0 -1
- evalscope/evaluator/evaluator.py +5 -4
- evalscope/evaluator/rating_eval.py +1 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +2 -1
- evalscope/perf/arguments.py +2 -1
- evalscope/perf/benchmark.py +2 -2
- evalscope/perf/main.py +2 -5
- evalscope/perf/plugin/api/openai_api.py +2 -2
- evalscope/perf/plugin/registry.py +3 -3
- evalscope/perf/utils/benchmark_util.py +4 -4
- evalscope/perf/utils/db_util.py +66 -22
- evalscope/perf/utils/local_server.py +3 -1
- evalscope/run.py +45 -82
- evalscope/run_arena.py +2 -1
- evalscope/summarizer.py +14 -26
- evalscope/third_party/longbench_write/eval.py +2 -1
- evalscope/third_party/longbench_write/longbench_write.py +2 -1
- evalscope/third_party/longbench_write/tools/data_etl.py +1 -1
- evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
- evalscope/tools/combine_reports.py +2 -4
- evalscope/tools/rewrite_eval_results.py +1 -1
- evalscope/utils/__init__.py +1 -0
- evalscope/utils/chat_service.py +1 -1
- evalscope/utils/io_utils.py +162 -0
- evalscope/utils/logger.py +8 -0
- evalscope/utils/utils.py +0 -175
- evalscope/version.py +2 -2
- {evalscope-0.8.0.dist-info → evalscope-0.8.1.dist-info}/METADATA +1 -1
- {evalscope-0.8.0.dist-info → evalscope-0.8.1.dist-info}/RECORD +46 -46
- tests/cli/test_run.py +11 -12
- tests/perf/test_perf.py +2 -1
- tests/vlm/test_vlmeval.py +3 -2
- evalscope/evaluator/humaneval_evaluator.py +0 -158
- {evalscope-0.8.0.dist-info → evalscope-0.8.1.dist-info}/LICENSE +0 -0
- {evalscope-0.8.0.dist-info → evalscope-0.8.1.dist-info}/WHEEL +0 -0
- {evalscope-0.8.0.dist-info → evalscope-0.8.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.8.0.dist-info → evalscope-0.8.1.dist-info}/top_level.txt +0 -0
|
@@ -1,158 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import os
|
|
3
|
-
import re
|
|
4
|
-
from tqdm import tqdm
|
|
5
|
-
from typing import List, Optional
|
|
6
|
-
|
|
7
|
-
from evalscope.constants import OutputsStructure
|
|
8
|
-
from evalscope.evaluator.evaluator import logger
|
|
9
|
-
from evalscope.models.model_adapter import BaseModelAdapter
|
|
10
|
-
from evalscope.tools.combine_reports import gen_table
|
|
11
|
-
from evalscope.utils import normalize_score
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class HumanevalEvaluator(object):
|
|
15
|
-
|
|
16
|
-
def __init__(
|
|
17
|
-
self,
|
|
18
|
-
problem_file: str,
|
|
19
|
-
model_id: str,
|
|
20
|
-
model_revision: str,
|
|
21
|
-
model_adapter: BaseModelAdapter,
|
|
22
|
-
outputs: Optional[OutputsStructure] = None,
|
|
23
|
-
k: List[int] = [1, 10, 100],
|
|
24
|
-
n_workers: int = 4,
|
|
25
|
-
timeout: float = 3.0,
|
|
26
|
-
):
|
|
27
|
-
try:
|
|
28
|
-
from human_eval.data import read_problems, write_jsonl
|
|
29
|
-
from human_eval.evaluation import evaluate_functional_correctness
|
|
30
|
-
except ImportError:
|
|
31
|
-
raise ImportError('Please install human_eval:'
|
|
32
|
-
'https://github.com/openai/human-eval/tree/master#installation , '
|
|
33
|
-
'Note that you need to enable the execution code in the human_eval/execution.py first.')
|
|
34
|
-
|
|
35
|
-
self.problem_file = problem_file
|
|
36
|
-
self.k = k
|
|
37
|
-
self.num_workers = n_workers
|
|
38
|
-
self.timeout = timeout
|
|
39
|
-
self.model_adapter = model_adapter
|
|
40
|
-
|
|
41
|
-
self.read_problems_func = read_problems
|
|
42
|
-
self.write_jsonl_func = write_jsonl
|
|
43
|
-
self.eval_func = evaluate_functional_correctness
|
|
44
|
-
|
|
45
|
-
# {'task_id': {'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}, ...}
|
|
46
|
-
self.problems = self.read_problems_func(self.problem_file)
|
|
47
|
-
|
|
48
|
-
# Deal with the output paths
|
|
49
|
-
self.outputs_structure = OutputsStructure(outputs)
|
|
50
|
-
|
|
51
|
-
def get_answers(self, infer_cfg: dict) -> List[dict]:
|
|
52
|
-
ans_list: list = []
|
|
53
|
-
system_prompt: str = 'Complete the following python code:\n'
|
|
54
|
-
for task_id, data_d in tqdm(self.problems.items(), total=len(self.problems), desc='Predicting(problems)'):
|
|
55
|
-
prompt: str = system_prompt + data_d['prompt']
|
|
56
|
-
inputs: dict = {'data': [prompt]}
|
|
57
|
-
# pred_res: dict = self.model_adapter.predict(inputs)
|
|
58
|
-
|
|
59
|
-
pred_res: dict = self.model_adapter.predict(inputs=inputs, infer_cfg=infer_cfg)
|
|
60
|
-
|
|
61
|
-
pred_ans: str = pred_res['choices'][0]['message']['content']
|
|
62
|
-
pred_ans = self._postprocess(pred_ans)
|
|
63
|
-
|
|
64
|
-
ans_list.append({'task_id': task_id, 'completion': pred_ans})
|
|
65
|
-
|
|
66
|
-
return ans_list
|
|
67
|
-
|
|
68
|
-
def eval(self, infer_cfg: dict, **kwargs):
|
|
69
|
-
|
|
70
|
-
# predict
|
|
71
|
-
ans_list: list = self.get_answers(infer_cfg)
|
|
72
|
-
ans_out_file: str = os.path.join(self.outputs_structure.predictions_dir, 'human_eval_predictions.jsonl')
|
|
73
|
-
|
|
74
|
-
self.write_jsonl_func(filename=ans_out_file, data=ans_list)
|
|
75
|
-
# logger.info(f'** Dump predictions to {ans_out_file} successfully.')
|
|
76
|
-
logger.info('** Dump predictions successfully.')
|
|
77
|
-
|
|
78
|
-
# evaluate results: e.g. {'pass@1': 0.333, 'pass@10': 0.111}
|
|
79
|
-
results = self.eval_func(
|
|
80
|
-
sample_file=ans_out_file,
|
|
81
|
-
k=self.k,
|
|
82
|
-
n_workers=self.num_workers,
|
|
83
|
-
timeout=self.timeout,
|
|
84
|
-
problem_file=self.problem_file)
|
|
85
|
-
|
|
86
|
-
# output: report
|
|
87
|
-
report_map: dict = self.gen_report(results=results)
|
|
88
|
-
report_dir: str = self.outputs_structure.reports_dir
|
|
89
|
-
report_file: str = os.path.join(report_dir, 'human_eval_report.json')
|
|
90
|
-
|
|
91
|
-
with open(report_file, 'w') as f:
|
|
92
|
-
f.write(json.dumps(report_map, ensure_ascii=False, indent=4))
|
|
93
|
-
# logger.info(f'** Dump report to {report_file} \n')
|
|
94
|
-
logger.info('** Dump report \n')
|
|
95
|
-
|
|
96
|
-
try:
|
|
97
|
-
# Make table
|
|
98
|
-
report_table: str = gen_table([report_dir])
|
|
99
|
-
logger.info(f'** Report table: \n {report_table} \n')
|
|
100
|
-
except Exception:
|
|
101
|
-
logger.error('Failed to generate report table.')
|
|
102
|
-
|
|
103
|
-
def gen_report(self, results: dict) -> dict:
|
|
104
|
-
"""
|
|
105
|
-
Generate report from evaluation results.
|
|
106
|
-
|
|
107
|
-
Returns:
|
|
108
|
-
{
|
|
109
|
-
"name":"ARC-Challenge",
|
|
110
|
-
"metric":"WeightedAverageAccuracy",
|
|
111
|
-
"score":0.3389,
|
|
112
|
-
"category":[
|
|
113
|
-
{
|
|
114
|
-
"name":"DEFAULT",
|
|
115
|
-
"score":0.3389,
|
|
116
|
-
"subset":[
|
|
117
|
-
{
|
|
118
|
-
"name":"ARC-Challenge",
|
|
119
|
-
"score":0.3389
|
|
120
|
-
},
|
|
121
|
-
]
|
|
122
|
-
}
|
|
123
|
-
],
|
|
124
|
-
"total_num":100
|
|
125
|
-
}
|
|
126
|
-
"""
|
|
127
|
-
results = {k: normalize_score(score=v) for k, v in results.items()}
|
|
128
|
-
|
|
129
|
-
category_d = dict(name='DEFAULT', score=results, subset=[])
|
|
130
|
-
|
|
131
|
-
res_map = dict(
|
|
132
|
-
name='HumanEval', metric='pass@k', score=results, category=[category_d], total_num=len(self.problems))
|
|
133
|
-
|
|
134
|
-
return res_map
|
|
135
|
-
|
|
136
|
-
@classmethod
|
|
137
|
-
def _postprocess(cls, text: str) -> str:
|
|
138
|
-
if '```' in text:
|
|
139
|
-
blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
|
|
140
|
-
if len(blocks) == 0:
|
|
141
|
-
text = text.split('```')[1] # fall back to default strategy
|
|
142
|
-
else:
|
|
143
|
-
text = blocks[0] # fetch the first code block
|
|
144
|
-
if not text.startswith('\n'): # in case starting with ```python
|
|
145
|
-
text = text[max(text.find('\n') + 1, 0):]
|
|
146
|
-
if text.strip().startswith('from') or text.strip().startswith('import'):
|
|
147
|
-
def_idx = text.find('def')
|
|
148
|
-
if def_idx != -1:
|
|
149
|
-
text = text[max(text.find('\n', def_idx) + 1, 0):]
|
|
150
|
-
text = text.split('\n\n')[0]
|
|
151
|
-
if text.strip().startswith('def'):
|
|
152
|
-
text = '\n'.join(text.split('\n')[1:])
|
|
153
|
-
if not text.startswith(' '):
|
|
154
|
-
if text.startswith(' '):
|
|
155
|
-
text = ' ' + text.lstrip()
|
|
156
|
-
else:
|
|
157
|
-
text = '\n'.join([' ' + line for line in text.split('\n')])
|
|
158
|
-
return text
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|