evalscope 0.8.0__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (47) hide show
  1. evalscope/backend/base.py +1 -1
  2. evalscope/backend/rag_eval/utils/clip.py +2 -2
  3. evalscope/backend/rag_eval/utils/embedding.py +1 -1
  4. evalscope/benchmarks/general_qa/general_qa_adapter.py +1 -1
  5. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +2 -1
  6. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +2 -1
  7. evalscope/benchmarks/humaneval/humaneval_adapter.py +193 -7
  8. evalscope/benchmarks/race/race_adapter.py +2 -1
  9. evalscope/config.py +35 -1
  10. evalscope/constants.py +24 -38
  11. evalscope/evaluator/__init__.py +0 -1
  12. evalscope/evaluator/evaluator.py +5 -4
  13. evalscope/evaluator/rating_eval.py +1 -1
  14. evalscope/evaluator/reviewer/auto_reviewer.py +2 -1
  15. evalscope/perf/arguments.py +2 -1
  16. evalscope/perf/benchmark.py +2 -2
  17. evalscope/perf/main.py +2 -5
  18. evalscope/perf/plugin/api/openai_api.py +2 -2
  19. evalscope/perf/plugin/registry.py +3 -3
  20. evalscope/perf/utils/benchmark_util.py +4 -4
  21. evalscope/perf/utils/db_util.py +66 -22
  22. evalscope/perf/utils/local_server.py +3 -1
  23. evalscope/run.py +45 -82
  24. evalscope/run_arena.py +2 -1
  25. evalscope/summarizer.py +14 -26
  26. evalscope/third_party/longbench_write/eval.py +2 -1
  27. evalscope/third_party/longbench_write/longbench_write.py +2 -1
  28. evalscope/third_party/longbench_write/tools/data_etl.py +1 -1
  29. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  30. evalscope/tools/combine_reports.py +2 -4
  31. evalscope/tools/rewrite_eval_results.py +1 -1
  32. evalscope/utils/__init__.py +1 -0
  33. evalscope/utils/chat_service.py +1 -1
  34. evalscope/utils/io_utils.py +162 -0
  35. evalscope/utils/logger.py +8 -0
  36. evalscope/utils/utils.py +0 -175
  37. evalscope/version.py +2 -2
  38. {evalscope-0.8.0.dist-info → evalscope-0.8.1.dist-info}/METADATA +1 -1
  39. {evalscope-0.8.0.dist-info → evalscope-0.8.1.dist-info}/RECORD +46 -46
  40. tests/cli/test_run.py +11 -12
  41. tests/perf/test_perf.py +2 -1
  42. tests/vlm/test_vlmeval.py +3 -2
  43. evalscope/evaluator/humaneval_evaluator.py +0 -158
  44. {evalscope-0.8.0.dist-info → evalscope-0.8.1.dist-info}/LICENSE +0 -0
  45. {evalscope-0.8.0.dist-info → evalscope-0.8.1.dist-info}/WHEEL +0 -0
  46. {evalscope-0.8.0.dist-info → evalscope-0.8.1.dist-info}/entry_points.txt +0 -0
  47. {evalscope-0.8.0.dist-info → evalscope-0.8.1.dist-info}/top_level.txt +0 -0
@@ -1,158 +0,0 @@
1
- import json
2
- import os
3
- import re
4
- from tqdm import tqdm
5
- from typing import List, Optional
6
-
7
- from evalscope.constants import OutputsStructure
8
- from evalscope.evaluator.evaluator import logger
9
- from evalscope.models.model_adapter import BaseModelAdapter
10
- from evalscope.tools.combine_reports import gen_table
11
- from evalscope.utils import normalize_score
12
-
13
-
14
- class HumanevalEvaluator(object):
15
-
16
- def __init__(
17
- self,
18
- problem_file: str,
19
- model_id: str,
20
- model_revision: str,
21
- model_adapter: BaseModelAdapter,
22
- outputs: Optional[OutputsStructure] = None,
23
- k: List[int] = [1, 10, 100],
24
- n_workers: int = 4,
25
- timeout: float = 3.0,
26
- ):
27
- try:
28
- from human_eval.data import read_problems, write_jsonl
29
- from human_eval.evaluation import evaluate_functional_correctness
30
- except ImportError:
31
- raise ImportError('Please install human_eval:'
32
- 'https://github.com/openai/human-eval/tree/master#installation , '
33
- 'Note that you need to enable the execution code in the human_eval/execution.py first.')
34
-
35
- self.problem_file = problem_file
36
- self.k = k
37
- self.num_workers = n_workers
38
- self.timeout = timeout
39
- self.model_adapter = model_adapter
40
-
41
- self.read_problems_func = read_problems
42
- self.write_jsonl_func = write_jsonl
43
- self.eval_func = evaluate_functional_correctness
44
-
45
- # {'task_id': {'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}, ...}
46
- self.problems = self.read_problems_func(self.problem_file)
47
-
48
- # Deal with the output paths
49
- self.outputs_structure = OutputsStructure(outputs)
50
-
51
- def get_answers(self, infer_cfg: dict) -> List[dict]:
52
- ans_list: list = []
53
- system_prompt: str = 'Complete the following python code:\n'
54
- for task_id, data_d in tqdm(self.problems.items(), total=len(self.problems), desc='Predicting(problems)'):
55
- prompt: str = system_prompt + data_d['prompt']
56
- inputs: dict = {'data': [prompt]}
57
- # pred_res: dict = self.model_adapter.predict(inputs)
58
-
59
- pred_res: dict = self.model_adapter.predict(inputs=inputs, infer_cfg=infer_cfg)
60
-
61
- pred_ans: str = pred_res['choices'][0]['message']['content']
62
- pred_ans = self._postprocess(pred_ans)
63
-
64
- ans_list.append({'task_id': task_id, 'completion': pred_ans})
65
-
66
- return ans_list
67
-
68
- def eval(self, infer_cfg: dict, **kwargs):
69
-
70
- # predict
71
- ans_list: list = self.get_answers(infer_cfg)
72
- ans_out_file: str = os.path.join(self.outputs_structure.predictions_dir, 'human_eval_predictions.jsonl')
73
-
74
- self.write_jsonl_func(filename=ans_out_file, data=ans_list)
75
- # logger.info(f'** Dump predictions to {ans_out_file} successfully.')
76
- logger.info('** Dump predictions successfully.')
77
-
78
- # evaluate results: e.g. {'pass@1': 0.333, 'pass@10': 0.111}
79
- results = self.eval_func(
80
- sample_file=ans_out_file,
81
- k=self.k,
82
- n_workers=self.num_workers,
83
- timeout=self.timeout,
84
- problem_file=self.problem_file)
85
-
86
- # output: report
87
- report_map: dict = self.gen_report(results=results)
88
- report_dir: str = self.outputs_structure.reports_dir
89
- report_file: str = os.path.join(report_dir, 'human_eval_report.json')
90
-
91
- with open(report_file, 'w') as f:
92
- f.write(json.dumps(report_map, ensure_ascii=False, indent=4))
93
- # logger.info(f'** Dump report to {report_file} \n')
94
- logger.info('** Dump report \n')
95
-
96
- try:
97
- # Make table
98
- report_table: str = gen_table([report_dir])
99
- logger.info(f'** Report table: \n {report_table} \n')
100
- except Exception:
101
- logger.error('Failed to generate report table.')
102
-
103
- def gen_report(self, results: dict) -> dict:
104
- """
105
- Generate report from evaluation results.
106
-
107
- Returns:
108
- {
109
- "name":"ARC-Challenge",
110
- "metric":"WeightedAverageAccuracy",
111
- "score":0.3389,
112
- "category":[
113
- {
114
- "name":"DEFAULT",
115
- "score":0.3389,
116
- "subset":[
117
- {
118
- "name":"ARC-Challenge",
119
- "score":0.3389
120
- },
121
- ]
122
- }
123
- ],
124
- "total_num":100
125
- }
126
- """
127
- results = {k: normalize_score(score=v) for k, v in results.items()}
128
-
129
- category_d = dict(name='DEFAULT', score=results, subset=[])
130
-
131
- res_map = dict(
132
- name='HumanEval', metric='pass@k', score=results, category=[category_d], total_num=len(self.problems))
133
-
134
- return res_map
135
-
136
- @classmethod
137
- def _postprocess(cls, text: str) -> str:
138
- if '```' in text:
139
- blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
140
- if len(blocks) == 0:
141
- text = text.split('```')[1] # fall back to default strategy
142
- else:
143
- text = blocks[0] # fetch the first code block
144
- if not text.startswith('\n'): # in case starting with ```python
145
- text = text[max(text.find('\n') + 1, 0):]
146
- if text.strip().startswith('from') or text.strip().startswith('import'):
147
- def_idx = text.find('def')
148
- if def_idx != -1:
149
- text = text[max(text.find('\n', def_idx) + 1, 0):]
150
- text = text.split('\n\n')[0]
151
- if text.strip().startswith('def'):
152
- text = '\n'.join(text.split('\n')[1:])
153
- if not text.startswith(' '):
154
- if text.startswith(' '):
155
- text = ' ' + text.lstrip()
156
- else:
157
- text = '\n'.join([' ' + line for line in text.split('\n')])
158
- return text