evalscope 0.8.0__py3-none-any.whl → 0.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (147) hide show
  1. evalscope/__init__.py +2 -0
  2. evalscope/arguments.py +11 -3
  3. evalscope/backend/base.py +1 -1
  4. evalscope/backend/rag_eval/clip_benchmark/tasks/zeroshot_classification.py +0 -1
  5. evalscope/backend/rag_eval/utils/clip.py +2 -2
  6. evalscope/backend/rag_eval/utils/embedding.py +1 -1
  7. evalscope/backend/rag_eval/utils/llm.py +1 -1
  8. evalscope/benchmarks/__init__.py +20 -1
  9. evalscope/benchmarks/arc/__init__.py +0 -5
  10. evalscope/benchmarks/arc/arc_adapter.py +24 -102
  11. evalscope/benchmarks/bbh/__init__.py +0 -4
  12. evalscope/benchmarks/bbh/bbh_adapter.py +20 -90
  13. evalscope/benchmarks/benchmark.py +70 -59
  14. evalscope/benchmarks/ceval/__init__.py +0 -5
  15. evalscope/benchmarks/ceval/ceval_adapter.py +24 -125
  16. evalscope/benchmarks/cmmlu/__init__.py +0 -5
  17. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +22 -117
  18. evalscope/benchmarks/competition_math/__init__.py +0 -5
  19. evalscope/benchmarks/competition_math/competition_math_adapter.py +29 -371
  20. evalscope/benchmarks/data_adapter.py +115 -87
  21. evalscope/benchmarks/general_qa/__init__.py +0 -5
  22. evalscope/benchmarks/general_qa/general_qa_adapter.py +24 -80
  23. evalscope/benchmarks/gpqa/__init__.py +0 -0
  24. evalscope/benchmarks/gpqa/chain_of_thought.txt +81 -0
  25. evalscope/benchmarks/gpqa/gpqa_adapter.py +103 -0
  26. evalscope/benchmarks/gsm8k/__init__.py +0 -4
  27. evalscope/benchmarks/gsm8k/gsm8k_adapter.py +22 -101
  28. evalscope/benchmarks/hellaswag/__init__.py +0 -5
  29. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +33 -99
  30. evalscope/benchmarks/humaneval/__init__.py +0 -4
  31. evalscope/benchmarks/humaneval/humaneval_adapter.py +93 -9
  32. evalscope/benchmarks/ifeval/__init__.py +0 -0
  33. evalscope/benchmarks/ifeval/ifeval_adapter.py +56 -0
  34. evalscope/benchmarks/ifeval/instructions.py +1477 -0
  35. evalscope/benchmarks/ifeval/instructions_registry.py +188 -0
  36. evalscope/benchmarks/ifeval/instructions_util.py +1670 -0
  37. evalscope/benchmarks/ifeval/utils.py +134 -0
  38. evalscope/benchmarks/iquiz/__init__.py +0 -0
  39. evalscope/benchmarks/iquiz/iquiz_adapter.py +63 -0
  40. evalscope/benchmarks/mmlu/__init__.py +0 -5
  41. evalscope/benchmarks/mmlu/mmlu_adapter.py +32 -130
  42. evalscope/benchmarks/mmlu_pro/__init__.py +0 -0
  43. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +110 -0
  44. evalscope/benchmarks/race/__init__.py +0 -5
  45. evalscope/benchmarks/race/race_adapter.py +27 -123
  46. evalscope/benchmarks/trivia_qa/__init__.py +0 -5
  47. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +23 -99
  48. evalscope/benchmarks/truthful_qa/__init__.py +0 -5
  49. evalscope/benchmarks/truthful_qa/truthful_qa_adapter.py +29 -88
  50. evalscope/cli/cli.py +2 -0
  51. evalscope/cli/start_app.py +30 -0
  52. evalscope/collections/__init__.py +3 -0
  53. evalscope/collections/evaluator.py +198 -0
  54. evalscope/collections/sampler.py +138 -0
  55. evalscope/collections/schema.py +126 -0
  56. evalscope/config.py +45 -7
  57. evalscope/constants.py +7 -38
  58. evalscope/evaluator/__init__.py +0 -1
  59. evalscope/evaluator/evaluator.py +89 -121
  60. evalscope/evaluator/rating_eval.py +1 -1
  61. evalscope/evaluator/reviewer/auto_reviewer.py +14 -5
  62. evalscope/metrics/__init__.py +3 -0
  63. evalscope/metrics/bundled_rouge_score/rouge_scorer.py +1 -1
  64. evalscope/metrics/math_accuracy.py +193 -50
  65. evalscope/metrics/metrics.py +18 -6
  66. evalscope/metrics/named_metrics.py +17 -0
  67. evalscope/metrics/rouge_metric.py +13 -8
  68. evalscope/models/__init__.py +14 -1
  69. evalscope/models/base_adapter.py +52 -0
  70. evalscope/models/chat_adapter.py +140 -0
  71. evalscope/models/choice_adapter.py +211 -0
  72. evalscope/{tools/rewrite_eval_results.py → models/custom/dummy_model.py} +1 -1
  73. evalscope/models/custom_adapter.py +67 -0
  74. evalscope/models/local_model.py +74 -0
  75. evalscope/models/model.py +141 -0
  76. evalscope/models/server_adapter.py +111 -0
  77. evalscope/perf/__init__.py +1 -0
  78. evalscope/perf/arguments.py +3 -1
  79. evalscope/perf/benchmark.py +3 -3
  80. evalscope/perf/main.py +5 -7
  81. evalscope/perf/plugin/api/custom_api.py +1 -1
  82. evalscope/perf/plugin/api/openai_api.py +54 -50
  83. evalscope/perf/plugin/datasets/flickr8k.py +1 -1
  84. evalscope/perf/plugin/datasets/longalpaca.py +1 -1
  85. evalscope/perf/plugin/registry.py +3 -3
  86. evalscope/perf/utils/benchmark_util.py +4 -4
  87. evalscope/perf/utils/db_util.py +66 -22
  88. evalscope/perf/utils/local_server.py +4 -1
  89. evalscope/report/__init__.py +5 -0
  90. evalscope/report/app.py +693 -0
  91. evalscope/report/combinator.py +73 -0
  92. evalscope/report/generator.py +80 -0
  93. evalscope/report/utils.py +133 -0
  94. evalscope/run.py +64 -125
  95. evalscope/run_arena.py +3 -2
  96. evalscope/summarizer.py +15 -27
  97. evalscope/third_party/longbench_write/eval.py +2 -1
  98. evalscope/third_party/longbench_write/longbench_write.py +2 -1
  99. evalscope/third_party/longbench_write/tools/data_etl.py +1 -1
  100. evalscope/third_party/toolbench_static/toolbench_static.py +2 -1
  101. evalscope/utils/__init__.py +1 -0
  102. evalscope/utils/chat_service.py +6 -5
  103. evalscope/utils/io_utils.py +170 -0
  104. evalscope/utils/logger.py +13 -0
  105. evalscope/utils/model_utils.py +15 -2
  106. evalscope/utils/utils.py +3 -200
  107. evalscope/version.py +2 -2
  108. {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/METADATA +129 -23
  109. {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/RECORD +119 -115
  110. tests/cli/test_collection.py +57 -0
  111. tests/cli/test_run.py +57 -7
  112. tests/perf/test_perf.py +3 -2
  113. tests/rag/test_mteb.py +3 -2
  114. tests/vlm/test_vlmeval.py +3 -2
  115. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/correctness_prompt_chinese.json +0 -87
  116. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerCorrectness/long_form_answer_prompt_chinese.json +0 -36
  117. evalscope/backend/rag_eval/ragas/prompts/chinese/AnswerRelevancy/question_generation_chinese.json +0 -26
  118. evalscope/backend/rag_eval/ragas/prompts/chinese/ContextPrecision/context_precision_prompt_chinese.json +0 -41
  119. evalscope/backend/rag_eval/ragas/prompts/chinese/CustomNodeFilter/scoring_prompt_chinese.json +0 -7
  120. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/nli_statements_message_chinese.json +0 -60
  121. evalscope/backend/rag_eval/ragas/prompts/chinese/Faithfulness/statement_prompt_chinese.json +0 -36
  122. evalscope/backend/rag_eval/ragas/prompts/chinese/HeadlinesExtractor/prompt_chinese.json +0 -24
  123. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/concept_combination_prompt_chinese.json +0 -35
  124. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
  125. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopAbstractQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  126. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -30
  127. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  128. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalFaithfulness/faithfulness_prompt_chinese.json +0 -34
  129. evalscope/backend/rag_eval/ragas/prompts/chinese/MultiModalRelevance/relevance_prompt_chinese.json +0 -36
  130. evalscope/backend/rag_eval/ragas/prompts/chinese/NERExtractor/prompt_chinese.json +0 -25
  131. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/generate_query_reference_prompt_chinese.json +0 -24
  132. evalscope/backend/rag_eval/ragas/prompts/chinese/SingleHopSpecificQuerySynthesizer/theme_persona_matching_prompt_chinese.json +0 -39
  133. evalscope/backend/rag_eval/ragas/prompts/chinese/SummaryExtractor/prompt_chinese.json +0 -16
  134. evalscope/backend/rag_eval/ragas/prompts/chinese/ThemesExtractor/prompt_chinese.json +0 -24
  135. evalscope/evaluator/humaneval_evaluator.py +0 -158
  136. evalscope/models/api/__init__.py +0 -3
  137. evalscope/models/dummy_chat_model.py +0 -49
  138. evalscope/models/model_adapter.py +0 -525
  139. evalscope/models/openai_model.py +0 -103
  140. evalscope/tools/__init__.py +0 -1
  141. evalscope/tools/combine_reports.py +0 -135
  142. evalscope/tools/gen_mmlu_subject_mapping.py +0 -90
  143. /evalscope/{models/api → third_party/longbench_write/tools}/openai_api.py +0 -0
  144. {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/LICENSE +0 -0
  145. {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/WHEEL +0 -0
  146. {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/entry_points.txt +0 -0
  147. {evalscope-0.8.0.dist-info → evalscope-0.10.1.dist-info}/top_level.txt +0 -0
@@ -1,39 +0,0 @@
1
- {
2
- "ragas_version": "0.2.7",
3
- "original_hash": 4608101540215877909,
4
- "language": "chinese",
5
- "instruction": "给定一个主题和角色列表,根据角色描述将每个角色与相关主题关联起来。",
6
- "examples": [
7
- {
8
- "input": {
9
- "themes": [
10
- "同理心",
11
- "包容性",
12
- "远程工作"
13
- ],
14
- "personas": [
15
- {
16
- "name": "人力资源经理",
17
- "role_description": "专注于包容性和员工支持。"
18
- },
19
- {
20
- "name": "远程团队负责人",
21
- "role_description": "管理远程团队沟通。"
22
- }
23
- ]
24
- },
25
- "output": {
26
- "mapping": {
27
- "HR Manager": [
28
- "包容性",
29
- "同理心"
30
- ],
31
- "Remote Team Lead": [
32
- "远程工作",
33
- "同理心"
34
- ]
35
- }
36
- }
37
- }
38
- ]
39
- }
@@ -1,16 +0,0 @@
1
- {
2
- "ragas_version": "0.2.7",
3
- "original_hash": -2203889341293275650,
4
- "language": "chinese",
5
- "instruction": "将给定文本总结为不超过10个句子。",
6
- "examples": [
7
- {
8
- "input": {
9
- "text": "人工智能\n\n人工智能正在通过自动化以前需要人类智能的任务来改变各个行业。从医疗保健到金融,人工智能正在被用来快速准确地分析大量数据。这项技术还推动了自动驾驶汽车和个性化推荐等领域的创新。"
10
- },
11
- "output": {
12
- "text": "人工智能通过自动化任务、分析数据和推动自动驾驶汽车和个性化推荐等创新,正在革新各个行业。"
13
- }
14
- }
15
- ]
16
- }
@@ -1,24 +0,0 @@
1
- {
2
- "ragas_version": "0.2.7",
3
- "original_hash": -7344189172470926110,
4
- "language": "chinese",
5
- "instruction": "从给定的文本中提取主要主题和概念。",
6
- "examples": [
7
- {
8
- "input": {
9
- "text": "人工智能通过自动化需要人类智能的任务来改变行业。人工智能快速准确地分析大量数据,推动了自动驾驶汽车和个性化推荐等创新。",
10
- "max_num": 10
11
- },
12
- "output": {
13
- "output": [
14
- "人工智能",
15
- "自动化",
16
- "数据分析",
17
- "创新",
18
- "自动驾驶汽车",
19
- "个性化推荐"
20
- ]
21
- }
22
- }
23
- ]
24
- }
@@ -1,158 +0,0 @@
1
- import json
2
- import os
3
- import re
4
- from tqdm import tqdm
5
- from typing import List, Optional
6
-
7
- from evalscope.constants import OutputsStructure
8
- from evalscope.evaluator.evaluator import logger
9
- from evalscope.models.model_adapter import BaseModelAdapter
10
- from evalscope.tools.combine_reports import gen_table
11
- from evalscope.utils import normalize_score
12
-
13
-
14
- class HumanevalEvaluator(object):
15
-
16
- def __init__(
17
- self,
18
- problem_file: str,
19
- model_id: str,
20
- model_revision: str,
21
- model_adapter: BaseModelAdapter,
22
- outputs: Optional[OutputsStructure] = None,
23
- k: List[int] = [1, 10, 100],
24
- n_workers: int = 4,
25
- timeout: float = 3.0,
26
- ):
27
- try:
28
- from human_eval.data import read_problems, write_jsonl
29
- from human_eval.evaluation import evaluate_functional_correctness
30
- except ImportError:
31
- raise ImportError('Please install human_eval:'
32
- 'https://github.com/openai/human-eval/tree/master#installation , '
33
- 'Note that you need to enable the execution code in the human_eval/execution.py first.')
34
-
35
- self.problem_file = problem_file
36
- self.k = k
37
- self.num_workers = n_workers
38
- self.timeout = timeout
39
- self.model_adapter = model_adapter
40
-
41
- self.read_problems_func = read_problems
42
- self.write_jsonl_func = write_jsonl
43
- self.eval_func = evaluate_functional_correctness
44
-
45
- # {'task_id': {'task_id': '', 'prompt': '', 'entry_point': '', 'canonical_solution': '', 'test': ''}, ...}
46
- self.problems = self.read_problems_func(self.problem_file)
47
-
48
- # Deal with the output paths
49
- self.outputs_structure = OutputsStructure(outputs)
50
-
51
- def get_answers(self, infer_cfg: dict) -> List[dict]:
52
- ans_list: list = []
53
- system_prompt: str = 'Complete the following python code:\n'
54
- for task_id, data_d in tqdm(self.problems.items(), total=len(self.problems), desc='Predicting(problems)'):
55
- prompt: str = system_prompt + data_d['prompt']
56
- inputs: dict = {'data': [prompt]}
57
- # pred_res: dict = self.model_adapter.predict(inputs)
58
-
59
- pred_res: dict = self.model_adapter.predict(inputs=inputs, infer_cfg=infer_cfg)
60
-
61
- pred_ans: str = pred_res['choices'][0]['message']['content']
62
- pred_ans = self._postprocess(pred_ans)
63
-
64
- ans_list.append({'task_id': task_id, 'completion': pred_ans})
65
-
66
- return ans_list
67
-
68
- def eval(self, infer_cfg: dict, **kwargs):
69
-
70
- # predict
71
- ans_list: list = self.get_answers(infer_cfg)
72
- ans_out_file: str = os.path.join(self.outputs_structure.predictions_dir, 'human_eval_predictions.jsonl')
73
-
74
- self.write_jsonl_func(filename=ans_out_file, data=ans_list)
75
- # logger.info(f'** Dump predictions to {ans_out_file} successfully.')
76
- logger.info('** Dump predictions successfully.')
77
-
78
- # evaluate results: e.g. {'pass@1': 0.333, 'pass@10': 0.111}
79
- results = self.eval_func(
80
- sample_file=ans_out_file,
81
- k=self.k,
82
- n_workers=self.num_workers,
83
- timeout=self.timeout,
84
- problem_file=self.problem_file)
85
-
86
- # output: report
87
- report_map: dict = self.gen_report(results=results)
88
- report_dir: str = self.outputs_structure.reports_dir
89
- report_file: str = os.path.join(report_dir, 'human_eval_report.json')
90
-
91
- with open(report_file, 'w') as f:
92
- f.write(json.dumps(report_map, ensure_ascii=False, indent=4))
93
- # logger.info(f'** Dump report to {report_file} \n')
94
- logger.info('** Dump report \n')
95
-
96
- try:
97
- # Make table
98
- report_table: str = gen_table([report_dir])
99
- logger.info(f'** Report table: \n {report_table} \n')
100
- except Exception:
101
- logger.error('Failed to generate report table.')
102
-
103
- def gen_report(self, results: dict) -> dict:
104
- """
105
- Generate report from evaluation results.
106
-
107
- Returns:
108
- {
109
- "name":"ARC-Challenge",
110
- "metric":"WeightedAverageAccuracy",
111
- "score":0.3389,
112
- "category":[
113
- {
114
- "name":"DEFAULT",
115
- "score":0.3389,
116
- "subset":[
117
- {
118
- "name":"ARC-Challenge",
119
- "score":0.3389
120
- },
121
- ]
122
- }
123
- ],
124
- "total_num":100
125
- }
126
- """
127
- results = {k: normalize_score(score=v) for k, v in results.items()}
128
-
129
- category_d = dict(name='DEFAULT', score=results, subset=[])
130
-
131
- res_map = dict(
132
- name='HumanEval', metric='pass@k', score=results, category=[category_d], total_num=len(self.problems))
133
-
134
- return res_map
135
-
136
- @classmethod
137
- def _postprocess(cls, text: str) -> str:
138
- if '```' in text:
139
- blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
140
- if len(blocks) == 0:
141
- text = text.split('```')[1] # fall back to default strategy
142
- else:
143
- text = blocks[0] # fetch the first code block
144
- if not text.startswith('\n'): # in case starting with ```python
145
- text = text[max(text.find('\n') + 1, 0):]
146
- if text.strip().startswith('from') or text.strip().startswith('import'):
147
- def_idx = text.find('def')
148
- if def_idx != -1:
149
- text = text[max(text.find('\n', def_idx) + 1, 0):]
150
- text = text.split('\n\n')[0]
151
- if text.strip().startswith('def'):
152
- text = '\n'.join(text.split('\n')[1:])
153
- if not text.startswith(' '):
154
- if text.startswith(' '):
155
- text = ' ' + text.lstrip()
156
- else:
157
- text = '\n'.join([' ' + line for line in text.split('\n')])
158
- return text
@@ -1,3 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
- from evalscope.models.api.openai_api import OpenaiApi
@@ -1,49 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
-
3
- import random
4
- import time
5
-
6
- from evalscope.models import ChatBaseModel
7
- from evalscope.utils.logger import get_logger
8
-
9
- logger = get_logger()
10
-
11
-
12
- class DummyChatModel(ChatBaseModel):
13
-
14
- MODEL_ID = 'dummy_chat_model_0801'
15
- REVISION = 'v1.0.0'
16
-
17
- def __init__(self, model_cfg: dict, **kwargs):
18
- model_cfg['model_id'] = self.MODEL_ID
19
- model_cfg['revision'] = self.REVISION
20
- super(DummyChatModel, self).__init__(model_cfg=model_cfg)
21
-
22
- def predict(self, inputs: dict, **kwargs) -> dict:
23
-
24
- debug: bool = False
25
- if debug:
26
- messages = inputs['messages']
27
- history = inputs['history']
28
-
29
- logger.info(f'** messages: {messages}')
30
- logger.info(f'** history: {history}')
31
-
32
- choice = random.choice(['A', 'B', 'C', 'D'])
33
-
34
- # Build response
35
- res = {
36
- 'choices': [{
37
- 'index': 0,
38
- 'message': {
39
- 'content': choice,
40
- 'role': 'assistant'
41
- }
42
- }],
43
- 'created': time.time(),
44
- 'model': self.MODEL_ID + '-' + self.REVISION,
45
- 'object': 'chat.completion',
46
- 'usage': {}
47
- }
48
-
49
- return res