evalscope 0.16.3__py3-none-any.whl → 0.17.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (157) hide show
  1. evalscope/app/app.py +9 -762
  2. evalscope/app/constants.py +1 -0
  3. evalscope/app/ui/__init__.py +20 -0
  4. evalscope/app/ui/app_ui.py +52 -0
  5. evalscope/app/ui/multi_model.py +323 -0
  6. evalscope/app/ui/sidebar.py +42 -0
  7. evalscope/app/ui/single_model.py +202 -0
  8. evalscope/app/ui/visualization.py +36 -0
  9. evalscope/app/utils/data_utils.py +178 -0
  10. evalscope/app/utils/localization.py +221 -0
  11. evalscope/app/utils/text_utils.py +119 -0
  12. evalscope/app/utils/visualization.py +91 -0
  13. evalscope/backend/opencompass/backend_manager.py +2 -1
  14. evalscope/backend/rag_eval/backend_manager.py +2 -1
  15. evalscope/backend/rag_eval/utils/embedding.py +1 -1
  16. evalscope/backend/vlm_eval_kit/backend_manager.py +4 -1
  17. evalscope/benchmarks/__init__.py +15 -1
  18. evalscope/benchmarks/aime/aime24_adapter.py +2 -1
  19. evalscope/benchmarks/aime/aime25_adapter.py +2 -1
  20. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -1
  21. evalscope/benchmarks/arc/arc_adapter.py +1 -1
  22. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -1
  23. evalscope/benchmarks/arena_hard/utils.py +0 -12
  24. evalscope/benchmarks/bfcl/bfcl_adapter.py +1 -1
  25. evalscope/benchmarks/ceval/ceval_adapter.py +5 -16
  26. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -21
  27. evalscope/benchmarks/competition_math/competition_math_adapter.py +2 -1
  28. evalscope/benchmarks/data_adapter.py +29 -9
  29. evalscope/benchmarks/general_arena/__init__.py +0 -0
  30. evalscope/benchmarks/general_arena/general_arena_adapter.py +411 -0
  31. evalscope/benchmarks/general_arena/utils.py +226 -0
  32. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +3 -2
  33. evalscope/benchmarks/general_qa/general_qa_adapter.py +44 -30
  34. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
  35. evalscope/benchmarks/hle/__init__.py +0 -0
  36. evalscope/benchmarks/hle/hle_adapter.py +118 -0
  37. evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -21
  38. evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -4
  39. evalscope/benchmarks/iquiz/iquiz_adapter.py +1 -1
  40. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -6
  41. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +1 -1
  42. evalscope/benchmarks/math_500/math_500_adapter.py +2 -1
  43. evalscope/benchmarks/mmlu/mmlu_adapter.py +2 -2
  44. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  45. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
  46. evalscope/benchmarks/musr/musr_adapter.py +1 -1
  47. evalscope/benchmarks/race/race_adapter.py +1 -1
  48. evalscope/benchmarks/tau_bench/__init__.py +0 -0
  49. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +110 -0
  50. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +7 -1
  51. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +9 -4
  52. evalscope/benchmarks/utils.py +2 -2
  53. evalscope/benchmarks/winogrande/winogrande_adapter.py +1 -1
  54. evalscope/config.py +8 -123
  55. evalscope/constants.py +5 -21
  56. evalscope/evaluator/__init__.py +1 -1
  57. evalscope/evaluator/evaluator.py +20 -15
  58. evalscope/metrics/__init__.py +9 -1
  59. evalscope/{utils/utils.py → metrics/completion_parsers.py} +71 -176
  60. evalscope/metrics/llm_judge.py +106 -20
  61. evalscope/metrics/metrics.py +20 -8
  62. evalscope/models/__init__.py +4 -8
  63. evalscope/models/adapters/__init__.py +4 -9
  64. evalscope/models/adapters/base_adapter.py +4 -0
  65. evalscope/models/adapters/bfcl_adapter.py +2 -0
  66. evalscope/models/adapters/chat_adapter.py +3 -0
  67. evalscope/models/adapters/choice_adapter.py +4 -0
  68. evalscope/models/adapters/custom_adapter.py +7 -3
  69. evalscope/models/adapters/server_adapter.py +4 -2
  70. evalscope/models/adapters/t2i_adapter.py +3 -0
  71. evalscope/models/adapters/tau_bench_adapter.py +189 -0
  72. evalscope/models/custom/dummy_model.py +3 -3
  73. evalscope/models/register.py +0 -14
  74. evalscope/perf/arguments.py +15 -16
  75. evalscope/perf/benchmark.py +38 -39
  76. evalscope/perf/http_client.py +30 -86
  77. evalscope/perf/main.py +3 -3
  78. evalscope/perf/plugin/__init__.py +3 -2
  79. evalscope/perf/plugin/api/__init__.py +4 -3
  80. evalscope/perf/plugin/api/base.py +22 -4
  81. evalscope/perf/plugin/api/custom_api.py +212 -55
  82. evalscope/perf/plugin/api/dashscope_api.py +4 -10
  83. evalscope/perf/plugin/api/default_api.py +105 -0
  84. evalscope/perf/plugin/api/openai_api.py +17 -19
  85. evalscope/perf/plugin/datasets/__init__.py +10 -7
  86. evalscope/perf/plugin/datasets/base.py +22 -1
  87. evalscope/perf/plugin/datasets/custom.py +2 -1
  88. evalscope/perf/plugin/datasets/flickr8k.py +4 -27
  89. evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  90. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  91. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  92. evalscope/perf/plugin/datasets/openqa.py +2 -1
  93. evalscope/perf/plugin/datasets/random_dataset.py +15 -4
  94. evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  95. evalscope/perf/plugin/registry.py +36 -16
  96. evalscope/perf/utils/analysis_result.py +24 -23
  97. evalscope/perf/utils/benchmark_util.py +14 -20
  98. evalscope/perf/utils/db_util.py +79 -61
  99. evalscope/report/__init__.py +1 -1
  100. evalscope/report/utils.py +34 -15
  101. evalscope/run.py +1 -1
  102. evalscope/summarizer.py +1 -2
  103. evalscope/utils/__init__.py +63 -2
  104. evalscope/utils/argument_utils.py +64 -0
  105. evalscope/utils/import_utils.py +16 -0
  106. evalscope/utils/io_utils.py +55 -4
  107. evalscope/utils/model_utils.py +37 -1
  108. evalscope/version.py +2 -2
  109. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/METADATA +100 -51
  110. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/RECORD +129 -133
  111. tests/aigc/test_t2i.py +1 -1
  112. tests/cli/test_all.py +68 -4
  113. tests/cli/test_collection.py +1 -1
  114. tests/cli/test_custom.py +261 -0
  115. tests/cli/test_run.py +34 -70
  116. tests/perf/test_perf.py +31 -4
  117. tests/rag/test_clip_benchmark.py +2 -1
  118. tests/rag/test_mteb.py +3 -1
  119. tests/rag/test_ragas.py +3 -1
  120. tests/swift/test_run_swift_eval.py +2 -1
  121. tests/swift/test_run_swift_vlm_eval.py +2 -1
  122. tests/swift/test_run_swift_vlm_jugde_eval.py +2 -1
  123. tests/utils.py +13 -0
  124. tests/vlm/test_vlmeval.py +8 -2
  125. evalscope/evaluator/rating_eval.py +0 -157
  126. evalscope/evaluator/reviewer/__init__.py +0 -1
  127. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  128. evalscope/models/model.py +0 -189
  129. evalscope/registry/__init__.py +0 -1
  130. evalscope/registry/config/cfg_arena.yaml +0 -77
  131. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  132. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  133. evalscope/registry/config/cfg_single.yaml +0 -78
  134. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  135. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  136. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  137. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  138. evalscope/registry/data/question.jsonl +0 -80
  139. evalscope/registry/tasks/arc.yaml +0 -28
  140. evalscope/registry/tasks/bbh.yaml +0 -26
  141. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  142. evalscope/registry/tasks/ceval.yaml +0 -27
  143. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  144. evalscope/registry/tasks/cmmlu.yaml +0 -27
  145. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  146. evalscope/registry/tasks/general_qa.yaml +0 -27
  147. evalscope/registry/tasks/gsm8k.yaml +0 -29
  148. evalscope/registry/tasks/mmlu.yaml +0 -29
  149. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  150. evalscope/run_arena.py +0 -202
  151. evalscope/utils/arena_utils.py +0 -217
  152. evalscope/utils/completion_parsers.py +0 -82
  153. /evalscope/{utils → benchmarks}/filters.py +0 -0
  154. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/LICENSE +0 -0
  155. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/WHEEL +0 -0
  156. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/entry_points.txt +0 -0
  157. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/top_level.txt +0 -0
evalscope/models/model.py DELETED
@@ -1,189 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
2
- import os
3
- import time
4
- from abc import ABC, abstractmethod
5
- from typing import Any, List
6
-
7
- from evalscope.utils.logger import get_logger
8
-
9
- logger = get_logger()
10
-
11
-
12
- class BaseModel(ABC):
13
-
14
- def __init__(self, model_cfg: dict, **kwargs):
15
- """
16
- Base model class.
17
-
18
- Args:
19
- model_cfg (dict): The model configuration. Depending on the specific model. Example:
20
- {'model_id': 'modelscope/Llama-2-7b-chat-ms', 'revision': 'v1.0.0'}
21
-
22
- **kwargs: kwargs
23
- """
24
- self.model_cfg: dict = model_cfg
25
- self.kwargs = kwargs
26
-
27
- @abstractmethod
28
- def predict(self, *args, **kwargs) -> Any:
29
- """
30
- Model prediction func.
31
- """
32
- raise NotImplementedError
33
-
34
-
35
- class ChatBaseModel(BaseModel):
36
-
37
- def __init__(self, model_cfg: dict, **kwargs):
38
- """
39
- Chat base model class. Depending on the specific model.
40
-
41
- Args:
42
- model_cfg (dict):
43
- {'model_id': 'modelscope/Llama-2-7b-chat-ms', 'revision': 'v1.0.0', 'device_map': 'auto'}
44
-
45
- **kwargs: kwargs
46
- """
47
- super(ChatBaseModel, self).__init__(model_cfg=model_cfg, **kwargs)
48
-
49
- @abstractmethod
50
- def predict(self, inputs: dict, **kwargs) -> dict:
51
- """
52
- Model prediction func. The inputs and outputs are compatible with OpenAI Chat Completions APIs.
53
- Refer to: https://platform.openai.com/docs/guides/gpt/chat-completions-api
54
-
55
- # TODO: follow latest OpenAI API
56
-
57
- Args:
58
- inputs (dict): The input prompts and history. Input format:
59
- {'messages': [
60
- {'role': 'system', 'content': 'You are a helpful assistant.'},
61
- {'role': 'user', 'content': 'Who won the world series in 2020?'},
62
- {'role': 'assistant', 'content': 'The Los Angeles Dodgers won the World Series in 2020.'},
63
- ]
64
- 'history': [
65
- {'role': 'system', 'content': 'Hello'},
66
- {'role': 'user', 'content': 'Hi'}]
67
- }
68
-
69
- kwargs (dict): Could be inference configuration. Default: None.
70
- cfg format: {'max_length': 1024}
71
-
72
- Returns: The result format:
73
- {
74
- 'choices': [
75
- {
76
- 'index': 0,
77
- 'message': {
78
- 'content': 'The 2020 World Series was played in Texas at Globe Life Field in Arlington.',
79
- 'role': 'assistant'
80
- }
81
- }
82
- ],
83
- 'created': 1677664795,
84
- # For models on the ModelScope or HuggingFace, concat model_id and revision with "-".
85
- 'model': 'gpt-3.5-turbo-0613',
86
- 'object': 'chat.completion',
87
- 'usage': {
88
- 'completion_tokens': 17,
89
- 'prompt_tokens': 57,
90
- 'total_tokens': 74
91
- }
92
- }
93
- """
94
- raise NotImplementedError
95
-
96
-
97
- # TODO: Remove this class after refactoring all models
98
- class OpenAIModel(ChatBaseModel):
99
- """
100
- APIs of OpenAI models.
101
- Available models: gpt-3.5-turbo, gpt-4
102
- """
103
-
104
- MAX_RETRIES = 3
105
-
106
- def __init__(self, model_cfg: dict, **kwargs):
107
- super(OpenAIModel, self).__init__(model_cfg=model_cfg, **kwargs)
108
-
109
- openai_api_key = os.environ.get('OPENAI_API_KEY', None)
110
- self.api_key = self.model_cfg.get('api_key', openai_api_key)
111
-
112
- if not self.api_key:
113
- logger.error('OpenAI API key is not provided, please set it in environment variable OPENAI_API_KEY')
114
- # raise ValueError(
115
- # 'OpenAI API key is not provided, '
116
- # 'please set it in environment variable OPENAI_API_KEY')
117
-
118
- def predict(self, model_id: str, inputs: dict, **kwargs) -> dict:
119
-
120
- sys_prompt: str = inputs.get('sys_prompt', '')
121
- user_prompt: str = inputs.get('user_prompt', '')
122
-
123
- # model_id: str = kwargs.get('model_id', '')
124
- temperature: float = kwargs.pop('temperature', 0.2)
125
- max_tokens: int = kwargs.pop('max_tokens', 1024)
126
- mode: str = kwargs.pop('mode', 'chat.completion')
127
-
128
- logger.info(f'Using OpenAI model_id: {model_id}')
129
-
130
- res = self._predict(
131
- model_id=model_id,
132
- sys_prompt=sys_prompt,
133
- user_prompt=user_prompt,
134
- temperature=temperature,
135
- max_tokens=max_tokens,
136
- mode=mode)
137
-
138
- return res
139
-
140
- def _predict(
141
- self,
142
- model_id,
143
- sys_prompt,
144
- user_prompt,
145
- temperature,
146
- max_tokens,
147
- mode: str = 'chat.completion',
148
- ) -> dict:
149
- import openai
150
-
151
- res = {}
152
- openai.api_key = self.api_key
153
-
154
- for i in range(self.MAX_RETRIES):
155
- try:
156
- if mode == 'chat.completion':
157
- resp = openai.ChatCompletion.create(
158
- model=model_id,
159
- messages=[{
160
- 'role': 'system',
161
- 'content': sys_prompt
162
- }, {
163
- 'role': 'user',
164
- 'content': user_prompt
165
- }],
166
- temperature=temperature,
167
- max_tokens=max_tokens)
168
-
169
- if resp:
170
- ans_text = resp['choices'][0]['message']['content']
171
- model_id = resp['model']
172
- else:
173
- logger.warning(f'OpenAI GPT API call failed: got empty response '
174
- f'for input {sys_prompt} {user_prompt}')
175
- ans_text = ''
176
- model_id = ''
177
-
178
- res['ans_text'] = ans_text
179
- res['model_id'] = model_id
180
- else:
181
- raise ValueError(f'Invalid mode: {mode}')
182
-
183
- return res
184
-
185
- except Exception as e:
186
- logger.warning(f'OpenAI API call failed: {e}')
187
- time.sleep(3)
188
- logger.error(f'OpenAI API call failed after {self.MAX_RETRIES} retries')
189
- return res
@@ -1 +0,0 @@
1
- # Copyright (c) Alibaba, Inc. and its affiliates.
@@ -1,77 +0,0 @@
1
- # input raw data
2
- question_file: registry/data/question.jsonl
3
-
4
- # candidate models to be battled
5
- answers_gen:
6
- chatglm3-6b:
7
- # model_id_or_path could be local absolute path, e.g. /to/path/.cache/modelscope/ZhipuAI/chatglm3-6b
8
- model_id_or_path: ZhipuAI/chatglm3-6b # model_id on modelscope
9
- revision: v1.0.2 # revision of model, default is NULL
10
- precision: torch.float16
11
- enable: true # enable or disable this model
12
- template_type: chatglm3 # see: https://github.com/modelscope/swift/blob/main/docs/source/LLM/%E6%94%AF%E6%8C%81%E7%9A%84%E6%A8%A1%E5%9E%8B%E5%92%8C%E6%95%B0%E6%8D%AE%E9%9B%86.md
13
- generation_config:
14
- do_sample: true
15
- max_new_tokens: 256
16
- top_k: 20
17
- top_p: 0.75
18
- temperature: 0.333
19
- # output predicted answer file name
20
- output_file: registry/data/arena/answers/answer_chatglm3-6b.jsonl
21
- Baichuan2-7B-Base:
22
- model_id_or_path: baichuan-inc/Baichuan2-7B-Base
23
- revision: v1.0.2 # revision of model, default is NULL
24
- precision: torch.float16
25
- enable: false # enable or disable this model
26
- template_type: default-generation
27
- generation_config:
28
- do_sample: true
29
- max_new_tokens: 256
30
- top_k: 20
31
- top_p: 0.75
32
- temperature: 0.3
33
- output_file: registry/data/arena/answers/answer_Baichuan2-7B-Base.jsonl
34
- Qwen-7B:
35
- model_id_or_path: qwen/Qwen-7B
36
- revision: v1.1.8 # revision of model, default is NULL
37
- precision: torch.float16
38
- enable: true # enable or disable this model # TODO: tokenizer issue
39
- template_type: default-generation
40
- generation_config:
41
- do_sample: true
42
- max_new_tokens: 256
43
- top_k: 20
44
- top_p: 0.75
45
- temperature: 0.3
46
- output_file: registry/data/arena/answers/answer_Qwen-7B.jsonl
47
-
48
- # Auto-reviewer(GPT-4) config
49
- reviews_gen:
50
- enable: true
51
- reviewer:
52
- # class reference of auto reviewer(GPT-4)
53
- ref: evalscope.evaluator.reviewer.auto_reviewer:AutoReviewerGpt4
54
- args:
55
- max_tokens: 1024
56
- temperature: 0.2
57
- # options: pairwise, pairwise_baseline, single (default is pairwise)
58
- mode: pairwise
59
- # position bias mitigation strategy, options: swap_position, randomize_order, NULL. default is NULL
60
- position_bias_mitigation: NULL
61
- # completion parser config, default is lmsys_parser
62
- fn_completion_parser: lmsys_parser
63
- # prompt templates for auto reviewer(GPT-4)
64
- prompt_file: registry/data/prompt_template/prompt_templates.jsonl
65
- # target answer files list to be reviewed,
66
- # could be replaced by your own path: ['/path/to/answers_model_1.jsonl', '/path/to/answers_model_2.jsonl', ...]
67
- # Default is NULL, which means all answers in answers_gen will be reviewed
68
- target_answers: NULL
69
- # output file name of auto reviewer
70
- review_file: registry/data/arena/reviews/review_gpt4.jsonl
71
-
72
- # rating results
73
- rating_gen:
74
- enable: true
75
- metrics: ['elo']
76
- # elo rating report file name
77
- report_file: registry/data/arena/reports/elo_rating_origin.csv
@@ -1,63 +0,0 @@
1
- # input raw data
2
- question_file: registry/data/question.jsonl
3
-
4
- # candidate models to be battled
5
- answers_gen:
6
- Qwen2-7B-Instruct:
7
- model_id_or_path: /mnt/data/data/user/maoyunlin.myl/models/Qwen2-7B-Instruct # model_id on modelscope
8
- revision: NULL # revision of model, default is NULL
9
- precision: torch.float16
10
- enable: true # enable or disable this model
11
- template_type: default-generation # see: https://github.com/modelscope/swift/blob/main/docs/source/LLM/%E6%94%AF%E6%8C%81%E7%9A%84%E6%A8%A1%E5%9E%8B%E5%92%8C%E6%95%B0%E6%8D%AE%E9%9B%86.md
12
- generation_config:
13
- do_sample: true
14
- max_new_tokens: 512
15
- top_k: 20
16
- top_p: 0.9
17
- temperature: 0.7
18
- # output predicted answer file name
19
- output_file: registry/data/arena/answers/answer_qwen2.jsonl
20
- Qwen-7B:
21
- model_id_or_path: /mnt/data/data/user/maoyunlin.myl/output/qwen2-7b-instruct/v25-20240809-113533/checkpoint-309-merged
22
- revision: NULL # revision of model, default is NULL
23
- precision: torch.float16
24
- enable: true # enable or disable this model
25
- template_type: default-generation
26
- generation_config:
27
- do_sample: true
28
- max_new_tokens: 512
29
- top_k: 20
30
- top_p: 0.9
31
- temperature: 0.7
32
- output_file: registry/data/arena/answers/answer_Qwen-7B.jsonl
33
-
34
- # Auto-reviewer(GPT-4) config
35
- reviews_gen:
36
- enable: true
37
- reviewer:
38
- # class reference of auto reviewer(GPT-4)
39
- ref: evalscope.evaluator.reviewer.auto_reviewer:AutoReviewerGpt4
40
- args:
41
- max_tokens: 1024
42
- temperature: 0.2
43
- # options: pairwise, pairwise_baseline, single (default is pairwise)
44
- mode: pairwise
45
- # position bias mitigation strategy, options: swap_position, randomize_order, NULL. default is NULL
46
- position_bias_mitigation: NULL
47
- # completion parser config, default is lmsys_parser
48
- fn_completion_parser: lmsys_parser
49
- # prompt templates for auto reviewer(GPT-4)
50
- prompt_file: registry/data/prompt_template/prompt_templates.jsonl
51
- # target answer files list to be reviewed,
52
- # could be replaced by your own path: ['/path/to/answers_model_1.jsonl', '/path/to/answers_model_2.jsonl', ...]
53
- # Default is NULL, which means all answers in answers_gen will be reviewed
54
- target_answers: NULL
55
- # output file name of auto reviewer
56
- review_file: registry/data/arena/reviews/review_gpt4.jsonl
57
-
58
- # rating results
59
- rating_gen:
60
- enable: true
61
- metrics: ['elo']
62
- # elo rating report file name
63
- report_file: registry/data/arena/reports/elo_rating_origin.csv
@@ -1,83 +0,0 @@
1
- # input raw data
2
- question_file: registry/data/question.jsonl
3
-
4
- # candidate models to be battled
5
- answers_gen:
6
- chatglm3-6b:
7
- # model_id_or_path could be local absolute path, e.g. /to/path/.cache/modelscope/ZhipuAI/chatglm3-6b
8
- model_id_or_path: ZhipuAI/chatglm3-6b # model_id on modelscope
9
- revision: v1.0.2 # revision of model, default is NULL
10
- precision: torch.float16
11
- enable: true # enable or disable this model
12
- template_type: chatglm3 # see: https://github.com/modelscope/swift/blob/main/docs/source/LLM/%E6%94%AF%E6%8C%81%E7%9A%84%E6%A8%A1%E5%9E%8B%E5%92%8C%E6%95%B0%E6%8D%AE%E9%9B%86.md
13
- generation_config:
14
- do_sample: true
15
- max_new_tokens: 256
16
- top_k: 20
17
- top_p: 0.75
18
- temperature: 0.3
19
- # output predicted answer file name
20
- output_file: registry/data/arena/answers/answer_chatglm3-6b.jsonl
21
- Baichuan2-7B-Base:
22
- model_id_or_path: baichuan-inc/Baichuan2-7B-Base
23
- revision: v1.0.2 # revision of model, default is NULL
24
- precision: torch.float16
25
- enable: false # enable or disable this model
26
- template_type: default-generation
27
- generation_config:
28
- do_sample: true
29
- max_new_tokens: 256
30
- top_k: 20
31
- top_p: 0.75
32
- temperature: 0.3
33
- output_file: registry/data/arena/answers/answer_Baichuan2-7B-Base.jsonl
34
- Qwen-7B:
35
- model_id_or_path: qwen/Qwen-7B
36
- revision: v1.1.8 # revision of model, default is NULL
37
- precision: torch.float16
38
- enable: true # enable or disable this model # TODO: tokenizer issue
39
- template_type: default-generation
40
- generation_config:
41
- do_sample: true
42
- max_new_tokens: 256
43
- top_k: 20
44
- top_p: 0.75
45
- temperature: 0.3
46
- output_file: registry/data/arena/answers/answer_Qwen-7B.jsonl
47
-
48
- # model of auto-reviewer
49
- reviews_gen:
50
- enable: true
51
- reviewer:
52
- ref: evalscope.evaluator.reviewer.auto_reviewer:AutoReviewerGpt4
53
- args:
54
- model: gpt-4
55
- max_tokens: 1024
56
- temperature: 0
57
- # pairwise comparison against baseline
58
- mode: pairwise_baseline
59
- # position bias mitigation strategy, options: swap_position, randomize_order, None. default is None
60
- position_bias_mitigation: swap_position
61
- # completion parser config, default is lmsys_parser
62
- fn_completion_parser: lmsys_parser
63
- # target answers list to be reviewed, could be replaced by your own path: /path/to/answers.jsonl
64
- target_answers: [registry/data/arena/answers/answer_chatglm3-6b.jsonl,
65
- registry/data/arena/answers/answer_Baichuan2-7B-Base.jsonl]
66
- # the path to the outputs of the baseline model
67
- baseline_file: registry/data/arena/answers/answer_text_davinci_003.jsonl
68
- # the path to the reference answers
69
- reference_file:
70
- # prompt templates for auto reviewer(GPT-4)
71
- prompt_file: registry/data/prompt_template/lmsys_v2.jsonl
72
- # output file of auto reviewer
73
- review_file: registry/data/arena/reviews/review_gpt4_pair_baseline.jsonl
74
- # cache file of auto reviewer
75
- cache_file: registry/data/arena/reviews/review_gpt4_pair_baseline.jsonl
76
-
77
- # rating results
78
- rating_gen:
79
- enable: true
80
- metrics: ['pairwise']
81
- baseline_model: text_davinci_003
82
- # elo rating report file
83
- report_file: registry/data/arena/reports/rating_pairwise_baseline.csv
@@ -1,78 +0,0 @@
1
- # input raw data
2
- question_file: registry/data/question.jsonl
3
-
4
- # candidate models to be battled
5
- answers_gen:
6
- chatglm3-6b:
7
- # model_id_or_path could be local absolute path, e.g. /to/path/.cache/modelscope/ZhipuAI/chatglm3-6b
8
- model_id_or_path: ZhipuAI/chatglm3-6b # model_id on modelscope
9
- revision: v1.0.2 # revision of model, default is NULL
10
- precision: torch.float16
11
- enable: true # enable or disable this model
12
- template_type: chatglm3
13
- generation_config:
14
- do_sample: true
15
- max_new_tokens: 256
16
- top_k: 20
17
- top_p: 0.75
18
- temperature: 0.3
19
- # output predicted answer file name
20
- output_file: registry/data/arena/answers/answer_chatglm3-6b.jsonl
21
- Baichuan2-7B-Base:
22
- model_id_or_path: baichuan-inc/Baichuan2-7B-Base
23
- revision: v1.0.2 # revision of model, default is NULL
24
- precision: torch.float16
25
- enable: false # enable or disable this model
26
- template_type: default-generation
27
- generation_config:
28
- do_sample: true
29
- max_new_tokens: 256
30
- top_k: 20
31
- top_p: 0.75
32
- temperature: 0.3
33
- output_file: registry/data/arena/answers/answer_Baichuan2-7B-Base.jsonl
34
- Qwen-7B:
35
- model_id_or_path: qwen/Qwen-7B
36
- revision: v1.1.8 # revision of model, default is NULL
37
- precision: torch.float16
38
- enable: true # enable or disable this model # TODO: tokenizer issue
39
- template_type: default-generation
40
- generation_config:
41
- do_sample: true
42
- max_new_tokens: 256
43
- top_k: 20
44
- top_p: 0.75
45
- temperature: 0.3
46
- output_file: registry/data/arena/answers/answer_Qwen-7B.jsonl
47
-
48
- # model of auto-reviewer
49
- reviews_gen:
50
- enable: true
51
- reviewer:
52
- ref: evalscope.evaluator.reviewer.auto_reviewer:AutoReviewerGpt4
53
- args:
54
- model: gpt-4
55
- max_tokens: 1024
56
- temperature: 0
57
- # pairwise comparison against baseline
58
- mode: single
59
- # completion parser config, default is lmsys_parser
60
- fn_completion_parser: lmsys_parser
61
- # target answers list to be reviewed, could be replaced by your own path: /path/to/answers.jsonl
62
- target_answers: [registry/data/arena/answers/answer_chatglm3-6b.jsonl,
63
- registry/data/arena/answers/answer_Baichuan2-7B-Base.jsonl]
64
- # the path to the reference answers
65
- reference_file:
66
- # prompt templates for auto reviewer(GPT-4)
67
- prompt_file: registry/data/prompt_template/lmsys_v2.jsonl
68
- # output file of auto reviewer
69
- review_file: registry/data/arena/reviews/review_gpt4_single.jsonl
70
- # cache file of auto reviewer
71
- cache_file: registry/data/arena/reviews/review_gpt4_single.jsonl
72
-
73
- # rating results
74
- rating_gen:
75
- enable: true
76
- metrics: ['score']
77
- # elo rating report file
78
- report_file: registry/data/arena/reports/rating_single.csv
@@ -1,8 +0,0 @@
1
- {"name": "pair-v2", "type": "pairwise", "system_prompt": "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user question displayed below. You should choose the assistant that follows the user's instructions and answers the user's question better. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of their responses. Begin your evaluation by comparing the two responses and provide a short explanation. Avoid any positional biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: \"[[A]]\" if assistant A is better, \"[[B]]\" if assistant B is better, and \"[[C]]\" for a tie.", "prompt_template": "[User Question]\n{question}\n\n[The Start of Assistant A's Answer]\n{answer_a}\n[The End of Assistant A's Answer]\n\n[The Start of Assistant B's Answer]\n{answer_b}\n[The End of Assistant B's Answer]", "description": "Prompt for general questions", "category": "general", "output_format": "[[A]]"}
2
- {"name": "pair-v2-multi-turn", "type": "pairwise", "system_prompt": "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user questions. You should choose the assistant that follows the user's instructions and answers the user's questions better. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of their responses. You should focus on who provides a better answer to the second user question. Begin your evaluation by comparing the responses of the two assistants and provide a short explanation. Avoid any positional biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: \"[[A]]\" if assistant A is better, \"[[B]]\" if assistant B is better, and \"[[C]]\" for a tie.", "prompt_template": "<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_a_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_a_2}\n\n<|The End of Assistant A's Conversation with User|>\n\n\n<|The Start of Assistant B's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant B:\n{answer_b_1}\n\n### User:\n{question_2}\n\n### Assistant B:\n{answer_b_2}\n\n<|The End of Assistant B's Conversation with User|>", "description": "Prompt for multi-turn general questions", "category": "general", "output_format": "[[A]]"}
3
- {"name": "pair-math-v1", "type": "pairwise", "system_prompt": "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user question displayed below. Your evaluation should consider correctness and helpfulness. You will be given a reference answer, assistant A's answer, and assistant B's answer. Your job is to evaluate which assistant's answer is better. Begin your evaluation by comparing both assistants' answers with the reference answer. Identify and correct any mistakes. Avoid any positional biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: \"[[A]]\" if assistant A is better, \"[[B]]\" if assistant B is better, and \"[[C]]\" for a tie.", "prompt_template": "[User Question]\n{question}\n\n[The Start of Reference Answer]\n{ref_answer_1}\n[The End of Reference Answer]\n\n[The Start of Assistant A's Answer]\n{answer_a}\n[The End of Assistant A's Answer]\n\n[The Start of Assistant B's Answer]\n{answer_b}\n[The End of Assistant B's Answer]", "description": "Prompt for math questions", "category": ["math", "reasoning", "coding"], "output_format": "[[A]]"}
4
- {"name": "pair-math-v1-multi-turn", "type": "pairwise", "system_prompt": "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user questions. Your evaluation should consider correctness and helpfulness. You will be given reference answers, the assistant A's answers, the assistant B's answers. Your job is to determine which assistant provides correct and helpful answers to the second user question. Begin your evaluation by comparing both assistants' answers with the reference answers. Identify and correct any mistakes. Avoid any positional biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: \"[[A]]\" if assistant A is better, \"[[B]]\" if assistant B is better, and \"[[C]]\" for a tie.", "prompt_template": "<|The Start of Reference Answer|>\n\n### User:\n{question_1}\n\n### Reference answer:\n{ref_answer_1}\n\n### User:\n{question_2}\n\n### Reference answer:\n{ref_answer_2}\n\n<|The End of Reference Answer|>\n\n\n<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_a_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_a_2}\n\n<|The End of Assistant A's Conversation with User|>\n\n\n<|The Start of Assistant B's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant B:\n{answer_b_1}\n\n### User:\n{question_2}\n\n### Assistant B:\n{answer_b_2}\n\n<|The End of Assistant B's Conversation with User|>", "description": "Prompt for multi-turn general questions", "category": ["math", "reasoning", "coding"], "output_format": "[[A]]"}
5
- {"name": "single-v1", "type": "single", "system_prompt": "You are a helpful assistant.", "prompt_template": "[Instruction]\nPlease act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n[Question]\n{question}\n\n[The Start of Assistant's Answer]\n{answer}\n[The End of Assistant's Answer]", "description": "Prompt for general questions", "category": "general", "output_format": "[[rating]]"}
6
- {"name": "single-math-v1", "type": "single", "system_prompt": "You are a helpful assistant.", "prompt_template": "[Instruction]\nPlease act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider correctness and helpfulness. You will be given a reference answer and the assistant's answer. Begin your evaluation by comparing the assistant's answer with the reference answer. Identify and correct any mistakes. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n[Question]\n{question}\n\n[The Start of Reference Answer]\n{ref_answer_1}\n[The End of Reference Answer]\n\n[The Start of Assistant's Answer]\n{answer}\n[The End of Assistant's Answer]", "description": "Prompt for general questions", "category": ["math", "reasoning", "coding"], "output_format": "[[rating]]"}
7
- {"name": "single-v1-multi-turn", "type": "single", "system_prompt": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. You evaluation should focus on the assistant's answer to the second user question. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n", "prompt_template": "<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_2}\n\n<|The End of Assistant A's Conversation with User|>", "description": "Prompt for general questions", "category": "general", "output_format": "[[rating]]"}
8
- {"name": "single-math-v1-multi-turn", "type": "single", "system_prompt": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question. Your evaluation should consider correctness and helpfulness. You will be given a reference answer and the assistant's answer. You evaluation should focus on the assistant's answer to the second question. Begin your evaluation by comparing the assistant's answer with the reference answer. Identify and correct any mistakes. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n", "prompt_template": "<|The Start of Reference Answer|>\n\n### User:\n{question_1}\n\n### Reference answer:\n{ref_answer_1}\n\n### User:\n{question_2}\n\n### Reference answer:\n{ref_answer_2}\n\n<|The End of Reference Answer|>\n\n\n<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_2}\n\n<|The End of Assistant A's Conversation with User|>", "description": "Prompt for general questions", "category": ["math", "reasoning", "coding"], "output_format": "[[rating]]"}
@@ -1,8 +0,0 @@
1
- {"name": "pair-v2", "type": "pairwise", "system_prompt": "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user question displayed below. You should choose the assistant that follows the user's instructions and answers the user's question better. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of their responses. Begin your evaluation by comparing the two responses and provide a short explanation. Avoid any positional biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: \"[[A]]\" if assistant A is better, \"[[B]]\" if assistant B is better, and \"[[C]]\" for a tie.", "prompt_template": "[User Question]\n{question}\n\n[The Start of Assistant A's Answer]\n{answer_a}\n[The End of Assistant A's Answer]\n\n[The Start of Assistant B's Answer]\n{answer_b}\n[The End of Assistant B's Answer]", "description": "Prompt for general questions", "category": "general", "output_format": "[[A]]"}
2
- {"name": "pair-v2-multi-turn", "type": "pairwise", "system_prompt": "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user questions. You should choose the assistant that follows the user's instructions and answers the user's questions better. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of their responses. You should focus on who provides a better answer to the second user question. Begin your evaluation by comparing the responses of the two assistants and provide a short explanation. Avoid any positional biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: \"[[A]]\" if assistant A is better, \"[[B]]\" if assistant B is better, and \"[[C]]\" for a tie.", "prompt_template": "<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_a_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_a_2}\n\n<|The End of Assistant A's Conversation with User|>\n\n\n<|The Start of Assistant B's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant B:\n{answer_b_1}\n\n### User:\n{question_2}\n\n### Assistant B:\n{answer_b_2}\n\n<|The End of Assistant B's Conversation with User|>", "description": "Prompt for multi-turn general questions", "category": "general", "output_format": "[[A]]"}
3
- {"name": "pair-math-v1", "type": "pairwise", "system_prompt": "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user question displayed below. Your evaluation should consider correctness and helpfulness. You will be given a reference answer, assistant A's answer, and assistant B's answer. Your job is to evaluate which assistant's answer is better. Begin your evaluation by comparing both assistants' answers with the reference answer. Identify and correct any mistakes. Avoid any positional biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: \"[[A]]\" if assistant A is better, \"[[B]]\" if assistant B is better, and \"[[C]]\" for a tie.", "prompt_template": "[User Question]\n{question}\n\n[The Start of Reference Answer]\n{ref_answer_1}\n[The End of Reference Answer]\n\n[The Start of Assistant A's Answer]\n{answer_a}\n[The End of Assistant A's Answer]\n\n[The Start of Assistant B's Answer]\n{answer_b}\n[The End of Assistant B's Answer]", "description": "Prompt for math questions", "category": ["math", "reasoning", "coding"], "output_format": "[[A]]"}
4
- {"name": "pair-math-v1-multi-turn", "type": "pairwise", "system_prompt": "Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user questions. Your evaluation should consider correctness and helpfulness. You will be given reference answers, the assistant A's answers, the assistant B's answers. Your job is to determine which assistant provides correct and helpful answers to the second user question. Begin your evaluation by comparing both assistants' answers with the reference answers. Identify and correct any mistakes. Avoid any positional biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: \"[[A]]\" if assistant A is better, \"[[B]]\" if assistant B is better, and \"[[C]]\" for a tie.", "prompt_template": "<|The Start of Reference Answer|>\n\n### User:\n{question_1}\n\n### Reference answer:\n{ref_answer_1}\n\n### User:\n{question_2}\n\n### Reference answer:\n{ref_answer_2}\n\n<|The End of Reference Answer|>\n\n\n<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_a_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_a_2}\n\n<|The End of Assistant A's Conversation with User|>\n\n\n<|The Start of Assistant B's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant B:\n{answer_b_1}\n\n### User:\n{question_2}\n\n### Assistant B:\n{answer_b_2}\n\n<|The End of Assistant B's Conversation with User|>", "description": "Prompt for multi-turn general questions", "category": ["math", "reasoning", "coding"], "output_format": "[[A]]"}
5
- {"name": "single-v1", "type": "single", "system_prompt": "You are a helpful assistant.", "prompt_template": "[Instruction]\nPlease act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n[Question]\n{question}\n\n[The Start of Assistant's Answer]\n{answer}\n[The End of Assistant's Answer]", "description": "Prompt for general questions", "category": "general", "output_format": "[[rating]]"}
6
- {"name": "single-math-v1", "type": "single", "system_prompt": "You are a helpful assistant.", "prompt_template": "[Instruction]\nPlease act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider correctness and helpfulness. You will be given a reference answer and the assistant's answer. Begin your evaluation by comparing the assistant's answer with the reference answer. Identify and correct any mistakes. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n[Question]\n{question}\n\n[The Start of Reference Answer]\n{ref_answer_1}\n[The End of Reference Answer]\n\n[The Start of Assistant's Answer]\n{answer}\n[The End of Assistant's Answer]", "description": "Prompt for general questions", "category": ["math", "reasoning", "coding"], "output_format": "[[rating]]"}
7
- {"name": "single-v1-multi-turn", "type": "single", "system_prompt": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response. You evaluation should focus on the assistant's answer to the second user question. Begin your evaluation by providing a short explanation. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n", "prompt_template": "<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_2}\n\n<|The End of Assistant A's Conversation with User|>", "description": "Prompt for general questions", "category": "general", "output_format": "[[rating]]"}
8
- {"name": "single-math-v1-multi-turn", "type": "single", "system_prompt": "Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question. Your evaluation should consider correctness and helpfulness. You will be given a reference answer and the assistant's answer. You evaluation should focus on the assistant's answer to the second question. Begin your evaluation by comparing the assistant's answer with the reference answer. Identify and correct any mistakes. Be as objective as possible. After providing your explanation, you must rate the response on a scale of 1 to 10 by strictly following this format: \"[[rating]]\", for example: \"Rating: [[5]]\".\n\n", "prompt_template": "<|The Start of Reference Answer|>\n\n### User:\n{question_1}\n\n### Reference answer:\n{ref_answer_1}\n\n### User:\n{question_2}\n\n### Reference answer:\n{ref_answer_2}\n\n<|The End of Reference Answer|>\n\n\n<|The Start of Assistant A's Conversation with User|>\n\n### User:\n{question_1}\n\n### Assistant A:\n{answer_1}\n\n### User:\n{question_2}\n\n### Assistant A:\n{answer_2}\n\n<|The End of Assistant A's Conversation with User|>", "description": "Prompt for general questions", "category": ["math", "reasoning", "coding"], "output_format": "[[rating]]"}