evalscope 0.16.3__py3-none-any.whl → 0.17.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of evalscope might be problematic. Click here for more details.
- evalscope/app/app.py +9 -762
- evalscope/app/constants.py +1 -0
- evalscope/app/ui/__init__.py +20 -0
- evalscope/app/ui/app_ui.py +52 -0
- evalscope/app/ui/multi_model.py +323 -0
- evalscope/app/ui/sidebar.py +42 -0
- evalscope/app/ui/single_model.py +202 -0
- evalscope/app/ui/visualization.py +36 -0
- evalscope/app/utils/data_utils.py +178 -0
- evalscope/app/utils/localization.py +221 -0
- evalscope/app/utils/text_utils.py +119 -0
- evalscope/app/utils/visualization.py +91 -0
- evalscope/backend/opencompass/backend_manager.py +2 -1
- evalscope/backend/rag_eval/backend_manager.py +2 -1
- evalscope/backend/rag_eval/utils/embedding.py +1 -1
- evalscope/backend/vlm_eval_kit/backend_manager.py +4 -1
- evalscope/benchmarks/__init__.py +15 -1
- evalscope/benchmarks/aime/aime24_adapter.py +2 -1
- evalscope/benchmarks/aime/aime25_adapter.py +2 -1
- evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -1
- evalscope/benchmarks/arc/arc_adapter.py +1 -1
- evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -1
- evalscope/benchmarks/arena_hard/utils.py +0 -12
- evalscope/benchmarks/bfcl/bfcl_adapter.py +1 -1
- evalscope/benchmarks/ceval/ceval_adapter.py +5 -16
- evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -21
- evalscope/benchmarks/competition_math/competition_math_adapter.py +2 -1
- evalscope/benchmarks/data_adapter.py +29 -9
- evalscope/benchmarks/general_arena/__init__.py +0 -0
- evalscope/benchmarks/general_arena/general_arena_adapter.py +411 -0
- evalscope/benchmarks/general_arena/utils.py +226 -0
- evalscope/benchmarks/general_mcq/general_mcq_adapter.py +3 -2
- evalscope/benchmarks/general_qa/general_qa_adapter.py +44 -30
- evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
- evalscope/benchmarks/hle/__init__.py +0 -0
- evalscope/benchmarks/hle/hle_adapter.py +118 -0
- evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -21
- evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -4
- evalscope/benchmarks/iquiz/iquiz_adapter.py +1 -1
- evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -6
- evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +1 -1
- evalscope/benchmarks/math_500/math_500_adapter.py +2 -1
- evalscope/benchmarks/mmlu/mmlu_adapter.py +2 -2
- evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
- evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
- evalscope/benchmarks/musr/musr_adapter.py +1 -1
- evalscope/benchmarks/race/race_adapter.py +1 -1
- evalscope/benchmarks/tau_bench/__init__.py +0 -0
- evalscope/benchmarks/tau_bench/tau_bench_adapter.py +110 -0
- evalscope/benchmarks/tool_bench/tool_bench_adapter.py +7 -1
- evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +9 -4
- evalscope/benchmarks/utils.py +2 -2
- evalscope/benchmarks/winogrande/winogrande_adapter.py +1 -1
- evalscope/config.py +8 -123
- evalscope/constants.py +5 -21
- evalscope/evaluator/__init__.py +1 -1
- evalscope/evaluator/evaluator.py +20 -15
- evalscope/metrics/__init__.py +9 -1
- evalscope/{utils/utils.py → metrics/completion_parsers.py} +71 -176
- evalscope/metrics/llm_judge.py +106 -20
- evalscope/metrics/metrics.py +20 -8
- evalscope/models/__init__.py +4 -8
- evalscope/models/adapters/__init__.py +4 -9
- evalscope/models/adapters/base_adapter.py +4 -0
- evalscope/models/adapters/bfcl_adapter.py +2 -0
- evalscope/models/adapters/chat_adapter.py +3 -0
- evalscope/models/adapters/choice_adapter.py +4 -0
- evalscope/models/adapters/custom_adapter.py +7 -3
- evalscope/models/adapters/server_adapter.py +4 -2
- evalscope/models/adapters/t2i_adapter.py +3 -0
- evalscope/models/adapters/tau_bench_adapter.py +189 -0
- evalscope/models/custom/dummy_model.py +3 -3
- evalscope/models/register.py +0 -14
- evalscope/perf/arguments.py +15 -16
- evalscope/perf/benchmark.py +38 -39
- evalscope/perf/http_client.py +30 -86
- evalscope/perf/main.py +3 -3
- evalscope/perf/plugin/__init__.py +3 -2
- evalscope/perf/plugin/api/__init__.py +4 -3
- evalscope/perf/plugin/api/base.py +22 -4
- evalscope/perf/plugin/api/custom_api.py +212 -55
- evalscope/perf/plugin/api/dashscope_api.py +4 -10
- evalscope/perf/plugin/api/default_api.py +105 -0
- evalscope/perf/plugin/api/openai_api.py +17 -19
- evalscope/perf/plugin/datasets/__init__.py +10 -7
- evalscope/perf/plugin/datasets/base.py +22 -1
- evalscope/perf/plugin/datasets/custom.py +2 -1
- evalscope/perf/plugin/datasets/flickr8k.py +4 -27
- evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
- evalscope/perf/plugin/datasets/line_by_line.py +2 -1
- evalscope/perf/plugin/datasets/longalpaca.py +2 -1
- evalscope/perf/plugin/datasets/openqa.py +2 -1
- evalscope/perf/plugin/datasets/random_dataset.py +15 -4
- evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
- evalscope/perf/plugin/registry.py +36 -16
- evalscope/perf/utils/analysis_result.py +24 -23
- evalscope/perf/utils/benchmark_util.py +14 -20
- evalscope/perf/utils/db_util.py +79 -61
- evalscope/report/__init__.py +1 -1
- evalscope/report/utils.py +34 -15
- evalscope/run.py +1 -1
- evalscope/summarizer.py +1 -2
- evalscope/utils/__init__.py +63 -2
- evalscope/utils/argument_utils.py +64 -0
- evalscope/utils/import_utils.py +16 -0
- evalscope/utils/io_utils.py +55 -4
- evalscope/utils/model_utils.py +37 -1
- evalscope/version.py +2 -2
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/METADATA +100 -51
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/RECORD +129 -133
- tests/aigc/test_t2i.py +1 -1
- tests/cli/test_all.py +68 -4
- tests/cli/test_collection.py +1 -1
- tests/cli/test_custom.py +261 -0
- tests/cli/test_run.py +34 -70
- tests/perf/test_perf.py +31 -4
- tests/rag/test_clip_benchmark.py +2 -1
- tests/rag/test_mteb.py +3 -1
- tests/rag/test_ragas.py +3 -1
- tests/swift/test_run_swift_eval.py +2 -1
- tests/swift/test_run_swift_vlm_eval.py +2 -1
- tests/swift/test_run_swift_vlm_jugde_eval.py +2 -1
- tests/utils.py +13 -0
- tests/vlm/test_vlmeval.py +8 -2
- evalscope/evaluator/rating_eval.py +0 -157
- evalscope/evaluator/reviewer/__init__.py +0 -1
- evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
- evalscope/models/model.py +0 -189
- evalscope/registry/__init__.py +0 -1
- evalscope/registry/config/cfg_arena.yaml +0 -77
- evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
- evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
- evalscope/registry/config/cfg_single.yaml +0 -78
- evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
- evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
- evalscope/registry/data/qa_browser/battle.jsonl +0 -634
- evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
- evalscope/registry/data/question.jsonl +0 -80
- evalscope/registry/tasks/arc.yaml +0 -28
- evalscope/registry/tasks/bbh.yaml +0 -26
- evalscope/registry/tasks/bbh_mini.yaml +0 -26
- evalscope/registry/tasks/ceval.yaml +0 -27
- evalscope/registry/tasks/ceval_mini.yaml +0 -26
- evalscope/registry/tasks/cmmlu.yaml +0 -27
- evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
- evalscope/registry/tasks/general_qa.yaml +0 -27
- evalscope/registry/tasks/gsm8k.yaml +0 -29
- evalscope/registry/tasks/mmlu.yaml +0 -29
- evalscope/registry/tasks/mmlu_mini.yaml +0 -27
- evalscope/run_arena.py +0 -202
- evalscope/utils/arena_utils.py +0 -217
- evalscope/utils/completion_parsers.py +0 -82
- /evalscope/{utils → benchmarks}/filters.py +0 -0
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/LICENSE +0 -0
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/WHEEL +0 -0
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/entry_points.txt +0 -0
- {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/top_level.txt +0 -0
evalscope/metrics/llm_judge.py
CHANGED
|
@@ -2,17 +2,21 @@ import os
|
|
|
2
2
|
import re
|
|
3
3
|
from typing import Any, Dict, List, Optional
|
|
4
4
|
|
|
5
|
+
from evalscope.constants import JudgeScoreType
|
|
5
6
|
from evalscope.utils.logger import get_logger
|
|
6
7
|
|
|
7
8
|
logger = get_logger()
|
|
8
9
|
|
|
9
10
|
DEFAULT_PROMPT_TEMPLATE = """Your job is to look at a question, a gold target, and a predicted answer, and return a letter "A" or "B" to indicate whether the predicted answer is correct or incorrect.
|
|
10
11
|
|
|
11
|
-
Question
|
|
12
|
+
[Question]
|
|
13
|
+
{question}
|
|
12
14
|
|
|
13
|
-
Reference Answer
|
|
15
|
+
[Reference Answer]
|
|
16
|
+
{gold}
|
|
14
17
|
|
|
15
|
-
|
|
18
|
+
[Predicted Answer]
|
|
19
|
+
{pred}
|
|
16
20
|
|
|
17
21
|
Evaluate the model's answer based on correctness compared to the reference answer.
|
|
18
22
|
Grade the predicted answer of this new question as one of:
|
|
@@ -22,6 +26,18 @@ B: INCORRECT
|
|
|
22
26
|
Just return the letters "A" or "B", with no text around it.
|
|
23
27
|
""" # noqa: E501
|
|
24
28
|
|
|
29
|
+
|
|
30
|
+
DEFAULT_NUMERIC_SCORE_TEMPLATE = """Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response.
|
|
31
|
+
Begin your evaluation by providing a short explanation. Be as objective as possible.
|
|
32
|
+
After providing your explanation, you must rate the response on a scale of 0 (worst) to 1 (best) by strictly following this format: \"[[rating]]\", for example: \"Rating: [[0.5]]\"
|
|
33
|
+
|
|
34
|
+
[Question]
|
|
35
|
+
{question}
|
|
36
|
+
|
|
37
|
+
[Response]
|
|
38
|
+
{pred}
|
|
39
|
+
""" # noqa: E501
|
|
40
|
+
|
|
25
41
|
DEFAULT_JUDGE_MODEL = 'Qwen/Qwen3-235B-A22B'
|
|
26
42
|
DEFAULT_API_URL = 'https://api-inference.modelscope.cn/v1/'
|
|
27
43
|
|
|
@@ -31,14 +47,18 @@ class LLMJudge:
|
|
|
31
47
|
A metric that uses LLM to judge the quality of model predictions by comparing them with reference answers.
|
|
32
48
|
"""
|
|
33
49
|
|
|
34
|
-
def __init__(
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
50
|
+
def __init__(
|
|
51
|
+
self,
|
|
52
|
+
api_key: Optional[str] = None,
|
|
53
|
+
api_url: Optional[str] = None,
|
|
54
|
+
model_id: Optional[str] = None,
|
|
55
|
+
system_prompt: Optional[str] = None,
|
|
56
|
+
prompt_template: Optional[str] = None,
|
|
57
|
+
generation_config: Optional[Dict[str, Any]] = None,
|
|
58
|
+
score_pattern: Optional[str] = None,
|
|
59
|
+
score_mapping: Optional[Dict[str, float]] = None,
|
|
60
|
+
score_type: str = JudgeScoreType.PATTERN, # 'pattern', 'numeric'
|
|
61
|
+
**kwargs):
|
|
42
62
|
"""
|
|
43
63
|
Initialize LLMJudge metric.
|
|
44
64
|
|
|
@@ -49,14 +69,34 @@ class LLMJudge:
|
|
|
49
69
|
system_prompt (str, optional): System prompt for the judge
|
|
50
70
|
prompt_template (str, optional): Prompt template for the judge
|
|
51
71
|
generation_config (dict, optional): Generation configuration for the judge
|
|
72
|
+
score_pattern (str, optional): Regex pattern to extract score from LLM response
|
|
73
|
+
score_mapping (dict, optional): Mapping from extracted score to float value
|
|
74
|
+
score_type (str, optional): Type of score extraction strategy ('pattern', 'numeric') defaults to 'pattern'.
|
|
75
|
+
- 'pattern': Use score_pattern and score_mapping to extract categorical scores
|
|
76
|
+
- 'numeric': Treat the extracted value as a direct numerical score
|
|
52
77
|
"""
|
|
53
78
|
self.api_key = api_key or os.environ.get('MODELSCOPE_SDK_TOKEN', 'EMPTY')
|
|
54
79
|
self.api_url = api_url or os.environ.get('MODELSCOPE_API_BASE', DEFAULT_API_URL)
|
|
55
80
|
self.model_id = model_id or os.environ.get('MODELSCOPE_JUDGE_LLM', DEFAULT_JUDGE_MODEL)
|
|
56
81
|
self.system_prompt = system_prompt or os.environ.get('JUDGE_SYSTEM_PROMPT', None)
|
|
57
|
-
self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE', DEFAULT_PROMPT_TEMPLATE)
|
|
58
82
|
self.generation_config = generation_config or {}
|
|
59
83
|
|
|
84
|
+
# Default score mapping for A/B pattern
|
|
85
|
+
self.score_type = score_type
|
|
86
|
+
if self.score_type == JudgeScoreType.NUMERIC:
|
|
87
|
+
self.score_pattern = score_pattern or r'\[\[(\d+(?:\.\d+)?)\]\]'
|
|
88
|
+
self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE',
|
|
89
|
+
DEFAULT_NUMERIC_SCORE_TEMPLATE)
|
|
90
|
+
elif self.score_type == JudgeScoreType.PATTERN:
|
|
91
|
+
self.score_pattern = score_pattern or r'(A|B)'
|
|
92
|
+
self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE', DEFAULT_PROMPT_TEMPLATE)
|
|
93
|
+
else:
|
|
94
|
+
raise ValueError(f"Invalid score_type: {self.score_type}. Must be 'pattern' or 'numeric'.")
|
|
95
|
+
self.score_mapping = score_mapping or {'A': 1.0, 'B': 0.0}
|
|
96
|
+
|
|
97
|
+
self._init_server_adapter()
|
|
98
|
+
|
|
99
|
+
def _init_server_adapter(self):
|
|
60
100
|
from evalscope.models import ServerModelAdapter
|
|
61
101
|
|
|
62
102
|
# Initialize ServerModelAdapter
|
|
@@ -95,17 +135,63 @@ class LLMJudge:
|
|
|
95
135
|
def build_prompt(self, pred: str, gold: str, question: Optional[str] = None):
|
|
96
136
|
if question is None:
|
|
97
137
|
question = 'Not provided'
|
|
98
|
-
|
|
138
|
+
|
|
139
|
+
# check variables in prompt_template
|
|
140
|
+
prompt = self.prompt_template
|
|
141
|
+
if '{question}' in self.prompt_template:
|
|
142
|
+
prompt = prompt.replace('{question}', question)
|
|
143
|
+
if '{pred}' in self.prompt_template:
|
|
144
|
+
prompt = prompt.replace('{pred}', pred)
|
|
145
|
+
if '{gold}' in self.prompt_template:
|
|
146
|
+
prompt = prompt.replace('{gold}', gold)
|
|
147
|
+
return prompt
|
|
99
148
|
|
|
100
149
|
def get_score(self, response: str) -> float:
|
|
150
|
+
"""
|
|
151
|
+
Extract score from LLM response using the configured pattern and mapping.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
response (str): The response from the LLM
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
float: The numeric score extracted from the response
|
|
158
|
+
"""
|
|
101
159
|
if response is None:
|
|
102
|
-
return 0
|
|
103
|
-
|
|
160
|
+
return 0.0
|
|
161
|
+
|
|
162
|
+
# choose extraction method based on score_type
|
|
163
|
+
if self.score_type == JudgeScoreType.NUMERIC:
|
|
164
|
+
return self._extract_numeric_score(response)
|
|
165
|
+
elif self.score_type == JudgeScoreType.PATTERN:
|
|
166
|
+
return self._extract_pattern_score(response)
|
|
167
|
+
|
|
168
|
+
def _extract_numeric_score(self, response: str) -> Optional[float]:
|
|
169
|
+
"""extract numeric score from the response using the score_pattern"""
|
|
170
|
+
match = re.search(self.score_pattern, response)
|
|
171
|
+
|
|
172
|
+
if match:
|
|
173
|
+
# try to convert each captured group to float
|
|
174
|
+
for group in match.groups():
|
|
175
|
+
if group is not None:
|
|
176
|
+
try:
|
|
177
|
+
return float(group)
|
|
178
|
+
except (ValueError, TypeError):
|
|
179
|
+
continue
|
|
180
|
+
|
|
181
|
+
# if not found in groups, try the whole match
|
|
182
|
+
try:
|
|
183
|
+
return float(match.group(0))
|
|
184
|
+
except (ValueError, TypeError):
|
|
185
|
+
logger.warning(f'Failed to convert any extracted value to float from: {match.group(0)}')
|
|
186
|
+
|
|
187
|
+
return None
|
|
188
|
+
|
|
189
|
+
def _extract_pattern_score(self, response: str) -> float:
|
|
190
|
+
"""use the score_pattern to extract categorical scores"""
|
|
191
|
+
match = re.search(self.score_pattern, response)
|
|
104
192
|
if match:
|
|
105
193
|
answer = match.group(0)
|
|
106
|
-
|
|
107
|
-
return 1
|
|
108
|
-
elif answer == 'B':
|
|
109
|
-
return 0
|
|
194
|
+
return self.score_mapping.get(answer, 0.0)
|
|
110
195
|
else:
|
|
111
|
-
|
|
196
|
+
logger.warning(f"No match found for pattern '{self.score_pattern}' in response: {response}")
|
|
197
|
+
return 0.0
|
evalscope/metrics/metrics.py
CHANGED
|
@@ -9,7 +9,7 @@ import random
|
|
|
9
9
|
import sacrebleu
|
|
10
10
|
from collections import defaultdict
|
|
11
11
|
from collections.abc import Iterable
|
|
12
|
-
from typing import
|
|
12
|
+
from typing import Dict, List, Union
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
def mean(arr: list):
|
|
@@ -22,16 +22,28 @@ def mean(arr: list):
|
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
def pass_at_k(arr: Union[List[int], List[List[int]]], k: int = 1) -> float:
|
|
25
|
+
"""
|
|
26
|
+
Calculates the pass@k metric using the calculate_pass_at_k function.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
arr: List of binary values (1 for correct, 0 for incorrect) or list of such lists
|
|
30
|
+
k: Number of attempts allowed
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
The average pass@k score across all problems
|
|
34
|
+
"""
|
|
25
35
|
if not arr:
|
|
26
36
|
return 0.0
|
|
37
|
+
if not isinstance(arr[0], list):
|
|
38
|
+
# If arr is a simple list of binary results, convert it to a list of lists
|
|
39
|
+
arr = [arr]
|
|
27
40
|
|
|
28
|
-
|
|
29
|
-
|
|
41
|
+
# For list of lists case, each inner list represents attempts for one problem
|
|
42
|
+
num_samples = [len(sub_arr) for sub_arr in arr]
|
|
43
|
+
num_correct = [sum(sub_arr) for sub_arr in arr]
|
|
44
|
+
pass_at_k_values = calculate_pass_at_k(num_samples, num_correct, k)
|
|
30
45
|
|
|
31
|
-
|
|
32
|
-
return sum(sub_pass_at_k(sub_arr) for sub_arr in arr) / len(arr)
|
|
33
|
-
else:
|
|
34
|
-
return sum(arr) / len(arr)
|
|
46
|
+
return float(np.mean(pass_at_k_values))
|
|
35
47
|
|
|
36
48
|
|
|
37
49
|
def pop_stddev(arr):
|
|
@@ -223,7 +235,7 @@ def chrf(items):
|
|
|
223
235
|
Source: https://github.com/m-popovic/chrF
|
|
224
236
|
Paper: https://www.aclweb.org/anthology/W15-3049.pdf
|
|
225
237
|
|
|
226
|
-
Higher is better
|
|
238
|
+
Higher is better
|
|
227
239
|
"""
|
|
228
240
|
refs = list(zip(*items))[0]
|
|
229
241
|
preds = list(zip(*items))[1]
|
evalscope/models/__init__.py
CHANGED
|
@@ -4,12 +4,11 @@ from typing import TYPE_CHECKING
|
|
|
4
4
|
from evalscope.utils.import_utils import _LazyModule
|
|
5
5
|
|
|
6
6
|
if TYPE_CHECKING:
|
|
7
|
-
from .adapters import (BaseModelAdapter, ChatGenerationModelAdapter, ContinuationLogitsModelAdapter,
|
|
7
|
+
from .adapters import (BaseModelAdapter, BFCLAdapter, ChatGenerationModelAdapter, ContinuationLogitsModelAdapter,
|
|
8
8
|
CustomModelAdapter, MultiChoiceModelAdapter, ServerModelAdapter, T2IModelAdapter,
|
|
9
|
-
initialize_model_adapter)
|
|
9
|
+
TauBenchAdapter, initialize_model_adapter)
|
|
10
10
|
from .custom import CustomModel, DummyCustomModel
|
|
11
11
|
from .local_model import LocalModel, get_local_model
|
|
12
|
-
from .model import BaseModel, ChatBaseModel, OpenAIModel
|
|
13
12
|
from .register import get_model_adapter
|
|
14
13
|
|
|
15
14
|
else:
|
|
@@ -23,6 +22,8 @@ else:
|
|
|
23
22
|
'CustomModelAdapter',
|
|
24
23
|
'ServerModelAdapter',
|
|
25
24
|
'T2IModelAdapter',
|
|
25
|
+
'TauBenchAdapter',
|
|
26
|
+
'BFCLAdapter',
|
|
26
27
|
],
|
|
27
28
|
'custom': [
|
|
28
29
|
'CustomModel',
|
|
@@ -32,11 +33,6 @@ else:
|
|
|
32
33
|
'LocalModel',
|
|
33
34
|
'get_local_model',
|
|
34
35
|
],
|
|
35
|
-
'model': [
|
|
36
|
-
'BaseModel',
|
|
37
|
-
'ChatBaseModel',
|
|
38
|
-
'OpenAIModel',
|
|
39
|
-
],
|
|
40
36
|
'register': [
|
|
41
37
|
'get_model_adapter',
|
|
42
38
|
],
|
|
@@ -5,15 +5,10 @@ from .choice_adapter import ContinuationLogitsModelAdapter, MultiChoiceModelAdap
|
|
|
5
5
|
from .custom_adapter import CustomModelAdapter
|
|
6
6
|
from .server_adapter import ServerModelAdapter
|
|
7
7
|
from .t2i_adapter import T2IModelAdapter
|
|
8
|
+
from .tau_bench_adapter import TauBenchAdapter
|
|
8
9
|
|
|
9
10
|
__all__ = [
|
|
10
|
-
'initialize_model_adapter',
|
|
11
|
-
'
|
|
12
|
-
'
|
|
13
|
-
'ContinuationLogitsModelAdapter',
|
|
14
|
-
'MultiChoiceModelAdapter',
|
|
15
|
-
'CustomModelAdapter',
|
|
16
|
-
'ServerModelAdapter',
|
|
17
|
-
'BFCLAdapter',
|
|
18
|
-
'T2IModelAdapter',
|
|
11
|
+
'initialize_model_adapter', 'BaseModelAdapter', 'ChatGenerationModelAdapter', 'ContinuationLogitsModelAdapter',
|
|
12
|
+
'MultiChoiceModelAdapter', 'CustomModelAdapter', 'ServerModelAdapter', 'BFCLAdapter', 'T2IModelAdapter',
|
|
13
|
+
'TauBenchAdapter'
|
|
19
14
|
]
|
|
@@ -53,7 +53,10 @@ def initialize_model_adapter(task_cfg: 'TaskConfig', benchmark: 'DataAdapter', b
|
|
|
53
53
|
if task_cfg.eval_type == EvalType.SERVICE or task_cfg.api_url is not None:
|
|
54
54
|
|
|
55
55
|
if 'server' not in model_adapter_cls_str:
|
|
56
|
+
logger.warning(f'Output type {model_adapter_cls_str} is not supported for service evaluation. '
|
|
57
|
+
f'Using server model adapter instead.')
|
|
56
58
|
model_adapter_cls_str = 'server'
|
|
59
|
+
benchmark.model_adapter = model_adapter_cls_str
|
|
57
60
|
|
|
58
61
|
# init server model adapter
|
|
59
62
|
model_adapter_cls = get_model_adapter(model_adapter_cls_str)
|
|
@@ -71,6 +74,7 @@ def initialize_model_adapter(task_cfg: 'TaskConfig', benchmark: 'DataAdapter', b
|
|
|
71
74
|
logger.warning(f'Output type {model_adapter_cls_str} is not supported for benchmark {benchmark.name}.'
|
|
72
75
|
f'Using {benchmark.output_types[0]} instead.')
|
|
73
76
|
model_adapter_cls_str = benchmark.output_types[0]
|
|
77
|
+
benchmark.model_adapter = model_adapter_cls_str
|
|
74
78
|
|
|
75
79
|
model_adapter_cls = get_model_adapter(model_adapter_cls_str)
|
|
76
80
|
return model_adapter_cls(
|
|
@@ -4,11 +4,13 @@ import uuid
|
|
|
4
4
|
from typing import Any, List, Optional, Union
|
|
5
5
|
|
|
6
6
|
from evalscope.utils.logger import get_logger
|
|
7
|
+
from ..register import register_model_adapter
|
|
7
8
|
from .server_adapter import ServerModelAdapter
|
|
8
9
|
|
|
9
10
|
logger = get_logger()
|
|
10
11
|
|
|
11
12
|
|
|
13
|
+
@register_model_adapter(name='bfcl_server')
|
|
12
14
|
class BFCLAdapter(ServerModelAdapter):
|
|
13
15
|
"""
|
|
14
16
|
BFCL model adapter to request remote API model and generate results for BFCL evaluation.
|
|
@@ -3,15 +3,18 @@ import time
|
|
|
3
3
|
import torch
|
|
4
4
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
5
5
|
|
|
6
|
+
from evalscope.constants import OutputType
|
|
6
7
|
from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage, Usage
|
|
7
8
|
from evalscope.utils.logger import get_logger
|
|
8
9
|
from evalscope.utils.model_utils import fix_do_sample_warning
|
|
9
10
|
from ..local_model import LocalModel
|
|
11
|
+
from ..register import register_model_adapter
|
|
10
12
|
from .base_adapter import BaseModelAdapter
|
|
11
13
|
|
|
12
14
|
logger = get_logger()
|
|
13
15
|
|
|
14
16
|
|
|
17
|
+
@register_model_adapter(name=OutputType.GENERATION)
|
|
15
18
|
class ChatGenerationModelAdapter(BaseModelAdapter):
|
|
16
19
|
"""
|
|
17
20
|
Chat generation model adapter.
|
|
@@ -3,11 +3,14 @@ import time
|
|
|
3
3
|
import torch
|
|
4
4
|
from typing import List
|
|
5
5
|
|
|
6
|
+
from evalscope.constants import OutputType
|
|
6
7
|
from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage
|
|
7
8
|
from ..local_model import LocalModel
|
|
9
|
+
from ..register import register_model_adapter
|
|
8
10
|
from .base_adapter import BaseModelAdapter
|
|
9
11
|
|
|
10
12
|
|
|
13
|
+
@register_model_adapter(name=OutputType.MULTIPLE_CHOICE)
|
|
11
14
|
class MultiChoiceModelAdapter(BaseModelAdapter):
|
|
12
15
|
""" The multi-choice model adapter. """
|
|
13
16
|
|
|
@@ -110,6 +113,7 @@ class MultiChoiceModelAdapter(BaseModelAdapter):
|
|
|
110
113
|
return log_probs, {'tokens': tokens}
|
|
111
114
|
|
|
112
115
|
|
|
116
|
+
@register_model_adapter(name=OutputType.CONTINUOUS)
|
|
113
117
|
class ContinuationLogitsModelAdapter(MultiChoiceModelAdapter):
|
|
114
118
|
"""
|
|
115
119
|
Continuation-logits model adapter.
|
|
@@ -1,12 +1,16 @@
|
|
|
1
|
-
from typing import Any, Dict, List, Union
|
|
1
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Union
|
|
2
2
|
|
|
3
|
-
from ..
|
|
3
|
+
from ..register import register_model_adapter
|
|
4
4
|
from .base_adapter import BaseModelAdapter
|
|
5
5
|
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from ..custom import CustomModel
|
|
6
8
|
|
|
9
|
+
|
|
10
|
+
@register_model_adapter(name='custom')
|
|
7
11
|
class CustomModelAdapter(BaseModelAdapter):
|
|
8
12
|
|
|
9
|
-
def __init__(self, custom_model: CustomModel, **kwargs):
|
|
13
|
+
def __init__(self, custom_model: 'CustomModel', **kwargs):
|
|
10
14
|
"""
|
|
11
15
|
Custom model adapter.
|
|
12
16
|
|
|
@@ -5,13 +5,15 @@ from openai.types.chat import ChatCompletion, ChatCompletionChunk
|
|
|
5
5
|
from openai.types.chat.chat_completion import ChatCompletionMessage, Choice
|
|
6
6
|
from typing import List, Optional, Union
|
|
7
7
|
|
|
8
|
+
from evalscope.utils.argument_utils import get_supported_params
|
|
8
9
|
from evalscope.utils.logger import get_logger
|
|
9
|
-
from
|
|
10
|
+
from ..register import register_model_adapter
|
|
10
11
|
from .base_adapter import BaseModelAdapter
|
|
11
12
|
|
|
12
13
|
logger = get_logger()
|
|
13
14
|
|
|
14
15
|
|
|
16
|
+
@register_model_adapter(name='server')
|
|
15
17
|
class ServerModelAdapter(BaseModelAdapter):
|
|
16
18
|
"""
|
|
17
19
|
Server model adapter to request remote API model and generate results.
|
|
@@ -29,7 +31,7 @@ class ServerModelAdapter(BaseModelAdapter):
|
|
|
29
31
|
self.api_key = api_key
|
|
30
32
|
|
|
31
33
|
self.client = openai.OpenAI(
|
|
32
|
-
api_key=api_key,
|
|
34
|
+
api_key=self.api_key,
|
|
33
35
|
base_url=self.api_url,
|
|
34
36
|
)
|
|
35
37
|
self.supported_params = get_supported_params(self.client.chat.completions.create)
|
|
@@ -3,15 +3,18 @@ import time
|
|
|
3
3
|
import torch
|
|
4
4
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
5
5
|
|
|
6
|
+
from evalscope.constants import OutputType
|
|
6
7
|
from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage
|
|
7
8
|
from evalscope.utils.io_utils import OutputsStructure
|
|
8
9
|
from evalscope.utils.logger import get_logger
|
|
9
10
|
from ..local_model import LocalModel
|
|
11
|
+
from ..register import register_model_adapter
|
|
10
12
|
from .base_adapter import BaseModelAdapter
|
|
11
13
|
|
|
12
14
|
logger = get_logger()
|
|
13
15
|
|
|
14
16
|
|
|
17
|
+
@register_model_adapter(name=OutputType.IMAGE_GENERATION)
|
|
15
18
|
class T2IModelAdapter(BaseModelAdapter):
|
|
16
19
|
"""
|
|
17
20
|
Text to image model adapter.
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import time
|
|
3
|
+
from typing import Any, Dict, List, Optional, Union
|
|
4
|
+
|
|
5
|
+
from evalscope.utils.logger import get_logger
|
|
6
|
+
from ..register import register_model_adapter
|
|
7
|
+
from .server_adapter import ServerModelAdapter
|
|
8
|
+
|
|
9
|
+
logger = get_logger()
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@register_model_adapter(name='tau_bench_server')
|
|
13
|
+
class TauBenchAdapter(ServerModelAdapter):
|
|
14
|
+
"""
|
|
15
|
+
TauBench model adapter to request remote API model and generate results for TauBench evaluation.
|
|
16
|
+
Support multi-turn and single-turn function calling tasks.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def __init__(self, api_url: str, model_id: str, api_key: str = 'EMPTY', **kwargs):
|
|
20
|
+
"""
|
|
21
|
+
Args:
|
|
22
|
+
api_url: The URL of the remote API model.
|
|
23
|
+
model_id: The ID of the remote API model.
|
|
24
|
+
api_key: The API key of the remote API model.
|
|
25
|
+
"""
|
|
26
|
+
super().__init__(api_url=api_url, model_id=model_id, api_key=api_key, **kwargs)
|
|
27
|
+
|
|
28
|
+
self._patch_agent_solve()
|
|
29
|
+
|
|
30
|
+
def predict(self, inputs: List[dict], infer_cfg: Optional[dict] = None) -> List[dict]:
|
|
31
|
+
"""
|
|
32
|
+
Model prediction func. For multi-turn evals, we pass a list[list[message]] to the model
|
|
33
|
+
where each list is a follow up turn in the conversation
|
|
34
|
+
each turn is a List[List[Message]]
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
inputs (List[dict]): The input data.
|
|
38
|
+
infer_cfg (dict): Inference configuration.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
res (List[dict]): The model prediction results.
|
|
42
|
+
"""
|
|
43
|
+
infer_cfg = infer_cfg or {}
|
|
44
|
+
results = []
|
|
45
|
+
|
|
46
|
+
for input_item in inputs:
|
|
47
|
+
raw_input = input_item.get('raw_input')
|
|
48
|
+
|
|
49
|
+
res_d = self.solve(env_name=raw_input['env_name'], task_index=raw_input['task_index'], infer_cfg=infer_cfg)
|
|
50
|
+
|
|
51
|
+
wrapper_res = {
|
|
52
|
+
'choices': [{
|
|
53
|
+
'index': 0,
|
|
54
|
+
'message': {
|
|
55
|
+
'content': json.dumps(res_d, ensure_ascii=False),
|
|
56
|
+
'role': 'assistant'
|
|
57
|
+
}
|
|
58
|
+
}],
|
|
59
|
+
'created':
|
|
60
|
+
time.time(),
|
|
61
|
+
'model':
|
|
62
|
+
self.model_id,
|
|
63
|
+
'object':
|
|
64
|
+
'chat.completion',
|
|
65
|
+
'usage': {
|
|
66
|
+
'completion_tokens': 0,
|
|
67
|
+
'prompt_tokens': 0,
|
|
68
|
+
'total_tokens': 0
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
results.append(wrapper_res)
|
|
73
|
+
|
|
74
|
+
return results
|
|
75
|
+
|
|
76
|
+
def _patch_agent_solve(self):
|
|
77
|
+
"""Patch ToolCallingAgent.solve method to use custom model configuration"""
|
|
78
|
+
from tau_bench.agents.tool_calling_agent import ToolCallingAgent, message_to_action
|
|
79
|
+
from tau_bench.envs.base import Env
|
|
80
|
+
from tau_bench.types import RESPOND_ACTION_NAME, SolveResult
|
|
81
|
+
from typing import List, Optional
|
|
82
|
+
|
|
83
|
+
def patched_solve(self,
|
|
84
|
+
env: Env,
|
|
85
|
+
task_index: Optional[int] = None,
|
|
86
|
+
max_num_steps: int = 30,
|
|
87
|
+
infer_cfg: Optional[dict] = {}) -> SolveResult:
|
|
88
|
+
env_reset_res = env.reset(task_index=task_index)
|
|
89
|
+
obs = env_reset_res.observation
|
|
90
|
+
info = env_reset_res.info.model_dump()
|
|
91
|
+
reward = 0.0
|
|
92
|
+
messages: List[Dict[str, Any]] = [
|
|
93
|
+
{
|
|
94
|
+
'role': 'system',
|
|
95
|
+
'content': self.wiki
|
|
96
|
+
},
|
|
97
|
+
{
|
|
98
|
+
'role': 'user',
|
|
99
|
+
'content': obs
|
|
100
|
+
},
|
|
101
|
+
]
|
|
102
|
+
|
|
103
|
+
for step_index in range(max_num_steps):
|
|
104
|
+
# Use adapter's model configuration instead of agent's
|
|
105
|
+
request_json = adapter_instance.make_request(
|
|
106
|
+
input_item={
|
|
107
|
+
'messages': messages,
|
|
108
|
+
'tools': self.tools_info
|
|
109
|
+
}, infer_cfg=infer_cfg)
|
|
110
|
+
res = adapter_instance.send_request(request_json)
|
|
111
|
+
|
|
112
|
+
next_message = res['choices'][0]['message']
|
|
113
|
+
action = message_to_action(next_message)
|
|
114
|
+
env_response = env.step(action)
|
|
115
|
+
reward = env_response.reward
|
|
116
|
+
info = {**info, **env_response.info.model_dump()}
|
|
117
|
+
|
|
118
|
+
if action.name != RESPOND_ACTION_NAME:
|
|
119
|
+
next_message['tool_calls'] = next_message['tool_calls'][:1]
|
|
120
|
+
messages.extend([
|
|
121
|
+
next_message,
|
|
122
|
+
{
|
|
123
|
+
'role': 'tool',
|
|
124
|
+
'tool_call_id': next_message['tool_calls'][0]['id'],
|
|
125
|
+
'name': next_message['tool_calls'][0]['function']['name'],
|
|
126
|
+
'content': env_response.observation,
|
|
127
|
+
},
|
|
128
|
+
])
|
|
129
|
+
else:
|
|
130
|
+
messages.extend([
|
|
131
|
+
next_message,
|
|
132
|
+
{
|
|
133
|
+
'role': 'user',
|
|
134
|
+
'content': env_response.observation
|
|
135
|
+
},
|
|
136
|
+
])
|
|
137
|
+
logger.debug(f'Task: {task_index} Step: {step_index} finished')
|
|
138
|
+
|
|
139
|
+
if env_response.done:
|
|
140
|
+
break
|
|
141
|
+
|
|
142
|
+
return SolveResult(
|
|
143
|
+
reward=reward,
|
|
144
|
+
info=info,
|
|
145
|
+
messages=messages,
|
|
146
|
+
total_cost=0,
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
adapter_instance = self
|
|
150
|
+
|
|
151
|
+
ToolCallingAgent.solve = patched_solve
|
|
152
|
+
|
|
153
|
+
return 'ToolCallingAgent.solve patched successfully'
|
|
154
|
+
|
|
155
|
+
def solve(self, env_name, task_index, infer_cfg, **kwargs):
|
|
156
|
+
"""
|
|
157
|
+
Solve a specific task in the TauBench environment.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
env_name (str): The name of the TauBench environment.
|
|
161
|
+
task_index (int): The index of the task to solve.
|
|
162
|
+
**kwargs: Additional arguments for the task.
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
dict: The result of the task.
|
|
166
|
+
"""
|
|
167
|
+
from tau_bench.agents.tool_calling_agent import ToolCallingAgent
|
|
168
|
+
from tau_bench.envs import get_env
|
|
169
|
+
|
|
170
|
+
# This method can be implemented to solve specific tasks in the TauBench environment
|
|
171
|
+
isolated_env = get_env(
|
|
172
|
+
env_name=env_name,
|
|
173
|
+
user_strategy='llm',
|
|
174
|
+
user_model='dummy', # Use dummy model to prevent errors
|
|
175
|
+
user_provider='openai', # Use dummy provider to prevent errors
|
|
176
|
+
task_split='test',
|
|
177
|
+
task_index=task_index,
|
|
178
|
+
)
|
|
179
|
+
agent = ToolCallingAgent(
|
|
180
|
+
tools_info=isolated_env.tools_info,
|
|
181
|
+
wiki=isolated_env.wiki,
|
|
182
|
+
model='dummy', # Use dummy model to prevent errors
|
|
183
|
+
provider='dummy', # Use dummy provider to prevent errors
|
|
184
|
+
temperature=0, # dummy temperature to prevent errors
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
res = agent.solve(env=isolated_env, task_index=task_index, infer_cfg=infer_cfg)
|
|
188
|
+
|
|
189
|
+
return res.model_dump()
|
|
@@ -50,14 +50,14 @@ class DummyCustomModel(CustomModel):
|
|
|
50
50
|
# Must return a list of dicts with the same format as the OpenAI API.
|
|
51
51
|
responses = []
|
|
52
52
|
for input_item in original_inputs:
|
|
53
|
-
message = self.make_request_messages(input_item)
|
|
54
|
-
response = f'Dummy response for prompt: {message}'
|
|
53
|
+
# message = self.make_request_messages(input_item)
|
|
54
|
+
# response = f'Dummy response for prompt: {message}'
|
|
55
55
|
|
|
56
56
|
res_d = {
|
|
57
57
|
'choices': [{
|
|
58
58
|
'index': 0,
|
|
59
59
|
'message': {
|
|
60
|
-
'content':
|
|
60
|
+
'content': '*PlaceHolder*',
|
|
61
61
|
'role': 'assistant'
|
|
62
62
|
}
|
|
63
63
|
}],
|
evalscope/models/register.py
CHANGED
|
@@ -1,6 +1,3 @@
|
|
|
1
|
-
from evalscope.constants import OutputType
|
|
2
|
-
from .adapters import *
|
|
3
|
-
|
|
4
1
|
MODEL_ADAPTERS = {}
|
|
5
2
|
|
|
6
3
|
|
|
@@ -42,14 +39,3 @@ def register_model_adapter_class(cls, name=None):
|
|
|
42
39
|
if name in MODEL_ADAPTERS:
|
|
43
40
|
raise ValueError(f"Model adapter class '{name}' is already registered.")
|
|
44
41
|
MODEL_ADAPTERS[name] = cls
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
# register all model adapters
|
|
48
|
-
register_model_adapter_class(BaseModelAdapter, name='base')
|
|
49
|
-
register_model_adapter_class(ChatGenerationModelAdapter, name=OutputType.GENERATION)
|
|
50
|
-
register_model_adapter_class(ContinuationLogitsModelAdapter, name=OutputType.CONTINUOUS)
|
|
51
|
-
register_model_adapter_class(MultiChoiceModelAdapter, name=OutputType.MULTIPLE_CHOICE)
|
|
52
|
-
register_model_adapter_class(CustomModelAdapter, name='custom')
|
|
53
|
-
register_model_adapter_class(ServerModelAdapter, name='server')
|
|
54
|
-
register_model_adapter_class(BFCLAdapter, name='bfcl_server')
|
|
55
|
-
register_model_adapter_class(T2IModelAdapter, name=OutputType.IMAGE_GENERATION)
|