evalscope 0.16.3__py3-none-any.whl → 0.17.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of evalscope might be problematic. Click here for more details.

Files changed (157) hide show
  1. evalscope/app/app.py +9 -762
  2. evalscope/app/constants.py +1 -0
  3. evalscope/app/ui/__init__.py +20 -0
  4. evalscope/app/ui/app_ui.py +52 -0
  5. evalscope/app/ui/multi_model.py +323 -0
  6. evalscope/app/ui/sidebar.py +42 -0
  7. evalscope/app/ui/single_model.py +202 -0
  8. evalscope/app/ui/visualization.py +36 -0
  9. evalscope/app/utils/data_utils.py +178 -0
  10. evalscope/app/utils/localization.py +221 -0
  11. evalscope/app/utils/text_utils.py +119 -0
  12. evalscope/app/utils/visualization.py +91 -0
  13. evalscope/backend/opencompass/backend_manager.py +2 -1
  14. evalscope/backend/rag_eval/backend_manager.py +2 -1
  15. evalscope/backend/rag_eval/utils/embedding.py +1 -1
  16. evalscope/backend/vlm_eval_kit/backend_manager.py +4 -1
  17. evalscope/benchmarks/__init__.py +15 -1
  18. evalscope/benchmarks/aime/aime24_adapter.py +2 -1
  19. evalscope/benchmarks/aime/aime25_adapter.py +2 -1
  20. evalscope/benchmarks/alpaca_eval/alpaca_eval_adapter.py +1 -1
  21. evalscope/benchmarks/arc/arc_adapter.py +1 -1
  22. evalscope/benchmarks/arena_hard/arena_hard_adapter.py +1 -1
  23. evalscope/benchmarks/arena_hard/utils.py +0 -12
  24. evalscope/benchmarks/bfcl/bfcl_adapter.py +1 -1
  25. evalscope/benchmarks/ceval/ceval_adapter.py +5 -16
  26. evalscope/benchmarks/cmmlu/cmmlu_adapter.py +9 -21
  27. evalscope/benchmarks/competition_math/competition_math_adapter.py +2 -1
  28. evalscope/benchmarks/data_adapter.py +29 -9
  29. evalscope/benchmarks/general_arena/__init__.py +0 -0
  30. evalscope/benchmarks/general_arena/general_arena_adapter.py +411 -0
  31. evalscope/benchmarks/general_arena/utils.py +226 -0
  32. evalscope/benchmarks/general_mcq/general_mcq_adapter.py +3 -2
  33. evalscope/benchmarks/general_qa/general_qa_adapter.py +44 -30
  34. evalscope/benchmarks/hellaswag/hellaswag_adapter.py +1 -1
  35. evalscope/benchmarks/hle/__init__.py +0 -0
  36. evalscope/benchmarks/hle/hle_adapter.py +118 -0
  37. evalscope/benchmarks/humaneval/humaneval_adapter.py +5 -21
  38. evalscope/benchmarks/ifeval/ifeval_adapter.py +2 -4
  39. evalscope/benchmarks/iquiz/iquiz_adapter.py +1 -1
  40. evalscope/benchmarks/live_code_bench/live_code_bench_adapter.py +0 -6
  41. evalscope/benchmarks/maritime_bench/maritime_bench_adapter.py +1 -1
  42. evalscope/benchmarks/math_500/math_500_adapter.py +2 -1
  43. evalscope/benchmarks/mmlu/mmlu_adapter.py +2 -2
  44. evalscope/benchmarks/mmlu_pro/mmlu_pro_adapter.py +1 -1
  45. evalscope/benchmarks/mmlu_redux/mmlu_redux_adapter.py +1 -1
  46. evalscope/benchmarks/musr/musr_adapter.py +1 -1
  47. evalscope/benchmarks/race/race_adapter.py +1 -1
  48. evalscope/benchmarks/tau_bench/__init__.py +0 -0
  49. evalscope/benchmarks/tau_bench/tau_bench_adapter.py +110 -0
  50. evalscope/benchmarks/tool_bench/tool_bench_adapter.py +7 -1
  51. evalscope/benchmarks/trivia_qa/trivia_qa_adapter.py +9 -4
  52. evalscope/benchmarks/utils.py +2 -2
  53. evalscope/benchmarks/winogrande/winogrande_adapter.py +1 -1
  54. evalscope/config.py +8 -123
  55. evalscope/constants.py +5 -21
  56. evalscope/evaluator/__init__.py +1 -1
  57. evalscope/evaluator/evaluator.py +20 -15
  58. evalscope/metrics/__init__.py +9 -1
  59. evalscope/{utils/utils.py → metrics/completion_parsers.py} +71 -176
  60. evalscope/metrics/llm_judge.py +106 -20
  61. evalscope/metrics/metrics.py +20 -8
  62. evalscope/models/__init__.py +4 -8
  63. evalscope/models/adapters/__init__.py +4 -9
  64. evalscope/models/adapters/base_adapter.py +4 -0
  65. evalscope/models/adapters/bfcl_adapter.py +2 -0
  66. evalscope/models/adapters/chat_adapter.py +3 -0
  67. evalscope/models/adapters/choice_adapter.py +4 -0
  68. evalscope/models/adapters/custom_adapter.py +7 -3
  69. evalscope/models/adapters/server_adapter.py +4 -2
  70. evalscope/models/adapters/t2i_adapter.py +3 -0
  71. evalscope/models/adapters/tau_bench_adapter.py +189 -0
  72. evalscope/models/custom/dummy_model.py +3 -3
  73. evalscope/models/register.py +0 -14
  74. evalscope/perf/arguments.py +15 -16
  75. evalscope/perf/benchmark.py +38 -39
  76. evalscope/perf/http_client.py +30 -86
  77. evalscope/perf/main.py +3 -3
  78. evalscope/perf/plugin/__init__.py +3 -2
  79. evalscope/perf/plugin/api/__init__.py +4 -3
  80. evalscope/perf/plugin/api/base.py +22 -4
  81. evalscope/perf/plugin/api/custom_api.py +212 -55
  82. evalscope/perf/plugin/api/dashscope_api.py +4 -10
  83. evalscope/perf/plugin/api/default_api.py +105 -0
  84. evalscope/perf/plugin/api/openai_api.py +17 -19
  85. evalscope/perf/plugin/datasets/__init__.py +10 -7
  86. evalscope/perf/plugin/datasets/base.py +22 -1
  87. evalscope/perf/plugin/datasets/custom.py +2 -1
  88. evalscope/perf/plugin/datasets/flickr8k.py +4 -27
  89. evalscope/perf/plugin/datasets/kontext_bench.py +28 -0
  90. evalscope/perf/plugin/datasets/line_by_line.py +2 -1
  91. evalscope/perf/plugin/datasets/longalpaca.py +2 -1
  92. evalscope/perf/plugin/datasets/openqa.py +2 -1
  93. evalscope/perf/plugin/datasets/random_dataset.py +15 -4
  94. evalscope/perf/plugin/datasets/random_vl_dataset.py +80 -0
  95. evalscope/perf/plugin/registry.py +36 -16
  96. evalscope/perf/utils/analysis_result.py +24 -23
  97. evalscope/perf/utils/benchmark_util.py +14 -20
  98. evalscope/perf/utils/db_util.py +79 -61
  99. evalscope/report/__init__.py +1 -1
  100. evalscope/report/utils.py +34 -15
  101. evalscope/run.py +1 -1
  102. evalscope/summarizer.py +1 -2
  103. evalscope/utils/__init__.py +63 -2
  104. evalscope/utils/argument_utils.py +64 -0
  105. evalscope/utils/import_utils.py +16 -0
  106. evalscope/utils/io_utils.py +55 -4
  107. evalscope/utils/model_utils.py +37 -1
  108. evalscope/version.py +2 -2
  109. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/METADATA +100 -51
  110. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/RECORD +129 -133
  111. tests/aigc/test_t2i.py +1 -1
  112. tests/cli/test_all.py +68 -4
  113. tests/cli/test_collection.py +1 -1
  114. tests/cli/test_custom.py +261 -0
  115. tests/cli/test_run.py +34 -70
  116. tests/perf/test_perf.py +31 -4
  117. tests/rag/test_clip_benchmark.py +2 -1
  118. tests/rag/test_mteb.py +3 -1
  119. tests/rag/test_ragas.py +3 -1
  120. tests/swift/test_run_swift_eval.py +2 -1
  121. tests/swift/test_run_swift_vlm_eval.py +2 -1
  122. tests/swift/test_run_swift_vlm_jugde_eval.py +2 -1
  123. tests/utils.py +13 -0
  124. tests/vlm/test_vlmeval.py +8 -2
  125. evalscope/evaluator/rating_eval.py +0 -157
  126. evalscope/evaluator/reviewer/__init__.py +0 -1
  127. evalscope/evaluator/reviewer/auto_reviewer.py +0 -391
  128. evalscope/models/model.py +0 -189
  129. evalscope/registry/__init__.py +0 -1
  130. evalscope/registry/config/cfg_arena.yaml +0 -77
  131. evalscope/registry/config/cfg_arena_zhihu.yaml +0 -63
  132. evalscope/registry/config/cfg_pairwise_baseline.yaml +0 -83
  133. evalscope/registry/config/cfg_single.yaml +0 -78
  134. evalscope/registry/data/prompt_template/lmsys_v2.jsonl +0 -8
  135. evalscope/registry/data/prompt_template/prompt_templates.jsonl +0 -8
  136. evalscope/registry/data/qa_browser/battle.jsonl +0 -634
  137. evalscope/registry/data/qa_browser/category_mapping.yaml +0 -10
  138. evalscope/registry/data/question.jsonl +0 -80
  139. evalscope/registry/tasks/arc.yaml +0 -28
  140. evalscope/registry/tasks/bbh.yaml +0 -26
  141. evalscope/registry/tasks/bbh_mini.yaml +0 -26
  142. evalscope/registry/tasks/ceval.yaml +0 -27
  143. evalscope/registry/tasks/ceval_mini.yaml +0 -26
  144. evalscope/registry/tasks/cmmlu.yaml +0 -27
  145. evalscope/registry/tasks/eval_qwen-7b-chat_v100.yaml +0 -28
  146. evalscope/registry/tasks/general_qa.yaml +0 -27
  147. evalscope/registry/tasks/gsm8k.yaml +0 -29
  148. evalscope/registry/tasks/mmlu.yaml +0 -29
  149. evalscope/registry/tasks/mmlu_mini.yaml +0 -27
  150. evalscope/run_arena.py +0 -202
  151. evalscope/utils/arena_utils.py +0 -217
  152. evalscope/utils/completion_parsers.py +0 -82
  153. /evalscope/{utils → benchmarks}/filters.py +0 -0
  154. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/LICENSE +0 -0
  155. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/WHEEL +0 -0
  156. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/entry_points.txt +0 -0
  157. {evalscope-0.16.3.dist-info → evalscope-0.17.1.dist-info}/top_level.txt +0 -0
@@ -2,17 +2,21 @@ import os
2
2
  import re
3
3
  from typing import Any, Dict, List, Optional
4
4
 
5
+ from evalscope.constants import JudgeScoreType
5
6
  from evalscope.utils.logger import get_logger
6
7
 
7
8
  logger = get_logger()
8
9
 
9
10
  DEFAULT_PROMPT_TEMPLATE = """Your job is to look at a question, a gold target, and a predicted answer, and return a letter "A" or "B" to indicate whether the predicted answer is correct or incorrect.
10
11
 
11
- Question: {question}
12
+ [Question]
13
+ {question}
12
14
 
13
- Reference Answer: {gold}
15
+ [Reference Answer]
16
+ {gold}
14
17
 
15
- Model Answer: {pred}
18
+ [Predicted Answer]
19
+ {pred}
16
20
 
17
21
  Evaluate the model's answer based on correctness compared to the reference answer.
18
22
  Grade the predicted answer of this new question as one of:
@@ -22,6 +26,18 @@ B: INCORRECT
22
26
  Just return the letters "A" or "B", with no text around it.
23
27
  """ # noqa: E501
24
28
 
29
+
30
+ DEFAULT_NUMERIC_SCORE_TEMPLATE = """Please act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. Your evaluation should consider factors such as the helpfulness, relevance, accuracy, depth, creativity, and level of detail of the response.
31
+ Begin your evaluation by providing a short explanation. Be as objective as possible.
32
+ After providing your explanation, you must rate the response on a scale of 0 (worst) to 1 (best) by strictly following this format: \"[[rating]]\", for example: \"Rating: [[0.5]]\"
33
+
34
+ [Question]
35
+ {question}
36
+
37
+ [Response]
38
+ {pred}
39
+ """ # noqa: E501
40
+
25
41
  DEFAULT_JUDGE_MODEL = 'Qwen/Qwen3-235B-A22B'
26
42
  DEFAULT_API_URL = 'https://api-inference.modelscope.cn/v1/'
27
43
 
@@ -31,14 +47,18 @@ class LLMJudge:
31
47
  A metric that uses LLM to judge the quality of model predictions by comparing them with reference answers.
32
48
  """
33
49
 
34
- def __init__(self,
35
- api_key: Optional[str] = None,
36
- api_url: Optional[str] = None,
37
- model_id: Optional[str] = None,
38
- system_prompt: Optional[str] = None,
39
- prompt_template: Optional[str] = None,
40
- generation_config: Optional[Dict[str, Any]] = None,
41
- **kwargs):
50
+ def __init__(
51
+ self,
52
+ api_key: Optional[str] = None,
53
+ api_url: Optional[str] = None,
54
+ model_id: Optional[str] = None,
55
+ system_prompt: Optional[str] = None,
56
+ prompt_template: Optional[str] = None,
57
+ generation_config: Optional[Dict[str, Any]] = None,
58
+ score_pattern: Optional[str] = None,
59
+ score_mapping: Optional[Dict[str, float]] = None,
60
+ score_type: str = JudgeScoreType.PATTERN, # 'pattern', 'numeric'
61
+ **kwargs):
42
62
  """
43
63
  Initialize LLMJudge metric.
44
64
 
@@ -49,14 +69,34 @@ class LLMJudge:
49
69
  system_prompt (str, optional): System prompt for the judge
50
70
  prompt_template (str, optional): Prompt template for the judge
51
71
  generation_config (dict, optional): Generation configuration for the judge
72
+ score_pattern (str, optional): Regex pattern to extract score from LLM response
73
+ score_mapping (dict, optional): Mapping from extracted score to float value
74
+ score_type (str, optional): Type of score extraction strategy ('pattern', 'numeric') defaults to 'pattern'.
75
+ - 'pattern': Use score_pattern and score_mapping to extract categorical scores
76
+ - 'numeric': Treat the extracted value as a direct numerical score
52
77
  """
53
78
  self.api_key = api_key or os.environ.get('MODELSCOPE_SDK_TOKEN', 'EMPTY')
54
79
  self.api_url = api_url or os.environ.get('MODELSCOPE_API_BASE', DEFAULT_API_URL)
55
80
  self.model_id = model_id or os.environ.get('MODELSCOPE_JUDGE_LLM', DEFAULT_JUDGE_MODEL)
56
81
  self.system_prompt = system_prompt or os.environ.get('JUDGE_SYSTEM_PROMPT', None)
57
- self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE', DEFAULT_PROMPT_TEMPLATE)
58
82
  self.generation_config = generation_config or {}
59
83
 
84
+ # Default score mapping for A/B pattern
85
+ self.score_type = score_type
86
+ if self.score_type == JudgeScoreType.NUMERIC:
87
+ self.score_pattern = score_pattern or r'\[\[(\d+(?:\.\d+)?)\]\]'
88
+ self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE',
89
+ DEFAULT_NUMERIC_SCORE_TEMPLATE)
90
+ elif self.score_type == JudgeScoreType.PATTERN:
91
+ self.score_pattern = score_pattern or r'(A|B)'
92
+ self.prompt_template = prompt_template or os.environ.get('JUDGE_PROMPT_TEMPLATE', DEFAULT_PROMPT_TEMPLATE)
93
+ else:
94
+ raise ValueError(f"Invalid score_type: {self.score_type}. Must be 'pattern' or 'numeric'.")
95
+ self.score_mapping = score_mapping or {'A': 1.0, 'B': 0.0}
96
+
97
+ self._init_server_adapter()
98
+
99
+ def _init_server_adapter(self):
60
100
  from evalscope.models import ServerModelAdapter
61
101
 
62
102
  # Initialize ServerModelAdapter
@@ -95,17 +135,63 @@ class LLMJudge:
95
135
  def build_prompt(self, pred: str, gold: str, question: Optional[str] = None):
96
136
  if question is None:
97
137
  question = 'Not provided'
98
- return self.prompt_template.format(question=question, pred=pred, gold=gold)
138
+
139
+ # check variables in prompt_template
140
+ prompt = self.prompt_template
141
+ if '{question}' in self.prompt_template:
142
+ prompt = prompt.replace('{question}', question)
143
+ if '{pred}' in self.prompt_template:
144
+ prompt = prompt.replace('{pred}', pred)
145
+ if '{gold}' in self.prompt_template:
146
+ prompt = prompt.replace('{gold}', gold)
147
+ return prompt
99
148
 
100
149
  def get_score(self, response: str) -> float:
150
+ """
151
+ Extract score from LLM response using the configured pattern and mapping.
152
+
153
+ Args:
154
+ response (str): The response from the LLM
155
+
156
+ Returns:
157
+ float: The numeric score extracted from the response
158
+ """
101
159
  if response is None:
102
- return 0
103
- match = re.search(r'(A|B)', response)
160
+ return 0.0
161
+
162
+ # choose extraction method based on score_type
163
+ if self.score_type == JudgeScoreType.NUMERIC:
164
+ return self._extract_numeric_score(response)
165
+ elif self.score_type == JudgeScoreType.PATTERN:
166
+ return self._extract_pattern_score(response)
167
+
168
+ def _extract_numeric_score(self, response: str) -> Optional[float]:
169
+ """extract numeric score from the response using the score_pattern"""
170
+ match = re.search(self.score_pattern, response)
171
+
172
+ if match:
173
+ # try to convert each captured group to float
174
+ for group in match.groups():
175
+ if group is not None:
176
+ try:
177
+ return float(group)
178
+ except (ValueError, TypeError):
179
+ continue
180
+
181
+ # if not found in groups, try the whole match
182
+ try:
183
+ return float(match.group(0))
184
+ except (ValueError, TypeError):
185
+ logger.warning(f'Failed to convert any extracted value to float from: {match.group(0)}')
186
+
187
+ return None
188
+
189
+ def _extract_pattern_score(self, response: str) -> float:
190
+ """use the score_pattern to extract categorical scores"""
191
+ match = re.search(self.score_pattern, response)
104
192
  if match:
105
193
  answer = match.group(0)
106
- if answer == 'A':
107
- return 1
108
- elif answer == 'B':
109
- return 0
194
+ return self.score_mapping.get(answer, 0.0)
110
195
  else:
111
- return 0
196
+ logger.warning(f"No match found for pattern '{self.score_pattern}' in response: {response}")
197
+ return 0.0
@@ -9,7 +9,7 @@ import random
9
9
  import sacrebleu
10
10
  from collections import defaultdict
11
11
  from collections.abc import Iterable
12
- from typing import TYPE_CHECKING, Dict, List, Union
12
+ from typing import Dict, List, Union
13
13
 
14
14
 
15
15
  def mean(arr: list):
@@ -22,16 +22,28 @@ def mean(arr: list):
22
22
 
23
23
 
24
24
  def pass_at_k(arr: Union[List[int], List[List[int]]], k: int = 1) -> float:
25
+ """
26
+ Calculates the pass@k metric using the calculate_pass_at_k function.
27
+
28
+ Args:
29
+ arr: List of binary values (1 for correct, 0 for incorrect) or list of such lists
30
+ k: Number of attempts allowed
31
+
32
+ Returns:
33
+ The average pass@k score across all problems
34
+ """
25
35
  if not arr:
26
36
  return 0.0
37
+ if not isinstance(arr[0], list):
38
+ # If arr is a simple list of binary results, convert it to a list of lists
39
+ arr = [arr]
27
40
 
28
- def sub_pass_at_k(sub_arr: List[int]) -> float:
29
- return 1.0 if any(sub_arr[:k]) else 0.0
41
+ # For list of lists case, each inner list represents attempts for one problem
42
+ num_samples = [len(sub_arr) for sub_arr in arr]
43
+ num_correct = [sum(sub_arr) for sub_arr in arr]
44
+ pass_at_k_values = calculate_pass_at_k(num_samples, num_correct, k)
30
45
 
31
- if isinstance(arr[0], list):
32
- return sum(sub_pass_at_k(sub_arr) for sub_arr in arr) / len(arr)
33
- else:
34
- return sum(arr) / len(arr)
46
+ return float(np.mean(pass_at_k_values))
35
47
 
36
48
 
37
49
  def pop_stddev(arr):
@@ -223,7 +235,7 @@ def chrf(items):
223
235
  Source: https://github.com/m-popovic/chrF
224
236
  Paper: https://www.aclweb.org/anthology/W15-3049.pdf
225
237
 
226
- Higher is better # TODO I think
238
+ Higher is better
227
239
  """
228
240
  refs = list(zip(*items))[0]
229
241
  preds = list(zip(*items))[1]
@@ -4,12 +4,11 @@ from typing import TYPE_CHECKING
4
4
  from evalscope.utils.import_utils import _LazyModule
5
5
 
6
6
  if TYPE_CHECKING:
7
- from .adapters import (BaseModelAdapter, ChatGenerationModelAdapter, ContinuationLogitsModelAdapter,
7
+ from .adapters import (BaseModelAdapter, BFCLAdapter, ChatGenerationModelAdapter, ContinuationLogitsModelAdapter,
8
8
  CustomModelAdapter, MultiChoiceModelAdapter, ServerModelAdapter, T2IModelAdapter,
9
- initialize_model_adapter)
9
+ TauBenchAdapter, initialize_model_adapter)
10
10
  from .custom import CustomModel, DummyCustomModel
11
11
  from .local_model import LocalModel, get_local_model
12
- from .model import BaseModel, ChatBaseModel, OpenAIModel
13
12
  from .register import get_model_adapter
14
13
 
15
14
  else:
@@ -23,6 +22,8 @@ else:
23
22
  'CustomModelAdapter',
24
23
  'ServerModelAdapter',
25
24
  'T2IModelAdapter',
25
+ 'TauBenchAdapter',
26
+ 'BFCLAdapter',
26
27
  ],
27
28
  'custom': [
28
29
  'CustomModel',
@@ -32,11 +33,6 @@ else:
32
33
  'LocalModel',
33
34
  'get_local_model',
34
35
  ],
35
- 'model': [
36
- 'BaseModel',
37
- 'ChatBaseModel',
38
- 'OpenAIModel',
39
- ],
40
36
  'register': [
41
37
  'get_model_adapter',
42
38
  ],
@@ -5,15 +5,10 @@ from .choice_adapter import ContinuationLogitsModelAdapter, MultiChoiceModelAdap
5
5
  from .custom_adapter import CustomModelAdapter
6
6
  from .server_adapter import ServerModelAdapter
7
7
  from .t2i_adapter import T2IModelAdapter
8
+ from .tau_bench_adapter import TauBenchAdapter
8
9
 
9
10
  __all__ = [
10
- 'initialize_model_adapter',
11
- 'BaseModelAdapter',
12
- 'ChatGenerationModelAdapter',
13
- 'ContinuationLogitsModelAdapter',
14
- 'MultiChoiceModelAdapter',
15
- 'CustomModelAdapter',
16
- 'ServerModelAdapter',
17
- 'BFCLAdapter',
18
- 'T2IModelAdapter',
11
+ 'initialize_model_adapter', 'BaseModelAdapter', 'ChatGenerationModelAdapter', 'ContinuationLogitsModelAdapter',
12
+ 'MultiChoiceModelAdapter', 'CustomModelAdapter', 'ServerModelAdapter', 'BFCLAdapter', 'T2IModelAdapter',
13
+ 'TauBenchAdapter'
19
14
  ]
@@ -53,7 +53,10 @@ def initialize_model_adapter(task_cfg: 'TaskConfig', benchmark: 'DataAdapter', b
53
53
  if task_cfg.eval_type == EvalType.SERVICE or task_cfg.api_url is not None:
54
54
 
55
55
  if 'server' not in model_adapter_cls_str:
56
+ logger.warning(f'Output type {model_adapter_cls_str} is not supported for service evaluation. '
57
+ f'Using server model adapter instead.')
56
58
  model_adapter_cls_str = 'server'
59
+ benchmark.model_adapter = model_adapter_cls_str
57
60
 
58
61
  # init server model adapter
59
62
  model_adapter_cls = get_model_adapter(model_adapter_cls_str)
@@ -71,6 +74,7 @@ def initialize_model_adapter(task_cfg: 'TaskConfig', benchmark: 'DataAdapter', b
71
74
  logger.warning(f'Output type {model_adapter_cls_str} is not supported for benchmark {benchmark.name}.'
72
75
  f'Using {benchmark.output_types[0]} instead.')
73
76
  model_adapter_cls_str = benchmark.output_types[0]
77
+ benchmark.model_adapter = model_adapter_cls_str
74
78
 
75
79
  model_adapter_cls = get_model_adapter(model_adapter_cls_str)
76
80
  return model_adapter_cls(
@@ -4,11 +4,13 @@ import uuid
4
4
  from typing import Any, List, Optional, Union
5
5
 
6
6
  from evalscope.utils.logger import get_logger
7
+ from ..register import register_model_adapter
7
8
  from .server_adapter import ServerModelAdapter
8
9
 
9
10
  logger = get_logger()
10
11
 
11
12
 
13
+ @register_model_adapter(name='bfcl_server')
12
14
  class BFCLAdapter(ServerModelAdapter):
13
15
  """
14
16
  BFCL model adapter to request remote API model and generate results for BFCL evaluation.
@@ -3,15 +3,18 @@ import time
3
3
  import torch
4
4
  from typing import Any, Dict, List, Optional, Tuple, Union
5
5
 
6
+ from evalscope.constants import OutputType
6
7
  from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage, Usage
7
8
  from evalscope.utils.logger import get_logger
8
9
  from evalscope.utils.model_utils import fix_do_sample_warning
9
10
  from ..local_model import LocalModel
11
+ from ..register import register_model_adapter
10
12
  from .base_adapter import BaseModelAdapter
11
13
 
12
14
  logger = get_logger()
13
15
 
14
16
 
17
+ @register_model_adapter(name=OutputType.GENERATION)
15
18
  class ChatGenerationModelAdapter(BaseModelAdapter):
16
19
  """
17
20
  Chat generation model adapter.
@@ -3,11 +3,14 @@ import time
3
3
  import torch
4
4
  from typing import List
5
5
 
6
+ from evalscope.constants import OutputType
6
7
  from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage
7
8
  from ..local_model import LocalModel
9
+ from ..register import register_model_adapter
8
10
  from .base_adapter import BaseModelAdapter
9
11
 
10
12
 
13
+ @register_model_adapter(name=OutputType.MULTIPLE_CHOICE)
11
14
  class MultiChoiceModelAdapter(BaseModelAdapter):
12
15
  """ The multi-choice model adapter. """
13
16
 
@@ -110,6 +113,7 @@ class MultiChoiceModelAdapter(BaseModelAdapter):
110
113
  return log_probs, {'tokens': tokens}
111
114
 
112
115
 
116
+ @register_model_adapter(name=OutputType.CONTINUOUS)
113
117
  class ContinuationLogitsModelAdapter(MultiChoiceModelAdapter):
114
118
  """
115
119
  Continuation-logits model adapter.
@@ -1,12 +1,16 @@
1
- from typing import Any, Dict, List, Union
1
+ from typing import TYPE_CHECKING, Any, Dict, List, Union
2
2
 
3
- from ..custom import CustomModel
3
+ from ..register import register_model_adapter
4
4
  from .base_adapter import BaseModelAdapter
5
5
 
6
+ if TYPE_CHECKING:
7
+ from ..custom import CustomModel
6
8
 
9
+
10
+ @register_model_adapter(name='custom')
7
11
  class CustomModelAdapter(BaseModelAdapter):
8
12
 
9
- def __init__(self, custom_model: CustomModel, **kwargs):
13
+ def __init__(self, custom_model: 'CustomModel', **kwargs):
10
14
  """
11
15
  Custom model adapter.
12
16
 
@@ -5,13 +5,15 @@ from openai.types.chat import ChatCompletion, ChatCompletionChunk
5
5
  from openai.types.chat.chat_completion import ChatCompletionMessage, Choice
6
6
  from typing import List, Optional, Union
7
7
 
8
+ from evalscope.utils.argument_utils import get_supported_params
8
9
  from evalscope.utils.logger import get_logger
9
- from evalscope.utils.utils import get_supported_params
10
+ from ..register import register_model_adapter
10
11
  from .base_adapter import BaseModelAdapter
11
12
 
12
13
  logger = get_logger()
13
14
 
14
15
 
16
+ @register_model_adapter(name='server')
15
17
  class ServerModelAdapter(BaseModelAdapter):
16
18
  """
17
19
  Server model adapter to request remote API model and generate results.
@@ -29,7 +31,7 @@ class ServerModelAdapter(BaseModelAdapter):
29
31
  self.api_key = api_key
30
32
 
31
33
  self.client = openai.OpenAI(
32
- api_key=api_key,
34
+ api_key=self.api_key,
33
35
  base_url=self.api_url,
34
36
  )
35
37
  self.supported_params = get_supported_params(self.client.chat.completions.create)
@@ -3,15 +3,18 @@ import time
3
3
  import torch
4
4
  from typing import Any, Dict, List, Optional, Tuple, Union
5
5
 
6
+ from evalscope.constants import OutputType
6
7
  from evalscope.utils.chat_service import ChatCompletionResponse, ChatCompletionResponseChoice, ChatMessage
7
8
  from evalscope.utils.io_utils import OutputsStructure
8
9
  from evalscope.utils.logger import get_logger
9
10
  from ..local_model import LocalModel
11
+ from ..register import register_model_adapter
10
12
  from .base_adapter import BaseModelAdapter
11
13
 
12
14
  logger = get_logger()
13
15
 
14
16
 
17
+ @register_model_adapter(name=OutputType.IMAGE_GENERATION)
15
18
  class T2IModelAdapter(BaseModelAdapter):
16
19
  """
17
20
  Text to image model adapter.
@@ -0,0 +1,189 @@
1
+ import json
2
+ import time
3
+ from typing import Any, Dict, List, Optional, Union
4
+
5
+ from evalscope.utils.logger import get_logger
6
+ from ..register import register_model_adapter
7
+ from .server_adapter import ServerModelAdapter
8
+
9
+ logger = get_logger()
10
+
11
+
12
+ @register_model_adapter(name='tau_bench_server')
13
+ class TauBenchAdapter(ServerModelAdapter):
14
+ """
15
+ TauBench model adapter to request remote API model and generate results for TauBench evaluation.
16
+ Support multi-turn and single-turn function calling tasks.
17
+ """
18
+
19
+ def __init__(self, api_url: str, model_id: str, api_key: str = 'EMPTY', **kwargs):
20
+ """
21
+ Args:
22
+ api_url: The URL of the remote API model.
23
+ model_id: The ID of the remote API model.
24
+ api_key: The API key of the remote API model.
25
+ """
26
+ super().__init__(api_url=api_url, model_id=model_id, api_key=api_key, **kwargs)
27
+
28
+ self._patch_agent_solve()
29
+
30
+ def predict(self, inputs: List[dict], infer_cfg: Optional[dict] = None) -> List[dict]:
31
+ """
32
+ Model prediction func. For multi-turn evals, we pass a list[list[message]] to the model
33
+ where each list is a follow up turn in the conversation
34
+ each turn is a List[List[Message]]
35
+
36
+ Args:
37
+ inputs (List[dict]): The input data.
38
+ infer_cfg (dict): Inference configuration.
39
+
40
+ Returns:
41
+ res (List[dict]): The model prediction results.
42
+ """
43
+ infer_cfg = infer_cfg or {}
44
+ results = []
45
+
46
+ for input_item in inputs:
47
+ raw_input = input_item.get('raw_input')
48
+
49
+ res_d = self.solve(env_name=raw_input['env_name'], task_index=raw_input['task_index'], infer_cfg=infer_cfg)
50
+
51
+ wrapper_res = {
52
+ 'choices': [{
53
+ 'index': 0,
54
+ 'message': {
55
+ 'content': json.dumps(res_d, ensure_ascii=False),
56
+ 'role': 'assistant'
57
+ }
58
+ }],
59
+ 'created':
60
+ time.time(),
61
+ 'model':
62
+ self.model_id,
63
+ 'object':
64
+ 'chat.completion',
65
+ 'usage': {
66
+ 'completion_tokens': 0,
67
+ 'prompt_tokens': 0,
68
+ 'total_tokens': 0
69
+ }
70
+ }
71
+
72
+ results.append(wrapper_res)
73
+
74
+ return results
75
+
76
+ def _patch_agent_solve(self):
77
+ """Patch ToolCallingAgent.solve method to use custom model configuration"""
78
+ from tau_bench.agents.tool_calling_agent import ToolCallingAgent, message_to_action
79
+ from tau_bench.envs.base import Env
80
+ from tau_bench.types import RESPOND_ACTION_NAME, SolveResult
81
+ from typing import List, Optional
82
+
83
+ def patched_solve(self,
84
+ env: Env,
85
+ task_index: Optional[int] = None,
86
+ max_num_steps: int = 30,
87
+ infer_cfg: Optional[dict] = {}) -> SolveResult:
88
+ env_reset_res = env.reset(task_index=task_index)
89
+ obs = env_reset_res.observation
90
+ info = env_reset_res.info.model_dump()
91
+ reward = 0.0
92
+ messages: List[Dict[str, Any]] = [
93
+ {
94
+ 'role': 'system',
95
+ 'content': self.wiki
96
+ },
97
+ {
98
+ 'role': 'user',
99
+ 'content': obs
100
+ },
101
+ ]
102
+
103
+ for step_index in range(max_num_steps):
104
+ # Use adapter's model configuration instead of agent's
105
+ request_json = adapter_instance.make_request(
106
+ input_item={
107
+ 'messages': messages,
108
+ 'tools': self.tools_info
109
+ }, infer_cfg=infer_cfg)
110
+ res = adapter_instance.send_request(request_json)
111
+
112
+ next_message = res['choices'][0]['message']
113
+ action = message_to_action(next_message)
114
+ env_response = env.step(action)
115
+ reward = env_response.reward
116
+ info = {**info, **env_response.info.model_dump()}
117
+
118
+ if action.name != RESPOND_ACTION_NAME:
119
+ next_message['tool_calls'] = next_message['tool_calls'][:1]
120
+ messages.extend([
121
+ next_message,
122
+ {
123
+ 'role': 'tool',
124
+ 'tool_call_id': next_message['tool_calls'][0]['id'],
125
+ 'name': next_message['tool_calls'][0]['function']['name'],
126
+ 'content': env_response.observation,
127
+ },
128
+ ])
129
+ else:
130
+ messages.extend([
131
+ next_message,
132
+ {
133
+ 'role': 'user',
134
+ 'content': env_response.observation
135
+ },
136
+ ])
137
+ logger.debug(f'Task: {task_index} Step: {step_index} finished')
138
+
139
+ if env_response.done:
140
+ break
141
+
142
+ return SolveResult(
143
+ reward=reward,
144
+ info=info,
145
+ messages=messages,
146
+ total_cost=0,
147
+ )
148
+
149
+ adapter_instance = self
150
+
151
+ ToolCallingAgent.solve = patched_solve
152
+
153
+ return 'ToolCallingAgent.solve patched successfully'
154
+
155
+ def solve(self, env_name, task_index, infer_cfg, **kwargs):
156
+ """
157
+ Solve a specific task in the TauBench environment.
158
+
159
+ Args:
160
+ env_name (str): The name of the TauBench environment.
161
+ task_index (int): The index of the task to solve.
162
+ **kwargs: Additional arguments for the task.
163
+
164
+ Returns:
165
+ dict: The result of the task.
166
+ """
167
+ from tau_bench.agents.tool_calling_agent import ToolCallingAgent
168
+ from tau_bench.envs import get_env
169
+
170
+ # This method can be implemented to solve specific tasks in the TauBench environment
171
+ isolated_env = get_env(
172
+ env_name=env_name,
173
+ user_strategy='llm',
174
+ user_model='dummy', # Use dummy model to prevent errors
175
+ user_provider='openai', # Use dummy provider to prevent errors
176
+ task_split='test',
177
+ task_index=task_index,
178
+ )
179
+ agent = ToolCallingAgent(
180
+ tools_info=isolated_env.tools_info,
181
+ wiki=isolated_env.wiki,
182
+ model='dummy', # Use dummy model to prevent errors
183
+ provider='dummy', # Use dummy provider to prevent errors
184
+ temperature=0, # dummy temperature to prevent errors
185
+ )
186
+
187
+ res = agent.solve(env=isolated_env, task_index=task_index, infer_cfg=infer_cfg)
188
+
189
+ return res.model_dump()
@@ -50,14 +50,14 @@ class DummyCustomModel(CustomModel):
50
50
  # Must return a list of dicts with the same format as the OpenAI API.
51
51
  responses = []
52
52
  for input_item in original_inputs:
53
- message = self.make_request_messages(input_item)
54
- response = f'Dummy response for prompt: {message}'
53
+ # message = self.make_request_messages(input_item)
54
+ # response = f'Dummy response for prompt: {message}'
55
55
 
56
56
  res_d = {
57
57
  'choices': [{
58
58
  'index': 0,
59
59
  'message': {
60
- 'content': response,
60
+ 'content': '*PlaceHolder*',
61
61
  'role': 'assistant'
62
62
  }
63
63
  }],
@@ -1,6 +1,3 @@
1
- from evalscope.constants import OutputType
2
- from .adapters import *
3
-
4
1
  MODEL_ADAPTERS = {}
5
2
 
6
3
 
@@ -42,14 +39,3 @@ def register_model_adapter_class(cls, name=None):
42
39
  if name in MODEL_ADAPTERS:
43
40
  raise ValueError(f"Model adapter class '{name}' is already registered.")
44
41
  MODEL_ADAPTERS[name] = cls
45
-
46
-
47
- # register all model adapters
48
- register_model_adapter_class(BaseModelAdapter, name='base')
49
- register_model_adapter_class(ChatGenerationModelAdapter, name=OutputType.GENERATION)
50
- register_model_adapter_class(ContinuationLogitsModelAdapter, name=OutputType.CONTINUOUS)
51
- register_model_adapter_class(MultiChoiceModelAdapter, name=OutputType.MULTIPLE_CHOICE)
52
- register_model_adapter_class(CustomModelAdapter, name='custom')
53
- register_model_adapter_class(ServerModelAdapter, name='server')
54
- register_model_adapter_class(BFCLAdapter, name='bfcl_server')
55
- register_model_adapter_class(T2IModelAdapter, name=OutputType.IMAGE_GENERATION)