eval-protocol 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. development/__init__.py +1 -0
  2. development/normalize_sandbox_fusion.py +628 -0
  3. development/utils/__init__.py +1 -0
  4. development/utils/generate_api_key.py +31 -0
  5. development/utils/subprocess_manager.py +481 -0
  6. eval_protocol/__init__.py +86 -0
  7. eval_protocol/__main__.py +10 -0
  8. eval_protocol/_version.py +21 -0
  9. eval_protocol/adapters/__init__.py +1 -0
  10. eval_protocol/adapters/braintrust.py +8 -0
  11. eval_protocol/adapters/trl.py +8 -0
  12. eval_protocol/agent/__init__.py +29 -0
  13. eval_protocol/agent/models.py +69 -0
  14. eval_protocol/agent/orchestrator.py +893 -0
  15. eval_protocol/agent/resource_abc.py +89 -0
  16. eval_protocol/agent/resource_pool.py +184 -0
  17. eval_protocol/agent/resources/__init__.py +44 -0
  18. eval_protocol/agent/resources/bfcl_envs/__init__.py +1 -0
  19. eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +342 -0
  20. eval_protocol/agent/resources/bfcl_envs/math_api.py +40 -0
  21. eval_protocol/agent/resources/bfcl_envs/posting_api.py +157 -0
  22. eval_protocol/agent/resources/bfcl_sim_api_resource.py +314 -0
  23. eval_protocol/agent/resources/docker_resource.py +479 -0
  24. eval_protocol/agent/resources/filesystem_resource.py +371 -0
  25. eval_protocol/agent/resources/http_rollout_protocol.py +85 -0
  26. eval_protocol/agent/resources/http_rollout_resource.py +325 -0
  27. eval_protocol/agent/resources/python_state_resource.py +170 -0
  28. eval_protocol/agent/resources/sql_resource.py +271 -0
  29. eval_protocol/agent/task_manager.py +1064 -0
  30. eval_protocol/agent/tool_registry.py +111 -0
  31. eval_protocol/auth.py +156 -0
  32. eval_protocol/cli.py +425 -0
  33. eval_protocol/cli_commands/__init__.py +1 -0
  34. eval_protocol/cli_commands/agent_eval_cmd.py +264 -0
  35. eval_protocol/cli_commands/common.py +242 -0
  36. eval_protocol/cli_commands/deploy.py +486 -0
  37. eval_protocol/cli_commands/deploy_mcp.py +287 -0
  38. eval_protocol/cli_commands/preview.py +186 -0
  39. eval_protocol/cli_commands/run_eval_cmd.py +202 -0
  40. eval_protocol/common_utils.py +36 -0
  41. eval_protocol/config.py +180 -0
  42. eval_protocol/datasets/__init__.py +1 -0
  43. eval_protocol/datasets/loader.py +521 -0
  44. eval_protocol/evaluation.py +1045 -0
  45. eval_protocol/execution/__init__.py +1 -0
  46. eval_protocol/execution/pipeline.py +920 -0
  47. eval_protocol/gcp_tools.py +484 -0
  48. eval_protocol/generation/cache.py +141 -0
  49. eval_protocol/generation/clients/base.py +67 -0
  50. eval_protocol/generation/clients.py +248 -0
  51. eval_protocol/generic_server.py +165 -0
  52. eval_protocol/integrations/__init__.py +12 -0
  53. eval_protocol/integrations/braintrust.py +51 -0
  54. eval_protocol/integrations/deepeval.py +106 -0
  55. eval_protocol/integrations/openeval.py +40 -0
  56. eval_protocol/integrations/trl.py +187 -0
  57. eval_protocol/mcp/__init__.py +48 -0
  58. eval_protocol/mcp/adapter.py +131 -0
  59. eval_protocol/mcp/client/__init__.py +12 -0
  60. eval_protocol/mcp/client/connection.py +499 -0
  61. eval_protocol/mcp/clients.py +195 -0
  62. eval_protocol/mcp/execution/__init__.py +23 -0
  63. eval_protocol/mcp/execution/base_policy.py +227 -0
  64. eval_protocol/mcp/execution/fireworks_policy.py +209 -0
  65. eval_protocol/mcp/execution/manager.py +506 -0
  66. eval_protocol/mcp/execution/policy.py +421 -0
  67. eval_protocol/mcp/grid_renderer.py +54 -0
  68. eval_protocol/mcp/mcpgym.py +637 -0
  69. eval_protocol/mcp/process_manager.py +177 -0
  70. eval_protocol/mcp/session/__init__.py +11 -0
  71. eval_protocol/mcp/session/manager.py +228 -0
  72. eval_protocol/mcp/simple_process_manager.py +291 -0
  73. eval_protocol/mcp/simulation_server.py +458 -0
  74. eval_protocol/mcp/types.py +80 -0
  75. eval_protocol/mcp_agent/__init__.py +1 -0
  76. eval_protocol/mcp_agent/config.py +147 -0
  77. eval_protocol/mcp_agent/intermediary_server.py +542 -0
  78. eval_protocol/mcp_agent/main.py +210 -0
  79. eval_protocol/mcp_agent/orchestration/__init__.py +1 -0
  80. eval_protocol/mcp_agent/orchestration/base_client.py +132 -0
  81. eval_protocol/mcp_agent/orchestration/local_docker_client.py +702 -0
  82. eval_protocol/mcp_agent/orchestration/remote_http_client.py +304 -0
  83. eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +3 -0
  84. eval_protocol/mcp_agent/session.py +79 -0
  85. eval_protocol/mcp_env.py +304 -0
  86. eval_protocol/models.py +366 -0
  87. eval_protocol/packaging.py +219 -0
  88. eval_protocol/platform_api.py +360 -0
  89. eval_protocol/playback_policy.py +396 -0
  90. eval_protocol/resources.py +128 -0
  91. eval_protocol/reward_function.py +410 -0
  92. eval_protocol/rewards/__init__.py +94 -0
  93. eval_protocol/rewards/accuracy.py +454 -0
  94. eval_protocol/rewards/accuracy_length.py +173 -0
  95. eval_protocol/rewards/apps_coding_reward.py +331 -0
  96. eval_protocol/rewards/apps_execution_utils.py +149 -0
  97. eval_protocol/rewards/apps_testing_util.py +559 -0
  98. eval_protocol/rewards/bfcl_reward.py +313 -0
  99. eval_protocol/rewards/code_execution.py +1620 -0
  100. eval_protocol/rewards/code_execution_utils.py +72 -0
  101. eval_protocol/rewards/cpp_code.py +861 -0
  102. eval_protocol/rewards/deepcoder_reward.py +161 -0
  103. eval_protocol/rewards/format.py +129 -0
  104. eval_protocol/rewards/function_calling.py +541 -0
  105. eval_protocol/rewards/json_schema.py +422 -0
  106. eval_protocol/rewards/language_consistency.py +700 -0
  107. eval_protocol/rewards/lean_prover.py +479 -0
  108. eval_protocol/rewards/length.py +375 -0
  109. eval_protocol/rewards/list_comparison_math_reward.py +221 -0
  110. eval_protocol/rewards/math.py +762 -0
  111. eval_protocol/rewards/multiple_choice_math_reward.py +232 -0
  112. eval_protocol/rewards/reasoning_steps.py +249 -0
  113. eval_protocol/rewards/repetition.py +342 -0
  114. eval_protocol/rewards/tag_count.py +162 -0
  115. eval_protocol/rl_processing.py +82 -0
  116. eval_protocol/server.py +271 -0
  117. eval_protocol/typed_interface.py +260 -0
  118. eval_protocol/utils/__init__.py +8 -0
  119. eval_protocol/utils/batch_evaluation.py +217 -0
  120. eval_protocol/utils/batch_transformation.py +205 -0
  121. eval_protocol/utils/dataset_helpers.py +112 -0
  122. eval_protocol/utils/module_loader.py +56 -0
  123. eval_protocol/utils/packaging_utils.py +108 -0
  124. eval_protocol/utils/static_policy.py +305 -0
  125. eval_protocol-0.0.3.dist-info/METADATA +635 -0
  126. eval_protocol-0.0.3.dist-info/RECORD +130 -0
  127. eval_protocol-0.0.3.dist-info/WHEEL +5 -0
  128. eval_protocol-0.0.3.dist-info/entry_points.txt +4 -0
  129. eval_protocol-0.0.3.dist-info/licenses/LICENSE +201 -0
  130. eval_protocol-0.0.3.dist-info/top_level.txt +2 -0
@@ -0,0 +1,232 @@
1
+ """
2
+ Multiple Choice Question (MCQ) reward function.
3
+
4
+ This module provides a reward function specifically for evaluating
5
+ answers to multiple-choice questions, where the answer is typically
6
+ a single letter (e.g., A, B, C, D, E).
7
+ """
8
+
9
+ import re
10
+ from typing import Any, Dict, List, Optional, Tuple, TypedDict, Union
11
+
12
+ from ..models import EvaluateResult, Message, MetricResult
13
+ from ..typed_interface import reward_function
14
+
15
+
16
+ class MatchInfo(TypedDict):
17
+ text: str
18
+ letter: str
19
+ span: Tuple[int, int]
20
+ priority: int
21
+
22
+
23
+ def extract_mcq_option(text: str) -> List[Tuple[str, str]]:
24
+ """
25
+ Extracts MCQ options (A-E) from text.
26
+ Prioritizes options in parentheses or brackets, or standalone letters.
27
+
28
+ Args:
29
+ text: The text to extract MCQ options from.
30
+
31
+ Returns:
32
+ A list of tuples, where each tuple contains the original matched
33
+ string and the uppercase letter of the MCQ option.
34
+ Returns an empty list if no MCQ option is confidently extracted.
35
+ """
36
+ mcq_answers: List[Tuple[str, str]] = []
37
+ found_mcq_letters = set()
38
+
39
+ patterns = [
40
+ r"(\(([A-E])\))", # (A)
41
+ r"(\[([A-E])\])", # [A]
42
+ r"(\{([A-E])\})", # {A}
43
+ r"((?<![a-zA-Z0-9_])([A-E])\.(?!\w))", # A.
44
+ r"((?<![a-zA-Z0-9_])([A-E])(?![a-zA-Z0-9_]))", # Standalone A
45
+ ]
46
+
47
+ all_potential_matches: List[MatchInfo] = []
48
+ for p_idx, p_str in enumerate(patterns):
49
+ for match in re.finditer(p_str, text, re.IGNORECASE):
50
+ option_text = match.group(1)
51
+ letter = match.group(2)
52
+ span_tuple = match.span(1)
53
+ if not (
54
+ isinstance(span_tuple, tuple)
55
+ and len(span_tuple) == 2
56
+ and isinstance(span_tuple[0], int)
57
+ and isinstance(span_tuple[1], int)
58
+ ):
59
+ continue
60
+
61
+ match_data: MatchInfo = {
62
+ "text": option_text if option_text is not None else "",
63
+ "letter": letter.upper() if letter is not None else "",
64
+ "span": span_tuple,
65
+ "priority": p_idx,
66
+ }
67
+ all_potential_matches.append(match_data)
68
+
69
+ all_potential_matches.sort(key=lambda m: (m["span"][0], m["priority"], -(m["span"][1] - m["span"][0])))
70
+
71
+ last_covered_end = -1
72
+ for match_info in all_potential_matches:
73
+ start, end = match_info["span"]
74
+ if start >= last_covered_end:
75
+ letter_upper = match_info["letter"]
76
+ if letter_upper not in found_mcq_letters:
77
+ mcq_answers.append((match_info["text"], letter_upper))
78
+ found_mcq_letters.add(letter_upper)
79
+ last_covered_end = end
80
+ # Renamed final_mcq_answers to mcq_answers for clarity
81
+ return mcq_answers
82
+
83
+
84
+ @reward_function # type: ignore[arg-type]
85
+ def multiple_choice_math_reward(
86
+ messages: List[Message],
87
+ ground_truth: List[Message],
88
+ **kwargs: Any,
89
+ ) -> EvaluateResult:
90
+ """
91
+ Evaluate multiple-choice answers in messages.
92
+
93
+ Extracts MCQ options (A-E) from the last assistant message in
94
+ the generated messages and from the ground truth assistant message, then compares them.
95
+
96
+ Args:
97
+ messages: Generated conversation messages, where the last message is the
98
+ assistant's response.
99
+ ground_truth: A list containing the ground truth assistant message.
100
+ **kwargs: Additional keyword arguments.
101
+
102
+ Returns:
103
+ EvaluateResult with score and metrics.
104
+ """
105
+ metrics: Dict[str, MetricResult] = {}
106
+
107
+ if not messages:
108
+ return EvaluateResult(
109
+ score=0.0,
110
+ reason="Missing generated messages",
111
+ metrics={
112
+ "error": MetricResult(
113
+ score=0.0,
114
+ is_score_valid=False,
115
+ reason="Missing generated messages",
116
+ )
117
+ },
118
+ )
119
+
120
+ if not ground_truth:
121
+ return EvaluateResult(
122
+ score=0.0,
123
+ reason="Missing ground truth message",
124
+ metrics={
125
+ "error": MetricResult(
126
+ score=0.0,
127
+ is_score_valid=False,
128
+ reason="Missing ground truth message",
129
+ )
130
+ },
131
+ )
132
+
133
+ gen_content = ""
134
+ if messages and len(messages) > 0:
135
+ gen_response_message = messages[-1]
136
+ if gen_response_message.role == "assistant":
137
+ gen_content = gen_response_message.content or ""
138
+
139
+ if not gen_content:
140
+ metrics["error_generated_message"] = MetricResult(
141
+ score=0.0,
142
+ is_score_valid=False,
143
+ reason="Invalid generated message: Last message not from assistant or has no content.",
144
+ )
145
+ return EvaluateResult(
146
+ score=0.0,
147
+ reason="Last generated message not from assistant or has no content.",
148
+ metrics=metrics,
149
+ )
150
+
151
+ orig_content = ""
152
+ if ground_truth and len(ground_truth) > 0:
153
+ orig_response_message = ground_truth[0]
154
+ if orig_response_message.role == "assistant":
155
+ orig_content = orig_response_message.content or ""
156
+
157
+ if not orig_content:
158
+ metrics["error_original_message"] = MetricResult(
159
+ score=0.0,
160
+ is_score_valid=False,
161
+ reason="Invalid ground truth message: Not an assistant message or has no content.",
162
+ )
163
+ return EvaluateResult(
164
+ score=0.0,
165
+ reason="Invalid ground truth message: Not an assistant message or has no content.",
166
+ metrics=metrics,
167
+ )
168
+
169
+ gen_mcq_options = extract_mcq_option(gen_content)
170
+ orig_mcq_options = extract_mcq_option(orig_content)
171
+
172
+ def format_extracted_mcq(items: List[Tuple[str, str]]) -> str:
173
+ if not items:
174
+ return "None"
175
+ return ", ".join([f"'{i[0]}' ({i[1]})" for i in items])
176
+
177
+ metrics["extracted_original_mcq"] = MetricResult(
178
+ score=1.0 if orig_mcq_options else 0.0,
179
+ is_score_valid=bool(orig_mcq_options),
180
+ reason=f"Extracted from original: {format_extracted_mcq(orig_mcq_options)}",
181
+ )
182
+ metrics["extracted_generated_mcq"] = MetricResult(
183
+ score=1.0 if gen_mcq_options else 0.0,
184
+ is_score_valid=bool(gen_mcq_options),
185
+ reason=f"Extracted from generated: {format_extracted_mcq(gen_mcq_options)}",
186
+ )
187
+
188
+ if not orig_mcq_options:
189
+ return EvaluateResult(
190
+ score=0.0,
191
+ reason="Could not extract MCQ option from original message (ground truth). Assumed not an MCQ.",
192
+ metrics=metrics,
193
+ )
194
+
195
+ if not gen_mcq_options:
196
+ return EvaluateResult(
197
+ score=0.0,
198
+ reason="Could not extract MCQ option from generated message, but original message has an MCQ option.",
199
+ metrics=metrics,
200
+ )
201
+
202
+ # Ideally, MCQs should have one clear answer.
203
+ if len(orig_mcq_options) > 1:
204
+ metrics["ambiguous_original_mcq"] = MetricResult(
205
+ score=0.0,
206
+ is_score_valid=False,
207
+ reason=f"Original message has multiple MCQ options extracted: {format_extracted_mcq(orig_mcq_options)}",
208
+ )
209
+
210
+ if len(gen_mcq_options) > 1:
211
+ metrics["ambiguous_generated_mcq"] = MetricResult(
212
+ score=0.0,
213
+ is_score_valid=False,
214
+ reason=f"Generated message has multiple MCQ options extracted: {format_extracted_mcq(gen_mcq_options)}",
215
+ )
216
+ if len(orig_mcq_options) == 1: # Penalize if GT is specific but gen is ambiguous
217
+ return EvaluateResult(
218
+ score=0.0,
219
+ reason="Generated answer is ambiguous (multiple MCQ options) while ground truth is specific.",
220
+ metrics=metrics,
221
+ )
222
+
223
+ orig_answer_letter = orig_mcq_options[0][1]
224
+ gen_answer_letter = gen_mcq_options[0][1]
225
+
226
+ is_match = orig_answer_letter == gen_answer_letter
227
+ score = 1.0 if is_match else 0.0
228
+ reason = f"Match: {is_match}. Gen: '{gen_mcq_options[0][0]}' ({gen_answer_letter}) vs Orig: '{orig_mcq_options[0][0]}' ({orig_answer_letter})"
229
+
230
+ metrics["mcq_comparison"] = MetricResult(score=score, is_score_valid=is_match, reason=reason)
231
+
232
+ return EvaluateResult(score=score, reason=reason, metrics=metrics)
@@ -0,0 +1,249 @@
1
+ """
2
+ Reward functions for evaluating reasoning steps.
3
+
4
+ This module provides reward functions that evaluate whether a model's response
5
+ contains adequate step-by-step reasoning, rewarding structured thinking.
6
+ """
7
+
8
+ import re
9
+ from typing import Any, Dict, List, Optional, Pattern, Set, Union
10
+
11
+ from ..models import EvaluateResult, Message, MetricResult
12
+ from ..typed_interface import reward_function
13
+
14
+
15
+ @reward_function
16
+ def reasoning_steps_reward(
17
+ messages: List[Message],
18
+ pattern: Optional[str] = None,
19
+ min_steps: int = 3,
20
+ max_steps: Optional[int] = None,
21
+ exclusive_patterns: bool = False,
22
+ **kwargs: Any,
23
+ ) -> EvaluateResult:
24
+ """
25
+ Reward function that evaluates step-by-step reasoning in model responses.
26
+
27
+ This function checks if the model's response contains indicators of structured
28
+ reasoning, such as numbered steps, bullet points, or transitional phrases.
29
+
30
+ Args:
31
+ messages: List of conversation messages
32
+ pattern: Optional custom regex pattern to use for detecting reasoning steps
33
+ min_steps: Minimum number of steps required for full score
34
+ max_steps: Optional maximum number of steps (default: None)
35
+ exclusive_patterns: Whether to use only the custom pattern (True) or
36
+ combine it with default patterns (False)
37
+ **kwargs: Additional arguments
38
+
39
+ Returns:
40
+ EvaluateResult with score based on the number of reasoning steps detected
41
+ """
42
+ if not messages or len(messages) == 0:
43
+ return EvaluateResult(
44
+ score=0.0,
45
+ reason="No messages provided",
46
+ metrics={"reasoning_steps": MetricResult(score=0.0, is_score_valid=False, reason="No messages provided")},
47
+ )
48
+
49
+ response = messages[-1]
50
+
51
+ if response.role != "assistant" or not response.content:
52
+ return EvaluateResult(
53
+ score=0.0,
54
+ reason="No assistant response found or response has no content",
55
+ metrics={
56
+ "reasoning_steps": MetricResult(
57
+ score=0.0,
58
+ is_score_valid=False,
59
+ reason="Message not from assistant or has no content",
60
+ )
61
+ },
62
+ )
63
+ text: str = response.content
64
+
65
+ # Default patterns for detecting reasoning steps
66
+ default_patterns = [
67
+ r"Step\s+\d+[:.]\s+",
68
+ r"^\s*\d+\.\s+",
69
+ r"\n\s*\d+\.\s+",
70
+ r"\n\s*-\s+",
71
+ r"\n\s*\*\s+",
72
+ r"\b(?:First|Second|Third|Fourth|Fifth|Next|Then|Finally)[,:]",
73
+ r"\b(?:Let's|I will|To solve this|To begin)[,:]",
74
+ ]
75
+
76
+ patterns_to_use = []
77
+ if pattern and exclusive_patterns:
78
+ patterns_to_use = [pattern]
79
+ elif pattern:
80
+ patterns_to_use = [pattern] + default_patterns
81
+ else:
82
+ patterns_to_use = default_patterns
83
+
84
+ combined_pattern = "|".join(f"(?:{p})" for p in patterns_to_use)
85
+ matches = re.findall(combined_pattern, text, re.MULTILINE)
86
+ num_steps = len(matches)
87
+
88
+ if num_steps == 0:
89
+ score = 0.0
90
+ elif max_steps is not None:
91
+ score = min(
92
+ 1.0,
93
+ max(0.0, (num_steps - min_steps + 1) / (max_steps - min_steps + 1)),
94
+ )
95
+ else:
96
+ score = min(1.0, num_steps / min_steps)
97
+
98
+ success = num_steps >= min_steps
99
+ step_metrics = {}
100
+
101
+ explicit_steps = len(re.findall(r"Step\s+\d+[:.]\s+", text, re.MULTILINE))
102
+ if explicit_steps > 0:
103
+ step_metrics["explicit_steps"] = MetricResult(
104
+ score=min(1.0, explicit_steps / min_steps),
105
+ is_score_valid=explicit_steps >= min_steps,
106
+ reason=f"Found {explicit_steps} explicit steps",
107
+ )
108
+
109
+ numbered_lists = len(re.findall(r"(?:^|\n)\s*\d+\.\s+", text, re.MULTILINE))
110
+ if numbered_lists > 0:
111
+ step_metrics["numbered_lists"] = MetricResult(
112
+ score=min(1.0, numbered_lists / min_steps),
113
+ is_score_valid=numbered_lists >= min_steps,
114
+ reason=f"Found {numbered_lists} numbered list items",
115
+ )
116
+
117
+ bullets = len(re.findall(r"(?:^|\n)\s*[-*]\s+", text, re.MULTILINE))
118
+ if bullets > 0:
119
+ step_metrics["bullet_points"] = MetricResult(
120
+ score=min(1.0, bullets / min_steps),
121
+ is_score_valid=bullets >= min_steps,
122
+ reason=f"Found {bullets} bullet points",
123
+ )
124
+
125
+ transitions = len(
126
+ re.findall(
127
+ r"\b(?:First|Second|Third|Next|Then|Finally)[,:]",
128
+ text,
129
+ re.MULTILINE,
130
+ )
131
+ )
132
+ if transitions > 0:
133
+ step_metrics["transition_phrases"] = MetricResult(
134
+ score=min(1.0, transitions / min_steps),
135
+ is_score_valid=transitions >= min_steps,
136
+ reason=f"Found {transitions} transition phrases",
137
+ )
138
+
139
+ metrics = {
140
+ "reasoning_steps": MetricResult(
141
+ score=score,
142
+ is_score_valid=success,
143
+ reason=f"Found {num_steps} reasoning steps (minimum required: {min_steps})",
144
+ ),
145
+ **step_metrics,
146
+ }
147
+
148
+ reason = f"Detected {num_steps} reasoning steps (required: {min_steps})"
149
+ if max_steps:
150
+ reason += f", max: {max_steps}"
151
+
152
+ return EvaluateResult(score=score, reason=reason, metrics=metrics)
153
+
154
+
155
+ @reward_function
156
+ def sequence_reward(
157
+ messages: List[Message],
158
+ sequence_terms: Optional[List[str]] = None,
159
+ min_matches: int = 3,
160
+ case_sensitive: bool = False,
161
+ **kwargs: Any,
162
+ ) -> EvaluateResult:
163
+ """
164
+ Reward function that evaluates sequential reasoning in model responses.
165
+
166
+ This function checks if the model's response follows a specific sequence
167
+ of reasoning steps or includes a minimum number of required terms in order.
168
+
169
+ Args:
170
+ messages: List of conversation messages
171
+ sequence_terms: List of terms that should appear in sequence
172
+ min_matches: Minimum number of sequence terms required for full score
173
+ case_sensitive: Whether matching should be case-sensitive
174
+ **kwargs: Additional arguments
175
+
176
+ Returns:
177
+ EvaluateResult with score based on sequence matching
178
+ """
179
+ if not messages or len(messages) == 0:
180
+ return EvaluateResult(
181
+ score=0.0,
182
+ reason="No messages provided",
183
+ metrics={
184
+ "sequence_reasoning": MetricResult(score=0.0, is_score_valid=False, reason="No messages provided")
185
+ },
186
+ )
187
+
188
+ response = messages[-1]
189
+
190
+ if response.role != "assistant" or not response.content:
191
+ return EvaluateResult(
192
+ score=0.0,
193
+ reason="No assistant response found or response has no content",
194
+ metrics={
195
+ "sequence_reasoning": MetricResult(
196
+ score=0.0,
197
+ is_score_valid=False,
198
+ reason="Message not from assistant or has no content",
199
+ )
200
+ },
201
+ )
202
+ text: str = response.content
203
+
204
+ if not sequence_terms:
205
+ sequence_terms = [
206
+ "First",
207
+ "Second",
208
+ "Third",
209
+ "Fourth",
210
+ "Fifth",
211
+ "Next",
212
+ "Then",
213
+ "Finally",
214
+ "Conclusion",
215
+ ]
216
+
217
+ found_terms = []
218
+ last_position = -1
219
+
220
+ if not case_sensitive:
221
+ text = text.lower()
222
+ sequence_terms = [term.lower() for term in sequence_terms]
223
+
224
+ for term in sequence_terms:
225
+ position = text.find(term, last_position + 1)
226
+ if position > last_position:
227
+ found_terms.append(term)
228
+ last_position = position
229
+
230
+ num_matches = len(found_terms)
231
+ score = min(1.0, num_matches / min_matches)
232
+ success = num_matches >= min_matches
233
+
234
+ metrics = {
235
+ "sequence_reasoning": MetricResult(
236
+ score=score,
237
+ is_score_valid=success,
238
+ reason=f"Found {num_matches} sequential terms (minimum required: {min_matches})",
239
+ ),
240
+ "sequential_terms_found": MetricResult(
241
+ score=score,
242
+ is_score_valid=success,
243
+ reason=f"Sequential terms found: {', '.join(found_terms)}",
244
+ ),
245
+ }
246
+
247
+ reason = f"Detected {num_matches} sequential reasoning terms in order (required: {min_matches})"
248
+
249
+ return EvaluateResult(score=score, reason=reason, metrics=metrics)