PyPI - eval-protocol - Versions diffs - 0.0.3__py3-none-any.whl - Mend

eval-protocol 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (130) hide show

development/__init__.py +1 -0
development/normalize_sandbox_fusion.py +628 -0
development/utils/__init__.py +1 -0
development/utils/generate_api_key.py +31 -0
development/utils/subprocess_manager.py +481 -0
eval_protocol/__init__.py +86 -0
eval_protocol/__main__.py +10 -0
eval_protocol/_version.py +21 -0
eval_protocol/adapters/__init__.py +1 -0
eval_protocol/adapters/braintrust.py +8 -0
eval_protocol/adapters/trl.py +8 -0
eval_protocol/agent/__init__.py +29 -0
eval_protocol/agent/models.py +69 -0
eval_protocol/agent/orchestrator.py +893 -0
eval_protocol/agent/resource_abc.py +89 -0
eval_protocol/agent/resource_pool.py +184 -0
eval_protocol/agent/resources/__init__.py +44 -0
eval_protocol/agent/resources/bfcl_envs/__init__.py +1 -0
eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +342 -0
eval_protocol/agent/resources/bfcl_envs/math_api.py +40 -0
eval_protocol/agent/resources/bfcl_envs/posting_api.py +157 -0
eval_protocol/agent/resources/bfcl_sim_api_resource.py +314 -0
eval_protocol/agent/resources/docker_resource.py +479 -0
eval_protocol/agent/resources/filesystem_resource.py +371 -0
eval_protocol/agent/resources/http_rollout_protocol.py +85 -0
eval_protocol/agent/resources/http_rollout_resource.py +325 -0
eval_protocol/agent/resources/python_state_resource.py +170 -0
eval_protocol/agent/resources/sql_resource.py +271 -0
eval_protocol/agent/task_manager.py +1064 -0
eval_protocol/agent/tool_registry.py +111 -0
eval_protocol/auth.py +156 -0
eval_protocol/cli.py +425 -0
eval_protocol/cli_commands/__init__.py +1 -0
eval_protocol/cli_commands/agent_eval_cmd.py +264 -0
eval_protocol/cli_commands/common.py +242 -0
eval_protocol/cli_commands/deploy.py +486 -0
eval_protocol/cli_commands/deploy_mcp.py +287 -0
eval_protocol/cli_commands/preview.py +186 -0
eval_protocol/cli_commands/run_eval_cmd.py +202 -0
eval_protocol/common_utils.py +36 -0
eval_protocol/config.py +180 -0
eval_protocol/datasets/__init__.py +1 -0
eval_protocol/datasets/loader.py +521 -0
eval_protocol/evaluation.py +1045 -0
eval_protocol/execution/__init__.py +1 -0
eval_protocol/execution/pipeline.py +920 -0
eval_protocol/gcp_tools.py +484 -0
eval_protocol/generation/cache.py +141 -0
eval_protocol/generation/clients/base.py +67 -0
eval_protocol/generation/clients.py +248 -0
eval_protocol/generic_server.py +165 -0
eval_protocol/integrations/__init__.py +12 -0
eval_protocol/integrations/braintrust.py +51 -0
eval_protocol/integrations/deepeval.py +106 -0
eval_protocol/integrations/openeval.py +40 -0
eval_protocol/integrations/trl.py +187 -0
eval_protocol/mcp/__init__.py +48 -0
eval_protocol/mcp/adapter.py +131 -0
eval_protocol/mcp/client/__init__.py +12 -0
eval_protocol/mcp/client/connection.py +499 -0
eval_protocol/mcp/clients.py +195 -0
eval_protocol/mcp/execution/__init__.py +23 -0
eval_protocol/mcp/execution/base_policy.py +227 -0
eval_protocol/mcp/execution/fireworks_policy.py +209 -0
eval_protocol/mcp/execution/manager.py +506 -0
eval_protocol/mcp/execution/policy.py +421 -0
eval_protocol/mcp/grid_renderer.py +54 -0
eval_protocol/mcp/mcpgym.py +637 -0
eval_protocol/mcp/process_manager.py +177 -0
eval_protocol/mcp/session/__init__.py +11 -0
eval_protocol/mcp/session/manager.py +228 -0
eval_protocol/mcp/simple_process_manager.py +291 -0
eval_protocol/mcp/simulation_server.py +458 -0
eval_protocol/mcp/types.py +80 -0
eval_protocol/mcp_agent/__init__.py +1 -0
eval_protocol/mcp_agent/config.py +147 -0
eval_protocol/mcp_agent/intermediary_server.py +542 -0
eval_protocol/mcp_agent/main.py +210 -0
eval_protocol/mcp_agent/orchestration/__init__.py +1 -0
eval_protocol/mcp_agent/orchestration/base_client.py +132 -0
eval_protocol/mcp_agent/orchestration/local_docker_client.py +702 -0
eval_protocol/mcp_agent/orchestration/remote_http_client.py +304 -0
eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +3 -0
eval_protocol/mcp_agent/session.py +79 -0
eval_protocol/mcp_env.py +304 -0
eval_protocol/models.py +366 -0
eval_protocol/packaging.py +219 -0
eval_protocol/platform_api.py +360 -0
eval_protocol/playback_policy.py +396 -0
eval_protocol/resources.py +128 -0
eval_protocol/reward_function.py +410 -0
eval_protocol/rewards/__init__.py +94 -0
eval_protocol/rewards/accuracy.py +454 -0
eval_protocol/rewards/accuracy_length.py +173 -0
eval_protocol/rewards/apps_coding_reward.py +331 -0
eval_protocol/rewards/apps_execution_utils.py +149 -0
eval_protocol/rewards/apps_testing_util.py +559 -0
eval_protocol/rewards/bfcl_reward.py +313 -0
eval_protocol/rewards/code_execution.py +1620 -0
eval_protocol/rewards/code_execution_utils.py +72 -0
eval_protocol/rewards/cpp_code.py +861 -0
eval_protocol/rewards/deepcoder_reward.py +161 -0
eval_protocol/rewards/format.py +129 -0
eval_protocol/rewards/function_calling.py +541 -0
eval_protocol/rewards/json_schema.py +422 -0
eval_protocol/rewards/language_consistency.py +700 -0
eval_protocol/rewards/lean_prover.py +479 -0
eval_protocol/rewards/length.py +375 -0
eval_protocol/rewards/list_comparison_math_reward.py +221 -0
eval_protocol/rewards/math.py +762 -0
eval_protocol/rewards/multiple_choice_math_reward.py +232 -0
eval_protocol/rewards/reasoning_steps.py +249 -0
eval_protocol/rewards/repetition.py +342 -0
eval_protocol/rewards/tag_count.py +162 -0
eval_protocol/rl_processing.py +82 -0
eval_protocol/server.py +271 -0
eval_protocol/typed_interface.py +260 -0
eval_protocol/utils/__init__.py +8 -0
eval_protocol/utils/batch_evaluation.py +217 -0
eval_protocol/utils/batch_transformation.py +205 -0
eval_protocol/utils/dataset_helpers.py +112 -0
eval_protocol/utils/module_loader.py +56 -0
eval_protocol/utils/packaging_utils.py +108 -0
eval_protocol/utils/static_policy.py +305 -0
eval_protocol-0.0.3.dist-info/METADATA +635 -0
eval_protocol-0.0.3.dist-info/RECORD +130 -0
eval_protocol-0.0.3.dist-info/WHEEL +5 -0
eval_protocol-0.0.3.dist-info/entry_points.txt +4 -0
eval_protocol-0.0.3.dist-info/licenses/LICENSE +201 -0
eval_protocol-0.0.3.dist-info/top_level.txt +2 -0

eval_protocol/rewards/length.py ADDED Viewed

@@ -0,0 +1,375 @@
+"""
+Reward functions for evaluating response length.
+This module provides reward functions that evaluate the length of model responses,
+either by simple token/character count or using cosine-scaled rewards to promote
+token efficiency.
+"""
+import math
+import re
+from typing import Any, Callable, Dict, List, Optional, Union
+from ..models import EvaluateResult, Message, MetricResult
+from ..typed_interface import reward_function
+def count_tokens(text: str, method: str = "whitespace") -> int:
+    """
+    Count tokens in text using different methods.
+    Args:
+        text: The text to tokenize
+        method: Tokenization method to use ('whitespace', 'character', or 'words')
+    Returns:
+        Token count based on the selected method
+    """
+    if method == "character":
+        return len(text)
+    elif method == "whitespace":
+        return len(re.split(r"\s+", text.strip()))
+    elif method == "words":
+        return len(re.findall(r"\b[\w\d]+\b", text))
+    else:
+        return len(re.split(r"\s+", text.strip()))
+@reward_function  # type: ignore[arg-type]
+def length_reward(
+    messages: Union[List[Message], List[Dict[str, Any]]],
+    *,
+    ground_truth: Optional[
+        Union[List[Message], List[Dict[str, Any]]]
+    ] = None,  # Not used by this function but part of standard signature
+    target_length: Optional[int] = None,
+    min_length: Optional[int] = None,
+    max_length: Optional[int] = None,
+    token_method: str = "whitespace",
+    scaling: str = "linear",
+    reward_range: Optional[List[float]] = None,
+    **kwargs: Any,
+) -> EvaluateResult:
+    """
+    Reward function that evaluates the length of model responses.
+    The model's response is assumed to be the last message in the `messages` list.
+    This function can calculate rewards based on token count and can encourage either
+    conciseness or thoroughness by setting appropriate min/max/target parameters.
+    Args:
+        messages: List of conversation messages, where `messages[-1]` is the model's response.
+        ground_truth: Optional. Expected assistant response trajectory. Not directly used by this length reward.
+        target_length: Optional target token count (optimal length).
+        min_length: Minimum acceptable token count.
+        max_length: Maximum acceptable token count.
+        token_method: Method to count tokens ('whitespace', 'character', or 'words')
+        scaling: Scaling method for reward calculation ('linear' or 'cosine')
+        reward_range: Range for reward values, default is [0.0, 1.0]
+        **kwargs: Additional arguments
+    Returns:
+        EvaluateResult with score based on length evaluation
+    """
+    if not messages or len(messages) == 0:
+        return EvaluateResult(
+            score=0.0,
+            reason="No messages provided",
+            metrics={"length": MetricResult(score=0.0, is_score_valid=False, reason="No messages provided")},
+        )
+    response = messages[-1]
+    if isinstance(response, Message):
+        if response.role != "assistant" or not response.content:
+            return EvaluateResult(
+                score=0.0,
+                reason="No assistant response found",
+                metrics={
+                    "length": MetricResult(
+                        score=0.0,
+                        is_score_valid=False,
+                        reason="Message not from assistant or has no content",
+                    )
+                },
+            )
+        text = response.content
+    elif isinstance(response, dict):
+        if response.get("role") != "assistant" or not response.get("content"):
+            return EvaluateResult(
+                score=0.0,
+                reason="No assistant response found",
+                metrics={
+                    "length": MetricResult(
+                        score=0.0,
+                        is_score_valid=False,
+                        reason="Message not from assistant or has no content",
+                    )
+                },
+            )
+        text = response.get("content", "")
+    else:
+        return EvaluateResult(
+            score=0.0,
+            reason="Last message is of unexpected type.",
+            metrics={
+                "length": MetricResult(
+                    score=0.0,
+                    is_score_valid=False,
+                    reason="Invalid message type in messages.",
+                )
+            },
+        )
+    token_count = count_tokens(text, method=token_method)
+    if reward_range is None:
+        reward_range = [0.0, 1.0]
+    min_reward, max_reward = reward_range
+    if target_length is not None:
+        normalized_diff = abs(token_count - target_length) / target_length if target_length > 0 else 1.0
+        if scaling == "cosine":
+            progress = min(1.0, normalized_diff)
+            score = min_reward + (max_reward - min_reward) * (1.0 + math.cos(progress * math.pi)) / 2.0
+        else:
+            score = max(
+                min_reward,
+                max_reward - normalized_diff * (max_reward - min_reward),
+            )
+        reason = (
+            f"Response length ({token_count} tokens) deviated by {normalized_diff:.2f} from target ({target_length})"
+        )
+        success = normalized_diff < 0.2
+    elif min_length is not None and max_length is not None:
+        if token_count < min_length:
+            progress = token_count / min_length
+            if scaling == "cosine":
+                score = min_reward + (max_reward - min_reward) * (1.0 - math.cos(progress * math.pi / 2.0))
+            else:
+                score = min_reward + (max_reward - min_reward) * progress
+            reason = f"Response length ({token_count} tokens) is below minimum ({min_length})"
+            success = False
+        elif token_count > max_length:
+            excess = token_count - max_length
+            range_size = (
+                max_length - min_length if max_length > min_length else 1
+            )  # Avoid division by zero if min_length == max_length
+            progress = min(
+                1.0,
+                excess / range_size if range_size > 0 else (1.0 if excess > 0 else 0.0),
+            )
+            if scaling == "cosine":
+                score = max_reward - (max_reward - min_reward) * (1.0 - math.cos(progress * math.pi / 2.0))
+            else:
+                score = max_reward - (max_reward - min_reward) * progress
+            reason = f"Response length ({token_count} tokens) exceeds maximum ({max_length})"
+            success = False
+        else:
+            score = max_reward
+            reason = f"Response length ({token_count} tokens) is within acceptable range ({min_length}-{max_length})"
+            success = True
+    elif min_length is not None:
+        if token_count < min_length:
+            progress = token_count / min_length
+            if scaling == "cosine":
+                score = min_reward + (max_reward - min_reward) * (1.0 - math.cos(progress * math.pi / 2.0))
+            else:
+                score = min_reward + (max_reward - min_reward) * progress
+            reason = f"Response length ({token_count} tokens) is below minimum ({min_length})"
+            success = False
+        else:
+            score = max_reward
+            reason = f"Response length ({token_count} tokens) meets minimum requirement ({min_length})"
+            success = True
+    elif max_length is not None:
+        if token_count > max_length:
+            excess = token_count - max_length
+            progress = min(
+                1.0,
+                excess / max_length if max_length > 0 else (1.0 if excess > 0 else 0.0),
+            )
+            if scaling == "cosine":
+                score = max_reward - (max_reward - min_reward) * (1.0 - math.cos(progress * math.pi / 2.0))
+            else:
+                score = max_reward - (max_reward - min_reward) * progress
+            reason = f"Response length ({token_count} tokens) exceeds maximum ({max_length})"
+            success = False
+        else:
+            score = max_reward
+            reason = f"Response length ({token_count} tokens) is within maximum limit ({max_length})"
+            success = True
+    else:
+        # This is useful when combined with correctness metrics
+        # E.g., shorter correct answers > longer correct answers > incorrect answers
+        reference_length = 100  # Default length for normalization
+        normalized_length = token_count / reference_length
+        if scaling == "cosine":
+            progress = min(1.0, normalized_length)
+            score = min_reward + (max_reward - min_reward) * (1.0 + math.cos(progress * math.pi)) / 2.0
+        else:
+            progress = min(1.0, normalized_length)
+            score = max_reward - progress * (max_reward - min_reward)
+        reason = f"Response length: {token_count} tokens"
+        success = True
+    metrics = {
+        "length": MetricResult(score=score, is_score_valid=success, reason=reason),
+        "token_count": MetricResult(
+            score=min(
+                1.0,
+                float(token_count) / (target_length or max_length or min_length or 100),
+            ),
+            is_score_valid=success,
+            reason=f"Token count: {token_count}",
+        ),
+    }
+    return EvaluateResult(score=score, reason=reason, metrics=metrics)
+@reward_function  # type: ignore[arg-type]
+def cosine_length_reward(
+    messages: Union[List[Message], List[Dict[str, Any]]],
+    *,
+    ground_truth: Optional[
+        Union[List[Message], List[Dict[str, Any]]]
+    ] = None,  # Not used by this function but part of standard signature
+    correctness: Optional[float] = None,
+    is_correct: Optional[bool] = None,
+    max_length: int = 1000,
+    min_value_wrong: float = 0.0,
+    max_value_wrong: float = 0.3,
+    min_value_correct: float = 0.5,
+    max_value_correct: float = 1.0,
+    token_method: str = "whitespace",
+    **kwargs: Any,
+) -> EvaluateResult:
+    """
+    Reward function that scales based on completion length using a cosine schedule.
+    The model's response is assumed to be the last message in the `messages` list.
+    Inspired by the OpenR1 implementation (https://github.com/OpenRL-Lab/open-r1) and
+    Kimi Technical Report (https://arxiv.org/abs/2501.12599).
+    Shorter correct solutions are rewarded more than longer ones.
+    Longer incorrect solutions are penalized less than shorter ones.
+    Args:
+        messages: List of conversation messages, where `messages[-1]` is the model's response.
+        ground_truth: Optional. Expected assistant response trajectory. Not directly used by this length reward.
+        correctness: Optional float (0-1) indicating solution correctness.
+        is_correct: Optional boolean indicating if the solution is correct.
+        max_length: Maximum length for scaling.
+        min_value_wrong: Minimum reward for wrong answers (typically negative)
+        max_value_wrong: Maximum reward for wrong answers (typically negative but closer to zero)
+        min_value_correct: Minimum reward for correct answers (typically positive)
+        max_value_correct: Maximum reward for correct answers (typically more positive)
+        token_method: Method to count tokens
+        **kwargs: Additional arguments
+    Returns:
+        EvaluateResult with score based on cosine-scaled length evaluation
+    """
+    if not messages or len(messages) == 0:
+        return EvaluateResult(
+            score=0.0,
+            reason="No messages provided",
+            metrics={"cosine_length": MetricResult(score=0.0, is_score_valid=False, reason="No messages provided")},
+        )
+    response = messages[-1]
+    if isinstance(response, Message):
+        if response.role != "assistant" or not response.content:
+            return EvaluateResult(
+                score=0.0,
+                reason="No assistant response found",
+                metrics={
+                    "cosine_length": MetricResult(
+                        score=0.0,
+                        is_score_valid=False,
+                        reason="Message not from assistant or has no content",
+                    )
+                },
+            )
+        text = response.content
+    elif isinstance(response, dict):
+        if response.get("role") != "assistant" or not response.get("content"):
+            return EvaluateResult(
+                score=0.0,
+                reason="No assistant response found",
+                metrics={
+                    "cosine_length": MetricResult(
+                        score=0.0,
+                        is_score_valid=False,
+                        reason="Message not from assistant or has no content",
+                    )
+                },
+            )
+        text = response.get("content", "")
+    else:
+        return EvaluateResult(
+            score=0.0,
+            reason="Last message is of unexpected type.",
+            metrics={
+                "cosine_length": MetricResult(
+                    score=0.0,
+                    is_score_valid=False,
+                    reason="Invalid message type in messages.",
+                )
+            },
+        )
+    token_count = count_tokens(text, method=token_method)
+    solution_is_correct = False
+    if is_correct is not None:
+        solution_is_correct = is_correct
+    elif correctness is not None:
+        solution_is_correct = correctness >= 0.9
+    progress = min(1.0, token_count / max_length)
+    cosine_factor = math.cos(progress * math.pi)
+    if solution_is_correct:
+        min_value = min_value_correct
+        max_value = max_value_correct
+    else:
+        min_value = max_value_wrong
+        max_value = min_value_wrong
+    score = min_value + 0.5 * (max_value - min_value) * (1.0 + cosine_factor)
+    if solution_is_correct:
+        success = True
+        reason = f"Correct solution with length penalty: {token_count} tokens"
+    else:
+        success = False
+        reason = f"Incorrect solution with length consideration: {token_count} tokens"
+    detailed_reason = (
+        f"Length-based {'reward' if solution_is_correct else 'penalty'}: "
+        f"{token_count}/{max_length} tokens, cosine factor: {cosine_factor:.2f}"
+    )
+    metrics = {
+        "cosine_length": MetricResult(
+            score=score,
+            is_score_valid=success,
+            reason=detailed_reason,  # Use detailed_reason here
+        ),
+        "token_count": MetricResult(
+            score=min(1.0, float(token_count) / max_length),
+            is_score_valid=success,
+            reason=f"Token count: {token_count}/{max_length}",
+        ),
+        "correctness": MetricResult(
+            score=1.0 if solution_is_correct else 0.0,
+            is_score_valid=solution_is_correct,
+            reason=f"Solution is {'correct' if solution_is_correct else 'incorrect'}",
+        ),
+    }
+    return EvaluateResult(score=score, reason=reason, metrics=metrics)

eval_protocol/rewards/list_comparison_math_reward.py ADDED Viewed

@@ -0,0 +1,221 @@
+"""
+Reward function for comparing lists of numbers, often found in math answers
+like sets of divisors, roots, etc.
+"""
+import re
+from typing import Any, Dict, List, Optional, Set, Tuple, Union
+from ..models import EvaluateResult, Message, MetricResult
+from ..typed_interface import reward_function
+def parse_number_list_from_string(s: str) -> Optional[List[float]]:
+    """
+    Parses a string potentially containing a comma-separated list of numbers.
+    Handles integers and simple decimals.
+    e.g., "1, 2, 3.5, 4" -> [1.0, 2.0, 3.5, 4.0]
+    """
+    numbers = []
+    s = s.replace("$", "").strip()
+    parts = re.split(r"\s*,\s*", s)
+    if not parts or not any(p.strip() for p in parts):
+        return None
+    for part in parts:
+        part = part.strip()
+        if not part:
+            continue
+        try:
+            numbers.append(float(part))
+        except ValueError:
+            return None
+    return numbers if numbers else None
+def extract_number_list(text: str) -> List[List[float]]:
+    """
+    Extracts lists of numbers from text.
+    Prioritizes content within \\boxed{} or $...$.
+    If multiple such expressions exist, each valid list is returned.
+    If no such delimiters, tries to parse the whole text.
+    Args:
+        text: The text to extract number lists from.
+    Returns:
+        A list of extracted number lists. Each inner list contains floats.
+        Example: "\\boxed{1,2,3}, $4,5$" -> [[1.0, 2.0, 3.0], [4.0, 5.0]]
+    """
+    extracted_lists: List[List[float]] = []
+    # Priority 1: Boxed LaTeX expressions
+    boxed_contents = re.findall(r"\\boxed\{((?:[^{}]|\{[^{}]*\})*)\}", text)
+    if boxed_contents:
+        for content in boxed_contents:
+            parsed_list = parse_number_list_from_string(content)
+            if parsed_list:
+                extracted_lists.append(parsed_list)
+        if extracted_lists:
+            return extracted_lists
+    # Priority 2: Content within $...$ or $$...$$
+    dollar_contents = re.findall(r"\$\$(.*?)\$\$|\$(.*?)\$", text, re.DOTALL)
+    if dollar_contents:
+        for group_match in dollar_contents:
+            content = group_match[0] if group_match[0] else group_match[1]
+            if content:
+                parsed_list = parse_number_list_from_string(content.strip())
+                if parsed_list:
+                    extracted_lists.append(parsed_list)
+        if extracted_lists:
+            return extracted_lists
+    # Priority 3: Try parsing the whole text as a list if no delimiters found
+    # This is a fallback and might be less reliable.
+    if not extracted_lists:
+        full_text_parsed_list = parse_number_list_from_string(text)
+        if full_text_parsed_list:
+            extracted_lists.append(full_text_parsed_list)
+    return extracted_lists
+@reward_function  # type: ignore[arg-type]
+def list_comparison_math_reward(
+    messages: List[Message],
+    *,
+    ground_truth: str,
+    order_matters: bool = False,
+    **kwargs: Any,
+) -> EvaluateResult:
+    """
+    Evaluate answers that are lists/sets of numbers.
+    Extracts lists of numbers from the model's response (messages[-1].content)
+    and the ground_truth string, then compares them.
+    By default, order does not matter (set comparison).
+    Args:
+        messages: List of conversation messages. The last message is the assistant's response.
+        ground_truth: String representation of the expected list of numbers.
+        order_matters: If True, compares lists directly (order and count matter).
+                       If False (default), compares as sets (order and duplicates
+                       within a list don't matter beyond presence).
+        **kwargs: Additional keyword arguments.
+    Returns:
+        EvaluateResult with score and metrics.
+    """
+    metrics: Dict[str, MetricResult] = {}
+    if (
+        not messages
+        or not isinstance(messages[-1], Message)
+        or messages[-1].role != "assistant"
+        or messages[-1].content is None
+    ):
+        return EvaluateResult(
+            score=0.0,
+            reason="Invalid or missing assistant response in messages.",
+            metrics={
+                "error": MetricResult(
+                    score=0.0,
+                    is_score_valid=False,
+                    reason="Last message not a valid assistant response.",
+                )
+            },
+        )
+    gen_content = messages[-1].content
+    orig_content = ground_truth
+    if not gen_content:
+        return EvaluateResult(
+            score=0.0,
+            reason="Assistant response content is empty.",
+            metrics={
+                "error": MetricResult(
+                    score=0.0,
+                    is_score_valid=False,
+                    reason="Empty generated message content.",
+                )
+            },
+        )
+    if not orig_content:
+        return EvaluateResult(
+            score=0.0,
+            reason="Ground truth string (expected list) is empty.",
+            metrics={"error": MetricResult(score=0.0, is_score_valid=False, reason="Empty ground truth string.")},
+        )
+    gen_lists = extract_number_list(gen_content)
+    orig_lists = extract_number_list(orig_content)
+    metrics["extracted_original_lists"] = MetricResult(
+        score=1.0 if orig_lists else 0.0,
+        is_score_valid=bool(orig_lists),
+        reason=f"Original lists: {orig_lists}",
+    )
+    metrics["extracted_generated_lists"] = MetricResult(
+        score=1.0 if gen_lists else 0.0,
+        is_score_valid=bool(gen_lists),
+        reason=f"Generated lists: {gen_lists}",
+    )
+    if not orig_lists:
+        return EvaluateResult(
+            score=0.0,
+            reason="Could not extract any number list from original message (ground truth).",
+            metrics=metrics,
+        )
+    if not gen_lists:
+        return EvaluateResult(
+            score=0.0,
+            reason="Could not extract any number list from generated message.",
+            metrics=metrics,
+        )
+    # For simplicity, compare the first valid list found in each.
+    # Future improvement: handle multiple lists (e.g., if solution has multiple boxed lists)
+    orig_list_to_compare = orig_lists[0]
+    gen_list_to_compare = gen_lists[0]
+    score = 0.0
+    match_reason = ""
+    if order_matters:
+        # Note: To be robust against float precision, comparison element-wise with tolerance might be needed.
+        if gen_list_to_compare == orig_list_to_compare:
+            score = 1.0
+            match_reason = (
+                f"Exact list match (order matters). Gen: {gen_list_to_compare} vs Orig: {orig_list_to_compare}"
+            )
+        else:
+            score = 0.0
+            match_reason = f"List mismatch (order matters). Gen: {gen_list_to_compare} vs Orig: {orig_list_to_compare}"
+    else:
+        # Note: float precision can be an issue with sets. A more robust set comparison would involve tolerance.
+        gen_set = set(gen_list_to_compare)
+        orig_set = set(orig_list_to_compare)
+        if gen_set == orig_set:
+            score = 1.0
+            match_reason = (
+                f"Set match (order does not matter). Gen: {sorted(list(gen_set))} vs Orig: {sorted(list(orig_set))}"
+            )
+        else:
+            score = 0.0
+            missing_in_gen = orig_set - gen_set
+            extra_in_gen = gen_set - orig_set
+            match_reason_parts = [
+                f"Set mismatch (order does not matter). Gen: {sorted(list(gen_set))} vs Orig: {sorted(list(orig_set))}."
+            ]
+            if missing_in_gen:
+                match_reason_parts.append(f"Missing in generated: {sorted(list(missing_in_gen))}.")
+            if extra_in_gen:
+                match_reason_parts.append(f"Extra in generated: {sorted(list(extra_in_gen))}.")
+            match_reason = " ".join(match_reason_parts)
+    metrics["list_comparison"] = MetricResult(score=score, is_score_valid=(score == 1.0), reason=match_reason)
+    return EvaluateResult(score=score, reason=match_reason, metrics=metrics)