PyPI - eval-protocol - Versions diffs - 0.0.3__py3-none-any.whl - Mend

eval-protocol 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (130) hide show

development/__init__.py +1 -0
development/normalize_sandbox_fusion.py +628 -0
development/utils/__init__.py +1 -0
development/utils/generate_api_key.py +31 -0
development/utils/subprocess_manager.py +481 -0
eval_protocol/__init__.py +86 -0
eval_protocol/__main__.py +10 -0
eval_protocol/_version.py +21 -0
eval_protocol/adapters/__init__.py +1 -0
eval_protocol/adapters/braintrust.py +8 -0
eval_protocol/adapters/trl.py +8 -0
eval_protocol/agent/__init__.py +29 -0
eval_protocol/agent/models.py +69 -0
eval_protocol/agent/orchestrator.py +893 -0
eval_protocol/agent/resource_abc.py +89 -0
eval_protocol/agent/resource_pool.py +184 -0
eval_protocol/agent/resources/__init__.py +44 -0
eval_protocol/agent/resources/bfcl_envs/__init__.py +1 -0
eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +342 -0
eval_protocol/agent/resources/bfcl_envs/math_api.py +40 -0
eval_protocol/agent/resources/bfcl_envs/posting_api.py +157 -0
eval_protocol/agent/resources/bfcl_sim_api_resource.py +314 -0
eval_protocol/agent/resources/docker_resource.py +479 -0
eval_protocol/agent/resources/filesystem_resource.py +371 -0
eval_protocol/agent/resources/http_rollout_protocol.py +85 -0
eval_protocol/agent/resources/http_rollout_resource.py +325 -0
eval_protocol/agent/resources/python_state_resource.py +170 -0
eval_protocol/agent/resources/sql_resource.py +271 -0
eval_protocol/agent/task_manager.py +1064 -0
eval_protocol/agent/tool_registry.py +111 -0
eval_protocol/auth.py +156 -0
eval_protocol/cli.py +425 -0
eval_protocol/cli_commands/__init__.py +1 -0
eval_protocol/cli_commands/agent_eval_cmd.py +264 -0
eval_protocol/cli_commands/common.py +242 -0
eval_protocol/cli_commands/deploy.py +486 -0
eval_protocol/cli_commands/deploy_mcp.py +287 -0
eval_protocol/cli_commands/preview.py +186 -0
eval_protocol/cli_commands/run_eval_cmd.py +202 -0
eval_protocol/common_utils.py +36 -0
eval_protocol/config.py +180 -0
eval_protocol/datasets/__init__.py +1 -0
eval_protocol/datasets/loader.py +521 -0
eval_protocol/evaluation.py +1045 -0
eval_protocol/execution/__init__.py +1 -0
eval_protocol/execution/pipeline.py +920 -0
eval_protocol/gcp_tools.py +484 -0
eval_protocol/generation/cache.py +141 -0
eval_protocol/generation/clients/base.py +67 -0
eval_protocol/generation/clients.py +248 -0
eval_protocol/generic_server.py +165 -0
eval_protocol/integrations/__init__.py +12 -0
eval_protocol/integrations/braintrust.py +51 -0
eval_protocol/integrations/deepeval.py +106 -0
eval_protocol/integrations/openeval.py +40 -0
eval_protocol/integrations/trl.py +187 -0
eval_protocol/mcp/__init__.py +48 -0
eval_protocol/mcp/adapter.py +131 -0
eval_protocol/mcp/client/__init__.py +12 -0
eval_protocol/mcp/client/connection.py +499 -0
eval_protocol/mcp/clients.py +195 -0
eval_protocol/mcp/execution/__init__.py +23 -0
eval_protocol/mcp/execution/base_policy.py +227 -0
eval_protocol/mcp/execution/fireworks_policy.py +209 -0
eval_protocol/mcp/execution/manager.py +506 -0
eval_protocol/mcp/execution/policy.py +421 -0
eval_protocol/mcp/grid_renderer.py +54 -0
eval_protocol/mcp/mcpgym.py +637 -0
eval_protocol/mcp/process_manager.py +177 -0
eval_protocol/mcp/session/__init__.py +11 -0
eval_protocol/mcp/session/manager.py +228 -0
eval_protocol/mcp/simple_process_manager.py +291 -0
eval_protocol/mcp/simulation_server.py +458 -0
eval_protocol/mcp/types.py +80 -0
eval_protocol/mcp_agent/__init__.py +1 -0
eval_protocol/mcp_agent/config.py +147 -0
eval_protocol/mcp_agent/intermediary_server.py +542 -0
eval_protocol/mcp_agent/main.py +210 -0
eval_protocol/mcp_agent/orchestration/__init__.py +1 -0
eval_protocol/mcp_agent/orchestration/base_client.py +132 -0
eval_protocol/mcp_agent/orchestration/local_docker_client.py +702 -0
eval_protocol/mcp_agent/orchestration/remote_http_client.py +304 -0
eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +3 -0
eval_protocol/mcp_agent/session.py +79 -0
eval_protocol/mcp_env.py +304 -0
eval_protocol/models.py +366 -0
eval_protocol/packaging.py +219 -0
eval_protocol/platform_api.py +360 -0
eval_protocol/playback_policy.py +396 -0
eval_protocol/resources.py +128 -0
eval_protocol/reward_function.py +410 -0
eval_protocol/rewards/__init__.py +94 -0
eval_protocol/rewards/accuracy.py +454 -0
eval_protocol/rewards/accuracy_length.py +173 -0
eval_protocol/rewards/apps_coding_reward.py +331 -0
eval_protocol/rewards/apps_execution_utils.py +149 -0
eval_protocol/rewards/apps_testing_util.py +559 -0
eval_protocol/rewards/bfcl_reward.py +313 -0
eval_protocol/rewards/code_execution.py +1620 -0
eval_protocol/rewards/code_execution_utils.py +72 -0
eval_protocol/rewards/cpp_code.py +861 -0
eval_protocol/rewards/deepcoder_reward.py +161 -0
eval_protocol/rewards/format.py +129 -0
eval_protocol/rewards/function_calling.py +541 -0
eval_protocol/rewards/json_schema.py +422 -0
eval_protocol/rewards/language_consistency.py +700 -0
eval_protocol/rewards/lean_prover.py +479 -0
eval_protocol/rewards/length.py +375 -0
eval_protocol/rewards/list_comparison_math_reward.py +221 -0
eval_protocol/rewards/math.py +762 -0
eval_protocol/rewards/multiple_choice_math_reward.py +232 -0
eval_protocol/rewards/reasoning_steps.py +249 -0
eval_protocol/rewards/repetition.py +342 -0
eval_protocol/rewards/tag_count.py +162 -0
eval_protocol/rl_processing.py +82 -0
eval_protocol/server.py +271 -0
eval_protocol/typed_interface.py +260 -0
eval_protocol/utils/__init__.py +8 -0
eval_protocol/utils/batch_evaluation.py +217 -0
eval_protocol/utils/batch_transformation.py +205 -0
eval_protocol/utils/dataset_helpers.py +112 -0
eval_protocol/utils/module_loader.py +56 -0
eval_protocol/utils/packaging_utils.py +108 -0
eval_protocol/utils/static_policy.py +305 -0
eval_protocol-0.0.3.dist-info/METADATA +635 -0
eval_protocol-0.0.3.dist-info/RECORD +130 -0
eval_protocol-0.0.3.dist-info/WHEEL +5 -0
eval_protocol-0.0.3.dist-info/entry_points.txt +4 -0
eval_protocol-0.0.3.dist-info/licenses/LICENSE +201 -0
eval_protocol-0.0.3.dist-info/top_level.txt +2 -0

eval_protocol/mcp_env.py ADDED Viewed

@@ -0,0 +1,304 @@
+"""
+MCP Environment API for reward-kit - Backward Compatibility Facade
+This module has been refactored into modular components for better maintainability.
+This file now serves as a backward compatibility facade.
+New modular structure:
+- mcp.client.connection: MCP client connection management
+- mcp.execution.policy: LLMBasePolicy and FireworksPolicy for tool calling
+- mcp.execution.rollout: Rollout coordination and lifecycle
+- mcp.session.manager: Session and environment management
+Usage remains the same:
+    import eval_protocol as ep
+    # Load dataset with environment configuration and prompts
+    dataset = load_jsonl("dataset.jsonl")
+    # Create general policy (environment-agnostic)
+    policy = ep.FireworksPolicy(model_id="accounts/fireworks/models/qwen3-235b-a22b")
+    # Create environments with dataset-driven configuration
+    envs = ep.make("http://localhost:8000/mcp", dataset=dataset)
+    # Execute tool-calling rollouts
+    trajectories = await ep.rollout(envs, policy=policy, steps=512)
+Key Features:
+- General tool-calling interface that works with any MCP environment
+- Dataset-driven configuration with system prompts and user prompt templates
+- Automatic MCP tool discovery from servers
+- **PROPER MCP PATTERN**: Initial state obtained from MCP resources during session establishment
+- Tools used only for actions/interactions, not for getting initial state
+- Dynamic user prompt formatting based on current observations
+- Environment-agnostic policy that receives tool schemas and makes structured calls
+- Backward compatibility with servers that don't expose resources
+- **NEW**: LLMBasePolicy abstraction enables easy OpenAI integration
+MCP Integration:
+- Session establishment creates MCP connection and discovers resources and tools
+- Initial state comes from MCP resources (list_resources + read_resource calls)
+- Tools are used for subsequent actions during rollout steps
+- Resources provide static/configuration data, tools provide dynamic actions
+"""
+# For legacy compatibility - import the facade functions
+import logging
+import os
+import random
+from typing import Any, Callable, Dict, List, Optional, Union
+# Import all functionality from the new modular components
+from .mcp.execution.manager import ExecutionManager
+from .mcp.execution.policy import AnthropicPolicy, LLMBasePolicy, OpenAIPolicy
+from .mcp.session.manager import GeneralMCPVectorEnv
+from .mcp.types import DatasetRow, MCPSession, MCPToolCall, Trajectory
+# Try to import FireworksPolicy - it may fail if fireworks-ai is not installed
+# or if a different 'fireworks' package is installed
+try:
+    from .mcp.execution.policy import FireworksPolicy
+except:
+    # Silently skip if import fails for any reason
+    pass
+logger = logging.getLogger(__name__)
+# Keep the old MCPVectorEnv for backward compatibility
+MCPVectorEnv = GeneralMCPVectorEnv
+def make(
+    env_spec: str,
+    dataset: Optional[List[Dict]] = None,
+    n: Optional[int] = None,
+    seeds: Optional[List[int]] = None,
+    model_id: str = "unknown",
+    user_prompt_formatter: Optional[Callable] = None,
+) -> GeneralMCPVectorEnv:
+    """
+    Create general MCP environments driven by dataset configuration.
+    Args:
+        env_spec: MCP server URL
+        dataset: List of dataset rows with prompts and context (preferred)
+        n: Number of environments (for backward compatibility)
+        seeds: List of seeds (for backward compatibility)
+        model_id: Model identifier
+        user_prompt_formatter: Optional callback for formatting user prompts
+    Returns:
+        General MCP environment that works with any MCP server
+    Example:
+        # New dataset-driven approach (preferred)
+        dataset = load_jsonl("dataset.jsonl")
+        envs = ep.make("http://localhost:8000/mcp", dataset=dataset)
+        # Legacy approach (backward compatibility)
+        envs = ep.make("http://localhost:8000/mcp", n=10, seeds=seeds)
+    """
+    # Parse environment specification - make sure URL format is correct
+    base_url = env_spec
+    if not base_url.startswith("http"):
+        raise ValueError("Environment spec must be a valid HTTP URL")
+    # Ensure we HAVE a trailing slash to avoid 307 redirects that break POST requests
+    if not base_url.endswith("/"):
+        base_url += "/"
+    # Handle dataset-driven vs legacy approaches
+    if dataset is not None:
+        # New dataset-driven approach
+        dataset_rows = []
+        sessions = []
+        for row in dataset:
+            # Parse dataset row
+            if isinstance(row, dict):
+                # Handle seed from both old location (backward compatibility) and new location
+                environment_context = row.get("environment_context", {})
+                seed = environment_context.get("seed")
+                dataset_row = DatasetRow(
+                    id=row["id"],
+                    seed=seed,
+                    system_prompt=row["system_prompt"],
+                    user_prompt_template=row["user_prompt_template"],
+                    environment_context=environment_context,
+                    user_simulation=(row["user_simulation"] if "user_simulation" in row else None),
+                )
+            else:
+                dataset_row = row  # Assume it's already a DatasetRow
+            dataset_rows.append(dataset_row)
+            # Create MCP session
+            session = MCPSession(
+                session_id=dataset_row.id,
+                base_url=base_url,
+                seed=dataset_row.seed,
+                model_id=model_id,
+                dataset_row=dataset_row,
+            )
+            sessions.append(session)
+        return GeneralMCPVectorEnv(sessions, dataset_rows, user_prompt_formatter)
+    else:
+        # Legacy approach for backward compatibility
+        if n is None:
+            raise ValueError("Either 'dataset' or 'n' must be provided")
+        # Generate seeds if not provided
+        if seeds is None:
+            seeds = [random.randint(0, 2**31 - 1) for _ in range(n)]
+        elif len(seeds) != n:
+            raise ValueError(f"Expected {n} seeds, got {len(seeds)}")
+        # Create default dataset rows for legacy mode
+        dataset_rows = []
+        sessions = []
+        for i in range(n):
+            # Create a default dataset row (environment-agnostic)
+            dataset_row = DatasetRow(
+                id=f"session_{i}",
+                seed=seeds[i],
+                system_prompt="You are an AI agent interacting with an environment via available tools.",
+                user_prompt_template="Current observation: {observation}. Use available tools to interact with the environment.",
+                environment_context={},
+            )
+            dataset_rows.append(dataset_row)
+            # Create MCP session
+            session = MCPSession(
+                session_id=f"session_{i}",
+                base_url=base_url,
+                seed=seeds[i],
+                model_id=model_id,
+                dataset_row=dataset_row,
+            )
+            sessions.append(session)
+        return GeneralMCPVectorEnv(sessions, dataset_rows, user_prompt_formatter)
+async def rollout(
+    envs: Union[GeneralMCPVectorEnv, "MCPVectorEnv"],
+    policy: Union[FireworksPolicy, LLMBasePolicy, Callable],
+    steps: int = 512,
+    openai_format_log_file: Optional[str] = None,
+    max_concurrent_rollouts: int = 8,
+) -> List[Trajectory]:
+    """
+    Execute general rollouts using tool calling interface with automatic record/playback.
+    Uses concurrent execution with semaphore-based concurrency control for efficiency.
+    This works with ANY MCP environment because:
+    1. Policy receives tool schemas and makes tool calls
+    2. Environment prompts come from dataset
+    3. No hardcoded environment logic
+    Args:
+        envs: GeneralMCPVectorEnv instance
+        policy: Policy that takes tool schemas, observations, prompts and returns tool calls
+        steps: Maximum steps per rollout
+        openai_format_log_file: Optional file to log clean OpenAI format for terminated trajectories only
+        max_concurrent_rollouts: Maximum number of concurrent rollouts to run
+    Environment Variable Control:
+        EP_PLAYBACK_FILE: Controls record/playback mode
+        - Not set: Normal live mode
+        - Set but file doesn't exist: Record mode (file will be created)
+        - Set and file exists: Playback mode (uses recorded data)
+    Returns:
+        List of Trajectory objects with complete rollout data
+    Example:
+        # Live mode
+        trajectories = await ep.rollout(envs, policy)
+        # Recording mode
+        os.environ["EP_PLAYBACK_FILE"] = "record.jsonl"
+        trajectories = await ep.rollout(envs, policy, openai_format_log_file="sft_data.jsonl")
+        # Playback mode (after recording file exists)
+        trajectories = await ep.rollout(envs, policy)
+    """
+    # Use the new ExecutionManager for execution
+    execution_manager = ExecutionManager()
+    return await execution_manager.execute_rollouts(
+        envs, policy, steps, openai_format_log_file, max_concurrent_rollouts
+    )
+async def test_mcp(base_url: str, seeds: List[int]) -> Dict[str, Any]:
+    """
+    Test function for validating MCP server as mentioned in north star document.
+    Args:
+        base_url: Base URL of MCP server (e.g., "http://localhost:8000/mcp")
+        seeds: List of seeds to test
+    Returns:
+        Test results dictionary
+    """
+    print(f"🧪 Testing MCP server at {base_url} with {len(seeds)} seeds...")
+    results = {"total_tests": len(seeds), "successful": 0, "failed": 0, "results": []}
+    for seed in seeds:
+        try:
+            # Create single environment
+            envs = make(base_url, n=1, seeds=[seed], model_id="test-model")
+            # Simple policy for testing
+            policy = FireworksPolicy("test-model")
+            # Run short rollout
+            trajectories = await rollout(envs, policy=policy, steps=10)
+            if trajectories and len(trajectories[0].observations) > 1:
+                results["successful"] += 1
+                results["results"].append(
+                    {
+                        "seed": seed,
+                        "status": "success",
+                        "steps": trajectories[0].steps,
+                        "total_reward": trajectories[0].total_reward,
+                    }
+                )
+            else:
+                results["failed"] += 1
+                results["results"].append({"seed": seed, "status": "failed", "error": "empty_trajectory"})
+        except Exception as e:
+            results["failed"] += 1
+            results["results"].append({"seed": seed, "status": "failed", "error": str(e)})
+    success_rate = results["successful"] / results["total_tests"] * 100
+    print(f"✅ Test complete: {results['successful']}/{results['total_tests']} successful ({success_rate:.1f}%)")
+    return results
+# Add to eval_protocol.__init__.py exports
+__all__ = [
+    "make",
+    "rollout",
+    "AnthropicPolicy",
+    "FireworksPolicy",
+    "OpenAIPolicy",
+    "LLMBasePolicy",  # New base class for OpenAI integration
+    "MCPVectorEnv",
+    "GeneralMCPVectorEnv",
+    "MCPToolCall",
+    "DatasetRow",
+    "Trajectory",
+    "test_mcp",
+]

eval_protocol/models.py ADDED Viewed

@@ -0,0 +1,366 @@
+import json
+from typing import Any, Dict, List, Optional, Union
+from openai.types.chat.chat_completion_message import (
+    ChatCompletionMessageToolCall,
+    FunctionCall,
+)
+from pydantic import BaseModel, Field
+class Message(BaseModel):
+    """Chat message model compatible with OpenAI's interface."""
+    role: str
+    content: Optional[str] = ""  # Content can be None for tool calls in OpenAI API
+    name: Optional[str] = None
+    tool_call_id: Optional[str] = None
+    tool_calls: Optional[List[ChatCompletionMessageToolCall]] = None
+    function_call: Optional[FunctionCall] = None
+    @classmethod
+    def model_validate(cls, obj, *args, **kwargs):
+        if isinstance(obj, dict) and "role" not in obj:
+            raise ValueError("Role is required")
+        return super().model_validate(obj, *args, **kwargs)
+class MetricResult(BaseModel):
+    """Result of a single metric evaluation.
+    Attributes:
+        is_score_valid (bool): Whether the score is valid for this metric (required).
+        score (float): The score for this metric.
+        reason (str): Explanation for the score.
+    """
+    is_score_valid: bool = True
+    score: float = Field(..., ge=0.0, le=1.0)
+    reason: str
+    def __getitem__(self, key: str) -> Any:
+        if key in self.__fields__:  # Changed to __fields__ for Pydantic v1 compatibility
+            value = getattr(self, key)
+            return value
+        raise KeyError(f"'{key}'")
+    def __contains__(self, key: str) -> bool:
+        return key in self.__fields__  # Changed to __fields__
+    def get(self, key: str, default: Any = None) -> Any:
+        return getattr(self, key, default)
+    def keys(self):
+        return self.__fields__.keys()  # Changed to __fields__
+    def values(self):
+        # For consistency with __getitem__ returning raw attribute values (including nested models)
+        return [getattr(self, key) for key in self.__fields__.keys()]  # Changed to __fields__
+    def items(self):
+        return [(key, getattr(self, key)) for key in self.__fields__.keys()]  # Changed to __fields__
+    def __iter__(self):
+        return iter(self.__fields__.keys())  # Changed to __fields__
+class StepOutput(BaseModel):
+    """Defines the base reward and other metrics for a single conceptual step within a rollout,
+    as determined by the user's reward function.
+    """
+    step_index: Union[int, str] = Field(
+        description="User-defined index for the step (e.g., assistant message index, turn number). This is used by the system to map this output to the internal StepData."
+    )
+    base_reward: float = Field(description="Base reward calculated by the user's reward function for this step.")
+    terminated: bool = Field(
+        default=False,
+        description="Whether the environment signaled termination at this step."
+    )
+    control_plane_info: Optional[Dict[str, Any]] = Field(
+        default=None,
+        description="Structured info from the environment's control plane."
+    )
+    metrics: Dict[str, Any] = Field(
+        default_factory=dict,
+        description="Optional dictionary of custom metrics for this step.",
+    )
+    reason: Optional[str] = Field(
+        default=None,
+        description="Optional explanation for the step's base reward or metrics.",
+    )
+class EvaluateResult(BaseModel):
+    """The complete result of an evaluator.
+    For standard evaluation, it provides an overall score and component metrics.
+    For Reinforcement Learning, it can also provide per-step base rewards via 'step_outputs'.
+    This unified model serves both per-turn and per-trajectory evaluation scenarios.
+    Attributes:
+        score (float): The overall evaluation score.
+        is_score_valid (bool): Whether the overall score is valid. Defaults to True.
+        reason (Optional[str]): Optional explanation for the overall score.
+        metrics (Dict[str, MetricResult]): Dictionary of component metrics for detailed evaluation.
+        step_outputs (Optional[List[StepOutput]]): For RL, a list of outputs for each conceptual step,
+                                                  providing base rewards.
+        error (Optional[str]): Optional error message if evaluation failed.
+        trajectory_info (Optional[Dict[str, Any]]): Additional trajectory-level information.
+        final_control_plane_info (Optional[Dict[str, Any]]): The final control plane state that led to termination.
+    """
+    score: float = Field(..., description="The overall evaluation score, typically between 0.0 and 1.0.")
+    is_score_valid: bool = Field(default=True, description="Whether the overall score is valid.")
+    reason: Optional[str] = Field(default=None, description="Optional explanation for the overall score.")
+    metrics: Dict[str, MetricResult] = Field(
+        default_factory=dict,
+        description="Dictionary of component metrics for detailed breakdown.",
+    )
+    # New field for RL per-step base rewards
+    step_outputs: Optional[List[StepOutput]] = Field(
+        default=None,
+        description="For RL, a list of outputs for each conceptual step, providing base rewards.",
+    )
+    error: Optional[str] = Field(
+        default=None,
+        description="Optional error message if the evaluation itself encountered an issue.",
+    )
+    # New fields for unified trajectory and row-wise results
+    trajectory_info: Optional[Dict[str, Any]] = Field(
+        default=None,
+        description="Additional trajectory-level information (duration, steps, termination_reason, etc.)."
+    )
+    final_control_plane_info: Optional[Dict[str, Any]] = Field(
+        default=None,
+        description="The final control plane state that led to termination."
+    )
+    def __getitem__(self, key: str) -> Any:
+        if key in self.__fields__:  # Changed to __fields__
+            value = getattr(self, key)
+            # If the value is a dict of MetricResult, and we want __getitem__ on metrics
+            # to return a dict of dicts (rather than dict of MetricResult objects),
+            # we'd need special handling here.
+            # For now, return the raw attribute value, consistent with MetricResult.__getitem__
+            return value
+        raise KeyError(f"'{key}'")
+    def __contains__(self, key: str) -> bool:
+        return key in self.__fields__  # Changed to __fields__
+    def get(self, key: str, default: Any = None) -> Any:
+        return getattr(self, key, default)
+    def keys(self):
+        return self.__fields__.keys()  # Changed to __fields__
+    def values(self):
+        # For consistency with __getitem__ returning raw attribute values
+        return [getattr(self, key) for key in self.__fields__.keys()]  # Changed to __fields__
+    def items(self):
+        return [(key, getattr(self, key)) for key in self.__fields__.keys()]  # Changed to __fields__
+    def __iter__(self):
+        return iter(self.__fields__.keys())  # Changed to __fields__
+class EvaluationRow(BaseModel):
+    """
+    Unified data structure for a single evaluation unit that contains messages,
+    tools, and evaluation results. This can represent either a single turn evaluation
+    or a complete trajectory evaluation.
+    This model serves as the canonical format for evaluation data across the system,
+    supporting both row-wise batch evaluation and trajectory-based RL evaluation.
+    """
+    # Core conversation data
+    messages: List[Message] = Field(
+        description="List of messages in the conversation/trajectory."
+    )
+    # Tool and function call information
+    tools: Optional[List[Dict[str, Any]]] = Field(
+        default=None,
+        description="Available tools/functions that were provided to the agent."
+    )
+    # Input-related metadata (grouped together for cleaner organization)
+    input_metadata: Optional[Dict[str, Any]] = Field(
+        default=None,
+        description="Metadata related to the input (dataset info, model config, session data, etc.)."
+    )
+    # Unified evaluation result
+    evaluation_result: Optional[EvaluateResult] = Field(
+        default=None,
+        description="The evaluation result for this row/trajectory."
+    )
+    def is_trajectory_evaluation(self) -> bool:
+        """
+        Returns True if this represents a trajectory evaluation (has step_outputs),
+        False if it represents a single turn evaluation.
+        """
+        return (
+            self.evaluation_result is not None
+            and self.evaluation_result.step_outputs is not None
+            and len(self.evaluation_result.step_outputs) > 0
+        )
+    def get_conversation_length(self) -> int:
+        """Returns the number of messages in the conversation."""
+        return len(self.messages)
+    def get_assistant_messages(self) -> List[Message]:
+        """Returns only the assistant messages from the conversation."""
+        return [msg for msg in self.messages if msg.role == "assistant"]
+    def get_user_messages(self) -> List[Message]:
+        """Returns only the user messages from the conversation."""
+        return [msg for msg in self.messages if msg.role == "user"]
+    def get_input_metadata(self, key: str, default: Any = None) -> Any:
+        """Helper method to get a specific value from input_metadata."""
+        if self.input_metadata is None:
+            return default
+        return self.input_metadata.get(key, default)
+# Original dataclass-based models for backwards compatibility
+# These are deprecated and will be removed in a future version
+# Use EvaluateResult and MetricResult instead
+# MetricRewardOutput and RewardOutput are fully removed.
+# --- Models for New Agent Evaluation Framework (V2) ---
+class ResourceServerConfig(BaseModel):
+    """
+    Configuration for a resource server required by a task.
+    """
+    start_command: str = Field(
+        description="The command to start the server. The string '{port}' will be replaced with a dynamically allocated free port."
+    )
+    health_check_url: str = Field(
+        description="The URL to poll to check if the server is ready. The string '{port}' will be replaced with the allocated port."
+    )
+class EvaluationCriteriaModel(BaseModel):
+    """
+    Defines criteria for evaluating task success, often by querying the final state of a resource.
+    """
+    final_state_query: Optional[str] = Field(
+        default=None,
+        description="A query (e.g., SQL) to run on the final state of the resource.",
+    )
+    expected_query_result_transform: Optional[str] = Field(
+        default=None,
+        description="A Python lambda string (e.g., 'lambda x: x > 0') to transform and evaluate the query result to a boolean.",
+    )
+    # Explicit fields for ground truth data for BFCL evaluation
+    ground_truth_function_calls: Optional[List[List[str]]] = Field(
+        default=None, description="Ground truth function calls for BFCL evaluation."
+    )
+    ground_truth_comparable_state: Optional[Dict[str, Any]] = Field(
+        default=None, description="Ground truth comparable state for BFCL evaluation."
+    )
+    # Future: Could include other complex evaluation logic or references
+class TaskDefinitionModel(BaseModel):
+    """
+    Pydantic model for validating the structure of a V2 agent evaluation task definition file (YAML/JSON).
+    """
+    name: str = Field(description="Unique name for the task.")
+    description: Optional[str] = Field(default=None, description="A brief description of the task.")
+    resource_type: str = Field(
+        description="The type of ForkableResource to use (e.g., 'SQLResource', 'PythonStateResource', 'FileSystemResource', 'DockerResource')."
+    )
+    base_resource_config: Dict[str, Any] = Field(
+        default_factory=dict,
+        description="Configuration dictionary passed to the base resource's setup() method.",
+    )
+    tools_module_path: Optional[str] = Field(
+        default=None,
+        description="Optional Python import path to a module containing custom tool functions for this task.",
+    )
+    reward_function_path: str = Field(
+        description="Python import path to the reward function (e.g., 'my_module.my_reward_func')."
+    )
+    goal_description: Optional[str] = Field(
+        default=None,
+        description="A human-readable description of the agent's goal for this task.",
+    )
+    evaluation_criteria: Optional[EvaluationCriteriaModel] = Field(
+        default=None,
+        description="Criteria used by the Orchestrator to determine if the primary goal was achieved.",
+    )
+    initial_user_prompt: Optional[str] = Field(
+        default=None,
+        description="The initial prompt or message to start the agent interaction. Deprecated if 'messages' field is used for multi-turn.",
+    )
+    messages: Optional[List[Dict[str, Any]]] = Field(  # Explicit field for initial/multi-turn messages
+        default=None,
+        description="A list of messages to start the conversation, can represent multiple user turns for sequential processing.",
+    )
+    # PoC / Task specific parameters
+    poc_max_turns: int = Field(
+        default=3,
+        ge=1,
+        description="For PoC Orchestrator, the maximum number of interaction turns.",
+    )
+    # Allow other custom fields to be captured if needed by specific tasks or resources
+    # These will be accessible via `model_extra` if `model_config` has `extra = 'allow'`
+    # Or define a specific field:
+    # custom_task_params: Dict[str, Any] = Field(default_factory=dict)
+    resource_server: Optional[ResourceServerConfig] = Field(
+        default=None,
+        description="Configuration for a background server required for the task.",
+    )
+    num_rollouts: int = Field(
+        default=1,
+        ge=1,
+        description="Number of parallel rollouts to execute for this task definition.",
+    )
+    # Data-driven evaluation fields
+    dataset_path: Optional[str] = Field(
+        default=None,
+        description="Path to dataset file (JSONL) containing experimental conditions for data-driven evaluation.",
+    )
+    num_rollouts_per_sample: int = Field(
+        default=1,
+        ge=1,
+        description="Number of rollouts to execute per sample from the dataset.",
+    )
+    class Config:
+        extra = "allow"  # Allow and capture extra fields not explicitly defined
+        # For Pydantic v2, it's model_config = {"extra": "allow"}
+        # Assuming Pydantic v1 style for now based on existing file, can update if needed.
+        # If using Pydantic v1, `Config.extra = "allow"` is correct.
+        # For Pydantic v2, this should be:
+        # from pydantic import ConfigDict
+        # model_config = ConfigDict(extra='allow')
+        # For Pydantic v1, `Config.extra = "allow"` is correct.