PyPI - eval-protocol - Versions diffs - 0.0.3__py3-none-any.whl - Mend

eval-protocol 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (130) hide show

development/__init__.py +1 -0
development/normalize_sandbox_fusion.py +628 -0
development/utils/__init__.py +1 -0
development/utils/generate_api_key.py +31 -0
development/utils/subprocess_manager.py +481 -0
eval_protocol/__init__.py +86 -0
eval_protocol/__main__.py +10 -0
eval_protocol/_version.py +21 -0
eval_protocol/adapters/__init__.py +1 -0
eval_protocol/adapters/braintrust.py +8 -0
eval_protocol/adapters/trl.py +8 -0
eval_protocol/agent/__init__.py +29 -0
eval_protocol/agent/models.py +69 -0
eval_protocol/agent/orchestrator.py +893 -0
eval_protocol/agent/resource_abc.py +89 -0
eval_protocol/agent/resource_pool.py +184 -0
eval_protocol/agent/resources/__init__.py +44 -0
eval_protocol/agent/resources/bfcl_envs/__init__.py +1 -0
eval_protocol/agent/resources/bfcl_envs/gorilla_file_system.py +342 -0
eval_protocol/agent/resources/bfcl_envs/math_api.py +40 -0
eval_protocol/agent/resources/bfcl_envs/posting_api.py +157 -0
eval_protocol/agent/resources/bfcl_sim_api_resource.py +314 -0
eval_protocol/agent/resources/docker_resource.py +479 -0
eval_protocol/agent/resources/filesystem_resource.py +371 -0
eval_protocol/agent/resources/http_rollout_protocol.py +85 -0
eval_protocol/agent/resources/http_rollout_resource.py +325 -0
eval_protocol/agent/resources/python_state_resource.py +170 -0
eval_protocol/agent/resources/sql_resource.py +271 -0
eval_protocol/agent/task_manager.py +1064 -0
eval_protocol/agent/tool_registry.py +111 -0
eval_protocol/auth.py +156 -0
eval_protocol/cli.py +425 -0
eval_protocol/cli_commands/__init__.py +1 -0
eval_protocol/cli_commands/agent_eval_cmd.py +264 -0
eval_protocol/cli_commands/common.py +242 -0
eval_protocol/cli_commands/deploy.py +486 -0
eval_protocol/cli_commands/deploy_mcp.py +287 -0
eval_protocol/cli_commands/preview.py +186 -0
eval_protocol/cli_commands/run_eval_cmd.py +202 -0
eval_protocol/common_utils.py +36 -0
eval_protocol/config.py +180 -0
eval_protocol/datasets/__init__.py +1 -0
eval_protocol/datasets/loader.py +521 -0
eval_protocol/evaluation.py +1045 -0
eval_protocol/execution/__init__.py +1 -0
eval_protocol/execution/pipeline.py +920 -0
eval_protocol/gcp_tools.py +484 -0
eval_protocol/generation/cache.py +141 -0
eval_protocol/generation/clients/base.py +67 -0
eval_protocol/generation/clients.py +248 -0
eval_protocol/generic_server.py +165 -0
eval_protocol/integrations/__init__.py +12 -0
eval_protocol/integrations/braintrust.py +51 -0
eval_protocol/integrations/deepeval.py +106 -0
eval_protocol/integrations/openeval.py +40 -0
eval_protocol/integrations/trl.py +187 -0
eval_protocol/mcp/__init__.py +48 -0
eval_protocol/mcp/adapter.py +131 -0
eval_protocol/mcp/client/__init__.py +12 -0
eval_protocol/mcp/client/connection.py +499 -0
eval_protocol/mcp/clients.py +195 -0
eval_protocol/mcp/execution/__init__.py +23 -0
eval_protocol/mcp/execution/base_policy.py +227 -0
eval_protocol/mcp/execution/fireworks_policy.py +209 -0
eval_protocol/mcp/execution/manager.py +506 -0
eval_protocol/mcp/execution/policy.py +421 -0
eval_protocol/mcp/grid_renderer.py +54 -0
eval_protocol/mcp/mcpgym.py +637 -0
eval_protocol/mcp/process_manager.py +177 -0
eval_protocol/mcp/session/__init__.py +11 -0
eval_protocol/mcp/session/manager.py +228 -0
eval_protocol/mcp/simple_process_manager.py +291 -0
eval_protocol/mcp/simulation_server.py +458 -0
eval_protocol/mcp/types.py +80 -0
eval_protocol/mcp_agent/__init__.py +1 -0
eval_protocol/mcp_agent/config.py +147 -0
eval_protocol/mcp_agent/intermediary_server.py +542 -0
eval_protocol/mcp_agent/main.py +210 -0
eval_protocol/mcp_agent/orchestration/__init__.py +1 -0
eval_protocol/mcp_agent/orchestration/base_client.py +132 -0
eval_protocol/mcp_agent/orchestration/local_docker_client.py +702 -0
eval_protocol/mcp_agent/orchestration/remote_http_client.py +304 -0
eval_protocol/mcp_agent/orchestration/stdio_mcp_client_helper.py +3 -0
eval_protocol/mcp_agent/session.py +79 -0
eval_protocol/mcp_env.py +304 -0
eval_protocol/models.py +366 -0
eval_protocol/packaging.py +219 -0
eval_protocol/platform_api.py +360 -0
eval_protocol/playback_policy.py +396 -0
eval_protocol/resources.py +128 -0
eval_protocol/reward_function.py +410 -0
eval_protocol/rewards/__init__.py +94 -0
eval_protocol/rewards/accuracy.py +454 -0
eval_protocol/rewards/accuracy_length.py +173 -0
eval_protocol/rewards/apps_coding_reward.py +331 -0
eval_protocol/rewards/apps_execution_utils.py +149 -0
eval_protocol/rewards/apps_testing_util.py +559 -0
eval_protocol/rewards/bfcl_reward.py +313 -0
eval_protocol/rewards/code_execution.py +1620 -0
eval_protocol/rewards/code_execution_utils.py +72 -0
eval_protocol/rewards/cpp_code.py +861 -0
eval_protocol/rewards/deepcoder_reward.py +161 -0
eval_protocol/rewards/format.py +129 -0
eval_protocol/rewards/function_calling.py +541 -0
eval_protocol/rewards/json_schema.py +422 -0
eval_protocol/rewards/language_consistency.py +700 -0
eval_protocol/rewards/lean_prover.py +479 -0
eval_protocol/rewards/length.py +375 -0
eval_protocol/rewards/list_comparison_math_reward.py +221 -0
eval_protocol/rewards/math.py +762 -0
eval_protocol/rewards/multiple_choice_math_reward.py +232 -0
eval_protocol/rewards/reasoning_steps.py +249 -0
eval_protocol/rewards/repetition.py +342 -0
eval_protocol/rewards/tag_count.py +162 -0
eval_protocol/rl_processing.py +82 -0
eval_protocol/server.py +271 -0
eval_protocol/typed_interface.py +260 -0
eval_protocol/utils/__init__.py +8 -0
eval_protocol/utils/batch_evaluation.py +217 -0
eval_protocol/utils/batch_transformation.py +205 -0
eval_protocol/utils/dataset_helpers.py +112 -0
eval_protocol/utils/module_loader.py +56 -0
eval_protocol/utils/packaging_utils.py +108 -0
eval_protocol/utils/static_policy.py +305 -0
eval_protocol-0.0.3.dist-info/METADATA +635 -0
eval_protocol-0.0.3.dist-info/RECORD +130 -0
eval_protocol-0.0.3.dist-info/WHEEL +5 -0
eval_protocol-0.0.3.dist-info/entry_points.txt +4 -0
eval_protocol-0.0.3.dist-info/licenses/LICENSE +201 -0
eval_protocol-0.0.3.dist-info/top_level.txt +2 -0

eval_protocol/server.py ADDED Viewed

@@ -0,0 +1,271 @@
+import importlib
+import json
+import logging
+import os
+from typing import Any, Callable, Dict, List, Optional, Union
+import uvicorn
+from fastapi import FastAPI, HTTPException, Request
+from pydantic import BaseModel, Field
+from .models import EvaluateResult
+# Setup logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class Message(BaseModel):
+    """Model for a conversation message."""
+    role: str
+    content: str
+    class Config:
+        extra = "allow"  # Allow extra fields
+class RewardRequest(BaseModel):
+    """Request model for reward endpoints."""
+    messages: List[Message] = Field(..., description="List of conversation messages")
+    ground_truth: Optional[Union[str, List[Message]]] = Field(
+        None, description="Ground truth data (string or list of messages) for context"
+    )
+    class Config:
+        extra = "allow"  # Allow extra fields for arbitrary kwargs
+class RewardServer:
+    """
+    Server for hosting reward functions.
+    This class creates a FastAPI server that can host reward functions.
+    Args:
+        func_path: Path to the reward function to host (e.g., "module.path:function_name")
+        host: Host to bind the server to
+        port: Port to bind the server to
+    """
+    def __init__(
+        self,
+        func_path: str,
+        host: str = "0.0.0.0",
+        port: int = 8000,
+    ):
+        self.func_path = func_path
+        self.host = host
+        self.port = port
+        self.app = FastAPI(title="Reward Function Server")
+        # Load the reward function
+        self.reward_func = self._load_function()
+        # Register the endpoints
+        self._setup_routes()
+    def _load_function(self):
+        """Load the reward function from the provided path."""
+        try:
+            if ":" not in self.func_path:
+                raise ValueError(f"Invalid func_path format: {self.func_path}, expected 'module.path:function_name'")
+            module_path, func_name = self.func_path.split(":", 1)
+            module = importlib.import_module(module_path)
+            func = getattr(module, func_name)
+            logger.info(f"Loaded reward function {func_name} from {module_path}")
+            return func
+        except (ImportError, AttributeError) as e:
+            raise ImportError(f"Failed to load function from path {self.func_path}: {str(e)}")
+    def _setup_routes(self):
+        """Set up the API routes."""
+        @self.app.get("/")
+        async def root():
+            """Get server info."""
+            return {
+                "status": "ok",
+                "reward_function": self.func_path,
+                "endpoints": ["/reward"],
+            }
+        @self.app.post("/reward")
+        async def reward(request: RewardRequest):
+            """
+            Get reward score for messages.
+            Args:
+                request: RewardRequest object with messages and optional parameters
+            Returns:
+                EvaluateResult object with score and metrics
+            """
+            try:
+                # Extract kwargs from the request
+                kwargs = request.dict(exclude={"messages", "ground_truth"})
+                # Set default for ground_truth if not provided and expected as list
+                ground_truth_data = request.ground_truth
+                if ground_truth_data is None:
+                    # This default applies if ground_truth is expected to be a list of messages for context
+                    ground_truth_data = request.messages[:-1] if request.messages else []
+                # Call the reward function
+                result = self.reward_func(
+                    messages=request.messages,
+                    ground_truth=ground_truth_data,
+                    **kwargs,
+                )
+                # Handle different return types
+                # The self.reward_func is expected to be decorated by the new @reward_function,
+                # which returns a dictionary.
+                if isinstance(result, dict) and "score" in result:
+                    return result
+                elif isinstance(result, EvaluateResult):  # Should not happen if func is from new decorator
+                    logger.warning("Reward function returned EvaluateResult object directly to server; expected dict.")
+                    return result.model_dump()
+                elif isinstance(result, tuple) and len(result) == 2:  # Legacy tuple
+                    logger.warning("Reward function returned legacy tuple format to server.")
+                    score, components = result
+                    return {"score": score, "metrics": components}
+                else:
+                    raise TypeError(f"Invalid return type from reward function after decoration: {type(result)}")
+            except Exception as e:
+                logger.error(f"Error processing reward request: {str(e)}")
+                raise HTTPException(status_code=500, detail=str(e))
+        @self.app.get("/health")
+        async def health():
+            """Health check endpoint."""
+            return {"status": "ok"}
+    def run(self):
+        """Run the server."""
+        logger.info(f"Starting reward server on {self.host}:{self.port}")
+        uvicorn.run(self.app, host=self.host, port=self.port)
+def serve(func_path: str, host: str = "0.0.0.0", port: int = 8000):
+    """
+    Serve a reward function as an HTTP API.
+    Args:
+        func_path: Path to the reward function to serve (e.g., "module.path:function_name")
+        host: Host to bind the server to
+        port: Port to bind the server to
+    """
+    server = RewardServer(func_path=func_path, host=host, port=port)
+    server.run()
+# ngrok-based serve_tunnel is deprecated in favor of Serveo via subprocess_manager.
+# def serve_tunnel(func_path: str, port: int = 8000):
+#     """
+#     Serve a reward function with an ngrok tunnel.
+#     DEPRECATED.
+#     """
+#     try:
+#         import pyngrok.ngrok as ngrok  # type: ignore
+#     except ImportError:
+#         raise ImportError(
+#             "The 'pyngrok' package is required to use serve_tunnel. "
+#             "Please install it with 'pip install pyngrok'."
+#         )
+#
+#     # Open the tunnel
+#     tunnel = ngrok.connect(port)
+#     public_url = tunnel.public_url
+#
+#     # Print the tunnel URL
+#     logger.info(f"Reward function available at: {public_url}/reward")
+#
+#     # Start the server
+#     serve(func_path=func_path, host="0.0.0.0", port=port)
+def create_app(reward_func: Callable[..., EvaluateResult]) -> FastAPI:
+    """
+    Create a FastAPI app for the given reward function.
+    This function creates a FastAPI app that can be used to serve a reward function.
+    It's particularly useful for testing or when you want to manage the lifecycle
+    of the app yourself.
+    Args:
+        reward_func: The reward function to serve
+    Returns:
+        A FastAPI app instance
+    """
+    app = FastAPI(title="Reward Function Server")
+    @app.get("/")
+    async def root():
+        """Get server info."""
+        return {"status": "ok", "endpoints": ["/reward"]}
+    @app.post("/reward")
+    async def reward(request_data: RewardRequest):
+        """
+        Get reward score for messages.
+        Args:
+            request_data: RewardRequest object with messages and optional parameters
+        Returns:
+            EvaluateResult object with score and metrics
+        """
+        try:
+            # Convert Pydantic models to dictionaries using model_dump (Pydantic v2)
+            messages = [msg.model_dump() for msg in request_data.messages]
+            ground_truth_data: Optional[Union[str, List[Dict[str, Any]]]] = None
+            if isinstance(request_data.ground_truth, str):
+                ground_truth_data = request_data.ground_truth
+            elif isinstance(request_data.ground_truth, list):
+                ground_truth_data = [msg.model_dump() for msg in request_data.ground_truth]
+            # Extract kwargs from any extra fields
+            kwargs = {k: v for k, v in request_data.model_dump().items() if k not in ["messages", "ground_truth"]}
+            # Set default for ground_truth if not provided and expected as list
+            if ground_truth_data is None:
+                # This default applies if ground_truth is expected to be a list of messages for context
+                ground_truth_data = messages[:-1] if messages else []
+            # Call the reward function
+            result = reward_func(messages=messages, ground_truth=ground_truth_data, **kwargs)
+            # Handle different return types
+            # The reward_func is expected to be decorated by the new @reward_function,
+            # which returns a dictionary.
+            if isinstance(result, dict) and "score" in result:
+                return result
+            elif isinstance(result, EvaluateResult):  # Should not happen if func is from new decorator
+                logger.warning(
+                    "Reward function passed to create_app returned EvaluateResult object directly; expected dict after decoration."
+                )
+                return result.model_dump()
+            elif isinstance(result, tuple) and len(result) == 2:  # Legacy tuple
+                logger.warning("Reward function passed to create_app returned legacy tuple format.")
+                score, components = result
+                return {"score": score, "metrics": components}
+            else:
+                raise TypeError(f"Invalid return type from reward function after decoration: {type(result)}")
+        except Exception as e:
+            logger.error(f"Error processing reward request: {str(e)}")
+            raise HTTPException(status_code=500, detail=str(e))
+    @app.get("/health")
+    async def health():
+        """Health check endpoint."""
+        return {"status": "ok"}
+    return app

eval_protocol/typed_interface.py ADDED Viewed

@@ -0,0 +1,260 @@
+import inspect
+from functools import wraps
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    List,
+    Literal,
+    Optional,
+    Protocol,
+    TypeVar,
+    Union,
+    cast,
+    get_args,
+    get_origin,
+)
+from pydantic import TypeAdapter, ValidationError
+# EvaluateResult and StepOutput are now extended/defined in models.py
+from .models import (  # Removed StepOutput as it's not used here directly
+    EvaluateResult,
+    Message,
+)
+# Import resource types
+from .resources import ResourceDict
+_single_res_adapter = TypeAdapter(EvaluateResult)
+_list_res_adapter = TypeAdapter(List[EvaluateResult])
+# Define a type for the mode parameter
+EvaluationMode = Literal["pointwise", "batch"]
+# TypeVar for the function being decorated, to preserve its signature as much as possible.
+F = TypeVar("F", bound=Callable[..., Any])
+def reward_function(
+    _func: Optional[F] = None,
+    *,
+    mode: EvaluationMode = "pointwise",
+    id: Optional[str] = None,
+    requirements: Optional[List[str]] = None,  # Changed to List[str]
+    resources: Optional[ResourceDict] = None,  # Resource management
+    concurrency: Optional[int] = None,
+    timeout: Optional[int] = None,
+) -> Union[F, Callable[[F], F]]:
+    """
+    Decorator for user-defined reward and evaluation functions with resource management.
+    It handles:
+    - Coercing input messages (and ground truths if applicable) to Pydantic `Message` objects
+      if the decorated function is type-hinted to receive them. This part currently targets
+      parameters named 'messages' and 'ground_truth'.
+    - Validating that the output conforms to `EvaluateResult` (for pointwise) or `List[EvaluateResult]` (for batch).
+    - Managing declared resources (LLMs, databases, etc.) with automatic setup and cleanup
+    Args:
+        _func: The user's reward/evaluation function. Optional for decorator usage with args.
+        mode: Specifies the operational mode. Defaults to "pointwise".
+              - "pointwise": Function processes one rollout. Expected output: `EvaluateResult`.
+              - "batch": Function processes a batch of rollouts. Expected output: `List[EvaluateResult]`.
+        id: Optional identifier for the reward function, used for deployment
+        requirements: Optional string content for requirements.txt for deployment
+        resources: Optional dictionary of resource types to resource instances.
+                  Example: {"llms": [llm_resource]}
+                  Resources are automatically setup before evaluation and cleaned up after.
+        concurrency: Optional number of concurrent requests to the reward function. This will only take effect if the function is async or there are async resources binded to the reward function (e.g. LLM resource).
+        timeout: Optional timeout for the reward function. This will only take effect if the function is async or there are async resources binded to the reward function (e.g. LLM resource).
+    Returns:
+        A decorator if `_func` is None, or the decorated function.
+    """
+    def decorator(func: F) -> F:
+        sig = inspect.signature(func)
+        params = sig.parameters
+        # Validate that the function accepts **kwargs
+        has_var_keyword = any(param.kind == inspect.Parameter.VAR_KEYWORD for param in params.values())
+        if not has_var_keyword:
+            raise ValueError(
+                f"Function '{func.__name__}' must accept **kwargs parameter. "
+                f"Please add '**kwargs' to the function signature."
+            )
+        # Setup resources once when the decorator is applied
+        resource_managers = {}
+        if resources:
+            for resource_type, resource_list in resources.items():
+                managers = []
+                for resource in resource_list:
+                    resource.setup()
+                    managers.append(resource)
+                resource_managers[resource_type] = managers
+        # Detect if the user supplied function is a coroutine (async def)
+        _is_async_function = inspect.iscoroutinefunction(func)
+        def _prepare_final_args(*args: Any, **kwargs: Any):
+            """Prepare final positional and keyword arguments for the user function call.
+            This includes Pydantic coercion and resource injection. Returns a tuple of
+            (call_args, call_kwargs).
+            """
+            # Bind arguments to handle *args and **kwargs correctly for the wrapped function
+            bound_args = sig.bind_partial(*args, **kwargs)
+            bound_args.apply_defaults()
+            # Create a mutable copy of arguments to modify
+            final_func_args = dict(bound_args.arguments)
+            def _coerce_to_list_message(data_list: Any, arg_name_for_error: str) -> List[Message]:
+                if not isinstance(data_list, list):
+                    raise TypeError(f"Expected a list for '{arg_name_for_error}', got {type(data_list)}")
+                typed_list = []
+                for i, item_data in enumerate(data_list):
+                    if isinstance(item_data, Message):
+                        typed_list.append(item_data)
+                    elif isinstance(item_data, dict):
+                        typed_list.append(Message(**item_data))
+                    else:
+                        raise TypeError(f"Unexpected type for item {i} in '{arg_name_for_error}': {type(item_data)}")
+                return typed_list
+            # 1. Conditional Pydantic conversion for 'messages' (pointwise) or 'rollouts_messages' (batch)
+            if mode == "pointwise" and "messages" in params and "messages" in final_func_args:
+                messages_param_annotation = params["messages"].annotation
+                if (
+                    get_origin(messages_param_annotation) in (list, List)
+                    and get_args(messages_param_annotation)
+                    and get_args(messages_param_annotation)[0] == Message
+                ):
+                    try:
+                        final_func_args["messages"] = _coerce_to_list_message(final_func_args["messages"], "messages")
+                    except Exception as err:
+                        raise ValueError(f"Input 'messages' failed Pydantic validation: {err}") from None
+            elif mode == "batch" and "rollouts_messages" in params and "rollouts_messages" in final_func_args:
+                param_annotation = params["rollouts_messages"].annotation
+                inner = get_args(param_annotation)[0] if get_args(param_annotation) else None
+                if get_origin(param_annotation) == list and inner and get_origin(inner) == list:
+                    if get_args(inner) and get_args(inner)[0] == Message:
+                        try:
+                            coerced_rollouts = []
+                            for i, rollout_data in enumerate(final_func_args["rollouts_messages"]):
+                                coerced_rollouts.append(
+                                    _coerce_to_list_message(rollout_data, f"rollouts_messages[{i}]")
+                                )
+                            final_func_args["rollouts_messages"] = coerced_rollouts
+                        except Exception as err:
+                            raise ValueError(f"Input 'rollouts_messages' failed Pydantic validation: {err}") from None
+            # Ground truth coercion (if needed)
+            if "ground_truth" in params and "ground_truth" in final_func_args:
+                gt_ann = params["ground_truth"].annotation
+                if get_origin(gt_ann) in (list, List) and get_args(gt_ann) and get_args(gt_ann)[0] == Message:
+                    if final_func_args["ground_truth"] is not None:
+                        try:
+                            final_func_args["ground_truth"] = _coerce_to_list_message(
+                                final_func_args["ground_truth"], "ground_truth"
+                            )
+                        except Exception as err:
+                            raise ValueError(
+                                f"Input 'ground_truth' failed Pydantic validation for List[Message]: {err}"
+                            ) from None
+            # Inject resource clients into kwargs (resources are already setup)
+            if resource_managers:
+                final_func_args["resources"] = {
+                    resource_type: [manager.get_client() for manager in managers]
+                    for resource_type, managers in resource_managers.items()
+                }
+            # Call the author's function using the (potentially modified) arguments dictionary.
+            # final_func_args should contain all parameters expected by func, correctly mapped.
+            # Reconstruct args and kwargs for the call to func
+            call_args: List[Any] = []
+            call_kwargs: Dict[str, Any] = {}
+            for (
+                p_name,
+                p_obj,
+            ) in params.items():  # params from inspect.signature(func).parameters
+                if p_obj.kind == inspect.Parameter.VAR_POSITIONAL:
+                    # If original func had *pos_args, final_func_args might contain it as a tuple
+                    call_args.extend(final_func_args.get(p_name, ()))
+                elif p_obj.kind == inspect.Parameter.VAR_KEYWORD:  # **kwargs
+                    # If original func had **kw_args, final_func_args contains the dict of these
+                    call_kwargs.update(final_func_args.get(p_name, {}))
+                elif p_name in final_func_args:  # Named parameters
+                    if p_obj.kind == inspect.Parameter.POSITIONAL_ONLY:
+                        call_args.append(final_func_args[p_name])
+                    else:  # POSITIONAL_OR_KEYWORD, KEYWORD_ONLY
+                        call_kwargs[p_name] = final_func_args[p_name]
+            return call_args, call_kwargs
+        def _validate_output(result: Any):
+            if mode == "pointwise":
+                if isinstance(result, EvaluateResult):
+                    return result
+                return _single_res_adapter.validate_python(result)
+            elif mode == "batch":
+                if isinstance(result, list) and all(isinstance(item, EvaluateResult) for item in result):
+                    return result
+                return _list_res_adapter.validate_python(result)
+            else:
+                raise ValueError(f"Internal error: Invalid mode '{mode}' in wrapper.")
+        if _is_async_function:
+            @wraps(func)
+            async def async_wrapper(
+                *args: Any,
+                **kwargs: Any,
+            ) -> Union[EvaluateResult, List[EvaluateResult]]:
+                call_args, call_kwargs = _prepare_final_args(*args, **kwargs)
+                result = await func(*call_args, **call_kwargs)  # type: ignore[misc]
+                try:
+                    return _validate_output(result)
+                except ValidationError as err:
+                    raise ValueError(
+                        f"Return value from function '{func.__name__}' failed Pydantic validation for mode '{mode}':\n{err}"
+                    ) from None
+            wrapper_fn = async_wrapper
+        else:
+            @wraps(func)
+            def sync_wrapper(
+                *args: Any,
+                **kwargs: Any,
+            ) -> Union[EvaluateResult, List[EvaluateResult]]:
+                call_args, call_kwargs = _prepare_final_args(*args, **kwargs)
+                result = func(*call_args, **call_kwargs)
+                try:
+                    return _validate_output(result)
+                except ValidationError as err:
+                    raise ValueError(
+                        f"Return value from function '{func.__name__}' failed Pydantic validation for mode '{mode}':\n{err}"
+                    ) from None
+            wrapper_fn = sync_wrapper
+        # Set attributes for introspection and deployment
+        wrapper_fn._reward_function_id = id  # type: ignore[attr-defined]
+        wrapper_fn._reward_function_requirements = requirements  # type: ignore[attr-defined]
+        wrapper_fn._reward_function_mode = mode  # type: ignore[attr-defined]
+        wrapper_fn._reward_function_resources = resources  # type: ignore[attr-defined]
+        wrapper_fn._reward_function_timeout = timeout  # type: ignore[attr-defined]
+        wrapper_fn._reward_function_concurrency = concurrency  # type: ignore[attr-defined]
+        return cast(F, wrapper_fn)
+    if _func is None:  # Decorator called with arguments, e.g., @reward_function(mode="batch")
+        return decorator
+    else:  # Decorator called without arguments, e.g., @reward_function (defaults to pointwise)
+        return decorator(_func)

eval_protocol/utils/__init__.py ADDED Viewed

@@ -0,0 +1,8 @@
+# This file makes the 'utils' directory a Python package.
+# You can selectively expose functions or classes from modules within 'utils' here
+# for easier access, e.g.:
+# from .dataset_helpers import load_jsonl_to_hf_dataset
+# For now, allow direct import of modules like:
+# from eval_protocol.utils.dataset_helpers import ...