PyPI - eval-protocol - Versions diffs - 0.2.55.dev1__tar.gz → 0.2.57.dev2__tar.gz - Mend

eval-protocol 0.2.55.dev1tar.gz → 0.2.57.dev2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (443) hide show

{eval_protocol-0.2.55.dev1/eval_protocol.egg-info → eval_protocol-0.2.57.dev2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: eval-protocol
-Version: 0.2.55.dev1
+Version: 0.2.57.dev2
 Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
 Author-email: Fireworks AI <info@fireworks.ai>
 License-Expression: MIT

{eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/__init__.py RENAMED Viewed

@@ -29,12 +29,20 @@ from .playback_policy import PlaybackPolicyBase
 from .resources import create_llm_resource
 from .reward_function import RewardFunction
 from .typed_interface import reward_function
-from .quickstart import aha_judge, multi_turn_assistant_to_ground_truth, assistant_to_ground_truth
-from .pytest import evaluation_test, SingleTurnRolloutProcessor, RemoteRolloutProcessor
+from .quickstart.aha_judge import aha_judge
+from .utils.evaluation_row_utils import (
+    multi_turn_assistant_to_ground_truth,
+    assistant_to_ground_truth,
+    filter_longest_conversation,
+)
+from .pytest import evaluation_test, SingleTurnRolloutProcessor, RemoteRolloutProcessor, GithubActionRolloutProcessor
 from .pytest.parameterize import DefaultParameterIdGenerator
 from .log_utils.elasticsearch_direct_http_handler import ElasticsearchDirectHttpHandler
 from .log_utils.rollout_id_filter import RolloutIdFilter
 from .log_utils.util import setup_rollout_logging_for_elasticsearch_handler
+from .log_utils.fireworks_tracing_http_handler import FireworksTracingHttpHandler
+from .log_utils.elasticsearch_client import ElasticsearchConfig
 from .types.remote_rollout_processor import (
     InitRequest,
@@ -81,12 +89,14 @@ except ImportError:
 warnings.filterwarnings("default", category=DeprecationWarning, module="eval_protocol")
 __all__ = [
+    "ElasticsearchConfig",
     "ElasticsearchDirectHttpHandler",
     "RolloutIdFilter",
     "setup_rollout_logging_for_elasticsearch_handler",
     "DataLoaderConfig",
     "Status",
     "RemoteRolloutProcessor",
+    "GithubActionRolloutProcessor",
     "InputMetadata",
     "EvaluationRow",
     "DefaultParameterIdGenerator",
@@ -95,6 +105,7 @@ __all__ = [
     "aha_judge",
     "multi_turn_assistant_to_ground_truth",
     "assistant_to_ground_truth",
+    "filter_longest_conversation",
     "evaluation_test",
     "SingleTurnRolloutProcessor",
     "OpenAIResponsesAdapter",
@@ -103,6 +114,7 @@ __all__ = [
     "BraintrustAdapter",
     "create_braintrust_adapter",
     "LangSmithAdapter",
+    "FireworksTracingHttpHandler",
     # Core interfaces
     "Message",
     "MetricResult",

{eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/_version.py RENAMED Viewed

@@ -8,11 +8,11 @@ import json
 version_json = '''
 {
- "date": "2025-10-13T20:26:15-0700",
+ "date": "2025-10-21T14:44:45-0700",
  "dirty": false,
  "error": null,
- "full-revisionid": "3c516e0d466d1a1a2d501f7ca0ac6ee7f10cf017",
- "version": "0.2.55-dev1"
+ "full-revisionid": "5a0eb89e557f1362bc17acd8a02c25a072dc3092",
+ "version": "0.2.57-dev2"
 }
 '''  # END VERSION_JSON

{eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/adapters/fireworks_tracing.py RENAMED Viewed

@@ -265,6 +265,55 @@ class FireworksTracingAdapter(BaseAdapter):
         self.base_url = base_url.rstrip("/")
         self.timeout = timeout
+    def search_logs(self, tags: List[str], limit: int = 100, hours_back: int = 24) -> List[Dict[str, Any]]:
+        """Fetch logs from Fireworks tracing gateway /logs endpoint.
+        Returns entries with keys: timestamp, message, severity, tags.
+        """
+        if not tags:
+            raise ValueError("At least one tag is required to fetch logs")
+        headers = {"Authorization": f"Bearer {os.environ.get('FIREWORKS_API_KEY')}"}
+        params: Dict[str, Any] = {"tags": tags, "limit": limit, "hours_back": hours_back, "program": "eval_protocol"}
+        # Try /logs first, fall back to /v1/logs if not found
+        urls_to_try = [f"{self.base_url}/logs", f"{self.base_url}/v1/logs"]
+        data: Dict[str, Any] = {}
+        last_error: Optional[str] = None
+        for url in urls_to_try:
+            try:
+                response = requests.get(url, params=params, timeout=self.timeout, headers=headers)
+                if response.status_code == 404:
+                    # Try next variant
+                    last_error = f"404 for {url}"
+                    continue
+                response.raise_for_status()
+                data = response.json() or {}
+                break
+            except requests.exceptions.RequestException as e:
+                last_error = str(e)
+                continue
+        else:
+            # All attempts failed
+            if last_error:
+                logger.error("Failed to fetch logs from Fireworks (tried %s): %s", urls_to_try, last_error)
+            return []
+        entries: List[Dict[str, Any]] = data.get("entries", []) or []
+        # Normalize minimal shape
+        results: List[Dict[str, Any]] = []
+        for e in entries:
+            results.append(
+                {
+                    "timestamp": e.get("timestamp"),
+                    "message": e.get("message"),
+                    "severity": e.get("severity", "INFO"),
+                    "tags": e.get("tags", []),
+                    "status": e.get("status"),
+                }
+            )
+        return results
     def get_evaluation_rows(
         self,
         tags: List[str],

{eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/adapters/openai_responses.py RENAMED Viewed

@@ -169,7 +169,9 @@ class OpenAIResponsesAdapter(BaseAdapter):
                         raise NotImplementedError(f"Unsupported content type: {content_item.type}")
             elif item.type == "function_call_output":
                 # Collect tool call outputs to add before assistant message
-                tool_call_outputs.append(Message(role="tool", content=item.output, tool_call_id=item.call_id))
+                tool_call_outputs.append(
+                    Message(role="tool", content=self._coerce_tool_output(item.output), tool_call_id=item.call_id)
+                )
             elif item.type == "function_call":
                 tool_call = ChatCompletionMessageToolCall(
                     id=item.call_id, type="function", function=Function(name=item.name, arguments=item.arguments)
@@ -186,3 +188,29 @@ class OpenAIResponsesAdapter(BaseAdapter):
             messages.append(Message(role="assistant", tool_calls=current_tool_calls))
         return reversed(messages)
+    def _coerce_tool_output(self, output: Any) -> str:
+        """Coerce OpenAI Responses tool output into a string for Message.content.
+        The Responses API may return structured content lists. For our purposes,
+        we stringify non-string outputs to satisfy the Message.content type.
+        """
+        if isinstance(output, str):
+            return output
+        try:
+            # Attempt to join list of objects with any 'text' fields
+            if isinstance(output, list):
+                parts: list[str] = []
+                for part in output:
+                    text = None
+                    if isinstance(part, dict):
+                        text = part.get("text")
+                    if text:
+                        parts.append(str(text))
+                    else:
+                        parts.append(str(part))
+                return "\n".join(parts)
+            # Fallback to string conversion
+            return str(output)
+        except Exception:
+            return str(output)

{eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/auth.py RENAMED Viewed

@@ -4,6 +4,8 @@ import os
 from pathlib import Path
 from typing import Dict, Optional  # Added Dict
+import requests
 logger = logging.getLogger(__name__)
 # Default locations (used for tests and as fallback). Actual resolution is dynamic via _get_auth_ini_file().
@@ -218,3 +220,40 @@ def get_fireworks_api_base() -> str:
     else:
         logger.debug("FIREWORKS_API_BASE not set in environment, defaulting to %s.", api_base)
     return api_base
+def verify_api_key_and_get_account_id(
+    api_key: Optional[str] = None,
+    api_base: Optional[str] = None,
+) -> Optional[str]:
+    """
+    Calls the Fireworks API verify endpoint to validate the API key and returns the
+    account id from response headers when available.
+    Args:
+        api_key: Optional explicit API key. When None, resolves via get_fireworks_api_key().
+        api_base: Optional explicit API base. When None, resolves via get_fireworks_api_base().
+    Returns:
+        The resolved account id if verification succeeds and the header is present; otherwise None.
+    """
+    try:
+        resolved_key = api_key or get_fireworks_api_key()
+        if not resolved_key:
+            return None
+        resolved_base = api_base or get_fireworks_api_base()
+        url = f"{resolved_base.rstrip('/')}/verifyApiKey"
+        headers = {"Authorization": f"Bearer {resolved_key}"}
+        resp = requests.get(url, headers=headers, timeout=10)
+        if resp.status_code != 200:
+            logger.debug("verifyApiKey returned status %s", resp.status_code)
+            return None
+        # Header keys could vary in case; requests provides case-insensitive dict
+        account_id = resp.headers.get("x-fireworks-account-id") or resp.headers.get("X-Fireworks-Account-Id")
+        if account_id and account_id.strip():
+            logger.debug("Resolved FIREWORKS_ACCOUNT_ID via verifyApiKey: %s", account_id)
+            return account_id.strip()
+        return None
+    except Exception as e:
+        logger.debug("Failed to verify API key for account id resolution: %s", e)
+        return None

{eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/cli.py RENAMED Viewed

@@ -301,6 +301,22 @@ def parse_args(args=None):
     logs_parser = subparsers.add_parser("logs", help="Serve logs with file watching and real-time updates")
     logs_parser.add_argument("--port", type=int, default=8000, help="Port to bind to (default: 8000)")
     logs_parser.add_argument("--debug", action="store_true", help="Enable debug mode")
+    logs_parser.add_argument("--disable-elasticsearch-setup", action="store_true", help="Disable Elasticsearch setup")
+    logs_parser.add_argument(
+        "--use-env-elasticsearch-config",
+        action="store_true",
+        help="Use env vars for Elasticsearch config (requires ELASTICSEARCH_URL, ELASTICSEARCH_API_KEY, ELASTICSEARCH_INDEX_NAME)",
+    )
+    logs_parser.add_argument(
+        "--use-fireworks",
+        action="store_true",
+        help="Force Fireworks tracing backend for logs UI (overrides env auto-detection)",
+    )
+    logs_parser.add_argument(
+        "--use-elasticsearch",
+        action="store_true",
+        help="Force Elasticsearch backend for logs UI (overrides env auto-detection)",
+    )
     # Upload command
     upload_parser = subparsers.add_parser(

eval_protocol-0.2.57.dev2/eval_protocol/cli_commands/logs.py ADDED Viewed

@@ -0,0 +1,57 @@
+"""
+CLI command for serving logs with file watching and real-time updates.
+"""
+import sys
+from pathlib import Path
+import os
+from ..utils.logs_server import serve_logs
+def logs_command(args):
+    """Serve logs with file watching and real-time updates"""
+    port = args.port
+    print("🚀 Starting Eval Protocol Logs Server")
+    print(f"🌐 URL: http://localhost:{port}")
+    print(f"🔌 WebSocket: ws://localhost:{port}/ws")
+    print(f"👀 Watching paths: {['current directory']}")
+    print(f"🔍 Debug mode: {args.debug}")
+    print("Press Ctrl+C to stop the server")
+    print("-" * 50)
+    # Backend selection: Fireworks first when API key present, unless overridden
+    use_fireworks = False
+    if getattr(args, "use_fireworks", False):
+        use_fireworks = True
+    elif getattr(args, "use_elasticsearch", False):
+        use_fireworks = False
+    else:
+        use_fireworks = bool(os.environ.get("FIREWORKS_API_KEY"))
+    # Setup backend configs
+    elasticsearch_config = None
+    # Prefer explicit FW_TRACING_GATEWAY_BASE_URL, then GATEWAY_URL from env (remote validation),
+    # finally default to public tracing.fireworks.ai
+    fireworks_base_url = (
+        os.environ.get("FW_TRACING_GATEWAY_BASE_URL")
+        or os.environ.get("GATEWAY_URL")
+        or "https://tracing.fireworks.ai"
+    )
+    try:
+        serve_logs(
+            port=args.port,
+            elasticsearch_config=elasticsearch_config,
+            debug=args.debug,
+            backend="fireworks" if use_fireworks else "elasticsearch",
+            fireworks_base_url=fireworks_base_url if use_fireworks else None,
+        )
+        return 0
+    except KeyboardInterrupt:
+        print("\n🛑 Server stopped by user")
+        return 0
+    except Exception as e:
+        print(f"❌ Error starting server: {e}")
+        return 1

{eval_protocol-0.2.55.dev1 → eval_protocol-0.2.57.dev2}/eval_protocol/cli_commands/upload.py RENAMED Viewed

@@ -12,7 +12,12 @@ from pathlib import Path
 from typing import Any, Callable, Iterable, Optional
 import pytest
-from eval_protocol.auth import get_fireworks_account_id, get_fireworks_api_key
+from eval_protocol.auth import (
+    get_fireworks_account_id,
+    get_fireworks_api_key,
+    get_fireworks_api_base,
+    verify_api_key_and_get_account_id,
+)
 from eval_protocol.platform_api import create_or_update_fireworks_secret
 from eval_protocol.evaluation import create_evaluation
@@ -259,81 +264,43 @@ def _parse_entry(entry: str, cwd: str) -> tuple[str, str]:
         raise ValueError("--entry must be in 'module::function', 'path::function', or 'module:function' format")
-def _generate_ts_mode_code_from_entry(entry: str, cwd: str) -> tuple[str, str, str, str]:
+def _resolve_entry_to_qual_and_source(entry: str, cwd: str) -> tuple[str, str]:
     target, func = _parse_entry(entry, cwd)
-    # Check if target looks like a file path
+    # Determine the file path to load
     if "/" in target or "\\" in target or os.path.exists(target):
-        # It's a file path - convert to absolute and load as module
+        # It's a file path - convert to absolute
         if not os.path.isabs(target):
             target = os.path.abspath(os.path.join(cwd, target))
         if not target.endswith(".py"):
             target = target + ".py"
         if not os.path.isfile(target):
             raise ValueError(f"File not found: {target}")
-        # Import module from file path
-        spec = importlib.util.spec_from_file_location(Path(target).stem, target)
-        if not spec or not spec.loader:
-            raise ValueError(f"Unable to load module from path: {target}")
-        module = importlib.util.module_from_spec(spec)
-        sys.modules[spec.name] = module
-        spec.loader.exec_module(module)  # type: ignore[attr-defined]
-        module_name = spec.name
         source_file_path = target
     else:
-        # Treat as module path (e.g., "my_package.my_module")
-        module_name = target
-        module = importlib.import_module(module_name)
-        source_file_path = getattr(module, "__file__", "") or ""
+        # Treat dotted name as a file path
+        dotted_as_path = target.replace(".", "/") + ".py"
+        source_file_path = os.path.join(cwd, dotted_as_path)
+    # Load the module from the file path
+    spec = importlib.util.spec_from_file_location(Path(source_file_path).stem, source_file_path)
+    if not spec or not spec.loader:
+        raise ValueError(f"Unable to load module from path: {source_file_path}")
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[spec.name] = module
+    spec.loader.exec_module(module)  # type: ignore[attr-defined]
+    module_name = spec.name
     if not hasattr(module, func):
         raise ValueError(f"Function '{func}' not found in module '{module_name}'")
     qualname = f"{module_name}.{func}"
-    code, file_name = _generate_ts_mode_code(
-        DiscoveredTest(
-            module_path=module_name,
-            module_name=module_name,
-            qualname=qualname,
-            file_path=getattr(module, "__file__", module_name),
-            lineno=None,
-            has_parametrize=False,
-            param_count=0,
-            nodeids=[],
-        )
-    )
-    return code, file_name, qualname, os.path.abspath(source_file_path) if source_file_path else ""
+    return qualname, os.path.abspath(source_file_path) if source_file_path else ""
 def _generate_ts_mode_code(test: DiscoveredTest) -> tuple[str, str]:
-    # Generate a minimal main.py that imports the test module and calls the function
-    module = test.module_name
-    func = test.qualname.split(".")[-1]
-    code = f"""
-from typing import Any, Dict, List, Optional, Union
-from eval_protocol.models import EvaluationRow, Message
-from {module} import {func} as _ep_test
-def evaluate(messages: List[Dict[str, Any]], ground_truth: Optional[Union[str, List[Dict[str, Any]]]] = None, tools=None, **kwargs):
-    row = EvaluationRow(messages=[Message(**m) for m in messages], ground_truth=ground_truth)
-    result = _ep_test(row)  # Supports sync/async via decorator's dual-mode
-    if hasattr(result, "__await__"):
-        import asyncio
-        result = asyncio.get_event_loop().run_until_complete(result)
-    if result.evaluation_result is None:
-        return {{"score": 0.0, "reason": "No evaluation_result set"}}
-    out = {{
-        "score": float(result.evaluation_result.score or 0.0),
-        "reason": result.evaluation_result.reason,
-        "metrics": {{k: (v.model_dump() if hasattr(v, "model_dump") else v) for k, v in (result.evaluation_result.metrics or {{}}).items()}},
-    }}
-    return out
-"""
-    return (code, "main.py")
+    # Deprecated: we no longer generate a shim; keep stub for import compatibility
+    return ("", "main.py")
 def _normalize_evaluator_id(evaluator_id: str) -> str:
@@ -522,10 +489,10 @@ def upload_command(args: argparse.Namespace) -> int:
     entries_arg = getattr(args, "entry", None)
     if entries_arg:
         entries = [e.strip() for e in re.split(r"[,\s]+", entries_arg) if e.strip()]
-        selected_specs: list[tuple[str, str, str, str]] = []
+        selected_specs: list[tuple[str, str]] = []
         for e in entries:
-            code, file_name, qualname, resolved_path = _generate_ts_mode_code_from_entry(e, root)
-            selected_specs.append((code, file_name, qualname, resolved_path))
+            qualname, resolved_path = _resolve_entry_to_qual_and_source(e, root)
+            selected_specs.append((qualname, resolved_path))
     else:
         print("Scanning for evaluation tests...")
         tests = _discover_tests(root)
@@ -545,11 +512,7 @@ def upload_command(args: argparse.Namespace) -> int:
             print("      handles all parameter combinations. The evaluator will work with")
             print("      the same logic regardless of which model/parameters are used.")
-        selected_specs = []
-        for t in selected_tests:
-            code, file_name = _generate_ts_mode_code(t)
-            # Store test info for better ID generation
-            selected_specs.append((code, file_name, t.qualname, t.file_path))
+        selected_specs = [(t.qualname, t.file_path) for t in selected_tests]
     base_id = getattr(args, "id", None)
     display_name = getattr(args, "display_name", None)
@@ -560,6 +523,14 @@ def upload_command(args: argparse.Namespace) -> int:
     try:
         fw_account_id = get_fireworks_account_id()
         fw_api_key_value = get_fireworks_api_key()
+        if not fw_account_id and fw_api_key_value:
+            # Attempt to verify and resolve account id from server headers
+            resolved = verify_api_key_and_get_account_id(api_key=fw_api_key_value, api_base=get_fireworks_api_base())
+            if resolved:
+                fw_account_id = resolved
+                # Propagate to environment so downstream calls use it if needed
+                os.environ["FIREWORKS_ACCOUNT_ID"] = fw_account_id
+                print(f"Resolved FIREWORKS_ACCOUNT_ID via API verification: {fw_account_id}")
         if fw_account_id and fw_api_key_value:
             print("Ensuring FIREWORKS_API_KEY is registered as a secret on Fireworks for rollout...")
             if create_or_update_fireworks_secret(
@@ -579,8 +550,7 @@ def upload_command(args: argparse.Namespace) -> int:
         print(f"Warning: Skipped Fireworks secret registration due to error: {e}")
     exit_code = 0
-    for i, (code, file_name, qualname, source_file_path) in enumerate(selected_specs):
-        # Use ts_mode to upload evaluator
+    for i, (qualname, source_file_path) in enumerate(selected_specs):
         # Generate a short default ID from just the test function name
         if base_id:
             evaluator_id = base_id
@@ -618,12 +588,11 @@ def upload_command(args: argparse.Namespace) -> int:
         print(f"\nUploading evaluator '{evaluator_id}' for {qualname.split('.')[-1]}...")
         try:
+            test_dir = root
+            metric_name = os.path.basename(test_dir) or "metric"
             result = create_evaluation(
                 evaluator_id=evaluator_id,
-                python_code_to_evaluate=code,
-                python_file_name_for_code=file_name,
-                criterion_name_for_code=qualname,
-                criterion_description_for_code=description or f"Evaluator for {qualname}",
+                metric_folders=[f"{metric_name}={test_dir}"],
                 display_name=display_name or evaluator_id,
                 description=description or f"Evaluator for {qualname}",
                 force=force,

eval-protocol 0.2.55.dev1__tar.gz → 0.2.57.dev2__tar.gz

eval-protocol 0.2.55.dev1tar.gz → 0.2.57.dev2tar.gz