PyPI - eval-protocol - Versions diffs - 0.3.9.dev1__tar.gz → 0.3.10.dev2__tar.gz - Mend

eval-protocol 0.3.9.dev1tar.gz → 0.3.10.dev2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (473) hide show

{eval_protocol-0.3.9.dev1/eval_protocol.egg-info → eval_protocol-0.3.10.dev2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: eval-protocol
-Version: 0.3.9.dev1
+Version: 0.3.10.dev2
 Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
 Author-email: Fireworks AI <info@fireworks.ai>
 License-Expression: MIT
@@ -29,7 +29,7 @@ Requires-Dist: pytest>=6.0.0
 Requires-Dist: pytest-asyncio>=0.21.0
 Requires-Dist: peewee>=3.18.2
 Requires-Dist: backoff>=2.2.0
-Requires-Dist: fireworks-ai==1.0.0a20
+Requires-Dist: fireworks-ai==1.0.0a22
 Requires-Dist: questionary>=2.0.0
 Requires-Dist: toml>=0.10.0
 Requires-Dist: loguru>=0.6.0

{eval_protocol-0.3.9.dev1 → eval_protocol-0.3.10.dev2}/eval_protocol/_version.py RENAMED Viewed

@@ -8,11 +8,11 @@ import json
 version_json = '''
 {
- "date": "2026-01-08T13:29:17-0800",
+ "date": "2026-01-13T16:25:00-0800",
  "dirty": false,
  "error": null,
- "full-revisionid": "764ac4f132c35fe01c354b4150cbc19c7eedea12",
- "version": "0.3.9.dev.1"
+ "full-revisionid": "66f191a09db5364b9cd9bb21230e1f48e50be724",
+ "version": "0.3.10.dev.2"
 }
 '''  # END VERSION_JSON

{eval_protocol-0.3.9.dev1 → eval_protocol-0.3.10.dev2}/eval_protocol/adapters/fireworks_tracing.py RENAMED Viewed

@@ -253,6 +253,7 @@ class FireworksTracingAdapter(BaseAdapter):
         project_id: Optional[str] = None,
         base_url: str = "https://tracing.fireworks.ai",
         timeout: int = 300,
+        api_key: Optional[str] = None,
     ):
         """Initialize the Fireworks Tracing adapter.
@@ -260,10 +261,16 @@ class FireworksTracingAdapter(BaseAdapter):
             project_id: Optional project ID. If not provided, uses the default project configured on the server.
             base_url: The base URL of the tracing proxy (default: https://tracing.fireworks.ai)
             timeout: Request timeout in seconds (default: 300)
+            api_key: Optional API key. If not provided, falls back to FIREWORKS_API_KEY environment variable.
         """
         self.project_id = project_id
         self.base_url = base_url.rstrip("/")
         self.timeout = timeout
+        self._api_key = api_key
+    def _get_api_key(self) -> Optional[str]:
+        """Get the API key, preferring instance-level key over environment variable."""
+        return self._api_key or os.environ.get("FIREWORKS_API_KEY")
     def search_logs(self, tags: List[str], limit: int = 100, hours_back: int = 24) -> List[Dict[str, Any]]:
         """Fetch logs from Fireworks tracing gateway /logs endpoint.
@@ -276,7 +283,7 @@ class FireworksTracingAdapter(BaseAdapter):
         from ..common_utils import get_user_agent
         headers = {
-            "Authorization": f"Bearer {os.environ.get('FIREWORKS_API_KEY')}",
+            "Authorization": f"Bearer {self._get_api_key()}",
             "User-Agent": get_user_agent(),
         }
         params: Dict[str, Any] = {"tags": tags, "limit": limit, "hours_back": hours_back, "program": "eval_protocol"}
@@ -407,7 +414,7 @@ class FireworksTracingAdapter(BaseAdapter):
         from ..common_utils import get_user_agent
         headers = {
-            "Authorization": f"Bearer {os.environ.get('FIREWORKS_API_KEY')}",
+            "Authorization": f"Bearer {self._get_api_key()}",
             "User-Agent": get_user_agent(),
         }

{eval_protocol-0.3.9.dev1 → eval_protocol-0.3.10.dev2}/eval_protocol/auth.py RENAMED Viewed

@@ -1,12 +1,75 @@
 import logging
 import os
-from typing import Optional
+from typing import Dict, Optional
 import requests
+from dotenv import dotenv_values, find_dotenv, load_dotenv
 logger = logging.getLogger(__name__)
+def find_dotenv_path(search_path: Optional[str] = None) -> Optional[str]:
+    """
+    Find the .env file path, searching .env.dev first, then .env.
+    Args:
+        search_path: Directory to search from. If None, uses current working directory.
+    Returns:
+        Path to the .env file if found, otherwise None.
+    """
+    # If a specific search path is provided, look there first
+    if search_path:
+        env_dev_path = os.path.join(search_path, ".env.dev")
+        if os.path.isfile(env_dev_path):
+            return env_dev_path
+        env_path = os.path.join(search_path, ".env")
+        if os.path.isfile(env_path):
+            return env_path
+        return None
+    # Otherwise use find_dotenv to search up the directory tree
+    env_dev_path = find_dotenv(filename=".env.dev", raise_error_if_not_found=False, usecwd=True)
+    if env_dev_path:
+        return env_dev_path
+    env_path = find_dotenv(filename=".env", raise_error_if_not_found=False, usecwd=True)
+    if env_path:
+        return env_path
+    return None
+def get_dotenv_values(search_path: Optional[str] = None) -> Dict[str, Optional[str]]:
+    """
+    Get all key-value pairs from the .env file.
+    Args:
+        search_path: Directory to search from. If None, uses current working directory.
+    Returns:
+        Dictionary of environment variable names to values.
+    """
+    dotenv_path = find_dotenv_path(search_path)
+    if dotenv_path:
+        return dotenv_values(dotenv_path)
+    return {}
+# --- Load .env files ---
+# Attempt to load .env.dev first, then .env as a fallback.
+# This happens when the module is imported.
+# We use override=False (default) so that existing environment variables
+# (e.g., set in the shell) are NOT overridden by .env files.
+_DOTENV_PATH = find_dotenv_path()
+if _DOTENV_PATH:
+    load_dotenv(dotenv_path=_DOTENV_PATH, override=False)
+    logger.debug(f"eval_protocol.auth: Loaded environment variables from: {_DOTENV_PATH}")
+else:
+    logger.debug(
+        "eval_protocol.auth: No .env.dev or .env file found. Relying on shell/existing environment variables."
+    )
+# --- End .env loading ---
 def get_fireworks_api_key() -> Optional[str]:
     """
     Retrieves the Fireworks API key.
@@ -73,6 +136,8 @@ def verify_api_key_and_get_account_id(
     Args:
         api_key: Optional explicit API key. When None, resolves via get_fireworks_api_key().
         api_base: Optional explicit API base. When None, resolves via get_fireworks_api_base().
+            If api_base is api.fireworks.ai, it is used directly. Otherwise, defaults to
+            dev.api.fireworks.ai for the verification call.
     Returns:
         The resolved account id if verification succeeds and the header is present; otherwise None.
@@ -81,7 +146,12 @@ def verify_api_key_and_get_account_id(
         resolved_key = api_key or get_fireworks_api_key()
         if not resolved_key:
             return None
-        resolved_base = api_base or get_fireworks_api_base()
+        provided_base = api_base or get_fireworks_api_base()
+        # Use api.fireworks.ai if explicitly provided, otherwise fall back to dev
+        if "api.fireworks.ai" in provided_base:
+            resolved_base = provided_base
+        else:
+            resolved_base = "https://dev.api.fireworks.ai"
         from .common_utils import get_user_agent

{eval_protocol-0.3.9.dev1 → eval_protocol-0.3.10.dev2}/eval_protocol/cli.py RENAMED Viewed

@@ -81,13 +81,12 @@ def _configure_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParse
         "--env-file",
         help="Path to .env file containing secrets to upload (default: .env in current directory)",
     )
-    upload_parser.add_argument(
-        "--force",
-        action="store_true",
-        help="Overwrite existing evaluator with the same ID",
-    )
     # Auto-generate flags from SDK Fireworks().evaluators.create() signature
+    # Note: We use Fireworks() directly here instead of create_fireworks_client()
+    # because we only need the method signature for introspection, not a fully
+    # authenticated client. create_fireworks_client() would trigger an HTTP request
+    # to verify the API key, causing delays even for --help invocations.
     create_evaluator_fn = Fireworks().evaluators.create
     upload_skip_fields = {
@@ -137,7 +136,6 @@ def _configure_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParse
     rft_parser.add_argument("--yes", "-y", action="store_true", help="Non-interactive mode")
     rft_parser.add_argument("--dry-run", action="store_true", help="Print planned SDK call without sending")
-    rft_parser.add_argument("--force", action="store_true", help="Overwrite existing evaluator with the same ID")
     rft_parser.add_argument("--skip-validation", action="store_true", help="Skip local dataset/evaluator validation")
     rft_parser.add_argument(
         "--ignore-docker",
@@ -198,6 +196,10 @@ def _configure_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParse
         "loss_config.method": "RL loss method for underlying trainers. One of {grpo,dapo}.",
     }
+    # Note: We use Fireworks() directly here instead of create_fireworks_client()
+    # because we only need the method signature for introspection, not a fully
+    # authenticated client. create_fireworks_client() would trigger an HTTP request
+    # to verify the API key, causing delays even for --help invocations.
     create_rft_job_fn = Fireworks().reinforcement_fine_tuning_jobs.create
     add_args_from_callable_signature(

{eval_protocol-0.3.9.dev1 → eval_protocol-0.3.10.dev2}/eval_protocol/cli_commands/create_rft.py RENAMED Viewed

@@ -7,19 +7,18 @@ import sys
 import time
 from typing import Any, Callable, Dict, Optional
 import inspect
-import requests
 import tempfile
 from pydantic import ValidationError
 from ..auth import get_fireworks_api_base, get_fireworks_api_key
-from ..common_utils import get_user_agent, load_jsonl
+from ..fireworks_client import create_fireworks_client
+from ..common_utils import load_jsonl
 from ..fireworks_rft import (
     create_dataset_from_jsonl,
     detect_dataset_builder,
     materialize_dataset_via_builder,
 )
 from ..models import EvaluationRow
-from .upload import upload_command
 from .utils import (
     _build_entry_point,
     _build_trimmed_dataset_id,
@@ -35,8 +34,6 @@ from .utils import (
 )
 from .local_test import run_evaluator_test
-from fireworks import Fireworks
 def _extract_dataset_adapter(
     test_file_path: str, test_func_name: str
@@ -223,64 +220,68 @@ def _extract_jsonl_from_input_dataset(test_file_path: str, test_func_name: str)
         return None
-def _poll_evaluator_status(
-    evaluator_resource_name: str, api_key: str, api_base: str, timeout_minutes: int = 10
+def _poll_evaluator_version_status(
+    evaluator_id: str,
+    version_id: str,
+    api_key: str,
+    api_base: str,
+    timeout_minutes: int = 10,
 ) -> bool:
     """
-    Poll evaluator status until it becomes ACTIVE or times out.
+    Poll a specific evaluator version status until it becomes ACTIVE or times out.
+    Uses the Fireworks SDK to get the specified version of the evaluator and checks
+    its build state.
     Args:
-        evaluator_resource_name: Full evaluator resource name (e.g., accounts/xxx/evaluators/yyy)
+        evaluator_id: The evaluator ID (not full resource name)
+        version_id: The specific version ID to poll
         api_key: Fireworks API key
         api_base: Fireworks API base URL
         timeout_minutes: Maximum time to wait in minutes
     Returns:
-        True if evaluator becomes ACTIVE, False if timeout or BUILD_FAILED
+        True if evaluator version becomes ACTIVE, False if timeout or BUILD_FAILED
     """
-    headers = {
-        "Authorization": f"Bearer {api_key}",
-        "Content-Type": "application/json",
-        "User-Agent": get_user_agent(),
-    }
-    check_url = f"{api_base}/v1/{evaluator_resource_name}"
     timeout_seconds = timeout_minutes * 60
     poll_interval = 10  # seconds
     start_time = time.time()
-    print(f"Polling evaluator status (timeout: {timeout_minutes}m, interval: {poll_interval}s)...")
+    print(
+        f"Polling evaluator version '{version_id}' status (timeout: {timeout_minutes}m, interval: {poll_interval}s)..."
+    )
+    client = create_fireworks_client(api_key=api_key, base_url=api_base)
     while time.time() - start_time < timeout_seconds:
         try:
-            response = requests.get(check_url, headers=headers, timeout=30)
-            response.raise_for_status()
-            evaluator_data = response.json()
-            state = evaluator_data.get("state", "STATE_UNSPECIFIED")
-            status = evaluator_data.get("status", "")
+            version = client.evaluator_versions.get(version_id, evaluator_id=evaluator_id)
+            state = version.state or "STATE_UNSPECIFIED"
+            status_msg = ""
+            if version.status and version.status.message:
+                status_msg = version.status.message
             if state == "ACTIVE":
-                print("✅ Evaluator is ACTIVE and ready!")
+                print("✅ Evaluator version is ACTIVE and ready!")
                 return True
             elif state == "BUILD_FAILED":
-                print(f"❌ Evaluator build failed. Status: {status}")
+                print(f"❌ Evaluator version build failed. Status: {status_msg}")
                 return False
             elif state == "BUILDING":
                 elapsed_minutes = (time.time() - start_time) / 60
-                print(f"⏳ Evaluator is still building... ({elapsed_minutes:.1f}m elapsed)")
+                print(f"⏳ Evaluator version is still building... ({elapsed_minutes:.1f}m elapsed)")
             else:
-                print(f"⏳ Evaluator state: {state}, status: {status}")
+                print(f"⏳ Evaluator version state: {state}, status: {status_msg}")
-        except requests.exceptions.RequestException as e:
-            print(f"Warning: Failed to check evaluator status: {e}")
+        except Exception as e:
+            print(f"Warning: Failed to check evaluator version status: {e}")
         # Wait before next poll
         time.sleep(poll_interval)
     # Timeout reached
     elapsed_minutes = (time.time() - start_time) / 60
-    print(f"⏰ Timeout after {elapsed_minutes:.1f}m - evaluator is not yet ACTIVE")
+    print(f"⏰ Timeout after {elapsed_minutes:.1f}m - evaluator version is not yet ACTIVE")
     return False
@@ -565,42 +566,16 @@ def _upload_dataset(
 def _upload_and_ensure_evaluator(
     project_root: str,
     evaluator_id: str,
-    evaluator_resource_name: str,
     api_key: str,
     api_base: str,
-    force: bool,
 ) -> bool:
-    """Ensure the evaluator exists and is ACTIVE, uploading it if needed."""
-    # Optional short-circuit: if evaluator already exists and not forcing, skip upload path
-    if not force:
-        try:
-            headers = {
-                "Authorization": f"Bearer {api_key}",
-                "Content-Type": "application/json",
-                "User-Agent": get_user_agent(),
-            }
-            resp = requests.get(f"{api_base}/v1/{evaluator_resource_name}", headers=headers, timeout=10)
-            if resp.ok:
-                state = resp.json().get("state", "STATE_UNSPECIFIED")
-                print(f"✓ Evaluator exists (state: {state}). Skipping upload (use --force to overwrite).")
-                # Poll for ACTIVE before proceeding
-                print(f"Waiting for evaluator '{evaluator_id}' to become ACTIVE...")
-                if not _poll_evaluator_status(
-                    evaluator_resource_name=evaluator_resource_name,
-                    api_key=api_key,
-                    api_base=api_base,
-                    timeout_minutes=10,
-                ):
-                    dashboard_url = _build_evaluator_dashboard_url(evaluator_id)
-                    print("\n❌ Evaluator is not ready within the timeout period.")
-                    print(f"📊 Please check the evaluator status at: {dashboard_url}")
-                    print("   Wait for it to become ACTIVE, then run 'eval-protocol create rft' again.")
-                    return False
-                return True
-        except requests.exceptions.RequestException:
-            pass
+    """Upload evaluator and ensure its version becomes ACTIVE.
+    Creates/updates the evaluator and uploads the code, then polls the specific
+    version until it becomes ACTIVE.
+    """
+    from eval_protocol.evaluation import create_evaluation
-    # Ensure evaluator exists by invoking the upload flow programmatically
     try:
         tests = _discover_tests(project_root)
         selected_entry: Optional[str] = None
@@ -617,43 +592,37 @@ def _upload_and_ensure_evaluator(
             )
             return False
-        upload_args = argparse.Namespace(
-            path=project_root,
-            entry=selected_entry,
-            id=evaluator_id,
-            display_name=None,
-            description=None,
-            force=force,  # Pass through the --force flag
-            yes=True,
-            env_file=None,  # Add the new env_file parameter
+        print(f"\nUploading evaluator '{evaluator_id}'...")
+        result, version_id = create_evaluation(
+            evaluator_id=evaluator_id,
+            display_name=evaluator_id,
+            description=f"Evaluator for {evaluator_id}",
+            entry_point=selected_entry,
         )
-        if force:
-            print(f"🔄 Force flag enabled - will overwrite existing evaluator '{evaluator_id}'")
+        if not version_id:
+            print("Warning: Evaluator created but version upload failed.")
+            return False
-        rc = upload_command(upload_args)
-        if rc == 0:
-            print(f"✓ Uploaded/ensured evaluator: {evaluator_id}")
+        print(f"✓ Uploaded evaluator: {evaluator_id} (version: {version_id})")
-            # Poll for evaluator status
-            print(f"Waiting for evaluator '{evaluator_id}' to become ACTIVE...")
-            is_active = _poll_evaluator_status(
-                evaluator_resource_name=evaluator_resource_name,
-                api_key=api_key,
-                api_base=api_base,
-                timeout_minutes=10,
-            )
+        # Poll for the specific evaluator version status
+        print(f"Waiting for evaluator '{evaluator_id}' version '{version_id}' to become ACTIVE...")
+        is_active = _poll_evaluator_version_status(
+            evaluator_id=evaluator_id,
+            version_id=version_id,
+            api_key=api_key,
+            api_base=api_base,
+            timeout_minutes=10,
+        )
-            if not is_active:
-                dashboard_url = _build_evaluator_dashboard_url(evaluator_id)
-                print("\n❌ Evaluator is not ready within the timeout period.")
-                print(f"📊 Please check the evaluator status at: {dashboard_url}")
-                print("   Wait for it to become ACTIVE, then run 'eval-protocol create rft' again.")
-                return False
-            return True
-        else:
-            print("Warning: Evaluator upload did not complete successfully; proceeding to RFT creation.")
+        if not is_active:
+            dashboard_url = _build_evaluator_dashboard_url(evaluator_id)
+            print("\n❌ Evaluator version is not ready within the timeout period.")
+            print(f"📊 Please check the evaluator status at: {dashboard_url}")
+            print("   Wait for it to become ACTIVE, then run 'eval-protocol create rft' again.")
             return False
+        return True
     except Exception as e:
         print(f"Warning: Failed to upload evaluator automatically: {e}")
         return False
@@ -672,7 +641,7 @@ def _create_rft_job(
 ) -> int:
     """Build and submit the RFT job request (via Fireworks SDK)."""
-    signature = inspect.signature(Fireworks().reinforcement_fine_tuning_jobs.create)
+    signature = inspect.signature(create_fireworks_client().reinforcement_fine_tuning_jobs.create)
     # Build top-level SDK kwargs
     sdk_kwargs: Dict[str, Any] = {
@@ -711,7 +680,7 @@ def _create_rft_job(
         return 0
     try:
-        fw: Fireworks = Fireworks(api_key=api_key, base_url=api_base)
+        fw: Fireworks = create_fireworks_client(api_key=api_key, base_url=api_base)
         job: ReinforcementFineTuningJob = fw.reinforcement_fine_tuning_jobs.create(account_id=account_id, **sdk_kwargs)
         job_name = job.name
         print(f"\n✅ Created Reinforcement Fine-tuning Job: {job_name}")
@@ -739,7 +708,6 @@ def create_rft_command(args) -> int:
     evaluator_arg: Optional[str] = getattr(args, "evaluator", None)
     non_interactive: bool = bool(getattr(args, "yes", False))
     dry_run: bool = bool(getattr(args, "dry_run", False))
-    force: bool = bool(getattr(args, "force", False))
     skip_validation: bool = bool(getattr(args, "skip_validation", False))
     ignore_docker: bool = bool(getattr(args, "ignore_docker", False))
     docker_build_extra: str = getattr(args, "docker_build_extra", "") or ""
@@ -810,14 +778,12 @@ def create_rft_command(args) -> int:
     if not dataset_id or not dataset_resource:
         return 1
-    # 5) Ensure evaluator exists and is ACTIVE (upload + poll if needed)
+    # 5) Ensure evaluator exists and its latest version is ACTIVE (upload + poll if needed)
     if not _upload_and_ensure_evaluator(
         project_root=project_root,
         evaluator_id=evaluator_id,
-        evaluator_resource_name=evaluator_resource_name,
         api_key=api_key,
         api_base=api_base,
-        force=force,
     ):
         return 1

{eval_protocol-0.3.9.dev1 → eval_protocol-0.3.10.dev2}/eval_protocol/cli_commands/local_test.py RENAMED Viewed

@@ -5,6 +5,7 @@ import subprocess
 import sys
 from typing import List
+from ..auth import get_dotenv_values
 from .utils import _build_entry_point, _discover_and_select_tests
@@ -71,6 +72,12 @@ def _run_pytest_in_docker(
         workdir,
     ]
+    # Forward environment variables from .env file to the container
+    dotenv_vars = get_dotenv_values(project_root)
+    for key, value in dotenv_vars.items():
+        if value is not None:
+            cmd += ["-e", f"{key}={value}"]
     # If EP_SUMMARY_JSON is set on the host, mirror it into the container so that
     # pytest evaluation tests can write summary artifacts that are visible to the
     # host. We map paths under the host logs directory (~/.eval_protocol) into the

{eval_protocol-0.3.9.dev1 → eval_protocol-0.3.10.dev2}/eval_protocol/cli_commands/upload.py RENAMED Viewed

@@ -289,7 +289,6 @@ def upload_command(args: argparse.Namespace) -> int:
     base_id = getattr(args, "id", None)
     display_name = getattr(args, "display_name", None)
     description = getattr(args, "description", None)
-    force = bool(getattr(args, "force", False))
     env_file = getattr(args, "env_file", None)
     # Load secrets from .env file and ensure they're available on Fireworks
@@ -378,17 +377,18 @@ def upload_command(args: argparse.Namespace) -> int:
         print(f"\nUploading evaluator '{evaluator_id}' for {qualname.split('.')[-1]}...")
         try:
-            result = create_evaluation(
+            result, version_id = create_evaluation(
                 evaluator_id=evaluator_id,
                 display_name=display_name or evaluator_id,
                 description=description or f"Evaluator for {qualname}",
-                force=force,
                 entry_point=entry_point,
             )
             name = result.get("name", evaluator_id) if isinstance(result, dict) else evaluator_id
             # Print success message with Fireworks dashboard link
             print(f"\n✅ Successfully uploaded evaluator: {evaluator_id}")
+            if version_id:
+                print(f"   Version: {version_id}")
             print("📊 View in Fireworks Dashboard:")
             dashboard_url = _build_evaluator_dashboard_url(evaluator_id)
             print(f"   {dashboard_url}\n")

eval-protocol 0.3.9.dev1__tar.gz → 0.3.10.dev2__tar.gz

eval-protocol 0.3.9.dev1tar.gz → 0.3.10.dev2tar.gz