PyPI - eval-protocol - Versions diffs - 0.3.9.dev3__tar.gz → 0.3.10.dev1__tar.gz - Mend

eval-protocol 0.3.9.dev3tar.gz → 0.3.10.dev1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (470) hide show

{eval_protocol-0.3.9.dev3/eval_protocol.egg-info → eval_protocol-0.3.10.dev1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: eval-protocol
-Version: 0.3.9.dev3
+Version: 0.3.10.dev1
 Summary: The official Python SDK for Eval Protocol (EP.) EP is an open protocol that standardizes how developers author evals for large language model (LLM) applications.
 Author-email: Fireworks AI <info@fireworks.ai>
 License-Expression: MIT
@@ -29,7 +29,7 @@ Requires-Dist: pytest>=6.0.0
 Requires-Dist: pytest-asyncio>=0.21.0
 Requires-Dist: peewee>=3.18.2
 Requires-Dist: backoff>=2.2.0
-Requires-Dist: fireworks-ai==1.0.0a20
+Requires-Dist: fireworks-ai==1.0.0a22
 Requires-Dist: questionary>=2.0.0
 Requires-Dist: toml>=0.10.0
 Requires-Dist: loguru>=0.6.0

{eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/_version.py RENAMED Viewed

@@ -8,11 +8,11 @@ import json
 version_json = '''
 {
- "date": "2026-01-08T14:08:17-0800",
+ "date": "2026-01-13T15:54:22-0800",
  "dirty": false,
  "error": null,
- "full-revisionid": "74e35d4e2e53433124d13671c12a4677078a8b0a",
- "version": "0.3.9.dev.3"
+ "full-revisionid": "3314becfcdf35f771c41988a24f38dcb91593203",
+ "version": "0.3.10.dev.1"
 }
 '''  # END VERSION_JSON

{eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/adapters/fireworks_tracing.py RENAMED Viewed

@@ -253,6 +253,7 @@ class FireworksTracingAdapter(BaseAdapter):
         project_id: Optional[str] = None,
         base_url: str = "https://tracing.fireworks.ai",
         timeout: int = 300,
+        api_key: Optional[str] = None,
     ):
         """Initialize the Fireworks Tracing adapter.
@@ -260,10 +261,16 @@ class FireworksTracingAdapter(BaseAdapter):
             project_id: Optional project ID. If not provided, uses the default project configured on the server.
             base_url: The base URL of the tracing proxy (default: https://tracing.fireworks.ai)
             timeout: Request timeout in seconds (default: 300)
+            api_key: Optional API key. If not provided, falls back to FIREWORKS_API_KEY environment variable.
         """
         self.project_id = project_id
         self.base_url = base_url.rstrip("/")
         self.timeout = timeout
+        self._api_key = api_key
+    def _get_api_key(self) -> Optional[str]:
+        """Get the API key, preferring instance-level key over environment variable."""
+        return self._api_key or os.environ.get("FIREWORKS_API_KEY")
     def search_logs(self, tags: List[str], limit: int = 100, hours_back: int = 24) -> List[Dict[str, Any]]:
         """Fetch logs from Fireworks tracing gateway /logs endpoint.
@@ -276,7 +283,7 @@ class FireworksTracingAdapter(BaseAdapter):
         from ..common_utils import get_user_agent
         headers = {
-            "Authorization": f"Bearer {os.environ.get('FIREWORKS_API_KEY')}",
+            "Authorization": f"Bearer {self._get_api_key()}",
             "User-Agent": get_user_agent(),
         }
         params: Dict[str, Any] = {"tags": tags, "limit": limit, "hours_back": hours_back, "program": "eval_protocol"}
@@ -407,7 +414,7 @@ class FireworksTracingAdapter(BaseAdapter):
         from ..common_utils import get_user_agent
         headers = {
-            "Authorization": f"Bearer {os.environ.get('FIREWORKS_API_KEY')}",
+            "Authorization": f"Bearer {self._get_api_key()}",
             "User-Agent": get_user_agent(),
         }

{eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/auth.py RENAMED Viewed

@@ -3,9 +3,30 @@ import os
 from typing import Optional
 import requests
+from dotenv import find_dotenv, load_dotenv
 logger = logging.getLogger(__name__)
+# --- Load .env files ---
+# Attempt to load .env.dev first, then .env as a fallback.
+# This happens when the module is imported.
+# We use override=False (default) so that existing environment variables
+# (e.g., set in the shell) are NOT overridden by .env files.
+_ENV_DEV_PATH = find_dotenv(filename=".env.dev", raise_error_if_not_found=False, usecwd=True)
+if _ENV_DEV_PATH:
+    load_dotenv(dotenv_path=_ENV_DEV_PATH, override=False)
+    logger.debug(f"eval_protocol.auth: Loaded environment variables from: {_ENV_DEV_PATH}")
+else:
+    _ENV_PATH = find_dotenv(filename=".env", raise_error_if_not_found=False, usecwd=True)
+    if _ENV_PATH:
+        load_dotenv(dotenv_path=_ENV_PATH, override=False)
+        logger.debug(f"eval_protocol.auth: Loaded environment variables from: {_ENV_PATH}")
+    else:
+        logger.debug(
+            "eval_protocol.auth: No .env.dev or .env file found. Relying on shell/existing environment variables."
+        )
+# --- End .env loading ---
 def get_fireworks_api_key() -> Optional[str]:
     """
@@ -73,6 +94,8 @@ def verify_api_key_and_get_account_id(
     Args:
         api_key: Optional explicit API key. When None, resolves via get_fireworks_api_key().
         api_base: Optional explicit API base. When None, resolves via get_fireworks_api_base().
+            If api_base is api.fireworks.ai, it is used directly. Otherwise, defaults to
+            dev.api.fireworks.ai for the verification call.
     Returns:
         The resolved account id if verification succeeds and the header is present; otherwise None.
@@ -81,7 +104,12 @@ def verify_api_key_and_get_account_id(
         resolved_key = api_key or get_fireworks_api_key()
         if not resolved_key:
             return None
-        resolved_base = api_base or get_fireworks_api_base()
+        provided_base = api_base or get_fireworks_api_base()
+        # Use api.fireworks.ai if explicitly provided, otherwise fall back to dev
+        if "api.fireworks.ai" in provided_base:
+            resolved_base = provided_base
+        else:
+            resolved_base = "https://dev.api.fireworks.ai"
         from .common_utils import get_user_agent

{eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/cli.py RENAMED Viewed

@@ -81,13 +81,12 @@ def _configure_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParse
         "--env-file",
         help="Path to .env file containing secrets to upload (default: .env in current directory)",
     )
-    upload_parser.add_argument(
-        "--force",
-        action="store_true",
-        help="Overwrite existing evaluator with the same ID",
-    )
     # Auto-generate flags from SDK Fireworks().evaluators.create() signature
+    # Note: We use Fireworks() directly here instead of create_fireworks_client()
+    # because we only need the method signature for introspection, not a fully
+    # authenticated client. create_fireworks_client() would trigger an HTTP request
+    # to verify the API key, causing delays even for --help invocations.
     create_evaluator_fn = Fireworks().evaluators.create
     upload_skip_fields = {
@@ -137,7 +136,6 @@ def _configure_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParse
     rft_parser.add_argument("--yes", "-y", action="store_true", help="Non-interactive mode")
     rft_parser.add_argument("--dry-run", action="store_true", help="Print planned SDK call without sending")
-    rft_parser.add_argument("--force", action="store_true", help="Overwrite existing evaluator with the same ID")
     rft_parser.add_argument("--skip-validation", action="store_true", help="Skip local dataset/evaluator validation")
     rft_parser.add_argument(
         "--ignore-docker",
@@ -198,6 +196,10 @@ def _configure_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParse
         "loss_config.method": "RL loss method for underlying trainers. One of {grpo,dapo}.",
     }
+    # Note: We use Fireworks() directly here instead of create_fireworks_client()
+    # because we only need the method signature for introspection, not a fully
+    # authenticated client. create_fireworks_client() would trigger an HTTP request
+    # to verify the API key, causing delays even for --help invocations.
     create_rft_job_fn = Fireworks().reinforcement_fine_tuning_jobs.create
     add_args_from_callable_signature(

{eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/cli_commands/create_rft.py RENAMED Viewed

@@ -7,19 +7,18 @@ import sys
 import time
 from typing import Any, Callable, Dict, Optional
 import inspect
-import requests
 import tempfile
 from pydantic import ValidationError
 from ..auth import get_fireworks_api_base, get_fireworks_api_key
-from ..common_utils import get_user_agent, load_jsonl
+from ..fireworks_client import create_fireworks_client
+from ..common_utils import load_jsonl
 from ..fireworks_rft import (
     create_dataset_from_jsonl,
     detect_dataset_builder,
     materialize_dataset_via_builder,
 )
 from ..models import EvaluationRow
-from .upload import upload_command
 from .utils import (
     _build_entry_point,
     _build_trimmed_dataset_id,
@@ -35,8 +34,6 @@ from .utils import (
 )
 from .local_test import run_evaluator_test
-from fireworks import Fireworks
 def _extract_dataset_adapter(
     test_file_path: str, test_func_name: str
@@ -223,64 +220,68 @@ def _extract_jsonl_from_input_dataset(test_file_path: str, test_func_name: str)
         return None
-def _poll_evaluator_status(
-    evaluator_resource_name: str, api_key: str, api_base: str, timeout_minutes: int = 10
+def _poll_evaluator_version_status(
+    evaluator_id: str,
+    version_id: str,
+    api_key: str,
+    api_base: str,
+    timeout_minutes: int = 10,
 ) -> bool:
     """
-    Poll evaluator status until it becomes ACTIVE or times out.
+    Poll a specific evaluator version status until it becomes ACTIVE or times out.
+    Uses the Fireworks SDK to get the specified version of the evaluator and checks
+    its build state.
     Args:
-        evaluator_resource_name: Full evaluator resource name (e.g., accounts/xxx/evaluators/yyy)
+        evaluator_id: The evaluator ID (not full resource name)
+        version_id: The specific version ID to poll
         api_key: Fireworks API key
         api_base: Fireworks API base URL
         timeout_minutes: Maximum time to wait in minutes
     Returns:
-        True if evaluator becomes ACTIVE, False if timeout or BUILD_FAILED
+        True if evaluator version becomes ACTIVE, False if timeout or BUILD_FAILED
     """
-    headers = {
-        "Authorization": f"Bearer {api_key}",
-        "Content-Type": "application/json",
-        "User-Agent": get_user_agent(),
-    }
-    check_url = f"{api_base}/v1/{evaluator_resource_name}"
     timeout_seconds = timeout_minutes * 60
     poll_interval = 10  # seconds
     start_time = time.time()
-    print(f"Polling evaluator status (timeout: {timeout_minutes}m, interval: {poll_interval}s)...")
+    print(
+        f"Polling evaluator version '{version_id}' status (timeout: {timeout_minutes}m, interval: {poll_interval}s)..."
+    )
+    client = create_fireworks_client(api_key=api_key, base_url=api_base)
     while time.time() - start_time < timeout_seconds:
         try:
-            response = requests.get(check_url, headers=headers, timeout=30)
-            response.raise_for_status()
-            evaluator_data = response.json()
-            state = evaluator_data.get("state", "STATE_UNSPECIFIED")
-            status = evaluator_data.get("status", "")
+            version = client.evaluator_versions.get(version_id, evaluator_id=evaluator_id)
+            state = version.state or "STATE_UNSPECIFIED"
+            status_msg = ""
+            if version.status and version.status.message:
+                status_msg = version.status.message
             if state == "ACTIVE":
-                print("✅ Evaluator is ACTIVE and ready!")
+                print("✅ Evaluator version is ACTIVE and ready!")
                 return True
             elif state == "BUILD_FAILED":
-                print(f"❌ Evaluator build failed. Status: {status}")
+                print(f"❌ Evaluator version build failed. Status: {status_msg}")
                 return False
             elif state == "BUILDING":
                 elapsed_minutes = (time.time() - start_time) / 60
-                print(f"⏳ Evaluator is still building... ({elapsed_minutes:.1f}m elapsed)")
+                print(f"⏳ Evaluator version is still building... ({elapsed_minutes:.1f}m elapsed)")
             else:
-                print(f"⏳ Evaluator state: {state}, status: {status}")
+                print(f"⏳ Evaluator version state: {state}, status: {status_msg}")
-        except requests.exceptions.RequestException as e:
-            print(f"Warning: Failed to check evaluator status: {e}")
+        except Exception as e:
+            print(f"Warning: Failed to check evaluator version status: {e}")
         # Wait before next poll
         time.sleep(poll_interval)
     # Timeout reached
     elapsed_minutes = (time.time() - start_time) / 60
-    print(f"⏰ Timeout after {elapsed_minutes:.1f}m - evaluator is not yet ACTIVE")
+    print(f"⏰ Timeout after {elapsed_minutes:.1f}m - evaluator version is not yet ACTIVE")
     return False
@@ -565,42 +566,16 @@ def _upload_dataset(
 def _upload_and_ensure_evaluator(
     project_root: str,
     evaluator_id: str,
-    evaluator_resource_name: str,
     api_key: str,
     api_base: str,
-    force: bool,
 ) -> bool:
-    """Ensure the evaluator exists and is ACTIVE, uploading it if needed."""
-    # Optional short-circuit: if evaluator already exists and not forcing, skip upload path
-    if not force:
-        try:
-            headers = {
-                "Authorization": f"Bearer {api_key}",
-                "Content-Type": "application/json",
-                "User-Agent": get_user_agent(),
-            }
-            resp = requests.get(f"{api_base}/v1/{evaluator_resource_name}", headers=headers, timeout=10)
-            if resp.ok:
-                state = resp.json().get("state", "STATE_UNSPECIFIED")
-                print(f"✓ Evaluator exists (state: {state}). Skipping upload (use --force to overwrite).")
-                # Poll for ACTIVE before proceeding
-                print(f"Waiting for evaluator '{evaluator_id}' to become ACTIVE...")
-                if not _poll_evaluator_status(
-                    evaluator_resource_name=evaluator_resource_name,
-                    api_key=api_key,
-                    api_base=api_base,
-                    timeout_minutes=10,
-                ):
-                    dashboard_url = _build_evaluator_dashboard_url(evaluator_id)
-                    print("\n❌ Evaluator is not ready within the timeout period.")
-                    print(f"📊 Please check the evaluator status at: {dashboard_url}")
-                    print("   Wait for it to become ACTIVE, then run 'eval-protocol create rft' again.")
-                    return False
-                return True
-        except requests.exceptions.RequestException:
-            pass
+    """Upload evaluator and ensure its version becomes ACTIVE.
+    Creates/updates the evaluator and uploads the code, then polls the specific
+    version until it becomes ACTIVE.
+    """
+    from eval_protocol.evaluation import create_evaluation
-    # Ensure evaluator exists by invoking the upload flow programmatically
     try:
         tests = _discover_tests(project_root)
         selected_entry: Optional[str] = None
@@ -617,43 +592,37 @@ def _upload_and_ensure_evaluator(
             )
             return False
-        upload_args = argparse.Namespace(
-            path=project_root,
-            entry=selected_entry,
-            id=evaluator_id,
-            display_name=None,
-            description=None,
-            force=force,  # Pass through the --force flag
-            yes=True,
-            env_file=None,  # Add the new env_file parameter
+        print(f"\nUploading evaluator '{evaluator_id}'...")
+        result, version_id = create_evaluation(
+            evaluator_id=evaluator_id,
+            display_name=evaluator_id,
+            description=f"Evaluator for {evaluator_id}",
+            entry_point=selected_entry,
         )
-        if force:
-            print(f"🔄 Force flag enabled - will overwrite existing evaluator '{evaluator_id}'")
+        if not version_id:
+            print("Warning: Evaluator created but version upload failed.")
+            return False
-        rc = upload_command(upload_args)
-        if rc == 0:
-            print(f"✓ Uploaded/ensured evaluator: {evaluator_id}")
+        print(f"✓ Uploaded evaluator: {evaluator_id} (version: {version_id})")
-            # Poll for evaluator status
-            print(f"Waiting for evaluator '{evaluator_id}' to become ACTIVE...")
-            is_active = _poll_evaluator_status(
-                evaluator_resource_name=evaluator_resource_name,
-                api_key=api_key,
-                api_base=api_base,
-                timeout_minutes=10,
-            )
+        # Poll for the specific evaluator version status
+        print(f"Waiting for evaluator '{evaluator_id}' version '{version_id}' to become ACTIVE...")
+        is_active = _poll_evaluator_version_status(
+            evaluator_id=evaluator_id,
+            version_id=version_id,
+            api_key=api_key,
+            api_base=api_base,
+            timeout_minutes=10,
+        )
-            if not is_active:
-                dashboard_url = _build_evaluator_dashboard_url(evaluator_id)
-                print("\n❌ Evaluator is not ready within the timeout period.")
-                print(f"📊 Please check the evaluator status at: {dashboard_url}")
-                print("   Wait for it to become ACTIVE, then run 'eval-protocol create rft' again.")
-                return False
-            return True
-        else:
-            print("Warning: Evaluator upload did not complete successfully; proceeding to RFT creation.")
+        if not is_active:
+            dashboard_url = _build_evaluator_dashboard_url(evaluator_id)
+            print("\n❌ Evaluator version is not ready within the timeout period.")
+            print(f"📊 Please check the evaluator status at: {dashboard_url}")
+            print("   Wait for it to become ACTIVE, then run 'eval-protocol create rft' again.")
             return False
+        return True
     except Exception as e:
         print(f"Warning: Failed to upload evaluator automatically: {e}")
         return False
@@ -672,7 +641,7 @@ def _create_rft_job(
 ) -> int:
     """Build and submit the RFT job request (via Fireworks SDK)."""
-    signature = inspect.signature(Fireworks().reinforcement_fine_tuning_jobs.create)
+    signature = inspect.signature(create_fireworks_client().reinforcement_fine_tuning_jobs.create)
     # Build top-level SDK kwargs
     sdk_kwargs: Dict[str, Any] = {
@@ -711,7 +680,7 @@ def _create_rft_job(
         return 0
     try:
-        fw: Fireworks = Fireworks(api_key=api_key, base_url=api_base)
+        fw: Fireworks = create_fireworks_client(api_key=api_key, base_url=api_base)
         job: ReinforcementFineTuningJob = fw.reinforcement_fine_tuning_jobs.create(account_id=account_id, **sdk_kwargs)
         job_name = job.name
         print(f"\n✅ Created Reinforcement Fine-tuning Job: {job_name}")
@@ -739,7 +708,6 @@ def create_rft_command(args) -> int:
     evaluator_arg: Optional[str] = getattr(args, "evaluator", None)
     non_interactive: bool = bool(getattr(args, "yes", False))
     dry_run: bool = bool(getattr(args, "dry_run", False))
-    force: bool = bool(getattr(args, "force", False))
     skip_validation: bool = bool(getattr(args, "skip_validation", False))
     ignore_docker: bool = bool(getattr(args, "ignore_docker", False))
     docker_build_extra: str = getattr(args, "docker_build_extra", "") or ""
@@ -810,14 +778,12 @@ def create_rft_command(args) -> int:
     if not dataset_id or not dataset_resource:
         return 1
-    # 5) Ensure evaluator exists and is ACTIVE (upload + poll if needed)
+    # 5) Ensure evaluator exists and its latest version is ACTIVE (upload + poll if needed)
     if not _upload_and_ensure_evaluator(
         project_root=project_root,
         evaluator_id=evaluator_id,
-        evaluator_resource_name=evaluator_resource_name,
         api_key=api_key,
         api_base=api_base,
-        force=force,
     ):
         return 1

{eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/cli_commands/upload.py RENAMED Viewed

@@ -289,7 +289,6 @@ def upload_command(args: argparse.Namespace) -> int:
     base_id = getattr(args, "id", None)
     display_name = getattr(args, "display_name", None)
     description = getattr(args, "description", None)
-    force = bool(getattr(args, "force", False))
     env_file = getattr(args, "env_file", None)
     # Load secrets from .env file and ensure they're available on Fireworks
@@ -378,17 +377,18 @@ def upload_command(args: argparse.Namespace) -> int:
         print(f"\nUploading evaluator '{evaluator_id}' for {qualname.split('.')[-1]}...")
         try:
-            result = create_evaluation(
+            result, version_id = create_evaluation(
                 evaluator_id=evaluator_id,
                 display_name=display_name or evaluator_id,
                 description=description or f"Evaluator for {qualname}",
-                force=force,
                 entry_point=entry_point,
             )
             name = result.get("name", evaluator_id) if isinstance(result, dict) else evaluator_id
             # Print success message with Fireworks dashboard link
             print(f"\n✅ Successfully uploaded evaluator: {evaluator_id}")
+            if version_id:
+                print(f"   Version: {version_id}")
             print("📊 View in Fireworks Dashboard:")
             dashboard_url = _build_evaluator_dashboard_url(evaluator_id)
             print(f"   {dashboard_url}\n")

{eval_protocol-0.3.9.dev3 → eval_protocol-0.3.10.dev1}/eval_protocol/evaluation.py RENAMED Viewed

@@ -4,14 +4,15 @@ import time
 from typing import List, Optional
 import fireworks
+from fireworks.types import EvaluatorVersionParam
 import requests
-from fireworks import Fireworks
 from eval_protocol.auth import (
     get_fireworks_account_id,
     get_fireworks_api_key,
     verify_api_key_and_get_account_id,
 )
+from eval_protocol.fireworks_client import create_fireworks_client
 from eval_protocol.get_pep440_version import get_pep440_version
 logger = logging.getLogger(__name__)
@@ -153,7 +154,7 @@ class Evaluator:
         logger.info(f"Created {output_path} ({size_bytes:,} bytes)")
         return size_bytes
-    def create(self, evaluator_id, display_name=None, description=None, force=False):
+    def create(self, evaluator_id, display_name=None, description=None):
         auth_token = self.api_key or get_fireworks_api_key()
         account_id = self.account_id or get_fireworks_account_id()
         if not account_id and auth_token:
@@ -163,7 +164,11 @@ class Evaluator:
             logger.error("Authentication error: API credentials appear to be invalid or incomplete.")
             raise ValueError("Invalid or missing API credentials.")
-        client = Fireworks(api_key=auth_token, base_url=self.api_base, account_id=account_id)
+        client = create_fireworks_client(
+            api_key=auth_token,
+            base_url=self.api_base,
+            account_id=account_id,
+        )
         self.display_name = display_name or evaluator_id
         self.description = description or f"Evaluator created from {evaluator_id}"
@@ -197,28 +202,20 @@ class Evaluator:
         logger.info(f"Creating evaluator '{evaluator_id}' for account '{account_id}'...")
         try:
-            if force:
-                try:
-                    logger.info("Checking if evaluator exists")
-                    existing_evaluator = client.evaluators.get(evaluator_id=evaluator_id)
-                    if existing_evaluator:
-                        logger.info(f"Evaluator '{evaluator_id}' already exists, deleting and recreating...")
-                        try:
-                            client.evaluators.delete(evaluator_id=evaluator_id)
-                            logger.info(f"Successfully deleted evaluator '{evaluator_id}'")
-                        except fireworks.NotFoundError:
-                            logger.info(f"Evaluator '{evaluator_id}' not found, creating...")
-                        except fireworks.APIError as e:
-                            logger.warning(f"Error deleting evaluator: {str(e)}")
-                except fireworks.NotFoundError:
-                    logger.info(f"Evaluator '{evaluator_id}' does not exist, creating...")
-            # Create evaluator using SDK
-            result = client.evaluators.create(
-                evaluator_id=evaluator_id,
-                evaluator=evaluator_params,
-            )
-            logger.info(f"Successfully created evaluator '{evaluator_id}'")
+            # Try to create evaluator using SDK
+            try:
+                result = client.evaluators.create(
+                    evaluator_id=evaluator_id,
+                    evaluator=evaluator_params,
+                )
+                logger.info(f"Successfully created evaluator '{evaluator_id}'")
+            except fireworks.APIStatusError as create_error:
+                if create_error.status_code == 409:
+                    # Evaluator already exists, get the existing one and proceed to create a new version
+                    logger.info(f"Evaluator '{evaluator_id}' already exists, creating new version...")
+                    result = client.evaluators.get(evaluator_id=evaluator_id)
+                else:
+                    raise
             # Upload code as tar.gz to GCS
             evaluator_name = result.name  # e.g., "accounts/pyroworks/evaluators/test-123"
@@ -229,6 +226,25 @@ class Evaluator:
                     f"Cannot proceed with code upload. Response: {result}"
                 )
+            evaluator_version_param: EvaluatorVersionParam = {}
+            if "commit_hash" in evaluator_params:
+                evaluator_version_param["commit_hash"] = evaluator_params["commit_hash"]
+            if "entry_point" in evaluator_params:
+                evaluator_version_param["entry_point"] = evaluator_params["entry_point"]
+            if "requirements" in evaluator_params:
+                evaluator_version_param["requirements"] = evaluator_params["requirements"]
+            evaluator_version = client.evaluator_versions.create(
+                evaluator_id=evaluator_id,
+                evaluator_version=evaluator_version_param,
+            )
+            evaluator_version_id = evaluator_version.name.split("/")[-1] if evaluator_version.name else None
+            if not evaluator_version_id:
+                raise ValueError(
+                    "Create evaluator version response missing 'name' field. "
+                    f"Cannot proceed with code upload. Response: {evaluator_version}"
+                )
             try:
                 # Create tar.gz of current directory
                 cwd = os.getcwd()
@@ -240,7 +256,8 @@ class Evaluator:
                 # Call GetEvaluatorUploadEndpoint using SDK
                 logger.info(f"Requesting upload endpoint for {tar_filename}")
-                upload_response = client.evaluators.get_upload_endpoint(
+                upload_response = client.evaluator_versions.get_upload_endpoint(
+                    version_id=evaluator_version_id,
                     evaluator_id=evaluator_id,
                     filename_to_size={tar_filename: str(tar_size)},
                 )
@@ -321,9 +338,9 @@ class Evaluator:
                             raise
                 # Step 3: Validate upload using SDK
-                client.evaluators.validate_upload(
+                client.evaluator_versions.validate_upload(
+                    version_id=evaluator_version_id,
                     evaluator_id=evaluator_id,
-                    body={},
                 )
                 logger.info("Upload validated successfully")
@@ -334,8 +351,10 @@ class Evaluator:
             except Exception as upload_error:
                 logger.warning(f"Code upload failed (evaluator created but code not uploaded): {upload_error}")
                 # Don't fail - evaluator is created, just code upload failed
+                # Return None for version_id since upload failed
+                return result, None
-            return result  # Return after attempting upload
+            return result, evaluator_version_id  # Return evaluator result and version ID
         except fireworks.APIStatusError as e:
             logger.error(f"Error creating evaluator: {str(e)}")
             logger.error(f"Status code: {e.status_code}, Response: {e.response.text}")
@@ -361,7 +380,6 @@ def create_evaluation(
     evaluator_id: str,
     display_name: Optional[str] = None,
     description: Optional[str] = None,
-    force: bool = False,
     account_id: Optional[str] = None,
     api_key: Optional[str] = None,
     entry_point: Optional[str] = None,
@@ -373,10 +391,13 @@ def create_evaluation(
         evaluator_id: Unique identifier for the evaluator
         display_name: Display name for the evaluator
         description: Description for the evaluator
-        force: If True, delete and recreate if evaluator exists
         account_id: Optional Fireworks account ID
         api_key: Optional Fireworks API key
         entry_point: Optional entry point (module::function or path::function)
+    Returns:
+        A tuple of (evaluator_result, version_id) where version_id is the ID of the
+        created evaluator version, or None if upload failed.
     """
     evaluator = Evaluator(
         account_id=account_id,
@@ -384,4 +405,4 @@ def create_evaluation(
         entry_point=entry_point,
     )
-    return evaluator.create(evaluator_id, display_name, description, force)
+    return evaluator.create(evaluator_id, display_name, description)

eval-protocol 0.3.9.dev3__tar.gz → 0.3.10.dev1__tar.gz

eval-protocol 0.3.9.dev3tar.gz → 0.3.10.dev1tar.gz