PyPI - judgeval - Versions diffs - 0.10.1__py3-none-any.whl → 0.12.0__py3-none-any.whl - Mend

judgeval 0.10.1py3-none-any.whl → 0.12.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

judgeval/__init__.py +5 -5
judgeval/api/__init__.py +17 -9
judgeval/api/api_types.py +20 -18
judgeval/data/evaluation_run.py +13 -12
judgeval/data/judgment_types.py +25 -14
judgeval/data/result.py +1 -0
judgeval/data/scorer_data.py +1 -26
judgeval/dataset/__init__.py +17 -16
judgeval/env.py +11 -2
judgeval/evaluation/__init__.py +20 -63
judgeval/integrations/langgraph/__init__.py +2 -1
judgeval/scorers/__init__.py +2 -0
judgeval/scorers/agent_scorer.py +15 -15
judgeval/scorers/base_scorer.py +2 -2
judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +1 -1
judgeval/scorers/score.py +1 -1
judgeval/tracer/__init__.py +6 -9
judgeval/tracer/local_eval_queue.py +11 -7
judgeval/trainer/config.py +1 -1
judgeval/trainer/trainable_model.py +1 -1
judgeval/trainer/trainer.py +8 -6
judgeval/utils/async_utils.py +7 -3
judgeval/utils/testing.py +0 -4
judgeval/version.py +1 -1
{judgeval-0.10.1.dist-info → judgeval-0.12.0.dist-info}/METADATA +1 -1
{judgeval-0.10.1.dist-info → judgeval-0.12.0.dist-info}/RECORD +29 -30
judgeval/data/tool.py +0 -5
{judgeval-0.10.1.dist-info → judgeval-0.12.0.dist-info}/WHEEL +0 -0
{judgeval-0.10.1.dist-info → judgeval-0.12.0.dist-info}/entry_points.txt +0 -0
{judgeval-0.10.1.dist-info → judgeval-0.12.0.dist-info}/licenses/LICENSE.md +0 -0

judgeval/__init__.py CHANGED Viewed

@@ -5,8 +5,9 @@ from judgeval.evaluation import run_eval
 from judgeval.data.evaluation_run import ExampleEvaluationRun
-from typing import List, Optional, Union
-from judgeval.scorers import BaseScorer, ExampleAPIScorerConfig
+from typing import List, Optional, Union, Sequence
+from judgeval.scorers import ExampleAPIScorerConfig
+from judgeval.scorers.example_scorer import ExampleScorer
 from judgeval.data.example import Example
 from judgeval.logger import judgeval_logger
 from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_DEFAULT_GPT_MODEL, JUDGMENT_ORG_ID
@@ -38,7 +39,7 @@ class JudgmentClient(metaclass=SingletonMeta):
     def run_evaluation(
         self,
         examples: List[Example],
-        scorers: List[Union[ExampleAPIScorerConfig, BaseScorer]],
+        scorers: Sequence[Union[ExampleAPIScorerConfig, ExampleScorer]],
         project_name: str = "default_project",
         eval_run_name: str = "default_eval_run",
         model: str = JUDGMENT_DEFAULT_GPT_MODEL,
@@ -51,10 +52,9 @@ class JudgmentClient(metaclass=SingletonMeta):
                 examples=examples,
                 scorers=scorers,
                 model=model,
-                organization_id=self.organization_id,
             )
-            results = run_eval(eval, self.api_key)
+            results = run_eval(eval)
             if assert_test:
                 assert_test_results(results)

judgeval/api/__init__.py CHANGED Viewed

@@ -137,12 +137,13 @@ class JudgmentSyncClient:
             payload,
         )
-    def datasets_pull_all_for_judgeval(self, payload: DatasetsFetch) -> List[DatasetInfo]:
+    def datasets_pull_all_for_judgeval(self, payload: DatasetsFetch) -> Any:
         return self._request(
             "POST",
             url_for("/datasets/pull_all_for_judgeval/"),
             payload,
         )
     def datasets_create_for_judgeval(self, payload: DatasetCreate) -> Any:
         return self._request(
             "POST",
@@ -180,12 +181,12 @@ class JudgmentSyncClient:
             payload,
         )
-    def fetch_scorer(
-        self, payload: FetchPromptScorerRequest
-    ) -> FetchPromptScorerResponse:
+    def fetch_scorers(
+        self, payload: FetchPromptScorersRequest
+    ) -> FetchPromptScorersResponse:
         return self._request(
             "POST",
-            url_for("/fetch_scorer/"),
+            url_for("/fetch_scorers/"),
             payload,
         )
@@ -345,6 +346,13 @@ class JudgmentAsyncClient:
             payload,
         )
+    async def datasets_pull_all_for_judgeval(self, payload: DatasetsFetch) -> Any:
+        return await self._request(
+            "POST",
+            url_for("/datasets/pull_all_for_judgeval/"),
+            payload,
+        )
     async def datasets_create_for_judgeval(self, payload: DatasetCreate) -> Any:
         return await self._request(
             "POST",
@@ -384,12 +392,12 @@ class JudgmentAsyncClient:
             payload,
         )
-    async def fetch_scorer(
-        self, payload: FetchPromptScorerRequest
-    ) -> FetchPromptScorerResponse:
+    async def fetch_scorers(
+        self, payload: FetchPromptScorersRequest
+    ) -> FetchPromptScorersResponse:
         return await self._request(
             "POST",
-            url_for("/fetch_scorer/"),
+            url_for("/fetch_scorers/"),
             payload,
         )

judgeval/api/api_types.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # generated by datamodel-codegen:
 #   filename:  .openapi.json
-#   timestamp: 2025-09-10T17:42:12+00:00
+#   timestamp: 2025-09-12T16:54:35+00:00
 from __future__ import annotations
 from typing import Any, Dict, List, Literal, Optional, TypedDict, Union
@@ -19,6 +19,7 @@ class DatasetFetch(TypedDict):
     dataset_name: str
     project_name: str
 class DatasetsFetch(TypedDict):
     project_name: str
@@ -60,8 +61,8 @@ class SavePromptScorerResponse(TypedDict):
     name: str
-class FetchPromptScorerRequest(TypedDict):
-    name: str
+class FetchPromptScorersRequest(TypedDict):
+    names: NotRequired[Optional[List[str]]]
 class CustomScorerUploadPayload(TypedDict):
@@ -154,7 +155,7 @@ class ScorerData(TypedDict):
     score: NotRequired[Optional[float]]
     reason: NotRequired[Optional[str]]
     strict_mode: NotRequired[Optional[bool]]
-    evaluation_model: NotRequired[str]
+    evaluation_model: NotRequired[Optional[str]]
     error: NotRequired[Optional[str]]
     additional_metadata: NotRequired[Optional[Dict[str, Any]]]
@@ -189,13 +190,13 @@ class OtelTraceSpan(TypedDict):
 class ExampleEvaluationRun(TypedDict):
-    id: NotRequired[Optional[str]]
-    project_name: NotRequired[Optional[str]]
-    eval_name: NotRequired[Optional[str]]
+    id: NotRequired[str]
+    project_name: str
+    eval_name: str
     custom_scorers: NotRequired[List[BaseScorer]]
     judgment_scorers: NotRequired[List[ScorerConfig]]
     model: str
-    created_at: NotRequired[Optional[str]]
+    created_at: NotRequired[str]
     examples: List[Example]
     trace_span_id: NotRequired[Optional[str]]
     trace_id: NotRequired[Optional[str]]
@@ -206,13 +207,13 @@ class HTTPValidationError(TypedDict):
 class TraceEvaluationRun(TypedDict):
-    id: NotRequired[Optional[str]]
-    project_name: NotRequired[Optional[str]]
-    eval_name: NotRequired[Optional[str]]
+    id: NotRequired[str]
+    project_name: str
+    eval_name: str
     custom_scorers: NotRequired[List[BaseScorer]]
     judgment_scorers: NotRequired[List[ScorerConfig]]
     model: str
-    created_at: NotRequired[Optional[str]]
+    created_at: NotRequired[str]
     trace_and_span_ids: List[TraceAndSpanId]
     is_offline: NotRequired[bool]
@@ -228,30 +229,31 @@ class DatasetReturn(TypedDict):
     project_name: str
     examples: NotRequired[Optional[List[Example]]]
 class DatasetInfo(TypedDict):
     dataset_id: str
     name: str
     created_at: str
     dataset_kind: DatasetKind
     entries: int
-    creator: str
+    creator: str
 class DatasetCreate(TypedDict):
     name: str
     dataset_kind: DatasetKind
     project_name: str
-    examples: NotRequired[Optional[List[Example]]]
-    overwrite: NotRequired[Optional[bool]]
+    examples: List[Example]
+    overwrite: bool
-class FetchPromptScorerResponse(TypedDict):
-    scorer: PromptScorer
+class FetchPromptScorersResponse(TypedDict):
+    scorers: List[PromptScorer]
 class ScoringResult(TypedDict):
     success: bool
-    scorers_data: Optional[List[ScorerData]]
+    scorers_data: List[ScorerData]
     name: NotRequired[Optional[str]]
     data_object: NotRequired[Optional[Union[OtelTraceSpan, Example]]]
     trace_id: NotRequired[Optional[str]]

judgeval/data/evaluation_run.py CHANGED Viewed

@@ -1,11 +1,11 @@
-from typing import List, Optional, Union, Tuple
-from litellm.files.main import BaseModel
-from pydantic import field_validator, model_validator, Field
+from typing import List, Optional, Union, Tuple, Sequence
+from pydantic import field_validator, model_validator, Field, BaseModel
 from datetime import datetime, timezone
 import uuid
 from judgeval.data import Example
-from judgeval.scorers import BaseScorer, APIScorerConfig
+from judgeval.scorers import APIScorerConfig
+from judgeval.scorers.example_scorer import ExampleScorer
 from judgeval.constants import ACCEPTABLE_MODELS
 from judgeval.data.judgment_types import (
     ExampleEvaluationRun as ExampleEvaluationRunJudgmentType,
@@ -14,19 +14,20 @@ from judgeval.data.judgment_types import (
 class EvaluationRun(BaseModel):
-    id: Optional[str] = Field(default_factory=lambda: str(uuid.uuid4()))
-    created_at: Optional[str] = Field(
+    id: str = Field(default_factory=lambda: str(uuid.uuid4()))
+    created_at: str = Field(
         default_factory=lambda: datetime.now(timezone.utc).isoformat()
     )
-    organization_id: Optional[str] = None
-    custom_scorers: Optional[List[BaseScorer]] = None
-    judgment_scorers: Optional[List[APIScorerConfig]] = None
-    scorers: Optional[List[Union[BaseScorer, APIScorerConfig]]] = None
+    custom_scorers: List[ExampleScorer] = Field(default_factory=list)
+    judgment_scorers: Sequence[APIScorerConfig] = Field(default_factory=list)
+    scorers: Sequence[Union[ExampleScorer, APIScorerConfig]] = Field(
+        default_factory=list
+    )
     model: str
     def __init__(
         self,
-        scorers: Optional[List[Union[BaseScorer, APIScorerConfig]]] = None,
+        scorers: Optional[List[Union[ExampleScorer, APIScorerConfig]]] = None,
         **kwargs,
     ):
         """
@@ -38,7 +39,7 @@ class EvaluationRun(BaseModel):
         """
         if scorers is not None:
             # Automatically sort scorers into appropriate fields
-            custom_scorers = [s for s in scorers if isinstance(s, BaseScorer)]
+            custom_scorers = [s for s in scorers if isinstance(s, ExampleScorer)]
             judgment_scorers = [s for s in scorers if isinstance(s, APIScorerConfig)]
             # Always set both fields as lists (even if empty) to satisfy validation

judgeval/data/judgment_types.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # generated by datamodel-codegen:
 #   filename:  .openapi.json
-#   timestamp: 2025-09-10T17:42:11+00:00
+#   timestamp: 2025-09-12T16:54:34+00:00
 from __future__ import annotations
 from typing import Annotated, Any, Dict, List, Optional, Union
@@ -22,6 +22,10 @@ class DatasetFetch(BaseModel):
     project_name: Annotated[str, Field(title="Project Name")]
+class DatasetsFetch(BaseModel):
+    project_name: Annotated[str, Field(title="Project Name")]
 class ProjectAdd(BaseModel):
     project_name: Annotated[str, Field(title="Project Name")]
@@ -59,8 +63,8 @@ class SavePromptScorerResponse(BaseModel):
     name: Annotated[str, Field(title="Name")]
-class FetchPromptScorerRequest(BaseModel):
-    name: Annotated[str, Field(title="Name")]
+class FetchPromptScorersRequest(BaseModel):
+    names: Annotated[Optional[List[str]], Field(title="Names")] = None
 class CustomScorerUploadPayload(BaseModel):
@@ -210,8 +214,8 @@ class OtelTraceSpan(BaseModel):
 class ExampleEvaluationRun(BaseModel):
     id: Annotated[Optional[str], Field(title="Id")] = None
-    project_name: Annotated[Optional[str], Field(title="Project Name")] = None
-    eval_name: Annotated[Optional[str], Field(title="Eval Name")] = None
+    project_name: Annotated[str, Field(title="Project Name")]
+    eval_name: Annotated[str, Field(title="Eval Name")]
     custom_scorers: Annotated[
         Optional[List[BaseScorer]], Field(title="Custom Scorers")
     ] = []
@@ -231,8 +235,8 @@ class HTTPValidationError(BaseModel):
 class TraceEvaluationRun(BaseModel):
     id: Annotated[Optional[str], Field(title="Id")] = None
-    project_name: Annotated[Optional[str], Field(title="Project Name")] = None
-    eval_name: Annotated[Optional[str], Field(title="Eval Name")] = None
+    project_name: Annotated[str, Field(title="Project Name")]
+    eval_name: Annotated[str, Field(title="Eval Name")]
     custom_scorers: Annotated[
         Optional[List[BaseScorer]], Field(title="Custom Scorers")
     ] = []
@@ -259,23 +263,30 @@ class DatasetReturn(BaseModel):
     examples: Annotated[Optional[List[Example]], Field(title="Examples")] = None
+class DatasetInfo(BaseModel):
+    dataset_id: Annotated[str, Field(title="Dataset Id")]
+    name: Annotated[str, Field(title="Name")]
+    created_at: Annotated[str, Field(title="Created At")]
+    dataset_kind: DatasetKind
+    entries: Annotated[int, Field(title="Entries")]
+    creator: Annotated[str, Field(title="Creator")]
 class DatasetCreate(BaseModel):
     name: Annotated[str, Field(title="Name")]
     dataset_kind: DatasetKind
     project_name: Annotated[str, Field(title="Project Name")]
-    examples: Annotated[Optional[List[Example]], Field(title="Examples")] = None
-    overwrite: Annotated[Optional[bool], Field(title="Overwrite")] = False
+    examples: Annotated[List[Example], Field(title="Examples")]
+    overwrite: Annotated[bool, Field(title="Overwrite")]
-class FetchPromptScorerResponse(BaseModel):
-    scorer: PromptScorer
+class FetchPromptScorersResponse(BaseModel):
+    scorers: Annotated[List[PromptScorer], Field(title="Scorers")]
 class ScoringResult(BaseModel):
     success: Annotated[bool, Field(title="Success")]
-    scorers_data: Annotated[Optional[List[ScorerData]], Field(title="Scorers Data")] = (
-        None
-    )
+    scorers_data: Annotated[List[ScorerData], Field(title="Scorers Data")]
     name: Annotated[Optional[str], Field(title="Name")] = None
     data_object: Annotated[
         Optional[Union[OtelTraceSpan, Example]], Field(title="Data Object")

judgeval/data/result.py CHANGED Viewed

@@ -18,6 +18,7 @@ class ScoringResult(JudgmentScoringResult):
     # Need to override this so that it uses this repo's Example class
     data_object: Example
+    scorers_data: List[ScorerData]
     def model_dump(self, **kwargs):
         data = super().model_dump(**kwargs)

judgeval/data/scorer_data.py CHANGED Viewed

@@ -6,36 +6,11 @@ ScorerData holds the information related to a single, completed Scorer evaluatio
 from __future__ import annotations
-from judgeval.data.judgment_types import ScorerData as JudgmentScorerData
+from judgeval.data.judgment_types import ScorerData
 from judgeval.scorers import BaseScorer
 from typing import List
-class ScorerData(JudgmentScorerData):
-    """
-    ScorerData holds the information related to a single, completed Scorer evaluation run.
-    For example, if running the Judgment Faithfulness scorer on an example, the ScorerData
-    object will contain whether the example passed its threshold expectation, as well as more detailed
-    information surrounding the evaluation run such as the claims and verdicts generated by the
-    judge model(s).
-    """
-    def to_dict(self) -> dict:
-        """Convert the ScorerData instance to a JSON-serializable dictionary."""
-        return {
-            "name": self.name,
-            "threshold": self.threshold,
-            "success": self.success,
-            "score": self.score,
-            "reason": self.reason,
-            "strict_mode": self.strict_mode,
-            "evaluation_model": self.evaluation_model,
-            "error": self.error,
-            "additional_metadata": self.additional_metadata,
-        }
 def create_scorer_data(scorer: BaseScorer) -> List[ScorerData]:
     """
     After a `scorer` is run, it contains information about the example that was evaluated

judgeval/dataset/__init__.py CHANGED Viewed

@@ -3,7 +3,7 @@ import orjson
 import os
 import yaml
 from dataclasses import dataclass
-from typing import List, Literal, Optional
+from typing import List, Literal
 from judgeval.data import Example
 from judgeval.utils.file_utils import get_examples_from_yaml, get_examples_from_json
@@ -13,15 +13,17 @@ from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_ORG_ID
 from judgeval.api.api_types import DatasetKind
 @dataclass
 class DatasetInfo:
     dataset_id: str
-    name: str
+    name: str
     created_at: str
     dataset_kind: DatasetKind
     entries: int
     creator: str
 @dataclass
 class Dataset:
     examples: List[Example]
@@ -46,9 +48,12 @@ class Dataset:
         if not dataset:
             raise ValueError(f"Dataset {name} not found in project {project_name}")
         examples = dataset.get("examples", [])
+        if examples is None:
+            examples = []
         for e in examples:
-            if isinstance(e, dict) and isinstance(e.get("data"), dict):
-                e.update(e.pop("data"))
+            if isinstance(e, dict) and isinstance(e.get("data", {}), dict):
+                e.update(e.pop("data"))  # type: ignore
                 e.pop(
                     "example_id"
                 )  # TODO: remove once scorer data migraiton is complete
@@ -64,7 +69,7 @@ class Dataset:
         cls,
         name: str,
         project_name: str,
-        examples: Optional[List[Example]] = None,
+        examples: List[Example] = [],
         overwrite: bool = False,
     ):
         if not examples:
@@ -75,7 +80,7 @@ class Dataset:
             {
                 "name": name,
                 "project_name": project_name,
-                "examples": [e.model_dump() for e in examples],
+                "examples": examples,  # type: ignore
                 "dataset_kind": "example",
                 "overwrite": overwrite,
             }
@@ -87,18 +92,14 @@ class Dataset:
             project_name=project_name,
             examples=examples,
         )
     @classmethod
-    def list(
-        cls,
-        project_name: str
-    ):
+    def list(cls, project_name: str):
         client = JudgmentSyncClient(cls.judgment_api_key, cls.organization_id)
-        datasets = client.datasets_pull_all_for_judgeval(
-            {"project_name": project_name}
-        )
+        datasets = client.datasets_pull_all_for_judgeval({"project_name": project_name})
         judgeval_logger.info(f"Fetched all datasets for project {project_name}!")
         return [DatasetInfo(**dataset_info) for dataset_info in datasets]
     def add_from_json(self, file_path: str) -> None:
@@ -147,7 +148,7 @@ class Dataset:
             {
                 "dataset_name": self.name,
                 "project_name": self.project_name,
-                "examples": [e.model_dump() for e in examples],
+                "examples": examples,  # type: ignore
             }
         )

judgeval/env.py CHANGED Viewed

@@ -19,8 +19,17 @@ def optional_env_var(var_name: str, default: str | None = None) -> str | None:
     return os.getenv(var_name, default)
-JUDGMENT_API_KEY = optional_env_var("JUDGMENT_API_KEY")
-JUDGMENT_ORG_ID = optional_env_var("JUDGMENT_ORG_ID")
+def required_env_var(var_name: str) -> str:
+    value = os.getenv(var_name)
+    if value is None:
+        raise EnvironmentError(
+            f"Environment variable '{var_name}' is required but not set."
+        )
+    return value
+JUDGMENT_API_KEY = required_env_var("JUDGMENT_API_KEY")
+JUDGMENT_ORG_ID = required_env_var("JUDGMENT_ORG_ID")
 JUDGMENT_API_URL = optional_env_var("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
 JUDGMENT_DEFAULT_GPT_MODEL = optional_env_var("JUDGMENT_DEFAULT_GPT_MODEL", "gpt-4.1")

judgeval/evaluation/__init__.py CHANGED Viewed

@@ -3,14 +3,11 @@ from __future__ import annotations
 import asyncio
 import concurrent.futures
 import time
-import orjson
-import sys
 import threading
-from typing import List, Dict, Union, Tuple, TYPE_CHECKING
+from typing import List, Tuple, TYPE_CHECKING
 from rich import print as rprint
-from judgeval.data import ScorerData, ScoringResult, Example
-from judgeval.scorers import BaseScorer, ExampleAPIScorerConfig
+from judgeval.data import ScorerData, ScoringResult
 from judgeval.scorers.score import a_execute_scoring
 from judgeval.api import JudgmentSyncClient
 from judgeval.env import (
@@ -19,9 +16,10 @@ from judgeval.env import (
 from judgeval.exceptions import JudgmentAPIError, JudgmentRuntimeError
 from judgeval.logger import judgeval_logger
+from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_ORG_ID
 if TYPE_CHECKING:
-    from judgeval.data.evaluation_run import EvaluationRun
+    from judgeval.data.evaluation_run import ExampleEvaluationRun
 def safe_run_async(coro):
@@ -49,8 +47,7 @@ def safe_run_async(coro):
 def log_evaluation_results(
     scoring_results: List[ScoringResult],
-    run: EvaluationRun,
-    judgment_api_key: str,
+    run: ExampleEvaluationRun,
 ) -> str:
     """
     Logs evaluation results to the Judgment API database.
@@ -65,10 +62,10 @@ def log_evaluation_results(
         ValueError: If there's a validation error with the results
     """
     try:
-        if not judgment_api_key or not run.organization_id:
+        if not JUDGMENT_API_KEY or not JUDGMENT_ORG_ID:
             raise ValueError("API key and organization ID are required")
-        api_client = JudgmentSyncClient(judgment_api_key, run.organization_id)
+        api_client = JudgmentSyncClient(JUDGMENT_API_KEY, JUDGMENT_ORG_ID)
         response = api_client.log_eval_results(
             {
                 "results": scoring_results,  # type: ignore
@@ -85,41 +82,8 @@ def log_evaluation_results(
         )
-def check_examples(
-    examples: List[Example], scorers: List[Union[ExampleAPIScorerConfig, BaseScorer]]
-) -> None:
-    """
-    Checks if the example contains the necessary parameters for the scorer.
-    """
-    prompt_user = False
-    for scorer in scorers:
-        for example in examples:
-            missing_params = []
-            for param in scorer.required_params:
-                if getattr(example, param.value) is None:
-                    missing_params.append(f"{param.value}")
-            if missing_params:
-                rprint(
-                    f"[yellow]⚠️  WARNING:[/yellow] Example is missing required parameters for scorer [bold]{scorer.score_type.value}[/bold]"
-                )
-                rprint(f"Missing parameters: {', '.join(missing_params)}")
-                rprint(
-                    f"Example: {orjson.dumps(example.model_dump(), option=orjson.OPT_INDENT_2).decode('utf-8')}"
-                )
-                rprint("-" * 40)
-                prompt_user = True
-    if prompt_user:
-        user_input = input("Do you want to continue? (y/n)")
-        if user_input.lower() != "y":
-            sys.exit(0)
-        else:
-            rprint("[green]Continuing...[/green]")
 def _poll_evaluation_until_complete(
-    evaluation_run: EvaluationRun,
-    judgment_api_key: str,
+    evaluation_run: ExampleEvaluationRun,
     expected_scorer_data_count: int,
     poll_interval_seconds: float = 5,
     max_failures: int = 5,
@@ -140,13 +104,15 @@ def _poll_evaluation_until_complete(
     Returns:
         List[ScoringResult]: The evaluation results
     """
-    organization_id = evaluation_run.organization_id
     project_name = evaluation_run.project_name
     experiment_run_id = evaluation_run.id
+    if not project_name or not experiment_run_id:
+        raise ValueError("Project name and experiment run ID are required")
     poll_count = 0
     exception_count = 0
-    api_client = JudgmentSyncClient(judgment_api_key, organization_id)
+    api_client = JudgmentSyncClient(JUDGMENT_API_KEY, JUDGMENT_ORG_ID)
     while poll_count < max_poll_count:
         poll_count += 1
         try:
@@ -213,14 +179,13 @@ def progress_logger(stop_event, msg="Working...", interval=5):
 def run_eval(
-    evaluation_run: EvaluationRun,
-    judgment_api_key: str,
+    evaluation_run: ExampleEvaluationRun,
 ) -> List[ScoringResult]:
     """
     Executes an evaluation of `Example`s using one or more `Scorer`s
     Args:
-        evaluation_run (EvaluationRun): Stores example and evaluation together for running
+        evaluation_run (ExampleEvaluationRun): Stores example and evaluation together for running
     Returns:
         List[ScoringResult]: A list of ScoringResult objects
@@ -258,16 +223,13 @@ def run_eval(
             judgeval_logger.error(error_msg)
             raise ValueError(error_msg)
-        check_examples(evaluation_run.examples, evaluation_run.judgment_scorers)
         stop_event = threading.Event()
         t = threading.Thread(
             target=progress_logger, args=(stop_event, "Running evaluation...")
         )
         t.start()
         try:
-            api_client = JudgmentSyncClient(
-                judgment_api_key, evaluation_run.organization_id
-            )
+            api_client = JudgmentSyncClient(JUDGMENT_API_KEY, JUDGMENT_ORG_ID)
             response = api_client.add_to_run_eval_queue_examples(
                 evaluation_run.model_dump(warnings=False)  # type: ignore
             )
@@ -286,7 +248,6 @@ def run_eval(
             )
             results, url = _poll_evaluation_until_complete(
                 evaluation_run=evaluation_run,
-                judgment_api_key=judgment_api_key,
                 expected_scorer_data_count=(num_scorers * len(evaluation_run.examples)),
             )
         finally:
@@ -306,7 +267,7 @@ def run_eval(
         send_results = [
             scoring_result.model_dump(warnings=False) for scoring_result in results
         ]
-        url = log_evaluation_results(send_results, evaluation_run, judgment_api_key)
+        url = log_evaluation_results(send_results, evaluation_run)
     rprint(
         f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
     )
@@ -323,27 +284,23 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
     Returns:
         None. Raises exceptions for any failed test cases.
     """
-    failed_cases: List[ScorerData] = []
+    failed_cases: List[List[ScorerData]] = []
     for result in scoring_results:
         if not result.success:
             # Create a test case context with all relevant fields
-            test_case: Dict = {"failed_scorers": []}
+            test_case: List[ScorerData] = []
             if result.scorers_data:
                 # If the result was not successful, check each scorer_data
                 for scorer_data in result.scorers_data:
                     if not scorer_data.success:
-                        if scorer_data.name == "Tool Order":
-                            # Remove threshold, evaluation model for Tool Order scorer
-                            scorer_data.threshold = None
-                            scorer_data.evaluation_model = None
-                        test_case["failed_scorers"].append(scorer_data)
+                        test_case.append(scorer_data)
             failed_cases.append(test_case)
     if failed_cases:
         error_msg = "The following test cases failed: \n"
         for fail_case in failed_cases:
-            for fail_scorer in fail_case["failed_scorers"]:
+            for fail_scorer in fail_case:
                 error_msg += (
                     f"\nScorer Name: {fail_scorer.name}\n"
                     f"Threshold: {fail_scorer.threshold}\n"

judgeval/integrations/langgraph/__init__.py CHANGED Viewed

@@ -507,6 +507,7 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
             )
             # Extract response content
+            output: Any
             if response.generations:
                 last_generation = response.generations[-1][-1]
                 if (
@@ -547,7 +548,7 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
                 for key, value in usage_attrs.items():
                     span.set_attribute(key, value)
-            self._end_span(run_id=run_id, outputs=output, **usage_attrs)
+            self._end_span(run_id=run_id, outputs=output, **usage_attrs)  # type: ignore
         except Exception as e:
             judgeval_logger.exception(f"Error in on_llm_end: {e}")

judgeval/scorers/__init__.py CHANGED Viewed

@@ -4,6 +4,7 @@ from judgeval.scorers.api_scorer import (
     TraceAPIScorerConfig,
 )
 from judgeval.scorers.base_scorer import BaseScorer
+from judgeval.scorers.example_scorer import ExampleScorer
 from judgeval.scorers.judgeval_scorers.api_scorers import (
     FaithfulnessScorer,
     AnswerRelevancyScorer,
@@ -18,6 +19,7 @@ __all__ = [
     "ExampleAPIScorerConfig",
     "TraceAPIScorerConfig",
     "BaseScorer",
+    "ExampleScorer",
     "TracePromptScorer",
     "PromptScorer",
     "FaithfulnessScorer",

judgeval/scorers/agent_scorer.py CHANGED Viewed

@@ -1,17 +1,17 @@
-from judgeval.scorers.base_scorer import BaseScorer
-from judgeval.data.judgment_types import Trace as JudgmentTrace
-from typing import List, Optional
-from abc import abstractmethod
+# from judgeval.scorers.base_scorer import BaseScorer
+# from judgeval.data.judgment_types import Trace as JudgmentTrace
+# from typing import List, Optional
+# from abc import abstractmethod
-class TraceScorer(BaseScorer):
-    @abstractmethod
-    async def a_score_trace(
-        self, trace: JudgmentTrace, tools: Optional[List] = None, *args, **kwargs
-    ) -> float:
-        """
-        Asynchronously measures the score on a trace
-        """
-        raise NotImplementedError(
-            "You must implement the `a_score_trace` method in your custom scorer"
-        )
+# class TraceScorer(BaseScorer):
+#     @abstractmethod
+#     async def a_score_trace(
+#         self, trace: JudgmentTrace, tools: Optional[List] = None, *args, **kwargs
+#     ) -> float:
+#         """
+#         Asynchronously measures the score on a trace
+#         """
+#         raise NotImplementedError(
+#             "You must implement the `a_score_trace` method in your custom scorer"
+#         )

judgeval/scorers/base_scorer.py CHANGED Viewed

@@ -27,7 +27,7 @@ class BaseScorer(BaseModel):
     threshold: float = 0.5
     # name of your scorer (Faithfulness, PromptScorer-randomslug)
-    name: Optional[str] = None
+    name: str = ""
     # The name of the class of the scorer
     class_name: Optional[str] = None
@@ -42,7 +42,7 @@ class BaseScorer(BaseModel):
     using_native_model: Optional[bool] = None
     # Whether the test case passed or failed
-    success: Optional[bool] = None
+    success: bool = False
     # The name of the model used to evaluate the test case
     model: Optional[str] = None

judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py CHANGED Viewed

@@ -55,7 +55,7 @@ def fetch_prompt_scorer(
 ):
     client = JudgmentSyncClient(judgment_api_key, organization_id)
     try:
-        scorer_config = client.fetch_scorer({"name": name})["scorer"]
+        scorer_config = client.fetch_scorers({"names": [name]})["scorers"][0]
         scorer_config.pop("created_at")
         scorer_config.pop("updated_at")
         return scorer_config

judgeval/scorers/score.py CHANGED Viewed

@@ -21,7 +21,7 @@ from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
 async def safe_a_score_example(
-    scorer: ExampleScorer,
+    scorer: Union[ExampleScorer],
     example: Example,
 ):
     """

judgeval/tracer/__init__.py CHANGED Viewed

@@ -43,8 +43,8 @@ from judgeval.env import (
     JUDGMENT_ORG_ID,
 )
 from judgeval.logger import judgeval_logger
-from judgeval.scorers.api_scorer import ExampleAPIScorerConfig, TraceAPIScorerConfig
-from judgeval.scorers.base_scorer import BaseScorer
+from judgeval.scorers.api_scorer import TraceAPIScorerConfig, ExampleAPIScorerConfig
+from judgeval.scorers.example_scorer import ExampleScorer
 from judgeval.tracer.constants import JUDGEVAL_TRACER_INSTRUMENTING_MODULE_NAME
 from judgeval.tracer.managers import (
     sync_span_context,
@@ -358,7 +358,6 @@ class Tracer:
         eval_run_name = f"async_trace_evaluate_{span_id}"
         eval_run = TraceEvaluationRun(
-            organization_id=self.organization_id,
             project_name=self.project_name,
             eval_name=eval_run_name,
             scorers=[scorer],
@@ -862,7 +861,7 @@ class Tracer:
         self,
         /,
         *,
-        scorer: Union[ExampleAPIScorerConfig, BaseScorer],
+        scorer: Union[ExampleAPIScorerConfig, ExampleScorer],
         example: Example,
         model: str = JUDGMENT_DEFAULT_GPT_MODEL,
         sampling_rate: float = 1.0,
@@ -871,9 +870,9 @@ class Tracer:
             judgeval_logger.info("Evaluation is not enabled, skipping evaluation")
             return
-        if not isinstance(scorer, (ExampleAPIScorerConfig, BaseScorer)):
+        if not isinstance(scorer, (ExampleAPIScorerConfig, ExampleScorer)):
             judgeval_logger.error(
-                "Scorer must be an instance of ExampleAPIScorerConfig or BaseScorer, got %s, skipping evaluation."
+                "Scorer must be an instance of ExampleAPIScorerConfig or ExampleScorer, got %s, skipping evaluation."
                 % type(scorer)
             )
             return
@@ -903,12 +902,11 @@ class Tracer:
         trace_id = format(span_context.trace_id, "032x")
         span_id = format(span_context.span_id, "016x")
         hosted_scoring = isinstance(scorer, ExampleAPIScorerConfig) or (
-            isinstance(scorer, BaseScorer) and scorer.server_hosted
+            isinstance(scorer, ExampleScorer) and scorer.server_hosted
         )
         eval_run_name = f"async_evaluate_{span_id}"  # note this name doesnt matter because we don't save the experiment only the example and scorer_data
         if hosted_scoring:
             eval_run = ExampleEvaluationRun(
-                organization_id=self.organization_id,
                 project_name=self.project_name,
                 eval_name=eval_run_name,
                 examples=[example],
@@ -923,7 +921,6 @@ class Tracer:
         else:
             # Handle custom scorers using local evaluation queue
             eval_run = ExampleEvaluationRun(
-                organization_id=self.organization_id,
                 project_name=self.project_name,
                 eval_name=eval_run_name,
                 examples=[example],

judgeval/tracer/local_eval_queue.py CHANGED Viewed

@@ -13,7 +13,7 @@ import time
 from judgeval.logger import judgeval_logger
 from judgeval.env import JUDGMENT_MAX_CONCURRENT_EVALUATIONS
 from judgeval.data import ScoringResult
-from judgeval.data.evaluation_run import EvaluationRun
+from judgeval.data.evaluation_run import ExampleEvaluationRun
 from judgeval.utils.async_utils import safe_run_async
 from judgeval.scorers.score import a_execute_scoring
 from judgeval.api import JudgmentSyncClient
@@ -34,7 +34,7 @@ class LocalEvaluationQueue:
     ):
         if num_workers <= 0:
             raise ValueError("num_workers must be a positive integer.")
-        self._queue: queue.Queue[Optional[EvaluationRun]] = queue.Queue()
+        self._queue: queue.Queue[Optional[ExampleEvaluationRun]] = queue.Queue()
         self._max_concurrent = max_concurrent
         self._num_workers = num_workers  # Number of worker threads
         self._worker_threads: List[threading.Thread] = []
@@ -44,11 +44,11 @@ class LocalEvaluationQueue:
             organization_id=JUDGMENT_ORG_ID,
         )
-    def enqueue(self, evaluation_run: EvaluationRun) -> None:
+    def enqueue(self, evaluation_run: ExampleEvaluationRun) -> None:
         """Add evaluation run to the queue."""
         self._queue.put(evaluation_run)
-    def _process_run(self, evaluation_run: EvaluationRun) -> List[ScoringResult]:
+    def _process_run(self, evaluation_run: ExampleEvaluationRun) -> List[ScoringResult]:
         """Execute evaluation run locally and return results."""
         if not evaluation_run.custom_scorers:
@@ -70,7 +70,9 @@ class LocalEvaluationQueue:
     def run_all(
         self,
-        callback: Optional[Callable[[EvaluationRun, List[ScoringResult]], None]] = None,
+        callback: Optional[
+            Callable[[ExampleEvaluationRun, List[ScoringResult]], None]
+        ] = None,
     ) -> None:
         """Process all queued runs synchronously.
@@ -134,7 +136,9 @@ class LocalEvaluationQueue:
     def start_worker(
         self,
-        callback: Optional[Callable[[EvaluationRun, List[ScoringResult]], None]] = None,
+        callback: Optional[
+            Callable[[ExampleEvaluationRun, List[ScoringResult]], None]
+        ] = None,
     ) -> Optional[threading.Thread]:
         """Start a single background thread to process runs (backward compatibility).
@@ -144,7 +148,7 @@ class LocalEvaluationQueue:
         Returns:
             The started thread, or None if no threads were started.
         """
-        threads = self.start_workers(callback)
+        threads = self.start_workers()
         return threads[0] if threads else None
     def wait_for_completion(self, timeout: Optional[float] = None) -> bool:

judgeval/trainer/config.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import Optional, Dict, Any, TYPE_CHECKING
 import json
 if TYPE_CHECKING:
-    from fireworks.llm.llm_reinforcement_step import ReinforcementAcceleratorTypeLiteral
+    from fireworks.llm.llm_reinforcement_step import ReinforcementAcceleratorTypeLiteral  # type: ignore[import-not-found]
 @dataclass

judgeval/trainer/trainable_model.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from fireworks import LLM
+from fireworks import LLM  # type: ignore[import-not-found]
 from .config import TrainerConfig, ModelConfig
 from typing import Optional, Dict, Any, Callable
 from .console import _model_spinner_progress, _print_model_progress

judgeval/trainer/trainer.py CHANGED Viewed

@@ -2,7 +2,7 @@ import asyncio
 import json
 import time
 from typing import Optional, Callable, Any, List, Union, Dict
-from fireworks import Dataset
+from fireworks import Dataset  # type: ignore[import-not-found]
 from .config import TrainerConfig, ModelConfig
 from .trainable_model import TrainableModel
 from judgeval.tracer import Tracer
@@ -10,7 +10,7 @@ from judgeval.tracer.exporters.store import SpanStore
 from judgeval.tracer.exporters import InMemorySpanExporter
 from judgeval.tracer.keys import AttributeKeys
 from judgeval import JudgmentClient
-from judgeval.scorers import BaseScorer, ExampleAPIScorerConfig
+from judgeval.scorers import ExampleScorer, ExampleAPIScorerConfig
 from judgeval.data import Example
 from .console import _spinner_progress, _print_progress, _print_progress_update
 from judgeval.exceptions import JudgmentRuntimeError
@@ -85,7 +85,9 @@ class JudgmentTrainer:
                 if not first_found and span_attributes.get(
                     AttributeKeys.JUDGMENT_INPUT
                 ):
-                    input_data = span_attributes.get(AttributeKeys.JUDGMENT_INPUT, {})
+                    input_data: Any = span_attributes.get(
+                        AttributeKeys.JUDGMENT_INPUT, {}
+                    )
                     if isinstance(input_data, dict) and "messages" in input_data:
                         input_messages = input_data["messages"]
                         if input_messages:
@@ -154,7 +156,7 @@ class JudgmentTrainer:
     async def generate_rollouts_and_rewards(
         self,
         agent_function: Callable[[Any], Any],
-        scorers: List[Union[ExampleAPIScorerConfig, BaseScorer]],
+        scorers: List[Union[ExampleAPIScorerConfig, ExampleScorer]],
         prompts: List[Any],
         num_prompts_per_step: Optional[int] = None,
         num_generations_per_prompt: Optional[int] = None,
@@ -264,7 +266,7 @@ class JudgmentTrainer:
     async def run_reinforcement_learning(
         self,
         agent_function: Callable[[Any], Any],
-        scorers: List[Union[ExampleAPIScorerConfig, BaseScorer]],
+        scorers: List[Union[ExampleAPIScorerConfig, ExampleScorer]],
         prompts: List[Any],
     ) -> ModelConfig:
         """
@@ -370,7 +372,7 @@ class JudgmentTrainer:
     async def train(
         self,
         agent_function: Callable[[Any], Any],
-        scorers: List[Union[ExampleAPIScorerConfig, BaseScorer]],
+        scorers: List[Union[ExampleAPIScorerConfig, ExampleScorer]],
         prompts: List[Any],
         rft_provider: Optional[str] = None,
     ) -> ModelConfig:

judgeval/utils/async_utils.py CHANGED Viewed

@@ -2,13 +2,13 @@
 import asyncio
 import concurrent.futures
-from typing import Awaitable, TypeVar
+from typing import Awaitable, TypeVar, Coroutine
 T = TypeVar("T")
-def safe_run_async(coro: Awaitable[T]) -> T:  # type: ignore[type-var]
+def safe_run_async(coro: Awaitable[T]) -> T:
     """Safely execute an async *coro* from synchronous code.
     This helper handles two common situations:
@@ -24,6 +24,8 @@ def safe_run_async(coro: Awaitable[T]) -> T:  # type: ignore[type-var]
     Returns:
         The result returned by *coro*.
     """
+    if not isinstance(coro, Coroutine):
+        raise TypeError("The provided awaitable must be a coroutine.")
     try:
         asyncio.get_running_loop()
@@ -31,5 +33,7 @@ def safe_run_async(coro: Awaitable[T]) -> T:  # type: ignore[type-var]
         return asyncio.run(coro)
     with concurrent.futures.ThreadPoolExecutor() as executor:
-        future = executor.submit(lambda: asyncio.run(coro))
+        future: concurrent.futures.Future[T] = executor.submit(
+            lambda: asyncio.run(coro)
+        )
         return future.result()

judgeval/utils/testing.py CHANGED Viewed

@@ -26,10 +26,6 @@ def assert_test_results(scoring_results: List[ScoringResult]) -> None:
                 # If the result was not successful, check each scorer_data
                 for scorer_data in result.scorers_data:
                     if not scorer_data.success:
-                        if scorer_data.name == "Tool Order":
-                            # Remove threshold, evaluation model for Tool Order scorer
-                            scorer_data.threshold = None
-                            scorer_data.evaluation_model = None
                         test_case.append(scorer_data)
             failed_cases.append(test_case)

judgeval/version.py CHANGED Viewed

@@ -1,4 +1,4 @@
-__version__ = "0.0.0"
+__version__ = "0.12.0"
 def get_version() -> str:

{judgeval-0.10.1.dist-info → judgeval-0.12.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: judgeval
-Version: 0.10.1
+Version: 0.12.0
 Summary: Judgeval Package
 Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
 Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues

{judgeval-0.10.1.dist-info → judgeval-0.12.0.dist-info}/RECORD RENAMED Viewed

@@ -1,38 +1,37 @@
-judgeval/__init__.py,sha256=MqB1s0zp-Fr_KvKFjGKnRHUeulutmrlMcUyjNRRAU_4,4962
+judgeval/__init__.py,sha256=LDL_vOvI6LmMwbVt6NMPwponDeEOaGHV-nd_0wSCLHM,4957
 judgeval/cli.py,sha256=R5IiIQmSVg21kQHX2kL3sOeXCxvvAMSqyva3Z9AoSXc,1560
 judgeval/constants.py,sha256=h7Cuf_2uvNzHZi8nqRFoMpvsQUZMS3mlNB3s2uduse8,3557
-judgeval/env.py,sha256=R0bj7XU29RIVVQjkVMa11ObhOYVMbaE_3LTvL3I9dWM,2212
+judgeval/env.py,sha256=QO_77E2oX5LLf29XgqLdUoYUIqEaGxd9mcCco6rzS-w,2445
 judgeval/exceptions.py,sha256=tTbfe4yoOtPXmn22UQz9-6a-5PT9uOko85xaRRwr0Sw,621
 judgeval/logger.py,sha256=ZWbp0QfT1CJnQIjV-Zle4n489nFCKEmD2-ukx--iiow,1553
-judgeval/version.py,sha256=kJtYsih3hTYZ_rY_Lt0RcFqvjAfF5Xo1uNq0jZWJ5pw,73
+judgeval/version.py,sha256=necdb4jxf2rIhW5LPI_UhDC8zSb9h-dNqtKbwoLv6z8,74
 judgeval/warnings.py,sha256=LbGte14ppiFjrkp-JJYueZ40NWFvMkWRvPXr6r-fUWw,73
-judgeval/api/__init__.py,sha256=asbr9nuP7H_0jh53P-LB8sQnRTYIRI6oBTxbigh3YdI,12993
-judgeval/api/api_types.py,sha256=hInVnVHrYFdPz9NiDtK5ik0rgRiB29a4PUkpRJYocRs,6666
+judgeval/api/__init__.py,sha256=3Pm0qQ4ZQj76jUsJVrnuazRnYcqF3pzM_Wv_Z6lOv0w,13216
+judgeval/api/api_types.py,sha256=AEh_9WpL0wTDUKZ0CwphkiGV3IeysBgTE9FzX4VYPic,6528
 judgeval/data/__init__.py,sha256=1tU0EN0ThIfQ1fad5I3dKxAfTcZ5U8cvTLcQ6qLVLU0,407
-judgeval/data/evaluation_run.py,sha256=G7ad4eDQTjketfcQRITk8bs8CIO8rm058H1G_qkLmhc,4729
+judgeval/data/evaluation_run.py,sha256=N47waxScMFKvGBxADX2FrfjW4wT5Zqd8n1PZKWb7JMA,4766
 judgeval/data/example.py,sha256=eGJpF-lyUH734Cg90B7WtU9f8iKoS3VFGeV6R-GVCCc,1039
-judgeval/data/judgment_types.py,sha256=JkhNG6fRBFdryG8ogVZsMWtq3W3JmWh0AYIR8LdBAT4,11773
-judgeval/data/result.py,sha256=LA0OzwcVKwD5NkmtmFuA_EusmYRyE10mjDMXa2bgU1g,2067
-judgeval/data/scorer_data.py,sha256=g9PE0DNLikW0LgxGWhgpCiNVOX8PzqEaZKivifLOUDI,2997
-judgeval/data/tool.py,sha256=bj_WxFg22mypUUVR5KqQRxMDHWvKwiE1MMPjLnTCoDU,99
+judgeval/data/judgment_types.py,sha256=8cGuj6VAHjYPfmHZL_Bb4D0D2bLP0V9-_Wec2WZhjKA,12130
+judgeval/data/result.py,sha256=XufFGSAkBDfevPUmzSgsR9HEqytISkM0U5HkhJmsjpY,2102
+judgeval/data/scorer_data.py,sha256=HeP15ZgftFTJCF8JmDJCLWXRnZJIaGDJCzl7Hg6gWwE,2006
 judgeval/data/trace.py,sha256=R9RF1kv1JHeOpjXLjErJcxV2RrNrJUSqWcWe73l3f9k,503
 judgeval/data/scripts/fix_default_factory.py,sha256=lvp2JwYZqz-XpD9LZNa3mANZVP-jJSZoNzolI6JWERM,591
 judgeval/data/scripts/openapi_transform.py,sha256=Sm04JClzyP1ga8KA3gkIdsae8Hlx-XU7-x0gHCQYOhg,3877
-judgeval/dataset/__init__.py,sha256=gzjozPF_Sz0DhlPflYyHsdZxU8K4L1MplZ2W9-qSJiU,6393
-judgeval/evaluation/__init__.py,sha256=u-aDyLTRebPZigeBbJHpnZk3wQAS7jv_VgLXIi-jMGU,15075
-judgeval/integrations/langgraph/__init__.py,sha256=VvqCKOk65A2gLlr8uWrJVzpRF5OnIja5zwF4hGPEFsw,27540
+judgeval/dataset/__init__.py,sha256=2B3ifWP_gn_4l0GgZaY2tB9UuV8m7dI1BEWwMgckDOc,6348
+judgeval/evaluation/__init__.py,sha256=6bSC1Sw-fpJN6OkZTv4UtAoYZqkjUy7OG17lxiRX5qE,13321
+judgeval/integrations/langgraph/__init__.py,sha256=Ow2rl21SmRQNVVR_WfejCsxFPcLvFFlpvKVgG0_igEQ,27580
 judgeval/judges/__init__.py,sha256=e7JnTc1TG_SwqydDHTXHIP0EBazQxt-ydMQG7ghSU5A,228
 judgeval/judges/base_judge.py,sha256=_dz0qWsKRxzXxpRY9l6mrxTRYPSF2FE4ZXkrzhZ4gbY,986
 judgeval/judges/litellm_judge.py,sha256=5vEF0IUo7HVWnOF2ww-DMke8Xkarnz32B_qbgKjc0-I,4182
 judgeval/judges/together_judge.py,sha256=GzwlXZJzle8hT-vWKmq39JyIeanJqJfHDOkrksUbzk0,4398
 judgeval/judges/utils.py,sha256=ITbYwvjU3o9-FIAReFvxh24yJrx9LV3l9BnSBgKUpxg,2068
-judgeval/scorers/__init__.py,sha256=34PMPsfR2_3n7T96wpSfAZJWzWlU6v53S3mGX2PE87k,665
-judgeval/scorers/agent_scorer.py,sha256=V1NSwhGWgtXPsX-blKLkDLsPPbEiP-A4614X-95dtlQ,565
+judgeval/scorers/__init__.py,sha256=pomKzEy4YNFyygYp8vbS3co8iB5CMstRkQwdUgi1u4g,744
+judgeval/scorers/agent_scorer.py,sha256=-qcNSkY6i7ur2LXkM7H1jTKuuFbDuXbjTq42o3vjeQ8,595
 judgeval/scorers/api_scorer.py,sha256=8TUJut9r74v-qMACiSKAUbDI1v3ZItPXrTz8s4_Lrgk,2287
-judgeval/scorers/base_scorer.py,sha256=naGiZYHnkn9HVwY-jpOY7O6cYPJJJe5dHbrRBSOikxw,2723
+judgeval/scorers/base_scorer.py,sha256=hsMuqdW8QtW5n9JzruXyaZC7im2K2sSmz1RDkbMisJ4,2702
 judgeval/scorers/example_scorer.py,sha256=o_BGUztJXjnKnuOqIa9T4PXe0wPoWg63FyH518N1LxA,561
 judgeval/scorers/exceptions.py,sha256=ACDHK5-TWiF3NTk-wycaedpbrdobm-CvvC1JA_iP-Mk,179
-judgeval/scorers/score.py,sha256=xquM59SCtNeuAsrBsHFgBQk3CHp4-bms4oFs24xfcU0,7176
+judgeval/scorers/score.py,sha256=95tnNRnihrEVvG0yH-RDTQ8KoiBakDijjukclqxH5KE,7183
 judgeval/scorers/utils.py,sha256=iSZONwK0HecxUPz-cMCyra_87DSCag1E8BdpF2a4_44,377
 judgeval/scorers/judgeval_scorers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=wrq7y9I30GZbwDXIrSh81KRO_-j7i-1DjwX5Hc3PScI,728
@@ -40,11 +39,11 @@ judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py,sha256=_qa1s
 judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py,sha256=ciiFBQQC4UDsk9qou9OiKbAR31s82eRUY1ZTt1gdM-0,407
 judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py,sha256=lIJ3GgOI9tfbrC7voZMvlxXdK3X1bhdj2zNxqdaGIkM,545
 judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py,sha256=bSwbpVNhpkpEeX3GtCJuyz5vFyY1gbyqYEfaBF2KTVY,697
-judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py,sha256=E2_TVO88iLSBAdcKYnfHYp4cUyffgG_p1th5aCpjCd8,9680
-judgeval/tracer/__init__.py,sha256=mQQaca8XJRYwSRn7a5x63dFQeA8xGjwfoZYikQCAAyI,35214
+judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py,sha256=djPfHC8NP9srwTAgp075kK_zz6Tbn2WFIh6jOZjqppQ,9688
+judgeval/tracer/__init__.py,sha256=YLJklv1YfNDV61GiJw3PflLp_cajxAnXHojJVKitbz4,35074
 judgeval/tracer/constants.py,sha256=ae8tivAW97awJQxdRB9OMqX50wOLX3zqChT_AGkPBu0,85
 judgeval/tracer/keys.py,sha256=qXPoZSkEhVF-YYfQ9-zeDMVdr4GtpPf2W7MPJaN2AQo,2889
-judgeval/tracer/local_eval_queue.py,sha256=iv9on1G4woGlhYn1mZATEMkzCiz-qVn2cdzEINzQFYQ,7242
+judgeval/tracer/local_eval_queue.py,sha256=KZKvSSli7B-EVzdHa4-CmXUpv0uOjGLLRa2KTPg8lRc,7320
 judgeval/tracer/managers.py,sha256=h2ZHJ61_vf3cS-HlEUiodFzKDUuQWIhYC6n7pMVyM9c,6113
 judgeval/tracer/utils.py,sha256=3_8ZjjF4XgNyAu9LpThq5dVOcwdwI-E3vb-HRl_Px8c,594
 judgeval/tracer/exporters/__init__.py,sha256=lnZXfPGaQH844HAIuZCQqjqhnmZGA98kHY8Xp-Oi4Ws,1220
@@ -55,21 +54,21 @@ judgeval/tracer/llm/__init__.py,sha256=p9uwWPg9k-NcWjj9TbwQj55sHhBOqRYx2-Ld6YHaF
 judgeval/tracer/llm/providers.py,sha256=QQLJlSNnDjXRAc2Wqw78o254COJUSXX39D7D_mx3NVA,2651
 judgeval/tracer/processors/__init__.py,sha256=tXbQaXGMQeutgM_7d5Y2EFTeSjbVEBky685Dst_v3rg,8672
 judgeval/trainer/__init__.py,sha256=h_DDVV7HFF7HUPAJFpt2d9wjqgnmEVcHxqZyB1k7pPQ,257
-judgeval/trainer/config.py,sha256=8s0X8B334PJomorwONaUpb6K8cAMxRdYAeQdtx7HPHs,4258
+judgeval/trainer/config.py,sha256=sAAVBgeoFDJWYjGIgOvoQoiO0gtqNAOI6MHncwdN_mk,4292
 judgeval/trainer/console.py,sha256=PJ0rCnDwC7aoW-VsLDS96ZyMyagh-l9EOJKff1ATIpo,4342
-judgeval/trainer/trainable_model.py,sha256=vSDtHJJ-fLczC2gkaY9jG6TQvLgWqaVjElm1l8YlJcU,8959
-judgeval/trainer/trainer.py,sha256=YhepEm3M-5z1RB50cAEsLbZiOIE_fOWiX-thyvBj6v4,16578
-judgeval/utils/async_utils.py,sha256=lgCgi8gkLUcAEepruEkx-AGQgJnAJpKmBIhZx6Y0q2s,935
+judgeval/trainer/trainable_model.py,sha256=T-Sioi_sXtfYlcu3lE0cd60PHs8DrYaZ-Kxb4h1nU04,8993
+judgeval/trainer/trainer.py,sha256=FBhHq2YPooKADDCC_IEKex81L6a5quCmAMyl9mn3QLk,16675
+judgeval/utils/async_utils.py,sha256=AF1xdu8Ao5GyhFvfaLOaKJHn1RISyXZ4U70UZe9zfBA,1083
 judgeval/utils/decorators.py,sha256=rdqY1w0zNL6O6GU6Wdeo0-x5EgpFTEhU2vkgiWsRYdc,525
 judgeval/utils/file_utils.py,sha256=3LI1YCZwO5ogTgJreyOgRgDksey3natO2Td1PQqaPyY,3252
 judgeval/utils/guards.py,sha256=QBb6m6KElxdvt2bskLZCKh_zGHbBcqV-VfGzT63o3hY,807
 judgeval/utils/meta.py,sha256=wQFCLJTNKF9yUdXcw37AT6mC-wqzZpAvjn5gP_6flD8,349
 judgeval/utils/serialize.py,sha256=QXR-8Nj5rqOrI9zLx0oRLdk6DW6Bc7j8eyF4zQ7PLxA,6256
-judgeval/utils/testing.py,sha256=kJOq4LlEXaNThfg9oSIRqSK7IH8AwLgbukjn5uxMY7A,3661
+judgeval/utils/testing.py,sha256=4HO4UCZQgeB7wi-LQoKPjiAYMbj4PpeApAnxZdmI_8w,3392
 judgeval/utils/url.py,sha256=Shf0v3XcbaWpL0m1eGJEEO_z4TsQCnDB2Rl25OTUmiI,195
 judgeval/utils/version_check.py,sha256=kcF6SvB6GbVKI0Gv9QRVm-kvBn9_z-c3jmPORsXO3h0,1015
-judgeval-0.10.1.dist-info/METADATA,sha256=Jd1eGkgAIO5XGpCaD42riNtaJ6DYJQeGCs8JLrl6Ibs,8870
-judgeval-0.10.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-judgeval-0.10.1.dist-info/entry_points.txt,sha256=-eoeD-oDLn4A7MSgeBS9Akwanf3_0r0cgEleBcIOjg0,46
-judgeval-0.10.1.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
-judgeval-0.10.1.dist-info/RECORD,,
+judgeval-0.12.0.dist-info/METADATA,sha256=RVS9bm8KrWk-ifawDz1s9oDx_NY3zjGPkbknKKzpjeM,8870
+judgeval-0.12.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+judgeval-0.12.0.dist-info/entry_points.txt,sha256=-eoeD-oDLn4A7MSgeBS9Akwanf3_0r0cgEleBcIOjg0,46
+judgeval-0.12.0.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
+judgeval-0.12.0.dist-info/RECORD,,

judgeval/data/tool.py DELETED Viewed

@@ -1,5 +0,0 @@
-from judgeval.data.judgment_types import Tool as JudgmentTool
-class Tool(JudgmentTool):
-    pass

{judgeval-0.10.1.dist-info → judgeval-0.12.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{judgeval-0.10.1.dist-info → judgeval-0.12.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{judgeval-0.10.1.dist-info → judgeval-0.12.0.dist-info}/licenses/LICENSE.md RENAMED Viewed

File without changes

judgeval 0.10.1__py3-none-any.whl → 0.12.0__py3-none-any.whl

judgeval 0.10.1py3-none-any.whl → 0.12.0py3-none-any.whl