PyPI - judgeval - Versions diffs - 0.14.1__py3-none-any.whl → 0.16.0__py3-none-any.whl - Mend

judgeval 0.14.1py3-none-any.whl → 0.16.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

judgeval/api/__init__.py +4 -40
judgeval/api/api_types.py +34 -27
judgeval/data/judgment_types.py +39 -35
judgeval/dataset/__init__.py +1 -1
judgeval/evaluation/__init__.py +9 -21
judgeval/integrations/openlit/__init__.py +50 -0
judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +25 -2
judgeval/tracer/__init__.py +4 -0
judgeval/tracer/llm/__init__.py +36 -4
judgeval/version.py +1 -1
judgeval-0.16.0.dist-info/METADATA +266 -0
{judgeval-0.14.1.dist-info → judgeval-0.16.0.dist-info}/RECORD +15 -14
judgeval-0.14.1.dist-info/METADATA +0 -158
{judgeval-0.14.1.dist-info → judgeval-0.16.0.dist-info}/WHEEL +0 -0
{judgeval-0.14.1.dist-info → judgeval-0.16.0.dist-info}/entry_points.txt +0 -0
{judgeval-0.14.1.dist-info → judgeval-0.16.0.dist-info}/licenses/LICENSE.md +0 -0

judgeval/api/__init__.py CHANGED Viewed

@@ -73,7 +73,7 @@ class JudgmentSyncClient:
     def evaluate_examples(
         self, payload: ExampleEvaluationRun, stream: Optional[str] = None
-    ) -> Any:
+    ) -> EvaluateResponse:
         query_params = {}
         if stream is not None:
             query_params["stream"] = stream
@@ -86,7 +86,7 @@ class JudgmentSyncClient:
     def evaluate_traces(
         self, payload: TraceEvaluationRun, stream: Optional[str] = None
-    ) -> Any:
+    ) -> EvaluateResponse:
         query_params = {}
         if stream is not None:
             query_params["stream"] = stream
@@ -111,16 +111,6 @@ class JudgmentSyncClient:
             payload,
         )
-    def get_evaluation_status(self, experiment_run_id: str, project_name: str) -> Any:
-        query_params = {}
-        query_params["experiment_run_id"] = experiment_run_id
-        query_params["project_name"] = project_name
-        return self._request(
-            "GET",
-            url_for("/get_evaluation_status/"),
-            query_params,
-        )
     def datasets_insert_examples_for_judgeval(
         self, payload: DatasetInsertExamples
     ) -> Any:
@@ -222,13 +212,6 @@ class JudgmentSyncClient:
             payload,
         )
-    def e2e_fetch_trace_scorer_span_score(self, payload: SpanScoreRequest) -> Any:
-        return self._request(
-            "POST",
-            url_for("/e2e_fetch_trace_scorer_span_score/"),
-            payload,
-        )
 class JudgmentAsyncClient:
     __slots__ = ("api_key", "organization_id", "client")
@@ -280,7 +263,7 @@ class JudgmentAsyncClient:
     async def evaluate_examples(
         self, payload: ExampleEvaluationRun, stream: Optional[str] = None
-    ) -> Any:
+    ) -> EvaluateResponse:
         query_params = {}
         if stream is not None:
             query_params["stream"] = stream
@@ -293,7 +276,7 @@ class JudgmentAsyncClient:
     async def evaluate_traces(
         self, payload: TraceEvaluationRun, stream: Optional[str] = None
-    ) -> Any:
+    ) -> EvaluateResponse:
         query_params = {}
         if stream is not None:
             query_params["stream"] = stream
@@ -318,18 +301,6 @@ class JudgmentAsyncClient:
             payload,
         )
-    async def get_evaluation_status(
-        self, experiment_run_id: str, project_name: str
-    ) -> Any:
-        query_params = {}
-        query_params["experiment_run_id"] = experiment_run_id
-        query_params["project_name"] = project_name
-        return await self._request(
-            "GET",
-            url_for("/get_evaluation_status/"),
-            query_params,
-        )
     async def datasets_insert_examples_for_judgeval(
         self, payload: DatasetInsertExamples
     ) -> Any:
@@ -433,13 +404,6 @@ class JudgmentAsyncClient:
             payload,
         )
-    async def e2e_fetch_trace_scorer_span_score(self, payload: SpanScoreRequest) -> Any:
-        return await self._request(
-            "POST",
-            url_for("/e2e_fetch_trace_scorer_span_score/"),
-            payload,
-        )
 __all__ = [
     "JudgmentSyncClient",

judgeval/api/api_types.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # generated by datamodel-codegen:
 #   filename:  .openapi.json
-#   timestamp: 2025-09-29T19:54:47+00:00
+#   timestamp: 2025-10-07T20:43:52+00:00
 from __future__ import annotations
 from typing import Any, Dict, List, Literal, Optional, TypedDict, Union
@@ -24,6 +24,15 @@ class DatasetsFetch(TypedDict):
     project_name: str
+class DatasetsTableRow(TypedDict):
+    dataset_id: str
+    name: str
+    created_at: str
+    kind: Literal["trace", "example"]
+    entries: int
+    creator: str
 class ProjectAdd(TypedDict):
     project_name: str
@@ -137,6 +146,14 @@ class ValidationError(TypedDict):
     type: str
+class UsageInfo(TypedDict):
+    total_judgees: int
+    regular_use: int
+    pay_as_you_go_use: int
+    remaining_regular: int
+    remaining_after: int
 DatasetKind = Literal["trace", "example"]
@@ -180,18 +197,10 @@ class OtelTraceSpan(TypedDict):
     resource_attributes: NotRequired[Optional[Dict[str, Any]]]
     span_attributes: NotRequired[Optional[Dict[str, Any]]]
     duration: NotRequired[Optional[int]]
-    status_code: NotRequired[Optional[str]]
+    status_code: NotRequired[Optional[int]]
     status_message: NotRequired[Optional[str]]
     events: NotRequired[Optional[List[Dict[str, Any]]]]
     links: NotRequired[Optional[List[Dict[str, Any]]]]
-    legacy_span_id: NotRequired[Optional[str]]
-    inputs: NotRequired[Optional[Dict[str, Any]]]
-    output: Any
-    error: NotRequired[Optional[Dict[str, Any]]]
-    agent_id: NotRequired[Optional[str]]
-    cumulative_llm_cost: NotRequired[Optional[float]]
-    state_after: NotRequired[Optional[Dict[str, Any]]]
-    state_before: NotRequired[Optional[Dict[str, Any]]]
 class OtelSpanListItemScores(TypedDict):
@@ -206,7 +215,7 @@ class OtelSpanDetailScores(TypedDict):
     score: float
     reason: NotRequired[Optional[str]]
     name: str
-    data: NotRequired[Optional[Dict[str, Any]]]
+    example_id: NotRequired[Optional[str]]
 class ExampleEvaluationRun(TypedDict):
@@ -244,15 +253,6 @@ class DatasetInsertExamples(TypedDict):
     project_name: str
-class DatasetInfo(TypedDict):
-    dataset_id: str
-    name: str
-    created_at: str
-    dataset_kind: DatasetKind
-    entries: int
-    creator: str
 class DatasetCreate(TypedDict):
     name: str
     dataset_kind: DatasetKind
@@ -279,16 +279,17 @@ class OtelTraceListItem(TypedDict):
     organization_id: str
     project_id: str
     trace_id: str
-    timestamp: str
+    created_at: str
     duration: NotRequired[Optional[int]]
-    has_notification: NotRequired[Optional[bool]]
     tags: NotRequired[Optional[List[str]]]
     experiment_run_id: NotRequired[Optional[str]]
     span_name: NotRequired[Optional[str]]
-    cumulative_llm_cost: NotRequired[Optional[float]]
-    error: NotRequired[Optional[Dict[str, Any]]]
+    llm_cost: NotRequired[Optional[float]]
+    error: NotRequired[str]
     scores: NotRequired[List[OtelSpanListItemScores]]
     customer_id: NotRequired[Optional[str]]
+    input: NotRequired[Optional[str]]
+    output: NotRequired[Optional[str]]
     input_preview: NotRequired[Optional[str]]
     output_preview: NotRequired[Optional[str]]
     annotation_count: NotRequired[int]
@@ -310,9 +311,9 @@ class OtelSpanDetail(TypedDict):
     resource_attributes: NotRequired[Optional[Dict[str, Any]]]
     span_attributes: NotRequired[Optional[Dict[str, Any]]]
     duration: NotRequired[Optional[int]]
-    status_code: NotRequired[Optional[str]]
+    status_code: NotRequired[Optional[int]]
     status_message: NotRequired[Optional[str]]
-    events: NotRequired[Optional[Union[List[Dict[str, Any]], Dict[str, Any]]]]
+    events: NotRequired[Optional[List[Dict[str, Any]]]]
     links: NotRequired[Optional[Union[List[Dict[str, Any]], Dict[str, Any]]]]
     llm_cost: NotRequired[Optional[float]]
     prompt_tokens: NotRequired[Optional[int]]
@@ -320,6 +321,12 @@ class OtelSpanDetail(TypedDict):
     scores: NotRequired[Optional[List[OtelSpanDetailScores]]]
+class EvaluateResponse(TypedDict):
+    status: str
+    results: List[ScoringResult]
+    resource_usage: NotRequired[Optional[UsageInfo]]
 class EvalResults(TypedDict):
     results: List[ScoringResult]
     run: Union[ExampleEvaluationRun, TraceEvaluationRun]
@@ -335,5 +342,5 @@ class DatasetReturn(TypedDict):
     name: str
     project_name: str
     dataset_kind: DatasetKind
-    examples: NotRequired[Optional[List[Example]]]
+    examples: NotRequired[List[Example]]
     traces: NotRequired[Optional[List[DatasetTraceWithSpans]]]

judgeval/data/judgment_types.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # generated by datamodel-codegen:
 #   filename:  .openapi.json
-#   timestamp: 2025-09-29T19:54:46+00:00
+#   timestamp: 2025-10-07T20:43:51+00:00
 from __future__ import annotations
 from typing import Annotated, Any, Dict, List, Optional, Union
@@ -26,6 +26,20 @@ class DatasetsFetch(BaseModel):
     project_name: Annotated[str, Field(title="Project Name")]
+class Kind(Enum):
+    trace = "trace"
+    example = "example"
+class DatasetsTableRow(BaseModel):
+    dataset_id: Annotated[str, Field(title="Dataset Id")]
+    name: Annotated[str, Field(title="Name")]
+    created_at: Annotated[str, Field(title="Created At")]
+    kind: Annotated[Kind, Field(title="Kind")]
+    entries: Annotated[int, Field(title="Entries")]
+    creator: Annotated[str, Field(title="Creator")]
 class ProjectAdd(BaseModel):
     project_name: Annotated[str, Field(title="Project Name")]
@@ -148,6 +162,14 @@ class ValidationError(BaseModel):
     type: Annotated[str, Field(title="Error Type")]
+class UsageInfo(BaseModel):
+    total_judgees: Annotated[int, Field(title="Total Judgees")]
+    regular_use: Annotated[int, Field(title="Regular Use")]
+    pay_as_you_go_use: Annotated[int, Field(title="Pay As You Go Use")]
+    remaining_regular: Annotated[int, Field(title="Remaining Regular")]
+    remaining_after: Annotated[int, Field(title="Remaining After")]
 class DatasetKind(Enum):
     trace = "trace"
     example = "example"
@@ -199,22 +221,10 @@ class OtelTraceSpan(BaseModel):
         Optional[Dict[str, Any]], Field(title="Span Attributes")
     ] = None
     duration: Annotated[Optional[int], Field(title="Duration")] = None
-    status_code: Annotated[Optional[str], Field(title="Status Code")] = None
+    status_code: Annotated[Optional[int], Field(title="Status Code")] = None
     status_message: Annotated[Optional[str], Field(title="Status Message")] = None
     events: Annotated[Optional[List[Dict[str, Any]]], Field(title="Events")] = None
     links: Annotated[Optional[List[Dict[str, Any]]], Field(title="Links")] = None
-    legacy_span_id: Annotated[Optional[str], Field(title="Legacy Span Id")] = None
-    inputs: Annotated[Optional[Dict[str, Any]], Field(title="Inputs")] = None
-    output: Annotated[Any, Field(title="Output")]
-    error: Annotated[Optional[Dict[str, Any]], Field(title="Error")] = None
-    agent_id: Annotated[Optional[str], Field(title="Agent Id")] = None
-    cumulative_llm_cost: Annotated[
-        Optional[float], Field(title="Cumulative Llm Cost")
-    ] = None
-    state_after: Annotated[Optional[Dict[str, Any]], Field(title="State After")] = None
-    state_before: Annotated[Optional[Dict[str, Any]], Field(title="State Before")] = (
-        None
-    )
 class OtelSpanListItemScores(BaseModel):
@@ -229,7 +239,7 @@ class OtelSpanDetailScores(BaseModel):
     score: Annotated[float, Field(title="Score")]
     reason: Annotated[Optional[str], Field(title="Reason")] = None
     name: Annotated[str, Field(title="Name")]
-    data: Annotated[Optional[Dict[str, Any]], Field(title="Data")] = None
+    example_id: Annotated[Optional[str], Field(title="Example Id")] = None
 class ExampleEvaluationRun(BaseModel):
@@ -277,15 +287,6 @@ class DatasetInsertExamples(BaseModel):
     project_name: Annotated[str, Field(title="Project Name")]
-class DatasetInfo(BaseModel):
-    dataset_id: Annotated[str, Field(title="Dataset Id")]
-    name: Annotated[str, Field(title="Name")]
-    created_at: Annotated[str, Field(title="Created At")]
-    dataset_kind: DatasetKind
-    entries: Annotated[int, Field(title="Entries")]
-    creator: Annotated[str, Field(title="Creator")]
 class DatasetCreate(BaseModel):
     name: Annotated[str, Field(title="Name")]
     dataset_kind: DatasetKind
@@ -314,20 +315,19 @@ class OtelTraceListItem(BaseModel):
     organization_id: Annotated[str, Field(title="Organization Id")]
     project_id: Annotated[str, Field(title="Project Id")]
     trace_id: Annotated[str, Field(title="Trace Id")]
-    timestamp: Annotated[str, Field(title="Timestamp")]
+    created_at: Annotated[AwareDatetime, Field(title="Created At")]
     duration: Annotated[Optional[int], Field(title="Duration")] = None
-    has_notification: Annotated[Optional[bool], Field(title="Has Notification")] = None
     tags: Annotated[Optional[List[str]], Field(title="Tags")] = None
     experiment_run_id: Annotated[Optional[str], Field(title="Experiment Run Id")] = None
     span_name: Annotated[Optional[str], Field(title="Span Name")] = None
-    cumulative_llm_cost: Annotated[
-        Optional[float], Field(title="Cumulative Llm Cost")
-    ] = None
-    error: Annotated[Optional[Dict[str, Any]], Field(title="Error")] = None
+    llm_cost: Annotated[Optional[float], Field(title="Llm Cost")] = None
+    error: Annotated[Optional[str], Field(title="Error")] = ""
     scores: Annotated[
         Optional[List[OtelSpanListItemScores]], Field(title="Scores")
     ] = []
     customer_id: Annotated[Optional[str], Field(title="Customer Id")] = None
+    input: Annotated[Optional[str], Field(title="Input")] = None
+    output: Annotated[Optional[str], Field(title="Output")] = None
     input_preview: Annotated[Optional[str], Field(title="Input Preview")] = None
     output_preview: Annotated[Optional[str], Field(title="Output Preview")] = None
     annotation_count: Annotated[Optional[int], Field(title="Annotation Count")] = 0
@@ -338,7 +338,7 @@ class OtelTraceListItem(BaseModel):
 class OtelSpanDetail(BaseModel):
     organization_id: Annotated[str, Field(title="Organization Id")]
     project_id: Annotated[str, Field(title="Project Id")]
-    timestamp: Annotated[str, Field(title="Timestamp")]
+    timestamp: Annotated[AwareDatetime, Field(title="Timestamp")]
     trace_id: Annotated[str, Field(title="Trace Id")]
     span_id: Annotated[str, Field(title="Span Id")]
     parent_span_id: Annotated[Optional[str], Field(title="Parent Span Id")] = None
@@ -353,11 +353,9 @@ class OtelSpanDetail(BaseModel):
         Optional[Dict[str, Any]], Field(title="Span Attributes")
     ] = None
     duration: Annotated[Optional[int], Field(title="Duration")] = None
-    status_code: Annotated[Optional[str], Field(title="Status Code")] = None
+    status_code: Annotated[Optional[int], Field(title="Status Code")] = None
     status_message: Annotated[Optional[str], Field(title="Status Message")] = None
-    events: Annotated[
-        Optional[Union[List[Dict[str, Any]], Dict[str, Any]]], Field(title="Events")
-    ] = None
+    events: Annotated[Optional[List[Dict[str, Any]]], Field(title="Events")] = None
     links: Annotated[
         Optional[Union[List[Dict[str, Any]], Dict[str, Any]]], Field(title="Links")
     ] = None
@@ -369,6 +367,12 @@ class OtelSpanDetail(BaseModel):
     )
+class EvaluateResponse(BaseModel):
+    status: Annotated[str, Field(title="Status")]
+    results: Annotated[List[ScoringResult], Field(title="Results")]
+    resource_usage: Optional[UsageInfo] = None
 class EvalResults(BaseModel):
     results: Annotated[List[ScoringResult], Field(title="Results")]
     run: Annotated[Union[ExampleEvaluationRun, TraceEvaluationRun], Field(title="Run")]

judgeval/dataset/__init__.py CHANGED Viewed

@@ -20,7 +20,7 @@ class DatasetInfo:
     dataset_id: str
     name: str
     created_at: str
-    dataset_kind: DatasetKind
+    kind: DatasetKind
     entries: int
     creator: str

judgeval/evaluation/__init__.py CHANGED Viewed

@@ -84,7 +84,7 @@ def log_evaluation_results(
 def _poll_evaluation_until_complete(
     evaluation_run: ExampleEvaluationRun,
-    expected_scorer_data_count: int,
+    expected_examples_count: int,
     poll_interval_seconds: float = 5,
     max_failures: int = 5,
     max_poll_count: int = 60,  # This should be equivalent to 5 minutes
@@ -117,29 +117,22 @@ def _poll_evaluation_until_complete(
         poll_count += 1
         try:
             # Check status
-            status_response = api_client.get_evaluation_status(
-                experiment_run_id, project_name
-            )
-            if status_response.get("status") != "completed":
-                time.sleep(poll_interval_seconds)
-                continue
-            example_scorer_pairings = status_response.get("results", [])
-            if len(example_scorer_pairings) != expected_scorer_data_count:
-                time.sleep(poll_interval_seconds)
-                continue
             results_response = api_client.fetch_experiment_run(
                 {
                     "experiment_run_id": experiment_run_id,
                     "project_name": project_name,
                 }
             )
+            example_scorer_pairings = results_response.get("results", [])
+            if len(example_scorer_pairings) != expected_examples_count:
+                time.sleep(poll_interval_seconds)
+                continue
             url = results_response.get("ui_results_url")
             scoring_result_list = []
-            for res in results_response.get("results", []):
+            for res in example_scorer_pairings:
                 example = res.get("data", {}).copy()
                 example["example_id"] = res.get("example_id")
                 scoring_result = ScoringResult(
@@ -241,14 +234,9 @@ def run_eval(
                 )
                 raise JudgmentRuntimeError(error_message)
-            num_scorers = (
-                len(evaluation_run.judgment_scorers)
-                if evaluation_run.judgment_scorers
-                else sum(1 for cs in evaluation_run.custom_scorers if cs.server_hosted)
-            )
             results, url = _poll_evaluation_until_complete(
                 evaluation_run=evaluation_run,
-                expected_scorer_data_count=(num_scorers * len(evaluation_run.examples)),
+                expected_examples_count=len(evaluation_run.examples),
             )
         finally:
             stop_event.set()

judgeval/integrations/openlit/__init__.py ADDED Viewed

@@ -0,0 +1,50 @@
+from abc import ABC
+from judgeval.tracer import Tracer
+from judgeval.logger import judgeval_logger
+from judgeval.utils.url import url_for
+try:
+    import openlit  # type: ignore
+except ImportError:
+    raise ImportError(
+        "Openlit is not installed and required for the openlit integration. Please install it with `pip install openlit`."
+    )
+class Openlit(ABC):
+    @staticmethod
+    def initialize(
+        **kwargs,
+    ):
+        tracer = Tracer.get_instance()
+        if not tracer or not tracer._initialized:
+            raise ValueError(
+                "Openlit must be initialized after the tracer has been initialized. Please create the Tracer instance first before initializing Openlit."
+            )
+        api_key = tracer.api_key
+        organization_id = tracer.organization_id
+        project_name = tracer.project_name
+        project_id = Tracer._resolve_project_id(project_name, api_key, organization_id)
+        if not project_id:
+            judgeval_logger.warning(
+                f"Project {project_name} not found. Please create it first at https://app.judgmentlabs.ai/org/{organization_id}/projects."
+            )
+            return
+        openlit.init(
+            service_name=project_name,
+            otlp_endpoint=url_for("/otel"),
+            otlp_headers={
+                "Authorization": f"Bearer {api_key}",
+                "X-Organization-Id": organization_id,
+                "X-Project-Id": project_id,
+            },
+            tracer=tracer.get_tracer(),
+            **kwargs,
+        )
+__all__ = ["Openlit"]

judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py CHANGED Viewed

@@ -20,6 +20,7 @@ def push_prompt_scorer(
     threshold: float,
     options: Optional[Dict[str, float]] = None,
     model: str = JUDGMENT_DEFAULT_GPT_MODEL,
+    description: Optional[str] = None,
     judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
     organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
     is_trace: bool = False,
@@ -33,6 +34,7 @@ def push_prompt_scorer(
                 "threshold": threshold,
                 "options": options,
                 "model": model,
+                "description": description,
                 "is_trace": is_trace,
             }
         )
@@ -102,6 +104,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
     score_type: APIScorerType
     prompt: str
     options: Optional[Dict[str, float]] = None
+    description: Optional[str] = None
     judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or ""
     organization_id: str = os.getenv("JUDGMENT_ORG_ID") or ""
@@ -130,6 +133,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
             threshold=scorer_config["threshold"],
             options=scorer_config.get("options"),
             model=scorer_config.get("model"),
+            description=scorer_config.get("description"),
             judgment_api_key=judgment_api_key,
             organization_id=organization_id,
         )
@@ -142,6 +146,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
         threshold: float = 0.5,
         options: Optional[Dict[str, float]] = None,
         model: str = JUDGMENT_DEFAULT_GPT_MODEL,
+        description: Optional[str] = None,
         judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
         organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
     ):
@@ -158,6 +163,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
                 threshold,
                 options,
                 model,
+                description,
                 judgment_api_key,
                 organization_id,
                 is_trace,
@@ -170,6 +176,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
                 threshold=threshold,
                 options=options,
                 model=model,
+                description=description,
                 judgment_api_key=judgment_api_key,
                 organization_id=organization_id,
             )
@@ -215,6 +222,14 @@ class BasePromptScorer(ABC, APIScorerConfig):
         self.push_prompt_scorer()
         judgeval_logger.info(f"Successfully updated options for {self.name}")
+    def set_description(self, description: Optional[str]):
+        """
+        Updates the description of the scorer.
+        """
+        self.description = description
+        self.push_prompt_scorer()
+        judgeval_logger.info(f"Successfully updated description for {self.name}")
     def append_to_prompt(self, prompt_addition: str):
         """
         Appends a string to the prompt.
@@ -248,7 +263,13 @@ class BasePromptScorer(ABC, APIScorerConfig):
         """
         return copy(self.options) if self.options is not None else None
-    def get_name(self) -> str | None:
+    def get_description(self) -> str | None:
+        """
+        Returns the description of the scorer.
+        """
+        return self.description
+    def get_name(self) -> str:
         """
         Returns the name of the scorer.
         """
@@ -264,6 +285,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
             "prompt": self.prompt,
             "threshold": self.threshold,
             "options": self.options,
+            "description": self.description,
         }
     def push_prompt_scorer(self):
@@ -276,13 +298,14 @@ class BasePromptScorer(ABC, APIScorerConfig):
             self.threshold,
             self.options,
             self.model,
+            self.description,
             self.judgment_api_key,
             self.organization_id,
             isinstance(self, TracePromptScorer),
         )
     def __str__(self):
-        return f"PromptScorer(name={self.name}, model={self.model}, prompt={self.prompt}, threshold={self.threshold}, options={self.options})"
+        return f"PromptScorer(name={self.name}, model={self.model}, prompt={self.prompt}, threshold={self.threshold}, options={self.options}, description={self.description})"
     def model_dump(self, *args, **kwargs) -> Dict[str, Any]:
         base = super().model_dump(*args, **kwargs)

judgeval/tracer/__init__.py CHANGED Viewed

@@ -255,6 +255,10 @@ class Tracer(metaclass=SingletonMeta):
     def get_current_agent_context(self):
         return self.agent_context
+    def get_span_processor(self) -> JudgmentSpanProcessor:
+        """Get the internal span processor of this tracer instance."""
+        return self.judgment_processor
     def set_customer_id(self, customer_id: str) -> None:
         span = self.get_current_span()
         if span and span.is_recording():

judgeval/tracer/llm/__init__.py CHANGED Viewed

@@ -137,9 +137,23 @@ def _extract_openai_content(chunk) -> str:
 def _extract_anthropic_content(chunk) -> str:
     """Extract content from Anthropic streaming chunk."""
-    if hasattr(chunk, "type") and chunk.type == "content_block_delta":
-        if hasattr(chunk, "delta") and hasattr(chunk.delta, "text"):
-            return chunk.delta.text or ""
+    if hasattr(chunk, "type"):
+        if chunk.type == "content_block_delta":
+            if hasattr(chunk, "delta"):
+                if hasattr(chunk.delta, "text"):
+                    return chunk.delta.text or ""
+                elif hasattr(chunk.delta, "partial_json"):
+                    # Tool use input streaming - return raw JSON to accumulate properly
+                    return chunk.delta.partial_json or ""
+        elif chunk.type == "content_block_start":
+            if hasattr(chunk, "content_block") and hasattr(chunk.content_block, "type"):
+                if chunk.content_block.type == "tool_use":
+                    tool_info = {
+                        "type": "tool_use",
+                        "id": getattr(chunk.content_block, "id", None),
+                        "name": getattr(chunk.content_block, "name", None),
+                    }
+                    return f"[TOOL_USE_START: {tool_info}]"
     elif hasattr(chunk, "delta") and hasattr(chunk.delta, "text"):
         return chunk.delta.text or ""
     elif hasattr(chunk, "text"):
@@ -409,7 +423,25 @@ def _format_anthropic_output(
         and usage.cache_creation_input_tokens is not None
         else 0
     )
-    message_content = response.content[0].text if hasattr(response, "content") else None
+    # Extract content from Anthropic response, handling both text and tool use blocks
+    message_content = None
+    if hasattr(response, "content") and response.content:
+        content_parts = []
+        for content_block in response.content:
+            block_type = getattr(content_block, "type", None)
+            if block_type == "text":
+                # Text content block
+                content_parts.append(getattr(content_block, "text", ""))
+            elif block_type == "tool_use":
+                # Tool use block - serialize the tool call information
+                tool_info = {
+                    "type": "tool_use",
+                    "id": getattr(content_block, "id", None),
+                    "name": getattr(content_block, "name", None),
+                    "input": getattr(content_block, "input", None),
+                }
+                content_parts.append(f"[TOOL_USE: {tool_info}]")
+        message_content = "\n".join(content_parts) if content_parts else None
     if model_name:
         return message_content, _create_usage(

judgeval/version.py CHANGED Viewed

@@ -1,4 +1,4 @@
-__version__ = "0.14.1"
+__version__ = "0.16.0"
 def get_version() -> str:

judgeval-0.16.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,266 @@
+Metadata-Version: 2.4
+Name: judgeval
+Version: 0.16.0
+Summary: Judgeval Package
+Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
+Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
+Author-email: Andrew Li <andrew@judgmentlabs.ai>, Alex Shan <alex@judgmentlabs.ai>, Joseph Camyre <joseph@judgmentlabs.ai>
+License-Expression: Apache-2.0
+License-File: LICENSE.md
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3
+Requires-Python: >=3.10
+Requires-Dist: boto3>=1.40.11
+Requires-Dist: click<8.2.0
+Requires-Dist: dotenv
+Requires-Dist: httpx>=0.28.1
+Requires-Dist: litellm<1.75.0
+Requires-Dist: opentelemetry-exporter-otlp>=1.36.0
+Requires-Dist: opentelemetry-sdk>=1.36.0
+Requires-Dist: orjson>=3.9.0
+Requires-Dist: typer>=0.9.0
+Provides-Extra: s3
+Requires-Dist: boto3>=1.40.11; extra == 's3'
+Provides-Extra: trainer
+Requires-Dist: fireworks-ai>=0.19.18; extra == 'trainer'
+Description-Content-Type: text/markdown
+<div align="center">
+<a href="https://judgmentlabs.ai/">
+  <picture>
+    <source media="(prefers-color-scheme: dark)" srcset="assets/logo_darkmode.svg">
+    <img src="assets/logo_lightmode.svg" alt="Judgment Logo" width="400" />
+  </picture>
+</a>
+<br>
+## Agent Behavior Monitoring (ABM)
+Track and judge any agent behavior in online and offline setups. Set up Sentry-style alerts and analyze agent behaviors / topic patterns at scale!
+[![Docs](https://img.shields.io/badge/Documentation-blue)](https://docs.judgmentlabs.ai/documentation)
+[![Judgment Cloud](https://img.shields.io/badge/Judgment%20Cloud-brightgreen)](https://app.judgmentlabs.ai/register)
+[![Self-Host](https://img.shields.io/badge/Self--Host-orange)](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started)
+[![X](https://img.shields.io/badge/-X/Twitter-000?logo=x&logoColor=white)](https://x.com/JudgmentLabs)
+[![LinkedIn](https://custom-icon-badges.demolab.com/badge/LinkedIn%20-0A66C2?logo=linkedin-white&logoColor=fff)](https://www.linkedin.com/company/judgmentlabs)
+</div>
+</table>
+## [NEW] 🎆 Agent Reinforcement Learning
+Train your agents with multi-turn reinforcement learning using judgeval and [Fireworks AI](https://fireworks.ai/)! Judgeval's ABM now integrates with Fireworks' Reinforcement Fine-Tuning (RFT) endpoint, supporting gpt-oss, qwen3, Kimi2, DeepSeek, and more.
+Judgeval's agent monitoring infra provides a simple harness for integrating GRPO into any Python agent, giving builders a quick method to **try RL with minimal code changes** to their existing agents!
+```python
+await trainer.train(
+    agent_function=your_agent_function,  # entry point to your agent
+    scorers=[RewardScorer()],  # Custom scorer you define based on task criteria, acts as reward
+    prompts=training_prompts,  # Tasks
+    rft_provider="fireworks"
+)
+```
+**That's it!** Judgeval automatically manages trajectory collection and reward tagging - your agent can learn from production data with minimal code changes.
+👉 Check out the [Wikipedia Racer notebook](https://colab.research.google.com/github/JudgmentLabs/judgment-cookbook/blob/main/rl/WikiRacingAgent_RL.ipynb), where an agent learns to navigate Wikipedia using RL, to see Judgeval in action.
+You can view and monitor training progress for free via the [Judgment Dashboard](https://app.judgmentlabs.ai/).
+## Judgeval Overview
+Judgeval is an open-source framework for agent behavior monitoring. Judgeval offers a toolkit to track and judge agent behavior in online and offline setups, enabling you to convert interaction data from production/test environments into improved agents. To get started, try running one of the notebooks below or dive deeper in our [docs](https://docs.judgmentlabs.ai/documentation).
+Our mission is to unlock the power of production data for agent development, enabling teams to improve their apps by catching real-time failures and optimizing over their users' preferences.
+## 📚 Cookbooks
+| Try Out | Notebook | Description |
+|:---------|:-----|:------------|
+| RL | [Wikipedia Racer](https://colab.research.google.com/github/JudgmentLabs/judgment-cookbook/blob/main/rl/WikiRacingAgent_RL.ipynb) | Train agents with reinforcement learning |
+| Online ABM | [Research Agent](https://colab.research.google.com/github/JudgmentLabs/judgment-cookbook/blob/main/monitoring/Research_Agent_Online_Monitoring.ipynb) | Monitor agent behavior in production |
+| Custom Scorers | [HumanEval](https://colab.research.google.com/github/JudgmentLabs/judgment-cookbook/blob/main/custom_scorers/HumanEval_Custom_Scorer.ipynb) | Build custom evaluators for your agents |
+| Offline Testing | [Get Started For Free] | Compare how different prompts, models, or agent configs affect performance across ANY metric |
+You can access our [repo of cookbooks](https://github.com/JudgmentLabs/judgment-cookbook).
+You can find a list of [video tutorials for Judgeval use cases](https://www.youtube.com/@Alexshander-JL).
+## Why Judgeval?
+🤖 **Simple to run multi-turn RL**: Optimize your agents with multi-turn RL without managing compute infrastructure or data pipelines. Just add a few lines of code to your existing agent code and train!
+⚙️ **Custom Evaluators**: No restriction to only monitoring with prefab scorers. Judgeval provides simple abstractions for custom Python scorers, supporting any LLM-as-a-judge rubrics/models and code-based scorers that integrate to our live agent-tracking infrastructure. [Learn more](https://docs.judgmentlabs.ai/documentation/evaluation/custom-scorers)
+🚨 **Production Monitoring**: Run any custom scorer in a hosted, virtualized secure container to flag agent behaviors online in production. Get Slack alerts for failures and add custom hooks to address regressions before they impact users. [Learn more](https://docs.judgmentlabs.ai/documentation/performance/online-evals)
+📊 **Behavior/Topic Grouping**: Group agent runs by behavior type or topic for deeper analysis. Drill down into subsets of users, agents, or use cases to reveal patterns of agent behavior.
+<!-- Add link to Bucketing docs once we have it -->
+<!--
+TODO: Once we have trainer code docs, plug in here
+-->
+🧪 **Run experiments on your agents**: Compare test different prompts, models, or agent configs across customer segments. Measure which changes improve agent performance and decrease bad agent behaviors.
+<!--
+Use this once we have AI PM features:
+**Run experiments on your agents**: A/B test different prompts, models, or agent configs across customer segments. Measure which changes improve agent performance and decrease bad agent behaviors. [Learn more]
+-->
+## 🛠️ Quickstart
+Get started with Judgeval by installing our SDK using pip:
+```bash
+pip install judgeval
+```
+Ensure you have your `JUDGMENT_API_KEY` and `JUDGMENT_ORG_ID` environment variables set to connect to the [Judgment Platform](https://app.judgmentlabs.ai/).
+```bash
+export JUDGMENT_API_KEY=...
+export JUDGMENT_ORG_ID=...
+```
+**If you don't have keys, [create an account for free](https://app.judgmentlabs.ai/register) on the platform!**
+### Start monitoring with Judgeval
+```python
+from judgeval.tracer import Tracer, wrap
+from judgeval.data import Example
+from judgeval.scorers import AnswerRelevancyScorer
+from openai import OpenAI
+judgment = Tracer(project_name="default_project")
+client = wrap(OpenAI())  # tracks all LLM calls
+@judgment.observe(span_type="tool")
+def format_question(question: str) -> str:
+    # dummy tool
+    return f"Question : {question}"
+@judgment.observe(span_type="function")
+def run_agent(prompt: str) -> str:
+    task = format_question(prompt)
+    response = client.chat.completions.create(
+        model="gpt-5-mini",
+        messages=[{"role": "user", "content": task}]
+    )
+    judgment.async_evaluate(  # trigger online monitoring
+        scorer=AnswerRelevancyScorer(threshold=0.5),  # swap with any scorer
+        example=Example(input=task, actual_output=response),  # customize to your data
+        model="gpt-5",
+    )
+    return response.choices[0].message.content
+run_agent("What is the capital of the United States?")
+```
+Running this code will deliver monitoring results to your [free platform account](https://app.judgmentlabs.ai/register) and should look like this:
+![Judgment Platform Trajectory View](assets/quickstart_trajectory_ss.png)
+### Customizable Scorers Over Agent Behavior
+Judgeval's strongest suit is the full customization over the types of scorers you can run online monitoring with. No restrictions to only single-prompt LLM judges or prefab scorers - if you can express your scorer
+in python code, judgeval can monitor it! Under the hood, judgeval hosts your scorer in a virtualized secure container, enabling online monitoring for any scorer.
+First, create a behavior scorer in a file called `helpfulness_scorer.py`:
+```python
+from judgeval.data import Example
+from judgeval.scorers.example_scorer import ExampleScorer
+# Define custom example class
+class QuestionAnswer(Example):
+    question: str
+    answer: str
+# Define a server-hosted custom scorer
+class HelpfulnessScorer(ExampleScorer):
+    name: str = "Helpfulness Scorer"
+    server_hosted: bool = True  # Enable server hosting
+    async def a_score_example(self, example: QuestionAnswer):
+        # Custom scoring logic for agent behavior
+        # Can be an arbitrary combination of code and LLM calls
+        if len(example.answer) > 10 and "?" not in example.answer:
+            self.reason = "Answer is detailed and provides helpful information"
+            return 1.0
+        else:
+            self.reason = "Answer is too brief or unclear"
+            return 0.0
+```
+Then deploy your scorer to Judgment's infrastructure:
+```bash
+echo "pydantic" > requirements.txt
+uv run judgeval upload_scorer helpfulness_scorer.py requirements.txt
+```
+Now you can instrument your agent with monitoring and online evaluation:
+```python
+from judgeval.tracer import Tracer, wrap
+from helpfulness_scorer import HelpfulnessScorer, QuestionAnswer
+from openai import OpenAI
+judgment = Tracer(project_name="default_project")
+client = wrap(OpenAI())  # tracks all LLM calls
+@judgment.observe(span_type="tool")
+def format_task(question: str) -> str:  # replace with your prompt engineering
+    return f"Please answer the following question: {question}"
+@judgment.observe(span_type="tool")
+def answer_question(prompt: str) -> str:  # replace with your LLM system calls
+    response = client.chat.completions.create(
+        model="gpt-5-mini",
+        messages=[{"role": "user", "content": prompt}]
+    )
+    return response.choices[0].message.content
+@judgment.observe(span_type="function")
+def run_agent(question: str) -> str:
+    task = format_task(question)
+    answer = answer_question(task)
+    # Add online evaluation with server-hosted scorer
+    judgment.async_evaluate(
+        scorer=HelpfulnessScorer(),
+        example=QuestionAnswer(question=question, answer=answer),
+        sampling_rate=0.9  # Evaluate 90% of agent runs
+    )
+    return answer
+if __name__ == "__main__":
+    result = run_agent("What is the capital of the United States?")
+    print(result)
+```
+Congratulations! Your online eval result should look like this:
+![Custom Scorer Online ABM](assets/custom_scorer_online_abm.png)
+You can now run any online scorer in a secure Firecracker microVMs with no latency impact on your applications.
+---
+Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).

{judgeval-0.14.1.dist-info → judgeval-0.16.0.dist-info}/RECORD RENAMED Viewed

@@ -4,22 +4,23 @@ judgeval/constants.py,sha256=JZZJ1MqzZZDVk-5PRPRbmLnM8mXI-RDL5vxa1JFuscs,3408
 judgeval/env.py,sha256=37Mn4g0OkpFxXCZGlO_CLqKJnyX-jx_R24tC28XJzig,2112
 judgeval/exceptions.py,sha256=tTbfe4yoOtPXmn22UQz9-6a-5PT9uOko85xaRRwr0Sw,621
 judgeval/logger.py,sha256=ZWbp0QfT1CJnQIjV-Zle4n489nFCKEmD2-ukx--iiow,1553
-judgeval/version.py,sha256=jxLK8GY7YWWLhTk4egDdn5VKiEty1Qpb-C3dLL2m-To,74
+judgeval/version.py,sha256=UCd6S0KuM6h0ZUz8pm-Ty1EDHaJNSUYM_7PrDz0ov-E,74
 judgeval/warnings.py,sha256=LbGte14ppiFjrkp-JJYueZ40NWFvMkWRvPXr6r-fUWw,73
-judgeval/api/__init__.py,sha256=3Pm0qQ4ZQj76jUsJVrnuazRnYcqF3pzM_Wv_Z6lOv0w,13216
-judgeval/api/api_types.py,sha256=mtk9xcgYGj1zXV1w_vZ_fbVu9OI4i2IIDLL37lgYnV4,8979
+judgeval/api/__init__.py,sha256=ho8L4wC9y-STYEpk5zHwc2mZJhC4ezW8jiGgOIERBVY,12058
+judgeval/api/api_types.py,sha256=6wrjvO8XsYbfPxjQ_sHS9EOjqexbn3XDFclWqb4CgZ4,8874
 judgeval/data/__init__.py,sha256=1tU0EN0ThIfQ1fad5I3dKxAfTcZ5U8cvTLcQ6qLVLU0,407
 judgeval/data/evaluation_run.py,sha256=O41p99wNAuCAf6lsLNKzkZ6W-kL9LlzCYxVls7IcKkA,4727
 judgeval/data/example.py,sha256=eGJpF-lyUH734Cg90B7WtU9f8iKoS3VFGeV6R-GVCCc,1039
-judgeval/data/judgment_types.py,sha256=fNRqiGEG_nJhVkucagoxxgFqmpwK0-GlwWOwjmBtpXk,16603
+judgeval/data/judgment_types.py,sha256=uI4wUiXeA6k8o2ONia506eaZcydHKQKrK1LzccTK-xc,16577
 judgeval/data/result.py,sha256=XufFGSAkBDfevPUmzSgsR9HEqytISkM0U5HkhJmsjpY,2102
 judgeval/data/scorer_data.py,sha256=HeP15ZgftFTJCF8JmDJCLWXRnZJIaGDJCzl7Hg6gWwE,2006
 judgeval/data/trace.py,sha256=zSiR3o6xt8Z46XA3M9fJBtViF0BsPO6yKp9jxdscOSc,3881
 judgeval/data/scripts/fix_default_factory.py,sha256=lvp2JwYZqz-XpD9LZNa3mANZVP-jJSZoNzolI6JWERM,591
 judgeval/data/scripts/openapi_transform.py,sha256=Sm04JClzyP1ga8KA3gkIdsae8Hlx-XU7-x0gHCQYOhg,3877
-judgeval/dataset/__init__.py,sha256=4CiV7jQUiJ8_IXnD_E-vS5OfoEr0hghBe3-OSuVoBwE,8277
-judgeval/evaluation/__init__.py,sha256=6bSC1Sw-fpJN6OkZTv4UtAoYZqkjUy7OG17lxiRX5qE,13321
+judgeval/dataset/__init__.py,sha256=kL0_tIMP3qV6t4W17HQU91ybdXMZ5iDZzyUKzyfRdyY,8269
+judgeval/evaluation/__init__.py,sha256=WcqOgQdwgtc_BwEwDz6RDlF2RczyLrNjjIevQp-_NKE,12788
 judgeval/integrations/langgraph/__init__.py,sha256=HwXmtDxaO75Kn4KPErnMb6Ne6FcpRxV_SCYVuwFsve0,332
+judgeval/integrations/openlit/__init__.py,sha256=-8D4D6-fGsWPwoOojw82OaE9X5sUbmb16x1bF-WfOmg,1571
 judgeval/judges/__init__.py,sha256=e7JnTc1TG_SwqydDHTXHIP0EBazQxt-ydMQG7ghSU5A,228
 judgeval/judges/base_judge.py,sha256=_dz0qWsKRxzXxpRY9l6mrxTRYPSF2FE4ZXkrzhZ4gbY,986
 judgeval/judges/litellm_judge.py,sha256=5vEF0IUo7HVWnOF2ww-DMke8Xkarnz32B_qbgKjc0-I,4182
@@ -39,8 +40,8 @@ judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py,sha256=WUeFy
 judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py,sha256=ciiFBQQC4UDsk9qou9OiKbAR31s82eRUY1ZTt1gdM-0,407
 judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py,sha256=ucYOI6ztAjfoYmcgTDzN8u5RrehlVqrkeLEfss9b1fk,441
 judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py,sha256=V3RdrWhnR_vLBrtWw7QbgN9K_A-Och7-v9I2fN4z8gY,506
-judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py,sha256=FbrXNMedeepYp_bADsysapIIZcr09l9EV9QWfGxvanw,10075
-judgeval/tracer/__init__.py,sha256=iqFvWok4QBW-1bs2zCmkhw4Y_o2d2mVeiPUtQbG9Nvc,35995
+judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py,sha256=zJ0n3HyZ1FFBnMnTYxBi37m_3Er7ENd4HpqLjNi5Eag,10902
+judgeval/tracer/__init__.py,sha256=uIOx-2P_FVwBKhwVkkIOyEQCv3gouCZ2I8-eApocnKU,36165
 judgeval/tracer/constants.py,sha256=ae8tivAW97awJQxdRB9OMqX50wOLX3zqChT_AGkPBu0,85
 judgeval/tracer/keys.py,sha256=ho4-_w4ngTVejdSKUH80sG6vtYt4c7FEKrYpFrDfPLs,2105
 judgeval/tracer/local_eval_queue.py,sha256=KZKvSSli7B-EVzdHa4-CmXUpv0uOjGLLRa2KTPg8lRc,7320
@@ -50,7 +51,7 @@ judgeval/tracer/exporters/__init__.py,sha256=3WDXC28iY5gYMM5s7ejmy7P-DVDQ_iIuzwo
 judgeval/tracer/exporters/s3.py,sha256=N9gmw17cnR0VkfAQQkLsNj5BksgNRETThR5qYhWRjP4,4360
 judgeval/tracer/exporters/store.py,sha256=KQV3cyqteesByQjR-9VdPXT9OlUZ-6F08ogqj837_c0,1012
 judgeval/tracer/exporters/utils.py,sha256=JRcoSQuEHxMDJbXfyrUIfA2SHBVkZM82h4bTbYGxkNw,1154
-judgeval/tracer/llm/__init__.py,sha256=6JSF-RaK6tZNzd0rZOK6Don7vvf15EhSPSio_FmS7i8,42564
+judgeval/tracer/llm/__init__.py,sha256=b7toFMVyZU4Pv8jximfneP5gyohUB4DwJDvy8b2_IMw,44217
 judgeval/tracer/llm/providers.py,sha256=UU8xrh2n9p3xZwnlWMUcZoFpog2-F9-YfcV0c2aUNqQ,1432
 judgeval/tracer/llm/anthropic/__init__.py,sha256=DUTkYjMejWLI8inFJ_Ih7vf7_aJFAiCyi1Oxls-ACGo,439
 judgeval/tracer/llm/google/__init__.py,sha256=7j96SPUl61yVl3jCQ-JuPpgVU9GhmcsBzY2vj5wJAVo,506
@@ -72,8 +73,8 @@ judgeval/utils/serialize.py,sha256=QXR-8Nj5rqOrI9zLx0oRLdk6DW6Bc7j8eyF4zQ7PLxA,6
 judgeval/utils/testing.py,sha256=m5Nexv65tmfSj1XvAPK5Ear7aJ7w5xjDtZN0tLZ_RBk,2939
 judgeval/utils/url.py,sha256=Shf0v3XcbaWpL0m1eGJEEO_z4TsQCnDB2Rl25OTUmiI,195
 judgeval/utils/version_check.py,sha256=ylZQSqV7kLzEOChxvav9SCHUU4OnaCp36tXHLjdzmw0,1072
-judgeval-0.14.1.dist-info/METADATA,sha256=e8rJlBzFrfcadnR6-WiBQaRTKj2LlsnuxAS-Ag_WK1Q,8564
-judgeval-0.14.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-judgeval-0.14.1.dist-info/entry_points.txt,sha256=-eoeD-oDLn4A7MSgeBS9Akwanf3_0r0cgEleBcIOjg0,46
-judgeval-0.14.1.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
-judgeval-0.14.1.dist-info/RECORD,,
+judgeval-0.16.0.dist-info/METADATA,sha256=kojyijzNE_2gKKvMGrs7E0zHHv3GtOXRjfmIOUQujTY,11512
+judgeval-0.16.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+judgeval-0.16.0.dist-info/entry_points.txt,sha256=-eoeD-oDLn4A7MSgeBS9Akwanf3_0r0cgEleBcIOjg0,46
+judgeval-0.16.0.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
+judgeval-0.16.0.dist-info/RECORD,,

judgeval-0.14.1.dist-info/METADATA DELETED Viewed

@@ -1,158 +0,0 @@
-Metadata-Version: 2.4
-Name: judgeval
-Version: 0.14.1
-Summary: Judgeval Package
-Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
-Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
-Author-email: Andrew Li <andrew@judgmentlabs.ai>, Alex Shan <alex@judgmentlabs.ai>, Joseph Camyre <joseph@judgmentlabs.ai>
-License-Expression: Apache-2.0
-License-File: LICENSE.md
-Classifier: Operating System :: OS Independent
-Classifier: Programming Language :: Python :: 3
-Requires-Python: >=3.10
-Requires-Dist: boto3>=1.40.11
-Requires-Dist: click<8.2.0
-Requires-Dist: dotenv
-Requires-Dist: httpx>=0.28.1
-Requires-Dist: litellm<1.75.0
-Requires-Dist: opentelemetry-exporter-otlp>=1.36.0
-Requires-Dist: opentelemetry-sdk>=1.36.0
-Requires-Dist: orjson>=3.9.0
-Requires-Dist: typer>=0.9.0
-Provides-Extra: s3
-Requires-Dist: boto3>=1.40.11; extra == 's3'
-Provides-Extra: trainer
-Requires-Dist: fireworks-ai>=0.19.18; extra == 'trainer'
-Description-Content-Type: text/markdown
-<div align="center">
-<img src="assets/new_lightmode.svg#gh-light-mode-only" alt="Judgment Logo" width="400" />
-<img src="assets/new_darkmode.svg#gh-dark-mode-only" alt="Judgment Logo" width="400" />
-<br>
-<div style="font-size: 1.5em;">
-    Enable self-learning agents with environment data and evals.
-</div>
-## [Docs](https://docs.judgmentlabs.ai/)  •  [Judgment Cloud](https://app.judgmentlabs.ai/register)  • [Self-Host](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started)  • [Landing Page](https://judgmentlabs.ai/)
- [Demo](https://www.youtube.com/watch?v=1S4LixpVbcc) • [Bug Reports](https://github.com/JudgmentLabs/judgeval/issues) • [Changelog](https://docs.judgmentlabs.ai/changelog/2025-04-21)
-We're hiring! Join us in our mission to enable self-learning agents by providing the data and signals needed for monitoring and post-training.
-[![X](https://img.shields.io/badge/-X/Twitter-000?logo=x&logoColor=white)](https://x.com/JudgmentLabs)
-[![LinkedIn](https://custom-icon-badges.demolab.com/badge/LinkedIn%20-0A66C2?logo=linkedin-white&logoColor=fff)](https://www.linkedin.com/company/judgmentlabs)
-[![Discord](https://img.shields.io/badge/-Discord-5865F2?logo=discord&logoColor=white)](https://discord.gg/tGVFf8UBUY)
-<img src="assets/product_shot.png" alt="Judgment Platform" width="800" />
-</div>
-Judgeval offers **open-source tooling** for evaluating autonomous, stateful agents. It **provides runtime data from agent-environment interactions** for continuous learning and self-improvement.
-## 🎬 See Judgeval in Action
-**[Multi-Agent System](https://github.com/JudgmentLabs/judgment-cookbook/tree/main/cookbooks/agents/multi-agent) with complete observability:** (1) A multi-agent system spawns agents to research topics on the internet. (2) With just **3 lines of code**, Judgeval captures all environment responses across all agent tool calls for monitoring. (3) After completion, (4) export all interaction data to enable further environment-specific learning and optimization.
-<table style="width: 100%; max-width: 800px; table-layout: fixed;">
-<tr>
-<td align="center" style="padding: 8px; width: 50%;">
-  <img src="assets/agent.gif" alt="Agent Demo" style="width: 100%; max-width: 350px; height: auto;" />
-  <br><strong>🤖 Agents Running</strong>
-</td>
-<td align="center" style="padding: 8px; width: 50%;">
-  <img src="assets/trace.gif" alt="Capturing Environment Data Demo" style="width: 100%; max-width: 350px; height: auto;" />
-  <br><strong>📊 Capturing Environment Data </strong>
-</td>
-</tr>
-<tr>
-<td align="center" style="padding: 8px; width: 50%;">
-  <img src="assets/document.gif" alt="Agent Completed Demo" style="width: 100%; max-width: 350px; height: auto;" />
-  <br><strong>✅ Agents Completed Running</strong>
-</td>
-<td align="center" style="padding: 8px; width: 50%;">
-  <img src="assets/data.gif" alt="Data Export Demo" style="width: 100%; max-width: 350px; height: auto;" />
-  <br><strong>📤 Exporting Agent Environment Data</strong>
-</td>
-</tr>
-</table>
-## 📋 Table of Contents
-- [🛠️ Installation](#️-installation)
-- [🏁 Quickstarts](#-quickstarts)
-- [✨ Features](#-features)
-- [🏢 Self-Hosting](#-self-hosting)
-- [📚 Cookbooks](#-cookbooks)
-- [💻 Development with Cursor](#-development-with-cursor)
-## 🛠️ Installation
-Get started with Judgeval by installing our SDK using pip:
-```bash
-pip install judgeval
-```
-Ensure you have your `JUDGMENT_API_KEY` and `JUDGMENT_ORG_ID` environment variables set to connect to the [Judgment Platform](https://app.judgmentlabs.ai/).
-```bash
-export JUDGMENT_API_KEY=...
-export JUDGMENT_ORG_ID=...
-```
-**If you don't have keys, [create an account](https://app.judgmentlabs.ai/register) on the platform!**
-## ✨ Features
-|  |  |
-|:---|:---:|
-| <h3>🧪 Evals</h3>Build custom evaluators on top of your agents. Judgeval supports LLM-as-a-judge, manual labeling, and code-based evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 A/B testing <br>• 🛡️ Online guardrails | <p align="center"><img src="assets/test.png" alt="Evaluation metrics" width="800"/></p> |
-| <h3>📡 Monitoring</h3>Get Slack alerts for agent failures in production. Add custom hooks to address production regressions.<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/errors.png" alt="Monitoring Dashboard" width="1200"/></p> |
-| <h3>📊 Datasets</h3>Export environment interactions and test cases to datasets for scaled analysis and optimization. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations, enabling continuous learning from production interactions. <br><br> **Useful for:**<br>• 🗃️ Agent environment interaction data for optimization<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
-## 🏢 Self-Hosting
-Run Judgment on your own infrastructure: we provide comprehensive self-hosting capabilities that give you full control over the backend and data plane that Judgeval interfaces with.
-### Key Features
-* Deploy Judgment on your own AWS account
-* Store data in your own Supabase instance
-* Access Judgment through your own custom domain
-### Getting Started
-1. Check out our [self-hosting documentation](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started) for detailed setup instructions, along with how your self-hosted instance can be accessed
-2. Use the [Judgment CLI](https://docs.judgmentlabs.ai/documentation/developer-tools/judgment-cli/installation) to deploy your self-hosted environment
-3. After your self-hosted instance is setup, make sure the `JUDGMENT_API_URL` environmental variable is set to your self-hosted backend endpoint
-## 📚 Cookbooks
-Have your own? We're happy to feature it if you create a PR or message us on [Discord](https://discord.gg/tGVFf8UBUY).
-You can access our repo of cookbooks [here](https://github.com/JudgmentLabs/judgment-cookbook).
-## 💻 Development with Cursor
-Building agents and LLM workflows in Cursor works best when your coding assistant has the proper context about Judgment integration. The Cursor rules file contains the key information needed for your assistant to implement Judgment features effectively.
-Refer to the official [documentation](https://docs.judgmentlabs.ai/documentation/developer-tools/cursor/cursor-rules) for access to the rules file and more information on integrating this rules file with your codebase.
-## ⭐ Star Us on GitHub
-If you find Judgeval useful, please consider giving us a star on GitHub! Your support helps us grow our community and continue improving the repository.
-## ❤️ Contributors
-There are many ways to contribute to Judgeval:
-- Submit [bug reports](https://github.com/JudgmentLabs/judgeval/issues) and [feature requests](https://github.com/JudgmentLabs/judgeval/issues)
-- Review the documentation and submit [Pull Requests](https://github.com/JudgmentLabs/judgeval/pulls) to improve it
-- Speaking or writing about Judgment and letting us know!
-<!-- Contributors collage -->
-[![Contributors](https://contributors-img.web.app/image?repo=JudgmentLabs/judgeval)](https://github.com/JudgmentLabs/judgeval/graphs/contributors)
----
-Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).

{judgeval-0.14.1.dist-info → judgeval-0.16.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{judgeval-0.14.1.dist-info → judgeval-0.16.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{judgeval-0.14.1.dist-info → judgeval-0.16.0.dist-info}/licenses/LICENSE.md RENAMED Viewed

File without changes

judgeval 0.14.1__py3-none-any.whl → 0.16.0__py3-none-any.whl

judgeval 0.14.1py3-none-any.whl → 0.16.0py3-none-any.whl