PyPI - judgeval - Versions diffs - 0.14.1__tar.gz → 0.15.0__tar.gz - Mend

judgeval 0.14.1tar.gz → 0.15.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (134) hide show

{judgeval-0.14.1 → judgeval-0.15.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: judgeval
-Version: 0.14.1
+Version: 0.15.0
 Summary: Judgeval Package
 Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
 Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues

{judgeval-0.14.1 → judgeval-0.15.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "judgeval"
-version = "0.14.1"
+version = "0.15.0"
 authors = [
     { name = "Andrew Li", email = "andrew@judgmentlabs.ai" },
     { name = "Alex Shan", email = "alex@judgmentlabs.ai" },
@@ -58,7 +58,7 @@ dev = [
     "langchain-core>=0.3.72",
     "langgraph>=0.6.4",
     "mypy>=1.17.1",
-    "openai>=1.78.1",
+    "openai>=1.92.0",
     "opentelemetry-instrumentation-openai>=0.44.1",
     "ruff>=0.9.1,<0.10.0",
     "together>=1.5.21",
@@ -73,6 +73,7 @@ dev = [
     "langchain-tavily>=0.2.11",
     "streamlit>=1.49.1",
     "langchain-community>=0.3.29",
+    "openlit>=1.35.5",
 ]

{judgeval-0.14.1 → judgeval-0.15.0}/scripts/api_generator.py RENAMED Viewed

@@ -31,7 +31,6 @@ JUDGEVAL_PATHS: List[str] = [
     "/fetch_experiment_run/",
     "/add_to_run_eval_queue/examples",
     "/add_to_run_eval_queue/traces",
-    "/get_evaluation_status/",
     "/save_scorer/",
     "/fetch_scorers/",
     "/scorer_exists/",

{judgeval-0.14.1 → judgeval-0.15.0}/scripts/openapi_transform.py RENAMED Viewed

@@ -30,7 +30,6 @@ JUDGEVAL_PATHS: List[str] = [
     "/fetch_experiment_run/",
     "/add_to_run_eval_queue/examples",
     "/add_to_run_eval_queue/traces",
-    "/get_evaluation_status/",
     "/save_scorer/",
     "/fetch_scorers/",
     "/scorer_exists/",

{judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/api/__init__.py RENAMED Viewed

@@ -111,16 +111,6 @@ class JudgmentSyncClient:
             payload,
         )
-    def get_evaluation_status(self, experiment_run_id: str, project_name: str) -> Any:
-        query_params = {}
-        query_params["experiment_run_id"] = experiment_run_id
-        query_params["project_name"] = project_name
-        return self._request(
-            "GET",
-            url_for("/get_evaluation_status/"),
-            query_params,
-        )
     def datasets_insert_examples_for_judgeval(
         self, payload: DatasetInsertExamples
     ) -> Any:
@@ -318,18 +308,6 @@ class JudgmentAsyncClient:
             payload,
         )
-    async def get_evaluation_status(
-        self, experiment_run_id: str, project_name: str
-    ) -> Any:
-        query_params = {}
-        query_params["experiment_run_id"] = experiment_run_id
-        query_params["project_name"] = project_name
-        return await self._request(
-            "GET",
-            url_for("/get_evaluation_status/"),
-            query_params,
-        )
     async def datasets_insert_examples_for_judgeval(
         self, payload: DatasetInsertExamples
     ) -> Any:

{judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/api/api_types.py RENAMED Viewed

@@ -1,6 +1,6 @@
 # generated by datamodel-codegen:
 #   filename:  .openapi.json
-#   timestamp: 2025-09-29T19:54:47+00:00
+#   timestamp: 2025-09-30T18:06:51+00:00
 from __future__ import annotations
 from typing import Any, Dict, List, Literal, Optional, TypedDict, Union
@@ -24,6 +24,15 @@ class DatasetsFetch(TypedDict):
     project_name: str
+class DatasetsTableRow(TypedDict):
+    dataset_id: str
+    name: str
+    created_at: str
+    kind: Literal["trace", "example"]
+    entries: int
+    creator: str
 class ProjectAdd(TypedDict):
     project_name: str
@@ -180,18 +189,10 @@ class OtelTraceSpan(TypedDict):
     resource_attributes: NotRequired[Optional[Dict[str, Any]]]
     span_attributes: NotRequired[Optional[Dict[str, Any]]]
     duration: NotRequired[Optional[int]]
-    status_code: NotRequired[Optional[str]]
+    status_code: NotRequired[Optional[int]]
     status_message: NotRequired[Optional[str]]
     events: NotRequired[Optional[List[Dict[str, Any]]]]
     links: NotRequired[Optional[List[Dict[str, Any]]]]
-    legacy_span_id: NotRequired[Optional[str]]
-    inputs: NotRequired[Optional[Dict[str, Any]]]
-    output: Any
-    error: NotRequired[Optional[Dict[str, Any]]]
-    agent_id: NotRequired[Optional[str]]
-    cumulative_llm_cost: NotRequired[Optional[float]]
-    state_after: NotRequired[Optional[Dict[str, Any]]]
-    state_before: NotRequired[Optional[Dict[str, Any]]]
 class OtelSpanListItemScores(TypedDict):
@@ -206,7 +207,7 @@ class OtelSpanDetailScores(TypedDict):
     score: float
     reason: NotRequired[Optional[str]]
     name: str
-    data: NotRequired[Optional[Dict[str, Any]]]
+    example_id: NotRequired[Optional[str]]
 class ExampleEvaluationRun(TypedDict):
@@ -244,15 +245,6 @@ class DatasetInsertExamples(TypedDict):
     project_name: str
-class DatasetInfo(TypedDict):
-    dataset_id: str
-    name: str
-    created_at: str
-    dataset_kind: DatasetKind
-    entries: int
-    creator: str
 class DatasetCreate(TypedDict):
     name: str
     dataset_kind: DatasetKind
@@ -279,14 +271,14 @@ class OtelTraceListItem(TypedDict):
     organization_id: str
     project_id: str
     trace_id: str
-    timestamp: str
+    created_at: str
     duration: NotRequired[Optional[int]]
     has_notification: NotRequired[Optional[bool]]
     tags: NotRequired[Optional[List[str]]]
     experiment_run_id: NotRequired[Optional[str]]
     span_name: NotRequired[Optional[str]]
-    cumulative_llm_cost: NotRequired[Optional[float]]
-    error: NotRequired[Optional[Dict[str, Any]]]
+    llm_cost: NotRequired[Optional[float]]
+    error: NotRequired[str]
     scores: NotRequired[List[OtelSpanListItemScores]]
     customer_id: NotRequired[Optional[str]]
     input_preview: NotRequired[Optional[str]]
@@ -310,9 +302,9 @@ class OtelSpanDetail(TypedDict):
     resource_attributes: NotRequired[Optional[Dict[str, Any]]]
     span_attributes: NotRequired[Optional[Dict[str, Any]]]
     duration: NotRequired[Optional[int]]
-    status_code: NotRequired[Optional[str]]
+    status_code: NotRequired[Optional[int]]
     status_message: NotRequired[Optional[str]]
-    events: NotRequired[Optional[Union[List[Dict[str, Any]], Dict[str, Any]]]]
+    events: NotRequired[Optional[List[Dict[str, Any]]]]
     links: NotRequired[Optional[Union[List[Dict[str, Any]], Dict[str, Any]]]]
     llm_cost: NotRequired[Optional[float]]
     prompt_tokens: NotRequired[Optional[int]]
@@ -335,5 +327,5 @@ class DatasetReturn(TypedDict):
     name: str
     project_name: str
     dataset_kind: DatasetKind
-    examples: NotRequired[Optional[List[Example]]]
+    examples: NotRequired[List[Example]]
     traces: NotRequired[Optional[List[DatasetTraceWithSpans]]]

{judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/data/judgment_types.py RENAMED Viewed

@@ -1,6 +1,6 @@
 # generated by datamodel-codegen:
 #   filename:  .openapi.json
-#   timestamp: 2025-09-29T19:54:46+00:00
+#   timestamp: 2025-09-30T18:06:50+00:00
 from __future__ import annotations
 from typing import Annotated, Any, Dict, List, Optional, Union
@@ -26,6 +26,20 @@ class DatasetsFetch(BaseModel):
     project_name: Annotated[str, Field(title="Project Name")]
+class Kind(Enum):
+    trace = "trace"
+    example = "example"
+class DatasetsTableRow(BaseModel):
+    dataset_id: Annotated[str, Field(title="Dataset Id")]
+    name: Annotated[str, Field(title="Name")]
+    created_at: Annotated[str, Field(title="Created At")]
+    kind: Annotated[Kind, Field(title="Kind")]
+    entries: Annotated[int, Field(title="Entries")]
+    creator: Annotated[str, Field(title="Creator")]
 class ProjectAdd(BaseModel):
     project_name: Annotated[str, Field(title="Project Name")]
@@ -199,22 +213,10 @@ class OtelTraceSpan(BaseModel):
         Optional[Dict[str, Any]], Field(title="Span Attributes")
     ] = None
     duration: Annotated[Optional[int], Field(title="Duration")] = None
-    status_code: Annotated[Optional[str], Field(title="Status Code")] = None
+    status_code: Annotated[Optional[int], Field(title="Status Code")] = None
     status_message: Annotated[Optional[str], Field(title="Status Message")] = None
     events: Annotated[Optional[List[Dict[str, Any]]], Field(title="Events")] = None
     links: Annotated[Optional[List[Dict[str, Any]]], Field(title="Links")] = None
-    legacy_span_id: Annotated[Optional[str], Field(title="Legacy Span Id")] = None
-    inputs: Annotated[Optional[Dict[str, Any]], Field(title="Inputs")] = None
-    output: Annotated[Any, Field(title="Output")]
-    error: Annotated[Optional[Dict[str, Any]], Field(title="Error")] = None
-    agent_id: Annotated[Optional[str], Field(title="Agent Id")] = None
-    cumulative_llm_cost: Annotated[
-        Optional[float], Field(title="Cumulative Llm Cost")
-    ] = None
-    state_after: Annotated[Optional[Dict[str, Any]], Field(title="State After")] = None
-    state_before: Annotated[Optional[Dict[str, Any]], Field(title="State Before")] = (
-        None
-    )
 class OtelSpanListItemScores(BaseModel):
@@ -229,7 +231,7 @@ class OtelSpanDetailScores(BaseModel):
     score: Annotated[float, Field(title="Score")]
     reason: Annotated[Optional[str], Field(title="Reason")] = None
     name: Annotated[str, Field(title="Name")]
-    data: Annotated[Optional[Dict[str, Any]], Field(title="Data")] = None
+    example_id: Annotated[Optional[str], Field(title="Example Id")] = None
 class ExampleEvaluationRun(BaseModel):
@@ -277,15 +279,6 @@ class DatasetInsertExamples(BaseModel):
     project_name: Annotated[str, Field(title="Project Name")]
-class DatasetInfo(BaseModel):
-    dataset_id: Annotated[str, Field(title="Dataset Id")]
-    name: Annotated[str, Field(title="Name")]
-    created_at: Annotated[str, Field(title="Created At")]
-    dataset_kind: DatasetKind
-    entries: Annotated[int, Field(title="Entries")]
-    creator: Annotated[str, Field(title="Creator")]
 class DatasetCreate(BaseModel):
     name: Annotated[str, Field(title="Name")]
     dataset_kind: DatasetKind
@@ -314,16 +307,14 @@ class OtelTraceListItem(BaseModel):
     organization_id: Annotated[str, Field(title="Organization Id")]
     project_id: Annotated[str, Field(title="Project Id")]
     trace_id: Annotated[str, Field(title="Trace Id")]
-    timestamp: Annotated[str, Field(title="Timestamp")]
+    created_at: Annotated[AwareDatetime, Field(title="Created At")]
     duration: Annotated[Optional[int], Field(title="Duration")] = None
     has_notification: Annotated[Optional[bool], Field(title="Has Notification")] = None
     tags: Annotated[Optional[List[str]], Field(title="Tags")] = None
     experiment_run_id: Annotated[Optional[str], Field(title="Experiment Run Id")] = None
     span_name: Annotated[Optional[str], Field(title="Span Name")] = None
-    cumulative_llm_cost: Annotated[
-        Optional[float], Field(title="Cumulative Llm Cost")
-    ] = None
-    error: Annotated[Optional[Dict[str, Any]], Field(title="Error")] = None
+    llm_cost: Annotated[Optional[float], Field(title="Llm Cost")] = None
+    error: Annotated[Optional[str], Field(title="Error")] = ""
     scores: Annotated[
         Optional[List[OtelSpanListItemScores]], Field(title="Scores")
     ] = []
@@ -338,7 +329,7 @@ class OtelTraceListItem(BaseModel):
 class OtelSpanDetail(BaseModel):
     organization_id: Annotated[str, Field(title="Organization Id")]
     project_id: Annotated[str, Field(title="Project Id")]
-    timestamp: Annotated[str, Field(title="Timestamp")]
+    timestamp: Annotated[AwareDatetime, Field(title="Timestamp")]
     trace_id: Annotated[str, Field(title="Trace Id")]
     span_id: Annotated[str, Field(title="Span Id")]
     parent_span_id: Annotated[Optional[str], Field(title="Parent Span Id")] = None
@@ -353,11 +344,9 @@ class OtelSpanDetail(BaseModel):
         Optional[Dict[str, Any]], Field(title="Span Attributes")
     ] = None
     duration: Annotated[Optional[int], Field(title="Duration")] = None
-    status_code: Annotated[Optional[str], Field(title="Status Code")] = None
+    status_code: Annotated[Optional[int], Field(title="Status Code")] = None
     status_message: Annotated[Optional[str], Field(title="Status Message")] = None
-    events: Annotated[
-        Optional[Union[List[Dict[str, Any]], Dict[str, Any]]], Field(title="Events")
-    ] = None
+    events: Annotated[Optional[List[Dict[str, Any]]], Field(title="Events")] = None
     links: Annotated[
         Optional[Union[List[Dict[str, Any]], Dict[str, Any]]], Field(title="Links")
     ] = None

{judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/dataset/__init__.py RENAMED Viewed

@@ -20,7 +20,7 @@ class DatasetInfo:
     dataset_id: str
     name: str
     created_at: str
-    dataset_kind: DatasetKind
+    kind: DatasetKind
     entries: int
     creator: str

{judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/evaluation/__init__.py RENAMED Viewed

@@ -84,7 +84,7 @@ def log_evaluation_results(
 def _poll_evaluation_until_complete(
     evaluation_run: ExampleEvaluationRun,
-    expected_scorer_data_count: int,
+    expected_examples_count: int,
     poll_interval_seconds: float = 5,
     max_failures: int = 5,
     max_poll_count: int = 60,  # This should be equivalent to 5 minutes
@@ -117,29 +117,22 @@ def _poll_evaluation_until_complete(
         poll_count += 1
         try:
             # Check status
-            status_response = api_client.get_evaluation_status(
-                experiment_run_id, project_name
-            )
-            if status_response.get("status") != "completed":
-                time.sleep(poll_interval_seconds)
-                continue
-            example_scorer_pairings = status_response.get("results", [])
-            if len(example_scorer_pairings) != expected_scorer_data_count:
-                time.sleep(poll_interval_seconds)
-                continue
             results_response = api_client.fetch_experiment_run(
                 {
                     "experiment_run_id": experiment_run_id,
                     "project_name": project_name,
                 }
             )
+            example_scorer_pairings = results_response.get("results", [])
+            if len(example_scorer_pairings) != expected_examples_count:
+                time.sleep(poll_interval_seconds)
+                continue
             url = results_response.get("ui_results_url")
             scoring_result_list = []
-            for res in results_response.get("results", []):
+            for res in example_scorer_pairings:
                 example = res.get("data", {}).copy()
                 example["example_id"] = res.get("example_id")
                 scoring_result = ScoringResult(
@@ -241,14 +234,9 @@ def run_eval(
                 )
                 raise JudgmentRuntimeError(error_message)
-            num_scorers = (
-                len(evaluation_run.judgment_scorers)
-                if evaluation_run.judgment_scorers
-                else sum(1 for cs in evaluation_run.custom_scorers if cs.server_hosted)
-            )
             results, url = _poll_evaluation_until_complete(
                 evaluation_run=evaluation_run,
-                expected_scorer_data_count=(num_scorers * len(evaluation_run.examples)),
+                expected_examples_count=len(evaluation_run.examples),
             )
         finally:
             stop_event.set()

judgeval-0.15.0/src/judgeval/integrations/openlit/__init__.py ADDED Viewed

@@ -0,0 +1,50 @@
+from abc import ABC
+from judgeval.tracer import Tracer
+from judgeval.logger import judgeval_logger
+from judgeval.utils.url import url_for
+try:
+    import openlit  # type: ignore
+except ImportError:
+    raise ImportError(
+        "Openlit is not installed and required for the openlit integration. Please install it with `pip install openlit`."
+    )
+class Openlit(ABC):
+    @staticmethod
+    def initialize(
+        **kwargs,
+    ):
+        tracer = Tracer.get_instance()
+        if not tracer or not tracer._initialized:
+            raise ValueError(
+                "Openlit must be initialized after the tracer has been initialized. Please create the Tracer instance first before initializing Openlit."
+            )
+        api_key = tracer.api_key
+        organization_id = tracer.organization_id
+        project_name = tracer.project_name
+        project_id = Tracer._resolve_project_id(project_name, api_key, organization_id)
+        if not project_id:
+            judgeval_logger.warning(
+                f"Project {project_name} not found. Please create it first at https://app.judgmentlabs.ai/org/{organization_id}/projects."
+            )
+            return
+        openlit.init(
+            service_name=project_name,
+            otlp_endpoint=url_for("/otel"),
+            otlp_headers={
+                "Authorization": f"Bearer {api_key}",
+                "X-Organization-Id": organization_id,
+                "X-Project-Id": project_id,
+            },
+            tracer=tracer.get_tracer(),
+            **kwargs,
+        )
+__all__ = ["Openlit"]

{judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py RENAMED Viewed

@@ -20,6 +20,7 @@ def push_prompt_scorer(
     threshold: float,
     options: Optional[Dict[str, float]] = None,
     model: str = JUDGMENT_DEFAULT_GPT_MODEL,
+    description: Optional[str] = None,
     judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
     organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
     is_trace: bool = False,
@@ -33,6 +34,7 @@ def push_prompt_scorer(
                 "threshold": threshold,
                 "options": options,
                 "model": model,
+                "description": description,
                 "is_trace": is_trace,
             }
         )
@@ -102,6 +104,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
     score_type: APIScorerType
     prompt: str
     options: Optional[Dict[str, float]] = None
+    description: Optional[str] = None
     judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or ""
     organization_id: str = os.getenv("JUDGMENT_ORG_ID") or ""
@@ -130,6 +133,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
             threshold=scorer_config["threshold"],
             options=scorer_config.get("options"),
             model=scorer_config.get("model"),
+            description=scorer_config.get("description"),
             judgment_api_key=judgment_api_key,
             organization_id=organization_id,
         )
@@ -142,6 +146,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
         threshold: float = 0.5,
         options: Optional[Dict[str, float]] = None,
         model: str = JUDGMENT_DEFAULT_GPT_MODEL,
+        description: Optional[str] = None,
         judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
         organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
     ):
@@ -158,6 +163,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
                 threshold,
                 options,
                 model,
+                description,
                 judgment_api_key,
                 organization_id,
                 is_trace,
@@ -170,6 +176,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
                 threshold=threshold,
                 options=options,
                 model=model,
+                description=description,
                 judgment_api_key=judgment_api_key,
                 organization_id=organization_id,
             )
@@ -215,6 +222,14 @@ class BasePromptScorer(ABC, APIScorerConfig):
         self.push_prompt_scorer()
         judgeval_logger.info(f"Successfully updated options for {self.name}")
+    def set_description(self, description: Optional[str]):
+        """
+        Updates the description of the scorer.
+        """
+        self.description = description
+        self.push_prompt_scorer()
+        judgeval_logger.info(f"Successfully updated description for {self.name}")
     def append_to_prompt(self, prompt_addition: str):
         """
         Appends a string to the prompt.
@@ -248,7 +263,13 @@ class BasePromptScorer(ABC, APIScorerConfig):
         """
         return copy(self.options) if self.options is not None else None
-    def get_name(self) -> str | None:
+    def get_description(self) -> str | None:
+        """
+        Returns the description of the scorer.
+        """
+        return self.description
+    def get_name(self) -> str:
         """
         Returns the name of the scorer.
         """
@@ -264,6 +285,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
             "prompt": self.prompt,
             "threshold": self.threshold,
             "options": self.options,
+            "description": self.description,
         }
     def push_prompt_scorer(self):
@@ -276,13 +298,14 @@ class BasePromptScorer(ABC, APIScorerConfig):
             self.threshold,
             self.options,
             self.model,
+            self.description,
             self.judgment_api_key,
             self.organization_id,
             isinstance(self, TracePromptScorer),
         )
     def __str__(self):
-        return f"PromptScorer(name={self.name}, model={self.model}, prompt={self.prompt}, threshold={self.threshold}, options={self.options})"
+        return f"PromptScorer(name={self.name}, model={self.model}, prompt={self.prompt}, threshold={self.threshold}, options={self.options}, description={self.description})"
     def model_dump(self, *args, **kwargs) -> Dict[str, Any]:
         base = super().model_dump(*args, **kwargs)

{judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/tracer/__init__.py RENAMED Viewed

@@ -255,6 +255,10 @@ class Tracer(metaclass=SingletonMeta):
     def get_current_agent_context(self):
         return self.agent_context
+    def get_span_processor(self) -> JudgmentSpanProcessor:
+        """Get the internal span processor of this tracer instance."""
+        return self.judgment_processor
     def set_customer_id(self, customer_id: str) -> None:
         span = self.get_current_span()
         if span and span.is_recording():

{judgeval-0.14.1 → judgeval-0.15.0}/src/judgeval/version.py RENAMED Viewed

@@ -1,4 +1,4 @@
-__version__ = "0.14.1"
+__version__ = "0.15.0"
 def get_version() -> str:

judgeval 0.14.1__tar.gz → 0.15.0__tar.gz

judgeval 0.14.1tar.gz → 0.15.0tar.gz