PyPI - judgeval - Versions diffs - 0.3.2__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

judgeval 0.3.2py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

judgeval/__init__.py +2 -0
judgeval/clients.py +2 -1
judgeval/common/api/api.py +4 -18
judgeval/common/api/constants.py +1 -1
judgeval/common/api/json_encoder.py +242 -0
judgeval/common/tracer/core.py +498 -215
judgeval/common/tracer/providers.py +119 -0
judgeval/common/tracer/span_transformer.py +14 -25
judgeval/constants.py +1 -0
judgeval/data/judgment_types.py +2 -1
judgeval/data/trace.py +5 -122
judgeval/data/trace_run.py +2 -1
judgeval/dataset.py +2 -0
judgeval/evaluation_run.py +6 -2
judgeval/judges/litellm_judge.py +2 -1
judgeval/judges/mixture_of_judges.py +2 -1
judgeval/judges/utils.py +2 -1
judgeval/judgment_client.py +11 -6
judgeval/local_eval_queue.py +192 -0
judgeval/run_evaluation.py +11 -6
judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +18 -19
judgeval/scorers/score.py +34 -11
judgeval/utils/async_utils.py +36 -0
{judgeval-0.3.2.dist-info → judgeval-0.5.0.dist-info}/METADATA +9 -12
{judgeval-0.3.2.dist-info → judgeval-0.5.0.dist-info}/RECORD +27 -23
{judgeval-0.3.2.dist-info → judgeval-0.5.0.dist-info}/WHEEL +0 -0
{judgeval-0.3.2.dist-info → judgeval-0.5.0.dist-info}/licenses/LICENSE.md +0 -0

judgeval/common/tracer/providers.py ADDED Viewed

@@ -0,0 +1,119 @@
+from __future__ import annotations
+import logging
+from typing import Any, TypeAlias
+logger = logging.getLogger(__name__)
+# TODO: Have functions that assert and return the relevant exports when the client is installed.
+# The method should raise if the user tries to access client information that doesnt exist.
+HAS_OPENAI = False
+openai_OpenAI = None
+openai_AsyncOpenAI = None
+openai_ChatCompletion = None
+openai_Response = None
+openai_ParsedChatCompletion = None
+try:
+    from openai import OpenAI, AsyncOpenAI
+    from openai.types.chat.chat_completion import ChatCompletion
+    from openai.types.responses.response import Response
+    from openai.types.chat import ParsedChatCompletion
+    openai_OpenAI = OpenAI
+    openai_AsyncOpenAI = AsyncOpenAI
+    openai_ChatCompletion = ChatCompletion
+    openai_Response = Response
+    openai_ParsedChatCompletion = ParsedChatCompletion
+    HAS_OPENAI = True
+except ImportError:
+    pass
+HAS_TOGETHER = False
+together_Together = None
+together_AsyncTogether = None
+try:
+    from together import Together, AsyncTogether
+    together_Together = Together
+    together_AsyncTogether = AsyncTogether
+    HAS_TOGETHER = True
+except ImportError:
+    pass
+HAS_ANTHROPIC = False
+anthropic_Anthropic = None
+anthropic_AsyncAnthropic = None
+try:
+    from anthropic import Anthropic, AsyncAnthropic
+    anthropic_Anthropic = Anthropic
+    anthropic_AsyncAnthropic = AsyncAnthropic
+    HAS_ANTHROPIC = True
+except ImportError:
+    pass
+HAS_GOOGLE_GENAI = False
+google_genai_Client = None
+google_genai_cleint_AsyncClient = None
+try:
+    from google.genai import Client
+    from google.genai.client import AsyncClient
+    google_genai_Client = Client
+    google_genai_AsyncClient = AsyncClient
+    HAS_GOOGLE_GENAI = True
+except ImportError:
+    pass
+HAS_GROQ = False
+groq_Groq = None
+groq_AsyncGroq = None
+try:
+    from groq import Groq, AsyncGroq
+    groq_Groq = Groq
+    groq_AsyncGroq = AsyncGroq
+    HAS_GROQ = True
+except ImportError:
+    pass
+# TODO: if we support dependency groups we can have this better type, but during runtime, we do
+# not know which clients an end user might have installed.
+ApiClient: TypeAlias = Any
+__all__ = [
+    "ApiClient",
+    # OpenAI
+    "HAS_OPENAI",
+    "openai_OpenAI",
+    "openai_AsyncOpenAI",
+    "openai_ChatCompletion",
+    "openai_Response",
+    "openai_ParsedChatCompletion",
+    # Together
+    "HAS_TOGETHER",
+    "together_Together",
+    "together_AsyncTogether",
+    # Anthropic
+    "HAS_ANTHROPIC",
+    "anthropic_Anthropic",
+    "anthropic_AsyncAnthropic",
+    # Google GenAI
+    "HAS_GOOGLE_GENAI",
+    "google_genai_Client",
+    "google_genai_AsyncClient",
+    # Groq
+    "HAS_GROQ",
+    "groq_Groq",
+    "groq_AsyncGroq",
+]

judgeval/common/tracer/span_transformer.py CHANGED Viewed

@@ -9,6 +9,7 @@ from typing import Any, Dict, Mapping, Optional, Union
 from opentelemetry.sdk.trace import ReadableSpan
 from pydantic import BaseModel
+from judgeval.common.api.json_encoder import json_encoder
 from judgeval.data import TraceSpan
 from judgeval.evaluation_run import EvaluationRun
@@ -38,21 +39,13 @@ class SpanTransformer:
             return True
     @staticmethod
-    def _safe_json_handle(obj: Any, serialize: bool = True) -> Any:
-        if serialize:
-            if obj is None:
-                return None
-            try:
-                return orjson.dumps(obj, default=str).decode("utf-8")
-            except Exception:
-                return orjson.dumps(str(obj)).decode("utf-8")
-        else:
-            if not isinstance(obj, str):
-                return obj
-            try:
-                return orjson.loads(obj)
-            except (orjson.JSONDecodeError, TypeError, ValueError):
-                return obj
+    def _safe_deserialize(obj: Any) -> Any:
+        if not isinstance(obj, str):
+            return obj
+        try:
+            return orjson.loads(obj)
+        except (orjson.JSONDecodeError, TypeError):
+            return obj
     @staticmethod
     def _format_timestamp(timestamp: Optional[Union[float, int, str]]) -> str:
@@ -84,15 +77,13 @@ class SpanTransformer:
             if field_name == "created_at":
                 attributes[attr_name] = SpanTransformer._format_timestamp(value)
             elif field_name == "expected_tools" and value:
-                attributes[attr_name] = SpanTransformer._safe_json_handle(
+                attributes[attr_name] = json_encoder(
                     [tool.model_dump() for tool in trace_span.expected_tools]
                 )
             elif field_name == "usage" and value:
-                attributes[attr_name] = SpanTransformer._safe_json_handle(
-                    trace_span.usage.model_dump()
-                )
+                attributes[attr_name] = json_encoder(trace_span.usage)
             elif SpanTransformer._needs_json_serialization(value):
-                attributes[attr_name] = SpanTransformer._safe_json_handle(value)
+                attributes[attr_name] = json_encoder(value)
             else:
                 attributes[attr_name] = value
@@ -115,7 +106,7 @@ class SpanTransformer:
             field_name = key[9:]
             if isinstance(value, str):
-                deserialized = SpanTransformer._safe_json_handle(value, serialize=False)
+                deserialized = SpanTransformer._safe_deserialize(value)
                 judgment_data[field_name] = deserialized
             else:
                 judgment_data[field_name] = value
@@ -174,9 +165,7 @@ class SpanTransformer:
         attributes = {
             "judgment.evaluation_run": True,
             "judgment.associated_span_id": span_id,
-            "judgment.span_data": SpanTransformer._safe_json_handle(
-                span_data.model_dump()
-            ),
+            "judgment.span_data": json_encoder(span_data),
         }
         eval_data = evaluation_run.model_dump()
@@ -186,7 +175,7 @@ class SpanTransformer:
             attr_name = f"judgment.{key}"
             if SpanTransformer._needs_json_serialization(value):
-                attributes[attr_name] = SpanTransformer._safe_json_handle(value)
+                attributes[attr_name] = json_encoder(value)
             else:
                 attributes[attr_name] = value

judgeval/constants.py CHANGED Viewed

@@ -105,6 +105,7 @@ TOGETHER_SUPPORTED_MODELS = [
 ]
 DEFAULT_TOGETHER_MODEL = "meta-llama/Meta-Llama-3-8B-Instruct-Lite"
+DEFAULT_GPT_MODEL = "gpt-4.1"
 JUDGMENT_SUPPORTED_MODELS = {"osiris-large", "osiris-mini", "osiris"}

judgeval/data/judgment_types.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # generated by datamodel-codegen:
 #   filename:  openapi_new.json
-#   timestamp: 2025-07-29T18:13:07+00:00
+#   timestamp: 2025-08-01T22:19:19+00:00
 from __future__ import annotations
@@ -99,6 +99,7 @@ class JudgmentEvalJudgmentType(BaseModel):
     append: Annotated[Optional[bool], Field(title="Append")] = False
     override: Annotated[Optional[bool], Field(title="Override")] = False
     trace_span_id: Annotated[Optional[str], Field(title="Trace Span Id")] = None
+    trace_id: Annotated[Optional[str], Field(title="Trace Id")] = None
 class TraceSpanJudgmentType(BaseModel):

judgeval/data/trace.py CHANGED Viewed

@@ -1,7 +1,4 @@
-from typing import Any
-import sys
 import threading
-import orjson
 from datetime import datetime, timezone
 from judgeval.data.judgment_types import (
     TraceUsageJudgmentType,
@@ -9,7 +6,7 @@ from judgeval.data.judgment_types import (
     TraceJudgmentType,
 )
 from judgeval.constants import SPAN_LIFECYCLE_END_UPDATE_ID
-from pydantic import BaseModel
+from judgeval.common.api.json_encoder import json_encoder
 class TraceUsage(TraceUsageJudgmentType):
@@ -25,9 +22,9 @@ class TraceSpan(TraceSpanJudgmentType):
             "created_at": datetime.fromtimestamp(
                 self.created_at, tz=timezone.utc
             ).isoformat(),
-            "inputs": self._serialize_value(self.inputs),
-            "output": self._serialize_value(self.output),
-            "error": self._serialize_value(self.error),
+            "inputs": json_encoder(self.inputs),
+            "output": json_encoder(self.output),
+            "error": json_encoder(self.error),
             "parent_span_id": self.parent_span_id,
             "function": self.function,
             "duration": self.duration,
@@ -37,7 +34,7 @@ class TraceSpan(TraceSpanJudgmentType):
             "agent_name": self.agent_name,
             "state_before": self.state_before,
             "state_after": self.state_after,
-            "additional_metadata": self._serialize_value(self.additional_metadata),
+            "additional_metadata": json_encoder(self.additional_metadata),
             "update_id": self.update_id,
         }
@@ -80,120 +77,6 @@ class TraceSpan(TraceSpanJudgmentType):
         )
         print(f"{indent}→ {self.function} (id: {self.span_id}){parent_info}")
-    def _is_json_serializable(self, obj: Any) -> bool:
-        """Helper method to check if an object is JSON serializable."""
-        try:
-            orjson.dumps(obj)
-            return True
-        except (TypeError, OverflowError, ValueError):
-            return False
-    def safe_stringify(self, output, function_name):
-        """
-        Safely converts an object to a JSON-serializable structure, handling common object types intelligently.
-        """
-        # Handle Pydantic models
-        if hasattr(output, "model_dump"):
-            try:
-                return output.model_dump()
-            except Exception:
-                pass
-        # Handle LangChain messages and similar objects with content/type
-        if hasattr(output, "content") and hasattr(output, "type"):
-            try:
-                result = {"type": output.type, "content": output.content}
-                # Add additional fields if they exist
-                if hasattr(output, "additional_kwargs"):
-                    result["additional_kwargs"] = output.additional_kwargs
-                if hasattr(output, "response_metadata"):
-                    result["response_metadata"] = output.response_metadata
-                if hasattr(output, "name"):
-                    result["name"] = output.name
-                return result
-            except Exception:
-                pass
-        if hasattr(output, "dict"):
-            try:
-                return output.dict()
-            except Exception:
-                pass
-        if hasattr(output, "to_dict"):
-            try:
-                return output.to_dict()
-            except Exception:
-                pass
-        if hasattr(output, "__dataclass_fields__"):
-            try:
-                import dataclasses
-                return dataclasses.asdict(output)
-            except Exception:
-                pass
-        if hasattr(output, "__dict__"):
-            try:
-                return output.__dict__
-            except Exception:
-                pass
-        try:
-            return str(output)
-        except (TypeError, OverflowError, ValueError):
-            pass
-        try:
-            return repr(output)
-        except (TypeError, OverflowError, ValueError):
-            pass
-        return None
-    def _serialize_value(self, value: Any) -> Any:
-        """Helper method to deep serialize a value safely supporting Pydantic Models / regular PyObjects."""
-        if value is None:
-            return None
-        recursion_limit = sys.getrecursionlimit()
-        recursion_limit = int(recursion_limit * 0.75)
-        def serialize_value(value, current_depth=0):
-            try:
-                if current_depth > recursion_limit:
-                    return {"error": "max_depth_reached: " + type(value).__name__}
-                if isinstance(value, BaseModel):
-                    return value.model_dump()
-                elif isinstance(value, dict):
-                    # Recursively serialize dictionary values
-                    return {
-                        k: serialize_value(v, current_depth + 1)
-                        for k, v in value.items()
-                    }
-                elif isinstance(value, (list, tuple)):
-                    # Recursively serialize list/tuple items
-                    return [serialize_value(item, current_depth + 1) for item in value]
-                else:
-                    try:
-                        orjson.dumps(value)
-                        return value
-                    except (TypeError, OverflowError, ValueError):
-                        # Fallback to safe stringification
-                        return self.safe_stringify(value, self.function)
-                    except Exception:
-                        return {"error": "Unable to serialize"}
-            except Exception:
-                return {"error": "Unable to serialize"}
-        # Start serialization with the top-level value
-        try:
-            return serialize_value(value, current_depth=0)
-        except Exception:
-            return {"error": "Unable to serialize"}
 class Trace(TraceJudgmentType):
     pass

judgeval/data/trace_run.py CHANGED Viewed

@@ -3,6 +3,7 @@ from typing import List, Optional, Dict, Any, Union
 from judgeval.data import Trace
 from judgeval.scorers import APIScorerConfig, BaseScorer
 from judgeval.rules import Rule
+from judgeval.constants import DEFAULT_GPT_MODEL
 class TraceRun(BaseModel):
@@ -26,7 +27,7 @@ class TraceRun(BaseModel):
     eval_name: Optional[str] = None
     traces: Optional[List[Trace]] = None
     scorers: List[Union[APIScorerConfig, BaseScorer]]
-    model: Optional[str] = "gpt-4.1"
+    model: Optional[str] = DEFAULT_GPT_MODEL
     trace_span_id: Optional[str] = None
     append: Optional[bool] = False
     override: Optional[bool] = False

judgeval/dataset.py CHANGED Viewed

@@ -35,6 +35,7 @@ class Dataset:
         for e in examples:
             if isinstance(e, dict) and isinstance(e.get("data"), dict):
                 e.update(e.pop("data"))
+        judgeval_logger.info(f"Succesfully retrieved dataset {name}!")
         return cls(
             name=name,
             project_name=project_name,
@@ -68,6 +69,7 @@ class Dataset:
             traces=[t.model_dump() for t in traces],
             overwrite=overwrite,
         )
+        judgeval_logger.info(f"Succesfull created dataset {name}!")
         return cls(
             name=name,
             project_name=project_name,

judgeval/evaluation_run.py CHANGED Viewed

@@ -3,7 +3,7 @@ from pydantic import BaseModel, field_validator, Field
 from judgeval.data import Example
 from judgeval.scorers import BaseScorer, APIScorerConfig
-from judgeval.constants import ACCEPTABLE_MODELS
+from judgeval.constants import ACCEPTABLE_MODELS, DEFAULT_GPT_MODEL
 class EvaluationRun(BaseModel):
@@ -24,8 +24,9 @@ class EvaluationRun(BaseModel):
     eval_name: Optional[str] = Field(default=None, validate_default=True)
     examples: List[Example]
     scorers: List[Union[APIScorerConfig, BaseScorer]]
-    model: Optional[str] = "gpt-4.1"
+    model: Optional[str] = DEFAULT_GPT_MODEL
     trace_span_id: Optional[str] = None
+    trace_id: Optional[str] = None
     # API Key will be "" until user calls client.run_eval(), then API Key will be set
     override: Optional[bool] = False
     append: Optional[bool] = False
@@ -44,6 +45,9 @@ class EvaluationRun(BaseModel):
     def validate_examples(cls, v):
         if not v:
             raise ValueError("Examples cannot be empty.")
+        for item in v:
+            if not isinstance(item, Example):
+                raise ValueError(f"Item of type {type(item)} is not a Example")
         return v
     @field_validator("scorers", mode="before")

judgeval/judges/litellm_judge.py CHANGED Viewed

@@ -7,6 +7,7 @@ from judgeval.common.utils import (
     fetch_litellm_api_response,
 )
 from judgeval.common.logger import judgeval_logger
+from judgeval.constants import DEFAULT_GPT_MODEL
 BASE_CONVERSATION = [
     {"role": "system", "content": "You are a helpful assistant."},
@@ -14,7 +15,7 @@ BASE_CONVERSATION = [
 class LiteLLMJudge(JudgevalJudge):
-    def __init__(self, model: str = "gpt-4.1-mini", **kwargs):
+    def __init__(self, model: str = DEFAULT_GPT_MODEL, **kwargs):
         self.model = model
         self.kwargs = kwargs
         super().__init__(model_name=model)

judgeval/judges/mixture_of_judges.py CHANGED Viewed

@@ -14,6 +14,7 @@ from judgeval.common.utils import (
     aget_chat_completion,
 )
 from judgeval.common.logger import judgeval_logger
+from judgeval.constants import DEFAULT_GPT_MODEL
 def build_dynamic_mixture_prompt(
@@ -161,7 +162,7 @@ class MixtureOfJudges(JudgevalJudge):
             "LLAMA3_70B_INSTRUCT_TURBO",
             "MISTRAL_8x22B_INSTRUCT",
         ],
-        aggregator: str = "gpt-4.1",
+        aggregator: str = DEFAULT_GPT_MODEL,
         **kwargs,
     ):
         """

judgeval/judges/utils.py CHANGED Viewed

@@ -7,6 +7,7 @@ from typing import Optional, Union, Tuple, List
 from judgeval.common.exceptions import InvalidJudgeModelError
 from judgeval.judges import JudgevalJudge, LiteLLMJudge, TogetherJudge, MixtureOfJudges
+from judgeval.constants import DEFAULT_GPT_MODEL
 from judgeval.constants import (
     TOGETHER_SUPPORTED_MODELS,
     JUDGMENT_SUPPORTED_MODELS,
@@ -30,7 +31,7 @@ def create_judge(
     If no model is provided, uses GPT4o as the default judge.
     """
     if model is None:  # default option
-        return LiteLLMJudge(model="gpt-4.1"), True
+        return LiteLLMJudge(model=DEFAULT_GPT_MODEL), True
     if not isinstance(model, (str, list, JudgevalJudge)):
         raise InvalidJudgeModelError(
             f"Model must be a string, list of strings, or a judgeval judge object. Got: {type(model)} instead."

judgeval/judgment_client.py CHANGED Viewed

@@ -2,9 +2,10 @@
 Implements the JudgmentClient to interact with the Judgment API.
 """
+from __future__ import annotations
 import os
 from uuid import uuid4
-from typing import Optional, List, Dict, Any, Union, Callable
+from typing import Optional, List, Dict, Any, Union, Callable, TYPE_CHECKING
 from judgeval.data import (
     ScoringResult,
@@ -28,7 +29,11 @@ from judgeval.common.tracer import Tracer
 from judgeval.common.utils import validate_api_key
 from pydantic import BaseModel
 from judgeval.common.logger import judgeval_logger
-from judgeval.integrations.langgraph import JudgevalCallbackHandler
+if TYPE_CHECKING:
+    from judgeval.integrations.langgraph import JudgevalCallbackHandler
+from judgeval.constants import DEFAULT_GPT_MODEL
 class EvalRunRequestBody(BaseModel):
@@ -89,7 +94,7 @@ class JudgmentClient(metaclass=SingletonMeta):
         tools: Optional[List[Dict[str, Any]]] = None,
         project_name: str = "default_project",
         eval_run_name: str = "default_eval_trace",
-        model: Optional[str] = "gpt-4.1",
+        model: Optional[str] = DEFAULT_GPT_MODEL,
         append: bool = False,
         override: bool = False,
     ) -> List[ScoringResult]:
@@ -127,7 +132,7 @@ class JudgmentClient(metaclass=SingletonMeta):
         self,
         examples: List[Example],
         scorers: List[Union[APIScorerConfig, BaseScorer]],
-        model: Optional[str] = "gpt-4.1",
+        model: Optional[str] = DEFAULT_GPT_MODEL,
         project_name: str = "default_project",
         eval_run_name: str = "default_eval_run",
         override: bool = False,
@@ -214,7 +219,7 @@ class JudgmentClient(metaclass=SingletonMeta):
         self,
         examples: List[Example],
         scorers: List[Union[APIScorerConfig, BaseScorer]],
-        model: Optional[str] = "gpt-4.1",
+        model: Optional[str] = DEFAULT_GPT_MODEL,
         project_name: str = "default_test",
         eval_run_name: str = str(uuid4()),
         override: bool = False,
@@ -255,7 +260,7 @@ class JudgmentClient(metaclass=SingletonMeta):
         tracer: Optional[Union[Tracer, JudgevalCallbackHandler]] = None,
         traces: Optional[List[Trace]] = None,
         tools: Optional[List[Dict[str, Any]]] = None,
-        model: Optional[str] = "gpt-4.1",
+        model: Optional[str] = DEFAULT_GPT_MODEL,
         project_name: str = "default_test",
         eval_run_name: str = str(uuid4()),
         override: bool = False,

judgeval 0.3.2__py3-none-any.whl → 0.5.0__py3-none-any.whl

judgeval 0.3.2py3-none-any.whl → 0.5.0py3-none-any.whl