PyPI - judgeval - Versions diffs - 0.2.0__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

judgeval 0.2.0py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

judgeval/common/api/api.py +38 -7
judgeval/common/api/constants.py +9 -1
judgeval/common/storage/s3_storage.py +2 -3
judgeval/common/tracer/core.py +66 -32
judgeval/common/tracer/otel_span_processor.py +4 -50
judgeval/common/tracer/span_transformer.py +16 -10
judgeval/common/utils.py +46 -38
judgeval/constants.py +2 -0
judgeval/data/example.py +9 -37
judgeval/data/judgment_types.py +23 -45
judgeval/data/result.py +8 -14
judgeval/data/scripts/openapi_transform.py +5 -5
judgeval/data/trace.py +3 -4
judgeval/dataset.py +192 -0
judgeval/evaluation_run.py +1 -0
judgeval/judges/litellm_judge.py +2 -2
judgeval/judges/mixture_of_judges.py +6 -6
judgeval/judges/together_judge.py +6 -3
judgeval/judgment_client.py +9 -71
judgeval/run_evaluation.py +41 -9
judgeval/scorers/score.py +11 -7
judgeval/scorers/utils.py +3 -3
judgeval/utils/file_utils.py +40 -25
{judgeval-0.2.0.dist-info → judgeval-0.3.1.dist-info}/METADATA +10 -6
{judgeval-0.2.0.dist-info → judgeval-0.3.1.dist-info}/RECORD +27 -29
judgeval/data/datasets/__init__.py +0 -4
judgeval/data/datasets/dataset.py +0 -341
judgeval/data/datasets/eval_dataset_client.py +0 -214
{judgeval-0.2.0.dist-info → judgeval-0.3.1.dist-info}/WHEEL +0 -0
{judgeval-0.2.0.dist-info → judgeval-0.3.1.dist-info}/licenses/LICENSE.md +0 -0

judgeval/common/utils.py CHANGED Viewed

@@ -16,7 +16,7 @@ from types import TracebackType
 from judgeval.common.api.constants import ROOT_API
 from judgeval.utils.requests import requests
 import pprint
-from typing import Any, Dict, List, Mapping, Optional, TypeAlias, Union, TypeGuard
+from typing import Any, Dict, List, Optional, TypeAlias, Union, TypeGuard
 # Third-party imports
 import litellm
@@ -138,7 +138,9 @@ def validate_api_key(judgment_api_key: str):
 def fetch_together_api_response(
-    model: str, messages: List[Mapping], response_format: pydantic.BaseModel = None
+    model: str,
+    messages: List[Dict[str, str]],
+    response_format: Union[pydantic.BaseModel, None] = None,
 ) -> str:
     """
     Fetches a single response from the Together API for a given model and messages.
@@ -167,7 +169,9 @@ def fetch_together_api_response(
 async def afetch_together_api_response(
-    model: str, messages: List[Mapping], response_format: pydantic.BaseModel = None
+    model: str,
+    messages: List[Dict],
+    response_format: Union[pydantic.BaseModel, None] = None,
 ) -> str:
     """
     ASYNCHRONOUSLY Fetches a single response from the Together API for a given model and messages.
@@ -192,8 +196,8 @@ async def afetch_together_api_response(
 def query_together_api_multiple_calls(
     models: List[str],
-    messages: List[List[Mapping]],
-    response_formats: List[pydantic.BaseModel] | None = None,
+    messages: List[List[Dict]],
+    response_formats: Union[List[Union[pydantic.BaseModel, None]], None] = None,
 ) -> List[Union[str, None]]:
     """
     Queries the Together API for multiple calls in parallel
@@ -230,7 +234,7 @@ def query_together_api_multiple_calls(
     num_workers = int(os.getenv("NUM_WORKER_THREADS", MAX_WORKER_THREADS))
     # Initialize results to maintain ordered outputs
-    out: List[str | None] = [None] * len(messages)
+    out: List[Union[str, None]] = [None] * len(messages)
     with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
         # Submit all queries to together API with index, gets back the response content
         futures = {
@@ -255,8 +259,8 @@ def query_together_api_multiple_calls(
 async def aquery_together_api_multiple_calls(
     models: List[str],
-    messages: List[List[Mapping]],
-    response_formats: List[pydantic.BaseModel] | None = None,
+    messages: List[List[Dict]],
+    response_formats: Union[List[Union[pydantic.BaseModel, None]], None] = None,
 ) -> List[Union[str, None]]:
     """
     Queries the Together API for multiple calls in parallel
@@ -314,7 +318,9 @@ async def aquery_together_api_multiple_calls(
 def fetch_litellm_api_response(
-    model: str, messages: List[Mapping], response_format: pydantic.BaseModel = None
+    model: str,
+    messages: List[Dict[str, str]],
+    response_format: Union[pydantic.BaseModel, None] = None,
 ) -> str:
     """
     Fetches a single response from the Litellm API for a given model and messages.
@@ -339,8 +345,8 @@ def fetch_litellm_api_response(
 def fetch_custom_litellm_api_response(
     custom_model_parameters: CustomModelParameters,
-    messages: List[Mapping],
-    response_format: pydantic.BaseModel = None,
+    messages: List[Dict[str, str]],
+    response_format: Union[pydantic.BaseModel, None] = None,
 ) -> str:
     if messages is None or messages == []:
         raise ValueError("Messages cannot be empty")
@@ -372,7 +378,9 @@ def fetch_custom_litellm_api_response(
 async def afetch_litellm_api_response(
-    model: str, messages: List[Mapping], response_format: pydantic.BaseModel = None
+    model: str,
+    messages: List[Dict[str, str]],
+    response_format: Union[pydantic.BaseModel, None] = None,
 ) -> str:
     """
     ASYNCHRONOUSLY Fetches a single response from the Litellm API for a given model and messages.
@@ -402,8 +410,8 @@ async def afetch_litellm_api_response(
 async def afetch_custom_litellm_api_response(
     custom_model_parameters: CustomModelParameters,
-    messages: List[Mapping],
-    response_format: pydantic.BaseModel = None,
+    messages: List[Dict[str, str]],
+    response_format: Union[pydantic.BaseModel, None] = None,
 ) -> str:
     """
     ASYNCHRONOUSLY Fetches a single response from the Litellm API for a given model and messages.
@@ -439,8 +447,8 @@ async def afetch_custom_litellm_api_response(
 def query_litellm_api_multiple_calls(
     models: List[str],
-    messages: List[List[Mapping]],
-    response_formats: List[pydantic.BaseModel] | None = None,
+    messages: List[List[Dict]],
+    response_formats: Union[List[Union[pydantic.BaseModel, None]], None] = None,
 ) -> List[Union[str, None]]:
     """
     Queries the Litellm API for multiple calls in parallel
@@ -480,8 +488,8 @@ def query_litellm_api_multiple_calls(
 async def aquery_litellm_api_multiple_calls(
     models: List[str],
-    messages: List[List[Mapping]],
-    response_formats: List[pydantic.BaseModel] | None = None,
+    messages: List[List[Dict[str, str]]],
+    response_formats: Union[List[Union[pydantic.BaseModel, None]], None] = None,
 ) -> List[Union[str, None]]:
     """
     Queries the Litellm API for multiple calls in parallel
@@ -565,14 +573,14 @@ def validate_batched_chat_messages(messages):
 def is_batched_messages(
-    messages: Union[List[Mapping], List[List[Mapping]]],
-) -> TypeGuard[List[List[Mapping]]]:
+    messages: Union[List[Dict[str, str]], List[List[Dict[str, str]]]],
+) -> TypeGuard[List[List[Dict[str, str]]]]:
     return isinstance(messages, list) and all(isinstance(msg, list) for msg in messages)
 def is_simple_messages(
-    messages: Union[List[Mapping], List[List[Mapping]]],
-) -> TypeGuard[List[Mapping]]:
+    messages: Union[List[Dict[str, str]], List[List[Dict[str, str]]]],
+) -> TypeGuard[List[Dict[str, str]]]:
     return isinstance(messages, list) and all(
         not isinstance(msg, list) for msg in messages
     )
@@ -580,10 +588,10 @@ def is_simple_messages(
 def get_chat_completion(
     model_type: str,
-    messages: Union[List[Mapping], List[List[Mapping]]],
-    response_format: pydantic.BaseModel = None,
+    messages: Union[List[Dict[str, str]], List[List[Dict[str, str]]]],
+    response_format: Union[pydantic.BaseModel, None] = None,
     batched: bool = False,
-) -> Union[str, List[str | None]]:
+) -> Union[str, List[Union[str, None]]]:
     """
     Generates chat completions using a single model and potentially several messages. Supports closed-source and OSS models.
@@ -653,10 +661,10 @@ def get_chat_completion(
 async def aget_chat_completion(
     model_type: str,
-    messages: Union[List[Mapping], List[List[Mapping]]],
-    response_format: pydantic.BaseModel = None,
+    messages: Union[List[Dict[str, str]], List[List[Dict[str, str]]]],
+    response_format: Union[pydantic.BaseModel, None] = None,
     batched: bool = False,
-) -> Union[str, List[str | None]]:
+) -> Union[str, List[Union[str, None]]]:
     """
     ASYNCHRONOUSLY generates chat completions using a single model and potentially several messages. Supports closed-source and OSS models.
@@ -722,9 +730,9 @@ async def aget_chat_completion(
 def get_completion_multiple_models(
     models: List[str],
-    messages: List[List[Mapping]],
-    response_formats: List[pydantic.BaseModel] | None = None,
-) -> List[str | None]:
+    messages: List[List[Dict[str, str]]],
+    response_formats: Union[List[Union[pydantic.BaseModel, None]], None] = None,
+) -> List[Union[str, None]]:
     """
     Retrieves completions for a single prompt from multiple models in parallel. Supports closed-source and OSS models.
@@ -801,9 +809,9 @@ def get_completion_multiple_models(
 async def aget_completion_multiple_models(
     models: List[str],
-    messages: List[List[Mapping]],
-    response_formats: List[pydantic.BaseModel] | None = None,
-) -> List[str | None]:
+    messages: List[List[Dict[str, str]]],
+    response_formats: Union[List[Union[pydantic.BaseModel, None]], None] = None,
+) -> List[Union[str, None]]:
     """
     ASYNCHRONOUSLY retrieves completions for a single prompt from multiple models in parallel. Supports closed-source and OSS models.
@@ -875,7 +883,7 @@ async def aget_completion_multiple_models(
 if __name__ == "__main__":
-    batched_messages: List[List[Mapping]] = [
+    batched_messages: List[List[Dict[str, str]]] = [
         [
             {"role": "system", "content": "You are a helpful assistant."},
             {"role": "user", "content": "What is the capital of France?"},
@@ -886,12 +894,12 @@ if __name__ == "__main__":
         ],
     ]
-    non_batched_messages: List[Mapping] = [
+    non_batched_messages: List[Dict[str, str]] = [
         {"role": "system", "content": "You are a helpful assistant."},
         {"role": "user", "content": "What is the capital of France?"},
     ]
-    batched_messages_2: List[List[Mapping]] = [
+    batched_messages_2: List[List[Dict[str, str]]] = [
         [
             {"role": "system", "content": "You are a helpful assistant."},
             {"role": "user", "content": "What is the capital of China?"},
@@ -937,4 +945,4 @@ if __name__ == "__main__":
     )
 ExcInfo: TypeAlias = tuple[type[BaseException], BaseException, TracebackType]
-OptExcInfo: TypeAlias = ExcInfo | tuple[None, None, None]
+OptExcInfo: TypeAlias = Union[ExcInfo, tuple[None, None, None]]

judgeval/constants.py CHANGED Viewed

@@ -104,6 +104,8 @@ TOGETHER_SUPPORTED_MODELS = [
     "mistralai/Mistral-7B-Instruct-v0.1",
 ]
+DEFAULT_TOGETHER_MODEL = "meta-llama/Meta-Llama-3-8B-Instruct-Lite"
 JUDGMENT_SUPPORTED_MODELS = {"osiris-large", "osiris-mini", "osiris"}
 ACCEPTABLE_MODELS = (

judgeval/data/example.py CHANGED Viewed

@@ -4,6 +4,7 @@ Classes for representing examples in a dataset.
 from enum import Enum
 from datetime import datetime
+from typing import Dict, Any, Optional
 from judgeval.data.judgment_types import ExampleJudgmentType
@@ -15,47 +16,18 @@ class ExampleParams(str, Enum):
     RETRIEVAL_CONTEXT = "retrieval_context"
     TOOLS_CALLED = "tools_called"
     EXPECTED_TOOLS = "expected_tools"
-    REASONING = "reasoning"
     ADDITIONAL_METADATA = "additional_metadata"
 class Example(ExampleJudgmentType):
     example_id: str = ""
+    created_at: str = datetime.now().isoformat()
+    name: Optional[str] = None
-    def __init__(self, **data):
-        if "created_at" not in data:
-            data["created_at"] = datetime.now().isoformat()
-        super().__init__(**data)
-        self.example_id = None
+    def to_dict(self) -> Dict[str, Any]:
+        data = super().model_dump(warnings=False)
+        return data
-    def to_dict(self):
-        return {
-            "input": self.input,
-            "actual_output": self.actual_output,
-            "expected_output": self.expected_output,
-            "context": self.context,
-            "retrieval_context": self.retrieval_context,
-            "additional_metadata": self.additional_metadata,
-            "tools_called": self.tools_called,
-            "expected_tools": self.expected_tools,
-            "name": self.name,
-            "example_id": self.example_id,
-            "example_index": self.example_index,
-            "created_at": self.created_at,
-        }
-    def __str__(self):
-        return (
-            f"Example(input={self.input}, "
-            f"actual_output={self.actual_output}, "
-            f"expected_output={self.expected_output}, "
-            f"context={self.context}, "
-            f"retrieval_context={self.retrieval_context}, "
-            f"additional_metadata={self.additional_metadata}, "
-            f"tools_called={self.tools_called}, "
-            f"expected_tools={self.expected_tools}, "
-            f"name={self.name}, "
-            f"example_id={self.example_id}, "
-            f"example_index={self.example_index}, "
-            f"created_at={self.created_at}, "
-        )
+    def get_fields(self):
+        excluded = {"example_id", "name", "created_at"}
+        return self.model_dump(exclude=excluded)

judgeval/data/judgment_types.py CHANGED Viewed

@@ -1,12 +1,12 @@
 # generated by datamodel-codegen:
 #   filename:  openapi_new.json
-#   timestamp: 2025-07-17T03:14:16+00:00
+#   timestamp: 2025-07-29T18:13:07+00:00
 from __future__ import annotations
 from typing import Annotated, Any, Dict, List, Optional, Union
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, ConfigDict, Field
 class ValidationErrorJudgmentType(BaseModel):
@@ -31,6 +31,15 @@ class ScorerDataJudgmentType(BaseModel):
     ] = None
+class ExampleJudgmentType(BaseModel):
+    model_config = ConfigDict(
+        extra="allow",
+    )
+    example_id: Annotated[str, Field(title="Example Id")]
+    created_at: Annotated[str, Field(title="Created At")]
+    name: Annotated[Optional[str], Field(title="Name")] = None
 class ScorerConfigJudgmentType(BaseModel):
     score_type: Annotated[str, Field(title="Score Type")]
     name: Annotated[Optional[str], Field(title="Name")] = None
@@ -81,6 +90,17 @@ class HTTPValidationErrorJudgmentType(BaseModel):
     ] = None
+class JudgmentEvalJudgmentType(BaseModel):
+    project_name: Annotated[Optional[str], Field(title="Project Name")] = None
+    eval_name: Annotated[Optional[str], Field(title="Eval Name")] = None
+    examples: Annotated[List[ExampleJudgmentType], Field(title="Examples")]
+    scorers: Annotated[List[ScorerConfigJudgmentType], Field(title="Scorers")]
+    model: Annotated[str, Field(title="Model")]
+    append: Annotated[Optional[bool], Field(title="Append")] = False
+    override: Annotated[Optional[bool], Field(title="Override")] = False
+    trace_span_id: Annotated[Optional[str], Field(title="Trace Span Id")] = None
 class TraceSpanJudgmentType(BaseModel):
     span_id: Annotated[str, Field(title="Span Id")]
     trace_id: Annotated[str, Field(title="Trace Id")]
@@ -109,43 +129,12 @@ class TraceSpanJudgmentType(BaseModel):
     update_id: Annotated[Optional[int], Field(title="Update Id")] = 1
-class ExampleJudgmentType(BaseModel):
-    input: Annotated[Optional[Union[str, Dict[str, Any]]], Field(title="Input")] = None
-    actual_output: Annotated[
-        Optional[Union[str, List[str]]], Field(title="Actual Output")
-    ] = None
-    expected_output: Annotated[
-        Optional[Union[str, List[str]]], Field(title="Expected Output")
-    ] = None
-    context: Annotated[Optional[List[str]], Field(title="Context")] = None
-    retrieval_context: Annotated[
-        Optional[List[str]], Field(title="Retrieval Context")
-    ] = None
-    additional_metadata: Annotated[
-        Optional[Dict[str, Any]], Field(title="Additional Metadata")
-    ] = None
-    tools_called: Annotated[Optional[List[str]], Field(title="Tools Called")] = Field(
-        default_factory=list
-    )
-    expected_tools: Annotated[
-        Optional[List[ToolJudgmentType]], Field(title="Expected Tools")
-    ] = Field(default_factory=list)
-    name: Annotated[Optional[str], Field(title="Name")] = None
-    example_id: Annotated[str, Field(title="Example Id")]
-    example_index: Annotated[Optional[int], Field(title="Example Index")] = None
-    created_at: Annotated[Optional[str], Field(title="Created At")] = None
-    trace_id: Annotated[Optional[str], Field(title="Trace Id")] = None
-    trace_span_id: Annotated[Optional[str], Field(title="Trace Span Id")] = None
-    dataset_id: Annotated[Optional[str], Field(title="Dataset Id")] = None
 class TraceJudgmentType(BaseModel):
     trace_id: Annotated[str, Field(title="Trace Id")]
     name: Annotated[str, Field(title="Name")]
     created_at: Annotated[str, Field(title="Created At")]
     duration: Annotated[float, Field(title="Duration")]
     trace_spans: Annotated[List[TraceSpanJudgmentType], Field(title="Trace Spans")]
-    overwrite: Annotated[Optional[bool], Field(title="Overwrite")] = False
     offline_mode: Annotated[Optional[bool], Field(title="Offline Mode")] = False
     rules: Annotated[Optional[Dict[str, Any]], Field(title="Rules")] = Field(
         default_factory=dict
@@ -165,7 +154,7 @@ class ScoringResultJudgmentType(BaseModel):
     success: Annotated[bool, Field(title="Success")]
     scorers_data: Annotated[
         Optional[List[ScorerDataJudgmentType]], Field(title="Scorers Data")
-    ]
+    ] = None
     name: Annotated[Optional[str], Field(title="Name")] = None
     data_object: Annotated[
         Optional[Union[TraceSpanJudgmentType, ExampleJudgmentType]],
@@ -188,17 +177,6 @@ class TraceRunJudgmentType(BaseModel):
     tools: Annotated[Optional[List[Dict[str, Any]]], Field(title="Tools")] = None
-class JudgmentEvalJudgmentType(BaseModel):
-    project_name: Annotated[Optional[str], Field(title="Project Name")] = None
-    eval_name: Annotated[Optional[str], Field(title="Eval Name")] = None
-    examples: Annotated[List[ExampleJudgmentType], Field(title="Examples")]
-    scorers: Annotated[List[ScorerConfigJudgmentType], Field(title="Scorers")]
-    model: Annotated[str, Field(title="Model")]
-    append: Annotated[Optional[bool], Field(title="Append")] = False
-    override: Annotated[Optional[bool], Field(title="Override")] = False
-    trace_span_id: Annotated[Optional[str], Field(title="Trace Span Id")] = None
 class EvalResultsJudgmentType(BaseModel):
     results: Annotated[List[ScoringResultJudgmentType], Field(title="Results")]
     run: Annotated[

judgeval/data/result.py CHANGED Viewed

@@ -17,15 +17,14 @@ class ScoringResult(ScoringResultJudgmentType):
     """
-    def to_dict(self) -> dict:
-        """Convert the ScoringResult instance to a dictionary, properly serializing scorer_data."""
-        return {
-            "success": self.success,
-            "scorers_data": [scorer_data.to_dict() for scorer_data in self.scorers_data]
-            if self.scorers_data
-            else None,
-            "data_object": self.data_object.to_dict() if self.data_object else None,
-        }
+    data_object: (
+        Example  # Need to override this so that it uses this repo's Example class
+    )
+    def model_dump(self, **kwargs):
+        data = super().model_dump(**kwargs)
+        data["data_object"] = self.data_object.model_dump()
+        return data
     def __str__(self) -> str:
         return f"ScoringResult(\
@@ -47,12 +46,7 @@ def generate_scoring_result(
     When an LLMTestCase is executed, it turns into an LLMApiTestCase and the progress of the evaluation run is tracked.
     At the end of the evaluation run, we create a TestResult object out of the completed LLMApiTestCase.
     """
-    if hasattr(data_object, "name") and data_object.name is not None:
-        name = data_object.name
-    else:
-        name = "Test Case Placeholder"
     scoring_result = ScoringResult(
-        name=name,
         data_object=data_object,
         success=success,
         scorers_data=scorers_data,

judgeval/data/scripts/openapi_transform.py CHANGED Viewed

@@ -1,7 +1,7 @@
-import json
+import orjson
 import sys
 from typing import Any, Dict, Generator, List
-from judgeval.utils.requests import requests
+import requests
 spec_file = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:8000/openapi.json"
@@ -10,8 +10,8 @@ if spec_file.startswith("http"):
     r.raise_for_status()
     SPEC = r.json()
 else:
-    with open(spec_file, "r") as f:
-        SPEC = json.load(f)
+    with open(spec_file, "rb") as f:
+        SPEC = orjson.loads(f.read())
 JUDGEVAL_PATHS: List[str] = [
     "/log_eval_results/",
@@ -120,4 +120,4 @@ spec = {
     },
 }
-print(json.dumps(spec, indent=4))
+print(orjson.dumps(spec, option=orjson.OPT_INDENT_2).decode("utf-8"))

judgeval/data/trace.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from typing import Any
-import json
 import sys
 import threading
+import orjson
 from datetime import datetime, timezone
 from judgeval.data.judgment_types import (
     TraceUsageJudgmentType,
@@ -83,7 +83,7 @@ class TraceSpan(TraceSpanJudgmentType):
     def _is_json_serializable(self, obj: Any) -> bool:
         """Helper method to check if an object is JSON serializable."""
         try:
-            json.dumps(obj)
+            orjson.dumps(obj)
             return True
         except (TypeError, OverflowError, ValueError):
             return False
@@ -177,9 +177,8 @@ class TraceSpan(TraceSpanJudgmentType):
                     # Recursively serialize list/tuple items
                     return [serialize_value(item, current_depth + 1) for item in value]
                 else:
-                    # Try direct JSON serialization first
                     try:
-                        json.dumps(value)
+                        orjson.dumps(value)
                         return value
                     except (TypeError, OverflowError, ValueError):
                         # Fallback to safe stringification

judgeval 0.2.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

judgeval 0.2.0py3-none-any.whl → 0.3.1py3-none-any.whl