PyPI - judgeval - Versions diffs - 0.2.0__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

judgeval 0.2.0py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

judgeval/common/api/api.py +38 -7
judgeval/common/api/constants.py +9 -1
judgeval/common/storage/s3_storage.py +2 -3
judgeval/common/tracer/core.py +66 -32
judgeval/common/tracer/otel_span_processor.py +4 -50
judgeval/common/tracer/span_transformer.py +16 -10
judgeval/common/utils.py +46 -38
judgeval/constants.py +2 -0
judgeval/data/example.py +9 -37
judgeval/data/judgment_types.py +23 -45
judgeval/data/result.py +8 -14
judgeval/data/scripts/openapi_transform.py +5 -5
judgeval/data/trace.py +3 -4
judgeval/dataset.py +192 -0
judgeval/evaluation_run.py +1 -0
judgeval/judges/litellm_judge.py +2 -2
judgeval/judges/mixture_of_judges.py +6 -6
judgeval/judges/together_judge.py +6 -3
judgeval/judgment_client.py +9 -71
judgeval/run_evaluation.py +41 -9
judgeval/scorers/score.py +11 -7
judgeval/scorers/utils.py +3 -3
judgeval/utils/file_utils.py +40 -25
{judgeval-0.2.0.dist-info → judgeval-0.3.1.dist-info}/METADATA +10 -6
{judgeval-0.2.0.dist-info → judgeval-0.3.1.dist-info}/RECORD +27 -29
judgeval/data/datasets/__init__.py +0 -4
judgeval/data/datasets/dataset.py +0 -341
judgeval/data/datasets/eval_dataset_client.py +0 -214
{judgeval-0.2.0.dist-info → judgeval-0.3.1.dist-info}/WHEEL +0 -0
{judgeval-0.2.0.dist-info → judgeval-0.3.1.dist-info}/licenses/LICENSE.md +0 -0

judgeval/common/api/api.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Literal, List, Dict, Any
+from typing import Literal, List, Dict, Any, Union
 from requests import exceptions
 from judgeval.common.api.constants import (
     JUDGMENT_TRACES_FETCH_API_URL,
@@ -25,6 +25,8 @@ from judgeval.common.api.constants import (
     JUDGMENT_SCORER_SAVE_API_URL,
     JUDGMENT_SCORER_FETCH_API_URL,
     JUDGMENT_SCORER_EXISTS_API_URL,
+    JUDGMENT_DATASETS_APPEND_TRACES_API_URL,
+    JUDGMENT_CHECK_EXAMPLE_KEYS_API_URL,
 )
 from judgeval.common.api.constants import (
     TraceFetchPayload,
@@ -48,9 +50,12 @@ from judgeval.common.api.constants import (
     ScorerSavePayload,
     ScorerFetchPayload,
     ScorerExistsPayload,
+    CheckExampleKeysPayload,
 )
 from judgeval.utils.requests import requests
+import orjson
 class JudgmentAPIException(exceptions.HTTPError):
     """
@@ -65,7 +70,7 @@ class JudgmentAPIException(exceptions.HTTPError):
         self.request = request
     @property
-    def status_code(self) -> int | None:
+    def status_code(self) -> Union[int, None]:
         """Get the HTTP status code from the response."""
         return self.response.status_code if self.response else None
@@ -114,8 +119,15 @@ class JudgmentApiClient:
         try:
             r.raise_for_status()
         except exceptions.HTTPError as e:
+            try:
+                detail = r.json().get("detail", "")
+            except Exception:
+                detail = r.text
             raise JudgmentAPIException(
-                f"HTTP {r.status_code}: {r.reason}", response=r, request=e.request
+                f"HTTP {r.status_code}: {r.reason}, {detail}",
+                response=r,
+                request=e.request,
             )
         return r.json()
@@ -218,6 +230,14 @@ class JudgmentApiClient:
         }
         return self._do_request("POST", JUDGMENT_EVAL_RUN_NAME_EXISTS_API_URL, payload)
+    def check_example_keys(self, keys: List[str], eval_name: str, project_name: str):
+        payload: CheckExampleKeysPayload = {
+            "keys": keys,
+            "eval_name": eval_name,
+            "project_name": project_name,
+        }
+        return self._do_request("POST", JUDGMENT_CHECK_EXAMPLE_KEYS_API_URL, payload)
     def save_scorer(self, name: str, prompt: str, options: dict):
         payload: ScorerSavePayload = {
             "name": name,
@@ -279,7 +299,7 @@ class JudgmentApiClient:
         project_name: str,
         examples: List[Dict[str, Any]],
         traces: List[Dict[str, Any]],
-        overwrite: bool,
+        overwrite: bool = False,
     ):
         payload: DatasetPushPayload = {
             "dataset_alias": dataset_alias,
@@ -302,6 +322,18 @@ class JudgmentApiClient:
             "POST", JUDGMENT_DATASETS_APPEND_EXAMPLES_API_URL, payload
         )
+    def append_traces(
+        self, dataset_alias: str, project_name: str, traces: List[Dict[str, Any]]
+    ):
+        payload: DatasetAppendPayload = {
+            "dataset_alias": dataset_alias,
+            "project_name": project_name,
+            "traces": traces,
+        }
+        return self._do_request(
+            "POST", JUDGMENT_DATASETS_APPEND_TRACES_API_URL, payload
+        )
     def pull_dataset(self, dataset_alias: str, project_name: str):
         payload: DatasetPullPayload = {
             "dataset_alias": dataset_alias,
@@ -347,6 +379,5 @@ class JudgmentApiClient:
                 except Exception as e:
                     return f"<Unserializable object of type {type(obj).__name__}: {e}>"
-        import json
-        return json.dumps(data, default=fallback_encoder)
+        # orjson returns bytes, so we need to decode to str
+        return orjson.dumps(data, default=fallback_encoder).decode("utf-8")

judgeval/common/api/constants.py CHANGED Viewed

@@ -51,6 +51,7 @@ JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL = f"{ROOT_API}/add_to_run_eval_queue/"
 JUDGMENT_GET_EVAL_STATUS_API_URL = f"{ROOT_API}/get_evaluation_status/"
 JUDGMENT_CHECK_EXPERIMENT_TYPE_API_URL = f"{ROOT_API}/check_experiment_type/"
 JUDGMENT_EVAL_RUN_NAME_EXISTS_API_URL = f"{ROOT_API}/eval-run-name-exists/"
+JUDGMENT_CHECK_EXAMPLE_KEYS_API_URL = f"{ROOT_API}/check_example_keys/"
 # Evaluation API Payloads
@@ -90,9 +91,16 @@ class EvalRunNameExistsPayload(TypedDict):
     judgment_api_key: str
+class CheckExampleKeysPayload(TypedDict):
+    keys: List[str]
+    eval_name: str
+    project_name: str
 # Datasets API
 JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
 JUDGMENT_DATASETS_APPEND_EXAMPLES_API_URL = f"{ROOT_API}/datasets/insert_examples/"
+JUDGMENT_DATASETS_APPEND_TRACES_API_URL = f"{ROOT_API}/traces/add_to_dataset/"
 JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull_for_judgeval/"
 JUDGMENT_DATASETS_DELETE_API_URL = f"{ROOT_API}/datasets/delete/"
 JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = f"{ROOT_API}/datasets/export_jsonl/"
@@ -134,7 +142,7 @@ class DatasetStatsPayload(TypedDict):
 # Projects API
-JUDGMENT_PROJECT_DELETE_API_URL = f"{ROOT_API}/projects/delete/"
+JUDGMENT_PROJECT_DELETE_API_URL = f"{ROOT_API}/projects/delete_from_judgeval/"
 JUDGMENT_PROJECT_CREATE_API_URL = f"{ROOT_API}/projects/add/"

judgeval/common/storage/s3_storage.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import os
-import json
 import boto3
+import orjson
 from typing import Optional
 from datetime import datetime, UTC
 from botocore.exceptions import ClientError
@@ -85,8 +85,7 @@ class S3Storage:
         timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
         s3_key = f"traces/{project_name}/{trace_id}_{timestamp}.json"
-        # Convert trace data to JSON string
-        trace_json = json.dumps(trace_data)
+        trace_json = orjson.dumps(trace_data).decode("utf-8")
         self.s3_client.put_object(
             Bucket=self.bucket_name,

judgeval/common/tracer/core.py CHANGED Viewed

@@ -32,6 +32,7 @@ from typing import (
 )
 import types
 from judgeval.common.tracer.constants import _TRACE_FILEPATH_BLOCKLIST
 from judgeval.common.tracer.otel_span_processor import JudgmentSpanProcessor
@@ -45,6 +46,7 @@ from openai.types.chat import ParsedChatCompletion
 from together import Together, AsyncTogether
 from anthropic import Anthropic, AsyncAnthropic
 from google import genai
+from groq import Groq, AsyncGroq
 from judgeval.data import Example, Trace, TraceSpan, TraceUsage
 from judgeval.scorers import APIScorerConfig, BaseScorer
@@ -67,6 +69,8 @@ ApiClient: TypeAlias = Union[
     AsyncTogether,
     genai.Client,
     genai.client.AsyncClient,
+    Groq,
+    AsyncGroq,
 ]
 SpanType: TypeAlias = str
@@ -79,7 +83,7 @@ class TraceClient:
         tracer: Tracer,
         trace_id: Optional[str] = None,
         name: str = "default",
-        project_name: str | None = None,
+        project_name: Union[str, None] = None,
         enable_monitoring: bool = True,
         enable_evaluations: bool = True,
         parent_trace_id: Optional[str] = None,
@@ -414,8 +418,6 @@ class TraceClient:
                 self.start_time or time.time(), timezone.utc
             ).isoformat(),
             "duration": total_duration,
-            "trace_spans": [span.model_dump() for span in self.trace_spans],
-            "evaluation_runs": [run.model_dump() for run in self.evaluation_runs],
             "offline_mode": self.tracer.offline_mode,
             "parent_trace_id": self.parent_trace_id,
             "parent_name": self.parent_name,
@@ -850,9 +852,9 @@ class Tracer:
     def __init__(
         self,
-        api_key: str | None = os.getenv("JUDGMENT_API_KEY"),
-        organization_id: str | None = os.getenv("JUDGMENT_ORG_ID"),
-        project_name: str | None = None,
+        api_key: Union[str, None] = os.getenv("JUDGMENT_API_KEY"),
+        organization_id: Union[str, None] = os.getenv("JUDGMENT_ORG_ID"),
+        project_name: Union[str, None] = None,
         deep_tracing: bool = False,  # Deep tracing is disabled by default
         enable_monitoring: bool = os.getenv("JUDGMENT_MONITORING", "true").lower()
         == "true",
@@ -905,8 +907,8 @@ class Tracer:
             self.class_identifiers: Dict[
                 str, str
             ] = {}  # Dictionary to store class identifiers
-            self.span_id_to_previous_span_id: Dict[str, str | None] = {}
-            self.trace_id_to_previous_trace: Dict[str, TraceClient | None] = {}
+            self.span_id_to_previous_span_id: Dict[str, Union[str, None]] = {}
+            self.trace_id_to_previous_trace: Dict[str, Union[TraceClient, None]] = {}
             self.current_span_id: Optional[str] = None
             self.current_trace: Optional[TraceClient] = None
             self.trace_across_async_contexts: bool = trace_across_async_contexts
@@ -958,7 +960,9 @@ class Tracer:
             self.enable_monitoring = False
             self.enable_evaluations = False
-    def set_current_span(self, span_id: str) -> Optional[contextvars.Token[str | None]]:
+    def set_current_span(
+        self, span_id: str
+    ) -> Optional[contextvars.Token[Union[str, None]]]:
         self.span_id_to_previous_span_id[span_id] = self.current_span_id
         self.current_span_id = span_id
         Tracer.current_span_id = span_id
@@ -981,7 +985,7 @@ class Tracer:
     def reset_current_span(
         self,
-        token: Optional[contextvars.Token[str | None]] = None,
+        token: Optional[contextvars.Token[Union[str, None]]] = None,
         span_id: Optional[str] = None,
     ):
         try:
@@ -997,7 +1001,7 @@ class Tracer:
     def set_current_trace(
         self, trace: TraceClient
-    ) -> Optional[contextvars.Token[TraceClient | None]]:
+    ) -> Optional[contextvars.Token[Union[TraceClient, None]]]:
         """
         Set the current trace context in contextvars
         """
@@ -1030,7 +1034,7 @@ class Tracer:
     def reset_current_trace(
         self,
-        token: Optional[contextvars.Token[TraceClient | None]] = None,
+        token: Optional[contextvars.Token[Union[TraceClient, None]]] = None,
         trace_id: Optional[str] = None,
     ):
         try:
@@ -1046,7 +1050,7 @@ class Tracer:
     @contextmanager
     def trace(
-        self, name: str, project_name: str | None = None
+        self, name: str, project_name: Union[str, None] = None
     ) -> Generator[TraceClient, None, None]:
         """Start a new trace context using a context manager"""
         trace_id = str(uuid.uuid4())
@@ -1692,25 +1696,31 @@ def wrap(
         return wrapper
     if isinstance(client, (OpenAI)):
-        client.chat.completions.create = wrapped(original_create)
-        client.responses.create = wrapped(original_responses_create)
-        client.beta.chat.completions.parse = wrapped(original_beta_parse)
+        setattr(client.chat.completions, "create", wrapped(original_create))
+        setattr(client.responses, "create", wrapped(original_responses_create))
+        setattr(client.beta.chat.completions, "parse", wrapped(original_beta_parse))
     elif isinstance(client, (AsyncOpenAI)):
-        client.chat.completions.create = wrapped_async(original_create)
-        client.responses.create = wrapped_async(original_responses_create)
-        client.beta.chat.completions.parse = wrapped_async(original_beta_parse)
+        setattr(client.chat.completions, "create", wrapped_async(original_create))
+        setattr(client.responses, "create", wrapped_async(original_responses_create))
+        setattr(
+            client.beta.chat.completions, "parse", wrapped_async(original_beta_parse)
+        )
     elif isinstance(client, (Together)):
-        client.chat.completions.create = wrapped(original_create)
+        setattr(client.chat.completions, "create", wrapped(original_create))
     elif isinstance(client, (AsyncTogether)):
-        client.chat.completions.create = wrapped_async(original_create)
+        setattr(client.chat.completions, "create", wrapped_async(original_create))
     elif isinstance(client, (Anthropic)):
-        client.messages.create = wrapped(original_create)
+        setattr(client.messages, "create", wrapped(original_create))
     elif isinstance(client, (AsyncAnthropic)):
-        client.messages.create = wrapped_async(original_create)
+        setattr(client.messages, "create", wrapped_async(original_create))
     elif isinstance(client, (genai.Client)):
-        client.models.generate_content = wrapped(original_create)
+        setattr(client.models, "generate_content", wrapped(original_create))
     elif isinstance(client, (genai.client.AsyncClient)):
-        client.models.generate_content = wrapped_async(original_create)
+        setattr(client.models, "generate_content", wrapped_async(original_create))
+    elif isinstance(client, (Groq)):
+        setattr(client.chat.completions, "create", wrapped(original_create))
+    elif isinstance(client, (AsyncGroq)):
+        setattr(client.chat.completions, "create", wrapped_async(original_create))
     return client
@@ -1745,6 +1755,8 @@ def _get_client_config(
             None,
             client.beta.chat.completions.parse,
         )
+    elif isinstance(client, (Groq, AsyncGroq)):
+        return "GROQ_API_CALL", client.chat.completions.create, None, None, None
     elif isinstance(client, (Together, AsyncTogether)):
         return "TOGETHER_API_CALL", client.chat.completions.create, None, None, None
     elif isinstance(client, (Anthropic, AsyncAnthropic)):
@@ -1783,9 +1795,17 @@ def _format_output_data(
     if isinstance(client, (OpenAI, AsyncOpenAI)):
         if isinstance(response, ChatCompletion):
             model_name = response.model
-            prompt_tokens = response.usage.prompt_tokens
-            completion_tokens = response.usage.completion_tokens
-            cache_read_input_tokens = response.usage.prompt_tokens_details.cached_tokens
+            prompt_tokens = response.usage.prompt_tokens if response.usage else 0
+            completion_tokens = (
+                response.usage.completion_tokens if response.usage else 0
+            )
+            cache_read_input_tokens = (
+                response.usage.prompt_tokens_details.cached_tokens
+                if response.usage
+                and response.usage.prompt_tokens_details
+                and response.usage.prompt_tokens_details.cached_tokens
+                else 0
+            )
             if isinstance(response, ParsedChatCompletion):
                 message_content = response.choices[0].message.parsed
@@ -1793,10 +1813,19 @@ def _format_output_data(
                 message_content = response.choices[0].message.content
         elif isinstance(response, Response):
             model_name = response.model
-            prompt_tokens = response.usage.input_tokens
-            completion_tokens = response.usage.output_tokens
-            cache_read_input_tokens = response.usage.input_tokens_details.cached_tokens
-            message_content = "".join(seg.text for seg in response.output[0].content)
+            prompt_tokens = response.usage.input_tokens if response.usage else 0
+            completion_tokens = response.usage.output_tokens if response.usage else 0
+            cache_read_input_tokens = (
+                response.usage.input_tokens_details.cached_tokens
+                if response.usage and response.usage.input_tokens_details
+                else 0
+            )
+            if hasattr(response.output[0], "content"):
+                message_content = "".join(
+                    seg.text
+                    for seg in response.output[0].content
+                    if hasattr(seg, "text")
+                )
         # Note: LiteLLM seems to use cache_read_input_tokens to calculate the cost for OpenAI
     elif isinstance(client, (Together, AsyncTogether)):
@@ -1821,6 +1850,11 @@ def _format_output_data(
         cache_read_input_tokens = response.usage.cache_read_input_tokens
         cache_creation_input_tokens = response.usage.cache_creation_input_tokens
         message_content = response.content[0].text
+    elif isinstance(client, (Groq, AsyncGroq)):
+        model_name = "groq/" + response.model
+        prompt_tokens = response.usage.prompt_tokens
+        completion_tokens = response.usage.completion_tokens
+        message_content = response.choices[0].message.content
     else:
         judgeval_logger.warning(f"Unsupported client type: {type(client)}")
         return None, None

judgeval/common/tracer/otel_span_processor.py CHANGED Viewed

@@ -11,11 +11,10 @@ import threading
 from typing import Any, Dict, Optional
 from opentelemetry.context import Context
-from opentelemetry.sdk.trace import ReadableSpan
+from opentelemetry.sdk.trace import ReadableSpan, Span
 from opentelemetry.sdk.trace.export import BatchSpanProcessor, SpanProcessor
-from opentelemetry.trace import Span, Status, StatusCode, SpanContext, TraceFlags
+from opentelemetry.trace import Status, StatusCode, SpanContext, TraceFlags
 from opentelemetry.trace.span import TraceState, INVALID_SPAN_CONTEXT
-from opentelemetry.util.types import Attributes
 from judgeval.common.logger import judgeval_logger
 from judgeval.common.tracer.otel_exporter import JudgmentAPISpanExporter
@@ -51,8 +50,8 @@ class SimpleReadableSpan(ReadableSpan):
             Status(StatusCode.ERROR) if trace_span.error else Status(StatusCode.OK)
         )
-        self._attributes = SpanTransformer.trace_span_to_otel_attributes(
-            trace_span, span_state
+        self._attributes: Dict[str, Any] = (
+            SpanTransformer.trace_span_to_otel_attributes(trace_span, span_state)
         )
         try:
@@ -81,53 +80,8 @@ class SimpleReadableSpan(ReadableSpan):
         self._parent: Optional[SpanContext] = None
         self._events: list[Any] = []
         self._links: list[Any] = []
-        self._resource: Optional[Any] = None
         self._instrumentation_info: Optional[Any] = None
-    @property
-    def name(self) -> str:
-        return self._name
-    @property
-    def context(self) -> SpanContext:
-        return self._context
-    @property
-    def parent(self) -> Optional[SpanContext]:
-        return self._parent
-    @property
-    def start_time(self) -> Optional[int]:
-        return self._start_time
-    @property
-    def end_time(self) -> Optional[int]:
-        return self._end_time
-    @property
-    def status(self) -> Status:
-        return self._status
-    @property
-    def attributes(self) -> Optional[Attributes]:
-        return self._attributes
-    @property
-    def events(self):
-        return self._events
-    @property
-    def links(self):
-        return self._links
-    @property
-    def resource(self) -> Optional[Any]:
-        return self._resource
-    @property
-    def instrumentation_info(self) -> Optional[Any]:
-        return self._instrumentation_info
 class JudgmentSpanProcessor(SpanProcessor, SpanProcessorBase):
     """

judgeval/common/tracer/span_transformer.py CHANGED Viewed

@@ -1,10 +1,10 @@
 from __future__ import annotations
-import json
 import time
 import uuid
+import orjson
 from datetime import datetime, timezone
-from typing import Any, Dict, Optional, Union
+from typing import Any, Dict, Mapping, Optional, Union
 from opentelemetry.sdk.trace import ReadableSpan
 from pydantic import BaseModel
@@ -16,11 +16,15 @@ from judgeval.evaluation_run import EvaluationRun
 class SpanTransformer:
     @staticmethod
     def _needs_json_serialization(value: Any) -> bool:
+        """
+        Check if the value needs JSON serialization.
+        Returns True if the value is complex and needs serialization.
+        """
         if value is None:
             return False
-        simple_types = (str, int, float, bool)
-        if isinstance(value, simple_types):
+        # Basic JSON-serializable types don't need serialization
+        if isinstance(value, (str, int, float, bool)):
             return False
         complex_types = (dict, list, tuple, set, BaseModel)
@@ -28,7 +32,7 @@ class SpanTransformer:
             return True
         try:
-            json.dumps(value)
+            orjson.dumps(value)
             return False
         except (TypeError, ValueError):
             return True
@@ -39,15 +43,15 @@ class SpanTransformer:
             if obj is None:
                 return None
             try:
-                return json.dumps(obj, default=str)
+                return orjson.dumps(obj, default=str).decode("utf-8")
             except Exception:
-                return json.dumps(str(obj))
+                return orjson.dumps(str(obj)).decode("utf-8")
         else:
             if not isinstance(obj, str):
                 return obj
             try:
-                return json.loads(obj)
-            except (json.JSONDecodeError, TypeError):
+                return orjson.loads(obj)
+            except (orjson.JSONDecodeError, TypeError, ValueError):
                 return obj
     @staticmethod
@@ -99,7 +103,9 @@ class SpanTransformer:
         return attributes
     @staticmethod
-    def otel_attributes_to_judgment_data(attributes: Dict[str, Any]) -> Dict[str, Any]:
+    def otel_attributes_to_judgment_data(
+        attributes: Mapping[str, Any],
+    ) -> Dict[str, Any]:
         judgment_data: Dict[str, Any] = {}
         for key, value in attributes.items():

judgeval 0.2.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

judgeval 0.2.0py3-none-any.whl → 0.3.1py3-none-any.whl