PyPI - judgeval - Versions diffs - 0.4.0__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

judgeval 0.4.0py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

judgeval/__init__.py +2 -0
judgeval/cli.py +65 -0
judgeval/clients.py +2 -1
judgeval/common/api/api.py +46 -54
judgeval/common/api/constants.py +18 -5
judgeval/common/api/json_encoder.py +241 -0
judgeval/common/tracer/core.py +772 -467
judgeval/common/tracer/otel_span_processor.py +1 -1
judgeval/common/tracer/providers.py +119 -0
judgeval/common/tracer/span_processor.py +1 -1
judgeval/common/tracer/span_transformer.py +16 -26
judgeval/constants.py +1 -0
judgeval/data/evaluation_run.py +104 -0
judgeval/data/judgment_types.py +38 -8
judgeval/data/trace.py +6 -122
judgeval/data/trace_run.py +2 -3
judgeval/dataset.py +2 -0
judgeval/integrations/langgraph.py +2 -1
judgeval/judges/litellm_judge.py +2 -1
judgeval/judges/mixture_of_judges.py +2 -1
judgeval/judges/utils.py +2 -1
judgeval/judgment_client.py +113 -53
judgeval/local_eval_queue.py +190 -0
judgeval/run_evaluation.py +43 -197
judgeval/scorers/base_scorer.py +9 -10
judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +17 -3
judgeval/scorers/score.py +33 -11
judgeval/utils/async_utils.py +36 -0
{judgeval-0.4.0.dist-info → judgeval-0.6.0.dist-info}/METADATA +11 -12
{judgeval-0.4.0.dist-info → judgeval-0.6.0.dist-info}/RECORD +33 -27
judgeval-0.6.0.dist-info/entry_points.txt +2 -0
judgeval/evaluation_run.py +0 -76
{judgeval-0.4.0.dist-info → judgeval-0.6.0.dist-info}/WHEEL +0 -0
{judgeval-0.4.0.dist-info → judgeval-0.6.0.dist-info}/licenses/LICENSE.md +0 -0

judgeval/__init__.py CHANGED Viewed

@@ -2,6 +2,7 @@
 from judgeval.clients import client, together_client
 from judgeval.judgment_client import JudgmentClient
 from judgeval.version_check import check_latest_version
+from judgeval.local_eval_queue import LocalEvaluationQueue
 check_latest_version()
@@ -10,4 +11,5 @@ __all__ = [
     "client",
     "together_client",
     "JudgmentClient",
+    "LocalEvaluationQueue",
 ]

judgeval/cli.py ADDED Viewed

@@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+import typer
+from pathlib import Path
+from dotenv import load_dotenv
+from judgeval.common.logger import judgeval_logger
+from judgeval.judgment_client import JudgmentClient
+load_dotenv()
+app = typer.Typer(
+    no_args_is_help=True,
+    rich_markup_mode=None,
+    rich_help_panel=None,
+    pretty_exceptions_enable=False,
+    pretty_exceptions_show_locals=False,
+    pretty_exceptions_short=False,
+)
+@app.command("upload_scorer")
+def upload_scorer(
+    scorer_file_path: str,
+    requirements_file_path: str,
+    unique_name: str = typer.Option(
+        None, help="Custom name for the scorer (auto-detected if not provided)"
+    ),
+):
+    # Validate file paths
+    if not Path(scorer_file_path).exists():
+        judgeval_logger.error(f"Scorer file not found: {scorer_file_path}")
+        raise typer.Exit(1)
+    if not Path(requirements_file_path).exists():
+        judgeval_logger.error(f"Requirements file not found: {requirements_file_path}")
+        raise typer.Exit(1)
+    try:
+        client = JudgmentClient()
+        result = client.save_custom_scorer(
+            scorer_file_path=scorer_file_path,
+            requirements_file_path=requirements_file_path,
+            unique_name=unique_name,
+        )
+        if not result:
+            judgeval_logger.error("Failed to upload custom scorer")
+            raise typer.Exit(1)
+        raise typer.Exit(0)
+    except Exception:
+        raise
+@app.command()
+def version():
+    """Show version info"""
+    judgeval_logger.info("JudgEval CLI v0.0.0")
+if __name__ == "__main__":
+    app()
+# judgeval upload_scorer /Users/alanzhang/repo/JudgmentLabs/judgeval/src/demo/profile_match_scorer.py /Users/alanzhang/repo/JudgmentLabs/judgeval/src/demo/requirements.txt

judgeval/clients.py CHANGED Viewed

@@ -2,7 +2,6 @@ import os
 from dotenv import load_dotenv
 from openai import OpenAI
 from typing import Optional
-from together import Together, AsyncTogether
 PATH_TO_DOTENV = os.path.join(os.path.dirname(__file__), ".env")
 load_dotenv(dotenv_path=PATH_TO_DOTENV)
@@ -28,6 +27,8 @@ async_together_client: Optional["AsyncTogether"] = None
 together_api_key = os.getenv("TOGETHERAI_API_KEY") or os.getenv("TOGETHER_API_KEY")
 if together_api_key:
     try:
+        from together import Together, AsyncTogether
         together_client = Together(api_key=together_api_key)
         async_together_client = AsyncTogether(api_key=together_api_key)
     except Exception:

judgeval/common/api/api.py CHANGED Viewed

@@ -20,13 +20,11 @@ from judgeval.common.api.constants import (
     JUDGMENT_EVAL_DELETE_API_URL,
     JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
     JUDGMENT_GET_EVAL_STATUS_API_URL,
-    JUDGMENT_CHECK_EXPERIMENT_TYPE_API_URL,
-    JUDGMENT_EVAL_RUN_NAME_EXISTS_API_URL,
     JUDGMENT_SCORER_SAVE_API_URL,
     JUDGMENT_SCORER_FETCH_API_URL,
     JUDGMENT_SCORER_EXISTS_API_URL,
+    JUDGMENT_CUSTOM_SCORER_UPLOAD_API_URL,
     JUDGMENT_DATASETS_APPEND_TRACES_API_URL,
-    JUDGMENT_CHECK_EXAMPLE_KEYS_API_URL,
 )
 from judgeval.common.api.constants import (
     TraceFetchPayload,
@@ -45,16 +43,14 @@ from judgeval.common.api.constants import (
     DeleteEvalRunRequestBody,
     EvalLogPayload,
     EvalStatusPayload,
-    CheckExperimentTypePayload,
-    EvalRunNameExistsPayload,
     ScorerSavePayload,
     ScorerFetchPayload,
     ScorerExistsPayload,
-    CheckExampleKeysPayload,
+    CustomScorerUploadPayload,
+    CustomScorerTemplateResponse,
 )
 from judgeval.utils.requests import requests
-import orjson
+from judgeval.common.api.json_encoder import json_encoder
 class JudgmentAPIException(exceptions.HTTPError):
@@ -98,22 +94,28 @@ class JudgmentApiClient:
         method: Literal["POST", "PATCH", "GET", "DELETE"],
         url: str,
         payload: Any,
+        timeout: Optional[Union[float, tuple]] = None,
     ) -> Any:
+        # Prepare request kwargs with optional timeout
+        request_kwargs = self._request_kwargs()
+        if timeout is not None:
+            request_kwargs["timeout"] = timeout
         if method == "GET":
             r = requests.request(
                 method,
                 url,
                 params=payload,
                 headers=self._headers(),
-                **self._request_kwargs(),
+                **request_kwargs,
             )
         else:
             r = requests.request(
                 method,
                 url,
-                data=self._serialize(payload),
+                json=json_encoder(payload),
                 headers=self._headers(),
-                **self._request_kwargs(),
+                **request_kwargs,
             )
         try:
@@ -187,10 +189,10 @@ class JudgmentApiClient:
         payload: EvalLogPayload = {"results": results, "run": run}
         return self._do_request("POST", JUDGMENT_EVAL_LOG_API_URL, payload)
-    def fetch_evaluation_results(self, project_name: str, eval_name: str):
+    def fetch_evaluation_results(self, experiment_run_id: str, project_name: str):
         payload: EvalRunRequestBody = {
             "project_name": project_name,
-            "eval_name": eval_name,
+            "experiment_run_id": experiment_run_id,
         }
         return self._do_request("POST", JUDGMENT_EVAL_FETCH_API_URL, payload)
@@ -205,43 +207,21 @@ class JudgmentApiClient:
     def add_to_evaluation_queue(self, payload: Dict[str, Any]):
         return self._do_request("POST", JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL, payload)
-    def get_evaluation_status(self, eval_name: str, project_name: str):
+    def get_evaluation_status(self, experiment_run_id: str, project_name: str):
         payload: EvalStatusPayload = {
-            "eval_name": eval_name,
+            "experiment_run_id": experiment_run_id,
             "project_name": project_name,
             "judgment_api_key": self.api_key,
         }
         return self._do_request("GET", JUDGMENT_GET_EVAL_STATUS_API_URL, payload)
-    def check_experiment_type(self, eval_name: str, project_name: str, is_trace: bool):
-        payload: CheckExperimentTypePayload = {
-            "eval_name": eval_name,
-            "project_name": project_name,
-            "judgment_api_key": self.api_key,
-            "is_trace": is_trace,
-        }
-        return self._do_request("POST", JUDGMENT_CHECK_EXPERIMENT_TYPE_API_URL, payload)
-    def check_eval_run_name_exists(self, eval_name: str, project_name: str):
-        payload: EvalRunNameExistsPayload = {
-            "eval_name": eval_name,
-            "project_name": project_name,
-            "judgment_api_key": self.api_key,
-        }
-        return self._do_request("POST", JUDGMENT_EVAL_RUN_NAME_EXISTS_API_URL, payload)
-    def check_example_keys(self, keys: List[str], eval_name: str, project_name: str):
-        payload: CheckExampleKeysPayload = {
-            "keys": keys,
-            "eval_name": eval_name,
-            "project_name": project_name,
-        }
-        return self._do_request("POST", JUDGMENT_CHECK_EXAMPLE_KEYS_API_URL, payload)
-    def save_scorer(self, name: str, prompt: str, options: Optional[dict] = None):
+    def save_scorer(
+        self, name: str, prompt: str, threshold: float, options: Optional[dict] = None
+    ):
         payload: ScorerSavePayload = {
             "name": name,
             "prompt": prompt,
+            "threshold": threshold,
             "options": options,
         }
         try:
@@ -293,6 +273,31 @@ class JudgmentApiClient:
                 request=e.request,
             )
+    def upload_custom_scorer(
+        self,
+        scorer_name: str,
+        scorer_code: str,
+        requirements_text: str,
+    ) -> CustomScorerTemplateResponse:
+        """Upload custom scorer to backend"""
+        payload: CustomScorerUploadPayload = {
+            "scorer_name": scorer_name,
+            "scorer_code": scorer_code,
+            "requirements_text": requirements_text,
+        }
+        try:
+            # Use longer timeout for custom scorer upload (5 minutes)
+            response = self._do_request(
+                "POST",
+                JUDGMENT_CUSTOM_SCORER_UPLOAD_API_URL,
+                payload,
+                timeout=(10, 300),
+            )
+            return response
+        except JudgmentAPIException as e:
+            raise e
     def push_dataset(
         self,
         dataset_alias: str,
@@ -368,16 +373,3 @@ class JudgmentApiClient:
             "verify": True,
             "timeout": 30,
         }
-    def _serialize(self, data: Any) -> str:
-        def fallback_encoder(obj):
-            try:
-                return repr(obj)
-            except Exception:
-                try:
-                    return str(obj)
-                except Exception as e:
-                    return f"<Unserializable object of type {type(obj).__name__}: {e}>"
-        # orjson returns bytes, so we need to decode to str
-        return orjson.dumps(data, default=fallback_encoder).decode("utf-8")

judgeval/common/api/constants.py CHANGED Viewed

@@ -49,9 +49,9 @@ JUDGMENT_EVAL_DELETE_API_URL = (
 JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_project/"
 JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL = f"{ROOT_API}/add_to_run_eval_queue/"
 JUDGMENT_GET_EVAL_STATUS_API_URL = f"{ROOT_API}/get_evaluation_status/"
-JUDGMENT_CHECK_EXPERIMENT_TYPE_API_URL = f"{ROOT_API}/check_experiment_type/"
-JUDGMENT_EVAL_RUN_NAME_EXISTS_API_URL = f"{ROOT_API}/eval-run-name-exists/"
-JUDGMENT_CHECK_EXAMPLE_KEYS_API_URL = f"{ROOT_API}/check_example_keys/"
+# Custom Scorers API
+JUDGMENT_CUSTOM_SCORER_UPLOAD_API_URL = f"{ROOT_API}/build_sandbox_template/"
 # Evaluation API Payloads
@@ -73,9 +73,9 @@ class EvalLogPayload(TypedDict):
 class EvalStatusPayload(TypedDict):
-    eval_name: str
-    project_name: str
+    experiment_run_id: str
     judgment_api_key: str
+    project_name: str
 class CheckExperimentTypePayload(TypedDict):
@@ -162,6 +162,7 @@ JUDGMENT_SCORER_EXISTS_API_URL = f"{ROOT_API}/scorer_exists/"
 class ScorerSavePayload(TypedDict):
     name: str
     prompt: str
+    threshold: float
     options: Optional[dict]
@@ -171,3 +172,15 @@ class ScorerFetchPayload(TypedDict):
 class ScorerExistsPayload(TypedDict):
     name: str
+class CustomScorerUploadPayload(TypedDict):
+    scorer_name: str
+    scorer_code: str
+    requirements_text: str
+class CustomScorerTemplateResponse(TypedDict):
+    scorer_name: str
+    status: str
+    message: str

judgeval/common/api/json_encoder.py ADDED Viewed

@@ -0,0 +1,241 @@
+"""
+This is a modified version of https://docs.powertools.aws.dev/lambda/python/2.35.1/api/event_handler/openapi/encoders.html
+"""
+import dataclasses
+import datetime
+from collections import defaultdict, deque
+from decimal import Decimal
+from enum import Enum
+from pathlib import Path, PurePath
+from re import Pattern
+from types import GeneratorType
+from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Type, Union
+from uuid import UUID
+from pydantic import BaseModel
+from pydantic.types import SecretBytes, SecretStr
+"""
+This module contains the encoders used by jsonable_encoder to convert Python objects to JSON serializable data types.
+"""
+def _model_dump(
+    model: BaseModel, mode: Literal["json", "python"] = "json", **kwargs: Any
+) -> Any:
+    return model.model_dump(mode=mode, **kwargs)
+def json_encoder(
+    obj: Any,
+    custom_serializer: Optional[Callable[[Any], str]] = None,
+) -> Any:
+    """
+    JSON encodes an arbitrary Python object into JSON serializable data types.
+    This is a modified version of fastapi.encoders.jsonable_encoder that supports
+    encoding of pydantic.BaseModel objects.
+    Parameters
+    ----------
+    obj : Any
+        The object to encode
+    custom_serializer : Callable, optional
+        A custom serializer to use for encoding the object, when everything else fails.
+    Returns
+    -------
+    Any
+        The JSON serializable data types
+    """
+    # Pydantic models
+    if isinstance(obj, BaseModel):
+        return _dump_base_model(
+            obj=obj,
+        )
+    # Dataclasses
+    if dataclasses.is_dataclass(obj):
+        obj_dict = dataclasses.asdict(obj)
+        return json_encoder(
+            obj_dict,
+        )
+    # Enums
+    if isinstance(obj, Enum):
+        return obj.value
+    # Paths
+    if isinstance(obj, PurePath):
+        return str(obj)
+    # Scalars
+    if isinstance(obj, (str, int, float, type(None))):
+        return obj
+    # Dictionaries
+    if isinstance(obj, dict):
+        return _dump_dict(
+            obj=obj,
+        )
+    # Sequences
+    if isinstance(obj, (list, set, frozenset, tuple, deque)):
+        return _dump_sequence(
+            obj=obj,
+        )
+    # Other types
+    if type(obj) in ENCODERS_BY_TYPE:
+        return ENCODERS_BY_TYPE[type(obj)](obj)
+    for encoder, classes_tuple in encoders_by_class_tuples.items():
+        if isinstance(obj, classes_tuple):
+            return encoder(obj)
+    # Use custom serializer if present
+    if custom_serializer:
+        return custom_serializer(obj)
+    # Default
+    return _dump_other(
+        obj=obj,
+    )
+def _dump_base_model(
+    *,
+    obj: Any,
+):
+    """
+    Dump a BaseModel object to a dict, using the same parameters as jsonable_encoder
+    """
+    obj_dict = _model_dump(
+        obj,
+        mode="json",
+    )
+    if "__root__" in obj_dict:
+        obj_dict = obj_dict["__root__"]
+    return json_encoder(
+        obj_dict,
+    )
+def _dump_dict(
+    *,
+    obj: Any,
+) -> Dict[str, Any]:
+    """
+    Dump a dict to a dict, using the same parameters as jsonable_encoder
+    """
+    encoded_dict = {}
+    allowed_keys = set(obj.keys())
+    for key, value in obj.items():
+        if key in allowed_keys:
+            encoded_key = json_encoder(
+                key,
+            )
+            encoded_value = json_encoder(
+                value,
+            )
+            encoded_dict[encoded_key] = encoded_value
+    return encoded_dict
+def _dump_sequence(
+    *,
+    obj: Any,
+) -> List[Any]:
+    """
+    Dump a sequence to a list, using the same parameters as jsonable_encoder
+    """
+    encoded_list = []
+    for item in obj:
+        encoded_list.append(
+            json_encoder(
+                item,
+            ),
+        )
+    return encoded_list
+def _dump_other(
+    *,
+    obj: Any,
+) -> Any:
+    """
+    Dump an object to a representation without iterating it.
+    Avoids calling dict(obj) which can consume iterators/generators or
+    invoke user-defined iteration protocols.
+    """
+    try:
+        return repr(obj)
+    except Exception:
+        return str(obj)
+def iso_format(o: Union[datetime.date, datetime.time]) -> str:
+    """
+    ISO format for date and time
+    """
+    return o.isoformat()
+def decimal_encoder(dec_value: Decimal) -> Union[int, float]:
+    """
+    Encodes a Decimal as int of there's no exponent, otherwise float
+    This is useful when we use ConstrainedDecimal to represent Numeric(x,0)
+    where an integer (but not int typed) is used. Encoding this as a float
+    results in failed round-tripping between encode and parse.
+    >>> decimal_encoder(Decimal("1.0"))
+    1.0
+    >>> decimal_encoder(Decimal("1"))
+    1
+    """
+    if dec_value.as_tuple().exponent >= 0:  # type: ignore[operator]
+        return int(dec_value)
+    else:
+        return float(dec_value)
+ENCODERS_BY_TYPE: Dict[Type[Any], Callable[[Any], Any]] = {
+    bytes: lambda o: o.decode(),
+    datetime.date: iso_format,
+    datetime.datetime: iso_format,
+    datetime.time: iso_format,
+    datetime.timedelta: lambda td: td.total_seconds(),
+    Decimal: decimal_encoder,
+    Enum: lambda o: o.value,
+    frozenset: list,
+    deque: list,
+    GeneratorType: repr,
+    Path: str,
+    Pattern: lambda o: o.pattern,
+    SecretBytes: str,
+    SecretStr: str,
+    set: list,
+    UUID: str,
+}
+# Generates a mapping of encoders to a tuple of classes that they can encode
+def generate_encoders_by_class_tuples(
+    type_encoder_map: Dict[Any, Callable[[Any], Any]],
+) -> Dict[Callable[[Any], Any], Tuple[Any, ...]]:
+    encoders: Dict[Callable[[Any], Any], Tuple[Any, ...]] = defaultdict(tuple)
+    for type_, encoder in type_encoder_map.items():
+        encoders[encoder] += (type_,)
+    return encoders
+# Mapping of encoders to a tuple of classes that they can encode
+encoders_by_class_tuples = generate_encoders_by_class_tuples(ENCODERS_BY_TYPE)

judgeval 0.4.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

judgeval 0.4.0py3-none-any.whl → 0.6.0py3-none-any.whl