PyPI - judgeval - Versions diffs - 0.0.11__py3-none-any.whl → 0.22.2__py3-none-any.whl - Mend

judgeval 0.0.11py3-none-any.whl → 0.22.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of judgeval might be problematic. Click here for more details.

Files changed (171) hide show

judgeval/__init__.py +177 -12
judgeval/api/__init__.py +519 -0
judgeval/api/api_types.py +407 -0
judgeval/cli.py +79 -0
judgeval/constants.py +76 -47
judgeval/data/__init__.py +3 -3
judgeval/data/evaluation_run.py +125 -0
judgeval/data/example.py +15 -56
judgeval/data/judgment_types.py +450 -0
judgeval/data/result.py +29 -73
judgeval/data/scorer_data.py +29 -62
judgeval/data/scripts/fix_default_factory.py +23 -0
judgeval/data/scripts/openapi_transform.py +123 -0
judgeval/data/trace.py +121 -0
judgeval/dataset/__init__.py +264 -0
judgeval/env.py +52 -0
judgeval/evaluation/__init__.py +344 -0
judgeval/exceptions.py +27 -0
judgeval/integrations/langgraph/__init__.py +13 -0
judgeval/integrations/openlit/__init__.py +50 -0
judgeval/judges/__init__.py +2 -3
judgeval/judges/base_judge.py +2 -3
judgeval/judges/litellm_judge.py +100 -20
judgeval/judges/together_judge.py +101 -20
judgeval/judges/utils.py +20 -24
judgeval/logger.py +62 -0
judgeval/prompt/__init__.py +330 -0
judgeval/scorers/__init__.py +18 -25
judgeval/scorers/agent_scorer.py +17 -0
judgeval/scorers/api_scorer.py +45 -41
judgeval/scorers/base_scorer.py +83 -38
judgeval/scorers/example_scorer.py +17 -0
judgeval/scorers/exceptions.py +1 -0
judgeval/scorers/judgeval_scorers/__init__.py +0 -148
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +19 -17
judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +13 -19
judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +12 -19
judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +13 -19
judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +15 -0
judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +327 -0
judgeval/scorers/score.py +77 -306
judgeval/scorers/utils.py +4 -199
judgeval/tracer/__init__.py +1122 -2
judgeval/tracer/constants.py +1 -0
judgeval/tracer/exporters/__init__.py +40 -0
judgeval/tracer/exporters/s3.py +119 -0
judgeval/tracer/exporters/store.py +59 -0
judgeval/tracer/exporters/utils.py +32 -0
judgeval/tracer/keys.py +63 -0
judgeval/tracer/llm/__init__.py +7 -0
judgeval/tracer/llm/config.py +78 -0
judgeval/tracer/llm/constants.py +9 -0
judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
judgeval/tracer/llm/llm_anthropic/config.py +6 -0
judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
judgeval/tracer/llm/llm_google/__init__.py +3 -0
judgeval/tracer/llm/llm_google/config.py +6 -0
judgeval/tracer/llm/llm_google/generate_content.py +127 -0
judgeval/tracer/llm/llm_google/wrapper.py +30 -0
judgeval/tracer/llm/llm_openai/__init__.py +3 -0
judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
judgeval/tracer/llm/llm_openai/config.py +6 -0
judgeval/tracer/llm/llm_openai/responses.py +506 -0
judgeval/tracer/llm/llm_openai/utils.py +42 -0
judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
judgeval/tracer/llm/llm_together/__init__.py +3 -0
judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
judgeval/tracer/llm/llm_together/config.py +6 -0
judgeval/tracer/llm/llm_together/wrapper.py +52 -0
judgeval/tracer/llm/providers.py +19 -0
judgeval/tracer/managers.py +167 -0
judgeval/tracer/processors/__init__.py +220 -0
judgeval/tracer/utils.py +19 -0
judgeval/trainer/__init__.py +14 -0
judgeval/trainer/base_trainer.py +122 -0
judgeval/trainer/config.py +128 -0
judgeval/trainer/console.py +144 -0
judgeval/trainer/fireworks_trainer.py +396 -0
judgeval/trainer/trainable_model.py +243 -0
judgeval/trainer/trainer.py +70 -0
judgeval/utils/async_utils.py +39 -0
judgeval/utils/decorators/__init__.py +0 -0
judgeval/utils/decorators/dont_throw.py +37 -0
judgeval/utils/decorators/use_once.py +13 -0
judgeval/utils/file_utils.py +97 -0
judgeval/utils/guards.py +36 -0
judgeval/utils/meta.py +27 -0
judgeval/utils/project.py +15 -0
judgeval/utils/serialize.py +253 -0
judgeval/utils/testing.py +70 -0
judgeval/utils/url.py +10 -0
judgeval/utils/version_check.py +28 -0
judgeval/utils/wrappers/README.md +3 -0
judgeval/utils/wrappers/__init__.py +15 -0
judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
judgeval/utils/wrappers/py.typed +0 -0
judgeval/utils/wrappers/utils.py +35 -0
judgeval/version.py +5 -0
judgeval/warnings.py +4 -0
judgeval-0.22.2.dist-info/METADATA +265 -0
judgeval-0.22.2.dist-info/RECORD +112 -0
judgeval-0.22.2.dist-info/entry_points.txt +2 -0
judgeval/clients.py +0 -39
judgeval/common/__init__.py +0 -8
judgeval/common/exceptions.py +0 -28
judgeval/common/logger.py +0 -189
judgeval/common/tracer.py +0 -798
judgeval/common/utils.py +0 -763
judgeval/data/api_example.py +0 -111
judgeval/data/datasets/__init__.py +0 -5
judgeval/data/datasets/dataset.py +0 -286
judgeval/data/datasets/eval_dataset_client.py +0 -193
judgeval/data/datasets/ground_truth.py +0 -54
judgeval/data/datasets/utils.py +0 -74
judgeval/evaluation_run.py +0 -132
judgeval/judges/mixture_of_judges.py +0 -248
judgeval/judgment_client.py +0 -354
judgeval/run_evaluation.py +0 -439
judgeval/scorers/judgeval_scorer.py +0 -140
judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -19
judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -19
judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -22
judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -19
judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -32
judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -20
judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py +0 -19
judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -54
judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -24
judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -277
judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -325
judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -263
judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -550
judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +0 -3
judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +0 -157
judgeval/scorers/prompt_scorer.py +0 -439
judgeval-0.0.11.dist-info/METADATA +0 -36
judgeval-0.0.11.dist-info/RECORD +0 -84
{judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/WHEEL +0 -0
{judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/licenses/LICENSE.md +0 -0

judgeval/data/evaluation_run.py ADDED Viewed

@@ -0,0 +1,125 @@
+from typing import List, Optional, Union, Tuple, Sequence
+from pydantic import field_validator, model_validator, Field, BaseModel
+from datetime import datetime, timezone
+import uuid
+from judgeval.data import Example
+from judgeval.scorers import APIScorerConfig
+from judgeval.scorers.example_scorer import ExampleScorer
+from judgeval.constants import ACCEPTABLE_MODELS
+from judgeval.data.judgment_types import (
+    ExampleEvaluationRun as ExampleEvaluationRunJudgmentType,
+    TraceEvaluationRun as TraceEvaluationRunJudgmentType,
+)
+class EvaluationRun(BaseModel):
+    id: str = Field(default_factory=lambda: str(uuid.uuid4()))
+    created_at: str = Field(
+        default_factory=lambda: datetime.now(timezone.utc).isoformat()
+    )
+    custom_scorers: List[ExampleScorer] = Field(default_factory=list)
+    judgment_scorers: Sequence[APIScorerConfig] = Field(default_factory=list)
+    scorers: Sequence[Union[ExampleScorer, APIScorerConfig]] = Field(
+        default_factory=list
+    )
+    model: Optional[str] = None
+    def __init__(
+        self,
+        scorers: Optional[List[Union[ExampleScorer, APIScorerConfig]]] = None,
+        **kwargs,
+    ):
+        """
+        Initialize EvaluationRun with automatic scorer classification.
+        Args:
+            scorers: List of scorers that will be automatically sorted into custom_scorers or judgment_scorers
+            **kwargs: Other initialization arguments
+        """
+        if scorers is not None:
+            # Automatically sort scorers into appropriate fields
+            custom_scorers = [s for s in scorers if isinstance(s, ExampleScorer)]
+            judgment_scorers = [s for s in scorers if isinstance(s, APIScorerConfig)]
+            # Always set both fields as lists (even if empty) to satisfy validation
+            kwargs["custom_scorers"] = custom_scorers
+            kwargs["judgment_scorers"] = judgment_scorers
+        super().__init__(**kwargs)
+    def model_dump(self, **kwargs):
+        data = super().model_dump(**kwargs)
+        data["custom_scorers"] = [s.model_dump() for s in self.custom_scorers]
+        data["judgment_scorers"] = [s.model_dump() for s in self.judgment_scorers]
+        return data
+    @model_validator(mode="after")
+    @classmethod
+    def validate_scorer_lists(cls, values):
+        custom_scorers = values.custom_scorers
+        judgment_scorers = values.judgment_scorers
+        # Check that both lists are not empty
+        if not custom_scorers and not judgment_scorers:
+            raise ValueError(
+                "At least one of custom_scorers or judgment_scorers must be provided."
+            )
+        # Check that only one list is filled
+        if custom_scorers and judgment_scorers:
+            raise ValueError(
+                "Only one of custom_scorers or judgment_scorers can be provided, not both."
+            )
+        return values
+    @field_validator("model")
+    def validate_model(cls, v, values):
+        # Check if model is string or list of strings
+        if v is not None and isinstance(v, str):
+            if v not in ACCEPTABLE_MODELS:
+                raise ValueError(
+                    f"Model name {v} not recognized. Please select a valid model name.)"
+                )
+            return v
+class ExampleEvaluationRun(EvaluationRun, ExampleEvaluationRunJudgmentType):  # type: ignore
+    """
+    Stores example and evaluation scorers together for running an eval task
+    Args:
+        project_name (str): The name of the project the evaluation results belong to
+        eval_name (str): A name for this evaluation run
+        examples (List[Example]): The examples to evaluate
+        scorers (List[Union[BaseScorer, APIScorerConfig]]): A list of scorers to use for evaluation
+        model (str): The model used as a judge when using LLM as a Judge
+    """
+    examples: List[Example]  # type: ignore
+    @field_validator("examples")
+    def validate_examples(cls, v):
+        if not v:
+            raise ValueError("Examples cannot be empty.")
+        for item in v:
+            if not isinstance(item, Example):
+                raise ValueError(f"Item of type {type(item)} is not a Example")
+        return v
+    def model_dump(self, **kwargs):
+        data = super().model_dump(**kwargs)
+        data["examples"] = [example.model_dump() for example in self.examples]
+        return data
+class TraceEvaluationRun(EvaluationRun, TraceEvaluationRunJudgmentType):  # type: ignore
+    trace_and_span_ids: List[Tuple[str, str]]  # type: ignore
+    @field_validator("trace_and_span_ids")
+    def validate_trace_and_span_ids(cls, v):
+        if not v:
+            raise ValueError("Trace and span IDs are required for trace evaluations.")
+        return v

judgeval/data/example.py CHANGED Viewed

@@ -2,17 +2,15 @@
 Classes for representing examples in a dataset.
 """
-from typing import TypeVar, Optional, Any, Dict, List
-from pydantic import BaseModel
 from enum import Enum
 from datetime import datetime
+from typing import Dict, Any, Optional
+from judgeval.data.judgment_types import Example as JudgmentExample
+from uuid import uuid4
+from pydantic import Field
-Input = TypeVar('Input')
-Output = TypeVar('Output')
-class ExampleParams(Enum):
+class ExampleParams(str, Enum):
     INPUT = "input"
     ACTUAL_OUTPUT = "actual_output"
     EXPECTED_OUTPUT = "expected_output"
@@ -20,57 +18,18 @@ class ExampleParams(Enum):
     RETRIEVAL_CONTEXT = "retrieval_context"
     TOOLS_CALLED = "tools_called"
     EXPECTED_TOOLS = "expected_tools"
-    REASONING = "reasoning"
+    ADDITIONAL_METADATA = "additional_metadata"
-class Example(BaseModel):
-    input: Input
-    actual_output: Output
-    expected_output: Optional[str] = None
-    context: Optional[List[str]] = None
-    retrieval_context: Optional[List[str]] = None
-    additional_metadata: Optional[Dict[str, Any]] = None
-    tools_called: Optional[List[str]] = None
-    expected_tools: Optional[List[str]] = None
+class Example(JudgmentExample):
+    example_id: str = Field(default_factory=lambda: str(uuid4()))
+    created_at: str = Field(default_factory=lambda: datetime.now().isoformat())
     name: Optional[str] = None
-    example_id: Optional[str] = None
-    timestamp: Optional[str] = None
-    trace_id: Optional[str] = None
-    def __init__(self, **data):
-        super().__init__(**data)
-        # Set timestamp if not provided
-        if self.timestamp is None:
-            self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    def to_dict(self):
-        return {
-            "input": self.input,
-            "actual_output": self.actual_output,
-            "expected_output": self.expected_output,
-            "context": self.context,
-            "retrieval_context": self.retrieval_context,
-            "additional_metadata": self.additional_metadata,
-            "tools_called": self.tools_called,
-            "expected_tools": self.expected_tools,
-            "name": self.name,
-            "example_id": self.example_id,
-            "timestamp": self.timestamp,
-            "trace_id": self.trace_id
-        }
+    def to_dict(self) -> Dict[str, Any]:
+        data = super().model_dump(warnings=False)
+        return data
-    def __str__(self):
-        return (
-            f"Example(input={self.input}, "
-            f"actual_output={self.actual_output}, "
-            f"expected_output={self.expected_output}, "
-            f"context={self.context}, "
-            f"retrieval_context={self.retrieval_context}, "
-            f"additional_metadata={self.additional_metadata}, "
-            f"tools_called={self.tools_called}, "
-            f"expected_tools={self.expected_tools}, "
-            f"name={self.name}, "
-            f"example_id={self.example_id}, "
-            f"timestamp={self.timestamp}, "
-            f"trace_id={self.trace_id})"
-        )
+    def get_fields(self):
+        excluded = {"example_id", "name", "created_at"}
+        return self.model_dump(exclude=excluded)

judgeval/data/judgment_types.py ADDED Viewed

@@ -0,0 +1,450 @@
+# generated by datamodel-codegen:
+#   filename:  .openapi.json
+#   timestamp: 2025-10-25T22:30:19+00:00
+from __future__ import annotations
+from typing import Annotated, Any, Dict, List, Optional, Union
+from pydantic import AwareDatetime, BaseModel, ConfigDict, Field, RootModel
+from enum import Enum
+class TraceAndSpanId(RootModel[List]):
+    root: Annotated[List, Field(max_length=2, min_length=2)]
+class EvalResultsFetch(BaseModel):
+    experiment_run_id: Annotated[str, Field(title="Experiment Run Id")]
+    project_name: Annotated[str, Field(title="Project Name")]
+class DatasetFetch(BaseModel):
+    dataset_name: Annotated[str, Field(title="Dataset Name")]
+    project_name: Annotated[str, Field(title="Project Name")]
+class DatasetsFetch(BaseModel):
+    project_name: Annotated[str, Field(title="Project Name")]
+class ProjectAdd(BaseModel):
+    project_name: Annotated[str, Field(title="Project Name")]
+class ProjectAddResponse(BaseModel):
+    project_id: Annotated[str, Field(title="Project Id")]
+class ProjectDeleteFromJudgevalResponse(BaseModel):
+    project_name: Annotated[str, Field(title="Project Name")]
+class ProjectDeleteResponse(BaseModel):
+    message: Annotated[str, Field(title="Message")]
+class ScorerExistsRequest(BaseModel):
+    name: Annotated[str, Field(title="Name")]
+class ScorerExistsResponse(BaseModel):
+    exists: Annotated[bool, Field(title="Exists")]
+class SavePromptScorerRequest(BaseModel):
+    name: Annotated[str, Field(title="Name")]
+    prompt: Annotated[str, Field(title="Prompt")]
+    threshold: Annotated[float, Field(title="Threshold")]
+    model: Annotated[Optional[str], Field(title="Model")] = "gpt-5"
+    is_trace: Annotated[Optional[bool], Field(title="Is Trace")] = False
+    options: Annotated[Optional[Dict[str, float]], Field(title="Options")] = None
+    description: Annotated[Optional[str], Field(title="Description")] = None
+class FetchPromptScorersRequest(BaseModel):
+    names: Annotated[Optional[List[str]], Field(title="Names")] = None
+    is_trace: Annotated[Optional[bool], Field(title="Is Trace")] = None
+class CustomScorerUploadPayload(BaseModel):
+    scorer_name: Annotated[str, Field(title="Scorer Name")]
+    scorer_code: Annotated[str, Field(title="Scorer Code")]
+    requirements_text: Annotated[str, Field(title="Requirements Text")]
+    overwrite: Annotated[Optional[bool], Field(title="Overwrite")] = False
+class CustomScorerTemplateResponse(BaseModel):
+    scorer_name: Annotated[str, Field(title="Scorer Name")]
+    status: Annotated[str, Field(title="Status")]
+    message: Annotated[str, Field(title="Message")]
+class PromptInsertRequest(BaseModel):
+    project_id: Annotated[str, Field(title="Project Id")]
+    name: Annotated[str, Field(title="Name")]
+    prompt: Annotated[str, Field(title="Prompt")]
+    tags: Annotated[List[str], Field(title="Tags")]
+class PromptInsertResponse(BaseModel):
+    commit_id: Annotated[str, Field(title="Commit Id")]
+    parent_commit_id: Annotated[Optional[str], Field(title="Parent Commit Id")] = None
+    created_at: Annotated[str, Field(title="Created At")]
+class PromptTagRequest(BaseModel):
+    project_id: Annotated[str, Field(title="Project Id")]
+    name: Annotated[str, Field(title="Name")]
+    commit_id: Annotated[str, Field(title="Commit Id")]
+    tags: Annotated[List[str], Field(title="Tags")]
+class PromptTagResponse(BaseModel):
+    commit_id: Annotated[str, Field(title="Commit Id")]
+class PromptUntagRequest(BaseModel):
+    project_id: Annotated[str, Field(title="Project Id")]
+    name: Annotated[str, Field(title="Name")]
+    tags: Annotated[List[str], Field(title="Tags")]
+class PromptUntagResponse(BaseModel):
+    commit_ids: Annotated[List[str], Field(title="Commit Ids")]
+class ResolveProjectNameRequest(BaseModel):
+    project_name: Annotated[str, Field(title="Project Name")]
+class ResolveProjectNameResponse(BaseModel):
+    project_id: Annotated[str, Field(title="Project Id")]
+class TraceIdRequest(BaseModel):
+    trace_id: Annotated[str, Field(title="Trace Id")]
+class SpanScoreRequest(BaseModel):
+    span_id: Annotated[str, Field(title="Span Id")]
+    trace_id: Annotated[str, Field(title="Trace Id")]
+class BaseScorer(BaseModel):
+    score_type: Annotated[str, Field(title="Score Type")]
+    threshold: Annotated[Optional[float], Field(title="Threshold")] = 0.5
+    name: Annotated[Optional[str], Field(title="Name")] = None
+    class_name: Annotated[Optional[str], Field(title="Class Name")] = None
+    score: Annotated[Optional[float], Field(title="Score")] = None
+    score_breakdown: Annotated[
+        Optional[Dict[str, Any]], Field(title="Score Breakdown")
+    ] = None
+    reason: Annotated[Optional[str], Field(title="Reason")] = ""
+    using_native_model: Annotated[Optional[bool], Field(title="Using Native Model")] = (
+        None
+    )
+    success: Annotated[Optional[bool], Field(title="Success")] = None
+    model: Annotated[Optional[str], Field(title="Model")] = None
+    model_client: Annotated[Any, Field(title="Model Client")] = None
+    strict_mode: Annotated[Optional[bool], Field(title="Strict Mode")] = False
+    error: Annotated[Optional[str], Field(title="Error")] = None
+    additional_metadata: Annotated[
+        Optional[Dict[str, Any]], Field(title="Additional Metadata")
+    ] = None
+    user: Annotated[Optional[str], Field(title="User")] = None
+    server_hosted: Annotated[Optional[bool], Field(title="Server Hosted")] = False
+class ScorerConfig(BaseModel):
+    score_type: Annotated[str, Field(title="Score Type")]
+    name: Annotated[Optional[str], Field(title="Name")] = None
+    threshold: Annotated[Optional[float], Field(title="Threshold")] = 0.5
+    model: Annotated[Optional[str], Field(title="Model")] = None
+    strict_mode: Annotated[Optional[bool], Field(title="Strict Mode")] = False
+    required_params: Annotated[Optional[List[str]], Field(title="Required Params")] = []
+    kwargs: Annotated[Optional[Dict[str, Any]], Field(title="Kwargs")] = None
+class Example(BaseModel):
+    model_config = ConfigDict(
+        extra="allow",
+    )
+    example_id: Annotated[Optional[str], Field(title="Example Id")] = None
+    created_at: Annotated[Optional[str], Field(title="Created At")] = None
+    name: Annotated[Optional[str], Field(title="Name")] = None
+class ValidationError(BaseModel):
+    loc: Annotated[List[Union[str, int]], Field(title="Location")]
+    msg: Annotated[str, Field(title="Message")]
+    type: Annotated[str, Field(title="Error Type")]
+class UsageInfo(BaseModel):
+    total_judgees: Annotated[int, Field(title="Total Judgees")]
+    regular_use: Annotated[int, Field(title="Regular Use")]
+    pay_as_you_go_use: Annotated[int, Field(title="Pay As You Go Use")]
+    remaining_regular: Annotated[int, Field(title="Remaining Regular")]
+    remaining_after: Annotated[int, Field(title="Remaining After")]
+class DatasetKind(Enum):
+    trace = "trace"
+    example = "example"
+class PromptScorer(BaseModel):
+    id: Annotated[str, Field(title="Id")]
+    user_id: Annotated[str, Field(title="User Id")]
+    organization_id: Annotated[str, Field(title="Organization Id")]
+    name: Annotated[str, Field(title="Name")]
+    prompt: Annotated[str, Field(title="Prompt")]
+    threshold: Annotated[float, Field(title="Threshold")]
+    model: Annotated[Optional[str], Field(title="Model")] = "gpt-5"
+    options: Annotated[Optional[Dict[str, float]], Field(title="Options")] = None
+    description: Annotated[Optional[str], Field(title="Description")] = None
+    created_at: Annotated[Optional[AwareDatetime], Field(title="Created At")] = None
+    updated_at: Annotated[Optional[AwareDatetime], Field(title="Updated At")] = None
+    is_trace: Annotated[Optional[bool], Field(title="Is Trace")] = False
+    is_bucket_rubric: Annotated[Optional[bool], Field(title="Is Bucket Rubric")] = None
+class PromptCommitInfo(BaseModel):
+    name: Annotated[str, Field(title="Name")]
+    prompt: Annotated[str, Field(title="Prompt")]
+    tags: Annotated[List[str], Field(title="Tags")]
+    commit_id: Annotated[str, Field(title="Commit Id")]
+    parent_commit_id: Annotated[Optional[str], Field(title="Parent Commit Id")] = None
+    created_at: Annotated[str, Field(title="Created At")]
+    first_name: Annotated[str, Field(title="First Name")]
+    last_name: Annotated[str, Field(title="Last Name")]
+    user_email: Annotated[str, Field(title="User Email")]
+class ScorerData(BaseModel):
+    id: Annotated[Optional[str], Field(title="Id")] = None
+    name: Annotated[str, Field(title="Name")]
+    threshold: Annotated[float, Field(title="Threshold")]
+    success: Annotated[bool, Field(title="Success")]
+    score: Annotated[Optional[float], Field(title="Score")] = None
+    reason: Annotated[Optional[str], Field(title="Reason")] = None
+    strict_mode: Annotated[Optional[bool], Field(title="Strict Mode")] = None
+    evaluation_model: Annotated[Optional[str], Field(title="Evaluation Model")] = None
+    error: Annotated[Optional[str], Field(title="Error")] = None
+    additional_metadata: Annotated[
+        Optional[Dict[str, Any]], Field(title="Additional Metadata")
+    ] = None
+class OtelTraceSpan(BaseModel):
+    organization_id: Annotated[str, Field(title="Organization Id")]
+    project_id: Annotated[Optional[str], Field(title="Project Id")] = None
+    user_id: Annotated[str, Field(title="User Id")]
+    timestamp: Annotated[str, Field(title="Timestamp")]
+    trace_id: Annotated[str, Field(title="Trace Id")]
+    span_id: Annotated[str, Field(title="Span Id")]
+    parent_span_id: Annotated[Optional[str], Field(title="Parent Span Id")] = None
+    trace_state: Annotated[Optional[str], Field(title="Trace State")] = None
+    span_name: Annotated[Optional[str], Field(title="Span Name")] = None
+    span_kind: Annotated[Optional[str], Field(title="Span Kind")] = None
+    service_name: Annotated[Optional[str], Field(title="Service Name")] = None
+    resource_attributes: Annotated[
+        Optional[Dict[str, Any]], Field(title="Resource Attributes")
+    ] = None
+    span_attributes: Annotated[
+        Optional[Dict[str, Any]], Field(title="Span Attributes")
+    ] = None
+    duration: Annotated[Optional[int], Field(title="Duration")] = None
+    status_code: Annotated[Optional[int], Field(title="Status Code")] = None
+    status_message: Annotated[Optional[str], Field(title="Status Message")] = None
+    events: Annotated[Optional[List[Dict[str, Any]]], Field(title="Events")] = None
+    links: Annotated[Optional[List[Dict[str, Any]]], Field(title="Links")] = None
+class OtelSpanListItemScores(BaseModel):
+    success: Annotated[bool, Field(title="Success")]
+    score: Annotated[float, Field(title="Score")]
+    reason: Annotated[Optional[str], Field(title="Reason")] = None
+    name: Annotated[str, Field(title="Name")]
+class OtelSpanDetailScores(BaseModel):
+    success: Annotated[bool, Field(title="Success")]
+    score: Annotated[float, Field(title="Score")]
+    reason: Annotated[Optional[str], Field(title="Reason")] = None
+    name: Annotated[str, Field(title="Name")]
+    example_id: Annotated[Optional[str], Field(title="Example Id")] = None
+class ExampleEvaluationRun(BaseModel):
+    id: Annotated[Optional[str], Field(title="Id")] = None
+    project_name: Annotated[str, Field(title="Project Name")]
+    eval_name: Annotated[str, Field(title="Eval Name")]
+    custom_scorers: Annotated[
+        Optional[List[BaseScorer]], Field(title="Custom Scorers")
+    ] = []
+    judgment_scorers: Annotated[
+        Optional[List[ScorerConfig]], Field(title="Judgment Scorers")
+    ] = []
+    model: Annotated[Optional[str], Field(title="Model")] = None
+    created_at: Annotated[Optional[str], Field(title="Created At")] = None
+    examples: Annotated[List[Example], Field(title="Examples")]
+    trace_span_id: Annotated[Optional[str], Field(title="Trace Span Id")] = None
+    trace_id: Annotated[Optional[str], Field(title="Trace Id")] = None
+class HTTPValidationError(BaseModel):
+    detail: Annotated[Optional[List[ValidationError]], Field(title="Detail")] = None
+class TraceEvaluationRun(BaseModel):
+    id: Annotated[Optional[str], Field(title="Id")] = None
+    project_name: Annotated[str, Field(title="Project Name")]
+    eval_name: Annotated[str, Field(title="Eval Name")]
+    custom_scorers: Annotated[
+        Optional[List[BaseScorer]], Field(title="Custom Scorers")
+    ] = []
+    judgment_scorers: Annotated[
+        Optional[List[ScorerConfig]], Field(title="Judgment Scorers")
+    ] = []
+    model: Annotated[Optional[str], Field(title="Model")] = None
+    created_at: Annotated[Optional[str], Field(title="Created At")] = None
+    trace_and_span_ids: Annotated[
+        List[TraceAndSpanId], Field(title="Trace And Span Ids")
+    ]
+    is_offline: Annotated[Optional[bool], Field(title="Is Offline")] = False
+    is_bucket_run: Annotated[Optional[bool], Field(title="Is Bucket Run")] = False
+class DatasetInsertExamples(BaseModel):
+    dataset_name: Annotated[str, Field(title="Dataset Name")]
+    examples: Annotated[List[Example], Field(title="Examples")]
+    project_name: Annotated[str, Field(title="Project Name")]
+class DatasetInfo(BaseModel):
+    dataset_id: Annotated[str, Field(title="Dataset Id")]
+    name: Annotated[str, Field(title="Name")]
+    created_at: Annotated[str, Field(title="Created At")]
+    kind: DatasetKind
+    entries: Annotated[int, Field(title="Entries")]
+    creator: Annotated[str, Field(title="Creator")]
+class DatasetCreate(BaseModel):
+    name: Annotated[str, Field(title="Name")]
+    dataset_kind: DatasetKind
+    project_name: Annotated[str, Field(title="Project Name")]
+    examples: Annotated[List[Example], Field(title="Examples")]
+    overwrite: Annotated[bool, Field(title="Overwrite")]
+class SavePromptScorerResponse(BaseModel):
+    scorer_response: PromptScorer
+class FetchPromptScorersResponse(BaseModel):
+    scorers: Annotated[List[PromptScorer], Field(title="Scorers")]
+class PromptFetchResponse(BaseModel):
+    commit: Optional[PromptCommitInfo] = None
+class PromptVersionsResponse(BaseModel):
+    versions: Annotated[List[PromptCommitInfo], Field(title="Versions")]
+class ScoringResult(BaseModel):
+    success: Annotated[bool, Field(title="Success")]
+    scorers_data: Annotated[List[ScorerData], Field(title="Scorers Data")]
+    name: Annotated[Optional[str], Field(title="Name")] = None
+    data_object: Annotated[
+        Optional[Union[OtelTraceSpan, Example]], Field(title="Data Object")
+    ] = None
+    trace_id: Annotated[Optional[str], Field(title="Trace Id")] = None
+    run_duration: Annotated[Optional[float], Field(title="Run Duration")] = None
+    evaluation_cost: Annotated[Optional[float], Field(title="Evaluation Cost")] = None
+class OtelTraceListItem(BaseModel):
+    organization_id: Annotated[str, Field(title="Organization Id")]
+    project_id: Annotated[str, Field(title="Project Id")]
+    trace_id: Annotated[str, Field(title="Trace Id")]
+    created_at: Annotated[AwareDatetime, Field(title="Created At")]
+    duration: Annotated[Optional[int], Field(title="Duration")] = None
+    tags: Annotated[Optional[List[str]], Field(title="Tags")] = None
+    experiment_run_id: Annotated[Optional[str], Field(title="Experiment Run Id")] = None
+    span_name: Annotated[Optional[str], Field(title="Span Name")] = None
+    llm_cost: Annotated[Optional[float], Field(title="Llm Cost")] = None
+    error: Annotated[Optional[str], Field(title="Error")] = ""
+    scores: Annotated[
+        Optional[List[OtelSpanListItemScores]], Field(title="Scores")
+    ] = []
+    rules_invoked: Annotated[Optional[List[str]], Field(title="Rules Invoked")] = []
+    customer_id: Annotated[Optional[str], Field(title="Customer Id")] = None
+    input: Annotated[Optional[str], Field(title="Input")] = None
+    output: Annotated[Optional[str], Field(title="Output")] = None
+    input_preview: Annotated[Optional[str], Field(title="Input Preview")] = None
+    output_preview: Annotated[Optional[str], Field(title="Output Preview")] = None
+    annotation_count: Annotated[Optional[int], Field(title="Annotation Count")] = 0
+    span_id: Annotated[str, Field(title="Span Id")]
+    rule_id: Annotated[Optional[str], Field(title="Rule Id")] = None
+class OtelSpanDetail(BaseModel):
+    organization_id: Annotated[str, Field(title="Organization Id")]
+    project_id: Annotated[str, Field(title="Project Id")]
+    timestamp: Annotated[AwareDatetime, Field(title="Timestamp")]
+    trace_id: Annotated[str, Field(title="Trace Id")]
+    span_id: Annotated[str, Field(title="Span Id")]
+    parent_span_id: Annotated[Optional[str], Field(title="Parent Span Id")] = None
+    trace_state: Annotated[Optional[str], Field(title="Trace State")] = None
+    span_name: Annotated[Optional[str], Field(title="Span Name")] = None
+    span_kind: Annotated[Optional[str], Field(title="Span Kind")] = None
+    service_name: Annotated[Optional[str], Field(title="Service Name")] = None
+    resource_attributes: Annotated[
+        Optional[Dict[str, Any]], Field(title="Resource Attributes")
+    ] = None
+    span_attributes: Annotated[
+        Optional[Dict[str, Any]], Field(title="Span Attributes")
+    ] = None
+    duration: Annotated[Optional[int], Field(title="Duration")] = None
+    status_code: Annotated[Optional[int], Field(title="Status Code")] = None
+    status_message: Annotated[Optional[str], Field(title="Status Message")] = None
+    events: Annotated[Optional[List[Dict[str, Any]]], Field(title="Events")] = None
+    links: Annotated[
+        Optional[Union[List[Dict[str, Any]], Dict[str, Any]]], Field(title="Links")
+    ] = None
+    llm_cost: Annotated[Optional[float], Field(title="Llm Cost")] = None
+    prompt_tokens: Annotated[Optional[int], Field(title="Prompt Tokens")] = None
+    completion_tokens: Annotated[Optional[int], Field(title="Completion Tokens")] = None
+    scores: Annotated[Optional[List[OtelSpanDetailScores]], Field(title="Scores")] = (
+        None
+    )
+class EvaluateResponse(BaseModel):
+    status: Annotated[str, Field(title="Status")]
+    results: Annotated[List[ScoringResult], Field(title="Results")]
+    resource_usage: Optional[UsageInfo] = None
+class EvalResults(BaseModel):
+    results: Annotated[List[ScoringResult], Field(title="Results")]
+    run: Annotated[Union[ExampleEvaluationRun, TraceEvaluationRun], Field(title="Run")]
+class DatasetTraceWithSpans(BaseModel):
+    dataset_id: Annotated[str, Field(title="Dataset Id")]
+    trace_detail: OtelTraceListItem
+    spans: Annotated[List[OtelSpanDetail], Field(title="Spans")]
+class DatasetReturn(BaseModel):
+    name: Annotated[str, Field(title="Name")]
+    project_name: Annotated[str, Field(title="Project Name")]
+    dataset_kind: DatasetKind
+    examples: Annotated[Optional[List[Example]], Field(title="Examples")] = None
+    traces: Annotated[Optional[List[DatasetTraceWithSpans]], Field(title="Traces")] = (
+        None
+    )

judgeval 0.0.11__py3-none-any.whl → 0.22.2__py3-none-any.whl

Potentially problematic release.

judgeval 0.0.11py3-none-any.whl → 0.22.2py3-none-any.whl