PyPI - judgeval - Versions diffs - 0.11.0__py3-none-any.whl → 0.13.0__py3-none-any.whl - Mend

judgeval 0.11.0py3-none-any.whl → 0.13.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

judgeval/__init__.py +5 -5
judgeval/api/api_types.py +81 -12
judgeval/cli.py +2 -1
judgeval/constants.py +0 -6
judgeval/data/evaluation_run.py +7 -8
judgeval/data/judgment_types.py +97 -12
judgeval/data/trace.py +108 -1
judgeval/dataset/__init__.py +72 -23
judgeval/env.py +5 -20
judgeval/integrations/langgraph/__init__.py +9 -785
judgeval/scorers/__init__.py +6 -0
judgeval/scorers/api_scorer.py +15 -12
judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +26 -35
judgeval/scorers/score.py +1 -1
judgeval/scorers/utils.py +1 -4
judgeval/tracer/__init__.py +181 -162
judgeval/tracer/exporters/__init__.py +4 -1
judgeval/tracer/keys.py +15 -25
judgeval/tracer/llm/__init__.py +0 -1
judgeval/tracer/llm/anthropic/__init__.py +20 -0
judgeval/tracer/llm/google/__init__.py +21 -0
judgeval/tracer/llm/groq/__init__.py +20 -0
judgeval/tracer/llm/openai/__init__.py +32 -0
judgeval/tracer/llm/providers.py +28 -79
judgeval/tracer/llm/together/__init__.py +20 -0
judgeval/tracer/managers.py +23 -48
judgeval/tracer/processors/__init__.py +36 -75
judgeval/tracer/utils.py +3 -4
judgeval/trainer/trainer.py +4 -4
judgeval/utils/file_utils.py +0 -2
judgeval/utils/meta.py +18 -5
judgeval/utils/testing.py +0 -14
judgeval/utils/version_check.py +2 -0
judgeval/version.py +1 -1
{judgeval-0.11.0.dist-info → judgeval-0.13.0.dist-info}/METADATA +1 -7
{judgeval-0.11.0.dist-info → judgeval-0.13.0.dist-info}/RECORD +43 -38
{judgeval-0.11.0.dist-info → judgeval-0.13.0.dist-info}/WHEEL +0 -0
{judgeval-0.11.0.dist-info → judgeval-0.13.0.dist-info}/entry_points.txt +0 -0
{judgeval-0.11.0.dist-info → judgeval-0.13.0.dist-info}/licenses/LICENSE.md +0 -0

judgeval/__init__.py CHANGED Viewed

@@ -5,12 +5,12 @@ from judgeval.evaluation import run_eval
 from judgeval.data.evaluation_run import ExampleEvaluationRun
-from typing import List, Optional, Union
-from judgeval.scorers import APIScorerConfig
+from typing import List, Optional, Union, Sequence
+from judgeval.scorers import ExampleAPIScorerConfig
 from judgeval.scorers.example_scorer import ExampleScorer
 from judgeval.data.example import Example
 from judgeval.logger import judgeval_logger
-from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_DEFAULT_GPT_MODEL, JUDGMENT_ORG_ID
+from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_ORG_ID
 from judgeval.utils.meta import SingletonMeta
 from judgeval.exceptions import JudgmentRuntimeError, JudgmentTestError
 from judgeval.api import JudgmentSyncClient
@@ -39,10 +39,10 @@ class JudgmentClient(metaclass=SingletonMeta):
     def run_evaluation(
         self,
         examples: List[Example],
-        scorers: List[Union[APIScorerConfig, ExampleScorer]],
+        scorers: Sequence[Union[ExampleAPIScorerConfig, ExampleScorer]],
         project_name: str = "default_project",
         eval_run_name: str = "default_eval_run",
-        model: str = JUDGMENT_DEFAULT_GPT_MODEL,
+        model: Optional[str] = None,
         assert_test: bool = False,
     ) -> List[ScoringResult]:
         try:

judgeval/api/api_types.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # generated by datamodel-codegen:
 #   filename:  .openapi.json
-#   timestamp: 2025-09-12T16:54:35+00:00
+#   timestamp: 2025-09-24T18:25:18+00:00
 from __future__ import annotations
 from typing import Any, Dict, List, Literal, Optional, TypedDict, Union
@@ -52,8 +52,8 @@ class SavePromptScorerRequest(TypedDict):
     name: str
     prompt: str
     threshold: float
-    options: NotRequired[Optional[Dict[str, float]]]
-    is_trace: NotRequired[Optional[bool]]
+    model: NotRequired[str]
+    is_trace: NotRequired[bool]
 class SavePromptScorerResponse(TypedDict):
@@ -117,6 +117,7 @@ class ScorerConfig(TypedDict):
     score_type: str
     name: NotRequired[Optional[str]]
     threshold: NotRequired[float]
+    model: NotRequired[Optional[str]]
     strict_mode: NotRequired[bool]
     required_params: NotRequired[List[str]]
     kwargs: NotRequired[Optional[Dict[str, Any]]]
@@ -141,7 +142,7 @@ class PromptScorer(TypedDict):
     name: str
     prompt: str
     threshold: float
-    options: NotRequired[Optional[Dict[str, float]]]
+    model: NotRequired[str]
     created_at: NotRequired[Optional[str]]
     updated_at: NotRequired[Optional[str]]
     is_trace: NotRequired[Optional[bool]]
@@ -189,13 +190,28 @@ class OtelTraceSpan(TypedDict):
     state_before: NotRequired[Optional[Dict[str, Any]]]
+class OtelSpanListItemScores(TypedDict):
+    success: bool
+    score: float
+    reason: NotRequired[Optional[str]]
+    name: str
+class OtelSpanDetailScores(TypedDict):
+    success: bool
+    score: float
+    reason: NotRequired[Optional[str]]
+    name: str
+    data: NotRequired[Optional[Dict[str, Any]]]
 class ExampleEvaluationRun(TypedDict):
     id: NotRequired[str]
     project_name: str
     eval_name: str
     custom_scorers: NotRequired[List[BaseScorer]]
     judgment_scorers: NotRequired[List[ScorerConfig]]
-    model: str
+    model: NotRequired[Optional[str]]
     created_at: NotRequired[str]
     examples: List[Example]
     trace_span_id: NotRequired[Optional[str]]
@@ -212,7 +228,7 @@ class TraceEvaluationRun(TypedDict):
     eval_name: str
     custom_scorers: NotRequired[List[BaseScorer]]
     judgment_scorers: NotRequired[List[ScorerConfig]]
-    model: str
+    model: NotRequired[Optional[str]]
     created_at: NotRequired[str]
     trace_and_span_ids: List[TraceAndSpanId]
     is_offline: NotRequired[bool]
@@ -224,12 +240,6 @@ class DatasetInsertExamples(TypedDict):
     project_name: str
-class DatasetReturn(TypedDict):
-    name: str
-    project_name: str
-    examples: NotRequired[Optional[List[Example]]]
 class DatasetInfo(TypedDict):
     dataset_id: str
     name: str
@@ -261,6 +271,65 @@ class ScoringResult(TypedDict):
     evaluation_cost: NotRequired[Optional[float]]
+class OtelTraceListItem(TypedDict):
+    organization_id: str
+    project_id: str
+    trace_id: str
+    timestamp: str
+    duration: NotRequired[Optional[int]]
+    has_notification: NotRequired[Optional[bool]]
+    tags: NotRequired[Optional[List[str]]]
+    experiment_run_id: NotRequired[Optional[str]]
+    span_name: NotRequired[Optional[str]]
+    cumulative_llm_cost: NotRequired[Optional[float]]
+    error: NotRequired[Optional[Dict[str, Any]]]
+    scores: NotRequired[List[OtelSpanListItemScores]]
+    customer_id: NotRequired[Optional[str]]
+    input_preview: NotRequired[Optional[str]]
+    output_preview: NotRequired[Optional[str]]
+    annotation_count: NotRequired[int]
+    span_id: str
+    rule_id: NotRequired[Optional[str]]
+class OtelSpanDetail(TypedDict):
+    organization_id: str
+    project_id: str
+    timestamp: str
+    trace_id: str
+    span_id: str
+    parent_span_id: NotRequired[Optional[str]]
+    trace_state: NotRequired[Optional[str]]
+    span_name: NotRequired[Optional[str]]
+    span_kind: NotRequired[Optional[str]]
+    service_name: NotRequired[Optional[str]]
+    resource_attributes: NotRequired[Optional[Dict[str, Any]]]
+    span_attributes: NotRequired[Optional[Dict[str, Any]]]
+    duration: NotRequired[Optional[int]]
+    status_code: NotRequired[Optional[str]]
+    status_message: NotRequired[Optional[str]]
+    events: NotRequired[Optional[Union[List[Dict[str, Any]], Dict[str, Any]]]]
+    links: NotRequired[Optional[Union[List[Dict[str, Any]], Dict[str, Any]]]]
+    llm_cost: NotRequired[Optional[float]]
+    prompt_tokens: NotRequired[Optional[int]]
+    completion_tokens: NotRequired[Optional[int]]
+    scores: NotRequired[Optional[List[OtelSpanDetailScores]]]
 class EvalResults(TypedDict):
     results: List[ScoringResult]
     run: Union[ExampleEvaluationRun, TraceEvaluationRun]
+class DatasetTraceWithSpans(TypedDict):
+    dataset_id: str
+    trace_detail: OtelTraceListItem
+    spans: List[OtelSpanDetail]
+class DatasetReturn(TypedDict):
+    name: str
+    project_name: str
+    dataset_kind: DatasetKind
+    examples: NotRequired[Optional[List[Example]]]
+    traces: NotRequired[Optional[List[DatasetTraceWithSpans]]]

judgeval/cli.py CHANGED Viewed

@@ -5,6 +5,7 @@ from pathlib import Path
 from dotenv import load_dotenv
 from judgeval.logger import judgeval_logger
 from judgeval import JudgmentClient
+from judgeval.version import get_version
 load_dotenv()
@@ -56,7 +57,7 @@ def upload_scorer(
 @app.command()
 def version():
     """Show version info"""
-    judgeval_logger.info("JudgEval CLI v0.0.0")
+    judgeval_logger.info(f"Judgeval CLI v{get_version()}")
 if __name__ == "__main__":

judgeval/constants.py CHANGED Viewed

@@ -24,7 +24,6 @@ class APIScorerType(str, Enum):
     @classmethod
     def __missing__(cls, value: str) -> APIScorerType:
-        # Handle case-insensitive lookup
         for member in cls:
             if member.value == value.lower():
                 return member
@@ -32,11 +31,6 @@ class APIScorerType(str, Enum):
         raise ValueError(f"Invalid scorer type: {value}")
-UNBOUNDED_SCORERS: Set[APIScorerType] = (
-    set()
-)  # scorers whose scores are not bounded between 0-1
 LITELLM_SUPPORTED_MODELS: Set[str] = set(litellm.model_list)

judgeval/data/evaluation_run.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import List, Optional, Union, Tuple
+from typing import List, Optional, Union, Tuple, Sequence
 from pydantic import field_validator, model_validator, Field, BaseModel
 from datetime import datetime, timezone
 import uuid
@@ -19,9 +19,11 @@ class EvaluationRun(BaseModel):
         default_factory=lambda: datetime.now(timezone.utc).isoformat()
     )
     custom_scorers: List[ExampleScorer] = Field(default_factory=list)
-    judgment_scorers: List[APIScorerConfig] = Field(default_factory=list)
-    scorers: List[Union[ExampleScorer, APIScorerConfig]] = Field(default_factory=list)
-    model: str
+    judgment_scorers: Sequence[APIScorerConfig] = Field(default_factory=list)
+    scorers: Sequence[Union[ExampleScorer, APIScorerConfig]] = Field(
+        default_factory=list
+    )
+    model: Optional[str] = None
     def __init__(
         self,
@@ -75,11 +77,8 @@ class EvaluationRun(BaseModel):
     @field_validator("model")
     def validate_model(cls, v, values):
-        if not v:
-            raise ValueError("Model cannot be empty.")
         # Check if model is string or list of strings
-        if isinstance(v, str):
+        if v is not None and isinstance(v, str):
             if v not in ACCEPTABLE_MODELS:
                 raise ValueError(
                     f"Model name {v} not recognized. Please select a valid model name.)"

judgeval/data/judgment_types.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # generated by datamodel-codegen:
 #   filename:  .openapi.json
-#   timestamp: 2025-09-12T16:54:34+00:00
+#   timestamp: 2025-09-24T18:25:17+00:00
 from __future__ import annotations
 from typing import Annotated, Any, Dict, List, Optional, Union
@@ -54,8 +54,8 @@ class SavePromptScorerRequest(BaseModel):
     name: Annotated[str, Field(title="Name")]
     prompt: Annotated[str, Field(title="Prompt")]
     threshold: Annotated[float, Field(title="Threshold")]
-    options: Annotated[Optional[Dict[str, float]], Field(title="Options")] = None
-    is_trace: Annotated[Optional[bool], Field(title="Is Trace")] = None
+    model: Annotated[Optional[str], Field(title="Model")] = "gpt-5"
+    is_trace: Annotated[Optional[bool], Field(title="Is Trace")] = False
 class SavePromptScorerResponse(BaseModel):
@@ -125,6 +125,7 @@ class ScorerConfig(BaseModel):
     score_type: Annotated[str, Field(title="Score Type")]
     name: Annotated[Optional[str], Field(title="Name")] = None
     threshold: Annotated[Optional[float], Field(title="Threshold")] = 0.5
+    model: Annotated[Optional[str], Field(title="Model")] = None
     strict_mode: Annotated[Optional[bool], Field(title="Strict Mode")] = False
     required_params: Annotated[Optional[List[str]], Field(title="Required Params")] = []
     kwargs: Annotated[Optional[Dict[str, Any]], Field(title="Kwargs")] = None
@@ -154,7 +155,7 @@ class PromptScorer(BaseModel):
     name: Annotated[str, Field(title="Name")]
     prompt: Annotated[str, Field(title="Prompt")]
     threshold: Annotated[float, Field(title="Threshold")]
-    options: Annotated[Optional[Dict[str, float]], Field(title="Options")] = None
+    model: Annotated[Optional[str], Field(title="Model")] = "gpt-5"
     created_at: Annotated[Optional[AwareDatetime], Field(title="Created At")] = None
     updated_at: Annotated[Optional[AwareDatetime], Field(title="Updated At")] = None
     is_trace: Annotated[Optional[bool], Field(title="Is Trace")] = False
@@ -212,6 +213,21 @@ class OtelTraceSpan(BaseModel):
     )
+class OtelSpanListItemScores(BaseModel):
+    success: Annotated[bool, Field(title="Success")]
+    score: Annotated[float, Field(title="Score")]
+    reason: Annotated[Optional[str], Field(title="Reason")] = None
+    name: Annotated[str, Field(title="Name")]
+class OtelSpanDetailScores(BaseModel):
+    success: Annotated[bool, Field(title="Success")]
+    score: Annotated[float, Field(title="Score")]
+    reason: Annotated[Optional[str], Field(title="Reason")] = None
+    name: Annotated[str, Field(title="Name")]
+    data: Annotated[Optional[Dict[str, Any]], Field(title="Data")] = None
 class ExampleEvaluationRun(BaseModel):
     id: Annotated[Optional[str], Field(title="Id")] = None
     project_name: Annotated[str, Field(title="Project Name")]
@@ -222,7 +238,7 @@ class ExampleEvaluationRun(BaseModel):
     judgment_scorers: Annotated[
         Optional[List[ScorerConfig]], Field(title="Judgment Scorers")
     ] = []
-    model: Annotated[str, Field(title="Model")]
+    model: Annotated[Optional[str], Field(title="Model")] = None
     created_at: Annotated[Optional[str], Field(title="Created At")] = None
     examples: Annotated[List[Example], Field(title="Examples")]
     trace_span_id: Annotated[Optional[str], Field(title="Trace Span Id")] = None
@@ -243,7 +259,7 @@ class TraceEvaluationRun(BaseModel):
     judgment_scorers: Annotated[
         Optional[List[ScorerConfig]], Field(title="Judgment Scorers")
     ] = []
-    model: Annotated[str, Field(title="Model")]
+    model: Annotated[Optional[str], Field(title="Model")] = None
     created_at: Annotated[Optional[str], Field(title="Created At")] = None
     trace_and_span_ids: Annotated[
         List[TraceAndSpanId], Field(title="Trace And Span Ids")
@@ -257,12 +273,6 @@ class DatasetInsertExamples(BaseModel):
     project_name: Annotated[str, Field(title="Project Name")]
-class DatasetReturn(BaseModel):
-    name: Annotated[str, Field(title="Name")]
-    project_name: Annotated[str, Field(title="Project Name")]
-    examples: Annotated[Optional[List[Example]], Field(title="Examples")] = None
 class DatasetInfo(BaseModel):
     dataset_id: Annotated[str, Field(title="Dataset Id")]
     name: Annotated[str, Field(title="Name")]
@@ -296,6 +306,81 @@ class ScoringResult(BaseModel):
     evaluation_cost: Annotated[Optional[float], Field(title="Evaluation Cost")] = None
+class OtelTraceListItem(BaseModel):
+    organization_id: Annotated[str, Field(title="Organization Id")]
+    project_id: Annotated[str, Field(title="Project Id")]
+    trace_id: Annotated[str, Field(title="Trace Id")]
+    timestamp: Annotated[str, Field(title="Timestamp")]
+    duration: Annotated[Optional[int], Field(title="Duration")] = None
+    has_notification: Annotated[Optional[bool], Field(title="Has Notification")] = None
+    tags: Annotated[Optional[List[str]], Field(title="Tags")] = None
+    experiment_run_id: Annotated[Optional[str], Field(title="Experiment Run Id")] = None
+    span_name: Annotated[Optional[str], Field(title="Span Name")] = None
+    cumulative_llm_cost: Annotated[
+        Optional[float], Field(title="Cumulative Llm Cost")
+    ] = None
+    error: Annotated[Optional[Dict[str, Any]], Field(title="Error")] = None
+    scores: Annotated[
+        Optional[List[OtelSpanListItemScores]], Field(title="Scores")
+    ] = []
+    customer_id: Annotated[Optional[str], Field(title="Customer Id")] = None
+    input_preview: Annotated[Optional[str], Field(title="Input Preview")] = None
+    output_preview: Annotated[Optional[str], Field(title="Output Preview")] = None
+    annotation_count: Annotated[Optional[int], Field(title="Annotation Count")] = 0
+    span_id: Annotated[str, Field(title="Span Id")]
+    rule_id: Annotated[Optional[str], Field(title="Rule Id")] = None
+class OtelSpanDetail(BaseModel):
+    organization_id: Annotated[str, Field(title="Organization Id")]
+    project_id: Annotated[str, Field(title="Project Id")]
+    timestamp: Annotated[str, Field(title="Timestamp")]
+    trace_id: Annotated[str, Field(title="Trace Id")]
+    span_id: Annotated[str, Field(title="Span Id")]
+    parent_span_id: Annotated[Optional[str], Field(title="Parent Span Id")] = None
+    trace_state: Annotated[Optional[str], Field(title="Trace State")] = None
+    span_name: Annotated[Optional[str], Field(title="Span Name")] = None
+    span_kind: Annotated[Optional[str], Field(title="Span Kind")] = None
+    service_name: Annotated[Optional[str], Field(title="Service Name")] = None
+    resource_attributes: Annotated[
+        Optional[Dict[str, Any]], Field(title="Resource Attributes")
+    ] = None
+    span_attributes: Annotated[
+        Optional[Dict[str, Any]], Field(title="Span Attributes")
+    ] = None
+    duration: Annotated[Optional[int], Field(title="Duration")] = None
+    status_code: Annotated[Optional[str], Field(title="Status Code")] = None
+    status_message: Annotated[Optional[str], Field(title="Status Message")] = None
+    events: Annotated[
+        Optional[Union[List[Dict[str, Any]], Dict[str, Any]]], Field(title="Events")
+    ] = None
+    links: Annotated[
+        Optional[Union[List[Dict[str, Any]], Dict[str, Any]]], Field(title="Links")
+    ] = None
+    llm_cost: Annotated[Optional[float], Field(title="Llm Cost")] = None
+    prompt_tokens: Annotated[Optional[int], Field(title="Prompt Tokens")] = None
+    completion_tokens: Annotated[Optional[int], Field(title="Completion Tokens")] = None
+    scores: Annotated[Optional[List[OtelSpanDetailScores]], Field(title="Scores")] = (
+        None
+    )
 class EvalResults(BaseModel):
     results: Annotated[List[ScoringResult], Field(title="Results")]
     run: Annotated[Union[ExampleEvaluationRun, TraceEvaluationRun], Field(title="Run")]
+class DatasetTraceWithSpans(BaseModel):
+    dataset_id: Annotated[str, Field(title="Dataset Id")]
+    trace_detail: OtelTraceListItem
+    spans: Annotated[List[OtelSpanDetail], Field(title="Spans")]
+class DatasetReturn(BaseModel):
+    name: Annotated[str, Field(title="Name")]
+    project_name: Annotated[str, Field(title="Project Name")]
+    dataset_kind: DatasetKind
+    examples: Annotated[Optional[List[Example]], Field(title="Examples")] = None
+    traces: Annotated[Optional[List[DatasetTraceWithSpans]], Field(title="Traces")] = (
+        None
+    )

judgeval/data/trace.py CHANGED Viewed

@@ -1,5 +1,10 @@
-from typing import Optional
+from typing import Optional, List, Dict, Any
 from pydantic import BaseModel
+from .judgment_types import (
+    OtelSpanDetailScores,
+    OtelSpanDetail,
+    OtelTraceListItem,
+)
 class TraceUsage(BaseModel):
@@ -12,3 +17,105 @@ class TraceUsage(BaseModel):
     completion_tokens_cost_usd: Optional[float] = None
     total_cost_usd: Optional[float] = None
     model_name: Optional[str] = None
+class TraceScore(OtelSpanDetailScores):
+    """Score information for a trace or span."""
+    pass
+class TraceRule(BaseModel):
+    """Rule that was triggered for a trace."""
+    rule_id: str
+    rule_name: str
+class TraceSpan(OtelSpanDetail):
+    """Individual span within a trace with complete telemetry data."""
+    @classmethod
+    def from_otel_span_detail(cls, span_detail: OtelSpanDetail) -> "TraceSpan":
+        """Create TraceSpan from OtelSpanDetail, converting scores to TraceScore."""
+        data = span_detail.model_dump()
+        if "scores" in data and data["scores"]:
+            data["scores"] = [TraceScore(**score) for score in data["scores"]]
+        return cls(**data)
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert TraceSpan to dictionary."""
+        return self.model_dump(exclude_none=True)
+class Trace(OtelTraceListItem):
+    """Complete trace with metadata and all associated spans."""
+    spans: List[TraceSpan] = []
+    rules: Optional[List[TraceRule]] = []
+    @classmethod
+    def from_dataset_trace_with_spans(cls, dataset_trace: Any) -> "Trace":
+        """Create Trace from DatasetTraceWithSpans (handles both API and judgment types)."""
+        if hasattr(dataset_trace, "trace_detail"):
+            trace_detail = dataset_trace.trace_detail
+            spans_data = dataset_trace.spans
+        else:
+            trace_detail = dataset_trace.get("trace_detail", {})
+            spans_data = dataset_trace.get("spans", [])
+        if hasattr(trace_detail, "model_dump"):
+            trace_data = trace_detail.model_dump()
+        elif isinstance(trace_detail, dict):
+            trace_data = trace_detail.copy()
+        else:
+            trace_data = dict(trace_detail)
+        spans = []
+        for span in spans_data:
+            if hasattr(span, "model_dump"):
+                spans.append(TraceSpan.from_otel_span_detail(span))
+            else:
+                # Handle dict spans
+                span_data = dict(span) if not isinstance(span, dict) else span.copy()
+                if "scores" in span_data and span_data["scores"]:
+                    span_data["scores"] = [
+                        TraceScore(**score)
+                        if isinstance(score, dict)
+                        else TraceScore(**score.model_dump())
+                        for score in span_data["scores"]
+                    ]
+                spans.append(TraceSpan(**span_data))
+        rules = []
+        if "rule_id" in trace_data and trace_data["rule_id"]:
+            rules = [
+                TraceRule(
+                    rule_id=trace_data["rule_id"],
+                    rule_name=f"Rule {trace_data['rule_id']}",
+                )
+            ]
+        trace_data.pop("scores", [])
+        trace_data.pop("rule_id", None)
+        trace = cls(**trace_data)
+        trace.spans = spans
+        trace.rules = rules
+        return trace
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert Trace to dictionary."""
+        return self.model_dump(exclude_none=True)
+    def __len__(self) -> int:
+        """Return the number of spans in the trace."""
+        return len(self.spans)
+    def __iter__(self):
+        """Iterate over spans in the trace."""
+        return iter(self.spans)

judgeval/dataset/__init__.py CHANGED Viewed

@@ -3,15 +3,16 @@ import orjson
 import os
 import yaml
 from dataclasses import dataclass
-from typing import List, Literal
+from typing import List, Literal, Optional
 from judgeval.data import Example
+from judgeval.data.trace import Trace
 from judgeval.utils.file_utils import get_examples_from_yaml, get_examples_from_json
 from judgeval.api import JudgmentSyncClient
 from judgeval.logger import judgeval_logger
 from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_ORG_ID
-from judgeval.api.api_types import DatasetKind
+from judgeval.data.judgment_types import DatasetKind
 @dataclass
@@ -26,9 +27,11 @@ class DatasetInfo:
 @dataclass
 class Dataset:
-    examples: List[Example]
     name: str
     project_name: str
+    dataset_kind: DatasetKind = DatasetKind.example
+    examples: Optional[List[Example]] = None
+    traces: Optional[List[Trace]] = None
     judgment_api_key: str = JUDGMENT_API_KEY or ""
     organization_id: str = JUDGMENT_ORG_ID or ""
@@ -47,22 +50,49 @@ class Dataset:
         )
         if not dataset:
             raise ValueError(f"Dataset {name} not found in project {project_name}")
-        examples = dataset.get("examples", [])
-        if examples is None:
-            examples = []
-        for e in examples:
-            if isinstance(e, dict) and isinstance(e.get("data", {}), dict):
-                e.update(e.pop("data"))  # type: ignore
-                e.pop(
-                    "example_id"
-                )  # TODO: remove once scorer data migraiton is complete
-        judgeval_logger.info(f"Successfully retrieved dataset {name}!")
-        return cls(
-            name=name,
-            project_name=project_name,
-            examples=[Example(**e) for e in examples],
-        )
+        dataset_kind = DatasetKind(dataset.get("dataset_kind", "example"))
+        if dataset_kind == DatasetKind.example:
+            examples = dataset.get("examples", [])
+            if examples is None:
+                examples = []
+            for e in examples:
+                if isinstance(e, dict) and isinstance(e.get("data", {}), dict):
+                    e.update(e.pop("data"))  # type: ignore
+                    e.pop(
+                        "example_id"
+                    )  # TODO: remove once scorer data migration is complete
+            judgeval_logger.info(f"Successfully retrieved example dataset {name}!")
+            return cls(
+                name=name,
+                project_name=project_name,
+                dataset_kind=dataset_kind,
+                examples=[Example(**e) for e in examples],
+            )
+        elif dataset_kind == DatasetKind.trace:
+            trace_data = dataset.get("traces", [])
+            if trace_data is None:
+                trace_data = []
+            traces = []
+            for trace_item in trace_data:
+                if isinstance(trace_item, dict):
+                    trace = Trace.from_dataset_trace_with_spans(trace_item)
+                    traces.append(trace)
+            judgeval_logger.info(f"Successfully retrieved trace dataset {name}!")
+            return cls(
+                name=name,
+                project_name=project_name,
+                dataset_kind=dataset_kind,
+                traces=traces,
+            )
+        else:
+            raise ValueError(f"Unsupported dataset kind: {dataset_kind}")
     @classmethod
     def create(
@@ -179,7 +209,9 @@ class Dataset:
                 file.write(
                     orjson.dumps(
                         {
-                            "examples": [e.to_dict() for e in self.examples],
+                            "examples": [e.to_dict() for e in self.examples]
+                            if self.examples
+                            else [],
                         },
                         option=orjson.OPT_INDENT_2,
                     )
@@ -187,7 +219,9 @@ class Dataset:
         elif file_type == "yaml":
             with open(complete_path, "w") as file:
                 yaml_data = {
-                    "examples": [e.to_dict() for e in self.examples],
+                    "examples": [e.to_dict() for e in self.examples]
+                    if self.examples
+                    else [],
                 }
                 yaml.dump(yaml_data, file, default_flow_style=False)
         else:
@@ -197,10 +231,25 @@ class Dataset:
             )
     def __iter__(self):
-        return iter(self.examples)
+        if self.dataset_kind == DatasetKind.example and self.examples:
+            return iter(self.examples)
+        elif self.dataset_kind == DatasetKind.trace and self.traces:
+            return iter(self.traces)
+        else:
+            return iter([])
     def __len__(self):
-        return len(self.examples)
+        if self.dataset_kind == DatasetKind.example and self.examples:
+            return len(self.examples)
+        elif self.dataset_kind == DatasetKind.trace and self.traces:
+            return len(self.traces)
+        else:
+            return 0
     def __str__(self):
-        return f"{self.__class__.__name__}(examples={self.examples}, name={self.name})"
+        if self.dataset_kind == DatasetKind.example:
+            return (
+                f"{self.__class__.__name__}(examples={self.examples}, name={self.name})"
+            )
+        else:
+            return f"{self.__class__.__name__}(traces={self.traces}, name={self.name})"

judgeval 0.11.0__py3-none-any.whl → 0.13.0__py3-none-any.whl

judgeval 0.11.0py3-none-any.whl → 0.13.0py3-none-any.whl