judgeval 0.4.0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +2 -0
- judgeval/cli.py +65 -0
- judgeval/clients.py +2 -1
- judgeval/common/api/api.py +46 -54
- judgeval/common/api/constants.py +18 -5
- judgeval/common/api/json_encoder.py +241 -0
- judgeval/common/tracer/core.py +772 -467
- judgeval/common/tracer/otel_span_processor.py +1 -1
- judgeval/common/tracer/providers.py +119 -0
- judgeval/common/tracer/span_processor.py +1 -1
- judgeval/common/tracer/span_transformer.py +16 -26
- judgeval/constants.py +1 -0
- judgeval/data/evaluation_run.py +104 -0
- judgeval/data/judgment_types.py +38 -8
- judgeval/data/trace.py +6 -122
- judgeval/data/trace_run.py +2 -3
- judgeval/dataset.py +2 -0
- judgeval/integrations/langgraph.py +2 -1
- judgeval/judges/litellm_judge.py +2 -1
- judgeval/judges/mixture_of_judges.py +2 -1
- judgeval/judges/utils.py +2 -1
- judgeval/judgment_client.py +113 -53
- judgeval/local_eval_queue.py +190 -0
- judgeval/run_evaluation.py +43 -197
- judgeval/scorers/base_scorer.py +9 -10
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +17 -3
- judgeval/scorers/score.py +33 -11
- judgeval/utils/async_utils.py +36 -0
- {judgeval-0.4.0.dist-info → judgeval-0.6.0.dist-info}/METADATA +11 -12
- {judgeval-0.4.0.dist-info → judgeval-0.6.0.dist-info}/RECORD +33 -27
- judgeval-0.6.0.dist-info/entry_points.txt +2 -0
- judgeval/evaluation_run.py +0 -76
- {judgeval-0.4.0.dist-info → judgeval-0.6.0.dist-info}/WHEEL +0 -0
- {judgeval-0.4.0.dist-info → judgeval-0.6.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -21,7 +21,7 @@ from judgeval.common.tracer.otel_exporter import JudgmentAPISpanExporter
|
|
21
21
|
from judgeval.common.tracer.span_processor import SpanProcessorBase
|
22
22
|
from judgeval.common.tracer.span_transformer import SpanTransformer
|
23
23
|
from judgeval.data import TraceSpan
|
24
|
-
from judgeval.evaluation_run import EvaluationRun
|
24
|
+
from judgeval.data.evaluation_run import EvaluationRun
|
25
25
|
|
26
26
|
|
27
27
|
class SimpleReadableSpan(ReadableSpan):
|
@@ -0,0 +1,119 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
import logging
|
3
|
+
from typing import Any, TypeAlias
|
4
|
+
|
5
|
+
|
6
|
+
logger = logging.getLogger(__name__)
|
7
|
+
# TODO: Have functions that assert and return the relevant exports when the client is installed.
|
8
|
+
# The method should raise if the user tries to access client information that doesnt exist.
|
9
|
+
|
10
|
+
HAS_OPENAI = False
|
11
|
+
openai_OpenAI = None
|
12
|
+
openai_AsyncOpenAI = None
|
13
|
+
openai_ChatCompletion = None
|
14
|
+
openai_Response = None
|
15
|
+
openai_ParsedChatCompletion = None
|
16
|
+
|
17
|
+
try:
|
18
|
+
from openai import OpenAI, AsyncOpenAI
|
19
|
+
from openai.types.chat.chat_completion import ChatCompletion
|
20
|
+
from openai.types.responses.response import Response
|
21
|
+
from openai.types.chat import ParsedChatCompletion
|
22
|
+
|
23
|
+
openai_OpenAI = OpenAI
|
24
|
+
openai_AsyncOpenAI = AsyncOpenAI
|
25
|
+
openai_ChatCompletion = ChatCompletion
|
26
|
+
openai_Response = Response
|
27
|
+
openai_ParsedChatCompletion = ParsedChatCompletion
|
28
|
+
HAS_OPENAI = True
|
29
|
+
except ImportError:
|
30
|
+
pass
|
31
|
+
|
32
|
+
|
33
|
+
HAS_TOGETHER = False
|
34
|
+
together_Together = None
|
35
|
+
together_AsyncTogether = None
|
36
|
+
|
37
|
+
try:
|
38
|
+
from together import Together, AsyncTogether
|
39
|
+
|
40
|
+
together_Together = Together
|
41
|
+
together_AsyncTogether = AsyncTogether
|
42
|
+
HAS_TOGETHER = True
|
43
|
+
except ImportError:
|
44
|
+
pass
|
45
|
+
|
46
|
+
|
47
|
+
HAS_ANTHROPIC = False
|
48
|
+
anthropic_Anthropic = None
|
49
|
+
anthropic_AsyncAnthropic = None
|
50
|
+
|
51
|
+
try:
|
52
|
+
from anthropic import Anthropic, AsyncAnthropic
|
53
|
+
|
54
|
+
anthropic_Anthropic = Anthropic
|
55
|
+
anthropic_AsyncAnthropic = AsyncAnthropic
|
56
|
+
HAS_ANTHROPIC = True
|
57
|
+
except ImportError:
|
58
|
+
pass
|
59
|
+
|
60
|
+
|
61
|
+
HAS_GOOGLE_GENAI = False
|
62
|
+
google_genai_Client = None
|
63
|
+
google_genai_cleint_AsyncClient = None
|
64
|
+
|
65
|
+
try:
|
66
|
+
from google.genai import Client
|
67
|
+
from google.genai.client import AsyncClient
|
68
|
+
|
69
|
+
google_genai_Client = Client
|
70
|
+
google_genai_AsyncClient = AsyncClient
|
71
|
+
HAS_GOOGLE_GENAI = True
|
72
|
+
except ImportError:
|
73
|
+
pass
|
74
|
+
|
75
|
+
|
76
|
+
HAS_GROQ = False
|
77
|
+
groq_Groq = None
|
78
|
+
groq_AsyncGroq = None
|
79
|
+
|
80
|
+
try:
|
81
|
+
from groq import Groq, AsyncGroq
|
82
|
+
|
83
|
+
groq_Groq = Groq
|
84
|
+
groq_AsyncGroq = AsyncGroq
|
85
|
+
HAS_GROQ = True
|
86
|
+
except ImportError:
|
87
|
+
pass
|
88
|
+
|
89
|
+
|
90
|
+
# TODO: if we support dependency groups we can have this better type, but during runtime, we do
|
91
|
+
# not know which clients an end user might have installed.
|
92
|
+
ApiClient: TypeAlias = Any
|
93
|
+
|
94
|
+
__all__ = [
|
95
|
+
"ApiClient",
|
96
|
+
# OpenAI
|
97
|
+
"HAS_OPENAI",
|
98
|
+
"openai_OpenAI",
|
99
|
+
"openai_AsyncOpenAI",
|
100
|
+
"openai_ChatCompletion",
|
101
|
+
"openai_Response",
|
102
|
+
"openai_ParsedChatCompletion",
|
103
|
+
# Together
|
104
|
+
"HAS_TOGETHER",
|
105
|
+
"together_Together",
|
106
|
+
"together_AsyncTogether",
|
107
|
+
# Anthropic
|
108
|
+
"HAS_ANTHROPIC",
|
109
|
+
"anthropic_Anthropic",
|
110
|
+
"anthropic_AsyncAnthropic",
|
111
|
+
# Google GenAI
|
112
|
+
"HAS_GOOGLE_GENAI",
|
113
|
+
"google_genai_Client",
|
114
|
+
"google_genai_AsyncClient",
|
115
|
+
# Groq
|
116
|
+
"HAS_GROQ",
|
117
|
+
"groq_Groq",
|
118
|
+
"groq_AsyncGroq",
|
119
|
+
]
|
@@ -7,7 +7,7 @@ When monitoring is enabled, we use JudgmentSpanProcessor which overrides the met
|
|
7
7
|
"""
|
8
8
|
|
9
9
|
from judgeval.data import TraceSpan
|
10
|
-
from judgeval.evaluation_run import EvaluationRun
|
10
|
+
from judgeval.data.evaluation_run import EvaluationRun
|
11
11
|
|
12
12
|
|
13
13
|
class SpanProcessorBase:
|
@@ -9,8 +9,9 @@ from typing import Any, Dict, Mapping, Optional, Union
|
|
9
9
|
from opentelemetry.sdk.trace import ReadableSpan
|
10
10
|
from pydantic import BaseModel
|
11
11
|
|
12
|
+
from judgeval.common.api.json_encoder import json_encoder
|
12
13
|
from judgeval.data import TraceSpan
|
13
|
-
from judgeval.evaluation_run import EvaluationRun
|
14
|
+
from judgeval.data.evaluation_run import EvaluationRun
|
14
15
|
|
15
16
|
|
16
17
|
class SpanTransformer:
|
@@ -38,21 +39,13 @@ class SpanTransformer:
|
|
38
39
|
return True
|
39
40
|
|
40
41
|
@staticmethod
|
41
|
-
def
|
42
|
-
if
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
return orjson.dumps(str(obj)).decode("utf-8")
|
49
|
-
else:
|
50
|
-
if not isinstance(obj, str):
|
51
|
-
return obj
|
52
|
-
try:
|
53
|
-
return orjson.loads(obj)
|
54
|
-
except (orjson.JSONDecodeError, TypeError, ValueError):
|
55
|
-
return obj
|
42
|
+
def _safe_deserialize(obj: Any) -> Any:
|
43
|
+
if not isinstance(obj, str):
|
44
|
+
return obj
|
45
|
+
try:
|
46
|
+
return orjson.loads(obj)
|
47
|
+
except (orjson.JSONDecodeError, TypeError):
|
48
|
+
return obj
|
56
49
|
|
57
50
|
@staticmethod
|
58
51
|
def _format_timestamp(timestamp: Optional[Union[float, int, str]]) -> str:
|
@@ -84,15 +77,13 @@ class SpanTransformer:
|
|
84
77
|
if field_name == "created_at":
|
85
78
|
attributes[attr_name] = SpanTransformer._format_timestamp(value)
|
86
79
|
elif field_name == "expected_tools" and value:
|
87
|
-
attributes[attr_name] =
|
80
|
+
attributes[attr_name] = json_encoder(
|
88
81
|
[tool.model_dump() for tool in trace_span.expected_tools]
|
89
82
|
)
|
90
83
|
elif field_name == "usage" and value:
|
91
|
-
attributes[attr_name] =
|
92
|
-
trace_span.usage.model_dump()
|
93
|
-
)
|
84
|
+
attributes[attr_name] = json_encoder(trace_span.usage)
|
94
85
|
elif SpanTransformer._needs_json_serialization(value):
|
95
|
-
attributes[attr_name] =
|
86
|
+
attributes[attr_name] = json_encoder(value)
|
96
87
|
else:
|
97
88
|
attributes[attr_name] = value
|
98
89
|
|
@@ -115,7 +106,7 @@ class SpanTransformer:
|
|
115
106
|
field_name = key[9:]
|
116
107
|
|
117
108
|
if isinstance(value, str):
|
118
|
-
deserialized = SpanTransformer.
|
109
|
+
deserialized = SpanTransformer._safe_deserialize(value)
|
119
110
|
judgment_data[field_name] = deserialized
|
120
111
|
else:
|
121
112
|
judgment_data[field_name] = value
|
@@ -159,6 +150,7 @@ class SpanTransformer:
|
|
159
150
|
"additional_metadata": judgment_data.get("additional_metadata"),
|
160
151
|
"has_evaluation": judgment_data.get("has_evaluation", False),
|
161
152
|
"agent_name": judgment_data.get("agent_name"),
|
153
|
+
"class_name": judgment_data.get("class_name"),
|
162
154
|
"state_before": judgment_data.get("state_before"),
|
163
155
|
"state_after": judgment_data.get("state_after"),
|
164
156
|
"update_id": judgment_data.get("update_id", 1),
|
@@ -174,9 +166,7 @@ class SpanTransformer:
|
|
174
166
|
attributes = {
|
175
167
|
"judgment.evaluation_run": True,
|
176
168
|
"judgment.associated_span_id": span_id,
|
177
|
-
"judgment.span_data":
|
178
|
-
span_data.model_dump()
|
179
|
-
),
|
169
|
+
"judgment.span_data": json_encoder(span_data),
|
180
170
|
}
|
181
171
|
|
182
172
|
eval_data = evaluation_run.model_dump()
|
@@ -186,7 +176,7 @@ class SpanTransformer:
|
|
186
176
|
|
187
177
|
attr_name = f"judgment.{key}"
|
188
178
|
if SpanTransformer._needs_json_serialization(value):
|
189
|
-
attributes[attr_name] =
|
179
|
+
attributes[attr_name] = json_encoder(value)
|
190
180
|
else:
|
191
181
|
attributes[attr_name] = value
|
192
182
|
|
judgeval/constants.py
CHANGED
@@ -0,0 +1,104 @@
|
|
1
|
+
from typing import List, Optional, Union
|
2
|
+
from pydantic import field_validator, model_validator, Field
|
3
|
+
from datetime import datetime, timezone
|
4
|
+
import uuid
|
5
|
+
|
6
|
+
from judgeval.data import Example
|
7
|
+
from judgeval.scorers import BaseScorer, APIScorerConfig
|
8
|
+
from judgeval.constants import ACCEPTABLE_MODELS
|
9
|
+
from judgeval.data.judgment_types import EvaluationRunJudgmentType
|
10
|
+
|
11
|
+
|
12
|
+
class EvaluationRun(EvaluationRunJudgmentType):
|
13
|
+
"""
|
14
|
+
Stores example and evaluation scorers together for running an eval task
|
15
|
+
|
16
|
+
Args:
|
17
|
+
project_name (str): The name of the project the evaluation results belong to
|
18
|
+
eval_name (str): A name for this evaluation run
|
19
|
+
examples (List[Example]): The examples to evaluate
|
20
|
+
scorers (List[Union[BaseScorer, APIScorerConfig]]): A list of scorers to use for evaluation
|
21
|
+
model (str): The model used as a judge when using LLM as a Judge
|
22
|
+
metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
|
23
|
+
"""
|
24
|
+
|
25
|
+
id: Optional[str] = Field(default_factory=lambda: str(uuid.uuid4()))
|
26
|
+
created_at: Optional[str] = Field(
|
27
|
+
default_factory=lambda: datetime.now(timezone.utc).isoformat()
|
28
|
+
)
|
29
|
+
custom_scorers: Optional[List[BaseScorer]] = None
|
30
|
+
judgment_scorers: Optional[List[APIScorerConfig]] = None
|
31
|
+
organization_id: Optional[str] = None
|
32
|
+
|
33
|
+
def __init__(
|
34
|
+
self,
|
35
|
+
scorers: Optional[List[Union[BaseScorer, APIScorerConfig]]] = None,
|
36
|
+
**kwargs,
|
37
|
+
):
|
38
|
+
"""
|
39
|
+
Initialize EvaluationRun with automatic scorer classification.
|
40
|
+
|
41
|
+
Args:
|
42
|
+
scorers: List of scorers that will be automatically sorted into custom_scorers or judgment_scorers
|
43
|
+
**kwargs: Other initialization arguments
|
44
|
+
"""
|
45
|
+
if scorers is not None:
|
46
|
+
# Automatically sort scorers into appropriate fields
|
47
|
+
custom_scorers = [s for s in scorers if isinstance(s, BaseScorer)]
|
48
|
+
judgment_scorers = [s for s in scorers if isinstance(s, APIScorerConfig)]
|
49
|
+
|
50
|
+
# Always set both fields as lists (even if empty) to satisfy validation
|
51
|
+
kwargs["custom_scorers"] = custom_scorers
|
52
|
+
kwargs["judgment_scorers"] = judgment_scorers
|
53
|
+
|
54
|
+
super().__init__(**kwargs)
|
55
|
+
|
56
|
+
def model_dump(self, **kwargs):
|
57
|
+
data = super().model_dump(**kwargs)
|
58
|
+
data["custom_scorers"] = [s.model_dump() for s in self.custom_scorers]
|
59
|
+
data["judgment_scorers"] = [s.model_dump() for s in self.judgment_scorers]
|
60
|
+
data["examples"] = [example.model_dump() for example in self.examples]
|
61
|
+
|
62
|
+
return data
|
63
|
+
|
64
|
+
@field_validator("examples")
|
65
|
+
def validate_examples(cls, v):
|
66
|
+
if not v:
|
67
|
+
raise ValueError("Examples cannot be empty.")
|
68
|
+
for item in v:
|
69
|
+
if not isinstance(item, Example):
|
70
|
+
raise ValueError(f"Item of type {type(item)} is not a Example")
|
71
|
+
return v
|
72
|
+
|
73
|
+
@model_validator(mode="after")
|
74
|
+
@classmethod
|
75
|
+
def validate_scorer_lists(cls, values):
|
76
|
+
custom_scorers = values.custom_scorers
|
77
|
+
judgment_scorers = values.judgment_scorers
|
78
|
+
|
79
|
+
# Check that both lists are not empty
|
80
|
+
if not custom_scorers and not judgment_scorers:
|
81
|
+
raise ValueError(
|
82
|
+
"At least one of custom_scorers or judgment_scorers must be provided."
|
83
|
+
)
|
84
|
+
|
85
|
+
# Check that only one list is filled
|
86
|
+
if custom_scorers and judgment_scorers:
|
87
|
+
raise ValueError(
|
88
|
+
"Only one of custom_scorers or judgment_scorers can be provided, not both."
|
89
|
+
)
|
90
|
+
|
91
|
+
return values
|
92
|
+
|
93
|
+
@field_validator("model")
|
94
|
+
def validate_model(cls, v, values):
|
95
|
+
if not v:
|
96
|
+
raise ValueError("Model cannot be empty.")
|
97
|
+
|
98
|
+
# Check if model is string or list of strings
|
99
|
+
if isinstance(v, str):
|
100
|
+
if v not in ACCEPTABLE_MODELS:
|
101
|
+
raise ValueError(
|
102
|
+
f"Model name {v} not recognized. Please select a valid model name.)"
|
103
|
+
)
|
104
|
+
return v
|
judgeval/data/judgment_types.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# generated by datamodel-codegen:
|
2
2
|
# filename: openapi_new.json
|
3
|
-
# timestamp: 2025-
|
3
|
+
# timestamp: 2025-08-08T18:50:51+00:00
|
4
4
|
|
5
5
|
from __future__ import annotations
|
6
6
|
|
@@ -51,6 +51,31 @@ class ScorerConfigJudgmentType(BaseModel):
|
|
51
51
|
kwargs: Annotated[Optional[Dict[str, Any]], Field(title="Kwargs")] = None
|
52
52
|
|
53
53
|
|
54
|
+
class BaseScorerJudgmentType(BaseModel):
|
55
|
+
score_type: Annotated[str, Field(title="Score Type")]
|
56
|
+
threshold: Annotated[Optional[float], Field(title="Threshold")] = 0.5
|
57
|
+
name: Annotated[Optional[str], Field(title="Name")] = None
|
58
|
+
class_name: Annotated[Optional[str], Field(title="Class Name")] = None
|
59
|
+
score: Annotated[Optional[float], Field(title="Score")] = None
|
60
|
+
score_breakdown: Annotated[
|
61
|
+
Optional[Dict[str, Any]], Field(title="Score Breakdown")
|
62
|
+
] = None
|
63
|
+
reason: Annotated[Optional[str], Field(title="Reason")] = ""
|
64
|
+
using_native_model: Annotated[Optional[bool], Field(title="Using Native Model")] = (
|
65
|
+
None
|
66
|
+
)
|
67
|
+
success: Annotated[Optional[bool], Field(title="Success")] = None
|
68
|
+
model: Annotated[Optional[str], Field(title="Model")] = None
|
69
|
+
model_client: Annotated[Any, Field(title="Model Client")] = None
|
70
|
+
strict_mode: Annotated[Optional[bool], Field(title="Strict Mode")] = False
|
71
|
+
error: Annotated[Optional[str], Field(title="Error")] = None
|
72
|
+
additional_metadata: Annotated[
|
73
|
+
Optional[Dict[str, Any]], Field(title="Additional Metadata")
|
74
|
+
] = None
|
75
|
+
user: Annotated[Optional[str], Field(title="User")] = None
|
76
|
+
server_hosted: Annotated[Optional[bool], Field(title="Server Hosted")] = False
|
77
|
+
|
78
|
+
|
54
79
|
class TraceUsageJudgmentType(BaseModel):
|
55
80
|
prompt_tokens: Annotated[Optional[int], Field(title="Prompt Tokens")] = None
|
56
81
|
completion_tokens: Annotated[Optional[int], Field(title="Completion Tokens")] = None
|
@@ -90,15 +115,21 @@ class HTTPValidationErrorJudgmentType(BaseModel):
|
|
90
115
|
] = None
|
91
116
|
|
92
117
|
|
93
|
-
class
|
118
|
+
class EvaluationRunJudgmentType(BaseModel):
|
119
|
+
id: Annotated[Optional[str], Field(title="Id")] = None
|
94
120
|
project_name: Annotated[Optional[str], Field(title="Project Name")] = None
|
95
121
|
eval_name: Annotated[Optional[str], Field(title="Eval Name")] = None
|
96
122
|
examples: Annotated[List[ExampleJudgmentType], Field(title="Examples")]
|
97
|
-
|
123
|
+
custom_scorers: Annotated[
|
124
|
+
Optional[List[BaseScorerJudgmentType]], Field(title="Custom Scorers")
|
125
|
+
] = Field(default_factory=list)
|
126
|
+
judgment_scorers: Annotated[
|
127
|
+
Optional[List[ScorerConfigJudgmentType]], Field(title="Judgment Scorers")
|
128
|
+
] = Field(default_factory=list)
|
98
129
|
model: Annotated[str, Field(title="Model")]
|
99
|
-
append: Annotated[Optional[bool], Field(title="Append")] = False
|
100
|
-
override: Annotated[Optional[bool], Field(title="Override")] = False
|
101
130
|
trace_span_id: Annotated[Optional[str], Field(title="Trace Span Id")] = None
|
131
|
+
trace_id: Annotated[Optional[str], Field(title="Trace Id")] = None
|
132
|
+
created_at: Annotated[Optional[str], Field(title="Created At")] = None
|
102
133
|
|
103
134
|
|
104
135
|
class TraceSpanJudgmentType(BaseModel):
|
@@ -122,6 +153,7 @@ class TraceSpanJudgmentType(BaseModel):
|
|
122
153
|
] = None
|
123
154
|
has_evaluation: Annotated[Optional[bool], Field(title="Has Evaluation")] = False
|
124
155
|
agent_name: Annotated[Optional[str], Field(title="Agent Name")] = None
|
156
|
+
class_name: Annotated[Optional[str], Field(title="Class Name")] = None
|
125
157
|
state_before: Annotated[Optional[Dict[str, Any]], Field(title="State Before")] = (
|
126
158
|
None
|
127
159
|
)
|
@@ -171,8 +203,6 @@ class TraceRunJudgmentType(BaseModel):
|
|
171
203
|
traces: Annotated[List[TraceJudgmentType], Field(title="Traces")]
|
172
204
|
scorers: Annotated[List[ScorerConfigJudgmentType], Field(title="Scorers")]
|
173
205
|
model: Annotated[str, Field(title="Model")]
|
174
|
-
append: Annotated[Optional[bool], Field(title="Append")] = False
|
175
|
-
override: Annotated[Optional[bool], Field(title="Override")] = False
|
176
206
|
trace_span_id: Annotated[Optional[str], Field(title="Trace Span Id")] = None
|
177
207
|
tools: Annotated[Optional[List[Dict[str, Any]]], Field(title="Tools")] = None
|
178
208
|
|
@@ -180,5 +210,5 @@ class TraceRunJudgmentType(BaseModel):
|
|
180
210
|
class EvalResultsJudgmentType(BaseModel):
|
181
211
|
results: Annotated[List[ScoringResultJudgmentType], Field(title="Results")]
|
182
212
|
run: Annotated[
|
183
|
-
Union[TraceRunJudgmentType,
|
213
|
+
Union[TraceRunJudgmentType, EvaluationRunJudgmentType], Field(title="Run")
|
184
214
|
]
|
judgeval/data/trace.py
CHANGED
@@ -1,7 +1,4 @@
|
|
1
|
-
from typing import Any
|
2
|
-
import sys
|
3
1
|
import threading
|
4
|
-
import orjson
|
5
2
|
from datetime import datetime, timezone
|
6
3
|
from judgeval.data.judgment_types import (
|
7
4
|
TraceUsageJudgmentType,
|
@@ -9,7 +6,7 @@ from judgeval.data.judgment_types import (
|
|
9
6
|
TraceJudgmentType,
|
10
7
|
)
|
11
8
|
from judgeval.constants import SPAN_LIFECYCLE_END_UPDATE_ID
|
12
|
-
from
|
9
|
+
from judgeval.common.api.json_encoder import json_encoder
|
13
10
|
|
14
11
|
|
15
12
|
class TraceUsage(TraceUsageJudgmentType):
|
@@ -25,9 +22,9 @@ class TraceSpan(TraceSpanJudgmentType):
|
|
25
22
|
"created_at": datetime.fromtimestamp(
|
26
23
|
self.created_at, tz=timezone.utc
|
27
24
|
).isoformat(),
|
28
|
-
"inputs":
|
29
|
-
"output":
|
30
|
-
"error":
|
25
|
+
"inputs": json_encoder(self.inputs),
|
26
|
+
"output": json_encoder(self.output),
|
27
|
+
"error": json_encoder(self.error),
|
31
28
|
"parent_span_id": self.parent_span_id,
|
32
29
|
"function": self.function,
|
33
30
|
"duration": self.duration,
|
@@ -35,9 +32,10 @@ class TraceSpan(TraceSpanJudgmentType):
|
|
35
32
|
"usage": self.usage.model_dump() if self.usage else None,
|
36
33
|
"has_evaluation": self.has_evaluation,
|
37
34
|
"agent_name": self.agent_name,
|
35
|
+
"class_name": self.class_name,
|
38
36
|
"state_before": self.state_before,
|
39
37
|
"state_after": self.state_after,
|
40
|
-
"additional_metadata":
|
38
|
+
"additional_metadata": json_encoder(self.additional_metadata),
|
41
39
|
"update_id": self.update_id,
|
42
40
|
}
|
43
41
|
|
@@ -80,120 +78,6 @@ class TraceSpan(TraceSpanJudgmentType):
|
|
80
78
|
)
|
81
79
|
print(f"{indent}→ {self.function} (id: {self.span_id}){parent_info}")
|
82
80
|
|
83
|
-
def _is_json_serializable(self, obj: Any) -> bool:
|
84
|
-
"""Helper method to check if an object is JSON serializable."""
|
85
|
-
try:
|
86
|
-
orjson.dumps(obj)
|
87
|
-
return True
|
88
|
-
except (TypeError, OverflowError, ValueError):
|
89
|
-
return False
|
90
|
-
|
91
|
-
def safe_stringify(self, output, function_name):
|
92
|
-
"""
|
93
|
-
Safely converts an object to a JSON-serializable structure, handling common object types intelligently.
|
94
|
-
"""
|
95
|
-
# Handle Pydantic models
|
96
|
-
if hasattr(output, "model_dump"):
|
97
|
-
try:
|
98
|
-
return output.model_dump()
|
99
|
-
except Exception:
|
100
|
-
pass
|
101
|
-
|
102
|
-
# Handle LangChain messages and similar objects with content/type
|
103
|
-
if hasattr(output, "content") and hasattr(output, "type"):
|
104
|
-
try:
|
105
|
-
result = {"type": output.type, "content": output.content}
|
106
|
-
# Add additional fields if they exist
|
107
|
-
if hasattr(output, "additional_kwargs"):
|
108
|
-
result["additional_kwargs"] = output.additional_kwargs
|
109
|
-
if hasattr(output, "response_metadata"):
|
110
|
-
result["response_metadata"] = output.response_metadata
|
111
|
-
if hasattr(output, "name"):
|
112
|
-
result["name"] = output.name
|
113
|
-
return result
|
114
|
-
except Exception:
|
115
|
-
pass
|
116
|
-
|
117
|
-
if hasattr(output, "dict"):
|
118
|
-
try:
|
119
|
-
return output.dict()
|
120
|
-
except Exception:
|
121
|
-
pass
|
122
|
-
|
123
|
-
if hasattr(output, "to_dict"):
|
124
|
-
try:
|
125
|
-
return output.to_dict()
|
126
|
-
except Exception:
|
127
|
-
pass
|
128
|
-
|
129
|
-
if hasattr(output, "__dataclass_fields__"):
|
130
|
-
try:
|
131
|
-
import dataclasses
|
132
|
-
|
133
|
-
return dataclasses.asdict(output)
|
134
|
-
except Exception:
|
135
|
-
pass
|
136
|
-
|
137
|
-
if hasattr(output, "__dict__"):
|
138
|
-
try:
|
139
|
-
return output.__dict__
|
140
|
-
except Exception:
|
141
|
-
pass
|
142
|
-
|
143
|
-
try:
|
144
|
-
return str(output)
|
145
|
-
except (TypeError, OverflowError, ValueError):
|
146
|
-
pass
|
147
|
-
|
148
|
-
try:
|
149
|
-
return repr(output)
|
150
|
-
except (TypeError, OverflowError, ValueError):
|
151
|
-
pass
|
152
|
-
|
153
|
-
return None
|
154
|
-
|
155
|
-
def _serialize_value(self, value: Any) -> Any:
|
156
|
-
"""Helper method to deep serialize a value safely supporting Pydantic Models / regular PyObjects."""
|
157
|
-
if value is None:
|
158
|
-
return None
|
159
|
-
|
160
|
-
recursion_limit = sys.getrecursionlimit()
|
161
|
-
recursion_limit = int(recursion_limit * 0.75)
|
162
|
-
|
163
|
-
def serialize_value(value, current_depth=0):
|
164
|
-
try:
|
165
|
-
if current_depth > recursion_limit:
|
166
|
-
return {"error": "max_depth_reached: " + type(value).__name__}
|
167
|
-
|
168
|
-
if isinstance(value, BaseModel):
|
169
|
-
return value.model_dump()
|
170
|
-
elif isinstance(value, dict):
|
171
|
-
# Recursively serialize dictionary values
|
172
|
-
return {
|
173
|
-
k: serialize_value(v, current_depth + 1)
|
174
|
-
for k, v in value.items()
|
175
|
-
}
|
176
|
-
elif isinstance(value, (list, tuple)):
|
177
|
-
# Recursively serialize list/tuple items
|
178
|
-
return [serialize_value(item, current_depth + 1) for item in value]
|
179
|
-
else:
|
180
|
-
try:
|
181
|
-
orjson.dumps(value)
|
182
|
-
return value
|
183
|
-
except (TypeError, OverflowError, ValueError):
|
184
|
-
# Fallback to safe stringification
|
185
|
-
return self.safe_stringify(value, self.function)
|
186
|
-
except Exception:
|
187
|
-
return {"error": "Unable to serialize"}
|
188
|
-
except Exception:
|
189
|
-
return {"error": "Unable to serialize"}
|
190
|
-
|
191
|
-
# Start serialization with the top-level value
|
192
|
-
try:
|
193
|
-
return serialize_value(value, current_depth=0)
|
194
|
-
except Exception:
|
195
|
-
return {"error": "Unable to serialize"}
|
196
|
-
|
197
81
|
|
198
82
|
class Trace(TraceJudgmentType):
|
199
83
|
pass
|
judgeval/data/trace_run.py
CHANGED
@@ -3,6 +3,7 @@ from typing import List, Optional, Dict, Any, Union
|
|
3
3
|
from judgeval.data import Trace
|
4
4
|
from judgeval.scorers import APIScorerConfig, BaseScorer
|
5
5
|
from judgeval.rules import Rule
|
6
|
+
from judgeval.constants import DEFAULT_GPT_MODEL
|
6
7
|
|
7
8
|
|
8
9
|
class TraceRun(BaseModel):
|
@@ -26,10 +27,8 @@ class TraceRun(BaseModel):
|
|
26
27
|
eval_name: Optional[str] = None
|
27
28
|
traces: Optional[List[Trace]] = None
|
28
29
|
scorers: List[Union[APIScorerConfig, BaseScorer]]
|
29
|
-
model: Optional[str] =
|
30
|
+
model: Optional[str] = DEFAULT_GPT_MODEL
|
30
31
|
trace_span_id: Optional[str] = None
|
31
|
-
append: Optional[bool] = False
|
32
|
-
override: Optional[bool] = False
|
33
32
|
rules: Optional[List[Rule]] = None
|
34
33
|
tools: Optional[List[Dict[str, Any]]] = None
|
35
34
|
|
judgeval/dataset.py
CHANGED
@@ -35,6 +35,7 @@ class Dataset:
|
|
35
35
|
for e in examples:
|
36
36
|
if isinstance(e, dict) and isinstance(e.get("data"), dict):
|
37
37
|
e.update(e.pop("data"))
|
38
|
+
judgeval_logger.info(f"Succesfully retrieved dataset {name}!")
|
38
39
|
return cls(
|
39
40
|
name=name,
|
40
41
|
project_name=project_name,
|
@@ -68,6 +69,7 @@ class Dataset:
|
|
68
69
|
traces=[t.model_dump() for t in traces],
|
69
70
|
overwrite=overwrite,
|
70
71
|
)
|
72
|
+
judgeval_logger.info(f"Succesfull created dataset {name}!")
|
71
73
|
return cls(
|
72
74
|
name=name,
|
73
75
|
project_name=project_name,
|
@@ -133,7 +133,8 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
133
133
|
inputs: Optional[Dict[str, Any]] = None,
|
134
134
|
) -> None:
|
135
135
|
"""Start tracking a span, ensuring trace client exists"""
|
136
|
-
|
136
|
+
if name.startswith("__") and name.endswith("__"):
|
137
|
+
return
|
137
138
|
start_time = time.time()
|
138
139
|
span_id = str(uuid.uuid4())
|
139
140
|
parent_span_id: Optional[str] = None
|
judgeval/judges/litellm_judge.py
CHANGED
@@ -7,6 +7,7 @@ from judgeval.common.utils import (
|
|
7
7
|
fetch_litellm_api_response,
|
8
8
|
)
|
9
9
|
from judgeval.common.logger import judgeval_logger
|
10
|
+
from judgeval.constants import DEFAULT_GPT_MODEL
|
10
11
|
|
11
12
|
BASE_CONVERSATION = [
|
12
13
|
{"role": "system", "content": "You are a helpful assistant."},
|
@@ -14,7 +15,7 @@ BASE_CONVERSATION = [
|
|
14
15
|
|
15
16
|
|
16
17
|
class LiteLLMJudge(JudgevalJudge):
|
17
|
-
def __init__(self, model: str =
|
18
|
+
def __init__(self, model: str = DEFAULT_GPT_MODEL, **kwargs):
|
18
19
|
self.model = model
|
19
20
|
self.kwargs = kwargs
|
20
21
|
super().__init__(model_name=model)
|
@@ -14,6 +14,7 @@ from judgeval.common.utils import (
|
|
14
14
|
aget_chat_completion,
|
15
15
|
)
|
16
16
|
from judgeval.common.logger import judgeval_logger
|
17
|
+
from judgeval.constants import DEFAULT_GPT_MODEL
|
17
18
|
|
18
19
|
|
19
20
|
def build_dynamic_mixture_prompt(
|
@@ -161,7 +162,7 @@ class MixtureOfJudges(JudgevalJudge):
|
|
161
162
|
"LLAMA3_70B_INSTRUCT_TURBO",
|
162
163
|
"MISTRAL_8x22B_INSTRUCT",
|
163
164
|
],
|
164
|
-
aggregator: str =
|
165
|
+
aggregator: str = DEFAULT_GPT_MODEL,
|
165
166
|
**kwargs,
|
166
167
|
):
|
167
168
|
"""
|