judgeval 0.4.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -21,7 +21,7 @@ from judgeval.common.tracer.otel_exporter import JudgmentAPISpanExporter
21
21
  from judgeval.common.tracer.span_processor import SpanProcessorBase
22
22
  from judgeval.common.tracer.span_transformer import SpanTransformer
23
23
  from judgeval.data import TraceSpan
24
- from judgeval.evaluation_run import EvaluationRun
24
+ from judgeval.data.evaluation_run import EvaluationRun
25
25
 
26
26
 
27
27
  class SimpleReadableSpan(ReadableSpan):
@@ -0,0 +1,119 @@
1
+ from __future__ import annotations
2
+ import logging
3
+ from typing import Any, TypeAlias
4
+
5
+
6
+ logger = logging.getLogger(__name__)
7
+ # TODO: Have functions that assert and return the relevant exports when the client is installed.
8
+ # The method should raise if the user tries to access client information that doesnt exist.
9
+
10
+ HAS_OPENAI = False
11
+ openai_OpenAI = None
12
+ openai_AsyncOpenAI = None
13
+ openai_ChatCompletion = None
14
+ openai_Response = None
15
+ openai_ParsedChatCompletion = None
16
+
17
+ try:
18
+ from openai import OpenAI, AsyncOpenAI
19
+ from openai.types.chat.chat_completion import ChatCompletion
20
+ from openai.types.responses.response import Response
21
+ from openai.types.chat import ParsedChatCompletion
22
+
23
+ openai_OpenAI = OpenAI
24
+ openai_AsyncOpenAI = AsyncOpenAI
25
+ openai_ChatCompletion = ChatCompletion
26
+ openai_Response = Response
27
+ openai_ParsedChatCompletion = ParsedChatCompletion
28
+ HAS_OPENAI = True
29
+ except ImportError:
30
+ pass
31
+
32
+
33
+ HAS_TOGETHER = False
34
+ together_Together = None
35
+ together_AsyncTogether = None
36
+
37
+ try:
38
+ from together import Together, AsyncTogether
39
+
40
+ together_Together = Together
41
+ together_AsyncTogether = AsyncTogether
42
+ HAS_TOGETHER = True
43
+ except ImportError:
44
+ pass
45
+
46
+
47
+ HAS_ANTHROPIC = False
48
+ anthropic_Anthropic = None
49
+ anthropic_AsyncAnthropic = None
50
+
51
+ try:
52
+ from anthropic import Anthropic, AsyncAnthropic
53
+
54
+ anthropic_Anthropic = Anthropic
55
+ anthropic_AsyncAnthropic = AsyncAnthropic
56
+ HAS_ANTHROPIC = True
57
+ except ImportError:
58
+ pass
59
+
60
+
61
+ HAS_GOOGLE_GENAI = False
62
+ google_genai_Client = None
63
+ google_genai_cleint_AsyncClient = None
64
+
65
+ try:
66
+ from google.genai import Client
67
+ from google.genai.client import AsyncClient
68
+
69
+ google_genai_Client = Client
70
+ google_genai_AsyncClient = AsyncClient
71
+ HAS_GOOGLE_GENAI = True
72
+ except ImportError:
73
+ pass
74
+
75
+
76
+ HAS_GROQ = False
77
+ groq_Groq = None
78
+ groq_AsyncGroq = None
79
+
80
+ try:
81
+ from groq import Groq, AsyncGroq
82
+
83
+ groq_Groq = Groq
84
+ groq_AsyncGroq = AsyncGroq
85
+ HAS_GROQ = True
86
+ except ImportError:
87
+ pass
88
+
89
+
90
+ # TODO: if we support dependency groups we can have this better type, but during runtime, we do
91
+ # not know which clients an end user might have installed.
92
+ ApiClient: TypeAlias = Any
93
+
94
+ __all__ = [
95
+ "ApiClient",
96
+ # OpenAI
97
+ "HAS_OPENAI",
98
+ "openai_OpenAI",
99
+ "openai_AsyncOpenAI",
100
+ "openai_ChatCompletion",
101
+ "openai_Response",
102
+ "openai_ParsedChatCompletion",
103
+ # Together
104
+ "HAS_TOGETHER",
105
+ "together_Together",
106
+ "together_AsyncTogether",
107
+ # Anthropic
108
+ "HAS_ANTHROPIC",
109
+ "anthropic_Anthropic",
110
+ "anthropic_AsyncAnthropic",
111
+ # Google GenAI
112
+ "HAS_GOOGLE_GENAI",
113
+ "google_genai_Client",
114
+ "google_genai_AsyncClient",
115
+ # Groq
116
+ "HAS_GROQ",
117
+ "groq_Groq",
118
+ "groq_AsyncGroq",
119
+ ]
@@ -7,7 +7,7 @@ When monitoring is enabled, we use JudgmentSpanProcessor which overrides the met
7
7
  """
8
8
 
9
9
  from judgeval.data import TraceSpan
10
- from judgeval.evaluation_run import EvaluationRun
10
+ from judgeval.data.evaluation_run import EvaluationRun
11
11
 
12
12
 
13
13
  class SpanProcessorBase:
@@ -9,8 +9,9 @@ from typing import Any, Dict, Mapping, Optional, Union
9
9
  from opentelemetry.sdk.trace import ReadableSpan
10
10
  from pydantic import BaseModel
11
11
 
12
+ from judgeval.common.api.json_encoder import json_encoder
12
13
  from judgeval.data import TraceSpan
13
- from judgeval.evaluation_run import EvaluationRun
14
+ from judgeval.data.evaluation_run import EvaluationRun
14
15
 
15
16
 
16
17
  class SpanTransformer:
@@ -38,21 +39,13 @@ class SpanTransformer:
38
39
  return True
39
40
 
40
41
  @staticmethod
41
- def _safe_json_handle(obj: Any, serialize: bool = True) -> Any:
42
- if serialize:
43
- if obj is None:
44
- return None
45
- try:
46
- return orjson.dumps(obj, default=str).decode("utf-8")
47
- except Exception:
48
- return orjson.dumps(str(obj)).decode("utf-8")
49
- else:
50
- if not isinstance(obj, str):
51
- return obj
52
- try:
53
- return orjson.loads(obj)
54
- except (orjson.JSONDecodeError, TypeError, ValueError):
55
- return obj
42
+ def _safe_deserialize(obj: Any) -> Any:
43
+ if not isinstance(obj, str):
44
+ return obj
45
+ try:
46
+ return orjson.loads(obj)
47
+ except (orjson.JSONDecodeError, TypeError):
48
+ return obj
56
49
 
57
50
  @staticmethod
58
51
  def _format_timestamp(timestamp: Optional[Union[float, int, str]]) -> str:
@@ -84,15 +77,13 @@ class SpanTransformer:
84
77
  if field_name == "created_at":
85
78
  attributes[attr_name] = SpanTransformer._format_timestamp(value)
86
79
  elif field_name == "expected_tools" and value:
87
- attributes[attr_name] = SpanTransformer._safe_json_handle(
80
+ attributes[attr_name] = json_encoder(
88
81
  [tool.model_dump() for tool in trace_span.expected_tools]
89
82
  )
90
83
  elif field_name == "usage" and value:
91
- attributes[attr_name] = SpanTransformer._safe_json_handle(
92
- trace_span.usage.model_dump()
93
- )
84
+ attributes[attr_name] = json_encoder(trace_span.usage)
94
85
  elif SpanTransformer._needs_json_serialization(value):
95
- attributes[attr_name] = SpanTransformer._safe_json_handle(value)
86
+ attributes[attr_name] = json_encoder(value)
96
87
  else:
97
88
  attributes[attr_name] = value
98
89
 
@@ -115,7 +106,7 @@ class SpanTransformer:
115
106
  field_name = key[9:]
116
107
 
117
108
  if isinstance(value, str):
118
- deserialized = SpanTransformer._safe_json_handle(value, serialize=False)
109
+ deserialized = SpanTransformer._safe_deserialize(value)
119
110
  judgment_data[field_name] = deserialized
120
111
  else:
121
112
  judgment_data[field_name] = value
@@ -159,6 +150,7 @@ class SpanTransformer:
159
150
  "additional_metadata": judgment_data.get("additional_metadata"),
160
151
  "has_evaluation": judgment_data.get("has_evaluation", False),
161
152
  "agent_name": judgment_data.get("agent_name"),
153
+ "class_name": judgment_data.get("class_name"),
162
154
  "state_before": judgment_data.get("state_before"),
163
155
  "state_after": judgment_data.get("state_after"),
164
156
  "update_id": judgment_data.get("update_id", 1),
@@ -174,9 +166,7 @@ class SpanTransformer:
174
166
  attributes = {
175
167
  "judgment.evaluation_run": True,
176
168
  "judgment.associated_span_id": span_id,
177
- "judgment.span_data": SpanTransformer._safe_json_handle(
178
- span_data.model_dump()
179
- ),
169
+ "judgment.span_data": json_encoder(span_data),
180
170
  }
181
171
 
182
172
  eval_data = evaluation_run.model_dump()
@@ -186,7 +176,7 @@ class SpanTransformer:
186
176
 
187
177
  attr_name = f"judgment.{key}"
188
178
  if SpanTransformer._needs_json_serialization(value):
189
- attributes[attr_name] = SpanTransformer._safe_json_handle(value)
179
+ attributes[attr_name] = json_encoder(value)
190
180
  else:
191
181
  attributes[attr_name] = value
192
182
 
judgeval/constants.py CHANGED
@@ -105,6 +105,7 @@ TOGETHER_SUPPORTED_MODELS = [
105
105
  ]
106
106
 
107
107
  DEFAULT_TOGETHER_MODEL = "meta-llama/Meta-Llama-3-8B-Instruct-Lite"
108
+ DEFAULT_GPT_MODEL = "gpt-4.1"
108
109
 
109
110
  JUDGMENT_SUPPORTED_MODELS = {"osiris-large", "osiris-mini", "osiris"}
110
111
 
@@ -0,0 +1,104 @@
1
+ from typing import List, Optional, Union
2
+ from pydantic import field_validator, model_validator, Field
3
+ from datetime import datetime, timezone
4
+ import uuid
5
+
6
+ from judgeval.data import Example
7
+ from judgeval.scorers import BaseScorer, APIScorerConfig
8
+ from judgeval.constants import ACCEPTABLE_MODELS
9
+ from judgeval.data.judgment_types import EvaluationRunJudgmentType
10
+
11
+
12
+ class EvaluationRun(EvaluationRunJudgmentType):
13
+ """
14
+ Stores example and evaluation scorers together for running an eval task
15
+
16
+ Args:
17
+ project_name (str): The name of the project the evaluation results belong to
18
+ eval_name (str): A name for this evaluation run
19
+ examples (List[Example]): The examples to evaluate
20
+ scorers (List[Union[BaseScorer, APIScorerConfig]]): A list of scorers to use for evaluation
21
+ model (str): The model used as a judge when using LLM as a Judge
22
+ metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
23
+ """
24
+
25
+ id: Optional[str] = Field(default_factory=lambda: str(uuid.uuid4()))
26
+ created_at: Optional[str] = Field(
27
+ default_factory=lambda: datetime.now(timezone.utc).isoformat()
28
+ )
29
+ custom_scorers: Optional[List[BaseScorer]] = None
30
+ judgment_scorers: Optional[List[APIScorerConfig]] = None
31
+ organization_id: Optional[str] = None
32
+
33
+ def __init__(
34
+ self,
35
+ scorers: Optional[List[Union[BaseScorer, APIScorerConfig]]] = None,
36
+ **kwargs,
37
+ ):
38
+ """
39
+ Initialize EvaluationRun with automatic scorer classification.
40
+
41
+ Args:
42
+ scorers: List of scorers that will be automatically sorted into custom_scorers or judgment_scorers
43
+ **kwargs: Other initialization arguments
44
+ """
45
+ if scorers is not None:
46
+ # Automatically sort scorers into appropriate fields
47
+ custom_scorers = [s for s in scorers if isinstance(s, BaseScorer)]
48
+ judgment_scorers = [s for s in scorers if isinstance(s, APIScorerConfig)]
49
+
50
+ # Always set both fields as lists (even if empty) to satisfy validation
51
+ kwargs["custom_scorers"] = custom_scorers
52
+ kwargs["judgment_scorers"] = judgment_scorers
53
+
54
+ super().__init__(**kwargs)
55
+
56
+ def model_dump(self, **kwargs):
57
+ data = super().model_dump(**kwargs)
58
+ data["custom_scorers"] = [s.model_dump() for s in self.custom_scorers]
59
+ data["judgment_scorers"] = [s.model_dump() for s in self.judgment_scorers]
60
+ data["examples"] = [example.model_dump() for example in self.examples]
61
+
62
+ return data
63
+
64
+ @field_validator("examples")
65
+ def validate_examples(cls, v):
66
+ if not v:
67
+ raise ValueError("Examples cannot be empty.")
68
+ for item in v:
69
+ if not isinstance(item, Example):
70
+ raise ValueError(f"Item of type {type(item)} is not a Example")
71
+ return v
72
+
73
+ @model_validator(mode="after")
74
+ @classmethod
75
+ def validate_scorer_lists(cls, values):
76
+ custom_scorers = values.custom_scorers
77
+ judgment_scorers = values.judgment_scorers
78
+
79
+ # Check that both lists are not empty
80
+ if not custom_scorers and not judgment_scorers:
81
+ raise ValueError(
82
+ "At least one of custom_scorers or judgment_scorers must be provided."
83
+ )
84
+
85
+ # Check that only one list is filled
86
+ if custom_scorers and judgment_scorers:
87
+ raise ValueError(
88
+ "Only one of custom_scorers or judgment_scorers can be provided, not both."
89
+ )
90
+
91
+ return values
92
+
93
+ @field_validator("model")
94
+ def validate_model(cls, v, values):
95
+ if not v:
96
+ raise ValueError("Model cannot be empty.")
97
+
98
+ # Check if model is string or list of strings
99
+ if isinstance(v, str):
100
+ if v not in ACCEPTABLE_MODELS:
101
+ raise ValueError(
102
+ f"Model name {v} not recognized. Please select a valid model name.)"
103
+ )
104
+ return v
@@ -1,6 +1,6 @@
1
1
  # generated by datamodel-codegen:
2
2
  # filename: openapi_new.json
3
- # timestamp: 2025-07-29T18:13:07+00:00
3
+ # timestamp: 2025-08-08T18:50:51+00:00
4
4
 
5
5
  from __future__ import annotations
6
6
 
@@ -51,6 +51,31 @@ class ScorerConfigJudgmentType(BaseModel):
51
51
  kwargs: Annotated[Optional[Dict[str, Any]], Field(title="Kwargs")] = None
52
52
 
53
53
 
54
+ class BaseScorerJudgmentType(BaseModel):
55
+ score_type: Annotated[str, Field(title="Score Type")]
56
+ threshold: Annotated[Optional[float], Field(title="Threshold")] = 0.5
57
+ name: Annotated[Optional[str], Field(title="Name")] = None
58
+ class_name: Annotated[Optional[str], Field(title="Class Name")] = None
59
+ score: Annotated[Optional[float], Field(title="Score")] = None
60
+ score_breakdown: Annotated[
61
+ Optional[Dict[str, Any]], Field(title="Score Breakdown")
62
+ ] = None
63
+ reason: Annotated[Optional[str], Field(title="Reason")] = ""
64
+ using_native_model: Annotated[Optional[bool], Field(title="Using Native Model")] = (
65
+ None
66
+ )
67
+ success: Annotated[Optional[bool], Field(title="Success")] = None
68
+ model: Annotated[Optional[str], Field(title="Model")] = None
69
+ model_client: Annotated[Any, Field(title="Model Client")] = None
70
+ strict_mode: Annotated[Optional[bool], Field(title="Strict Mode")] = False
71
+ error: Annotated[Optional[str], Field(title="Error")] = None
72
+ additional_metadata: Annotated[
73
+ Optional[Dict[str, Any]], Field(title="Additional Metadata")
74
+ ] = None
75
+ user: Annotated[Optional[str], Field(title="User")] = None
76
+ server_hosted: Annotated[Optional[bool], Field(title="Server Hosted")] = False
77
+
78
+
54
79
  class TraceUsageJudgmentType(BaseModel):
55
80
  prompt_tokens: Annotated[Optional[int], Field(title="Prompt Tokens")] = None
56
81
  completion_tokens: Annotated[Optional[int], Field(title="Completion Tokens")] = None
@@ -90,15 +115,21 @@ class HTTPValidationErrorJudgmentType(BaseModel):
90
115
  ] = None
91
116
 
92
117
 
93
- class JudgmentEvalJudgmentType(BaseModel):
118
+ class EvaluationRunJudgmentType(BaseModel):
119
+ id: Annotated[Optional[str], Field(title="Id")] = None
94
120
  project_name: Annotated[Optional[str], Field(title="Project Name")] = None
95
121
  eval_name: Annotated[Optional[str], Field(title="Eval Name")] = None
96
122
  examples: Annotated[List[ExampleJudgmentType], Field(title="Examples")]
97
- scorers: Annotated[List[ScorerConfigJudgmentType], Field(title="Scorers")]
123
+ custom_scorers: Annotated[
124
+ Optional[List[BaseScorerJudgmentType]], Field(title="Custom Scorers")
125
+ ] = Field(default_factory=list)
126
+ judgment_scorers: Annotated[
127
+ Optional[List[ScorerConfigJudgmentType]], Field(title="Judgment Scorers")
128
+ ] = Field(default_factory=list)
98
129
  model: Annotated[str, Field(title="Model")]
99
- append: Annotated[Optional[bool], Field(title="Append")] = False
100
- override: Annotated[Optional[bool], Field(title="Override")] = False
101
130
  trace_span_id: Annotated[Optional[str], Field(title="Trace Span Id")] = None
131
+ trace_id: Annotated[Optional[str], Field(title="Trace Id")] = None
132
+ created_at: Annotated[Optional[str], Field(title="Created At")] = None
102
133
 
103
134
 
104
135
  class TraceSpanJudgmentType(BaseModel):
@@ -122,6 +153,7 @@ class TraceSpanJudgmentType(BaseModel):
122
153
  ] = None
123
154
  has_evaluation: Annotated[Optional[bool], Field(title="Has Evaluation")] = False
124
155
  agent_name: Annotated[Optional[str], Field(title="Agent Name")] = None
156
+ class_name: Annotated[Optional[str], Field(title="Class Name")] = None
125
157
  state_before: Annotated[Optional[Dict[str, Any]], Field(title="State Before")] = (
126
158
  None
127
159
  )
@@ -171,8 +203,6 @@ class TraceRunJudgmentType(BaseModel):
171
203
  traces: Annotated[List[TraceJudgmentType], Field(title="Traces")]
172
204
  scorers: Annotated[List[ScorerConfigJudgmentType], Field(title="Scorers")]
173
205
  model: Annotated[str, Field(title="Model")]
174
- append: Annotated[Optional[bool], Field(title="Append")] = False
175
- override: Annotated[Optional[bool], Field(title="Override")] = False
176
206
  trace_span_id: Annotated[Optional[str], Field(title="Trace Span Id")] = None
177
207
  tools: Annotated[Optional[List[Dict[str, Any]]], Field(title="Tools")] = None
178
208
 
@@ -180,5 +210,5 @@ class TraceRunJudgmentType(BaseModel):
180
210
  class EvalResultsJudgmentType(BaseModel):
181
211
  results: Annotated[List[ScoringResultJudgmentType], Field(title="Results")]
182
212
  run: Annotated[
183
- Union[TraceRunJudgmentType, JudgmentEvalJudgmentType], Field(title="Run")
213
+ Union[TraceRunJudgmentType, EvaluationRunJudgmentType], Field(title="Run")
184
214
  ]
judgeval/data/trace.py CHANGED
@@ -1,7 +1,4 @@
1
- from typing import Any
2
- import sys
3
1
  import threading
4
- import orjson
5
2
  from datetime import datetime, timezone
6
3
  from judgeval.data.judgment_types import (
7
4
  TraceUsageJudgmentType,
@@ -9,7 +6,7 @@ from judgeval.data.judgment_types import (
9
6
  TraceJudgmentType,
10
7
  )
11
8
  from judgeval.constants import SPAN_LIFECYCLE_END_UPDATE_ID
12
- from pydantic import BaseModel
9
+ from judgeval.common.api.json_encoder import json_encoder
13
10
 
14
11
 
15
12
  class TraceUsage(TraceUsageJudgmentType):
@@ -25,9 +22,9 @@ class TraceSpan(TraceSpanJudgmentType):
25
22
  "created_at": datetime.fromtimestamp(
26
23
  self.created_at, tz=timezone.utc
27
24
  ).isoformat(),
28
- "inputs": self._serialize_value(self.inputs),
29
- "output": self._serialize_value(self.output),
30
- "error": self._serialize_value(self.error),
25
+ "inputs": json_encoder(self.inputs),
26
+ "output": json_encoder(self.output),
27
+ "error": json_encoder(self.error),
31
28
  "parent_span_id": self.parent_span_id,
32
29
  "function": self.function,
33
30
  "duration": self.duration,
@@ -35,9 +32,10 @@ class TraceSpan(TraceSpanJudgmentType):
35
32
  "usage": self.usage.model_dump() if self.usage else None,
36
33
  "has_evaluation": self.has_evaluation,
37
34
  "agent_name": self.agent_name,
35
+ "class_name": self.class_name,
38
36
  "state_before": self.state_before,
39
37
  "state_after": self.state_after,
40
- "additional_metadata": self._serialize_value(self.additional_metadata),
38
+ "additional_metadata": json_encoder(self.additional_metadata),
41
39
  "update_id": self.update_id,
42
40
  }
43
41
 
@@ -80,120 +78,6 @@ class TraceSpan(TraceSpanJudgmentType):
80
78
  )
81
79
  print(f"{indent}→ {self.function} (id: {self.span_id}){parent_info}")
82
80
 
83
- def _is_json_serializable(self, obj: Any) -> bool:
84
- """Helper method to check if an object is JSON serializable."""
85
- try:
86
- orjson.dumps(obj)
87
- return True
88
- except (TypeError, OverflowError, ValueError):
89
- return False
90
-
91
- def safe_stringify(self, output, function_name):
92
- """
93
- Safely converts an object to a JSON-serializable structure, handling common object types intelligently.
94
- """
95
- # Handle Pydantic models
96
- if hasattr(output, "model_dump"):
97
- try:
98
- return output.model_dump()
99
- except Exception:
100
- pass
101
-
102
- # Handle LangChain messages and similar objects with content/type
103
- if hasattr(output, "content") and hasattr(output, "type"):
104
- try:
105
- result = {"type": output.type, "content": output.content}
106
- # Add additional fields if they exist
107
- if hasattr(output, "additional_kwargs"):
108
- result["additional_kwargs"] = output.additional_kwargs
109
- if hasattr(output, "response_metadata"):
110
- result["response_metadata"] = output.response_metadata
111
- if hasattr(output, "name"):
112
- result["name"] = output.name
113
- return result
114
- except Exception:
115
- pass
116
-
117
- if hasattr(output, "dict"):
118
- try:
119
- return output.dict()
120
- except Exception:
121
- pass
122
-
123
- if hasattr(output, "to_dict"):
124
- try:
125
- return output.to_dict()
126
- except Exception:
127
- pass
128
-
129
- if hasattr(output, "__dataclass_fields__"):
130
- try:
131
- import dataclasses
132
-
133
- return dataclasses.asdict(output)
134
- except Exception:
135
- pass
136
-
137
- if hasattr(output, "__dict__"):
138
- try:
139
- return output.__dict__
140
- except Exception:
141
- pass
142
-
143
- try:
144
- return str(output)
145
- except (TypeError, OverflowError, ValueError):
146
- pass
147
-
148
- try:
149
- return repr(output)
150
- except (TypeError, OverflowError, ValueError):
151
- pass
152
-
153
- return None
154
-
155
- def _serialize_value(self, value: Any) -> Any:
156
- """Helper method to deep serialize a value safely supporting Pydantic Models / regular PyObjects."""
157
- if value is None:
158
- return None
159
-
160
- recursion_limit = sys.getrecursionlimit()
161
- recursion_limit = int(recursion_limit * 0.75)
162
-
163
- def serialize_value(value, current_depth=0):
164
- try:
165
- if current_depth > recursion_limit:
166
- return {"error": "max_depth_reached: " + type(value).__name__}
167
-
168
- if isinstance(value, BaseModel):
169
- return value.model_dump()
170
- elif isinstance(value, dict):
171
- # Recursively serialize dictionary values
172
- return {
173
- k: serialize_value(v, current_depth + 1)
174
- for k, v in value.items()
175
- }
176
- elif isinstance(value, (list, tuple)):
177
- # Recursively serialize list/tuple items
178
- return [serialize_value(item, current_depth + 1) for item in value]
179
- else:
180
- try:
181
- orjson.dumps(value)
182
- return value
183
- except (TypeError, OverflowError, ValueError):
184
- # Fallback to safe stringification
185
- return self.safe_stringify(value, self.function)
186
- except Exception:
187
- return {"error": "Unable to serialize"}
188
- except Exception:
189
- return {"error": "Unable to serialize"}
190
-
191
- # Start serialization with the top-level value
192
- try:
193
- return serialize_value(value, current_depth=0)
194
- except Exception:
195
- return {"error": "Unable to serialize"}
196
-
197
81
 
198
82
  class Trace(TraceJudgmentType):
199
83
  pass
@@ -3,6 +3,7 @@ from typing import List, Optional, Dict, Any, Union
3
3
  from judgeval.data import Trace
4
4
  from judgeval.scorers import APIScorerConfig, BaseScorer
5
5
  from judgeval.rules import Rule
6
+ from judgeval.constants import DEFAULT_GPT_MODEL
6
7
 
7
8
 
8
9
  class TraceRun(BaseModel):
@@ -26,10 +27,8 @@ class TraceRun(BaseModel):
26
27
  eval_name: Optional[str] = None
27
28
  traces: Optional[List[Trace]] = None
28
29
  scorers: List[Union[APIScorerConfig, BaseScorer]]
29
- model: Optional[str] = "gpt-4.1"
30
+ model: Optional[str] = DEFAULT_GPT_MODEL
30
31
  trace_span_id: Optional[str] = None
31
- append: Optional[bool] = False
32
- override: Optional[bool] = False
33
32
  rules: Optional[List[Rule]] = None
34
33
  tools: Optional[List[Dict[str, Any]]] = None
35
34
 
judgeval/dataset.py CHANGED
@@ -35,6 +35,7 @@ class Dataset:
35
35
  for e in examples:
36
36
  if isinstance(e, dict) and isinstance(e.get("data"), dict):
37
37
  e.update(e.pop("data"))
38
+ judgeval_logger.info(f"Succesfully retrieved dataset {name}!")
38
39
  return cls(
39
40
  name=name,
40
41
  project_name=project_name,
@@ -68,6 +69,7 @@ class Dataset:
68
69
  traces=[t.model_dump() for t in traces],
69
70
  overwrite=overwrite,
70
71
  )
72
+ judgeval_logger.info(f"Succesfull created dataset {name}!")
71
73
  return cls(
72
74
  name=name,
73
75
  project_name=project_name,
@@ -133,7 +133,8 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
133
133
  inputs: Optional[Dict[str, Any]] = None,
134
134
  ) -> None:
135
135
  """Start tracking a span, ensuring trace client exists"""
136
-
136
+ if name.startswith("__") and name.endswith("__"):
137
+ return
137
138
  start_time = time.time()
138
139
  span_id = str(uuid.uuid4())
139
140
  parent_span_id: Optional[str] = None
@@ -7,6 +7,7 @@ from judgeval.common.utils import (
7
7
  fetch_litellm_api_response,
8
8
  )
9
9
  from judgeval.common.logger import judgeval_logger
10
+ from judgeval.constants import DEFAULT_GPT_MODEL
10
11
 
11
12
  BASE_CONVERSATION = [
12
13
  {"role": "system", "content": "You are a helpful assistant."},
@@ -14,7 +15,7 @@ BASE_CONVERSATION = [
14
15
 
15
16
 
16
17
  class LiteLLMJudge(JudgevalJudge):
17
- def __init__(self, model: str = "gpt-4.1-mini", **kwargs):
18
+ def __init__(self, model: str = DEFAULT_GPT_MODEL, **kwargs):
18
19
  self.model = model
19
20
  self.kwargs = kwargs
20
21
  super().__init__(model_name=model)
@@ -14,6 +14,7 @@ from judgeval.common.utils import (
14
14
  aget_chat_completion,
15
15
  )
16
16
  from judgeval.common.logger import judgeval_logger
17
+ from judgeval.constants import DEFAULT_GPT_MODEL
17
18
 
18
19
 
19
20
  def build_dynamic_mixture_prompt(
@@ -161,7 +162,7 @@ class MixtureOfJudges(JudgevalJudge):
161
162
  "LLAMA3_70B_INSTRUCT_TURBO",
162
163
  "MISTRAL_8x22B_INSTRUCT",
163
164
  ],
164
- aggregator: str = "gpt-4.1",
165
+ aggregator: str = DEFAULT_GPT_MODEL,
165
166
  **kwargs,
166
167
  ):
167
168
  """