judgeval 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,119 @@
1
+ from __future__ import annotations
2
+ import logging
3
+ from typing import Any, TypeAlias
4
+
5
+
6
+ logger = logging.getLogger(__name__)
7
+ # TODO: Have functions that assert and return the relevant exports when the client is installed.
8
+ # The method should raise if the user tries to access client information that doesnt exist.
9
+
10
+ HAS_OPENAI = False
11
+ openai_OpenAI = None
12
+ openai_AsyncOpenAI = None
13
+ openai_ChatCompletion = None
14
+ openai_Response = None
15
+ openai_ParsedChatCompletion = None
16
+
17
+ try:
18
+ from openai import OpenAI, AsyncOpenAI
19
+ from openai.types.chat.chat_completion import ChatCompletion
20
+ from openai.types.responses.response import Response
21
+ from openai.types.chat import ParsedChatCompletion
22
+
23
+ openai_OpenAI = OpenAI
24
+ openai_AsyncOpenAI = AsyncOpenAI
25
+ openai_ChatCompletion = ChatCompletion
26
+ openai_Response = Response
27
+ openai_ParsedChatCompletion = ParsedChatCompletion
28
+ HAS_OPENAI = True
29
+ except ImportError:
30
+ pass
31
+
32
+
33
+ HAS_TOGETHER = False
34
+ together_Together = None
35
+ together_AsyncTogether = None
36
+
37
+ try:
38
+ from together import Together, AsyncTogether
39
+
40
+ together_Together = Together
41
+ together_AsyncTogether = AsyncTogether
42
+ HAS_TOGETHER = True
43
+ except ImportError:
44
+ pass
45
+
46
+
47
+ HAS_ANTHROPIC = False
48
+ anthropic_Anthropic = None
49
+ anthropic_AsyncAnthropic = None
50
+
51
+ try:
52
+ from anthropic import Anthropic, AsyncAnthropic
53
+
54
+ anthropic_Anthropic = Anthropic
55
+ anthropic_AsyncAnthropic = AsyncAnthropic
56
+ HAS_ANTHROPIC = True
57
+ except ImportError:
58
+ pass
59
+
60
+
61
+ HAS_GOOGLE_GENAI = False
62
+ google_genai_Client = None
63
+ google_genai_cleint_AsyncClient = None
64
+
65
+ try:
66
+ from google.genai import Client
67
+ from google.genai.client import AsyncClient
68
+
69
+ google_genai_Client = Client
70
+ google_genai_AsyncClient = AsyncClient
71
+ HAS_GOOGLE_GENAI = True
72
+ except ImportError:
73
+ pass
74
+
75
+
76
+ HAS_GROQ = False
77
+ groq_Groq = None
78
+ groq_AsyncGroq = None
79
+
80
+ try:
81
+ from groq import Groq, AsyncGroq
82
+
83
+ groq_Groq = Groq
84
+ groq_AsyncGroq = AsyncGroq
85
+ HAS_GROQ = True
86
+ except ImportError:
87
+ pass
88
+
89
+
90
+ # TODO: if we support dependency groups we can have this better type, but during runtime, we do
91
+ # not know which clients an end user might have installed.
92
+ ApiClient: TypeAlias = Any
93
+
94
+ __all__ = [
95
+ "ApiClient",
96
+ # OpenAI
97
+ "HAS_OPENAI",
98
+ "openai_OpenAI",
99
+ "openai_AsyncOpenAI",
100
+ "openai_ChatCompletion",
101
+ "openai_Response",
102
+ "openai_ParsedChatCompletion",
103
+ # Together
104
+ "HAS_TOGETHER",
105
+ "together_Together",
106
+ "together_AsyncTogether",
107
+ # Anthropic
108
+ "HAS_ANTHROPIC",
109
+ "anthropic_Anthropic",
110
+ "anthropic_AsyncAnthropic",
111
+ # Google GenAI
112
+ "HAS_GOOGLE_GENAI",
113
+ "google_genai_Client",
114
+ "google_genai_AsyncClient",
115
+ # Groq
116
+ "HAS_GROQ",
117
+ "groq_Groq",
118
+ "groq_AsyncGroq",
119
+ ]
@@ -9,6 +9,7 @@ from typing import Any, Dict, Mapping, Optional, Union
9
9
  from opentelemetry.sdk.trace import ReadableSpan
10
10
  from pydantic import BaseModel
11
11
 
12
+ from judgeval.common.api.json_encoder import json_encoder
12
13
  from judgeval.data import TraceSpan
13
14
  from judgeval.evaluation_run import EvaluationRun
14
15
 
@@ -38,21 +39,13 @@ class SpanTransformer:
38
39
  return True
39
40
 
40
41
  @staticmethod
41
- def _safe_json_handle(obj: Any, serialize: bool = True) -> Any:
42
- if serialize:
43
- if obj is None:
44
- return None
45
- try:
46
- return orjson.dumps(obj, default=str).decode("utf-8")
47
- except Exception:
48
- return orjson.dumps(str(obj)).decode("utf-8")
49
- else:
50
- if not isinstance(obj, str):
51
- return obj
52
- try:
53
- return orjson.loads(obj)
54
- except (orjson.JSONDecodeError, TypeError, ValueError):
55
- return obj
42
+ def _safe_deserialize(obj: Any) -> Any:
43
+ if not isinstance(obj, str):
44
+ return obj
45
+ try:
46
+ return orjson.loads(obj)
47
+ except (orjson.JSONDecodeError, TypeError):
48
+ return obj
56
49
 
57
50
  @staticmethod
58
51
  def _format_timestamp(timestamp: Optional[Union[float, int, str]]) -> str:
@@ -84,15 +77,13 @@ class SpanTransformer:
84
77
  if field_name == "created_at":
85
78
  attributes[attr_name] = SpanTransformer._format_timestamp(value)
86
79
  elif field_name == "expected_tools" and value:
87
- attributes[attr_name] = SpanTransformer._safe_json_handle(
80
+ attributes[attr_name] = json_encoder(
88
81
  [tool.model_dump() for tool in trace_span.expected_tools]
89
82
  )
90
83
  elif field_name == "usage" and value:
91
- attributes[attr_name] = SpanTransformer._safe_json_handle(
92
- trace_span.usage.model_dump()
93
- )
84
+ attributes[attr_name] = json_encoder(trace_span.usage)
94
85
  elif SpanTransformer._needs_json_serialization(value):
95
- attributes[attr_name] = SpanTransformer._safe_json_handle(value)
86
+ attributes[attr_name] = json_encoder(value)
96
87
  else:
97
88
  attributes[attr_name] = value
98
89
 
@@ -115,7 +106,7 @@ class SpanTransformer:
115
106
  field_name = key[9:]
116
107
 
117
108
  if isinstance(value, str):
118
- deserialized = SpanTransformer._safe_json_handle(value, serialize=False)
109
+ deserialized = SpanTransformer._safe_deserialize(value)
119
110
  judgment_data[field_name] = deserialized
120
111
  else:
121
112
  judgment_data[field_name] = value
@@ -174,9 +165,7 @@ class SpanTransformer:
174
165
  attributes = {
175
166
  "judgment.evaluation_run": True,
176
167
  "judgment.associated_span_id": span_id,
177
- "judgment.span_data": SpanTransformer._safe_json_handle(
178
- span_data.model_dump()
179
- ),
168
+ "judgment.span_data": json_encoder(span_data),
180
169
  }
181
170
 
182
171
  eval_data = evaluation_run.model_dump()
@@ -186,7 +175,7 @@ class SpanTransformer:
186
175
 
187
176
  attr_name = f"judgment.{key}"
188
177
  if SpanTransformer._needs_json_serialization(value):
189
- attributes[attr_name] = SpanTransformer._safe_json_handle(value)
178
+ attributes[attr_name] = json_encoder(value)
190
179
  else:
191
180
  attributes[attr_name] = value
192
181
 
judgeval/constants.py CHANGED
@@ -105,6 +105,7 @@ TOGETHER_SUPPORTED_MODELS = [
105
105
  ]
106
106
 
107
107
  DEFAULT_TOGETHER_MODEL = "meta-llama/Meta-Llama-3-8B-Instruct-Lite"
108
+ DEFAULT_GPT_MODEL = "gpt-4.1"
108
109
 
109
110
  JUDGMENT_SUPPORTED_MODELS = {"osiris-large", "osiris-mini", "osiris"}
110
111
 
@@ -1,6 +1,6 @@
1
1
  # generated by datamodel-codegen:
2
2
  # filename: openapi_new.json
3
- # timestamp: 2025-07-29T18:13:07+00:00
3
+ # timestamp: 2025-08-01T22:19:19+00:00
4
4
 
5
5
  from __future__ import annotations
6
6
 
@@ -99,6 +99,7 @@ class JudgmentEvalJudgmentType(BaseModel):
99
99
  append: Annotated[Optional[bool], Field(title="Append")] = False
100
100
  override: Annotated[Optional[bool], Field(title="Override")] = False
101
101
  trace_span_id: Annotated[Optional[str], Field(title="Trace Span Id")] = None
102
+ trace_id: Annotated[Optional[str], Field(title="Trace Id")] = None
102
103
 
103
104
 
104
105
  class TraceSpanJudgmentType(BaseModel):
judgeval/data/trace.py CHANGED
@@ -1,7 +1,4 @@
1
- from typing import Any
2
- import sys
3
1
  import threading
4
- import orjson
5
2
  from datetime import datetime, timezone
6
3
  from judgeval.data.judgment_types import (
7
4
  TraceUsageJudgmentType,
@@ -9,7 +6,7 @@ from judgeval.data.judgment_types import (
9
6
  TraceJudgmentType,
10
7
  )
11
8
  from judgeval.constants import SPAN_LIFECYCLE_END_UPDATE_ID
12
- from pydantic import BaseModel
9
+ from judgeval.common.api.json_encoder import json_encoder
13
10
 
14
11
 
15
12
  class TraceUsage(TraceUsageJudgmentType):
@@ -25,9 +22,9 @@ class TraceSpan(TraceSpanJudgmentType):
25
22
  "created_at": datetime.fromtimestamp(
26
23
  self.created_at, tz=timezone.utc
27
24
  ).isoformat(),
28
- "inputs": self._serialize_value(self.inputs),
29
- "output": self._serialize_value(self.output),
30
- "error": self._serialize_value(self.error),
25
+ "inputs": json_encoder(self.inputs),
26
+ "output": json_encoder(self.output),
27
+ "error": json_encoder(self.error),
31
28
  "parent_span_id": self.parent_span_id,
32
29
  "function": self.function,
33
30
  "duration": self.duration,
@@ -37,7 +34,7 @@ class TraceSpan(TraceSpanJudgmentType):
37
34
  "agent_name": self.agent_name,
38
35
  "state_before": self.state_before,
39
36
  "state_after": self.state_after,
40
- "additional_metadata": self._serialize_value(self.additional_metadata),
37
+ "additional_metadata": json_encoder(self.additional_metadata),
41
38
  "update_id": self.update_id,
42
39
  }
43
40
 
@@ -80,120 +77,6 @@ class TraceSpan(TraceSpanJudgmentType):
80
77
  )
81
78
  print(f"{indent}→ {self.function} (id: {self.span_id}){parent_info}")
82
79
 
83
- def _is_json_serializable(self, obj: Any) -> bool:
84
- """Helper method to check if an object is JSON serializable."""
85
- try:
86
- orjson.dumps(obj)
87
- return True
88
- except (TypeError, OverflowError, ValueError):
89
- return False
90
-
91
- def safe_stringify(self, output, function_name):
92
- """
93
- Safely converts an object to a JSON-serializable structure, handling common object types intelligently.
94
- """
95
- # Handle Pydantic models
96
- if hasattr(output, "model_dump"):
97
- try:
98
- return output.model_dump()
99
- except Exception:
100
- pass
101
-
102
- # Handle LangChain messages and similar objects with content/type
103
- if hasattr(output, "content") and hasattr(output, "type"):
104
- try:
105
- result = {"type": output.type, "content": output.content}
106
- # Add additional fields if they exist
107
- if hasattr(output, "additional_kwargs"):
108
- result["additional_kwargs"] = output.additional_kwargs
109
- if hasattr(output, "response_metadata"):
110
- result["response_metadata"] = output.response_metadata
111
- if hasattr(output, "name"):
112
- result["name"] = output.name
113
- return result
114
- except Exception:
115
- pass
116
-
117
- if hasattr(output, "dict"):
118
- try:
119
- return output.dict()
120
- except Exception:
121
- pass
122
-
123
- if hasattr(output, "to_dict"):
124
- try:
125
- return output.to_dict()
126
- except Exception:
127
- pass
128
-
129
- if hasattr(output, "__dataclass_fields__"):
130
- try:
131
- import dataclasses
132
-
133
- return dataclasses.asdict(output)
134
- except Exception:
135
- pass
136
-
137
- if hasattr(output, "__dict__"):
138
- try:
139
- return output.__dict__
140
- except Exception:
141
- pass
142
-
143
- try:
144
- return str(output)
145
- except (TypeError, OverflowError, ValueError):
146
- pass
147
-
148
- try:
149
- return repr(output)
150
- except (TypeError, OverflowError, ValueError):
151
- pass
152
-
153
- return None
154
-
155
- def _serialize_value(self, value: Any) -> Any:
156
- """Helper method to deep serialize a value safely supporting Pydantic Models / regular PyObjects."""
157
- if value is None:
158
- return None
159
-
160
- recursion_limit = sys.getrecursionlimit()
161
- recursion_limit = int(recursion_limit * 0.75)
162
-
163
- def serialize_value(value, current_depth=0):
164
- try:
165
- if current_depth > recursion_limit:
166
- return {"error": "max_depth_reached: " + type(value).__name__}
167
-
168
- if isinstance(value, BaseModel):
169
- return value.model_dump()
170
- elif isinstance(value, dict):
171
- # Recursively serialize dictionary values
172
- return {
173
- k: serialize_value(v, current_depth + 1)
174
- for k, v in value.items()
175
- }
176
- elif isinstance(value, (list, tuple)):
177
- # Recursively serialize list/tuple items
178
- return [serialize_value(item, current_depth + 1) for item in value]
179
- else:
180
- try:
181
- orjson.dumps(value)
182
- return value
183
- except (TypeError, OverflowError, ValueError):
184
- # Fallback to safe stringification
185
- return self.safe_stringify(value, self.function)
186
- except Exception:
187
- return {"error": "Unable to serialize"}
188
- except Exception:
189
- return {"error": "Unable to serialize"}
190
-
191
- # Start serialization with the top-level value
192
- try:
193
- return serialize_value(value, current_depth=0)
194
- except Exception:
195
- return {"error": "Unable to serialize"}
196
-
197
80
 
198
81
  class Trace(TraceJudgmentType):
199
82
  pass
@@ -3,6 +3,7 @@ from typing import List, Optional, Dict, Any, Union
3
3
  from judgeval.data import Trace
4
4
  from judgeval.scorers import APIScorerConfig, BaseScorer
5
5
  from judgeval.rules import Rule
6
+ from judgeval.constants import DEFAULT_GPT_MODEL
6
7
 
7
8
 
8
9
  class TraceRun(BaseModel):
@@ -26,7 +27,7 @@ class TraceRun(BaseModel):
26
27
  eval_name: Optional[str] = None
27
28
  traces: Optional[List[Trace]] = None
28
29
  scorers: List[Union[APIScorerConfig, BaseScorer]]
29
- model: Optional[str] = "gpt-4.1"
30
+ model: Optional[str] = DEFAULT_GPT_MODEL
30
31
  trace_span_id: Optional[str] = None
31
32
  append: Optional[bool] = False
32
33
  override: Optional[bool] = False
judgeval/dataset.py CHANGED
@@ -35,6 +35,7 @@ class Dataset:
35
35
  for e in examples:
36
36
  if isinstance(e, dict) and isinstance(e.get("data"), dict):
37
37
  e.update(e.pop("data"))
38
+ judgeval_logger.info(f"Succesfully retrieved dataset {name}!")
38
39
  return cls(
39
40
  name=name,
40
41
  project_name=project_name,
@@ -68,6 +69,7 @@ class Dataset:
68
69
  traces=[t.model_dump() for t in traces],
69
70
  overwrite=overwrite,
70
71
  )
72
+ judgeval_logger.info(f"Succesfull created dataset {name}!")
71
73
  return cls(
72
74
  name=name,
73
75
  project_name=project_name,
@@ -3,7 +3,7 @@ from pydantic import BaseModel, field_validator, Field
3
3
 
4
4
  from judgeval.data import Example
5
5
  from judgeval.scorers import BaseScorer, APIScorerConfig
6
- from judgeval.constants import ACCEPTABLE_MODELS
6
+ from judgeval.constants import ACCEPTABLE_MODELS, DEFAULT_GPT_MODEL
7
7
 
8
8
 
9
9
  class EvaluationRun(BaseModel):
@@ -24,8 +24,9 @@ class EvaluationRun(BaseModel):
24
24
  eval_name: Optional[str] = Field(default=None, validate_default=True)
25
25
  examples: List[Example]
26
26
  scorers: List[Union[APIScorerConfig, BaseScorer]]
27
- model: Optional[str] = "gpt-4.1"
27
+ model: Optional[str] = DEFAULT_GPT_MODEL
28
28
  trace_span_id: Optional[str] = None
29
+ trace_id: Optional[str] = None
29
30
  # API Key will be "" until user calls client.run_eval(), then API Key will be set
30
31
  override: Optional[bool] = False
31
32
  append: Optional[bool] = False
@@ -44,6 +45,9 @@ class EvaluationRun(BaseModel):
44
45
  def validate_examples(cls, v):
45
46
  if not v:
46
47
  raise ValueError("Examples cannot be empty.")
48
+ for item in v:
49
+ if not isinstance(item, Example):
50
+ raise ValueError(f"Item of type {type(item)} is not a Example")
47
51
  return v
48
52
 
49
53
  @field_validator("scorers", mode="before")
@@ -7,6 +7,7 @@ from judgeval.common.utils import (
7
7
  fetch_litellm_api_response,
8
8
  )
9
9
  from judgeval.common.logger import judgeval_logger
10
+ from judgeval.constants import DEFAULT_GPT_MODEL
10
11
 
11
12
  BASE_CONVERSATION = [
12
13
  {"role": "system", "content": "You are a helpful assistant."},
@@ -14,7 +15,7 @@ BASE_CONVERSATION = [
14
15
 
15
16
 
16
17
  class LiteLLMJudge(JudgevalJudge):
17
- def __init__(self, model: str = "gpt-4.1-mini", **kwargs):
18
+ def __init__(self, model: str = DEFAULT_GPT_MODEL, **kwargs):
18
19
  self.model = model
19
20
  self.kwargs = kwargs
20
21
  super().__init__(model_name=model)
@@ -14,6 +14,7 @@ from judgeval.common.utils import (
14
14
  aget_chat_completion,
15
15
  )
16
16
  from judgeval.common.logger import judgeval_logger
17
+ from judgeval.constants import DEFAULT_GPT_MODEL
17
18
 
18
19
 
19
20
  def build_dynamic_mixture_prompt(
@@ -161,7 +162,7 @@ class MixtureOfJudges(JudgevalJudge):
161
162
  "LLAMA3_70B_INSTRUCT_TURBO",
162
163
  "MISTRAL_8x22B_INSTRUCT",
163
164
  ],
164
- aggregator: str = "gpt-4.1",
165
+ aggregator: str = DEFAULT_GPT_MODEL,
165
166
  **kwargs,
166
167
  ):
167
168
  """
judgeval/judges/utils.py CHANGED
@@ -7,6 +7,7 @@ from typing import Optional, Union, Tuple, List
7
7
 
8
8
  from judgeval.common.exceptions import InvalidJudgeModelError
9
9
  from judgeval.judges import JudgevalJudge, LiteLLMJudge, TogetherJudge, MixtureOfJudges
10
+ from judgeval.constants import DEFAULT_GPT_MODEL
10
11
  from judgeval.constants import (
11
12
  TOGETHER_SUPPORTED_MODELS,
12
13
  JUDGMENT_SUPPORTED_MODELS,
@@ -30,7 +31,7 @@ def create_judge(
30
31
  If no model is provided, uses GPT4o as the default judge.
31
32
  """
32
33
  if model is None: # default option
33
- return LiteLLMJudge(model="gpt-4.1"), True
34
+ return LiteLLMJudge(model=DEFAULT_GPT_MODEL), True
34
35
  if not isinstance(model, (str, list, JudgevalJudge)):
35
36
  raise InvalidJudgeModelError(
36
37
  f"Model must be a string, list of strings, or a judgeval judge object. Got: {type(model)} instead."
@@ -2,9 +2,10 @@
2
2
  Implements the JudgmentClient to interact with the Judgment API.
3
3
  """
4
4
 
5
+ from __future__ import annotations
5
6
  import os
6
7
  from uuid import uuid4
7
- from typing import Optional, List, Dict, Any, Union, Callable
8
+ from typing import Optional, List, Dict, Any, Union, Callable, TYPE_CHECKING
8
9
 
9
10
  from judgeval.data import (
10
11
  ScoringResult,
@@ -28,7 +29,11 @@ from judgeval.common.tracer import Tracer
28
29
  from judgeval.common.utils import validate_api_key
29
30
  from pydantic import BaseModel
30
31
  from judgeval.common.logger import judgeval_logger
31
- from judgeval.integrations.langgraph import JudgevalCallbackHandler
32
+
33
+
34
+ if TYPE_CHECKING:
35
+ from judgeval.integrations.langgraph import JudgevalCallbackHandler
36
+ from judgeval.constants import DEFAULT_GPT_MODEL
32
37
 
33
38
 
34
39
  class EvalRunRequestBody(BaseModel):
@@ -89,7 +94,7 @@ class JudgmentClient(metaclass=SingletonMeta):
89
94
  tools: Optional[List[Dict[str, Any]]] = None,
90
95
  project_name: str = "default_project",
91
96
  eval_run_name: str = "default_eval_trace",
92
- model: Optional[str] = "gpt-4.1",
97
+ model: Optional[str] = DEFAULT_GPT_MODEL,
93
98
  append: bool = False,
94
99
  override: bool = False,
95
100
  ) -> List[ScoringResult]:
@@ -127,7 +132,7 @@ class JudgmentClient(metaclass=SingletonMeta):
127
132
  self,
128
133
  examples: List[Example],
129
134
  scorers: List[Union[APIScorerConfig, BaseScorer]],
130
- model: Optional[str] = "gpt-4.1",
135
+ model: Optional[str] = DEFAULT_GPT_MODEL,
131
136
  project_name: str = "default_project",
132
137
  eval_run_name: str = "default_eval_run",
133
138
  override: bool = False,
@@ -214,7 +219,7 @@ class JudgmentClient(metaclass=SingletonMeta):
214
219
  self,
215
220
  examples: List[Example],
216
221
  scorers: List[Union[APIScorerConfig, BaseScorer]],
217
- model: Optional[str] = "gpt-4.1",
222
+ model: Optional[str] = DEFAULT_GPT_MODEL,
218
223
  project_name: str = "default_test",
219
224
  eval_run_name: str = str(uuid4()),
220
225
  override: bool = False,
@@ -255,7 +260,7 @@ class JudgmentClient(metaclass=SingletonMeta):
255
260
  tracer: Optional[Union[Tracer, JudgevalCallbackHandler]] = None,
256
261
  traces: Optional[List[Trace]] = None,
257
262
  tools: Optional[List[Dict[str, Any]]] = None,
258
- model: Optional[str] = "gpt-4.1",
263
+ model: Optional[str] = DEFAULT_GPT_MODEL,
259
264
  project_name: str = "default_test",
260
265
  eval_run_name: str = str(uuid4()),
261
266
  override: bool = False,