judgeval 0.3.2__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +2 -0
- judgeval/clients.py +2 -1
- judgeval/common/api/api.py +4 -18
- judgeval/common/api/constants.py +1 -1
- judgeval/common/api/json_encoder.py +242 -0
- judgeval/common/tracer/core.py +498 -215
- judgeval/common/tracer/providers.py +119 -0
- judgeval/common/tracer/span_transformer.py +14 -25
- judgeval/constants.py +1 -0
- judgeval/data/judgment_types.py +2 -1
- judgeval/data/trace.py +5 -122
- judgeval/data/trace_run.py +2 -1
- judgeval/dataset.py +2 -0
- judgeval/evaluation_run.py +6 -2
- judgeval/judges/litellm_judge.py +2 -1
- judgeval/judges/mixture_of_judges.py +2 -1
- judgeval/judges/utils.py +2 -1
- judgeval/judgment_client.py +11 -6
- judgeval/local_eval_queue.py +192 -0
- judgeval/run_evaluation.py +11 -6
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +18 -19
- judgeval/scorers/score.py +34 -11
- judgeval/utils/async_utils.py +36 -0
- {judgeval-0.3.2.dist-info → judgeval-0.5.0.dist-info}/METADATA +9 -12
- {judgeval-0.3.2.dist-info → judgeval-0.5.0.dist-info}/RECORD +27 -23
- {judgeval-0.3.2.dist-info → judgeval-0.5.0.dist-info}/WHEEL +0 -0
- {judgeval-0.3.2.dist-info → judgeval-0.5.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,119 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
import logging
|
3
|
+
from typing import Any, TypeAlias
|
4
|
+
|
5
|
+
|
6
|
+
logger = logging.getLogger(__name__)
|
7
|
+
# TODO: Have functions that assert and return the relevant exports when the client is installed.
|
8
|
+
# The method should raise if the user tries to access client information that doesnt exist.
|
9
|
+
|
10
|
+
HAS_OPENAI = False
|
11
|
+
openai_OpenAI = None
|
12
|
+
openai_AsyncOpenAI = None
|
13
|
+
openai_ChatCompletion = None
|
14
|
+
openai_Response = None
|
15
|
+
openai_ParsedChatCompletion = None
|
16
|
+
|
17
|
+
try:
|
18
|
+
from openai import OpenAI, AsyncOpenAI
|
19
|
+
from openai.types.chat.chat_completion import ChatCompletion
|
20
|
+
from openai.types.responses.response import Response
|
21
|
+
from openai.types.chat import ParsedChatCompletion
|
22
|
+
|
23
|
+
openai_OpenAI = OpenAI
|
24
|
+
openai_AsyncOpenAI = AsyncOpenAI
|
25
|
+
openai_ChatCompletion = ChatCompletion
|
26
|
+
openai_Response = Response
|
27
|
+
openai_ParsedChatCompletion = ParsedChatCompletion
|
28
|
+
HAS_OPENAI = True
|
29
|
+
except ImportError:
|
30
|
+
pass
|
31
|
+
|
32
|
+
|
33
|
+
HAS_TOGETHER = False
|
34
|
+
together_Together = None
|
35
|
+
together_AsyncTogether = None
|
36
|
+
|
37
|
+
try:
|
38
|
+
from together import Together, AsyncTogether
|
39
|
+
|
40
|
+
together_Together = Together
|
41
|
+
together_AsyncTogether = AsyncTogether
|
42
|
+
HAS_TOGETHER = True
|
43
|
+
except ImportError:
|
44
|
+
pass
|
45
|
+
|
46
|
+
|
47
|
+
HAS_ANTHROPIC = False
|
48
|
+
anthropic_Anthropic = None
|
49
|
+
anthropic_AsyncAnthropic = None
|
50
|
+
|
51
|
+
try:
|
52
|
+
from anthropic import Anthropic, AsyncAnthropic
|
53
|
+
|
54
|
+
anthropic_Anthropic = Anthropic
|
55
|
+
anthropic_AsyncAnthropic = AsyncAnthropic
|
56
|
+
HAS_ANTHROPIC = True
|
57
|
+
except ImportError:
|
58
|
+
pass
|
59
|
+
|
60
|
+
|
61
|
+
HAS_GOOGLE_GENAI = False
|
62
|
+
google_genai_Client = None
|
63
|
+
google_genai_cleint_AsyncClient = None
|
64
|
+
|
65
|
+
try:
|
66
|
+
from google.genai import Client
|
67
|
+
from google.genai.client import AsyncClient
|
68
|
+
|
69
|
+
google_genai_Client = Client
|
70
|
+
google_genai_AsyncClient = AsyncClient
|
71
|
+
HAS_GOOGLE_GENAI = True
|
72
|
+
except ImportError:
|
73
|
+
pass
|
74
|
+
|
75
|
+
|
76
|
+
HAS_GROQ = False
|
77
|
+
groq_Groq = None
|
78
|
+
groq_AsyncGroq = None
|
79
|
+
|
80
|
+
try:
|
81
|
+
from groq import Groq, AsyncGroq
|
82
|
+
|
83
|
+
groq_Groq = Groq
|
84
|
+
groq_AsyncGroq = AsyncGroq
|
85
|
+
HAS_GROQ = True
|
86
|
+
except ImportError:
|
87
|
+
pass
|
88
|
+
|
89
|
+
|
90
|
+
# TODO: if we support dependency groups we can have this better type, but during runtime, we do
|
91
|
+
# not know which clients an end user might have installed.
|
92
|
+
ApiClient: TypeAlias = Any
|
93
|
+
|
94
|
+
__all__ = [
|
95
|
+
"ApiClient",
|
96
|
+
# OpenAI
|
97
|
+
"HAS_OPENAI",
|
98
|
+
"openai_OpenAI",
|
99
|
+
"openai_AsyncOpenAI",
|
100
|
+
"openai_ChatCompletion",
|
101
|
+
"openai_Response",
|
102
|
+
"openai_ParsedChatCompletion",
|
103
|
+
# Together
|
104
|
+
"HAS_TOGETHER",
|
105
|
+
"together_Together",
|
106
|
+
"together_AsyncTogether",
|
107
|
+
# Anthropic
|
108
|
+
"HAS_ANTHROPIC",
|
109
|
+
"anthropic_Anthropic",
|
110
|
+
"anthropic_AsyncAnthropic",
|
111
|
+
# Google GenAI
|
112
|
+
"HAS_GOOGLE_GENAI",
|
113
|
+
"google_genai_Client",
|
114
|
+
"google_genai_AsyncClient",
|
115
|
+
# Groq
|
116
|
+
"HAS_GROQ",
|
117
|
+
"groq_Groq",
|
118
|
+
"groq_AsyncGroq",
|
119
|
+
]
|
@@ -9,6 +9,7 @@ from typing import Any, Dict, Mapping, Optional, Union
|
|
9
9
|
from opentelemetry.sdk.trace import ReadableSpan
|
10
10
|
from pydantic import BaseModel
|
11
11
|
|
12
|
+
from judgeval.common.api.json_encoder import json_encoder
|
12
13
|
from judgeval.data import TraceSpan
|
13
14
|
from judgeval.evaluation_run import EvaluationRun
|
14
15
|
|
@@ -38,21 +39,13 @@ class SpanTransformer:
|
|
38
39
|
return True
|
39
40
|
|
40
41
|
@staticmethod
|
41
|
-
def
|
42
|
-
if
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
return orjson.dumps(str(obj)).decode("utf-8")
|
49
|
-
else:
|
50
|
-
if not isinstance(obj, str):
|
51
|
-
return obj
|
52
|
-
try:
|
53
|
-
return orjson.loads(obj)
|
54
|
-
except (orjson.JSONDecodeError, TypeError, ValueError):
|
55
|
-
return obj
|
42
|
+
def _safe_deserialize(obj: Any) -> Any:
|
43
|
+
if not isinstance(obj, str):
|
44
|
+
return obj
|
45
|
+
try:
|
46
|
+
return orjson.loads(obj)
|
47
|
+
except (orjson.JSONDecodeError, TypeError):
|
48
|
+
return obj
|
56
49
|
|
57
50
|
@staticmethod
|
58
51
|
def _format_timestamp(timestamp: Optional[Union[float, int, str]]) -> str:
|
@@ -84,15 +77,13 @@ class SpanTransformer:
|
|
84
77
|
if field_name == "created_at":
|
85
78
|
attributes[attr_name] = SpanTransformer._format_timestamp(value)
|
86
79
|
elif field_name == "expected_tools" and value:
|
87
|
-
attributes[attr_name] =
|
80
|
+
attributes[attr_name] = json_encoder(
|
88
81
|
[tool.model_dump() for tool in trace_span.expected_tools]
|
89
82
|
)
|
90
83
|
elif field_name == "usage" and value:
|
91
|
-
attributes[attr_name] =
|
92
|
-
trace_span.usage.model_dump()
|
93
|
-
)
|
84
|
+
attributes[attr_name] = json_encoder(trace_span.usage)
|
94
85
|
elif SpanTransformer._needs_json_serialization(value):
|
95
|
-
attributes[attr_name] =
|
86
|
+
attributes[attr_name] = json_encoder(value)
|
96
87
|
else:
|
97
88
|
attributes[attr_name] = value
|
98
89
|
|
@@ -115,7 +106,7 @@ class SpanTransformer:
|
|
115
106
|
field_name = key[9:]
|
116
107
|
|
117
108
|
if isinstance(value, str):
|
118
|
-
deserialized = SpanTransformer.
|
109
|
+
deserialized = SpanTransformer._safe_deserialize(value)
|
119
110
|
judgment_data[field_name] = deserialized
|
120
111
|
else:
|
121
112
|
judgment_data[field_name] = value
|
@@ -174,9 +165,7 @@ class SpanTransformer:
|
|
174
165
|
attributes = {
|
175
166
|
"judgment.evaluation_run": True,
|
176
167
|
"judgment.associated_span_id": span_id,
|
177
|
-
"judgment.span_data":
|
178
|
-
span_data.model_dump()
|
179
|
-
),
|
168
|
+
"judgment.span_data": json_encoder(span_data),
|
180
169
|
}
|
181
170
|
|
182
171
|
eval_data = evaluation_run.model_dump()
|
@@ -186,7 +175,7 @@ class SpanTransformer:
|
|
186
175
|
|
187
176
|
attr_name = f"judgment.{key}"
|
188
177
|
if SpanTransformer._needs_json_serialization(value):
|
189
|
-
attributes[attr_name] =
|
178
|
+
attributes[attr_name] = json_encoder(value)
|
190
179
|
else:
|
191
180
|
attributes[attr_name] = value
|
192
181
|
|
judgeval/constants.py
CHANGED
judgeval/data/judgment_types.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# generated by datamodel-codegen:
|
2
2
|
# filename: openapi_new.json
|
3
|
-
# timestamp: 2025-
|
3
|
+
# timestamp: 2025-08-01T22:19:19+00:00
|
4
4
|
|
5
5
|
from __future__ import annotations
|
6
6
|
|
@@ -99,6 +99,7 @@ class JudgmentEvalJudgmentType(BaseModel):
|
|
99
99
|
append: Annotated[Optional[bool], Field(title="Append")] = False
|
100
100
|
override: Annotated[Optional[bool], Field(title="Override")] = False
|
101
101
|
trace_span_id: Annotated[Optional[str], Field(title="Trace Span Id")] = None
|
102
|
+
trace_id: Annotated[Optional[str], Field(title="Trace Id")] = None
|
102
103
|
|
103
104
|
|
104
105
|
class TraceSpanJudgmentType(BaseModel):
|
judgeval/data/trace.py
CHANGED
@@ -1,7 +1,4 @@
|
|
1
|
-
from typing import Any
|
2
|
-
import sys
|
3
1
|
import threading
|
4
|
-
import orjson
|
5
2
|
from datetime import datetime, timezone
|
6
3
|
from judgeval.data.judgment_types import (
|
7
4
|
TraceUsageJudgmentType,
|
@@ -9,7 +6,7 @@ from judgeval.data.judgment_types import (
|
|
9
6
|
TraceJudgmentType,
|
10
7
|
)
|
11
8
|
from judgeval.constants import SPAN_LIFECYCLE_END_UPDATE_ID
|
12
|
-
from
|
9
|
+
from judgeval.common.api.json_encoder import json_encoder
|
13
10
|
|
14
11
|
|
15
12
|
class TraceUsage(TraceUsageJudgmentType):
|
@@ -25,9 +22,9 @@ class TraceSpan(TraceSpanJudgmentType):
|
|
25
22
|
"created_at": datetime.fromtimestamp(
|
26
23
|
self.created_at, tz=timezone.utc
|
27
24
|
).isoformat(),
|
28
|
-
"inputs":
|
29
|
-
"output":
|
30
|
-
"error":
|
25
|
+
"inputs": json_encoder(self.inputs),
|
26
|
+
"output": json_encoder(self.output),
|
27
|
+
"error": json_encoder(self.error),
|
31
28
|
"parent_span_id": self.parent_span_id,
|
32
29
|
"function": self.function,
|
33
30
|
"duration": self.duration,
|
@@ -37,7 +34,7 @@ class TraceSpan(TraceSpanJudgmentType):
|
|
37
34
|
"agent_name": self.agent_name,
|
38
35
|
"state_before": self.state_before,
|
39
36
|
"state_after": self.state_after,
|
40
|
-
"additional_metadata":
|
37
|
+
"additional_metadata": json_encoder(self.additional_metadata),
|
41
38
|
"update_id": self.update_id,
|
42
39
|
}
|
43
40
|
|
@@ -80,120 +77,6 @@ class TraceSpan(TraceSpanJudgmentType):
|
|
80
77
|
)
|
81
78
|
print(f"{indent}→ {self.function} (id: {self.span_id}){parent_info}")
|
82
79
|
|
83
|
-
def _is_json_serializable(self, obj: Any) -> bool:
|
84
|
-
"""Helper method to check if an object is JSON serializable."""
|
85
|
-
try:
|
86
|
-
orjson.dumps(obj)
|
87
|
-
return True
|
88
|
-
except (TypeError, OverflowError, ValueError):
|
89
|
-
return False
|
90
|
-
|
91
|
-
def safe_stringify(self, output, function_name):
|
92
|
-
"""
|
93
|
-
Safely converts an object to a JSON-serializable structure, handling common object types intelligently.
|
94
|
-
"""
|
95
|
-
# Handle Pydantic models
|
96
|
-
if hasattr(output, "model_dump"):
|
97
|
-
try:
|
98
|
-
return output.model_dump()
|
99
|
-
except Exception:
|
100
|
-
pass
|
101
|
-
|
102
|
-
# Handle LangChain messages and similar objects with content/type
|
103
|
-
if hasattr(output, "content") and hasattr(output, "type"):
|
104
|
-
try:
|
105
|
-
result = {"type": output.type, "content": output.content}
|
106
|
-
# Add additional fields if they exist
|
107
|
-
if hasattr(output, "additional_kwargs"):
|
108
|
-
result["additional_kwargs"] = output.additional_kwargs
|
109
|
-
if hasattr(output, "response_metadata"):
|
110
|
-
result["response_metadata"] = output.response_metadata
|
111
|
-
if hasattr(output, "name"):
|
112
|
-
result["name"] = output.name
|
113
|
-
return result
|
114
|
-
except Exception:
|
115
|
-
pass
|
116
|
-
|
117
|
-
if hasattr(output, "dict"):
|
118
|
-
try:
|
119
|
-
return output.dict()
|
120
|
-
except Exception:
|
121
|
-
pass
|
122
|
-
|
123
|
-
if hasattr(output, "to_dict"):
|
124
|
-
try:
|
125
|
-
return output.to_dict()
|
126
|
-
except Exception:
|
127
|
-
pass
|
128
|
-
|
129
|
-
if hasattr(output, "__dataclass_fields__"):
|
130
|
-
try:
|
131
|
-
import dataclasses
|
132
|
-
|
133
|
-
return dataclasses.asdict(output)
|
134
|
-
except Exception:
|
135
|
-
pass
|
136
|
-
|
137
|
-
if hasattr(output, "__dict__"):
|
138
|
-
try:
|
139
|
-
return output.__dict__
|
140
|
-
except Exception:
|
141
|
-
pass
|
142
|
-
|
143
|
-
try:
|
144
|
-
return str(output)
|
145
|
-
except (TypeError, OverflowError, ValueError):
|
146
|
-
pass
|
147
|
-
|
148
|
-
try:
|
149
|
-
return repr(output)
|
150
|
-
except (TypeError, OverflowError, ValueError):
|
151
|
-
pass
|
152
|
-
|
153
|
-
return None
|
154
|
-
|
155
|
-
def _serialize_value(self, value: Any) -> Any:
|
156
|
-
"""Helper method to deep serialize a value safely supporting Pydantic Models / regular PyObjects."""
|
157
|
-
if value is None:
|
158
|
-
return None
|
159
|
-
|
160
|
-
recursion_limit = sys.getrecursionlimit()
|
161
|
-
recursion_limit = int(recursion_limit * 0.75)
|
162
|
-
|
163
|
-
def serialize_value(value, current_depth=0):
|
164
|
-
try:
|
165
|
-
if current_depth > recursion_limit:
|
166
|
-
return {"error": "max_depth_reached: " + type(value).__name__}
|
167
|
-
|
168
|
-
if isinstance(value, BaseModel):
|
169
|
-
return value.model_dump()
|
170
|
-
elif isinstance(value, dict):
|
171
|
-
# Recursively serialize dictionary values
|
172
|
-
return {
|
173
|
-
k: serialize_value(v, current_depth + 1)
|
174
|
-
for k, v in value.items()
|
175
|
-
}
|
176
|
-
elif isinstance(value, (list, tuple)):
|
177
|
-
# Recursively serialize list/tuple items
|
178
|
-
return [serialize_value(item, current_depth + 1) for item in value]
|
179
|
-
else:
|
180
|
-
try:
|
181
|
-
orjson.dumps(value)
|
182
|
-
return value
|
183
|
-
except (TypeError, OverflowError, ValueError):
|
184
|
-
# Fallback to safe stringification
|
185
|
-
return self.safe_stringify(value, self.function)
|
186
|
-
except Exception:
|
187
|
-
return {"error": "Unable to serialize"}
|
188
|
-
except Exception:
|
189
|
-
return {"error": "Unable to serialize"}
|
190
|
-
|
191
|
-
# Start serialization with the top-level value
|
192
|
-
try:
|
193
|
-
return serialize_value(value, current_depth=0)
|
194
|
-
except Exception:
|
195
|
-
return {"error": "Unable to serialize"}
|
196
|
-
|
197
80
|
|
198
81
|
class Trace(TraceJudgmentType):
|
199
82
|
pass
|
judgeval/data/trace_run.py
CHANGED
@@ -3,6 +3,7 @@ from typing import List, Optional, Dict, Any, Union
|
|
3
3
|
from judgeval.data import Trace
|
4
4
|
from judgeval.scorers import APIScorerConfig, BaseScorer
|
5
5
|
from judgeval.rules import Rule
|
6
|
+
from judgeval.constants import DEFAULT_GPT_MODEL
|
6
7
|
|
7
8
|
|
8
9
|
class TraceRun(BaseModel):
|
@@ -26,7 +27,7 @@ class TraceRun(BaseModel):
|
|
26
27
|
eval_name: Optional[str] = None
|
27
28
|
traces: Optional[List[Trace]] = None
|
28
29
|
scorers: List[Union[APIScorerConfig, BaseScorer]]
|
29
|
-
model: Optional[str] =
|
30
|
+
model: Optional[str] = DEFAULT_GPT_MODEL
|
30
31
|
trace_span_id: Optional[str] = None
|
31
32
|
append: Optional[bool] = False
|
32
33
|
override: Optional[bool] = False
|
judgeval/dataset.py
CHANGED
@@ -35,6 +35,7 @@ class Dataset:
|
|
35
35
|
for e in examples:
|
36
36
|
if isinstance(e, dict) and isinstance(e.get("data"), dict):
|
37
37
|
e.update(e.pop("data"))
|
38
|
+
judgeval_logger.info(f"Succesfully retrieved dataset {name}!")
|
38
39
|
return cls(
|
39
40
|
name=name,
|
40
41
|
project_name=project_name,
|
@@ -68,6 +69,7 @@ class Dataset:
|
|
68
69
|
traces=[t.model_dump() for t in traces],
|
69
70
|
overwrite=overwrite,
|
70
71
|
)
|
72
|
+
judgeval_logger.info(f"Succesfull created dataset {name}!")
|
71
73
|
return cls(
|
72
74
|
name=name,
|
73
75
|
project_name=project_name,
|
judgeval/evaluation_run.py
CHANGED
@@ -3,7 +3,7 @@ from pydantic import BaseModel, field_validator, Field
|
|
3
3
|
|
4
4
|
from judgeval.data import Example
|
5
5
|
from judgeval.scorers import BaseScorer, APIScorerConfig
|
6
|
-
from judgeval.constants import ACCEPTABLE_MODELS
|
6
|
+
from judgeval.constants import ACCEPTABLE_MODELS, DEFAULT_GPT_MODEL
|
7
7
|
|
8
8
|
|
9
9
|
class EvaluationRun(BaseModel):
|
@@ -24,8 +24,9 @@ class EvaluationRun(BaseModel):
|
|
24
24
|
eval_name: Optional[str] = Field(default=None, validate_default=True)
|
25
25
|
examples: List[Example]
|
26
26
|
scorers: List[Union[APIScorerConfig, BaseScorer]]
|
27
|
-
model: Optional[str] =
|
27
|
+
model: Optional[str] = DEFAULT_GPT_MODEL
|
28
28
|
trace_span_id: Optional[str] = None
|
29
|
+
trace_id: Optional[str] = None
|
29
30
|
# API Key will be "" until user calls client.run_eval(), then API Key will be set
|
30
31
|
override: Optional[bool] = False
|
31
32
|
append: Optional[bool] = False
|
@@ -44,6 +45,9 @@ class EvaluationRun(BaseModel):
|
|
44
45
|
def validate_examples(cls, v):
|
45
46
|
if not v:
|
46
47
|
raise ValueError("Examples cannot be empty.")
|
48
|
+
for item in v:
|
49
|
+
if not isinstance(item, Example):
|
50
|
+
raise ValueError(f"Item of type {type(item)} is not a Example")
|
47
51
|
return v
|
48
52
|
|
49
53
|
@field_validator("scorers", mode="before")
|
judgeval/judges/litellm_judge.py
CHANGED
@@ -7,6 +7,7 @@ from judgeval.common.utils import (
|
|
7
7
|
fetch_litellm_api_response,
|
8
8
|
)
|
9
9
|
from judgeval.common.logger import judgeval_logger
|
10
|
+
from judgeval.constants import DEFAULT_GPT_MODEL
|
10
11
|
|
11
12
|
BASE_CONVERSATION = [
|
12
13
|
{"role": "system", "content": "You are a helpful assistant."},
|
@@ -14,7 +15,7 @@ BASE_CONVERSATION = [
|
|
14
15
|
|
15
16
|
|
16
17
|
class LiteLLMJudge(JudgevalJudge):
|
17
|
-
def __init__(self, model: str =
|
18
|
+
def __init__(self, model: str = DEFAULT_GPT_MODEL, **kwargs):
|
18
19
|
self.model = model
|
19
20
|
self.kwargs = kwargs
|
20
21
|
super().__init__(model_name=model)
|
@@ -14,6 +14,7 @@ from judgeval.common.utils import (
|
|
14
14
|
aget_chat_completion,
|
15
15
|
)
|
16
16
|
from judgeval.common.logger import judgeval_logger
|
17
|
+
from judgeval.constants import DEFAULT_GPT_MODEL
|
17
18
|
|
18
19
|
|
19
20
|
def build_dynamic_mixture_prompt(
|
@@ -161,7 +162,7 @@ class MixtureOfJudges(JudgevalJudge):
|
|
161
162
|
"LLAMA3_70B_INSTRUCT_TURBO",
|
162
163
|
"MISTRAL_8x22B_INSTRUCT",
|
163
164
|
],
|
164
|
-
aggregator: str =
|
165
|
+
aggregator: str = DEFAULT_GPT_MODEL,
|
165
166
|
**kwargs,
|
166
167
|
):
|
167
168
|
"""
|
judgeval/judges/utils.py
CHANGED
@@ -7,6 +7,7 @@ from typing import Optional, Union, Tuple, List
|
|
7
7
|
|
8
8
|
from judgeval.common.exceptions import InvalidJudgeModelError
|
9
9
|
from judgeval.judges import JudgevalJudge, LiteLLMJudge, TogetherJudge, MixtureOfJudges
|
10
|
+
from judgeval.constants import DEFAULT_GPT_MODEL
|
10
11
|
from judgeval.constants import (
|
11
12
|
TOGETHER_SUPPORTED_MODELS,
|
12
13
|
JUDGMENT_SUPPORTED_MODELS,
|
@@ -30,7 +31,7 @@ def create_judge(
|
|
30
31
|
If no model is provided, uses GPT4o as the default judge.
|
31
32
|
"""
|
32
33
|
if model is None: # default option
|
33
|
-
return LiteLLMJudge(model=
|
34
|
+
return LiteLLMJudge(model=DEFAULT_GPT_MODEL), True
|
34
35
|
if not isinstance(model, (str, list, JudgevalJudge)):
|
35
36
|
raise InvalidJudgeModelError(
|
36
37
|
f"Model must be a string, list of strings, or a judgeval judge object. Got: {type(model)} instead."
|
judgeval/judgment_client.py
CHANGED
@@ -2,9 +2,10 @@
|
|
2
2
|
Implements the JudgmentClient to interact with the Judgment API.
|
3
3
|
"""
|
4
4
|
|
5
|
+
from __future__ import annotations
|
5
6
|
import os
|
6
7
|
from uuid import uuid4
|
7
|
-
from typing import Optional, List, Dict, Any, Union, Callable
|
8
|
+
from typing import Optional, List, Dict, Any, Union, Callable, TYPE_CHECKING
|
8
9
|
|
9
10
|
from judgeval.data import (
|
10
11
|
ScoringResult,
|
@@ -28,7 +29,11 @@ from judgeval.common.tracer import Tracer
|
|
28
29
|
from judgeval.common.utils import validate_api_key
|
29
30
|
from pydantic import BaseModel
|
30
31
|
from judgeval.common.logger import judgeval_logger
|
31
|
-
|
32
|
+
|
33
|
+
|
34
|
+
if TYPE_CHECKING:
|
35
|
+
from judgeval.integrations.langgraph import JudgevalCallbackHandler
|
36
|
+
from judgeval.constants import DEFAULT_GPT_MODEL
|
32
37
|
|
33
38
|
|
34
39
|
class EvalRunRequestBody(BaseModel):
|
@@ -89,7 +94,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
89
94
|
tools: Optional[List[Dict[str, Any]]] = None,
|
90
95
|
project_name: str = "default_project",
|
91
96
|
eval_run_name: str = "default_eval_trace",
|
92
|
-
model: Optional[str] =
|
97
|
+
model: Optional[str] = DEFAULT_GPT_MODEL,
|
93
98
|
append: bool = False,
|
94
99
|
override: bool = False,
|
95
100
|
) -> List[ScoringResult]:
|
@@ -127,7 +132,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
127
132
|
self,
|
128
133
|
examples: List[Example],
|
129
134
|
scorers: List[Union[APIScorerConfig, BaseScorer]],
|
130
|
-
model: Optional[str] =
|
135
|
+
model: Optional[str] = DEFAULT_GPT_MODEL,
|
131
136
|
project_name: str = "default_project",
|
132
137
|
eval_run_name: str = "default_eval_run",
|
133
138
|
override: bool = False,
|
@@ -214,7 +219,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
214
219
|
self,
|
215
220
|
examples: List[Example],
|
216
221
|
scorers: List[Union[APIScorerConfig, BaseScorer]],
|
217
|
-
model: Optional[str] =
|
222
|
+
model: Optional[str] = DEFAULT_GPT_MODEL,
|
218
223
|
project_name: str = "default_test",
|
219
224
|
eval_run_name: str = str(uuid4()),
|
220
225
|
override: bool = False,
|
@@ -255,7 +260,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
255
260
|
tracer: Optional[Union[Tracer, JudgevalCallbackHandler]] = None,
|
256
261
|
traces: Optional[List[Trace]] = None,
|
257
262
|
tools: Optional[List[Dict[str, Any]]] = None,
|
258
|
-
model: Optional[str] =
|
263
|
+
model: Optional[str] = DEFAULT_GPT_MODEL,
|
259
264
|
project_name: str = "default_test",
|
260
265
|
eval_run_name: str = str(uuid4()),
|
261
266
|
override: bool = False,
|