judgeval 0.0.52__py3-none-any.whl → 0.0.53__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/logger.py +46 -199
- judgeval/common/s3_storage.py +2 -6
- judgeval/common/tracer.py +182 -262
- judgeval/common/utils.py +16 -36
- judgeval/constants.py +14 -20
- judgeval/data/__init__.py +0 -2
- judgeval/data/datasets/dataset.py +6 -10
- judgeval/data/datasets/eval_dataset_client.py +25 -27
- judgeval/data/example.py +5 -138
- judgeval/data/judgment_types.py +214 -0
- judgeval/data/result.py +7 -25
- judgeval/data/scorer_data.py +28 -40
- judgeval/data/scripts/fix_default_factory.py +23 -0
- judgeval/data/scripts/openapi_transform.py +123 -0
- judgeval/data/tool.py +3 -54
- judgeval/data/trace.py +31 -50
- judgeval/data/trace_run.py +3 -3
- judgeval/evaluation_run.py +16 -23
- judgeval/integrations/langgraph.py +11 -12
- judgeval/judges/litellm_judge.py +3 -6
- judgeval/judges/mixture_of_judges.py +8 -25
- judgeval/judges/together_judge.py +3 -6
- judgeval/judgment_client.py +22 -24
- judgeval/rules.py +7 -19
- judgeval/run_evaluation.py +79 -242
- judgeval/scorers/__init__.py +4 -20
- judgeval/scorers/agent_scorer.py +21 -0
- judgeval/scorers/api_scorer.py +28 -38
- judgeval/scorers/base_scorer.py +98 -0
- judgeval/scorers/example_scorer.py +19 -0
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -20
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +10 -17
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +9 -24
- judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +16 -68
- judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +4 -12
- judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +4 -4
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +10 -17
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +4 -4
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +4 -4
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +4 -4
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +18 -14
- judgeval/scorers/score.py +45 -330
- judgeval/scorers/utils.py +6 -88
- judgeval/utils/file_utils.py +4 -6
- judgeval/version_check.py +3 -2
- {judgeval-0.0.52.dist-info → judgeval-0.0.53.dist-info}/METADATA +2 -1
- judgeval-0.0.53.dist-info/RECORD +65 -0
- judgeval/data/custom_example.py +0 -19
- judgeval/scorers/judgeval_scorer.py +0 -177
- judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +0 -45
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -29
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -29
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -32
- judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +0 -28
- judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -38
- judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -27
- judgeval/scorers/prompt_scorer.py +0 -296
- judgeval-0.0.52.dist-info/RECORD +0 -69
- {judgeval-0.0.52.dist-info → judgeval-0.0.53.dist-info}/WHEEL +0 -0
- {judgeval-0.0.52.dist-info → judgeval-0.0.53.dist-info}/licenses/LICENSE.md +0 -0
judgeval/data/example.py
CHANGED
@@ -2,15 +2,12 @@
|
|
2
2
|
Classes for representing examples in a dataset.
|
3
3
|
"""
|
4
4
|
|
5
|
-
from typing import Optional, Any, Dict, List, Union
|
6
|
-
from uuid import uuid4
|
7
|
-
from pydantic import BaseModel, Field, field_validator
|
8
5
|
from enum import Enum
|
9
6
|
from datetime import datetime
|
10
|
-
from judgeval.data.
|
7
|
+
from judgeval.data.judgment_types import ExampleJudgmentType
|
11
8
|
|
12
9
|
|
13
|
-
class ExampleParams(Enum):
|
10
|
+
class ExampleParams(str, Enum):
|
14
11
|
INPUT = "input"
|
15
12
|
ACTUAL_OUTPUT = "actual_output"
|
16
13
|
EXPECTED_OUTPUT = "expected_output"
|
@@ -22,144 +19,14 @@ class ExampleParams(Enum):
|
|
22
19
|
ADDITIONAL_METADATA = "additional_metadata"
|
23
20
|
|
24
21
|
|
25
|
-
class Example(
|
26
|
-
|
27
|
-
actual_output: Optional[Union[str, List[str]]] = None
|
28
|
-
expected_output: Optional[Union[str, List[str]]] = None
|
29
|
-
context: Optional[List[str]] = None
|
30
|
-
retrieval_context: Optional[List[str]] = None
|
31
|
-
additional_metadata: Optional[Dict[str, Any]] = None
|
32
|
-
tools_called: Optional[List[str]] = None
|
33
|
-
expected_tools: Optional[List[Tool]] = None
|
34
|
-
name: Optional[str] = None
|
35
|
-
example_id: str = Field(default_factory=lambda: str(uuid4()))
|
36
|
-
example_index: Optional[int] = None
|
37
|
-
created_at: Optional[str] = None
|
38
|
-
trace_id: Optional[str] = None
|
22
|
+
class Example(ExampleJudgmentType):
|
23
|
+
example_id: str = ""
|
39
24
|
|
40
25
|
def __init__(self, **data):
|
41
|
-
if "example_id" not in data:
|
42
|
-
data["example_id"] = str(uuid4())
|
43
|
-
# Set timestamp if not provided
|
44
26
|
if "created_at" not in data:
|
45
27
|
data["created_at"] = datetime.now().isoformat()
|
46
28
|
super().__init__(**data)
|
47
|
-
|
48
|
-
@field_validator("input", mode="before")
|
49
|
-
@classmethod
|
50
|
-
def validate_input(cls, v):
|
51
|
-
if v is not None:
|
52
|
-
if not isinstance(v, (str, dict)):
|
53
|
-
raise ValueError(
|
54
|
-
f"Input must be a string or dictionary but got {v} of type {type(v)}"
|
55
|
-
)
|
56
|
-
|
57
|
-
# If it's a string, check that it's not empty
|
58
|
-
if isinstance(v, str) and not v:
|
59
|
-
raise ValueError(f"Input string must be non-empty but got '{v}'")
|
60
|
-
|
61
|
-
# If it's a dictionary, check that it's not empty
|
62
|
-
if isinstance(v, dict) and not v:
|
63
|
-
raise ValueError(f"Input dictionary must be non-empty but got {v}")
|
64
|
-
|
65
|
-
return v
|
66
|
-
|
67
|
-
@field_validator("actual_output", mode="before")
|
68
|
-
@classmethod
|
69
|
-
def validate_actual_output(cls, v):
|
70
|
-
if v is not None:
|
71
|
-
if not isinstance(v, (str, list)):
|
72
|
-
raise ValueError(
|
73
|
-
f"Actual output must be a string or a list of strings but got {v} of type {type(v)}"
|
74
|
-
)
|
75
|
-
if isinstance(v, list) and not all(isinstance(item, str) for item in v):
|
76
|
-
raise ValueError(
|
77
|
-
f"All items in actual_output must be strings but got {v}"
|
78
|
-
)
|
79
|
-
return v
|
80
|
-
|
81
|
-
@field_validator("expected_output", mode="before")
|
82
|
-
@classmethod
|
83
|
-
def validate_expected_output(cls, v):
|
84
|
-
if v is not None and not isinstance(v, (str, list)):
|
85
|
-
raise ValueError(
|
86
|
-
f"Expected output must be a string, a list of strings, or None but got {v} of type {type(v)}"
|
87
|
-
)
|
88
|
-
if isinstance(v, list) and not all(isinstance(item, str) for item in v):
|
89
|
-
raise ValueError(
|
90
|
-
f"All items in expected_output must be strings but got {v}"
|
91
|
-
)
|
92
|
-
return v
|
93
|
-
|
94
|
-
@field_validator("expected_tools")
|
95
|
-
@classmethod
|
96
|
-
def validate_expected_tools(cls, v):
|
97
|
-
if v is not None:
|
98
|
-
if not isinstance(v, list):
|
99
|
-
raise ValueError(
|
100
|
-
f"Expected tools must be a list of Tools or None but got {v} of type {type(v)}"
|
101
|
-
)
|
102
|
-
|
103
|
-
# Check that each item in the list is a Tool
|
104
|
-
for i, item in enumerate(v):
|
105
|
-
if not isinstance(item, Tool):
|
106
|
-
raise ValueError(
|
107
|
-
f"Expected tools must be a list of Tools, but item at index {i} is {item} of type {type(item)}"
|
108
|
-
)
|
109
|
-
|
110
|
-
return v
|
111
|
-
|
112
|
-
@field_validator("context", "retrieval_context", "tools_called", mode="before")
|
113
|
-
@classmethod
|
114
|
-
def validate_string_lists(cls, v, info):
|
115
|
-
field_name = info.field_name
|
116
|
-
if v is not None:
|
117
|
-
if not isinstance(v, list):
|
118
|
-
raise ValueError(
|
119
|
-
f"{field_name} must be a list of strings or None but got {v} of type {type(v)}"
|
120
|
-
)
|
121
|
-
for i, item in enumerate(v):
|
122
|
-
if not isinstance(item, str):
|
123
|
-
raise ValueError(
|
124
|
-
f"All items in {field_name} must be strings but item at index {i} is {item} of type {type(item)}"
|
125
|
-
)
|
126
|
-
return v
|
127
|
-
|
128
|
-
@field_validator("additional_metadata", mode="before")
|
129
|
-
@classmethod
|
130
|
-
def validate_additional_metadata(cls, v):
|
131
|
-
if v is not None and not isinstance(v, dict):
|
132
|
-
raise ValueError(
|
133
|
-
f"Additional metadata must be a dictionary or None but got {v} of type {type(v)}"
|
134
|
-
)
|
135
|
-
return v
|
136
|
-
|
137
|
-
@field_validator("example_index", mode="before")
|
138
|
-
@classmethod
|
139
|
-
def validate_example_index(cls, v):
|
140
|
-
if v is not None and not isinstance(v, int):
|
141
|
-
raise ValueError(
|
142
|
-
f"Example index must be an integer or None but got {v} of type {type(v)}"
|
143
|
-
)
|
144
|
-
return v
|
145
|
-
|
146
|
-
@field_validator("created_at", mode="before")
|
147
|
-
@classmethod
|
148
|
-
def validate_created_at(cls, v):
|
149
|
-
if v is not None and not isinstance(v, str):
|
150
|
-
raise ValueError(
|
151
|
-
f"Timestamp must be a string or None but got {v} of type {type(v)}"
|
152
|
-
)
|
153
|
-
return v
|
154
|
-
|
155
|
-
@field_validator("trace_id", mode="before")
|
156
|
-
@classmethod
|
157
|
-
def validate_trace_id(cls, v):
|
158
|
-
if v is not None and not isinstance(v, str):
|
159
|
-
raise ValueError(
|
160
|
-
f"Trace ID must be a string or None but got {v} of type {type(v)}"
|
161
|
-
)
|
162
|
-
return v
|
29
|
+
self.example_id = None
|
163
30
|
|
164
31
|
def to_dict(self):
|
165
32
|
return {
|
@@ -0,0 +1,214 @@
|
|
1
|
+
# generated by datamodel-codegen:
|
2
|
+
# filename: openapi_new.json
|
3
|
+
# timestamp: 2025-07-12T17:11:33+00:00
|
4
|
+
|
5
|
+
from __future__ import annotations
|
6
|
+
|
7
|
+
from typing import Annotated, Any, Dict, List, Optional, Union
|
8
|
+
|
9
|
+
from pydantic import BaseModel, Field
|
10
|
+
|
11
|
+
|
12
|
+
class ValidationErrorJudgmentType(BaseModel):
|
13
|
+
loc: Annotated[List[Union[str, int]], Field(title="Location")]
|
14
|
+
msg: Annotated[str, Field(title="Message")]
|
15
|
+
type: Annotated[str, Field(title="Error Type")]
|
16
|
+
|
17
|
+
|
18
|
+
class ScorerDataJudgmentType(BaseModel):
|
19
|
+
name: Annotated[str, Field(title="Name")]
|
20
|
+
threshold: Annotated[float, Field(title="Threshold")]
|
21
|
+
success: Annotated[bool, Field(title="Success")]
|
22
|
+
score: Annotated[Optional[float], Field(title="Score")] = None
|
23
|
+
reason: Annotated[Optional[str], Field(title="Reason")] = None
|
24
|
+
strict_mode: Annotated[Optional[bool], Field(title="Strict Mode")] = None
|
25
|
+
evaluation_model: Annotated[
|
26
|
+
Optional[Union[List[str], str]], Field(title="Evaluation Model")
|
27
|
+
] = None
|
28
|
+
error: Annotated[Optional[str], Field(title="Error")] = None
|
29
|
+
additional_metadata: Annotated[
|
30
|
+
Optional[Dict[str, Any]], Field(title="Additional Metadata")
|
31
|
+
] = None
|
32
|
+
|
33
|
+
|
34
|
+
class ScorerConfigJudgmentType(BaseModel):
|
35
|
+
score_type: Annotated[str, Field(title="Score Type")]
|
36
|
+
name: Annotated[Optional[str], Field(title="Name")] = None
|
37
|
+
threshold: Annotated[Optional[float], Field(title="Threshold")] = 0.5
|
38
|
+
strict_mode: Annotated[Optional[bool], Field(title="Strict Mode")] = False
|
39
|
+
required_params: Annotated[Optional[List[str]], Field(title="Required Params")] = (
|
40
|
+
Field(default_factory=list)
|
41
|
+
)
|
42
|
+
kwargs: Annotated[Optional[Dict[str, Any]], Field(title="Kwargs")] = None
|
43
|
+
|
44
|
+
|
45
|
+
class TraceUsageJudgmentType(BaseModel):
|
46
|
+
prompt_tokens: Annotated[Optional[int], Field(title="Prompt Tokens")] = None
|
47
|
+
completion_tokens: Annotated[Optional[int], Field(title="Completion Tokens")] = None
|
48
|
+
cache_creation_input_tokens: Annotated[
|
49
|
+
Optional[int], Field(title="Cache Creation Input Tokens")
|
50
|
+
] = None
|
51
|
+
cache_read_input_tokens: Annotated[
|
52
|
+
Optional[int], Field(title="Cache Read Input Tokens")
|
53
|
+
] = None
|
54
|
+
total_tokens: Annotated[Optional[int], Field(title="Total Tokens")] = None
|
55
|
+
prompt_tokens_cost_usd: Annotated[
|
56
|
+
Optional[float], Field(title="Prompt Tokens Cost Usd")
|
57
|
+
] = None
|
58
|
+
completion_tokens_cost_usd: Annotated[
|
59
|
+
Optional[float], Field(title="Completion Tokens Cost Usd")
|
60
|
+
] = None
|
61
|
+
total_cost_usd: Annotated[Optional[float], Field(title="Total Cost Usd")] = None
|
62
|
+
model_name: Annotated[Optional[str], Field(title="Model Name")] = None
|
63
|
+
|
64
|
+
|
65
|
+
class ToolJudgmentType(BaseModel):
|
66
|
+
tool_name: Annotated[str, Field(title="Tool Name")]
|
67
|
+
parameters: Annotated[Optional[Dict[str, Any]], Field(title="Parameters")] = None
|
68
|
+
agent_name: Annotated[Optional[str], Field(title="Agent Name")] = None
|
69
|
+
result_dependencies: Annotated[
|
70
|
+
Optional[List[Dict[str, Any]]], Field(title="Result Dependencies")
|
71
|
+
] = None
|
72
|
+
action_dependencies: Annotated[
|
73
|
+
Optional[List[Dict[str, Any]]], Field(title="Action Dependencies")
|
74
|
+
] = None
|
75
|
+
require_all: Annotated[Optional[bool], Field(title="Require All")] = None
|
76
|
+
|
77
|
+
|
78
|
+
class HTTPValidationErrorJudgmentType(BaseModel):
|
79
|
+
detail: Annotated[
|
80
|
+
Optional[List[ValidationErrorJudgmentType]], Field(title="Detail")
|
81
|
+
] = None
|
82
|
+
|
83
|
+
|
84
|
+
class TraceSpanJudgmentType(BaseModel):
|
85
|
+
span_id: Annotated[str, Field(title="Span Id")]
|
86
|
+
trace_id: Annotated[str, Field(title="Trace Id")]
|
87
|
+
function: Annotated[str, Field(title="Function")]
|
88
|
+
depth: Annotated[int, Field(title="Depth")]
|
89
|
+
created_at: Annotated[Any, Field(title="Created At")] = None
|
90
|
+
parent_span_id: Annotated[Optional[str], Field(title="Parent Span Id")] = None
|
91
|
+
span_type: Annotated[Optional[str], Field(title="Span Type")] = "span"
|
92
|
+
inputs: Annotated[Optional[Dict[str, Any]], Field(title="Inputs")] = None
|
93
|
+
error: Annotated[Optional[Dict[str, Any]], Field(title="Error")] = None
|
94
|
+
output: Annotated[Any, Field(title="Output")] = None
|
95
|
+
usage: Optional[TraceUsageJudgmentType] = None
|
96
|
+
duration: Annotated[Optional[float], Field(title="Duration")] = None
|
97
|
+
annotation: Annotated[Optional[List[Dict[str, Any]]], Field(title="Annotation")] = (
|
98
|
+
None
|
99
|
+
)
|
100
|
+
expected_tools: Annotated[
|
101
|
+
Optional[List[ToolJudgmentType]], Field(title="Expected Tools")
|
102
|
+
] = None
|
103
|
+
additional_metadata: Annotated[
|
104
|
+
Optional[Dict[str, Any]], Field(title="Additional Metadata")
|
105
|
+
] = None
|
106
|
+
has_evaluation: Annotated[Optional[bool], Field(title="Has Evaluation")] = False
|
107
|
+
agent_name: Annotated[Optional[str], Field(title="Agent Name")] = None
|
108
|
+
state_before: Annotated[Optional[Dict[str, Any]], Field(title="State Before")] = (
|
109
|
+
None
|
110
|
+
)
|
111
|
+
state_after: Annotated[Optional[Dict[str, Any]], Field(title="State After")] = None
|
112
|
+
update_id: Annotated[Optional[int], Field(title="Update Id")] = 1
|
113
|
+
|
114
|
+
|
115
|
+
class ExampleJudgmentType(BaseModel):
|
116
|
+
input: Annotated[Optional[Union[str, Dict[str, Any]]], Field(title="Input")] = None
|
117
|
+
actual_output: Annotated[
|
118
|
+
Optional[Union[str, List[str]]], Field(title="Actual Output")
|
119
|
+
] = None
|
120
|
+
expected_output: Annotated[
|
121
|
+
Optional[Union[str, List[str]]], Field(title="Expected Output")
|
122
|
+
] = None
|
123
|
+
context: Annotated[Optional[List[str]], Field(title="Context")] = None
|
124
|
+
retrieval_context: Annotated[
|
125
|
+
Optional[List[str]], Field(title="Retrieval Context")
|
126
|
+
] = None
|
127
|
+
additional_metadata: Annotated[
|
128
|
+
Optional[Dict[str, Any]], Field(title="Additional Metadata")
|
129
|
+
] = None
|
130
|
+
tools_called: Annotated[Optional[List[str]], Field(title="Tools Called")] = Field(
|
131
|
+
default_factory=list
|
132
|
+
)
|
133
|
+
expected_tools: Annotated[
|
134
|
+
Optional[List[ToolJudgmentType]], Field(title="Expected Tools")
|
135
|
+
] = Field(default_factory=list)
|
136
|
+
name: Annotated[Optional[str], Field(title="Name")] = None
|
137
|
+
example_id: Annotated[str, Field(title="Example Id")]
|
138
|
+
example_index: Annotated[Optional[int], Field(title="Example Index")] = None
|
139
|
+
created_at: Annotated[Optional[str], Field(title="Created At")] = None
|
140
|
+
trace_id: Annotated[Optional[str], Field(title="Trace Id")] = None
|
141
|
+
trace_span_id: Annotated[Optional[str], Field(title="Trace Span Id")] = None
|
142
|
+
dataset_id: Annotated[Optional[str], Field(title="Dataset Id")] = None
|
143
|
+
|
144
|
+
|
145
|
+
class TraceJudgmentType(BaseModel):
|
146
|
+
trace_id: Annotated[str, Field(title="Trace Id")]
|
147
|
+
name: Annotated[str, Field(title="Name")]
|
148
|
+
created_at: Annotated[str, Field(title="Created At")]
|
149
|
+
duration: Annotated[float, Field(title="Duration")]
|
150
|
+
trace_spans: Annotated[List[TraceSpanJudgmentType], Field(title="Trace Spans")]
|
151
|
+
overwrite: Annotated[Optional[bool], Field(title="Overwrite")] = False
|
152
|
+
offline_mode: Annotated[Optional[bool], Field(title="Offline Mode")] = False
|
153
|
+
rules: Annotated[Optional[Dict[str, Any]], Field(title="Rules")] = Field(
|
154
|
+
default_factory=dict
|
155
|
+
)
|
156
|
+
has_notification: Annotated[Optional[bool], Field(title="Has Notification")] = False
|
157
|
+
customer_id: Annotated[Optional[str], Field(title="Customer Id")] = None
|
158
|
+
tags: Annotated[Optional[List[str]], Field(title="Tags")] = Field(
|
159
|
+
default_factory=list
|
160
|
+
)
|
161
|
+
metadata: Annotated[Optional[Dict[str, Any]], Field(title="Metadata")] = Field(
|
162
|
+
default_factory=dict
|
163
|
+
)
|
164
|
+
update_id: Annotated[Optional[int], Field(title="Update Id")] = 1
|
165
|
+
|
166
|
+
|
167
|
+
class ScoringResultJudgmentType(BaseModel):
|
168
|
+
success: Annotated[bool, Field(title="Success")]
|
169
|
+
scorers_data: Annotated[
|
170
|
+
Optional[List[ScorerDataJudgmentType]], Field(title="Scorers Data")
|
171
|
+
]
|
172
|
+
name: Annotated[Optional[str], Field(title="Name")] = None
|
173
|
+
data_object: Annotated[
|
174
|
+
Optional[Union[TraceSpanJudgmentType, ExampleJudgmentType]],
|
175
|
+
Field(title="Data Object"),
|
176
|
+
] = None
|
177
|
+
trace_id: Annotated[Optional[str], Field(title="Trace Id")] = None
|
178
|
+
run_duration: Annotated[Optional[float], Field(title="Run Duration")] = None
|
179
|
+
|
180
|
+
|
181
|
+
class TraceRunJudgmentType(BaseModel):
|
182
|
+
project_name: Annotated[Optional[str], Field(title="Project Name")] = None
|
183
|
+
eval_name: Annotated[Optional[str], Field(title="Eval Name")] = None
|
184
|
+
traces: Annotated[List[TraceJudgmentType], Field(title="Traces")]
|
185
|
+
scorers: Annotated[List[ScorerConfigJudgmentType], Field(title="Scorers")]
|
186
|
+
model: Annotated[str, Field(title="Model")]
|
187
|
+
judgment_api_key: Annotated[Optional[str], Field(title="Judgment Api Key")] = None
|
188
|
+
append: Annotated[Optional[bool], Field(title="Append")] = False
|
189
|
+
override_existing_eval_run_name: Annotated[
|
190
|
+
Optional[bool], Field(title="Override Existing Eval Run Name")
|
191
|
+
] = False
|
192
|
+
trace_span_id: Annotated[Optional[str], Field(title="Trace Span Id")] = None
|
193
|
+
tools: Annotated[Optional[List[Dict[str, Any]]], Field(title="Tools")] = None
|
194
|
+
|
195
|
+
|
196
|
+
class JudgmentEvalJudgmentType(BaseModel):
|
197
|
+
project_name: Annotated[Optional[str], Field(title="Project Name")] = None
|
198
|
+
eval_name: Annotated[Optional[str], Field(title="Eval Name")] = None
|
199
|
+
examples: Annotated[List[ExampleJudgmentType], Field(title="Examples")]
|
200
|
+
scorers: Annotated[List[ScorerConfigJudgmentType], Field(title="Scorers")]
|
201
|
+
model: Annotated[str, Field(title="Model")]
|
202
|
+
judgment_api_key: Annotated[Optional[str], Field(title="Judgment Api Key")] = None
|
203
|
+
append: Annotated[Optional[bool], Field(title="Append")] = False
|
204
|
+
override_existing_eval_run_name: Annotated[
|
205
|
+
Optional[bool], Field(title="Override Existing Eval Run Name")
|
206
|
+
] = False
|
207
|
+
trace_span_id: Annotated[Optional[str], Field(title="Trace Span Id")] = None
|
208
|
+
|
209
|
+
|
210
|
+
class EvalResultsJudgmentType(BaseModel):
|
211
|
+
results: Annotated[List[ScoringResultJudgmentType], Field(title="Results")]
|
212
|
+
run: Annotated[
|
213
|
+
Union[TraceRunJudgmentType, JudgmentEvalJudgmentType], Field(title="Run")
|
214
|
+
]
|
judgeval/data/result.py
CHANGED
@@ -1,11 +1,10 @@
|
|
1
|
-
from typing import List,
|
2
|
-
from judgeval.
|
3
|
-
from pydantic import BaseModel
|
4
|
-
from judgeval.data import ScorerData, Example, CustomExample
|
1
|
+
from typing import List, Union
|
2
|
+
from judgeval.data import ScorerData, Example
|
5
3
|
from judgeval.data.trace import TraceSpan
|
4
|
+
from judgeval.data.judgment_types import ScoringResultJudgmentType
|
6
5
|
|
7
6
|
|
8
|
-
class ScoringResult(
|
7
|
+
class ScoringResult(ScoringResultJudgmentType):
|
9
8
|
"""
|
10
9
|
A ScoringResult contains the output of one or more scorers applied to a single example.
|
11
10
|
Ie: One input, one actual_output, one expected_output, etc..., and 1+ scorer (Faithfulness, Hallucination, Summarization, etc...)
|
@@ -14,23 +13,10 @@ class ScoringResult(BaseModel):
|
|
14
13
|
success (bool): Whether the evaluation was successful.
|
15
14
|
This means that all scorers applied to this example returned a success.
|
16
15
|
scorer_data (List[ScorerData]): The scorers data for the evaluated example
|
17
|
-
data_object (Optional[Example]): The original example object that was used to create the ScoringResult, can be Example,
|
16
|
+
data_object (Optional[Example]): The original example object that was used to create the ScoringResult, can be Example, WorkflowRun (future)
|
18
17
|
|
19
18
|
"""
|
20
19
|
|
21
|
-
# Fields for scoring outputs
|
22
|
-
success: bool # used for unit testing
|
23
|
-
scorers_data: Union[List[ScorerData], None]
|
24
|
-
name: Optional[str] = None
|
25
|
-
|
26
|
-
# The original example object that was used to create the ScoringResult
|
27
|
-
data_object: Optional[Union[TraceSpan, CustomExample, Example]] = None
|
28
|
-
trace_id: Optional[str] = None
|
29
|
-
|
30
|
-
# Additional fields for internal use
|
31
|
-
run_duration: Optional[float] = None
|
32
|
-
evaluation_cost: Optional[float] = None
|
33
|
-
|
34
20
|
def to_dict(self) -> dict:
|
35
21
|
"""Convert the ScoringResult instance to a dictionary, properly serializing scorer_data."""
|
36
22
|
return {
|
@@ -46,8 +32,7 @@ class ScoringResult(BaseModel):
|
|
46
32
|
success={self.success}, \
|
47
33
|
scorer_data={self.scorers_data}, \
|
48
34
|
data_object={self.data_object}, \
|
49
|
-
run_duration={self.run_duration}
|
50
|
-
evaluation_cost={self.evaluation_cost})"
|
35
|
+
run_duration={self.run_duration})"
|
51
36
|
|
52
37
|
|
53
38
|
def generate_scoring_result(
|
@@ -62,18 +47,15 @@ def generate_scoring_result(
|
|
62
47
|
When an LLMTestCase is executed, it turns into an LLMApiTestCase and the progress of the evaluation run is tracked.
|
63
48
|
At the end of the evaluation run, we create a TestResult object out of the completed LLMApiTestCase.
|
64
49
|
"""
|
65
|
-
if data_object.name is not None:
|
50
|
+
if hasattr(data_object, "name") and data_object.name is not None:
|
66
51
|
name = data_object.name
|
67
52
|
else:
|
68
53
|
name = "Test Case Placeholder"
|
69
|
-
debug(f"No name provided for example, using default name: {name}")
|
70
|
-
debug(f"Creating ScoringResult for: {name}")
|
71
54
|
scoring_result = ScoringResult(
|
72
55
|
name=name,
|
73
56
|
data_object=data_object,
|
74
57
|
success=success,
|
75
58
|
scorers_data=scorers_data,
|
76
59
|
run_duration=run_duration,
|
77
|
-
evaluation_cost=None,
|
78
60
|
)
|
79
61
|
return scoring_result
|
judgeval/data/scorer_data.py
CHANGED
@@ -4,13 +4,12 @@ Implementation of the ScorerData class.
|
|
4
4
|
ScorerData holds the information related to a single, completed Scorer evaluation run.
|
5
5
|
"""
|
6
6
|
|
7
|
-
from
|
8
|
-
from
|
7
|
+
from judgeval.data.judgment_types import ScorerDataJudgmentType
|
8
|
+
from judgeval.scorers import BaseScorer
|
9
|
+
from typing import List
|
9
10
|
|
10
|
-
from judgeval.scorers import JudgevalScorer
|
11
11
|
|
12
|
-
|
13
|
-
class ScorerData(BaseModel):
|
12
|
+
class ScorerData(ScorerDataJudgmentType):
|
14
13
|
"""
|
15
14
|
ScorerData holds the information related to a single, completed Scorer evaluation run.
|
16
15
|
|
@@ -20,18 +19,6 @@ class ScorerData(BaseModel):
|
|
20
19
|
judge model(s).
|
21
20
|
"""
|
22
21
|
|
23
|
-
name: str
|
24
|
-
threshold: float
|
25
|
-
success: bool
|
26
|
-
score: Optional[float] = None
|
27
|
-
reason: Optional[str] = None
|
28
|
-
strict_mode: Optional[bool] = None
|
29
|
-
evaluation_model: Union[List[str], str] | None = None
|
30
|
-
error: Optional[str] = None
|
31
|
-
evaluation_cost: Union[float, None] = None
|
32
|
-
verbose_logs: Optional[str] = None
|
33
|
-
additional_metadata: Optional[Dict] = None
|
34
|
-
|
35
22
|
def to_dict(self) -> dict:
|
36
23
|
"""Convert the ScorerData instance to a JSON-serializable dictionary."""
|
37
24
|
return {
|
@@ -43,13 +30,11 @@ class ScorerData(BaseModel):
|
|
43
30
|
"strict_mode": self.strict_mode,
|
44
31
|
"evaluation_model": self.evaluation_model,
|
45
32
|
"error": self.error,
|
46
|
-
"evaluation_cost": self.evaluation_cost,
|
47
|
-
"verbose_logs": self.verbose_logs,
|
48
33
|
"additional_metadata": self.additional_metadata,
|
49
34
|
}
|
50
35
|
|
51
36
|
|
52
|
-
def create_scorer_data(scorer:
|
37
|
+
def create_scorer_data(scorer: BaseScorer) -> List[ScorerData]:
|
53
38
|
"""
|
54
39
|
After a `scorer` is run, it contains information about the example that was evaluated
|
55
40
|
using the scorer. For example, after computing Faithfulness, the `scorer` object will contain
|
@@ -59,30 +44,33 @@ def create_scorer_data(scorer: JudgevalScorer) -> ScorerData:
|
|
59
44
|
contains the output of the scorer run that can be exported to be logged as a part of
|
60
45
|
the ScorerResult.
|
61
46
|
"""
|
62
|
-
|
63
|
-
|
64
|
-
|
47
|
+
scorers_result = list()
|
48
|
+
|
49
|
+
scorers_result.append(
|
50
|
+
ScorerData(
|
51
|
+
name=scorer.name,
|
65
52
|
threshold=scorer.threshold,
|
66
|
-
score=None,
|
67
|
-
reason=None,
|
68
|
-
success=False,
|
69
|
-
strict_mode=scorer.strict_mode,
|
70
|
-
evaluation_model=scorer.evaluation_model,
|
71
|
-
error=scorer.error,
|
72
|
-
evaluation_cost=scorer.evaluation_cost,
|
73
|
-
verbose_logs=scorer.verbose_logs,
|
74
|
-
)
|
75
|
-
else: # standard execution, no error
|
76
|
-
return ScorerData(
|
77
|
-
name=scorer.__name__,
|
78
53
|
score=scorer.score,
|
79
|
-
threshold=scorer.threshold,
|
80
54
|
reason=scorer.reason,
|
81
|
-
success=scorer.
|
55
|
+
success=scorer.success,
|
82
56
|
strict_mode=scorer.strict_mode,
|
83
57
|
evaluation_model=scorer.evaluation_model,
|
84
|
-
error=
|
85
|
-
evaluation_cost=scorer.evaluation_cost,
|
86
|
-
verbose_logs=scorer.verbose_logs,
|
58
|
+
error=scorer.error,
|
87
59
|
additional_metadata=scorer.additional_metadata,
|
88
60
|
)
|
61
|
+
)
|
62
|
+
if hasattr(scorer, "internal_scorer") and scorer.internal_scorer is not None:
|
63
|
+
scorers_result.append(
|
64
|
+
ScorerData(
|
65
|
+
name=scorer.internal_scorer.name,
|
66
|
+
score=scorer.internal_scorer.score,
|
67
|
+
threshold=scorer.internal_scorer.threshold,
|
68
|
+
reason=scorer.internal_scorer.reason,
|
69
|
+
success=scorer.internal_scorer.success,
|
70
|
+
strict_mode=scorer.internal_scorer.strict_mode,
|
71
|
+
evaluation_model=scorer.internal_scorer.evaluation_model,
|
72
|
+
error=scorer.internal_scorer.error,
|
73
|
+
additional_metadata=scorer.internal_scorer.additional_metadata,
|
74
|
+
)
|
75
|
+
)
|
76
|
+
return scorers_result
|
@@ -0,0 +1,23 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
Post-process generated Pydantic models with default_factory defaults.
|
4
|
+
"""
|
5
|
+
|
6
|
+
import sys
|
7
|
+
|
8
|
+
|
9
|
+
def fix_mutable_defaults(file_path: str) -> None:
|
10
|
+
"""Fix mutable defaults in generated Pydantic models."""
|
11
|
+
|
12
|
+
with open(file_path, "r") as f:
|
13
|
+
content = f.read()
|
14
|
+
|
15
|
+
content = content.replace(" = {}", " = Field(default_factory=dict)")
|
16
|
+
content = content.replace(" = []", " = Field(default_factory=list)")
|
17
|
+
with open(file_path, "w") as f:
|
18
|
+
f.write(content)
|
19
|
+
|
20
|
+
|
21
|
+
if __name__ == "__main__":
|
22
|
+
file_path = sys.argv[1]
|
23
|
+
fix_mutable_defaults(file_path)
|