judgeval 0.0.44__py3-none-any.whl → 0.0.46__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +5 -4
- judgeval/clients.py +6 -6
- judgeval/common/__init__.py +7 -2
- judgeval/common/exceptions.py +2 -3
- judgeval/common/logger.py +74 -49
- judgeval/common/s3_storage.py +30 -23
- judgeval/common/tracer.py +1273 -939
- judgeval/common/utils.py +416 -244
- judgeval/constants.py +73 -61
- judgeval/data/__init__.py +1 -1
- judgeval/data/custom_example.py +3 -2
- judgeval/data/datasets/dataset.py +80 -54
- judgeval/data/datasets/eval_dataset_client.py +131 -181
- judgeval/data/example.py +67 -43
- judgeval/data/result.py +11 -9
- judgeval/data/scorer_data.py +4 -2
- judgeval/data/tool.py +25 -16
- judgeval/data/trace.py +57 -29
- judgeval/data/trace_run.py +5 -11
- judgeval/evaluation_run.py +22 -82
- judgeval/integrations/langgraph.py +546 -184
- judgeval/judges/base_judge.py +1 -2
- judgeval/judges/litellm_judge.py +33 -11
- judgeval/judges/mixture_of_judges.py +128 -78
- judgeval/judges/together_judge.py +22 -9
- judgeval/judges/utils.py +14 -5
- judgeval/judgment_client.py +259 -271
- judgeval/rules.py +169 -142
- judgeval/run_evaluation.py +462 -305
- judgeval/scorers/api_scorer.py +20 -11
- judgeval/scorers/exceptions.py +1 -0
- judgeval/scorers/judgeval_scorer.py +77 -58
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +46 -15
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +12 -11
- judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +7 -5
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +5 -2
- judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +2 -1
- judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +17 -8
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +3 -2
- judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +8 -9
- judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +4 -4
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +5 -5
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +5 -2
- judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +9 -10
- judgeval/scorers/prompt_scorer.py +48 -37
- judgeval/scorers/score.py +86 -53
- judgeval/scorers/utils.py +11 -7
- judgeval/tracer/__init__.py +1 -1
- judgeval/utils/alerts.py +23 -12
- judgeval/utils/{data_utils.py → file_utils.py} +5 -9
- judgeval/utils/requests.py +29 -0
- judgeval/version_check.py +5 -2
- {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/METADATA +79 -135
- judgeval-0.0.46.dist-info/RECORD +69 -0
- judgeval-0.0.44.dist-info/RECORD +0 -68
- {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/WHEEL +0 -0
- {judgeval-0.0.44.dist-info → judgeval-0.0.46.dist-info}/licenses/LICENSE.md +0 -0
judgeval/data/result.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1
|
-
from
|
2
|
-
from
|
3
|
-
from judgeval.common.logger import debug, error
|
1
|
+
from typing import List, Optional, Union
|
2
|
+
from judgeval.common.logger import debug
|
4
3
|
from pydantic import BaseModel
|
5
4
|
from judgeval.data import ScorerData, Example, CustomExample
|
6
5
|
from judgeval.data.trace import TraceSpan
|
@@ -12,13 +11,14 @@ class ScoringResult(BaseModel):
|
|
12
11
|
Ie: One input, one actual_output, one expected_output, etc..., and 1+ scorer (Faithfulness, Hallucination, Summarization, etc...)
|
13
12
|
|
14
13
|
Args:
|
15
|
-
success (bool): Whether the evaluation was successful.
|
14
|
+
success (bool): Whether the evaluation was successful.
|
16
15
|
This means that all scorers applied to this example returned a success.
|
17
16
|
scorer_data (List[ScorerData]): The scorers data for the evaluated example
|
18
17
|
data_object (Optional[Example]): The original example object that was used to create the ScoringResult, can be Example, CustomExample (future), WorkflowRun (future)
|
19
|
-
|
18
|
+
|
20
19
|
"""
|
21
|
-
|
20
|
+
|
21
|
+
# Fields for scoring outputs
|
22
22
|
success: bool # used for unit testing
|
23
23
|
scorers_data: Union[List[ScorerData], None]
|
24
24
|
name: Optional[str] = None
|
@@ -26,16 +26,18 @@ class ScoringResult(BaseModel):
|
|
26
26
|
# The original example object that was used to create the ScoringResult
|
27
27
|
data_object: Optional[Union[TraceSpan, CustomExample, Example]] = None
|
28
28
|
trace_id: Optional[str] = None
|
29
|
-
|
29
|
+
|
30
30
|
# Additional fields for internal use
|
31
31
|
run_duration: Optional[float] = None
|
32
32
|
evaluation_cost: Optional[float] = None
|
33
|
-
|
33
|
+
|
34
34
|
def to_dict(self) -> dict:
|
35
35
|
"""Convert the ScoringResult instance to a dictionary, properly serializing scorer_data."""
|
36
36
|
return {
|
37
37
|
"success": self.success,
|
38
|
-
"scorers_data": [scorer_data.to_dict() for scorer_data in self.scorers_data]
|
38
|
+
"scorers_data": [scorer_data.to_dict() for scorer_data in self.scorers_data]
|
39
|
+
if self.scorers_data
|
40
|
+
else None,
|
39
41
|
"data_object": self.data_object.to_dict() if self.data_object else None,
|
40
42
|
}
|
41
43
|
|
judgeval/data/scorer_data.py
CHANGED
@@ -9,6 +9,7 @@ from pydantic import BaseModel
|
|
9
9
|
|
10
10
|
from judgeval.scorers import JudgevalScorer
|
11
11
|
|
12
|
+
|
12
13
|
class ScorerData(BaseModel):
|
13
14
|
"""
|
14
15
|
ScorerData holds the information related to a single, completed Scorer evaluation run.
|
@@ -18,13 +19,14 @@ class ScorerData(BaseModel):
|
|
18
19
|
information surrounding the evaluation run such as the claims and verdicts generated by the
|
19
20
|
judge model(s).
|
20
21
|
"""
|
22
|
+
|
21
23
|
name: str
|
22
24
|
threshold: float
|
23
25
|
success: bool
|
24
26
|
score: Optional[float] = None
|
25
27
|
reason: Optional[str] = None
|
26
28
|
strict_mode: Optional[bool] = None
|
27
|
-
evaluation_model: Union[List[str], str] = None
|
29
|
+
evaluation_model: Union[List[str], str] | None = None
|
28
30
|
error: Optional[str] = None
|
29
31
|
evaluation_cost: Union[float, None] = None
|
30
32
|
verbose_logs: Optional[str] = None
|
@@ -43,7 +45,7 @@ class ScorerData(BaseModel):
|
|
43
45
|
"error": self.error,
|
44
46
|
"evaluation_cost": self.evaluation_cost,
|
45
47
|
"verbose_logs": self.verbose_logs,
|
46
|
-
"additional_metadata": self.additional_metadata
|
48
|
+
"additional_metadata": self.additional_metadata,
|
47
49
|
}
|
48
50
|
|
49
51
|
|
judgeval/data/tool.py
CHANGED
@@ -2,6 +2,7 @@ from pydantic import BaseModel, field_validator
|
|
2
2
|
from typing import Dict, Any, Optional, List
|
3
3
|
import warnings
|
4
4
|
|
5
|
+
|
5
6
|
class Tool(BaseModel):
|
6
7
|
tool_name: str
|
7
8
|
parameters: Optional[Dict[str, Any]] = None
|
@@ -9,39 +10,47 @@ class Tool(BaseModel):
|
|
9
10
|
result_dependencies: Optional[List[Dict[str, Any]]] = None
|
10
11
|
action_dependencies: Optional[List[Dict[str, Any]]] = None
|
11
12
|
require_all: Optional[bool] = None
|
12
|
-
|
13
|
-
@field_validator(
|
13
|
+
|
14
|
+
@field_validator("tool_name")
|
14
15
|
def validate_tool_name(cls, v):
|
15
16
|
if not v:
|
16
17
|
warnings.warn("Tool name is empty or None", UserWarning)
|
17
18
|
return v
|
18
|
-
|
19
|
-
@field_validator(
|
19
|
+
|
20
|
+
@field_validator("parameters")
|
20
21
|
def validate_parameters(cls, v):
|
21
22
|
if v is not None and not isinstance(v, dict):
|
22
|
-
warnings.warn(
|
23
|
+
warnings.warn(
|
24
|
+
f"Parameters should be a dictionary, got {type(v)}", UserWarning
|
25
|
+
)
|
23
26
|
return v
|
24
|
-
|
25
|
-
@field_validator(
|
27
|
+
|
28
|
+
@field_validator("agent_name")
|
26
29
|
def validate_agent_name(cls, v):
|
27
30
|
if v is not None and not isinstance(v, str):
|
28
31
|
warnings.warn(f"Agent name should be a string, got {type(v)}", UserWarning)
|
29
32
|
return v
|
30
|
-
|
31
|
-
@field_validator(
|
33
|
+
|
34
|
+
@field_validator("result_dependencies")
|
32
35
|
def validate_result_dependencies(cls, v):
|
33
36
|
if v is not None and not isinstance(v, list):
|
34
|
-
warnings.warn(
|
37
|
+
warnings.warn(
|
38
|
+
f"Result dependencies should be a list, got {type(v)}", UserWarning
|
39
|
+
)
|
35
40
|
return v
|
36
|
-
|
37
|
-
@field_validator(
|
41
|
+
|
42
|
+
@field_validator("action_dependencies")
|
38
43
|
def validate_action_dependencies(cls, v):
|
39
44
|
if v is not None and not isinstance(v, list):
|
40
|
-
warnings.warn(
|
45
|
+
warnings.warn(
|
46
|
+
f"Action dependencies should be a list, got {type(v)}", UserWarning
|
47
|
+
)
|
41
48
|
return v
|
42
49
|
|
43
|
-
@field_validator(
|
50
|
+
@field_validator("require_all")
|
44
51
|
def validate_require_all(cls, v):
|
45
52
|
if v is not None and not isinstance(v, bool):
|
46
|
-
warnings.warn(
|
47
|
-
|
53
|
+
warnings.warn(
|
54
|
+
f"Require all should be a boolean, got {type(v)}", UserWarning
|
55
|
+
)
|
56
|
+
return v
|
judgeval/data/trace.py
CHANGED
@@ -1,10 +1,12 @@
|
|
1
|
-
from pydantic import BaseModel
|
1
|
+
from pydantic import BaseModel, Field
|
2
2
|
from typing import Optional, Dict, Any, List
|
3
3
|
from judgeval.evaluation_run import EvaluationRun
|
4
4
|
from judgeval.data.tool import Tool
|
5
5
|
import json
|
6
|
+
import sys
|
6
7
|
from datetime import datetime, timezone
|
7
8
|
|
9
|
+
|
8
10
|
class TraceUsage(BaseModel):
|
9
11
|
prompt_tokens: Optional[int] = None
|
10
12
|
completion_tokens: Optional[int] = None
|
@@ -14,6 +16,7 @@ class TraceUsage(BaseModel):
|
|
14
16
|
total_cost_usd: Optional[float] = None
|
15
17
|
model_name: Optional[str] = None
|
16
18
|
|
19
|
+
|
17
20
|
class TraceSpan(BaseModel):
|
18
21
|
span_id: str
|
19
22
|
trace_id: str
|
@@ -41,11 +44,15 @@ class TraceSpan(BaseModel):
|
|
41
44
|
"span_id": self.span_id,
|
42
45
|
"trace_id": self.trace_id,
|
43
46
|
"depth": self.depth,
|
44
|
-
"created_at": datetime.fromtimestamp(
|
47
|
+
"created_at": datetime.fromtimestamp(
|
48
|
+
self.created_at, tz=timezone.utc
|
49
|
+
).isoformat(),
|
45
50
|
"inputs": self._serialize_value(self.inputs),
|
46
51
|
"output": self._serialize_value(self.output),
|
47
52
|
"error": self._serialize_value(self.error),
|
48
|
-
"evaluation_runs": [run.model_dump() for run in self.evaluation_runs]
|
53
|
+
"evaluation_runs": [run.model_dump() for run in self.evaluation_runs]
|
54
|
+
if self.evaluation_runs
|
55
|
+
else [],
|
49
56
|
"parent_span_id": self.parent_span_id,
|
50
57
|
"function": self.function,
|
51
58
|
"duration": self.duration,
|
@@ -55,13 +62,15 @@ class TraceSpan(BaseModel):
|
|
55
62
|
"agent_name": self.agent_name,
|
56
63
|
"state_before": self.state_before,
|
57
64
|
"state_after": self.state_after,
|
58
|
-
"additional_metadata": self._serialize_value(self.additional_metadata)
|
65
|
+
"additional_metadata": self._serialize_value(self.additional_metadata),
|
59
66
|
}
|
60
|
-
|
67
|
+
|
61
68
|
def print_span(self):
|
62
69
|
"""Print the span with proper formatting and parent relationship information."""
|
63
70
|
indent = " " * self.depth
|
64
|
-
parent_info =
|
71
|
+
parent_info = (
|
72
|
+
f" (parent_id: {self.parent_span_id})" if self.parent_span_id else ""
|
73
|
+
)
|
65
74
|
print(f"{indent}→ {self.function} (id: {self.span_id}){parent_info}")
|
66
75
|
|
67
76
|
def _is_json_serializable(self, obj: Any) -> bool:
|
@@ -80,38 +89,56 @@ class TraceSpan(BaseModel):
|
|
80
89
|
return str(output)
|
81
90
|
except (TypeError, OverflowError, ValueError):
|
82
91
|
pass
|
83
|
-
|
92
|
+
|
84
93
|
try:
|
85
94
|
return repr(output)
|
86
95
|
except (TypeError, OverflowError, ValueError):
|
87
96
|
pass
|
88
97
|
return None
|
89
|
-
|
98
|
+
|
90
99
|
def _serialize_value(self, value: Any) -> Any:
|
91
100
|
"""Helper method to deep serialize a value safely supporting Pydantic Models / regular PyObjects."""
|
92
101
|
if value is None:
|
93
102
|
return None
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
103
|
+
|
104
|
+
recursion_limit = sys.getrecursionlimit()
|
105
|
+
recursion_limit = int(recursion_limit * 0.75)
|
106
|
+
|
107
|
+
def serialize_value(value, current_depth=0):
|
108
|
+
try:
|
109
|
+
if current_depth > recursion_limit:
|
110
|
+
return {"error": "max_depth_reached: " + type(value).__name__}
|
111
|
+
|
112
|
+
if isinstance(value, BaseModel):
|
113
|
+
return value.model_dump()
|
114
|
+
elif isinstance(value, dict):
|
115
|
+
# Recursively serialize dictionary values
|
116
|
+
return {
|
117
|
+
k: serialize_value(v, current_depth + 1)
|
118
|
+
for k, v in value.items()
|
119
|
+
}
|
120
|
+
elif isinstance(value, (list, tuple)):
|
121
|
+
# Recursively serialize list/tuple items
|
122
|
+
return [serialize_value(item, current_depth + 1) for item in value]
|
123
|
+
else:
|
124
|
+
# Try direct JSON serialization first
|
125
|
+
try:
|
126
|
+
json.dumps(value)
|
127
|
+
return value
|
128
|
+
except (TypeError, OverflowError, ValueError):
|
129
|
+
# Fallback to safe stringification
|
130
|
+
return self.safe_stringify(value, self.function)
|
131
|
+
except Exception:
|
132
|
+
return {"error": "Unable to serialize"}
|
133
|
+
except Exception:
|
134
|
+
return {"error": "Unable to serialize"}
|
112
135
|
|
113
136
|
# Start serialization with the top-level value
|
114
|
-
|
137
|
+
try:
|
138
|
+
return serialize_value(value, current_depth=0)
|
139
|
+
except Exception:
|
140
|
+
return {"error": "Unable to serialize"}
|
141
|
+
|
115
142
|
|
116
143
|
class Trace(BaseModel):
|
117
144
|
trace_id: str
|
@@ -121,6 +148,7 @@ class Trace(BaseModel):
|
|
121
148
|
trace_spans: List[TraceSpan]
|
122
149
|
overwrite: bool = False
|
123
150
|
offline_mode: bool = False
|
124
|
-
rules:
|
151
|
+
rules: Dict[str, Any] = Field(default_factory=dict)
|
125
152
|
has_notification: Optional[bool] = False
|
126
|
-
|
153
|
+
customer_id: Optional[str] = None
|
154
|
+
tags: List[str] = Field(default_factory=list)
|
judgeval/data/trace_run.py
CHANGED
@@ -1,22 +1,20 @@
|
|
1
1
|
from pydantic import BaseModel
|
2
|
-
from typing import List, Optional, Dict, Any, Union
|
2
|
+
from typing import List, Optional, Dict, Any, Union
|
3
3
|
from judgeval.data import Trace
|
4
4
|
from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
|
5
|
-
from judgeval.judges import JudgevalJudge
|
6
5
|
from judgeval.rules import Rule
|
7
6
|
|
8
7
|
|
9
8
|
class TraceRun(BaseModel):
|
10
9
|
"""
|
11
10
|
Stores example and evaluation scorers together for running an eval task
|
12
|
-
|
13
|
-
Args:
|
11
|
+
|
12
|
+
Args:
|
14
13
|
project_name (str): The name of the project the evaluation results belong to
|
15
14
|
eval_name (str): A name for this evaluation run
|
16
15
|
traces (List[Trace]): The traces to evaluate
|
17
16
|
scorers (List[Union[JudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
|
18
17
|
model (str): The model used as a judge when using LLM as a Judge
|
19
|
-
aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
|
20
18
|
metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
|
21
19
|
judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
|
22
20
|
rules (Optional[List[Rule]]): Rules to evaluate against scoring results
|
@@ -24,16 +22,12 @@ class TraceRun(BaseModel):
|
|
24
22
|
tools (Optional[List[Dict[str, Any]]]): List of tools to use for evaluation
|
25
23
|
"""
|
26
24
|
|
27
|
-
# The user will specify whether they want log_results when they call run_eval
|
28
|
-
log_results: bool = False # NOTE: log_results has to be set first because it is used to validate project_name and eval_name
|
29
25
|
organization_id: Optional[str] = None
|
30
26
|
project_name: Optional[str] = None
|
31
27
|
eval_name: Optional[str] = None
|
32
28
|
traces: Optional[List[Trace]] = None
|
33
29
|
scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]
|
34
|
-
model: Optional[
|
35
|
-
aggregator: Optional[str] = None
|
36
|
-
metadata: Optional[Dict[str, Any]] = None
|
30
|
+
model: Optional[str] = "gpt-4.1"
|
37
31
|
trace_span_id: Optional[str] = None
|
38
32
|
append: Optional[bool] = False
|
39
33
|
# API Key will be "" until user calls client.run_eval(), then API Key will be set
|
@@ -43,4 +37,4 @@ class TraceRun(BaseModel):
|
|
43
37
|
tools: Optional[List[Dict[str, Any]]] = None
|
44
38
|
|
45
39
|
class Config:
|
46
|
-
arbitrary_types_allowed = True
|
40
|
+
arbitrary_types_allowed = True
|
judgeval/evaluation_run.py
CHANGED
@@ -1,144 +1,84 @@
|
|
1
|
-
from typing import List, Optional,
|
1
|
+
from typing import List, Optional, Union
|
2
2
|
from pydantic import BaseModel, field_validator, Field
|
3
3
|
|
4
4
|
from judgeval.data import Example, CustomExample
|
5
5
|
from judgeval.scorers import JudgevalScorer, APIJudgmentScorer
|
6
6
|
from judgeval.constants import ACCEPTABLE_MODELS
|
7
|
-
|
8
|
-
from judgeval.judges import JudgevalJudge
|
9
|
-
from judgeval.rules import Rule
|
7
|
+
|
10
8
|
|
11
9
|
class EvaluationRun(BaseModel):
|
12
10
|
"""
|
13
11
|
Stores example and evaluation scorers together for running an eval task
|
14
|
-
|
15
|
-
Args:
|
12
|
+
|
13
|
+
Args:
|
16
14
|
project_name (str): The name of the project the evaluation results belong to
|
17
15
|
eval_name (str): A name for this evaluation run
|
18
16
|
examples (Union[List[Example], List[CustomExample]]): The examples to evaluate
|
19
17
|
scorers (List[Union[JudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
|
20
18
|
model (str): The model used as a judge when using LLM as a Judge
|
21
|
-
aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
|
22
19
|
metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
|
23
20
|
judgment_api_key (Optional[str]): The API key for running evaluations on the Judgment API
|
24
|
-
rules (Optional[List[Rule]]): Rules to evaluate against scoring results
|
25
21
|
"""
|
26
22
|
|
27
|
-
# The user will specify whether they want log_results when they call run_eval
|
28
|
-
log_results: bool = False # NOTE: log_results has to be set first because it is used to validate project_name and eval_name
|
29
23
|
organization_id: Optional[str] = None
|
30
24
|
project_name: Optional[str] = Field(default=None, validate_default=True)
|
31
25
|
eval_name: Optional[str] = Field(default=None, validate_default=True)
|
32
26
|
examples: Union[List[Example], List[CustomExample]]
|
33
27
|
scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]
|
34
|
-
model: Optional[
|
35
|
-
aggregator: Optional[str] = Field(default=None, validate_default=True)
|
36
|
-
metadata: Optional[Dict[str, Any]] = None
|
28
|
+
model: Optional[str] = "gpt-4.1"
|
37
29
|
trace_span_id: Optional[str] = None
|
38
30
|
# API Key will be "" until user calls client.run_eval(), then API Key will be set
|
39
31
|
judgment_api_key: Optional[str] = ""
|
40
32
|
override: Optional[bool] = False
|
41
33
|
append: Optional[bool] = False
|
42
|
-
|
43
|
-
|
34
|
+
|
44
35
|
def model_dump(self, **kwargs):
|
45
36
|
data = super().model_dump(**kwargs)
|
46
37
|
|
47
38
|
data["scorers"] = [
|
48
|
-
scorer.to_dict()
|
49
|
-
|
39
|
+
scorer.to_dict()
|
40
|
+
if hasattr(scorer, "to_dict")
|
41
|
+
else scorer.model_dump()
|
42
|
+
if hasattr(scorer, "model_dump")
|
50
43
|
else {"score_type": scorer.score_type, "threshold": scorer.threshold}
|
51
44
|
for scorer in self.scorers
|
52
45
|
]
|
53
46
|
|
54
|
-
if self.rules:
|
55
|
-
# Process rules to ensure proper serialization
|
56
|
-
data["rules"] = [rule.model_dump() for rule in self.rules]
|
57
|
-
|
58
47
|
return data
|
59
48
|
|
60
|
-
@field_validator(
|
61
|
-
def validate_log_results(cls, v):
|
62
|
-
if not isinstance(v, bool):
|
63
|
-
raise ValueError(f"log_results must be a boolean. Received {v} of type {type(v)}")
|
64
|
-
return v
|
65
|
-
|
66
|
-
@field_validator('project_name')
|
67
|
-
def validate_project_name(cls, v, values):
|
68
|
-
if values.data.get('log_results', False) and not v:
|
69
|
-
debug("No project name provided when log_results is True")
|
70
|
-
error("Validation failed: Project name required when logging results")
|
71
|
-
raise ValueError("Project name is required when log_results is True. Please include the project_name argument.")
|
72
|
-
return v
|
73
|
-
|
74
|
-
@field_validator('eval_name')
|
75
|
-
def validate_eval_name(cls, v, values):
|
76
|
-
if values.data.get('log_results', False) and not v:
|
77
|
-
debug("No eval name provided when log_results is True")
|
78
|
-
error("Validation failed: Eval name required when logging results")
|
79
|
-
raise ValueError("Eval name is required when log_results is True. Please include the eval_run_name argument.")
|
80
|
-
return v
|
81
|
-
|
82
|
-
@field_validator('examples')
|
49
|
+
@field_validator("examples")
|
83
50
|
def validate_examples(cls, v):
|
84
51
|
if not v:
|
85
52
|
raise ValueError("Examples cannot be empty.")
|
86
|
-
|
53
|
+
|
87
54
|
first_type = type(v[0])
|
88
55
|
if first_type not in (Example, CustomExample):
|
89
56
|
raise ValueError(f"Invalid type for Example/CustomExample: {first_type}")
|
90
57
|
if not all(isinstance(ex, first_type) for ex in v):
|
91
|
-
raise ValueError(
|
92
|
-
|
58
|
+
raise ValueError(
|
59
|
+
"All examples must be of the same type, either all Example or all CustomExample."
|
60
|
+
)
|
61
|
+
|
93
62
|
return v
|
94
63
|
|
95
|
-
@field_validator(
|
64
|
+
@field_validator("scorers")
|
96
65
|
def validate_scorers(cls, v):
|
97
66
|
if not v:
|
98
67
|
raise ValueError("Scorers cannot be empty.")
|
99
68
|
return v
|
100
69
|
|
101
|
-
@field_validator(
|
70
|
+
@field_validator("model")
|
102
71
|
def validate_model(cls, v, values):
|
103
72
|
if not v:
|
104
73
|
raise ValueError("Model cannot be empty.")
|
105
|
-
|
106
|
-
# Check if model is a judgevalJudge
|
107
|
-
if isinstance(v, JudgevalJudge):
|
108
|
-
# Verify all scorers are JudgevalScorer when using judgevalJudge
|
109
|
-
scorers = values.data.get('scorers', [])
|
110
|
-
if not all(isinstance(s, JudgevalScorer) for s in scorers):
|
111
|
-
raise ValueError("When using a judgevalJudge model, all scorers must be JudgevalScorer type")
|
112
|
-
return v
|
113
|
-
|
74
|
+
|
114
75
|
# Check if model is string or list of strings
|
115
76
|
if isinstance(v, str):
|
116
77
|
if v not in ACCEPTABLE_MODELS:
|
117
|
-
raise ValueError(
|
118
|
-
|
119
|
-
|
120
|
-
if isinstance(v, list):
|
121
|
-
if not all(isinstance(m, str) for m in v):
|
122
|
-
raise ValueError("When providing a list of models, all elements must be strings")
|
123
|
-
for m in v:
|
124
|
-
if m not in ACCEPTABLE_MODELS:
|
125
|
-
raise ValueError(f"Model name {m} not recognized. Please select a valid model name.")
|
78
|
+
raise ValueError(
|
79
|
+
f"Model name {v} not recognized. Please select a valid model name.)"
|
80
|
+
)
|
126
81
|
return v
|
127
|
-
raise ValueError(f"Model must be one of: string, list of strings, or JudgevalJudge instance. Received type {type(v)}.")
|
128
82
|
|
129
|
-
@field_validator('aggregator', mode='before')
|
130
|
-
def validate_aggregator(cls, v, values):
|
131
|
-
model = values.data.get('model')
|
132
|
-
if isinstance(model, list) and v is None:
|
133
|
-
raise ValueError("Aggregator cannot be empty.")
|
134
|
-
|
135
|
-
if isinstance(model, list) and not isinstance(v, str):
|
136
|
-
raise ValueError("Aggregator must be a string if provided.")
|
137
|
-
|
138
|
-
if v is not None and v not in ACCEPTABLE_MODELS:
|
139
|
-
raise ValueError(f"Model name {v} not recognized.")
|
140
|
-
|
141
|
-
return v
|
142
|
-
|
143
83
|
class Config:
|
144
84
|
arbitrary_types_allowed = True
|