judgeval 0.0.52__py3-none-any.whl → 0.0.54__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. judgeval/common/logger.py +46 -199
  2. judgeval/common/s3_storage.py +2 -6
  3. judgeval/common/tracer.py +182 -262
  4. judgeval/common/utils.py +16 -36
  5. judgeval/constants.py +14 -20
  6. judgeval/data/__init__.py +0 -2
  7. judgeval/data/datasets/dataset.py +6 -10
  8. judgeval/data/datasets/eval_dataset_client.py +25 -27
  9. judgeval/data/example.py +5 -138
  10. judgeval/data/judgment_types.py +214 -0
  11. judgeval/data/result.py +7 -25
  12. judgeval/data/scorer_data.py +28 -40
  13. judgeval/data/scripts/fix_default_factory.py +23 -0
  14. judgeval/data/scripts/openapi_transform.py +123 -0
  15. judgeval/data/tool.py +3 -54
  16. judgeval/data/trace.py +31 -50
  17. judgeval/data/trace_run.py +3 -3
  18. judgeval/evaluation_run.py +16 -23
  19. judgeval/integrations/langgraph.py +11 -12
  20. judgeval/judges/litellm_judge.py +3 -6
  21. judgeval/judges/mixture_of_judges.py +8 -25
  22. judgeval/judges/together_judge.py +3 -6
  23. judgeval/judgment_client.py +22 -24
  24. judgeval/rules.py +7 -19
  25. judgeval/run_evaluation.py +79 -242
  26. judgeval/scorers/__init__.py +4 -20
  27. judgeval/scorers/agent_scorer.py +21 -0
  28. judgeval/scorers/api_scorer.py +28 -38
  29. judgeval/scorers/base_scorer.py +98 -0
  30. judgeval/scorers/example_scorer.py +19 -0
  31. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -20
  32. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +10 -17
  33. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +9 -24
  34. judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +16 -68
  35. judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +4 -12
  36. judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +4 -4
  37. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +10 -17
  38. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +4 -4
  39. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +4 -4
  40. judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +4 -4
  41. judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +18 -14
  42. judgeval/scorers/score.py +45 -330
  43. judgeval/scorers/utils.py +6 -88
  44. judgeval/utils/file_utils.py +4 -6
  45. judgeval/version_check.py +3 -2
  46. {judgeval-0.0.52.dist-info → judgeval-0.0.54.dist-info}/METADATA +6 -5
  47. judgeval-0.0.54.dist-info/RECORD +65 -0
  48. judgeval/data/custom_example.py +0 -19
  49. judgeval/scorers/judgeval_scorer.py +0 -177
  50. judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +0 -45
  51. judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -29
  52. judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -29
  53. judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -32
  54. judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +0 -28
  55. judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -38
  56. judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -27
  57. judgeval/scorers/prompt_scorer.py +0 -296
  58. judgeval-0.0.52.dist-info/RECORD +0 -69
  59. {judgeval-0.0.52.dist-info → judgeval-0.0.54.dist-info}/WHEEL +0 -0
  60. {judgeval-0.0.52.dist-info → judgeval-0.0.54.dist-info}/licenses/LICENSE.md +0 -0
judgeval/data/example.py CHANGED
@@ -2,15 +2,12 @@
2
2
  Classes for representing examples in a dataset.
3
3
  """
4
4
 
5
- from typing import Optional, Any, Dict, List, Union
6
- from uuid import uuid4
7
- from pydantic import BaseModel, Field, field_validator
8
5
  from enum import Enum
9
6
  from datetime import datetime
10
- from judgeval.data.tool import Tool
7
+ from judgeval.data.judgment_types import ExampleJudgmentType
11
8
 
12
9
 
13
- class ExampleParams(Enum):
10
+ class ExampleParams(str, Enum):
14
11
  INPUT = "input"
15
12
  ACTUAL_OUTPUT = "actual_output"
16
13
  EXPECTED_OUTPUT = "expected_output"
@@ -22,144 +19,14 @@ class ExampleParams(Enum):
22
19
  ADDITIONAL_METADATA = "additional_metadata"
23
20
 
24
21
 
25
- class Example(BaseModel):
26
- input: Optional[Union[str, Dict[str, Any]]] = None
27
- actual_output: Optional[Union[str, List[str]]] = None
28
- expected_output: Optional[Union[str, List[str]]] = None
29
- context: Optional[List[str]] = None
30
- retrieval_context: Optional[List[str]] = None
31
- additional_metadata: Optional[Dict[str, Any]] = None
32
- tools_called: Optional[List[str]] = None
33
- expected_tools: Optional[List[Tool]] = None
34
- name: Optional[str] = None
35
- example_id: str = Field(default_factory=lambda: str(uuid4()))
36
- example_index: Optional[int] = None
37
- created_at: Optional[str] = None
38
- trace_id: Optional[str] = None
22
+ class Example(ExampleJudgmentType):
23
+ example_id: str = ""
39
24
 
40
25
  def __init__(self, **data):
41
- if "example_id" not in data:
42
- data["example_id"] = str(uuid4())
43
- # Set timestamp if not provided
44
26
  if "created_at" not in data:
45
27
  data["created_at"] = datetime.now().isoformat()
46
28
  super().__init__(**data)
47
-
48
- @field_validator("input", mode="before")
49
- @classmethod
50
- def validate_input(cls, v):
51
- if v is not None:
52
- if not isinstance(v, (str, dict)):
53
- raise ValueError(
54
- f"Input must be a string or dictionary but got {v} of type {type(v)}"
55
- )
56
-
57
- # If it's a string, check that it's not empty
58
- if isinstance(v, str) and not v:
59
- raise ValueError(f"Input string must be non-empty but got '{v}'")
60
-
61
- # If it's a dictionary, check that it's not empty
62
- if isinstance(v, dict) and not v:
63
- raise ValueError(f"Input dictionary must be non-empty but got {v}")
64
-
65
- return v
66
-
67
- @field_validator("actual_output", mode="before")
68
- @classmethod
69
- def validate_actual_output(cls, v):
70
- if v is not None:
71
- if not isinstance(v, (str, list)):
72
- raise ValueError(
73
- f"Actual output must be a string or a list of strings but got {v} of type {type(v)}"
74
- )
75
- if isinstance(v, list) and not all(isinstance(item, str) for item in v):
76
- raise ValueError(
77
- f"All items in actual_output must be strings but got {v}"
78
- )
79
- return v
80
-
81
- @field_validator("expected_output", mode="before")
82
- @classmethod
83
- def validate_expected_output(cls, v):
84
- if v is not None and not isinstance(v, (str, list)):
85
- raise ValueError(
86
- f"Expected output must be a string, a list of strings, or None but got {v} of type {type(v)}"
87
- )
88
- if isinstance(v, list) and not all(isinstance(item, str) for item in v):
89
- raise ValueError(
90
- f"All items in expected_output must be strings but got {v}"
91
- )
92
- return v
93
-
94
- @field_validator("expected_tools")
95
- @classmethod
96
- def validate_expected_tools(cls, v):
97
- if v is not None:
98
- if not isinstance(v, list):
99
- raise ValueError(
100
- f"Expected tools must be a list of Tools or None but got {v} of type {type(v)}"
101
- )
102
-
103
- # Check that each item in the list is a Tool
104
- for i, item in enumerate(v):
105
- if not isinstance(item, Tool):
106
- raise ValueError(
107
- f"Expected tools must be a list of Tools, but item at index {i} is {item} of type {type(item)}"
108
- )
109
-
110
- return v
111
-
112
- @field_validator("context", "retrieval_context", "tools_called", mode="before")
113
- @classmethod
114
- def validate_string_lists(cls, v, info):
115
- field_name = info.field_name
116
- if v is not None:
117
- if not isinstance(v, list):
118
- raise ValueError(
119
- f"{field_name} must be a list of strings or None but got {v} of type {type(v)}"
120
- )
121
- for i, item in enumerate(v):
122
- if not isinstance(item, str):
123
- raise ValueError(
124
- f"All items in {field_name} must be strings but item at index {i} is {item} of type {type(item)}"
125
- )
126
- return v
127
-
128
- @field_validator("additional_metadata", mode="before")
129
- @classmethod
130
- def validate_additional_metadata(cls, v):
131
- if v is not None and not isinstance(v, dict):
132
- raise ValueError(
133
- f"Additional metadata must be a dictionary or None but got {v} of type {type(v)}"
134
- )
135
- return v
136
-
137
- @field_validator("example_index", mode="before")
138
- @classmethod
139
- def validate_example_index(cls, v):
140
- if v is not None and not isinstance(v, int):
141
- raise ValueError(
142
- f"Example index must be an integer or None but got {v} of type {type(v)}"
143
- )
144
- return v
145
-
146
- @field_validator("created_at", mode="before")
147
- @classmethod
148
- def validate_created_at(cls, v):
149
- if v is not None and not isinstance(v, str):
150
- raise ValueError(
151
- f"Timestamp must be a string or None but got {v} of type {type(v)}"
152
- )
153
- return v
154
-
155
- @field_validator("trace_id", mode="before")
156
- @classmethod
157
- def validate_trace_id(cls, v):
158
- if v is not None and not isinstance(v, str):
159
- raise ValueError(
160
- f"Trace ID must be a string or None but got {v} of type {type(v)}"
161
- )
162
- return v
29
+ self.example_id = None
163
30
 
164
31
  def to_dict(self):
165
32
  return {
@@ -0,0 +1,214 @@
1
+ # generated by datamodel-codegen:
2
+ # filename: openapi_new.json
3
+ # timestamp: 2025-07-12T17:11:33+00:00
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import Annotated, Any, Dict, List, Optional, Union
8
+
9
+ from pydantic import BaseModel, Field
10
+
11
+
12
+ class ValidationErrorJudgmentType(BaseModel):
13
+ loc: Annotated[List[Union[str, int]], Field(title="Location")]
14
+ msg: Annotated[str, Field(title="Message")]
15
+ type: Annotated[str, Field(title="Error Type")]
16
+
17
+
18
+ class ScorerDataJudgmentType(BaseModel):
19
+ name: Annotated[str, Field(title="Name")]
20
+ threshold: Annotated[float, Field(title="Threshold")]
21
+ success: Annotated[bool, Field(title="Success")]
22
+ score: Annotated[Optional[float], Field(title="Score")] = None
23
+ reason: Annotated[Optional[str], Field(title="Reason")] = None
24
+ strict_mode: Annotated[Optional[bool], Field(title="Strict Mode")] = None
25
+ evaluation_model: Annotated[
26
+ Optional[Union[List[str], str]], Field(title="Evaluation Model")
27
+ ] = None
28
+ error: Annotated[Optional[str], Field(title="Error")] = None
29
+ additional_metadata: Annotated[
30
+ Optional[Dict[str, Any]], Field(title="Additional Metadata")
31
+ ] = None
32
+
33
+
34
+ class ScorerConfigJudgmentType(BaseModel):
35
+ score_type: Annotated[str, Field(title="Score Type")]
36
+ name: Annotated[Optional[str], Field(title="Name")] = None
37
+ threshold: Annotated[Optional[float], Field(title="Threshold")] = 0.5
38
+ strict_mode: Annotated[Optional[bool], Field(title="Strict Mode")] = False
39
+ required_params: Annotated[Optional[List[str]], Field(title="Required Params")] = (
40
+ Field(default_factory=list)
41
+ )
42
+ kwargs: Annotated[Optional[Dict[str, Any]], Field(title="Kwargs")] = None
43
+
44
+
45
+ class TraceUsageJudgmentType(BaseModel):
46
+ prompt_tokens: Annotated[Optional[int], Field(title="Prompt Tokens")] = None
47
+ completion_tokens: Annotated[Optional[int], Field(title="Completion Tokens")] = None
48
+ cache_creation_input_tokens: Annotated[
49
+ Optional[int], Field(title="Cache Creation Input Tokens")
50
+ ] = None
51
+ cache_read_input_tokens: Annotated[
52
+ Optional[int], Field(title="Cache Read Input Tokens")
53
+ ] = None
54
+ total_tokens: Annotated[Optional[int], Field(title="Total Tokens")] = None
55
+ prompt_tokens_cost_usd: Annotated[
56
+ Optional[float], Field(title="Prompt Tokens Cost Usd")
57
+ ] = None
58
+ completion_tokens_cost_usd: Annotated[
59
+ Optional[float], Field(title="Completion Tokens Cost Usd")
60
+ ] = None
61
+ total_cost_usd: Annotated[Optional[float], Field(title="Total Cost Usd")] = None
62
+ model_name: Annotated[Optional[str], Field(title="Model Name")] = None
63
+
64
+
65
+ class ToolJudgmentType(BaseModel):
66
+ tool_name: Annotated[str, Field(title="Tool Name")]
67
+ parameters: Annotated[Optional[Dict[str, Any]], Field(title="Parameters")] = None
68
+ agent_name: Annotated[Optional[str], Field(title="Agent Name")] = None
69
+ result_dependencies: Annotated[
70
+ Optional[List[Dict[str, Any]]], Field(title="Result Dependencies")
71
+ ] = None
72
+ action_dependencies: Annotated[
73
+ Optional[List[Dict[str, Any]]], Field(title="Action Dependencies")
74
+ ] = None
75
+ require_all: Annotated[Optional[bool], Field(title="Require All")] = None
76
+
77
+
78
+ class HTTPValidationErrorJudgmentType(BaseModel):
79
+ detail: Annotated[
80
+ Optional[List[ValidationErrorJudgmentType]], Field(title="Detail")
81
+ ] = None
82
+
83
+
84
+ class TraceSpanJudgmentType(BaseModel):
85
+ span_id: Annotated[str, Field(title="Span Id")]
86
+ trace_id: Annotated[str, Field(title="Trace Id")]
87
+ function: Annotated[str, Field(title="Function")]
88
+ depth: Annotated[int, Field(title="Depth")]
89
+ created_at: Annotated[Any, Field(title="Created At")] = None
90
+ parent_span_id: Annotated[Optional[str], Field(title="Parent Span Id")] = None
91
+ span_type: Annotated[Optional[str], Field(title="Span Type")] = "span"
92
+ inputs: Annotated[Optional[Dict[str, Any]], Field(title="Inputs")] = None
93
+ error: Annotated[Optional[Dict[str, Any]], Field(title="Error")] = None
94
+ output: Annotated[Any, Field(title="Output")] = None
95
+ usage: Optional[TraceUsageJudgmentType] = None
96
+ duration: Annotated[Optional[float], Field(title="Duration")] = None
97
+ annotation: Annotated[Optional[List[Dict[str, Any]]], Field(title="Annotation")] = (
98
+ None
99
+ )
100
+ expected_tools: Annotated[
101
+ Optional[List[ToolJudgmentType]], Field(title="Expected Tools")
102
+ ] = None
103
+ additional_metadata: Annotated[
104
+ Optional[Dict[str, Any]], Field(title="Additional Metadata")
105
+ ] = None
106
+ has_evaluation: Annotated[Optional[bool], Field(title="Has Evaluation")] = False
107
+ agent_name: Annotated[Optional[str], Field(title="Agent Name")] = None
108
+ state_before: Annotated[Optional[Dict[str, Any]], Field(title="State Before")] = (
109
+ None
110
+ )
111
+ state_after: Annotated[Optional[Dict[str, Any]], Field(title="State After")] = None
112
+ update_id: Annotated[Optional[int], Field(title="Update Id")] = 1
113
+
114
+
115
+ class ExampleJudgmentType(BaseModel):
116
+ input: Annotated[Optional[Union[str, Dict[str, Any]]], Field(title="Input")] = None
117
+ actual_output: Annotated[
118
+ Optional[Union[str, List[str]]], Field(title="Actual Output")
119
+ ] = None
120
+ expected_output: Annotated[
121
+ Optional[Union[str, List[str]]], Field(title="Expected Output")
122
+ ] = None
123
+ context: Annotated[Optional[List[str]], Field(title="Context")] = None
124
+ retrieval_context: Annotated[
125
+ Optional[List[str]], Field(title="Retrieval Context")
126
+ ] = None
127
+ additional_metadata: Annotated[
128
+ Optional[Dict[str, Any]], Field(title="Additional Metadata")
129
+ ] = None
130
+ tools_called: Annotated[Optional[List[str]], Field(title="Tools Called")] = Field(
131
+ default_factory=list
132
+ )
133
+ expected_tools: Annotated[
134
+ Optional[List[ToolJudgmentType]], Field(title="Expected Tools")
135
+ ] = Field(default_factory=list)
136
+ name: Annotated[Optional[str], Field(title="Name")] = None
137
+ example_id: Annotated[str, Field(title="Example Id")]
138
+ example_index: Annotated[Optional[int], Field(title="Example Index")] = None
139
+ created_at: Annotated[Optional[str], Field(title="Created At")] = None
140
+ trace_id: Annotated[Optional[str], Field(title="Trace Id")] = None
141
+ trace_span_id: Annotated[Optional[str], Field(title="Trace Span Id")] = None
142
+ dataset_id: Annotated[Optional[str], Field(title="Dataset Id")] = None
143
+
144
+
145
+ class TraceJudgmentType(BaseModel):
146
+ trace_id: Annotated[str, Field(title="Trace Id")]
147
+ name: Annotated[str, Field(title="Name")]
148
+ created_at: Annotated[str, Field(title="Created At")]
149
+ duration: Annotated[float, Field(title="Duration")]
150
+ trace_spans: Annotated[List[TraceSpanJudgmentType], Field(title="Trace Spans")]
151
+ overwrite: Annotated[Optional[bool], Field(title="Overwrite")] = False
152
+ offline_mode: Annotated[Optional[bool], Field(title="Offline Mode")] = False
153
+ rules: Annotated[Optional[Dict[str, Any]], Field(title="Rules")] = Field(
154
+ default_factory=dict
155
+ )
156
+ has_notification: Annotated[Optional[bool], Field(title="Has Notification")] = False
157
+ customer_id: Annotated[Optional[str], Field(title="Customer Id")] = None
158
+ tags: Annotated[Optional[List[str]], Field(title="Tags")] = Field(
159
+ default_factory=list
160
+ )
161
+ metadata: Annotated[Optional[Dict[str, Any]], Field(title="Metadata")] = Field(
162
+ default_factory=dict
163
+ )
164
+ update_id: Annotated[Optional[int], Field(title="Update Id")] = 1
165
+
166
+
167
+ class ScoringResultJudgmentType(BaseModel):
168
+ success: Annotated[bool, Field(title="Success")]
169
+ scorers_data: Annotated[
170
+ Optional[List[ScorerDataJudgmentType]], Field(title="Scorers Data")
171
+ ]
172
+ name: Annotated[Optional[str], Field(title="Name")] = None
173
+ data_object: Annotated[
174
+ Optional[Union[TraceSpanJudgmentType, ExampleJudgmentType]],
175
+ Field(title="Data Object"),
176
+ ] = None
177
+ trace_id: Annotated[Optional[str], Field(title="Trace Id")] = None
178
+ run_duration: Annotated[Optional[float], Field(title="Run Duration")] = None
179
+
180
+
181
+ class TraceRunJudgmentType(BaseModel):
182
+ project_name: Annotated[Optional[str], Field(title="Project Name")] = None
183
+ eval_name: Annotated[Optional[str], Field(title="Eval Name")] = None
184
+ traces: Annotated[List[TraceJudgmentType], Field(title="Traces")]
185
+ scorers: Annotated[List[ScorerConfigJudgmentType], Field(title="Scorers")]
186
+ model: Annotated[str, Field(title="Model")]
187
+ judgment_api_key: Annotated[Optional[str], Field(title="Judgment Api Key")] = None
188
+ append: Annotated[Optional[bool], Field(title="Append")] = False
189
+ override_existing_eval_run_name: Annotated[
190
+ Optional[bool], Field(title="Override Existing Eval Run Name")
191
+ ] = False
192
+ trace_span_id: Annotated[Optional[str], Field(title="Trace Span Id")] = None
193
+ tools: Annotated[Optional[List[Dict[str, Any]]], Field(title="Tools")] = None
194
+
195
+
196
+ class JudgmentEvalJudgmentType(BaseModel):
197
+ project_name: Annotated[Optional[str], Field(title="Project Name")] = None
198
+ eval_name: Annotated[Optional[str], Field(title="Eval Name")] = None
199
+ examples: Annotated[List[ExampleJudgmentType], Field(title="Examples")]
200
+ scorers: Annotated[List[ScorerConfigJudgmentType], Field(title="Scorers")]
201
+ model: Annotated[str, Field(title="Model")]
202
+ judgment_api_key: Annotated[Optional[str], Field(title="Judgment Api Key")] = None
203
+ append: Annotated[Optional[bool], Field(title="Append")] = False
204
+ override_existing_eval_run_name: Annotated[
205
+ Optional[bool], Field(title="Override Existing Eval Run Name")
206
+ ] = False
207
+ trace_span_id: Annotated[Optional[str], Field(title="Trace Span Id")] = None
208
+
209
+
210
+ class EvalResultsJudgmentType(BaseModel):
211
+ results: Annotated[List[ScoringResultJudgmentType], Field(title="Results")]
212
+ run: Annotated[
213
+ Union[TraceRunJudgmentType, JudgmentEvalJudgmentType], Field(title="Run")
214
+ ]
judgeval/data/result.py CHANGED
@@ -1,11 +1,10 @@
1
- from typing import List, Optional, Union
2
- from judgeval.common.logger import debug
3
- from pydantic import BaseModel
4
- from judgeval.data import ScorerData, Example, CustomExample
1
+ from typing import List, Union
2
+ from judgeval.data import ScorerData, Example
5
3
  from judgeval.data.trace import TraceSpan
4
+ from judgeval.data.judgment_types import ScoringResultJudgmentType
6
5
 
7
6
 
8
- class ScoringResult(BaseModel):
7
+ class ScoringResult(ScoringResultJudgmentType):
9
8
  """
10
9
  A ScoringResult contains the output of one or more scorers applied to a single example.
11
10
  Ie: One input, one actual_output, one expected_output, etc..., and 1+ scorer (Faithfulness, Hallucination, Summarization, etc...)
@@ -14,23 +13,10 @@ class ScoringResult(BaseModel):
14
13
  success (bool): Whether the evaluation was successful.
15
14
  This means that all scorers applied to this example returned a success.
16
15
  scorer_data (List[ScorerData]): The scorers data for the evaluated example
17
- data_object (Optional[Example]): The original example object that was used to create the ScoringResult, can be Example, CustomExample (future), WorkflowRun (future)
16
+ data_object (Optional[Example]): The original example object that was used to create the ScoringResult, can be Example, WorkflowRun (future)
18
17
 
19
18
  """
20
19
 
21
- # Fields for scoring outputs
22
- success: bool # used for unit testing
23
- scorers_data: Union[List[ScorerData], None]
24
- name: Optional[str] = None
25
-
26
- # The original example object that was used to create the ScoringResult
27
- data_object: Optional[Union[TraceSpan, CustomExample, Example]] = None
28
- trace_id: Optional[str] = None
29
-
30
- # Additional fields for internal use
31
- run_duration: Optional[float] = None
32
- evaluation_cost: Optional[float] = None
33
-
34
20
  def to_dict(self) -> dict:
35
21
  """Convert the ScoringResult instance to a dictionary, properly serializing scorer_data."""
36
22
  return {
@@ -46,8 +32,7 @@ class ScoringResult(BaseModel):
46
32
  success={self.success}, \
47
33
  scorer_data={self.scorers_data}, \
48
34
  data_object={self.data_object}, \
49
- run_duration={self.run_duration}, \
50
- evaluation_cost={self.evaluation_cost})"
35
+ run_duration={self.run_duration})"
51
36
 
52
37
 
53
38
  def generate_scoring_result(
@@ -62,18 +47,15 @@ def generate_scoring_result(
62
47
  When an LLMTestCase is executed, it turns into an LLMApiTestCase and the progress of the evaluation run is tracked.
63
48
  At the end of the evaluation run, we create a TestResult object out of the completed LLMApiTestCase.
64
49
  """
65
- if data_object.name is not None:
50
+ if hasattr(data_object, "name") and data_object.name is not None:
66
51
  name = data_object.name
67
52
  else:
68
53
  name = "Test Case Placeholder"
69
- debug(f"No name provided for example, using default name: {name}")
70
- debug(f"Creating ScoringResult for: {name}")
71
54
  scoring_result = ScoringResult(
72
55
  name=name,
73
56
  data_object=data_object,
74
57
  success=success,
75
58
  scorers_data=scorers_data,
76
59
  run_duration=run_duration,
77
- evaluation_cost=None,
78
60
  )
79
61
  return scoring_result
@@ -4,13 +4,12 @@ Implementation of the ScorerData class.
4
4
  ScorerData holds the information related to a single, completed Scorer evaluation run.
5
5
  """
6
6
 
7
- from typing import List, Union, Optional, Dict
8
- from pydantic import BaseModel
7
+ from judgeval.data.judgment_types import ScorerDataJudgmentType
8
+ from judgeval.scorers import BaseScorer
9
+ from typing import List
9
10
 
10
- from judgeval.scorers import JudgevalScorer
11
11
 
12
-
13
- class ScorerData(BaseModel):
12
+ class ScorerData(ScorerDataJudgmentType):
14
13
  """
15
14
  ScorerData holds the information related to a single, completed Scorer evaluation run.
16
15
 
@@ -20,18 +19,6 @@ class ScorerData(BaseModel):
20
19
  judge model(s).
21
20
  """
22
21
 
23
- name: str
24
- threshold: float
25
- success: bool
26
- score: Optional[float] = None
27
- reason: Optional[str] = None
28
- strict_mode: Optional[bool] = None
29
- evaluation_model: Union[List[str], str] | None = None
30
- error: Optional[str] = None
31
- evaluation_cost: Union[float, None] = None
32
- verbose_logs: Optional[str] = None
33
- additional_metadata: Optional[Dict] = None
34
-
35
22
  def to_dict(self) -> dict:
36
23
  """Convert the ScorerData instance to a JSON-serializable dictionary."""
37
24
  return {
@@ -43,13 +30,11 @@ class ScorerData(BaseModel):
43
30
  "strict_mode": self.strict_mode,
44
31
  "evaluation_model": self.evaluation_model,
45
32
  "error": self.error,
46
- "evaluation_cost": self.evaluation_cost,
47
- "verbose_logs": self.verbose_logs,
48
33
  "additional_metadata": self.additional_metadata,
49
34
  }
50
35
 
51
36
 
52
- def create_scorer_data(scorer: JudgevalScorer) -> ScorerData:
37
+ def create_scorer_data(scorer: BaseScorer) -> List[ScorerData]:
53
38
  """
54
39
  After a `scorer` is run, it contains information about the example that was evaluated
55
40
  using the scorer. For example, after computing Faithfulness, the `scorer` object will contain
@@ -59,30 +44,33 @@ def create_scorer_data(scorer: JudgevalScorer) -> ScorerData:
59
44
  contains the output of the scorer run that can be exported to be logged as a part of
60
45
  the ScorerResult.
61
46
  """
62
- if scorer.error is not None: # error occurred during eval run
63
- return ScorerData(
64
- name=scorer.__name__,
47
+ scorers_result = list()
48
+
49
+ scorers_result.append(
50
+ ScorerData(
51
+ name=scorer.name,
65
52
  threshold=scorer.threshold,
66
- score=None,
67
- reason=None,
68
- success=False,
69
- strict_mode=scorer.strict_mode,
70
- evaluation_model=scorer.evaluation_model,
71
- error=scorer.error,
72
- evaluation_cost=scorer.evaluation_cost,
73
- verbose_logs=scorer.verbose_logs,
74
- )
75
- else: # standard execution, no error
76
- return ScorerData(
77
- name=scorer.__name__,
78
53
  score=scorer.score,
79
- threshold=scorer.threshold,
80
54
  reason=scorer.reason,
81
- success=scorer._success_check(),
55
+ success=scorer.success,
82
56
  strict_mode=scorer.strict_mode,
83
57
  evaluation_model=scorer.evaluation_model,
84
- error=None,
85
- evaluation_cost=scorer.evaluation_cost,
86
- verbose_logs=scorer.verbose_logs,
58
+ error=scorer.error,
87
59
  additional_metadata=scorer.additional_metadata,
88
60
  )
61
+ )
62
+ if hasattr(scorer, "internal_scorer") and scorer.internal_scorer is not None:
63
+ scorers_result.append(
64
+ ScorerData(
65
+ name=scorer.internal_scorer.name,
66
+ score=scorer.internal_scorer.score,
67
+ threshold=scorer.internal_scorer.threshold,
68
+ reason=scorer.internal_scorer.reason,
69
+ success=scorer.internal_scorer.success,
70
+ strict_mode=scorer.internal_scorer.strict_mode,
71
+ evaluation_model=scorer.internal_scorer.evaluation_model,
72
+ error=scorer.internal_scorer.error,
73
+ additional_metadata=scorer.internal_scorer.additional_metadata,
74
+ )
75
+ )
76
+ return scorers_result
@@ -0,0 +1,23 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Post-process generated Pydantic models with default_factory defaults.
4
+ """
5
+
6
+ import sys
7
+
8
+
9
+ def fix_mutable_defaults(file_path: str) -> None:
10
+ """Fix mutable defaults in generated Pydantic models."""
11
+
12
+ with open(file_path, "r") as f:
13
+ content = f.read()
14
+
15
+ content = content.replace(" = {}", " = Field(default_factory=dict)")
16
+ content = content.replace(" = []", " = Field(default_factory=list)")
17
+ with open(file_path, "w") as f:
18
+ f.write(content)
19
+
20
+
21
+ if __name__ == "__main__":
22
+ file_path = sys.argv[1]
23
+ fix_mutable_defaults(file_path)