judgeval 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +139 -12
- judgeval/api/__init__.py +501 -0
- judgeval/api/api_types.py +344 -0
- judgeval/cli.py +2 -4
- judgeval/constants.py +10 -26
- judgeval/data/evaluation_run.py +49 -26
- judgeval/data/example.py +2 -2
- judgeval/data/judgment_types.py +266 -82
- judgeval/data/result.py +4 -5
- judgeval/data/scorer_data.py +4 -2
- judgeval/data/tool.py +2 -2
- judgeval/data/trace.py +7 -50
- judgeval/data/trace_run.py +7 -4
- judgeval/{dataset.py → dataset/__init__.py} +43 -28
- judgeval/env.py +67 -0
- judgeval/{run_evaluation.py → evaluation/__init__.py} +29 -95
- judgeval/exceptions.py +27 -0
- judgeval/integrations/langgraph/__init__.py +788 -0
- judgeval/judges/__init__.py +2 -2
- judgeval/judges/litellm_judge.py +75 -15
- judgeval/judges/together_judge.py +86 -18
- judgeval/judges/utils.py +7 -21
- judgeval/{common/logger.py → logger.py} +8 -6
- judgeval/scorers/__init__.py +0 -4
- judgeval/scorers/agent_scorer.py +3 -7
- judgeval/scorers/api_scorer.py +8 -13
- judgeval/scorers/base_scorer.py +52 -32
- judgeval/scorers/example_scorer.py +1 -3
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -14
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +45 -20
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +3 -3
- judgeval/scorers/score.py +21 -31
- judgeval/scorers/trace_api_scorer.py +5 -0
- judgeval/scorers/utils.py +1 -103
- judgeval/tracer/__init__.py +1075 -2
- judgeval/tracer/constants.py +1 -0
- judgeval/tracer/exporters/__init__.py +37 -0
- judgeval/tracer/exporters/s3.py +119 -0
- judgeval/tracer/exporters/store.py +43 -0
- judgeval/tracer/exporters/utils.py +32 -0
- judgeval/tracer/keys.py +67 -0
- judgeval/tracer/llm/__init__.py +1233 -0
- judgeval/{common/tracer → tracer/llm}/providers.py +5 -10
- judgeval/{local_eval_queue.py → tracer/local_eval_queue.py} +15 -10
- judgeval/tracer/managers.py +188 -0
- judgeval/tracer/processors/__init__.py +181 -0
- judgeval/tracer/utils.py +20 -0
- judgeval/trainer/__init__.py +5 -0
- judgeval/{common/trainer → trainer}/config.py +12 -9
- judgeval/{common/trainer → trainer}/console.py +2 -9
- judgeval/{common/trainer → trainer}/trainable_model.py +12 -7
- judgeval/{common/trainer → trainer}/trainer.py +119 -17
- judgeval/utils/async_utils.py +2 -3
- judgeval/utils/decorators.py +24 -0
- judgeval/utils/file_utils.py +37 -4
- judgeval/utils/guards.py +32 -0
- judgeval/utils/meta.py +14 -0
- judgeval/{common/api/json_encoder.py → utils/serialize.py} +7 -1
- judgeval/utils/testing.py +88 -0
- judgeval/utils/url.py +10 -0
- judgeval/{version_check.py → utils/version_check.py} +3 -3
- judgeval/version.py +5 -0
- judgeval/warnings.py +4 -0
- {judgeval-0.8.0.dist-info → judgeval-0.9.0.dist-info}/METADATA +12 -14
- judgeval-0.9.0.dist-info/RECORD +80 -0
- judgeval/clients.py +0 -35
- judgeval/common/__init__.py +0 -13
- judgeval/common/api/__init__.py +0 -3
- judgeval/common/api/api.py +0 -375
- judgeval/common/api/constants.py +0 -186
- judgeval/common/exceptions.py +0 -27
- judgeval/common/storage/__init__.py +0 -6
- judgeval/common/storage/s3_storage.py +0 -97
- judgeval/common/tracer/__init__.py +0 -31
- judgeval/common/tracer/constants.py +0 -22
- judgeval/common/tracer/core.py +0 -2427
- judgeval/common/tracer/otel_exporter.py +0 -108
- judgeval/common/tracer/otel_span_processor.py +0 -188
- judgeval/common/tracer/span_processor.py +0 -37
- judgeval/common/tracer/span_transformer.py +0 -207
- judgeval/common/tracer/trace_manager.py +0 -101
- judgeval/common/trainer/__init__.py +0 -5
- judgeval/common/utils.py +0 -948
- judgeval/integrations/langgraph.py +0 -844
- judgeval/judges/mixture_of_judges.py +0 -287
- judgeval/judgment_client.py +0 -267
- judgeval/rules.py +0 -521
- judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
- judgeval/utils/alerts.py +0 -93
- judgeval/utils/requests.py +0 -50
- judgeval-0.8.0.dist-info/RECORD +0 -82
- {judgeval-0.8.0.dist-info → judgeval-0.9.0.dist-info}/WHEEL +0 -0
- {judgeval-0.8.0.dist-info → judgeval-0.9.0.dist-info}/entry_points.txt +0 -0
- {judgeval-0.8.0.dist-info → judgeval-0.9.0.dist-info}/licenses/LICENSE.md +0 -0
judgeval/data/judgment_types.py
CHANGED
@@ -1,57 +1,129 @@
|
|
1
1
|
# generated by datamodel-codegen:
|
2
|
-
# filename:
|
3
|
-
# timestamp: 2025-08-
|
2
|
+
# filename: .openapi.json
|
3
|
+
# timestamp: 2025-08-29T04:49:38+00:00
|
4
4
|
|
5
5
|
from __future__ import annotations
|
6
|
-
|
7
6
|
from typing import Annotated, Any, Dict, List, Optional, Union
|
7
|
+
from pydantic import AwareDatetime, BaseModel, ConfigDict, Field, RootModel
|
8
8
|
|
9
|
-
from pydantic import BaseModel, ConfigDict, Field
|
10
9
|
|
10
|
+
class TraceAndSpanId(RootModel[List]):
|
11
|
+
root: Annotated[List, Field(max_length=2, min_length=2)]
|
12
|
+
|
13
|
+
|
14
|
+
class EvalResultsFetch(BaseModel):
|
15
|
+
experiment_run_id: Annotated[str, Field(title="Experiment Run Id")]
|
16
|
+
project_name: Annotated[str, Field(title="Project Name")]
|
11
17
|
|
12
|
-
class ValidationErrorJudgmentType(BaseModel):
|
13
|
-
loc: Annotated[List[Union[str, int]], Field(title="Location")]
|
14
|
-
msg: Annotated[str, Field(title="Message")]
|
15
|
-
type: Annotated[str, Field(title="Error Type")]
|
16
18
|
|
19
|
+
class DatasetFetch(BaseModel):
|
20
|
+
dataset_alias: Annotated[str, Field(title="Dataset Alias")]
|
21
|
+
project_name: Annotated[str, Field(title="Project Name")]
|
17
22
|
|
18
|
-
|
23
|
+
|
24
|
+
class TraceSave(BaseModel):
|
25
|
+
project_name: Annotated[str, Field(title="Project Name")]
|
26
|
+
trace_id: Annotated[str, Field(title="Trace Id")]
|
19
27
|
name: Annotated[str, Field(title="Name")]
|
28
|
+
created_at: Annotated[str, Field(title="Created At")]
|
29
|
+
duration: Annotated[float, Field(title="Duration")]
|
30
|
+
offline_mode: Annotated[Optional[bool], Field(title="Offline Mode")] = False
|
31
|
+
has_notification: Annotated[Optional[bool], Field(title="Has Notification")] = False
|
32
|
+
customer_id: Annotated[Optional[str], Field(title="Customer Id")] = None
|
33
|
+
tags: Annotated[Optional[List[str]], Field(title="Tags")] = None
|
34
|
+
metadata: Annotated[Optional[Dict[str, Any]], Field(title="Metadata")] = None
|
35
|
+
update_id: Annotated[Optional[int], Field(title="Update Id")] = 1
|
36
|
+
|
37
|
+
|
38
|
+
class TraceFetch(BaseModel):
|
39
|
+
trace_id: Annotated[str, Field(title="Trace Id")]
|
40
|
+
|
41
|
+
|
42
|
+
class TraceAddToDataset(BaseModel):
|
43
|
+
trace_id: Annotated[str, Field(title="Trace Id")]
|
44
|
+
trace_span_id: Annotated[str, Field(title="Trace Span Id")]
|
45
|
+
dataset_alias: Annotated[str, Field(title="Dataset Alias")]
|
46
|
+
project_name: Annotated[str, Field(title="Project Name")]
|
47
|
+
|
48
|
+
|
49
|
+
class EvaluationRunsBatchRequest(BaseModel):
|
50
|
+
organization_id: Annotated[str, Field(title="Organization Id")]
|
51
|
+
evaluation_entries: Annotated[
|
52
|
+
List[Dict[str, Any]], Field(title="Evaluation Entries")
|
53
|
+
]
|
54
|
+
|
55
|
+
|
56
|
+
class ProjectAdd(BaseModel):
|
57
|
+
project_name: Annotated[str, Field(title="Project Name")]
|
58
|
+
|
59
|
+
|
60
|
+
class ProjectAddResponse(BaseModel):
|
61
|
+
project_id: Annotated[str, Field(title="Project Id")]
|
62
|
+
|
63
|
+
|
64
|
+
class ProjectDeleteFromJudgevalResponse(BaseModel):
|
65
|
+
project_name: Annotated[str, Field(title="Project Name")]
|
66
|
+
|
67
|
+
|
68
|
+
class ProjectDeleteResponse(BaseModel):
|
69
|
+
message: Annotated[str, Field(title="Message")]
|
70
|
+
|
71
|
+
|
72
|
+
class ScorerExistsRequest(BaseModel):
|
73
|
+
name: Annotated[str, Field(title="Name")]
|
74
|
+
|
75
|
+
|
76
|
+
class ScorerExistsResponse(BaseModel):
|
77
|
+
exists: Annotated[bool, Field(title="Exists")]
|
78
|
+
|
79
|
+
|
80
|
+
class SavePromptScorerRequest(BaseModel):
|
81
|
+
name: Annotated[str, Field(title="Name")]
|
82
|
+
prompt: Annotated[str, Field(title="Prompt")]
|
20
83
|
threshold: Annotated[float, Field(title="Threshold")]
|
21
|
-
|
22
|
-
|
23
|
-
reason: Annotated[Optional[str], Field(title="Reason")] = None
|
24
|
-
strict_mode: Annotated[Optional[bool], Field(title="Strict Mode")] = None
|
25
|
-
evaluation_model: Annotated[
|
26
|
-
Optional[Union[List[str], str]], Field(title="Evaluation Model")
|
27
|
-
] = None
|
28
|
-
error: Annotated[Optional[str], Field(title="Error")] = None
|
29
|
-
additional_metadata: Annotated[
|
30
|
-
Optional[Dict[str, Any]], Field(title="Additional Metadata")
|
31
|
-
] = None
|
84
|
+
options: Annotated[Optional[Dict[str, float]], Field(title="Options")] = None
|
85
|
+
is_trace: Annotated[Optional[bool], Field(title="Is Trace")] = False
|
32
86
|
|
33
87
|
|
34
|
-
class
|
35
|
-
|
36
|
-
|
37
|
-
)
|
38
|
-
example_id: Annotated[str, Field(title="Example Id")]
|
39
|
-
created_at: Annotated[str, Field(title="Created At")]
|
40
|
-
name: Annotated[Optional[str], Field(title="Name")] = None
|
88
|
+
class SavePromptScorerResponse(BaseModel):
|
89
|
+
message: Annotated[str, Field(title="Message")]
|
90
|
+
name: Annotated[str, Field(title="Name")]
|
41
91
|
|
42
92
|
|
43
|
-
class
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
)
|
51
|
-
|
93
|
+
class FetchPromptScorerRequest(BaseModel):
|
94
|
+
name: Annotated[str, Field(title="Name")]
|
95
|
+
|
96
|
+
|
97
|
+
class CustomScorerUploadPayload(BaseModel):
|
98
|
+
scorer_name: Annotated[str, Field(title="Scorer Name")]
|
99
|
+
scorer_code: Annotated[str, Field(title="Scorer Code")]
|
100
|
+
requirements_text: Annotated[str, Field(title="Requirements Text")]
|
101
|
+
|
52
102
|
|
103
|
+
class CustomScorerTemplateResponse(BaseModel):
|
104
|
+
scorer_name: Annotated[str, Field(title="Scorer Name")]
|
105
|
+
status: Annotated[str, Field(title="Status")]
|
106
|
+
message: Annotated[str, Field(title="Message")]
|
53
107
|
|
54
|
-
|
108
|
+
|
109
|
+
class ResolveProjectNameRequest(BaseModel):
|
110
|
+
project_name: Annotated[str, Field(title="Project Name")]
|
111
|
+
|
112
|
+
|
113
|
+
class ResolveProjectNameResponse(BaseModel):
|
114
|
+
project_id: Annotated[str, Field(title="Project Id")]
|
115
|
+
|
116
|
+
|
117
|
+
class TraceIdRequest(BaseModel):
|
118
|
+
trace_id: Annotated[str, Field(title="Trace Id")]
|
119
|
+
|
120
|
+
|
121
|
+
class SpanScoreRequest(BaseModel):
|
122
|
+
span_id: Annotated[str, Field(title="Span Id")]
|
123
|
+
trace_id: Annotated[str, Field(title="Trace Id")]
|
124
|
+
|
125
|
+
|
126
|
+
class BaseScorer(BaseModel):
|
55
127
|
score_type: Annotated[str, Field(title="Score Type")]
|
56
128
|
threshold: Annotated[Optional[float], Field(title="Threshold")] = 0.5
|
57
129
|
name: Annotated[Optional[str], Field(title="Name")] = None
|
@@ -76,7 +148,87 @@ class BaseScorerJudgmentType(BaseModel):
|
|
76
148
|
server_hosted: Annotated[Optional[bool], Field(title="Server Hosted")] = False
|
77
149
|
|
78
150
|
|
79
|
-
class
|
151
|
+
class ScorerConfig(BaseModel):
|
152
|
+
score_type: Annotated[str, Field(title="Score Type")]
|
153
|
+
name: Annotated[Optional[str], Field(title="Name")] = None
|
154
|
+
threshold: Annotated[Optional[float], Field(title="Threshold")] = 0.5
|
155
|
+
strict_mode: Annotated[Optional[bool], Field(title="Strict Mode")] = False
|
156
|
+
required_params: Annotated[Optional[List[str]], Field(title="Required Params")] = []
|
157
|
+
kwargs: Annotated[Optional[Dict[str, Any]], Field(title="Kwargs")] = None
|
158
|
+
|
159
|
+
|
160
|
+
class Example(BaseModel):
|
161
|
+
model_config = ConfigDict(
|
162
|
+
extra="allow",
|
163
|
+
)
|
164
|
+
example_id: Annotated[str, Field(title="Example Id")]
|
165
|
+
created_at: Annotated[str, Field(title="Created At")]
|
166
|
+
name: Annotated[Optional[str], Field(title="Name")] = None
|
167
|
+
|
168
|
+
|
169
|
+
class ValidationError(BaseModel):
|
170
|
+
loc: Annotated[List[Union[str, int]], Field(title="Location")]
|
171
|
+
msg: Annotated[str, Field(title="Message")]
|
172
|
+
type: Annotated[str, Field(title="Error Type")]
|
173
|
+
|
174
|
+
|
175
|
+
class SpanBatchItem(BaseModel):
|
176
|
+
span_id: Annotated[str, Field(title="Span Id")]
|
177
|
+
trace_id: Annotated[str, Field(title="Trace Id")]
|
178
|
+
function: Annotated[str, Field(title="Function")]
|
179
|
+
created_at: Annotated[Any, Field(title="Created At")] = None
|
180
|
+
parent_span_id: Annotated[Optional[str], Field(title="Parent Span Id")] = None
|
181
|
+
span_type: Annotated[Optional[str], Field(title="Span Type")] = "span"
|
182
|
+
inputs: Annotated[Optional[Dict[str, Any]], Field(title="Inputs")] = None
|
183
|
+
output: Annotated[Any, Field(title="Output")] = None
|
184
|
+
error: Annotated[Optional[Dict[str, Any]], Field(title="Error")] = None
|
185
|
+
usage: Annotated[Optional[Dict[str, Any]], Field(title="Usage")] = None
|
186
|
+
duration: Annotated[Optional[float], Field(title="Duration")] = None
|
187
|
+
expected_tools: Annotated[
|
188
|
+
Optional[List[Dict[str, Any]]], Field(title="Expected Tools")
|
189
|
+
] = None
|
190
|
+
additional_metadata: Annotated[
|
191
|
+
Optional[Dict[str, Any]], Field(title="Additional Metadata")
|
192
|
+
] = None
|
193
|
+
has_evaluation: Annotated[Optional[bool], Field(title="Has Evaluation")] = False
|
194
|
+
agent_name: Annotated[Optional[str], Field(title="Agent Name")] = None
|
195
|
+
class_name: Annotated[Optional[str], Field(title="Class Name")] = None
|
196
|
+
state_before: Annotated[Optional[Dict[str, Any]], Field(title="State Before")] = (
|
197
|
+
None
|
198
|
+
)
|
199
|
+
state_after: Annotated[Optional[Dict[str, Any]], Field(title="State After")] = None
|
200
|
+
span_state: Annotated[str, Field(title="Span State")]
|
201
|
+
update_id: Annotated[Optional[int], Field(title="Update Id")] = 1
|
202
|
+
queued_at: Annotated[float, Field(title="Queued At")]
|
203
|
+
|
204
|
+
|
205
|
+
class PromptScorer(BaseModel):
|
206
|
+
name: Annotated[str, Field(title="Name")]
|
207
|
+
prompt: Annotated[str, Field(title="Prompt")]
|
208
|
+
threshold: Annotated[float, Field(title="Threshold")]
|
209
|
+
options: Annotated[Optional[Dict[str, float]], Field(title="Options")] = None
|
210
|
+
created_at: Annotated[Optional[AwareDatetime], Field(title="Created At")] = None
|
211
|
+
updated_at: Annotated[Optional[AwareDatetime], Field(title="Updated At")] = None
|
212
|
+
is_trace: Annotated[Optional[bool], Field(title="Is Trace")] = False
|
213
|
+
|
214
|
+
|
215
|
+
class ScorerData(BaseModel):
|
216
|
+
name: Annotated[str, Field(title="Name")]
|
217
|
+
threshold: Annotated[float, Field(title="Threshold")]
|
218
|
+
success: Annotated[bool, Field(title="Success")]
|
219
|
+
score: Annotated[Optional[float], Field(title="Score")] = None
|
220
|
+
reason: Annotated[Optional[str], Field(title="Reason")] = None
|
221
|
+
strict_mode: Annotated[Optional[bool], Field(title="Strict Mode")] = None
|
222
|
+
evaluation_model: Annotated[
|
223
|
+
Optional[Union[List[str], str]], Field(title="Evaluation Model")
|
224
|
+
] = None
|
225
|
+
error: Annotated[Optional[str], Field(title="Error")] = None
|
226
|
+
additional_metadata: Annotated[
|
227
|
+
Optional[Dict[str, Any]], Field(title="Additional Metadata")
|
228
|
+
] = None
|
229
|
+
|
230
|
+
|
231
|
+
class TraceUsage(BaseModel):
|
80
232
|
prompt_tokens: Annotated[Optional[int], Field(title="Prompt Tokens")] = None
|
81
233
|
completion_tokens: Annotated[Optional[int], Field(title="Completion Tokens")] = None
|
82
234
|
cache_creation_input_tokens: Annotated[
|
@@ -96,7 +248,7 @@ class TraceUsageJudgmentType(BaseModel):
|
|
96
248
|
model_name: Annotated[Optional[str], Field(title="Model Name")] = None
|
97
249
|
|
98
250
|
|
99
|
-
class
|
251
|
+
class Tool(BaseModel):
|
100
252
|
tool_name: Annotated[str, Field(title="Tool Name")]
|
101
253
|
parameters: Annotated[Optional[Dict[str, Any]], Field(title="Parameters")] = None
|
102
254
|
agent_name: Annotated[Optional[str], Field(title="Agent Name")] = None
|
@@ -109,45 +261,75 @@ class ToolJudgmentType(BaseModel):
|
|
109
261
|
require_all: Annotated[Optional[bool], Field(title="Require All")] = None
|
110
262
|
|
111
263
|
|
112
|
-
class
|
113
|
-
detail: Annotated[
|
114
|
-
Optional[List[ValidationErrorJudgmentType]], Field(title="Detail")
|
115
|
-
] = None
|
116
|
-
|
117
|
-
|
118
|
-
class EvaluationRunJudgmentType(BaseModel):
|
264
|
+
class ExampleEvaluationRun(BaseModel):
|
119
265
|
id: Annotated[Optional[str], Field(title="Id")] = None
|
120
266
|
project_name: Annotated[Optional[str], Field(title="Project Name")] = None
|
121
267
|
eval_name: Annotated[Optional[str], Field(title="Eval Name")] = None
|
122
|
-
examples: Annotated[List[ExampleJudgmentType], Field(title="Examples")]
|
123
268
|
custom_scorers: Annotated[
|
124
|
-
Optional[List[
|
125
|
-
] =
|
269
|
+
Optional[List[BaseScorer]], Field(title="Custom Scorers")
|
270
|
+
] = []
|
126
271
|
judgment_scorers: Annotated[
|
127
|
-
Optional[List[
|
128
|
-
] =
|
272
|
+
Optional[List[ScorerConfig]], Field(title="Judgment Scorers")
|
273
|
+
] = []
|
129
274
|
model: Annotated[str, Field(title="Model")]
|
275
|
+
created_at: Annotated[Optional[str], Field(title="Created At")] = None
|
276
|
+
examples: Annotated[List[Example], Field(title="Examples")]
|
130
277
|
trace_span_id: Annotated[Optional[str], Field(title="Trace Span Id")] = None
|
131
278
|
trace_id: Annotated[Optional[str], Field(title="Trace Id")] = None
|
279
|
+
|
280
|
+
|
281
|
+
class HTTPValidationError(BaseModel):
|
282
|
+
detail: Annotated[Optional[List[ValidationError]], Field(title="Detail")] = None
|
283
|
+
|
284
|
+
|
285
|
+
class TraceEvaluationRun(BaseModel):
|
286
|
+
id: Annotated[Optional[str], Field(title="Id")] = None
|
287
|
+
project_name: Annotated[Optional[str], Field(title="Project Name")] = None
|
288
|
+
eval_name: Annotated[Optional[str], Field(title="Eval Name")] = None
|
289
|
+
custom_scorers: Annotated[
|
290
|
+
Optional[List[BaseScorer]], Field(title="Custom Scorers")
|
291
|
+
] = []
|
292
|
+
judgment_scorers: Annotated[
|
293
|
+
Optional[List[ScorerConfig]], Field(title="Judgment Scorers")
|
294
|
+
] = []
|
295
|
+
model: Annotated[str, Field(title="Model")]
|
132
296
|
created_at: Annotated[Optional[str], Field(title="Created At")] = None
|
297
|
+
trace_and_span_ids: Annotated[
|
298
|
+
List[TraceAndSpanId], Field(title="Trace And Span Ids")
|
299
|
+
]
|
300
|
+
is_offline: Annotated[Optional[bool], Field(title="Is Offline")] = False
|
301
|
+
|
302
|
+
|
303
|
+
class DatasetInsertExamples(BaseModel):
|
304
|
+
dataset_alias: Annotated[str, Field(title="Dataset Alias")]
|
305
|
+
examples: Annotated[List[Example], Field(title="Examples")]
|
306
|
+
project_name: Annotated[str, Field(title="Project Name")]
|
307
|
+
|
133
308
|
|
309
|
+
class SpansBatchRequest(BaseModel):
|
310
|
+
spans: Annotated[List[SpanBatchItem], Field(title="Spans")]
|
311
|
+
organization_id: Annotated[str, Field(title="Organization Id")]
|
134
312
|
|
135
|
-
|
313
|
+
|
314
|
+
class FetchPromptScorerResponse(BaseModel):
|
315
|
+
scorer: PromptScorer
|
316
|
+
|
317
|
+
|
318
|
+
class TraceSpan(BaseModel):
|
136
319
|
span_id: Annotated[str, Field(title="Span Id")]
|
137
320
|
trace_id: Annotated[str, Field(title="Trace Id")]
|
138
321
|
function: Annotated[str, Field(title="Function")]
|
139
|
-
depth: Annotated[int, Field(title="Depth")]
|
140
322
|
created_at: Annotated[Any, Field(title="Created At")] = None
|
141
323
|
parent_span_id: Annotated[Optional[str], Field(title="Parent Span Id")] = None
|
142
324
|
span_type: Annotated[Optional[str], Field(title="Span Type")] = "span"
|
143
325
|
inputs: Annotated[Optional[Dict[str, Any]], Field(title="Inputs")] = None
|
144
326
|
error: Annotated[Optional[Dict[str, Any]], Field(title="Error")] = None
|
145
327
|
output: Annotated[Any, Field(title="Output")] = None
|
146
|
-
usage: Optional[
|
328
|
+
usage: Optional[TraceUsage] = None
|
147
329
|
duration: Annotated[Optional[float], Field(title="Duration")] = None
|
148
|
-
expected_tools: Annotated[
|
149
|
-
|
150
|
-
|
330
|
+
expected_tools: Annotated[Optional[List[Tool]], Field(title="Expected Tools")] = (
|
331
|
+
None
|
332
|
+
)
|
151
333
|
additional_metadata: Annotated[
|
152
334
|
Optional[Dict[str, Any]], Field(title="Additional Metadata")
|
153
335
|
] = None
|
@@ -161,54 +343,56 @@ class TraceSpanJudgmentType(BaseModel):
|
|
161
343
|
update_id: Annotated[Optional[int], Field(title="Update Id")] = 1
|
162
344
|
|
163
345
|
|
164
|
-
class
|
346
|
+
class Trace(BaseModel):
|
165
347
|
trace_id: Annotated[str, Field(title="Trace Id")]
|
166
348
|
name: Annotated[str, Field(title="Name")]
|
167
349
|
created_at: Annotated[str, Field(title="Created At")]
|
168
350
|
duration: Annotated[float, Field(title="Duration")]
|
169
|
-
trace_spans: Annotated[List[
|
351
|
+
trace_spans: Annotated[List[TraceSpan], Field(title="Trace Spans")]
|
170
352
|
offline_mode: Annotated[Optional[bool], Field(title="Offline Mode")] = False
|
171
|
-
rules: Annotated[Optional[Dict[str, Any]], Field(title="Rules")] =
|
172
|
-
default_factory=dict
|
173
|
-
)
|
353
|
+
rules: Annotated[Optional[Dict[str, Any]], Field(title="Rules")] = {}
|
174
354
|
has_notification: Annotated[Optional[bool], Field(title="Has Notification")] = False
|
175
355
|
customer_id: Annotated[Optional[str], Field(title="Customer Id")] = None
|
176
|
-
tags: Annotated[Optional[List[str]], Field(title="Tags")] =
|
177
|
-
|
178
|
-
)
|
179
|
-
metadata: Annotated[Optional[Dict[str, Any]], Field(title="Metadata")] = Field(
|
180
|
-
default_factory=dict
|
181
|
-
)
|
356
|
+
tags: Annotated[Optional[List[str]], Field(title="Tags")] = []
|
357
|
+
metadata: Annotated[Optional[Dict[str, Any]], Field(title="Metadata")] = {}
|
182
358
|
update_id: Annotated[Optional[int], Field(title="Update Id")] = 1
|
183
359
|
|
184
360
|
|
185
|
-
class
|
361
|
+
class ScoringResult(BaseModel):
|
186
362
|
success: Annotated[bool, Field(title="Success")]
|
187
|
-
scorers_data: Annotated[
|
188
|
-
|
189
|
-
|
363
|
+
scorers_data: Annotated[Optional[List[ScorerData]], Field(title="Scorers Data")] = (
|
364
|
+
None
|
365
|
+
)
|
190
366
|
name: Annotated[Optional[str], Field(title="Name")] = None
|
191
367
|
data_object: Annotated[
|
192
|
-
Optional[Union[
|
193
|
-
Field(title="Data Object"),
|
368
|
+
Optional[Union[TraceSpan, Example]], Field(title="Data Object")
|
194
369
|
] = None
|
195
370
|
trace_id: Annotated[Optional[str], Field(title="Trace Id")] = None
|
196
371
|
run_duration: Annotated[Optional[float], Field(title="Run Duration")] = None
|
197
372
|
evaluation_cost: Annotated[Optional[float], Field(title="Evaluation Cost")] = None
|
198
373
|
|
199
374
|
|
200
|
-
class
|
375
|
+
class TraceRun(BaseModel):
|
201
376
|
project_name: Annotated[Optional[str], Field(title="Project Name")] = None
|
202
377
|
eval_name: Annotated[Optional[str], Field(title="Eval Name")] = None
|
203
|
-
traces: Annotated[List[
|
204
|
-
scorers: Annotated[List[
|
378
|
+
traces: Annotated[List[Trace], Field(title="Traces")]
|
379
|
+
scorers: Annotated[List[ScorerConfig], Field(title="Scorers")]
|
205
380
|
model: Annotated[str, Field(title="Model")]
|
206
381
|
trace_span_id: Annotated[Optional[str], Field(title="Trace Span Id")] = None
|
207
382
|
tools: Annotated[Optional[List[Dict[str, Any]]], Field(title="Tools")] = None
|
208
383
|
|
209
384
|
|
210
|
-
class
|
211
|
-
results: Annotated[List[
|
212
|
-
run: Annotated[
|
213
|
-
|
214
|
-
|
385
|
+
class EvalResults(BaseModel):
|
386
|
+
results: Annotated[List[ScoringResult], Field(title="Results")]
|
387
|
+
run: Annotated[Union[ExampleEvaluationRun, TraceEvaluationRun], Field(title="Run")]
|
388
|
+
|
389
|
+
|
390
|
+
class DatasetPush(BaseModel):
|
391
|
+
dataset_alias: Annotated[str, Field(title="Dataset Alias")]
|
392
|
+
comments: Annotated[Optional[str], Field(title="Comments")] = None
|
393
|
+
source_file: Annotated[Optional[str], Field(title="Source File")] = None
|
394
|
+
examples: Annotated[Optional[List[Example]], Field(title="Examples")] = None
|
395
|
+
traces: Annotated[Optional[List[Trace]], Field(title="Traces")] = None
|
396
|
+
is_trace: Annotated[Optional[bool], Field(title="Is Trace")] = False
|
397
|
+
project_name: Annotated[str, Field(title="Project Name")]
|
398
|
+
overwrite: Annotated[Optional[bool], Field(title="Overwrite")] = False
|
judgeval/data/result.py
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
from typing import List, Union
|
2
2
|
from judgeval.data import ScorerData, Example
|
3
3
|
from judgeval.data.trace import TraceSpan
|
4
|
-
from judgeval.data.judgment_types import
|
4
|
+
from judgeval.data.judgment_types import ScoringResult as JudgmentScoringResult
|
5
5
|
|
6
6
|
|
7
|
-
class ScoringResult(
|
7
|
+
class ScoringResult(JudgmentScoringResult):
|
8
8
|
"""
|
9
9
|
A ScoringResult contains the output of one or more scorers applied to a single example.
|
10
10
|
Ie: One input, one actual_output, one expected_output, etc..., and 1+ scorer (Faithfulness, Hallucination, Summarization, etc...)
|
@@ -17,9 +17,8 @@ class ScoringResult(ScoringResultJudgmentType):
|
|
17
17
|
|
18
18
|
"""
|
19
19
|
|
20
|
-
|
21
|
-
|
22
|
-
)
|
20
|
+
# Need to override this so that it uses this repo's Example class
|
21
|
+
data_object: Example
|
23
22
|
|
24
23
|
def model_dump(self, **kwargs):
|
25
24
|
data = super().model_dump(**kwargs)
|
judgeval/data/scorer_data.py
CHANGED
@@ -4,12 +4,14 @@ Implementation of the ScorerData class.
|
|
4
4
|
ScorerData holds the information related to a single, completed Scorer evaluation run.
|
5
5
|
"""
|
6
6
|
|
7
|
-
from
|
7
|
+
from __future__ import annotations
|
8
|
+
|
9
|
+
from judgeval.data.judgment_types import ScorerData as JudgmentScorerData
|
8
10
|
from judgeval.scorers import BaseScorer
|
9
11
|
from typing import List
|
10
12
|
|
11
13
|
|
12
|
-
class ScorerData(
|
14
|
+
class ScorerData(JudgmentScorerData):
|
13
15
|
"""
|
14
16
|
ScorerData holds the information related to a single, completed Scorer evaluation run.
|
15
17
|
|
judgeval/data/tool.py
CHANGED
judgeval/data/trace.py
CHANGED
@@ -1,24 +1,21 @@
|
|
1
|
-
import threading
|
2
1
|
from datetime import datetime, timezone
|
3
2
|
from judgeval.data.judgment_types import (
|
4
|
-
|
5
|
-
|
6
|
-
|
3
|
+
TraceUsage as JudgmentTraceUsage,
|
4
|
+
TraceSpan as JudgmentTraceSpan,
|
5
|
+
Trace as JudgmentTrace,
|
7
6
|
)
|
8
|
-
from judgeval.
|
9
|
-
from judgeval.common.api.json_encoder import json_encoder
|
7
|
+
from judgeval.utils.serialize import json_encoder
|
10
8
|
|
11
9
|
|
12
|
-
class TraceUsage(
|
10
|
+
class TraceUsage(JudgmentTraceUsage):
|
13
11
|
pass
|
14
12
|
|
15
13
|
|
16
|
-
class TraceSpan(
|
14
|
+
class TraceSpan(JudgmentTraceSpan):
|
17
15
|
def model_dump(self, **kwargs):
|
18
16
|
return {
|
19
17
|
"span_id": self.span_id,
|
20
18
|
"trace_id": self.trace_id,
|
21
|
-
"depth": self.depth,
|
22
19
|
"created_at": datetime.fromtimestamp(
|
23
20
|
self.created_at, tz=timezone.utc
|
24
21
|
).isoformat(),
|
@@ -32,52 +29,12 @@ class TraceSpan(TraceSpanJudgmentType):
|
|
32
29
|
"usage": self.usage.model_dump() if self.usage else None,
|
33
30
|
"has_evaluation": self.has_evaluation,
|
34
31
|
"agent_name": self.agent_name,
|
35
|
-
"class_name": self.class_name,
|
36
32
|
"state_before": self.state_before,
|
37
33
|
"state_after": self.state_after,
|
38
34
|
"additional_metadata": json_encoder(self.additional_metadata),
|
39
35
|
"update_id": self.update_id,
|
40
36
|
}
|
41
37
|
|
42
|
-
def __init__(self, **data):
|
43
|
-
super().__init__(**data)
|
44
|
-
# Initialize thread lock for thread-safe update_id increment
|
45
|
-
self._update_id_lock = threading.Lock()
|
46
38
|
|
47
|
-
|
48
|
-
"""
|
49
|
-
Thread-safe method to increment the update_id counter.
|
50
|
-
Returns:
|
51
|
-
int: The new update_id value after incrementing
|
52
|
-
"""
|
53
|
-
with self._update_id_lock:
|
54
|
-
self.update_id += 1
|
55
|
-
return self.update_id
|
56
|
-
|
57
|
-
def set_update_id_to_ending_number(
|
58
|
-
self, ending_number: int = SPAN_LIFECYCLE_END_UPDATE_ID
|
59
|
-
) -> int:
|
60
|
-
"""
|
61
|
-
Thread-safe method to set the update_id to a predetermined ending number.
|
62
|
-
|
63
|
-
Args:
|
64
|
-
ending_number (int): The number to set update_id to. Defaults to SPAN_LIFECYCLE_END_UPDATE_ID.
|
65
|
-
|
66
|
-
Returns:
|
67
|
-
int: The new update_id value after setting
|
68
|
-
"""
|
69
|
-
with self._update_id_lock:
|
70
|
-
self.update_id = ending_number
|
71
|
-
return self.update_id
|
72
|
-
|
73
|
-
def print_span(self):
|
74
|
-
"""Print the span with proper formatting and parent relationship information."""
|
75
|
-
indent = " " * self.depth
|
76
|
-
parent_info = (
|
77
|
-
f" (parent_id: {self.parent_span_id})" if self.parent_span_id else ""
|
78
|
-
)
|
79
|
-
print(f"{indent}→ {self.function} (id: {self.span_id}){parent_info}")
|
80
|
-
|
81
|
-
|
82
|
-
class Trace(TraceJudgmentType):
|
39
|
+
class Trace(JudgmentTrace):
|
83
40
|
pass
|
judgeval/data/trace_run.py
CHANGED
@@ -2,8 +2,7 @@ from pydantic import BaseModel
|
|
2
2
|
from typing import List, Optional, Dict, Any, Union
|
3
3
|
from judgeval.data import Trace
|
4
4
|
from judgeval.scorers import APIScorerConfig, BaseScorer
|
5
|
-
from judgeval.
|
6
|
-
from judgeval.constants import DEFAULT_GPT_MODEL
|
5
|
+
from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
|
7
6
|
|
8
7
|
|
9
8
|
class TraceRun(BaseModel):
|
@@ -27,9 +26,13 @@ class TraceRun(BaseModel):
|
|
27
26
|
eval_name: Optional[str] = None
|
28
27
|
traces: Optional[List[Trace]] = None
|
29
28
|
scorers: List[Union[APIScorerConfig, BaseScorer]]
|
30
|
-
model: Optional[str] =
|
29
|
+
model: Optional[str] = JUDGMENT_DEFAULT_GPT_MODEL
|
31
30
|
trace_span_id: Optional[str] = None
|
32
|
-
|
31
|
+
append: Optional[bool] = False
|
32
|
+
override: Optional[bool] = False
|
33
|
+
|
34
|
+
# TODO: ?
|
35
|
+
rules: Any = None
|
33
36
|
tools: Optional[List[Dict[str, Any]]] = None
|
34
37
|
|
35
38
|
class Config:
|