judgeval 0.9.3__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +2 -2
- judgeval/api/__init__.py +28 -96
- judgeval/api/api_types.py +49 -140
- judgeval/constants.py +1 -5
- judgeval/data/__init__.py +1 -3
- judgeval/data/example.py +4 -2
- judgeval/data/judgment_types.py +57 -165
- judgeval/data/result.py +1 -2
- judgeval/data/trace.py +14 -40
- judgeval/dataset/__init__.py +15 -42
- judgeval/evaluation/__init__.py +23 -34
- judgeval/scorers/__init__.py +9 -7
- judgeval/scorers/api_scorer.py +8 -0
- judgeval/scorers/base_scorer.py +0 -1
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -10
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +43 -4
- judgeval/tracer/__init__.py +40 -93
- judgeval/tracer/local_eval_queue.py +2 -2
- judgeval/tracer/processors/__init__.py +84 -6
- judgeval/tracer/utils.py +1 -1
- judgeval/trainer/trainer.py +4 -4
- judgeval/utils/serialize.py +7 -1
- {judgeval-0.9.3.dist-info → judgeval-0.10.0.dist-info}/METADATA +2 -2
- {judgeval-0.9.3.dist-info → judgeval-0.10.0.dist-info}/RECORD +31 -36
- judgeval/data/trace_run.py +0 -39
- judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
- judgeval/scorers/trace_api_scorer.py +0 -5
- {judgeval-0.9.3.dist-info → judgeval-0.10.0.dist-info}/WHEEL +0 -0
- {judgeval-0.9.3.dist-info → judgeval-0.10.0.dist-info}/entry_points.txt +0 -0
- {judgeval-0.9.3.dist-info → judgeval-0.10.0.dist-info}/licenses/LICENSE.md +0 -0
judgeval/data/judgment_types.py
CHANGED
@@ -1,10 +1,11 @@
|
|
1
1
|
# generated by datamodel-codegen:
|
2
2
|
# filename: .openapi.json
|
3
|
-
# timestamp: 2025-
|
3
|
+
# timestamp: 2025-09-10T17:42:11+00:00
|
4
4
|
|
5
5
|
from __future__ import annotations
|
6
6
|
from typing import Annotated, Any, Dict, List, Optional, Union
|
7
7
|
from pydantic import AwareDatetime, BaseModel, ConfigDict, Field, RootModel
|
8
|
+
from enum import Enum
|
8
9
|
|
9
10
|
|
10
11
|
class TraceAndSpanId(RootModel[List]):
|
@@ -17,42 +18,10 @@ class EvalResultsFetch(BaseModel):
|
|
17
18
|
|
18
19
|
|
19
20
|
class DatasetFetch(BaseModel):
|
20
|
-
|
21
|
+
dataset_name: Annotated[str, Field(title="Dataset Name")]
|
21
22
|
project_name: Annotated[str, Field(title="Project Name")]
|
22
23
|
|
23
24
|
|
24
|
-
class TraceSave(BaseModel):
|
25
|
-
project_name: Annotated[str, Field(title="Project Name")]
|
26
|
-
trace_id: Annotated[str, Field(title="Trace Id")]
|
27
|
-
name: Annotated[str, Field(title="Name")]
|
28
|
-
created_at: Annotated[str, Field(title="Created At")]
|
29
|
-
duration: Annotated[float, Field(title="Duration")]
|
30
|
-
offline_mode: Annotated[Optional[bool], Field(title="Offline Mode")] = False
|
31
|
-
has_notification: Annotated[Optional[bool], Field(title="Has Notification")] = False
|
32
|
-
customer_id: Annotated[Optional[str], Field(title="Customer Id")] = None
|
33
|
-
tags: Annotated[Optional[List[str]], Field(title="Tags")] = None
|
34
|
-
metadata: Annotated[Optional[Dict[str, Any]], Field(title="Metadata")] = None
|
35
|
-
update_id: Annotated[Optional[int], Field(title="Update Id")] = 1
|
36
|
-
|
37
|
-
|
38
|
-
class TraceFetch(BaseModel):
|
39
|
-
trace_id: Annotated[str, Field(title="Trace Id")]
|
40
|
-
|
41
|
-
|
42
|
-
class TraceAddToDataset(BaseModel):
|
43
|
-
trace_id: Annotated[str, Field(title="Trace Id")]
|
44
|
-
trace_span_id: Annotated[str, Field(title="Trace Span Id")]
|
45
|
-
dataset_alias: Annotated[str, Field(title="Dataset Alias")]
|
46
|
-
project_name: Annotated[str, Field(title="Project Name")]
|
47
|
-
|
48
|
-
|
49
|
-
class EvaluationRunsBatchRequest(BaseModel):
|
50
|
-
organization_id: Annotated[str, Field(title="Organization Id")]
|
51
|
-
evaluation_entries: Annotated[
|
52
|
-
List[Dict[str, Any]], Field(title="Evaluation Entries")
|
53
|
-
]
|
54
|
-
|
55
|
-
|
56
25
|
class ProjectAdd(BaseModel):
|
57
26
|
project_name: Annotated[str, Field(title="Project Name")]
|
58
27
|
|
@@ -82,7 +51,7 @@ class SavePromptScorerRequest(BaseModel):
|
|
82
51
|
prompt: Annotated[str, Field(title="Prompt")]
|
83
52
|
threshold: Annotated[float, Field(title="Threshold")]
|
84
53
|
options: Annotated[Optional[Dict[str, float]], Field(title="Options")] = None
|
85
|
-
is_trace: Annotated[Optional[bool], Field(title="Is Trace")] =
|
54
|
+
is_trace: Annotated[Optional[bool], Field(title="Is Trace")] = None
|
86
55
|
|
87
56
|
|
88
57
|
class SavePromptScorerResponse(BaseModel):
|
@@ -161,8 +130,8 @@ class Example(BaseModel):
|
|
161
130
|
model_config = ConfigDict(
|
162
131
|
extra="allow",
|
163
132
|
)
|
164
|
-
example_id: Annotated[str, Field(title="Example Id")]
|
165
|
-
created_at: Annotated[str, Field(title="Created At")]
|
133
|
+
example_id: Annotated[Optional[str], Field(title="Example Id")] = None
|
134
|
+
created_at: Annotated[Optional[str], Field(title="Created At")] = None
|
166
135
|
name: Annotated[Optional[str], Field(title="Name")] = None
|
167
136
|
|
168
137
|
|
@@ -172,34 +141,9 @@ class ValidationError(BaseModel):
|
|
172
141
|
type: Annotated[str, Field(title="Error Type")]
|
173
142
|
|
174
143
|
|
175
|
-
class
|
176
|
-
|
177
|
-
|
178
|
-
function: Annotated[str, Field(title="Function")]
|
179
|
-
created_at: Annotated[Any, Field(title="Created At")] = None
|
180
|
-
parent_span_id: Annotated[Optional[str], Field(title="Parent Span Id")] = None
|
181
|
-
span_type: Annotated[Optional[str], Field(title="Span Type")] = "span"
|
182
|
-
inputs: Annotated[Optional[Dict[str, Any]], Field(title="Inputs")] = None
|
183
|
-
output: Annotated[Any, Field(title="Output")] = None
|
184
|
-
error: Annotated[Optional[Dict[str, Any]], Field(title="Error")] = None
|
185
|
-
usage: Annotated[Optional[Dict[str, Any]], Field(title="Usage")] = None
|
186
|
-
duration: Annotated[Optional[float], Field(title="Duration")] = None
|
187
|
-
expected_tools: Annotated[
|
188
|
-
Optional[List[Dict[str, Any]]], Field(title="Expected Tools")
|
189
|
-
] = None
|
190
|
-
additional_metadata: Annotated[
|
191
|
-
Optional[Dict[str, Any]], Field(title="Additional Metadata")
|
192
|
-
] = None
|
193
|
-
has_evaluation: Annotated[Optional[bool], Field(title="Has Evaluation")] = False
|
194
|
-
agent_name: Annotated[Optional[str], Field(title="Agent Name")] = None
|
195
|
-
class_name: Annotated[Optional[str], Field(title="Class Name")] = None
|
196
|
-
state_before: Annotated[Optional[Dict[str, Any]], Field(title="State Before")] = (
|
197
|
-
None
|
198
|
-
)
|
199
|
-
state_after: Annotated[Optional[Dict[str, Any]], Field(title="State After")] = None
|
200
|
-
span_state: Annotated[str, Field(title="Span State")]
|
201
|
-
update_id: Annotated[Optional[int], Field(title="Update Id")] = 1
|
202
|
-
queued_at: Annotated[float, Field(title="Queued At")]
|
144
|
+
class DatasetKind(Enum):
|
145
|
+
trace = "trace"
|
146
|
+
example = "example"
|
203
147
|
|
204
148
|
|
205
149
|
class PromptScorer(BaseModel):
|
@@ -213,52 +157,55 @@ class PromptScorer(BaseModel):
|
|
213
157
|
|
214
158
|
|
215
159
|
class ScorerData(BaseModel):
|
160
|
+
id: Annotated[Optional[str], Field(title="Id")] = None
|
216
161
|
name: Annotated[str, Field(title="Name")]
|
217
162
|
threshold: Annotated[float, Field(title="Threshold")]
|
218
163
|
success: Annotated[bool, Field(title="Success")]
|
219
164
|
score: Annotated[Optional[float], Field(title="Score")] = None
|
220
165
|
reason: Annotated[Optional[str], Field(title="Reason")] = None
|
221
166
|
strict_mode: Annotated[Optional[bool], Field(title="Strict Mode")] = None
|
222
|
-
evaluation_model: Annotated[
|
223
|
-
Optional[Union[List[str], str]], Field(title="Evaluation Model")
|
224
|
-
] = None
|
167
|
+
evaluation_model: Annotated[Optional[str], Field(title="Evaluation Model")] = None
|
225
168
|
error: Annotated[Optional[str], Field(title="Error")] = None
|
226
169
|
additional_metadata: Annotated[
|
227
170
|
Optional[Dict[str, Any]], Field(title="Additional Metadata")
|
228
171
|
] = None
|
229
172
|
|
230
173
|
|
231
|
-
class
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
] = None
|
240
|
-
|
241
|
-
|
242
|
-
|
174
|
+
class OtelTraceSpan(BaseModel):
|
175
|
+
organization_id: Annotated[str, Field(title="Organization Id")]
|
176
|
+
project_id: Annotated[Optional[str], Field(title="Project Id")] = None
|
177
|
+
user_id: Annotated[str, Field(title="User Id")]
|
178
|
+
timestamp: Annotated[str, Field(title="Timestamp")]
|
179
|
+
trace_id: Annotated[str, Field(title="Trace Id")]
|
180
|
+
span_id: Annotated[str, Field(title="Span Id")]
|
181
|
+
parent_span_id: Annotated[Optional[str], Field(title="Parent Span Id")] = None
|
182
|
+
trace_state: Annotated[Optional[str], Field(title="Trace State")] = None
|
183
|
+
span_name: Annotated[Optional[str], Field(title="Span Name")] = None
|
184
|
+
span_kind: Annotated[Optional[str], Field(title="Span Kind")] = None
|
185
|
+
service_name: Annotated[Optional[str], Field(title="Service Name")] = None
|
186
|
+
resource_attributes: Annotated[
|
187
|
+
Optional[Dict[str, Any]], Field(title="Resource Attributes")
|
243
188
|
] = None
|
244
|
-
|
245
|
-
Optional[
|
189
|
+
span_attributes: Annotated[
|
190
|
+
Optional[Dict[str, Any]], Field(title="Span Attributes")
|
246
191
|
] = None
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
Optional[List[Dict[str, Any]]], Field(title="Action Dependencies")
|
192
|
+
duration: Annotated[Optional[int], Field(title="Duration")] = None
|
193
|
+
status_code: Annotated[Optional[str], Field(title="Status Code")] = None
|
194
|
+
status_message: Annotated[Optional[str], Field(title="Status Message")] = None
|
195
|
+
events: Annotated[Optional[List[Dict[str, Any]]], Field(title="Events")] = None
|
196
|
+
links: Annotated[Optional[List[Dict[str, Any]]], Field(title="Links")] = None
|
197
|
+
legacy_span_id: Annotated[Optional[str], Field(title="Legacy Span Id")] = None
|
198
|
+
inputs: Annotated[Optional[Dict[str, Any]], Field(title="Inputs")] = None
|
199
|
+
output: Annotated[Any, Field(title="Output")]
|
200
|
+
error: Annotated[Optional[Dict[str, Any]], Field(title="Error")] = None
|
201
|
+
agent_id: Annotated[Optional[str], Field(title="Agent Id")] = None
|
202
|
+
cumulative_llm_cost: Annotated[
|
203
|
+
Optional[float], Field(title="Cumulative Llm Cost")
|
260
204
|
] = None
|
261
|
-
|
205
|
+
state_after: Annotated[Optional[Dict[str, Any]], Field(title="State After")] = None
|
206
|
+
state_before: Annotated[Optional[Dict[str, Any]], Field(title="State Before")] = (
|
207
|
+
None
|
208
|
+
)
|
262
209
|
|
263
210
|
|
264
211
|
class ExampleEvaluationRun(BaseModel):
|
@@ -301,61 +248,27 @@ class TraceEvaluationRun(BaseModel):
|
|
301
248
|
|
302
249
|
|
303
250
|
class DatasetInsertExamples(BaseModel):
|
304
|
-
|
251
|
+
dataset_name: Annotated[str, Field(title="Dataset Name")]
|
305
252
|
examples: Annotated[List[Example], Field(title="Examples")]
|
306
253
|
project_name: Annotated[str, Field(title="Project Name")]
|
307
254
|
|
308
255
|
|
309
|
-
class
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
class FetchPromptScorerResponse(BaseModel):
|
315
|
-
scorer: PromptScorer
|
256
|
+
class DatasetReturn(BaseModel):
|
257
|
+
name: Annotated[str, Field(title="Name")]
|
258
|
+
project_name: Annotated[str, Field(title="Project Name")]
|
259
|
+
examples: Annotated[Optional[List[Example]], Field(title="Examples")] = None
|
316
260
|
|
317
261
|
|
318
|
-
class
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
span_type: Annotated[Optional[str], Field(title="Span Type")] = "span"
|
325
|
-
inputs: Annotated[Optional[Dict[str, Any]], Field(title="Inputs")] = None
|
326
|
-
error: Annotated[Optional[Dict[str, Any]], Field(title="Error")] = None
|
327
|
-
output: Annotated[Any, Field(title="Output")] = None
|
328
|
-
usage: Optional[TraceUsage] = None
|
329
|
-
duration: Annotated[Optional[float], Field(title="Duration")] = None
|
330
|
-
expected_tools: Annotated[Optional[List[Tool]], Field(title="Expected Tools")] = (
|
331
|
-
None
|
332
|
-
)
|
333
|
-
additional_metadata: Annotated[
|
334
|
-
Optional[Dict[str, Any]], Field(title="Additional Metadata")
|
335
|
-
] = None
|
336
|
-
has_evaluation: Annotated[Optional[bool], Field(title="Has Evaluation")] = False
|
337
|
-
agent_name: Annotated[Optional[str], Field(title="Agent Name")] = None
|
338
|
-
class_name: Annotated[Optional[str], Field(title="Class Name")] = None
|
339
|
-
state_before: Annotated[Optional[Dict[str, Any]], Field(title="State Before")] = (
|
340
|
-
None
|
341
|
-
)
|
342
|
-
state_after: Annotated[Optional[Dict[str, Any]], Field(title="State After")] = None
|
343
|
-
update_id: Annotated[Optional[int], Field(title="Update Id")] = 1
|
262
|
+
class DatasetCreate(BaseModel):
|
263
|
+
name: Annotated[str, Field(title="Name")]
|
264
|
+
dataset_kind: DatasetKind
|
265
|
+
project_name: Annotated[str, Field(title="Project Name")]
|
266
|
+
examples: Annotated[Optional[List[Example]], Field(title="Examples")] = None
|
267
|
+
overwrite: Annotated[Optional[bool], Field(title="Overwrite")] = False
|
344
268
|
|
345
269
|
|
346
|
-
class
|
347
|
-
|
348
|
-
name: Annotated[str, Field(title="Name")]
|
349
|
-
created_at: Annotated[str, Field(title="Created At")]
|
350
|
-
duration: Annotated[float, Field(title="Duration")]
|
351
|
-
trace_spans: Annotated[List[TraceSpan], Field(title="Trace Spans")]
|
352
|
-
offline_mode: Annotated[Optional[bool], Field(title="Offline Mode")] = False
|
353
|
-
rules: Annotated[Optional[Dict[str, Any]], Field(title="Rules")] = {}
|
354
|
-
has_notification: Annotated[Optional[bool], Field(title="Has Notification")] = False
|
355
|
-
customer_id: Annotated[Optional[str], Field(title="Customer Id")] = None
|
356
|
-
tags: Annotated[Optional[List[str]], Field(title="Tags")] = []
|
357
|
-
metadata: Annotated[Optional[Dict[str, Any]], Field(title="Metadata")] = {}
|
358
|
-
update_id: Annotated[Optional[int], Field(title="Update Id")] = 1
|
270
|
+
class FetchPromptScorerResponse(BaseModel):
|
271
|
+
scorer: PromptScorer
|
359
272
|
|
360
273
|
|
361
274
|
class ScoringResult(BaseModel):
|
@@ -365,34 +278,13 @@ class ScoringResult(BaseModel):
|
|
365
278
|
)
|
366
279
|
name: Annotated[Optional[str], Field(title="Name")] = None
|
367
280
|
data_object: Annotated[
|
368
|
-
Optional[Union[
|
281
|
+
Optional[Union[OtelTraceSpan, Example]], Field(title="Data Object")
|
369
282
|
] = None
|
370
283
|
trace_id: Annotated[Optional[str], Field(title="Trace Id")] = None
|
371
284
|
run_duration: Annotated[Optional[float], Field(title="Run Duration")] = None
|
372
285
|
evaluation_cost: Annotated[Optional[float], Field(title="Evaluation Cost")] = None
|
373
286
|
|
374
287
|
|
375
|
-
class TraceRun(BaseModel):
|
376
|
-
project_name: Annotated[Optional[str], Field(title="Project Name")] = None
|
377
|
-
eval_name: Annotated[Optional[str], Field(title="Eval Name")] = None
|
378
|
-
traces: Annotated[List[Trace], Field(title="Traces")]
|
379
|
-
scorers: Annotated[List[ScorerConfig], Field(title="Scorers")]
|
380
|
-
model: Annotated[str, Field(title="Model")]
|
381
|
-
trace_span_id: Annotated[Optional[str], Field(title="Trace Span Id")] = None
|
382
|
-
tools: Annotated[Optional[List[Dict[str, Any]]], Field(title="Tools")] = None
|
383
|
-
|
384
|
-
|
385
288
|
class EvalResults(BaseModel):
|
386
289
|
results: Annotated[List[ScoringResult], Field(title="Results")]
|
387
290
|
run: Annotated[Union[ExampleEvaluationRun, TraceEvaluationRun], Field(title="Run")]
|
388
|
-
|
389
|
-
|
390
|
-
class DatasetPush(BaseModel):
|
391
|
-
dataset_alias: Annotated[str, Field(title="Dataset Alias")]
|
392
|
-
comments: Annotated[Optional[str], Field(title="Comments")] = None
|
393
|
-
source_file: Annotated[Optional[str], Field(title="Source File")] = None
|
394
|
-
examples: Annotated[Optional[List[Example]], Field(title="Examples")] = None
|
395
|
-
traces: Annotated[Optional[List[Trace]], Field(title="Traces")] = None
|
396
|
-
is_trace: Annotated[Optional[bool], Field(title="Is Trace")] = False
|
397
|
-
project_name: Annotated[str, Field(title="Project Name")]
|
398
|
-
overwrite: Annotated[Optional[bool], Field(title="Overwrite")] = False
|
judgeval/data/result.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
from typing import List, Union
|
2
2
|
from judgeval.data import ScorerData, Example
|
3
|
-
from judgeval.data.trace import TraceSpan
|
4
3
|
from judgeval.data.judgment_types import ScoringResult as JudgmentScoringResult
|
5
4
|
|
6
5
|
|
@@ -34,7 +33,7 @@ class ScoringResult(JudgmentScoringResult):
|
|
34
33
|
|
35
34
|
|
36
35
|
def generate_scoring_result(
|
37
|
-
data_object: Union[Example
|
36
|
+
data_object: Union[Example],
|
38
37
|
scorers_data: List[ScorerData],
|
39
38
|
run_duration: float,
|
40
39
|
success: bool,
|
judgeval/data/trace.py
CHANGED
@@ -1,40 +1,14 @@
|
|
1
|
-
from
|
2
|
-
from
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
def model_dump(self, **kwargs):
|
16
|
-
return {
|
17
|
-
"span_id": self.span_id,
|
18
|
-
"trace_id": self.trace_id,
|
19
|
-
"created_at": datetime.fromtimestamp(
|
20
|
-
self.created_at, tz=timezone.utc
|
21
|
-
).isoformat(),
|
22
|
-
"inputs": json_encoder(self.inputs),
|
23
|
-
"output": json_encoder(self.output),
|
24
|
-
"error": json_encoder(self.error),
|
25
|
-
"parent_span_id": self.parent_span_id,
|
26
|
-
"function": self.function,
|
27
|
-
"duration": self.duration,
|
28
|
-
"span_type": self.span_type,
|
29
|
-
"usage": self.usage.model_dump() if self.usage else None,
|
30
|
-
"has_evaluation": self.has_evaluation,
|
31
|
-
"agent_name": self.agent_name,
|
32
|
-
"state_before": self.state_before,
|
33
|
-
"state_after": self.state_after,
|
34
|
-
"additional_metadata": json_encoder(self.additional_metadata),
|
35
|
-
"update_id": self.update_id,
|
36
|
-
}
|
37
|
-
|
38
|
-
|
39
|
-
class Trace(JudgmentTrace):
|
40
|
-
pass
|
1
|
+
from typing import Optional
|
2
|
+
from pydantic import BaseModel
|
3
|
+
|
4
|
+
|
5
|
+
class TraceUsage(BaseModel):
|
6
|
+
prompt_tokens: Optional[int] = None
|
7
|
+
completion_tokens: Optional[int] = None
|
8
|
+
cache_creation_input_tokens: Optional[int] = None
|
9
|
+
cache_read_input_tokens: Optional[int] = None
|
10
|
+
total_tokens: Optional[int] = None
|
11
|
+
prompt_tokens_cost_usd: Optional[float] = None
|
12
|
+
completion_tokens_cost_usd: Optional[float] = None
|
13
|
+
total_cost_usd: Optional[float] = None
|
14
|
+
model_name: Optional[str] = None
|
judgeval/dataset/__init__.py
CHANGED
@@ -5,7 +5,7 @@ import yaml
|
|
5
5
|
from dataclasses import dataclass
|
6
6
|
from typing import List, Literal, Optional
|
7
7
|
|
8
|
-
from judgeval.data import Example
|
8
|
+
from judgeval.data import Example
|
9
9
|
from judgeval.utils.file_utils import get_examples_from_yaml, get_examples_from_json
|
10
10
|
from judgeval.api import JudgmentSyncClient
|
11
11
|
from judgeval.logger import judgeval_logger
|
@@ -15,7 +15,6 @@ from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_ORG_ID
|
|
15
15
|
@dataclass
|
16
16
|
class Dataset:
|
17
17
|
examples: List[Example]
|
18
|
-
traces: List[Trace]
|
19
18
|
name: str
|
20
19
|
project_name: str
|
21
20
|
judgment_api_key: str = JUDGMENT_API_KEY or ""
|
@@ -30,7 +29,7 @@ class Dataset:
|
|
30
29
|
client = JudgmentSyncClient(cls.judgment_api_key, cls.organization_id)
|
31
30
|
dataset = client.datasets_pull_for_judgeval(
|
32
31
|
{
|
33
|
-
"
|
32
|
+
"dataset_name": name,
|
34
33
|
"project_name": project_name,
|
35
34
|
},
|
36
35
|
)
|
@@ -40,12 +39,14 @@ class Dataset:
|
|
40
39
|
for e in examples:
|
41
40
|
if isinstance(e, dict) and isinstance(e.get("data"), dict):
|
42
41
|
e.update(e.pop("data"))
|
42
|
+
e.pop(
|
43
|
+
"example_id"
|
44
|
+
) # TODO: remove once scorer data migraiton is complete
|
43
45
|
judgeval_logger.info(f"Succesfully retrieved dataset {name}!")
|
44
46
|
return cls(
|
45
47
|
name=name,
|
46
48
|
project_name=project_name,
|
47
49
|
examples=[Example(**e) for e in examples],
|
48
|
-
traces=[Trace(**t) for t in dataset.get("traces", [])],
|
49
50
|
)
|
50
51
|
|
51
52
|
@classmethod
|
@@ -54,25 +55,18 @@ class Dataset:
|
|
54
55
|
name: str,
|
55
56
|
project_name: str,
|
56
57
|
examples: Optional[List[Example]] = None,
|
57
|
-
traces: Optional[List[Trace]] = None,
|
58
58
|
overwrite: bool = False,
|
59
59
|
):
|
60
|
-
if examples and traces:
|
61
|
-
raise ValueError("Only one of examples or traces must be provided")
|
62
|
-
|
63
60
|
if not examples:
|
64
61
|
examples = []
|
65
62
|
|
66
|
-
if not traces:
|
67
|
-
traces = []
|
68
|
-
|
69
63
|
client = JudgmentSyncClient(cls.judgment_api_key, cls.organization_id)
|
70
|
-
client.
|
64
|
+
client.datasets_create_for_judgeval(
|
71
65
|
{
|
72
|
-
"
|
66
|
+
"name": name,
|
73
67
|
"project_name": project_name,
|
74
|
-
"examples": [e.model_dump() for e in examples],
|
75
|
-
"
|
68
|
+
"examples": [e.model_dump() for e in examples],
|
69
|
+
"dataset_kind": "example",
|
76
70
|
"overwrite": overwrite,
|
77
71
|
}
|
78
72
|
)
|
@@ -82,7 +76,6 @@ class Dataset:
|
|
82
76
|
name=name,
|
83
77
|
project_name=project_name,
|
84
78
|
examples=examples,
|
85
|
-
traces=traces,
|
86
79
|
)
|
87
80
|
|
88
81
|
def add_from_json(self, file_path: str) -> None:
|
@@ -123,29 +116,15 @@ class Dataset:
|
|
123
116
|
self.add_examples(examples)
|
124
117
|
|
125
118
|
def add_examples(self, examples: List[Example]) -> None:
|
126
|
-
|
127
|
-
|
128
|
-
{
|
129
|
-
"dataset_alias": self.name,
|
130
|
-
"project_name": self.project_name,
|
131
|
-
"examples": [
|
132
|
-
{
|
133
|
-
"name": e.name,
|
134
|
-
"created_at": e.created_at,
|
135
|
-
"example_id": e.example_id,
|
136
|
-
}
|
137
|
-
for e in examples
|
138
|
-
],
|
139
|
-
}
|
140
|
-
)
|
119
|
+
if not isinstance(examples, list):
|
120
|
+
raise TypeError("examples must be a list")
|
141
121
|
|
142
|
-
def add_traces(self, traces: List[Trace]) -> None:
|
143
122
|
client = JudgmentSyncClient(self.judgment_api_key, self.organization_id)
|
144
|
-
client.
|
123
|
+
client.datasets_insert_examples_for_judgeval(
|
145
124
|
{
|
146
|
-
"
|
125
|
+
"dataset_name": self.name,
|
147
126
|
"project_name": self.project_name,
|
148
|
-
"
|
127
|
+
"examples": [e.model_dump() for e in examples],
|
149
128
|
}
|
150
129
|
)
|
151
130
|
|
@@ -200,10 +179,4 @@ class Dataset:
|
|
200
179
|
return len(self.examples)
|
201
180
|
|
202
181
|
def __str__(self):
|
203
|
-
return (
|
204
|
-
f"{self.__class__.__name__}("
|
205
|
-
f"examples={self.examples}, "
|
206
|
-
f"traces={self.traces}, "
|
207
|
-
f"name={self.name}"
|
208
|
-
f")"
|
209
|
-
)
|
182
|
+
return f"{self.__class__.__name__}(examples={self.examples}, name={self.name})"
|
judgeval/evaluation/__init__.py
CHANGED
@@ -10,7 +10,7 @@ from typing import List, Dict, Union, Tuple, TYPE_CHECKING
|
|
10
10
|
from rich import print as rprint
|
11
11
|
|
12
12
|
from judgeval.data import ScorerData, ScoringResult, Example
|
13
|
-
from judgeval.scorers import BaseScorer,
|
13
|
+
from judgeval.scorers import BaseScorer, ExampleAPIScorerConfig
|
14
14
|
from judgeval.scorers.score import a_execute_scoring
|
15
15
|
from judgeval.api import JudgmentSyncClient
|
16
16
|
from judgeval.env import (
|
@@ -86,7 +86,7 @@ def log_evaluation_results(
|
|
86
86
|
|
87
87
|
|
88
88
|
def check_examples(
|
89
|
-
examples: List[Example], scorers: List[Union[
|
89
|
+
examples: List[Example], scorers: List[Union[ExampleAPIScorerConfig, BaseScorer]]
|
90
90
|
) -> None:
|
91
91
|
"""
|
92
92
|
Checks if the example contains the necessary parameters for the scorer.
|
@@ -118,10 +118,8 @@ def check_examples(
|
|
118
118
|
|
119
119
|
|
120
120
|
def _poll_evaluation_until_complete(
|
121
|
-
|
122
|
-
project_name: str,
|
121
|
+
evaluation_run: EvaluationRun,
|
123
122
|
judgment_api_key: str,
|
124
|
-
organization_id: str,
|
125
123
|
expected_scorer_data_count: int,
|
126
124
|
poll_interval_seconds: float = 5,
|
127
125
|
max_failures: int = 5,
|
@@ -142,6 +140,10 @@ def _poll_evaluation_until_complete(
|
|
142
140
|
Returns:
|
143
141
|
List[ScoringResult]: The evaluation results
|
144
142
|
"""
|
143
|
+
organization_id = evaluation_run.organization_id
|
144
|
+
project_name = evaluation_run.project_name
|
145
|
+
experiment_run_id = evaluation_run.id
|
146
|
+
|
145
147
|
poll_count = 0
|
146
148
|
exception_count = 0
|
147
149
|
api_client = JudgmentSyncClient(judgment_api_key, organization_id)
|
@@ -157,6 +159,11 @@ def _poll_evaluation_until_complete(
|
|
157
159
|
time.sleep(poll_interval_seconds)
|
158
160
|
continue
|
159
161
|
|
162
|
+
example_scorer_pairings = status_response.get("results", [])
|
163
|
+
if len(example_scorer_pairings) != expected_scorer_data_count:
|
164
|
+
time.sleep(poll_interval_seconds)
|
165
|
+
continue
|
166
|
+
|
160
167
|
results_response = api_client.fetch_experiment_run(
|
161
168
|
{
|
162
169
|
"experiment_run_id": experiment_run_id,
|
@@ -165,36 +172,20 @@ def _poll_evaluation_until_complete(
|
|
165
172
|
)
|
166
173
|
url = results_response.get("ui_results_url")
|
167
174
|
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
examples_data = results_response.get("examples", [])
|
173
|
-
scoring_results = []
|
174
|
-
scorer_data_count = 0
|
175
|
-
|
176
|
-
for example_data in examples_data:
|
177
|
-
scorer_data_list = []
|
178
|
-
for raw_scorer_data in example_data.get("scorer_data", []):
|
179
|
-
scorer_data = ScorerData(**raw_scorer_data)
|
180
|
-
scorer_data_list.append(scorer_data)
|
181
|
-
scorer_data_count += 1
|
182
|
-
|
183
|
-
example = Example(**example_data)
|
184
|
-
|
185
|
-
success = all(scorer_data.success for scorer_data in scorer_data_list)
|
175
|
+
scoring_result_list = []
|
176
|
+
for res in results_response.get("results", []):
|
177
|
+
example = res.get("data", {}).copy()
|
178
|
+
example["example_id"] = res.get("example_id")
|
186
179
|
scoring_result = ScoringResult(
|
187
|
-
|
188
|
-
|
180
|
+
scorers_data=res.get("scorers", []),
|
181
|
+
success=all(
|
182
|
+
t.get("success", False) for t in res.get("scorers", [])
|
183
|
+
),
|
189
184
|
data_object=example,
|
190
185
|
)
|
191
|
-
|
192
|
-
|
193
|
-
if scorer_data_count != expected_scorer_data_count:
|
194
|
-
time.sleep(poll_interval_seconds)
|
195
|
-
continue
|
186
|
+
scoring_result_list.append(scoring_result)
|
196
187
|
|
197
|
-
return
|
188
|
+
return scoring_result_list, url
|
198
189
|
except Exception as e:
|
199
190
|
exception_count += 1
|
200
191
|
if isinstance(e, JudgmentAPIError):
|
@@ -294,10 +285,8 @@ def run_eval(
|
|
294
285
|
else sum(1 for cs in evaluation_run.custom_scorers if cs.server_hosted)
|
295
286
|
)
|
296
287
|
results, url = _poll_evaluation_until_complete(
|
297
|
-
|
298
|
-
project_name=evaluation_run.project_name,
|
288
|
+
evaluation_run=evaluation_run,
|
299
289
|
judgment_api_key=judgment_api_key,
|
300
|
-
organization_id=evaluation_run.organization_id,
|
301
290
|
expected_scorer_data_count=(num_scorers * len(evaluation_run.examples)),
|
302
291
|
)
|
303
292
|
finally:
|
judgeval/scorers/__init__.py
CHANGED
@@ -1,25 +1,27 @@
|
|
1
|
-
from judgeval.scorers.api_scorer import
|
1
|
+
from judgeval.scorers.api_scorer import (
|
2
|
+
APIScorerConfig,
|
3
|
+
ExampleAPIScorerConfig,
|
4
|
+
TraceAPIScorerConfig,
|
5
|
+
)
|
2
6
|
from judgeval.scorers.base_scorer import BaseScorer
|
3
7
|
from judgeval.scorers.judgeval_scorers.api_scorers import (
|
4
8
|
FaithfulnessScorer,
|
5
9
|
AnswerRelevancyScorer,
|
6
10
|
AnswerCorrectnessScorer,
|
7
11
|
InstructionAdherenceScorer,
|
8
|
-
|
9
|
-
ToolOrderScorer,
|
12
|
+
TracePromptScorer,
|
10
13
|
PromptScorer,
|
11
|
-
ToolDependencyScorer,
|
12
14
|
)
|
13
15
|
|
14
16
|
__all__ = [
|
15
17
|
"APIScorerConfig",
|
18
|
+
"ExampleAPIScorerConfig",
|
19
|
+
"TraceAPIScorerConfig",
|
16
20
|
"BaseScorer",
|
21
|
+
"TracePromptScorer",
|
17
22
|
"PromptScorer",
|
18
23
|
"FaithfulnessScorer",
|
19
24
|
"AnswerRelevancyScorer",
|
20
25
|
"AnswerCorrectnessScorer",
|
21
26
|
"InstructionAdherenceScorer",
|
22
|
-
"DerailmentScorer",
|
23
|
-
"ToolOrderScorer",
|
24
|
-
"ToolDependencyScorer",
|
25
27
|
]
|
judgeval/scorers/api_scorer.py
CHANGED
@@ -63,3 +63,11 @@ class APIScorerConfig(BaseModel):
|
|
63
63
|
|
64
64
|
def __str__(self):
|
65
65
|
return f"JudgmentScorer(score_type={self.score_type.value}, threshold={self.threshold})"
|
66
|
+
|
67
|
+
|
68
|
+
class ExampleAPIScorerConfig(APIScorerConfig):
|
69
|
+
pass
|
70
|
+
|
71
|
+
|
72
|
+
class TraceAPIScorerConfig(APIScorerConfig):
|
73
|
+
pass
|