judgeval 0.12.0__py3-none-any.whl → 0.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +2 -2
- judgeval/api/api_types.py +81 -12
- judgeval/cli.py +2 -1
- judgeval/constants.py +0 -6
- judgeval/data/evaluation_run.py +2 -5
- judgeval/data/judgment_types.py +97 -12
- judgeval/data/trace.py +108 -1
- judgeval/dataset/__init__.py +72 -23
- judgeval/env.py +5 -20
- judgeval/integrations/langgraph/__init__.py +9 -785
- judgeval/scorers/api_scorer.py +7 -12
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +0 -8
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +0 -8
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +0 -12
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +22 -33
- judgeval/scorers/score.py +1 -1
- judgeval/scorers/utils.py +1 -4
- judgeval/tracer/__init__.py +175 -156
- judgeval/tracer/exporters/__init__.py +4 -1
- judgeval/tracer/keys.py +15 -25
- judgeval/tracer/llm/__init__.py +0 -1
- judgeval/tracer/llm/anthropic/__init__.py +20 -0
- judgeval/tracer/llm/google/__init__.py +21 -0
- judgeval/tracer/llm/groq/__init__.py +20 -0
- judgeval/tracer/llm/openai/__init__.py +32 -0
- judgeval/tracer/llm/providers.py +28 -79
- judgeval/tracer/llm/together/__init__.py +20 -0
- judgeval/tracer/managers.py +23 -48
- judgeval/tracer/processors/__init__.py +36 -75
- judgeval/tracer/utils.py +1 -2
- judgeval/utils/file_utils.py +0 -2
- judgeval/utils/meta.py +18 -5
- judgeval/utils/testing.py +0 -14
- judgeval/utils/version_check.py +2 -0
- judgeval/version.py +1 -1
- {judgeval-0.12.0.dist-info → judgeval-0.13.0.dist-info}/METADATA +1 -7
- {judgeval-0.12.0.dist-info → judgeval-0.13.0.dist-info}/RECORD +40 -35
- {judgeval-0.12.0.dist-info → judgeval-0.13.0.dist-info}/WHEEL +0 -0
- {judgeval-0.12.0.dist-info → judgeval-0.13.0.dist-info}/entry_points.txt +0 -0
- {judgeval-0.12.0.dist-info → judgeval-0.13.0.dist-info}/licenses/LICENSE.md +0 -0
judgeval/__init__.py
CHANGED
@@ -10,7 +10,7 @@ from judgeval.scorers import ExampleAPIScorerConfig
|
|
10
10
|
from judgeval.scorers.example_scorer import ExampleScorer
|
11
11
|
from judgeval.data.example import Example
|
12
12
|
from judgeval.logger import judgeval_logger
|
13
|
-
from judgeval.env import JUDGMENT_API_KEY,
|
13
|
+
from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_ORG_ID
|
14
14
|
from judgeval.utils.meta import SingletonMeta
|
15
15
|
from judgeval.exceptions import JudgmentRuntimeError, JudgmentTestError
|
16
16
|
from judgeval.api import JudgmentSyncClient
|
@@ -42,7 +42,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
42
42
|
scorers: Sequence[Union[ExampleAPIScorerConfig, ExampleScorer]],
|
43
43
|
project_name: str = "default_project",
|
44
44
|
eval_run_name: str = "default_eval_run",
|
45
|
-
model: str =
|
45
|
+
model: Optional[str] = None,
|
46
46
|
assert_test: bool = False,
|
47
47
|
) -> List[ScoringResult]:
|
48
48
|
try:
|
judgeval/api/api_types.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# generated by datamodel-codegen:
|
2
2
|
# filename: .openapi.json
|
3
|
-
# timestamp: 2025-09-
|
3
|
+
# timestamp: 2025-09-24T18:25:18+00:00
|
4
4
|
|
5
5
|
from __future__ import annotations
|
6
6
|
from typing import Any, Dict, List, Literal, Optional, TypedDict, Union
|
@@ -52,8 +52,8 @@ class SavePromptScorerRequest(TypedDict):
|
|
52
52
|
name: str
|
53
53
|
prompt: str
|
54
54
|
threshold: float
|
55
|
-
|
56
|
-
is_trace: NotRequired[
|
55
|
+
model: NotRequired[str]
|
56
|
+
is_trace: NotRequired[bool]
|
57
57
|
|
58
58
|
|
59
59
|
class SavePromptScorerResponse(TypedDict):
|
@@ -117,6 +117,7 @@ class ScorerConfig(TypedDict):
|
|
117
117
|
score_type: str
|
118
118
|
name: NotRequired[Optional[str]]
|
119
119
|
threshold: NotRequired[float]
|
120
|
+
model: NotRequired[Optional[str]]
|
120
121
|
strict_mode: NotRequired[bool]
|
121
122
|
required_params: NotRequired[List[str]]
|
122
123
|
kwargs: NotRequired[Optional[Dict[str, Any]]]
|
@@ -141,7 +142,7 @@ class PromptScorer(TypedDict):
|
|
141
142
|
name: str
|
142
143
|
prompt: str
|
143
144
|
threshold: float
|
144
|
-
|
145
|
+
model: NotRequired[str]
|
145
146
|
created_at: NotRequired[Optional[str]]
|
146
147
|
updated_at: NotRequired[Optional[str]]
|
147
148
|
is_trace: NotRequired[Optional[bool]]
|
@@ -189,13 +190,28 @@ class OtelTraceSpan(TypedDict):
|
|
189
190
|
state_before: NotRequired[Optional[Dict[str, Any]]]
|
190
191
|
|
191
192
|
|
193
|
+
class OtelSpanListItemScores(TypedDict):
|
194
|
+
success: bool
|
195
|
+
score: float
|
196
|
+
reason: NotRequired[Optional[str]]
|
197
|
+
name: str
|
198
|
+
|
199
|
+
|
200
|
+
class OtelSpanDetailScores(TypedDict):
|
201
|
+
success: bool
|
202
|
+
score: float
|
203
|
+
reason: NotRequired[Optional[str]]
|
204
|
+
name: str
|
205
|
+
data: NotRequired[Optional[Dict[str, Any]]]
|
206
|
+
|
207
|
+
|
192
208
|
class ExampleEvaluationRun(TypedDict):
|
193
209
|
id: NotRequired[str]
|
194
210
|
project_name: str
|
195
211
|
eval_name: str
|
196
212
|
custom_scorers: NotRequired[List[BaseScorer]]
|
197
213
|
judgment_scorers: NotRequired[List[ScorerConfig]]
|
198
|
-
model: str
|
214
|
+
model: NotRequired[Optional[str]]
|
199
215
|
created_at: NotRequired[str]
|
200
216
|
examples: List[Example]
|
201
217
|
trace_span_id: NotRequired[Optional[str]]
|
@@ -212,7 +228,7 @@ class TraceEvaluationRun(TypedDict):
|
|
212
228
|
eval_name: str
|
213
229
|
custom_scorers: NotRequired[List[BaseScorer]]
|
214
230
|
judgment_scorers: NotRequired[List[ScorerConfig]]
|
215
|
-
model: str
|
231
|
+
model: NotRequired[Optional[str]]
|
216
232
|
created_at: NotRequired[str]
|
217
233
|
trace_and_span_ids: List[TraceAndSpanId]
|
218
234
|
is_offline: NotRequired[bool]
|
@@ -224,12 +240,6 @@ class DatasetInsertExamples(TypedDict):
|
|
224
240
|
project_name: str
|
225
241
|
|
226
242
|
|
227
|
-
class DatasetReturn(TypedDict):
|
228
|
-
name: str
|
229
|
-
project_name: str
|
230
|
-
examples: NotRequired[Optional[List[Example]]]
|
231
|
-
|
232
|
-
|
233
243
|
class DatasetInfo(TypedDict):
|
234
244
|
dataset_id: str
|
235
245
|
name: str
|
@@ -261,6 +271,65 @@ class ScoringResult(TypedDict):
|
|
261
271
|
evaluation_cost: NotRequired[Optional[float]]
|
262
272
|
|
263
273
|
|
274
|
+
class OtelTraceListItem(TypedDict):
|
275
|
+
organization_id: str
|
276
|
+
project_id: str
|
277
|
+
trace_id: str
|
278
|
+
timestamp: str
|
279
|
+
duration: NotRequired[Optional[int]]
|
280
|
+
has_notification: NotRequired[Optional[bool]]
|
281
|
+
tags: NotRequired[Optional[List[str]]]
|
282
|
+
experiment_run_id: NotRequired[Optional[str]]
|
283
|
+
span_name: NotRequired[Optional[str]]
|
284
|
+
cumulative_llm_cost: NotRequired[Optional[float]]
|
285
|
+
error: NotRequired[Optional[Dict[str, Any]]]
|
286
|
+
scores: NotRequired[List[OtelSpanListItemScores]]
|
287
|
+
customer_id: NotRequired[Optional[str]]
|
288
|
+
input_preview: NotRequired[Optional[str]]
|
289
|
+
output_preview: NotRequired[Optional[str]]
|
290
|
+
annotation_count: NotRequired[int]
|
291
|
+
span_id: str
|
292
|
+
rule_id: NotRequired[Optional[str]]
|
293
|
+
|
294
|
+
|
295
|
+
class OtelSpanDetail(TypedDict):
|
296
|
+
organization_id: str
|
297
|
+
project_id: str
|
298
|
+
timestamp: str
|
299
|
+
trace_id: str
|
300
|
+
span_id: str
|
301
|
+
parent_span_id: NotRequired[Optional[str]]
|
302
|
+
trace_state: NotRequired[Optional[str]]
|
303
|
+
span_name: NotRequired[Optional[str]]
|
304
|
+
span_kind: NotRequired[Optional[str]]
|
305
|
+
service_name: NotRequired[Optional[str]]
|
306
|
+
resource_attributes: NotRequired[Optional[Dict[str, Any]]]
|
307
|
+
span_attributes: NotRequired[Optional[Dict[str, Any]]]
|
308
|
+
duration: NotRequired[Optional[int]]
|
309
|
+
status_code: NotRequired[Optional[str]]
|
310
|
+
status_message: NotRequired[Optional[str]]
|
311
|
+
events: NotRequired[Optional[Union[List[Dict[str, Any]], Dict[str, Any]]]]
|
312
|
+
links: NotRequired[Optional[Union[List[Dict[str, Any]], Dict[str, Any]]]]
|
313
|
+
llm_cost: NotRequired[Optional[float]]
|
314
|
+
prompt_tokens: NotRequired[Optional[int]]
|
315
|
+
completion_tokens: NotRequired[Optional[int]]
|
316
|
+
scores: NotRequired[Optional[List[OtelSpanDetailScores]]]
|
317
|
+
|
318
|
+
|
264
319
|
class EvalResults(TypedDict):
|
265
320
|
results: List[ScoringResult]
|
266
321
|
run: Union[ExampleEvaluationRun, TraceEvaluationRun]
|
322
|
+
|
323
|
+
|
324
|
+
class DatasetTraceWithSpans(TypedDict):
|
325
|
+
dataset_id: str
|
326
|
+
trace_detail: OtelTraceListItem
|
327
|
+
spans: List[OtelSpanDetail]
|
328
|
+
|
329
|
+
|
330
|
+
class DatasetReturn(TypedDict):
|
331
|
+
name: str
|
332
|
+
project_name: str
|
333
|
+
dataset_kind: DatasetKind
|
334
|
+
examples: NotRequired[Optional[List[Example]]]
|
335
|
+
traces: NotRequired[Optional[List[DatasetTraceWithSpans]]]
|
judgeval/cli.py
CHANGED
@@ -5,6 +5,7 @@ from pathlib import Path
|
|
5
5
|
from dotenv import load_dotenv
|
6
6
|
from judgeval.logger import judgeval_logger
|
7
7
|
from judgeval import JudgmentClient
|
8
|
+
from judgeval.version import get_version
|
8
9
|
|
9
10
|
load_dotenv()
|
10
11
|
|
@@ -56,7 +57,7 @@ def upload_scorer(
|
|
56
57
|
@app.command()
|
57
58
|
def version():
|
58
59
|
"""Show version info"""
|
59
|
-
judgeval_logger.info("
|
60
|
+
judgeval_logger.info(f"Judgeval CLI v{get_version()}")
|
60
61
|
|
61
62
|
|
62
63
|
if __name__ == "__main__":
|
judgeval/constants.py
CHANGED
@@ -24,7 +24,6 @@ class APIScorerType(str, Enum):
|
|
24
24
|
|
25
25
|
@classmethod
|
26
26
|
def __missing__(cls, value: str) -> APIScorerType:
|
27
|
-
# Handle case-insensitive lookup
|
28
27
|
for member in cls:
|
29
28
|
if member.value == value.lower():
|
30
29
|
return member
|
@@ -32,11 +31,6 @@ class APIScorerType(str, Enum):
|
|
32
31
|
raise ValueError(f"Invalid scorer type: {value}")
|
33
32
|
|
34
33
|
|
35
|
-
UNBOUNDED_SCORERS: Set[APIScorerType] = (
|
36
|
-
set()
|
37
|
-
) # scorers whose scores are not bounded between 0-1
|
38
|
-
|
39
|
-
|
40
34
|
LITELLM_SUPPORTED_MODELS: Set[str] = set(litellm.model_list)
|
41
35
|
|
42
36
|
|
judgeval/data/evaluation_run.py
CHANGED
@@ -23,7 +23,7 @@ class EvaluationRun(BaseModel):
|
|
23
23
|
scorers: Sequence[Union[ExampleScorer, APIScorerConfig]] = Field(
|
24
24
|
default_factory=list
|
25
25
|
)
|
26
|
-
model: str
|
26
|
+
model: Optional[str] = None
|
27
27
|
|
28
28
|
def __init__(
|
29
29
|
self,
|
@@ -77,11 +77,8 @@ class EvaluationRun(BaseModel):
|
|
77
77
|
|
78
78
|
@field_validator("model")
|
79
79
|
def validate_model(cls, v, values):
|
80
|
-
if not v:
|
81
|
-
raise ValueError("Model cannot be empty.")
|
82
|
-
|
83
80
|
# Check if model is string or list of strings
|
84
|
-
if isinstance(v, str):
|
81
|
+
if v is not None and isinstance(v, str):
|
85
82
|
if v not in ACCEPTABLE_MODELS:
|
86
83
|
raise ValueError(
|
87
84
|
f"Model name {v} not recognized. Please select a valid model name.)"
|
judgeval/data/judgment_types.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# generated by datamodel-codegen:
|
2
2
|
# filename: .openapi.json
|
3
|
-
# timestamp: 2025-09-
|
3
|
+
# timestamp: 2025-09-24T18:25:17+00:00
|
4
4
|
|
5
5
|
from __future__ import annotations
|
6
6
|
from typing import Annotated, Any, Dict, List, Optional, Union
|
@@ -54,8 +54,8 @@ class SavePromptScorerRequest(BaseModel):
|
|
54
54
|
name: Annotated[str, Field(title="Name")]
|
55
55
|
prompt: Annotated[str, Field(title="Prompt")]
|
56
56
|
threshold: Annotated[float, Field(title="Threshold")]
|
57
|
-
|
58
|
-
is_trace: Annotated[Optional[bool], Field(title="Is Trace")] =
|
57
|
+
model: Annotated[Optional[str], Field(title="Model")] = "gpt-5"
|
58
|
+
is_trace: Annotated[Optional[bool], Field(title="Is Trace")] = False
|
59
59
|
|
60
60
|
|
61
61
|
class SavePromptScorerResponse(BaseModel):
|
@@ -125,6 +125,7 @@ class ScorerConfig(BaseModel):
|
|
125
125
|
score_type: Annotated[str, Field(title="Score Type")]
|
126
126
|
name: Annotated[Optional[str], Field(title="Name")] = None
|
127
127
|
threshold: Annotated[Optional[float], Field(title="Threshold")] = 0.5
|
128
|
+
model: Annotated[Optional[str], Field(title="Model")] = None
|
128
129
|
strict_mode: Annotated[Optional[bool], Field(title="Strict Mode")] = False
|
129
130
|
required_params: Annotated[Optional[List[str]], Field(title="Required Params")] = []
|
130
131
|
kwargs: Annotated[Optional[Dict[str, Any]], Field(title="Kwargs")] = None
|
@@ -154,7 +155,7 @@ class PromptScorer(BaseModel):
|
|
154
155
|
name: Annotated[str, Field(title="Name")]
|
155
156
|
prompt: Annotated[str, Field(title="Prompt")]
|
156
157
|
threshold: Annotated[float, Field(title="Threshold")]
|
157
|
-
|
158
|
+
model: Annotated[Optional[str], Field(title="Model")] = "gpt-5"
|
158
159
|
created_at: Annotated[Optional[AwareDatetime], Field(title="Created At")] = None
|
159
160
|
updated_at: Annotated[Optional[AwareDatetime], Field(title="Updated At")] = None
|
160
161
|
is_trace: Annotated[Optional[bool], Field(title="Is Trace")] = False
|
@@ -212,6 +213,21 @@ class OtelTraceSpan(BaseModel):
|
|
212
213
|
)
|
213
214
|
|
214
215
|
|
216
|
+
class OtelSpanListItemScores(BaseModel):
|
217
|
+
success: Annotated[bool, Field(title="Success")]
|
218
|
+
score: Annotated[float, Field(title="Score")]
|
219
|
+
reason: Annotated[Optional[str], Field(title="Reason")] = None
|
220
|
+
name: Annotated[str, Field(title="Name")]
|
221
|
+
|
222
|
+
|
223
|
+
class OtelSpanDetailScores(BaseModel):
|
224
|
+
success: Annotated[bool, Field(title="Success")]
|
225
|
+
score: Annotated[float, Field(title="Score")]
|
226
|
+
reason: Annotated[Optional[str], Field(title="Reason")] = None
|
227
|
+
name: Annotated[str, Field(title="Name")]
|
228
|
+
data: Annotated[Optional[Dict[str, Any]], Field(title="Data")] = None
|
229
|
+
|
230
|
+
|
215
231
|
class ExampleEvaluationRun(BaseModel):
|
216
232
|
id: Annotated[Optional[str], Field(title="Id")] = None
|
217
233
|
project_name: Annotated[str, Field(title="Project Name")]
|
@@ -222,7 +238,7 @@ class ExampleEvaluationRun(BaseModel):
|
|
222
238
|
judgment_scorers: Annotated[
|
223
239
|
Optional[List[ScorerConfig]], Field(title="Judgment Scorers")
|
224
240
|
] = []
|
225
|
-
model: Annotated[str, Field(title="Model")]
|
241
|
+
model: Annotated[Optional[str], Field(title="Model")] = None
|
226
242
|
created_at: Annotated[Optional[str], Field(title="Created At")] = None
|
227
243
|
examples: Annotated[List[Example], Field(title="Examples")]
|
228
244
|
trace_span_id: Annotated[Optional[str], Field(title="Trace Span Id")] = None
|
@@ -243,7 +259,7 @@ class TraceEvaluationRun(BaseModel):
|
|
243
259
|
judgment_scorers: Annotated[
|
244
260
|
Optional[List[ScorerConfig]], Field(title="Judgment Scorers")
|
245
261
|
] = []
|
246
|
-
model: Annotated[str, Field(title="Model")]
|
262
|
+
model: Annotated[Optional[str], Field(title="Model")] = None
|
247
263
|
created_at: Annotated[Optional[str], Field(title="Created At")] = None
|
248
264
|
trace_and_span_ids: Annotated[
|
249
265
|
List[TraceAndSpanId], Field(title="Trace And Span Ids")
|
@@ -257,12 +273,6 @@ class DatasetInsertExamples(BaseModel):
|
|
257
273
|
project_name: Annotated[str, Field(title="Project Name")]
|
258
274
|
|
259
275
|
|
260
|
-
class DatasetReturn(BaseModel):
|
261
|
-
name: Annotated[str, Field(title="Name")]
|
262
|
-
project_name: Annotated[str, Field(title="Project Name")]
|
263
|
-
examples: Annotated[Optional[List[Example]], Field(title="Examples")] = None
|
264
|
-
|
265
|
-
|
266
276
|
class DatasetInfo(BaseModel):
|
267
277
|
dataset_id: Annotated[str, Field(title="Dataset Id")]
|
268
278
|
name: Annotated[str, Field(title="Name")]
|
@@ -296,6 +306,81 @@ class ScoringResult(BaseModel):
|
|
296
306
|
evaluation_cost: Annotated[Optional[float], Field(title="Evaluation Cost")] = None
|
297
307
|
|
298
308
|
|
309
|
+
class OtelTraceListItem(BaseModel):
|
310
|
+
organization_id: Annotated[str, Field(title="Organization Id")]
|
311
|
+
project_id: Annotated[str, Field(title="Project Id")]
|
312
|
+
trace_id: Annotated[str, Field(title="Trace Id")]
|
313
|
+
timestamp: Annotated[str, Field(title="Timestamp")]
|
314
|
+
duration: Annotated[Optional[int], Field(title="Duration")] = None
|
315
|
+
has_notification: Annotated[Optional[bool], Field(title="Has Notification")] = None
|
316
|
+
tags: Annotated[Optional[List[str]], Field(title="Tags")] = None
|
317
|
+
experiment_run_id: Annotated[Optional[str], Field(title="Experiment Run Id")] = None
|
318
|
+
span_name: Annotated[Optional[str], Field(title="Span Name")] = None
|
319
|
+
cumulative_llm_cost: Annotated[
|
320
|
+
Optional[float], Field(title="Cumulative Llm Cost")
|
321
|
+
] = None
|
322
|
+
error: Annotated[Optional[Dict[str, Any]], Field(title="Error")] = None
|
323
|
+
scores: Annotated[
|
324
|
+
Optional[List[OtelSpanListItemScores]], Field(title="Scores")
|
325
|
+
] = []
|
326
|
+
customer_id: Annotated[Optional[str], Field(title="Customer Id")] = None
|
327
|
+
input_preview: Annotated[Optional[str], Field(title="Input Preview")] = None
|
328
|
+
output_preview: Annotated[Optional[str], Field(title="Output Preview")] = None
|
329
|
+
annotation_count: Annotated[Optional[int], Field(title="Annotation Count")] = 0
|
330
|
+
span_id: Annotated[str, Field(title="Span Id")]
|
331
|
+
rule_id: Annotated[Optional[str], Field(title="Rule Id")] = None
|
332
|
+
|
333
|
+
|
334
|
+
class OtelSpanDetail(BaseModel):
|
335
|
+
organization_id: Annotated[str, Field(title="Organization Id")]
|
336
|
+
project_id: Annotated[str, Field(title="Project Id")]
|
337
|
+
timestamp: Annotated[str, Field(title="Timestamp")]
|
338
|
+
trace_id: Annotated[str, Field(title="Trace Id")]
|
339
|
+
span_id: Annotated[str, Field(title="Span Id")]
|
340
|
+
parent_span_id: Annotated[Optional[str], Field(title="Parent Span Id")] = None
|
341
|
+
trace_state: Annotated[Optional[str], Field(title="Trace State")] = None
|
342
|
+
span_name: Annotated[Optional[str], Field(title="Span Name")] = None
|
343
|
+
span_kind: Annotated[Optional[str], Field(title="Span Kind")] = None
|
344
|
+
service_name: Annotated[Optional[str], Field(title="Service Name")] = None
|
345
|
+
resource_attributes: Annotated[
|
346
|
+
Optional[Dict[str, Any]], Field(title="Resource Attributes")
|
347
|
+
] = None
|
348
|
+
span_attributes: Annotated[
|
349
|
+
Optional[Dict[str, Any]], Field(title="Span Attributes")
|
350
|
+
] = None
|
351
|
+
duration: Annotated[Optional[int], Field(title="Duration")] = None
|
352
|
+
status_code: Annotated[Optional[str], Field(title="Status Code")] = None
|
353
|
+
status_message: Annotated[Optional[str], Field(title="Status Message")] = None
|
354
|
+
events: Annotated[
|
355
|
+
Optional[Union[List[Dict[str, Any]], Dict[str, Any]]], Field(title="Events")
|
356
|
+
] = None
|
357
|
+
links: Annotated[
|
358
|
+
Optional[Union[List[Dict[str, Any]], Dict[str, Any]]], Field(title="Links")
|
359
|
+
] = None
|
360
|
+
llm_cost: Annotated[Optional[float], Field(title="Llm Cost")] = None
|
361
|
+
prompt_tokens: Annotated[Optional[int], Field(title="Prompt Tokens")] = None
|
362
|
+
completion_tokens: Annotated[Optional[int], Field(title="Completion Tokens")] = None
|
363
|
+
scores: Annotated[Optional[List[OtelSpanDetailScores]], Field(title="Scores")] = (
|
364
|
+
None
|
365
|
+
)
|
366
|
+
|
367
|
+
|
299
368
|
class EvalResults(BaseModel):
|
300
369
|
results: Annotated[List[ScoringResult], Field(title="Results")]
|
301
370
|
run: Annotated[Union[ExampleEvaluationRun, TraceEvaluationRun], Field(title="Run")]
|
371
|
+
|
372
|
+
|
373
|
+
class DatasetTraceWithSpans(BaseModel):
|
374
|
+
dataset_id: Annotated[str, Field(title="Dataset Id")]
|
375
|
+
trace_detail: OtelTraceListItem
|
376
|
+
spans: Annotated[List[OtelSpanDetail], Field(title="Spans")]
|
377
|
+
|
378
|
+
|
379
|
+
class DatasetReturn(BaseModel):
|
380
|
+
name: Annotated[str, Field(title="Name")]
|
381
|
+
project_name: Annotated[str, Field(title="Project Name")]
|
382
|
+
dataset_kind: DatasetKind
|
383
|
+
examples: Annotated[Optional[List[Example]], Field(title="Examples")] = None
|
384
|
+
traces: Annotated[Optional[List[DatasetTraceWithSpans]], Field(title="Traces")] = (
|
385
|
+
None
|
386
|
+
)
|
judgeval/data/trace.py
CHANGED
@@ -1,5 +1,10 @@
|
|
1
|
-
from typing import Optional
|
1
|
+
from typing import Optional, List, Dict, Any
|
2
2
|
from pydantic import BaseModel
|
3
|
+
from .judgment_types import (
|
4
|
+
OtelSpanDetailScores,
|
5
|
+
OtelSpanDetail,
|
6
|
+
OtelTraceListItem,
|
7
|
+
)
|
3
8
|
|
4
9
|
|
5
10
|
class TraceUsage(BaseModel):
|
@@ -12,3 +17,105 @@ class TraceUsage(BaseModel):
|
|
12
17
|
completion_tokens_cost_usd: Optional[float] = None
|
13
18
|
total_cost_usd: Optional[float] = None
|
14
19
|
model_name: Optional[str] = None
|
20
|
+
|
21
|
+
|
22
|
+
class TraceScore(OtelSpanDetailScores):
|
23
|
+
"""Score information for a trace or span."""
|
24
|
+
|
25
|
+
pass
|
26
|
+
|
27
|
+
|
28
|
+
class TraceRule(BaseModel):
|
29
|
+
"""Rule that was triggered for a trace."""
|
30
|
+
|
31
|
+
rule_id: str
|
32
|
+
rule_name: str
|
33
|
+
|
34
|
+
|
35
|
+
class TraceSpan(OtelSpanDetail):
|
36
|
+
"""Individual span within a trace with complete telemetry data."""
|
37
|
+
|
38
|
+
@classmethod
|
39
|
+
def from_otel_span_detail(cls, span_detail: OtelSpanDetail) -> "TraceSpan":
|
40
|
+
"""Create TraceSpan from OtelSpanDetail, converting scores to TraceScore."""
|
41
|
+
data = span_detail.model_dump()
|
42
|
+
|
43
|
+
if "scores" in data and data["scores"]:
|
44
|
+
data["scores"] = [TraceScore(**score) for score in data["scores"]]
|
45
|
+
|
46
|
+
return cls(**data)
|
47
|
+
|
48
|
+
def to_dict(self) -> Dict[str, Any]:
|
49
|
+
"""Convert TraceSpan to dictionary."""
|
50
|
+
return self.model_dump(exclude_none=True)
|
51
|
+
|
52
|
+
|
53
|
+
class Trace(OtelTraceListItem):
|
54
|
+
"""Complete trace with metadata and all associated spans."""
|
55
|
+
|
56
|
+
spans: List[TraceSpan] = []
|
57
|
+
rules: Optional[List[TraceRule]] = []
|
58
|
+
|
59
|
+
@classmethod
|
60
|
+
def from_dataset_trace_with_spans(cls, dataset_trace: Any) -> "Trace":
|
61
|
+
"""Create Trace from DatasetTraceWithSpans (handles both API and judgment types)."""
|
62
|
+
|
63
|
+
if hasattr(dataset_trace, "trace_detail"):
|
64
|
+
trace_detail = dataset_trace.trace_detail
|
65
|
+
spans_data = dataset_trace.spans
|
66
|
+
else:
|
67
|
+
trace_detail = dataset_trace.get("trace_detail", {})
|
68
|
+
spans_data = dataset_trace.get("spans", [])
|
69
|
+
|
70
|
+
if hasattr(trace_detail, "model_dump"):
|
71
|
+
trace_data = trace_detail.model_dump()
|
72
|
+
elif isinstance(trace_detail, dict):
|
73
|
+
trace_data = trace_detail.copy()
|
74
|
+
else:
|
75
|
+
trace_data = dict(trace_detail)
|
76
|
+
|
77
|
+
spans = []
|
78
|
+
for span in spans_data:
|
79
|
+
if hasattr(span, "model_dump"):
|
80
|
+
spans.append(TraceSpan.from_otel_span_detail(span))
|
81
|
+
else:
|
82
|
+
# Handle dict spans
|
83
|
+
span_data = dict(span) if not isinstance(span, dict) else span.copy()
|
84
|
+
if "scores" in span_data and span_data["scores"]:
|
85
|
+
span_data["scores"] = [
|
86
|
+
TraceScore(**score)
|
87
|
+
if isinstance(score, dict)
|
88
|
+
else TraceScore(**score.model_dump())
|
89
|
+
for score in span_data["scores"]
|
90
|
+
]
|
91
|
+
spans.append(TraceSpan(**span_data))
|
92
|
+
|
93
|
+
rules = []
|
94
|
+
if "rule_id" in trace_data and trace_data["rule_id"]:
|
95
|
+
rules = [
|
96
|
+
TraceRule(
|
97
|
+
rule_id=trace_data["rule_id"],
|
98
|
+
rule_name=f"Rule {trace_data['rule_id']}",
|
99
|
+
)
|
100
|
+
]
|
101
|
+
|
102
|
+
trace_data.pop("scores", [])
|
103
|
+
trace_data.pop("rule_id", None)
|
104
|
+
trace = cls(**trace_data)
|
105
|
+
|
106
|
+
trace.spans = spans
|
107
|
+
trace.rules = rules
|
108
|
+
|
109
|
+
return trace
|
110
|
+
|
111
|
+
def to_dict(self) -> Dict[str, Any]:
|
112
|
+
"""Convert Trace to dictionary."""
|
113
|
+
return self.model_dump(exclude_none=True)
|
114
|
+
|
115
|
+
def __len__(self) -> int:
|
116
|
+
"""Return the number of spans in the trace."""
|
117
|
+
return len(self.spans)
|
118
|
+
|
119
|
+
def __iter__(self):
|
120
|
+
"""Iterate over spans in the trace."""
|
121
|
+
return iter(self.spans)
|
judgeval/dataset/__init__.py
CHANGED
@@ -3,15 +3,16 @@ import orjson
|
|
3
3
|
import os
|
4
4
|
import yaml
|
5
5
|
from dataclasses import dataclass
|
6
|
-
from typing import List, Literal
|
6
|
+
from typing import List, Literal, Optional
|
7
7
|
|
8
8
|
from judgeval.data import Example
|
9
|
+
from judgeval.data.trace import Trace
|
9
10
|
from judgeval.utils.file_utils import get_examples_from_yaml, get_examples_from_json
|
10
11
|
from judgeval.api import JudgmentSyncClient
|
11
12
|
from judgeval.logger import judgeval_logger
|
12
13
|
from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_ORG_ID
|
13
14
|
|
14
|
-
from judgeval.
|
15
|
+
from judgeval.data.judgment_types import DatasetKind
|
15
16
|
|
16
17
|
|
17
18
|
@dataclass
|
@@ -26,9 +27,11 @@ class DatasetInfo:
|
|
26
27
|
|
27
28
|
@dataclass
|
28
29
|
class Dataset:
|
29
|
-
examples: List[Example]
|
30
30
|
name: str
|
31
31
|
project_name: str
|
32
|
+
dataset_kind: DatasetKind = DatasetKind.example
|
33
|
+
examples: Optional[List[Example]] = None
|
34
|
+
traces: Optional[List[Trace]] = None
|
32
35
|
judgment_api_key: str = JUDGMENT_API_KEY or ""
|
33
36
|
organization_id: str = JUDGMENT_ORG_ID or ""
|
34
37
|
|
@@ -47,22 +50,49 @@ class Dataset:
|
|
47
50
|
)
|
48
51
|
if not dataset:
|
49
52
|
raise ValueError(f"Dataset {name} not found in project {project_name}")
|
50
|
-
examples = dataset.get("examples", [])
|
51
|
-
if examples is None:
|
52
|
-
examples = []
|
53
53
|
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
54
|
+
dataset_kind = DatasetKind(dataset.get("dataset_kind", "example"))
|
55
|
+
|
56
|
+
if dataset_kind == DatasetKind.example:
|
57
|
+
examples = dataset.get("examples", [])
|
58
|
+
if examples is None:
|
59
|
+
examples = []
|
60
|
+
|
61
|
+
for e in examples:
|
62
|
+
if isinstance(e, dict) and isinstance(e.get("data", {}), dict):
|
63
|
+
e.update(e.pop("data")) # type: ignore
|
64
|
+
e.pop(
|
65
|
+
"example_id"
|
66
|
+
) # TODO: remove once scorer data migration is complete
|
67
|
+
judgeval_logger.info(f"Successfully retrieved example dataset {name}!")
|
68
|
+
return cls(
|
69
|
+
name=name,
|
70
|
+
project_name=project_name,
|
71
|
+
dataset_kind=dataset_kind,
|
72
|
+
examples=[Example(**e) for e in examples],
|
73
|
+
)
|
74
|
+
|
75
|
+
elif dataset_kind == DatasetKind.trace:
|
76
|
+
trace_data = dataset.get("traces", [])
|
77
|
+
if trace_data is None:
|
78
|
+
trace_data = []
|
79
|
+
|
80
|
+
traces = []
|
81
|
+
for trace_item in trace_data:
|
82
|
+
if isinstance(trace_item, dict):
|
83
|
+
trace = Trace.from_dataset_trace_with_spans(trace_item)
|
84
|
+
traces.append(trace)
|
85
|
+
|
86
|
+
judgeval_logger.info(f"Successfully retrieved trace dataset {name}!")
|
87
|
+
return cls(
|
88
|
+
name=name,
|
89
|
+
project_name=project_name,
|
90
|
+
dataset_kind=dataset_kind,
|
91
|
+
traces=traces,
|
92
|
+
)
|
93
|
+
|
94
|
+
else:
|
95
|
+
raise ValueError(f"Unsupported dataset kind: {dataset_kind}")
|
66
96
|
|
67
97
|
@classmethod
|
68
98
|
def create(
|
@@ -179,7 +209,9 @@ class Dataset:
|
|
179
209
|
file.write(
|
180
210
|
orjson.dumps(
|
181
211
|
{
|
182
|
-
"examples": [e.to_dict() for e in self.examples]
|
212
|
+
"examples": [e.to_dict() for e in self.examples]
|
213
|
+
if self.examples
|
214
|
+
else [],
|
183
215
|
},
|
184
216
|
option=orjson.OPT_INDENT_2,
|
185
217
|
)
|
@@ -187,7 +219,9 @@ class Dataset:
|
|
187
219
|
elif file_type == "yaml":
|
188
220
|
with open(complete_path, "w") as file:
|
189
221
|
yaml_data = {
|
190
|
-
"examples": [e.to_dict() for e in self.examples]
|
222
|
+
"examples": [e.to_dict() for e in self.examples]
|
223
|
+
if self.examples
|
224
|
+
else [],
|
191
225
|
}
|
192
226
|
yaml.dump(yaml_data, file, default_flow_style=False)
|
193
227
|
else:
|
@@ -197,10 +231,25 @@ class Dataset:
|
|
197
231
|
)
|
198
232
|
|
199
233
|
def __iter__(self):
|
200
|
-
|
234
|
+
if self.dataset_kind == DatasetKind.example and self.examples:
|
235
|
+
return iter(self.examples)
|
236
|
+
elif self.dataset_kind == DatasetKind.trace and self.traces:
|
237
|
+
return iter(self.traces)
|
238
|
+
else:
|
239
|
+
return iter([])
|
201
240
|
|
202
241
|
def __len__(self):
|
203
|
-
|
242
|
+
if self.dataset_kind == DatasetKind.example and self.examples:
|
243
|
+
return len(self.examples)
|
244
|
+
elif self.dataset_kind == DatasetKind.trace and self.traces:
|
245
|
+
return len(self.traces)
|
246
|
+
else:
|
247
|
+
return 0
|
204
248
|
|
205
249
|
def __str__(self):
|
206
|
-
|
250
|
+
if self.dataset_kind == DatasetKind.example:
|
251
|
+
return (
|
252
|
+
f"{self.__class__.__name__}(examples={self.examples}, name={self.name})"
|
253
|
+
)
|
254
|
+
else:
|
255
|
+
return f"{self.__class__.__name__}(traces={self.traces}, name={self.name})"
|