judgeval 0.11.0__py3-none-any.whl → 0.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. judgeval/__init__.py +5 -5
  2. judgeval/api/api_types.py +81 -12
  3. judgeval/cli.py +2 -1
  4. judgeval/constants.py +0 -6
  5. judgeval/data/evaluation_run.py +7 -8
  6. judgeval/data/judgment_types.py +97 -12
  7. judgeval/data/trace.py +108 -1
  8. judgeval/dataset/__init__.py +72 -23
  9. judgeval/env.py +5 -20
  10. judgeval/integrations/langgraph/__init__.py +9 -785
  11. judgeval/scorers/__init__.py +6 -0
  12. judgeval/scorers/api_scorer.py +15 -12
  13. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
  14. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
  15. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
  16. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
  17. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +26 -35
  18. judgeval/scorers/score.py +1 -1
  19. judgeval/scorers/utils.py +1 -4
  20. judgeval/tracer/__init__.py +181 -162
  21. judgeval/tracer/exporters/__init__.py +4 -1
  22. judgeval/tracer/keys.py +15 -25
  23. judgeval/tracer/llm/__init__.py +0 -1
  24. judgeval/tracer/llm/anthropic/__init__.py +20 -0
  25. judgeval/tracer/llm/google/__init__.py +21 -0
  26. judgeval/tracer/llm/groq/__init__.py +20 -0
  27. judgeval/tracer/llm/openai/__init__.py +32 -0
  28. judgeval/tracer/llm/providers.py +28 -79
  29. judgeval/tracer/llm/together/__init__.py +20 -0
  30. judgeval/tracer/managers.py +23 -48
  31. judgeval/tracer/processors/__init__.py +36 -75
  32. judgeval/tracer/utils.py +3 -4
  33. judgeval/trainer/trainer.py +4 -4
  34. judgeval/utils/file_utils.py +0 -2
  35. judgeval/utils/meta.py +18 -5
  36. judgeval/utils/testing.py +0 -14
  37. judgeval/utils/version_check.py +2 -0
  38. judgeval/version.py +1 -1
  39. {judgeval-0.11.0.dist-info → judgeval-0.13.0.dist-info}/METADATA +1 -7
  40. {judgeval-0.11.0.dist-info → judgeval-0.13.0.dist-info}/RECORD +43 -38
  41. {judgeval-0.11.0.dist-info → judgeval-0.13.0.dist-info}/WHEEL +0 -0
  42. {judgeval-0.11.0.dist-info → judgeval-0.13.0.dist-info}/entry_points.txt +0 -0
  43. {judgeval-0.11.0.dist-info → judgeval-0.13.0.dist-info}/licenses/LICENSE.md +0 -0
judgeval/__init__.py CHANGED
@@ -5,12 +5,12 @@ from judgeval.evaluation import run_eval
5
5
  from judgeval.data.evaluation_run import ExampleEvaluationRun
6
6
 
7
7
 
8
- from typing import List, Optional, Union
9
- from judgeval.scorers import APIScorerConfig
8
+ from typing import List, Optional, Union, Sequence
9
+ from judgeval.scorers import ExampleAPIScorerConfig
10
10
  from judgeval.scorers.example_scorer import ExampleScorer
11
11
  from judgeval.data.example import Example
12
12
  from judgeval.logger import judgeval_logger
13
- from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_DEFAULT_GPT_MODEL, JUDGMENT_ORG_ID
13
+ from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_ORG_ID
14
14
  from judgeval.utils.meta import SingletonMeta
15
15
  from judgeval.exceptions import JudgmentRuntimeError, JudgmentTestError
16
16
  from judgeval.api import JudgmentSyncClient
@@ -39,10 +39,10 @@ class JudgmentClient(metaclass=SingletonMeta):
39
39
  def run_evaluation(
40
40
  self,
41
41
  examples: List[Example],
42
- scorers: List[Union[APIScorerConfig, ExampleScorer]],
42
+ scorers: Sequence[Union[ExampleAPIScorerConfig, ExampleScorer]],
43
43
  project_name: str = "default_project",
44
44
  eval_run_name: str = "default_eval_run",
45
- model: str = JUDGMENT_DEFAULT_GPT_MODEL,
45
+ model: Optional[str] = None,
46
46
  assert_test: bool = False,
47
47
  ) -> List[ScoringResult]:
48
48
  try:
judgeval/api/api_types.py CHANGED
@@ -1,6 +1,6 @@
1
1
  # generated by datamodel-codegen:
2
2
  # filename: .openapi.json
3
- # timestamp: 2025-09-12T16:54:35+00:00
3
+ # timestamp: 2025-09-24T18:25:18+00:00
4
4
 
5
5
  from __future__ import annotations
6
6
  from typing import Any, Dict, List, Literal, Optional, TypedDict, Union
@@ -52,8 +52,8 @@ class SavePromptScorerRequest(TypedDict):
52
52
  name: str
53
53
  prompt: str
54
54
  threshold: float
55
- options: NotRequired[Optional[Dict[str, float]]]
56
- is_trace: NotRequired[Optional[bool]]
55
+ model: NotRequired[str]
56
+ is_trace: NotRequired[bool]
57
57
 
58
58
 
59
59
  class SavePromptScorerResponse(TypedDict):
@@ -117,6 +117,7 @@ class ScorerConfig(TypedDict):
117
117
  score_type: str
118
118
  name: NotRequired[Optional[str]]
119
119
  threshold: NotRequired[float]
120
+ model: NotRequired[Optional[str]]
120
121
  strict_mode: NotRequired[bool]
121
122
  required_params: NotRequired[List[str]]
122
123
  kwargs: NotRequired[Optional[Dict[str, Any]]]
@@ -141,7 +142,7 @@ class PromptScorer(TypedDict):
141
142
  name: str
142
143
  prompt: str
143
144
  threshold: float
144
- options: NotRequired[Optional[Dict[str, float]]]
145
+ model: NotRequired[str]
145
146
  created_at: NotRequired[Optional[str]]
146
147
  updated_at: NotRequired[Optional[str]]
147
148
  is_trace: NotRequired[Optional[bool]]
@@ -189,13 +190,28 @@ class OtelTraceSpan(TypedDict):
189
190
  state_before: NotRequired[Optional[Dict[str, Any]]]
190
191
 
191
192
 
193
+ class OtelSpanListItemScores(TypedDict):
194
+ success: bool
195
+ score: float
196
+ reason: NotRequired[Optional[str]]
197
+ name: str
198
+
199
+
200
+ class OtelSpanDetailScores(TypedDict):
201
+ success: bool
202
+ score: float
203
+ reason: NotRequired[Optional[str]]
204
+ name: str
205
+ data: NotRequired[Optional[Dict[str, Any]]]
206
+
207
+
192
208
  class ExampleEvaluationRun(TypedDict):
193
209
  id: NotRequired[str]
194
210
  project_name: str
195
211
  eval_name: str
196
212
  custom_scorers: NotRequired[List[BaseScorer]]
197
213
  judgment_scorers: NotRequired[List[ScorerConfig]]
198
- model: str
214
+ model: NotRequired[Optional[str]]
199
215
  created_at: NotRequired[str]
200
216
  examples: List[Example]
201
217
  trace_span_id: NotRequired[Optional[str]]
@@ -212,7 +228,7 @@ class TraceEvaluationRun(TypedDict):
212
228
  eval_name: str
213
229
  custom_scorers: NotRequired[List[BaseScorer]]
214
230
  judgment_scorers: NotRequired[List[ScorerConfig]]
215
- model: str
231
+ model: NotRequired[Optional[str]]
216
232
  created_at: NotRequired[str]
217
233
  trace_and_span_ids: List[TraceAndSpanId]
218
234
  is_offline: NotRequired[bool]
@@ -224,12 +240,6 @@ class DatasetInsertExamples(TypedDict):
224
240
  project_name: str
225
241
 
226
242
 
227
- class DatasetReturn(TypedDict):
228
- name: str
229
- project_name: str
230
- examples: NotRequired[Optional[List[Example]]]
231
-
232
-
233
243
  class DatasetInfo(TypedDict):
234
244
  dataset_id: str
235
245
  name: str
@@ -261,6 +271,65 @@ class ScoringResult(TypedDict):
261
271
  evaluation_cost: NotRequired[Optional[float]]
262
272
 
263
273
 
274
+ class OtelTraceListItem(TypedDict):
275
+ organization_id: str
276
+ project_id: str
277
+ trace_id: str
278
+ timestamp: str
279
+ duration: NotRequired[Optional[int]]
280
+ has_notification: NotRequired[Optional[bool]]
281
+ tags: NotRequired[Optional[List[str]]]
282
+ experiment_run_id: NotRequired[Optional[str]]
283
+ span_name: NotRequired[Optional[str]]
284
+ cumulative_llm_cost: NotRequired[Optional[float]]
285
+ error: NotRequired[Optional[Dict[str, Any]]]
286
+ scores: NotRequired[List[OtelSpanListItemScores]]
287
+ customer_id: NotRequired[Optional[str]]
288
+ input_preview: NotRequired[Optional[str]]
289
+ output_preview: NotRequired[Optional[str]]
290
+ annotation_count: NotRequired[int]
291
+ span_id: str
292
+ rule_id: NotRequired[Optional[str]]
293
+
294
+
295
+ class OtelSpanDetail(TypedDict):
296
+ organization_id: str
297
+ project_id: str
298
+ timestamp: str
299
+ trace_id: str
300
+ span_id: str
301
+ parent_span_id: NotRequired[Optional[str]]
302
+ trace_state: NotRequired[Optional[str]]
303
+ span_name: NotRequired[Optional[str]]
304
+ span_kind: NotRequired[Optional[str]]
305
+ service_name: NotRequired[Optional[str]]
306
+ resource_attributes: NotRequired[Optional[Dict[str, Any]]]
307
+ span_attributes: NotRequired[Optional[Dict[str, Any]]]
308
+ duration: NotRequired[Optional[int]]
309
+ status_code: NotRequired[Optional[str]]
310
+ status_message: NotRequired[Optional[str]]
311
+ events: NotRequired[Optional[Union[List[Dict[str, Any]], Dict[str, Any]]]]
312
+ links: NotRequired[Optional[Union[List[Dict[str, Any]], Dict[str, Any]]]]
313
+ llm_cost: NotRequired[Optional[float]]
314
+ prompt_tokens: NotRequired[Optional[int]]
315
+ completion_tokens: NotRequired[Optional[int]]
316
+ scores: NotRequired[Optional[List[OtelSpanDetailScores]]]
317
+
318
+
264
319
  class EvalResults(TypedDict):
265
320
  results: List[ScoringResult]
266
321
  run: Union[ExampleEvaluationRun, TraceEvaluationRun]
322
+
323
+
324
+ class DatasetTraceWithSpans(TypedDict):
325
+ dataset_id: str
326
+ trace_detail: OtelTraceListItem
327
+ spans: List[OtelSpanDetail]
328
+
329
+
330
+ class DatasetReturn(TypedDict):
331
+ name: str
332
+ project_name: str
333
+ dataset_kind: DatasetKind
334
+ examples: NotRequired[Optional[List[Example]]]
335
+ traces: NotRequired[Optional[List[DatasetTraceWithSpans]]]
judgeval/cli.py CHANGED
@@ -5,6 +5,7 @@ from pathlib import Path
5
5
  from dotenv import load_dotenv
6
6
  from judgeval.logger import judgeval_logger
7
7
  from judgeval import JudgmentClient
8
+ from judgeval.version import get_version
8
9
 
9
10
  load_dotenv()
10
11
 
@@ -56,7 +57,7 @@ def upload_scorer(
56
57
  @app.command()
57
58
  def version():
58
59
  """Show version info"""
59
- judgeval_logger.info("JudgEval CLI v0.0.0")
60
+ judgeval_logger.info(f"Judgeval CLI v{get_version()}")
60
61
 
61
62
 
62
63
  if __name__ == "__main__":
judgeval/constants.py CHANGED
@@ -24,7 +24,6 @@ class APIScorerType(str, Enum):
24
24
 
25
25
  @classmethod
26
26
  def __missing__(cls, value: str) -> APIScorerType:
27
- # Handle case-insensitive lookup
28
27
  for member in cls:
29
28
  if member.value == value.lower():
30
29
  return member
@@ -32,11 +31,6 @@ class APIScorerType(str, Enum):
32
31
  raise ValueError(f"Invalid scorer type: {value}")
33
32
 
34
33
 
35
- UNBOUNDED_SCORERS: Set[APIScorerType] = (
36
- set()
37
- ) # scorers whose scores are not bounded between 0-1
38
-
39
-
40
34
  LITELLM_SUPPORTED_MODELS: Set[str] = set(litellm.model_list)
41
35
 
42
36
 
@@ -1,4 +1,4 @@
1
- from typing import List, Optional, Union, Tuple
1
+ from typing import List, Optional, Union, Tuple, Sequence
2
2
  from pydantic import field_validator, model_validator, Field, BaseModel
3
3
  from datetime import datetime, timezone
4
4
  import uuid
@@ -19,9 +19,11 @@ class EvaluationRun(BaseModel):
19
19
  default_factory=lambda: datetime.now(timezone.utc).isoformat()
20
20
  )
21
21
  custom_scorers: List[ExampleScorer] = Field(default_factory=list)
22
- judgment_scorers: List[APIScorerConfig] = Field(default_factory=list)
23
- scorers: List[Union[ExampleScorer, APIScorerConfig]] = Field(default_factory=list)
24
- model: str
22
+ judgment_scorers: Sequence[APIScorerConfig] = Field(default_factory=list)
23
+ scorers: Sequence[Union[ExampleScorer, APIScorerConfig]] = Field(
24
+ default_factory=list
25
+ )
26
+ model: Optional[str] = None
25
27
 
26
28
  def __init__(
27
29
  self,
@@ -75,11 +77,8 @@ class EvaluationRun(BaseModel):
75
77
 
76
78
  @field_validator("model")
77
79
  def validate_model(cls, v, values):
78
- if not v:
79
- raise ValueError("Model cannot be empty.")
80
-
81
80
  # Check if model is string or list of strings
82
- if isinstance(v, str):
81
+ if v is not None and isinstance(v, str):
83
82
  if v not in ACCEPTABLE_MODELS:
84
83
  raise ValueError(
85
84
  f"Model name {v} not recognized. Please select a valid model name.)"
@@ -1,6 +1,6 @@
1
1
  # generated by datamodel-codegen:
2
2
  # filename: .openapi.json
3
- # timestamp: 2025-09-12T16:54:34+00:00
3
+ # timestamp: 2025-09-24T18:25:17+00:00
4
4
 
5
5
  from __future__ import annotations
6
6
  from typing import Annotated, Any, Dict, List, Optional, Union
@@ -54,8 +54,8 @@ class SavePromptScorerRequest(BaseModel):
54
54
  name: Annotated[str, Field(title="Name")]
55
55
  prompt: Annotated[str, Field(title="Prompt")]
56
56
  threshold: Annotated[float, Field(title="Threshold")]
57
- options: Annotated[Optional[Dict[str, float]], Field(title="Options")] = None
58
- is_trace: Annotated[Optional[bool], Field(title="Is Trace")] = None
57
+ model: Annotated[Optional[str], Field(title="Model")] = "gpt-5"
58
+ is_trace: Annotated[Optional[bool], Field(title="Is Trace")] = False
59
59
 
60
60
 
61
61
  class SavePromptScorerResponse(BaseModel):
@@ -125,6 +125,7 @@ class ScorerConfig(BaseModel):
125
125
  score_type: Annotated[str, Field(title="Score Type")]
126
126
  name: Annotated[Optional[str], Field(title="Name")] = None
127
127
  threshold: Annotated[Optional[float], Field(title="Threshold")] = 0.5
128
+ model: Annotated[Optional[str], Field(title="Model")] = None
128
129
  strict_mode: Annotated[Optional[bool], Field(title="Strict Mode")] = False
129
130
  required_params: Annotated[Optional[List[str]], Field(title="Required Params")] = []
130
131
  kwargs: Annotated[Optional[Dict[str, Any]], Field(title="Kwargs")] = None
@@ -154,7 +155,7 @@ class PromptScorer(BaseModel):
154
155
  name: Annotated[str, Field(title="Name")]
155
156
  prompt: Annotated[str, Field(title="Prompt")]
156
157
  threshold: Annotated[float, Field(title="Threshold")]
157
- options: Annotated[Optional[Dict[str, float]], Field(title="Options")] = None
158
+ model: Annotated[Optional[str], Field(title="Model")] = "gpt-5"
158
159
  created_at: Annotated[Optional[AwareDatetime], Field(title="Created At")] = None
159
160
  updated_at: Annotated[Optional[AwareDatetime], Field(title="Updated At")] = None
160
161
  is_trace: Annotated[Optional[bool], Field(title="Is Trace")] = False
@@ -212,6 +213,21 @@ class OtelTraceSpan(BaseModel):
212
213
  )
213
214
 
214
215
 
216
+ class OtelSpanListItemScores(BaseModel):
217
+ success: Annotated[bool, Field(title="Success")]
218
+ score: Annotated[float, Field(title="Score")]
219
+ reason: Annotated[Optional[str], Field(title="Reason")] = None
220
+ name: Annotated[str, Field(title="Name")]
221
+
222
+
223
+ class OtelSpanDetailScores(BaseModel):
224
+ success: Annotated[bool, Field(title="Success")]
225
+ score: Annotated[float, Field(title="Score")]
226
+ reason: Annotated[Optional[str], Field(title="Reason")] = None
227
+ name: Annotated[str, Field(title="Name")]
228
+ data: Annotated[Optional[Dict[str, Any]], Field(title="Data")] = None
229
+
230
+
215
231
  class ExampleEvaluationRun(BaseModel):
216
232
  id: Annotated[Optional[str], Field(title="Id")] = None
217
233
  project_name: Annotated[str, Field(title="Project Name")]
@@ -222,7 +238,7 @@ class ExampleEvaluationRun(BaseModel):
222
238
  judgment_scorers: Annotated[
223
239
  Optional[List[ScorerConfig]], Field(title="Judgment Scorers")
224
240
  ] = []
225
- model: Annotated[str, Field(title="Model")]
241
+ model: Annotated[Optional[str], Field(title="Model")] = None
226
242
  created_at: Annotated[Optional[str], Field(title="Created At")] = None
227
243
  examples: Annotated[List[Example], Field(title="Examples")]
228
244
  trace_span_id: Annotated[Optional[str], Field(title="Trace Span Id")] = None
@@ -243,7 +259,7 @@ class TraceEvaluationRun(BaseModel):
243
259
  judgment_scorers: Annotated[
244
260
  Optional[List[ScorerConfig]], Field(title="Judgment Scorers")
245
261
  ] = []
246
- model: Annotated[str, Field(title="Model")]
262
+ model: Annotated[Optional[str], Field(title="Model")] = None
247
263
  created_at: Annotated[Optional[str], Field(title="Created At")] = None
248
264
  trace_and_span_ids: Annotated[
249
265
  List[TraceAndSpanId], Field(title="Trace And Span Ids")
@@ -257,12 +273,6 @@ class DatasetInsertExamples(BaseModel):
257
273
  project_name: Annotated[str, Field(title="Project Name")]
258
274
 
259
275
 
260
- class DatasetReturn(BaseModel):
261
- name: Annotated[str, Field(title="Name")]
262
- project_name: Annotated[str, Field(title="Project Name")]
263
- examples: Annotated[Optional[List[Example]], Field(title="Examples")] = None
264
-
265
-
266
276
  class DatasetInfo(BaseModel):
267
277
  dataset_id: Annotated[str, Field(title="Dataset Id")]
268
278
  name: Annotated[str, Field(title="Name")]
@@ -296,6 +306,81 @@ class ScoringResult(BaseModel):
296
306
  evaluation_cost: Annotated[Optional[float], Field(title="Evaluation Cost")] = None
297
307
 
298
308
 
309
+ class OtelTraceListItem(BaseModel):
310
+ organization_id: Annotated[str, Field(title="Organization Id")]
311
+ project_id: Annotated[str, Field(title="Project Id")]
312
+ trace_id: Annotated[str, Field(title="Trace Id")]
313
+ timestamp: Annotated[str, Field(title="Timestamp")]
314
+ duration: Annotated[Optional[int], Field(title="Duration")] = None
315
+ has_notification: Annotated[Optional[bool], Field(title="Has Notification")] = None
316
+ tags: Annotated[Optional[List[str]], Field(title="Tags")] = None
317
+ experiment_run_id: Annotated[Optional[str], Field(title="Experiment Run Id")] = None
318
+ span_name: Annotated[Optional[str], Field(title="Span Name")] = None
319
+ cumulative_llm_cost: Annotated[
320
+ Optional[float], Field(title="Cumulative Llm Cost")
321
+ ] = None
322
+ error: Annotated[Optional[Dict[str, Any]], Field(title="Error")] = None
323
+ scores: Annotated[
324
+ Optional[List[OtelSpanListItemScores]], Field(title="Scores")
325
+ ] = []
326
+ customer_id: Annotated[Optional[str], Field(title="Customer Id")] = None
327
+ input_preview: Annotated[Optional[str], Field(title="Input Preview")] = None
328
+ output_preview: Annotated[Optional[str], Field(title="Output Preview")] = None
329
+ annotation_count: Annotated[Optional[int], Field(title="Annotation Count")] = 0
330
+ span_id: Annotated[str, Field(title="Span Id")]
331
+ rule_id: Annotated[Optional[str], Field(title="Rule Id")] = None
332
+
333
+
334
+ class OtelSpanDetail(BaseModel):
335
+ organization_id: Annotated[str, Field(title="Organization Id")]
336
+ project_id: Annotated[str, Field(title="Project Id")]
337
+ timestamp: Annotated[str, Field(title="Timestamp")]
338
+ trace_id: Annotated[str, Field(title="Trace Id")]
339
+ span_id: Annotated[str, Field(title="Span Id")]
340
+ parent_span_id: Annotated[Optional[str], Field(title="Parent Span Id")] = None
341
+ trace_state: Annotated[Optional[str], Field(title="Trace State")] = None
342
+ span_name: Annotated[Optional[str], Field(title="Span Name")] = None
343
+ span_kind: Annotated[Optional[str], Field(title="Span Kind")] = None
344
+ service_name: Annotated[Optional[str], Field(title="Service Name")] = None
345
+ resource_attributes: Annotated[
346
+ Optional[Dict[str, Any]], Field(title="Resource Attributes")
347
+ ] = None
348
+ span_attributes: Annotated[
349
+ Optional[Dict[str, Any]], Field(title="Span Attributes")
350
+ ] = None
351
+ duration: Annotated[Optional[int], Field(title="Duration")] = None
352
+ status_code: Annotated[Optional[str], Field(title="Status Code")] = None
353
+ status_message: Annotated[Optional[str], Field(title="Status Message")] = None
354
+ events: Annotated[
355
+ Optional[Union[List[Dict[str, Any]], Dict[str, Any]]], Field(title="Events")
356
+ ] = None
357
+ links: Annotated[
358
+ Optional[Union[List[Dict[str, Any]], Dict[str, Any]]], Field(title="Links")
359
+ ] = None
360
+ llm_cost: Annotated[Optional[float], Field(title="Llm Cost")] = None
361
+ prompt_tokens: Annotated[Optional[int], Field(title="Prompt Tokens")] = None
362
+ completion_tokens: Annotated[Optional[int], Field(title="Completion Tokens")] = None
363
+ scores: Annotated[Optional[List[OtelSpanDetailScores]], Field(title="Scores")] = (
364
+ None
365
+ )
366
+
367
+
299
368
  class EvalResults(BaseModel):
300
369
  results: Annotated[List[ScoringResult], Field(title="Results")]
301
370
  run: Annotated[Union[ExampleEvaluationRun, TraceEvaluationRun], Field(title="Run")]
371
+
372
+
373
+ class DatasetTraceWithSpans(BaseModel):
374
+ dataset_id: Annotated[str, Field(title="Dataset Id")]
375
+ trace_detail: OtelTraceListItem
376
+ spans: Annotated[List[OtelSpanDetail], Field(title="Spans")]
377
+
378
+
379
+ class DatasetReturn(BaseModel):
380
+ name: Annotated[str, Field(title="Name")]
381
+ project_name: Annotated[str, Field(title="Project Name")]
382
+ dataset_kind: DatasetKind
383
+ examples: Annotated[Optional[List[Example]], Field(title="Examples")] = None
384
+ traces: Annotated[Optional[List[DatasetTraceWithSpans]], Field(title="Traces")] = (
385
+ None
386
+ )
judgeval/data/trace.py CHANGED
@@ -1,5 +1,10 @@
1
- from typing import Optional
1
+ from typing import Optional, List, Dict, Any
2
2
  from pydantic import BaseModel
3
+ from .judgment_types import (
4
+ OtelSpanDetailScores,
5
+ OtelSpanDetail,
6
+ OtelTraceListItem,
7
+ )
3
8
 
4
9
 
5
10
  class TraceUsage(BaseModel):
@@ -12,3 +17,105 @@ class TraceUsage(BaseModel):
12
17
  completion_tokens_cost_usd: Optional[float] = None
13
18
  total_cost_usd: Optional[float] = None
14
19
  model_name: Optional[str] = None
20
+
21
+
22
+ class TraceScore(OtelSpanDetailScores):
23
+ """Score information for a trace or span."""
24
+
25
+ pass
26
+
27
+
28
+ class TraceRule(BaseModel):
29
+ """Rule that was triggered for a trace."""
30
+
31
+ rule_id: str
32
+ rule_name: str
33
+
34
+
35
+ class TraceSpan(OtelSpanDetail):
36
+ """Individual span within a trace with complete telemetry data."""
37
+
38
+ @classmethod
39
+ def from_otel_span_detail(cls, span_detail: OtelSpanDetail) -> "TraceSpan":
40
+ """Create TraceSpan from OtelSpanDetail, converting scores to TraceScore."""
41
+ data = span_detail.model_dump()
42
+
43
+ if "scores" in data and data["scores"]:
44
+ data["scores"] = [TraceScore(**score) for score in data["scores"]]
45
+
46
+ return cls(**data)
47
+
48
+ def to_dict(self) -> Dict[str, Any]:
49
+ """Convert TraceSpan to dictionary."""
50
+ return self.model_dump(exclude_none=True)
51
+
52
+
53
+ class Trace(OtelTraceListItem):
54
+ """Complete trace with metadata and all associated spans."""
55
+
56
+ spans: List[TraceSpan] = []
57
+ rules: Optional[List[TraceRule]] = []
58
+
59
+ @classmethod
60
+ def from_dataset_trace_with_spans(cls, dataset_trace: Any) -> "Trace":
61
+ """Create Trace from DatasetTraceWithSpans (handles both API and judgment types)."""
62
+
63
+ if hasattr(dataset_trace, "trace_detail"):
64
+ trace_detail = dataset_trace.trace_detail
65
+ spans_data = dataset_trace.spans
66
+ else:
67
+ trace_detail = dataset_trace.get("trace_detail", {})
68
+ spans_data = dataset_trace.get("spans", [])
69
+
70
+ if hasattr(trace_detail, "model_dump"):
71
+ trace_data = trace_detail.model_dump()
72
+ elif isinstance(trace_detail, dict):
73
+ trace_data = trace_detail.copy()
74
+ else:
75
+ trace_data = dict(trace_detail)
76
+
77
+ spans = []
78
+ for span in spans_data:
79
+ if hasattr(span, "model_dump"):
80
+ spans.append(TraceSpan.from_otel_span_detail(span))
81
+ else:
82
+ # Handle dict spans
83
+ span_data = dict(span) if not isinstance(span, dict) else span.copy()
84
+ if "scores" in span_data and span_data["scores"]:
85
+ span_data["scores"] = [
86
+ TraceScore(**score)
87
+ if isinstance(score, dict)
88
+ else TraceScore(**score.model_dump())
89
+ for score in span_data["scores"]
90
+ ]
91
+ spans.append(TraceSpan(**span_data))
92
+
93
+ rules = []
94
+ if "rule_id" in trace_data and trace_data["rule_id"]:
95
+ rules = [
96
+ TraceRule(
97
+ rule_id=trace_data["rule_id"],
98
+ rule_name=f"Rule {trace_data['rule_id']}",
99
+ )
100
+ ]
101
+
102
+ trace_data.pop("scores", [])
103
+ trace_data.pop("rule_id", None)
104
+ trace = cls(**trace_data)
105
+
106
+ trace.spans = spans
107
+ trace.rules = rules
108
+
109
+ return trace
110
+
111
+ def to_dict(self) -> Dict[str, Any]:
112
+ """Convert Trace to dictionary."""
113
+ return self.model_dump(exclude_none=True)
114
+
115
+ def __len__(self) -> int:
116
+ """Return the number of spans in the trace."""
117
+ return len(self.spans)
118
+
119
+ def __iter__(self):
120
+ """Iterate over spans in the trace."""
121
+ return iter(self.spans)
@@ -3,15 +3,16 @@ import orjson
3
3
  import os
4
4
  import yaml
5
5
  from dataclasses import dataclass
6
- from typing import List, Literal
6
+ from typing import List, Literal, Optional
7
7
 
8
8
  from judgeval.data import Example
9
+ from judgeval.data.trace import Trace
9
10
  from judgeval.utils.file_utils import get_examples_from_yaml, get_examples_from_json
10
11
  from judgeval.api import JudgmentSyncClient
11
12
  from judgeval.logger import judgeval_logger
12
13
  from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_ORG_ID
13
14
 
14
- from judgeval.api.api_types import DatasetKind
15
+ from judgeval.data.judgment_types import DatasetKind
15
16
 
16
17
 
17
18
  @dataclass
@@ -26,9 +27,11 @@ class DatasetInfo:
26
27
 
27
28
  @dataclass
28
29
  class Dataset:
29
- examples: List[Example]
30
30
  name: str
31
31
  project_name: str
32
+ dataset_kind: DatasetKind = DatasetKind.example
33
+ examples: Optional[List[Example]] = None
34
+ traces: Optional[List[Trace]] = None
32
35
  judgment_api_key: str = JUDGMENT_API_KEY or ""
33
36
  organization_id: str = JUDGMENT_ORG_ID or ""
34
37
 
@@ -47,22 +50,49 @@ class Dataset:
47
50
  )
48
51
  if not dataset:
49
52
  raise ValueError(f"Dataset {name} not found in project {project_name}")
50
- examples = dataset.get("examples", [])
51
- if examples is None:
52
- examples = []
53
53
 
54
- for e in examples:
55
- if isinstance(e, dict) and isinstance(e.get("data", {}), dict):
56
- e.update(e.pop("data")) # type: ignore
57
- e.pop(
58
- "example_id"
59
- ) # TODO: remove once scorer data migraiton is complete
60
- judgeval_logger.info(f"Successfully retrieved dataset {name}!")
61
- return cls(
62
- name=name,
63
- project_name=project_name,
64
- examples=[Example(**e) for e in examples],
65
- )
54
+ dataset_kind = DatasetKind(dataset.get("dataset_kind", "example"))
55
+
56
+ if dataset_kind == DatasetKind.example:
57
+ examples = dataset.get("examples", [])
58
+ if examples is None:
59
+ examples = []
60
+
61
+ for e in examples:
62
+ if isinstance(e, dict) and isinstance(e.get("data", {}), dict):
63
+ e.update(e.pop("data")) # type: ignore
64
+ e.pop(
65
+ "example_id"
66
+ ) # TODO: remove once scorer data migration is complete
67
+ judgeval_logger.info(f"Successfully retrieved example dataset {name}!")
68
+ return cls(
69
+ name=name,
70
+ project_name=project_name,
71
+ dataset_kind=dataset_kind,
72
+ examples=[Example(**e) for e in examples],
73
+ )
74
+
75
+ elif dataset_kind == DatasetKind.trace:
76
+ trace_data = dataset.get("traces", [])
77
+ if trace_data is None:
78
+ trace_data = []
79
+
80
+ traces = []
81
+ for trace_item in trace_data:
82
+ if isinstance(trace_item, dict):
83
+ trace = Trace.from_dataset_trace_with_spans(trace_item)
84
+ traces.append(trace)
85
+
86
+ judgeval_logger.info(f"Successfully retrieved trace dataset {name}!")
87
+ return cls(
88
+ name=name,
89
+ project_name=project_name,
90
+ dataset_kind=dataset_kind,
91
+ traces=traces,
92
+ )
93
+
94
+ else:
95
+ raise ValueError(f"Unsupported dataset kind: {dataset_kind}")
66
96
 
67
97
  @classmethod
68
98
  def create(
@@ -179,7 +209,9 @@ class Dataset:
179
209
  file.write(
180
210
  orjson.dumps(
181
211
  {
182
- "examples": [e.to_dict() for e in self.examples],
212
+ "examples": [e.to_dict() for e in self.examples]
213
+ if self.examples
214
+ else [],
183
215
  },
184
216
  option=orjson.OPT_INDENT_2,
185
217
  )
@@ -187,7 +219,9 @@ class Dataset:
187
219
  elif file_type == "yaml":
188
220
  with open(complete_path, "w") as file:
189
221
  yaml_data = {
190
- "examples": [e.to_dict() for e in self.examples],
222
+ "examples": [e.to_dict() for e in self.examples]
223
+ if self.examples
224
+ else [],
191
225
  }
192
226
  yaml.dump(yaml_data, file, default_flow_style=False)
193
227
  else:
@@ -197,10 +231,25 @@ class Dataset:
197
231
  )
198
232
 
199
233
  def __iter__(self):
200
- return iter(self.examples)
234
+ if self.dataset_kind == DatasetKind.example and self.examples:
235
+ return iter(self.examples)
236
+ elif self.dataset_kind == DatasetKind.trace and self.traces:
237
+ return iter(self.traces)
238
+ else:
239
+ return iter([])
201
240
 
202
241
  def __len__(self):
203
- return len(self.examples)
242
+ if self.dataset_kind == DatasetKind.example and self.examples:
243
+ return len(self.examples)
244
+ elif self.dataset_kind == DatasetKind.trace and self.traces:
245
+ return len(self.traces)
246
+ else:
247
+ return 0
204
248
 
205
249
  def __str__(self):
206
- return f"{self.__class__.__name__}(examples={self.examples}, name={self.name})"
250
+ if self.dataset_kind == DatasetKind.example:
251
+ return (
252
+ f"{self.__class__.__name__}(examples={self.examples}, name={self.name})"
253
+ )
254
+ else:
255
+ return f"{self.__class__.__name__}(traces={self.traces}, name={self.name})"