judgeval 0.0.36__py3-none-any.whl → 0.0.37__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
judgeval/common/utils.py CHANGED
@@ -12,6 +12,7 @@ NOTE: any function beginning with 'a', e.g. 'afetch_together_api_response', is a
12
12
  import asyncio
13
13
  import concurrent.futures
14
14
  import os
15
+ import requests
15
16
  import pprint
16
17
  from typing import Any, Dict, List, Literal, Mapping, Optional, Union
17
18
 
@@ -96,6 +97,23 @@ def read_file(file_path: str) -> str:
96
97
  with open(file_path, "r", encoding='utf-8') as file:
97
98
  return file.read()
98
99
 
100
+ def validate_api_key(judgment_api_key: str):
101
+ """
102
+ Validates that the user api key is valid
103
+ """
104
+ response = requests.post(
105
+ f"{ROOT_API}/validate_api_key/",
106
+ headers={
107
+ "Content-Type": "application/json",
108
+ "Authorization": f"Bearer {judgment_api_key}",
109
+ },
110
+ json={}, # Empty body now
111
+ verify=True
112
+ )
113
+ if response.status_code == 200:
114
+ return True, response.json()
115
+ else:
116
+ return False, response.json().get("detail", "Error validating API key")
99
117
 
100
118
  def fetch_together_api_response(model: str, messages: List[Mapping], response_format: pydantic.BaseModel = None) -> str:
101
119
  """
judgeval/constants.py CHANGED
@@ -27,7 +27,7 @@ class APIScorer(str, Enum):
27
27
  COMPARISON = "comparison"
28
28
  GROUNDEDNESS = "groundedness"
29
29
  DERAILMENT = "derailment"
30
-
30
+ TOOL_ORDER = "tool_order"
31
31
  @classmethod
32
32
  def _missing_(cls, value):
33
33
  # Handle case-insensitive lookup
@@ -50,6 +50,7 @@ JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = f"{ROOT_API}/datasets/export_jsonl/"
50
50
  JUDGMENT_DATASETS_PROJECT_STATS_API_URL = f"{ROOT_API}/datasets/fetch_stats_by_project/"
51
51
  JUDGMENT_DATASETS_INSERT_API_URL = f"{ROOT_API}/datasets/insert_examples/"
52
52
  JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
53
+ JUDGMENT_RETRIEVE_SEQUENCE_FROM_TRACE_API_URL = f"{ROOT_API}/traces/convert_trace_to_sequence/"
53
54
  JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_experiment_run/"
54
55
  JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/delete_eval_results_by_project_and_run_names/"
55
56
  JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_project/"
@@ -58,6 +59,7 @@ JUDGMENT_PROJECT_CREATE_API_URL = f"{ROOT_API}/projects/add/"
58
59
  JUDGMENT_TRACES_FETCH_API_URL = f"{ROOT_API}/traces/fetch/"
59
60
  JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
60
61
  JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
62
+ JUDGMENT_TRACES_ADD_ANNOTATION_API_URL = f"{ROOT_API}/traces/add_annotation/"
61
63
  JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL = f"{ROOT_API}/add_to_run_eval_queue/"
62
64
  # RabbitMQ
63
65
  RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "rabbitmq-networklb-faa155df16ec9085.elb.us-west-1.amazonaws.com")
judgeval/data/__init__.py CHANGED
@@ -3,6 +3,8 @@ from judgeval.data.custom_example import CustomExample
3
3
  from judgeval.data.scorer_data import ScorerData, create_scorer_data
4
4
  from judgeval.data.result import ScoringResult, generate_scoring_result
5
5
  from judgeval.data.sequence import Sequence
6
+ from judgeval.data.trace import Trace, TraceSpan
7
+
6
8
 
7
9
  __all__ = [
8
10
  "Example",
@@ -13,4 +15,6 @@ __all__ = [
13
15
  "ScoringResult",
14
16
  "generate_scoring_result",
15
17
  "Sequence",
18
+ "Trace",
19
+ "TraceSpan",
16
20
  ]
@@ -273,7 +273,6 @@ class EvalDataset:
273
273
  None, # Example does not have comments
274
274
  None, # Example does not have source file
275
275
  True, # Adding an Example
276
- e.trace_id
277
276
  ]
278
277
  )
279
278
 
@@ -295,7 +294,6 @@ class EvalDataset:
295
294
  "comments": None, # Example does not have comments
296
295
  "source_file": None, # Example does not have source file
297
296
  "example": True, # Adding an Example
298
- "trace_id": e.trace_id
299
297
  }
300
298
  for e in self.examples
301
299
  ],
judgeval/data/example.py CHANGED
@@ -24,14 +24,14 @@ class ExampleParams(Enum):
24
24
 
25
25
 
26
26
  class Example(BaseModel):
27
- input: Optional[str] = None
27
+ input: Optional[Union[str, Dict[str, Any]]] = None
28
28
  actual_output: Optional[Union[str, List[str]]] = None
29
29
  expected_output: Optional[Union[str, List[str]]] = None
30
30
  context: Optional[List[str]] = None
31
31
  retrieval_context: Optional[List[str]] = None
32
32
  additional_metadata: Optional[Dict[str, Any]] = None
33
33
  tools_called: Optional[List[str]] = None
34
- expected_tools: Optional[List[str]] = None
34
+ expected_tools: Optional[List[Dict[str, Any]]] = None
35
35
  name: Optional[str] = None
36
36
  example_id: str = Field(default_factory=lambda: str(uuid4()))
37
37
  example_index: Optional[int] = None
@@ -50,8 +50,18 @@ class Example(BaseModel):
50
50
  @field_validator('input', mode='before')
51
51
  @classmethod
52
52
  def validate_input(cls, v):
53
- if v is not None and (not v or not isinstance(v, str)):
54
- raise ValueError(f"Input must be a non-empty string but got '{v}' of type {type(v)}")
53
+ if v is not None:
54
+ if not isinstance(v, (str, dict)):
55
+ raise ValueError(f"Input must be a string or dictionary but got {v} of type {type(v)}")
56
+
57
+ # If it's a string, check that it's not empty
58
+ if isinstance(v, str) and not v:
59
+ raise ValueError(f"Input string must be non-empty but got '{v}'")
60
+
61
+ # If it's a dictionary, check that it's not empty
62
+ if isinstance(v, dict) and not v:
63
+ raise ValueError(f"Input dictionary must be non-empty but got {v}")
64
+
55
65
  return v
56
66
 
57
67
  @field_validator('actual_output', mode='before')
@@ -73,7 +83,21 @@ class Example(BaseModel):
73
83
  raise ValueError(f"All items in expected_output must be strings but got {v}")
74
84
  return v
75
85
 
76
- @field_validator('context', 'retrieval_context', 'tools_called', 'expected_tools', mode='before')
86
+ @field_validator('expected_tools', mode='before')
87
+ @classmethod
88
+ def validate_expected_tools(cls, v):
89
+ if v is not None:
90
+ if not isinstance(v, list):
91
+ raise ValueError(f"Expected tools must be a list of dictionaries or None but got {v} of type {type(v)}")
92
+
93
+ # Check that each item in the list is a dictionary
94
+ for i, item in enumerate(v):
95
+ if not isinstance(item, dict):
96
+ raise ValueError(f"Expected tools must be a list of dictionaries, but item at index {i} is {item} of type {type(item)}")
97
+
98
+ return v
99
+
100
+ @field_validator('context', 'retrieval_context', 'tools_called', mode='before')
77
101
  @classmethod
78
102
  def validate_string_lists(cls, v, info):
79
103
  field_name = info.field_name
@@ -127,7 +151,6 @@ class Example(BaseModel):
127
151
  "example_id": self.example_id,
128
152
  "example_index": self.example_index,
129
153
  "timestamp": self.timestamp,
130
- "trace_id": self.trace_id
131
154
  }
132
155
 
133
156
  def __str__(self):
@@ -144,5 +167,4 @@ class Example(BaseModel):
144
167
  f"example_id={self.example_id}, "
145
168
  f"example_index={self.example_index}, "
146
169
  f"timestamp={self.timestamp}, "
147
- f"trace_id={self.trace_id})"
148
170
  )
judgeval/data/sequence.py CHANGED
@@ -1,5 +1,5 @@
1
1
  from pydantic import BaseModel, Field, field_validator, model_validator
2
- from typing import List, Optional, Union, Any
2
+ from typing import List, Optional, Union, Any, Dict
3
3
  from judgeval.data.example import Example
4
4
  from judgeval.scorers import JudgevalScorer, APIJudgmentScorer
5
5
  from uuid import uuid4
@@ -12,13 +12,14 @@ class Sequence(BaseModel):
12
12
  sequence_id: str = Field(default_factory=lambda: str(uuid4()))
13
13
  name: Optional[str] = "Sequence"
14
14
  created_at: str = Field(default_factory=lambda: datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S"))
15
- items: List[Union["Sequence", Example]]
15
+ items: List[Union["Sequence", Example]] = []
16
16
  scorers: Optional[Any] = None
17
17
  parent_sequence_id: Optional[str] = None
18
18
  sequence_order: Optional[int] = 0
19
19
  root_sequence_id: Optional[str] = None
20
- inputs: Optional[str] = None
21
- output: Optional[str] = None
20
+ inputs: Optional[Dict[str, Any]] = None
21
+ output: Optional[Any] = None
22
+ expected_tools: Optional[List[Dict[str, Any]]] = None
22
23
 
23
24
  @field_validator("scorers")
24
25
  def validate_scorer(cls, v):
@@ -1,6 +1,6 @@
1
1
 
2
2
  from pydantic import BaseModel
3
- from typing import List, Optional, Dict, Any, Union
3
+ from typing import List, Optional, Dict, Any, Union, Callable
4
4
  from judgeval.data import Sequence
5
5
  from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
6
6
  from judgeval.judges import JudgevalJudge
@@ -29,8 +29,9 @@ class SequenceRun(BaseModel):
29
29
  organization_id: Optional[str] = None
30
30
  project_name: Optional[str] = None
31
31
  eval_name: Optional[str] = None
32
- sequences: List[Sequence]
33
- model: Union[str, List[str], JudgevalJudge]
32
+ sequences: Optional[List[Sequence]] = None
33
+ scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]
34
+ model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1"
34
35
  aggregator: Optional[str] = None
35
36
  metadata: Optional[Dict[str, Any]] = None
36
37
  trace_span_id: Optional[str] = None
judgeval/data/trace.py ADDED
@@ -0,0 +1,129 @@
1
+ from pydantic import BaseModel
2
+ from typing import Optional, Dict, Any, List
3
+ from judgeval.evaluation_run import EvaluationRun
4
+ import json
5
+ from datetime import datetime, timezone
6
+
7
+ class TraceSpan(BaseModel):
8
+ span_id: str
9
+ trace_id: str
10
+ function: Optional[str] = None
11
+ depth: int
12
+ created_at: Optional[float] = None
13
+ parent_span_id: Optional[str] = None
14
+ span_type: Optional[str] = "span"
15
+ inputs: Optional[Dict[str, Any]] = None
16
+ output: Optional[Any] = None
17
+ duration: Optional[float] = None
18
+ annotation: Optional[List[Dict[str, Any]]] = None
19
+ evaluation_runs: Optional[List[EvaluationRun]] = []
20
+
21
+ def model_dump(self, **kwargs):
22
+ return {
23
+ "span_id": self.span_id,
24
+ "trace_id": self.trace_id,
25
+ "depth": self.depth,
26
+ # "created_at": datetime.fromtimestamp(self.created_at).isoformat(),
27
+ "created_at": datetime.fromtimestamp(self.created_at, tz=timezone.utc).isoformat(),
28
+ "inputs": self._serialize_inputs(),
29
+ "output": self._serialize_output(),
30
+ "evaluation_runs": [run.model_dump() for run in self.evaluation_runs] if self.evaluation_runs else [],
31
+ "parent_span_id": self.parent_span_id,
32
+ "function": self.function,
33
+ "duration": self.duration,
34
+ "span_type": self.span_type
35
+ }
36
+
37
+ def print_span(self):
38
+ """Print the span with proper formatting and parent relationship information."""
39
+ indent = " " * self.depth
40
+ parent_info = f" (parent_id: {self.parent_span_id})" if self.parent_span_id else ""
41
+ print(f"{indent}→ {self.function} (id: {self.span_id}){parent_info}")
42
+
43
+ def _serialize_inputs(self) -> dict:
44
+ """Helper method to serialize input data safely."""
45
+ if self.inputs is None:
46
+ return {}
47
+
48
+ serialized_inputs = {}
49
+ for key, value in self.inputs.items():
50
+ if isinstance(value, BaseModel):
51
+ serialized_inputs[key] = value.model_dump()
52
+ elif isinstance(value, (list, tuple)):
53
+ # Handle lists/tuples of arguments
54
+ serialized_inputs[key] = [
55
+ item.model_dump() if isinstance(item, BaseModel)
56
+ else None if not self._is_json_serializable(item)
57
+ else item
58
+ for item in value
59
+ ]
60
+ else:
61
+ if self._is_json_serializable(value):
62
+ serialized_inputs[key] = value
63
+ else:
64
+ serialized_inputs[key] = self.safe_stringify(value, self.function)
65
+ return serialized_inputs
66
+
67
+ def _is_json_serializable(self, obj: Any) -> bool:
68
+ """Helper method to check if an object is JSON serializable."""
69
+ try:
70
+ json.dumps(obj)
71
+ return True
72
+ except (TypeError, OverflowError, ValueError):
73
+ return False
74
+
75
+ def safe_stringify(self, output, function_name):
76
+ """
77
+ Safely converts an object to a string or repr, handling serialization issues gracefully.
78
+ """
79
+ try:
80
+ return str(output)
81
+ except (TypeError, OverflowError, ValueError):
82
+ pass
83
+
84
+ try:
85
+ return repr(output)
86
+ except (TypeError, OverflowError, ValueError):
87
+ pass
88
+
89
+ warnings.warn(
90
+ f"Output for function {function_name} is not JSON serializable and could not be converted to string. Setting to None."
91
+ )
92
+ return None
93
+
94
+ def _serialize_output(self) -> Any:
95
+ """Helper method to serialize output data safely."""
96
+ if self.output is None:
97
+ return None
98
+
99
+ def serialize_value(value):
100
+ if isinstance(value, BaseModel):
101
+ return value.model_dump()
102
+ elif isinstance(value, dict):
103
+ # Recursively serialize dictionary values
104
+ return {k: serialize_value(v) for k, v in value.items()}
105
+ elif isinstance(value, (list, tuple)):
106
+ # Recursively serialize list/tuple items
107
+ return [serialize_value(item) for item in value]
108
+ else:
109
+ # Try direct JSON serialization first
110
+ try:
111
+ json.dumps(value)
112
+ return value
113
+ except (TypeError, OverflowError, ValueError):
114
+ # Fallback to safe stringification
115
+ return self.safe_stringify(value, self.function)
116
+
117
+ # Start serialization with the top-level output
118
+ return serialize_value(self.output)
119
+
120
+ class Trace(BaseModel):
121
+ trace_id: str
122
+ name: str
123
+ created_at: str
124
+ duration: float
125
+ entries: List[TraceSpan]
126
+ overwrite: bool = False
127
+ rules: Optional[Dict[str, Any]] = None
128
+ has_notification: Optional[bool] = False
129
+
@@ -31,7 +31,7 @@ class EvaluationRun(BaseModel):
31
31
  eval_name: Optional[str] = None
32
32
  examples: Union[List[Example], List[CustomExample]]
33
33
  scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]
34
- model: Union[str, List[str], JudgevalJudge]
34
+ model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1"
35
35
  aggregator: Optional[str] = None
36
36
  metadata: Optional[Dict[str, Any]] = None
37
37
  trace_span_id: Optional[str] = None
@@ -68,6 +68,7 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
68
68
  self.executed_nodes: List[str] = []
69
69
  self.executed_tools: List[str] = []
70
70
  self.executed_node_tools: List[str] = []
71
+ self.traces: List[Dict[str, Any]] = []
71
72
  # --- END NEW __init__ ---
72
73
 
73
74
  # --- MODIFIED _ensure_trace_client ---
@@ -354,7 +355,8 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
354
355
  if self._trace_client and not self._trace_saved: # Check if not already saved
355
356
  try:
356
357
  # TODO: Check if trace_client.save needs await if TraceClient becomes async
357
- trace_id, _ = self._trace_client.save(overwrite=self._trace_client.overwrite) # Use client's overwrite setting
358
+ trace_id, trace_data = self._trace_client.save(overwrite=self._trace_client.overwrite) # Use client's overwrite setting
359
+ self.traces.append(trace_data)
358
360
  self._log(f"Trace {trace_id} successfully saved.")
359
361
  self._trace_saved = True # Set flag only after successful save
360
362
  trace_saved_successfully = True # Mark success
@@ -605,6 +607,9 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
605
607
  # More robust root detection: Often the first chain event with parent_run_id=None *is* the root.
606
608
  is_potential_root_event = parent_run_id is None
607
609
 
610
+ if 'langsmith:hidden' in tags:
611
+ pass
612
+
608
613
  if node_name:
609
614
  name = node_name # Use node name if available
610
615
  self._log(f" LangGraph Node Start: '{name}', run_id={run_id}")
@@ -631,7 +636,7 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
631
636
 
632
637
  # --- Start Span Tracking ---
633
638
  combined_inputs = {'inputs': inputs, 'tags': tags, 'metadata': metadata, 'kwargs': kwargs, 'serialized': serialized}
634
- self._start_span_tracking(trace_client, run_id, parent_run_id, name, span_type=span_type, inputs=combined_inputs)
639
+ self._start_span_tracking(trace_client, run_id, parent_run_id, name, span_type=span_type, inputs=inputs)
635
640
  # --- Store inputs for potential evaluation later ---
636
641
  self._run_id_to_start_inputs[run_id] = inputs # Store the raw inputs dict
637
642
  self._log(f" Stored inputs for run_id {run_id}")
@@ -651,6 +656,9 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
651
656
  # --- Define instance_id for logging ---
652
657
  instance_id = handler_instance_id # Use the already obtained id
653
658
 
659
+ if 'langsmith:hidden' in tags:
660
+ pass
661
+
654
662
  try:
655
663
  # Pass parent_run_id
656
664
  trace_client = self._ensure_trace_client(run_id, parent_run_id, "ChainEnd") # Corrected call
@@ -744,7 +752,8 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
744
752
  try:
745
753
  # Save might need to be async if TraceClient methods become async
746
754
  # Pass overwrite=True based on client's setting
747
- trace_id_saved, _ = trace_client.save(overwrite=trace_client.overwrite)
755
+ trace_id_saved, trace_data = trace_client.save(overwrite=trace_client.overwrite)
756
+ self.traces.append(trace_data)
748
757
  self._trace_saved = True
749
758
  self._log(f"Trace {trace_id_saved} successfully saved.")
750
759
  # Reset tracer's active client *after* successful save
@@ -812,11 +821,6 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
812
821
  if eval_config and span_id:
813
822
  self._log(f"{log_prefix} Submitting evaluation for span_id={span_id}")
814
823
  try:
815
- # Ensure example has trace_id set if not already present
816
- if not hasattr(eval_config.example, 'trace_id') or not eval_config.example.trace_id:
817
- # Use the correct variable name 'trace_client' here
818
- eval_config.example.trace_id = trace_client.trace_id
819
- self._log(f"{log_prefix} Set trace_id={trace_client.trace_id} on evaluation example.")
820
824
 
821
825
  # Call async_evaluate on the TraceClient instance ('trace_client')
822
826
  # Use the correct variable name 'trace_client' here
@@ -888,7 +892,7 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
888
892
  return
889
893
 
890
894
  combined_inputs = {'input_str': input_str, 'inputs': inputs, 'tags': tags, 'metadata': metadata, 'kwargs': kwargs, 'serialized': serialized}
891
- self._start_span_tracking(trace_client, run_id, parent_run_id, name, span_type="tool", inputs=combined_inputs)
895
+ self._start_span_tracking(trace_client, run_id, parent_run_id, name, span_type="tool", inputs=inputs)
892
896
 
893
897
  # --- Track executed tools (remains the same) ---
894
898
  if name not in self.executed_tools: self.executed_tools.append(name)
@@ -962,7 +966,7 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
962
966
  # print(f"{log_prefix} No trace client obtained in on_llm_start for {run_id}.")
963
967
  return
964
968
  inputs = {'prompts': prompts, 'invocation_params': invocation_params or kwargs, 'options': options, 'tags': tags, 'metadata': metadata, 'serialized': serialized}
965
- self._start_span_tracking(trace_client, run_id, parent_run_id, llm_name, span_type="llm", inputs=inputs)
969
+ self._start_span_tracking(trace_client, run_id, parent_run_id, llm_name, span_type="llm", inputs=prompts)
966
970
  except Exception as e:
967
971
  tc_id_on_error = id(self._trace_client) if self._trace_client else 'None'
968
972
  self._log(f"{log_prefix} UNCAUGHT EXCEPTION in on_llm_start for run_id={run_id} (TraceClient ID: {tc_id_on_error}): {e}")
@@ -1093,7 +1097,7 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
1093
1097
  trace_client = self._ensure_trace_client(run_id, parent_run_id, chat_model_name) # Corrected call with parent_run_id
1094
1098
  if not trace_client: return
1095
1099
  inputs = {'messages': messages, 'invocation_params': invocation_params or kwargs, 'options': options, 'tags': tags, 'metadata': metadata, 'serialized': serialized}
1096
- self._start_span_tracking(trace_client, run_id, parent_run_id, chat_model_name, span_type="llm", inputs=inputs) # Use 'llm' span_type for consistency
1100
+ self._start_span_tracking(trace_client, run_id, parent_run_id, chat_model_name, span_type="llm", inputs=messages) # Use 'llm' span_type for consistency
1097
1101
  except Exception as e:
1098
1102
  tc_id_on_error = id(self._trace_client) if self._trace_client else 'None'
1099
1103
  self._log(f"{log_prefix} UNCAUGHT EXCEPTION in on_chat_model_start for run_id={run_id} (TraceClient ID: {tc_id_on_error}): {e}")
@@ -1162,6 +1166,7 @@ class AsyncJudgevalCallbackHandler(AsyncCallbackHandler):
1162
1166
  self.executed_nodes: List[str] = []
1163
1167
  self.executed_tools: List[str] = []
1164
1168
  self.executed_node_tools: List[str] = []
1169
+ self.traces: List[Dict[str, Any]] = []
1165
1170
 
1166
1171
  # NOTE: _ensure_trace_client remains synchronous as it doesn't involve async I/O
1167
1172
  def _ensure_trace_client(self, run_id: UUID, event_name: str) -> Optional[TraceClient]:
@@ -1378,7 +1383,8 @@ class AsyncJudgevalCallbackHandler(AsyncCallbackHandler):
1378
1383
  if self._trace_client and not self._trace_saved: # Check if not already saved
1379
1384
  try:
1380
1385
  # TODO: Check if trace_client.save needs await if TraceClient becomes async
1381
- trace_id, _ = self._trace_client.save(overwrite=self._trace_client.overwrite) # Use client's overwrite setting
1386
+ trace_id, trace_data = self._trace_client.save(overwrite=self._trace_client.overwrite) # Use client's overwrite setting
1387
+ self.traces.append(trace_data)
1382
1388
  self._log(f"Trace {trace_id} successfully saved.")
1383
1389
  self._trace_saved = True # Set flag only after successful save
1384
1390
  trace_saved_successfully = True # Mark success
@@ -1710,11 +1716,6 @@ class AsyncJudgevalCallbackHandler(AsyncCallbackHandler):
1710
1716
  if eval_config and span_id: # Check eval_config *and* span_id again
1711
1717
  self._log(f"{log_prefix} Submitting evaluation for span_id={span_id}")
1712
1718
  try:
1713
- # Ensure example has trace_id set if not already present
1714
- if not hasattr(eval_config.example, 'trace_id') or not eval_config.example.trace_id:
1715
- # Use the correct variable name 'client' here for the async handler
1716
- eval_config.example.trace_id = client.trace_id
1717
- self._log(f"{log_prefix} Set trace_id={client.trace_id} on evaluation example.")
1718
1719
 
1719
1720
  # Call async_evaluate on the TraceClient instance ('client')
1720
1721
  # Use the correct variable name 'client' here for the async handler