judgeval 0.0.36__py3-none-any.whl → 0.0.37__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/tracer.py +565 -858
- judgeval/common/utils.py +18 -0
- judgeval/constants.py +3 -1
- judgeval/data/__init__.py +4 -0
- judgeval/data/datasets/dataset.py +0 -2
- judgeval/data/example.py +29 -7
- judgeval/data/sequence.py +5 -4
- judgeval/data/sequence_run.py +4 -3
- judgeval/data/trace.py +129 -0
- judgeval/evaluation_run.py +1 -1
- judgeval/integrations/langgraph.py +18 -17
- judgeval/judgment_client.py +77 -64
- judgeval/run_evaluation.py +126 -29
- judgeval/scorers/__init__.py +2 -0
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -0
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +18 -0
- judgeval/scorers/score.py +1 -1
- judgeval/utils/data_utils.py +57 -0
- judgeval-0.0.37.dist-info/METADATA +214 -0
- {judgeval-0.0.36.dist-info → judgeval-0.0.37.dist-info}/RECORD +22 -19
- judgeval-0.0.36.dist-info/METADATA +0 -169
- {judgeval-0.0.36.dist-info → judgeval-0.0.37.dist-info}/WHEEL +0 -0
- {judgeval-0.0.36.dist-info → judgeval-0.0.37.dist-info}/licenses/LICENSE.md +0 -0
judgeval/common/utils.py
CHANGED
@@ -12,6 +12,7 @@ NOTE: any function beginning with 'a', e.g. 'afetch_together_api_response', is a
|
|
12
12
|
import asyncio
|
13
13
|
import concurrent.futures
|
14
14
|
import os
|
15
|
+
import requests
|
15
16
|
import pprint
|
16
17
|
from typing import Any, Dict, List, Literal, Mapping, Optional, Union
|
17
18
|
|
@@ -96,6 +97,23 @@ def read_file(file_path: str) -> str:
|
|
96
97
|
with open(file_path, "r", encoding='utf-8') as file:
|
97
98
|
return file.read()
|
98
99
|
|
100
|
+
def validate_api_key(judgment_api_key: str):
|
101
|
+
"""
|
102
|
+
Validates that the user api key is valid
|
103
|
+
"""
|
104
|
+
response = requests.post(
|
105
|
+
f"{ROOT_API}/validate_api_key/",
|
106
|
+
headers={
|
107
|
+
"Content-Type": "application/json",
|
108
|
+
"Authorization": f"Bearer {judgment_api_key}",
|
109
|
+
},
|
110
|
+
json={}, # Empty body now
|
111
|
+
verify=True
|
112
|
+
)
|
113
|
+
if response.status_code == 200:
|
114
|
+
return True, response.json()
|
115
|
+
else:
|
116
|
+
return False, response.json().get("detail", "Error validating API key")
|
99
117
|
|
100
118
|
def fetch_together_api_response(model: str, messages: List[Mapping], response_format: pydantic.BaseModel = None) -> str:
|
101
119
|
"""
|
judgeval/constants.py
CHANGED
@@ -27,7 +27,7 @@ class APIScorer(str, Enum):
|
|
27
27
|
COMPARISON = "comparison"
|
28
28
|
GROUNDEDNESS = "groundedness"
|
29
29
|
DERAILMENT = "derailment"
|
30
|
-
|
30
|
+
TOOL_ORDER = "tool_order"
|
31
31
|
@classmethod
|
32
32
|
def _missing_(cls, value):
|
33
33
|
# Handle case-insensitive lookup
|
@@ -50,6 +50,7 @@ JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = f"{ROOT_API}/datasets/export_jsonl/"
|
|
50
50
|
JUDGMENT_DATASETS_PROJECT_STATS_API_URL = f"{ROOT_API}/datasets/fetch_stats_by_project/"
|
51
51
|
JUDGMENT_DATASETS_INSERT_API_URL = f"{ROOT_API}/datasets/insert_examples/"
|
52
52
|
JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
|
53
|
+
JUDGMENT_RETRIEVE_SEQUENCE_FROM_TRACE_API_URL = f"{ROOT_API}/traces/convert_trace_to_sequence/"
|
53
54
|
JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_experiment_run/"
|
54
55
|
JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/delete_eval_results_by_project_and_run_names/"
|
55
56
|
JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_project/"
|
@@ -58,6 +59,7 @@ JUDGMENT_PROJECT_CREATE_API_URL = f"{ROOT_API}/projects/add/"
|
|
58
59
|
JUDGMENT_TRACES_FETCH_API_URL = f"{ROOT_API}/traces/fetch/"
|
59
60
|
JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
|
60
61
|
JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
|
62
|
+
JUDGMENT_TRACES_ADD_ANNOTATION_API_URL = f"{ROOT_API}/traces/add_annotation/"
|
61
63
|
JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL = f"{ROOT_API}/add_to_run_eval_queue/"
|
62
64
|
# RabbitMQ
|
63
65
|
RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "rabbitmq-networklb-faa155df16ec9085.elb.us-west-1.amazonaws.com")
|
judgeval/data/__init__.py
CHANGED
@@ -3,6 +3,8 @@ from judgeval.data.custom_example import CustomExample
|
|
3
3
|
from judgeval.data.scorer_data import ScorerData, create_scorer_data
|
4
4
|
from judgeval.data.result import ScoringResult, generate_scoring_result
|
5
5
|
from judgeval.data.sequence import Sequence
|
6
|
+
from judgeval.data.trace import Trace, TraceSpan
|
7
|
+
|
6
8
|
|
7
9
|
__all__ = [
|
8
10
|
"Example",
|
@@ -13,4 +15,6 @@ __all__ = [
|
|
13
15
|
"ScoringResult",
|
14
16
|
"generate_scoring_result",
|
15
17
|
"Sequence",
|
18
|
+
"Trace",
|
19
|
+
"TraceSpan",
|
16
20
|
]
|
@@ -273,7 +273,6 @@ class EvalDataset:
|
|
273
273
|
None, # Example does not have comments
|
274
274
|
None, # Example does not have source file
|
275
275
|
True, # Adding an Example
|
276
|
-
e.trace_id
|
277
276
|
]
|
278
277
|
)
|
279
278
|
|
@@ -295,7 +294,6 @@ class EvalDataset:
|
|
295
294
|
"comments": None, # Example does not have comments
|
296
295
|
"source_file": None, # Example does not have source file
|
297
296
|
"example": True, # Adding an Example
|
298
|
-
"trace_id": e.trace_id
|
299
297
|
}
|
300
298
|
for e in self.examples
|
301
299
|
],
|
judgeval/data/example.py
CHANGED
@@ -24,14 +24,14 @@ class ExampleParams(Enum):
|
|
24
24
|
|
25
25
|
|
26
26
|
class Example(BaseModel):
|
27
|
-
input: Optional[str] = None
|
27
|
+
input: Optional[Union[str, Dict[str, Any]]] = None
|
28
28
|
actual_output: Optional[Union[str, List[str]]] = None
|
29
29
|
expected_output: Optional[Union[str, List[str]]] = None
|
30
30
|
context: Optional[List[str]] = None
|
31
31
|
retrieval_context: Optional[List[str]] = None
|
32
32
|
additional_metadata: Optional[Dict[str, Any]] = None
|
33
33
|
tools_called: Optional[List[str]] = None
|
34
|
-
expected_tools: Optional[List[str]] = None
|
34
|
+
expected_tools: Optional[List[Dict[str, Any]]] = None
|
35
35
|
name: Optional[str] = None
|
36
36
|
example_id: str = Field(default_factory=lambda: str(uuid4()))
|
37
37
|
example_index: Optional[int] = None
|
@@ -50,8 +50,18 @@ class Example(BaseModel):
|
|
50
50
|
@field_validator('input', mode='before')
|
51
51
|
@classmethod
|
52
52
|
def validate_input(cls, v):
|
53
|
-
if v is not None
|
54
|
-
|
53
|
+
if v is not None:
|
54
|
+
if not isinstance(v, (str, dict)):
|
55
|
+
raise ValueError(f"Input must be a string or dictionary but got {v} of type {type(v)}")
|
56
|
+
|
57
|
+
# If it's a string, check that it's not empty
|
58
|
+
if isinstance(v, str) and not v:
|
59
|
+
raise ValueError(f"Input string must be non-empty but got '{v}'")
|
60
|
+
|
61
|
+
# If it's a dictionary, check that it's not empty
|
62
|
+
if isinstance(v, dict) and not v:
|
63
|
+
raise ValueError(f"Input dictionary must be non-empty but got {v}")
|
64
|
+
|
55
65
|
return v
|
56
66
|
|
57
67
|
@field_validator('actual_output', mode='before')
|
@@ -73,7 +83,21 @@ class Example(BaseModel):
|
|
73
83
|
raise ValueError(f"All items in expected_output must be strings but got {v}")
|
74
84
|
return v
|
75
85
|
|
76
|
-
@field_validator('
|
86
|
+
@field_validator('expected_tools', mode='before')
|
87
|
+
@classmethod
|
88
|
+
def validate_expected_tools(cls, v):
|
89
|
+
if v is not None:
|
90
|
+
if not isinstance(v, list):
|
91
|
+
raise ValueError(f"Expected tools must be a list of dictionaries or None but got {v} of type {type(v)}")
|
92
|
+
|
93
|
+
# Check that each item in the list is a dictionary
|
94
|
+
for i, item in enumerate(v):
|
95
|
+
if not isinstance(item, dict):
|
96
|
+
raise ValueError(f"Expected tools must be a list of dictionaries, but item at index {i} is {item} of type {type(item)}")
|
97
|
+
|
98
|
+
return v
|
99
|
+
|
100
|
+
@field_validator('context', 'retrieval_context', 'tools_called', mode='before')
|
77
101
|
@classmethod
|
78
102
|
def validate_string_lists(cls, v, info):
|
79
103
|
field_name = info.field_name
|
@@ -127,7 +151,6 @@ class Example(BaseModel):
|
|
127
151
|
"example_id": self.example_id,
|
128
152
|
"example_index": self.example_index,
|
129
153
|
"timestamp": self.timestamp,
|
130
|
-
"trace_id": self.trace_id
|
131
154
|
}
|
132
155
|
|
133
156
|
def __str__(self):
|
@@ -144,5 +167,4 @@ class Example(BaseModel):
|
|
144
167
|
f"example_id={self.example_id}, "
|
145
168
|
f"example_index={self.example_index}, "
|
146
169
|
f"timestamp={self.timestamp}, "
|
147
|
-
f"trace_id={self.trace_id})"
|
148
170
|
)
|
judgeval/data/sequence.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
from pydantic import BaseModel, Field, field_validator, model_validator
|
2
|
-
from typing import List, Optional, Union, Any
|
2
|
+
from typing import List, Optional, Union, Any, Dict
|
3
3
|
from judgeval.data.example import Example
|
4
4
|
from judgeval.scorers import JudgevalScorer, APIJudgmentScorer
|
5
5
|
from uuid import uuid4
|
@@ -12,13 +12,14 @@ class Sequence(BaseModel):
|
|
12
12
|
sequence_id: str = Field(default_factory=lambda: str(uuid4()))
|
13
13
|
name: Optional[str] = "Sequence"
|
14
14
|
created_at: str = Field(default_factory=lambda: datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S"))
|
15
|
-
items: List[Union["Sequence", Example]]
|
15
|
+
items: List[Union["Sequence", Example]] = []
|
16
16
|
scorers: Optional[Any] = None
|
17
17
|
parent_sequence_id: Optional[str] = None
|
18
18
|
sequence_order: Optional[int] = 0
|
19
19
|
root_sequence_id: Optional[str] = None
|
20
|
-
inputs: Optional[str] = None
|
21
|
-
output: Optional[
|
20
|
+
inputs: Optional[Dict[str, Any]] = None
|
21
|
+
output: Optional[Any] = None
|
22
|
+
expected_tools: Optional[List[Dict[str, Any]]] = None
|
22
23
|
|
23
24
|
@field_validator("scorers")
|
24
25
|
def validate_scorer(cls, v):
|
judgeval/data/sequence_run.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
|
2
2
|
from pydantic import BaseModel
|
3
|
-
from typing import List, Optional, Dict, Any, Union
|
3
|
+
from typing import List, Optional, Dict, Any, Union, Callable
|
4
4
|
from judgeval.data import Sequence
|
5
5
|
from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
|
6
6
|
from judgeval.judges import JudgevalJudge
|
@@ -29,8 +29,9 @@ class SequenceRun(BaseModel):
|
|
29
29
|
organization_id: Optional[str] = None
|
30
30
|
project_name: Optional[str] = None
|
31
31
|
eval_name: Optional[str] = None
|
32
|
-
sequences: List[Sequence]
|
33
|
-
|
32
|
+
sequences: Optional[List[Sequence]] = None
|
33
|
+
scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]
|
34
|
+
model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1"
|
34
35
|
aggregator: Optional[str] = None
|
35
36
|
metadata: Optional[Dict[str, Any]] = None
|
36
37
|
trace_span_id: Optional[str] = None
|
judgeval/data/trace.py
ADDED
@@ -0,0 +1,129 @@
|
|
1
|
+
from pydantic import BaseModel
|
2
|
+
from typing import Optional, Dict, Any, List
|
3
|
+
from judgeval.evaluation_run import EvaluationRun
|
4
|
+
import json
|
5
|
+
from datetime import datetime, timezone
|
6
|
+
|
7
|
+
class TraceSpan(BaseModel):
|
8
|
+
span_id: str
|
9
|
+
trace_id: str
|
10
|
+
function: Optional[str] = None
|
11
|
+
depth: int
|
12
|
+
created_at: Optional[float] = None
|
13
|
+
parent_span_id: Optional[str] = None
|
14
|
+
span_type: Optional[str] = "span"
|
15
|
+
inputs: Optional[Dict[str, Any]] = None
|
16
|
+
output: Optional[Any] = None
|
17
|
+
duration: Optional[float] = None
|
18
|
+
annotation: Optional[List[Dict[str, Any]]] = None
|
19
|
+
evaluation_runs: Optional[List[EvaluationRun]] = []
|
20
|
+
|
21
|
+
def model_dump(self, **kwargs):
|
22
|
+
return {
|
23
|
+
"span_id": self.span_id,
|
24
|
+
"trace_id": self.trace_id,
|
25
|
+
"depth": self.depth,
|
26
|
+
# "created_at": datetime.fromtimestamp(self.created_at).isoformat(),
|
27
|
+
"created_at": datetime.fromtimestamp(self.created_at, tz=timezone.utc).isoformat(),
|
28
|
+
"inputs": self._serialize_inputs(),
|
29
|
+
"output": self._serialize_output(),
|
30
|
+
"evaluation_runs": [run.model_dump() for run in self.evaluation_runs] if self.evaluation_runs else [],
|
31
|
+
"parent_span_id": self.parent_span_id,
|
32
|
+
"function": self.function,
|
33
|
+
"duration": self.duration,
|
34
|
+
"span_type": self.span_type
|
35
|
+
}
|
36
|
+
|
37
|
+
def print_span(self):
|
38
|
+
"""Print the span with proper formatting and parent relationship information."""
|
39
|
+
indent = " " * self.depth
|
40
|
+
parent_info = f" (parent_id: {self.parent_span_id})" if self.parent_span_id else ""
|
41
|
+
print(f"{indent}→ {self.function} (id: {self.span_id}){parent_info}")
|
42
|
+
|
43
|
+
def _serialize_inputs(self) -> dict:
|
44
|
+
"""Helper method to serialize input data safely."""
|
45
|
+
if self.inputs is None:
|
46
|
+
return {}
|
47
|
+
|
48
|
+
serialized_inputs = {}
|
49
|
+
for key, value in self.inputs.items():
|
50
|
+
if isinstance(value, BaseModel):
|
51
|
+
serialized_inputs[key] = value.model_dump()
|
52
|
+
elif isinstance(value, (list, tuple)):
|
53
|
+
# Handle lists/tuples of arguments
|
54
|
+
serialized_inputs[key] = [
|
55
|
+
item.model_dump() if isinstance(item, BaseModel)
|
56
|
+
else None if not self._is_json_serializable(item)
|
57
|
+
else item
|
58
|
+
for item in value
|
59
|
+
]
|
60
|
+
else:
|
61
|
+
if self._is_json_serializable(value):
|
62
|
+
serialized_inputs[key] = value
|
63
|
+
else:
|
64
|
+
serialized_inputs[key] = self.safe_stringify(value, self.function)
|
65
|
+
return serialized_inputs
|
66
|
+
|
67
|
+
def _is_json_serializable(self, obj: Any) -> bool:
|
68
|
+
"""Helper method to check if an object is JSON serializable."""
|
69
|
+
try:
|
70
|
+
json.dumps(obj)
|
71
|
+
return True
|
72
|
+
except (TypeError, OverflowError, ValueError):
|
73
|
+
return False
|
74
|
+
|
75
|
+
def safe_stringify(self, output, function_name):
|
76
|
+
"""
|
77
|
+
Safely converts an object to a string or repr, handling serialization issues gracefully.
|
78
|
+
"""
|
79
|
+
try:
|
80
|
+
return str(output)
|
81
|
+
except (TypeError, OverflowError, ValueError):
|
82
|
+
pass
|
83
|
+
|
84
|
+
try:
|
85
|
+
return repr(output)
|
86
|
+
except (TypeError, OverflowError, ValueError):
|
87
|
+
pass
|
88
|
+
|
89
|
+
warnings.warn(
|
90
|
+
f"Output for function {function_name} is not JSON serializable and could not be converted to string. Setting to None."
|
91
|
+
)
|
92
|
+
return None
|
93
|
+
|
94
|
+
def _serialize_output(self) -> Any:
|
95
|
+
"""Helper method to serialize output data safely."""
|
96
|
+
if self.output is None:
|
97
|
+
return None
|
98
|
+
|
99
|
+
def serialize_value(value):
|
100
|
+
if isinstance(value, BaseModel):
|
101
|
+
return value.model_dump()
|
102
|
+
elif isinstance(value, dict):
|
103
|
+
# Recursively serialize dictionary values
|
104
|
+
return {k: serialize_value(v) for k, v in value.items()}
|
105
|
+
elif isinstance(value, (list, tuple)):
|
106
|
+
# Recursively serialize list/tuple items
|
107
|
+
return [serialize_value(item) for item in value]
|
108
|
+
else:
|
109
|
+
# Try direct JSON serialization first
|
110
|
+
try:
|
111
|
+
json.dumps(value)
|
112
|
+
return value
|
113
|
+
except (TypeError, OverflowError, ValueError):
|
114
|
+
# Fallback to safe stringification
|
115
|
+
return self.safe_stringify(value, self.function)
|
116
|
+
|
117
|
+
# Start serialization with the top-level output
|
118
|
+
return serialize_value(self.output)
|
119
|
+
|
120
|
+
class Trace(BaseModel):
|
121
|
+
trace_id: str
|
122
|
+
name: str
|
123
|
+
created_at: str
|
124
|
+
duration: float
|
125
|
+
entries: List[TraceSpan]
|
126
|
+
overwrite: bool = False
|
127
|
+
rules: Optional[Dict[str, Any]] = None
|
128
|
+
has_notification: Optional[bool] = False
|
129
|
+
|
judgeval/evaluation_run.py
CHANGED
@@ -31,7 +31,7 @@ class EvaluationRun(BaseModel):
|
|
31
31
|
eval_name: Optional[str] = None
|
32
32
|
examples: Union[List[Example], List[CustomExample]]
|
33
33
|
scorers: List[Union[APIJudgmentScorer, JudgevalScorer]]
|
34
|
-
model: Union[str, List[str], JudgevalJudge]
|
34
|
+
model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1"
|
35
35
|
aggregator: Optional[str] = None
|
36
36
|
metadata: Optional[Dict[str, Any]] = None
|
37
37
|
trace_span_id: Optional[str] = None
|
@@ -68,6 +68,7 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
68
68
|
self.executed_nodes: List[str] = []
|
69
69
|
self.executed_tools: List[str] = []
|
70
70
|
self.executed_node_tools: List[str] = []
|
71
|
+
self.traces: List[Dict[str, Any]] = []
|
71
72
|
# --- END NEW __init__ ---
|
72
73
|
|
73
74
|
# --- MODIFIED _ensure_trace_client ---
|
@@ -354,7 +355,8 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
354
355
|
if self._trace_client and not self._trace_saved: # Check if not already saved
|
355
356
|
try:
|
356
357
|
# TODO: Check if trace_client.save needs await if TraceClient becomes async
|
357
|
-
trace_id,
|
358
|
+
trace_id, trace_data = self._trace_client.save(overwrite=self._trace_client.overwrite) # Use client's overwrite setting
|
359
|
+
self.traces.append(trace_data)
|
358
360
|
self._log(f"Trace {trace_id} successfully saved.")
|
359
361
|
self._trace_saved = True # Set flag only after successful save
|
360
362
|
trace_saved_successfully = True # Mark success
|
@@ -605,6 +607,9 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
605
607
|
# More robust root detection: Often the first chain event with parent_run_id=None *is* the root.
|
606
608
|
is_potential_root_event = parent_run_id is None
|
607
609
|
|
610
|
+
if 'langsmith:hidden' in tags:
|
611
|
+
pass
|
612
|
+
|
608
613
|
if node_name:
|
609
614
|
name = node_name # Use node name if available
|
610
615
|
self._log(f" LangGraph Node Start: '{name}', run_id={run_id}")
|
@@ -631,7 +636,7 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
631
636
|
|
632
637
|
# --- Start Span Tracking ---
|
633
638
|
combined_inputs = {'inputs': inputs, 'tags': tags, 'metadata': metadata, 'kwargs': kwargs, 'serialized': serialized}
|
634
|
-
self._start_span_tracking(trace_client, run_id, parent_run_id, name, span_type=span_type, inputs=
|
639
|
+
self._start_span_tracking(trace_client, run_id, parent_run_id, name, span_type=span_type, inputs=inputs)
|
635
640
|
# --- Store inputs for potential evaluation later ---
|
636
641
|
self._run_id_to_start_inputs[run_id] = inputs # Store the raw inputs dict
|
637
642
|
self._log(f" Stored inputs for run_id {run_id}")
|
@@ -651,6 +656,9 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
651
656
|
# --- Define instance_id for logging ---
|
652
657
|
instance_id = handler_instance_id # Use the already obtained id
|
653
658
|
|
659
|
+
if 'langsmith:hidden' in tags:
|
660
|
+
pass
|
661
|
+
|
654
662
|
try:
|
655
663
|
# Pass parent_run_id
|
656
664
|
trace_client = self._ensure_trace_client(run_id, parent_run_id, "ChainEnd") # Corrected call
|
@@ -744,7 +752,8 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
744
752
|
try:
|
745
753
|
# Save might need to be async if TraceClient methods become async
|
746
754
|
# Pass overwrite=True based on client's setting
|
747
|
-
trace_id_saved,
|
755
|
+
trace_id_saved, trace_data = trace_client.save(overwrite=trace_client.overwrite)
|
756
|
+
self.traces.append(trace_data)
|
748
757
|
self._trace_saved = True
|
749
758
|
self._log(f"Trace {trace_id_saved} successfully saved.")
|
750
759
|
# Reset tracer's active client *after* successful save
|
@@ -812,11 +821,6 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
812
821
|
if eval_config and span_id:
|
813
822
|
self._log(f"{log_prefix} Submitting evaluation for span_id={span_id}")
|
814
823
|
try:
|
815
|
-
# Ensure example has trace_id set if not already present
|
816
|
-
if not hasattr(eval_config.example, 'trace_id') or not eval_config.example.trace_id:
|
817
|
-
# Use the correct variable name 'trace_client' here
|
818
|
-
eval_config.example.trace_id = trace_client.trace_id
|
819
|
-
self._log(f"{log_prefix} Set trace_id={trace_client.trace_id} on evaluation example.")
|
820
824
|
|
821
825
|
# Call async_evaluate on the TraceClient instance ('trace_client')
|
822
826
|
# Use the correct variable name 'trace_client' here
|
@@ -888,7 +892,7 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
888
892
|
return
|
889
893
|
|
890
894
|
combined_inputs = {'input_str': input_str, 'inputs': inputs, 'tags': tags, 'metadata': metadata, 'kwargs': kwargs, 'serialized': serialized}
|
891
|
-
self._start_span_tracking(trace_client, run_id, parent_run_id, name, span_type="tool", inputs=
|
895
|
+
self._start_span_tracking(trace_client, run_id, parent_run_id, name, span_type="tool", inputs=inputs)
|
892
896
|
|
893
897
|
# --- Track executed tools (remains the same) ---
|
894
898
|
if name not in self.executed_tools: self.executed_tools.append(name)
|
@@ -962,7 +966,7 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
962
966
|
# print(f"{log_prefix} No trace client obtained in on_llm_start for {run_id}.")
|
963
967
|
return
|
964
968
|
inputs = {'prompts': prompts, 'invocation_params': invocation_params or kwargs, 'options': options, 'tags': tags, 'metadata': metadata, 'serialized': serialized}
|
965
|
-
self._start_span_tracking(trace_client, run_id, parent_run_id, llm_name, span_type="llm", inputs=
|
969
|
+
self._start_span_tracking(trace_client, run_id, parent_run_id, llm_name, span_type="llm", inputs=prompts)
|
966
970
|
except Exception as e:
|
967
971
|
tc_id_on_error = id(self._trace_client) if self._trace_client else 'None'
|
968
972
|
self._log(f"{log_prefix} UNCAUGHT EXCEPTION in on_llm_start for run_id={run_id} (TraceClient ID: {tc_id_on_error}): {e}")
|
@@ -1093,7 +1097,7 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
1093
1097
|
trace_client = self._ensure_trace_client(run_id, parent_run_id, chat_model_name) # Corrected call with parent_run_id
|
1094
1098
|
if not trace_client: return
|
1095
1099
|
inputs = {'messages': messages, 'invocation_params': invocation_params or kwargs, 'options': options, 'tags': tags, 'metadata': metadata, 'serialized': serialized}
|
1096
|
-
self._start_span_tracking(trace_client, run_id, parent_run_id, chat_model_name, span_type="llm", inputs=
|
1100
|
+
self._start_span_tracking(trace_client, run_id, parent_run_id, chat_model_name, span_type="llm", inputs=messages) # Use 'llm' span_type for consistency
|
1097
1101
|
except Exception as e:
|
1098
1102
|
tc_id_on_error = id(self._trace_client) if self._trace_client else 'None'
|
1099
1103
|
self._log(f"{log_prefix} UNCAUGHT EXCEPTION in on_chat_model_start for run_id={run_id} (TraceClient ID: {tc_id_on_error}): {e}")
|
@@ -1162,6 +1166,7 @@ class AsyncJudgevalCallbackHandler(AsyncCallbackHandler):
|
|
1162
1166
|
self.executed_nodes: List[str] = []
|
1163
1167
|
self.executed_tools: List[str] = []
|
1164
1168
|
self.executed_node_tools: List[str] = []
|
1169
|
+
self.traces: List[Dict[str, Any]] = []
|
1165
1170
|
|
1166
1171
|
# NOTE: _ensure_trace_client remains synchronous as it doesn't involve async I/O
|
1167
1172
|
def _ensure_trace_client(self, run_id: UUID, event_name: str) -> Optional[TraceClient]:
|
@@ -1378,7 +1383,8 @@ class AsyncJudgevalCallbackHandler(AsyncCallbackHandler):
|
|
1378
1383
|
if self._trace_client and not self._trace_saved: # Check if not already saved
|
1379
1384
|
try:
|
1380
1385
|
# TODO: Check if trace_client.save needs await if TraceClient becomes async
|
1381
|
-
trace_id,
|
1386
|
+
trace_id, trace_data = self._trace_client.save(overwrite=self._trace_client.overwrite) # Use client's overwrite setting
|
1387
|
+
self.traces.append(trace_data)
|
1382
1388
|
self._log(f"Trace {trace_id} successfully saved.")
|
1383
1389
|
self._trace_saved = True # Set flag only after successful save
|
1384
1390
|
trace_saved_successfully = True # Mark success
|
@@ -1710,11 +1716,6 @@ class AsyncJudgevalCallbackHandler(AsyncCallbackHandler):
|
|
1710
1716
|
if eval_config and span_id: # Check eval_config *and* span_id again
|
1711
1717
|
self._log(f"{log_prefix} Submitting evaluation for span_id={span_id}")
|
1712
1718
|
try:
|
1713
|
-
# Ensure example has trace_id set if not already present
|
1714
|
-
if not hasattr(eval_config.example, 'trace_id') or not eval_config.example.trace_id:
|
1715
|
-
# Use the correct variable name 'client' here for the async handler
|
1716
|
-
eval_config.example.trace_id = client.trace_id
|
1717
|
-
self._log(f"{log_prefix} Set trace_id={client.trace_id} on evaluation example.")
|
1718
1719
|
|
1719
1720
|
# Call async_evaluate on the TraceClient instance ('client')
|
1720
1721
|
# Use the correct variable name 'client' here for the async handler
|