judgeval 0.0.23__py3-none-any.whl → 0.0.25__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/tracer.py +48 -252
- judgeval/data/__init__.py +1 -2
- judgeval/integrations/langgraph.py +316 -0
- judgeval-0.0.25.dist-info/METADATA +156 -0
- {judgeval-0.0.23.dist-info → judgeval-0.0.25.dist-info}/RECORD +7 -9
- judgeval/data/custom_example.py +0 -98
- judgeval/data/datasets/utils.py +0 -0
- judgeval/data/ground_truth.py +0 -0
- judgeval-0.0.23.dist-info/METADATA +0 -40
- {judgeval-0.0.23.dist-info → judgeval-0.0.25.dist-info}/WHEEL +0 -0
- {judgeval-0.0.23.dist-info → judgeval-0.0.25.dist-info}/licenses/LICENSE.md +0 -0
judgeval/common/tracer.py
CHANGED
@@ -10,16 +10,12 @@ import os
|
|
10
10
|
import time
|
11
11
|
import uuid
|
12
12
|
import warnings
|
13
|
-
from contextvars import ContextVar
|
14
13
|
from contextlib import contextmanager
|
15
|
-
from collections import defaultdict
|
16
14
|
from dataclasses import dataclass, field
|
17
15
|
from datetime import datetime
|
18
16
|
from http import HTTPStatus
|
19
17
|
from typing import Any, Dict, Generator, List, Literal, Optional, Tuple, TypeAlias, Union
|
20
18
|
from rich import print as rprint
|
21
|
-
from uuid import UUID
|
22
|
-
from collections.abc import Sequence
|
23
19
|
|
24
20
|
# Third-party imports
|
25
21
|
import pika
|
@@ -48,19 +44,6 @@ from judgeval.rules import Rule
|
|
48
44
|
from judgeval.evaluation_run import EvaluationRun
|
49
45
|
from judgeval.data.result import ScoringResult
|
50
46
|
|
51
|
-
from langchain_core.language_models import BaseChatModel
|
52
|
-
from langchain_huggingface import ChatHuggingFace
|
53
|
-
from langchain_openai import ChatOpenAI
|
54
|
-
from langchain_anthropic import ChatAnthropic
|
55
|
-
from langchain_core.utils.function_calling import convert_to_openai_tool
|
56
|
-
from langchain_core.callbacks import CallbackManager, BaseCallbackHandler
|
57
|
-
from langchain_core.agents import AgentAction, AgentFinish
|
58
|
-
from langchain_core.outputs import LLMResult
|
59
|
-
from langchain_core.tracers.context import register_configure_hook
|
60
|
-
from langchain_core.messages.ai import AIMessage
|
61
|
-
from langchain_core.messages.tool import ToolMessage
|
62
|
-
from langchain_core.messages.base import BaseMessage
|
63
|
-
from langchain_core.documents import Document
|
64
47
|
|
65
48
|
# Define type aliases for better code readability and maintainability
|
66
49
|
ApiClient: TypeAlias = Union[OpenAI, Together, Anthropic] # Supported API clients
|
@@ -125,8 +108,7 @@ class TraceEntry:
|
|
125
108
|
if self._is_json_serializable(value):
|
126
109
|
serialized_inputs[key] = value
|
127
110
|
else:
|
128
|
-
|
129
|
-
serialized_inputs[key] = None
|
111
|
+
serialized_inputs[key] = self.safe_stringify(value, self.function)
|
130
112
|
return serialized_inputs
|
131
113
|
|
132
114
|
def _is_json_serializable(self, obj: Any) -> bool:
|
@@ -137,6 +119,25 @@ class TraceEntry:
|
|
137
119
|
except (TypeError, OverflowError, ValueError):
|
138
120
|
return False
|
139
121
|
|
122
|
+
def safe_stringify(self, output, function_name):
|
123
|
+
"""
|
124
|
+
Safely converts an object to a string or repr, handling serialization issues gracefully.
|
125
|
+
"""
|
126
|
+
try:
|
127
|
+
return str(output)
|
128
|
+
except (TypeError, OverflowError, ValueError):
|
129
|
+
pass
|
130
|
+
|
131
|
+
try:
|
132
|
+
return repr(output)
|
133
|
+
except (TypeError, OverflowError, ValueError):
|
134
|
+
pass
|
135
|
+
|
136
|
+
warnings.warn(
|
137
|
+
f"Output for function {function_name} is not JSON serializable and could not be converted to string. Setting to None."
|
138
|
+
)
|
139
|
+
return None
|
140
|
+
|
140
141
|
def to_dict(self) -> dict:
|
141
142
|
"""Convert the trace entry to a dictionary format for storage/transmission."""
|
142
143
|
return {
|
@@ -160,25 +161,6 @@ class TraceEntry:
|
|
160
161
|
- We try to serialize into JSON, then string, then the base representation (__repr__)
|
161
162
|
- Non-serializable objects return None with a warning
|
162
163
|
"""
|
163
|
-
|
164
|
-
def safe_stringify(output, function_name):
|
165
|
-
"""
|
166
|
-
Safely converts an object to a string or repr, handling serialization issues gracefully.
|
167
|
-
"""
|
168
|
-
try:
|
169
|
-
return str(output)
|
170
|
-
except (TypeError, OverflowError, ValueError):
|
171
|
-
pass
|
172
|
-
|
173
|
-
try:
|
174
|
-
return repr(output)
|
175
|
-
except (TypeError, OverflowError, ValueError):
|
176
|
-
pass
|
177
|
-
|
178
|
-
warnings.warn(
|
179
|
-
f"Output for function {function_name} is not JSON serializable and could not be converted to string. Setting to None."
|
180
|
-
)
|
181
|
-
return None
|
182
164
|
|
183
165
|
if isinstance(self.output, BaseModel):
|
184
166
|
return self.output.model_dump()
|
@@ -188,7 +170,7 @@ class TraceEntry:
|
|
188
170
|
json.dumps(self.output)
|
189
171
|
return self.output
|
190
172
|
except (TypeError, OverflowError, ValueError):
|
191
|
-
return safe_stringify(self.output, self.function)
|
173
|
+
return self.safe_stringify(self.output, self.function)
|
192
174
|
|
193
175
|
|
194
176
|
class TraceManagerClient:
|
@@ -331,6 +313,8 @@ class TraceClient:
|
|
331
313
|
project_name: str = "default_project",
|
332
314
|
overwrite: bool = False,
|
333
315
|
rules: Optional[List[Rule]] = None,
|
316
|
+
enable_monitoring: bool = True,
|
317
|
+
enable_evaluations: bool = True
|
334
318
|
):
|
335
319
|
self.name = name
|
336
320
|
self.trace_id = trace_id or str(uuid.uuid4())
|
@@ -339,6 +323,8 @@ class TraceClient:
|
|
339
323
|
self.tracer = tracer
|
340
324
|
# Initialize rules with either provided rules or an empty list
|
341
325
|
self.rules = rules or []
|
326
|
+
self.enable_monitoring = enable_monitoring
|
327
|
+
self.enable_evaluations = enable_evaluations
|
342
328
|
|
343
329
|
self.client: JudgmentClient = tracer.client
|
344
330
|
self.entries: List[TraceEntry] = []
|
@@ -399,6 +385,9 @@ class TraceClient:
|
|
399
385
|
model: Optional[str] = None,
|
400
386
|
log_results: Optional[bool] = True
|
401
387
|
):
|
388
|
+
if not self.enable_evaluations:
|
389
|
+
return
|
390
|
+
|
402
391
|
start_time = time.time() # Record start time
|
403
392
|
example = Example(
|
404
393
|
input=input,
|
@@ -698,7 +687,10 @@ class Tracer:
|
|
698
687
|
api_key: str = os.getenv("JUDGMENT_API_KEY"),
|
699
688
|
project_name: str = "default_project",
|
700
689
|
rules: Optional[List[Rule]] = None, # Added rules parameter
|
701
|
-
organization_id: str = os.getenv("JUDGMENT_ORG_ID")
|
690
|
+
organization_id: str = os.getenv("JUDGMENT_ORG_ID"),
|
691
|
+
enable_monitoring: bool = os.getenv("JUDGMENT_MONITORING", "true").lower() == "true",
|
692
|
+
enable_evaluations: bool = os.getenv("JUDGMENT_EVALUATIONS", "true").lower() == "true"
|
693
|
+
):
|
702
694
|
if not hasattr(self, 'initialized'):
|
703
695
|
if not api_key:
|
704
696
|
raise ValueError("Tracer must be configured with a Judgment API key")
|
@@ -714,6 +706,8 @@ class Tracer:
|
|
714
706
|
self._current_trace: Optional[str] = None
|
715
707
|
self.rules: List[Rule] = rules or [] # Store rules at tracer level
|
716
708
|
self.initialized: bool = True
|
709
|
+
self.enable_monitoring: bool = enable_monitoring
|
710
|
+
self.enable_evaluations: bool = enable_evaluations
|
717
711
|
elif hasattr(self, 'project_name') and self.project_name != project_name:
|
718
712
|
warnings.warn(
|
719
713
|
f"Attempting to initialize Tracer with project_name='{project_name}' but it was already initialized with "
|
@@ -740,7 +734,9 @@ class Tracer:
|
|
740
734
|
name,
|
741
735
|
project_name=project,
|
742
736
|
overwrite=overwrite,
|
743
|
-
rules=self.rules # Pass combined rules to the trace client
|
737
|
+
rules=self.rules, # Pass combined rules to the trace client
|
738
|
+
enable_monitoring=self.enable_monitoring,
|
739
|
+
enable_evaluations=self.enable_evaluations
|
744
740
|
)
|
745
741
|
prev_trace = self._current_trace
|
746
742
|
self._current_trace = trace
|
@@ -771,6 +767,9 @@ class Tracer:
|
|
771
767
|
project_name: Optional project name override
|
772
768
|
overwrite: Whether to overwrite existing traces
|
773
769
|
"""
|
770
|
+
if not self.enable_monitoring:
|
771
|
+
return
|
772
|
+
|
774
773
|
if func is None:
|
775
774
|
return lambda f: self.observe(f, name=name, span_type=span_type, project_name=project_name, overwrite=overwrite)
|
776
775
|
|
@@ -787,7 +786,7 @@ class Tracer:
|
|
787
786
|
trace_id = str(uuid.uuid4())
|
788
787
|
trace_name = func.__name__
|
789
788
|
project = project_name if project_name is not None else self.project_name
|
790
|
-
trace = TraceClient(self, trace_id, trace_name, project_name=project, overwrite=overwrite, rules=self.rules)
|
789
|
+
trace = TraceClient(self, trace_id, trace_name, project_name=project, overwrite=overwrite, rules=self.rules, enable_monitoring=self.enable_monitoring, enable_evaluations=self.enable_evaluations)
|
791
790
|
self._current_trace = trace
|
792
791
|
# Only save empty trace for the root call
|
793
792
|
trace.save(empty_save=True, overwrite=overwrite)
|
@@ -824,7 +823,7 @@ class Tracer:
|
|
824
823
|
trace_id = str(uuid.uuid4())
|
825
824
|
trace_name = func.__name__
|
826
825
|
project = project_name if project_name is not None else self.project_name
|
827
|
-
trace = TraceClient(self, trace_id, trace_name, project_name=project, overwrite=overwrite, rules=self.rules)
|
826
|
+
trace = TraceClient(self, trace_id, trace_name, project_name=project, overwrite=overwrite, rules=self.rules, enable_monitoring=self.enable_monitoring)
|
828
827
|
self._current_trace = trace
|
829
828
|
# Only save empty trace for the root call
|
830
829
|
trace.save(empty_save=True, overwrite=overwrite)
|
@@ -872,6 +871,11 @@ class Tracer:
|
|
872
871
|
self._current_trace.async_evaluate(scorers=[scorers], input=args, actual_output=kwargs, model="gpt-4o-mini", log_results=True)
|
873
872
|
return wrapper
|
874
873
|
|
874
|
+
def async_evaluate(self, *args, **kwargs):
|
875
|
+
if self._current_trace:
|
876
|
+
self._current_trace.async_evaluate(*args, **kwargs)
|
877
|
+
else:
|
878
|
+
warnings.warn("No trace found, skipping evaluation")
|
875
879
|
|
876
880
|
|
877
881
|
def wrap(client: Any) -> Any:
|
@@ -982,212 +986,4 @@ def _format_output_data(client: ApiClient, response: Any) -> dict:
|
|
982
986
|
"output_tokens": response.usage.output_tokens,
|
983
987
|
"total_tokens": response.usage.input_tokens + response.usage.output_tokens
|
984
988
|
}
|
985
|
-
}
|
986
|
-
|
987
|
-
class JudgevalCallbackHandler(BaseCallbackHandler):
|
988
|
-
def __init__(self, trace_client: TraceClient):
|
989
|
-
self.trace_client = trace_client
|
990
|
-
self.previous_node = "__start__"
|
991
|
-
self.executed_node_tools = []
|
992
|
-
self.executed_nodes = []
|
993
|
-
self.executed_tools = []
|
994
|
-
self.openai_count = 1
|
995
|
-
|
996
|
-
def start_span(self, name: str, span_type: SpanType = "span"):
|
997
|
-
start_time = time.time()
|
998
|
-
|
999
|
-
# Record span entry
|
1000
|
-
self.trace_client.add_entry(TraceEntry(
|
1001
|
-
type="enter",
|
1002
|
-
function=name,
|
1003
|
-
depth=self.trace_client.tracer.depth,
|
1004
|
-
message=name,
|
1005
|
-
timestamp=start_time,
|
1006
|
-
span_type=span_type
|
1007
|
-
))
|
1008
|
-
|
1009
|
-
self.trace_client.tracer.depth += 1
|
1010
|
-
self.trace_client.prev_span = self.trace_client._current_span
|
1011
|
-
self.trace_client._current_span = name
|
1012
|
-
self._start_time = start_time
|
1013
|
-
|
1014
|
-
def end_span(self, name: str, span_type: SpanType = "span"):
|
1015
|
-
self.trace_client.tracer.depth -= 1
|
1016
|
-
duration = time.time() - self._start_time
|
1017
|
-
|
1018
|
-
# Record span exit
|
1019
|
-
self.trace_client.add_entry(TraceEntry(
|
1020
|
-
type="exit",
|
1021
|
-
function=name,
|
1022
|
-
depth=self.trace_client.tracer.depth,
|
1023
|
-
message=f"← {name}",
|
1024
|
-
timestamp=time.time(),
|
1025
|
-
duration=duration,
|
1026
|
-
span_type=span_type
|
1027
|
-
))
|
1028
|
-
self.trace_client._current_span = self.trace_client.prev_span
|
1029
|
-
|
1030
|
-
def on_retriever_start(
|
1031
|
-
self,
|
1032
|
-
serialized: Optional[dict[str, Any]],
|
1033
|
-
query: str,
|
1034
|
-
*,
|
1035
|
-
run_id: UUID,
|
1036
|
-
parent_run_id: Optional[UUID] = None,
|
1037
|
-
tags: Optional[list[str]] = None,
|
1038
|
-
metadata: Optional[dict[str, Any]] = None,
|
1039
|
-
**kwargs: Any,
|
1040
|
-
) -> Any:
|
1041
|
-
name = "RETRIEVER_CALL"
|
1042
|
-
if serialized and "name" in serialized:
|
1043
|
-
name = f"RETRIEVER_{serialized['name'].upper()}"
|
1044
|
-
|
1045
|
-
self.start_span(name, span_type="retriever")
|
1046
|
-
self.trace_client.record_input({
|
1047
|
-
'query': query,
|
1048
|
-
'tags': tags,
|
1049
|
-
'metadata': metadata,
|
1050
|
-
'kwargs': kwargs
|
1051
|
-
})
|
1052
|
-
|
1053
|
-
def on_retriever_end(
|
1054
|
-
self,
|
1055
|
-
documents: Sequence[Document],
|
1056
|
-
*,
|
1057
|
-
run_id: UUID,
|
1058
|
-
parent_run_id: Optional[UUID] = None,
|
1059
|
-
**kwargs: Any
|
1060
|
-
) -> Any:
|
1061
|
-
# Process the retrieved documents into a format suitable for logging
|
1062
|
-
doc_summary = []
|
1063
|
-
for i, doc in enumerate(documents):
|
1064
|
-
# Extract key information from each document
|
1065
|
-
doc_data = {
|
1066
|
-
"index": i,
|
1067
|
-
"page_content": doc.page_content[:100] + "..." if len(doc.page_content) > 100 else doc.page_content,
|
1068
|
-
"metadata": doc.metadata
|
1069
|
-
}
|
1070
|
-
doc_summary.append(doc_data)
|
1071
|
-
|
1072
|
-
# Record the document data
|
1073
|
-
self.trace_client.record_output({
|
1074
|
-
"document_count": len(documents),
|
1075
|
-
"documents": doc_summary
|
1076
|
-
})
|
1077
|
-
|
1078
|
-
# End the retriever span
|
1079
|
-
self.end_span(self.trace_client._current_span, span_type="retriever")
|
1080
|
-
|
1081
|
-
def on_chain_start(
|
1082
|
-
self,
|
1083
|
-
serialized: Dict[str, Any],
|
1084
|
-
inputs: Dict[str, Any],
|
1085
|
-
*,
|
1086
|
-
run_id: UUID,
|
1087
|
-
parent_run_id: Optional[UUID] = None,
|
1088
|
-
tags: Optional[List[str]] = None,
|
1089
|
-
metadata: Optional[Dict[str, Any]] = None,
|
1090
|
-
**kwargs: Any
|
1091
|
-
) -> None:
|
1092
|
-
node = metadata.get("langgraph_node")
|
1093
|
-
if node != None and node != "__start__" and node != self.previous_node:
|
1094
|
-
self.executed_node_tools.append(node)
|
1095
|
-
self.executed_nodes.append(node)
|
1096
|
-
self.previous_node = node
|
1097
|
-
|
1098
|
-
def on_tool_start(
|
1099
|
-
self,
|
1100
|
-
serialized: Optional[dict[str, Any]],
|
1101
|
-
input_str: str,
|
1102
|
-
run_id: Optional[UUID] = None,
|
1103
|
-
parent_run_id: Optional[UUID] = None,
|
1104
|
-
inputs: Optional[dict[str, Any]] = None,
|
1105
|
-
**kwargs: Any,
|
1106
|
-
):
|
1107
|
-
name = serialized["name"]
|
1108
|
-
self.start_span(name, span_type="tool")
|
1109
|
-
self.executed_node_tools.append(f"{self.previous_node}:{name}")
|
1110
|
-
self.executed_tools.append(name)
|
1111
|
-
self.trace_client.record_input({
|
1112
|
-
'args': input_str,
|
1113
|
-
'kwargs': kwargs
|
1114
|
-
})
|
1115
|
-
|
1116
|
-
def on_tool_end(self, output: Any, *, run_id: UUID, parent_run_id: Optional[UUID] = None, **kwargs: Any) -> Any:
|
1117
|
-
self.trace_client.record_output(output)
|
1118
|
-
self.end_span(self.trace_client._current_span, span_type="tool")
|
1119
|
-
|
1120
|
-
def on_agent_action (self, action: AgentAction, **kwargs: Any) -> Any:
|
1121
|
-
print(f"Agent action: {action}")
|
1122
|
-
|
1123
|
-
def on_agent_finish(
|
1124
|
-
self,
|
1125
|
-
finish: AgentFinish,
|
1126
|
-
*,
|
1127
|
-
run_id: UUID,
|
1128
|
-
parent_run_id: Optional[UUID] = None,
|
1129
|
-
tags: Optional[list[str]] = None,
|
1130
|
-
**kwargs: Any,
|
1131
|
-
) -> None:
|
1132
|
-
print(f"Agent action: {finish}")
|
1133
|
-
|
1134
|
-
def on_llm_start(
|
1135
|
-
self,
|
1136
|
-
serialized: Optional[dict[str, Any]],
|
1137
|
-
prompts: list[str],
|
1138
|
-
*,
|
1139
|
-
run_id: UUID,
|
1140
|
-
parent_run_id: Optional[UUID] = None,
|
1141
|
-
**kwargs: Any,
|
1142
|
-
) -> Any:
|
1143
|
-
name = "LLM call"
|
1144
|
-
self.start_span(name, span_type="llm")
|
1145
|
-
self.trace_client.record_input({
|
1146
|
-
'args': prompts,
|
1147
|
-
'kwargs': kwargs
|
1148
|
-
})
|
1149
|
-
|
1150
|
-
def on_llm_end(self, response: LLMResult, *, run_id: UUID, parent_run_id: Optional[UUID] = None, **kwargs: Any):
|
1151
|
-
self.trace_client.record_output(response.generations[0][0].text)
|
1152
|
-
self.end_span(self.trace_client._current_span, span_type="llm")
|
1153
|
-
|
1154
|
-
def on_chat_model_start(
|
1155
|
-
self,
|
1156
|
-
serialized: Optional[dict[str, Any]],
|
1157
|
-
messages: list[list[BaseMessage]],
|
1158
|
-
*,
|
1159
|
-
run_id: UUID,
|
1160
|
-
parent_run_id: Optional[UUID] = None,
|
1161
|
-
**kwargs: Any,
|
1162
|
-
) -> Any:
|
1163
|
-
|
1164
|
-
if "openai" in serialized["id"]:
|
1165
|
-
name = f"OPENAI_API_CALL_{self.openai_count}"
|
1166
|
-
self.openai_count += 1
|
1167
|
-
elif "anthropic" in serialized["id"]:
|
1168
|
-
name = "ANTHROPIC_API_CALL"
|
1169
|
-
elif "together" in serialized["id"]:
|
1170
|
-
name = "TOGETHER_API_CALL"
|
1171
|
-
else:
|
1172
|
-
name = "LLM call"
|
1173
|
-
|
1174
|
-
self.start_span(name, span_type="llm")
|
1175
|
-
self.trace_client.record_input({
|
1176
|
-
'args': str(messages),
|
1177
|
-
'kwargs': kwargs
|
1178
|
-
})
|
1179
|
-
|
1180
|
-
judgeval_callback_handler_var: ContextVar[Optional[JudgevalCallbackHandler]] = ContextVar(
|
1181
|
-
"judgeval_callback_handler", default=None
|
1182
|
-
)
|
1183
|
-
|
1184
|
-
def set_global_handler(handler: JudgevalCallbackHandler):
|
1185
|
-
judgeval_callback_handler_var.set(handler)
|
1186
|
-
|
1187
|
-
def clear_global_handler():
|
1188
|
-
judgeval_callback_handler_var.set(None)
|
1189
|
-
|
1190
|
-
register_configure_hook(
|
1191
|
-
context_var=judgeval_callback_handler_var,
|
1192
|
-
inheritable=True,
|
1193
|
-
)
|
989
|
+
}
|
judgeval/data/__init__.py
CHANGED
@@ -2,7 +2,7 @@ from judgeval.data.example import Example, ExampleParams
|
|
2
2
|
from judgeval.data.api_example import ProcessExample, create_process_example
|
3
3
|
from judgeval.data.scorer_data import ScorerData, create_scorer_data
|
4
4
|
from judgeval.data.result import ScoringResult, generate_scoring_result
|
5
|
-
|
5
|
+
|
6
6
|
__all__ = [
|
7
7
|
"Example",
|
8
8
|
"ExampleParams",
|
@@ -12,5 +12,4 @@ __all__ = [
|
|
12
12
|
"create_scorer_data",
|
13
13
|
"ScoringResult",
|
14
14
|
"generate_scoring_result",
|
15
|
-
"CustomExample",
|
16
15
|
]
|
@@ -0,0 +1,316 @@
|
|
1
|
+
from typing import Any, Dict, List, Optional, Sequence
|
2
|
+
from uuid import UUID
|
3
|
+
import time
|
4
|
+
import uuid
|
5
|
+
from contextvars import ContextVar
|
6
|
+
from judgeval.common.tracer import TraceClient, TraceEntry, Tracer, SpanType
|
7
|
+
|
8
|
+
from langchain_core.language_models import BaseChatModel
|
9
|
+
from langchain_huggingface import ChatHuggingFace
|
10
|
+
from langchain_openai import ChatOpenAI
|
11
|
+
from langchain_anthropic import ChatAnthropic
|
12
|
+
from langchain_core.utils.function_calling import convert_to_openai_tool
|
13
|
+
from langchain_core.callbacks import CallbackManager, BaseCallbackHandler
|
14
|
+
from langchain_core.agents import AgentAction, AgentFinish
|
15
|
+
from langchain_core.outputs import LLMResult
|
16
|
+
from langchain_core.tracers.context import register_configure_hook
|
17
|
+
from langchain_core.messages.ai import AIMessage
|
18
|
+
from langchain_core.messages.tool import ToolMessage
|
19
|
+
from langchain_core.messages.base import BaseMessage
|
20
|
+
from langchain_core.documents import Document
|
21
|
+
|
22
|
+
class JudgevalCallbackHandler(BaseCallbackHandler):
|
23
|
+
def __init__(self, tracer: Tracer):
|
24
|
+
self.tracer = tracer
|
25
|
+
self.trace_client = tracer.get_current_trace() if tracer.get_current_trace() else None
|
26
|
+
self.previous_spans = [] # stack of previous spans
|
27
|
+
self.finished = False
|
28
|
+
|
29
|
+
# Attributes for users to access
|
30
|
+
self.previous_node = None
|
31
|
+
self.executed_node_tools = []
|
32
|
+
self.executed_nodes = []
|
33
|
+
self.executed_tools = []
|
34
|
+
|
35
|
+
def start_span(self, name: str, span_type: SpanType = "span"):
|
36
|
+
start_time = time.time()
|
37
|
+
|
38
|
+
# Record span entry
|
39
|
+
self.trace_client.add_entry(TraceEntry(
|
40
|
+
type="enter",
|
41
|
+
function=name,
|
42
|
+
depth=self.trace_client.tracer.depth,
|
43
|
+
message=name,
|
44
|
+
timestamp=start_time,
|
45
|
+
span_type=span_type
|
46
|
+
))
|
47
|
+
|
48
|
+
self.trace_client.tracer.depth += 1
|
49
|
+
self.previous_spans.append(self.trace_client._current_span)
|
50
|
+
self.trace_client._current_span = name
|
51
|
+
self._start_time = start_time
|
52
|
+
|
53
|
+
def end_span(self, name: str, span_type: SpanType = "span"):
|
54
|
+
self.trace_client.tracer.depth -= 1
|
55
|
+
duration = time.time() - self._start_time
|
56
|
+
|
57
|
+
# Record span exit
|
58
|
+
self.trace_client.add_entry(TraceEntry(
|
59
|
+
type="exit",
|
60
|
+
function=name,
|
61
|
+
depth=self.trace_client.tracer.depth,
|
62
|
+
message=f"{name}",
|
63
|
+
timestamp=time.time(),
|
64
|
+
duration=duration,
|
65
|
+
span_type=span_type
|
66
|
+
))
|
67
|
+
self.trace_client._current_span = self.previous_spans.pop()
|
68
|
+
|
69
|
+
if self.trace_client.tracer.depth == 0:
|
70
|
+
# Save the trace if we are the root, this is when users dont use any @observe decorators
|
71
|
+
self.trace_client.save(empty_save=False, overwrite=True)
|
72
|
+
self.trace_client._current_trace = None
|
73
|
+
|
74
|
+
def on_retriever_start(
|
75
|
+
self,
|
76
|
+
serialized: Optional[dict[str, Any]],
|
77
|
+
query: str,
|
78
|
+
*,
|
79
|
+
run_id: UUID,
|
80
|
+
parent_run_id: Optional[UUID] = None,
|
81
|
+
tags: Optional[list[str]] = None,
|
82
|
+
metadata: Optional[dict[str, Any]] = None,
|
83
|
+
**kwargs: Any,
|
84
|
+
) -> Any:
|
85
|
+
name = "RETRIEVER_CALL"
|
86
|
+
if serialized and "name" in serialized:
|
87
|
+
name = f"RETRIEVER_{serialized['name'].upper()}"
|
88
|
+
|
89
|
+
self.start_span(name, span_type="retriever")
|
90
|
+
self.trace_client.record_input({
|
91
|
+
'query': query,
|
92
|
+
'tags': tags,
|
93
|
+
'metadata': metadata,
|
94
|
+
'kwargs': kwargs
|
95
|
+
})
|
96
|
+
|
97
|
+
def on_retriever_end(
|
98
|
+
self,
|
99
|
+
documents: Sequence[Document],
|
100
|
+
*,
|
101
|
+
run_id: UUID,
|
102
|
+
parent_run_id: Optional[UUID] = None,
|
103
|
+
**kwargs: Any
|
104
|
+
) -> Any:
|
105
|
+
# Process the retrieved documents into a format suitable for logging
|
106
|
+
doc_summary = []
|
107
|
+
for i, doc in enumerate(documents):
|
108
|
+
# Extract key information from each document
|
109
|
+
doc_data = {
|
110
|
+
"index": i,
|
111
|
+
"page_content": doc.page_content[:100] + "..." if len(doc.page_content) > 100 else doc.page_content,
|
112
|
+
"metadata": doc.metadata
|
113
|
+
}
|
114
|
+
doc_summary.append(doc_data)
|
115
|
+
|
116
|
+
# Record the document data
|
117
|
+
self.trace_client.record_output({
|
118
|
+
"document_count": len(documents),
|
119
|
+
"documents": doc_summary
|
120
|
+
})
|
121
|
+
|
122
|
+
# End the retriever span
|
123
|
+
self.end_span(self.trace_client._current_span, span_type="retriever")
|
124
|
+
|
125
|
+
def on_chain_start(
|
126
|
+
self,
|
127
|
+
serialized: Dict[str, Any],
|
128
|
+
inputs: Dict[str, Any],
|
129
|
+
*,
|
130
|
+
run_id: UUID,
|
131
|
+
parent_run_id: Optional[UUID] = None,
|
132
|
+
tags: Optional[List[str]] = None,
|
133
|
+
metadata: Optional[Dict[str, Any]] = None,
|
134
|
+
**kwargs: Any
|
135
|
+
) -> None:
|
136
|
+
# If the user doesnt use any @observe decorators, the first action in LangGraph workflows seems tohave this attribute, so we intialize our trace client here
|
137
|
+
if kwargs.get('name') == 'LangGraph':
|
138
|
+
if not self.trace_client:
|
139
|
+
trace_id = str(uuid.uuid4())
|
140
|
+
project = self.tracer.project_name
|
141
|
+
trace = TraceClient(self.tracer, trace_id, trace_id, project_name=project, overwrite=False, rules=self.tracer.rules, enable_monitoring=self.tracer.enable_monitoring, enable_evaluations=self.tracer.enable_evaluations)
|
142
|
+
self.trace_client = trace
|
143
|
+
self.tracer._current_trace = trace # set the trace in the original tracer object
|
144
|
+
# Only save empty trace for the root call
|
145
|
+
self.trace_client.save(empty_save=True, overwrite=False)
|
146
|
+
|
147
|
+
self.start_span("LangGraph", span_type="Main Function")
|
148
|
+
|
149
|
+
node = metadata.get("langgraph_node")
|
150
|
+
if node != None and node != self.previous_node:
|
151
|
+
self.start_span(node, span_type="node")
|
152
|
+
self.executed_node_tools.append(node)
|
153
|
+
self.executed_nodes.append(node)
|
154
|
+
self.trace_client.record_input({
|
155
|
+
'args': inputs,
|
156
|
+
'kwargs': kwargs
|
157
|
+
})
|
158
|
+
self.previous_node = node
|
159
|
+
|
160
|
+
def on_chain_end(
|
161
|
+
self,
|
162
|
+
outputs: Dict[str, Any],
|
163
|
+
*,
|
164
|
+
run_id: UUID,
|
165
|
+
parent_run_id: Optional[UUID] = None,
|
166
|
+
tags: Optional[List[str]] = None,
|
167
|
+
**kwargs: Any,
|
168
|
+
) -> Any:
|
169
|
+
if outputs == "__end__":
|
170
|
+
self.finished = True
|
171
|
+
if tags is not None and any("graph:step" in tag for tag in tags):
|
172
|
+
self.trace_client.record_output(outputs)
|
173
|
+
self.end_span(self.trace_client._current_span, span_type="node")
|
174
|
+
|
175
|
+
if self.finished:
|
176
|
+
self.end_span(self.trace_client._current_span, span_type="Main Function")
|
177
|
+
|
178
|
+
def on_chain_error(
|
179
|
+
self,
|
180
|
+
error: BaseException,
|
181
|
+
*,
|
182
|
+
run_id: UUID,
|
183
|
+
parent_run_id: Optional[UUID] = None,
|
184
|
+
**kwargs: Any,
|
185
|
+
) -> Any:
|
186
|
+
print(f"Chain error: {error}")
|
187
|
+
self.trace_client.record_output(error)
|
188
|
+
self.end_span(self.trace_client._current_span, span_type="node")
|
189
|
+
|
190
|
+
def on_tool_start(
|
191
|
+
self,
|
192
|
+
serialized: Optional[dict[str, Any]],
|
193
|
+
input_str: str,
|
194
|
+
run_id: Optional[UUID] = None,
|
195
|
+
parent_run_id: Optional[UUID] = None,
|
196
|
+
inputs: Optional[dict[str, Any]] = None,
|
197
|
+
**kwargs: Any,
|
198
|
+
):
|
199
|
+
name = serialized["name"]
|
200
|
+
self.start_span(name, span_type="tool")
|
201
|
+
self.executed_node_tools.append(f"{self.previous_node}:{name}")
|
202
|
+
self.executed_tools.append(name)
|
203
|
+
self.trace_client.record_input({
|
204
|
+
'args': input_str,
|
205
|
+
'kwargs': kwargs
|
206
|
+
})
|
207
|
+
|
208
|
+
def on_tool_end(self, output: Any, *, run_id: UUID, parent_run_id: Optional[UUID] = None, **kwargs: Any) -> Any:
|
209
|
+
self.trace_client.record_output(output)
|
210
|
+
self.end_span(self.trace_client._current_span, span_type="tool")
|
211
|
+
|
212
|
+
def on_tool_error(
|
213
|
+
self,
|
214
|
+
error: BaseException,
|
215
|
+
*,
|
216
|
+
run_id: UUID,
|
217
|
+
parent_run_id: Optional[UUID] = None,
|
218
|
+
**kwargs: Any,
|
219
|
+
) -> Any:
|
220
|
+
print(f"Tool error: {error}")
|
221
|
+
self.trace_client.record_output(error)
|
222
|
+
self.end_span(self.trace_client._current_span, span_type="tool")
|
223
|
+
|
224
|
+
def on_agent_action(
|
225
|
+
self,
|
226
|
+
action: AgentAction,
|
227
|
+
*,
|
228
|
+
run_id: UUID,
|
229
|
+
parent_run_id: Optional[UUID] = None,
|
230
|
+
**kwargs: Any,
|
231
|
+
) -> Any:
|
232
|
+
print(f"Agent action: {action}")
|
233
|
+
|
234
|
+
def on_agent_finish(
|
235
|
+
self,
|
236
|
+
finish: AgentFinish,
|
237
|
+
*,
|
238
|
+
run_id: UUID,
|
239
|
+
parent_run_id: Optional[UUID] = None,
|
240
|
+
**kwargs: Any,
|
241
|
+
) -> Any:
|
242
|
+
print(f"Agent finish: {finish}")
|
243
|
+
|
244
|
+
def on_llm_start(
|
245
|
+
self,
|
246
|
+
serialized: Optional[dict[str, Any]],
|
247
|
+
prompts: list[str],
|
248
|
+
*,
|
249
|
+
run_id: UUID,
|
250
|
+
parent_run_id: Optional[UUID] = None,
|
251
|
+
**kwargs: Any,
|
252
|
+
) -> Any:
|
253
|
+
name = "LLM call"
|
254
|
+
self.start_span(name, span_type="llm")
|
255
|
+
self.trace_client.record_input({
|
256
|
+
'args': prompts,
|
257
|
+
'kwargs': kwargs
|
258
|
+
})
|
259
|
+
|
260
|
+
def on_llm_end(self, response: LLMResult, *, run_id: UUID, parent_run_id: Optional[UUID] = None, **kwargs: Any):
|
261
|
+
self.trace_client.record_output(response.generations[0][0].text)
|
262
|
+
self.end_span(self.trace_client._current_span, span_type="llm")
|
263
|
+
|
264
|
+
def on_llm_error(
|
265
|
+
self,
|
266
|
+
error: BaseException,
|
267
|
+
*,
|
268
|
+
run_id: UUID,
|
269
|
+
parent_run_id: Optional[UUID] = None,
|
270
|
+
**kwargs: Any,
|
271
|
+
) -> Any:
|
272
|
+
print(f"LLM error: {error}")
|
273
|
+
self.trace_client.record_output(error)
|
274
|
+
self.end_span(self.trace_client._current_span, span_type="llm")
|
275
|
+
|
276
|
+
def on_chat_model_start(
|
277
|
+
self,
|
278
|
+
serialized: Optional[dict[str, Any]],
|
279
|
+
messages: list[list[BaseMessage]],
|
280
|
+
*,
|
281
|
+
run_id: UUID,
|
282
|
+
parent_run_id: Optional[UUID] = None,
|
283
|
+
**kwargs: Any,
|
284
|
+
) -> Any:
|
285
|
+
|
286
|
+
if "openai" in serialized["id"]:
|
287
|
+
name = f"OPENAI_API_CALL"
|
288
|
+
elif "anthropic" in serialized["id"]:
|
289
|
+
name = "ANTHROPIC_API_CALL"
|
290
|
+
elif "together" in serialized["id"]:
|
291
|
+
name = "TOGETHER_API_CALL"
|
292
|
+
else:
|
293
|
+
name = "LLM call"
|
294
|
+
|
295
|
+
self.start_span(name, span_type="llm")
|
296
|
+
self.trace_client.record_input({
|
297
|
+
'args': str(messages),
|
298
|
+
'kwargs': kwargs
|
299
|
+
})
|
300
|
+
|
301
|
+
judgeval_callback_handler_var: ContextVar[Optional[JudgevalCallbackHandler]] = ContextVar(
|
302
|
+
"judgeval_callback_handler", default=None
|
303
|
+
)
|
304
|
+
|
305
|
+
def set_global_handler(handler: JudgevalCallbackHandler):
|
306
|
+
if not handler.tracer.enable_monitoring:
|
307
|
+
return
|
308
|
+
judgeval_callback_handler_var.set(handler)
|
309
|
+
|
310
|
+
def clear_global_handler():
|
311
|
+
judgeval_callback_handler_var.set(None)
|
312
|
+
|
313
|
+
register_configure_hook(
|
314
|
+
context_var=judgeval_callback_handler_var,
|
315
|
+
inheritable=True,
|
316
|
+
)
|
@@ -0,0 +1,156 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: judgeval
|
3
|
+
Version: 0.0.25
|
4
|
+
Summary: Judgeval Package
|
5
|
+
Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
|
6
|
+
Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
|
7
|
+
Author-email: Andrew Li <andrew@judgmentlabs.ai>, Alex Shan <alex@judgmentlabs.ai>, Joseph Camyre <joseph@judgmentlabs.ai>
|
8
|
+
License-Expression: Apache-2.0
|
9
|
+
License-File: LICENSE.md
|
10
|
+
Classifier: Operating System :: OS Independent
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
12
|
+
Requires-Python: >=3.11
|
13
|
+
Requires-Dist: anthropic
|
14
|
+
Requires-Dist: fastapi
|
15
|
+
Requires-Dist: langchain
|
16
|
+
Requires-Dist: langchain-anthropic
|
17
|
+
Requires-Dist: langchain-core
|
18
|
+
Requires-Dist: langchain-huggingface
|
19
|
+
Requires-Dist: langchain-openai
|
20
|
+
Requires-Dist: litellm
|
21
|
+
Requires-Dist: nest-asyncio
|
22
|
+
Requires-Dist: openai
|
23
|
+
Requires-Dist: openpyxl
|
24
|
+
Requires-Dist: pandas
|
25
|
+
Requires-Dist: pika
|
26
|
+
Requires-Dist: python-dotenv==1.0.1
|
27
|
+
Requires-Dist: requests
|
28
|
+
Requires-Dist: supabase
|
29
|
+
Requires-Dist: together
|
30
|
+
Requires-Dist: uvicorn
|
31
|
+
Provides-Extra: dev
|
32
|
+
Requires-Dist: pytest-asyncio>=0.25.0; extra == 'dev'
|
33
|
+
Requires-Dist: pytest-mock>=3.14.0; extra == 'dev'
|
34
|
+
Requires-Dist: pytest>=8.3.4; extra == 'dev'
|
35
|
+
Requires-Dist: tavily-python; extra == 'dev'
|
36
|
+
Description-Content-Type: text/markdown
|
37
|
+
|
38
|
+
# Judgeval SDK
|
39
|
+
|
40
|
+
Judgeval is an open-source framework for building evaluation pipelines for multi-step agent workflows, supporting both real-time and experimental evaluation setups. To learn more about Judgment or sign up for free, visit our [website](https://www.judgmentlabs.ai/) or check out our [developer docs](https://judgment.mintlify.app/getting_started).
|
41
|
+
|
42
|
+
## Features
|
43
|
+
|
44
|
+
- **Development and Production Evaluation Layer**: Offers a robust evaluation layer for multi-step agent applications, including unit-testing and performance monitoring.
|
45
|
+
- **Plug-and-Evaluate**: Integrate LLM systems with 10+ research-backed metrics, including:
|
46
|
+
- Hallucination detection
|
47
|
+
- RAG retriever quality
|
48
|
+
- And more
|
49
|
+
- **Custom Evaluation Pipelines**: Construct powerful custom evaluation pipelines tailored for your LLM systems.
|
50
|
+
- **Monitoring in Production**: Utilize state-of-the-art real-time evaluation foundation models to monitor LLM systems effectively.
|
51
|
+
|
52
|
+
## Installation
|
53
|
+
|
54
|
+
```bash
|
55
|
+
pip install judgeval
|
56
|
+
```
|
57
|
+
|
58
|
+
## Quickstart: Evaluations
|
59
|
+
|
60
|
+
You can evaluate your workflow execution data to measure quality metrics such as hallucination.
|
61
|
+
|
62
|
+
Create a file named `evaluate.py` with the following code:
|
63
|
+
|
64
|
+
```python
|
65
|
+
from judgeval import JudgmentClient
|
66
|
+
from judgeval.data import Example
|
67
|
+
from judgeval.scorers import FaithfulnessScorer
|
68
|
+
|
69
|
+
client = JudgmentClient()
|
70
|
+
|
71
|
+
example = Example(
|
72
|
+
input="What if these shoes don't fit?",
|
73
|
+
actual_output="We offer a 30-day full refund at no extra cost.",
|
74
|
+
retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
|
75
|
+
)
|
76
|
+
|
77
|
+
scorer = FaithfulnessScorer(threshold=0.5)
|
78
|
+
results = client.run_evaluation(
|
79
|
+
examples=[example],
|
80
|
+
scorers=[scorer],
|
81
|
+
model="gpt-4o",
|
82
|
+
)
|
83
|
+
print(results)
|
84
|
+
```
|
85
|
+
Click [here](https://judgment.mintlify.app/getting_started#create-your-first-experiment) for a more detailed explanation
|
86
|
+
|
87
|
+
## Quickstart: Traces
|
88
|
+
|
89
|
+
Track your workflow execution for full observability with just a few lines of code.
|
90
|
+
|
91
|
+
Create a file named `traces.py` with the following code:
|
92
|
+
|
93
|
+
```python
|
94
|
+
from judgeval.common.tracer import Tracer, wrap
|
95
|
+
from openai import OpenAI
|
96
|
+
|
97
|
+
client = wrap(OpenAI())
|
98
|
+
judgment = Tracer(project_name="my_project")
|
99
|
+
|
100
|
+
@judgment.observe(span_type="tool")
|
101
|
+
def my_tool():
|
102
|
+
return "Hello world!"
|
103
|
+
|
104
|
+
@judgment.observe(span_type="function")
|
105
|
+
def main():
|
106
|
+
task_input = my_tool()
|
107
|
+
res = client.chat.completions.create(
|
108
|
+
model="gpt-4o",
|
109
|
+
messages=[{"role": "user", "content": f"{task_input}"}]
|
110
|
+
)
|
111
|
+
return res.choices[0].message.content
|
112
|
+
```
|
113
|
+
Click [here](https://judgment.mintlify.app/getting_started#create-your-first-trace) for a more detailed explanation
|
114
|
+
|
115
|
+
## Quickstart: Online Evaluations
|
116
|
+
|
117
|
+
Apply performance monitoring to measure the quality of your systems in production, not just on historical data.
|
118
|
+
|
119
|
+
Using the same traces.py file we created earlier:
|
120
|
+
|
121
|
+
```python
|
122
|
+
from judgeval.common.tracer import Tracer, wrap
|
123
|
+
from judgeval.scorers import AnswerRelevancyScorer
|
124
|
+
from openai import OpenAI
|
125
|
+
|
126
|
+
client = wrap(OpenAI())
|
127
|
+
judgment = Tracer(project_name="my_project")
|
128
|
+
|
129
|
+
@judgment.observe(span_type="tool")
|
130
|
+
def my_tool():
|
131
|
+
return "Hello world!"
|
132
|
+
|
133
|
+
@judgment.observe(span_type="function")
|
134
|
+
def main():
|
135
|
+
task_input = my_tool()
|
136
|
+
res = client.chat.completions.create(
|
137
|
+
model="gpt-4o",
|
138
|
+
messages=[{"role": "user", "content": f"{task_input}"}]
|
139
|
+
).choices[0].message.content
|
140
|
+
|
141
|
+
judgment.get_current_trace().async_evaluate(
|
142
|
+
scorers=[AnswerRelevancyScorer(threshold=0.5)],
|
143
|
+
input=task_input,
|
144
|
+
actual_output=res,
|
145
|
+
model="gpt-4o"
|
146
|
+
)
|
147
|
+
|
148
|
+
return res
|
149
|
+
```
|
150
|
+
Click [here](https://judgment.mintlify.app/getting_started#create-your-first-online-evaluation) for a more detailed explanation
|
151
|
+
|
152
|
+
## Documentation and Demos
|
153
|
+
|
154
|
+
For more detailed documentation, please check out our [docs](https://judgment.mintlify.app/getting_started) and some of our [demo videos](https://www.youtube.com/@AlexShan-j3o) for reference!
|
155
|
+
|
156
|
+
##
|
@@ -8,19 +8,17 @@ judgeval/run_evaluation.py,sha256=YOzkyeWl-r3vaz0jB5nM-1VULi7ALmJ9_f58ENqexXk,23
|
|
8
8
|
judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
|
9
9
|
judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
|
10
10
|
judgeval/common/logger.py,sha256=KO75wWXCxhUHUMvLaTU31ZzOk6tkZBa7heQ7y0f-zFE,6062
|
11
|
-
judgeval/common/tracer.py,sha256=
|
11
|
+
judgeval/common/tracer.py,sha256=cc_K1poBg3Vzl2Nf7yhHlklrOe6Fb_TEekvjAVAQFSc,39958
|
12
12
|
judgeval/common/utils.py,sha256=LUQV5JfDr6wj7xHAJoNq-gofNZ6mjXbeKrGKzBME1KM,33533
|
13
|
-
judgeval/data/__init__.py,sha256=
|
13
|
+
judgeval/data/__init__.py,sha256=YferxwmUqoBi18hrdgro0BD0h4pt20LAqISeUzGMcVU,474
|
14
14
|
judgeval/data/api_example.py,sha256=dzkrQ0xno08y6qNfqL2djXbapUyc2B2aQ5iANn0o4CY,3667
|
15
|
-
judgeval/data/custom_example.py,sha256=C-j9iVenBy52dwnL6PIjJAdKsBO1ajKjsaRr4RJthUo,3676
|
16
15
|
judgeval/data/example.py,sha256=BhGBhamFWgH6wtvrRYM8dGtDfXh-cDxDhtNL5Gbdz_M,5892
|
17
|
-
judgeval/data/ground_truth.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
16
|
judgeval/data/result.py,sha256=4fgjKtUmT3br7K6fkRiNIxTGKUuwMeGyRLqzkpxwXKE,4436
|
19
17
|
judgeval/data/scorer_data.py,sha256=JVlaTx1EP2jw2gh3Vgx1CSEsvIFABAN26IquKyxwiJQ,3273
|
20
18
|
judgeval/data/datasets/__init__.py,sha256=IdNKhQv9yYZ_op0rdBacrFaFVmiiYQ3JTzXzxOTsEVQ,176
|
21
19
|
judgeval/data/datasets/dataset.py,sha256=DjJNy-qvviXMGBl_JhiBzvgiJH1_3rYtAWeHP6Daw6E,11897
|
22
20
|
judgeval/data/datasets/eval_dataset_client.py,sha256=B4bRy0Di2oFlaBbvp4_hRx2g_9e6Cs0y3ZUT9reMyhw,10926
|
23
|
-
judgeval/
|
21
|
+
judgeval/integrations/langgraph.py,sha256=yBbZrePkY19dLLgleeIYFVzakEPaiko6YuccLbwSYcE,10957
|
24
22
|
judgeval/judges/__init__.py,sha256=6X7VSwrwsdxGBNxCyapVRWGghhKOy3MVxFNMQ62kCXM,308
|
25
23
|
judgeval/judges/base_judge.py,sha256=ch_S7uBB7lyv44Lf1d7mIGFpveOO58zOkkpImKgd9_4,994
|
26
24
|
judgeval/judges/litellm_judge.py,sha256=EIL58Teptv8DzZUO3yP2RDQCDq-aoBB6HPZzPdK6KTg,2424
|
@@ -89,7 +87,7 @@ judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py
|
|
89
87
|
judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py,sha256=Qk7lwHgRPYeGoxTOyclAh1VfGItfvHJ6l1t7Nk3SWFM,20927
|
90
88
|
judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
|
91
89
|
judgeval/utils/alerts.py,sha256=RgW5R9Dn3Jtim0OyAYDbNzjoX2s6SA4Mw16GyyaikjI,1424
|
92
|
-
judgeval-0.0.
|
93
|
-
judgeval-0.0.
|
94
|
-
judgeval-0.0.
|
95
|
-
judgeval-0.0.
|
90
|
+
judgeval-0.0.25.dist-info/METADATA,sha256=09S16QU5qwYqwvrsdg36KVvv9-tnVcSKccgDldPqWpQ,5418
|
91
|
+
judgeval-0.0.25.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
92
|
+
judgeval-0.0.25.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
|
93
|
+
judgeval-0.0.25.dist-info/RECORD,,
|
judgeval/data/custom_example.py
DELETED
@@ -1,98 +0,0 @@
|
|
1
|
-
from pydantic import BaseModel, Field, field_validator
|
2
|
-
from typing import Optional, Dict, Any
|
3
|
-
from uuid import uuid4
|
4
|
-
from datetime import datetime
|
5
|
-
import json
|
6
|
-
import warnings
|
7
|
-
|
8
|
-
# Brainstorming what are the requirements for the fields?
|
9
|
-
class CustomExample(BaseModel):
|
10
|
-
name: Optional[str] = None
|
11
|
-
additional_metadata: Optional[Dict[str, Any]] = None
|
12
|
-
example_id: str = Field(default_factory=lambda: str(uuid4()))
|
13
|
-
example_index: Optional[int] = None
|
14
|
-
timestamp: Optional[str] = None
|
15
|
-
trace_id: Optional[str] = None
|
16
|
-
|
17
|
-
model_config = {
|
18
|
-
"extra": "allow", # Allow extra fields with any types
|
19
|
-
}
|
20
|
-
|
21
|
-
def __init__(self, **data):
|
22
|
-
if 'example_id' not in data:
|
23
|
-
data['example_id'] = str(uuid4())
|
24
|
-
# Set timestamp if not provided
|
25
|
-
if 'timestamp' not in data:
|
26
|
-
data['timestamp'] = datetime.now().isoformat()
|
27
|
-
super().__init__(**data)
|
28
|
-
|
29
|
-
@field_validator('additional_metadata', mode='before')
|
30
|
-
@classmethod
|
31
|
-
def validate_additional_metadata(cls, v):
|
32
|
-
if v is not None and not isinstance(v, dict):
|
33
|
-
raise ValueError(f"Additional metadata must be a dictionary or None but got {v} of type {type(v)}")
|
34
|
-
return v
|
35
|
-
|
36
|
-
@field_validator('example_index', mode='before')
|
37
|
-
@classmethod
|
38
|
-
def validate_example_index(cls, v):
|
39
|
-
if v is not None and not isinstance(v, int):
|
40
|
-
raise ValueError(f"Example index must be an integer or None but got {v} of type {type(v)}")
|
41
|
-
return v
|
42
|
-
|
43
|
-
@field_validator('timestamp', mode='before')
|
44
|
-
@classmethod
|
45
|
-
def validate_timestamp(cls, v):
|
46
|
-
if v is not None and not isinstance(v, str):
|
47
|
-
raise ValueError(f"Timestamp must be a string or None but got {v} of type {type(v)}")
|
48
|
-
return v
|
49
|
-
|
50
|
-
@field_validator('trace_id', mode='before')
|
51
|
-
@classmethod
|
52
|
-
def validate_trace_id(cls, v):
|
53
|
-
if v is not None and not isinstance(v, str):
|
54
|
-
raise ValueError(f"Trace ID must be a string or None but got {v} of type {type(v)}")
|
55
|
-
return v
|
56
|
-
|
57
|
-
def to_dict(self):
|
58
|
-
return self.model_dump()
|
59
|
-
|
60
|
-
def __str__(self):
|
61
|
-
return str(self.model_dump())
|
62
|
-
|
63
|
-
def model_dump(self, **kwargs):
|
64
|
-
"""
|
65
|
-
Custom serialization that handles special cases for fields that might fail standard serialization.
|
66
|
-
"""
|
67
|
-
data = super().model_dump(**kwargs)
|
68
|
-
|
69
|
-
# Get all fields including custom ones
|
70
|
-
all_fields = self.__dict__
|
71
|
-
|
72
|
-
for field_name, value in all_fields.items():
|
73
|
-
try:
|
74
|
-
# Check if the field has its own serialization method
|
75
|
-
if hasattr(value, 'to_dict'):
|
76
|
-
data[field_name] = value.to_dict()
|
77
|
-
elif hasattr(value, 'model_dump'):
|
78
|
-
data[field_name] = value.model_dump()
|
79
|
-
# Field is already in data from super().model_dump()
|
80
|
-
elif field_name in data:
|
81
|
-
continue
|
82
|
-
else:
|
83
|
-
# Try standard JSON serialization
|
84
|
-
json.dumps(value)
|
85
|
-
data[field_name] = value
|
86
|
-
except (TypeError, OverflowError, ValueError):
|
87
|
-
# Handle non-serializable objects
|
88
|
-
try:
|
89
|
-
# Try converting to string
|
90
|
-
data[field_name] = str(value)
|
91
|
-
except Exception as _:
|
92
|
-
# If all else fails, store as None and optionally warn
|
93
|
-
warnings.warn(f"Could not serialize field {field_name}, setting to None")
|
94
|
-
data[field_name] = None
|
95
|
-
|
96
|
-
return data
|
97
|
-
|
98
|
-
|
judgeval/data/datasets/utils.py
DELETED
File without changes
|
judgeval/data/ground_truth.py
DELETED
File without changes
|
@@ -1,40 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.4
|
2
|
-
Name: judgeval
|
3
|
-
Version: 0.0.23
|
4
|
-
Summary: Judgeval Package
|
5
|
-
Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
|
6
|
-
Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
|
7
|
-
Author-email: Andrew Li <andrew@judgmentlabs.ai>, Alex Shan <alex@judgmentlabs.ai>, Joseph Camyre <joseph@judgmentlabs.ai>
|
8
|
-
License-Expression: Apache-2.0
|
9
|
-
License-File: LICENSE.md
|
10
|
-
Classifier: Operating System :: OS Independent
|
11
|
-
Classifier: Programming Language :: Python :: 3
|
12
|
-
Requires-Python: >=3.11
|
13
|
-
Requires-Dist: anthropic
|
14
|
-
Requires-Dist: fastapi
|
15
|
-
Requires-Dist: langchain
|
16
|
-
Requires-Dist: langchain-anthropic
|
17
|
-
Requires-Dist: langchain-core
|
18
|
-
Requires-Dist: langchain-huggingface
|
19
|
-
Requires-Dist: langchain-openai
|
20
|
-
Requires-Dist: litellm
|
21
|
-
Requires-Dist: nest-asyncio
|
22
|
-
Requires-Dist: openai
|
23
|
-
Requires-Dist: openpyxl
|
24
|
-
Requires-Dist: pandas
|
25
|
-
Requires-Dist: pika
|
26
|
-
Requires-Dist: python-dotenv==1.0.1
|
27
|
-
Requires-Dist: requests
|
28
|
-
Requires-Dist: supabase
|
29
|
-
Requires-Dist: together
|
30
|
-
Requires-Dist: uvicorn
|
31
|
-
Provides-Extra: dev
|
32
|
-
Requires-Dist: pytest-asyncio>=0.25.0; extra == 'dev'
|
33
|
-
Requires-Dist: pytest-mock>=3.14.0; extra == 'dev'
|
34
|
-
Requires-Dist: pytest>=8.3.4; extra == 'dev'
|
35
|
-
Requires-Dist: tavily-python; extra == 'dev'
|
36
|
-
Description-Content-Type: text/markdown
|
37
|
-
|
38
|
-
# judgeval
|
39
|
-
|
40
|
-
Judgeval is an open-source evaluation framework for multi-agent LLM workflows, for both real-time and offline evaluations.
|
File without changes
|
File without changes
|