judgeval 0.0.23__py3-none-any.whl → 0.0.25__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
judgeval/common/tracer.py CHANGED
@@ -10,16 +10,12 @@ import os
10
10
  import time
11
11
  import uuid
12
12
  import warnings
13
- from contextvars import ContextVar
14
13
  from contextlib import contextmanager
15
- from collections import defaultdict
16
14
  from dataclasses import dataclass, field
17
15
  from datetime import datetime
18
16
  from http import HTTPStatus
19
17
  from typing import Any, Dict, Generator, List, Literal, Optional, Tuple, TypeAlias, Union
20
18
  from rich import print as rprint
21
- from uuid import UUID
22
- from collections.abc import Sequence
23
19
 
24
20
  # Third-party imports
25
21
  import pika
@@ -48,19 +44,6 @@ from judgeval.rules import Rule
48
44
  from judgeval.evaluation_run import EvaluationRun
49
45
  from judgeval.data.result import ScoringResult
50
46
 
51
- from langchain_core.language_models import BaseChatModel
52
- from langchain_huggingface import ChatHuggingFace
53
- from langchain_openai import ChatOpenAI
54
- from langchain_anthropic import ChatAnthropic
55
- from langchain_core.utils.function_calling import convert_to_openai_tool
56
- from langchain_core.callbacks import CallbackManager, BaseCallbackHandler
57
- from langchain_core.agents import AgentAction, AgentFinish
58
- from langchain_core.outputs import LLMResult
59
- from langchain_core.tracers.context import register_configure_hook
60
- from langchain_core.messages.ai import AIMessage
61
- from langchain_core.messages.tool import ToolMessage
62
- from langchain_core.messages.base import BaseMessage
63
- from langchain_core.documents import Document
64
47
 
65
48
  # Define type aliases for better code readability and maintainability
66
49
  ApiClient: TypeAlias = Union[OpenAI, Together, Anthropic] # Supported API clients
@@ -125,8 +108,7 @@ class TraceEntry:
125
108
  if self._is_json_serializable(value):
126
109
  serialized_inputs[key] = value
127
110
  else:
128
- warnings.warn(f"Input '{key}' for function {self.function} is not JSON serializable. Setting to None.")
129
- serialized_inputs[key] = None
111
+ serialized_inputs[key] = self.safe_stringify(value, self.function)
130
112
  return serialized_inputs
131
113
 
132
114
  def _is_json_serializable(self, obj: Any) -> bool:
@@ -137,6 +119,25 @@ class TraceEntry:
137
119
  except (TypeError, OverflowError, ValueError):
138
120
  return False
139
121
 
122
+ def safe_stringify(self, output, function_name):
123
+ """
124
+ Safely converts an object to a string or repr, handling serialization issues gracefully.
125
+ """
126
+ try:
127
+ return str(output)
128
+ except (TypeError, OverflowError, ValueError):
129
+ pass
130
+
131
+ try:
132
+ return repr(output)
133
+ except (TypeError, OverflowError, ValueError):
134
+ pass
135
+
136
+ warnings.warn(
137
+ f"Output for function {function_name} is not JSON serializable and could not be converted to string. Setting to None."
138
+ )
139
+ return None
140
+
140
141
  def to_dict(self) -> dict:
141
142
  """Convert the trace entry to a dictionary format for storage/transmission."""
142
143
  return {
@@ -160,25 +161,6 @@ class TraceEntry:
160
161
  - We try to serialize into JSON, then string, then the base representation (__repr__)
161
162
  - Non-serializable objects return None with a warning
162
163
  """
163
-
164
- def safe_stringify(output, function_name):
165
- """
166
- Safely converts an object to a string or repr, handling serialization issues gracefully.
167
- """
168
- try:
169
- return str(output)
170
- except (TypeError, OverflowError, ValueError):
171
- pass
172
-
173
- try:
174
- return repr(output)
175
- except (TypeError, OverflowError, ValueError):
176
- pass
177
-
178
- warnings.warn(
179
- f"Output for function {function_name} is not JSON serializable and could not be converted to string. Setting to None."
180
- )
181
- return None
182
164
 
183
165
  if isinstance(self.output, BaseModel):
184
166
  return self.output.model_dump()
@@ -188,7 +170,7 @@ class TraceEntry:
188
170
  json.dumps(self.output)
189
171
  return self.output
190
172
  except (TypeError, OverflowError, ValueError):
191
- return safe_stringify(self.output, self.function)
173
+ return self.safe_stringify(self.output, self.function)
192
174
 
193
175
 
194
176
  class TraceManagerClient:
@@ -331,6 +313,8 @@ class TraceClient:
331
313
  project_name: str = "default_project",
332
314
  overwrite: bool = False,
333
315
  rules: Optional[List[Rule]] = None,
316
+ enable_monitoring: bool = True,
317
+ enable_evaluations: bool = True
334
318
  ):
335
319
  self.name = name
336
320
  self.trace_id = trace_id or str(uuid.uuid4())
@@ -339,6 +323,8 @@ class TraceClient:
339
323
  self.tracer = tracer
340
324
  # Initialize rules with either provided rules or an empty list
341
325
  self.rules = rules or []
326
+ self.enable_monitoring = enable_monitoring
327
+ self.enable_evaluations = enable_evaluations
342
328
 
343
329
  self.client: JudgmentClient = tracer.client
344
330
  self.entries: List[TraceEntry] = []
@@ -399,6 +385,9 @@ class TraceClient:
399
385
  model: Optional[str] = None,
400
386
  log_results: Optional[bool] = True
401
387
  ):
388
+ if not self.enable_evaluations:
389
+ return
390
+
402
391
  start_time = time.time() # Record start time
403
392
  example = Example(
404
393
  input=input,
@@ -698,7 +687,10 @@ class Tracer:
698
687
  api_key: str = os.getenv("JUDGMENT_API_KEY"),
699
688
  project_name: str = "default_project",
700
689
  rules: Optional[List[Rule]] = None, # Added rules parameter
701
- organization_id: str = os.getenv("JUDGMENT_ORG_ID")):
690
+ organization_id: str = os.getenv("JUDGMENT_ORG_ID"),
691
+ enable_monitoring: bool = os.getenv("JUDGMENT_MONITORING", "true").lower() == "true",
692
+ enable_evaluations: bool = os.getenv("JUDGMENT_EVALUATIONS", "true").lower() == "true"
693
+ ):
702
694
  if not hasattr(self, 'initialized'):
703
695
  if not api_key:
704
696
  raise ValueError("Tracer must be configured with a Judgment API key")
@@ -714,6 +706,8 @@ class Tracer:
714
706
  self._current_trace: Optional[str] = None
715
707
  self.rules: List[Rule] = rules or [] # Store rules at tracer level
716
708
  self.initialized: bool = True
709
+ self.enable_monitoring: bool = enable_monitoring
710
+ self.enable_evaluations: bool = enable_evaluations
717
711
  elif hasattr(self, 'project_name') and self.project_name != project_name:
718
712
  warnings.warn(
719
713
  f"Attempting to initialize Tracer with project_name='{project_name}' but it was already initialized with "
@@ -740,7 +734,9 @@ class Tracer:
740
734
  name,
741
735
  project_name=project,
742
736
  overwrite=overwrite,
743
- rules=self.rules # Pass combined rules to the trace client
737
+ rules=self.rules, # Pass combined rules to the trace client
738
+ enable_monitoring=self.enable_monitoring,
739
+ enable_evaluations=self.enable_evaluations
744
740
  )
745
741
  prev_trace = self._current_trace
746
742
  self._current_trace = trace
@@ -771,6 +767,9 @@ class Tracer:
771
767
  project_name: Optional project name override
772
768
  overwrite: Whether to overwrite existing traces
773
769
  """
770
+ if not self.enable_monitoring:
771
+ return
772
+
774
773
  if func is None:
775
774
  return lambda f: self.observe(f, name=name, span_type=span_type, project_name=project_name, overwrite=overwrite)
776
775
 
@@ -787,7 +786,7 @@ class Tracer:
787
786
  trace_id = str(uuid.uuid4())
788
787
  trace_name = func.__name__
789
788
  project = project_name if project_name is not None else self.project_name
790
- trace = TraceClient(self, trace_id, trace_name, project_name=project, overwrite=overwrite, rules=self.rules)
789
+ trace = TraceClient(self, trace_id, trace_name, project_name=project, overwrite=overwrite, rules=self.rules, enable_monitoring=self.enable_monitoring, enable_evaluations=self.enable_evaluations)
791
790
  self._current_trace = trace
792
791
  # Only save empty trace for the root call
793
792
  trace.save(empty_save=True, overwrite=overwrite)
@@ -824,7 +823,7 @@ class Tracer:
824
823
  trace_id = str(uuid.uuid4())
825
824
  trace_name = func.__name__
826
825
  project = project_name if project_name is not None else self.project_name
827
- trace = TraceClient(self, trace_id, trace_name, project_name=project, overwrite=overwrite, rules=self.rules)
826
+ trace = TraceClient(self, trace_id, trace_name, project_name=project, overwrite=overwrite, rules=self.rules, enable_monitoring=self.enable_monitoring)
828
827
  self._current_trace = trace
829
828
  # Only save empty trace for the root call
830
829
  trace.save(empty_save=True, overwrite=overwrite)
@@ -872,6 +871,11 @@ class Tracer:
872
871
  self._current_trace.async_evaluate(scorers=[scorers], input=args, actual_output=kwargs, model="gpt-4o-mini", log_results=True)
873
872
  return wrapper
874
873
 
874
+ def async_evaluate(self, *args, **kwargs):
875
+ if self._current_trace:
876
+ self._current_trace.async_evaluate(*args, **kwargs)
877
+ else:
878
+ warnings.warn("No trace found, skipping evaluation")
875
879
 
876
880
 
877
881
  def wrap(client: Any) -> Any:
@@ -982,212 +986,4 @@ def _format_output_data(client: ApiClient, response: Any) -> dict:
982
986
  "output_tokens": response.usage.output_tokens,
983
987
  "total_tokens": response.usage.input_tokens + response.usage.output_tokens
984
988
  }
985
- }
986
-
987
- class JudgevalCallbackHandler(BaseCallbackHandler):
988
- def __init__(self, trace_client: TraceClient):
989
- self.trace_client = trace_client
990
- self.previous_node = "__start__"
991
- self.executed_node_tools = []
992
- self.executed_nodes = []
993
- self.executed_tools = []
994
- self.openai_count = 1
995
-
996
- def start_span(self, name: str, span_type: SpanType = "span"):
997
- start_time = time.time()
998
-
999
- # Record span entry
1000
- self.trace_client.add_entry(TraceEntry(
1001
- type="enter",
1002
- function=name,
1003
- depth=self.trace_client.tracer.depth,
1004
- message=name,
1005
- timestamp=start_time,
1006
- span_type=span_type
1007
- ))
1008
-
1009
- self.trace_client.tracer.depth += 1
1010
- self.trace_client.prev_span = self.trace_client._current_span
1011
- self.trace_client._current_span = name
1012
- self._start_time = start_time
1013
-
1014
- def end_span(self, name: str, span_type: SpanType = "span"):
1015
- self.trace_client.tracer.depth -= 1
1016
- duration = time.time() - self._start_time
1017
-
1018
- # Record span exit
1019
- self.trace_client.add_entry(TraceEntry(
1020
- type="exit",
1021
- function=name,
1022
- depth=self.trace_client.tracer.depth,
1023
- message=f"← {name}",
1024
- timestamp=time.time(),
1025
- duration=duration,
1026
- span_type=span_type
1027
- ))
1028
- self.trace_client._current_span = self.trace_client.prev_span
1029
-
1030
- def on_retriever_start(
1031
- self,
1032
- serialized: Optional[dict[str, Any]],
1033
- query: str,
1034
- *,
1035
- run_id: UUID,
1036
- parent_run_id: Optional[UUID] = None,
1037
- tags: Optional[list[str]] = None,
1038
- metadata: Optional[dict[str, Any]] = None,
1039
- **kwargs: Any,
1040
- ) -> Any:
1041
- name = "RETRIEVER_CALL"
1042
- if serialized and "name" in serialized:
1043
- name = f"RETRIEVER_{serialized['name'].upper()}"
1044
-
1045
- self.start_span(name, span_type="retriever")
1046
- self.trace_client.record_input({
1047
- 'query': query,
1048
- 'tags': tags,
1049
- 'metadata': metadata,
1050
- 'kwargs': kwargs
1051
- })
1052
-
1053
- def on_retriever_end(
1054
- self,
1055
- documents: Sequence[Document],
1056
- *,
1057
- run_id: UUID,
1058
- parent_run_id: Optional[UUID] = None,
1059
- **kwargs: Any
1060
- ) -> Any:
1061
- # Process the retrieved documents into a format suitable for logging
1062
- doc_summary = []
1063
- for i, doc in enumerate(documents):
1064
- # Extract key information from each document
1065
- doc_data = {
1066
- "index": i,
1067
- "page_content": doc.page_content[:100] + "..." if len(doc.page_content) > 100 else doc.page_content,
1068
- "metadata": doc.metadata
1069
- }
1070
- doc_summary.append(doc_data)
1071
-
1072
- # Record the document data
1073
- self.trace_client.record_output({
1074
- "document_count": len(documents),
1075
- "documents": doc_summary
1076
- })
1077
-
1078
- # End the retriever span
1079
- self.end_span(self.trace_client._current_span, span_type="retriever")
1080
-
1081
- def on_chain_start(
1082
- self,
1083
- serialized: Dict[str, Any],
1084
- inputs: Dict[str, Any],
1085
- *,
1086
- run_id: UUID,
1087
- parent_run_id: Optional[UUID] = None,
1088
- tags: Optional[List[str]] = None,
1089
- metadata: Optional[Dict[str, Any]] = None,
1090
- **kwargs: Any
1091
- ) -> None:
1092
- node = metadata.get("langgraph_node")
1093
- if node != None and node != "__start__" and node != self.previous_node:
1094
- self.executed_node_tools.append(node)
1095
- self.executed_nodes.append(node)
1096
- self.previous_node = node
1097
-
1098
- def on_tool_start(
1099
- self,
1100
- serialized: Optional[dict[str, Any]],
1101
- input_str: str,
1102
- run_id: Optional[UUID] = None,
1103
- parent_run_id: Optional[UUID] = None,
1104
- inputs: Optional[dict[str, Any]] = None,
1105
- **kwargs: Any,
1106
- ):
1107
- name = serialized["name"]
1108
- self.start_span(name, span_type="tool")
1109
- self.executed_node_tools.append(f"{self.previous_node}:{name}")
1110
- self.executed_tools.append(name)
1111
- self.trace_client.record_input({
1112
- 'args': input_str,
1113
- 'kwargs': kwargs
1114
- })
1115
-
1116
- def on_tool_end(self, output: Any, *, run_id: UUID, parent_run_id: Optional[UUID] = None, **kwargs: Any) -> Any:
1117
- self.trace_client.record_output(output)
1118
- self.end_span(self.trace_client._current_span, span_type="tool")
1119
-
1120
- def on_agent_action (self, action: AgentAction, **kwargs: Any) -> Any:
1121
- print(f"Agent action: {action}")
1122
-
1123
- def on_agent_finish(
1124
- self,
1125
- finish: AgentFinish,
1126
- *,
1127
- run_id: UUID,
1128
- parent_run_id: Optional[UUID] = None,
1129
- tags: Optional[list[str]] = None,
1130
- **kwargs: Any,
1131
- ) -> None:
1132
- print(f"Agent action: {finish}")
1133
-
1134
- def on_llm_start(
1135
- self,
1136
- serialized: Optional[dict[str, Any]],
1137
- prompts: list[str],
1138
- *,
1139
- run_id: UUID,
1140
- parent_run_id: Optional[UUID] = None,
1141
- **kwargs: Any,
1142
- ) -> Any:
1143
- name = "LLM call"
1144
- self.start_span(name, span_type="llm")
1145
- self.trace_client.record_input({
1146
- 'args': prompts,
1147
- 'kwargs': kwargs
1148
- })
1149
-
1150
- def on_llm_end(self, response: LLMResult, *, run_id: UUID, parent_run_id: Optional[UUID] = None, **kwargs: Any):
1151
- self.trace_client.record_output(response.generations[0][0].text)
1152
- self.end_span(self.trace_client._current_span, span_type="llm")
1153
-
1154
- def on_chat_model_start(
1155
- self,
1156
- serialized: Optional[dict[str, Any]],
1157
- messages: list[list[BaseMessage]],
1158
- *,
1159
- run_id: UUID,
1160
- parent_run_id: Optional[UUID] = None,
1161
- **kwargs: Any,
1162
- ) -> Any:
1163
-
1164
- if "openai" in serialized["id"]:
1165
- name = f"OPENAI_API_CALL_{self.openai_count}"
1166
- self.openai_count += 1
1167
- elif "anthropic" in serialized["id"]:
1168
- name = "ANTHROPIC_API_CALL"
1169
- elif "together" in serialized["id"]:
1170
- name = "TOGETHER_API_CALL"
1171
- else:
1172
- name = "LLM call"
1173
-
1174
- self.start_span(name, span_type="llm")
1175
- self.trace_client.record_input({
1176
- 'args': str(messages),
1177
- 'kwargs': kwargs
1178
- })
1179
-
1180
- judgeval_callback_handler_var: ContextVar[Optional[JudgevalCallbackHandler]] = ContextVar(
1181
- "judgeval_callback_handler", default=None
1182
- )
1183
-
1184
- def set_global_handler(handler: JudgevalCallbackHandler):
1185
- judgeval_callback_handler_var.set(handler)
1186
-
1187
- def clear_global_handler():
1188
- judgeval_callback_handler_var.set(None)
1189
-
1190
- register_configure_hook(
1191
- context_var=judgeval_callback_handler_var,
1192
- inheritable=True,
1193
- )
989
+ }
judgeval/data/__init__.py CHANGED
@@ -2,7 +2,7 @@ from judgeval.data.example import Example, ExampleParams
2
2
  from judgeval.data.api_example import ProcessExample, create_process_example
3
3
  from judgeval.data.scorer_data import ScorerData, create_scorer_data
4
4
  from judgeval.data.result import ScoringResult, generate_scoring_result
5
- from judgeval.data.custom_example import CustomExample
5
+
6
6
  __all__ = [
7
7
  "Example",
8
8
  "ExampleParams",
@@ -12,5 +12,4 @@ __all__ = [
12
12
  "create_scorer_data",
13
13
  "ScoringResult",
14
14
  "generate_scoring_result",
15
- "CustomExample",
16
15
  ]
@@ -0,0 +1,316 @@
1
+ from typing import Any, Dict, List, Optional, Sequence
2
+ from uuid import UUID
3
+ import time
4
+ import uuid
5
+ from contextvars import ContextVar
6
+ from judgeval.common.tracer import TraceClient, TraceEntry, Tracer, SpanType
7
+
8
+ from langchain_core.language_models import BaseChatModel
9
+ from langchain_huggingface import ChatHuggingFace
10
+ from langchain_openai import ChatOpenAI
11
+ from langchain_anthropic import ChatAnthropic
12
+ from langchain_core.utils.function_calling import convert_to_openai_tool
13
+ from langchain_core.callbacks import CallbackManager, BaseCallbackHandler
14
+ from langchain_core.agents import AgentAction, AgentFinish
15
+ from langchain_core.outputs import LLMResult
16
+ from langchain_core.tracers.context import register_configure_hook
17
+ from langchain_core.messages.ai import AIMessage
18
+ from langchain_core.messages.tool import ToolMessage
19
+ from langchain_core.messages.base import BaseMessage
20
+ from langchain_core.documents import Document
21
+
22
+ class JudgevalCallbackHandler(BaseCallbackHandler):
23
+ def __init__(self, tracer: Tracer):
24
+ self.tracer = tracer
25
+ self.trace_client = tracer.get_current_trace() if tracer.get_current_trace() else None
26
+ self.previous_spans = [] # stack of previous spans
27
+ self.finished = False
28
+
29
+ # Attributes for users to access
30
+ self.previous_node = None
31
+ self.executed_node_tools = []
32
+ self.executed_nodes = []
33
+ self.executed_tools = []
34
+
35
+ def start_span(self, name: str, span_type: SpanType = "span"):
36
+ start_time = time.time()
37
+
38
+ # Record span entry
39
+ self.trace_client.add_entry(TraceEntry(
40
+ type="enter",
41
+ function=name,
42
+ depth=self.trace_client.tracer.depth,
43
+ message=name,
44
+ timestamp=start_time,
45
+ span_type=span_type
46
+ ))
47
+
48
+ self.trace_client.tracer.depth += 1
49
+ self.previous_spans.append(self.trace_client._current_span)
50
+ self.trace_client._current_span = name
51
+ self._start_time = start_time
52
+
53
+ def end_span(self, name: str, span_type: SpanType = "span"):
54
+ self.trace_client.tracer.depth -= 1
55
+ duration = time.time() - self._start_time
56
+
57
+ # Record span exit
58
+ self.trace_client.add_entry(TraceEntry(
59
+ type="exit",
60
+ function=name,
61
+ depth=self.trace_client.tracer.depth,
62
+ message=f"{name}",
63
+ timestamp=time.time(),
64
+ duration=duration,
65
+ span_type=span_type
66
+ ))
67
+ self.trace_client._current_span = self.previous_spans.pop()
68
+
69
+ if self.trace_client.tracer.depth == 0:
70
+ # Save the trace if we are the root, this is when users dont use any @observe decorators
71
+ self.trace_client.save(empty_save=False, overwrite=True)
72
+ self.trace_client._current_trace = None
73
+
74
+ def on_retriever_start(
75
+ self,
76
+ serialized: Optional[dict[str, Any]],
77
+ query: str,
78
+ *,
79
+ run_id: UUID,
80
+ parent_run_id: Optional[UUID] = None,
81
+ tags: Optional[list[str]] = None,
82
+ metadata: Optional[dict[str, Any]] = None,
83
+ **kwargs: Any,
84
+ ) -> Any:
85
+ name = "RETRIEVER_CALL"
86
+ if serialized and "name" in serialized:
87
+ name = f"RETRIEVER_{serialized['name'].upper()}"
88
+
89
+ self.start_span(name, span_type="retriever")
90
+ self.trace_client.record_input({
91
+ 'query': query,
92
+ 'tags': tags,
93
+ 'metadata': metadata,
94
+ 'kwargs': kwargs
95
+ })
96
+
97
+ def on_retriever_end(
98
+ self,
99
+ documents: Sequence[Document],
100
+ *,
101
+ run_id: UUID,
102
+ parent_run_id: Optional[UUID] = None,
103
+ **kwargs: Any
104
+ ) -> Any:
105
+ # Process the retrieved documents into a format suitable for logging
106
+ doc_summary = []
107
+ for i, doc in enumerate(documents):
108
+ # Extract key information from each document
109
+ doc_data = {
110
+ "index": i,
111
+ "page_content": doc.page_content[:100] + "..." if len(doc.page_content) > 100 else doc.page_content,
112
+ "metadata": doc.metadata
113
+ }
114
+ doc_summary.append(doc_data)
115
+
116
+ # Record the document data
117
+ self.trace_client.record_output({
118
+ "document_count": len(documents),
119
+ "documents": doc_summary
120
+ })
121
+
122
+ # End the retriever span
123
+ self.end_span(self.trace_client._current_span, span_type="retriever")
124
+
125
+ def on_chain_start(
126
+ self,
127
+ serialized: Dict[str, Any],
128
+ inputs: Dict[str, Any],
129
+ *,
130
+ run_id: UUID,
131
+ parent_run_id: Optional[UUID] = None,
132
+ tags: Optional[List[str]] = None,
133
+ metadata: Optional[Dict[str, Any]] = None,
134
+ **kwargs: Any
135
+ ) -> None:
136
+ # If the user doesnt use any @observe decorators, the first action in LangGraph workflows seems tohave this attribute, so we intialize our trace client here
137
+ if kwargs.get('name') == 'LangGraph':
138
+ if not self.trace_client:
139
+ trace_id = str(uuid.uuid4())
140
+ project = self.tracer.project_name
141
+ trace = TraceClient(self.tracer, trace_id, trace_id, project_name=project, overwrite=False, rules=self.tracer.rules, enable_monitoring=self.tracer.enable_monitoring, enable_evaluations=self.tracer.enable_evaluations)
142
+ self.trace_client = trace
143
+ self.tracer._current_trace = trace # set the trace in the original tracer object
144
+ # Only save empty trace for the root call
145
+ self.trace_client.save(empty_save=True, overwrite=False)
146
+
147
+ self.start_span("LangGraph", span_type="Main Function")
148
+
149
+ node = metadata.get("langgraph_node")
150
+ if node != None and node != self.previous_node:
151
+ self.start_span(node, span_type="node")
152
+ self.executed_node_tools.append(node)
153
+ self.executed_nodes.append(node)
154
+ self.trace_client.record_input({
155
+ 'args': inputs,
156
+ 'kwargs': kwargs
157
+ })
158
+ self.previous_node = node
159
+
160
+ def on_chain_end(
161
+ self,
162
+ outputs: Dict[str, Any],
163
+ *,
164
+ run_id: UUID,
165
+ parent_run_id: Optional[UUID] = None,
166
+ tags: Optional[List[str]] = None,
167
+ **kwargs: Any,
168
+ ) -> Any:
169
+ if outputs == "__end__":
170
+ self.finished = True
171
+ if tags is not None and any("graph:step" in tag for tag in tags):
172
+ self.trace_client.record_output(outputs)
173
+ self.end_span(self.trace_client._current_span, span_type="node")
174
+
175
+ if self.finished:
176
+ self.end_span(self.trace_client._current_span, span_type="Main Function")
177
+
178
+ def on_chain_error(
179
+ self,
180
+ error: BaseException,
181
+ *,
182
+ run_id: UUID,
183
+ parent_run_id: Optional[UUID] = None,
184
+ **kwargs: Any,
185
+ ) -> Any:
186
+ print(f"Chain error: {error}")
187
+ self.trace_client.record_output(error)
188
+ self.end_span(self.trace_client._current_span, span_type="node")
189
+
190
+ def on_tool_start(
191
+ self,
192
+ serialized: Optional[dict[str, Any]],
193
+ input_str: str,
194
+ run_id: Optional[UUID] = None,
195
+ parent_run_id: Optional[UUID] = None,
196
+ inputs: Optional[dict[str, Any]] = None,
197
+ **kwargs: Any,
198
+ ):
199
+ name = serialized["name"]
200
+ self.start_span(name, span_type="tool")
201
+ self.executed_node_tools.append(f"{self.previous_node}:{name}")
202
+ self.executed_tools.append(name)
203
+ self.trace_client.record_input({
204
+ 'args': input_str,
205
+ 'kwargs': kwargs
206
+ })
207
+
208
+ def on_tool_end(self, output: Any, *, run_id: UUID, parent_run_id: Optional[UUID] = None, **kwargs: Any) -> Any:
209
+ self.trace_client.record_output(output)
210
+ self.end_span(self.trace_client._current_span, span_type="tool")
211
+
212
+ def on_tool_error(
213
+ self,
214
+ error: BaseException,
215
+ *,
216
+ run_id: UUID,
217
+ parent_run_id: Optional[UUID] = None,
218
+ **kwargs: Any,
219
+ ) -> Any:
220
+ print(f"Tool error: {error}")
221
+ self.trace_client.record_output(error)
222
+ self.end_span(self.trace_client._current_span, span_type="tool")
223
+
224
+ def on_agent_action(
225
+ self,
226
+ action: AgentAction,
227
+ *,
228
+ run_id: UUID,
229
+ parent_run_id: Optional[UUID] = None,
230
+ **kwargs: Any,
231
+ ) -> Any:
232
+ print(f"Agent action: {action}")
233
+
234
+ def on_agent_finish(
235
+ self,
236
+ finish: AgentFinish,
237
+ *,
238
+ run_id: UUID,
239
+ parent_run_id: Optional[UUID] = None,
240
+ **kwargs: Any,
241
+ ) -> Any:
242
+ print(f"Agent finish: {finish}")
243
+
244
+ def on_llm_start(
245
+ self,
246
+ serialized: Optional[dict[str, Any]],
247
+ prompts: list[str],
248
+ *,
249
+ run_id: UUID,
250
+ parent_run_id: Optional[UUID] = None,
251
+ **kwargs: Any,
252
+ ) -> Any:
253
+ name = "LLM call"
254
+ self.start_span(name, span_type="llm")
255
+ self.trace_client.record_input({
256
+ 'args': prompts,
257
+ 'kwargs': kwargs
258
+ })
259
+
260
+ def on_llm_end(self, response: LLMResult, *, run_id: UUID, parent_run_id: Optional[UUID] = None, **kwargs: Any):
261
+ self.trace_client.record_output(response.generations[0][0].text)
262
+ self.end_span(self.trace_client._current_span, span_type="llm")
263
+
264
+ def on_llm_error(
265
+ self,
266
+ error: BaseException,
267
+ *,
268
+ run_id: UUID,
269
+ parent_run_id: Optional[UUID] = None,
270
+ **kwargs: Any,
271
+ ) -> Any:
272
+ print(f"LLM error: {error}")
273
+ self.trace_client.record_output(error)
274
+ self.end_span(self.trace_client._current_span, span_type="llm")
275
+
276
+ def on_chat_model_start(
277
+ self,
278
+ serialized: Optional[dict[str, Any]],
279
+ messages: list[list[BaseMessage]],
280
+ *,
281
+ run_id: UUID,
282
+ parent_run_id: Optional[UUID] = None,
283
+ **kwargs: Any,
284
+ ) -> Any:
285
+
286
+ if "openai" in serialized["id"]:
287
+ name = f"OPENAI_API_CALL"
288
+ elif "anthropic" in serialized["id"]:
289
+ name = "ANTHROPIC_API_CALL"
290
+ elif "together" in serialized["id"]:
291
+ name = "TOGETHER_API_CALL"
292
+ else:
293
+ name = "LLM call"
294
+
295
+ self.start_span(name, span_type="llm")
296
+ self.trace_client.record_input({
297
+ 'args': str(messages),
298
+ 'kwargs': kwargs
299
+ })
300
+
301
+ judgeval_callback_handler_var: ContextVar[Optional[JudgevalCallbackHandler]] = ContextVar(
302
+ "judgeval_callback_handler", default=None
303
+ )
304
+
305
+ def set_global_handler(handler: JudgevalCallbackHandler):
306
+ if not handler.tracer.enable_monitoring:
307
+ return
308
+ judgeval_callback_handler_var.set(handler)
309
+
310
+ def clear_global_handler():
311
+ judgeval_callback_handler_var.set(None)
312
+
313
+ register_configure_hook(
314
+ context_var=judgeval_callback_handler_var,
315
+ inheritable=True,
316
+ )
@@ -0,0 +1,156 @@
1
+ Metadata-Version: 2.4
2
+ Name: judgeval
3
+ Version: 0.0.25
4
+ Summary: Judgeval Package
5
+ Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
+ Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
7
+ Author-email: Andrew Li <andrew@judgmentlabs.ai>, Alex Shan <alex@judgmentlabs.ai>, Joseph Camyre <joseph@judgmentlabs.ai>
8
+ License-Expression: Apache-2.0
9
+ License-File: LICENSE.md
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: Programming Language :: Python :: 3
12
+ Requires-Python: >=3.11
13
+ Requires-Dist: anthropic
14
+ Requires-Dist: fastapi
15
+ Requires-Dist: langchain
16
+ Requires-Dist: langchain-anthropic
17
+ Requires-Dist: langchain-core
18
+ Requires-Dist: langchain-huggingface
19
+ Requires-Dist: langchain-openai
20
+ Requires-Dist: litellm
21
+ Requires-Dist: nest-asyncio
22
+ Requires-Dist: openai
23
+ Requires-Dist: openpyxl
24
+ Requires-Dist: pandas
25
+ Requires-Dist: pika
26
+ Requires-Dist: python-dotenv==1.0.1
27
+ Requires-Dist: requests
28
+ Requires-Dist: supabase
29
+ Requires-Dist: together
30
+ Requires-Dist: uvicorn
31
+ Provides-Extra: dev
32
+ Requires-Dist: pytest-asyncio>=0.25.0; extra == 'dev'
33
+ Requires-Dist: pytest-mock>=3.14.0; extra == 'dev'
34
+ Requires-Dist: pytest>=8.3.4; extra == 'dev'
35
+ Requires-Dist: tavily-python; extra == 'dev'
36
+ Description-Content-Type: text/markdown
37
+
38
+ # Judgeval SDK
39
+
40
+ Judgeval is an open-source framework for building evaluation pipelines for multi-step agent workflows, supporting both real-time and experimental evaluation setups. To learn more about Judgment or sign up for free, visit our [website](https://www.judgmentlabs.ai/) or check out our [developer docs](https://judgment.mintlify.app/getting_started).
41
+
42
+ ## Features
43
+
44
+ - **Development and Production Evaluation Layer**: Offers a robust evaluation layer for multi-step agent applications, including unit-testing and performance monitoring.
45
+ - **Plug-and-Evaluate**: Integrate LLM systems with 10+ research-backed metrics, including:
46
+ - Hallucination detection
47
+ - RAG retriever quality
48
+ - And more
49
+ - **Custom Evaluation Pipelines**: Construct powerful custom evaluation pipelines tailored for your LLM systems.
50
+ - **Monitoring in Production**: Utilize state-of-the-art real-time evaluation foundation models to monitor LLM systems effectively.
51
+
52
+ ## Installation
53
+
54
+ ```bash
55
+ pip install judgeval
56
+ ```
57
+
58
+ ## Quickstart: Evaluations
59
+
60
+ You can evaluate your workflow execution data to measure quality metrics such as hallucination.
61
+
62
+ Create a file named `evaluate.py` with the following code:
63
+
64
+ ```python
65
+ from judgeval import JudgmentClient
66
+ from judgeval.data import Example
67
+ from judgeval.scorers import FaithfulnessScorer
68
+
69
+ client = JudgmentClient()
70
+
71
+ example = Example(
72
+ input="What if these shoes don't fit?",
73
+ actual_output="We offer a 30-day full refund at no extra cost.",
74
+ retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
75
+ )
76
+
77
+ scorer = FaithfulnessScorer(threshold=0.5)
78
+ results = client.run_evaluation(
79
+ examples=[example],
80
+ scorers=[scorer],
81
+ model="gpt-4o",
82
+ )
83
+ print(results)
84
+ ```
85
+ Click [here](https://judgment.mintlify.app/getting_started#create-your-first-experiment) for a more detailed explanation
86
+
87
+ ## Quickstart: Traces
88
+
89
+ Track your workflow execution for full observability with just a few lines of code.
90
+
91
+ Create a file named `traces.py` with the following code:
92
+
93
+ ```python
94
+ from judgeval.common.tracer import Tracer, wrap
95
+ from openai import OpenAI
96
+
97
+ client = wrap(OpenAI())
98
+ judgment = Tracer(project_name="my_project")
99
+
100
+ @judgment.observe(span_type="tool")
101
+ def my_tool():
102
+ return "Hello world!"
103
+
104
+ @judgment.observe(span_type="function")
105
+ def main():
106
+ task_input = my_tool()
107
+ res = client.chat.completions.create(
108
+ model="gpt-4o",
109
+ messages=[{"role": "user", "content": f"{task_input}"}]
110
+ )
111
+ return res.choices[0].message.content
112
+ ```
113
+ Click [here](https://judgment.mintlify.app/getting_started#create-your-first-trace) for a more detailed explanation
114
+
115
+ ## Quickstart: Online Evaluations
116
+
117
+ Apply performance monitoring to measure the quality of your systems in production, not just on historical data.
118
+
119
+ Using the same traces.py file we created earlier:
120
+
121
+ ```python
122
+ from judgeval.common.tracer import Tracer, wrap
123
+ from judgeval.scorers import AnswerRelevancyScorer
124
+ from openai import OpenAI
125
+
126
+ client = wrap(OpenAI())
127
+ judgment = Tracer(project_name="my_project")
128
+
129
+ @judgment.observe(span_type="tool")
130
+ def my_tool():
131
+ return "Hello world!"
132
+
133
+ @judgment.observe(span_type="function")
134
+ def main():
135
+ task_input = my_tool()
136
+ res = client.chat.completions.create(
137
+ model="gpt-4o",
138
+ messages=[{"role": "user", "content": f"{task_input}"}]
139
+ ).choices[0].message.content
140
+
141
+ judgment.get_current_trace().async_evaluate(
142
+ scorers=[AnswerRelevancyScorer(threshold=0.5)],
143
+ input=task_input,
144
+ actual_output=res,
145
+ model="gpt-4o"
146
+ )
147
+
148
+ return res
149
+ ```
150
+ Click [here](https://judgment.mintlify.app/getting_started#create-your-first-online-evaluation) for a more detailed explanation
151
+
152
+ ## Documentation and Demos
153
+
154
+ For more detailed documentation, please check out our [docs](https://judgment.mintlify.app/getting_started) and some of our [demo videos](https://www.youtube.com/@AlexShan-j3o) for reference!
155
+
156
+ ##
@@ -8,19 +8,17 @@ judgeval/run_evaluation.py,sha256=YOzkyeWl-r3vaz0jB5nM-1VULi7ALmJ9_f58ENqexXk,23
8
8
  judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
9
9
  judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
10
10
  judgeval/common/logger.py,sha256=KO75wWXCxhUHUMvLaTU31ZzOk6tkZBa7heQ7y0f-zFE,6062
11
- judgeval/common/tracer.py,sha256=WFjFNf3NZ2BN8UAu2MG0F3Om9LgJNma3m_GrxyXgJqE,46655
11
+ judgeval/common/tracer.py,sha256=cc_K1poBg3Vzl2Nf7yhHlklrOe6Fb_TEekvjAVAQFSc,39958
12
12
  judgeval/common/utils.py,sha256=LUQV5JfDr6wj7xHAJoNq-gofNZ6mjXbeKrGKzBME1KM,33533
13
- judgeval/data/__init__.py,sha256=6ADbugtS3AporRv23Hxm67qcghU4tj0OScS8t3xLd6U,549
13
+ judgeval/data/__init__.py,sha256=YferxwmUqoBi18hrdgro0BD0h4pt20LAqISeUzGMcVU,474
14
14
  judgeval/data/api_example.py,sha256=dzkrQ0xno08y6qNfqL2djXbapUyc2B2aQ5iANn0o4CY,3667
15
- judgeval/data/custom_example.py,sha256=C-j9iVenBy52dwnL6PIjJAdKsBO1ajKjsaRr4RJthUo,3676
16
15
  judgeval/data/example.py,sha256=BhGBhamFWgH6wtvrRYM8dGtDfXh-cDxDhtNL5Gbdz_M,5892
17
- judgeval/data/ground_truth.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
16
  judgeval/data/result.py,sha256=4fgjKtUmT3br7K6fkRiNIxTGKUuwMeGyRLqzkpxwXKE,4436
19
17
  judgeval/data/scorer_data.py,sha256=JVlaTx1EP2jw2gh3Vgx1CSEsvIFABAN26IquKyxwiJQ,3273
20
18
  judgeval/data/datasets/__init__.py,sha256=IdNKhQv9yYZ_op0rdBacrFaFVmiiYQ3JTzXzxOTsEVQ,176
21
19
  judgeval/data/datasets/dataset.py,sha256=DjJNy-qvviXMGBl_JhiBzvgiJH1_3rYtAWeHP6Daw6E,11897
22
20
  judgeval/data/datasets/eval_dataset_client.py,sha256=B4bRy0Di2oFlaBbvp4_hRx2g_9e6Cs0y3ZUT9reMyhw,10926
23
- judgeval/data/datasets/utils.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
+ judgeval/integrations/langgraph.py,sha256=yBbZrePkY19dLLgleeIYFVzakEPaiko6YuccLbwSYcE,10957
24
22
  judgeval/judges/__init__.py,sha256=6X7VSwrwsdxGBNxCyapVRWGghhKOy3MVxFNMQ62kCXM,308
25
23
  judgeval/judges/base_judge.py,sha256=ch_S7uBB7lyv44Lf1d7mIGFpveOO58zOkkpImKgd9_4,994
26
24
  judgeval/judges/litellm_judge.py,sha256=EIL58Teptv8DzZUO3yP2RDQCDq-aoBB6HPZzPdK6KTg,2424
@@ -89,7 +87,7 @@ judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py
89
87
  judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py,sha256=Qk7lwHgRPYeGoxTOyclAh1VfGItfvHJ6l1t7Nk3SWFM,20927
90
88
  judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
91
89
  judgeval/utils/alerts.py,sha256=RgW5R9Dn3Jtim0OyAYDbNzjoX2s6SA4Mw16GyyaikjI,1424
92
- judgeval-0.0.23.dist-info/METADATA,sha256=EkRIGemm8UvM5J4RBR5KVzBfn0XTBBYvJjRM4-F0s0w,1378
93
- judgeval-0.0.23.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
94
- judgeval-0.0.23.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
95
- judgeval-0.0.23.dist-info/RECORD,,
90
+ judgeval-0.0.25.dist-info/METADATA,sha256=09S16QU5qwYqwvrsdg36KVvv9-tnVcSKccgDldPqWpQ,5418
91
+ judgeval-0.0.25.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
92
+ judgeval-0.0.25.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
93
+ judgeval-0.0.25.dist-info/RECORD,,
@@ -1,98 +0,0 @@
1
- from pydantic import BaseModel, Field, field_validator
2
- from typing import Optional, Dict, Any
3
- from uuid import uuid4
4
- from datetime import datetime
5
- import json
6
- import warnings
7
-
8
- # Brainstorming what are the requirements for the fields?
9
- class CustomExample(BaseModel):
10
- name: Optional[str] = None
11
- additional_metadata: Optional[Dict[str, Any]] = None
12
- example_id: str = Field(default_factory=lambda: str(uuid4()))
13
- example_index: Optional[int] = None
14
- timestamp: Optional[str] = None
15
- trace_id: Optional[str] = None
16
-
17
- model_config = {
18
- "extra": "allow", # Allow extra fields with any types
19
- }
20
-
21
- def __init__(self, **data):
22
- if 'example_id' not in data:
23
- data['example_id'] = str(uuid4())
24
- # Set timestamp if not provided
25
- if 'timestamp' not in data:
26
- data['timestamp'] = datetime.now().isoformat()
27
- super().__init__(**data)
28
-
29
- @field_validator('additional_metadata', mode='before')
30
- @classmethod
31
- def validate_additional_metadata(cls, v):
32
- if v is not None and not isinstance(v, dict):
33
- raise ValueError(f"Additional metadata must be a dictionary or None but got {v} of type {type(v)}")
34
- return v
35
-
36
- @field_validator('example_index', mode='before')
37
- @classmethod
38
- def validate_example_index(cls, v):
39
- if v is not None and not isinstance(v, int):
40
- raise ValueError(f"Example index must be an integer or None but got {v} of type {type(v)}")
41
- return v
42
-
43
- @field_validator('timestamp', mode='before')
44
- @classmethod
45
- def validate_timestamp(cls, v):
46
- if v is not None and not isinstance(v, str):
47
- raise ValueError(f"Timestamp must be a string or None but got {v} of type {type(v)}")
48
- return v
49
-
50
- @field_validator('trace_id', mode='before')
51
- @classmethod
52
- def validate_trace_id(cls, v):
53
- if v is not None and not isinstance(v, str):
54
- raise ValueError(f"Trace ID must be a string or None but got {v} of type {type(v)}")
55
- return v
56
-
57
- def to_dict(self):
58
- return self.model_dump()
59
-
60
- def __str__(self):
61
- return str(self.model_dump())
62
-
63
- def model_dump(self, **kwargs):
64
- """
65
- Custom serialization that handles special cases for fields that might fail standard serialization.
66
- """
67
- data = super().model_dump(**kwargs)
68
-
69
- # Get all fields including custom ones
70
- all_fields = self.__dict__
71
-
72
- for field_name, value in all_fields.items():
73
- try:
74
- # Check if the field has its own serialization method
75
- if hasattr(value, 'to_dict'):
76
- data[field_name] = value.to_dict()
77
- elif hasattr(value, 'model_dump'):
78
- data[field_name] = value.model_dump()
79
- # Field is already in data from super().model_dump()
80
- elif field_name in data:
81
- continue
82
- else:
83
- # Try standard JSON serialization
84
- json.dumps(value)
85
- data[field_name] = value
86
- except (TypeError, OverflowError, ValueError):
87
- # Handle non-serializable objects
88
- try:
89
- # Try converting to string
90
- data[field_name] = str(value)
91
- except Exception as _:
92
- # If all else fails, store as None and optionally warn
93
- warnings.warn(f"Could not serialize field {field_name}, setting to None")
94
- data[field_name] = None
95
-
96
- return data
97
-
98
-
File without changes
File without changes
@@ -1,40 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: judgeval
3
- Version: 0.0.23
4
- Summary: Judgeval Package
5
- Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
- Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
7
- Author-email: Andrew Li <andrew@judgmentlabs.ai>, Alex Shan <alex@judgmentlabs.ai>, Joseph Camyre <joseph@judgmentlabs.ai>
8
- License-Expression: Apache-2.0
9
- License-File: LICENSE.md
10
- Classifier: Operating System :: OS Independent
11
- Classifier: Programming Language :: Python :: 3
12
- Requires-Python: >=3.11
13
- Requires-Dist: anthropic
14
- Requires-Dist: fastapi
15
- Requires-Dist: langchain
16
- Requires-Dist: langchain-anthropic
17
- Requires-Dist: langchain-core
18
- Requires-Dist: langchain-huggingface
19
- Requires-Dist: langchain-openai
20
- Requires-Dist: litellm
21
- Requires-Dist: nest-asyncio
22
- Requires-Dist: openai
23
- Requires-Dist: openpyxl
24
- Requires-Dist: pandas
25
- Requires-Dist: pika
26
- Requires-Dist: python-dotenv==1.0.1
27
- Requires-Dist: requests
28
- Requires-Dist: supabase
29
- Requires-Dist: together
30
- Requires-Dist: uvicorn
31
- Provides-Extra: dev
32
- Requires-Dist: pytest-asyncio>=0.25.0; extra == 'dev'
33
- Requires-Dist: pytest-mock>=3.14.0; extra == 'dev'
34
- Requires-Dist: pytest>=8.3.4; extra == 'dev'
35
- Requires-Dist: tavily-python; extra == 'dev'
36
- Description-Content-Type: text/markdown
37
-
38
- # judgeval
39
-
40
- Judgeval is an open-source evaluation framework for multi-agent LLM workflows, for both real-time and offline evaluations.