judgeval 0.1.0__py3-none-any.whl → 0.23.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (234) hide show
  1. judgeval/__init__.py +173 -10
  2. judgeval/api/__init__.py +523 -0
  3. judgeval/api/api_types.py +413 -0
  4. judgeval/cli.py +112 -0
  5. judgeval/constants.py +7 -30
  6. judgeval/data/__init__.py +1 -3
  7. judgeval/data/evaluation_run.py +125 -0
  8. judgeval/data/example.py +14 -40
  9. judgeval/data/judgment_types.py +396 -146
  10. judgeval/data/result.py +11 -18
  11. judgeval/data/scorer_data.py +3 -26
  12. judgeval/data/scripts/openapi_transform.py +5 -5
  13. judgeval/data/trace.py +115 -194
  14. judgeval/dataset/__init__.py +335 -0
  15. judgeval/env.py +55 -0
  16. judgeval/evaluation/__init__.py +346 -0
  17. judgeval/exceptions.py +28 -0
  18. judgeval/integrations/langgraph/__init__.py +13 -0
  19. judgeval/integrations/openlit/__init__.py +51 -0
  20. judgeval/judges/__init__.py +2 -2
  21. judgeval/judges/litellm_judge.py +77 -16
  22. judgeval/judges/together_judge.py +88 -17
  23. judgeval/judges/utils.py +7 -20
  24. judgeval/judgment_attribute_keys.py +55 -0
  25. judgeval/{common/logger.py → logger.py} +24 -8
  26. judgeval/prompt/__init__.py +330 -0
  27. judgeval/scorers/__init__.py +11 -11
  28. judgeval/scorers/agent_scorer.py +15 -19
  29. judgeval/scorers/api_scorer.py +21 -23
  30. judgeval/scorers/base_scorer.py +54 -36
  31. judgeval/scorers/example_scorer.py +1 -3
  32. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -24
  33. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
  34. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
  35. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
  36. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
  37. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +171 -59
  38. judgeval/scorers/score.py +64 -47
  39. judgeval/scorers/utils.py +2 -107
  40. judgeval/tracer/__init__.py +1111 -2
  41. judgeval/tracer/constants.py +1 -0
  42. judgeval/tracer/exporters/__init__.py +40 -0
  43. judgeval/tracer/exporters/s3.py +119 -0
  44. judgeval/tracer/exporters/store.py +59 -0
  45. judgeval/tracer/exporters/utils.py +32 -0
  46. judgeval/tracer/keys.py +63 -0
  47. judgeval/tracer/llm/__init__.py +7 -0
  48. judgeval/tracer/llm/config.py +78 -0
  49. judgeval/tracer/llm/constants.py +9 -0
  50. judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
  51. judgeval/tracer/llm/llm_anthropic/config.py +6 -0
  52. judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
  53. judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
  54. judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
  55. judgeval/tracer/llm/llm_google/__init__.py +3 -0
  56. judgeval/tracer/llm/llm_google/config.py +6 -0
  57. judgeval/tracer/llm/llm_google/generate_content.py +127 -0
  58. judgeval/tracer/llm/llm_google/wrapper.py +30 -0
  59. judgeval/tracer/llm/llm_openai/__init__.py +3 -0
  60. judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
  61. judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
  62. judgeval/tracer/llm/llm_openai/config.py +6 -0
  63. judgeval/tracer/llm/llm_openai/responses.py +506 -0
  64. judgeval/tracer/llm/llm_openai/utils.py +42 -0
  65. judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
  66. judgeval/tracer/llm/llm_together/__init__.py +3 -0
  67. judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
  68. judgeval/tracer/llm/llm_together/config.py +6 -0
  69. judgeval/tracer/llm/llm_together/wrapper.py +52 -0
  70. judgeval/tracer/llm/providers.py +19 -0
  71. judgeval/tracer/managers.py +167 -0
  72. judgeval/tracer/processors/__init__.py +220 -0
  73. judgeval/tracer/utils.py +19 -0
  74. judgeval/trainer/__init__.py +14 -0
  75. judgeval/trainer/base_trainer.py +122 -0
  76. judgeval/trainer/config.py +123 -0
  77. judgeval/trainer/console.py +144 -0
  78. judgeval/trainer/fireworks_trainer.py +392 -0
  79. judgeval/trainer/trainable_model.py +252 -0
  80. judgeval/trainer/trainer.py +70 -0
  81. judgeval/utils/async_utils.py +39 -0
  82. judgeval/utils/decorators/__init__.py +0 -0
  83. judgeval/utils/decorators/dont_throw.py +37 -0
  84. judgeval/utils/decorators/use_once.py +13 -0
  85. judgeval/utils/file_utils.py +74 -28
  86. judgeval/utils/guards.py +36 -0
  87. judgeval/utils/meta.py +27 -0
  88. judgeval/utils/project.py +15 -0
  89. judgeval/utils/serialize.py +253 -0
  90. judgeval/utils/testing.py +70 -0
  91. judgeval/utils/url.py +10 -0
  92. judgeval/{version_check.py → utils/version_check.py} +5 -3
  93. judgeval/utils/wrappers/README.md +3 -0
  94. judgeval/utils/wrappers/__init__.py +15 -0
  95. judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
  96. judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
  97. judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
  98. judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
  99. judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
  100. judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
  101. judgeval/utils/wrappers/py.typed +0 -0
  102. judgeval/utils/wrappers/utils.py +35 -0
  103. judgeval/v1/__init__.py +88 -0
  104. judgeval/v1/data/__init__.py +7 -0
  105. judgeval/v1/data/example.py +44 -0
  106. judgeval/v1/data/scorer_data.py +42 -0
  107. judgeval/v1/data/scoring_result.py +44 -0
  108. judgeval/v1/datasets/__init__.py +6 -0
  109. judgeval/v1/datasets/dataset.py +214 -0
  110. judgeval/v1/datasets/dataset_factory.py +94 -0
  111. judgeval/v1/evaluation/__init__.py +6 -0
  112. judgeval/v1/evaluation/evaluation.py +182 -0
  113. judgeval/v1/evaluation/evaluation_factory.py +17 -0
  114. judgeval/v1/instrumentation/__init__.py +6 -0
  115. judgeval/v1/instrumentation/llm/__init__.py +7 -0
  116. judgeval/v1/instrumentation/llm/config.py +78 -0
  117. judgeval/v1/instrumentation/llm/constants.py +11 -0
  118. judgeval/v1/instrumentation/llm/llm_anthropic/__init__.py +5 -0
  119. judgeval/v1/instrumentation/llm/llm_anthropic/config.py +6 -0
  120. judgeval/v1/instrumentation/llm/llm_anthropic/messages.py +414 -0
  121. judgeval/v1/instrumentation/llm/llm_anthropic/messages_stream.py +307 -0
  122. judgeval/v1/instrumentation/llm/llm_anthropic/wrapper.py +61 -0
  123. judgeval/v1/instrumentation/llm/llm_google/__init__.py +5 -0
  124. judgeval/v1/instrumentation/llm/llm_google/config.py +6 -0
  125. judgeval/v1/instrumentation/llm/llm_google/generate_content.py +121 -0
  126. judgeval/v1/instrumentation/llm/llm_google/wrapper.py +30 -0
  127. judgeval/v1/instrumentation/llm/llm_openai/__init__.py +5 -0
  128. judgeval/v1/instrumentation/llm/llm_openai/beta_chat_completions.py +212 -0
  129. judgeval/v1/instrumentation/llm/llm_openai/chat_completions.py +477 -0
  130. judgeval/v1/instrumentation/llm/llm_openai/config.py +6 -0
  131. judgeval/v1/instrumentation/llm/llm_openai/responses.py +472 -0
  132. judgeval/v1/instrumentation/llm/llm_openai/utils.py +41 -0
  133. judgeval/v1/instrumentation/llm/llm_openai/wrapper.py +63 -0
  134. judgeval/v1/instrumentation/llm/llm_together/__init__.py +5 -0
  135. judgeval/v1/instrumentation/llm/llm_together/chat_completions.py +382 -0
  136. judgeval/v1/instrumentation/llm/llm_together/config.py +6 -0
  137. judgeval/v1/instrumentation/llm/llm_together/wrapper.py +57 -0
  138. judgeval/v1/instrumentation/llm/providers.py +19 -0
  139. judgeval/v1/integrations/claude_agent_sdk/__init__.py +119 -0
  140. judgeval/v1/integrations/claude_agent_sdk/wrapper.py +564 -0
  141. judgeval/v1/integrations/langgraph/__init__.py +13 -0
  142. judgeval/v1/integrations/openlit/__init__.py +47 -0
  143. judgeval/v1/internal/api/__init__.py +525 -0
  144. judgeval/v1/internal/api/api_types.py +413 -0
  145. judgeval/v1/prompts/__init__.py +6 -0
  146. judgeval/v1/prompts/prompt.py +29 -0
  147. judgeval/v1/prompts/prompt_factory.py +189 -0
  148. judgeval/v1/py.typed +0 -0
  149. judgeval/v1/scorers/__init__.py +6 -0
  150. judgeval/v1/scorers/api_scorer.py +82 -0
  151. judgeval/v1/scorers/base_scorer.py +17 -0
  152. judgeval/v1/scorers/built_in/__init__.py +17 -0
  153. judgeval/v1/scorers/built_in/answer_correctness.py +28 -0
  154. judgeval/v1/scorers/built_in/answer_relevancy.py +28 -0
  155. judgeval/v1/scorers/built_in/built_in_factory.py +26 -0
  156. judgeval/v1/scorers/built_in/faithfulness.py +28 -0
  157. judgeval/v1/scorers/built_in/instruction_adherence.py +28 -0
  158. judgeval/v1/scorers/custom_scorer/__init__.py +6 -0
  159. judgeval/v1/scorers/custom_scorer/custom_scorer.py +50 -0
  160. judgeval/v1/scorers/custom_scorer/custom_scorer_factory.py +16 -0
  161. judgeval/v1/scorers/prompt_scorer/__init__.py +6 -0
  162. judgeval/v1/scorers/prompt_scorer/prompt_scorer.py +86 -0
  163. judgeval/v1/scorers/prompt_scorer/prompt_scorer_factory.py +85 -0
  164. judgeval/v1/scorers/scorers_factory.py +49 -0
  165. judgeval/v1/tracer/__init__.py +7 -0
  166. judgeval/v1/tracer/base_tracer.py +520 -0
  167. judgeval/v1/tracer/exporters/__init__.py +14 -0
  168. judgeval/v1/tracer/exporters/in_memory_span_exporter.py +25 -0
  169. judgeval/v1/tracer/exporters/judgment_span_exporter.py +42 -0
  170. judgeval/v1/tracer/exporters/noop_span_exporter.py +19 -0
  171. judgeval/v1/tracer/exporters/span_store.py +50 -0
  172. judgeval/v1/tracer/judgment_tracer_provider.py +70 -0
  173. judgeval/v1/tracer/processors/__init__.py +6 -0
  174. judgeval/v1/tracer/processors/_lifecycles/__init__.py +28 -0
  175. judgeval/v1/tracer/processors/_lifecycles/agent_id_processor.py +53 -0
  176. judgeval/v1/tracer/processors/_lifecycles/context_keys.py +11 -0
  177. judgeval/v1/tracer/processors/_lifecycles/customer_id_processor.py +29 -0
  178. judgeval/v1/tracer/processors/_lifecycles/registry.py +18 -0
  179. judgeval/v1/tracer/processors/judgment_span_processor.py +165 -0
  180. judgeval/v1/tracer/processors/noop_span_processor.py +42 -0
  181. judgeval/v1/tracer/tracer.py +67 -0
  182. judgeval/v1/tracer/tracer_factory.py +38 -0
  183. judgeval/v1/trainers/__init__.py +5 -0
  184. judgeval/v1/trainers/base_trainer.py +62 -0
  185. judgeval/v1/trainers/config.py +123 -0
  186. judgeval/v1/trainers/console.py +144 -0
  187. judgeval/v1/trainers/fireworks_trainer.py +392 -0
  188. judgeval/v1/trainers/trainable_model.py +252 -0
  189. judgeval/v1/trainers/trainers_factory.py +37 -0
  190. judgeval/v1/utils.py +18 -0
  191. judgeval/version.py +5 -0
  192. judgeval/warnings.py +4 -0
  193. judgeval-0.23.0.dist-info/METADATA +266 -0
  194. judgeval-0.23.0.dist-info/RECORD +201 -0
  195. judgeval-0.23.0.dist-info/entry_points.txt +2 -0
  196. judgeval/clients.py +0 -34
  197. judgeval/common/__init__.py +0 -13
  198. judgeval/common/api/__init__.py +0 -3
  199. judgeval/common/api/api.py +0 -352
  200. judgeval/common/api/constants.py +0 -165
  201. judgeval/common/exceptions.py +0 -27
  202. judgeval/common/storage/__init__.py +0 -6
  203. judgeval/common/storage/s3_storage.py +0 -98
  204. judgeval/common/tracer/__init__.py +0 -31
  205. judgeval/common/tracer/constants.py +0 -22
  206. judgeval/common/tracer/core.py +0 -1916
  207. judgeval/common/tracer/otel_exporter.py +0 -108
  208. judgeval/common/tracer/otel_span_processor.py +0 -234
  209. judgeval/common/tracer/span_processor.py +0 -37
  210. judgeval/common/tracer/span_transformer.py +0 -211
  211. judgeval/common/tracer/trace_manager.py +0 -92
  212. judgeval/common/utils.py +0 -940
  213. judgeval/data/datasets/__init__.py +0 -4
  214. judgeval/data/datasets/dataset.py +0 -341
  215. judgeval/data/datasets/eval_dataset_client.py +0 -214
  216. judgeval/data/tool.py +0 -5
  217. judgeval/data/trace_run.py +0 -37
  218. judgeval/evaluation_run.py +0 -75
  219. judgeval/integrations/langgraph.py +0 -843
  220. judgeval/judges/mixture_of_judges.py +0 -286
  221. judgeval/judgment_client.py +0 -369
  222. judgeval/rules.py +0 -521
  223. judgeval/run_evaluation.py +0 -684
  224. judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
  225. judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
  226. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
  227. judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
  228. judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
  229. judgeval/utils/alerts.py +0 -93
  230. judgeval/utils/requests.py +0 -50
  231. judgeval-0.1.0.dist-info/METADATA +0 -202
  232. judgeval-0.1.0.dist-info/RECORD +0 -73
  233. {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/WHEEL +0 -0
  234. {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,843 +0,0 @@
1
- from typing import Any, Dict, List, Optional, Sequence
2
- from uuid import UUID
3
- import time
4
- import uuid
5
- from datetime import datetime, timezone
6
-
7
- from judgeval.common.tracer import (
8
- TraceClient,
9
- TraceSpan,
10
- Tracer,
11
- SpanType,
12
- cost_per_token,
13
- )
14
- from judgeval.data.trace import TraceUsage
15
-
16
- from langchain_core.callbacks import BaseCallbackHandler
17
- from langchain_core.agents import AgentAction, AgentFinish
18
- from langchain_core.outputs import LLMResult
19
- from langchain_core.messages.base import BaseMessage
20
- from langchain_core.documents import Document
21
-
22
- # TODO: Figure out how to handle context variables. Current solution is to keep track of current span id in Tracer class
23
-
24
-
25
- class JudgevalCallbackHandler(BaseCallbackHandler):
26
- """
27
- LangChain Callback Handler using run_id/parent_run_id for hierarchy.
28
- Manages its own internal TraceClient instance created upon first use.
29
- Includes verbose logging and defensive checks.
30
- """
31
-
32
- # Make all properties ignored by LangChain's callback system
33
- # to prevent unexpected serialization issues.
34
- lc_serializable = False
35
- lc_kwargs: dict = {}
36
-
37
- def __init__(self, tracer: Tracer):
38
- self.tracer = tracer
39
- self.executed_nodes: List[str] = []
40
- self._reset_state()
41
-
42
- def _reset_state(self):
43
- """Reset only the critical execution state for reuse across multiple executions"""
44
- # Reset core execution state that must be cleared between runs
45
- self._trace_client: Optional[TraceClient] = None
46
- self._run_id_to_span_id: Dict[UUID, str] = {}
47
- self._span_id_to_start_time: Dict[str, float] = {}
48
- self._span_id_to_depth: Dict[str, int] = {}
49
- self._root_run_id: Optional[UUID] = None
50
- self._trace_saved: bool = False
51
- self.span_id_to_token: Dict[str, Any] = {}
52
- self.trace_id_to_token: Dict[str, Any] = {}
53
-
54
- # Add timestamp to track when we last reset
55
- self._last_reset_time: float = time.time()
56
-
57
- # Also reset tracking/logging variables
58
- self.executed_nodes: List[str] = []
59
-
60
- def reset(self):
61
- """Public method to manually reset handler execution state for reuse"""
62
- self._reset_state()
63
-
64
- def reset_all(self):
65
- """Public method to reset ALL handler state including tracking/logging data"""
66
- self._reset_state()
67
-
68
- def _ensure_trace_client(
69
- self, run_id: UUID, parent_run_id: Optional[UUID], event_name: str
70
- ) -> Optional[TraceClient]:
71
- """
72
- Ensures the internal trace client is initialized, creating it only once
73
- per handler instance lifecycle (effectively per graph invocation).
74
- Returns the client or None.
75
- """
76
-
77
- # If this is a potential new root execution (no parent_run_id) and we had a previous trace saved,
78
- # reset state to allow reuse of the handler
79
- if parent_run_id is None and self._trace_saved:
80
- self._reset_state()
81
-
82
- # If a client already exists, return it.
83
- if self._trace_client:
84
- return self._trace_client
85
-
86
- # If no client exists, initialize it NOW.
87
- trace_id = str(uuid.uuid4())
88
- project = self.tracer.project_name
89
- try:
90
- # Use event_name as the initial trace name, might be updated later by on_chain_start if root
91
- client_instance = TraceClient(
92
- self.tracer,
93
- trace_id,
94
- event_name,
95
- project_name=project,
96
- enable_monitoring=self.tracer.enable_monitoring,
97
- enable_evaluations=self.tracer.enable_evaluations,
98
- )
99
- self._trace_client = client_instance
100
- token = self.tracer.set_current_trace(self._trace_client)
101
- if token:
102
- self.trace_id_to_token[trace_id] = token
103
-
104
- if self._trace_client:
105
- self._root_run_id = run_id
106
- self._trace_saved = False
107
- self.tracer._active_trace_client = self._trace_client
108
-
109
- try:
110
- self._trace_client.save(final_save=False)
111
- except Exception as e:
112
- import warnings
113
-
114
- warnings.warn(
115
- f"Failed to save initial trace for live tracking: {e}"
116
- )
117
-
118
- return self._trace_client
119
- else:
120
- return None
121
- except Exception:
122
- self._trace_client = None
123
- self._root_run_id = None
124
- return None
125
-
126
- def _start_span_tracking(
127
- self,
128
- trace_client: TraceClient,
129
- run_id: UUID,
130
- parent_run_id: Optional[UUID],
131
- name: str,
132
- span_type: SpanType = "span",
133
- inputs: Optional[Dict[str, Any]] = None,
134
- ) -> None:
135
- """Start tracking a span, ensuring trace client exists"""
136
-
137
- start_time = time.time()
138
- span_id = str(uuid.uuid4())
139
- parent_span_id: Optional[str] = None
140
- current_depth = 0
141
-
142
- if parent_run_id and parent_run_id in self._run_id_to_span_id:
143
- parent_span_id = self._run_id_to_span_id[parent_run_id]
144
- if parent_span_id in self._span_id_to_depth:
145
- current_depth = self._span_id_to_depth[parent_span_id] + 1
146
-
147
- self._run_id_to_span_id[run_id] = span_id
148
- self._span_id_to_start_time[span_id] = start_time
149
- self._span_id_to_depth[span_id] = current_depth
150
-
151
- new_span = TraceSpan(
152
- span_id=span_id,
153
- trace_id=trace_client.trace_id,
154
- parent_span_id=parent_span_id,
155
- function=name,
156
- depth=current_depth,
157
- created_at=start_time,
158
- span_type=span_type,
159
- )
160
-
161
- # Separate metadata from inputs
162
- if inputs:
163
- metadata = {}
164
- clean_inputs = {}
165
-
166
- # Extract metadata fields
167
- metadata_fields = ["tags", "metadata", "kwargs", "serialized"]
168
- for field in metadata_fields:
169
- if field in inputs:
170
- metadata[field] = inputs.pop(field)
171
-
172
- # Store the remaining inputs
173
- clean_inputs = inputs
174
-
175
- # Set both fields on the span
176
- new_span.inputs = clean_inputs
177
- new_span.additional_metadata = metadata
178
- else:
179
- new_span.inputs = {}
180
- new_span.additional_metadata = {}
181
-
182
- trace_client.add_span(new_span)
183
-
184
- trace_client.otel_span_processor.queue_span_update(new_span, span_state="input")
185
-
186
- token = self.tracer.set_current_span(span_id)
187
- if token:
188
- self.span_id_to_token[span_id] = token
189
-
190
- def _end_span_tracking(
191
- self,
192
- trace_client: TraceClient,
193
- run_id: UUID,
194
- outputs: Optional[Any] = None,
195
- error: Optional[BaseException] = None,
196
- ) -> None:
197
- """End tracking a span, ensuring trace client exists"""
198
-
199
- # Get span ID and check if it exists
200
- span_id = self._run_id_to_span_id.get(run_id)
201
- if span_id:
202
- token = self.span_id_to_token.pop(span_id, None)
203
- self.tracer.reset_current_span(token, span_id)
204
-
205
- start_time = self._span_id_to_start_time.get(span_id) if span_id else None
206
- duration = time.time() - start_time if start_time is not None else None
207
-
208
- # Add exit entry (only if span was tracked)
209
- if span_id:
210
- trace_span = trace_client.span_id_to_span.get(span_id)
211
- if trace_span:
212
- trace_span.duration = duration
213
-
214
- # Handle outputs and error
215
- if error:
216
- trace_span.output = error
217
- elif outputs:
218
- # Separate metadata from outputs
219
- metadata = {}
220
- clean_outputs = {}
221
-
222
- # Extract metadata fields
223
- metadata_fields = ["tags", "kwargs"]
224
- if isinstance(outputs, dict):
225
- for field in metadata_fields:
226
- if field in outputs:
227
- metadata[field] = outputs.pop(field)
228
-
229
- # Store the remaining outputs
230
- clean_outputs = outputs
231
- else:
232
- clean_outputs = outputs
233
-
234
- # Set both fields on the span
235
- trace_span.output = clean_outputs
236
- if metadata:
237
- # Merge with existing metadata
238
- existing_metadata = trace_span.additional_metadata or {}
239
- trace_span.additional_metadata = {
240
- **existing_metadata,
241
- **metadata,
242
- }
243
-
244
- span_state = "error" if error else "completed"
245
- trace_client.otel_span_processor.queue_span_update(
246
- trace_span, span_state=span_state
247
- )
248
-
249
- # Clean up dictionaries for this specific span
250
- if span_id in self._span_id_to_start_time:
251
- del self._span_id_to_start_time[span_id]
252
- if span_id in self._span_id_to_depth:
253
- del self._span_id_to_depth[span_id]
254
-
255
- # Check if this is the root run ending
256
- if run_id == self._root_run_id:
257
- try:
258
- self._root_run_id = None
259
- if (
260
- self._trace_client and not self._trace_saved
261
- ): # Check if not already saved
262
- complete_trace_data = {
263
- "trace_id": self._trace_client.trace_id,
264
- "name": self._trace_client.name,
265
- "created_at": datetime.fromtimestamp(
266
- self._trace_client.start_time, timezone.utc
267
- ).isoformat(),
268
- "duration": self._trace_client.get_duration(),
269
- "trace_spans": [
270
- span.model_dump() for span in self._trace_client.trace_spans
271
- ],
272
- "offline_mode": self.tracer.offline_mode,
273
- "parent_trace_id": self._trace_client.parent_trace_id,
274
- "parent_name": self._trace_client.parent_name,
275
- }
276
-
277
- self.tracer.flush_background_spans()
278
-
279
- trace_id, trace_data = self._trace_client.save(
280
- final_save=True, # Final save with usage counter updates
281
- )
282
- token = self.trace_id_to_token.pop(trace_id, None)
283
- self.tracer.reset_current_trace(token, trace_id)
284
-
285
- # Store complete trace data instead of server response
286
- self.tracer.traces.append(complete_trace_data)
287
- self._trace_saved = True # Set flag only after successful save
288
- finally:
289
- # This block executes regardless of save success/failure
290
- # Reset root run id
291
- self._root_run_id = None
292
- # Reset input storage for this handler instance
293
- if self.tracer._active_trace_client == self._trace_client:
294
- self.tracer._active_trace_client = None
295
-
296
- # --- Callback Methods ---
297
- # Each method now ensures the trace client exists before proceeding
298
-
299
- def on_retriever_start(
300
- self,
301
- serialized: Dict[str, Any],
302
- query: str,
303
- *,
304
- run_id: UUID,
305
- parent_run_id: Optional[UUID] = None,
306
- tags: Optional[List[str]] = None,
307
- metadata: Optional[Dict[str, Any]] = None,
308
- **kwargs: Any,
309
- ) -> Any:
310
- serialized_name = (
311
- serialized.get("name", "Unknown")
312
- if serialized
313
- else "Unknown (Serialized=None)"
314
- )
315
-
316
- name = f"RETRIEVER_{(serialized_name).upper()}"
317
- trace_client = self._ensure_trace_client(run_id, parent_run_id, name)
318
- if not trace_client:
319
- return
320
-
321
- inputs = {
322
- "query": query,
323
- "tags": tags,
324
- "metadata": metadata,
325
- "kwargs": kwargs,
326
- "serialized": serialized,
327
- }
328
- self._start_span_tracking(
329
- trace_client,
330
- run_id,
331
- parent_run_id,
332
- name,
333
- span_type="retriever",
334
- inputs=inputs,
335
- )
336
-
337
- def on_retriever_end(
338
- self,
339
- documents: Sequence[Document],
340
- *,
341
- run_id: UUID,
342
- parent_run_id: Optional[UUID] = None,
343
- **kwargs: Any,
344
- ) -> Any:
345
- trace_client = self._ensure_trace_client(run_id, parent_run_id, "RetrieverEnd")
346
- if not trace_client:
347
- return
348
- doc_summary = [
349
- {
350
- "index": i,
351
- "page_content": (
352
- doc.page_content[:100] + "..."
353
- if len(doc.page_content) > 100
354
- else doc.page_content
355
- ),
356
- "metadata": doc.metadata,
357
- }
358
- for i, doc in enumerate(documents)
359
- ]
360
- outputs = {
361
- "document_count": len(documents),
362
- "documents": doc_summary,
363
- "kwargs": kwargs,
364
- }
365
- self._end_span_tracking(trace_client, run_id, outputs=outputs)
366
-
367
- def on_chain_start(
368
- self,
369
- serialized: Dict[str, Any],
370
- inputs: Dict[str, Any],
371
- *,
372
- run_id: UUID,
373
- parent_run_id: Optional[UUID] = None,
374
- tags: Optional[List[str]] = None,
375
- metadata: Optional[Dict[str, Any]] = None,
376
- **kwargs: Any,
377
- ) -> None:
378
- serialized_name = (
379
- serialized.get("name") if serialized else "Unknown (Serialized=None)"
380
- )
381
-
382
- # --- Determine Name and Span Type ---
383
- span_type: SpanType = "chain"
384
- name = serialized_name if serialized_name else "Unknown Chain"
385
- node_name = metadata.get("langgraph_node") if metadata else None
386
- is_langgraph_root_kwarg = (
387
- kwargs.get("name") == "LangGraph"
388
- ) # Check kwargs for explicit root name
389
- # More robust root detection: Often the first chain event with parent_run_id=None *is* the root.
390
- is_potential_root_event = parent_run_id is None
391
-
392
- if node_name:
393
- name = node_name # Use node name if available
394
- if name not in self.executed_nodes:
395
- self.executed_nodes.append(
396
- name
397
- ) # Leaving this in for now but can probably be removed
398
- elif is_langgraph_root_kwarg and is_potential_root_event:
399
- name = "LangGraph" # Explicit root detected
400
- # Add handling for other potential LangChain internal chains if needed, e.g., "RunnableSequence"
401
-
402
- trace_client = self._ensure_trace_client(run_id, parent_run_id, name)
403
- if not trace_client:
404
- return
405
-
406
- if (
407
- is_potential_root_event
408
- and run_id == self._root_run_id
409
- and trace_client.name != name
410
- ):
411
- trace_client.name = name
412
-
413
- combined_inputs = {
414
- "inputs": inputs,
415
- "tags": tags,
416
- "metadata": metadata,
417
- "kwargs": kwargs,
418
- "serialized": serialized,
419
- }
420
- self._start_span_tracking(
421
- trace_client,
422
- run_id,
423
- parent_run_id,
424
- name,
425
- span_type=span_type,
426
- inputs=combined_inputs,
427
- )
428
-
429
- def on_chain_end(
430
- self,
431
- outputs: Dict[str, Any],
432
- *,
433
- run_id: UUID,
434
- parent_run_id: Optional[UUID] = None,
435
- tags: Optional[List[str]] = None,
436
- **kwargs: Any,
437
- ) -> Any:
438
- trace_client = self._ensure_trace_client(run_id, parent_run_id, "ChainEnd")
439
- if not trace_client:
440
- return
441
-
442
- span_id = self._run_id_to_span_id.get(run_id)
443
- if not span_id and run_id != self._root_run_id:
444
- return
445
-
446
- combined_outputs = {"outputs": outputs, "tags": tags, "kwargs": kwargs}
447
-
448
- self._end_span_tracking(trace_client, run_id, outputs=combined_outputs)
449
-
450
- if run_id == self._root_run_id:
451
- if trace_client and not self._trace_saved:
452
- complete_trace_data = {
453
- "trace_id": trace_client.trace_id,
454
- "name": trace_client.name,
455
- "created_at": datetime.fromtimestamp(
456
- trace_client.start_time, timezone.utc
457
- ).isoformat(),
458
- "duration": trace_client.get_duration(),
459
- "trace_spans": [
460
- span.model_dump() for span in trace_client.trace_spans
461
- ],
462
- "offline_mode": self.tracer.offline_mode,
463
- "parent_trace_id": trace_client.parent_trace_id,
464
- "parent_name": trace_client.parent_name,
465
- }
466
-
467
- self.tracer.flush_background_spans()
468
-
469
- trace_client.save(
470
- final_save=True,
471
- )
472
-
473
- self.tracer.traces.append(complete_trace_data)
474
- self._trace_saved = True
475
- if self.tracer._active_trace_client == trace_client:
476
- self.tracer._active_trace_client = None
477
-
478
- self._root_run_id = None
479
-
480
- def on_chain_error(
481
- self,
482
- error: BaseException,
483
- *,
484
- run_id: UUID,
485
- parent_run_id: Optional[UUID] = None,
486
- **kwargs: Any,
487
- ) -> Any:
488
- trace_client = self._ensure_trace_client(run_id, parent_run_id, "ChainError")
489
- if not trace_client:
490
- return
491
-
492
- span_id = self._run_id_to_span_id.get(run_id)
493
-
494
- if not span_id and run_id != self._root_run_id:
495
- return
496
-
497
- self._end_span_tracking(trace_client, run_id, error=error)
498
-
499
- def on_tool_start(
500
- self,
501
- serialized: Dict[str, Any],
502
- input_str: str,
503
- *,
504
- run_id: UUID,
505
- parent_run_id: Optional[UUID] = None,
506
- tags: Optional[List[str]] = None,
507
- metadata: Optional[Dict[str, Any]] = None,
508
- inputs: Optional[Dict[str, Any]] = None,
509
- **kwargs: Any,
510
- ) -> Any:
511
- name = (
512
- serialized.get("name", "Unnamed Tool")
513
- if serialized
514
- else "Unknown Tool (Serialized=None)"
515
- )
516
-
517
- trace_client = self._ensure_trace_client(run_id, parent_run_id, name)
518
- if not trace_client:
519
- return
520
-
521
- combined_inputs = {
522
- "input_str": input_str,
523
- "inputs": inputs,
524
- "tags": tags,
525
- "metadata": metadata,
526
- "kwargs": kwargs,
527
- "serialized": serialized,
528
- }
529
- self._start_span_tracking(
530
- trace_client,
531
- run_id,
532
- parent_run_id,
533
- name,
534
- span_type="tool",
535
- inputs=combined_inputs,
536
- )
537
-
538
- def on_tool_end(
539
- self,
540
- output: Any,
541
- *,
542
- run_id: UUID,
543
- parent_run_id: Optional[UUID] = None,
544
- **kwargs: Any,
545
- ) -> Any:
546
- trace_client = self._ensure_trace_client(run_id, parent_run_id, "ToolEnd")
547
- if not trace_client:
548
- return
549
- outputs = {"output": output, "kwargs": kwargs}
550
- self._end_span_tracking(trace_client, run_id, outputs=outputs)
551
-
552
- def on_tool_error(
553
- self,
554
- error: BaseException,
555
- *,
556
- run_id: UUID,
557
- parent_run_id: Optional[UUID] = None,
558
- **kwargs: Any,
559
- ) -> Any:
560
- trace_client = self._ensure_trace_client(run_id, parent_run_id, "ToolError")
561
- if not trace_client:
562
- return
563
- self._end_span_tracking(trace_client, run_id, error=error)
564
-
565
- def on_llm_start(
566
- self,
567
- serialized: Dict[str, Any],
568
- prompts: List[str],
569
- *,
570
- run_id: UUID,
571
- parent_run_id: Optional[UUID] = None,
572
- tags: Optional[List[str]] = None,
573
- metadata: Optional[Dict[str, Any]] = None,
574
- invocation_params: Optional[Dict[str, Any]] = None,
575
- options: Optional[Dict[str, Any]] = None,
576
- name: Optional[str] = None,
577
- **kwargs: Any,
578
- ) -> Any:
579
- llm_name = name or serialized.get("name", "LLM Call")
580
-
581
- trace_client = self._ensure_trace_client(run_id, parent_run_id, llm_name)
582
- if not trace_client:
583
- return
584
- inputs = {
585
- "prompts": prompts,
586
- "invocation_params": invocation_params or kwargs,
587
- "options": options,
588
- "tags": tags,
589
- "metadata": metadata,
590
- "serialized": serialized,
591
- }
592
- self._start_span_tracking(
593
- trace_client,
594
- run_id,
595
- parent_run_id,
596
- llm_name,
597
- span_type="llm",
598
- inputs=inputs,
599
- )
600
-
601
- def on_llm_end(
602
- self,
603
- response: LLMResult,
604
- *,
605
- run_id: UUID,
606
- parent_run_id: Optional[UUID] = None,
607
- **kwargs: Any,
608
- ) -> Any:
609
- trace_client = self._ensure_trace_client(run_id, parent_run_id, "LLMEnd")
610
- if not trace_client:
611
- return
612
- outputs = {"response": response, "kwargs": kwargs}
613
-
614
- prompt_tokens = None
615
- completion_tokens = None
616
- total_tokens = None
617
- model_name = None
618
-
619
- # Extract model name from response if available
620
- if (
621
- hasattr(response, "llm_output")
622
- and response.llm_output
623
- and isinstance(response.llm_output, dict)
624
- ):
625
- model_name = response.llm_output.get(
626
- "model_name"
627
- ) or response.llm_output.get("model")
628
-
629
- # Try to get model from the first generation if available
630
- if not model_name and response.generations and len(response.generations) > 0:
631
- if (
632
- hasattr(response.generations[0][0], "generation_info")
633
- and response.generations[0][0].generation_info
634
- ):
635
- gen_info = response.generations[0][0].generation_info
636
- model_name = gen_info.get("model") or gen_info.get("model_name")
637
-
638
- if response.llm_output and isinstance(response.llm_output, dict):
639
- # Check for OpenAI/standard 'token_usage' first
640
- if "token_usage" in response.llm_output:
641
- token_usage = response.llm_output.get("token_usage")
642
- if token_usage and isinstance(token_usage, dict):
643
- prompt_tokens = token_usage.get("prompt_tokens")
644
- completion_tokens = token_usage.get("completion_tokens")
645
- total_tokens = token_usage.get(
646
- "total_tokens"
647
- ) # OpenAI provides total
648
- # Check for Anthropic 'usage'
649
- elif "usage" in response.llm_output:
650
- token_usage = response.llm_output.get("usage")
651
- if token_usage and isinstance(token_usage, dict):
652
- prompt_tokens = token_usage.get(
653
- "input_tokens"
654
- ) # Anthropic uses input_tokens
655
- completion_tokens = token_usage.get(
656
- "output_tokens"
657
- ) # Anthropic uses output_tokens
658
- # Calculate total if possible
659
- if prompt_tokens is not None and completion_tokens is not None:
660
- total_tokens = prompt_tokens + completion_tokens
661
-
662
- if prompt_tokens is not None or completion_tokens is not None:
663
- prompt_cost = None
664
- completion_cost = None
665
- total_cost_usd = None
666
-
667
- if (
668
- model_name
669
- and prompt_tokens is not None
670
- and completion_tokens is not None
671
- ):
672
- try:
673
- prompt_cost, completion_cost = cost_per_token(
674
- model=model_name,
675
- prompt_tokens=prompt_tokens,
676
- completion_tokens=completion_tokens,
677
- )
678
- total_cost_usd = (
679
- (prompt_cost + completion_cost)
680
- if prompt_cost and completion_cost
681
- else None
682
- )
683
- except Exception as e:
684
- # If cost calculation fails, continue without costs
685
- import warnings
686
-
687
- warnings.warn(
688
- f"Failed to calculate token costs for model {model_name}: {e}"
689
- )
690
-
691
- usage = TraceUsage(
692
- prompt_tokens=prompt_tokens,
693
- completion_tokens=completion_tokens,
694
- total_tokens=total_tokens
695
- or (
696
- prompt_tokens + completion_tokens
697
- if prompt_tokens and completion_tokens
698
- else None
699
- ),
700
- prompt_tokens_cost_usd=prompt_cost,
701
- completion_tokens_cost_usd=completion_cost,
702
- total_cost_usd=total_cost_usd,
703
- model_name=model_name,
704
- )
705
-
706
- span_id = self._run_id_to_span_id.get(run_id)
707
- if span_id and span_id in trace_client.span_id_to_span:
708
- trace_span = trace_client.span_id_to_span[span_id]
709
- trace_span.usage = usage
710
-
711
- self._end_span_tracking(trace_client, run_id, outputs=outputs)
712
-
713
- def on_llm_error(
714
- self,
715
- error: BaseException,
716
- *,
717
- run_id: UUID,
718
- parent_run_id: Optional[UUID] = None,
719
- **kwargs: Any,
720
- ) -> Any:
721
- trace_client = self._ensure_trace_client(run_id, parent_run_id, "LLMError")
722
- if not trace_client:
723
- return
724
- self._end_span_tracking(trace_client, run_id, error=error)
725
-
726
- def on_chat_model_start(
727
- self,
728
- serialized: Dict[str, Any],
729
- messages: List[List[BaseMessage]],
730
- *,
731
- run_id: UUID,
732
- parent_run_id: Optional[UUID] = None,
733
- tags: Optional[List[str]] = None,
734
- metadata: Optional[Dict[str, Any]] = None,
735
- invocation_params: Optional[Dict[str, Any]] = None,
736
- options: Optional[Dict[str, Any]] = None,
737
- name: Optional[str] = None,
738
- **kwargs: Any,
739
- ) -> Any:
740
- chat_model_name = name or serialized.get("name", "ChatModel Call")
741
- is_openai = (
742
- any(
743
- key.startswith("openai") for key in serialized.get("secrets", {}).keys()
744
- )
745
- or "openai" in chat_model_name.lower()
746
- )
747
- is_anthropic = (
748
- any(
749
- key.startswith("anthropic")
750
- for key in serialized.get("secrets", {}).keys()
751
- )
752
- or "anthropic" in chat_model_name.lower()
753
- or "claude" in chat_model_name.lower()
754
- )
755
- is_together = (
756
- any(
757
- key.startswith("together")
758
- for key in serialized.get("secrets", {}).keys()
759
- )
760
- or "together" in chat_model_name.lower()
761
- )
762
-
763
- is_google = (
764
- any(
765
- key.startswith("google") for key in serialized.get("secrets", {}).keys()
766
- )
767
- or "google" in chat_model_name.lower()
768
- or "gemini" in chat_model_name.lower()
769
- )
770
-
771
- if is_openai and "OPENAI_API_CALL" not in chat_model_name:
772
- chat_model_name = f"{chat_model_name} OPENAI_API_CALL"
773
- elif is_anthropic and "ANTHROPIC_API_CALL" not in chat_model_name:
774
- chat_model_name = f"{chat_model_name} ANTHROPIC_API_CALL"
775
- elif is_together and "TOGETHER_API_CALL" not in chat_model_name:
776
- chat_model_name = f"{chat_model_name} TOGETHER_API_CALL"
777
-
778
- elif is_google and "GOOGLE_API_CALL" not in chat_model_name:
779
- chat_model_name = f"{chat_model_name} GOOGLE_API_CALL"
780
-
781
- trace_client = self._ensure_trace_client(run_id, parent_run_id, chat_model_name)
782
- if not trace_client:
783
- return
784
- inputs = {
785
- "messages": messages,
786
- "invocation_params": invocation_params or kwargs,
787
- "options": options,
788
- "tags": tags,
789
- "metadata": metadata,
790
- "serialized": serialized,
791
- }
792
- self._start_span_tracking(
793
- trace_client,
794
- run_id,
795
- parent_run_id,
796
- chat_model_name,
797
- span_type="llm",
798
- inputs=inputs,
799
- )
800
-
801
- def on_agent_action(
802
- self,
803
- action: AgentAction,
804
- *,
805
- run_id: UUID,
806
- parent_run_id: Optional[UUID] = None,
807
- **kwargs: Any,
808
- ) -> Any:
809
- action_tool = action.tool
810
- name = f"AGENT_ACTION_{(action_tool).upper()}"
811
- trace_client = self._ensure_trace_client(run_id, parent_run_id, name)
812
- if not trace_client:
813
- return
814
-
815
- inputs = {
816
- "tool_input": action.tool_input,
817
- "log": action.log,
818
- "messages": action.messages,
819
- "kwargs": kwargs,
820
- }
821
- self._start_span_tracking(
822
- trace_client, run_id, parent_run_id, name, span_type="agent", inputs=inputs
823
- )
824
-
825
- def on_agent_finish(
826
- self,
827
- finish: AgentFinish,
828
- *,
829
- run_id: UUID,
830
- parent_run_id: Optional[UUID] = None,
831
- **kwargs: Any,
832
- ) -> Any:
833
- trace_client = self._ensure_trace_client(run_id, parent_run_id, "AgentFinish")
834
- if not trace_client:
835
- return
836
-
837
- outputs = {
838
- "return_values": finish.return_values,
839
- "log": finish.log,
840
- "messages": finish.messages,
841
- "kwargs": kwargs,
842
- }
843
- self._end_span_tracking(trace_client, run_id, outputs=outputs)