judgeval 0.12.0__py3-none-any.whl → 0.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. judgeval/__init__.py +2 -2
  2. judgeval/api/api_types.py +81 -12
  3. judgeval/cli.py +2 -1
  4. judgeval/constants.py +0 -6
  5. judgeval/data/evaluation_run.py +2 -5
  6. judgeval/data/judgment_types.py +97 -12
  7. judgeval/data/trace.py +108 -1
  8. judgeval/dataset/__init__.py +72 -23
  9. judgeval/env.py +5 -20
  10. judgeval/integrations/langgraph/__init__.py +9 -785
  11. judgeval/scorers/api_scorer.py +7 -12
  12. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +0 -8
  13. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +0 -8
  14. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +0 -12
  15. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +22 -33
  16. judgeval/scorers/score.py +1 -1
  17. judgeval/scorers/utils.py +1 -4
  18. judgeval/tracer/__init__.py +175 -156
  19. judgeval/tracer/exporters/__init__.py +4 -1
  20. judgeval/tracer/keys.py +15 -25
  21. judgeval/tracer/llm/__init__.py +0 -1
  22. judgeval/tracer/llm/anthropic/__init__.py +20 -0
  23. judgeval/tracer/llm/google/__init__.py +21 -0
  24. judgeval/tracer/llm/groq/__init__.py +20 -0
  25. judgeval/tracer/llm/openai/__init__.py +32 -0
  26. judgeval/tracer/llm/providers.py +28 -79
  27. judgeval/tracer/llm/together/__init__.py +20 -0
  28. judgeval/tracer/managers.py +23 -48
  29. judgeval/tracer/processors/__init__.py +36 -75
  30. judgeval/tracer/utils.py +1 -2
  31. judgeval/utils/file_utils.py +0 -2
  32. judgeval/utils/meta.py +18 -5
  33. judgeval/utils/testing.py +0 -14
  34. judgeval/utils/version_check.py +2 -0
  35. judgeval/version.py +1 -1
  36. {judgeval-0.12.0.dist-info → judgeval-0.13.0.dist-info}/METADATA +1 -7
  37. {judgeval-0.12.0.dist-info → judgeval-0.13.0.dist-info}/RECORD +40 -35
  38. {judgeval-0.12.0.dist-info → judgeval-0.13.0.dist-info}/WHEEL +0 -0
  39. {judgeval-0.12.0.dist-info → judgeval-0.13.0.dist-info}/entry_points.txt +0 -0
  40. {judgeval-0.12.0.dist-info → judgeval-0.13.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,789 +1,13 @@
1
1
  from __future__ import annotations
2
2
 
3
- import time
4
- import uuid
5
- from typing import Any, Dict, List, Optional, Sequence, Set, Type
6
- from uuid import UUID
3
+ from abc import ABC
4
+ import os
7
5
 
8
- try:
9
- from langchain_core.callbacks import BaseCallbackHandler
10
- from langchain_core.agents import AgentAction, AgentFinish
11
- from langchain_core.outputs import LLMResult, ChatGeneration
12
- from langchain_core.messages import (
13
- AIMessage,
14
- BaseMessage,
15
- ChatMessage,
16
- FunctionMessage,
17
- HumanMessage,
18
- SystemMessage,
19
- ToolMessage,
20
- )
21
- from langchain_core.documents import Document
22
- except ImportError as e:
23
- raise ImportError(
24
- "Judgeval's langgraph integration requires langchain to be installed. Please install it with `pip install judgeval[langchain]`"
25
- ) from e
26
6
 
27
- from judgeval.tracer import Tracer
28
- from judgeval.tracer.keys import AttributeKeys
29
- from judgeval.tracer.managers import sync_span_context
30
- from judgeval.utils.serialize import safe_serialize
31
- from judgeval.logger import judgeval_logger
32
- from opentelemetry.trace import Status, StatusCode, Span
33
-
34
- # Control flow exception types that should not be treated as errors
35
- CONTROL_FLOW_EXCEPTION_TYPES: Set[Type[BaseException]] = set()
36
-
37
- try:
38
- from langgraph.errors import GraphBubbleUp
39
-
40
- CONTROL_FLOW_EXCEPTION_TYPES.add(GraphBubbleUp)
41
- except ImportError:
42
- pass
43
-
44
- LANGSMITH_TAG_HIDDEN: str = "langsmith:hidden"
45
-
46
-
47
- class JudgevalCallbackHandler(BaseCallbackHandler):
48
- """
49
- LangGraph/LangChain Callback Handler that creates OpenTelemetry spans
50
- using the Judgeval tracer framework.
51
-
52
- This handler tracks the execution of chains, tools, LLMs, and other components
53
- in a LangGraph/LangChain application, creating proper span hierarchies for monitoring.
54
- """
55
-
56
- # Prevent LangChain serialization issues
57
- lc_serializable = False
58
- lc_kwargs: dict = {}
59
-
60
- def __init__(self, tracer: Optional[Tracer] = None):
61
- """
62
- Initialize the callback handler.
63
-
64
- Args:
65
- tracer: Optional Tracer instance. If not provided, will try to use an active tracer.
66
- """
67
- self.tracer = tracer
68
- if self.tracer is None:
69
- # Try to get an active tracer
70
- if Tracer._active_tracers:
71
- self.tracer = next(iter(Tracer._active_tracers))
72
- else:
73
- judgeval_logger.warning(
74
- "No tracer provided and no active tracers found. "
75
- "Callback handler will not create spans."
76
- )
77
- return
78
-
79
- # Track spans by run_id for proper hierarchy
80
- self.spans: Dict[UUID, Span] = {}
81
- self.span_start_times: Dict[UUID, float] = {}
82
- self.run_id_to_span_id: Dict[UUID, str] = {}
83
- self.span_id_to_depth: Dict[str, int] = {}
84
- self.root_run_id: Optional[UUID] = None
85
-
86
- # Track execution for debugging
87
- self.executed_nodes: List[str] = []
88
- self.executed_tools: List[str] = []
89
- self.executed_node_tools: List[Dict[str, Any]] = []
90
-
91
- def reset(self):
92
- """Reset handler state for reuse across multiple executions."""
93
- self.spans.clear()
94
- self.span_start_times.clear()
95
- self.executed_nodes.clear()
96
- self.executed_tools.clear()
97
- self.executed_node_tools.clear()
98
-
99
- def _get_run_name(self, serialized: Optional[Dict[str, Any]], **kwargs: Any) -> str:
100
- """Extract the name of the operation from serialized data or kwargs."""
101
- if "name" in kwargs and kwargs["name"] is not None:
102
- return str(kwargs["name"])
103
-
104
- if serialized is None:
105
- return "<unknown>"
106
-
107
- try:
108
- return str(serialized["name"])
109
- except (KeyError, TypeError):
110
- pass
111
-
112
- try:
113
- return str(serialized["id"][-1])
114
- except (KeyError, TypeError):
115
- pass
116
-
117
- return "<unknown>"
118
-
119
- def _convert_message_to_dict(self, message: BaseMessage) -> Dict[str, Any]:
120
- """Convert a LangChain message to a dictionary for storage."""
121
- if isinstance(message, HumanMessage):
122
- message_dict = {"role": "user", "content": message.content}
123
- elif isinstance(message, AIMessage):
124
- message_dict = {"role": "assistant", "content": message.content}
125
- elif isinstance(message, SystemMessage):
126
- message_dict = {"role": "system", "content": message.content}
127
- elif isinstance(message, ToolMessage):
128
- message_dict = {
129
- "role": "tool",
130
- "content": message.content,
131
- "tool_call_id": message.tool_call_id,
132
- }
133
- elif isinstance(message, FunctionMessage):
134
- message_dict = {"role": "function", "content": message.content}
135
- elif isinstance(message, ChatMessage):
136
- message_dict = {"role": message.role, "content": message.content}
137
- else:
138
- message_dict = {"role": "unknown", "content": str(message.content)}
139
-
140
- if hasattr(message, "additional_kwargs") and message.additional_kwargs:
141
- message_dict["additional_kwargs"] = str(message.additional_kwargs)
142
-
143
- return message_dict
144
-
145
- def _create_message_dicts(
146
- self, messages: List[BaseMessage]
147
- ) -> List[Dict[str, Any]]:
148
- """Convert a list of LangChain messages to dictionaries."""
149
- return [self._convert_message_to_dict(m) for m in messages]
150
-
151
- def _join_tags_and_metadata(
152
- self,
153
- tags: Optional[List[str]] = None,
154
- metadata: Optional[Dict[str, Any]] = None,
155
- ) -> Optional[Dict[str, Any]]:
156
- """Join tags and metadata into a single dictionary."""
157
- final_dict = {}
158
- if tags is not None and len(tags) > 0:
159
- final_dict["tags"] = tags
160
- if metadata is not None:
161
- final_dict.update(metadata)
162
- return final_dict if final_dict else None
163
-
164
- def _start_span(
165
- self,
166
- run_id: UUID,
167
- parent_run_id: Optional[UUID],
168
- name: str,
169
- span_type: str,
170
- inputs: Any = None,
171
- tags: Optional[List[str]] = None,
172
- metadata: Optional[Dict[str, Any]] = None,
173
- **extra_attributes: Any,
174
- ) -> None:
175
- """Start a new span for the given run."""
176
- if not self.tracer:
177
- return
178
-
179
- # Skip internal spans
180
- if name.startswith("__") and name.endswith("__"):
181
- return
182
-
183
- try:
184
- # Determine if this is a root span
185
- is_root = parent_run_id is None
186
- if is_root:
187
- self.root_run_id = run_id
188
-
189
- # Calculate depth for proper hierarchy
190
- current_depth = 0
191
- if parent_run_id and parent_run_id in self.run_id_to_span_id:
192
- parent_span_id = self.run_id_to_span_id[parent_run_id]
193
- current_depth = self.span_id_to_depth.get(parent_span_id, 0) + 1
194
-
195
- # Create span attributes
196
- attributes = {
197
- AttributeKeys.JUDGMENT_SPAN_KIND.value: span_type,
198
- }
199
-
200
- # Add metadata and tags
201
- combined_metadata = self._join_tags_and_metadata(tags, metadata)
202
- if combined_metadata:
203
- metadata_str = safe_serialize(combined_metadata)
204
- attributes["metadata"] = metadata_str
205
-
206
- # Add extra attributes
207
- for key, value in extra_attributes.items():
208
- if value is not None:
209
- attributes[str(key)] = str(value)
210
-
211
- # Create span using the tracer's context manager for proper hierarchy
212
- with sync_span_context(self.tracer, name, attributes) as span:
213
- # Set input data if provided
214
- if inputs is not None:
215
- span.set_attribute(
216
- AttributeKeys.JUDGMENT_INPUT.value, safe_serialize(inputs)
217
- )
218
-
219
- # Store span information for tracking
220
- span_id = (
221
- str(span.get_span_context().span_id)
222
- if span.get_span_context()
223
- else str(uuid.uuid4())
224
- )
225
- self.spans[run_id] = span
226
- self.span_start_times[run_id] = time.time()
227
- self.run_id_to_span_id[run_id] = span_id
228
- self.span_id_to_depth[span_id] = current_depth
229
-
230
- except Exception as e:
231
- judgeval_logger.exception(f"Error starting span for {name}: {e}")
232
-
233
- def _end_span(
234
- self,
235
- run_id: UUID,
236
- outputs: Any = None,
237
- error: Optional[BaseException] = None,
238
- **extra_attributes: Any,
239
- ) -> None:
240
- """End the span for the given run."""
241
- if run_id not in self.spans:
242
- return
243
-
244
- try:
245
- span = self.spans[run_id]
246
-
247
- # Set output data if provided
248
- if outputs is not None:
249
- span.set_attribute(
250
- AttributeKeys.JUDGMENT_OUTPUT.value, safe_serialize(outputs)
251
- )
252
-
253
- # Set additional attributes
254
- for key, value in extra_attributes.items():
255
- if value is not None:
256
- span.set_attribute(str(key), str(value))
257
-
258
- # Handle errors
259
- if error is not None:
260
- # Check if this is a control flow exception
261
- is_control_flow = any(
262
- isinstance(error, t) for t in CONTROL_FLOW_EXCEPTION_TYPES
263
- )
264
- if not is_control_flow:
265
- span.record_exception(error)
266
- span.set_status(Status(StatusCode.ERROR, str(error)))
267
- # Control flow exceptions don't set error status
268
- else:
269
- span.set_status(Status(StatusCode.OK))
270
-
271
- # Note: The span will be ended automatically by the context manager
272
-
273
- except Exception as e:
274
- judgeval_logger.exception(f"Error ending span for run_id {run_id}: {e}")
275
- finally:
276
- # Cleanup tracking data
277
- if run_id in self.spans:
278
- del self.spans[run_id]
279
- if run_id in self.span_start_times:
280
- del self.span_start_times[run_id]
281
- if run_id in self.run_id_to_span_id:
282
- span_id = self.run_id_to_span_id[run_id]
283
- del self.run_id_to_span_id[run_id]
284
- if span_id in self.span_id_to_depth:
285
- del self.span_id_to_depth[span_id]
286
-
287
- # Check if this is the root run ending
288
- if run_id == self.root_run_id:
289
- self.root_run_id = None
290
-
291
- def _log_debug_event(
292
- self,
293
- event_name: str,
294
- run_id: UUID,
295
- parent_run_id: Optional[UUID] = None,
296
- **kwargs: Any,
297
- ) -> None:
298
- """Log debug information about callback events."""
299
- judgeval_logger.debug(
300
- f"Event: {event_name}, run_id: {str(run_id)[:8]}, "
301
- f"parent_run_id: {str(parent_run_id)[:8] if parent_run_id else None}"
302
- )
303
-
304
- # Chain callbacks
305
- def on_chain_start(
306
- self,
307
- serialized: Optional[Dict[str, Any]],
308
- inputs: Dict[str, Any],
309
- *,
310
- run_id: UUID,
311
- parent_run_id: Optional[UUID] = None,
312
- tags: Optional[List[str]] = None,
313
- metadata: Optional[Dict[str, Any]] = None,
314
- **kwargs: Any,
315
- ) -> Any:
316
- """Called when a chain starts running."""
317
- try:
318
- self._log_debug_event(
319
- "on_chain_start", run_id, parent_run_id, inputs=inputs
320
- )
321
-
322
- name = self._get_run_name(serialized, **kwargs)
323
-
324
- # Check for LangGraph node
325
- node_name = metadata.get("langgraph_node") if metadata else None
326
- if node_name:
327
- name = node_name
328
- if name not in self.executed_nodes:
329
- self.executed_nodes.append(name)
330
-
331
- # Determine if this is a root LangGraph execution
332
- is_langgraph_root = (
333
- kwargs.get("name") == "LangGraph" and parent_run_id is None
334
- )
335
- if is_langgraph_root:
336
- name = "LangGraph"
337
-
338
- span_level = "DEBUG" if tags and LANGSMITH_TAG_HIDDEN in tags else None
339
-
340
- self._start_span(
341
- run_id=run_id,
342
- parent_run_id=parent_run_id,
343
- name=name,
344
- span_type="chain",
345
- inputs=inputs,
346
- tags=tags,
347
- metadata=metadata,
348
- level=span_level,
349
- serialized=safe_serialize(serialized) if serialized else None,
350
- )
351
- except Exception as e:
352
- judgeval_logger.exception(f"Error in on_chain_start: {e}")
353
-
354
- def on_chain_end(
355
- self,
356
- outputs: Dict[str, Any],
357
- *,
358
- run_id: UUID,
359
- parent_run_id: Optional[UUID] = None,
360
- **kwargs: Any,
361
- ) -> Any:
362
- """Called when a chain ends successfully."""
363
- try:
364
- self._log_debug_event(
365
- "on_chain_end", run_id, parent_run_id, outputs=outputs
366
- )
367
- self._end_span(run_id=run_id, outputs=outputs)
368
- except Exception as e:
369
- judgeval_logger.exception(f"Error in on_chain_end: {e}")
370
-
371
- def on_chain_error(
372
- self,
373
- error: BaseException,
374
- *,
375
- run_id: UUID,
376
- parent_run_id: Optional[UUID] = None,
377
- **kwargs: Any,
378
- ) -> None:
379
- """Called when a chain encounters an error."""
380
- try:
381
- self._log_debug_event("on_chain_error", run_id, parent_run_id, error=error)
382
- self._end_span(run_id=run_id, error=error)
383
- except Exception as e:
384
- judgeval_logger.exception(f"Error in on_chain_error: {e}")
385
-
386
- # LLM callbacks
387
- def on_llm_start(
388
- self,
389
- serialized: Optional[Dict[str, Any]],
390
- prompts: List[str],
391
- *,
392
- run_id: UUID,
393
- parent_run_id: Optional[UUID] = None,
394
- tags: Optional[List[str]] = None,
395
- metadata: Optional[Dict[str, Any]] = None,
396
- **kwargs: Any,
397
- ) -> Any:
398
- """Called when an LLM starts generating."""
399
- try:
400
- self._log_debug_event(
401
- "on_llm_start", run_id, parent_run_id, prompts=prompts
402
- )
403
-
404
- name = self._get_run_name(serialized, **kwargs)
405
- model_name = self._extract_model_name(serialized, kwargs)
406
-
407
- prompt_data = prompts[0] if len(prompts) == 1 else prompts
408
-
409
- self._start_span(
410
- run_id=run_id,
411
- parent_run_id=parent_run_id,
412
- name=name,
413
- span_type="llm",
414
- inputs=prompt_data,
415
- tags=tags,
416
- metadata=metadata,
417
- model=model_name,
418
- serialized=safe_serialize(serialized) if serialized else None,
419
- )
420
-
421
- # Set GenAI specific attributes
422
- if run_id in self.spans:
423
- span = self.spans[run_id]
424
- if model_name:
425
- span.set_attribute(AttributeKeys.GEN_AI_REQUEST_MODEL, model_name)
426
- span.set_attribute(
427
- AttributeKeys.GEN_AI_PROMPT, safe_serialize(prompt_data)
428
- )
429
-
430
- # Set model parameters if available
431
- invocation_params = kwargs.get("invocation_params", {})
432
- if "temperature" in invocation_params:
433
- span.set_attribute(
434
- AttributeKeys.GEN_AI_REQUEST_TEMPERATURE,
435
- float(invocation_params["temperature"]),
436
- )
437
- if "max_tokens" in invocation_params:
438
- span.set_attribute(
439
- AttributeKeys.GEN_AI_REQUEST_MAX_TOKENS,
440
- int(invocation_params["max_tokens"]),
441
- )
442
-
443
- except Exception as e:
444
- judgeval_logger.exception(f"Error in on_llm_start: {e}")
445
-
446
- def on_chat_model_start(
447
- self,
448
- serialized: Optional[Dict[str, Any]],
449
- messages: List[List[BaseMessage]],
450
- *,
451
- run_id: UUID,
452
- parent_run_id: Optional[UUID] = None,
453
- tags: Optional[List[str]] = None,
454
- metadata: Optional[Dict[str, Any]] = None,
455
- **kwargs: Any,
456
- ) -> Any:
457
- """Called when a chat model starts generating."""
458
- try:
459
- self._log_debug_event(
460
- "on_chat_model_start", run_id, parent_run_id, messages=messages
461
- )
462
-
463
- name = self._get_run_name(serialized, **kwargs)
464
- model_name = self._extract_model_name(serialized, kwargs)
465
-
466
- # Flatten messages
467
- flattened_messages = []
468
- for message_list in messages:
469
- flattened_messages.extend(self._create_message_dicts(message_list))
470
-
471
- self._start_span(
472
- run_id=run_id,
473
- parent_run_id=parent_run_id,
474
- name=name,
475
- span_type="llm",
476
- inputs=flattened_messages,
477
- tags=tags,
478
- metadata=metadata,
479
- model=model_name,
480
- serialized=safe_serialize(serialized) if serialized else None,
481
- )
482
-
483
- # Set GenAI specific attributes
484
- if run_id in self.spans:
485
- span = self.spans[run_id]
486
- if model_name:
487
- span.set_attribute(AttributeKeys.GEN_AI_REQUEST_MODEL, model_name)
488
- span.set_attribute(
489
- AttributeKeys.GEN_AI_PROMPT, safe_serialize(flattened_messages)
490
- )
491
-
492
- except Exception as e:
493
- judgeval_logger.exception(f"Error in on_chat_model_start: {e}")
494
-
495
- def on_llm_end(
496
- self,
497
- response: LLMResult,
498
- *,
499
- run_id: UUID,
500
- parent_run_id: Optional[UUID] = None,
501
- **kwargs: Any,
502
- ) -> Any:
503
- """Called when an LLM finishes generating."""
504
- try:
505
- self._log_debug_event(
506
- "on_llm_end", run_id, parent_run_id, response=response
507
- )
508
-
509
- # Extract response content
510
- output: Any
511
- if response.generations:
512
- last_generation = response.generations[-1][-1]
513
- if (
514
- isinstance(last_generation, ChatGeneration)
515
- and last_generation.message
516
- ):
517
- output = self._convert_message_to_dict(last_generation.message)
518
- else:
519
- output = (
520
- last_generation.text
521
- if hasattr(last_generation, "text")
522
- else str(last_generation)
523
- )
524
- else:
525
- output = ""
526
-
527
- # Extract usage information
528
- usage_attrs = {}
529
- if response.llm_output and "token_usage" in response.llm_output:
530
- token_usage = response.llm_output["token_usage"]
531
- if hasattr(token_usage, "prompt_tokens"):
532
- usage_attrs[AttributeKeys.GEN_AI_USAGE_INPUT_TOKENS] = (
533
- token_usage.prompt_tokens
534
- )
535
- if hasattr(token_usage, "completion_tokens"):
536
- usage_attrs[AttributeKeys.GEN_AI_USAGE_OUTPUT_TOKENS] = (
537
- token_usage.completion_tokens
538
- )
539
-
540
- # Set completion attribute
541
- if run_id in self.spans:
542
- span = self.spans[run_id]
543
- span.set_attribute(
544
- AttributeKeys.GEN_AI_COMPLETION, safe_serialize(output)
545
- )
546
-
547
- # Set usage attributes
548
- for key, value in usage_attrs.items():
549
- span.set_attribute(key, value)
550
-
551
- self._end_span(run_id=run_id, outputs=output, **usage_attrs) # type: ignore
552
-
553
- except Exception as e:
554
- judgeval_logger.exception(f"Error in on_llm_end: {e}")
555
-
556
- def on_llm_error(
557
- self,
558
- error: BaseException,
559
- *,
560
- run_id: UUID,
561
- parent_run_id: Optional[UUID] = None,
562
- **kwargs: Any,
563
- ) -> Any:
564
- """Called when an LLM encounters an error."""
565
- try:
566
- self._log_debug_event("on_llm_error", run_id, parent_run_id, error=error)
567
- self._end_span(run_id=run_id, error=error)
568
- except Exception as e:
569
- judgeval_logger.exception(f"Error in on_llm_error: {e}")
570
-
571
- # Tool callbacks
572
- def on_tool_start(
573
- self,
574
- serialized: Optional[Dict[str, Any]],
575
- input_str: str,
576
- *,
577
- run_id: UUID,
578
- parent_run_id: Optional[UUID] = None,
579
- tags: Optional[List[str]] = None,
580
- metadata: Optional[Dict[str, Any]] = None,
581
- **kwargs: Any,
582
- ) -> Any:
583
- """Called when a tool starts executing."""
584
- try:
585
- self._log_debug_event(
586
- "on_tool_start", run_id, parent_run_id, input_str=input_str
587
- )
588
-
589
- name = self._get_run_name(serialized, **kwargs)
590
- if name not in self.executed_tools:
591
- self.executed_tools.append(name)
592
-
593
- self._start_span(
594
- run_id=run_id,
595
- parent_run_id=parent_run_id,
596
- name=name,
597
- span_type="tool",
598
- inputs=input_str,
599
- tags=tags,
600
- metadata=metadata,
601
- serialized=safe_serialize(serialized) if serialized else None,
602
- )
603
- except Exception as e:
604
- judgeval_logger.exception(f"Error in on_tool_start: {e}")
605
-
606
- def on_tool_end(
607
- self,
608
- output: str,
609
- *,
610
- run_id: UUID,
611
- parent_run_id: Optional[UUID] = None,
612
- **kwargs: Any,
613
- ) -> Any:
614
- """Called when a tool finishes executing."""
615
- try:
616
- self._log_debug_event("on_tool_end", run_id, parent_run_id, output=output)
617
- self._end_span(run_id=run_id, outputs=output)
618
- except Exception as e:
619
- judgeval_logger.exception(f"Error in on_tool_end: {e}")
620
-
621
- def on_tool_error(
622
- self,
623
- error: BaseException,
624
- *,
625
- run_id: UUID,
626
- parent_run_id: Optional[UUID] = None,
627
- **kwargs: Any,
628
- ) -> Any:
629
- """Called when a tool encounters an error."""
630
- try:
631
- self._log_debug_event("on_tool_error", run_id, parent_run_id, error=error)
632
- self._end_span(run_id=run_id, error=error)
633
- except Exception as e:
634
- judgeval_logger.exception(f"Error in on_tool_error: {e}")
635
-
636
- # Agent callbacks
637
- def on_agent_action(
638
- self,
639
- action: AgentAction,
640
- *,
641
- run_id: UUID,
642
- parent_run_id: Optional[UUID] = None,
643
- **kwargs: Any,
644
- ) -> Any:
645
- """Called when an agent takes an action."""
646
- try:
647
- self._log_debug_event(
648
- "on_agent_action", run_id, parent_run_id, action=action
649
- )
650
-
651
- if run_id in self.spans:
652
- span = self.spans[run_id]
653
- span.set_attribute("agent.action.tool", action.tool)
654
- span.set_attribute(
655
- "agent.action.tool_input", safe_serialize(action.tool_input)
656
- )
657
- span.set_attribute("agent.action.log", action.log)
658
-
659
- self._end_span(
660
- run_id=run_id,
661
- outputs={"action": action.tool, "input": action.tool_input},
662
- )
663
- except Exception as e:
664
- judgeval_logger.exception(f"Error in on_agent_action: {e}")
665
-
666
- def on_agent_finish(
667
- self,
668
- finish: AgentFinish,
669
- *,
670
- run_id: UUID,
671
- parent_run_id: Optional[UUID] = None,
672
- **kwargs: Any,
673
- ) -> Any:
674
- """Called when an agent finishes."""
675
- try:
676
- self._log_debug_event(
677
- "on_agent_finish", run_id, parent_run_id, finish=finish
678
- )
679
-
680
- if run_id in self.spans:
681
- span = self.spans[run_id]
682
- span.set_attribute("agent.finish.log", finish.log)
683
-
684
- self._end_span(run_id=run_id, outputs=finish.return_values)
685
- except Exception as e:
686
- judgeval_logger.exception(f"Error in on_agent_finish: {e}")
687
-
688
- # Retriever callbacks
689
- def on_retriever_start(
690
- self,
691
- serialized: Optional[Dict[str, Any]],
692
- query: str,
693
- *,
694
- run_id: UUID,
695
- parent_run_id: Optional[UUID] = None,
696
- tags: Optional[List[str]] = None,
697
- metadata: Optional[Dict[str, Any]] = None,
698
- **kwargs: Any,
699
- ) -> Any:
700
- """Called when a retriever starts."""
701
- try:
702
- self._log_debug_event(
703
- "on_retriever_start", run_id, parent_run_id, query=query
704
- )
705
-
706
- name = self._get_run_name(serialized, **kwargs)
707
-
708
- self._start_span(
709
- run_id=run_id,
710
- parent_run_id=parent_run_id,
711
- name=name,
712
- span_type="retriever",
713
- inputs=query,
714
- tags=tags,
715
- metadata=metadata,
716
- serialized=safe_serialize(serialized) if serialized else None,
717
- )
718
- except Exception as e:
719
- judgeval_logger.exception(f"Error in on_retriever_start: {e}")
720
-
721
- def on_retriever_end(
722
- self,
723
- documents: Sequence[Document],
724
- *,
725
- run_id: UUID,
726
- parent_run_id: Optional[UUID] = None,
727
- **kwargs: Any,
728
- ) -> Any:
729
- """Called when a retriever finishes."""
730
- try:
731
- self._log_debug_event(
732
- "on_retriever_end", run_id, parent_run_id, documents=documents
733
- )
734
-
735
- # Convert documents to serializable format
736
- doc_data = [
737
- {"page_content": doc.page_content, "metadata": doc.metadata}
738
- for doc in documents
739
- ]
740
-
741
- if run_id in self.spans:
742
- span = self.spans[run_id]
743
- span.set_attribute("retriever.document_count", len(documents))
744
-
745
- self._end_span(
746
- run_id=run_id, outputs=doc_data, document_count=len(documents)
747
- )
748
- except Exception as e:
749
- judgeval_logger.exception(f"Error in on_retriever_end: {e}")
750
-
751
- def on_retriever_error(
752
- self,
753
- error: BaseException,
754
- *,
755
- run_id: UUID,
756
- parent_run_id: Optional[UUID] = None,
757
- **kwargs: Any,
758
- ) -> Any:
759
- """Called when a retriever encounters an error."""
760
- try:
761
- self._log_debug_event(
762
- "on_retriever_error", run_id, parent_run_id, error=error
763
- )
764
- self._end_span(run_id=run_id, error=error)
765
- except Exception as e:
766
- judgeval_logger.exception(f"Error in on_retriever_error: {e}")
767
-
768
- def _extract_model_name(
769
- self, serialized: Optional[Dict[str, Any]], kwargs: Dict[str, Any]
770
- ) -> Optional[str]:
771
- """Extract model name from serialized data or kwargs."""
772
- # Try to get from invocation params
773
- invocation_params = kwargs.get("invocation_params", {})
774
- if "model_name" in invocation_params:
775
- return invocation_params["model_name"]
776
- if "model" in invocation_params:
777
- return invocation_params["model"]
778
-
779
- # Try to get from serialized data
780
- if serialized:
781
- if "model_name" in serialized:
782
- return serialized["model_name"]
783
- if "model" in serialized:
784
- return serialized["model"]
785
-
786
- return None
787
-
788
-
789
- __all__ = ["JudgevalCallbackHandler"]
7
+ class Langgraph(ABC):
8
+ @staticmethod
9
+ def initialize(otel_only: bool = True):
10
+ os.environ["LANGSMITH_OTEL_ENABLED"] = "true"
11
+ os.environ["LANGSMITH_TRACING"] = "true"
12
+ if otel_only:
13
+ os.environ["LANGSMITH_OTEL_ONLY"] = "true"