deepeval 3.6.4__py3-none-any.whl → 3.6.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. deepeval/__init__.py +42 -10
  2. deepeval/_version.py +1 -1
  3. deepeval/config/logging.py +33 -0
  4. deepeval/config/settings.py +167 -12
  5. deepeval/dataset/dataset.py +8 -2
  6. deepeval/evaluate/evaluate.py +8 -2
  7. deepeval/evaluate/execute.py +28 -30
  8. deepeval/evaluate/types.py +4 -1
  9. deepeval/evaluate/utils.py +46 -29
  10. deepeval/integrations/crewai/__init__.py +1 -2
  11. deepeval/integrations/crewai/handler.py +153 -81
  12. deepeval/integrations/crewai/wrapper.py +87 -0
  13. deepeval/integrations/pydantic_ai/instrumentator.py +48 -9
  14. deepeval/integrations/pydantic_ai/test_instrumentator.py +0 -0
  15. deepeval/metrics/faithfulness/faithfulness.py +8 -0
  16. deepeval/metrics/g_eval/g_eval.py +26 -15
  17. deepeval/metrics/prompt_alignment/prompt_alignment.py +41 -23
  18. deepeval/models/retry_policy.py +202 -11
  19. deepeval/test_run/__init__.py +2 -1
  20. deepeval/test_run/api.py +1 -0
  21. deepeval/test_run/test_run.py +85 -9
  22. deepeval/tracing/__init__.py +2 -0
  23. deepeval/tracing/otel/exporter.py +0 -6
  24. deepeval/tracing/otel/test_exporter.py +35 -0
  25. deepeval/tracing/otel/utils.py +57 -7
  26. deepeval/tracing/trace_context.py +14 -0
  27. deepeval/tracing/trace_test_manager.py +19 -0
  28. deepeval/tracing/tracing.py +7 -6
  29. deepeval/tracing/utils.py +2 -86
  30. deepeval/utils.py +149 -1
  31. {deepeval-3.6.4.dist-info → deepeval-3.6.6.dist-info}/METADATA +1 -1
  32. {deepeval-3.6.4.dist-info → deepeval-3.6.6.dist-info}/RECORD +35 -31
  33. deepeval/integrations/crewai/agent.py +0 -98
  34. deepeval/integrations/crewai/patch.py +0 -41
  35. {deepeval-3.6.4.dist-info → deepeval-3.6.6.dist-info}/LICENSE.md +0 -0
  36. {deepeval-3.6.4.dist-info → deepeval-3.6.6.dist-info}/WHEEL +0 -0
  37. {deepeval-3.6.4.dist-info → deepeval-3.6.6.dist-info}/entry_points.txt +0 -0
@@ -1,9 +1,10 @@
1
1
  import ast
2
2
  import inspect
3
- from typing import Optional, List, Callable, Union, Dict
4
- import os, time
5
-
3
+ from typing import Optional, List, Callable, Union
4
+ import os
5
+ import time
6
6
 
7
+ from deepeval.utils import format_turn
7
8
  from deepeval.test_case.conversational_test_case import Turn
8
9
  from deepeval.test_run.api import TurnApi
9
10
  from deepeval.test_run.test_run import TestRunResultDisplay
@@ -34,6 +35,29 @@ from deepeval.tracing.utils import (
34
35
  )
35
36
 
36
37
 
38
+ def _is_metric_successful(metric_data: MetricData) -> bool:
39
+ """
40
+ Robustly determine success for a metric row.
41
+
42
+ Rationale:
43
+ - If the metric recorded an error, treat as failure.
44
+ - Be defensive: custom rows may not be MetricData at runtime.
45
+ """
46
+ if getattr(metric_data, "error", None):
47
+ return False
48
+
49
+ s = getattr(metric_data, "success", None)
50
+ if isinstance(s, bool):
51
+ return s
52
+ if s is None:
53
+ return False
54
+ if isinstance(s, (int, float)):
55
+ return bool(s)
56
+ if isinstance(s, str):
57
+ return s.strip().lower() in {"true", "t", "1", "yes", "y"}
58
+ return False
59
+
60
+
37
61
  def create_metric_data(metric: BaseMetric) -> MetricData:
38
62
  if metric.error is not None:
39
63
  return MetricData(
@@ -75,6 +99,7 @@ def create_test_result(
75
99
  metrics_data=api_test_case.metrics_data,
76
100
  conversational=True,
77
101
  additional_metadata=api_test_case.additional_metadata,
102
+ turns=api_test_case.turns,
78
103
  )
79
104
  else:
80
105
  multimodal = (
@@ -112,6 +137,7 @@ def create_api_turn(turn: Turn, index: int) -> TurnApi:
112
137
  return TurnApi(
113
138
  role=turn.role,
114
139
  content=turn.content,
140
+ user_id=turn.user_id,
115
141
  retrievalContext=turn.retrieval_context,
116
142
  toolsCalled=turn.tools_called,
117
143
  additionalMetadata=turn.additional_metadata,
@@ -372,17 +398,7 @@ def print_test_result(test_result: TestResult, display: TestRunResultDisplay):
372
398
  print("Metrics Summary\n")
373
399
 
374
400
  for metric_data in test_result.metrics_data:
375
- successful = True
376
- if metric_data.error is not None:
377
- successful = False
378
- else:
379
- # This try block is for user defined custom metrics,
380
- # which might not handle the score == undefined case elegantly
381
- try:
382
- if not metric_data.success:
383
- successful = False
384
- except:
385
- successful = False
401
+ successful = _is_metric_successful(metric_data)
386
402
 
387
403
  if not successful:
388
404
  print(
@@ -401,9 +417,14 @@ def print_test_result(test_result: TestResult, display: TestRunResultDisplay):
401
417
 
402
418
  elif test_result.conversational:
403
419
  print("For conversational test case:\n")
404
- print(
405
- f" - Unable to print conversational test case. Run 'deepeval login' to view conversational evaluations in full."
406
- )
420
+ if test_result.turns:
421
+ print(" Turns:")
422
+ turns = sorted(test_result.turns, key=lambda t: t.order)
423
+ for t in turns:
424
+ print(format_turn(t))
425
+ else:
426
+ print(" - No turns recorded in this test case.")
427
+
407
428
  else:
408
429
  print("For test case:\n")
409
430
  print(f" - input: {test_result.input}")
@@ -470,15 +491,7 @@ def write_test_result_to_file(
470
491
  file.write("Metrics Summary\n\n")
471
492
 
472
493
  for metric_data in test_result.metrics_data:
473
- successful = True
474
- if metric_data.error is not None:
475
- successful = False
476
- else:
477
- try:
478
- if not metric_data.success:
479
- successful = False
480
- except:
481
- successful = False
494
+ successful = _is_metric_successful(metric_data)
482
495
 
483
496
  if not successful:
484
497
  file.write(
@@ -500,9 +513,13 @@ def write_test_result_to_file(
500
513
  file.write(f" - actual output: {test_result.actual_output}\n")
501
514
  elif test_result.conversational:
502
515
  file.write("For conversational test case:\n\n")
503
- file.write(
504
- " - Unable to print conversational test case. Run 'deepeval login' to view conversational evaluations in full.\n"
505
- )
516
+ if test_result.turns:
517
+ file.write(" Turns:\n")
518
+ turns = sorted(test_result.turns, key=lambda t: t.order)
519
+ for t in turns:
520
+ file.write(format_turn(t) + "\n")
521
+ else:
522
+ file.write(" - No turns recorded in this test case.\n")
506
523
  else:
507
524
  file.write("For test case:\n\n")
508
525
  file.write(f" - input: {test_result.input}\n")
@@ -1,4 +1,3 @@
1
1
  from .handler import instrument_crewai
2
- from .agent import Agent
3
2
 
4
- __all__ = ["instrument_crewai", "Agent"]
3
+ __all__ = ["instrument_crewai"]
@@ -1,30 +1,50 @@
1
- from typing import Optional
1
+ import logging
2
2
  import deepeval
3
- from deepeval.integrations.crewai.agent import (
4
- Agent as PatchedAgent,
5
- agent_registry,
6
- )
7
- from deepeval.integrations.crewai.patch import patch_build_context_for_task
3
+
4
+ from typing import Optional
8
5
  from deepeval.telemetry import capture_tracing_integration
9
- from deepeval.tracing.types import AgentSpan, LlmSpan
6
+ from deepeval.tracing.context import current_span_context, current_trace_context
7
+ from deepeval.tracing.tracing import Observer
8
+ from deepeval.tracing.types import LlmSpan
9
+ from deepeval.config.settings import get_settings
10
+
11
+
12
+ logger = logging.getLogger(__name__)
13
+
10
14
 
11
15
  try:
12
- from crewai.crew import Crew
13
- from crewai.llm import LLM
14
- from crewai.agent import Agent
15
- from crewai.utilities.events import AgentExecutionCompletedEvent
16
16
  from crewai.utilities.events.base_event_listener import BaseEventListener
17
- from crewai.task import Task
18
- from crewai.agents.crew_agent_executor import CrewAgentExecutor
19
- from crewai.utilities.events import ToolUsageFinishedEvent
20
- from crewai.tools.tool_usage import ToolUsage
21
- from crewai.utilities.events import LLMCallCompletedEvent
22
- from crewai.memory.contextual.contextual_memory import ContextualMemory
17
+ from crewai.events import (
18
+ CrewKickoffStartedEvent,
19
+ CrewKickoffCompletedEvent,
20
+ LLMCallStartedEvent,
21
+ LLMCallCompletedEvent,
22
+ AgentExecutionStartedEvent,
23
+ AgentExecutionCompletedEvent,
24
+ ToolUsageStartedEvent,
25
+ ToolUsageFinishedEvent,
26
+ )
23
27
 
24
28
  crewai_installed = True
25
- except:
29
+ except ImportError as e:
30
+ if get_settings().DEEPEVAL_VERBOSE_MODE:
31
+ if isinstance(e, ModuleNotFoundError):
32
+ logger.warning(
33
+ "Optional crewai dependency not installed: %s",
34
+ e.name,
35
+ stacklevel=2,
36
+ )
37
+ else:
38
+ logger.warning(
39
+ "Optional crewai import failed: %s",
40
+ e,
41
+ stacklevel=2,
42
+ )
43
+
26
44
  crewai_installed = False
27
45
 
46
+ IS_WRAPPED_ALL = False
47
+
28
48
 
29
49
  def is_crewai_installed():
30
50
  if not crewai_installed:
@@ -33,81 +53,114 @@ def is_crewai_installed():
33
53
  )
34
54
 
35
55
 
36
- from deepeval.test_case.llm_test_case import LLMTestCase
37
- from deepeval.tracing.tracing import (
38
- observe,
39
- current_span_context,
40
- trace_manager,
41
- current_trace_context,
42
- )
43
-
44
-
45
56
  class CrewAIEventsListener(BaseEventListener):
46
57
  def __init__(self):
47
58
  is_crewai_installed()
48
59
  super().__init__()
60
+ self.span_observers: dict[str, Observer] = {}
49
61
 
50
- def setup_listeners(self, crewai_event_bus):
62
+ @staticmethod
63
+ def get_tool_execution_id(source, event) -> str:
64
+ source_id = id(source)
65
+ task_id = getattr(event, "task_id", "unknown")
66
+ agent_id = getattr(event, "agent_id", "unknown")
67
+ tool_name = getattr(event, "tool_name", "unknown")
68
+ execution_id = f"tool_{source_id}_{task_id}_{agent_id}_{tool_name}"
51
69
 
52
- @crewai_event_bus.on(AgentExecutionCompletedEvent)
53
- def on_agent_execution_completed(
54
- source, event: AgentExecutionCompletedEvent
55
- ):
70
+ return execution_id
71
+
72
+ def setup_listeners(self, crewai_event_bus):
73
+ @crewai_event_bus.on(CrewKickoffStartedEvent)
74
+ def on_crew_started(source, event: CrewKickoffStartedEvent):
75
+ # Assuming that this event is called in the crew.kickoff method
56
76
  current_span = current_span_context.get()
57
77
 
58
- if isinstance(current_span, AgentSpan):
59
- if isinstance(source, Agent):
60
- current_span.name = source.role
61
- current_span.available_tools = [
62
- tool.name for tool in source.tools
63
- ]
78
+ # set the input
79
+ if current_span:
80
+ current_span.input = event.inputs
81
+
82
+ # set trace input
83
+ current_trace = current_trace_context.get()
84
+ if current_trace:
85
+ current_trace.input = event.inputs
86
+
87
+ @crewai_event_bus.on(CrewKickoffCompletedEvent)
88
+ def on_crew_completed(source, event: CrewKickoffCompletedEvent):
89
+ # Assuming that this event is called in the crew.kickoff method
90
+ current_span = current_span_context.get()
64
91
 
92
+ # set the output
65
93
  if current_span:
66
- # set llm test case
67
- input = None
68
- actual_output = None
69
- expected_output = None
70
-
71
- if isinstance(event.task, Task):
72
- input = event.task.prompt()
73
- actual_output = event.output
74
- expected_output = event.task.expected_output
75
-
76
- current_span.input = input
77
- current_span.output = actual_output
78
- current_span.expected_output = expected_output
79
-
80
- # set metrics
81
- if isinstance(source, PatchedAgent):
82
- current_span.metrics = agent_registry.get_metrics(source)
83
- current_span.metric_collection = (
84
- agent_registry.get_metric_collection(source)
85
- )
86
-
87
- # set offline evals
88
- if current_span.metric_collection:
89
- trace_manager.integration_traces_to_evaluate.append(
90
- current_trace_context.get()
91
- )
94
+ current_span.output = str(event.output)
92
95
 
93
- @crewai_event_bus.on(ToolUsageFinishedEvent)
94
- def on_tool_usage_finished(source, event: ToolUsageFinishedEvent):
96
+ # set trace output
97
+ current_trace = current_trace_context.get()
98
+ if current_trace:
99
+ current_trace.output = str(event.output)
100
+
101
+ @crewai_event_bus.on(LLMCallStartedEvent)
102
+ def on_llm_started(source, event: LLMCallStartedEvent):
103
+ # Assuming that this event is called in the llm.call method
95
104
  current_span = current_span_context.get()
96
- current_span.input = event.tool_args
97
- current_span.output = event.output
98
- current_span.name = event.tool_name
105
+
106
+ # set the input
107
+ if current_span:
108
+ current_span.input = event.messages
109
+
110
+ # set the model
111
+ if isinstance(current_span, LlmSpan):
112
+ current_span.model = event.model
99
113
 
100
114
  @crewai_event_bus.on(LLMCallCompletedEvent)
101
- def on_llm_call_finished(source, event: LLMCallCompletedEvent):
115
+ def on_llm_completed(source, event: LLMCallCompletedEvent):
116
+ # Assuming that this event is called in the llm.call method
102
117
  current_span = current_span_context.get()
103
118
 
104
- if isinstance(current_span, LlmSpan):
105
- if isinstance(source, LLM):
106
- current_span.model = source.model
107
-
108
- current_span.input = event.messages
119
+ # set the output
120
+ if current_span:
109
121
  current_span.output = event.response
110
122
 
123
+ @crewai_event_bus.on(AgentExecutionStartedEvent)
124
+ def on_agent_started(source, event: AgentExecutionStartedEvent):
125
+ # Assuming that this event is called in the agent.execute_task method
126
+ current_span = current_span_context.get()
127
+
128
+ # set the input
129
+ if current_span:
130
+ current_span.input = event.task_prompt
131
+
132
+ @crewai_event_bus.on(AgentExecutionCompletedEvent)
133
+ def on_agent_completed(source, event: AgentExecutionCompletedEvent):
134
+ # Assuming that this event is called in the agent.execute_task method
135
+ current_span = current_span_context.get()
136
+
137
+ # set the output
138
+ if current_span:
139
+ current_span.output = event.output
140
+
141
+ @crewai_event_bus.on(ToolUsageStartedEvent)
142
+ def on_tool_started(source, event: ToolUsageStartedEvent):
143
+ observer = Observer(
144
+ span_type="tool",
145
+ func_name=event.tool_name,
146
+ function_kwargs=event.tool_args,
147
+ )
148
+ self.span_observers[self.get_tool_execution_id(source, event)] = (
149
+ observer
150
+ )
151
+ observer.__enter__()
152
+
153
+ @crewai_event_bus.on(ToolUsageFinishedEvent)
154
+ def on_tool_completed(source, event: ToolUsageFinishedEvent):
155
+ observer = self.span_observers.pop(
156
+ self.get_tool_execution_id(source, event)
157
+ )
158
+ if observer:
159
+ current_span = current_span_context.get()
160
+ if current_span:
161
+ current_span.output = event.output
162
+ observer.__exit__(None, None, None)
163
+
111
164
 
112
165
  def instrument_crewai(api_key: Optional[str] = None):
113
166
  is_crewai_installed()
@@ -115,10 +168,29 @@ def instrument_crewai(api_key: Optional[str] = None):
115
168
  if api_key:
116
169
  deepeval.login(api_key)
117
170
 
118
- Crew.kickoff = observe(Crew.kickoff)
119
- LLM.call = observe(LLM.call, type="llm", model="")
120
- Agent.execute_task = observe(Agent.execute_task, type="agent")
121
- CrewAgentExecutor.invoke = observe(CrewAgentExecutor.invoke)
122
- ToolUsage.use = observe(ToolUsage.use, type="tool")
123
- patch_build_context_for_task()
171
+ wrap_all()
172
+
124
173
  CrewAIEventsListener()
174
+
175
+
176
+ def wrap_all():
177
+ global IS_WRAPPED_ALL
178
+
179
+ if not IS_WRAPPED_ALL:
180
+ from deepeval.integrations.crewai.wrapper import (
181
+ wrap_crew_kickoff,
182
+ wrap_crew_kickoff_for_each,
183
+ wrap_crew_kickoff_async,
184
+ wrap_crew_kickoff_for_each_async,
185
+ wrap_llm_call,
186
+ wrap_agent_execute_task,
187
+ )
188
+
189
+ wrap_crew_kickoff()
190
+ wrap_crew_kickoff_for_each()
191
+ wrap_crew_kickoff_async()
192
+ wrap_crew_kickoff_for_each_async()
193
+ wrap_llm_call()
194
+ wrap_agent_execute_task()
195
+
196
+ IS_WRAPPED_ALL = True
@@ -0,0 +1,87 @@
1
+ from crewai.llm import LLM
2
+ from crewai.crew import Crew
3
+ from crewai.agent import Agent
4
+ from functools import wraps
5
+ from deepeval.tracing.tracing import Observer
6
+
7
+
8
+ def wrap_crew_kickoff():
9
+ original_kickoff = Crew.kickoff
10
+
11
+ @wraps(original_kickoff)
12
+ def wrapper(self, *args, **kwargs):
13
+ with Observer(span_type="crew", func_name="kickoff"):
14
+ result = original_kickoff(self, *args, **kwargs)
15
+
16
+ return result
17
+
18
+ Crew.kickoff = wrapper
19
+
20
+
21
+ def wrap_crew_kickoff_for_each():
22
+ original_kickoff_for_each = Crew.kickoff_for_each
23
+
24
+ @wraps(original_kickoff_for_each)
25
+ def wrapper(self, *args, **kwargs):
26
+ with Observer(span_type="crew", func_name="kickoff_for_each"):
27
+ result = original_kickoff_for_each(self, *args, **kwargs)
28
+
29
+ return result
30
+
31
+ Crew.kickoff_for_each = wrapper
32
+
33
+
34
+ def wrap_crew_kickoff_async():
35
+ original_kickoff_async = Crew.kickoff_async
36
+
37
+ @wraps(original_kickoff_async)
38
+ async def wrapper(self, *args, **kwargs):
39
+ with Observer(span_type="crew", func_name="kickoff_async"):
40
+ result = await original_kickoff_async(self, *args, **kwargs)
41
+
42
+ return result
43
+
44
+ Crew.kickoff_async = wrapper
45
+
46
+
47
+ def wrap_crew_kickoff_for_each_async():
48
+ original_kickoff_for_each_async = Crew.kickoff_for_each_async
49
+
50
+ @wraps(original_kickoff_for_each_async)
51
+ async def wrapper(self, *args, **kwargs):
52
+ with Observer(span_type="crew", func_name="kickoff_for_each_async"):
53
+ result = await original_kickoff_for_each_async(
54
+ self, *args, **kwargs
55
+ )
56
+
57
+ return result
58
+
59
+ Crew.kickoff_for_each_async = wrapper
60
+
61
+
62
+ def wrap_llm_call():
63
+ original_llm_call = LLM.call
64
+
65
+ @wraps(original_llm_call)
66
+ def wrapper(self, *args, **kwargs):
67
+ with Observer(
68
+ span_type="llm",
69
+ func_name="call",
70
+ observe_kwargs={"model": "temp_model"},
71
+ ):
72
+ result = original_llm_call(self, *args, **kwargs)
73
+ return result
74
+
75
+ LLM.call = wrapper
76
+
77
+
78
+ def wrap_agent_execute_task():
79
+ original_execute_task = Agent.execute_task
80
+
81
+ @wraps(original_execute_task)
82
+ def wrapper(self, *args, **kwargs):
83
+ with Observer(span_type="agent", func_name="execute_task"):
84
+ result = original_execute_task(self, *args, **kwargs)
85
+ return result
86
+
87
+ Agent.execute_task = wrapper
@@ -1,7 +1,19 @@
1
1
  import json
2
+ import logging
2
3
  import os
3
4
  from typing import Literal, Optional, List
4
5
 
6
+ from deepeval.config.settings import get_settings
7
+ from deepeval.confident.api import get_confident_api_key
8
+ from deepeval.prompt import Prompt
9
+ from deepeval.tracing.context import current_trace_context
10
+ from deepeval.tracing.types import Trace
11
+ from deepeval.tracing.otel.utils import to_hex_string
12
+
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
5
17
  try:
6
18
  from pydantic_ai.models.instrumented import InstrumentationSettings
7
19
  from opentelemetry.sdk.trace import SpanProcessor, TracerProvider
@@ -11,7 +23,20 @@ try:
11
23
  )
12
24
 
13
25
  dependency_installed = True
14
- except:
26
+ except ImportError as e:
27
+ if get_settings().DEEPEVAL_VERBOSE_MODE:
28
+ if isinstance(e, ModuleNotFoundError):
29
+ logger.warning(
30
+ "Optional tracing dependency not installed: %s",
31
+ e.name,
32
+ stacklevel=2,
33
+ )
34
+ else:
35
+ logger.warning(
36
+ "Optional tracing import failed: %s",
37
+ e,
38
+ stacklevel=2,
39
+ )
15
40
  dependency_installed = False
16
41
 
17
42
 
@@ -25,6 +50,10 @@ def is_dependency_installed():
25
50
 
26
51
  from deepeval.confident.api import get_confident_api_key
27
52
  from deepeval.prompt import Prompt
53
+ from deepeval.tracing.otel.test_exporter import test_exporter
54
+ from deepeval.tracing.context import current_trace_context
55
+ from deepeval.tracing.types import Trace
56
+ from deepeval.tracing.otel.utils import to_hex_string
28
57
 
29
58
  # OTLP_ENDPOINT = "http://127.0.0.1:4318/v1/traces"
30
59
  OTLP_ENDPOINT = "https://otel.confident-ai.com/v1/traces"
@@ -37,6 +66,12 @@ class SpanInterceptor(SpanProcessor):
37
66
 
38
67
  def on_start(self, span, parent_context):
39
68
 
69
+ # set trace uuid
70
+ _current_trace_context = current_trace_context.get()
71
+ if _current_trace_context and isinstance(_current_trace_context, Trace):
72
+ _otel_trace_id = span.get_span_context().trace_id
73
+ _current_trace_context.uuid = to_hex_string(_otel_trace_id, 32)
74
+
40
75
  # set trace attributes
41
76
  if self.settings.thread_id:
42
77
  span.set_attribute(
@@ -148,8 +183,9 @@ class ConfidentInstrumentationSettings(InstrumentationSettings):
148
183
  confident_prompt: Optional[Prompt] = None,
149
184
  llm_metric_collection: Optional[str] = None,
150
185
  agent_metric_collection: Optional[str] = None,
151
- tool_metric_collection_map: dict = {},
186
+ tool_metric_collection_map: Optional[dict] = None,
152
187
  trace_metric_collection: Optional[str] = None,
188
+ is_test_mode: Optional[bool] = False,
153
189
  ):
154
190
  is_dependency_installed()
155
191
 
@@ -162,7 +198,7 @@ class ConfidentInstrumentationSettings(InstrumentationSettings):
162
198
  ]:
163
199
  self.environment = _environment
164
200
 
165
- self.tool_metric_collection_map = tool_metric_collection_map
201
+ self.tool_metric_collection_map = tool_metric_collection_map or {}
166
202
  self.name = name
167
203
  self.thread_id = thread_id
168
204
  self.user_id = user_id
@@ -185,12 +221,15 @@ class ConfidentInstrumentationSettings(InstrumentationSettings):
185
221
  span_interceptor = SpanInterceptor(self)
186
222
  trace_provider.add_span_processor(span_interceptor)
187
223
 
188
- trace_provider.add_span_processor(
189
- BatchSpanProcessor(
190
- OTLPSpanExporter(
191
- endpoint=OTLP_ENDPOINT,
192
- headers={"x-confident-api-key": api_key},
224
+ if is_test_mode:
225
+ trace_provider.add_span_processor(BatchSpanProcessor(test_exporter))
226
+ else:
227
+ trace_provider.add_span_processor(
228
+ BatchSpanProcessor(
229
+ OTLPSpanExporter(
230
+ endpoint=OTLP_ENDPOINT,
231
+ headers={"x-confident-api-key": api_key},
232
+ )
193
233
  )
194
234
  )
195
- )
196
235
  super().__init__(tracer_provider=trace_provider)
@@ -41,6 +41,7 @@ class FaithfulnessMetric(BaseMetric):
41
41
  strict_mode: bool = False,
42
42
  verbose_mode: bool = False,
43
43
  truths_extraction_limit: Optional[int] = None,
44
+ penalize_ambiguous_claims: bool = False,
44
45
  evaluation_template: Type[FaithfulnessTemplate] = FaithfulnessTemplate,
45
46
  ):
46
47
  self.threshold = 1 if strict_mode else threshold
@@ -51,6 +52,7 @@ class FaithfulnessMetric(BaseMetric):
51
52
  self.strict_mode = strict_mode
52
53
  self.verbose_mode = verbose_mode
53
54
  self.evaluation_template = evaluation_template
55
+ self.penalize_ambiguous_claims = penalize_ambiguous_claims
54
56
 
55
57
  self.truths_extraction_limit = truths_extraction_limit
56
58
  if self.truths_extraction_limit is not None:
@@ -329,6 +331,12 @@ class FaithfulnessMetric(BaseMetric):
329
331
  if verdict.verdict.strip().lower() != "no":
330
332
  faithfulness_count += 1
331
333
 
334
+ if (
335
+ self.penalize_ambiguous_claims
336
+ and verdict.verdict.strip().lower() == "idk"
337
+ ):
338
+ faithfulness_count -= 1
339
+
332
340
  score = faithfulness_count / number_of_verdicts
333
341
  return 0 if self.strict_mode and score < self.threshold else score
334
342