deepeval 3.6.4__py3-none-any.whl → 3.6.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,19 @@
1
1
  import json
2
+ import logging
2
3
  import os
3
4
  from typing import Literal, Optional, List
4
5
 
6
+ from deepeval.config.settings import get_settings
7
+ from deepeval.confident.api import get_confident_api_key
8
+ from deepeval.prompt import Prompt
9
+ from deepeval.tracing.context import current_trace_context
10
+ from deepeval.tracing.types import Trace
11
+ from deepeval.tracing.otel.utils import to_hex_string
12
+
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
5
17
  try:
6
18
  from pydantic_ai.models.instrumented import InstrumentationSettings
7
19
  from opentelemetry.sdk.trace import SpanProcessor, TracerProvider
@@ -11,7 +23,20 @@ try:
11
23
  )
12
24
 
13
25
  dependency_installed = True
14
- except:
26
+ except ImportError as e:
27
+ if get_settings().DEEPEVAL_VERBOSE_MODE:
28
+ if isinstance(e, ModuleNotFoundError):
29
+ logger.warning(
30
+ "Optional tracing dependency not installed: %s",
31
+ e.name,
32
+ stacklevel=2,
33
+ )
34
+ else:
35
+ logger.warning(
36
+ "Optional tracing import failed: %s",
37
+ e,
38
+ stacklevel=2,
39
+ )
15
40
  dependency_installed = False
16
41
 
17
42
 
@@ -25,6 +50,10 @@ def is_dependency_installed():
25
50
 
26
51
  from deepeval.confident.api import get_confident_api_key
27
52
  from deepeval.prompt import Prompt
53
+ from deepeval.tracing.otel.test_exporter import test_exporter
54
+ from deepeval.tracing.context import current_trace_context
55
+ from deepeval.tracing.types import Trace
56
+ from deepeval.tracing.otel.utils import to_hex_string
28
57
 
29
58
  # OTLP_ENDPOINT = "http://127.0.0.1:4318/v1/traces"
30
59
  OTLP_ENDPOINT = "https://otel.confident-ai.com/v1/traces"
@@ -37,6 +66,12 @@ class SpanInterceptor(SpanProcessor):
37
66
 
38
67
  def on_start(self, span, parent_context):
39
68
 
69
+ # set trace uuid
70
+ _current_trace_context = current_trace_context.get()
71
+ if _current_trace_context and isinstance(_current_trace_context, Trace):
72
+ _otel_trace_id = span.get_span_context().trace_id
73
+ _current_trace_context.uuid = to_hex_string(_otel_trace_id, 32)
74
+
40
75
  # set trace attributes
41
76
  if self.settings.thread_id:
42
77
  span.set_attribute(
@@ -148,8 +183,9 @@ class ConfidentInstrumentationSettings(InstrumentationSettings):
148
183
  confident_prompt: Optional[Prompt] = None,
149
184
  llm_metric_collection: Optional[str] = None,
150
185
  agent_metric_collection: Optional[str] = None,
151
- tool_metric_collection_map: dict = {},
186
+ tool_metric_collection_map: Optional[dict] = None,
152
187
  trace_metric_collection: Optional[str] = None,
188
+ is_test_mode: Optional[bool] = False,
153
189
  ):
154
190
  is_dependency_installed()
155
191
 
@@ -162,7 +198,7 @@ class ConfidentInstrumentationSettings(InstrumentationSettings):
162
198
  ]:
163
199
  self.environment = _environment
164
200
 
165
- self.tool_metric_collection_map = tool_metric_collection_map
201
+ self.tool_metric_collection_map = tool_metric_collection_map or {}
166
202
  self.name = name
167
203
  self.thread_id = thread_id
168
204
  self.user_id = user_id
@@ -185,12 +221,15 @@ class ConfidentInstrumentationSettings(InstrumentationSettings):
185
221
  span_interceptor = SpanInterceptor(self)
186
222
  trace_provider.add_span_processor(span_interceptor)
187
223
 
188
- trace_provider.add_span_processor(
189
- BatchSpanProcessor(
190
- OTLPSpanExporter(
191
- endpoint=OTLP_ENDPOINT,
192
- headers={"x-confident-api-key": api_key},
224
+ if is_test_mode:
225
+ trace_provider.add_span_processor(BatchSpanProcessor(test_exporter))
226
+ else:
227
+ trace_provider.add_span_processor(
228
+ BatchSpanProcessor(
229
+ OTLPSpanExporter(
230
+ endpoint=OTLP_ENDPOINT,
231
+ headers={"x-confident-api-key": api_key},
232
+ )
193
233
  )
194
234
  )
195
- )
196
235
  super().__init__(tracer_provider=trace_provider)
@@ -41,6 +41,7 @@ class FaithfulnessMetric(BaseMetric):
41
41
  strict_mode: bool = False,
42
42
  verbose_mode: bool = False,
43
43
  truths_extraction_limit: Optional[int] = None,
44
+ penalize_ambiguous_claims: bool = False,
44
45
  evaluation_template: Type[FaithfulnessTemplate] = FaithfulnessTemplate,
45
46
  ):
46
47
  self.threshold = 1 if strict_mode else threshold
@@ -51,6 +52,7 @@ class FaithfulnessMetric(BaseMetric):
51
52
  self.strict_mode = strict_mode
52
53
  self.verbose_mode = verbose_mode
53
54
  self.evaluation_template = evaluation_template
55
+ self.penalize_ambiguous_claims = penalize_ambiguous_claims
54
56
 
55
57
  self.truths_extraction_limit = truths_extraction_limit
56
58
  if self.truths_extraction_limit is not None:
@@ -329,6 +331,12 @@ class FaithfulnessMetric(BaseMetric):
329
331
  if verdict.verdict.strip().lower() != "no":
330
332
  faithfulness_count += 1
331
333
 
334
+ if (
335
+ self.penalize_ambiguous_claims
336
+ and verdict.verdict.strip().lower() == "idk"
337
+ ):
338
+ faithfulness_count -= 1
339
+
332
340
  score = faithfulness_count / number_of_verdicts
333
341
  return 0 if self.strict_mode and score < self.threshold else score
334
342
 
@@ -11,7 +11,7 @@ from .test_run import (
11
11
  )
12
12
 
13
13
  from .hooks import on_test_run_end, invoke_test_run_end_hook
14
- from .api import MetricData
14
+ from .api import MetricData, TurnApi
15
15
  from .hyperparameters import log_hyperparameters
16
16
 
17
17
 
@@ -28,5 +28,6 @@ __all__ = [
28
28
  "on_test_run_end",
29
29
  "invoke_test_run_end_hook",
30
30
  "MetricData",
31
+ "TurnApi",
31
32
  "log_hyperparameters",
32
33
  ]
deepeval/test_run/api.py CHANGED
@@ -99,6 +99,7 @@ class TurnApi(BaseModel):
99
99
  role: str
100
100
  content: str
101
101
  order: int
102
+ user_id: Optional[str] = Field(None, alias="userId")
102
103
  retrieval_context: Optional[list] = Field(None, alias="retrievalContext")
103
104
  tools_called: Optional[List[ToolCall]] = Field(None, alias="toolsCalled")
104
105
  additional_metadata: Optional[Dict] = Field(
@@ -2,9 +2,8 @@ from enum import Enum
2
2
  import os
3
3
  import json
4
4
  from pydantic import BaseModel, Field
5
- from typing import Any, Optional, List, Dict, Union
5
+ from typing import Any, Optional, List, Dict, Union, Tuple
6
6
  import shutil
7
- import webbrowser
8
7
  import sys
9
8
  import datetime
10
9
  import portalocker
@@ -27,6 +26,9 @@ from deepeval.utils import (
27
26
  delete_file_if_exists,
28
27
  get_is_running_deepeval,
29
28
  open_browser,
29
+ shorten,
30
+ format_turn,
31
+ len_short,
30
32
  )
31
33
  from deepeval.test_run.cache import global_test_run_cache_manager
32
34
  from deepeval.constants import CONFIDENT_TEST_CASE_BATCH_SIZE, HIDDEN_DIR
@@ -546,7 +548,7 @@ class TestRunManager:
546
548
 
547
549
  if (
548
550
  display == TestRunResultDisplay.PASSING
549
- and test_case.success == False
551
+ and test_case.success is False
550
552
  ):
551
553
  continue
552
554
  elif display == TestRunResultDisplay.FAILING and test_case.success:
@@ -618,7 +620,7 @@ class TestRunManager:
618
620
  ):
619
621
  if (
620
622
  display == TestRunResultDisplay.PASSING
621
- and conversational_test_case.success == False
623
+ and conversational_test_case.success is False
622
624
  ):
623
625
  continue
624
626
  elif (
@@ -631,6 +633,65 @@ class TestRunManager:
631
633
  fail_count = 0
632
634
  conversational_test_case_name = conversational_test_case.name
633
635
 
636
+ if conversational_test_case.turns:
637
+ turns_table = Table(
638
+ title=f"Conversation - {conversational_test_case_name}",
639
+ show_header=True,
640
+ header_style="bold",
641
+ )
642
+ turns_table.add_column("#", justify="right", width=3)
643
+ turns_table.add_column("Role", justify="left", width=10)
644
+
645
+ # subtract fixed widths + borders and padding.
646
+ # ~20 as a safe buffer
647
+ details_max_width = max(
648
+ 48, min(120, console.width - 3 - 10 - 20)
649
+ )
650
+ turns_table.add_column(
651
+ "Details",
652
+ justify="left",
653
+ overflow="fold",
654
+ max_width=details_max_width,
655
+ )
656
+
657
+ # truncate when too long
658
+ tools_max_width = min(60, max(24, console.width // 3))
659
+ turns_table.add_column(
660
+ "Tools",
661
+ justify="left",
662
+ no_wrap=True,
663
+ overflow="ellipsis",
664
+ max_width=tools_max_width,
665
+ )
666
+
667
+ sorted_turns = sorted(
668
+ conversational_test_case.turns, key=lambda t: t.order
669
+ )
670
+
671
+ for t in sorted_turns:
672
+ tools = t.tools_called or []
673
+ tool_names = ", ".join(tc.name for tc in tools)
674
+
675
+ # omit order, role and tools since we show them in a separate columns.
676
+ details = format_turn(
677
+ t,
678
+ include_tools_in_header=False,
679
+ include_order_role_in_header=False,
680
+ )
681
+
682
+ turns_table.add_row(
683
+ str(t.order),
684
+ t.role,
685
+ details,
686
+ shorten(tool_names, len_short()),
687
+ )
688
+
689
+ console.print(turns_table)
690
+ else:
691
+ console.print(
692
+ f"[dim]No turns recorded for {conversational_test_case_name}.[/dim]"
693
+ )
694
+
634
695
  if conversational_test_case.metrics_data is not None:
635
696
  for metric_data in conversational_test_case.metrics_data:
636
697
  if metric_data.success:
@@ -698,7 +759,7 @@ class TestRunManager:
698
759
  )
699
760
  print(table)
700
761
 
701
- def post_test_run(self, test_run: TestRun) -> Optional[str]:
762
+ def post_test_run(self, test_run: TestRun) -> Optional[Tuple[str, str]]:
702
763
  if (
703
764
  len(test_run.test_cases) == 0
704
765
  and len(test_run.conversational_test_cases) == 0
@@ -752,6 +813,21 @@ class TestRunManager:
752
813
  body=body,
753
814
  )
754
815
 
816
+ if not isinstance(data, dict) or "id" not in data:
817
+ # try to show helpful details
818
+ detail = None
819
+ if isinstance(data, dict):
820
+ detail = (
821
+ data.get("detail")
822
+ or data.get("message")
823
+ or data.get("error")
824
+ )
825
+ # fall back to repr for visibility
826
+ raise RuntimeError(
827
+ f"Confident API response missing 'id'. "
828
+ f"detail={detail!r} raw={type(data).__name__}:{repr(data)[:500]}"
829
+ )
830
+
755
831
  res = TestRunHttpResponse(
756
832
  id=data["id"],
757
833
  )
@@ -814,7 +890,7 @@ class TestRunManager:
814
890
  )
815
891
  self.save_final_test_run_link(link)
816
892
  open_browser(link)
817
- return link
893
+ return link, res.id
818
894
 
819
895
  def save_test_run_locally(self):
820
896
  local_folder = os.getenv("DEEPEVAL_RESULTS_FOLDER")
@@ -841,7 +917,7 @@ class TestRunManager:
841
917
  runDuration: float,
842
918
  display_table: bool = True,
843
919
  display: Optional[TestRunResultDisplay] = TestRunResultDisplay.ALL,
844
- ) -> Optional[str]:
920
+ ) -> Optional[Tuple[str, str]]:
845
921
  test_run = self.get_test_run()
846
922
  if test_run is None:
847
923
  print("Test Run is empty, please try again.")
@@ -868,8 +944,8 @@ class TestRunManager:
868
944
  test_run.sort_test_cases()
869
945
 
870
946
  if global_test_run_cache_manager.disable_write_cache is None:
871
- global_test_run_cache_manager.disable_write_cache = (
872
- get_is_running_deepeval() == False
947
+ global_test_run_cache_manager.disable_write_cache = not bool(
948
+ get_is_running_deepeval()
873
949
  )
874
950
 
875
951
  global_test_run_cache_manager.wrap_up_cached_test_run()
@@ -4,6 +4,7 @@ from .context import (
4
4
  update_retriever_span,
5
5
  update_llm_span,
6
6
  )
7
+ from .trace_context import trace
7
8
  from .types import BaseSpan, Trace
8
9
  from .tracing import observe, trace_manager
9
10
  from .offline_evals import evaluate_thread, evaluate_trace, evaluate_span
@@ -16,6 +17,7 @@ __all__ = [
16
17
  "BaseSpan",
17
18
  "Trace",
18
19
  "observe",
20
+ "trace",
19
21
  "trace_manager",
20
22
  "evaluate_thread",
21
23
  "evaluate_trace",
@@ -0,0 +1,35 @@
1
+ from typing import List, Dict, Any, Sequence
2
+ from opentelemetry.sdk.trace import ReadableSpan
3
+ from opentelemetry.sdk.trace.export import SpanExporter
4
+ from opentelemetry.sdk.trace.export import SpanExportResult
5
+ import json
6
+ from datetime import datetime
7
+
8
+
9
+ class TestExporter(SpanExporter):
10
+ """This exporter is used to test the exporter. It will store the spans in a list of dictionaries."""
11
+
12
+ span_json_list: List[Dict[str, Any]] = []
13
+
14
+ def export(
15
+ self, spans: Sequence[ReadableSpan], timeout_millis: int = 30000
16
+ ) -> SpanExportResult:
17
+ for span in spans:
18
+ _span_json = json.loads(span.to_json())
19
+ self.span_json_list.append(_span_json)
20
+
21
+ return SpanExportResult.SUCCESS
22
+
23
+ def get_span_json_list(self) -> List[Dict[str, Any]]:
24
+ return sorted(
25
+ self.span_json_list,
26
+ key=lambda x: datetime.fromisoformat(
27
+ x["start_time"].replace("Z", "+00:00")
28
+ ),
29
+ )
30
+
31
+ def clear_span_json_list(self):
32
+ self.span_json_list = []
33
+
34
+
35
+ test_exporter = TestExporter()
@@ -0,0 +1,14 @@
1
+ from .context import current_trace_context
2
+ from .tracing import trace_manager
3
+ from contextlib import contextmanager
4
+
5
+
6
+ @contextmanager
7
+ def trace():
8
+ current_trace = current_trace_context.get()
9
+
10
+ if not current_trace:
11
+ current_trace = trace_manager.start_new_trace()
12
+ current_trace_context.set(current_trace)
13
+
14
+ yield current_trace
@@ -1,3 +1,4 @@
1
+ import os
1
2
  from typing import Any, Dict, List, Literal, Optional, Set, Union, Callable
2
3
  from time import perf_counter
3
4
  import threading
@@ -47,13 +48,12 @@ from deepeval.tracing.utils import (
47
48
  tracing_enabled,
48
49
  validate_environment,
49
50
  validate_sampling_rate,
50
- dump_body_to_json_file,
51
- get_deepeval_trace_mode,
52
51
  )
53
52
  from deepeval.utils import dataclass_to_dict
54
53
  from deepeval.tracing.context import current_span_context, current_trace_context
55
54
  from deepeval.tracing.types import TestCaseMetricPair
56
55
  from deepeval.tracing.api import PromptApi
56
+ from tests.test_integrations.manager import trace_testing_manager
57
57
 
58
58
  EVAL_DUMMY_SPAN_NAME = "evals_iterator"
59
59
 
@@ -183,13 +183,14 @@ class TraceManager:
183
183
  if trace.status == TraceSpanStatus.IN_PROGRESS:
184
184
  trace.status = TraceSpanStatus.SUCCESS
185
185
 
186
- mode = get_deepeval_trace_mode()
187
- if mode == "gen":
186
+ if trace_testing_manager.test_name:
187
+ # Trace testing mode is enabled
188
+ # Instead posting the trace to the queue, it will be stored in this global variable
188
189
  body = self.create_trace_api(trace).model_dump(
189
190
  by_alias=True, exclude_none=True
190
191
  )
191
- dump_body_to_json_file(body)
192
- # Post the trace to the server before removing it
192
+ trace_testing_manager.test_dict = make_json_serializable(body)
193
+ # Post the trace to the server before removing it
193
194
  elif not self.evaluating:
194
195
  self.post_trace(trace)
195
196
  else:
deepeval/tracing/utils.py CHANGED
@@ -1,13 +1,8 @@
1
1
  import os
2
- import inspect
3
- import json
4
- import sys
5
2
  from datetime import datetime, timezone
6
3
  from enum import Enum
7
4
  from time import perf_counter
8
5
  from collections import deque
9
- from typing import Any, Dict, Optional
10
-
11
6
  from deepeval.constants import CONFIDENT_TRACING_ENABLED
12
7
 
13
8
 
@@ -186,84 +181,5 @@ def perf_counter_to_datetime(perf_counter_value: float) -> datetime:
186
181
  def replace_self_with_class_name(obj):
187
182
  try:
188
183
  return f"<{obj.__class__.__name__}>"
189
- except Exception:
190
- return "<self>"
191
-
192
-
193
- def get_deepeval_trace_mode() -> Optional[str]:
194
- deepeval_trace_mode = None
195
- try:
196
- args = sys.argv
197
- for idx, arg in enumerate(args):
198
- if isinstance(arg, str) and arg.startswith(
199
- "--deepeval-trace-mode="
200
- ):
201
- deepeval_trace_mode = (
202
- arg.split("=", 1)[1].strip().strip('"').strip("'").lower()
203
- )
204
- break
205
- if arg == "--deepeval-trace-mode" and idx + 1 < len(args):
206
- deepeval_trace_mode = (
207
- str(args[idx + 1]).strip().strip('"').strip("'").lower()
208
- )
209
- break
210
- except Exception:
211
- deepeval_trace_mode = None
212
-
213
- return deepeval_trace_mode
214
-
215
-
216
- def dump_body_to_json_file(
217
- body: Dict[str, Any], file_path: Optional[str] = None
218
- ) -> str:
219
- entry_file = None
220
- try:
221
- cmd0 = sys.argv[0] if sys.argv else None
222
- if cmd0 and cmd0.endswith(".py"):
223
- entry_file = cmd0
224
- else:
225
- for frame_info in reversed(inspect.stack()):
226
- fp = frame_info.filename
227
- if (
228
- fp
229
- and fp.endswith(".py")
230
- and "deepeval/tracing" not in fp
231
- and "site-packages" not in fp
232
- ):
233
- entry_file = fp
234
- break
235
- except Exception:
236
- entry_file = None
237
-
238
- if not entry_file:
239
- entry_file = "unknown.py"
240
-
241
- abs_entry = os.path.abspath(entry_file)
242
- dir_path = os.path.dirname(abs_entry)
243
-
244
- file_arg = None
245
- try:
246
- for idx, arg in enumerate(sys.argv):
247
- if isinstance(arg, str) and arg.startswith(
248
- "--deepeval-trace-file-name="
249
- ):
250
- file_arg = arg.split("=", 1)[1].strip().strip('"').strip("'")
251
- break
252
- if arg == "--deepeval-trace-file-name" and idx + 1 < len(sys.argv):
253
- file_arg = str(sys.argv[idx + 1]).strip().strip('"').strip("'")
254
- break
255
- except Exception:
256
- file_arg = None
257
-
258
- if file_path:
259
- dst_path = os.path.abspath(file_path)
260
- elif file_arg:
261
- dst_path = os.path.abspath(file_arg)
262
- else:
263
- base_name = os.path.splitext(os.path.basename(abs_entry))[0]
264
- dst_path = os.path.join(dir_path, f"{base_name}.json")
265
-
266
- actual_body = make_json_serializable(body)
267
- with open(dst_path, "w", encoding="utf-8") as f:
268
- json.dump(actual_body, f, ensure_ascii=False, indent=2, sort_keys=True)
269
- return dst_path
184
+ except:
185
+ return f"<self>"