deepeval 3.6.6__py3-none-any.whl → 3.6.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
  3. deepeval/cli/main.py +42 -0
  4. deepeval/confident/api.py +1 -0
  5. deepeval/config/settings.py +22 -4
  6. deepeval/constants.py +8 -1
  7. deepeval/dataset/dataset.py +2 -11
  8. deepeval/dataset/utils.py +1 -1
  9. deepeval/errors.py +20 -2
  10. deepeval/evaluate/evaluate.py +5 -1
  11. deepeval/evaluate/execute.py +811 -248
  12. deepeval/evaluate/types.py +1 -0
  13. deepeval/evaluate/utils.py +33 -119
  14. deepeval/integrations/crewai/__init__.py +7 -1
  15. deepeval/integrations/crewai/handler.py +1 -1
  16. deepeval/integrations/crewai/subs.py +51 -0
  17. deepeval/integrations/crewai/tool.py +71 -0
  18. deepeval/integrations/crewai/wrapper.py +45 -5
  19. deepeval/integrations/llama_index/__init__.py +0 -4
  20. deepeval/integrations/llama_index/handler.py +20 -21
  21. deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
  22. deepeval/metrics/__init__.py +13 -0
  23. deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
  24. deepeval/metrics/api.py +281 -0
  25. deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
  26. deepeval/metrics/base_metric.py +1 -0
  27. deepeval/metrics/bias/bias.py +12 -3
  28. deepeval/metrics/contextual_precision/contextual_precision.py +39 -24
  29. deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
  30. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
  31. deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
  32. deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
  33. deepeval/metrics/conversational_dag/nodes.py +12 -4
  34. deepeval/metrics/conversational_g_eval/__init__.py +3 -0
  35. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +84 -66
  36. deepeval/metrics/dag/dag.py +12 -0
  37. deepeval/metrics/dag/nodes.py +12 -4
  38. deepeval/metrics/dag/schema.py +1 -1
  39. deepeval/metrics/dag/templates.py +2 -2
  40. deepeval/metrics/faithfulness/faithfulness.py +12 -1
  41. deepeval/metrics/g_eval/g_eval.py +11 -0
  42. deepeval/metrics/goal_accuracy/__init__.py +1 -0
  43. deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
  44. deepeval/metrics/goal_accuracy/schema.py +17 -0
  45. deepeval/metrics/goal_accuracy/template.py +235 -0
  46. deepeval/metrics/hallucination/hallucination.py +20 -9
  47. deepeval/metrics/indicator.py +8 -2
  48. deepeval/metrics/json_correctness/json_correctness.py +12 -1
  49. deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
  50. deepeval/metrics/mcp/mcp_task_completion.py +20 -2
  51. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +29 -6
  52. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +14 -2
  53. deepeval/metrics/misuse/misuse.py +12 -1
  54. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
  55. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
  56. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
  57. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
  58. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
  59. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +38 -25
  60. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
  61. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
  62. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
  63. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
  64. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
  65. deepeval/metrics/non_advice/non_advice.py +12 -0
  66. deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
  67. deepeval/metrics/plan_adherence/__init__.py +1 -0
  68. deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
  69. deepeval/metrics/plan_adherence/schema.py +11 -0
  70. deepeval/metrics/plan_adherence/template.py +170 -0
  71. deepeval/metrics/plan_quality/__init__.py +1 -0
  72. deepeval/metrics/plan_quality/plan_quality.py +292 -0
  73. deepeval/metrics/plan_quality/schema.py +11 -0
  74. deepeval/metrics/plan_quality/template.py +101 -0
  75. deepeval/metrics/prompt_alignment/prompt_alignment.py +12 -1
  76. deepeval/metrics/role_adherence/role_adherence.py +12 -0
  77. deepeval/metrics/role_violation/role_violation.py +12 -0
  78. deepeval/metrics/step_efficiency/__init__.py +1 -0
  79. deepeval/metrics/step_efficiency/schema.py +11 -0
  80. deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
  81. deepeval/metrics/step_efficiency/template.py +256 -0
  82. deepeval/metrics/summarization/summarization.py +12 -1
  83. deepeval/metrics/task_completion/task_completion.py +4 -0
  84. deepeval/metrics/tool_correctness/schema.py +6 -0
  85. deepeval/metrics/tool_correctness/template.py +88 -0
  86. deepeval/metrics/tool_correctness/tool_correctness.py +233 -21
  87. deepeval/metrics/tool_use/__init__.py +1 -0
  88. deepeval/metrics/tool_use/schema.py +19 -0
  89. deepeval/metrics/tool_use/template.py +220 -0
  90. deepeval/metrics/tool_use/tool_use.py +458 -0
  91. deepeval/metrics/topic_adherence/__init__.py +1 -0
  92. deepeval/metrics/topic_adherence/schema.py +16 -0
  93. deepeval/metrics/topic_adherence/template.py +162 -0
  94. deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
  95. deepeval/metrics/toxicity/toxicity.py +12 -0
  96. deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
  97. deepeval/models/embedding_models/azure_embedding_model.py +37 -36
  98. deepeval/models/embedding_models/local_embedding_model.py +30 -32
  99. deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
  100. deepeval/models/embedding_models/openai_embedding_model.py +22 -31
  101. deepeval/models/llms/grok_model.py +1 -1
  102. deepeval/models/llms/openai_model.py +2 -0
  103. deepeval/openai/__init__.py +14 -32
  104. deepeval/openai/extractors.py +85 -50
  105. deepeval/openai/patch.py +258 -167
  106. deepeval/openai/types.py +20 -0
  107. deepeval/openai/utils.py +205 -56
  108. deepeval/prompt/__init__.py +19 -1
  109. deepeval/prompt/api.py +160 -0
  110. deepeval/prompt/prompt.py +245 -62
  111. deepeval/prompt/utils.py +186 -15
  112. deepeval/synthesizer/chunking/context_generator.py +209 -152
  113. deepeval/synthesizer/chunking/doc_chunker.py +46 -12
  114. deepeval/synthesizer/synthesizer.py +19 -15
  115. deepeval/test_case/api.py +131 -0
  116. deepeval/test_case/llm_test_case.py +6 -2
  117. deepeval/test_run/__init__.py +1 -0
  118. deepeval/test_run/hyperparameters.py +47 -8
  119. deepeval/test_run/test_run.py +292 -206
  120. deepeval/tracing/__init__.py +2 -1
  121. deepeval/tracing/api.py +3 -1
  122. deepeval/tracing/otel/exporter.py +3 -4
  123. deepeval/tracing/otel/utils.py +24 -5
  124. deepeval/tracing/trace_context.py +89 -5
  125. deepeval/tracing/tracing.py +74 -3
  126. deepeval/tracing/types.py +20 -2
  127. deepeval/tracing/utils.py +8 -0
  128. deepeval/utils.py +21 -0
  129. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/METADATA +1 -1
  130. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/RECORD +133 -103
  131. deepeval/integrations/llama_index/agent/patched.py +0 -68
  132. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/LICENSE.md +0 -0
  133. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/WHEEL +0 -0
  134. {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/entry_points.txt +0 -0
@@ -254,7 +254,7 @@ class ConfidentSpanExporter(SpanExporter):
254
254
 
255
255
  base_span = None
256
256
  try:
257
- base_span = self.__prepare_boilerplate_base_span(span)
257
+ base_span = self.prepare_boilerplate_base_span(span)
258
258
  except Exception:
259
259
  pass
260
260
 
@@ -453,9 +453,8 @@ class ConfidentSpanExporter(SpanExporter):
453
453
  if span_output:
454
454
  base_span.output = span_output
455
455
 
456
- def __prepare_boilerplate_base_span(
457
- self, span: ReadableSpan
458
- ) -> Optional[BaseSpan]:
456
+ @staticmethod
457
+ def prepare_boilerplate_base_span(span: ReadableSpan) -> Optional[BaseSpan]:
459
458
 
460
459
  ################ Get Span Type ################
461
460
  span_type = span.attributes.get("confident.span.type")
@@ -3,7 +3,7 @@ import json
3
3
  from typing import List, Optional, Tuple, Any
4
4
  from opentelemetry.sdk.trace.export import ReadableSpan
5
5
 
6
- from deepeval.evaluate.utils import create_api_test_case
6
+ from deepeval.test_case.api import create_api_test_case
7
7
  from deepeval.test_run.api import LLMApiTestCase
8
8
  from deepeval.test_run.test_run import global_test_run_manager
9
9
  from deepeval.tracing.types import Trace, LLMTestCase, ToolCall
@@ -11,7 +11,7 @@ from deepeval.tracing import trace_manager, BaseSpan
11
11
  from deepeval.tracing.utils import make_json_serializable
12
12
 
13
13
 
14
- GEN_AI_OPERATION_NAMES = ["chat", "generate_content", "task_completion"]
14
+ GEN_AI_OPERATION_NAMES = ["chat", "generate_content", "text_completion"]
15
15
 
16
16
 
17
17
  def to_hex_string(id_value: int | bytes, length: int = 32) -> str:
@@ -128,6 +128,10 @@ def check_llm_input_from_gen_ai_attributes(
128
128
 
129
129
  input = system_instructions + input_messages
130
130
 
131
+ model_parameters = check_model_parameters(span)
132
+ if model_parameters:
133
+ input.append(model_parameters)
134
+
131
135
  except Exception:
132
136
  pass
133
137
  try:
@@ -413,7 +417,7 @@ def post_test_run(traces: List[Trace], test_run_id: Optional[str]):
413
417
  # return test_run_manager.post_test_run(test_run) TODO: add after test run with metric collection is implemented
414
418
 
415
419
 
416
- def _normalize_pydantic_ai_messages(span: ReadableSpan) -> Optional[list]:
420
+ def normalize_pydantic_ai_messages(span: ReadableSpan) -> Optional[list]:
417
421
  try:
418
422
  raw = span.attributes.get("pydantic_ai.all_messages")
419
423
  if not raw:
@@ -438,7 +442,7 @@ def _normalize_pydantic_ai_messages(span: ReadableSpan) -> Optional[list]:
438
442
  except Exception:
439
443
  pass
440
444
 
441
- return None
445
+ return []
442
446
 
443
447
 
444
448
  def _extract_non_thinking_part_of_last_message(message: dict) -> dict:
@@ -461,7 +465,7 @@ def check_pydantic_ai_agent_input_output(
461
465
  output_val: Optional[Any] = None
462
466
 
463
467
  # Get normalized messages once
464
- normalized = _normalize_pydantic_ai_messages(span)
468
+ normalized = normalize_pydantic_ai_messages(span)
465
469
 
466
470
  # Input (pydantic_ai.all_messages) - slice up to and including the first 'user' message
467
471
  if normalized:
@@ -523,3 +527,18 @@ def check_pydantic_ai_trace_input_output(
523
527
  input_val, output_val = check_pydantic_ai_agent_input_output(span)
524
528
 
525
529
  return input_val, output_val
530
+
531
+
532
+ def check_model_parameters(span: ReadableSpan) -> Optional[dict]:
533
+ try:
534
+ raw_model_parameters = span.attributes.get("model_request_parameters")
535
+ if raw_model_parameters and isinstance(raw_model_parameters, str):
536
+ model_parameters = json.loads(raw_model_parameters)
537
+ if isinstance(model_parameters, dict):
538
+ return {
539
+ "role": "Model Request Parameters",
540
+ "content": model_parameters,
541
+ }
542
+ except Exception:
543
+ pass
544
+ return None
@@ -1,14 +1,98 @@
1
- from .context import current_trace_context
2
- from .tracing import trace_manager
1
+ from typing import Optional, List, Dict, Any
2
+ from contextvars import ContextVar
3
3
  from contextlib import contextmanager
4
+ from dataclasses import dataclass
5
+
6
+ from .tracing import trace_manager
7
+ from .context import current_trace_context, update_current_trace
8
+ from deepeval.prompt import Prompt
9
+ from deepeval.metrics import BaseMetric
10
+ from deepeval.test_case.llm_test_case import ToolCall
11
+
12
+
13
+ @dataclass
14
+ class LlmSpanContext:
15
+ prompt: Optional[Prompt] = None
16
+ metrics: Optional[List[BaseMetric]] = None
17
+ metric_collection: Optional[str] = None
18
+ expected_output: Optional[str] = None
19
+ expected_tools: Optional[List[ToolCall]] = None
20
+ context: Optional[List[str]] = None
21
+ retrieval_context: Optional[List[str]] = None
22
+
23
+
24
+ @dataclass
25
+ class AgentSpanContext:
26
+ metrics: Optional[List[BaseMetric]] = None
27
+ metric_collection: Optional[str] = None
28
+ expected_output: Optional[str] = None
29
+ expected_tools: Optional[List[ToolCall]] = None
30
+ context: Optional[List[str]] = None
31
+ retrieval_context: Optional[List[str]] = None
32
+
33
+
34
+ current_llm_context: ContextVar[Optional[LlmSpanContext]] = ContextVar(
35
+ "current_llm_context", default=LlmSpanContext()
36
+ )
37
+
38
+ current_agent_context: ContextVar[Optional[AgentSpanContext]] = ContextVar(
39
+ "current_agent_context", default=AgentSpanContext()
40
+ )
4
41
 
5
42
 
6
43
  @contextmanager
7
- def trace():
44
+ def trace(
45
+ llm_span_context: Optional[LlmSpanContext] = None,
46
+ agent_span_context: Optional[AgentSpanContext] = None,
47
+ name: Optional[str] = None,
48
+ tags: Optional[List[str]] = None,
49
+ metadata: Optional[Dict[str, Any]] = None,
50
+ thread_id: Optional[str] = None,
51
+ user_id: Optional[str] = None,
52
+ input: Optional[Any] = None,
53
+ output: Optional[Any] = None,
54
+ retrieval_context: Optional[List[str]] = None,
55
+ context: Optional[List[str]] = None,
56
+ expected_output: Optional[str] = None,
57
+ tools_called: Optional[List[ToolCall]] = None,
58
+ expected_tools: Optional[List[ToolCall]] = None,
59
+ metrics: Optional[List[BaseMetric]] = None,
60
+ metric_collection: Optional[str] = None,
61
+ ):
8
62
  current_trace = current_trace_context.get()
9
63
 
10
64
  if not current_trace:
11
65
  current_trace = trace_manager.start_new_trace()
12
- current_trace_context.set(current_trace)
13
66
 
14
- yield current_trace
67
+ if metrics:
68
+ current_trace.metrics = metrics
69
+
70
+ if metric_collection:
71
+ current_trace.metric_collection = metric_collection
72
+
73
+ current_trace_context.set(current_trace)
74
+
75
+ update_current_trace(
76
+ name=name,
77
+ tags=tags,
78
+ metadata=metadata,
79
+ thread_id=thread_id,
80
+ user_id=user_id,
81
+ input=input,
82
+ output=output,
83
+ retrieval_context=retrieval_context,
84
+ context=context,
85
+ expected_output=expected_output,
86
+ tools_called=tools_called,
87
+ expected_tools=expected_tools,
88
+ )
89
+
90
+ if llm_span_context:
91
+ current_llm_context.set(llm_span_context)
92
+ if agent_span_context:
93
+ current_agent_context.set(agent_span_context)
94
+ try:
95
+ yield current_trace
96
+ finally:
97
+ current_llm_context.set(LlmSpanContext())
98
+ current_agent_context.set(AgentSpanContext())
@@ -1,5 +1,15 @@
1
- import os
2
- from typing import Any, Dict, List, Literal, Optional, Set, Union, Callable
1
+ import weakref
2
+ from typing import (
3
+ TYPE_CHECKING,
4
+ Any,
5
+ Callable,
6
+ Dict,
7
+ List,
8
+ Literal,
9
+ Optional,
10
+ Set,
11
+ Union,
12
+ )
3
13
  from time import perf_counter
4
14
  import threading
5
15
  import functools
@@ -20,6 +30,7 @@ from deepeval.constants import (
20
30
  )
21
31
  from deepeval.confident.api import Api, Endpoints, HttpMethods, is_confident
22
32
  from deepeval.metrics import BaseMetric
33
+ from deepeval.test_case.llm_test_case import ToolCall
23
34
  from deepeval.tracing.api import (
24
35
  BaseApiSpan,
25
36
  SpanApiType,
@@ -41,6 +52,7 @@ from deepeval.tracing.types import (
41
52
  )
42
53
  from deepeval.tracing.utils import (
43
54
  Environment,
55
+ prepare_tool_call_input_parameters,
44
56
  replace_self_with_class_name,
45
57
  make_json_serializable,
46
58
  perf_counter_to_datetime,
@@ -55,6 +67,10 @@ from deepeval.tracing.types import TestCaseMetricPair
55
67
  from deepeval.tracing.api import PromptApi
56
68
  from deepeval.tracing.trace_test_manager import trace_testing_manager
57
69
 
70
+
71
+ if TYPE_CHECKING:
72
+ from deepeval.dataset.golden import Golden
73
+
58
74
  EVAL_DUMMY_SPAN_NAME = "evals_iterator"
59
75
 
60
76
 
@@ -65,6 +81,10 @@ class TraceManager:
65
81
  self.active_spans: Dict[str, BaseSpan] = (
66
82
  {}
67
83
  ) # Map of span_uuid to BaseSpan
84
+ # Map each trace created during evaluation_loop to the Golden that was active
85
+ # when it was started. This lets us evaluate traces against the correct golden
86
+ # since we cannot rely on positional indexing as the order is not guaranteed.
87
+ self.trace_uuid_to_golden: Dict[str, Golden] = {}
68
88
 
69
89
  settings = get_settings()
70
90
  # Initialize queue and worker thread for trace posting
@@ -73,6 +93,9 @@ class TraceManager:
73
93
  self._min_interval = 0.2 # Minimum time between API calls (seconds)
74
94
  self._last_post_time = 0
75
95
  self._in_flight_tasks: Set[asyncio.Task[Any]] = set()
96
+ self.task_bindings: "weakref.WeakKeyDictionary[asyncio.Task, dict]" = (
97
+ weakref.WeakKeyDictionary()
98
+ )
76
99
  self._flush_enabled = bool(settings.CONFIDENT_TRACE_FLUSH)
77
100
  self._daemon = not self._flush_enabled
78
101
 
@@ -86,7 +109,7 @@ class TraceManager:
86
109
  )
87
110
  validate_environment(self.environment)
88
111
 
89
- self.sampling_rate = settings.CONFIDENT_SAMPLE_RATE
112
+ self.sampling_rate = settings.CONFIDENT_TRACE_SAMPLE_RATE
90
113
  validate_sampling_rate(self.sampling_rate)
91
114
  self.openai_client = None
92
115
  self.tracing_enabled = True
@@ -166,6 +189,19 @@ class TraceManager:
166
189
  self.traces.append(new_trace)
167
190
  if self.evaluation_loop:
168
191
  self.traces_to_evaluate_order.append(trace_uuid)
192
+ # Associate the current Golden with this trace so we can
193
+ # later evaluate traces against the correct golden, even if more traces
194
+ # are created than goldens or the order interleaves.
195
+ try:
196
+ from deepeval.contextvars import get_current_golden
197
+
198
+ current_golden = get_current_golden()
199
+ if current_golden is not None:
200
+ self.trace_uuid_to_golden[trace_uuid] = current_golden
201
+ except Exception:
202
+ # not much we can do, but if the golden is not there during evaluation
203
+ # we will write out a verbose debug log
204
+ pass
169
205
  return new_trace
170
206
 
171
207
  def end_trace(self, trace_uuid: str):
@@ -820,6 +856,25 @@ class Observer:
820
856
  self._progress = parent_span.progress
821
857
  self._pbar_callback_id = parent_span.pbar_callback_id
822
858
 
859
+ try:
860
+ import asyncio
861
+
862
+ task = asyncio.current_task()
863
+ except Exception:
864
+ task = None
865
+
866
+ if task is not None:
867
+ binding = trace_manager.task_bindings.get(task) or {}
868
+ # record the trace the task is working on
869
+ binding["trace_uuid"] = span_instance.trace_uuid
870
+ # only set root_span_uuid when this span is a root. Don't do this for child or we will override our record.
871
+ if (
872
+ span_instance.parent_uuid is None
873
+ and "root_span_uuid" not in binding
874
+ ):
875
+ binding["root_span_uuid"] = span_instance.uuid
876
+ trace_manager.task_bindings[task] = binding
877
+
823
878
  if self._progress is not None and self._pbar_callback_id is not None:
824
879
  span_instance.progress = self._progress
825
880
  span_instance.pbar_callback_id = self._pbar_callback_id
@@ -861,6 +916,22 @@ class Observer:
861
916
  ):
862
917
  current_span.prompt = self.prompt
863
918
 
919
+ if not current_span.tools_called:
920
+ # check any tool span children
921
+ for child in current_span.children:
922
+ if isinstance(child, ToolSpan):
923
+ current_span.tools_called = current_span.tools_called or []
924
+ current_span.tools_called.append(
925
+ ToolCall(
926
+ name=child.name,
927
+ description=child.description,
928
+ input_parameters=prepare_tool_call_input_parameters(
929
+ child.input
930
+ ),
931
+ output=child.output,
932
+ )
933
+ )
934
+
864
935
  trace_manager.remove_span(self.uuid)
865
936
  if current_span.parent_uuid:
866
937
  parent_span = trace_manager.get_span_by_uuid(
deepeval/tracing/types.py CHANGED
@@ -1,7 +1,7 @@
1
1
  from enum import Enum
2
2
  from dataclasses import dataclass, field
3
3
  from pydantic import BaseModel, Field
4
- from typing import Any, Dict, List, Optional, Union
4
+ from typing import Any, Dict, List, Optional, Union, Literal
5
5
  from rich.progress import Progress
6
6
 
7
7
  from deepeval.prompt.prompt import Prompt
@@ -10,6 +10,19 @@ from deepeval.test_case import LLMTestCase
10
10
  from deepeval.metrics import BaseMetric
11
11
 
12
12
 
13
+ class Message(BaseModel):
14
+ role: str
15
+ """To be displayed on the top of the message block."""
16
+
17
+ type: Literal["tool_calls", "tool_output", "thinking", "default"] = (
18
+ "default"
19
+ )
20
+ """Decides how the content is rendered."""
21
+
22
+ content: Any
23
+ """The content of the message."""
24
+
25
+
13
26
  class TraceWorkerStatus(Enum):
14
27
  SUCCESS = "success"
15
28
  FAILURE = "failure"
@@ -44,7 +57,7 @@ class LlmOutput(BaseModel):
44
57
  class BaseSpan(BaseModel):
45
58
  uuid: str
46
59
  status: TraceSpanStatus
47
- children: List["BaseSpan"]
60
+ children: List["BaseSpan"] = Field(default_factory=list)
48
61
  trace_uuid: str = Field(serialization_alias="traceUuid")
49
62
  parent_uuid: Optional[str] = Field(None, serialization_alias="parentUuid")
50
63
  start_time: float = Field(serialization_alias="startTime")
@@ -88,6 +101,7 @@ class AgentSpan(BaseSpan):
88
101
 
89
102
 
90
103
  class LlmSpan(BaseSpan):
104
+
91
105
  model: Optional[str] = None
92
106
  prompt: Optional[Prompt] = None
93
107
  input_token_count: Optional[float] = Field(
@@ -106,6 +120,10 @@ class LlmSpan(BaseSpan):
106
120
  None, serialization_alias="tokenTimes"
107
121
  )
108
122
 
123
+ # input_tools: Optional[List[ToolSchema]] = Field(None, serialization_alias="inputTools")
124
+ # invocation_params: Optional[Dict[str, Any]] = Field(None, serialization_alias="invocationParams")
125
+ # output_metadata: Optional[Dict[str, Any]] = Field(None, serialization_alias="outputMetadata")
126
+
109
127
  # for serializing `prompt`
110
128
  model_config = {"arbitrary_types_allowed": True}
111
129
 
deepeval/tracing/utils.py CHANGED
@@ -1,4 +1,5 @@
1
1
  import os
2
+ from typing import Dict, Any
2
3
  from datetime import datetime, timezone
3
4
  from enum import Enum
4
5
  from time import perf_counter
@@ -183,3 +184,10 @@ def replace_self_with_class_name(obj):
183
184
  return f"<{obj.__class__.__name__}>"
184
185
  except:
185
186
  return f"<self>"
187
+
188
+
189
+ def prepare_tool_call_input_parameters(output: Any) -> Dict[str, Any]:
190
+ res = make_json_serializable(output)
191
+ if res and not isinstance(res, dict):
192
+ res = {"output": res}
193
+ return res
deepeval/utils.py CHANGED
@@ -10,6 +10,7 @@ import asyncio
10
10
  import nest_asyncio
11
11
  import uuid
12
12
  import math
13
+ import logging
13
14
 
14
15
  from contextvars import ContextVar
15
16
  from enum import Enum
@@ -747,3 +748,23 @@ my_theme = Theme(
747
748
  }
748
749
  )
749
750
  custom_console = Console(theme=my_theme)
751
+
752
+
753
+ def format_error_text(
754
+ exc: BaseException, *, with_stack: bool | None = None
755
+ ) -> str:
756
+ if with_stack is None:
757
+ with_stack = logging.getLogger("deepeval").isEnabledFor(logging.DEBUG)
758
+
759
+ text = f"{type(exc).__name__}: {exc}"
760
+
761
+ if with_stack:
762
+ import traceback
763
+
764
+ text += "\n" + "".join(
765
+ traceback.format_exception(type(exc), exc, exc.__traceback__)
766
+ )
767
+ elif get_settings().DEEPEVAL_VERBOSE_MODE:
768
+ text += " (Run with LOG_LEVEL=DEBUG for stack trace.)"
769
+
770
+ return text
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deepeval
3
- Version: 3.6.6
3
+ Version: 3.6.8
4
4
  Summary: The LLM Evaluation Framework
5
5
  Home-page: https://github.com/confident-ai/deepeval
6
6
  License: Apache-2.0