deepeval 3.6.6__py3-none-any.whl → 3.6.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
- deepeval/cli/main.py +42 -0
- deepeval/confident/api.py +1 -0
- deepeval/config/settings.py +22 -4
- deepeval/constants.py +8 -1
- deepeval/dataset/dataset.py +2 -11
- deepeval/dataset/utils.py +1 -1
- deepeval/errors.py +20 -2
- deepeval/evaluate/evaluate.py +5 -1
- deepeval/evaluate/execute.py +811 -248
- deepeval/evaluate/types.py +1 -0
- deepeval/evaluate/utils.py +33 -119
- deepeval/integrations/crewai/__init__.py +7 -1
- deepeval/integrations/crewai/handler.py +1 -1
- deepeval/integrations/crewai/subs.py +51 -0
- deepeval/integrations/crewai/tool.py +71 -0
- deepeval/integrations/crewai/wrapper.py +45 -5
- deepeval/integrations/llama_index/__init__.py +0 -4
- deepeval/integrations/llama_index/handler.py +20 -21
- deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
- deepeval/metrics/__init__.py +13 -0
- deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
- deepeval/metrics/api.py +281 -0
- deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
- deepeval/metrics/base_metric.py +1 -0
- deepeval/metrics/bias/bias.py +12 -3
- deepeval/metrics/contextual_precision/contextual_precision.py +39 -24
- deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
- deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
- deepeval/metrics/conversational_dag/nodes.py +12 -4
- deepeval/metrics/conversational_g_eval/__init__.py +3 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +84 -66
- deepeval/metrics/dag/dag.py +12 -0
- deepeval/metrics/dag/nodes.py +12 -4
- deepeval/metrics/dag/schema.py +1 -1
- deepeval/metrics/dag/templates.py +2 -2
- deepeval/metrics/faithfulness/faithfulness.py +12 -1
- deepeval/metrics/g_eval/g_eval.py +11 -0
- deepeval/metrics/goal_accuracy/__init__.py +1 -0
- deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
- deepeval/metrics/goal_accuracy/schema.py +17 -0
- deepeval/metrics/goal_accuracy/template.py +235 -0
- deepeval/metrics/hallucination/hallucination.py +20 -9
- deepeval/metrics/indicator.py +8 -2
- deepeval/metrics/json_correctness/json_correctness.py +12 -1
- deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +20 -2
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +29 -6
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +14 -2
- deepeval/metrics/misuse/misuse.py +12 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +38 -25
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
- deepeval/metrics/non_advice/non_advice.py +12 -0
- deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
- deepeval/metrics/plan_adherence/__init__.py +1 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
- deepeval/metrics/plan_adherence/schema.py +11 -0
- deepeval/metrics/plan_adherence/template.py +170 -0
- deepeval/metrics/plan_quality/__init__.py +1 -0
- deepeval/metrics/plan_quality/plan_quality.py +292 -0
- deepeval/metrics/plan_quality/schema.py +11 -0
- deepeval/metrics/plan_quality/template.py +101 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +12 -1
- deepeval/metrics/role_adherence/role_adherence.py +12 -0
- deepeval/metrics/role_violation/role_violation.py +12 -0
- deepeval/metrics/step_efficiency/__init__.py +1 -0
- deepeval/metrics/step_efficiency/schema.py +11 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
- deepeval/metrics/step_efficiency/template.py +256 -0
- deepeval/metrics/summarization/summarization.py +12 -1
- deepeval/metrics/task_completion/task_completion.py +4 -0
- deepeval/metrics/tool_correctness/schema.py +6 -0
- deepeval/metrics/tool_correctness/template.py +88 -0
- deepeval/metrics/tool_correctness/tool_correctness.py +233 -21
- deepeval/metrics/tool_use/__init__.py +1 -0
- deepeval/metrics/tool_use/schema.py +19 -0
- deepeval/metrics/tool_use/template.py +220 -0
- deepeval/metrics/tool_use/tool_use.py +458 -0
- deepeval/metrics/topic_adherence/__init__.py +1 -0
- deepeval/metrics/topic_adherence/schema.py +16 -0
- deepeval/metrics/topic_adherence/template.py +162 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
- deepeval/metrics/toxicity/toxicity.py +12 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
- deepeval/models/embedding_models/azure_embedding_model.py +37 -36
- deepeval/models/embedding_models/local_embedding_model.py +30 -32
- deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
- deepeval/models/embedding_models/openai_embedding_model.py +22 -31
- deepeval/models/llms/grok_model.py +1 -1
- deepeval/models/llms/openai_model.py +2 -0
- deepeval/openai/__init__.py +14 -32
- deepeval/openai/extractors.py +85 -50
- deepeval/openai/patch.py +258 -167
- deepeval/openai/types.py +20 -0
- deepeval/openai/utils.py +205 -56
- deepeval/prompt/__init__.py +19 -1
- deepeval/prompt/api.py +160 -0
- deepeval/prompt/prompt.py +245 -62
- deepeval/prompt/utils.py +186 -15
- deepeval/synthesizer/chunking/context_generator.py +209 -152
- deepeval/synthesizer/chunking/doc_chunker.py +46 -12
- deepeval/synthesizer/synthesizer.py +19 -15
- deepeval/test_case/api.py +131 -0
- deepeval/test_case/llm_test_case.py +6 -2
- deepeval/test_run/__init__.py +1 -0
- deepeval/test_run/hyperparameters.py +47 -8
- deepeval/test_run/test_run.py +292 -206
- deepeval/tracing/__init__.py +2 -1
- deepeval/tracing/api.py +3 -1
- deepeval/tracing/otel/exporter.py +3 -4
- deepeval/tracing/otel/utils.py +24 -5
- deepeval/tracing/trace_context.py +89 -5
- deepeval/tracing/tracing.py +74 -3
- deepeval/tracing/types.py +20 -2
- deepeval/tracing/utils.py +8 -0
- deepeval/utils.py +21 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/METADATA +1 -1
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/RECORD +133 -103
- deepeval/integrations/llama_index/agent/patched.py +0 -68
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/WHEEL +0 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/entry_points.txt +0 -0
|
@@ -254,7 +254,7 @@ class ConfidentSpanExporter(SpanExporter):
|
|
|
254
254
|
|
|
255
255
|
base_span = None
|
|
256
256
|
try:
|
|
257
|
-
base_span = self.
|
|
257
|
+
base_span = self.prepare_boilerplate_base_span(span)
|
|
258
258
|
except Exception:
|
|
259
259
|
pass
|
|
260
260
|
|
|
@@ -453,9 +453,8 @@ class ConfidentSpanExporter(SpanExporter):
|
|
|
453
453
|
if span_output:
|
|
454
454
|
base_span.output = span_output
|
|
455
455
|
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
) -> Optional[BaseSpan]:
|
|
456
|
+
@staticmethod
|
|
457
|
+
def prepare_boilerplate_base_span(span: ReadableSpan) -> Optional[BaseSpan]:
|
|
459
458
|
|
|
460
459
|
################ Get Span Type ################
|
|
461
460
|
span_type = span.attributes.get("confident.span.type")
|
deepeval/tracing/otel/utils.py
CHANGED
|
@@ -3,7 +3,7 @@ import json
|
|
|
3
3
|
from typing import List, Optional, Tuple, Any
|
|
4
4
|
from opentelemetry.sdk.trace.export import ReadableSpan
|
|
5
5
|
|
|
6
|
-
from deepeval.
|
|
6
|
+
from deepeval.test_case.api import create_api_test_case
|
|
7
7
|
from deepeval.test_run.api import LLMApiTestCase
|
|
8
8
|
from deepeval.test_run.test_run import global_test_run_manager
|
|
9
9
|
from deepeval.tracing.types import Trace, LLMTestCase, ToolCall
|
|
@@ -11,7 +11,7 @@ from deepeval.tracing import trace_manager, BaseSpan
|
|
|
11
11
|
from deepeval.tracing.utils import make_json_serializable
|
|
12
12
|
|
|
13
13
|
|
|
14
|
-
GEN_AI_OPERATION_NAMES = ["chat", "generate_content", "
|
|
14
|
+
GEN_AI_OPERATION_NAMES = ["chat", "generate_content", "text_completion"]
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
def to_hex_string(id_value: int | bytes, length: int = 32) -> str:
|
|
@@ -128,6 +128,10 @@ def check_llm_input_from_gen_ai_attributes(
|
|
|
128
128
|
|
|
129
129
|
input = system_instructions + input_messages
|
|
130
130
|
|
|
131
|
+
model_parameters = check_model_parameters(span)
|
|
132
|
+
if model_parameters:
|
|
133
|
+
input.append(model_parameters)
|
|
134
|
+
|
|
131
135
|
except Exception:
|
|
132
136
|
pass
|
|
133
137
|
try:
|
|
@@ -413,7 +417,7 @@ def post_test_run(traces: List[Trace], test_run_id: Optional[str]):
|
|
|
413
417
|
# return test_run_manager.post_test_run(test_run) TODO: add after test run with metric collection is implemented
|
|
414
418
|
|
|
415
419
|
|
|
416
|
-
def
|
|
420
|
+
def normalize_pydantic_ai_messages(span: ReadableSpan) -> Optional[list]:
|
|
417
421
|
try:
|
|
418
422
|
raw = span.attributes.get("pydantic_ai.all_messages")
|
|
419
423
|
if not raw:
|
|
@@ -438,7 +442,7 @@ def _normalize_pydantic_ai_messages(span: ReadableSpan) -> Optional[list]:
|
|
|
438
442
|
except Exception:
|
|
439
443
|
pass
|
|
440
444
|
|
|
441
|
-
return
|
|
445
|
+
return []
|
|
442
446
|
|
|
443
447
|
|
|
444
448
|
def _extract_non_thinking_part_of_last_message(message: dict) -> dict:
|
|
@@ -461,7 +465,7 @@ def check_pydantic_ai_agent_input_output(
|
|
|
461
465
|
output_val: Optional[Any] = None
|
|
462
466
|
|
|
463
467
|
# Get normalized messages once
|
|
464
|
-
normalized =
|
|
468
|
+
normalized = normalize_pydantic_ai_messages(span)
|
|
465
469
|
|
|
466
470
|
# Input (pydantic_ai.all_messages) - slice up to and including the first 'user' message
|
|
467
471
|
if normalized:
|
|
@@ -523,3 +527,18 @@ def check_pydantic_ai_trace_input_output(
|
|
|
523
527
|
input_val, output_val = check_pydantic_ai_agent_input_output(span)
|
|
524
528
|
|
|
525
529
|
return input_val, output_val
|
|
530
|
+
|
|
531
|
+
|
|
532
|
+
def check_model_parameters(span: ReadableSpan) -> Optional[dict]:
|
|
533
|
+
try:
|
|
534
|
+
raw_model_parameters = span.attributes.get("model_request_parameters")
|
|
535
|
+
if raw_model_parameters and isinstance(raw_model_parameters, str):
|
|
536
|
+
model_parameters = json.loads(raw_model_parameters)
|
|
537
|
+
if isinstance(model_parameters, dict):
|
|
538
|
+
return {
|
|
539
|
+
"role": "Model Request Parameters",
|
|
540
|
+
"content": model_parameters,
|
|
541
|
+
}
|
|
542
|
+
except Exception:
|
|
543
|
+
pass
|
|
544
|
+
return None
|
|
@@ -1,14 +1,98 @@
|
|
|
1
|
-
from
|
|
2
|
-
from
|
|
1
|
+
from typing import Optional, List, Dict, Any
|
|
2
|
+
from contextvars import ContextVar
|
|
3
3
|
from contextlib import contextmanager
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
|
|
6
|
+
from .tracing import trace_manager
|
|
7
|
+
from .context import current_trace_context, update_current_trace
|
|
8
|
+
from deepeval.prompt import Prompt
|
|
9
|
+
from deepeval.metrics import BaseMetric
|
|
10
|
+
from deepeval.test_case.llm_test_case import ToolCall
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class LlmSpanContext:
|
|
15
|
+
prompt: Optional[Prompt] = None
|
|
16
|
+
metrics: Optional[List[BaseMetric]] = None
|
|
17
|
+
metric_collection: Optional[str] = None
|
|
18
|
+
expected_output: Optional[str] = None
|
|
19
|
+
expected_tools: Optional[List[ToolCall]] = None
|
|
20
|
+
context: Optional[List[str]] = None
|
|
21
|
+
retrieval_context: Optional[List[str]] = None
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class AgentSpanContext:
|
|
26
|
+
metrics: Optional[List[BaseMetric]] = None
|
|
27
|
+
metric_collection: Optional[str] = None
|
|
28
|
+
expected_output: Optional[str] = None
|
|
29
|
+
expected_tools: Optional[List[ToolCall]] = None
|
|
30
|
+
context: Optional[List[str]] = None
|
|
31
|
+
retrieval_context: Optional[List[str]] = None
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
current_llm_context: ContextVar[Optional[LlmSpanContext]] = ContextVar(
|
|
35
|
+
"current_llm_context", default=LlmSpanContext()
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
current_agent_context: ContextVar[Optional[AgentSpanContext]] = ContextVar(
|
|
39
|
+
"current_agent_context", default=AgentSpanContext()
|
|
40
|
+
)
|
|
4
41
|
|
|
5
42
|
|
|
6
43
|
@contextmanager
|
|
7
|
-
def trace(
|
|
44
|
+
def trace(
|
|
45
|
+
llm_span_context: Optional[LlmSpanContext] = None,
|
|
46
|
+
agent_span_context: Optional[AgentSpanContext] = None,
|
|
47
|
+
name: Optional[str] = None,
|
|
48
|
+
tags: Optional[List[str]] = None,
|
|
49
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
50
|
+
thread_id: Optional[str] = None,
|
|
51
|
+
user_id: Optional[str] = None,
|
|
52
|
+
input: Optional[Any] = None,
|
|
53
|
+
output: Optional[Any] = None,
|
|
54
|
+
retrieval_context: Optional[List[str]] = None,
|
|
55
|
+
context: Optional[List[str]] = None,
|
|
56
|
+
expected_output: Optional[str] = None,
|
|
57
|
+
tools_called: Optional[List[ToolCall]] = None,
|
|
58
|
+
expected_tools: Optional[List[ToolCall]] = None,
|
|
59
|
+
metrics: Optional[List[BaseMetric]] = None,
|
|
60
|
+
metric_collection: Optional[str] = None,
|
|
61
|
+
):
|
|
8
62
|
current_trace = current_trace_context.get()
|
|
9
63
|
|
|
10
64
|
if not current_trace:
|
|
11
65
|
current_trace = trace_manager.start_new_trace()
|
|
12
|
-
current_trace_context.set(current_trace)
|
|
13
66
|
|
|
14
|
-
|
|
67
|
+
if metrics:
|
|
68
|
+
current_trace.metrics = metrics
|
|
69
|
+
|
|
70
|
+
if metric_collection:
|
|
71
|
+
current_trace.metric_collection = metric_collection
|
|
72
|
+
|
|
73
|
+
current_trace_context.set(current_trace)
|
|
74
|
+
|
|
75
|
+
update_current_trace(
|
|
76
|
+
name=name,
|
|
77
|
+
tags=tags,
|
|
78
|
+
metadata=metadata,
|
|
79
|
+
thread_id=thread_id,
|
|
80
|
+
user_id=user_id,
|
|
81
|
+
input=input,
|
|
82
|
+
output=output,
|
|
83
|
+
retrieval_context=retrieval_context,
|
|
84
|
+
context=context,
|
|
85
|
+
expected_output=expected_output,
|
|
86
|
+
tools_called=tools_called,
|
|
87
|
+
expected_tools=expected_tools,
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
if llm_span_context:
|
|
91
|
+
current_llm_context.set(llm_span_context)
|
|
92
|
+
if agent_span_context:
|
|
93
|
+
current_agent_context.set(agent_span_context)
|
|
94
|
+
try:
|
|
95
|
+
yield current_trace
|
|
96
|
+
finally:
|
|
97
|
+
current_llm_context.set(LlmSpanContext())
|
|
98
|
+
current_agent_context.set(AgentSpanContext())
|
deepeval/tracing/tracing.py
CHANGED
|
@@ -1,5 +1,15 @@
|
|
|
1
|
-
import
|
|
2
|
-
from typing import
|
|
1
|
+
import weakref
|
|
2
|
+
from typing import (
|
|
3
|
+
TYPE_CHECKING,
|
|
4
|
+
Any,
|
|
5
|
+
Callable,
|
|
6
|
+
Dict,
|
|
7
|
+
List,
|
|
8
|
+
Literal,
|
|
9
|
+
Optional,
|
|
10
|
+
Set,
|
|
11
|
+
Union,
|
|
12
|
+
)
|
|
3
13
|
from time import perf_counter
|
|
4
14
|
import threading
|
|
5
15
|
import functools
|
|
@@ -20,6 +30,7 @@ from deepeval.constants import (
|
|
|
20
30
|
)
|
|
21
31
|
from deepeval.confident.api import Api, Endpoints, HttpMethods, is_confident
|
|
22
32
|
from deepeval.metrics import BaseMetric
|
|
33
|
+
from deepeval.test_case.llm_test_case import ToolCall
|
|
23
34
|
from deepeval.tracing.api import (
|
|
24
35
|
BaseApiSpan,
|
|
25
36
|
SpanApiType,
|
|
@@ -41,6 +52,7 @@ from deepeval.tracing.types import (
|
|
|
41
52
|
)
|
|
42
53
|
from deepeval.tracing.utils import (
|
|
43
54
|
Environment,
|
|
55
|
+
prepare_tool_call_input_parameters,
|
|
44
56
|
replace_self_with_class_name,
|
|
45
57
|
make_json_serializable,
|
|
46
58
|
perf_counter_to_datetime,
|
|
@@ -55,6 +67,10 @@ from deepeval.tracing.types import TestCaseMetricPair
|
|
|
55
67
|
from deepeval.tracing.api import PromptApi
|
|
56
68
|
from deepeval.tracing.trace_test_manager import trace_testing_manager
|
|
57
69
|
|
|
70
|
+
|
|
71
|
+
if TYPE_CHECKING:
|
|
72
|
+
from deepeval.dataset.golden import Golden
|
|
73
|
+
|
|
58
74
|
EVAL_DUMMY_SPAN_NAME = "evals_iterator"
|
|
59
75
|
|
|
60
76
|
|
|
@@ -65,6 +81,10 @@ class TraceManager:
|
|
|
65
81
|
self.active_spans: Dict[str, BaseSpan] = (
|
|
66
82
|
{}
|
|
67
83
|
) # Map of span_uuid to BaseSpan
|
|
84
|
+
# Map each trace created during evaluation_loop to the Golden that was active
|
|
85
|
+
# when it was started. This lets us evaluate traces against the correct golden
|
|
86
|
+
# since we cannot rely on positional indexing as the order is not guaranteed.
|
|
87
|
+
self.trace_uuid_to_golden: Dict[str, Golden] = {}
|
|
68
88
|
|
|
69
89
|
settings = get_settings()
|
|
70
90
|
# Initialize queue and worker thread for trace posting
|
|
@@ -73,6 +93,9 @@ class TraceManager:
|
|
|
73
93
|
self._min_interval = 0.2 # Minimum time between API calls (seconds)
|
|
74
94
|
self._last_post_time = 0
|
|
75
95
|
self._in_flight_tasks: Set[asyncio.Task[Any]] = set()
|
|
96
|
+
self.task_bindings: "weakref.WeakKeyDictionary[asyncio.Task, dict]" = (
|
|
97
|
+
weakref.WeakKeyDictionary()
|
|
98
|
+
)
|
|
76
99
|
self._flush_enabled = bool(settings.CONFIDENT_TRACE_FLUSH)
|
|
77
100
|
self._daemon = not self._flush_enabled
|
|
78
101
|
|
|
@@ -86,7 +109,7 @@ class TraceManager:
|
|
|
86
109
|
)
|
|
87
110
|
validate_environment(self.environment)
|
|
88
111
|
|
|
89
|
-
self.sampling_rate = settings.
|
|
112
|
+
self.sampling_rate = settings.CONFIDENT_TRACE_SAMPLE_RATE
|
|
90
113
|
validate_sampling_rate(self.sampling_rate)
|
|
91
114
|
self.openai_client = None
|
|
92
115
|
self.tracing_enabled = True
|
|
@@ -166,6 +189,19 @@ class TraceManager:
|
|
|
166
189
|
self.traces.append(new_trace)
|
|
167
190
|
if self.evaluation_loop:
|
|
168
191
|
self.traces_to_evaluate_order.append(trace_uuid)
|
|
192
|
+
# Associate the current Golden with this trace so we can
|
|
193
|
+
# later evaluate traces against the correct golden, even if more traces
|
|
194
|
+
# are created than goldens or the order interleaves.
|
|
195
|
+
try:
|
|
196
|
+
from deepeval.contextvars import get_current_golden
|
|
197
|
+
|
|
198
|
+
current_golden = get_current_golden()
|
|
199
|
+
if current_golden is not None:
|
|
200
|
+
self.trace_uuid_to_golden[trace_uuid] = current_golden
|
|
201
|
+
except Exception:
|
|
202
|
+
# not much we can do, but if the golden is not there during evaluation
|
|
203
|
+
# we will write out a verbose debug log
|
|
204
|
+
pass
|
|
169
205
|
return new_trace
|
|
170
206
|
|
|
171
207
|
def end_trace(self, trace_uuid: str):
|
|
@@ -820,6 +856,25 @@ class Observer:
|
|
|
820
856
|
self._progress = parent_span.progress
|
|
821
857
|
self._pbar_callback_id = parent_span.pbar_callback_id
|
|
822
858
|
|
|
859
|
+
try:
|
|
860
|
+
import asyncio
|
|
861
|
+
|
|
862
|
+
task = asyncio.current_task()
|
|
863
|
+
except Exception:
|
|
864
|
+
task = None
|
|
865
|
+
|
|
866
|
+
if task is not None:
|
|
867
|
+
binding = trace_manager.task_bindings.get(task) or {}
|
|
868
|
+
# record the trace the task is working on
|
|
869
|
+
binding["trace_uuid"] = span_instance.trace_uuid
|
|
870
|
+
# only set root_span_uuid when this span is a root. Don't do this for child or we will override our record.
|
|
871
|
+
if (
|
|
872
|
+
span_instance.parent_uuid is None
|
|
873
|
+
and "root_span_uuid" not in binding
|
|
874
|
+
):
|
|
875
|
+
binding["root_span_uuid"] = span_instance.uuid
|
|
876
|
+
trace_manager.task_bindings[task] = binding
|
|
877
|
+
|
|
823
878
|
if self._progress is not None and self._pbar_callback_id is not None:
|
|
824
879
|
span_instance.progress = self._progress
|
|
825
880
|
span_instance.pbar_callback_id = self._pbar_callback_id
|
|
@@ -861,6 +916,22 @@ class Observer:
|
|
|
861
916
|
):
|
|
862
917
|
current_span.prompt = self.prompt
|
|
863
918
|
|
|
919
|
+
if not current_span.tools_called:
|
|
920
|
+
# check any tool span children
|
|
921
|
+
for child in current_span.children:
|
|
922
|
+
if isinstance(child, ToolSpan):
|
|
923
|
+
current_span.tools_called = current_span.tools_called or []
|
|
924
|
+
current_span.tools_called.append(
|
|
925
|
+
ToolCall(
|
|
926
|
+
name=child.name,
|
|
927
|
+
description=child.description,
|
|
928
|
+
input_parameters=prepare_tool_call_input_parameters(
|
|
929
|
+
child.input
|
|
930
|
+
),
|
|
931
|
+
output=child.output,
|
|
932
|
+
)
|
|
933
|
+
)
|
|
934
|
+
|
|
864
935
|
trace_manager.remove_span(self.uuid)
|
|
865
936
|
if current_span.parent_uuid:
|
|
866
937
|
parent_span = trace_manager.get_span_by_uuid(
|
deepeval/tracing/types.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from enum import Enum
|
|
2
2
|
from dataclasses import dataclass, field
|
|
3
3
|
from pydantic import BaseModel, Field
|
|
4
|
-
from typing import Any, Dict, List, Optional, Union
|
|
4
|
+
from typing import Any, Dict, List, Optional, Union, Literal
|
|
5
5
|
from rich.progress import Progress
|
|
6
6
|
|
|
7
7
|
from deepeval.prompt.prompt import Prompt
|
|
@@ -10,6 +10,19 @@ from deepeval.test_case import LLMTestCase
|
|
|
10
10
|
from deepeval.metrics import BaseMetric
|
|
11
11
|
|
|
12
12
|
|
|
13
|
+
class Message(BaseModel):
|
|
14
|
+
role: str
|
|
15
|
+
"""To be displayed on the top of the message block."""
|
|
16
|
+
|
|
17
|
+
type: Literal["tool_calls", "tool_output", "thinking", "default"] = (
|
|
18
|
+
"default"
|
|
19
|
+
)
|
|
20
|
+
"""Decides how the content is rendered."""
|
|
21
|
+
|
|
22
|
+
content: Any
|
|
23
|
+
"""The content of the message."""
|
|
24
|
+
|
|
25
|
+
|
|
13
26
|
class TraceWorkerStatus(Enum):
|
|
14
27
|
SUCCESS = "success"
|
|
15
28
|
FAILURE = "failure"
|
|
@@ -44,7 +57,7 @@ class LlmOutput(BaseModel):
|
|
|
44
57
|
class BaseSpan(BaseModel):
|
|
45
58
|
uuid: str
|
|
46
59
|
status: TraceSpanStatus
|
|
47
|
-
children: List["BaseSpan"]
|
|
60
|
+
children: List["BaseSpan"] = Field(default_factory=list)
|
|
48
61
|
trace_uuid: str = Field(serialization_alias="traceUuid")
|
|
49
62
|
parent_uuid: Optional[str] = Field(None, serialization_alias="parentUuid")
|
|
50
63
|
start_time: float = Field(serialization_alias="startTime")
|
|
@@ -88,6 +101,7 @@ class AgentSpan(BaseSpan):
|
|
|
88
101
|
|
|
89
102
|
|
|
90
103
|
class LlmSpan(BaseSpan):
|
|
104
|
+
|
|
91
105
|
model: Optional[str] = None
|
|
92
106
|
prompt: Optional[Prompt] = None
|
|
93
107
|
input_token_count: Optional[float] = Field(
|
|
@@ -106,6 +120,10 @@ class LlmSpan(BaseSpan):
|
|
|
106
120
|
None, serialization_alias="tokenTimes"
|
|
107
121
|
)
|
|
108
122
|
|
|
123
|
+
# input_tools: Optional[List[ToolSchema]] = Field(None, serialization_alias="inputTools")
|
|
124
|
+
# invocation_params: Optional[Dict[str, Any]] = Field(None, serialization_alias="invocationParams")
|
|
125
|
+
# output_metadata: Optional[Dict[str, Any]] = Field(None, serialization_alias="outputMetadata")
|
|
126
|
+
|
|
109
127
|
# for serializing `prompt`
|
|
110
128
|
model_config = {"arbitrary_types_allowed": True}
|
|
111
129
|
|
deepeval/tracing/utils.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import os
|
|
2
|
+
from typing import Dict, Any
|
|
2
3
|
from datetime import datetime, timezone
|
|
3
4
|
from enum import Enum
|
|
4
5
|
from time import perf_counter
|
|
@@ -183,3 +184,10 @@ def replace_self_with_class_name(obj):
|
|
|
183
184
|
return f"<{obj.__class__.__name__}>"
|
|
184
185
|
except:
|
|
185
186
|
return f"<self>"
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def prepare_tool_call_input_parameters(output: Any) -> Dict[str, Any]:
|
|
190
|
+
res = make_json_serializable(output)
|
|
191
|
+
if res and not isinstance(res, dict):
|
|
192
|
+
res = {"output": res}
|
|
193
|
+
return res
|
deepeval/utils.py
CHANGED
|
@@ -10,6 +10,7 @@ import asyncio
|
|
|
10
10
|
import nest_asyncio
|
|
11
11
|
import uuid
|
|
12
12
|
import math
|
|
13
|
+
import logging
|
|
13
14
|
|
|
14
15
|
from contextvars import ContextVar
|
|
15
16
|
from enum import Enum
|
|
@@ -747,3 +748,23 @@ my_theme = Theme(
|
|
|
747
748
|
}
|
|
748
749
|
)
|
|
749
750
|
custom_console = Console(theme=my_theme)
|
|
751
|
+
|
|
752
|
+
|
|
753
|
+
def format_error_text(
|
|
754
|
+
exc: BaseException, *, with_stack: bool | None = None
|
|
755
|
+
) -> str:
|
|
756
|
+
if with_stack is None:
|
|
757
|
+
with_stack = logging.getLogger("deepeval").isEnabledFor(logging.DEBUG)
|
|
758
|
+
|
|
759
|
+
text = f"{type(exc).__name__}: {exc}"
|
|
760
|
+
|
|
761
|
+
if with_stack:
|
|
762
|
+
import traceback
|
|
763
|
+
|
|
764
|
+
text += "\n" + "".join(
|
|
765
|
+
traceback.format_exception(type(exc), exc, exc.__traceback__)
|
|
766
|
+
)
|
|
767
|
+
elif get_settings().DEEPEVAL_VERBOSE_MODE:
|
|
768
|
+
text += " (Run with LOG_LEVEL=DEBUG for stack trace.)"
|
|
769
|
+
|
|
770
|
+
return text
|