deepeval 3.6.5__py3-none-any.whl → 3.6.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/__init__.py +42 -10
- deepeval/_version.py +1 -1
- deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
- deepeval/cli/main.py +42 -0
- deepeval/confident/api.py +1 -0
- deepeval/config/logging.py +33 -0
- deepeval/config/settings.py +176 -16
- deepeval/constants.py +8 -1
- deepeval/dataset/dataset.py +2 -11
- deepeval/dataset/utils.py +1 -1
- deepeval/evaluate/evaluate.py +5 -1
- deepeval/evaluate/execute.py +118 -60
- deepeval/evaluate/utils.py +20 -116
- deepeval/integrations/crewai/__init__.py +6 -1
- deepeval/integrations/crewai/handler.py +1 -1
- deepeval/integrations/crewai/subs.py +51 -0
- deepeval/integrations/crewai/wrapper.py +45 -5
- deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
- deepeval/metrics/api.py +281 -0
- deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
- deepeval/metrics/bias/bias.py +12 -3
- deepeval/metrics/contextual_precision/contextual_precision.py +12 -3
- deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
- deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
- deepeval/metrics/conversational_dag/nodes.py +12 -4
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +73 -59
- deepeval/metrics/dag/dag.py +12 -0
- deepeval/metrics/dag/nodes.py +12 -4
- deepeval/metrics/faithfulness/faithfulness.py +12 -1
- deepeval/metrics/g_eval/g_eval.py +37 -15
- deepeval/metrics/hallucination/hallucination.py +12 -1
- deepeval/metrics/indicator.py +8 -2
- deepeval/metrics/json_correctness/json_correctness.py +12 -1
- deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +13 -0
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +13 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +12 -1
- deepeval/metrics/misuse/misuse.py +12 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +6 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
- deepeval/metrics/non_advice/non_advice.py +12 -0
- deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
- deepeval/metrics/prompt_alignment/prompt_alignment.py +53 -24
- deepeval/metrics/role_adherence/role_adherence.py +12 -0
- deepeval/metrics/role_violation/role_violation.py +12 -0
- deepeval/metrics/summarization/summarization.py +12 -1
- deepeval/metrics/task_completion/task_completion.py +3 -0
- deepeval/metrics/tool_correctness/tool_correctness.py +8 -0
- deepeval/metrics/toxicity/toxicity.py +12 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
- deepeval/models/llms/grok_model.py +1 -1
- deepeval/models/llms/openai_model.py +2 -0
- deepeval/models/retry_policy.py +202 -11
- deepeval/openai/__init__.py +14 -32
- deepeval/openai/extractors.py +24 -34
- deepeval/openai/patch.py +256 -161
- deepeval/openai/types.py +20 -0
- deepeval/openai/utils.py +98 -56
- deepeval/prompt/__init__.py +19 -1
- deepeval/prompt/api.py +160 -0
- deepeval/prompt/prompt.py +244 -62
- deepeval/prompt/utils.py +144 -2
- deepeval/synthesizer/chunking/context_generator.py +209 -152
- deepeval/synthesizer/chunking/doc_chunker.py +46 -12
- deepeval/synthesizer/synthesizer.py +8 -5
- deepeval/test_case/api.py +131 -0
- deepeval/test_run/__init__.py +1 -0
- deepeval/test_run/hyperparameters.py +47 -8
- deepeval/test_run/test_run.py +104 -1
- deepeval/tracing/api.py +3 -1
- deepeval/tracing/message_types/__init__.py +10 -0
- deepeval/tracing/message_types/base.py +6 -0
- deepeval/tracing/message_types/messages.py +14 -0
- deepeval/tracing/message_types/tools.py +18 -0
- deepeval/tracing/otel/exporter.py +0 -6
- deepeval/tracing/otel/utils.py +58 -8
- deepeval/tracing/trace_context.py +73 -4
- deepeval/tracing/trace_test_manager.py +19 -0
- deepeval/tracing/tracing.py +52 -4
- deepeval/tracing/types.py +16 -0
- deepeval/tracing/utils.py +8 -0
- {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/METADATA +1 -1
- {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/RECORD +97 -87
- {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/WHEEL +0 -0
- {deepeval-3.6.5.dist-info → deepeval-3.6.7.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
from typing import Union, Optional
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
from deepeval.test_run.api import (
|
|
5
|
+
LLMApiTestCase,
|
|
6
|
+
ConversationalApiTestCase,
|
|
7
|
+
TurnApi,
|
|
8
|
+
TraceApi,
|
|
9
|
+
)
|
|
10
|
+
from deepeval.test_case import (
|
|
11
|
+
LLMTestCase,
|
|
12
|
+
ConversationalTestCase,
|
|
13
|
+
MLLMTestCase,
|
|
14
|
+
Turn,
|
|
15
|
+
)
|
|
16
|
+
from deepeval.constants import PYTEST_RUN_TEST_NAME
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def create_api_turn(turn: Turn, index: int) -> TurnApi:
|
|
20
|
+
return TurnApi(
|
|
21
|
+
role=turn.role,
|
|
22
|
+
content=turn.content,
|
|
23
|
+
user_id=turn.user_id,
|
|
24
|
+
retrievalContext=turn.retrieval_context,
|
|
25
|
+
toolsCalled=turn.tools_called,
|
|
26
|
+
additionalMetadata=turn.additional_metadata,
|
|
27
|
+
order=index,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def create_api_test_case(
|
|
32
|
+
test_case: Union[LLMTestCase, ConversationalTestCase, MLLMTestCase],
|
|
33
|
+
trace: Optional[TraceApi] = None,
|
|
34
|
+
index: Optional[int] = None,
|
|
35
|
+
) -> Union[LLMApiTestCase, ConversationalApiTestCase]:
|
|
36
|
+
if isinstance(test_case, ConversationalTestCase):
|
|
37
|
+
order = (
|
|
38
|
+
test_case._dataset_rank
|
|
39
|
+
if test_case._dataset_rank is not None
|
|
40
|
+
else index
|
|
41
|
+
)
|
|
42
|
+
if test_case.name:
|
|
43
|
+
name = test_case.name
|
|
44
|
+
else:
|
|
45
|
+
name = os.getenv(
|
|
46
|
+
PYTEST_RUN_TEST_NAME, f"conversational_test_case_{order}"
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
api_test_case = ConversationalApiTestCase(
|
|
50
|
+
name=name,
|
|
51
|
+
success=True,
|
|
52
|
+
metricsData=[],
|
|
53
|
+
runDuration=0,
|
|
54
|
+
evaluationCost=None,
|
|
55
|
+
order=order,
|
|
56
|
+
scenario=test_case.scenario,
|
|
57
|
+
expectedOutcome=test_case.expected_outcome,
|
|
58
|
+
userDescription=test_case.user_description,
|
|
59
|
+
context=test_case.context,
|
|
60
|
+
tags=test_case.tags,
|
|
61
|
+
comments=test_case.comments,
|
|
62
|
+
additionalMetadata=test_case.additional_metadata,
|
|
63
|
+
)
|
|
64
|
+
api_test_case.turns = [
|
|
65
|
+
create_api_turn(
|
|
66
|
+
turn=turn,
|
|
67
|
+
index=index,
|
|
68
|
+
)
|
|
69
|
+
for index, turn in enumerate(test_case.turns)
|
|
70
|
+
]
|
|
71
|
+
|
|
72
|
+
return api_test_case
|
|
73
|
+
else:
|
|
74
|
+
order = (
|
|
75
|
+
test_case._dataset_rank
|
|
76
|
+
if test_case._dataset_rank is not None
|
|
77
|
+
else index
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
success = True
|
|
81
|
+
if test_case.name is not None:
|
|
82
|
+
name = test_case.name
|
|
83
|
+
else:
|
|
84
|
+
name = os.getenv(PYTEST_RUN_TEST_NAME, f"test_case_{order}")
|
|
85
|
+
metrics_data = []
|
|
86
|
+
|
|
87
|
+
if isinstance(test_case, LLMTestCase):
|
|
88
|
+
api_test_case = LLMApiTestCase(
|
|
89
|
+
name=name,
|
|
90
|
+
input=test_case.input,
|
|
91
|
+
actualOutput=test_case.actual_output,
|
|
92
|
+
expectedOutput=test_case.expected_output,
|
|
93
|
+
context=test_case.context,
|
|
94
|
+
retrievalContext=test_case.retrieval_context,
|
|
95
|
+
toolsCalled=test_case.tools_called,
|
|
96
|
+
expectedTools=test_case.expected_tools,
|
|
97
|
+
tokenCost=test_case.token_cost,
|
|
98
|
+
completionTime=test_case.completion_time,
|
|
99
|
+
tags=test_case.tags,
|
|
100
|
+
success=success,
|
|
101
|
+
metricsData=metrics_data,
|
|
102
|
+
runDuration=None,
|
|
103
|
+
evaluationCost=None,
|
|
104
|
+
order=order,
|
|
105
|
+
additionalMetadata=test_case.additional_metadata,
|
|
106
|
+
comments=test_case.comments,
|
|
107
|
+
trace=trace,
|
|
108
|
+
)
|
|
109
|
+
elif isinstance(test_case, MLLMTestCase):
|
|
110
|
+
api_test_case = LLMApiTestCase(
|
|
111
|
+
name=name,
|
|
112
|
+
input="",
|
|
113
|
+
multimodalInput=test_case.input,
|
|
114
|
+
multimodalActualOutput=test_case.actual_output,
|
|
115
|
+
multimodalExpectedOutput=test_case.expected_output,
|
|
116
|
+
multimodalRetrievalContext=test_case.retrieval_context,
|
|
117
|
+
multimodalContext=test_case.context,
|
|
118
|
+
toolsCalled=test_case.tools_called,
|
|
119
|
+
expectedTools=test_case.expected_tools,
|
|
120
|
+
tokenCost=test_case.token_cost,
|
|
121
|
+
completionTime=test_case.completion_time,
|
|
122
|
+
success=success,
|
|
123
|
+
metricsData=metrics_data,
|
|
124
|
+
runDuration=None,
|
|
125
|
+
evaluationCost=None,
|
|
126
|
+
order=order,
|
|
127
|
+
additionalMetadata=test_case.additional_metadata,
|
|
128
|
+
comments=test_case.comments,
|
|
129
|
+
)
|
|
130
|
+
# llm_test_case_lookup_map[instance_id] = api_test_case
|
|
131
|
+
return api_test_case
|
deepeval/test_run/__init__.py
CHANGED
|
@@ -1,13 +1,15 @@
|
|
|
1
|
-
from typing import Union, Dict
|
|
2
|
-
|
|
1
|
+
from typing import Union, Dict, Optional, List
|
|
3
2
|
from deepeval.test_run import global_test_run_manager
|
|
4
3
|
from deepeval.prompt import Prompt
|
|
5
4
|
from deepeval.prompt.api import PromptApi
|
|
6
5
|
from deepeval.test_run.test_run import TEMP_FILE_PATH
|
|
6
|
+
from deepeval.confident.api import is_confident
|
|
7
|
+
from deepeval.test_run.test_run import PromptData
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
def process_hyperparameters(
|
|
10
|
-
hyperparameters,
|
|
11
|
+
hyperparameters: Optional[Dict] = None,
|
|
12
|
+
verbose: bool = True,
|
|
11
13
|
) -> Union[Dict[str, Union[str, int, float, PromptApi]], None]:
|
|
12
14
|
if hyperparameters is None:
|
|
13
15
|
return None
|
|
@@ -16,6 +18,7 @@ def process_hyperparameters(
|
|
|
16
18
|
raise TypeError("Hyperparameters must be a dictionary or None")
|
|
17
19
|
|
|
18
20
|
processed_hyperparameters = {}
|
|
21
|
+
prompts_version_id_map = {}
|
|
19
22
|
|
|
20
23
|
for key, value in hyperparameters.items():
|
|
21
24
|
if not isinstance(key, str):
|
|
@@ -30,14 +33,21 @@ def process_hyperparameters(
|
|
|
30
33
|
)
|
|
31
34
|
|
|
32
35
|
if isinstance(value, Prompt):
|
|
33
|
-
|
|
36
|
+
prompt_key = f"{value.alias}_{value.version}"
|
|
37
|
+
if value._prompt_version_id is not None and value.type is not None:
|
|
34
38
|
processed_hyperparameters[key] = PromptApi(
|
|
35
39
|
id=value._prompt_version_id,
|
|
36
|
-
type=value.
|
|
40
|
+
type=value.type,
|
|
37
41
|
)
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
42
|
+
elif is_confident():
|
|
43
|
+
if prompt_key not in prompts_version_id_map:
|
|
44
|
+
value.push(_verbose=verbose)
|
|
45
|
+
prompts_version_id_map[prompt_key] = (
|
|
46
|
+
value._prompt_version_id
|
|
47
|
+
)
|
|
48
|
+
processed_hyperparameters[key] = PromptApi(
|
|
49
|
+
id=prompts_version_id_map[prompt_key],
|
|
50
|
+
type=value.type,
|
|
41
51
|
)
|
|
42
52
|
else:
|
|
43
53
|
processed_hyperparameters[key] = str(value)
|
|
@@ -64,3 +74,32 @@ def log_hyperparameters(func):
|
|
|
64
74
|
|
|
65
75
|
# Return the wrapper function to be used as the decorator
|
|
66
76
|
return wrapper
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def process_prompts(
|
|
80
|
+
hyperparameters: Dict[str, Union[str, int, float, Prompt]],
|
|
81
|
+
) -> List[PromptData]:
|
|
82
|
+
prompts = []
|
|
83
|
+
if not hyperparameters:
|
|
84
|
+
return prompts
|
|
85
|
+
seen_prompts = set()
|
|
86
|
+
prompt_objects = [
|
|
87
|
+
value for value in hyperparameters.values() if isinstance(value, Prompt)
|
|
88
|
+
]
|
|
89
|
+
for prompt in prompt_objects:
|
|
90
|
+
prompt_version = prompt.version if is_confident() else None
|
|
91
|
+
prompt_key = f"{prompt.alias}_{prompt_version}"
|
|
92
|
+
if prompt_key in seen_prompts:
|
|
93
|
+
continue
|
|
94
|
+
seen_prompts.add(prompt_key)
|
|
95
|
+
prompt_data = PromptData(
|
|
96
|
+
alias=prompt.alias,
|
|
97
|
+
version=prompt_version,
|
|
98
|
+
text_template=prompt.text_template,
|
|
99
|
+
messages_template=prompt.messages_template,
|
|
100
|
+
model_settings=prompt.model_settings,
|
|
101
|
+
output_type=prompt.output_type,
|
|
102
|
+
interpolation_type=prompt.interpolation_type,
|
|
103
|
+
)
|
|
104
|
+
prompts.append(prompt_data)
|
|
105
|
+
return prompts
|
deepeval/test_run/test_run.py
CHANGED
|
@@ -32,6 +32,17 @@ from deepeval.utils import (
|
|
|
32
32
|
)
|
|
33
33
|
from deepeval.test_run.cache import global_test_run_cache_manager
|
|
34
34
|
from deepeval.constants import CONFIDENT_TEST_CASE_BATCH_SIZE, HIDDEN_DIR
|
|
35
|
+
from deepeval.prompt import (
|
|
36
|
+
PromptMessage,
|
|
37
|
+
ModelSettings,
|
|
38
|
+
OutputType,
|
|
39
|
+
PromptInterpolationType,
|
|
40
|
+
OutputType,
|
|
41
|
+
)
|
|
42
|
+
from rich.panel import Panel
|
|
43
|
+
from rich.text import Text
|
|
44
|
+
from rich.columns import Columns
|
|
45
|
+
|
|
35
46
|
|
|
36
47
|
TEMP_FILE_PATH = f"{HIDDEN_DIR}/.temp_test_run_data.json"
|
|
37
48
|
LATEST_TEST_RUN_FILE_PATH = f"{HIDDEN_DIR}/.latest_test_run.json"
|
|
@@ -71,6 +82,16 @@ class TraceMetricScores(BaseModel):
|
|
|
71
82
|
base: Dict[str, Dict[str, MetricScores]] = Field(default_factory=dict)
|
|
72
83
|
|
|
73
84
|
|
|
85
|
+
class PromptData(BaseModel):
|
|
86
|
+
alias: Optional[str] = None
|
|
87
|
+
version: Optional[str] = None
|
|
88
|
+
text_template: Optional[str] = None
|
|
89
|
+
messages_template: Optional[List[PromptMessage]] = None
|
|
90
|
+
model_settings: Optional[ModelSettings] = None
|
|
91
|
+
output_type: Optional[OutputType] = None
|
|
92
|
+
interpolation_type: Optional[PromptInterpolationType] = None
|
|
93
|
+
|
|
94
|
+
|
|
74
95
|
class MetricsAverageDict:
|
|
75
96
|
def __init__(self):
|
|
76
97
|
self.metric_dict = {}
|
|
@@ -123,6 +144,7 @@ class TestRun(BaseModel):
|
|
|
123
144
|
)
|
|
124
145
|
identifier: Optional[str] = None
|
|
125
146
|
hyperparameters: Optional[Dict[str, Any]] = Field(None)
|
|
147
|
+
prompts: Optional[List[PromptData]] = Field(None)
|
|
126
148
|
test_passed: Optional[int] = Field(None, alias="testPassed")
|
|
127
149
|
test_failed: Optional[int] = Field(None, alias="testFailed")
|
|
128
150
|
run_duration: float = Field(0.0, alias="runDuration")
|
|
@@ -799,6 +821,7 @@ class TestRunManager:
|
|
|
799
821
|
test_run.test_cases = initial_batch
|
|
800
822
|
|
|
801
823
|
try:
|
|
824
|
+
test_run.prompts = None
|
|
802
825
|
body = test_run.model_dump(by_alias=True, exclude_none=True)
|
|
803
826
|
except AttributeError:
|
|
804
827
|
# Pydantic version below 2.0
|
|
@@ -953,6 +976,23 @@ class TestRunManager:
|
|
|
953
976
|
if display_table:
|
|
954
977
|
self.display_results_table(test_run, display)
|
|
955
978
|
|
|
979
|
+
if test_run.hyperparameters is None:
|
|
980
|
+
console.print(
|
|
981
|
+
"\n[bold yellow]⚠ WARNING:[/bold yellow] No hyperparameters logged.\n"
|
|
982
|
+
"» [bold blue][link=https://deepeval.com/docs/evaluation-prompts]Log hyperparameters[/link][/bold blue] to attribute prompts and models to your test runs.\n\n"
|
|
983
|
+
+ "=" * 80
|
|
984
|
+
)
|
|
985
|
+
else:
|
|
986
|
+
if not test_run.prompts:
|
|
987
|
+
console.print(
|
|
988
|
+
"\n[bold yellow]⚠ WARNING:[/bold yellow] No prompts logged.\n"
|
|
989
|
+
"» [bold blue][link=https://deepeval.com/docs/evaluation-prompts]Log prompts[/link][/bold blue] to evaluate and optimize your prompt templates and models.\n\n"
|
|
990
|
+
+ "=" * 80
|
|
991
|
+
)
|
|
992
|
+
else:
|
|
993
|
+
console.print("\n[bold green]✓ Prompts Logged[/bold green]\n")
|
|
994
|
+
self._render_prompts_panels(prompts=test_run.prompts)
|
|
995
|
+
|
|
956
996
|
self.save_test_run_locally()
|
|
957
997
|
delete_file_if_exists(self.temp_file_path)
|
|
958
998
|
if is_confident() and self.disable_request is False:
|
|
@@ -967,7 +1007,7 @@ class TestRunManager:
|
|
|
967
1007
|
f"» Test Results ({test_run.test_passed + test_run.test_failed} total tests):\n",
|
|
968
1008
|
f" » Pass Rate: {round((test_run.test_passed / (test_run.test_passed + test_run.test_failed)) * 100, 2)}% | Passed: [bold green]{test_run.test_passed}[/bold green] | Failed: [bold red]{test_run.test_failed}[/bold red]\n\n",
|
|
969
1009
|
"=" * 80,
|
|
970
|
-
"\n\n»
|
|
1010
|
+
"\n\n» Want to share evals with your team, or a place for your test cases to live? ❤️ 🏡\n"
|
|
971
1011
|
" » Run [bold]'deepeval view'[/bold] to analyze and save testing results on [rgb(106,0,255)]Confident AI[/rgb(106,0,255)].\n\n",
|
|
972
1012
|
)
|
|
973
1013
|
|
|
@@ -993,5 +1033,68 @@ class TestRunManager:
|
|
|
993
1033
|
pass
|
|
994
1034
|
return None
|
|
995
1035
|
|
|
1036
|
+
def _render_prompts_panels(self, prompts: List[PromptData]) -> None:
|
|
1037
|
+
|
|
1038
|
+
def format_string(
|
|
1039
|
+
v, default="[dim]None[/dim]", color: Optional[str] = None
|
|
1040
|
+
):
|
|
1041
|
+
formatted_string = str(v) if v not in (None, "", []) else default
|
|
1042
|
+
return (
|
|
1043
|
+
f"{formatted_string}"
|
|
1044
|
+
if color is None or v in (None, "", [])
|
|
1045
|
+
else f"[{color}]{formatted_string}[/]"
|
|
1046
|
+
)
|
|
1047
|
+
|
|
1048
|
+
panels = []
|
|
1049
|
+
for prompt in prompts:
|
|
1050
|
+
lines = []
|
|
1051
|
+
p_type = (
|
|
1052
|
+
"messages"
|
|
1053
|
+
if prompt.messages_template
|
|
1054
|
+
else ("text" if prompt.text_template else "—")
|
|
1055
|
+
)
|
|
1056
|
+
if p_type:
|
|
1057
|
+
lines.append(f"type: {format_string(p_type, color='blue')}")
|
|
1058
|
+
if prompt.output_type:
|
|
1059
|
+
lines.append(
|
|
1060
|
+
f"output_type: {format_string(prompt.output_type, color='blue')}"
|
|
1061
|
+
)
|
|
1062
|
+
if prompt.interpolation_type:
|
|
1063
|
+
lines.append(
|
|
1064
|
+
f"interpolation_type: {format_string(prompt.interpolation_type, color='blue')}"
|
|
1065
|
+
)
|
|
1066
|
+
if prompt.model_settings:
|
|
1067
|
+
ms = prompt.model_settings
|
|
1068
|
+
settings_lines = [
|
|
1069
|
+
"Model Settings:",
|
|
1070
|
+
f" – provider: {format_string(ms.provider, color='green')}",
|
|
1071
|
+
f" – name: {format_string(ms.name, color='green')}",
|
|
1072
|
+
f" – temperature: {format_string(ms.temperature, color='green')}",
|
|
1073
|
+
f" – max_tokens: {format_string(ms.max_tokens, color='green')}",
|
|
1074
|
+
f" – top_p: {format_string(ms.top_p, color='green')}",
|
|
1075
|
+
f" – frequency_penalty: {format_string(ms.frequency_penalty, color='green')}",
|
|
1076
|
+
f" – presence_penalty: {format_string(ms.presence_penalty, color='green')}",
|
|
1077
|
+
f" – stop_sequence: {format_string(ms.stop_sequence, color='green')}",
|
|
1078
|
+
f" – reasoning_effort: {format_string(ms.reasoning_effort, color='green')}",
|
|
1079
|
+
f" – verbosity: {format_string(ms.verbosity, color='green')}",
|
|
1080
|
+
]
|
|
1081
|
+
lines.append("")
|
|
1082
|
+
lines.extend(settings_lines)
|
|
1083
|
+
title = f"{format_string(prompt.alias)}"
|
|
1084
|
+
if prompt.version:
|
|
1085
|
+
title += f" (v{prompt.version})"
|
|
1086
|
+
body = "\n".join(lines)
|
|
1087
|
+
panel = Panel(
|
|
1088
|
+
body,
|
|
1089
|
+
title=title,
|
|
1090
|
+
title_align="left",
|
|
1091
|
+
expand=False,
|
|
1092
|
+
padding=(1, 6, 1, 2),
|
|
1093
|
+
)
|
|
1094
|
+
panels.append(panel)
|
|
1095
|
+
|
|
1096
|
+
if panels:
|
|
1097
|
+
console.print(Columns(panels, equal=False, expand=False))
|
|
1098
|
+
|
|
996
1099
|
|
|
997
1100
|
global_test_run_manager = TestRunManager()
|
deepeval/tracing/api.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from enum import Enum
|
|
2
2
|
from typing import Dict, List, Optional, Union, Literal, Any
|
|
3
|
-
from pydantic import BaseModel, Field
|
|
3
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
4
4
|
|
|
5
5
|
from deepeval.test_case import ToolCall
|
|
6
6
|
|
|
@@ -27,6 +27,8 @@ class PromptApi(BaseModel):
|
|
|
27
27
|
|
|
28
28
|
|
|
29
29
|
class MetricData(BaseModel):
|
|
30
|
+
model_config = ConfigDict(extra="ignore")
|
|
31
|
+
|
|
30
32
|
name: str
|
|
31
33
|
threshold: float
|
|
32
34
|
success: bool
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from typing import Literal, Dict, Any
|
|
2
|
+
from .base import BaseMessage
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class TextMessage(BaseMessage):
|
|
6
|
+
type: Literal["text", "thinking"]
|
|
7
|
+
content: str
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ToolCallMessage(BaseMessage):
|
|
11
|
+
"""This is a message for tool calls in response.choices[0].message.tool_calls"""
|
|
12
|
+
|
|
13
|
+
name: str
|
|
14
|
+
args: Dict[str, Any]
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from typing import Any, Optional, Dict
|
|
2
|
+
from pydantic import BaseModel
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class BaseTool(BaseModel):
|
|
6
|
+
name: str
|
|
7
|
+
description: Optional[str] = None
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ToolSchema(BaseTool):
|
|
11
|
+
parameters: Dict[str, Any]
|
|
12
|
+
is_called: Optional[bool] = False
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class ToolOutput(BaseTool):
|
|
16
|
+
"""Output of the tool function"""
|
|
17
|
+
|
|
18
|
+
output: Any
|
|
@@ -90,12 +90,6 @@ class ConfidentSpanExporter(SpanExporter):
|
|
|
90
90
|
api_key: Optional[str] = None, # dynamic api key,
|
|
91
91
|
_test_run_id: Optional[str] = None,
|
|
92
92
|
) -> SpanExportResult:
|
|
93
|
-
# build forest of spans
|
|
94
|
-
# for span in spans:
|
|
95
|
-
# print("--------------------------------")
|
|
96
|
-
# print(span.to_json())
|
|
97
|
-
# print("--------------------------------")
|
|
98
|
-
# return SpanExportResult.SUCCESS
|
|
99
93
|
|
|
100
94
|
################ Build Forest of Spans ################
|
|
101
95
|
forest = self._build_span_forest(spans)
|
deepeval/tracing/otel/utils.py
CHANGED
|
@@ -3,7 +3,7 @@ import json
|
|
|
3
3
|
from typing import List, Optional, Tuple, Any
|
|
4
4
|
from opentelemetry.sdk.trace.export import ReadableSpan
|
|
5
5
|
|
|
6
|
-
from deepeval.
|
|
6
|
+
from deepeval.test_case.api import create_api_test_case
|
|
7
7
|
from deepeval.test_run.api import LLMApiTestCase
|
|
8
8
|
from deepeval.test_run.test_run import global_test_run_manager
|
|
9
9
|
from deepeval.tracing.types import Trace, LLMTestCase, ToolCall
|
|
@@ -109,8 +109,24 @@ def check_llm_input_from_gen_ai_attributes(
|
|
|
109
109
|
input = None
|
|
110
110
|
output = None
|
|
111
111
|
try:
|
|
112
|
-
|
|
113
|
-
|
|
112
|
+
# check for system instructions
|
|
113
|
+
system_instructions = []
|
|
114
|
+
system_instructions_raw = span.attributes.get(
|
|
115
|
+
"gen_ai.system_instructions"
|
|
116
|
+
)
|
|
117
|
+
if system_instructions_raw and isinstance(system_instructions_raw, str):
|
|
118
|
+
system_instructions_json = json.loads(system_instructions_raw)
|
|
119
|
+
system_instructions = _flatten_system_instructions(
|
|
120
|
+
system_instructions_json
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
input_messages = []
|
|
124
|
+
input_messages_raw = span.attributes.get("gen_ai.input.messages")
|
|
125
|
+
if input_messages_raw and isinstance(input_messages_raw, str):
|
|
126
|
+
input_messages_json = json.loads(input_messages_raw)
|
|
127
|
+
input_messages = _flatten_input(input_messages_json)
|
|
128
|
+
|
|
129
|
+
input = system_instructions + input_messages
|
|
114
130
|
|
|
115
131
|
except Exception:
|
|
116
132
|
pass
|
|
@@ -137,6 +153,20 @@ def check_llm_input_from_gen_ai_attributes(
|
|
|
137
153
|
return input, output
|
|
138
154
|
|
|
139
155
|
|
|
156
|
+
def _flatten_system_instructions(system_instructions: list) -> list:
|
|
157
|
+
if isinstance(system_instructions, list):
|
|
158
|
+
for system_instruction in system_instructions:
|
|
159
|
+
if isinstance(system_instruction, dict):
|
|
160
|
+
role = system_instruction.get("role")
|
|
161
|
+
if not role:
|
|
162
|
+
system_instruction["role"] = "System Instruction"
|
|
163
|
+
return _flatten_input(system_instructions)
|
|
164
|
+
elif isinstance(system_instructions, str):
|
|
165
|
+
return [{"role": "System Instruction", "content": system_instructions}]
|
|
166
|
+
|
|
167
|
+
return []
|
|
168
|
+
|
|
169
|
+
|
|
140
170
|
def _flatten_input(input: list) -> list:
|
|
141
171
|
if input and isinstance(input, list):
|
|
142
172
|
try:
|
|
@@ -411,10 +441,23 @@ def _normalize_pydantic_ai_messages(span: ReadableSpan) -> Optional[list]:
|
|
|
411
441
|
return None
|
|
412
442
|
|
|
413
443
|
|
|
444
|
+
def _extract_non_thinking_part_of_last_message(message: dict) -> dict:
|
|
445
|
+
|
|
446
|
+
if isinstance(message, dict) and message.get("role") == "assistant":
|
|
447
|
+
parts = message.get("parts")
|
|
448
|
+
if parts:
|
|
449
|
+
# Iterate from the last part
|
|
450
|
+
for part in reversed(parts):
|
|
451
|
+
if isinstance(part, dict) and part.get("type") == "text":
|
|
452
|
+
# Return a modified message with only the text content
|
|
453
|
+
return {"role": "assistant", "content": part.get("content")}
|
|
454
|
+
return None
|
|
455
|
+
|
|
456
|
+
|
|
414
457
|
def check_pydantic_ai_agent_input_output(
|
|
415
458
|
span: ReadableSpan,
|
|
416
459
|
) -> Tuple[Optional[Any], Optional[Any]]:
|
|
417
|
-
input_val:
|
|
460
|
+
input_val: list = []
|
|
418
461
|
output_val: Optional[Any] = None
|
|
419
462
|
|
|
420
463
|
# Get normalized messages once
|
|
@@ -445,14 +488,21 @@ def check_pydantic_ai_agent_input_output(
|
|
|
445
488
|
if span.attributes.get("confident.span.type") == "agent":
|
|
446
489
|
output_val = span.attributes.get("final_result")
|
|
447
490
|
if not output_val and normalized:
|
|
448
|
-
|
|
449
|
-
|
|
491
|
+
output_val = _extract_non_thinking_part_of_last_message(
|
|
492
|
+
normalized[-1]
|
|
493
|
+
)
|
|
450
494
|
except Exception:
|
|
451
495
|
pass
|
|
452
496
|
|
|
497
|
+
system_instructions = []
|
|
498
|
+
system_instruction_raw = span.attributes.get("gen_ai.system_instructions")
|
|
499
|
+
if system_instruction_raw and isinstance(system_instruction_raw, str):
|
|
500
|
+
system_instructions = _flatten_system_instructions(
|
|
501
|
+
json.loads(system_instruction_raw)
|
|
502
|
+
)
|
|
503
|
+
|
|
453
504
|
input_val = _flatten_input(input_val)
|
|
454
|
-
|
|
455
|
-
return input_val, output_val
|
|
505
|
+
return system_instructions + input_val, output_val
|
|
456
506
|
|
|
457
507
|
|
|
458
508
|
def check_tool_output(span: ReadableSpan):
|
|
@@ -1,14 +1,83 @@
|
|
|
1
|
-
from
|
|
2
|
-
from
|
|
1
|
+
from typing import Optional, List, Dict, Any
|
|
2
|
+
from contextvars import ContextVar
|
|
3
3
|
from contextlib import contextmanager
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
|
|
6
|
+
from .tracing import trace_manager
|
|
7
|
+
from .context import current_trace_context, update_current_trace
|
|
8
|
+
from deepeval.prompt import Prompt
|
|
9
|
+
from deepeval.metrics import BaseMetric
|
|
10
|
+
from deepeval.test_case.llm_test_case import ToolCall
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class LlmContext:
|
|
15
|
+
prompt: Optional[Prompt] = None
|
|
16
|
+
metrics: Optional[List[BaseMetric]] = None
|
|
17
|
+
metric_collection: Optional[str] = None
|
|
18
|
+
expected_output: Optional[str] = None
|
|
19
|
+
expected_tools: Optional[List[ToolCall]] = None
|
|
20
|
+
context: Optional[List[str]] = None
|
|
21
|
+
retrieval_context: Optional[List[str]] = None
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
current_llm_context: ContextVar[Optional[LlmContext]] = ContextVar(
|
|
25
|
+
"current_llm_context", default=LlmContext()
|
|
26
|
+
)
|
|
4
27
|
|
|
5
28
|
|
|
6
29
|
@contextmanager
|
|
7
|
-
def trace(
|
|
30
|
+
def trace(
|
|
31
|
+
prompt: Optional[Prompt] = None,
|
|
32
|
+
llm_metrics: Optional[List[BaseMetric]] = None,
|
|
33
|
+
llm_metric_collection: Optional[str] = None,
|
|
34
|
+
name: Optional[str] = None,
|
|
35
|
+
tags: Optional[List[str]] = None,
|
|
36
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
37
|
+
user_id: Optional[str] = None,
|
|
38
|
+
thread_id: Optional[str] = None,
|
|
39
|
+
expected_output: Optional[str] = None,
|
|
40
|
+
expected_tools: Optional[List[ToolCall]] = None,
|
|
41
|
+
context: Optional[List[str]] = None,
|
|
42
|
+
retrieval_context: Optional[List[str]] = None,
|
|
43
|
+
trace_metric_collection: Optional[str] = None,
|
|
44
|
+
trace_metrics: Optional[List[BaseMetric]] = None,
|
|
45
|
+
):
|
|
8
46
|
current_trace = current_trace_context.get()
|
|
9
47
|
|
|
10
48
|
if not current_trace:
|
|
11
49
|
current_trace = trace_manager.start_new_trace()
|
|
12
|
-
|
|
50
|
+
|
|
51
|
+
if trace_metrics:
|
|
52
|
+
current_trace.metrics = trace_metrics
|
|
53
|
+
|
|
54
|
+
if trace_metric_collection:
|
|
55
|
+
current_trace.metric_collection = trace_metric_collection
|
|
56
|
+
|
|
57
|
+
current_trace_context.set(current_trace)
|
|
58
|
+
|
|
59
|
+
current_llm_context.set(
|
|
60
|
+
LlmContext(
|
|
61
|
+
prompt=prompt,
|
|
62
|
+
metrics=llm_metrics,
|
|
63
|
+
metric_collection=llm_metric_collection,
|
|
64
|
+
expected_output=expected_output,
|
|
65
|
+
expected_tools=expected_tools,
|
|
66
|
+
context=context,
|
|
67
|
+
retrieval_context=retrieval_context,
|
|
68
|
+
)
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
# set the current trace attributes
|
|
72
|
+
if name:
|
|
73
|
+
update_current_trace(name=name)
|
|
74
|
+
if tags:
|
|
75
|
+
update_current_trace(tags=tags)
|
|
76
|
+
if metadata:
|
|
77
|
+
update_current_trace(metadata=metadata)
|
|
78
|
+
if user_id:
|
|
79
|
+
update_current_trace(user_id=user_id)
|
|
80
|
+
if thread_id:
|
|
81
|
+
update_current_trace(thread_id=thread_id)
|
|
13
82
|
|
|
14
83
|
yield current_trace
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from typing import Optional, Dict, Any
|
|
2
|
+
import asyncio
|
|
3
|
+
from time import monotonic
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class TraceTestingManager:
|
|
7
|
+
test_name: Optional[str] = None
|
|
8
|
+
test_dict: Optional[Dict[str, Any]] = None
|
|
9
|
+
|
|
10
|
+
async def wait_for_test_dict(
|
|
11
|
+
self, timeout: float = 10.0, poll_interval: float = 0.05
|
|
12
|
+
) -> Dict[str, Any]:
|
|
13
|
+
deadline = monotonic() + timeout
|
|
14
|
+
while self.test_dict is None and monotonic() < deadline:
|
|
15
|
+
await asyncio.sleep(poll_interval)
|
|
16
|
+
return self.test_dict or {}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
trace_testing_manager = TraceTestingManager()
|