deepeval 3.6.6__py3-none-any.whl → 3.6.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
- deepeval/cli/main.py +42 -0
- deepeval/confident/api.py +1 -0
- deepeval/config/settings.py +22 -4
- deepeval/constants.py +8 -1
- deepeval/dataset/dataset.py +2 -11
- deepeval/dataset/utils.py +1 -1
- deepeval/evaluate/evaluate.py +5 -1
- deepeval/evaluate/execute.py +97 -42
- deepeval/evaluate/utils.py +20 -116
- deepeval/integrations/crewai/__init__.py +6 -1
- deepeval/integrations/crewai/handler.py +1 -1
- deepeval/integrations/crewai/subs.py +51 -0
- deepeval/integrations/crewai/wrapper.py +45 -5
- deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
- deepeval/metrics/api.py +281 -0
- deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
- deepeval/metrics/bias/bias.py +12 -3
- deepeval/metrics/contextual_precision/contextual_precision.py +12 -3
- deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
- deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
- deepeval/metrics/conversational_dag/nodes.py +12 -4
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +73 -59
- deepeval/metrics/dag/dag.py +12 -0
- deepeval/metrics/dag/nodes.py +12 -4
- deepeval/metrics/faithfulness/faithfulness.py +12 -1
- deepeval/metrics/g_eval/g_eval.py +11 -0
- deepeval/metrics/hallucination/hallucination.py +12 -1
- deepeval/metrics/indicator.py +8 -2
- deepeval/metrics/json_correctness/json_correctness.py +12 -1
- deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +13 -0
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +13 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +12 -1
- deepeval/metrics/misuse/misuse.py +12 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +6 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
- deepeval/metrics/non_advice/non_advice.py +12 -0
- deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
- deepeval/metrics/prompt_alignment/prompt_alignment.py +12 -1
- deepeval/metrics/role_adherence/role_adherence.py +12 -0
- deepeval/metrics/role_violation/role_violation.py +12 -0
- deepeval/metrics/summarization/summarization.py +12 -1
- deepeval/metrics/task_completion/task_completion.py +3 -0
- deepeval/metrics/tool_correctness/tool_correctness.py +8 -0
- deepeval/metrics/toxicity/toxicity.py +12 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
- deepeval/models/llms/grok_model.py +1 -1
- deepeval/models/llms/openai_model.py +2 -0
- deepeval/openai/__init__.py +14 -32
- deepeval/openai/extractors.py +24 -34
- deepeval/openai/patch.py +256 -161
- deepeval/openai/types.py +20 -0
- deepeval/openai/utils.py +98 -56
- deepeval/prompt/__init__.py +19 -1
- deepeval/prompt/api.py +160 -0
- deepeval/prompt/prompt.py +244 -62
- deepeval/prompt/utils.py +144 -2
- deepeval/synthesizer/chunking/context_generator.py +209 -152
- deepeval/synthesizer/chunking/doc_chunker.py +46 -12
- deepeval/synthesizer/synthesizer.py +8 -5
- deepeval/test_case/api.py +131 -0
- deepeval/test_run/__init__.py +1 -0
- deepeval/test_run/hyperparameters.py +47 -8
- deepeval/test_run/test_run.py +104 -1
- deepeval/tracing/api.py +3 -1
- deepeval/tracing/message_types/__init__.py +10 -0
- deepeval/tracing/message_types/base.py +6 -0
- deepeval/tracing/message_types/messages.py +14 -0
- deepeval/tracing/message_types/tools.py +18 -0
- deepeval/tracing/otel/utils.py +1 -1
- deepeval/tracing/trace_context.py +73 -4
- deepeval/tracing/tracing.py +51 -3
- deepeval/tracing/types.py +16 -0
- deepeval/tracing/utils.py +8 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/METADATA +1 -1
- {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/RECORD +92 -84
- {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/WHEEL +0 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
from typing import Union, Optional
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
from deepeval.test_run.api import (
|
|
5
|
+
LLMApiTestCase,
|
|
6
|
+
ConversationalApiTestCase,
|
|
7
|
+
TurnApi,
|
|
8
|
+
TraceApi,
|
|
9
|
+
)
|
|
10
|
+
from deepeval.test_case import (
|
|
11
|
+
LLMTestCase,
|
|
12
|
+
ConversationalTestCase,
|
|
13
|
+
MLLMTestCase,
|
|
14
|
+
Turn,
|
|
15
|
+
)
|
|
16
|
+
from deepeval.constants import PYTEST_RUN_TEST_NAME
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def create_api_turn(turn: Turn, index: int) -> TurnApi:
|
|
20
|
+
return TurnApi(
|
|
21
|
+
role=turn.role,
|
|
22
|
+
content=turn.content,
|
|
23
|
+
user_id=turn.user_id,
|
|
24
|
+
retrievalContext=turn.retrieval_context,
|
|
25
|
+
toolsCalled=turn.tools_called,
|
|
26
|
+
additionalMetadata=turn.additional_metadata,
|
|
27
|
+
order=index,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def create_api_test_case(
|
|
32
|
+
test_case: Union[LLMTestCase, ConversationalTestCase, MLLMTestCase],
|
|
33
|
+
trace: Optional[TraceApi] = None,
|
|
34
|
+
index: Optional[int] = None,
|
|
35
|
+
) -> Union[LLMApiTestCase, ConversationalApiTestCase]:
|
|
36
|
+
if isinstance(test_case, ConversationalTestCase):
|
|
37
|
+
order = (
|
|
38
|
+
test_case._dataset_rank
|
|
39
|
+
if test_case._dataset_rank is not None
|
|
40
|
+
else index
|
|
41
|
+
)
|
|
42
|
+
if test_case.name:
|
|
43
|
+
name = test_case.name
|
|
44
|
+
else:
|
|
45
|
+
name = os.getenv(
|
|
46
|
+
PYTEST_RUN_TEST_NAME, f"conversational_test_case_{order}"
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
api_test_case = ConversationalApiTestCase(
|
|
50
|
+
name=name,
|
|
51
|
+
success=True,
|
|
52
|
+
metricsData=[],
|
|
53
|
+
runDuration=0,
|
|
54
|
+
evaluationCost=None,
|
|
55
|
+
order=order,
|
|
56
|
+
scenario=test_case.scenario,
|
|
57
|
+
expectedOutcome=test_case.expected_outcome,
|
|
58
|
+
userDescription=test_case.user_description,
|
|
59
|
+
context=test_case.context,
|
|
60
|
+
tags=test_case.tags,
|
|
61
|
+
comments=test_case.comments,
|
|
62
|
+
additionalMetadata=test_case.additional_metadata,
|
|
63
|
+
)
|
|
64
|
+
api_test_case.turns = [
|
|
65
|
+
create_api_turn(
|
|
66
|
+
turn=turn,
|
|
67
|
+
index=index,
|
|
68
|
+
)
|
|
69
|
+
for index, turn in enumerate(test_case.turns)
|
|
70
|
+
]
|
|
71
|
+
|
|
72
|
+
return api_test_case
|
|
73
|
+
else:
|
|
74
|
+
order = (
|
|
75
|
+
test_case._dataset_rank
|
|
76
|
+
if test_case._dataset_rank is not None
|
|
77
|
+
else index
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
success = True
|
|
81
|
+
if test_case.name is not None:
|
|
82
|
+
name = test_case.name
|
|
83
|
+
else:
|
|
84
|
+
name = os.getenv(PYTEST_RUN_TEST_NAME, f"test_case_{order}")
|
|
85
|
+
metrics_data = []
|
|
86
|
+
|
|
87
|
+
if isinstance(test_case, LLMTestCase):
|
|
88
|
+
api_test_case = LLMApiTestCase(
|
|
89
|
+
name=name,
|
|
90
|
+
input=test_case.input,
|
|
91
|
+
actualOutput=test_case.actual_output,
|
|
92
|
+
expectedOutput=test_case.expected_output,
|
|
93
|
+
context=test_case.context,
|
|
94
|
+
retrievalContext=test_case.retrieval_context,
|
|
95
|
+
toolsCalled=test_case.tools_called,
|
|
96
|
+
expectedTools=test_case.expected_tools,
|
|
97
|
+
tokenCost=test_case.token_cost,
|
|
98
|
+
completionTime=test_case.completion_time,
|
|
99
|
+
tags=test_case.tags,
|
|
100
|
+
success=success,
|
|
101
|
+
metricsData=metrics_data,
|
|
102
|
+
runDuration=None,
|
|
103
|
+
evaluationCost=None,
|
|
104
|
+
order=order,
|
|
105
|
+
additionalMetadata=test_case.additional_metadata,
|
|
106
|
+
comments=test_case.comments,
|
|
107
|
+
trace=trace,
|
|
108
|
+
)
|
|
109
|
+
elif isinstance(test_case, MLLMTestCase):
|
|
110
|
+
api_test_case = LLMApiTestCase(
|
|
111
|
+
name=name,
|
|
112
|
+
input="",
|
|
113
|
+
multimodalInput=test_case.input,
|
|
114
|
+
multimodalActualOutput=test_case.actual_output,
|
|
115
|
+
multimodalExpectedOutput=test_case.expected_output,
|
|
116
|
+
multimodalRetrievalContext=test_case.retrieval_context,
|
|
117
|
+
multimodalContext=test_case.context,
|
|
118
|
+
toolsCalled=test_case.tools_called,
|
|
119
|
+
expectedTools=test_case.expected_tools,
|
|
120
|
+
tokenCost=test_case.token_cost,
|
|
121
|
+
completionTime=test_case.completion_time,
|
|
122
|
+
success=success,
|
|
123
|
+
metricsData=metrics_data,
|
|
124
|
+
runDuration=None,
|
|
125
|
+
evaluationCost=None,
|
|
126
|
+
order=order,
|
|
127
|
+
additionalMetadata=test_case.additional_metadata,
|
|
128
|
+
comments=test_case.comments,
|
|
129
|
+
)
|
|
130
|
+
# llm_test_case_lookup_map[instance_id] = api_test_case
|
|
131
|
+
return api_test_case
|
deepeval/test_run/__init__.py
CHANGED
|
@@ -1,13 +1,15 @@
|
|
|
1
|
-
from typing import Union, Dict
|
|
2
|
-
|
|
1
|
+
from typing import Union, Dict, Optional, List
|
|
3
2
|
from deepeval.test_run import global_test_run_manager
|
|
4
3
|
from deepeval.prompt import Prompt
|
|
5
4
|
from deepeval.prompt.api import PromptApi
|
|
6
5
|
from deepeval.test_run.test_run import TEMP_FILE_PATH
|
|
6
|
+
from deepeval.confident.api import is_confident
|
|
7
|
+
from deepeval.test_run.test_run import PromptData
|
|
7
8
|
|
|
8
9
|
|
|
9
10
|
def process_hyperparameters(
|
|
10
|
-
hyperparameters,
|
|
11
|
+
hyperparameters: Optional[Dict] = None,
|
|
12
|
+
verbose: bool = True,
|
|
11
13
|
) -> Union[Dict[str, Union[str, int, float, PromptApi]], None]:
|
|
12
14
|
if hyperparameters is None:
|
|
13
15
|
return None
|
|
@@ -16,6 +18,7 @@ def process_hyperparameters(
|
|
|
16
18
|
raise TypeError("Hyperparameters must be a dictionary or None")
|
|
17
19
|
|
|
18
20
|
processed_hyperparameters = {}
|
|
21
|
+
prompts_version_id_map = {}
|
|
19
22
|
|
|
20
23
|
for key, value in hyperparameters.items():
|
|
21
24
|
if not isinstance(key, str):
|
|
@@ -30,14 +33,21 @@ def process_hyperparameters(
|
|
|
30
33
|
)
|
|
31
34
|
|
|
32
35
|
if isinstance(value, Prompt):
|
|
33
|
-
|
|
36
|
+
prompt_key = f"{value.alias}_{value.version}"
|
|
37
|
+
if value._prompt_version_id is not None and value.type is not None:
|
|
34
38
|
processed_hyperparameters[key] = PromptApi(
|
|
35
39
|
id=value._prompt_version_id,
|
|
36
|
-
type=value.
|
|
40
|
+
type=value.type,
|
|
37
41
|
)
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
42
|
+
elif is_confident():
|
|
43
|
+
if prompt_key not in prompts_version_id_map:
|
|
44
|
+
value.push(_verbose=verbose)
|
|
45
|
+
prompts_version_id_map[prompt_key] = (
|
|
46
|
+
value._prompt_version_id
|
|
47
|
+
)
|
|
48
|
+
processed_hyperparameters[key] = PromptApi(
|
|
49
|
+
id=prompts_version_id_map[prompt_key],
|
|
50
|
+
type=value.type,
|
|
41
51
|
)
|
|
42
52
|
else:
|
|
43
53
|
processed_hyperparameters[key] = str(value)
|
|
@@ -64,3 +74,32 @@ def log_hyperparameters(func):
|
|
|
64
74
|
|
|
65
75
|
# Return the wrapper function to be used as the decorator
|
|
66
76
|
return wrapper
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def process_prompts(
|
|
80
|
+
hyperparameters: Dict[str, Union[str, int, float, Prompt]],
|
|
81
|
+
) -> List[PromptData]:
|
|
82
|
+
prompts = []
|
|
83
|
+
if not hyperparameters:
|
|
84
|
+
return prompts
|
|
85
|
+
seen_prompts = set()
|
|
86
|
+
prompt_objects = [
|
|
87
|
+
value for value in hyperparameters.values() if isinstance(value, Prompt)
|
|
88
|
+
]
|
|
89
|
+
for prompt in prompt_objects:
|
|
90
|
+
prompt_version = prompt.version if is_confident() else None
|
|
91
|
+
prompt_key = f"{prompt.alias}_{prompt_version}"
|
|
92
|
+
if prompt_key in seen_prompts:
|
|
93
|
+
continue
|
|
94
|
+
seen_prompts.add(prompt_key)
|
|
95
|
+
prompt_data = PromptData(
|
|
96
|
+
alias=prompt.alias,
|
|
97
|
+
version=prompt_version,
|
|
98
|
+
text_template=prompt.text_template,
|
|
99
|
+
messages_template=prompt.messages_template,
|
|
100
|
+
model_settings=prompt.model_settings,
|
|
101
|
+
output_type=prompt.output_type,
|
|
102
|
+
interpolation_type=prompt.interpolation_type,
|
|
103
|
+
)
|
|
104
|
+
prompts.append(prompt_data)
|
|
105
|
+
return prompts
|
deepeval/test_run/test_run.py
CHANGED
|
@@ -32,6 +32,17 @@ from deepeval.utils import (
|
|
|
32
32
|
)
|
|
33
33
|
from deepeval.test_run.cache import global_test_run_cache_manager
|
|
34
34
|
from deepeval.constants import CONFIDENT_TEST_CASE_BATCH_SIZE, HIDDEN_DIR
|
|
35
|
+
from deepeval.prompt import (
|
|
36
|
+
PromptMessage,
|
|
37
|
+
ModelSettings,
|
|
38
|
+
OutputType,
|
|
39
|
+
PromptInterpolationType,
|
|
40
|
+
OutputType,
|
|
41
|
+
)
|
|
42
|
+
from rich.panel import Panel
|
|
43
|
+
from rich.text import Text
|
|
44
|
+
from rich.columns import Columns
|
|
45
|
+
|
|
35
46
|
|
|
36
47
|
TEMP_FILE_PATH = f"{HIDDEN_DIR}/.temp_test_run_data.json"
|
|
37
48
|
LATEST_TEST_RUN_FILE_PATH = f"{HIDDEN_DIR}/.latest_test_run.json"
|
|
@@ -71,6 +82,16 @@ class TraceMetricScores(BaseModel):
|
|
|
71
82
|
base: Dict[str, Dict[str, MetricScores]] = Field(default_factory=dict)
|
|
72
83
|
|
|
73
84
|
|
|
85
|
+
class PromptData(BaseModel):
|
|
86
|
+
alias: Optional[str] = None
|
|
87
|
+
version: Optional[str] = None
|
|
88
|
+
text_template: Optional[str] = None
|
|
89
|
+
messages_template: Optional[List[PromptMessage]] = None
|
|
90
|
+
model_settings: Optional[ModelSettings] = None
|
|
91
|
+
output_type: Optional[OutputType] = None
|
|
92
|
+
interpolation_type: Optional[PromptInterpolationType] = None
|
|
93
|
+
|
|
94
|
+
|
|
74
95
|
class MetricsAverageDict:
|
|
75
96
|
def __init__(self):
|
|
76
97
|
self.metric_dict = {}
|
|
@@ -123,6 +144,7 @@ class TestRun(BaseModel):
|
|
|
123
144
|
)
|
|
124
145
|
identifier: Optional[str] = None
|
|
125
146
|
hyperparameters: Optional[Dict[str, Any]] = Field(None)
|
|
147
|
+
prompts: Optional[List[PromptData]] = Field(None)
|
|
126
148
|
test_passed: Optional[int] = Field(None, alias="testPassed")
|
|
127
149
|
test_failed: Optional[int] = Field(None, alias="testFailed")
|
|
128
150
|
run_duration: float = Field(0.0, alias="runDuration")
|
|
@@ -799,6 +821,7 @@ class TestRunManager:
|
|
|
799
821
|
test_run.test_cases = initial_batch
|
|
800
822
|
|
|
801
823
|
try:
|
|
824
|
+
test_run.prompts = None
|
|
802
825
|
body = test_run.model_dump(by_alias=True, exclude_none=True)
|
|
803
826
|
except AttributeError:
|
|
804
827
|
# Pydantic version below 2.0
|
|
@@ -953,6 +976,23 @@ class TestRunManager:
|
|
|
953
976
|
if display_table:
|
|
954
977
|
self.display_results_table(test_run, display)
|
|
955
978
|
|
|
979
|
+
if test_run.hyperparameters is None:
|
|
980
|
+
console.print(
|
|
981
|
+
"\n[bold yellow]⚠ WARNING:[/bold yellow] No hyperparameters logged.\n"
|
|
982
|
+
"» [bold blue][link=https://deepeval.com/docs/evaluation-prompts]Log hyperparameters[/link][/bold blue] to attribute prompts and models to your test runs.\n\n"
|
|
983
|
+
+ "=" * 80
|
|
984
|
+
)
|
|
985
|
+
else:
|
|
986
|
+
if not test_run.prompts:
|
|
987
|
+
console.print(
|
|
988
|
+
"\n[bold yellow]⚠ WARNING:[/bold yellow] No prompts logged.\n"
|
|
989
|
+
"» [bold blue][link=https://deepeval.com/docs/evaluation-prompts]Log prompts[/link][/bold blue] to evaluate and optimize your prompt templates and models.\n\n"
|
|
990
|
+
+ "=" * 80
|
|
991
|
+
)
|
|
992
|
+
else:
|
|
993
|
+
console.print("\n[bold green]✓ Prompts Logged[/bold green]\n")
|
|
994
|
+
self._render_prompts_panels(prompts=test_run.prompts)
|
|
995
|
+
|
|
956
996
|
self.save_test_run_locally()
|
|
957
997
|
delete_file_if_exists(self.temp_file_path)
|
|
958
998
|
if is_confident() and self.disable_request is False:
|
|
@@ -967,7 +1007,7 @@ class TestRunManager:
|
|
|
967
1007
|
f"» Test Results ({test_run.test_passed + test_run.test_failed} total tests):\n",
|
|
968
1008
|
f" » Pass Rate: {round((test_run.test_passed / (test_run.test_passed + test_run.test_failed)) * 100, 2)}% | Passed: [bold green]{test_run.test_passed}[/bold green] | Failed: [bold red]{test_run.test_failed}[/bold red]\n\n",
|
|
969
1009
|
"=" * 80,
|
|
970
|
-
"\n\n»
|
|
1010
|
+
"\n\n» Want to share evals with your team, or a place for your test cases to live? ❤️ 🏡\n"
|
|
971
1011
|
" » Run [bold]'deepeval view'[/bold] to analyze and save testing results on [rgb(106,0,255)]Confident AI[/rgb(106,0,255)].\n\n",
|
|
972
1012
|
)
|
|
973
1013
|
|
|
@@ -993,5 +1033,68 @@ class TestRunManager:
|
|
|
993
1033
|
pass
|
|
994
1034
|
return None
|
|
995
1035
|
|
|
1036
|
+
def _render_prompts_panels(self, prompts: List[PromptData]) -> None:
|
|
1037
|
+
|
|
1038
|
+
def format_string(
|
|
1039
|
+
v, default="[dim]None[/dim]", color: Optional[str] = None
|
|
1040
|
+
):
|
|
1041
|
+
formatted_string = str(v) if v not in (None, "", []) else default
|
|
1042
|
+
return (
|
|
1043
|
+
f"{formatted_string}"
|
|
1044
|
+
if color is None or v in (None, "", [])
|
|
1045
|
+
else f"[{color}]{formatted_string}[/]"
|
|
1046
|
+
)
|
|
1047
|
+
|
|
1048
|
+
panels = []
|
|
1049
|
+
for prompt in prompts:
|
|
1050
|
+
lines = []
|
|
1051
|
+
p_type = (
|
|
1052
|
+
"messages"
|
|
1053
|
+
if prompt.messages_template
|
|
1054
|
+
else ("text" if prompt.text_template else "—")
|
|
1055
|
+
)
|
|
1056
|
+
if p_type:
|
|
1057
|
+
lines.append(f"type: {format_string(p_type, color='blue')}")
|
|
1058
|
+
if prompt.output_type:
|
|
1059
|
+
lines.append(
|
|
1060
|
+
f"output_type: {format_string(prompt.output_type, color='blue')}"
|
|
1061
|
+
)
|
|
1062
|
+
if prompt.interpolation_type:
|
|
1063
|
+
lines.append(
|
|
1064
|
+
f"interpolation_type: {format_string(prompt.interpolation_type, color='blue')}"
|
|
1065
|
+
)
|
|
1066
|
+
if prompt.model_settings:
|
|
1067
|
+
ms = prompt.model_settings
|
|
1068
|
+
settings_lines = [
|
|
1069
|
+
"Model Settings:",
|
|
1070
|
+
f" – provider: {format_string(ms.provider, color='green')}",
|
|
1071
|
+
f" – name: {format_string(ms.name, color='green')}",
|
|
1072
|
+
f" – temperature: {format_string(ms.temperature, color='green')}",
|
|
1073
|
+
f" – max_tokens: {format_string(ms.max_tokens, color='green')}",
|
|
1074
|
+
f" – top_p: {format_string(ms.top_p, color='green')}",
|
|
1075
|
+
f" – frequency_penalty: {format_string(ms.frequency_penalty, color='green')}",
|
|
1076
|
+
f" – presence_penalty: {format_string(ms.presence_penalty, color='green')}",
|
|
1077
|
+
f" – stop_sequence: {format_string(ms.stop_sequence, color='green')}",
|
|
1078
|
+
f" – reasoning_effort: {format_string(ms.reasoning_effort, color='green')}",
|
|
1079
|
+
f" – verbosity: {format_string(ms.verbosity, color='green')}",
|
|
1080
|
+
]
|
|
1081
|
+
lines.append("")
|
|
1082
|
+
lines.extend(settings_lines)
|
|
1083
|
+
title = f"{format_string(prompt.alias)}"
|
|
1084
|
+
if prompt.version:
|
|
1085
|
+
title += f" (v{prompt.version})"
|
|
1086
|
+
body = "\n".join(lines)
|
|
1087
|
+
panel = Panel(
|
|
1088
|
+
body,
|
|
1089
|
+
title=title,
|
|
1090
|
+
title_align="left",
|
|
1091
|
+
expand=False,
|
|
1092
|
+
padding=(1, 6, 1, 2),
|
|
1093
|
+
)
|
|
1094
|
+
panels.append(panel)
|
|
1095
|
+
|
|
1096
|
+
if panels:
|
|
1097
|
+
console.print(Columns(panels, equal=False, expand=False))
|
|
1098
|
+
|
|
996
1099
|
|
|
997
1100
|
global_test_run_manager = TestRunManager()
|
deepeval/tracing/api.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from enum import Enum
|
|
2
2
|
from typing import Dict, List, Optional, Union, Literal, Any
|
|
3
|
-
from pydantic import BaseModel, Field
|
|
3
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
4
4
|
|
|
5
5
|
from deepeval.test_case import ToolCall
|
|
6
6
|
|
|
@@ -27,6 +27,8 @@ class PromptApi(BaseModel):
|
|
|
27
27
|
|
|
28
28
|
|
|
29
29
|
class MetricData(BaseModel):
|
|
30
|
+
model_config = ConfigDict(extra="ignore")
|
|
31
|
+
|
|
30
32
|
name: str
|
|
31
33
|
threshold: float
|
|
32
34
|
success: bool
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from typing import Literal, Dict, Any
|
|
2
|
+
from .base import BaseMessage
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class TextMessage(BaseMessage):
|
|
6
|
+
type: Literal["text", "thinking"]
|
|
7
|
+
content: str
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ToolCallMessage(BaseMessage):
|
|
11
|
+
"""This is a message for tool calls in response.choices[0].message.tool_calls"""
|
|
12
|
+
|
|
13
|
+
name: str
|
|
14
|
+
args: Dict[str, Any]
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from typing import Any, Optional, Dict
|
|
2
|
+
from pydantic import BaseModel
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class BaseTool(BaseModel):
|
|
6
|
+
name: str
|
|
7
|
+
description: Optional[str] = None
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ToolSchema(BaseTool):
|
|
11
|
+
parameters: Dict[str, Any]
|
|
12
|
+
is_called: Optional[bool] = False
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class ToolOutput(BaseTool):
|
|
16
|
+
"""Output of the tool function"""
|
|
17
|
+
|
|
18
|
+
output: Any
|
deepeval/tracing/otel/utils.py
CHANGED
|
@@ -3,7 +3,7 @@ import json
|
|
|
3
3
|
from typing import List, Optional, Tuple, Any
|
|
4
4
|
from opentelemetry.sdk.trace.export import ReadableSpan
|
|
5
5
|
|
|
6
|
-
from deepeval.
|
|
6
|
+
from deepeval.test_case.api import create_api_test_case
|
|
7
7
|
from deepeval.test_run.api import LLMApiTestCase
|
|
8
8
|
from deepeval.test_run.test_run import global_test_run_manager
|
|
9
9
|
from deepeval.tracing.types import Trace, LLMTestCase, ToolCall
|
|
@@ -1,14 +1,83 @@
|
|
|
1
|
-
from
|
|
2
|
-
from
|
|
1
|
+
from typing import Optional, List, Dict, Any
|
|
2
|
+
from contextvars import ContextVar
|
|
3
3
|
from contextlib import contextmanager
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
|
|
6
|
+
from .tracing import trace_manager
|
|
7
|
+
from .context import current_trace_context, update_current_trace
|
|
8
|
+
from deepeval.prompt import Prompt
|
|
9
|
+
from deepeval.metrics import BaseMetric
|
|
10
|
+
from deepeval.test_case.llm_test_case import ToolCall
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class LlmContext:
|
|
15
|
+
prompt: Optional[Prompt] = None
|
|
16
|
+
metrics: Optional[List[BaseMetric]] = None
|
|
17
|
+
metric_collection: Optional[str] = None
|
|
18
|
+
expected_output: Optional[str] = None
|
|
19
|
+
expected_tools: Optional[List[ToolCall]] = None
|
|
20
|
+
context: Optional[List[str]] = None
|
|
21
|
+
retrieval_context: Optional[List[str]] = None
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
current_llm_context: ContextVar[Optional[LlmContext]] = ContextVar(
|
|
25
|
+
"current_llm_context", default=LlmContext()
|
|
26
|
+
)
|
|
4
27
|
|
|
5
28
|
|
|
6
29
|
@contextmanager
|
|
7
|
-
def trace(
|
|
30
|
+
def trace(
|
|
31
|
+
prompt: Optional[Prompt] = None,
|
|
32
|
+
llm_metrics: Optional[List[BaseMetric]] = None,
|
|
33
|
+
llm_metric_collection: Optional[str] = None,
|
|
34
|
+
name: Optional[str] = None,
|
|
35
|
+
tags: Optional[List[str]] = None,
|
|
36
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
37
|
+
user_id: Optional[str] = None,
|
|
38
|
+
thread_id: Optional[str] = None,
|
|
39
|
+
expected_output: Optional[str] = None,
|
|
40
|
+
expected_tools: Optional[List[ToolCall]] = None,
|
|
41
|
+
context: Optional[List[str]] = None,
|
|
42
|
+
retrieval_context: Optional[List[str]] = None,
|
|
43
|
+
trace_metric_collection: Optional[str] = None,
|
|
44
|
+
trace_metrics: Optional[List[BaseMetric]] = None,
|
|
45
|
+
):
|
|
8
46
|
current_trace = current_trace_context.get()
|
|
9
47
|
|
|
10
48
|
if not current_trace:
|
|
11
49
|
current_trace = trace_manager.start_new_trace()
|
|
12
|
-
|
|
50
|
+
|
|
51
|
+
if trace_metrics:
|
|
52
|
+
current_trace.metrics = trace_metrics
|
|
53
|
+
|
|
54
|
+
if trace_metric_collection:
|
|
55
|
+
current_trace.metric_collection = trace_metric_collection
|
|
56
|
+
|
|
57
|
+
current_trace_context.set(current_trace)
|
|
58
|
+
|
|
59
|
+
current_llm_context.set(
|
|
60
|
+
LlmContext(
|
|
61
|
+
prompt=prompt,
|
|
62
|
+
metrics=llm_metrics,
|
|
63
|
+
metric_collection=llm_metric_collection,
|
|
64
|
+
expected_output=expected_output,
|
|
65
|
+
expected_tools=expected_tools,
|
|
66
|
+
context=context,
|
|
67
|
+
retrieval_context=retrieval_context,
|
|
68
|
+
)
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
# set the current trace attributes
|
|
72
|
+
if name:
|
|
73
|
+
update_current_trace(name=name)
|
|
74
|
+
if tags:
|
|
75
|
+
update_current_trace(tags=tags)
|
|
76
|
+
if metadata:
|
|
77
|
+
update_current_trace(metadata=metadata)
|
|
78
|
+
if user_id:
|
|
79
|
+
update_current_trace(user_id=user_id)
|
|
80
|
+
if thread_id:
|
|
81
|
+
update_current_trace(thread_id=thread_id)
|
|
13
82
|
|
|
14
83
|
yield current_trace
|
deepeval/tracing/tracing.py
CHANGED
|
@@ -1,5 +1,14 @@
|
|
|
1
|
-
import
|
|
2
|
-
|
|
1
|
+
from typing import (
|
|
2
|
+
TYPE_CHECKING,
|
|
3
|
+
Any,
|
|
4
|
+
Callable,
|
|
5
|
+
Dict,
|
|
6
|
+
List,
|
|
7
|
+
Literal,
|
|
8
|
+
Optional,
|
|
9
|
+
Set,
|
|
10
|
+
Union,
|
|
11
|
+
)
|
|
3
12
|
from time import perf_counter
|
|
4
13
|
import threading
|
|
5
14
|
import functools
|
|
@@ -20,6 +29,7 @@ from deepeval.constants import (
|
|
|
20
29
|
)
|
|
21
30
|
from deepeval.confident.api import Api, Endpoints, HttpMethods, is_confident
|
|
22
31
|
from deepeval.metrics import BaseMetric
|
|
32
|
+
from deepeval.test_case.llm_test_case import ToolCall
|
|
23
33
|
from deepeval.tracing.api import (
|
|
24
34
|
BaseApiSpan,
|
|
25
35
|
SpanApiType,
|
|
@@ -41,6 +51,7 @@ from deepeval.tracing.types import (
|
|
|
41
51
|
)
|
|
42
52
|
from deepeval.tracing.utils import (
|
|
43
53
|
Environment,
|
|
54
|
+
prepare_tool_call_input_parameters,
|
|
44
55
|
replace_self_with_class_name,
|
|
45
56
|
make_json_serializable,
|
|
46
57
|
perf_counter_to_datetime,
|
|
@@ -55,6 +66,10 @@ from deepeval.tracing.types import TestCaseMetricPair
|
|
|
55
66
|
from deepeval.tracing.api import PromptApi
|
|
56
67
|
from deepeval.tracing.trace_test_manager import trace_testing_manager
|
|
57
68
|
|
|
69
|
+
|
|
70
|
+
if TYPE_CHECKING:
|
|
71
|
+
from deepeval.dataset.golden import Golden
|
|
72
|
+
|
|
58
73
|
EVAL_DUMMY_SPAN_NAME = "evals_iterator"
|
|
59
74
|
|
|
60
75
|
|
|
@@ -65,6 +80,10 @@ class TraceManager:
|
|
|
65
80
|
self.active_spans: Dict[str, BaseSpan] = (
|
|
66
81
|
{}
|
|
67
82
|
) # Map of span_uuid to BaseSpan
|
|
83
|
+
# Map each trace created during evaluation_loop to the Golden that was active
|
|
84
|
+
# when it was started. This lets us evaluate traces against the correct golden
|
|
85
|
+
# since we cannot rely on positional indexing as the order is not guaranteed.
|
|
86
|
+
self.trace_uuid_to_golden: Dict[str, Golden] = {}
|
|
68
87
|
|
|
69
88
|
settings = get_settings()
|
|
70
89
|
# Initialize queue and worker thread for trace posting
|
|
@@ -86,7 +105,7 @@ class TraceManager:
|
|
|
86
105
|
)
|
|
87
106
|
validate_environment(self.environment)
|
|
88
107
|
|
|
89
|
-
self.sampling_rate = settings.
|
|
108
|
+
self.sampling_rate = settings.CONFIDENT_TRACE_SAMPLE_RATE
|
|
90
109
|
validate_sampling_rate(self.sampling_rate)
|
|
91
110
|
self.openai_client = None
|
|
92
111
|
self.tracing_enabled = True
|
|
@@ -166,6 +185,19 @@ class TraceManager:
|
|
|
166
185
|
self.traces.append(new_trace)
|
|
167
186
|
if self.evaluation_loop:
|
|
168
187
|
self.traces_to_evaluate_order.append(trace_uuid)
|
|
188
|
+
# Associate the current Golden with this trace so we can
|
|
189
|
+
# later evaluate traces against the correct golden, even if more traces
|
|
190
|
+
# are created than goldens or the order interleaves.
|
|
191
|
+
try:
|
|
192
|
+
from deepeval.contextvars import get_current_golden
|
|
193
|
+
|
|
194
|
+
current_golden = get_current_golden()
|
|
195
|
+
if current_golden is not None:
|
|
196
|
+
self.trace_uuid_to_golden[trace_uuid] = current_golden
|
|
197
|
+
except Exception:
|
|
198
|
+
# not much we can do, but if the golden is not there during evaluation
|
|
199
|
+
# we will write out a verbose debug log
|
|
200
|
+
pass
|
|
169
201
|
return new_trace
|
|
170
202
|
|
|
171
203
|
def end_trace(self, trace_uuid: str):
|
|
@@ -861,6 +893,22 @@ class Observer:
|
|
|
861
893
|
):
|
|
862
894
|
current_span.prompt = self.prompt
|
|
863
895
|
|
|
896
|
+
if not current_span.tools_called:
|
|
897
|
+
# check any tool span children
|
|
898
|
+
for child in current_span.children:
|
|
899
|
+
if isinstance(child, ToolSpan):
|
|
900
|
+
current_span.tools_called = current_span.tools_called or []
|
|
901
|
+
current_span.tools_called.append(
|
|
902
|
+
ToolCall(
|
|
903
|
+
name=child.name,
|
|
904
|
+
description=child.description,
|
|
905
|
+
input_parameters=prepare_tool_call_input_parameters(
|
|
906
|
+
child.input
|
|
907
|
+
),
|
|
908
|
+
output=child.output,
|
|
909
|
+
)
|
|
910
|
+
)
|
|
911
|
+
|
|
864
912
|
trace_manager.remove_span(self.uuid)
|
|
865
913
|
if current_span.parent_uuid:
|
|
866
914
|
parent_span = trace_manager.get_span_by_uuid(
|
deepeval/tracing/types.py
CHANGED
|
@@ -3,6 +3,12 @@ from dataclasses import dataclass, field
|
|
|
3
3
|
from pydantic import BaseModel, Field
|
|
4
4
|
from typing import Any, Dict, List, Optional, Union
|
|
5
5
|
from rich.progress import Progress
|
|
6
|
+
from deepeval.tracing.message_types import (
|
|
7
|
+
ToolSchema,
|
|
8
|
+
ToolOutput,
|
|
9
|
+
TextMessage,
|
|
10
|
+
ToolCallMessage,
|
|
11
|
+
)
|
|
6
12
|
|
|
7
13
|
from deepeval.prompt.prompt import Prompt
|
|
8
14
|
from deepeval.test_case.llm_test_case import ToolCall
|
|
@@ -88,6 +94,12 @@ class AgentSpan(BaseSpan):
|
|
|
88
94
|
|
|
89
95
|
|
|
90
96
|
class LlmSpan(BaseSpan):
|
|
97
|
+
input: Optional[
|
|
98
|
+
Union[Any, List[Union[TextMessage, ToolCallMessage, ToolOutput]]]
|
|
99
|
+
] = None
|
|
100
|
+
output: Optional[Union[Any, List[Union[TextMessage, ToolCallMessage]]]] = (
|
|
101
|
+
None
|
|
102
|
+
)
|
|
91
103
|
model: Optional[str] = None
|
|
92
104
|
prompt: Optional[Prompt] = None
|
|
93
105
|
input_token_count: Optional[float] = Field(
|
|
@@ -106,6 +118,10 @@ class LlmSpan(BaseSpan):
|
|
|
106
118
|
None, serialization_alias="tokenTimes"
|
|
107
119
|
)
|
|
108
120
|
|
|
121
|
+
# input_tools: Optional[List[ToolSchema]] = Field(None, serialization_alias="inputTools")
|
|
122
|
+
# invocation_params: Optional[Dict[str, Any]] = Field(None, serialization_alias="invocationParams")
|
|
123
|
+
# output_metadata: Optional[Dict[str, Any]] = Field(None, serialization_alias="outputMetadata")
|
|
124
|
+
|
|
109
125
|
# for serializing `prompt`
|
|
110
126
|
model_config = {"arbitrary_types_allowed": True}
|
|
111
127
|
|