deepeval 3.6.4__py3-none-any.whl → 3.6.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/__init__.py +42 -10
- deepeval/_version.py +1 -1
- deepeval/config/logging.py +33 -0
- deepeval/config/settings.py +167 -12
- deepeval/dataset/dataset.py +8 -2
- deepeval/evaluate/evaluate.py +8 -2
- deepeval/evaluate/execute.py +28 -30
- deepeval/evaluate/types.py +4 -1
- deepeval/evaluate/utils.py +46 -29
- deepeval/integrations/crewai/__init__.py +1 -2
- deepeval/integrations/crewai/handler.py +153 -81
- deepeval/integrations/crewai/wrapper.py +87 -0
- deepeval/integrations/pydantic_ai/instrumentator.py +48 -9
- deepeval/integrations/pydantic_ai/test_instrumentator.py +0 -0
- deepeval/metrics/faithfulness/faithfulness.py +8 -0
- deepeval/metrics/g_eval/g_eval.py +26 -15
- deepeval/metrics/prompt_alignment/prompt_alignment.py +41 -23
- deepeval/models/retry_policy.py +202 -11
- deepeval/test_run/__init__.py +2 -1
- deepeval/test_run/api.py +1 -0
- deepeval/test_run/test_run.py +85 -9
- deepeval/tracing/__init__.py +2 -0
- deepeval/tracing/otel/exporter.py +0 -6
- deepeval/tracing/otel/test_exporter.py +35 -0
- deepeval/tracing/otel/utils.py +57 -7
- deepeval/tracing/trace_context.py +14 -0
- deepeval/tracing/trace_test_manager.py +19 -0
- deepeval/tracing/tracing.py +7 -6
- deepeval/tracing/utils.py +2 -86
- deepeval/utils.py +149 -1
- {deepeval-3.6.4.dist-info → deepeval-3.6.6.dist-info}/METADATA +1 -1
- {deepeval-3.6.4.dist-info → deepeval-3.6.6.dist-info}/RECORD +35 -31
- deepeval/integrations/crewai/agent.py +0 -98
- deepeval/integrations/crewai/patch.py +0 -41
- {deepeval-3.6.4.dist-info → deepeval-3.6.6.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.4.dist-info → deepeval-3.6.6.dist-info}/WHEEL +0 -0
- {deepeval-3.6.4.dist-info → deepeval-3.6.6.dist-info}/entry_points.txt +0 -0
deepeval/test_run/test_run.py
CHANGED
|
@@ -2,9 +2,8 @@ from enum import Enum
|
|
|
2
2
|
import os
|
|
3
3
|
import json
|
|
4
4
|
from pydantic import BaseModel, Field
|
|
5
|
-
from typing import Any, Optional, List, Dict, Union
|
|
5
|
+
from typing import Any, Optional, List, Dict, Union, Tuple
|
|
6
6
|
import shutil
|
|
7
|
-
import webbrowser
|
|
8
7
|
import sys
|
|
9
8
|
import datetime
|
|
10
9
|
import portalocker
|
|
@@ -27,6 +26,9 @@ from deepeval.utils import (
|
|
|
27
26
|
delete_file_if_exists,
|
|
28
27
|
get_is_running_deepeval,
|
|
29
28
|
open_browser,
|
|
29
|
+
shorten,
|
|
30
|
+
format_turn,
|
|
31
|
+
len_short,
|
|
30
32
|
)
|
|
31
33
|
from deepeval.test_run.cache import global_test_run_cache_manager
|
|
32
34
|
from deepeval.constants import CONFIDENT_TEST_CASE_BATCH_SIZE, HIDDEN_DIR
|
|
@@ -546,7 +548,7 @@ class TestRunManager:
|
|
|
546
548
|
|
|
547
549
|
if (
|
|
548
550
|
display == TestRunResultDisplay.PASSING
|
|
549
|
-
and test_case.success
|
|
551
|
+
and test_case.success is False
|
|
550
552
|
):
|
|
551
553
|
continue
|
|
552
554
|
elif display == TestRunResultDisplay.FAILING and test_case.success:
|
|
@@ -618,7 +620,7 @@ class TestRunManager:
|
|
|
618
620
|
):
|
|
619
621
|
if (
|
|
620
622
|
display == TestRunResultDisplay.PASSING
|
|
621
|
-
and conversational_test_case.success
|
|
623
|
+
and conversational_test_case.success is False
|
|
622
624
|
):
|
|
623
625
|
continue
|
|
624
626
|
elif (
|
|
@@ -631,6 +633,65 @@ class TestRunManager:
|
|
|
631
633
|
fail_count = 0
|
|
632
634
|
conversational_test_case_name = conversational_test_case.name
|
|
633
635
|
|
|
636
|
+
if conversational_test_case.turns:
|
|
637
|
+
turns_table = Table(
|
|
638
|
+
title=f"Conversation - {conversational_test_case_name}",
|
|
639
|
+
show_header=True,
|
|
640
|
+
header_style="bold",
|
|
641
|
+
)
|
|
642
|
+
turns_table.add_column("#", justify="right", width=3)
|
|
643
|
+
turns_table.add_column("Role", justify="left", width=10)
|
|
644
|
+
|
|
645
|
+
# subtract fixed widths + borders and padding.
|
|
646
|
+
# ~20 as a safe buffer
|
|
647
|
+
details_max_width = max(
|
|
648
|
+
48, min(120, console.width - 3 - 10 - 20)
|
|
649
|
+
)
|
|
650
|
+
turns_table.add_column(
|
|
651
|
+
"Details",
|
|
652
|
+
justify="left",
|
|
653
|
+
overflow="fold",
|
|
654
|
+
max_width=details_max_width,
|
|
655
|
+
)
|
|
656
|
+
|
|
657
|
+
# truncate when too long
|
|
658
|
+
tools_max_width = min(60, max(24, console.width // 3))
|
|
659
|
+
turns_table.add_column(
|
|
660
|
+
"Tools",
|
|
661
|
+
justify="left",
|
|
662
|
+
no_wrap=True,
|
|
663
|
+
overflow="ellipsis",
|
|
664
|
+
max_width=tools_max_width,
|
|
665
|
+
)
|
|
666
|
+
|
|
667
|
+
sorted_turns = sorted(
|
|
668
|
+
conversational_test_case.turns, key=lambda t: t.order
|
|
669
|
+
)
|
|
670
|
+
|
|
671
|
+
for t in sorted_turns:
|
|
672
|
+
tools = t.tools_called or []
|
|
673
|
+
tool_names = ", ".join(tc.name for tc in tools)
|
|
674
|
+
|
|
675
|
+
# omit order, role and tools since we show them in a separate columns.
|
|
676
|
+
details = format_turn(
|
|
677
|
+
t,
|
|
678
|
+
include_tools_in_header=False,
|
|
679
|
+
include_order_role_in_header=False,
|
|
680
|
+
)
|
|
681
|
+
|
|
682
|
+
turns_table.add_row(
|
|
683
|
+
str(t.order),
|
|
684
|
+
t.role,
|
|
685
|
+
details,
|
|
686
|
+
shorten(tool_names, len_short()),
|
|
687
|
+
)
|
|
688
|
+
|
|
689
|
+
console.print(turns_table)
|
|
690
|
+
else:
|
|
691
|
+
console.print(
|
|
692
|
+
f"[dim]No turns recorded for {conversational_test_case_name}.[/dim]"
|
|
693
|
+
)
|
|
694
|
+
|
|
634
695
|
if conversational_test_case.metrics_data is not None:
|
|
635
696
|
for metric_data in conversational_test_case.metrics_data:
|
|
636
697
|
if metric_data.success:
|
|
@@ -698,7 +759,7 @@ class TestRunManager:
|
|
|
698
759
|
)
|
|
699
760
|
print(table)
|
|
700
761
|
|
|
701
|
-
def post_test_run(self, test_run: TestRun) -> Optional[str]:
|
|
762
|
+
def post_test_run(self, test_run: TestRun) -> Optional[Tuple[str, str]]:
|
|
702
763
|
if (
|
|
703
764
|
len(test_run.test_cases) == 0
|
|
704
765
|
and len(test_run.conversational_test_cases) == 0
|
|
@@ -752,6 +813,21 @@ class TestRunManager:
|
|
|
752
813
|
body=body,
|
|
753
814
|
)
|
|
754
815
|
|
|
816
|
+
if not isinstance(data, dict) or "id" not in data:
|
|
817
|
+
# try to show helpful details
|
|
818
|
+
detail = None
|
|
819
|
+
if isinstance(data, dict):
|
|
820
|
+
detail = (
|
|
821
|
+
data.get("detail")
|
|
822
|
+
or data.get("message")
|
|
823
|
+
or data.get("error")
|
|
824
|
+
)
|
|
825
|
+
# fall back to repr for visibility
|
|
826
|
+
raise RuntimeError(
|
|
827
|
+
f"Confident API response missing 'id'. "
|
|
828
|
+
f"detail={detail!r} raw={type(data).__name__}:{repr(data)[:500]}"
|
|
829
|
+
)
|
|
830
|
+
|
|
755
831
|
res = TestRunHttpResponse(
|
|
756
832
|
id=data["id"],
|
|
757
833
|
)
|
|
@@ -814,7 +890,7 @@ class TestRunManager:
|
|
|
814
890
|
)
|
|
815
891
|
self.save_final_test_run_link(link)
|
|
816
892
|
open_browser(link)
|
|
817
|
-
return link
|
|
893
|
+
return link, res.id
|
|
818
894
|
|
|
819
895
|
def save_test_run_locally(self):
|
|
820
896
|
local_folder = os.getenv("DEEPEVAL_RESULTS_FOLDER")
|
|
@@ -841,7 +917,7 @@ class TestRunManager:
|
|
|
841
917
|
runDuration: float,
|
|
842
918
|
display_table: bool = True,
|
|
843
919
|
display: Optional[TestRunResultDisplay] = TestRunResultDisplay.ALL,
|
|
844
|
-
) -> Optional[str]:
|
|
920
|
+
) -> Optional[Tuple[str, str]]:
|
|
845
921
|
test_run = self.get_test_run()
|
|
846
922
|
if test_run is None:
|
|
847
923
|
print("Test Run is empty, please try again.")
|
|
@@ -868,8 +944,8 @@ class TestRunManager:
|
|
|
868
944
|
test_run.sort_test_cases()
|
|
869
945
|
|
|
870
946
|
if global_test_run_cache_manager.disable_write_cache is None:
|
|
871
|
-
global_test_run_cache_manager.disable_write_cache = (
|
|
872
|
-
get_is_running_deepeval()
|
|
947
|
+
global_test_run_cache_manager.disable_write_cache = not bool(
|
|
948
|
+
get_is_running_deepeval()
|
|
873
949
|
)
|
|
874
950
|
|
|
875
951
|
global_test_run_cache_manager.wrap_up_cached_test_run()
|
deepeval/tracing/__init__.py
CHANGED
|
@@ -4,6 +4,7 @@ from .context import (
|
|
|
4
4
|
update_retriever_span,
|
|
5
5
|
update_llm_span,
|
|
6
6
|
)
|
|
7
|
+
from .trace_context import trace
|
|
7
8
|
from .types import BaseSpan, Trace
|
|
8
9
|
from .tracing import observe, trace_manager
|
|
9
10
|
from .offline_evals import evaluate_thread, evaluate_trace, evaluate_span
|
|
@@ -16,6 +17,7 @@ __all__ = [
|
|
|
16
17
|
"BaseSpan",
|
|
17
18
|
"Trace",
|
|
18
19
|
"observe",
|
|
20
|
+
"trace",
|
|
19
21
|
"trace_manager",
|
|
20
22
|
"evaluate_thread",
|
|
21
23
|
"evaluate_trace",
|
|
@@ -90,12 +90,6 @@ class ConfidentSpanExporter(SpanExporter):
|
|
|
90
90
|
api_key: Optional[str] = None, # dynamic api key,
|
|
91
91
|
_test_run_id: Optional[str] = None,
|
|
92
92
|
) -> SpanExportResult:
|
|
93
|
-
# build forest of spans
|
|
94
|
-
# for span in spans:
|
|
95
|
-
# print("--------------------------------")
|
|
96
|
-
# print(span.to_json())
|
|
97
|
-
# print("--------------------------------")
|
|
98
|
-
# return SpanExportResult.SUCCESS
|
|
99
93
|
|
|
100
94
|
################ Build Forest of Spans ################
|
|
101
95
|
forest = self._build_span_forest(spans)
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from typing import List, Dict, Any, Sequence
|
|
2
|
+
from opentelemetry.sdk.trace import ReadableSpan
|
|
3
|
+
from opentelemetry.sdk.trace.export import SpanExporter
|
|
4
|
+
from opentelemetry.sdk.trace.export import SpanExportResult
|
|
5
|
+
import json
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class TestExporter(SpanExporter):
|
|
10
|
+
"""This exporter is used to test the exporter. It will store the spans in a list of dictionaries."""
|
|
11
|
+
|
|
12
|
+
span_json_list: List[Dict[str, Any]] = []
|
|
13
|
+
|
|
14
|
+
def export(
|
|
15
|
+
self, spans: Sequence[ReadableSpan], timeout_millis: int = 30000
|
|
16
|
+
) -> SpanExportResult:
|
|
17
|
+
for span in spans:
|
|
18
|
+
_span_json = json.loads(span.to_json())
|
|
19
|
+
self.span_json_list.append(_span_json)
|
|
20
|
+
|
|
21
|
+
return SpanExportResult.SUCCESS
|
|
22
|
+
|
|
23
|
+
def get_span_json_list(self) -> List[Dict[str, Any]]:
|
|
24
|
+
return sorted(
|
|
25
|
+
self.span_json_list,
|
|
26
|
+
key=lambda x: datetime.fromisoformat(
|
|
27
|
+
x["start_time"].replace("Z", "+00:00")
|
|
28
|
+
),
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
def clear_span_json_list(self):
|
|
32
|
+
self.span_json_list = []
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
test_exporter = TestExporter()
|
deepeval/tracing/otel/utils.py
CHANGED
|
@@ -109,8 +109,24 @@ def check_llm_input_from_gen_ai_attributes(
|
|
|
109
109
|
input = None
|
|
110
110
|
output = None
|
|
111
111
|
try:
|
|
112
|
-
|
|
113
|
-
|
|
112
|
+
# check for system instructions
|
|
113
|
+
system_instructions = []
|
|
114
|
+
system_instructions_raw = span.attributes.get(
|
|
115
|
+
"gen_ai.system_instructions"
|
|
116
|
+
)
|
|
117
|
+
if system_instructions_raw and isinstance(system_instructions_raw, str):
|
|
118
|
+
system_instructions_json = json.loads(system_instructions_raw)
|
|
119
|
+
system_instructions = _flatten_system_instructions(
|
|
120
|
+
system_instructions_json
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
input_messages = []
|
|
124
|
+
input_messages_raw = span.attributes.get("gen_ai.input.messages")
|
|
125
|
+
if input_messages_raw and isinstance(input_messages_raw, str):
|
|
126
|
+
input_messages_json = json.loads(input_messages_raw)
|
|
127
|
+
input_messages = _flatten_input(input_messages_json)
|
|
128
|
+
|
|
129
|
+
input = system_instructions + input_messages
|
|
114
130
|
|
|
115
131
|
except Exception:
|
|
116
132
|
pass
|
|
@@ -137,6 +153,20 @@ def check_llm_input_from_gen_ai_attributes(
|
|
|
137
153
|
return input, output
|
|
138
154
|
|
|
139
155
|
|
|
156
|
+
def _flatten_system_instructions(system_instructions: list) -> list:
|
|
157
|
+
if isinstance(system_instructions, list):
|
|
158
|
+
for system_instruction in system_instructions:
|
|
159
|
+
if isinstance(system_instruction, dict):
|
|
160
|
+
role = system_instruction.get("role")
|
|
161
|
+
if not role:
|
|
162
|
+
system_instruction["role"] = "System Instruction"
|
|
163
|
+
return _flatten_input(system_instructions)
|
|
164
|
+
elif isinstance(system_instructions, str):
|
|
165
|
+
return [{"role": "System Instruction", "content": system_instructions}]
|
|
166
|
+
|
|
167
|
+
return []
|
|
168
|
+
|
|
169
|
+
|
|
140
170
|
def _flatten_input(input: list) -> list:
|
|
141
171
|
if input and isinstance(input, list):
|
|
142
172
|
try:
|
|
@@ -411,10 +441,23 @@ def _normalize_pydantic_ai_messages(span: ReadableSpan) -> Optional[list]:
|
|
|
411
441
|
return None
|
|
412
442
|
|
|
413
443
|
|
|
444
|
+
def _extract_non_thinking_part_of_last_message(message: dict) -> dict:
|
|
445
|
+
|
|
446
|
+
if isinstance(message, dict) and message.get("role") == "assistant":
|
|
447
|
+
parts = message.get("parts")
|
|
448
|
+
if parts:
|
|
449
|
+
# Iterate from the last part
|
|
450
|
+
for part in reversed(parts):
|
|
451
|
+
if isinstance(part, dict) and part.get("type") == "text":
|
|
452
|
+
# Return a modified message with only the text content
|
|
453
|
+
return {"role": "assistant", "content": part.get("content")}
|
|
454
|
+
return None
|
|
455
|
+
|
|
456
|
+
|
|
414
457
|
def check_pydantic_ai_agent_input_output(
|
|
415
458
|
span: ReadableSpan,
|
|
416
459
|
) -> Tuple[Optional[Any], Optional[Any]]:
|
|
417
|
-
input_val:
|
|
460
|
+
input_val: list = []
|
|
418
461
|
output_val: Optional[Any] = None
|
|
419
462
|
|
|
420
463
|
# Get normalized messages once
|
|
@@ -445,14 +488,21 @@ def check_pydantic_ai_agent_input_output(
|
|
|
445
488
|
if span.attributes.get("confident.span.type") == "agent":
|
|
446
489
|
output_val = span.attributes.get("final_result")
|
|
447
490
|
if not output_val and normalized:
|
|
448
|
-
|
|
449
|
-
|
|
491
|
+
output_val = _extract_non_thinking_part_of_last_message(
|
|
492
|
+
normalized[-1]
|
|
493
|
+
)
|
|
450
494
|
except Exception:
|
|
451
495
|
pass
|
|
452
496
|
|
|
497
|
+
system_instructions = []
|
|
498
|
+
system_instruction_raw = span.attributes.get("gen_ai.system_instructions")
|
|
499
|
+
if system_instruction_raw and isinstance(system_instruction_raw, str):
|
|
500
|
+
system_instructions = _flatten_system_instructions(
|
|
501
|
+
json.loads(system_instruction_raw)
|
|
502
|
+
)
|
|
503
|
+
|
|
453
504
|
input_val = _flatten_input(input_val)
|
|
454
|
-
|
|
455
|
-
return input_val, output_val
|
|
505
|
+
return system_instructions + input_val, output_val
|
|
456
506
|
|
|
457
507
|
|
|
458
508
|
def check_tool_output(span: ReadableSpan):
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from .context import current_trace_context
|
|
2
|
+
from .tracing import trace_manager
|
|
3
|
+
from contextlib import contextmanager
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@contextmanager
|
|
7
|
+
def trace():
|
|
8
|
+
current_trace = current_trace_context.get()
|
|
9
|
+
|
|
10
|
+
if not current_trace:
|
|
11
|
+
current_trace = trace_manager.start_new_trace()
|
|
12
|
+
current_trace_context.set(current_trace)
|
|
13
|
+
|
|
14
|
+
yield current_trace
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from typing import Optional, Dict, Any
|
|
2
|
+
import asyncio
|
|
3
|
+
from time import monotonic
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class TraceTestingManager:
|
|
7
|
+
test_name: Optional[str] = None
|
|
8
|
+
test_dict: Optional[Dict[str, Any]] = None
|
|
9
|
+
|
|
10
|
+
async def wait_for_test_dict(
|
|
11
|
+
self, timeout: float = 10.0, poll_interval: float = 0.05
|
|
12
|
+
) -> Dict[str, Any]:
|
|
13
|
+
deadline = monotonic() + timeout
|
|
14
|
+
while self.test_dict is None and monotonic() < deadline:
|
|
15
|
+
await asyncio.sleep(poll_interval)
|
|
16
|
+
return self.test_dict or {}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
trace_testing_manager = TraceTestingManager()
|
deepeval/tracing/tracing.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import os
|
|
1
2
|
from typing import Any, Dict, List, Literal, Optional, Set, Union, Callable
|
|
2
3
|
from time import perf_counter
|
|
3
4
|
import threading
|
|
@@ -47,13 +48,12 @@ from deepeval.tracing.utils import (
|
|
|
47
48
|
tracing_enabled,
|
|
48
49
|
validate_environment,
|
|
49
50
|
validate_sampling_rate,
|
|
50
|
-
dump_body_to_json_file,
|
|
51
|
-
get_deepeval_trace_mode,
|
|
52
51
|
)
|
|
53
52
|
from deepeval.utils import dataclass_to_dict
|
|
54
53
|
from deepeval.tracing.context import current_span_context, current_trace_context
|
|
55
54
|
from deepeval.tracing.types import TestCaseMetricPair
|
|
56
55
|
from deepeval.tracing.api import PromptApi
|
|
56
|
+
from deepeval.tracing.trace_test_manager import trace_testing_manager
|
|
57
57
|
|
|
58
58
|
EVAL_DUMMY_SPAN_NAME = "evals_iterator"
|
|
59
59
|
|
|
@@ -183,13 +183,14 @@ class TraceManager:
|
|
|
183
183
|
if trace.status == TraceSpanStatus.IN_PROGRESS:
|
|
184
184
|
trace.status = TraceSpanStatus.SUCCESS
|
|
185
185
|
|
|
186
|
-
|
|
187
|
-
|
|
186
|
+
if trace_testing_manager.test_name:
|
|
187
|
+
# Trace testing mode is enabled
|
|
188
|
+
# Instead posting the trace to the queue, it will be stored in this global variable
|
|
188
189
|
body = self.create_trace_api(trace).model_dump(
|
|
189
190
|
by_alias=True, exclude_none=True
|
|
190
191
|
)
|
|
191
|
-
|
|
192
|
-
#
|
|
192
|
+
trace_testing_manager.test_dict = make_json_serializable(body)
|
|
193
|
+
# Post the trace to the server before removing it
|
|
193
194
|
elif not self.evaluating:
|
|
194
195
|
self.post_trace(trace)
|
|
195
196
|
else:
|
deepeval/tracing/utils.py
CHANGED
|
@@ -1,13 +1,8 @@
|
|
|
1
1
|
import os
|
|
2
|
-
import inspect
|
|
3
|
-
import json
|
|
4
|
-
import sys
|
|
5
2
|
from datetime import datetime, timezone
|
|
6
3
|
from enum import Enum
|
|
7
4
|
from time import perf_counter
|
|
8
5
|
from collections import deque
|
|
9
|
-
from typing import Any, Dict, Optional
|
|
10
|
-
|
|
11
6
|
from deepeval.constants import CONFIDENT_TRACING_ENABLED
|
|
12
7
|
|
|
13
8
|
|
|
@@ -186,84 +181,5 @@ def perf_counter_to_datetime(perf_counter_value: float) -> datetime:
|
|
|
186
181
|
def replace_self_with_class_name(obj):
|
|
187
182
|
try:
|
|
188
183
|
return f"<{obj.__class__.__name__}>"
|
|
189
|
-
except
|
|
190
|
-
return "<self>"
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
def get_deepeval_trace_mode() -> Optional[str]:
|
|
194
|
-
deepeval_trace_mode = None
|
|
195
|
-
try:
|
|
196
|
-
args = sys.argv
|
|
197
|
-
for idx, arg in enumerate(args):
|
|
198
|
-
if isinstance(arg, str) and arg.startswith(
|
|
199
|
-
"--deepeval-trace-mode="
|
|
200
|
-
):
|
|
201
|
-
deepeval_trace_mode = (
|
|
202
|
-
arg.split("=", 1)[1].strip().strip('"').strip("'").lower()
|
|
203
|
-
)
|
|
204
|
-
break
|
|
205
|
-
if arg == "--deepeval-trace-mode" and idx + 1 < len(args):
|
|
206
|
-
deepeval_trace_mode = (
|
|
207
|
-
str(args[idx + 1]).strip().strip('"').strip("'").lower()
|
|
208
|
-
)
|
|
209
|
-
break
|
|
210
|
-
except Exception:
|
|
211
|
-
deepeval_trace_mode = None
|
|
212
|
-
|
|
213
|
-
return deepeval_trace_mode
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
def dump_body_to_json_file(
|
|
217
|
-
body: Dict[str, Any], file_path: Optional[str] = None
|
|
218
|
-
) -> str:
|
|
219
|
-
entry_file = None
|
|
220
|
-
try:
|
|
221
|
-
cmd0 = sys.argv[0] if sys.argv else None
|
|
222
|
-
if cmd0 and cmd0.endswith(".py"):
|
|
223
|
-
entry_file = cmd0
|
|
224
|
-
else:
|
|
225
|
-
for frame_info in reversed(inspect.stack()):
|
|
226
|
-
fp = frame_info.filename
|
|
227
|
-
if (
|
|
228
|
-
fp
|
|
229
|
-
and fp.endswith(".py")
|
|
230
|
-
and "deepeval/tracing" not in fp
|
|
231
|
-
and "site-packages" not in fp
|
|
232
|
-
):
|
|
233
|
-
entry_file = fp
|
|
234
|
-
break
|
|
235
|
-
except Exception:
|
|
236
|
-
entry_file = None
|
|
237
|
-
|
|
238
|
-
if not entry_file:
|
|
239
|
-
entry_file = "unknown.py"
|
|
240
|
-
|
|
241
|
-
abs_entry = os.path.abspath(entry_file)
|
|
242
|
-
dir_path = os.path.dirname(abs_entry)
|
|
243
|
-
|
|
244
|
-
file_arg = None
|
|
245
|
-
try:
|
|
246
|
-
for idx, arg in enumerate(sys.argv):
|
|
247
|
-
if isinstance(arg, str) and arg.startswith(
|
|
248
|
-
"--deepeval-trace-file-name="
|
|
249
|
-
):
|
|
250
|
-
file_arg = arg.split("=", 1)[1].strip().strip('"').strip("'")
|
|
251
|
-
break
|
|
252
|
-
if arg == "--deepeval-trace-file-name" and idx + 1 < len(sys.argv):
|
|
253
|
-
file_arg = str(sys.argv[idx + 1]).strip().strip('"').strip("'")
|
|
254
|
-
break
|
|
255
|
-
except Exception:
|
|
256
|
-
file_arg = None
|
|
257
|
-
|
|
258
|
-
if file_path:
|
|
259
|
-
dst_path = os.path.abspath(file_path)
|
|
260
|
-
elif file_arg:
|
|
261
|
-
dst_path = os.path.abspath(file_arg)
|
|
262
|
-
else:
|
|
263
|
-
base_name = os.path.splitext(os.path.basename(abs_entry))[0]
|
|
264
|
-
dst_path = os.path.join(dir_path, f"{base_name}.json")
|
|
265
|
-
|
|
266
|
-
actual_body = make_json_serializable(body)
|
|
267
|
-
with open(dst_path, "w", encoding="utf-8") as f:
|
|
268
|
-
json.dump(actual_body, f, ensure_ascii=False, indent=2, sort_keys=True)
|
|
269
|
-
return dst_path
|
|
184
|
+
except:
|
|
185
|
+
return f"<self>"
|
deepeval/utils.py
CHANGED
|
@@ -13,7 +13,7 @@ import math
|
|
|
13
13
|
|
|
14
14
|
from contextvars import ContextVar
|
|
15
15
|
from enum import Enum
|
|
16
|
-
from typing import Any, Optional,
|
|
16
|
+
from typing import Any, Dict, List, Optional, Protocol, Sequence, Union
|
|
17
17
|
from collections.abc import Iterable
|
|
18
18
|
from dataclasses import asdict, is_dataclass
|
|
19
19
|
from pydantic import BaseModel
|
|
@@ -28,6 +28,22 @@ from deepeval.config.utils import (
|
|
|
28
28
|
)
|
|
29
29
|
|
|
30
30
|
|
|
31
|
+
###############
|
|
32
|
+
# Local Types #
|
|
33
|
+
###############
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class TurnLike(Protocol):
|
|
37
|
+
order: int
|
|
38
|
+
role: str
|
|
39
|
+
content: str
|
|
40
|
+
user_id: Optional[str]
|
|
41
|
+
retrieval_context: Optional[Sequence[str]]
|
|
42
|
+
tools_called: Optional[Sequence[Any]]
|
|
43
|
+
additional_metadata: Optional[Dict[str, Any]]
|
|
44
|
+
comments: Optional[str]
|
|
45
|
+
|
|
46
|
+
|
|
31
47
|
def get_lcs(seq1, seq2):
|
|
32
48
|
m, n = len(seq1), len(seq2)
|
|
33
49
|
dp = [[0] * (n + 1) for _ in range(m + 1)]
|
|
@@ -419,6 +435,138 @@ def is_missing(s: Optional[str]) -> bool:
|
|
|
419
435
|
return s is None or (isinstance(s, str) and s.strip() == "")
|
|
420
436
|
|
|
421
437
|
|
|
438
|
+
def len_tiny() -> int:
|
|
439
|
+
value = get_settings().DEEPEVAL_MAXLEN_TINY
|
|
440
|
+
return value if (isinstance(value, int) and value > 0) else 40
|
|
441
|
+
|
|
442
|
+
|
|
443
|
+
def len_short() -> int:
|
|
444
|
+
value = get_settings().DEEPEVAL_MAXLEN_SHORT
|
|
445
|
+
return value if (isinstance(value, int) and value > 0) else 60
|
|
446
|
+
|
|
447
|
+
|
|
448
|
+
def len_medium() -> int:
|
|
449
|
+
value = get_settings().DEEPEVAL_MAXLEN_MEDIUM
|
|
450
|
+
return value if (isinstance(value, int) and value > 0) else 120
|
|
451
|
+
|
|
452
|
+
|
|
453
|
+
def len_long() -> int:
|
|
454
|
+
value = get_settings().DEEPEVAL_MAXLEN_LONG
|
|
455
|
+
return value if (isinstance(value, int) and value > 0) else 240
|
|
456
|
+
|
|
457
|
+
|
|
458
|
+
def shorten(
|
|
459
|
+
text: Optional[object],
|
|
460
|
+
max_len: Optional[int] = None,
|
|
461
|
+
suffix: Optional[str] = None,
|
|
462
|
+
) -> str:
|
|
463
|
+
"""
|
|
464
|
+
Truncate text to max_len characters, appending `suffix` if truncated.
|
|
465
|
+
- Accepts None and returns "", or any object is returned as str().
|
|
466
|
+
- Safe when max_len <= len(suffix).
|
|
467
|
+
"""
|
|
468
|
+
settings = get_settings()
|
|
469
|
+
|
|
470
|
+
if max_len is None:
|
|
471
|
+
max_len = (
|
|
472
|
+
settings.DEEPEVAL_SHORTEN_DEFAULT_MAXLEN
|
|
473
|
+
if settings.DEEPEVAL_SHORTEN_DEFAULT_MAXLEN is not None
|
|
474
|
+
else len_long()
|
|
475
|
+
)
|
|
476
|
+
if suffix is None:
|
|
477
|
+
suffix = (
|
|
478
|
+
settings.DEEPEVAL_SHORTEN_SUFFIX
|
|
479
|
+
if settings.DEEPEVAL_SHORTEN_SUFFIX is not None
|
|
480
|
+
else "..."
|
|
481
|
+
)
|
|
482
|
+
|
|
483
|
+
if text is None:
|
|
484
|
+
return ""
|
|
485
|
+
stext = str(text)
|
|
486
|
+
if max_len <= 0:
|
|
487
|
+
return ""
|
|
488
|
+
if len(stext) <= max_len:
|
|
489
|
+
return stext
|
|
490
|
+
cut = max_len - len(suffix)
|
|
491
|
+
if cut <= 0:
|
|
492
|
+
return suffix[:max_len]
|
|
493
|
+
return stext[:cut] + suffix
|
|
494
|
+
|
|
495
|
+
|
|
496
|
+
def format_turn(
|
|
497
|
+
turn: TurnLike,
|
|
498
|
+
*,
|
|
499
|
+
content_length: Optional[int] = None,
|
|
500
|
+
max_context_items: Optional[int] = None,
|
|
501
|
+
context_length: Optional[int] = None,
|
|
502
|
+
meta_length: Optional[int] = None,
|
|
503
|
+
include_tools_in_header: bool = True,
|
|
504
|
+
include_order_role_in_header: bool = True,
|
|
505
|
+
) -> str:
|
|
506
|
+
"""
|
|
507
|
+
Build a multi-line, human-readable summary for a conversational turn.
|
|
508
|
+
Safe against missing fields and overly long content.
|
|
509
|
+
"""
|
|
510
|
+
if content_length is None:
|
|
511
|
+
content_length = len_long()
|
|
512
|
+
if max_context_items is None:
|
|
513
|
+
max_context_items = 2
|
|
514
|
+
if context_length is None:
|
|
515
|
+
context_length = len_medium()
|
|
516
|
+
if meta_length is None:
|
|
517
|
+
meta_length = len_medium()
|
|
518
|
+
|
|
519
|
+
tools = turn.tools_called or []
|
|
520
|
+
tool_names = ", ".join(getattr(tc, "name", str(tc)) for tc in tools)
|
|
521
|
+
content = shorten(turn.content, content_length)
|
|
522
|
+
|
|
523
|
+
lines = []
|
|
524
|
+
|
|
525
|
+
if include_order_role_in_header:
|
|
526
|
+
header = f"{turn.order:>2}. {turn.role:<9} {content}"
|
|
527
|
+
if include_tools_in_header and tool_names:
|
|
528
|
+
header += f" | tools: {tool_names}"
|
|
529
|
+
if turn.user_id:
|
|
530
|
+
header += f" | user: {shorten(turn.user_id, len_tiny())}"
|
|
531
|
+
lines.append(header)
|
|
532
|
+
indent = " "
|
|
533
|
+
else:
|
|
534
|
+
# No order or role prefix in this mode
|
|
535
|
+
# keep tools out of header as well.
|
|
536
|
+
first = content
|
|
537
|
+
if turn.user_id:
|
|
538
|
+
first += f" | user: {shorten(turn.user_id, len_tiny())}"
|
|
539
|
+
lines.append(first)
|
|
540
|
+
indent = " " # ctx and meta indent
|
|
541
|
+
|
|
542
|
+
rctx = list(turn.retrieval_context or [])
|
|
543
|
+
if rctx:
|
|
544
|
+
show = rctx[:max_context_items]
|
|
545
|
+
for i, item in enumerate(show):
|
|
546
|
+
lines.append(f"{indent}↳ ctx[{i}]: {shorten(item, context_length)}")
|
|
547
|
+
hidden = max(0, len(rctx) - len(show))
|
|
548
|
+
if hidden:
|
|
549
|
+
lines.append(f"{indent}↳ ctx: (+{hidden} more)")
|
|
550
|
+
|
|
551
|
+
if turn.comments:
|
|
552
|
+
lines.append(
|
|
553
|
+
f"{indent}↳ comment: {shorten(str(turn.comments), meta_length)}"
|
|
554
|
+
)
|
|
555
|
+
|
|
556
|
+
meta = turn.additional_metadata or {}
|
|
557
|
+
if isinstance(meta, dict):
|
|
558
|
+
for k in list(meta.keys())[:3]:
|
|
559
|
+
if k in {"user_id", "userId"}:
|
|
560
|
+
continue
|
|
561
|
+
v = meta.get(k)
|
|
562
|
+
if v is not None:
|
|
563
|
+
lines.append(
|
|
564
|
+
f"{indent}↳ meta.{k}: {shorten(str(v), meta_length)}"
|
|
565
|
+
)
|
|
566
|
+
|
|
567
|
+
return "\n".join(lines)
|
|
568
|
+
|
|
569
|
+
|
|
422
570
|
###############################################
|
|
423
571
|
# Source: https://github.com/tingofurro/summac
|
|
424
572
|
###############################################
|