deepeval 3.6.3__py3-none-any.whl → 3.6.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/config/settings.py +13 -0
- deepeval/dataset/dataset.py +8 -2
- deepeval/evaluate/evaluate.py +8 -2
- deepeval/evaluate/execute.py +6 -11
- deepeval/evaluate/types.py +4 -1
- deepeval/evaluate/utils.py +46 -29
- deepeval/integrations/crewai/__init__.py +1 -2
- deepeval/integrations/crewai/handler.py +153 -81
- deepeval/integrations/crewai/wrapper.py +87 -0
- deepeval/integrations/pydantic_ai/instrumentator.py +48 -9
- deepeval/integrations/pydantic_ai/test_instrumentator.py +0 -0
- deepeval/metrics/faithfulness/faithfulness.py +8 -0
- deepeval/prompt/prompt.py +133 -86
- deepeval/test_run/__init__.py +2 -1
- deepeval/test_run/api.py +1 -0
- deepeval/test_run/test_run.py +85 -9
- deepeval/tracing/__init__.py +2 -0
- deepeval/tracing/otel/test_exporter.py +35 -0
- deepeval/tracing/trace_context.py +14 -0
- deepeval/tracing/tracing.py +7 -6
- deepeval/tracing/utils.py +2 -86
- deepeval/utils.py +149 -1
- {deepeval-3.6.3.dist-info → deepeval-3.6.5.dist-info}/METADATA +1 -1
- {deepeval-3.6.3.dist-info → deepeval-3.6.5.dist-info}/RECORD +28 -26
- deepeval/integrations/crewai/agent.py +0 -98
- deepeval/integrations/crewai/patch.py +0 -41
- {deepeval-3.6.3.dist-info → deepeval-3.6.5.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.3.dist-info → deepeval-3.6.5.dist-info}/WHEEL +0 -0
- {deepeval-3.6.3.dist-info → deepeval-3.6.5.dist-info}/entry_points.txt +0 -0
|
@@ -1,7 +1,19 @@
|
|
|
1
1
|
import json
|
|
2
|
+
import logging
|
|
2
3
|
import os
|
|
3
4
|
from typing import Literal, Optional, List
|
|
4
5
|
|
|
6
|
+
from deepeval.config.settings import get_settings
|
|
7
|
+
from deepeval.confident.api import get_confident_api_key
|
|
8
|
+
from deepeval.prompt import Prompt
|
|
9
|
+
from deepeval.tracing.context import current_trace_context
|
|
10
|
+
from deepeval.tracing.types import Trace
|
|
11
|
+
from deepeval.tracing.otel.utils import to_hex_string
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
5
17
|
try:
|
|
6
18
|
from pydantic_ai.models.instrumented import InstrumentationSettings
|
|
7
19
|
from opentelemetry.sdk.trace import SpanProcessor, TracerProvider
|
|
@@ -11,7 +23,20 @@ try:
|
|
|
11
23
|
)
|
|
12
24
|
|
|
13
25
|
dependency_installed = True
|
|
14
|
-
except:
|
|
26
|
+
except ImportError as e:
|
|
27
|
+
if get_settings().DEEPEVAL_VERBOSE_MODE:
|
|
28
|
+
if isinstance(e, ModuleNotFoundError):
|
|
29
|
+
logger.warning(
|
|
30
|
+
"Optional tracing dependency not installed: %s",
|
|
31
|
+
e.name,
|
|
32
|
+
stacklevel=2,
|
|
33
|
+
)
|
|
34
|
+
else:
|
|
35
|
+
logger.warning(
|
|
36
|
+
"Optional tracing import failed: %s",
|
|
37
|
+
e,
|
|
38
|
+
stacklevel=2,
|
|
39
|
+
)
|
|
15
40
|
dependency_installed = False
|
|
16
41
|
|
|
17
42
|
|
|
@@ -25,6 +50,10 @@ def is_dependency_installed():
|
|
|
25
50
|
|
|
26
51
|
from deepeval.confident.api import get_confident_api_key
|
|
27
52
|
from deepeval.prompt import Prompt
|
|
53
|
+
from deepeval.tracing.otel.test_exporter import test_exporter
|
|
54
|
+
from deepeval.tracing.context import current_trace_context
|
|
55
|
+
from deepeval.tracing.types import Trace
|
|
56
|
+
from deepeval.tracing.otel.utils import to_hex_string
|
|
28
57
|
|
|
29
58
|
# OTLP_ENDPOINT = "http://127.0.0.1:4318/v1/traces"
|
|
30
59
|
OTLP_ENDPOINT = "https://otel.confident-ai.com/v1/traces"
|
|
@@ -37,6 +66,12 @@ class SpanInterceptor(SpanProcessor):
|
|
|
37
66
|
|
|
38
67
|
def on_start(self, span, parent_context):
|
|
39
68
|
|
|
69
|
+
# set trace uuid
|
|
70
|
+
_current_trace_context = current_trace_context.get()
|
|
71
|
+
if _current_trace_context and isinstance(_current_trace_context, Trace):
|
|
72
|
+
_otel_trace_id = span.get_span_context().trace_id
|
|
73
|
+
_current_trace_context.uuid = to_hex_string(_otel_trace_id, 32)
|
|
74
|
+
|
|
40
75
|
# set trace attributes
|
|
41
76
|
if self.settings.thread_id:
|
|
42
77
|
span.set_attribute(
|
|
@@ -148,8 +183,9 @@ class ConfidentInstrumentationSettings(InstrumentationSettings):
|
|
|
148
183
|
confident_prompt: Optional[Prompt] = None,
|
|
149
184
|
llm_metric_collection: Optional[str] = None,
|
|
150
185
|
agent_metric_collection: Optional[str] = None,
|
|
151
|
-
tool_metric_collection_map: dict =
|
|
186
|
+
tool_metric_collection_map: Optional[dict] = None,
|
|
152
187
|
trace_metric_collection: Optional[str] = None,
|
|
188
|
+
is_test_mode: Optional[bool] = False,
|
|
153
189
|
):
|
|
154
190
|
is_dependency_installed()
|
|
155
191
|
|
|
@@ -162,7 +198,7 @@ class ConfidentInstrumentationSettings(InstrumentationSettings):
|
|
|
162
198
|
]:
|
|
163
199
|
self.environment = _environment
|
|
164
200
|
|
|
165
|
-
self.tool_metric_collection_map = tool_metric_collection_map
|
|
201
|
+
self.tool_metric_collection_map = tool_metric_collection_map or {}
|
|
166
202
|
self.name = name
|
|
167
203
|
self.thread_id = thread_id
|
|
168
204
|
self.user_id = user_id
|
|
@@ -185,12 +221,15 @@ class ConfidentInstrumentationSettings(InstrumentationSettings):
|
|
|
185
221
|
span_interceptor = SpanInterceptor(self)
|
|
186
222
|
trace_provider.add_span_processor(span_interceptor)
|
|
187
223
|
|
|
188
|
-
|
|
189
|
-
BatchSpanProcessor(
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
224
|
+
if is_test_mode:
|
|
225
|
+
trace_provider.add_span_processor(BatchSpanProcessor(test_exporter))
|
|
226
|
+
else:
|
|
227
|
+
trace_provider.add_span_processor(
|
|
228
|
+
BatchSpanProcessor(
|
|
229
|
+
OTLPSpanExporter(
|
|
230
|
+
endpoint=OTLP_ENDPOINT,
|
|
231
|
+
headers={"x-confident-api-key": api_key},
|
|
232
|
+
)
|
|
193
233
|
)
|
|
194
234
|
)
|
|
195
|
-
)
|
|
196
235
|
super().__init__(tracer_provider=trace_provider)
|
|
File without changes
|
|
@@ -41,6 +41,7 @@ class FaithfulnessMetric(BaseMetric):
|
|
|
41
41
|
strict_mode: bool = False,
|
|
42
42
|
verbose_mode: bool = False,
|
|
43
43
|
truths_extraction_limit: Optional[int] = None,
|
|
44
|
+
penalize_ambiguous_claims: bool = False,
|
|
44
45
|
evaluation_template: Type[FaithfulnessTemplate] = FaithfulnessTemplate,
|
|
45
46
|
):
|
|
46
47
|
self.threshold = 1 if strict_mode else threshold
|
|
@@ -51,6 +52,7 @@ class FaithfulnessMetric(BaseMetric):
|
|
|
51
52
|
self.strict_mode = strict_mode
|
|
52
53
|
self.verbose_mode = verbose_mode
|
|
53
54
|
self.evaluation_template = evaluation_template
|
|
55
|
+
self.penalize_ambiguous_claims = penalize_ambiguous_claims
|
|
54
56
|
|
|
55
57
|
self.truths_extraction_limit = truths_extraction_limit
|
|
56
58
|
if self.truths_extraction_limit is not None:
|
|
@@ -329,6 +331,12 @@ class FaithfulnessMetric(BaseMetric):
|
|
|
329
331
|
if verdict.verdict.strip().lower() != "no":
|
|
330
332
|
faithfulness_count += 1
|
|
331
333
|
|
|
334
|
+
if (
|
|
335
|
+
self.penalize_ambiguous_claims
|
|
336
|
+
and verdict.verdict.strip().lower() == "idk"
|
|
337
|
+
):
|
|
338
|
+
faithfulness_count -= 1
|
|
339
|
+
|
|
332
340
|
score = faithfulness_count / number_of_verdicts
|
|
333
341
|
return 0 if self.strict_mode and score < self.threshold else score
|
|
334
342
|
|
deepeval/prompt/prompt.py
CHANGED
|
@@ -8,6 +8,7 @@ import os
|
|
|
8
8
|
from pydantic import BaseModel
|
|
9
9
|
import asyncio
|
|
10
10
|
import portalocker
|
|
11
|
+
import threading
|
|
11
12
|
|
|
12
13
|
from deepeval.prompt.api import (
|
|
13
14
|
PromptHttpResponse,
|
|
@@ -20,15 +21,39 @@ from deepeval.prompt.api import (
|
|
|
20
21
|
from deepeval.prompt.utils import interpolate_text
|
|
21
22
|
from deepeval.confident.api import Api, Endpoints, HttpMethods
|
|
22
23
|
from deepeval.constants import HIDDEN_DIR
|
|
23
|
-
from deepeval.utils import (
|
|
24
|
-
get_or_create_event_loop,
|
|
25
|
-
get_or_create_general_event_loop,
|
|
26
|
-
)
|
|
27
24
|
|
|
28
25
|
CACHE_FILE_NAME = f"{HIDDEN_DIR}/.deepeval-prompt-cache.json"
|
|
29
26
|
VERSION_CACHE_KEY = "version"
|
|
30
27
|
LABEL_CACHE_KEY = "label"
|
|
31
28
|
|
|
29
|
+
# Global background event loop for polling
|
|
30
|
+
_polling_loop: Optional[asyncio.AbstractEventLoop] = None
|
|
31
|
+
_polling_thread: Optional[threading.Thread] = None
|
|
32
|
+
_polling_loop_lock = threading.Lock()
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _get_or_create_polling_loop() -> asyncio.AbstractEventLoop:
|
|
36
|
+
"""Get or create a background event loop for polling that runs in a daemon thread."""
|
|
37
|
+
global _polling_loop, _polling_thread
|
|
38
|
+
|
|
39
|
+
with _polling_loop_lock:
|
|
40
|
+
if _polling_loop is None or not _polling_loop.is_running():
|
|
41
|
+
|
|
42
|
+
def run_loop():
|
|
43
|
+
global _polling_loop
|
|
44
|
+
_polling_loop = asyncio.new_event_loop()
|
|
45
|
+
asyncio.set_event_loop(_polling_loop)
|
|
46
|
+
_polling_loop.run_forever()
|
|
47
|
+
|
|
48
|
+
_polling_thread = threading.Thread(target=run_loop, daemon=True)
|
|
49
|
+
_polling_thread.start()
|
|
50
|
+
|
|
51
|
+
# Wait for loop to be ready
|
|
52
|
+
while _polling_loop is None:
|
|
53
|
+
time.sleep(0.01)
|
|
54
|
+
|
|
55
|
+
return _polling_loop
|
|
56
|
+
|
|
32
57
|
|
|
33
58
|
class CustomEncoder(json.JSONEncoder):
|
|
34
59
|
def default(self, obj):
|
|
@@ -80,11 +105,22 @@ class Prompt:
|
|
|
80
105
|
self._version = None
|
|
81
106
|
self._polling_tasks: Dict[str, Dict[str, asyncio.Task]] = {}
|
|
82
107
|
self._refresh_map: Dict[str, Dict[str, int]] = {}
|
|
108
|
+
self._lock = (
|
|
109
|
+
threading.Lock()
|
|
110
|
+
) # Protect instance attributes from race conditions
|
|
83
111
|
if template:
|
|
84
112
|
self._type = PromptType.TEXT
|
|
85
113
|
elif messages_template:
|
|
86
114
|
self._type = PromptType.LIST
|
|
87
115
|
|
|
116
|
+
def __del__(self):
|
|
117
|
+
"""Cleanup polling tasks when instance is destroyed"""
|
|
118
|
+
try:
|
|
119
|
+
self._stop_polling()
|
|
120
|
+
except Exception:
|
|
121
|
+
# Suppress exceptions during cleanup to avoid issues in interpreter shutdown
|
|
122
|
+
pass
|
|
123
|
+
|
|
88
124
|
@property
|
|
89
125
|
def version(self):
|
|
90
126
|
if self._version is not None and self._version != "latest":
|
|
@@ -100,33 +136,37 @@ class Prompt:
|
|
|
100
136
|
self._version = value
|
|
101
137
|
|
|
102
138
|
def interpolate(self, **kwargs):
|
|
103
|
-
|
|
104
|
-
|
|
139
|
+
with self._lock:
|
|
140
|
+
prompt_type = self._type
|
|
141
|
+
text_template = self._text_template
|
|
142
|
+
messages_template = self._messages_template
|
|
143
|
+
interpolation_type = self._interpolation_type
|
|
144
|
+
|
|
145
|
+
if prompt_type == PromptType.TEXT:
|
|
146
|
+
if text_template is None:
|
|
105
147
|
raise TypeError(
|
|
106
148
|
"Unable to interpolate empty prompt template. Please pull a prompt from Confident AI or set template manually to continue."
|
|
107
149
|
)
|
|
108
150
|
|
|
109
|
-
return interpolate_text(
|
|
110
|
-
self._interpolation_type, self._text_template, **kwargs
|
|
111
|
-
)
|
|
151
|
+
return interpolate_text(interpolation_type, text_template, **kwargs)
|
|
112
152
|
|
|
113
|
-
elif
|
|
114
|
-
if
|
|
153
|
+
elif prompt_type == PromptType.LIST:
|
|
154
|
+
if messages_template is None:
|
|
115
155
|
raise TypeError(
|
|
116
156
|
"Unable to interpolate empty prompt template messages. Please pull a prompt from Confident AI or set template manually to continue."
|
|
117
157
|
)
|
|
118
158
|
|
|
119
159
|
interpolated_messages = []
|
|
120
|
-
for message in
|
|
160
|
+
for message in messages_template:
|
|
121
161
|
interpolated_content = interpolate_text(
|
|
122
|
-
|
|
162
|
+
interpolation_type, message.content, **kwargs
|
|
123
163
|
)
|
|
124
164
|
interpolated_messages.append(
|
|
125
165
|
{"role": message.role, "content": interpolated_content}
|
|
126
166
|
)
|
|
127
167
|
return interpolated_messages
|
|
128
168
|
else:
|
|
129
|
-
raise ValueError(f"Unsupported prompt type: {
|
|
169
|
+
raise ValueError(f"Unsupported prompt type: {prompt_type}")
|
|
130
170
|
|
|
131
171
|
def _get_versions(self) -> List:
|
|
132
172
|
if self.alias is None:
|
|
@@ -272,15 +312,16 @@ class Prompt:
|
|
|
272
312
|
if not cached_prompt:
|
|
273
313
|
raise ValueError("Unable to fetch prompt and load from cache")
|
|
274
314
|
|
|
275
|
-
self.
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
315
|
+
with self._lock:
|
|
316
|
+
self.version = cached_prompt.version
|
|
317
|
+
self.label = cached_prompt.label
|
|
318
|
+
self._text_template = cached_prompt.template
|
|
319
|
+
self._messages_template = cached_prompt.messages_template
|
|
320
|
+
self._prompt_version_id = cached_prompt.prompt_version_id
|
|
321
|
+
self._type = PromptType(cached_prompt.type)
|
|
322
|
+
self._interpolation_type = PromptInterpolationType(
|
|
323
|
+
cached_prompt.interpolation_type
|
|
324
|
+
)
|
|
284
325
|
|
|
285
326
|
end_time = time.perf_counter()
|
|
286
327
|
time_taken = format(end_time - start_time, ".2f")
|
|
@@ -300,7 +341,6 @@ class Prompt:
|
|
|
300
341
|
):
|
|
301
342
|
should_write_on_first_fetch = False
|
|
302
343
|
if refresh:
|
|
303
|
-
default_to_cache = True
|
|
304
344
|
# Check if we need to bootstrap the cache
|
|
305
345
|
cached_prompt = self._read_from_cache(
|
|
306
346
|
self.alias, version=version, label=label
|
|
@@ -316,12 +356,10 @@ class Prompt:
|
|
|
316
356
|
)
|
|
317
357
|
|
|
318
358
|
# Manage background prompt polling
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
loop.run_until_complete(
|
|
324
|
-
self.create_polling_task(version, label, refresh)
|
|
359
|
+
if refresh:
|
|
360
|
+
loop = _get_or_create_polling_loop()
|
|
361
|
+
asyncio.run_coroutine_threadsafe(
|
|
362
|
+
self.create_polling_task(version, label, refresh), loop
|
|
325
363
|
)
|
|
326
364
|
|
|
327
365
|
if default_to_cache:
|
|
@@ -330,15 +368,20 @@ class Prompt:
|
|
|
330
368
|
self.alias, version=version, label=label
|
|
331
369
|
)
|
|
332
370
|
if cached_prompt:
|
|
333
|
-
self.
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
371
|
+
with self._lock:
|
|
372
|
+
self.version = cached_prompt.version
|
|
373
|
+
self.label = cached_prompt.label
|
|
374
|
+
self._text_template = cached_prompt.template
|
|
375
|
+
self._messages_template = (
|
|
376
|
+
cached_prompt.messages_template
|
|
377
|
+
)
|
|
378
|
+
self._prompt_version_id = (
|
|
379
|
+
cached_prompt.prompt_version_id
|
|
380
|
+
)
|
|
381
|
+
self._type = PromptType(cached_prompt.type)
|
|
382
|
+
self._interpolation_type = PromptInterpolationType(
|
|
383
|
+
cached_prompt.interpolation_type
|
|
384
|
+
)
|
|
342
385
|
return
|
|
343
386
|
except:
|
|
344
387
|
pass
|
|
@@ -402,13 +445,14 @@ class Prompt:
|
|
|
402
445
|
return
|
|
403
446
|
raise
|
|
404
447
|
|
|
405
|
-
self.
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
448
|
+
with self._lock:
|
|
449
|
+
self.version = response.version
|
|
450
|
+
self.label = response.label
|
|
451
|
+
self._text_template = response.text
|
|
452
|
+
self._messages_template = response.messages
|
|
453
|
+
self._prompt_version_id = response.id
|
|
454
|
+
self._type = response.type
|
|
455
|
+
self._interpolation_type = response.interpolation_type
|
|
412
456
|
|
|
413
457
|
end_time = time.perf_counter()
|
|
414
458
|
time_taken = format(end_time - start_time, ".2f")
|
|
@@ -483,11 +527,7 @@ class Prompt:
|
|
|
483
527
|
version: Optional[str],
|
|
484
528
|
label: Optional[str],
|
|
485
529
|
refresh: Optional[int] = 60,
|
|
486
|
-
default_to_cache: bool = True,
|
|
487
530
|
):
|
|
488
|
-
if version is None and label is None:
|
|
489
|
-
return
|
|
490
|
-
|
|
491
531
|
# If polling task doesn't exist, start it
|
|
492
532
|
CACHE_KEY = LABEL_CACHE_KEY if label else VERSION_CACHE_KEY
|
|
493
533
|
cache_value = label if label else version
|
|
@@ -506,9 +546,7 @@ class Prompt:
|
|
|
506
546
|
self._refresh_map[CACHE_KEY][cache_value] = refresh
|
|
507
547
|
if not polling_task:
|
|
508
548
|
self._polling_tasks[CACHE_KEY][cache_value] = (
|
|
509
|
-
asyncio.create_task(
|
|
510
|
-
self.poll(version, label, default_to_cache)
|
|
511
|
-
)
|
|
549
|
+
asyncio.create_task(self.poll(version, label))
|
|
512
550
|
)
|
|
513
551
|
|
|
514
552
|
# If invalid `refresh`, stop the task
|
|
@@ -524,24 +562,12 @@ class Prompt:
|
|
|
524
562
|
self,
|
|
525
563
|
version: Optional[str] = None,
|
|
526
564
|
label: Optional[str] = None,
|
|
527
|
-
default_to_cache: bool = True,
|
|
528
565
|
):
|
|
566
|
+
CACHE_KEY = LABEL_CACHE_KEY if label else VERSION_CACHE_KEY
|
|
567
|
+
cache_value = label if label else version
|
|
568
|
+
|
|
529
569
|
while True:
|
|
530
|
-
|
|
531
|
-
cached_prompt = self._read_from_cache(
|
|
532
|
-
self.alias, version=version, label=label
|
|
533
|
-
)
|
|
534
|
-
if cached_prompt:
|
|
535
|
-
self.version = cached_prompt.version
|
|
536
|
-
self.label = cached_prompt.label
|
|
537
|
-
self._text_template = cached_prompt.template
|
|
538
|
-
self._messages_template = cached_prompt.messages_template
|
|
539
|
-
self._prompt_version_id = cached_prompt.prompt_version_id
|
|
540
|
-
self._type = PromptType(cached_prompt.type)
|
|
541
|
-
self._interpolation_type = PromptInterpolationType(
|
|
542
|
-
cached_prompt.interpolation_type
|
|
543
|
-
)
|
|
544
|
-
return
|
|
570
|
+
await asyncio.sleep(self._refresh_map[CACHE_KEY][cache_value])
|
|
545
571
|
|
|
546
572
|
api = Api()
|
|
547
573
|
try:
|
|
@@ -573,22 +599,43 @@ class Prompt:
|
|
|
573
599
|
type=data["type"],
|
|
574
600
|
interpolation_type=data["interpolationType"],
|
|
575
601
|
)
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
602
|
+
|
|
603
|
+
# Update the cache with fresh data from server
|
|
604
|
+
self._write_to_cache(
|
|
605
|
+
cache_key=CACHE_KEY,
|
|
606
|
+
version=response.version,
|
|
607
|
+
label=response.label,
|
|
608
|
+
text_template=response.text,
|
|
609
|
+
messages_template=response.messages,
|
|
610
|
+
prompt_version_id=response.id,
|
|
611
|
+
type=response.type,
|
|
612
|
+
interpolation_type=response.interpolation_type,
|
|
613
|
+
)
|
|
614
|
+
|
|
615
|
+
# Update in-memory properties with fresh data (thread-safe)
|
|
616
|
+
with self._lock:
|
|
617
|
+
self.version = response.version
|
|
618
|
+
self.label = response.label
|
|
619
|
+
self._text_template = response.text
|
|
620
|
+
self._messages_template = response.messages
|
|
621
|
+
self._prompt_version_id = response.id
|
|
622
|
+
self._type = response.type
|
|
623
|
+
self._interpolation_type = response.interpolation_type
|
|
624
|
+
|
|
625
|
+
except Exception:
|
|
590
626
|
pass
|
|
591
627
|
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
628
|
+
def _stop_polling(self):
|
|
629
|
+
loop = _polling_loop
|
|
630
|
+
if not loop or not loop.is_running():
|
|
631
|
+
return
|
|
632
|
+
|
|
633
|
+
# Stop all polling tasks
|
|
634
|
+
for ck in list(self._polling_tasks.keys()):
|
|
635
|
+
for cv in list(self._polling_tasks[ck].keys()):
|
|
636
|
+
task = self._polling_tasks[ck][cv]
|
|
637
|
+
if task and not task.done():
|
|
638
|
+
loop.call_soon_threadsafe(task.cancel)
|
|
639
|
+
self._polling_tasks[ck].clear()
|
|
640
|
+
self._refresh_map[ck].clear()
|
|
641
|
+
return
|
deepeval/test_run/__init__.py
CHANGED
|
@@ -11,7 +11,7 @@ from .test_run import (
|
|
|
11
11
|
)
|
|
12
12
|
|
|
13
13
|
from .hooks import on_test_run_end, invoke_test_run_end_hook
|
|
14
|
-
from .api import MetricData
|
|
14
|
+
from .api import MetricData, TurnApi
|
|
15
15
|
from .hyperparameters import log_hyperparameters
|
|
16
16
|
|
|
17
17
|
|
|
@@ -28,5 +28,6 @@ __all__ = [
|
|
|
28
28
|
"on_test_run_end",
|
|
29
29
|
"invoke_test_run_end_hook",
|
|
30
30
|
"MetricData",
|
|
31
|
+
"TurnApi",
|
|
31
32
|
"log_hyperparameters",
|
|
32
33
|
]
|
deepeval/test_run/api.py
CHANGED
|
@@ -99,6 +99,7 @@ class TurnApi(BaseModel):
|
|
|
99
99
|
role: str
|
|
100
100
|
content: str
|
|
101
101
|
order: int
|
|
102
|
+
user_id: Optional[str] = Field(None, alias="userId")
|
|
102
103
|
retrieval_context: Optional[list] = Field(None, alias="retrievalContext")
|
|
103
104
|
tools_called: Optional[List[ToolCall]] = Field(None, alias="toolsCalled")
|
|
104
105
|
additional_metadata: Optional[Dict] = Field(
|
deepeval/test_run/test_run.py
CHANGED
|
@@ -2,9 +2,8 @@ from enum import Enum
|
|
|
2
2
|
import os
|
|
3
3
|
import json
|
|
4
4
|
from pydantic import BaseModel, Field
|
|
5
|
-
from typing import Any, Optional, List, Dict, Union
|
|
5
|
+
from typing import Any, Optional, List, Dict, Union, Tuple
|
|
6
6
|
import shutil
|
|
7
|
-
import webbrowser
|
|
8
7
|
import sys
|
|
9
8
|
import datetime
|
|
10
9
|
import portalocker
|
|
@@ -27,6 +26,9 @@ from deepeval.utils import (
|
|
|
27
26
|
delete_file_if_exists,
|
|
28
27
|
get_is_running_deepeval,
|
|
29
28
|
open_browser,
|
|
29
|
+
shorten,
|
|
30
|
+
format_turn,
|
|
31
|
+
len_short,
|
|
30
32
|
)
|
|
31
33
|
from deepeval.test_run.cache import global_test_run_cache_manager
|
|
32
34
|
from deepeval.constants import CONFIDENT_TEST_CASE_BATCH_SIZE, HIDDEN_DIR
|
|
@@ -546,7 +548,7 @@ class TestRunManager:
|
|
|
546
548
|
|
|
547
549
|
if (
|
|
548
550
|
display == TestRunResultDisplay.PASSING
|
|
549
|
-
and test_case.success
|
|
551
|
+
and test_case.success is False
|
|
550
552
|
):
|
|
551
553
|
continue
|
|
552
554
|
elif display == TestRunResultDisplay.FAILING and test_case.success:
|
|
@@ -618,7 +620,7 @@ class TestRunManager:
|
|
|
618
620
|
):
|
|
619
621
|
if (
|
|
620
622
|
display == TestRunResultDisplay.PASSING
|
|
621
|
-
and conversational_test_case.success
|
|
623
|
+
and conversational_test_case.success is False
|
|
622
624
|
):
|
|
623
625
|
continue
|
|
624
626
|
elif (
|
|
@@ -631,6 +633,65 @@ class TestRunManager:
|
|
|
631
633
|
fail_count = 0
|
|
632
634
|
conversational_test_case_name = conversational_test_case.name
|
|
633
635
|
|
|
636
|
+
if conversational_test_case.turns:
|
|
637
|
+
turns_table = Table(
|
|
638
|
+
title=f"Conversation - {conversational_test_case_name}",
|
|
639
|
+
show_header=True,
|
|
640
|
+
header_style="bold",
|
|
641
|
+
)
|
|
642
|
+
turns_table.add_column("#", justify="right", width=3)
|
|
643
|
+
turns_table.add_column("Role", justify="left", width=10)
|
|
644
|
+
|
|
645
|
+
# subtract fixed widths + borders and padding.
|
|
646
|
+
# ~20 as a safe buffer
|
|
647
|
+
details_max_width = max(
|
|
648
|
+
48, min(120, console.width - 3 - 10 - 20)
|
|
649
|
+
)
|
|
650
|
+
turns_table.add_column(
|
|
651
|
+
"Details",
|
|
652
|
+
justify="left",
|
|
653
|
+
overflow="fold",
|
|
654
|
+
max_width=details_max_width,
|
|
655
|
+
)
|
|
656
|
+
|
|
657
|
+
# truncate when too long
|
|
658
|
+
tools_max_width = min(60, max(24, console.width // 3))
|
|
659
|
+
turns_table.add_column(
|
|
660
|
+
"Tools",
|
|
661
|
+
justify="left",
|
|
662
|
+
no_wrap=True,
|
|
663
|
+
overflow="ellipsis",
|
|
664
|
+
max_width=tools_max_width,
|
|
665
|
+
)
|
|
666
|
+
|
|
667
|
+
sorted_turns = sorted(
|
|
668
|
+
conversational_test_case.turns, key=lambda t: t.order
|
|
669
|
+
)
|
|
670
|
+
|
|
671
|
+
for t in sorted_turns:
|
|
672
|
+
tools = t.tools_called or []
|
|
673
|
+
tool_names = ", ".join(tc.name for tc in tools)
|
|
674
|
+
|
|
675
|
+
# omit order, role and tools since we show them in a separate columns.
|
|
676
|
+
details = format_turn(
|
|
677
|
+
t,
|
|
678
|
+
include_tools_in_header=False,
|
|
679
|
+
include_order_role_in_header=False,
|
|
680
|
+
)
|
|
681
|
+
|
|
682
|
+
turns_table.add_row(
|
|
683
|
+
str(t.order),
|
|
684
|
+
t.role,
|
|
685
|
+
details,
|
|
686
|
+
shorten(tool_names, len_short()),
|
|
687
|
+
)
|
|
688
|
+
|
|
689
|
+
console.print(turns_table)
|
|
690
|
+
else:
|
|
691
|
+
console.print(
|
|
692
|
+
f"[dim]No turns recorded for {conversational_test_case_name}.[/dim]"
|
|
693
|
+
)
|
|
694
|
+
|
|
634
695
|
if conversational_test_case.metrics_data is not None:
|
|
635
696
|
for metric_data in conversational_test_case.metrics_data:
|
|
636
697
|
if metric_data.success:
|
|
@@ -698,7 +759,7 @@ class TestRunManager:
|
|
|
698
759
|
)
|
|
699
760
|
print(table)
|
|
700
761
|
|
|
701
|
-
def post_test_run(self, test_run: TestRun) -> Optional[str]:
|
|
762
|
+
def post_test_run(self, test_run: TestRun) -> Optional[Tuple[str, str]]:
|
|
702
763
|
if (
|
|
703
764
|
len(test_run.test_cases) == 0
|
|
704
765
|
and len(test_run.conversational_test_cases) == 0
|
|
@@ -752,6 +813,21 @@ class TestRunManager:
|
|
|
752
813
|
body=body,
|
|
753
814
|
)
|
|
754
815
|
|
|
816
|
+
if not isinstance(data, dict) or "id" not in data:
|
|
817
|
+
# try to show helpful details
|
|
818
|
+
detail = None
|
|
819
|
+
if isinstance(data, dict):
|
|
820
|
+
detail = (
|
|
821
|
+
data.get("detail")
|
|
822
|
+
or data.get("message")
|
|
823
|
+
or data.get("error")
|
|
824
|
+
)
|
|
825
|
+
# fall back to repr for visibility
|
|
826
|
+
raise RuntimeError(
|
|
827
|
+
f"Confident API response missing 'id'. "
|
|
828
|
+
f"detail={detail!r} raw={type(data).__name__}:{repr(data)[:500]}"
|
|
829
|
+
)
|
|
830
|
+
|
|
755
831
|
res = TestRunHttpResponse(
|
|
756
832
|
id=data["id"],
|
|
757
833
|
)
|
|
@@ -814,7 +890,7 @@ class TestRunManager:
|
|
|
814
890
|
)
|
|
815
891
|
self.save_final_test_run_link(link)
|
|
816
892
|
open_browser(link)
|
|
817
|
-
return link
|
|
893
|
+
return link, res.id
|
|
818
894
|
|
|
819
895
|
def save_test_run_locally(self):
|
|
820
896
|
local_folder = os.getenv("DEEPEVAL_RESULTS_FOLDER")
|
|
@@ -841,7 +917,7 @@ class TestRunManager:
|
|
|
841
917
|
runDuration: float,
|
|
842
918
|
display_table: bool = True,
|
|
843
919
|
display: Optional[TestRunResultDisplay] = TestRunResultDisplay.ALL,
|
|
844
|
-
) -> Optional[str]:
|
|
920
|
+
) -> Optional[Tuple[str, str]]:
|
|
845
921
|
test_run = self.get_test_run()
|
|
846
922
|
if test_run is None:
|
|
847
923
|
print("Test Run is empty, please try again.")
|
|
@@ -868,8 +944,8 @@ class TestRunManager:
|
|
|
868
944
|
test_run.sort_test_cases()
|
|
869
945
|
|
|
870
946
|
if global_test_run_cache_manager.disable_write_cache is None:
|
|
871
|
-
global_test_run_cache_manager.disable_write_cache = (
|
|
872
|
-
get_is_running_deepeval()
|
|
947
|
+
global_test_run_cache_manager.disable_write_cache = not bool(
|
|
948
|
+
get_is_running_deepeval()
|
|
873
949
|
)
|
|
874
950
|
|
|
875
951
|
global_test_run_cache_manager.wrap_up_cached_test_run()
|
deepeval/tracing/__init__.py
CHANGED
|
@@ -4,6 +4,7 @@ from .context import (
|
|
|
4
4
|
update_retriever_span,
|
|
5
5
|
update_llm_span,
|
|
6
6
|
)
|
|
7
|
+
from .trace_context import trace
|
|
7
8
|
from .types import BaseSpan, Trace
|
|
8
9
|
from .tracing import observe, trace_manager
|
|
9
10
|
from .offline_evals import evaluate_thread, evaluate_trace, evaluate_span
|
|
@@ -16,6 +17,7 @@ __all__ = [
|
|
|
16
17
|
"BaseSpan",
|
|
17
18
|
"Trace",
|
|
18
19
|
"observe",
|
|
20
|
+
"trace",
|
|
19
21
|
"trace_manager",
|
|
20
22
|
"evaluate_thread",
|
|
21
23
|
"evaluate_trace",
|