deepeval 3.7.6__py3-none-any.whl → 3.7.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/cli/main.py +2022 -759
- deepeval/cli/utils.py +208 -36
- deepeval/config/dotenv_handler.py +19 -0
- deepeval/config/settings.py +658 -262
- deepeval/config/utils.py +9 -1
- deepeval/dataset/test_run_tracer.py +4 -6
- deepeval/evaluate/execute.py +153 -94
- deepeval/integrations/pydantic_ai/instrumentator.py +4 -2
- deepeval/integrations/pydantic_ai/otel.py +5 -1
- deepeval/key_handler.py +121 -51
- deepeval/metrics/base_metric.py +9 -3
- deepeval/metrics/g_eval/g_eval.py +6 -1
- deepeval/metrics/indicator.py +8 -4
- deepeval/metrics/mcp/mcp_task_completion.py +15 -16
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +15 -15
- deepeval/metrics/mcp/schema.py +4 -0
- deepeval/metrics/mcp/template.py +8 -1
- deepeval/metrics/prompt_alignment/prompt_alignment.py +6 -3
- deepeval/metrics/tool_use/schema.py +4 -0
- deepeval/metrics/tool_use/template.py +16 -2
- deepeval/metrics/tool_use/tool_use.py +30 -28
- deepeval/metrics/topic_adherence/schema.py +4 -0
- deepeval/metrics/topic_adherence/template.py +8 -1
- deepeval/metrics/topic_adherence/topic_adherence.py +15 -14
- deepeval/metrics/turn_contextual_precision/template.py +8 -1
- deepeval/metrics/turn_contextual_precision/turn_contextual_precision.py +44 -86
- deepeval/metrics/turn_contextual_recall/template.py +8 -1
- deepeval/metrics/turn_contextual_recall/turn_contextual_recall.py +44 -82
- deepeval/metrics/turn_contextual_relevancy/template.py +8 -1
- deepeval/metrics/turn_contextual_relevancy/turn_contextual_relevancy.py +48 -92
- deepeval/metrics/turn_faithfulness/template.py +8 -1
- deepeval/metrics/turn_faithfulness/turn_faithfulness.py +76 -130
- deepeval/metrics/utils.py +16 -1
- deepeval/models/__init__.py +2 -0
- deepeval/models/llms/__init__.py +2 -0
- deepeval/models/llms/amazon_bedrock_model.py +5 -4
- deepeval/models/llms/anthropic_model.py +4 -3
- deepeval/models/llms/azure_model.py +4 -3
- deepeval/models/llms/deepseek_model.py +5 -8
- deepeval/models/llms/grok_model.py +5 -8
- deepeval/models/llms/kimi_model.py +5 -8
- deepeval/models/llms/litellm_model.py +2 -0
- deepeval/models/llms/local_model.py +1 -1
- deepeval/models/llms/openai_model.py +4 -3
- deepeval/models/retry_policy.py +10 -5
- deepeval/models/utils.py +1 -5
- deepeval/simulator/conversation_simulator.py +6 -2
- deepeval/simulator/template.py +3 -1
- deepeval/synthesizer/synthesizer.py +19 -17
- deepeval/test_run/test_run.py +6 -1
- deepeval/utils.py +26 -0
- {deepeval-3.7.6.dist-info → deepeval-3.7.8.dist-info}/METADATA +3 -3
- {deepeval-3.7.6.dist-info → deepeval-3.7.8.dist-info}/RECORD +57 -56
- {deepeval-3.7.6.dist-info → deepeval-3.7.8.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.6.dist-info → deepeval-3.7.8.dist-info}/WHEEL +0 -0
- {deepeval-3.7.6.dist-info → deepeval-3.7.8.dist-info}/entry_points.txt +0 -0
|
@@ -18,6 +18,7 @@ from deepeval.metrics.topic_adherence.schema import (
|
|
|
18
18
|
RelevancyVerdict,
|
|
19
19
|
QAPairs,
|
|
20
20
|
QAPair,
|
|
21
|
+
TopicAdherenceReason,
|
|
21
22
|
)
|
|
22
23
|
from deepeval.metrics.api import metric_data_manager
|
|
23
24
|
|
|
@@ -227,25 +228,25 @@ class TopicAdherenceMetric(BaseConversationalMetric):
|
|
|
227
228
|
prompt = TopicAdherenceTemplate.generate_reason(
|
|
228
229
|
self.success, self.score, self.threshold, TP, TN, FP, FN
|
|
229
230
|
)
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
231
|
+
return generate_with_schema_and_extract(
|
|
232
|
+
metric=self,
|
|
233
|
+
prompt=prompt,
|
|
234
|
+
schema_cls=TopicAdherenceReason,
|
|
235
|
+
extract_schema=lambda s: s.reason,
|
|
236
|
+
extract_json=lambda data: data["reason"],
|
|
237
|
+
)
|
|
237
238
|
|
|
238
239
|
async def _a_generate_reason(self, TP, TN, FP, FN):
|
|
239
240
|
prompt = TopicAdherenceTemplate.generate_reason(
|
|
240
241
|
self.success, self.score, self.threshold, TP, TN, FP, FN
|
|
241
242
|
)
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
243
|
+
return await a_generate_with_schema_and_extract(
|
|
244
|
+
metric=self,
|
|
245
|
+
prompt=prompt,
|
|
246
|
+
schema_cls=TopicAdherenceReason,
|
|
247
|
+
extract_schema=lambda s: s.reason,
|
|
248
|
+
extract_json=lambda data: data["reason"],
|
|
249
|
+
)
|
|
249
250
|
|
|
250
251
|
def _get_score(self, TP, TN, FP, FN) -> float:
|
|
251
252
|
true_values = TP[0] + TN[0]
|
|
@@ -134,6 +134,13 @@ class TurnContextualPrecisionTemplate:
|
|
|
134
134
|
Context:
|
|
135
135
|
This metric evaluates conversational contextual precision by determining whether relevant nodes in retrieval context are ranked higher than irrelevant nodes for each interaction. Each interaction yields a reason indicating why relevant nodes were well-ranked or poorly-ranked. You are given all those reasons.
|
|
136
136
|
|
|
137
|
+
**
|
|
138
|
+
IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
|
|
139
|
+
Example JSON:
|
|
140
|
+
{{
|
|
141
|
+
"reason": "The score is <contextual_precision_score> because <your_reason>."
|
|
142
|
+
}}
|
|
143
|
+
|
|
137
144
|
Inputs:
|
|
138
145
|
- final_score: the averaged score across all interactions.
|
|
139
146
|
- success: whether the metric passed or failed
|
|
@@ -160,7 +167,7 @@ class TurnContextualPrecisionTemplate:
|
|
|
160
167
|
|
|
161
168
|
Now give me a final reason that explains why the metric passed or failed. Output ONLY the reason and nothing else.
|
|
162
169
|
|
|
163
|
-
|
|
170
|
+
JSON:
|
|
164
171
|
"""
|
|
165
172
|
)
|
|
166
173
|
|
|
@@ -14,6 +14,8 @@ from deepeval.metrics.utils import (
|
|
|
14
14
|
get_unit_interactions,
|
|
15
15
|
get_turns_in_sliding_window,
|
|
16
16
|
initialize_model,
|
|
17
|
+
a_generate_with_schema_and_extract,
|
|
18
|
+
generate_with_schema_and_extract,
|
|
17
19
|
)
|
|
18
20
|
from deepeval.models import DeepEvalBaseLLM
|
|
19
21
|
from deepeval.metrics.turn_contextual_precision.template import (
|
|
@@ -279,26 +281,13 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
|
|
|
279
281
|
multimodal=multimodal,
|
|
280
282
|
)
|
|
281
283
|
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
res: Verdicts = await self.model.a_generate(
|
|
290
|
-
prompt, schema=Verdicts
|
|
291
|
-
)
|
|
292
|
-
verdicts = [item for item in res.verdicts]
|
|
293
|
-
return verdicts
|
|
294
|
-
except TypeError:
|
|
295
|
-
res = await self.model.a_generate(prompt)
|
|
296
|
-
data = trimAndLoadJson(res, self)
|
|
297
|
-
verdicts = [
|
|
298
|
-
ContextualPrecisionVerdict(**item)
|
|
299
|
-
for item in data["verdicts"]
|
|
300
|
-
]
|
|
301
|
-
return verdicts
|
|
284
|
+
return await a_generate_with_schema_and_extract(
|
|
285
|
+
metric=self,
|
|
286
|
+
prompt=prompt,
|
|
287
|
+
schema_cls=Verdicts,
|
|
288
|
+
extract_schema=lambda s: s.verdicts,
|
|
289
|
+
extract_json=lambda data: data["verdicts"],
|
|
290
|
+
)
|
|
302
291
|
|
|
303
292
|
def _generate_verdicts(
|
|
304
293
|
self,
|
|
@@ -319,24 +308,13 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
|
|
|
319
308
|
multimodal=multimodal,
|
|
320
309
|
)
|
|
321
310
|
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
res: Verdicts = self.model.generate(prompt, schema=Verdicts)
|
|
330
|
-
verdicts = [item for item in res.verdicts]
|
|
331
|
-
return verdicts
|
|
332
|
-
except TypeError:
|
|
333
|
-
res = self.model.generate(prompt)
|
|
334
|
-
data = trimAndLoadJson(res, self)
|
|
335
|
-
verdicts = [
|
|
336
|
-
ContextualPrecisionVerdict(**item)
|
|
337
|
-
for item in data["verdicts"]
|
|
338
|
-
]
|
|
339
|
-
return verdicts
|
|
311
|
+
return generate_with_schema_and_extract(
|
|
312
|
+
metric=self,
|
|
313
|
+
prompt=prompt,
|
|
314
|
+
schema_cls=Verdicts,
|
|
315
|
+
extract_schema=lambda s: s.verdicts,
|
|
316
|
+
extract_json=lambda data: data["verdicts"],
|
|
317
|
+
)
|
|
340
318
|
|
|
341
319
|
async def _a_get_interaction_score_and_reason(
|
|
342
320
|
self,
|
|
@@ -438,24 +416,13 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
|
|
|
438
416
|
multimodal=multimodal,
|
|
439
417
|
)
|
|
440
418
|
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
try:
|
|
449
|
-
res: ContextualPrecisionScoreReason = (
|
|
450
|
-
await self.model.a_generate(
|
|
451
|
-
prompt, schema=ContextualPrecisionScoreReason
|
|
452
|
-
)
|
|
453
|
-
)
|
|
454
|
-
return res.reason
|
|
455
|
-
except TypeError:
|
|
456
|
-
res = await self.model.a_generate(prompt)
|
|
457
|
-
data = trimAndLoadJson(res, self)
|
|
458
|
-
return data["reason"]
|
|
419
|
+
return await a_generate_with_schema_and_extract(
|
|
420
|
+
metric=self,
|
|
421
|
+
prompt=prompt,
|
|
422
|
+
schema_cls=ContextualPrecisionScoreReason,
|
|
423
|
+
extract_schema=lambda s: s.reason,
|
|
424
|
+
extract_json=lambda data: data["reason"],
|
|
425
|
+
)
|
|
459
426
|
|
|
460
427
|
def _get_interaction_reason(
|
|
461
428
|
self,
|
|
@@ -485,22 +452,13 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
|
|
|
485
452
|
multimodal=multimodal,
|
|
486
453
|
)
|
|
487
454
|
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
try:
|
|
496
|
-
res: ContextualPrecisionScoreReason = self.model.generate(
|
|
497
|
-
prompt, schema=ContextualPrecisionScoreReason
|
|
498
|
-
)
|
|
499
|
-
return res.reason
|
|
500
|
-
except TypeError:
|
|
501
|
-
res = self.model.generate(prompt)
|
|
502
|
-
data = trimAndLoadJson(res, self)
|
|
503
|
-
return data["reason"]
|
|
455
|
+
return generate_with_schema_and_extract(
|
|
456
|
+
metric=self,
|
|
457
|
+
prompt=prompt,
|
|
458
|
+
schema_cls=ContextualPrecisionScoreReason,
|
|
459
|
+
extract_schema=lambda s: s.reason,
|
|
460
|
+
extract_json=lambda data: data["reason"],
|
|
461
|
+
)
|
|
504
462
|
|
|
505
463
|
def _get_verbose_steps(
|
|
506
464
|
self, interaction_scores: List[InteractionContextualPrecisionScore]
|
|
@@ -533,13 +491,13 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
|
|
|
533
491
|
self.score, self.success, reasons
|
|
534
492
|
)
|
|
535
493
|
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
494
|
+
return generate_with_schema_and_extract(
|
|
495
|
+
metric=self,
|
|
496
|
+
prompt=prompt,
|
|
497
|
+
schema_cls=ContextualPrecisionScoreReason,
|
|
498
|
+
extract_schema=lambda s: s.reason,
|
|
499
|
+
extract_json=lambda data: data["reason"],
|
|
500
|
+
)
|
|
543
501
|
|
|
544
502
|
async def _a_generate_reason(
|
|
545
503
|
self, scores: List[InteractionContextualPrecisionScore]
|
|
@@ -558,13 +516,13 @@ class TurnContextualPrecisionMetric(BaseConversationalMetric):
|
|
|
558
516
|
self.score, self.success, reasons
|
|
559
517
|
)
|
|
560
518
|
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
519
|
+
return await a_generate_with_schema_and_extract(
|
|
520
|
+
metric=self,
|
|
521
|
+
prompt=prompt,
|
|
522
|
+
schema_cls=ContextualPrecisionScoreReason,
|
|
523
|
+
extract_schema=lambda s: s.reason,
|
|
524
|
+
extract_json=lambda data: data["reason"],
|
|
525
|
+
)
|
|
568
526
|
|
|
569
527
|
def _calculate_score(
|
|
570
528
|
self, scores: List[InteractionContextualPrecisionScore]
|
|
@@ -125,6 +125,13 @@ class TurnContextualRecallTemplate:
|
|
|
125
125
|
Context:
|
|
126
126
|
This metric evaluates conversational contextual recall by determining whether sentences in the assistant output can be attributed to the retrieval context for each interaction. Each interaction yields a reason indicating which sentences were supported or unsupported. You are given all those reasons.
|
|
127
127
|
|
|
128
|
+
**
|
|
129
|
+
IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
|
|
130
|
+
Example JSON:
|
|
131
|
+
{{
|
|
132
|
+
"reason": "The score is <contextual_recall_score> because <your_reason>."
|
|
133
|
+
}}
|
|
134
|
+
|
|
128
135
|
Inputs:
|
|
129
136
|
- final_score: the averaged score across all interactions.
|
|
130
137
|
- success: whether the metric passed or failed
|
|
@@ -151,7 +158,7 @@ class TurnContextualRecallTemplate:
|
|
|
151
158
|
|
|
152
159
|
Now give me a final reason that explains why the metric passed or failed. Output ONLY the reason and nothing else.
|
|
153
160
|
|
|
154
|
-
|
|
161
|
+
JSON:
|
|
155
162
|
"""
|
|
156
163
|
)
|
|
157
164
|
|
|
@@ -14,6 +14,8 @@ from deepeval.metrics.utils import (
|
|
|
14
14
|
get_unit_interactions,
|
|
15
15
|
get_turns_in_sliding_window,
|
|
16
16
|
initialize_model,
|
|
17
|
+
a_generate_with_schema_and_extract,
|
|
18
|
+
generate_with_schema_and_extract,
|
|
17
19
|
)
|
|
18
20
|
from deepeval.models import DeepEvalBaseLLM
|
|
19
21
|
from deepeval.metrics.turn_contextual_recall.template import (
|
|
@@ -271,25 +273,13 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
|
|
|
271
273
|
multimodal=multimodal,
|
|
272
274
|
)
|
|
273
275
|
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
res: Verdicts = await self.model.a_generate(
|
|
282
|
-
prompt, schema=Verdicts
|
|
283
|
-
)
|
|
284
|
-
verdicts = [item for item in res.verdicts]
|
|
285
|
-
return verdicts
|
|
286
|
-
except TypeError:
|
|
287
|
-
res = await self.model.a_generate(prompt)
|
|
288
|
-
data = trimAndLoadJson(res, self)
|
|
289
|
-
verdicts = [
|
|
290
|
-
ContextualRecallVerdict(**item) for item in data["verdicts"]
|
|
291
|
-
]
|
|
292
|
-
return verdicts
|
|
276
|
+
return await a_generate_with_schema_and_extract(
|
|
277
|
+
metric=self,
|
|
278
|
+
prompt=prompt,
|
|
279
|
+
schema_cls=Verdicts,
|
|
280
|
+
extract_schema=lambda s: s.verdicts,
|
|
281
|
+
extract_json=lambda data: data["verdicts"],
|
|
282
|
+
)
|
|
293
283
|
|
|
294
284
|
def _generate_verdicts(
|
|
295
285
|
self,
|
|
@@ -308,23 +298,13 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
|
|
|
308
298
|
multimodal=multimodal,
|
|
309
299
|
)
|
|
310
300
|
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
res: Verdicts = self.model.generate(prompt, schema=Verdicts)
|
|
319
|
-
verdicts = [item for item in res.verdicts]
|
|
320
|
-
return verdicts
|
|
321
|
-
except TypeError:
|
|
322
|
-
res = self.model.generate(prompt)
|
|
323
|
-
data = trimAndLoadJson(res, self)
|
|
324
|
-
verdicts = [
|
|
325
|
-
ContextualRecallVerdict(**item) for item in data["verdicts"]
|
|
326
|
-
]
|
|
327
|
-
return verdicts
|
|
301
|
+
return generate_with_schema_and_extract(
|
|
302
|
+
metric=self,
|
|
303
|
+
prompt=prompt,
|
|
304
|
+
schema_cls=Verdicts,
|
|
305
|
+
extract_schema=lambda s: s.verdicts,
|
|
306
|
+
extract_json=lambda data: data["verdicts"],
|
|
307
|
+
)
|
|
328
308
|
|
|
329
309
|
async def _a_get_interaction_score_and_reason(
|
|
330
310
|
self,
|
|
@@ -412,22 +392,13 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
|
|
|
412
392
|
multimodal=multimodal,
|
|
413
393
|
)
|
|
414
394
|
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
try:
|
|
423
|
-
res: ContextualRecallScoreReason = await self.model.a_generate(
|
|
424
|
-
prompt, schema=ContextualRecallScoreReason
|
|
425
|
-
)
|
|
426
|
-
return res.reason
|
|
427
|
-
except TypeError:
|
|
428
|
-
res = await self.model.a_generate(prompt)
|
|
429
|
-
data = trimAndLoadJson(res, self)
|
|
430
|
-
return data["reason"]
|
|
395
|
+
return await a_generate_with_schema_and_extract(
|
|
396
|
+
metric=self,
|
|
397
|
+
prompt=prompt,
|
|
398
|
+
schema_cls=ContextualRecallScoreReason,
|
|
399
|
+
extract_schema=lambda s: s.reason,
|
|
400
|
+
extract_json=lambda data: data["reason"],
|
|
401
|
+
)
|
|
431
402
|
|
|
432
403
|
def _get_interaction_reason(
|
|
433
404
|
self,
|
|
@@ -456,22 +427,13 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
|
|
|
456
427
|
multimodal=multimodal,
|
|
457
428
|
)
|
|
458
429
|
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
try:
|
|
467
|
-
res: ContextualRecallScoreReason = self.model.generate(
|
|
468
|
-
prompt, schema=ContextualRecallScoreReason
|
|
469
|
-
)
|
|
470
|
-
return res.reason
|
|
471
|
-
except TypeError:
|
|
472
|
-
res = self.model.generate(prompt)
|
|
473
|
-
data = trimAndLoadJson(res, self)
|
|
474
|
-
return data["reason"]
|
|
430
|
+
return generate_with_schema_and_extract(
|
|
431
|
+
metric=self,
|
|
432
|
+
prompt=prompt,
|
|
433
|
+
schema_cls=ContextualRecallScoreReason,
|
|
434
|
+
extract_schema=lambda s: s.reason,
|
|
435
|
+
extract_json=lambda data: data["reason"],
|
|
436
|
+
)
|
|
475
437
|
|
|
476
438
|
def _get_verbose_steps(
|
|
477
439
|
self, interaction_scores: List[InteractionContextualRecallScore]
|
|
@@ -504,13 +466,13 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
|
|
|
504
466
|
self.score, self.success, reasons
|
|
505
467
|
)
|
|
506
468
|
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
469
|
+
return generate_with_schema_and_extract(
|
|
470
|
+
metric=self,
|
|
471
|
+
prompt=prompt,
|
|
472
|
+
schema_cls=ContextualRecallScoreReason,
|
|
473
|
+
extract_schema=lambda s: s.reason,
|
|
474
|
+
extract_json=lambda data: data["reason"],
|
|
475
|
+
)
|
|
514
476
|
|
|
515
477
|
async def _a_generate_reason(
|
|
516
478
|
self, scores: List[InteractionContextualRecallScore]
|
|
@@ -529,13 +491,13 @@ class TurnContextualRecallMetric(BaseConversationalMetric):
|
|
|
529
491
|
self.score, self.success, reasons
|
|
530
492
|
)
|
|
531
493
|
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
494
|
+
return await a_generate_with_schema_and_extract(
|
|
495
|
+
metric=self,
|
|
496
|
+
prompt=prompt,
|
|
497
|
+
schema_cls=ContextualRecallScoreReason,
|
|
498
|
+
extract_schema=lambda s: s.reason,
|
|
499
|
+
extract_json=lambda data: data["reason"],
|
|
500
|
+
)
|
|
539
501
|
|
|
540
502
|
def _calculate_score(
|
|
541
503
|
self, scores: List[InteractionContextualRecallScore]
|
|
@@ -130,6 +130,13 @@ class TurnContextualRelevancyTemplate:
|
|
|
130
130
|
Context:
|
|
131
131
|
This metric evaluates conversational contextual relevancy by determining whether statements in the retrieval context are relevant to the user message for each interaction. Each interaction yields a reason indicating which statements were relevant or irrelevant. You are given all those reasons.
|
|
132
132
|
|
|
133
|
+
**
|
|
134
|
+
IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
|
|
135
|
+
Example JSON:
|
|
136
|
+
{{
|
|
137
|
+
"reason": "The score is <contextual_relevancy_score> because <your_reason>."
|
|
138
|
+
}}
|
|
139
|
+
|
|
133
140
|
Inputs:
|
|
134
141
|
- final_score: the averaged score across all interactions.
|
|
135
142
|
- success: whether the metric passed or failed
|
|
@@ -156,6 +163,6 @@ class TurnContextualRelevancyTemplate:
|
|
|
156
163
|
|
|
157
164
|
Now give me a final reason that explains why the metric passed or failed. Output ONLY the reason and nothing else.
|
|
158
165
|
|
|
159
|
-
|
|
166
|
+
JSON:
|
|
160
167
|
"""
|
|
161
168
|
)
|
|
@@ -14,6 +14,8 @@ from deepeval.metrics.utils import (
|
|
|
14
14
|
get_unit_interactions,
|
|
15
15
|
get_turns_in_sliding_window,
|
|
16
16
|
initialize_model,
|
|
17
|
+
generate_with_schema_and_extract,
|
|
18
|
+
a_generate_with_schema_and_extract,
|
|
17
19
|
)
|
|
18
20
|
from deepeval.models import DeepEvalBaseLLM
|
|
19
21
|
from deepeval.metrics.turn_contextual_relevancy.template import (
|
|
@@ -264,29 +266,15 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
|
|
|
264
266
|
multimodal=multimodal,
|
|
265
267
|
)
|
|
266
268
|
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
await self.model.a_generate(
|
|
277
|
-
prompt, schema=ContextualRelevancyVerdicts
|
|
278
|
-
)
|
|
279
|
-
)
|
|
280
|
-
verdicts.extend([item for item in res.verdicts])
|
|
281
|
-
except TypeError:
|
|
282
|
-
res = await self.model.a_generate(prompt)
|
|
283
|
-
data = trimAndLoadJson(res, self)
|
|
284
|
-
verdicts.extend(
|
|
285
|
-
[
|
|
286
|
-
ContextualRelevancyVerdict(**item)
|
|
287
|
-
for item in data["verdicts"]
|
|
288
|
-
]
|
|
289
|
-
)
|
|
269
|
+
result = await a_generate_with_schema_and_extract(
|
|
270
|
+
metric=self,
|
|
271
|
+
prompt=prompt,
|
|
272
|
+
schema_cls=ContextualRelevancyVerdicts,
|
|
273
|
+
extract_schema=lambda s: s.verdicts,
|
|
274
|
+
extract_json=lambda data: data["verdicts"],
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
verdicts.extend(result)
|
|
290
278
|
|
|
291
279
|
return verdicts
|
|
292
280
|
|
|
@@ -306,27 +294,15 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
|
|
|
306
294
|
multimodal=multimodal,
|
|
307
295
|
)
|
|
308
296
|
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
prompt, schema=ContextualRelevancyVerdicts
|
|
319
|
-
)
|
|
320
|
-
verdicts.extend([item for item in res.verdicts])
|
|
321
|
-
except TypeError:
|
|
322
|
-
res = self.model.generate(prompt)
|
|
323
|
-
data = trimAndLoadJson(res, self)
|
|
324
|
-
verdicts.extend(
|
|
325
|
-
[
|
|
326
|
-
ContextualRelevancyVerdict(**item)
|
|
327
|
-
for item in data["verdicts"]
|
|
328
|
-
]
|
|
329
|
-
)
|
|
297
|
+
result = generate_with_schema_and_extract(
|
|
298
|
+
metric=self,
|
|
299
|
+
prompt=prompt,
|
|
300
|
+
schema_cls=ContextualRelevancyVerdicts,
|
|
301
|
+
extract_schema=lambda s: s.verdicts,
|
|
302
|
+
extract_json=lambda data: data["verdicts"],
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
verdicts.extend(result)
|
|
330
306
|
|
|
331
307
|
return verdicts
|
|
332
308
|
|
|
@@ -419,24 +395,13 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
|
|
|
419
395
|
multimodal=multimodal,
|
|
420
396
|
)
|
|
421
397
|
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
try:
|
|
430
|
-
res: ContextualRelevancyScoreReason = (
|
|
431
|
-
await self.model.a_generate(
|
|
432
|
-
prompt, schema=ContextualRelevancyScoreReason
|
|
433
|
-
)
|
|
434
|
-
)
|
|
435
|
-
return res.reason
|
|
436
|
-
except TypeError:
|
|
437
|
-
res = await self.model.a_generate(prompt)
|
|
438
|
-
data = trimAndLoadJson(res, self)
|
|
439
|
-
return data["reason"]
|
|
398
|
+
return await a_generate_with_schema_and_extract(
|
|
399
|
+
metric=self,
|
|
400
|
+
prompt=prompt,
|
|
401
|
+
schema_cls=ContextualRelevancyScoreReason,
|
|
402
|
+
extract_schema=lambda s: s.reason,
|
|
403
|
+
extract_json=lambda data: data["reason"],
|
|
404
|
+
)
|
|
440
405
|
|
|
441
406
|
def _get_interaction_reason(
|
|
442
407
|
self,
|
|
@@ -469,22 +434,13 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
|
|
|
469
434
|
multimodal=multimodal,
|
|
470
435
|
)
|
|
471
436
|
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
try:
|
|
480
|
-
res: ContextualRelevancyScoreReason = self.model.generate(
|
|
481
|
-
prompt, schema=ContextualRelevancyScoreReason
|
|
482
|
-
)
|
|
483
|
-
return res.reason
|
|
484
|
-
except TypeError:
|
|
485
|
-
res = self.model.generate(prompt)
|
|
486
|
-
data = trimAndLoadJson(res, self)
|
|
487
|
-
return data["reason"]
|
|
437
|
+
return generate_with_schema_and_extract(
|
|
438
|
+
metric=self,
|
|
439
|
+
prompt=prompt,
|
|
440
|
+
schema_cls=ContextualRelevancyScoreReason,
|
|
441
|
+
extract_schema=lambda s: s.reason,
|
|
442
|
+
extract_json=lambda data: data["reason"],
|
|
443
|
+
)
|
|
488
444
|
|
|
489
445
|
def _get_verbose_steps(
|
|
490
446
|
self, windows_scores: List[InteractionContextualRelevancyScore]
|
|
@@ -517,13 +473,13 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
|
|
|
517
473
|
self.score, self.success, reasons
|
|
518
474
|
)
|
|
519
475
|
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
476
|
+
return generate_with_schema_and_extract(
|
|
477
|
+
metric=self,
|
|
478
|
+
prompt=prompt,
|
|
479
|
+
schema_cls=ContextualRelevancyScoreReason,
|
|
480
|
+
extract_schema=lambda s: s.reason,
|
|
481
|
+
extract_json=lambda data: data["reason"],
|
|
482
|
+
)
|
|
527
483
|
|
|
528
484
|
async def _a_generate_reason(
|
|
529
485
|
self, scores: List[InteractionContextualRelevancyScore]
|
|
@@ -542,13 +498,13 @@ class TurnContextualRelevancyMetric(BaseConversationalMetric):
|
|
|
542
498
|
self.score, self.success, reasons
|
|
543
499
|
)
|
|
544
500
|
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
501
|
+
return await a_generate_with_schema_and_extract(
|
|
502
|
+
metric=self,
|
|
503
|
+
prompt=prompt,
|
|
504
|
+
schema_cls=ContextualRelevancyScoreReason,
|
|
505
|
+
extract_schema=lambda s: s.reason,
|
|
506
|
+
extract_json=lambda data: data["reason"],
|
|
507
|
+
)
|
|
552
508
|
|
|
553
509
|
def _calculate_score(
|
|
554
510
|
self, scores: List[InteractionContextualRelevancyScore]
|