deepeval 3.6.6__py3-none-any.whl → 3.6.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
- deepeval/cli/main.py +42 -0
- deepeval/confident/api.py +1 -0
- deepeval/config/settings.py +22 -4
- deepeval/constants.py +8 -1
- deepeval/dataset/dataset.py +2 -11
- deepeval/dataset/utils.py +1 -1
- deepeval/errors.py +20 -2
- deepeval/evaluate/evaluate.py +5 -1
- deepeval/evaluate/execute.py +811 -248
- deepeval/evaluate/types.py +1 -0
- deepeval/evaluate/utils.py +33 -119
- deepeval/integrations/crewai/__init__.py +7 -1
- deepeval/integrations/crewai/handler.py +1 -1
- deepeval/integrations/crewai/subs.py +51 -0
- deepeval/integrations/crewai/tool.py +71 -0
- deepeval/integrations/crewai/wrapper.py +45 -5
- deepeval/integrations/llama_index/__init__.py +0 -4
- deepeval/integrations/llama_index/handler.py +20 -21
- deepeval/integrations/pydantic_ai/instrumentator.py +125 -76
- deepeval/metrics/__init__.py +13 -0
- deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
- deepeval/metrics/api.py +281 -0
- deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
- deepeval/metrics/base_metric.py +1 -0
- deepeval/metrics/bias/bias.py +12 -3
- deepeval/metrics/contextual_precision/contextual_precision.py +39 -24
- deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
- deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
- deepeval/metrics/conversational_dag/nodes.py +12 -4
- deepeval/metrics/conversational_g_eval/__init__.py +3 -0
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +84 -66
- deepeval/metrics/dag/dag.py +12 -0
- deepeval/metrics/dag/nodes.py +12 -4
- deepeval/metrics/dag/schema.py +1 -1
- deepeval/metrics/dag/templates.py +2 -2
- deepeval/metrics/faithfulness/faithfulness.py +12 -1
- deepeval/metrics/g_eval/g_eval.py +11 -0
- deepeval/metrics/goal_accuracy/__init__.py +1 -0
- deepeval/metrics/goal_accuracy/goal_accuracy.py +349 -0
- deepeval/metrics/goal_accuracy/schema.py +17 -0
- deepeval/metrics/goal_accuracy/template.py +235 -0
- deepeval/metrics/hallucination/hallucination.py +20 -9
- deepeval/metrics/indicator.py +8 -2
- deepeval/metrics/json_correctness/json_correctness.py +12 -1
- deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +20 -2
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +29 -6
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +14 -2
- deepeval/metrics/misuse/misuse.py +12 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +38 -25
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
- deepeval/metrics/non_advice/non_advice.py +12 -0
- deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
- deepeval/metrics/plan_adherence/__init__.py +1 -0
- deepeval/metrics/plan_adherence/plan_adherence.py +292 -0
- deepeval/metrics/plan_adherence/schema.py +11 -0
- deepeval/metrics/plan_adherence/template.py +170 -0
- deepeval/metrics/plan_quality/__init__.py +1 -0
- deepeval/metrics/plan_quality/plan_quality.py +292 -0
- deepeval/metrics/plan_quality/schema.py +11 -0
- deepeval/metrics/plan_quality/template.py +101 -0
- deepeval/metrics/prompt_alignment/prompt_alignment.py +12 -1
- deepeval/metrics/role_adherence/role_adherence.py +12 -0
- deepeval/metrics/role_violation/role_violation.py +12 -0
- deepeval/metrics/step_efficiency/__init__.py +1 -0
- deepeval/metrics/step_efficiency/schema.py +11 -0
- deepeval/metrics/step_efficiency/step_efficiency.py +234 -0
- deepeval/metrics/step_efficiency/template.py +256 -0
- deepeval/metrics/summarization/summarization.py +12 -1
- deepeval/metrics/task_completion/task_completion.py +4 -0
- deepeval/metrics/tool_correctness/schema.py +6 -0
- deepeval/metrics/tool_correctness/template.py +88 -0
- deepeval/metrics/tool_correctness/tool_correctness.py +233 -21
- deepeval/metrics/tool_use/__init__.py +1 -0
- deepeval/metrics/tool_use/schema.py +19 -0
- deepeval/metrics/tool_use/template.py +220 -0
- deepeval/metrics/tool_use/tool_use.py +458 -0
- deepeval/metrics/topic_adherence/__init__.py +1 -0
- deepeval/metrics/topic_adherence/schema.py +16 -0
- deepeval/metrics/topic_adherence/template.py +162 -0
- deepeval/metrics/topic_adherence/topic_adherence.py +355 -0
- deepeval/metrics/toxicity/toxicity.py +12 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
- deepeval/models/embedding_models/azure_embedding_model.py +37 -36
- deepeval/models/embedding_models/local_embedding_model.py +30 -32
- deepeval/models/embedding_models/ollama_embedding_model.py +18 -20
- deepeval/models/embedding_models/openai_embedding_model.py +22 -31
- deepeval/models/llms/grok_model.py +1 -1
- deepeval/models/llms/openai_model.py +2 -0
- deepeval/openai/__init__.py +14 -32
- deepeval/openai/extractors.py +85 -50
- deepeval/openai/patch.py +258 -167
- deepeval/openai/types.py +20 -0
- deepeval/openai/utils.py +205 -56
- deepeval/prompt/__init__.py +19 -1
- deepeval/prompt/api.py +160 -0
- deepeval/prompt/prompt.py +245 -62
- deepeval/prompt/utils.py +186 -15
- deepeval/synthesizer/chunking/context_generator.py +209 -152
- deepeval/synthesizer/chunking/doc_chunker.py +46 -12
- deepeval/synthesizer/synthesizer.py +19 -15
- deepeval/test_case/api.py +131 -0
- deepeval/test_case/llm_test_case.py +6 -2
- deepeval/test_run/__init__.py +1 -0
- deepeval/test_run/hyperparameters.py +47 -8
- deepeval/test_run/test_run.py +292 -206
- deepeval/tracing/__init__.py +2 -1
- deepeval/tracing/api.py +3 -1
- deepeval/tracing/otel/exporter.py +3 -4
- deepeval/tracing/otel/utils.py +24 -5
- deepeval/tracing/trace_context.py +89 -5
- deepeval/tracing/tracing.py +74 -3
- deepeval/tracing/types.py +20 -2
- deepeval/tracing/utils.py +8 -0
- deepeval/utils.py +21 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/METADATA +1 -1
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/RECORD +133 -103
- deepeval/integrations/llama_index/agent/patched.py +0 -68
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/WHEEL +0 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.8.dist-info}/entry_points.txt +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""A slightly modified tailored version of the LLM evaluated metric based on the GEval framework: https://arxiv.org/pdf/2303.16634.pdf"""
|
|
2
2
|
|
|
3
3
|
from openai.types.chat.chat_completion import ChatCompletion
|
|
4
|
-
from typing import Optional, List, Tuple, Union, Dict
|
|
4
|
+
from typing import Optional, List, Tuple, Union, Dict, Type
|
|
5
5
|
import math
|
|
6
6
|
from deepeval.metrics import BaseConversationalMetric
|
|
7
7
|
from deepeval.metrics.g_eval.utils import (
|
|
@@ -11,7 +11,6 @@ from deepeval.metrics.g_eval.utils import (
|
|
|
11
11
|
format_rubrics,
|
|
12
12
|
)
|
|
13
13
|
from deepeval.test_case import (
|
|
14
|
-
Turn,
|
|
15
14
|
TurnParams,
|
|
16
15
|
ConversationalTestCase,
|
|
17
16
|
)
|
|
@@ -28,7 +27,8 @@ from deepeval.metrics.utils import (
|
|
|
28
27
|
)
|
|
29
28
|
from deepeval.models import DeepEvalBaseLLM
|
|
30
29
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
31
|
-
|
|
30
|
+
import deepeval.metrics.conversational_g_eval.schema as cgschema
|
|
31
|
+
from deepeval.metrics.api import metric_data_manager
|
|
32
32
|
|
|
33
33
|
|
|
34
34
|
class ConversationalGEval(BaseConversationalMetric):
|
|
@@ -44,6 +44,9 @@ class ConversationalGEval(BaseConversationalMetric):
|
|
|
44
44
|
async_mode: bool = True,
|
|
45
45
|
strict_mode: bool = False,
|
|
46
46
|
verbose_mode: bool = False,
|
|
47
|
+
evaluation_template: Type[
|
|
48
|
+
ConversationalGEvalTemplate
|
|
49
|
+
] = ConversationalGEvalTemplate,
|
|
47
50
|
_include_g_eval_suffix: bool = True,
|
|
48
51
|
):
|
|
49
52
|
if evaluation_params is not None and len(evaluation_params) == 0:
|
|
@@ -85,6 +88,7 @@ class ConversationalGEval(BaseConversationalMetric):
|
|
|
85
88
|
self.strict_mode = strict_mode
|
|
86
89
|
self.async_mode = async_mode
|
|
87
90
|
self.verbose_mode = verbose_mode
|
|
91
|
+
self.evaluation_template = evaluation_template
|
|
88
92
|
self._include_g_eval_suffix = _include_g_eval_suffix
|
|
89
93
|
|
|
90
94
|
def measure(
|
|
@@ -92,6 +96,7 @@ class ConversationalGEval(BaseConversationalMetric):
|
|
|
92
96
|
test_case: ConversationalTestCase,
|
|
93
97
|
_show_indicator: bool = True,
|
|
94
98
|
_in_component: bool = False,
|
|
99
|
+
_log_metric_to_confident: bool = True,
|
|
95
100
|
) -> float:
|
|
96
101
|
check_conversational_test_case_params(
|
|
97
102
|
test_case, self.evaluation_params, self
|
|
@@ -108,6 +113,7 @@ class ConversationalGEval(BaseConversationalMetric):
|
|
|
108
113
|
test_case,
|
|
109
114
|
_show_indicator=False,
|
|
110
115
|
_in_component=_in_component,
|
|
116
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
111
117
|
)
|
|
112
118
|
)
|
|
113
119
|
else:
|
|
@@ -132,6 +138,10 @@ class ConversationalGEval(BaseConversationalMetric):
|
|
|
132
138
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
133
139
|
],
|
|
134
140
|
)
|
|
141
|
+
if _log_metric_to_confident:
|
|
142
|
+
metric_data_manager.post_metric_if_enabled(
|
|
143
|
+
self, test_case=test_case
|
|
144
|
+
)
|
|
135
145
|
|
|
136
146
|
return self.score
|
|
137
147
|
|
|
@@ -140,6 +150,7 @@ class ConversationalGEval(BaseConversationalMetric):
|
|
|
140
150
|
test_case: ConversationalTestCase,
|
|
141
151
|
_show_indicator: bool = True,
|
|
142
152
|
_in_component: bool = False,
|
|
153
|
+
_log_metric_to_confident: bool = True,
|
|
143
154
|
) -> float:
|
|
144
155
|
check_conversational_test_case_params(
|
|
145
156
|
test_case, self.evaluation_params, self
|
|
@@ -173,6 +184,10 @@ class ConversationalGEval(BaseConversationalMetric):
|
|
|
173
184
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
174
185
|
],
|
|
175
186
|
)
|
|
187
|
+
if _log_metric_to_confident:
|
|
188
|
+
metric_data_manager.post_metric_if_enabled(
|
|
189
|
+
self, test_case=test_case
|
|
190
|
+
)
|
|
176
191
|
|
|
177
192
|
return self.score
|
|
178
193
|
|
|
@@ -183,16 +198,20 @@ class ConversationalGEval(BaseConversationalMetric):
|
|
|
183
198
|
g_eval_params_str = construct_conversational_g_eval_turn_params_string(
|
|
184
199
|
self.evaluation_params
|
|
185
200
|
)
|
|
186
|
-
prompt =
|
|
201
|
+
prompt = self.evaluation_template.generate_evaluation_steps(
|
|
187
202
|
criteria=self.criteria, parameters=g_eval_params_str
|
|
188
203
|
)
|
|
189
204
|
if self.using_native_model:
|
|
190
|
-
res, cost = await self.model.a_generate(
|
|
205
|
+
res, cost = await self.model.a_generate(
|
|
206
|
+
prompt, schema=cgschema.Steps
|
|
207
|
+
)
|
|
191
208
|
self.evaluation_cost += cost
|
|
192
209
|
return res.steps
|
|
193
210
|
else:
|
|
194
211
|
try:
|
|
195
|
-
res: Steps = await self.model.a_generate(
|
|
212
|
+
res: cgschema.Steps = await self.model.a_generate(
|
|
213
|
+
prompt, schema=cgschema.Steps
|
|
214
|
+
)
|
|
196
215
|
return res.steps
|
|
197
216
|
except TypeError:
|
|
198
217
|
res = await self.model.a_generate(prompt)
|
|
@@ -206,16 +225,18 @@ class ConversationalGEval(BaseConversationalMetric):
|
|
|
206
225
|
g_eval_params_str = construct_conversational_g_eval_turn_params_string(
|
|
207
226
|
self.evaluation_params
|
|
208
227
|
)
|
|
209
|
-
prompt =
|
|
228
|
+
prompt = self.evaluation_template.generate_evaluation_steps(
|
|
210
229
|
criteria=self.criteria, parameters=g_eval_params_str
|
|
211
230
|
)
|
|
212
231
|
if self.using_native_model:
|
|
213
|
-
res, cost = self.model.generate(prompt, schema=Steps)
|
|
232
|
+
res, cost = self.model.generate(prompt, schema=cgschema.Steps)
|
|
214
233
|
self.evaluation_cost += cost
|
|
215
234
|
return res.steps
|
|
216
235
|
else:
|
|
217
236
|
try:
|
|
218
|
-
res: Steps = self.model.generate(
|
|
237
|
+
res: cgschema.Steps = self.model.generate(
|
|
238
|
+
prompt, schema=cgschema.Steps
|
|
239
|
+
)
|
|
219
240
|
return res.steps
|
|
220
241
|
except TypeError:
|
|
221
242
|
res = self.model.generate(prompt)
|
|
@@ -233,7 +254,7 @@ class ConversationalGEval(BaseConversationalMetric):
|
|
|
233
254
|
)
|
|
234
255
|
if not self.strict_mode:
|
|
235
256
|
rubric_str = format_rubrics(self.rubric) if self.rubric else None
|
|
236
|
-
prompt =
|
|
257
|
+
prompt = self.evaluation_template.generate_evaluation_results(
|
|
237
258
|
evaluation_steps=self.number_evaluation_steps(),
|
|
238
259
|
test_case_content=test_case_content,
|
|
239
260
|
turns=[
|
|
@@ -244,7 +265,7 @@ class ConversationalGEval(BaseConversationalMetric):
|
|
|
244
265
|
rubric=rubric_str,
|
|
245
266
|
)
|
|
246
267
|
else:
|
|
247
|
-
prompt =
|
|
268
|
+
prompt = self.evaluation_template.generate_evaluation_results(
|
|
248
269
|
evaluation_steps=self.number_evaluation_steps(),
|
|
249
270
|
test_case_content=test_case_content,
|
|
250
271
|
turns=[
|
|
@@ -270,21 +291,21 @@ class ConversationalGEval(BaseConversationalMetric):
|
|
|
270
291
|
score, res
|
|
271
292
|
)
|
|
272
293
|
return weighted_summed_score, reason
|
|
273
|
-
except:
|
|
294
|
+
except (KeyError, AttributeError, TypeError, ValueError):
|
|
274
295
|
return score, reason
|
|
275
296
|
except (
|
|
276
297
|
AttributeError
|
|
277
298
|
): # This catches the case where a_generate_raw_response doesn't exist.
|
|
278
299
|
if self.using_native_model:
|
|
279
300
|
res, cost = await self.model.a_generate(
|
|
280
|
-
prompt, schema=ReasonScore
|
|
301
|
+
prompt, schema=cgschema.ReasonScore
|
|
281
302
|
)
|
|
282
303
|
self.evaluation_cost += cost
|
|
283
304
|
return res.score, res.reason
|
|
284
305
|
else:
|
|
285
306
|
try:
|
|
286
|
-
res: ReasonScore = await self.model.a_generate(
|
|
287
|
-
prompt, schema=ReasonScore
|
|
307
|
+
res: cgschema.ReasonScore = await self.model.a_generate(
|
|
308
|
+
prompt, schema=cgschema.ReasonScore
|
|
288
309
|
)
|
|
289
310
|
return res.score, res.reason
|
|
290
311
|
except TypeError:
|
|
@@ -303,7 +324,7 @@ class ConversationalGEval(BaseConversationalMetric):
|
|
|
303
324
|
)
|
|
304
325
|
if not self.strict_mode:
|
|
305
326
|
rubric_str = format_rubrics(self.rubric) if self.rubric else None
|
|
306
|
-
prompt =
|
|
327
|
+
prompt = self.evaluation_template.generate_evaluation_results(
|
|
307
328
|
evaluation_steps=self.number_evaluation_steps(),
|
|
308
329
|
test_case_content=test_case_content,
|
|
309
330
|
turns=[
|
|
@@ -314,7 +335,7 @@ class ConversationalGEval(BaseConversationalMetric):
|
|
|
314
335
|
rubric=rubric_str,
|
|
315
336
|
)
|
|
316
337
|
else:
|
|
317
|
-
prompt =
|
|
338
|
+
prompt = self.evaluation_template.generate_evaluation_results(
|
|
318
339
|
evaluation_steps=self.number_evaluation_steps(),
|
|
319
340
|
test_case_content=test_case_content,
|
|
320
341
|
turns=[
|
|
@@ -340,18 +361,20 @@ class ConversationalGEval(BaseConversationalMetric):
|
|
|
340
361
|
score, res
|
|
341
362
|
)
|
|
342
363
|
return weighted_summed_score, reason
|
|
343
|
-
except:
|
|
364
|
+
except (KeyError, AttributeError, TypeError, ValueError):
|
|
344
365
|
return score, reason
|
|
345
366
|
except AttributeError:
|
|
346
367
|
# This catches the case where a_generate_raw_response doesn't exist.
|
|
347
368
|
if self.using_native_model:
|
|
348
|
-
res, cost = self.model.generate(
|
|
369
|
+
res, cost = self.model.generate(
|
|
370
|
+
prompt, schema=cgschema.ReasonScore
|
|
371
|
+
)
|
|
349
372
|
self.evaluation_cost += cost
|
|
350
373
|
return res.score, res.reason
|
|
351
374
|
else:
|
|
352
375
|
try:
|
|
353
|
-
res: ReasonScore = self.model.generate(
|
|
354
|
-
prompt, schema=ReasonScore
|
|
376
|
+
res: cgschema.ReasonScore = self.model.generate(
|
|
377
|
+
prompt, schema=cgschema.ReasonScore
|
|
355
378
|
)
|
|
356
379
|
return res.score, res.reason
|
|
357
380
|
except TypeError:
|
|
@@ -362,49 +385,44 @@ class ConversationalGEval(BaseConversationalMetric):
|
|
|
362
385
|
def generate_weighted_summed_score(
|
|
363
386
|
self, raw_score: int, raw_response: ChatCompletion
|
|
364
387
|
) -> Union[int, float]:
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
sum_of_weighted_scores / sum_linear_probability
|
|
404
|
-
)
|
|
405
|
-
return weighted_summed_score
|
|
406
|
-
except:
|
|
407
|
-
raise
|
|
388
|
+
generated_logprobs = raw_response.choices[0].logprobs.content
|
|
389
|
+
# First, locate the token that we care for logprobs, i.e., the token matching the score
|
|
390
|
+
score_logprobs = None
|
|
391
|
+
for token_logprobs in generated_logprobs:
|
|
392
|
+
if token_logprobs.token == str(raw_score):
|
|
393
|
+
score_logprobs = token_logprobs
|
|
394
|
+
break
|
|
395
|
+
# Then, calculate the score based on the logprobs
|
|
396
|
+
token_linear_probability: Dict[int, float] = {}
|
|
397
|
+
sum_linear_probability = 0
|
|
398
|
+
# Filter out tokens with <1% linear probability, i.e., logprobs < math.log(0.01)
|
|
399
|
+
min_logprob = math.log(0.01)
|
|
400
|
+
for token_logprob in score_logprobs.top_logprobs:
|
|
401
|
+
logprob = token_logprob.logprob
|
|
402
|
+
|
|
403
|
+
# Filter out low probability tokens
|
|
404
|
+
if logprob < min_logprob:
|
|
405
|
+
continue
|
|
406
|
+
# Filter out non-decimal token to prevent errors in later int(token) conversion
|
|
407
|
+
if not token_logprob.token.isdecimal():
|
|
408
|
+
continue
|
|
409
|
+
|
|
410
|
+
# Calculate the linear probability
|
|
411
|
+
linear_prob = math.exp(logprob)
|
|
412
|
+
token_score = int(token_logprob.token)
|
|
413
|
+
if token_linear_probability.get(token_score):
|
|
414
|
+
token_linear_probability[token_score] += linear_prob
|
|
415
|
+
else:
|
|
416
|
+
token_linear_probability[token_score] = linear_prob
|
|
417
|
+
sum_linear_probability += linear_prob
|
|
418
|
+
|
|
419
|
+
sum_of_weighted_scores = 0.0
|
|
420
|
+
for score, prob in token_linear_probability.items():
|
|
421
|
+
sum_of_weighted_scores += score * prob
|
|
422
|
+
|
|
423
|
+
# Scale the sum of linear probability to 1
|
|
424
|
+
weighted_summed_score = sum_of_weighted_scores / sum_linear_probability
|
|
425
|
+
return weighted_summed_score
|
|
408
426
|
|
|
409
427
|
def number_evaluation_steps(self):
|
|
410
428
|
evaluation_steps = """"""
|
|
@@ -417,8 +435,8 @@ class ConversationalGEval(BaseConversationalMetric):
|
|
|
417
435
|
self.success = False
|
|
418
436
|
else:
|
|
419
437
|
try:
|
|
420
|
-
self.score >= self.threshold
|
|
421
|
-
except:
|
|
438
|
+
self.success = self.score >= self.threshold
|
|
439
|
+
except TypeError:
|
|
422
440
|
self.success = False
|
|
423
441
|
return self.success
|
|
424
442
|
|
deepeval/metrics/dag/dag.py
CHANGED
|
@@ -18,6 +18,7 @@ from deepeval.metrics.dag.utils import (
|
|
|
18
18
|
is_valid_dag_from_roots,
|
|
19
19
|
extract_required_params,
|
|
20
20
|
)
|
|
21
|
+
from deepeval.metrics.api import metric_data_manager
|
|
21
22
|
|
|
22
23
|
|
|
23
24
|
class DAGMetric(BaseMetric):
|
|
@@ -59,6 +60,7 @@ class DAGMetric(BaseMetric):
|
|
|
59
60
|
test_case: LLMTestCase,
|
|
60
61
|
_show_indicator: bool = True,
|
|
61
62
|
_in_component: bool = False,
|
|
63
|
+
_log_metric_to_confident: bool = True,
|
|
62
64
|
) -> float:
|
|
63
65
|
check_llm_test_case_params(
|
|
64
66
|
test_case,
|
|
@@ -77,6 +79,7 @@ class DAGMetric(BaseMetric):
|
|
|
77
79
|
test_case,
|
|
78
80
|
_show_indicator=False,
|
|
79
81
|
_in_component=_in_component,
|
|
82
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
80
83
|
)
|
|
81
84
|
)
|
|
82
85
|
else:
|
|
@@ -89,6 +92,10 @@ class DAGMetric(BaseMetric):
|
|
|
89
92
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
90
93
|
],
|
|
91
94
|
)
|
|
95
|
+
if _log_metric_to_confident:
|
|
96
|
+
metric_data_manager.post_metric_if_enabled(
|
|
97
|
+
self, test_case=test_case
|
|
98
|
+
)
|
|
92
99
|
return self.score
|
|
93
100
|
|
|
94
101
|
async def a_measure(
|
|
@@ -96,6 +103,7 @@ class DAGMetric(BaseMetric):
|
|
|
96
103
|
test_case: LLMTestCase,
|
|
97
104
|
_show_indicator: bool = True,
|
|
98
105
|
_in_component: bool = False,
|
|
106
|
+
_log_metric_to_confident: bool = True,
|
|
99
107
|
) -> float:
|
|
100
108
|
check_llm_test_case_params(
|
|
101
109
|
test_case,
|
|
@@ -119,6 +127,10 @@ class DAGMetric(BaseMetric):
|
|
|
119
127
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
120
128
|
],
|
|
121
129
|
)
|
|
130
|
+
if _log_metric_to_confident:
|
|
131
|
+
metric_data_manager.post_metric_if_enabled(
|
|
132
|
+
self, test_case=test_case
|
|
133
|
+
)
|
|
122
134
|
return self.score
|
|
123
135
|
|
|
124
136
|
def is_successful(self) -> bool:
|
deepeval/metrics/dag/nodes.py
CHANGED
|
@@ -111,7 +111,9 @@ class VerdictNode(BaseNode):
|
|
|
111
111
|
copied_g_eval = GEval(**g_eval_args)
|
|
112
112
|
|
|
113
113
|
copied_g_eval.measure(
|
|
114
|
-
test_case=test_case,
|
|
114
|
+
test_case=test_case,
|
|
115
|
+
_show_indicator=False,
|
|
116
|
+
_log_metric_to_confident=False,
|
|
115
117
|
)
|
|
116
118
|
metric._verbose_steps.append(
|
|
117
119
|
construct_node_verbose_log(self, depth, copied_g_eval)
|
|
@@ -124,7 +126,9 @@ class VerdictNode(BaseNode):
|
|
|
124
126
|
copied_metric.verbose_mode = False
|
|
125
127
|
|
|
126
128
|
copied_metric.measure(
|
|
127
|
-
test_case=test_case,
|
|
129
|
+
test_case=test_case,
|
|
130
|
+
_show_indicator=False,
|
|
131
|
+
_log_metric_to_confident=False,
|
|
128
132
|
)
|
|
129
133
|
metric._verbose_steps.append(
|
|
130
134
|
construct_node_verbose_log(self, depth, copied_metric)
|
|
@@ -174,7 +178,9 @@ class VerdictNode(BaseNode):
|
|
|
174
178
|
copied_g_eval = GEval(**g_eval_args)
|
|
175
179
|
|
|
176
180
|
await copied_g_eval.a_measure(
|
|
177
|
-
test_case=test_case,
|
|
181
|
+
test_case=test_case,
|
|
182
|
+
_show_indicator=False,
|
|
183
|
+
_log_metric_to_confident=False,
|
|
178
184
|
)
|
|
179
185
|
metric._verbose_steps.append(
|
|
180
186
|
construct_node_verbose_log(self, depth, copied_g_eval)
|
|
@@ -188,7 +194,9 @@ class VerdictNode(BaseNode):
|
|
|
188
194
|
copied_metric.verbose_mode = False
|
|
189
195
|
|
|
190
196
|
await copied_metric.a_measure(
|
|
191
|
-
test_case=test_case,
|
|
197
|
+
test_case=test_case,
|
|
198
|
+
_show_indicator=False,
|
|
199
|
+
_log_metric_to_confident=False,
|
|
192
200
|
)
|
|
193
201
|
metric._verbose_steps.append(
|
|
194
202
|
construct_node_verbose_log(self, depth, copied_metric)
|
deepeval/metrics/dag/schema.py
CHANGED
|
@@ -60,10 +60,10 @@ class BinaryJudgementTemplate:
|
|
|
60
60
|
{text}
|
|
61
61
|
|
|
62
62
|
**
|
|
63
|
-
IMPORTANT: Please make sure to only return a json with two keys: `verdict` (
|
|
63
|
+
IMPORTANT: Please make sure to only return a json with two keys: `verdict` (True or False), and the 'reason' key providing the reason. The verdict must be a boolean only, either True or False.
|
|
64
64
|
Example JSON:
|
|
65
65
|
{{
|
|
66
|
-
"verdict":
|
|
66
|
+
"verdict": True,
|
|
67
67
|
"reason": "..."
|
|
68
68
|
}}
|
|
69
69
|
**
|
|
@@ -23,6 +23,7 @@ from deepeval.metrics.faithfulness.schema import (
|
|
|
23
23
|
Truths,
|
|
24
24
|
Claims,
|
|
25
25
|
)
|
|
26
|
+
from deepeval.metrics.api import metric_data_manager
|
|
26
27
|
|
|
27
28
|
|
|
28
29
|
class FaithfulnessMetric(BaseMetric):
|
|
@@ -63,6 +64,7 @@ class FaithfulnessMetric(BaseMetric):
|
|
|
63
64
|
test_case: LLMTestCase,
|
|
64
65
|
_show_indicator: bool = True,
|
|
65
66
|
_in_component: bool = False,
|
|
67
|
+
_log_metric_to_confident: bool = True,
|
|
66
68
|
) -> float:
|
|
67
69
|
|
|
68
70
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -78,6 +80,7 @@ class FaithfulnessMetric(BaseMetric):
|
|
|
78
80
|
test_case,
|
|
79
81
|
_show_indicator=False,
|
|
80
82
|
_in_component=_in_component,
|
|
83
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
81
84
|
)
|
|
82
85
|
)
|
|
83
86
|
else:
|
|
@@ -96,6 +99,10 @@ class FaithfulnessMetric(BaseMetric):
|
|
|
96
99
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
97
100
|
],
|
|
98
101
|
)
|
|
102
|
+
if _log_metric_to_confident:
|
|
103
|
+
metric_data_manager.post_metric_if_enabled(
|
|
104
|
+
self, test_case=test_case
|
|
105
|
+
)
|
|
99
106
|
|
|
100
107
|
return self.score
|
|
101
108
|
|
|
@@ -104,6 +111,7 @@ class FaithfulnessMetric(BaseMetric):
|
|
|
104
111
|
test_case: LLMTestCase,
|
|
105
112
|
_show_indicator: bool = True,
|
|
106
113
|
_in_component: bool = False,
|
|
114
|
+
_log_metric_to_confident: bool = True,
|
|
107
115
|
) -> float:
|
|
108
116
|
|
|
109
117
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -132,7 +140,10 @@ class FaithfulnessMetric(BaseMetric):
|
|
|
132
140
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
133
141
|
],
|
|
134
142
|
)
|
|
135
|
-
|
|
143
|
+
if _log_metric_to_confident:
|
|
144
|
+
metric_data_manager.post_metric_if_enabled(
|
|
145
|
+
self, test_case=test_case
|
|
146
|
+
)
|
|
136
147
|
return self.score
|
|
137
148
|
|
|
138
149
|
async def _a_generate_reason(self) -> str:
|
|
@@ -31,6 +31,7 @@ from deepeval.metrics.g_eval.utils import (
|
|
|
31
31
|
number_evaluation_steps,
|
|
32
32
|
get_score_range,
|
|
33
33
|
)
|
|
34
|
+
from deepeval.metrics.api import metric_data_manager
|
|
34
35
|
from deepeval.config.settings import get_settings
|
|
35
36
|
|
|
36
37
|
|
|
@@ -74,6 +75,7 @@ class GEval(BaseMetric):
|
|
|
74
75
|
test_case: LLMTestCase,
|
|
75
76
|
_show_indicator: bool = True,
|
|
76
77
|
_in_component: bool = False,
|
|
78
|
+
_log_metric_to_confident: bool = True,
|
|
77
79
|
_additional_context: Optional[str] = None,
|
|
78
80
|
) -> float:
|
|
79
81
|
check_llm_test_case_params(test_case, self.evaluation_params, self)
|
|
@@ -122,6 +124,10 @@ class GEval(BaseMetric):
|
|
|
122
124
|
f"Reason: {self.reason}",
|
|
123
125
|
],
|
|
124
126
|
)
|
|
127
|
+
if _log_metric_to_confident:
|
|
128
|
+
metric_data_manager.post_metric_if_enabled(
|
|
129
|
+
self, test_case=test_case
|
|
130
|
+
)
|
|
125
131
|
|
|
126
132
|
return self.score
|
|
127
133
|
|
|
@@ -130,6 +136,7 @@ class GEval(BaseMetric):
|
|
|
130
136
|
test_case: LLMTestCase,
|
|
131
137
|
_show_indicator: bool = True,
|
|
132
138
|
_in_component: bool = False,
|
|
139
|
+
_log_metric_to_confident: bool = True,
|
|
133
140
|
_additional_context: Optional[str] = None,
|
|
134
141
|
) -> float:
|
|
135
142
|
check_llm_test_case_params(test_case, self.evaluation_params, self)
|
|
@@ -165,6 +172,10 @@ class GEval(BaseMetric):
|
|
|
165
172
|
f"Reason: {self.reason}",
|
|
166
173
|
],
|
|
167
174
|
)
|
|
175
|
+
if _log_metric_to_confident:
|
|
176
|
+
metric_data_manager.post_metric_if_enabled(
|
|
177
|
+
self, test_case=test_case
|
|
178
|
+
)
|
|
168
179
|
return self.score
|
|
169
180
|
|
|
170
181
|
async def _a_generate_evaluation_steps(self) -> List[str]:
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .goal_accuracy import GoalAccuracyMetric
|