deepeval 3.6.6__py3-none-any.whl → 3.6.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/_version.py +1 -1
- deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
- deepeval/cli/main.py +42 -0
- deepeval/confident/api.py +1 -0
- deepeval/config/settings.py +22 -4
- deepeval/constants.py +8 -1
- deepeval/dataset/dataset.py +2 -11
- deepeval/dataset/utils.py +1 -1
- deepeval/evaluate/evaluate.py +5 -1
- deepeval/evaluate/execute.py +97 -42
- deepeval/evaluate/utils.py +20 -116
- deepeval/integrations/crewai/__init__.py +6 -1
- deepeval/integrations/crewai/handler.py +1 -1
- deepeval/integrations/crewai/subs.py +51 -0
- deepeval/integrations/crewai/wrapper.py +45 -5
- deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
- deepeval/metrics/api.py +281 -0
- deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
- deepeval/metrics/bias/bias.py +12 -3
- deepeval/metrics/contextual_precision/contextual_precision.py +12 -3
- deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
- deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
- deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
- deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
- deepeval/metrics/conversational_dag/nodes.py +12 -4
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +73 -59
- deepeval/metrics/dag/dag.py +12 -0
- deepeval/metrics/dag/nodes.py +12 -4
- deepeval/metrics/faithfulness/faithfulness.py +12 -1
- deepeval/metrics/g_eval/g_eval.py +11 -0
- deepeval/metrics/hallucination/hallucination.py +12 -1
- deepeval/metrics/indicator.py +8 -2
- deepeval/metrics/json_correctness/json_correctness.py +12 -1
- deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
- deepeval/metrics/mcp/mcp_task_completion.py +13 -0
- deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +13 -0
- deepeval/metrics/mcp_use_metric/mcp_use_metric.py +12 -1
- deepeval/metrics/misuse/misuse.py +12 -1
- deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
- deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
- deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +6 -1
- deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
- deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
- deepeval/metrics/non_advice/non_advice.py +12 -0
- deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
- deepeval/metrics/prompt_alignment/prompt_alignment.py +12 -1
- deepeval/metrics/role_adherence/role_adherence.py +12 -0
- deepeval/metrics/role_violation/role_violation.py +12 -0
- deepeval/metrics/summarization/summarization.py +12 -1
- deepeval/metrics/task_completion/task_completion.py +3 -0
- deepeval/metrics/tool_correctness/tool_correctness.py +8 -0
- deepeval/metrics/toxicity/toxicity.py +12 -0
- deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
- deepeval/models/llms/grok_model.py +1 -1
- deepeval/models/llms/openai_model.py +2 -0
- deepeval/openai/__init__.py +14 -32
- deepeval/openai/extractors.py +24 -34
- deepeval/openai/patch.py +256 -161
- deepeval/openai/types.py +20 -0
- deepeval/openai/utils.py +98 -56
- deepeval/prompt/__init__.py +19 -1
- deepeval/prompt/api.py +160 -0
- deepeval/prompt/prompt.py +244 -62
- deepeval/prompt/utils.py +144 -2
- deepeval/synthesizer/chunking/context_generator.py +209 -152
- deepeval/synthesizer/chunking/doc_chunker.py +46 -12
- deepeval/synthesizer/synthesizer.py +8 -5
- deepeval/test_case/api.py +131 -0
- deepeval/test_run/__init__.py +1 -0
- deepeval/test_run/hyperparameters.py +47 -8
- deepeval/test_run/test_run.py +104 -1
- deepeval/tracing/api.py +3 -1
- deepeval/tracing/message_types/__init__.py +10 -0
- deepeval/tracing/message_types/base.py +6 -0
- deepeval/tracing/message_types/messages.py +14 -0
- deepeval/tracing/message_types/tools.py +18 -0
- deepeval/tracing/otel/utils.py +1 -1
- deepeval/tracing/trace_context.py +73 -4
- deepeval/tracing/tracing.py +51 -3
- deepeval/tracing/types.py +16 -0
- deepeval/tracing/utils.py +8 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/METADATA +1 -1
- {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/RECORD +92 -84
- {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/LICENSE.md +0 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/WHEEL +0 -0
- {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from typing import List, Optional, Type, TypeVar
|
|
2
|
+
from pydantic import PrivateAttr
|
|
3
|
+
|
|
4
|
+
from deepeval.metrics.base_metric import BaseMetric
|
|
5
|
+
|
|
6
|
+
try:
|
|
7
|
+
from crewai import Crew, Agent, LLM
|
|
8
|
+
|
|
9
|
+
is_crewai_installed = True
|
|
10
|
+
except ImportError:
|
|
11
|
+
is_crewai_installed = False
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def is_crewai_installed():
|
|
15
|
+
if not is_crewai_installed:
|
|
16
|
+
raise ImportError(
|
|
17
|
+
"CrewAI is not installed. Please install it with `pip install crewai`."
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
T = TypeVar("T")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def create_deepeval_class(base_class: Type[T], class_name: str) -> Type[T]:
|
|
25
|
+
"""Factory function to create DeepEval-enabled CrewAI classes"""
|
|
26
|
+
|
|
27
|
+
class DeepEvalClass(base_class):
|
|
28
|
+
_metric_collection: Optional[str] = PrivateAttr(default=None)
|
|
29
|
+
_metrics: Optional[List[BaseMetric]] = PrivateAttr(default=None)
|
|
30
|
+
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
*args,
|
|
34
|
+
metrics: Optional[List[BaseMetric]] = None,
|
|
35
|
+
metric_collection: Optional[str] = None,
|
|
36
|
+
**kwargs
|
|
37
|
+
):
|
|
38
|
+
is_crewai_installed()
|
|
39
|
+
super().__init__(*args, **kwargs)
|
|
40
|
+
self._metric_collection = metric_collection
|
|
41
|
+
self._metrics = metrics
|
|
42
|
+
|
|
43
|
+
DeepEvalClass.__name__ = class_name
|
|
44
|
+
DeepEvalClass.__qualname__ = class_name
|
|
45
|
+
return DeepEvalClass
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
# Create the classes
|
|
49
|
+
DeepEvalCrew = create_deepeval_class(Crew, "DeepEvalCrew")
|
|
50
|
+
DeepEvalAgent = create_deepeval_class(Agent, "DeepEvalAgent")
|
|
51
|
+
DeepEvalLLM = create_deepeval_class(LLM, "DeepEvalLLM")
|
|
@@ -3,6 +3,7 @@ from crewai.crew import Crew
|
|
|
3
3
|
from crewai.agent import Agent
|
|
4
4
|
from functools import wraps
|
|
5
5
|
from deepeval.tracing.tracing import Observer
|
|
6
|
+
from typing import Any
|
|
6
7
|
|
|
7
8
|
|
|
8
9
|
def wrap_crew_kickoff():
|
|
@@ -10,7 +11,13 @@ def wrap_crew_kickoff():
|
|
|
10
11
|
|
|
11
12
|
@wraps(original_kickoff)
|
|
12
13
|
def wrapper(self, *args, **kwargs):
|
|
13
|
-
|
|
14
|
+
metric_collection, metrics = _check_metrics_and_metric_collection(self)
|
|
15
|
+
with Observer(
|
|
16
|
+
span_type="crew",
|
|
17
|
+
func_name="kickoff",
|
|
18
|
+
metric_collection=metric_collection,
|
|
19
|
+
metrics=metrics,
|
|
20
|
+
):
|
|
14
21
|
result = original_kickoff(self, *args, **kwargs)
|
|
15
22
|
|
|
16
23
|
return result
|
|
@@ -23,7 +30,13 @@ def wrap_crew_kickoff_for_each():
|
|
|
23
30
|
|
|
24
31
|
@wraps(original_kickoff_for_each)
|
|
25
32
|
def wrapper(self, *args, **kwargs):
|
|
26
|
-
|
|
33
|
+
metric_collection, metrics = _check_metrics_and_metric_collection(self)
|
|
34
|
+
with Observer(
|
|
35
|
+
span_type="crew",
|
|
36
|
+
func_name="kickoff_for_each",
|
|
37
|
+
metric_collection=metric_collection,
|
|
38
|
+
metrics=metrics,
|
|
39
|
+
):
|
|
27
40
|
result = original_kickoff_for_each(self, *args, **kwargs)
|
|
28
41
|
|
|
29
42
|
return result
|
|
@@ -36,7 +49,13 @@ def wrap_crew_kickoff_async():
|
|
|
36
49
|
|
|
37
50
|
@wraps(original_kickoff_async)
|
|
38
51
|
async def wrapper(self, *args, **kwargs):
|
|
39
|
-
|
|
52
|
+
metric_collection, metrics = _check_metrics_and_metric_collection(self)
|
|
53
|
+
with Observer(
|
|
54
|
+
span_type="crew",
|
|
55
|
+
func_name="kickoff_async",
|
|
56
|
+
metric_collection=metric_collection,
|
|
57
|
+
metrics=metrics,
|
|
58
|
+
):
|
|
40
59
|
result = await original_kickoff_async(self, *args, **kwargs)
|
|
41
60
|
|
|
42
61
|
return result
|
|
@@ -49,7 +68,13 @@ def wrap_crew_kickoff_for_each_async():
|
|
|
49
68
|
|
|
50
69
|
@wraps(original_kickoff_for_each_async)
|
|
51
70
|
async def wrapper(self, *args, **kwargs):
|
|
52
|
-
|
|
71
|
+
metric_collection, metrics = _check_metrics_and_metric_collection(self)
|
|
72
|
+
with Observer(
|
|
73
|
+
span_type="crew",
|
|
74
|
+
func_name="kickoff_for_each_async",
|
|
75
|
+
metric_collection=metric_collection,
|
|
76
|
+
metrics=metrics,
|
|
77
|
+
):
|
|
53
78
|
result = await original_kickoff_for_each_async(
|
|
54
79
|
self, *args, **kwargs
|
|
55
80
|
)
|
|
@@ -64,10 +89,13 @@ def wrap_llm_call():
|
|
|
64
89
|
|
|
65
90
|
@wraps(original_llm_call)
|
|
66
91
|
def wrapper(self, *args, **kwargs):
|
|
92
|
+
metric_collection, metrics = _check_metrics_and_metric_collection(self)
|
|
67
93
|
with Observer(
|
|
68
94
|
span_type="llm",
|
|
69
95
|
func_name="call",
|
|
70
96
|
observe_kwargs={"model": "temp_model"},
|
|
97
|
+
metric_collection=metric_collection,
|
|
98
|
+
metrics=metrics,
|
|
71
99
|
):
|
|
72
100
|
result = original_llm_call(self, *args, **kwargs)
|
|
73
101
|
return result
|
|
@@ -80,8 +108,20 @@ def wrap_agent_execute_task():
|
|
|
80
108
|
|
|
81
109
|
@wraps(original_execute_task)
|
|
82
110
|
def wrapper(self, *args, **kwargs):
|
|
83
|
-
|
|
111
|
+
metric_collection, metrics = _check_metrics_and_metric_collection(self)
|
|
112
|
+
with Observer(
|
|
113
|
+
span_type="agent",
|
|
114
|
+
func_name="execute_task",
|
|
115
|
+
metric_collection=metric_collection,
|
|
116
|
+
metrics=metrics,
|
|
117
|
+
):
|
|
84
118
|
result = original_execute_task(self, *args, **kwargs)
|
|
85
119
|
return result
|
|
86
120
|
|
|
87
121
|
Agent.execute_task = wrapper
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _check_metrics_and_metric_collection(obj: Any):
|
|
125
|
+
metric_collection = getattr(obj, "_metric_collection", None)
|
|
126
|
+
metrics = getattr(obj, "_metrics", None)
|
|
127
|
+
return metric_collection, metrics
|
|
@@ -16,6 +16,7 @@ from deepeval.models import DeepEvalBaseLLM
|
|
|
16
16
|
from deepeval.metrics.answer_relevancy.template import AnswerRelevancyTemplate
|
|
17
17
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
18
18
|
from deepeval.metrics.answer_relevancy.schema import *
|
|
19
|
+
from deepeval.metrics.api import metric_data_manager
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
class AnswerRelevancyMetric(BaseMetric):
|
|
@@ -50,8 +51,8 @@ class AnswerRelevancyMetric(BaseMetric):
|
|
|
50
51
|
test_case: LLMTestCase,
|
|
51
52
|
_show_indicator: bool = True,
|
|
52
53
|
_in_component: bool = False,
|
|
54
|
+
_log_metric_to_confident: bool = True,
|
|
53
55
|
) -> float:
|
|
54
|
-
|
|
55
56
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
56
57
|
|
|
57
58
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -65,6 +66,7 @@ class AnswerRelevancyMetric(BaseMetric):
|
|
|
65
66
|
test_case,
|
|
66
67
|
_show_indicator=False,
|
|
67
68
|
_in_component=_in_component,
|
|
69
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
68
70
|
)
|
|
69
71
|
)
|
|
70
72
|
else:
|
|
@@ -85,6 +87,10 @@ class AnswerRelevancyMetric(BaseMetric):
|
|
|
85
87
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
86
88
|
],
|
|
87
89
|
)
|
|
90
|
+
if _log_metric_to_confident:
|
|
91
|
+
metric_data_manager.post_metric_if_enabled(
|
|
92
|
+
self, test_case=test_case
|
|
93
|
+
)
|
|
88
94
|
|
|
89
95
|
return self.score
|
|
90
96
|
|
|
@@ -93,8 +99,8 @@ class AnswerRelevancyMetric(BaseMetric):
|
|
|
93
99
|
test_case: LLMTestCase,
|
|
94
100
|
_show_indicator: bool = True,
|
|
95
101
|
_in_component: bool = False,
|
|
102
|
+
_log_metric_to_confident: bool = True,
|
|
96
103
|
) -> float:
|
|
97
|
-
|
|
98
104
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
99
105
|
|
|
100
106
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -121,7 +127,10 @@ class AnswerRelevancyMetric(BaseMetric):
|
|
|
121
127
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
122
128
|
],
|
|
123
129
|
)
|
|
124
|
-
|
|
130
|
+
if _log_metric_to_confident:
|
|
131
|
+
metric_data_manager.post_metric_if_enabled(
|
|
132
|
+
self, test_case=test_case
|
|
133
|
+
)
|
|
125
134
|
return self.score
|
|
126
135
|
|
|
127
136
|
async def _a_generate_reason(self, input: str) -> str:
|
deepeval/metrics/api.py
ADDED
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
from typing import Optional, Set, Any, Dict, List, Union
|
|
2
|
+
import threading
|
|
3
|
+
import asyncio
|
|
4
|
+
import queue
|
|
5
|
+
import atexit
|
|
6
|
+
from time import perf_counter
|
|
7
|
+
from enum import Enum
|
|
8
|
+
from pydantic import Field
|
|
9
|
+
from rich.console import Console
|
|
10
|
+
|
|
11
|
+
from deepeval.confident.api import Api, HttpMethods, Endpoints, is_confident
|
|
12
|
+
from deepeval.constants import (
|
|
13
|
+
CONFIDENT_METRIC_LOGGING_FLUSH,
|
|
14
|
+
CONFIDENT_METRIC_LOGGING_VERBOSE,
|
|
15
|
+
)
|
|
16
|
+
from deepeval.metrics.base_metric import BaseConversationalMetric, BaseMetric
|
|
17
|
+
from deepeval.test_case.conversational_test_case import ConversationalTestCase
|
|
18
|
+
from deepeval.test_case.llm_test_case import LLMTestCase
|
|
19
|
+
from deepeval.test_case.api import create_api_test_case
|
|
20
|
+
from deepeval.test_run.api import LLMApiTestCase, ConversationalApiTestCase
|
|
21
|
+
from deepeval.tracing.api import MetricData
|
|
22
|
+
from deepeval.config.settings import get_settings
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class MetricWorkerStatus(Enum):
|
|
26
|
+
SUCCESS = "success"
|
|
27
|
+
FAILURE = "failure"
|
|
28
|
+
WARNING = "warning"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class ApiMetricData(MetricData):
|
|
32
|
+
llm_test_case: Optional[LLMApiTestCase] = Field(None, alias="llmTestCase")
|
|
33
|
+
conversational_test_case: Optional[ConversationalApiTestCase] = Field(
|
|
34
|
+
None, alias="conversationalTestCase"
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class MetricDataManager:
|
|
39
|
+
"""Manager for posting metric data asynchronously in background thread."""
|
|
40
|
+
|
|
41
|
+
def __init__(self):
|
|
42
|
+
settings = get_settings()
|
|
43
|
+
# Initialize queue and worker thread for metric posting
|
|
44
|
+
self._metric_queue = queue.Queue()
|
|
45
|
+
self._worker_thread = None
|
|
46
|
+
self._min_interval = 0.2 # Minimum time between API calls (seconds)
|
|
47
|
+
self._last_post_time = 0
|
|
48
|
+
self._in_flight_tasks: Set[asyncio.Task[Any]] = set()
|
|
49
|
+
self._flush_enabled = bool(settings.CONFIDENT_METRIC_LOGGING_FLUSH)
|
|
50
|
+
self._daemon = not self._flush_enabled
|
|
51
|
+
self._thread_lock = threading.Lock()
|
|
52
|
+
self.metric_logging_enabled = bool(
|
|
53
|
+
settings.CONFIDENT_METRIC_LOGGING_ENABLED
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
# Register an exit handler to warn about unprocessed metrics
|
|
57
|
+
atexit.register(self._warn_on_exit)
|
|
58
|
+
|
|
59
|
+
def post_metric_if_enabled(
|
|
60
|
+
self,
|
|
61
|
+
metric: Union[BaseMetric, BaseConversationalMetric],
|
|
62
|
+
test_case: Optional[Union[LLMTestCase, ConversationalTestCase]] = None,
|
|
63
|
+
):
|
|
64
|
+
"""Post metric data asynchronously in a background thread."""
|
|
65
|
+
if not self.metric_logging_enabled or not is_confident():
|
|
66
|
+
return
|
|
67
|
+
|
|
68
|
+
from deepeval.evaluate.utils import create_metric_data
|
|
69
|
+
|
|
70
|
+
metric_data = create_metric_data(metric)
|
|
71
|
+
api_metric_data = ApiMetricData(
|
|
72
|
+
**metric_data.model_dump(by_alias=True, exclude_none=True)
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
if isinstance(test_case, LLMTestCase):
|
|
76
|
+
api_metric_data.llm_test_case = create_api_test_case(test_case)
|
|
77
|
+
elif isinstance(test_case, ConversationalTestCase):
|
|
78
|
+
api_metric_data.conversational_test_case = create_api_test_case(
|
|
79
|
+
test_case
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
self._ensure_worker_thread_running()
|
|
83
|
+
self._metric_queue.put(api_metric_data)
|
|
84
|
+
|
|
85
|
+
def _warn_on_exit(self):
|
|
86
|
+
"""Warn if there are unprocessed metrics on exit."""
|
|
87
|
+
queue_size = self._metric_queue.qsize()
|
|
88
|
+
in_flight = len(self._in_flight_tasks)
|
|
89
|
+
remaining_tasks = queue_size + in_flight
|
|
90
|
+
|
|
91
|
+
if not self._flush_enabled and remaining_tasks > 0:
|
|
92
|
+
self._print_metric_data_status(
|
|
93
|
+
metric_worker_status=MetricWorkerStatus.WARNING,
|
|
94
|
+
message=f"Exiting with {queue_size + in_flight} abandoned metric(s).",
|
|
95
|
+
description=f"Set {CONFIDENT_METRIC_LOGGING_FLUSH}=1 as an environment variable to flush remaining metrics to Confident AI.",
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
def _ensure_worker_thread_running(self):
|
|
99
|
+
"""Ensure the background worker thread is running."""
|
|
100
|
+
with self._thread_lock:
|
|
101
|
+
if (
|
|
102
|
+
self._worker_thread is None
|
|
103
|
+
or not self._worker_thread.is_alive()
|
|
104
|
+
):
|
|
105
|
+
self._worker_thread = threading.Thread(
|
|
106
|
+
target=self._process_metric_queue,
|
|
107
|
+
daemon=self._daemon,
|
|
108
|
+
)
|
|
109
|
+
self._worker_thread.start()
|
|
110
|
+
|
|
111
|
+
def _print_metric_data_status(
|
|
112
|
+
self,
|
|
113
|
+
metric_worker_status: MetricWorkerStatus,
|
|
114
|
+
message: str,
|
|
115
|
+
description: Optional[str] = None,
|
|
116
|
+
):
|
|
117
|
+
"""Print metric data worker status messages."""
|
|
118
|
+
if getattr(get_settings(), CONFIDENT_METRIC_LOGGING_VERBOSE, False):
|
|
119
|
+
console = Console()
|
|
120
|
+
message_prefix = "[dim][Confident AI Metric Data Log][/dim]"
|
|
121
|
+
if metric_worker_status == MetricWorkerStatus.SUCCESS:
|
|
122
|
+
message = f"[green]{message}[/green]"
|
|
123
|
+
elif metric_worker_status == MetricWorkerStatus.FAILURE:
|
|
124
|
+
message = f"[red]{message}[/red]"
|
|
125
|
+
elif metric_worker_status == MetricWorkerStatus.WARNING:
|
|
126
|
+
message = f"[yellow]{message}[/yellow]"
|
|
127
|
+
|
|
128
|
+
if bool(CONFIDENT_METRIC_LOGGING_VERBOSE):
|
|
129
|
+
if description:
|
|
130
|
+
message += f": {description}"
|
|
131
|
+
|
|
132
|
+
console.print(
|
|
133
|
+
message_prefix,
|
|
134
|
+
message,
|
|
135
|
+
f"\nTo disable dev logging, set {CONFIDENT_METRIC_LOGGING_VERBOSE}=0 as an environment variable.",
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
def _process_metric_queue(self):
|
|
139
|
+
"""Worker thread function that processes the metric queue."""
|
|
140
|
+
import threading
|
|
141
|
+
|
|
142
|
+
main_thr = threading.main_thread()
|
|
143
|
+
|
|
144
|
+
# Create a new event loop
|
|
145
|
+
loop = asyncio.new_event_loop()
|
|
146
|
+
asyncio.set_event_loop(loop)
|
|
147
|
+
|
|
148
|
+
# Buffer for payloads that need to be sent after main exits
|
|
149
|
+
remaining_metric_request_bodies: List[Dict[str, Any]] = []
|
|
150
|
+
|
|
151
|
+
async def _a_send_metric(metric_data: ApiMetricData):
|
|
152
|
+
nonlocal remaining_metric_request_bodies
|
|
153
|
+
try:
|
|
154
|
+
# Build API object & payload
|
|
155
|
+
try:
|
|
156
|
+
body = metric_data.model_dump(
|
|
157
|
+
by_alias=True,
|
|
158
|
+
exclude_none=True,
|
|
159
|
+
)
|
|
160
|
+
except AttributeError:
|
|
161
|
+
# Pydantic version below 2.0
|
|
162
|
+
body = metric_data.dict(by_alias=True, exclude_none=True)
|
|
163
|
+
|
|
164
|
+
# If the main thread is still alive, send now
|
|
165
|
+
if main_thr.is_alive():
|
|
166
|
+
api = Api()
|
|
167
|
+
_, _ = await api.a_send_request(
|
|
168
|
+
method=HttpMethods.POST,
|
|
169
|
+
endpoint=Endpoints.METRIC_DATA_ENDPOINT,
|
|
170
|
+
body=body,
|
|
171
|
+
)
|
|
172
|
+
queue_size = self._metric_queue.qsize()
|
|
173
|
+
in_flight = len(self._in_flight_tasks)
|
|
174
|
+
status = f"({queue_size} metric{'s' if queue_size!=1 else ''} remaining in queue, {in_flight} in flight)"
|
|
175
|
+
self._print_metric_data_status(
|
|
176
|
+
metric_worker_status=MetricWorkerStatus.SUCCESS,
|
|
177
|
+
message=f"Successfully posted metric data {status}",
|
|
178
|
+
)
|
|
179
|
+
elif self._flush_enabled:
|
|
180
|
+
# Main thread gone → to be flushed
|
|
181
|
+
remaining_metric_request_bodies.append(body)
|
|
182
|
+
|
|
183
|
+
except Exception as e:
|
|
184
|
+
queue_size = self._metric_queue.qsize()
|
|
185
|
+
in_flight = len(self._in_flight_tasks)
|
|
186
|
+
status = f"({queue_size} metric{'s' if queue_size!=1 else ''} remaining in queue, {in_flight} in flight)"
|
|
187
|
+
self._print_metric_data_status(
|
|
188
|
+
metric_worker_status=MetricWorkerStatus.FAILURE,
|
|
189
|
+
message=f"Error posting metric data {status}",
|
|
190
|
+
description=str(e),
|
|
191
|
+
)
|
|
192
|
+
finally:
|
|
193
|
+
task = asyncio.current_task()
|
|
194
|
+
if task:
|
|
195
|
+
self._in_flight_tasks.discard(task)
|
|
196
|
+
|
|
197
|
+
async def async_worker():
|
|
198
|
+
# Continue while user code is running or work remains
|
|
199
|
+
while (
|
|
200
|
+
main_thr.is_alive()
|
|
201
|
+
or not self._metric_queue.empty()
|
|
202
|
+
or self._in_flight_tasks
|
|
203
|
+
):
|
|
204
|
+
try:
|
|
205
|
+
metric_data = self._metric_queue.get(
|
|
206
|
+
block=True, timeout=1.0
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
# Rate-limit
|
|
210
|
+
now = perf_counter()
|
|
211
|
+
elapsed = now - self._last_post_time
|
|
212
|
+
if elapsed < self._min_interval:
|
|
213
|
+
await asyncio.sleep(self._min_interval - elapsed)
|
|
214
|
+
self._last_post_time = perf_counter()
|
|
215
|
+
|
|
216
|
+
# Schedule async send
|
|
217
|
+
task = asyncio.create_task(_a_send_metric(metric_data))
|
|
218
|
+
self._in_flight_tasks.add(task)
|
|
219
|
+
self._metric_queue.task_done()
|
|
220
|
+
|
|
221
|
+
except queue.Empty:
|
|
222
|
+
await asyncio.sleep(0.1)
|
|
223
|
+
continue
|
|
224
|
+
except Exception as e:
|
|
225
|
+
self._print_metric_data_status(
|
|
226
|
+
message="Error in metric worker",
|
|
227
|
+
metric_worker_status=MetricWorkerStatus.FAILURE,
|
|
228
|
+
description=str(e),
|
|
229
|
+
)
|
|
230
|
+
await asyncio.sleep(1.0)
|
|
231
|
+
|
|
232
|
+
try:
|
|
233
|
+
loop.run_until_complete(async_worker())
|
|
234
|
+
finally:
|
|
235
|
+
# Drain any pending tasks
|
|
236
|
+
pending = asyncio.all_tasks(loop=loop)
|
|
237
|
+
if pending:
|
|
238
|
+
loop.run_until_complete(
|
|
239
|
+
asyncio.gather(*pending, return_exceptions=True)
|
|
240
|
+
)
|
|
241
|
+
self._flush_metrics(remaining_metric_request_bodies)
|
|
242
|
+
loop.run_until_complete(loop.shutdown_asyncgens())
|
|
243
|
+
loop.close()
|
|
244
|
+
|
|
245
|
+
def _flush_metrics(
|
|
246
|
+
self, remaining_metric_request_bodies: List[Dict[str, Any]]
|
|
247
|
+
):
|
|
248
|
+
"""Flush remaining metrics synchronously."""
|
|
249
|
+
if not remaining_metric_request_bodies:
|
|
250
|
+
return
|
|
251
|
+
|
|
252
|
+
self._print_metric_data_status(
|
|
253
|
+
MetricWorkerStatus.WARNING,
|
|
254
|
+
message=f"Flushing {len(remaining_metric_request_bodies)} remaining metric(s)",
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
for body in remaining_metric_request_bodies:
|
|
258
|
+
try:
|
|
259
|
+
api = Api()
|
|
260
|
+
_, link = api.send_request(
|
|
261
|
+
method=HttpMethods.POST,
|
|
262
|
+
endpoint=Endpoints.METRIC_DATA_ENDPOINT,
|
|
263
|
+
body=body,
|
|
264
|
+
)
|
|
265
|
+
qs = self._metric_queue.qsize()
|
|
266
|
+
self._print_metric_data_status(
|
|
267
|
+
metric_worker_status=MetricWorkerStatus.SUCCESS,
|
|
268
|
+
message=f"Successfully posted metric data ({qs} metrics remaining in queue, 1 in flight)",
|
|
269
|
+
description=link,
|
|
270
|
+
)
|
|
271
|
+
except Exception as e:
|
|
272
|
+
qs = self._metric_queue.qsize()
|
|
273
|
+
self._print_metric_data_status(
|
|
274
|
+
metric_worker_status=MetricWorkerStatus.FAILURE,
|
|
275
|
+
message="Error flushing remaining metric(s)",
|
|
276
|
+
description=str(e),
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
# Global metric manager instance
|
|
281
|
+
metric_data_manager = MetricDataManager()
|
|
@@ -19,6 +19,7 @@ from deepeval.metrics.argument_correctness.template import (
|
|
|
19
19
|
)
|
|
20
20
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
21
21
|
from deepeval.metrics.argument_correctness.schema import *
|
|
22
|
+
from deepeval.metrics.api import metric_data_manager
|
|
22
23
|
|
|
23
24
|
|
|
24
25
|
class ArgumentCorrectnessMetric(BaseMetric):
|
|
@@ -53,6 +54,7 @@ class ArgumentCorrectnessMetric(BaseMetric):
|
|
|
53
54
|
test_case: LLMTestCase,
|
|
54
55
|
_show_indicator: bool = True,
|
|
55
56
|
_in_component: bool = False,
|
|
57
|
+
_log_metric_to_confident: bool = True,
|
|
56
58
|
) -> float:
|
|
57
59
|
|
|
58
60
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -68,6 +70,7 @@ class ArgumentCorrectnessMetric(BaseMetric):
|
|
|
68
70
|
test_case,
|
|
69
71
|
_show_indicator=False,
|
|
70
72
|
_in_component=_in_component,
|
|
73
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
71
74
|
)
|
|
72
75
|
)
|
|
73
76
|
else:
|
|
@@ -91,7 +94,10 @@ class ArgumentCorrectnessMetric(BaseMetric):
|
|
|
91
94
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
92
95
|
],
|
|
93
96
|
)
|
|
94
|
-
|
|
97
|
+
if _log_metric_to_confident:
|
|
98
|
+
metric_data_manager.post_metric_if_enabled(
|
|
99
|
+
self, test_case=test_case
|
|
100
|
+
)
|
|
95
101
|
return self.score
|
|
96
102
|
|
|
97
103
|
async def a_measure(
|
|
@@ -99,6 +105,7 @@ class ArgumentCorrectnessMetric(BaseMetric):
|
|
|
99
105
|
test_case: LLMTestCase,
|
|
100
106
|
_show_indicator: bool = True,
|
|
101
107
|
_in_component: bool = False,
|
|
108
|
+
_log_metric_to_confident: bool = True,
|
|
102
109
|
) -> float:
|
|
103
110
|
|
|
104
111
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -130,7 +137,10 @@ class ArgumentCorrectnessMetric(BaseMetric):
|
|
|
130
137
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
131
138
|
],
|
|
132
139
|
)
|
|
133
|
-
|
|
140
|
+
if _log_metric_to_confident:
|
|
141
|
+
metric_data_manager.post_metric_if_enabled(
|
|
142
|
+
self, test_case=test_case
|
|
143
|
+
)
|
|
134
144
|
return self.score
|
|
135
145
|
|
|
136
146
|
async def _a_generate_reason(self, input: str) -> str:
|
deepeval/metrics/bias/bias.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import List, Optional, Type, Union
|
|
2
2
|
|
|
3
3
|
from deepeval.metrics import BaseMetric
|
|
4
|
+
from deepeval.metrics.api import metric_data_manager
|
|
4
5
|
from deepeval.test_case import (
|
|
5
6
|
LLMTestCase,
|
|
6
7
|
LLMTestCaseParams,
|
|
@@ -48,8 +49,8 @@ class BiasMetric(BaseMetric):
|
|
|
48
49
|
test_case: LLMTestCase,
|
|
49
50
|
_show_indicator: bool = True,
|
|
50
51
|
_in_component: bool = False,
|
|
52
|
+
_log_metric_to_confident: bool = True,
|
|
51
53
|
) -> float:
|
|
52
|
-
|
|
53
54
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
54
55
|
|
|
55
56
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -63,6 +64,7 @@ class BiasMetric(BaseMetric):
|
|
|
63
64
|
test_case,
|
|
64
65
|
_show_indicator=False,
|
|
65
66
|
_in_component=_in_component,
|
|
67
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
66
68
|
)
|
|
67
69
|
)
|
|
68
70
|
else:
|
|
@@ -81,7 +83,10 @@ class BiasMetric(BaseMetric):
|
|
|
81
83
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
82
84
|
],
|
|
83
85
|
)
|
|
84
|
-
|
|
86
|
+
if _log_metric_to_confident:
|
|
87
|
+
metric_data_manager.post_metric_if_enabled(
|
|
88
|
+
self, test_case=test_case
|
|
89
|
+
)
|
|
85
90
|
return self.score
|
|
86
91
|
|
|
87
92
|
async def a_measure(
|
|
@@ -89,8 +94,8 @@ class BiasMetric(BaseMetric):
|
|
|
89
94
|
test_case: LLMTestCase,
|
|
90
95
|
_show_indicator: bool = True,
|
|
91
96
|
_in_component: bool = False,
|
|
97
|
+
_log_metric_to_confident: bool = True,
|
|
92
98
|
) -> float:
|
|
93
|
-
|
|
94
99
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
95
100
|
|
|
96
101
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -116,6 +121,10 @@ class BiasMetric(BaseMetric):
|
|
|
116
121
|
],
|
|
117
122
|
)
|
|
118
123
|
|
|
124
|
+
if _log_metric_to_confident:
|
|
125
|
+
metric_data_manager.post_metric_if_enabled(
|
|
126
|
+
self, test_case=test_case
|
|
127
|
+
)
|
|
119
128
|
return self.score
|
|
120
129
|
|
|
121
130
|
async def _a_generate_reason(self) -> str:
|
|
@@ -18,6 +18,7 @@ from deepeval.metrics.contextual_precision.template import (
|
|
|
18
18
|
)
|
|
19
19
|
from deepeval.metrics.indicator import metric_progress_indicator
|
|
20
20
|
from deepeval.metrics.contextual_precision.schema import *
|
|
21
|
+
from deepeval.metrics.api import metric_data_manager
|
|
21
22
|
|
|
22
23
|
|
|
23
24
|
class ContextualPrecisionMetric(BaseMetric):
|
|
@@ -53,8 +54,8 @@ class ContextualPrecisionMetric(BaseMetric):
|
|
|
53
54
|
test_case: LLMTestCase,
|
|
54
55
|
_show_indicator: bool = True,
|
|
55
56
|
_in_component: bool = False,
|
|
57
|
+
_log_metric_to_confident: bool = True,
|
|
56
58
|
) -> float:
|
|
57
|
-
|
|
58
59
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
59
60
|
|
|
60
61
|
self.evaluation_cost = 0 if self.using_native_model else None
|
|
@@ -68,6 +69,7 @@ class ContextualPrecisionMetric(BaseMetric):
|
|
|
68
69
|
test_case,
|
|
69
70
|
_show_indicator=False,
|
|
70
71
|
_in_component=_in_component,
|
|
72
|
+
_log_metric_to_confident=_log_metric_to_confident,
|
|
71
73
|
)
|
|
72
74
|
)
|
|
73
75
|
else:
|
|
@@ -88,7 +90,10 @@ class ContextualPrecisionMetric(BaseMetric):
|
|
|
88
90
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
89
91
|
],
|
|
90
92
|
)
|
|
91
|
-
|
|
93
|
+
if _log_metric_to_confident:
|
|
94
|
+
metric_data_manager.post_metric_if_enabled(
|
|
95
|
+
self, test_case=test_case
|
|
96
|
+
)
|
|
92
97
|
return self.score
|
|
93
98
|
|
|
94
99
|
async def a_measure(
|
|
@@ -96,6 +101,7 @@ class ContextualPrecisionMetric(BaseMetric):
|
|
|
96
101
|
test_case: LLMTestCase,
|
|
97
102
|
_show_indicator: bool = True,
|
|
98
103
|
_in_component: bool = False,
|
|
104
|
+
_log_metric_to_confident: bool = True,
|
|
99
105
|
) -> float:
|
|
100
106
|
|
|
101
107
|
check_llm_test_case_params(test_case, self._required_params, self)
|
|
@@ -124,7 +130,10 @@ class ContextualPrecisionMetric(BaseMetric):
|
|
|
124
130
|
f"Score: {self.score}\nReason: {self.reason}",
|
|
125
131
|
],
|
|
126
132
|
)
|
|
127
|
-
|
|
133
|
+
if _log_metric_to_confident:
|
|
134
|
+
metric_data_manager.post_metric_if_enabled(
|
|
135
|
+
self, test_case=test_case
|
|
136
|
+
)
|
|
128
137
|
return self.score
|
|
129
138
|
|
|
130
139
|
async def _a_generate_reason(self, input: str):
|