unique_toolkit 1.45.5__py3-none-any.whl → 1.45.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- unique_toolkit/agentic/evaluation/config.py +25 -6
- unique_toolkit/agentic/evaluation/context_relevancy/prompts/__init__.py +13 -0
- unique_toolkit/agentic/evaluation/context_relevancy/{prompts.py → prompts/system_prompt.j2} +11 -43
- unique_toolkit/agentic/evaluation/context_relevancy/prompts/user_prompt.j2 +15 -0
- unique_toolkit/agentic/evaluation/context_relevancy/service.py +24 -56
- unique_toolkit/agentic/evaluation/hallucination/constants.py +26 -15
- unique_toolkit/agentic/evaluation/hallucination/prompts/__init__.py +13 -0
- unique_toolkit/agentic/evaluation/hallucination/prompts/system_prompt.j2 +35 -0
- unique_toolkit/agentic/evaluation/hallucination/prompts/user_prompt.j2 +27 -0
- unique_toolkit/agentic/evaluation/hallucination/utils.py +153 -102
- unique_toolkit/agentic/evaluation/tests/fixtures.py +102 -0
- unique_toolkit/agentic/evaluation/tests/test_config.py +247 -0
- unique_toolkit/agentic/evaluation/tests/test_context_relevancy_service.py +141 -121
- unique_toolkit/agentic/evaluation/tests/test_hallucination_constants.py +600 -0
- unique_toolkit/agentic/evaluation/tests/test_hallucination_utils.py +1009 -0
- unique_toolkit/agentic/evaluation/tests/test_output_parser.py +82 -23
- unique_toolkit/agentic/evaluation/tests/test_prompt_loaders.py +348 -0
- unique_toolkit/agentic/evaluation/utils.py +8 -0
- {unique_toolkit-1.45.5.dist-info → unique_toolkit-1.45.6.dist-info}/METADATA +4 -1
- {unique_toolkit-1.45.5.dist-info → unique_toolkit-1.45.6.dist-info}/RECORD +22 -12
- unique_toolkit/agentic/evaluation/hallucination/prompts.py +0 -79
- {unique_toolkit-1.45.5.dist-info → unique_toolkit-1.45.6.dist-info}/LICENSE +0 -0
- {unique_toolkit-1.45.5.dist-info → unique_toolkit-1.45.6.dist-info}/WHEEL +0 -0
|
@@ -1,9 +1,13 @@
|
|
|
1
|
-
import
|
|
2
|
-
from
|
|
3
|
-
from string import Template
|
|
1
|
+
import re
|
|
2
|
+
from logging import getLogger
|
|
4
3
|
|
|
4
|
+
from unique_toolkit._common.utils.jinja.render import render_template
|
|
5
5
|
from unique_toolkit.agentic.evaluation.config import EvaluationMetricConfig
|
|
6
6
|
from unique_toolkit.agentic.evaluation.exception import EvaluatorException
|
|
7
|
+
from unique_toolkit.agentic.evaluation.hallucination.constants import (
|
|
8
|
+
SourceSelectionMode,
|
|
9
|
+
hallucination_required_input_fields,
|
|
10
|
+
)
|
|
7
11
|
from unique_toolkit.agentic.evaluation.output_parser import parse_eval_metric_result
|
|
8
12
|
from unique_toolkit.agentic.evaluation.schemas import (
|
|
9
13
|
EvaluationMetricInput,
|
|
@@ -20,19 +24,7 @@ from unique_toolkit.language_model.schemas import (
|
|
|
20
24
|
)
|
|
21
25
|
from unique_toolkit.language_model.service import LanguageModelService
|
|
22
26
|
|
|
23
|
-
|
|
24
|
-
SYSTEM_MSG_DEFAULT_KEY,
|
|
25
|
-
SYSTEM_MSG_KEY,
|
|
26
|
-
USER_MSG_DEFAULT_KEY,
|
|
27
|
-
USER_MSG_KEY,
|
|
28
|
-
hallucination_required_input_fields,
|
|
29
|
-
)
|
|
30
|
-
from .prompts import (
|
|
31
|
-
HALLUCINATION_METRIC_SYSTEM_MSG,
|
|
32
|
-
HALLUCINATION_METRIC_SYSTEM_MSG_DEFAULT,
|
|
33
|
-
HALLUCINATION_METRIC_USER_MSG,
|
|
34
|
-
HALLUCINATION_METRIC_USER_MSG_DEFAULT,
|
|
35
|
-
)
|
|
27
|
+
_LOGGER = getLogger(__name__)
|
|
36
28
|
|
|
37
29
|
|
|
38
30
|
async def check_hallucination(
|
|
@@ -72,15 +64,14 @@ async def check_hallucination(
|
|
|
72
64
|
EvaluatorException: If the context texts are empty, required fields are missing, or an error occurs during the evaluation.
|
|
73
65
|
"""
|
|
74
66
|
|
|
75
|
-
logger = logging.getLogger(f"check_hallucination.{__name__}")
|
|
76
|
-
|
|
77
67
|
model_name = config.language_model.name
|
|
78
|
-
|
|
68
|
+
_LOGGER.info(f"Analyzing level of hallucination with {model_name}.")
|
|
79
69
|
|
|
80
70
|
input.validate_required_fields(hallucination_required_input_fields)
|
|
81
71
|
|
|
82
72
|
try:
|
|
83
|
-
msgs = _get_msgs(input, config
|
|
73
|
+
msgs = _get_msgs(input, config)
|
|
74
|
+
|
|
84
75
|
result = await LanguageModelService.complete_async_util(
|
|
85
76
|
company_id=company_id, user_id=user_id, messages=msgs, model_name=model_name
|
|
86
77
|
)
|
|
@@ -91,10 +82,12 @@ async def check_hallucination(
|
|
|
91
82
|
error_message=error_message,
|
|
92
83
|
user_message=error_message,
|
|
93
84
|
)
|
|
94
|
-
|
|
85
|
+
result = parse_eval_metric_result(
|
|
95
86
|
result_content, # type: ignore
|
|
96
87
|
EvaluationMetricName.HALLUCINATION,
|
|
97
88
|
)
|
|
89
|
+
|
|
90
|
+
return result
|
|
98
91
|
except Exception as e:
|
|
99
92
|
error_message = "Error occurred during hallucination metric analysis"
|
|
100
93
|
raise EvaluatorException(
|
|
@@ -107,131 +100,142 @@ async def check_hallucination(
|
|
|
107
100
|
def _get_msgs(
|
|
108
101
|
input: EvaluationMetricInput,
|
|
109
102
|
config: EvaluationMetricConfig,
|
|
110
|
-
logger: logging.Logger,
|
|
111
103
|
):
|
|
112
104
|
"""
|
|
113
105
|
Composes the messages for hallucination analysis based on the provided input and configuration.
|
|
114
106
|
|
|
115
|
-
This method
|
|
116
|
-
message texts in the
|
|
107
|
+
This method composes messages with or without context based on the availability of context texts
|
|
108
|
+
and history message texts in the input.
|
|
117
109
|
|
|
118
110
|
Args:
|
|
119
111
|
input (EvaluationMetricInput): The input data that includes context texts and history message texts
|
|
120
112
|
for the analysis.
|
|
121
113
|
config (EvaluationMetricConfig): The configuration settings for composing messages.
|
|
122
114
|
logger (Optional[logging.Logger], optional): The logger used for logging debug information.
|
|
123
|
-
Defaults to the logger for the current module.
|
|
124
115
|
|
|
125
116
|
Returns:
|
|
126
|
-
The composed messages as per the provided input and configuration.
|
|
127
|
-
depend on the implementation of the `compose_msgs` and `compose_msgs_default` methods.
|
|
128
|
-
|
|
117
|
+
The composed messages as per the provided input and configuration.
|
|
129
118
|
"""
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
119
|
+
has_context = bool(input.context_texts or input.history_messages)
|
|
120
|
+
|
|
121
|
+
if has_context:
|
|
122
|
+
_LOGGER.debug("Using context / history for hallucination evaluation.")
|
|
133
123
|
else:
|
|
134
|
-
|
|
135
|
-
|
|
124
|
+
_LOGGER.debug("No contexts and history provided for hallucination evaluation.")
|
|
125
|
+
|
|
126
|
+
return _compose_msgs(input, config, has_context)
|
|
136
127
|
|
|
137
128
|
|
|
138
129
|
def _compose_msgs(
|
|
139
130
|
input: EvaluationMetricInput,
|
|
140
131
|
config: EvaluationMetricConfig,
|
|
132
|
+
has_context: bool,
|
|
141
133
|
):
|
|
142
134
|
"""
|
|
143
|
-
Composes the hallucination analysis messages.
|
|
144
|
-
"""
|
|
145
|
-
system_msg_content = _get_system_prompt_with_contexts(config)
|
|
146
|
-
system_msg = LanguageModelSystemMessage(content=system_msg_content)
|
|
147
|
-
|
|
148
|
-
user_msg_templ = Template(_get_user_prompt_with_contexts(config))
|
|
149
|
-
user_msg_content = user_msg_templ.substitute(
|
|
150
|
-
input_text=input.input_text,
|
|
151
|
-
contexts_text=input.get_joined_context_texts(tag_name="reference"),
|
|
152
|
-
history_messages_text=input.get_joined_history_texts(tag_name="conversation"),
|
|
153
|
-
output_text=input.output_text,
|
|
154
|
-
)
|
|
155
|
-
user_msg = LanguageModelUserMessage(content=user_msg_content)
|
|
156
|
-
return LanguageModelMessages([system_msg, user_msg])
|
|
135
|
+
Composes the hallucination analysis messages using Jinja2 templates.
|
|
157
136
|
|
|
137
|
+
Args:
|
|
138
|
+
input (EvaluationMetricInput): The input data for evaluation.
|
|
139
|
+
config (EvaluationMetricConfig): The configuration settings.
|
|
140
|
+
has_context (bool): Whether context/history is available.
|
|
158
141
|
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
config: EvaluationMetricConfig,
|
|
162
|
-
):
|
|
163
|
-
"""
|
|
164
|
-
Composes the hallucination analysis prompt without messages.
|
|
142
|
+
Returns:
|
|
143
|
+
LanguageModelMessages: The composed messages for evaluation.
|
|
165
144
|
"""
|
|
166
|
-
|
|
145
|
+
# Get templates
|
|
146
|
+
system_template = config.prompts_config.system_prompt_template
|
|
147
|
+
user_template = config.prompts_config.user_prompt_template
|
|
148
|
+
|
|
149
|
+
# Render system message
|
|
150
|
+
system_msg_content = render_template(
|
|
151
|
+
system_template,
|
|
152
|
+
has_context=has_context,
|
|
153
|
+
)
|
|
167
154
|
system_msg = LanguageModelSystemMessage(content=system_msg_content)
|
|
168
155
|
|
|
169
|
-
|
|
170
|
-
user_msg_content =
|
|
156
|
+
# Render user message
|
|
157
|
+
user_msg_content = render_template(
|
|
158
|
+
user_template,
|
|
171
159
|
input_text=input.input_text,
|
|
160
|
+
contexts_text=input.get_joined_context_texts(tag_name="reference")
|
|
161
|
+
if has_context
|
|
162
|
+
else None,
|
|
163
|
+
history_messages_text=input.get_joined_history_texts(tag_name="conversation")
|
|
164
|
+
if has_context
|
|
165
|
+
else None,
|
|
172
166
|
output_text=input.output_text,
|
|
173
167
|
)
|
|
174
168
|
user_msg = LanguageModelUserMessage(content=user_msg_content)
|
|
175
|
-
return LanguageModelMessages([system_msg, user_msg])
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
def _get_system_prompt_with_contexts(config: EvaluationMetricConfig):
|
|
179
|
-
return config.custom_prompts.setdefault(
|
|
180
|
-
SYSTEM_MSG_KEY,
|
|
181
|
-
HALLUCINATION_METRIC_SYSTEM_MSG,
|
|
182
|
-
)
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
def _get_user_prompt_with_contexts(config: EvaluationMetricConfig):
|
|
186
|
-
return config.custom_prompts.setdefault(
|
|
187
|
-
USER_MSG_KEY,
|
|
188
|
-
HALLUCINATION_METRIC_USER_MSG,
|
|
189
|
-
)
|
|
190
169
|
|
|
170
|
+
return LanguageModelMessages([system_msg, user_msg])
|
|
191
171
|
|
|
192
|
-
def _get_system_prompt_default(config: EvaluationMetricConfig):
|
|
193
|
-
return config.custom_prompts.setdefault(
|
|
194
|
-
SYSTEM_MSG_DEFAULT_KEY,
|
|
195
|
-
HALLUCINATION_METRIC_SYSTEM_MSG_DEFAULT,
|
|
196
|
-
)
|
|
197
172
|
|
|
173
|
+
def context_text_from_stream_response(
|
|
174
|
+
response: LanguageModelStreamResponse,
|
|
175
|
+
selected_chunks: list[ContentChunk],
|
|
176
|
+
source_selection_mode: SourceSelectionMode = SourceSelectionMode.FROM_ORIGINAL_RESPONSE,
|
|
177
|
+
reference_pattern: str = r"[\[<]?source(\d+)[>\]]?",
|
|
178
|
+
) -> list[str]:
|
|
179
|
+
"""Extract context text from stream response based on selected chunks.
|
|
198
180
|
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
181
|
+
Args:
|
|
182
|
+
response: The language model stream response containing references.
|
|
183
|
+
selected_chunks: List of content chunks to select from.
|
|
184
|
+
source_selection_mode: Strategy for selecting referenced chunks.
|
|
185
|
+
- FROM_IDS: Match by chunk IDs (default)
|
|
186
|
+
- FROM_ORDER: Select by order of appearance
|
|
187
|
+
- FROM_ORIGINAL_RESPONSE: Extract from original response text using regex
|
|
188
|
+
ref_pattern: Regex pattern for extracting source numbers (only used with FROM_ORIGINAL_RESPONSE).
|
|
204
189
|
|
|
190
|
+
Returns:
|
|
191
|
+
List of text strings from the referenced chunks.
|
|
205
192
|
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
193
|
+
Raises:
|
|
194
|
+
ValueError: If source_selection_mode is invalid or required data is missing.
|
|
195
|
+
"""
|
|
196
|
+
response_references = response.message.references
|
|
209
197
|
|
|
198
|
+
# Define selection strategies
|
|
199
|
+
strategies = {
|
|
200
|
+
SourceSelectionMode.FROM_IDS: lambda: _default_source_selection_mode(
|
|
201
|
+
response_references, selected_chunks
|
|
202
|
+
),
|
|
203
|
+
SourceSelectionMode.FROM_ORDER: lambda: _from_order_source_selection_mode(
|
|
204
|
+
response_references, selected_chunks
|
|
205
|
+
),
|
|
206
|
+
SourceSelectionMode.FROM_ORIGINAL_RESPONSE: lambda: _from_original_response_source_selection_mode(
|
|
207
|
+
response.message.original_text, selected_chunks, reference_pattern
|
|
208
|
+
),
|
|
209
|
+
}
|
|
210
210
|
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
selected_chunks: list[ContentChunk],
|
|
214
|
-
source_selection_mode: SourceSelectionMode = SourceSelectionMode.FROM_IDS,
|
|
215
|
-
):
|
|
216
|
-
response_references = response.message.references
|
|
217
|
-
match source_selection_mode:
|
|
218
|
-
case SourceSelectionMode.FROM_IDS:
|
|
219
|
-
referenced_chunks = _default_source_selection_mode(
|
|
220
|
-
response_references, selected_chunks
|
|
221
|
-
)
|
|
222
|
-
case SourceSelectionMode.FROM_ORDER:
|
|
223
|
-
referenced_chunks = _from_order_source_selection_mode(
|
|
224
|
-
response_references, selected_chunks
|
|
225
|
-
)
|
|
226
|
-
case _:
|
|
211
|
+
try:
|
|
212
|
+
if source_selection_mode not in strategies:
|
|
227
213
|
raise ValueError(f"Invalid source selection mode: {source_selection_mode}")
|
|
228
214
|
|
|
215
|
+
_LOGGER.info(f"Selecting context text using {source_selection_mode} mode.")
|
|
216
|
+
referenced_chunks = strategies[source_selection_mode]()
|
|
217
|
+
except Exception as e:
|
|
218
|
+
_LOGGER.exception(f"Error selecting context text: {e}")
|
|
219
|
+
_LOGGER.info("Falling back to default source selection mode.")
|
|
220
|
+
referenced_chunks = _default_source_selection_mode(
|
|
221
|
+
response_references, selected_chunks
|
|
222
|
+
)
|
|
223
|
+
|
|
229
224
|
return [chunk.text for chunk in referenced_chunks]
|
|
230
225
|
|
|
231
226
|
|
|
232
227
|
def _default_source_selection_mode(
|
|
233
228
|
references: list[ContentReference], selected_chunks: list[ContentChunk]
|
|
234
|
-
):
|
|
229
|
+
) -> list[ContentChunk]:
|
|
230
|
+
"""Select chunks by matching reference IDs.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
references: List of content references with source IDs.
|
|
234
|
+
selected_chunks: List of content chunks to select from.
|
|
235
|
+
|
|
236
|
+
Returns:
|
|
237
|
+
List of referenced content chunks.
|
|
238
|
+
"""
|
|
235
239
|
reference_ids = {reference.source_id for reference in references}
|
|
236
240
|
|
|
237
241
|
def build_chunk_id(chunk: ContentChunk) -> str:
|
|
@@ -246,7 +250,16 @@ def _default_source_selection_mode(
|
|
|
246
250
|
|
|
247
251
|
def _from_order_source_selection_mode(
|
|
248
252
|
references: list[ContentReference], selected_chunks: list[ContentChunk]
|
|
249
|
-
):
|
|
253
|
+
) -> list[ContentChunk]:
|
|
254
|
+
"""Select chunks by order of appearance in references.
|
|
255
|
+
|
|
256
|
+
Args:
|
|
257
|
+
references: List of content references with original indices.
|
|
258
|
+
selected_chunks: List of content chunks to select from.
|
|
259
|
+
|
|
260
|
+
Returns:
|
|
261
|
+
List of referenced content chunks in order of appearance.
|
|
262
|
+
"""
|
|
250
263
|
original_chunks_order: list[int] = []
|
|
251
264
|
for reference in references:
|
|
252
265
|
for original_index in reference.original_index:
|
|
@@ -258,3 +271,41 @@ def _from_order_source_selection_mode(
|
|
|
258
271
|
referenced_chunks.append(selected_chunks[index])
|
|
259
272
|
|
|
260
273
|
return referenced_chunks
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def _from_original_response_source_selection_mode(
|
|
277
|
+
original_text: str | None,
|
|
278
|
+
selected_chunks: list[ContentChunk],
|
|
279
|
+
reference_pattern: str,
|
|
280
|
+
) -> list[ContentChunk]:
|
|
281
|
+
"""Extract referenced chunks from original text using regex pattern.
|
|
282
|
+
|
|
283
|
+
Args:
|
|
284
|
+
original_text: The original response text containing source references.
|
|
285
|
+
selected_chunks: List of content chunks to select from.
|
|
286
|
+
ref_pattern: Regex pattern for extracting source numbers.
|
|
287
|
+
|
|
288
|
+
Returns:
|
|
289
|
+
List of referenced content chunks.
|
|
290
|
+
"""
|
|
291
|
+
if original_text is None:
|
|
292
|
+
raise ValueError("original_text is required for FROM_ORIGINAL_RESPONSE mode")
|
|
293
|
+
_LOGGER.debug("Processing original text for source extraction")
|
|
294
|
+
source_number_matches = re.findall(reference_pattern, original_text)
|
|
295
|
+
|
|
296
|
+
# Remove duplicates and preserve order
|
|
297
|
+
source_numbers = list(dict.fromkeys(int(num) for num in source_number_matches))
|
|
298
|
+
|
|
299
|
+
# Add bounds checking
|
|
300
|
+
max_index = len(selected_chunks) - 1
|
|
301
|
+
valid_source_numbers = [idx for idx in source_numbers if 0 <= idx <= max_index]
|
|
302
|
+
|
|
303
|
+
if len(valid_source_numbers) < len(source_numbers):
|
|
304
|
+
invalid_numbers = set(source_numbers) - set(valid_source_numbers)
|
|
305
|
+
_LOGGER.warning(
|
|
306
|
+
f"Some source indices were out of bounds (max index: {max_index}). "
|
|
307
|
+
f"Valid indices: {sorted(valid_source_numbers)}, Invalid indices: {sorted(invalid_numbers)}"
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
referenced_chunks = [selected_chunks[idx] for idx in valid_source_numbers]
|
|
311
|
+
return referenced_chunks
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""Centralized fixtures for evaluation tests."""
|
|
2
|
+
|
|
3
|
+
from unittest.mock import MagicMock
|
|
4
|
+
|
|
5
|
+
import pytest
|
|
6
|
+
|
|
7
|
+
from unique_toolkit.agentic.evaluation.config import EvaluationMetricConfig
|
|
8
|
+
from unique_toolkit.agentic.evaluation.context_relevancy.service import (
|
|
9
|
+
ContextRelevancyEvaluator,
|
|
10
|
+
)
|
|
11
|
+
from unique_toolkit.agentic.evaluation.schemas import (
|
|
12
|
+
EvaluationMetricInput,
|
|
13
|
+
EvaluationMetricName,
|
|
14
|
+
)
|
|
15
|
+
from unique_toolkit.app.schemas import ChatEvent
|
|
16
|
+
from unique_toolkit.chat.service import LanguageModelName
|
|
17
|
+
from unique_toolkit.language_model.infos import LanguageModelInfo
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@pytest.fixture
|
|
21
|
+
def base_chat_event() -> MagicMock:
|
|
22
|
+
"""
|
|
23
|
+
Create a base chat event mock for evaluation tests.
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
MagicMock configured with standard test event properties.
|
|
27
|
+
"""
|
|
28
|
+
event = MagicMock(spec=ChatEvent)
|
|
29
|
+
event.payload = MagicMock()
|
|
30
|
+
event.payload.user_message = MagicMock()
|
|
31
|
+
event.payload.user_message.text = "Test query"
|
|
32
|
+
event.user_id = "user_0"
|
|
33
|
+
event.company_id = "company_0"
|
|
34
|
+
return event
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@pytest.fixture
|
|
38
|
+
def context_relevancy_evaluator(
|
|
39
|
+
base_chat_event: MagicMock,
|
|
40
|
+
) -> ContextRelevancyEvaluator:
|
|
41
|
+
"""
|
|
42
|
+
Create a ContextRelevancyEvaluator instance with base event.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
base_chat_event: Mock chat event fixture.
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
Configured ContextRelevancyEvaluator instance.
|
|
49
|
+
"""
|
|
50
|
+
return ContextRelevancyEvaluator(base_chat_event)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@pytest.fixture
|
|
54
|
+
def basic_evaluation_config() -> EvaluationMetricConfig:
|
|
55
|
+
"""
|
|
56
|
+
Create a basic evaluation config for context relevancy tests.
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
EvaluationMetricConfig with standard settings.
|
|
60
|
+
"""
|
|
61
|
+
return EvaluationMetricConfig(
|
|
62
|
+
enabled=True,
|
|
63
|
+
name=EvaluationMetricName.CONTEXT_RELEVANCY,
|
|
64
|
+
language_model=LanguageModelInfo.from_name(
|
|
65
|
+
LanguageModelName.AZURE_GPT_4o_2024_0806
|
|
66
|
+
),
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@pytest.fixture
|
|
71
|
+
def structured_evaluation_config(
|
|
72
|
+
basic_evaluation_config: EvaluationMetricConfig,
|
|
73
|
+
) -> EvaluationMetricConfig:
|
|
74
|
+
"""
|
|
75
|
+
Create evaluation config with structured output enabled.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
basic_evaluation_config: Base config fixture.
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
EvaluationMetricConfig configured for structured output.
|
|
82
|
+
"""
|
|
83
|
+
model_info = LanguageModelInfo.from_name(LanguageModelName.AZURE_GPT_4o_2024_0806)
|
|
84
|
+
return EvaluationMetricConfig(
|
|
85
|
+
enabled=True,
|
|
86
|
+
name=EvaluationMetricName.CONTEXT_RELEVANCY,
|
|
87
|
+
language_model=model_info,
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@pytest.fixture
|
|
92
|
+
def sample_evaluation_input() -> EvaluationMetricInput:
|
|
93
|
+
"""
|
|
94
|
+
Create sample evaluation input with test data.
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
EvaluationMetricInput with test query and contexts.
|
|
98
|
+
"""
|
|
99
|
+
return EvaluationMetricInput(
|
|
100
|
+
input_text="test query",
|
|
101
|
+
context_texts=["test context 1", "test context 2"],
|
|
102
|
+
)
|
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
"""Tests for evaluation config module."""
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from unique_toolkit.agentic.evaluation.config import (
|
|
6
|
+
EvaluationMetricConfig,
|
|
7
|
+
EvaluationMetricPromptsConfig,
|
|
8
|
+
)
|
|
9
|
+
from unique_toolkit.agentic.evaluation.schemas import EvaluationMetricName
|
|
10
|
+
from unique_toolkit.language_model.default_language_model import DEFAULT_GPT_4o
|
|
11
|
+
from unique_toolkit.language_model.infos import LanguageModelInfo
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@pytest.mark.ai
|
|
15
|
+
def test_evaluation_metric_prompts_config__initializes_with_empty_strings__by_default() -> (
|
|
16
|
+
None
|
|
17
|
+
):
|
|
18
|
+
"""
|
|
19
|
+
Purpose: Verify that EvaluationMetricPromptsConfig initializes with empty template strings.
|
|
20
|
+
Why this matters: Default initialization should not load templates automatically.
|
|
21
|
+
Setup summary: Create config with no arguments, assert empty string defaults.
|
|
22
|
+
"""
|
|
23
|
+
# Arrange - No setup needed
|
|
24
|
+
|
|
25
|
+
# Act
|
|
26
|
+
config: EvaluationMetricPromptsConfig = EvaluationMetricPromptsConfig()
|
|
27
|
+
|
|
28
|
+
# Assert
|
|
29
|
+
assert config.system_prompt_template == ""
|
|
30
|
+
assert config.user_prompt_template == ""
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@pytest.mark.ai
|
|
34
|
+
def test_evaluation_metric_prompts_config__accepts_custom_templates__on_initialization() -> (
|
|
35
|
+
None
|
|
36
|
+
):
|
|
37
|
+
"""
|
|
38
|
+
Purpose: Verify that EvaluationMetricPromptsConfig accepts custom template values.
|
|
39
|
+
Why this matters: Allows customization of prompts for different evaluation scenarios.
|
|
40
|
+
Setup summary: Initialize with custom prompts, assert they are stored correctly.
|
|
41
|
+
"""
|
|
42
|
+
# Arrange
|
|
43
|
+
system_prompt: str = "Custom system prompt"
|
|
44
|
+
user_prompt: str = "Custom user prompt"
|
|
45
|
+
|
|
46
|
+
# Act
|
|
47
|
+
config: EvaluationMetricPromptsConfig = EvaluationMetricPromptsConfig(
|
|
48
|
+
system_prompt_template=system_prompt,
|
|
49
|
+
user_prompt_template=user_prompt,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
# Assert
|
|
53
|
+
assert config.system_prompt_template == system_prompt
|
|
54
|
+
assert config.user_prompt_template == user_prompt
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@pytest.mark.ai
|
|
58
|
+
def test_evaluation_metric_prompts_config__stores_strings__for_template_fields() -> (
|
|
59
|
+
None
|
|
60
|
+
):
|
|
61
|
+
"""
|
|
62
|
+
Purpose: Verify that prompt template fields accept and store string values.
|
|
63
|
+
Why this matters: Type safety for prompt templates is critical for rendering.
|
|
64
|
+
Setup summary: Create config with string prompts, assert type is string.
|
|
65
|
+
"""
|
|
66
|
+
# Arrange
|
|
67
|
+
system_template: str = "Test system prompt"
|
|
68
|
+
user_template: str = "Test user prompt"
|
|
69
|
+
|
|
70
|
+
# Act
|
|
71
|
+
config: EvaluationMetricPromptsConfig = EvaluationMetricPromptsConfig(
|
|
72
|
+
system_prompt_template=system_template,
|
|
73
|
+
user_prompt_template=user_template,
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
# Assert
|
|
77
|
+
assert isinstance(config.system_prompt_template, str)
|
|
78
|
+
assert isinstance(config.user_prompt_template, str)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
@pytest.mark.ai
|
|
82
|
+
def test_evaluation_metric_prompts_config__allows_modification__after_initialization() -> (
|
|
83
|
+
None
|
|
84
|
+
):
|
|
85
|
+
"""
|
|
86
|
+
Purpose: Verify that prompt config fields can be modified after creation.
|
|
87
|
+
Why this matters: Enables dynamic prompt updates during runtime.
|
|
88
|
+
Setup summary: Create config, modify fields, assert new values.
|
|
89
|
+
"""
|
|
90
|
+
# Arrange
|
|
91
|
+
config: EvaluationMetricPromptsConfig = EvaluationMetricPromptsConfig()
|
|
92
|
+
|
|
93
|
+
# Act
|
|
94
|
+
config.system_prompt_template = "New system prompt"
|
|
95
|
+
config.user_prompt_template = "New user prompt"
|
|
96
|
+
|
|
97
|
+
# Assert
|
|
98
|
+
assert config.system_prompt_template == "New system prompt"
|
|
99
|
+
assert config.user_prompt_template == "New user prompt"
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
@pytest.mark.ai
|
|
103
|
+
def test_evaluation_metric_config__initializes_with_default_prompts_config__when_not_provided() -> (
|
|
104
|
+
None
|
|
105
|
+
):
|
|
106
|
+
"""
|
|
107
|
+
Purpose: Verify that EvaluationMetricConfig creates default prompts config.
|
|
108
|
+
Why this matters: Ensures config is always in valid state even without explicit prompts.
|
|
109
|
+
Setup summary: Create config without prompts_config, assert default empty prompts.
|
|
110
|
+
"""
|
|
111
|
+
# Arrange - No setup needed
|
|
112
|
+
|
|
113
|
+
# Act
|
|
114
|
+
config: EvaluationMetricConfig = EvaluationMetricConfig(
|
|
115
|
+
enabled=True,
|
|
116
|
+
name=EvaluationMetricName.CONTEXT_RELEVANCY,
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
# Assert
|
|
120
|
+
assert isinstance(config.prompts_config, EvaluationMetricPromptsConfig)
|
|
121
|
+
assert config.prompts_config.system_prompt_template == ""
|
|
122
|
+
assert config.prompts_config.user_prompt_template == ""
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
@pytest.mark.ai
|
|
126
|
+
def test_evaluation_metric_config__accepts_custom_prompts_config__on_initialization() -> (
|
|
127
|
+
None
|
|
128
|
+
):
|
|
129
|
+
"""
|
|
130
|
+
Purpose: Verify that EvaluationMetricConfig accepts custom prompts configuration.
|
|
131
|
+
Why this matters: Allows full customization of evaluation prompts per metric.
|
|
132
|
+
Setup summary: Create custom prompts config, pass to metric config, assert values.
|
|
133
|
+
"""
|
|
134
|
+
# Arrange
|
|
135
|
+
prompts_config: EvaluationMetricPromptsConfig = EvaluationMetricPromptsConfig(
|
|
136
|
+
system_prompt_template="Custom system",
|
|
137
|
+
user_prompt_template="Custom user",
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
# Act
|
|
141
|
+
config: EvaluationMetricConfig = EvaluationMetricConfig(
|
|
142
|
+
enabled=True,
|
|
143
|
+
name=EvaluationMetricName.CONTEXT_RELEVANCY,
|
|
144
|
+
prompts_config=prompts_config,
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
# Assert
|
|
148
|
+
assert config.prompts_config.system_prompt_template == "Custom system"
|
|
149
|
+
assert config.prompts_config.user_prompt_template == "Custom user"
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
@pytest.mark.ai
|
|
153
|
+
def test_evaluation_metric_config__has_all_required_fields__on_initialization() -> None:
|
|
154
|
+
"""
|
|
155
|
+
Purpose: Verify that EvaluationMetricConfig has all expected configuration fields.
|
|
156
|
+
Why this matters: Ensures complete config structure for evaluation metrics.
|
|
157
|
+
Setup summary: Create config with language model, assert all fields exist.
|
|
158
|
+
"""
|
|
159
|
+
# Arrange
|
|
160
|
+
language_model: LanguageModelInfo = LanguageModelInfo.from_name(DEFAULT_GPT_4o)
|
|
161
|
+
|
|
162
|
+
# Act
|
|
163
|
+
config: EvaluationMetricConfig = EvaluationMetricConfig(
|
|
164
|
+
enabled=True,
|
|
165
|
+
name=EvaluationMetricName.HALLUCINATION,
|
|
166
|
+
language_model=language_model,
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
# Assert
|
|
170
|
+
assert hasattr(config, "enabled")
|
|
171
|
+
assert hasattr(config, "name")
|
|
172
|
+
assert hasattr(config, "language_model")
|
|
173
|
+
assert hasattr(config, "additional_llm_options")
|
|
174
|
+
assert hasattr(config, "prompts_config")
|
|
175
|
+
assert hasattr(config, "score_to_label")
|
|
176
|
+
assert hasattr(config, "score_to_title")
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
@pytest.mark.ai
|
|
180
|
+
def test_evaluation_metric_config__defaults_to_empty_dict__for_additional_llm_options() -> (
|
|
181
|
+
None
|
|
182
|
+
):
|
|
183
|
+
"""
|
|
184
|
+
Purpose: Verify that additional_llm_options defaults to empty dictionary.
|
|
185
|
+
Why this matters: Provides safe default for optional LLM configuration.
|
|
186
|
+
Setup summary: Create config without options, assert empty dict default.
|
|
187
|
+
"""
|
|
188
|
+
# Arrange - No setup needed
|
|
189
|
+
|
|
190
|
+
# Act
|
|
191
|
+
config: EvaluationMetricConfig = EvaluationMetricConfig(
|
|
192
|
+
enabled=True,
|
|
193
|
+
name=EvaluationMetricName.CONTEXT_RELEVANCY,
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
# Assert
|
|
197
|
+
assert config.additional_llm_options == {}
|
|
198
|
+
assert isinstance(config.additional_llm_options, dict)
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
@pytest.mark.ai
|
|
202
|
+
def test_evaluation_metric_config__defaults_to_empty_dicts__for_score_mappings() -> (
|
|
203
|
+
None
|
|
204
|
+
):
|
|
205
|
+
"""
|
|
206
|
+
Purpose: Verify that score mapping dictionaries default to empty.
|
|
207
|
+
Why this matters: Allows optional score labeling and titling per metric.
|
|
208
|
+
Setup summary: Create config without mappings, assert empty dict defaults.
|
|
209
|
+
"""
|
|
210
|
+
# Arrange - No setup needed
|
|
211
|
+
|
|
212
|
+
# Act
|
|
213
|
+
config: EvaluationMetricConfig = EvaluationMetricConfig(
|
|
214
|
+
enabled=True,
|
|
215
|
+
name=EvaluationMetricName.CONTEXT_RELEVANCY,
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
# Assert
|
|
219
|
+
assert config.score_to_label == {}
|
|
220
|
+
assert config.score_to_title == {}
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
@pytest.mark.ai
|
|
224
|
+
def test_evaluation_metric_config__serializes_to_dict__with_all_fields() -> None:
|
|
225
|
+
"""
|
|
226
|
+
Purpose: Verify that config can be serialized to dictionary format.
|
|
227
|
+
Why this matters: Required for persistence and API serialization.
|
|
228
|
+
Setup summary: Create config with custom prompts, serialize, assert structure.
|
|
229
|
+
"""
|
|
230
|
+
# Arrange
|
|
231
|
+
prompts_config: EvaluationMetricPromptsConfig = EvaluationMetricPromptsConfig(
|
|
232
|
+
system_prompt_template="System",
|
|
233
|
+
user_prompt_template="User",
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
# Act
|
|
237
|
+
config: EvaluationMetricConfig = EvaluationMetricConfig(
|
|
238
|
+
enabled=True,
|
|
239
|
+
name=EvaluationMetricName.CONTEXT_RELEVANCY,
|
|
240
|
+
prompts_config=prompts_config,
|
|
241
|
+
)
|
|
242
|
+
config_dict: dict = config.model_dump()
|
|
243
|
+
|
|
244
|
+
# Assert
|
|
245
|
+
assert "prompts_config" in config_dict
|
|
246
|
+
assert config_dict["prompts_config"]["system_prompt_template"] == "System"
|
|
247
|
+
assert config_dict["prompts_config"]["user_prompt_template"] == "User"
|