unique_toolkit 1.45.5__py3-none-any.whl → 1.45.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- unique_toolkit/agentic/evaluation/config.py +25 -6
- unique_toolkit/agentic/evaluation/context_relevancy/prompts/__init__.py +13 -0
- unique_toolkit/agentic/evaluation/context_relevancy/{prompts.py → prompts/system_prompt.j2} +11 -43
- unique_toolkit/agentic/evaluation/context_relevancy/prompts/user_prompt.j2 +15 -0
- unique_toolkit/agentic/evaluation/context_relevancy/service.py +24 -56
- unique_toolkit/agentic/evaluation/hallucination/constants.py +26 -15
- unique_toolkit/agentic/evaluation/hallucination/prompts/__init__.py +13 -0
- unique_toolkit/agentic/evaluation/hallucination/prompts/system_prompt.j2 +35 -0
- unique_toolkit/agentic/evaluation/hallucination/prompts/user_prompt.j2 +27 -0
- unique_toolkit/agentic/evaluation/hallucination/utils.py +153 -102
- unique_toolkit/agentic/evaluation/tests/fixtures.py +102 -0
- unique_toolkit/agentic/evaluation/tests/test_config.py +247 -0
- unique_toolkit/agentic/evaluation/tests/test_context_relevancy_service.py +141 -121
- unique_toolkit/agentic/evaluation/tests/test_hallucination_constants.py +600 -0
- unique_toolkit/agentic/evaluation/tests/test_hallucination_utils.py +1009 -0
- unique_toolkit/agentic/evaluation/tests/test_output_parser.py +82 -23
- unique_toolkit/agentic/evaluation/tests/test_prompt_loaders.py +348 -0
- unique_toolkit/agentic/evaluation/utils.py +8 -0
- unique_toolkit/chat/responses_api.py +49 -45
- {unique_toolkit-1.45.5.dist-info → unique_toolkit-1.45.7.dist-info}/METADATA +9 -1
- {unique_toolkit-1.45.5.dist-info → unique_toolkit-1.45.7.dist-info}/RECORD +23 -13
- unique_toolkit/agentic/evaluation/hallucination/prompts.py +0 -79
- {unique_toolkit-1.45.5.dist-info → unique_toolkit-1.45.7.dist-info}/LICENSE +0 -0
- {unique_toolkit-1.45.5.dist-info → unique_toolkit-1.45.7.dist-info}/WHEEL +0 -0
|
@@ -1,103 +1,88 @@
|
|
|
1
|
+
"""Tests for context relevancy evaluation service."""
|
|
2
|
+
|
|
1
3
|
from unittest.mock import MagicMock, patch
|
|
2
4
|
|
|
3
5
|
import pytest
|
|
4
6
|
|
|
5
7
|
from unique_toolkit.agentic.evaluation.config import EvaluationMetricConfig
|
|
6
|
-
from unique_toolkit.agentic.evaluation.context_relevancy.prompts import (
|
|
7
|
-
CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG,
|
|
8
|
-
)
|
|
9
8
|
from unique_toolkit.agentic.evaluation.context_relevancy.schema import (
|
|
10
9
|
EvaluationSchemaStructuredOutput,
|
|
11
10
|
)
|
|
12
|
-
from unique_toolkit.agentic.evaluation.context_relevancy.service import (
|
|
13
|
-
ContextRelevancyEvaluator,
|
|
14
|
-
)
|
|
15
11
|
from unique_toolkit.agentic.evaluation.exception import EvaluatorException
|
|
16
12
|
from unique_toolkit.agentic.evaluation.schemas import (
|
|
17
13
|
EvaluationMetricInput,
|
|
18
|
-
EvaluationMetricName,
|
|
19
14
|
EvaluationMetricResult,
|
|
20
15
|
)
|
|
21
|
-
from unique_toolkit.app.schemas import ChatEvent
|
|
22
|
-
from unique_toolkit.chat.service import LanguageModelName
|
|
23
|
-
from unique_toolkit.language_model.infos import (
|
|
24
|
-
LanguageModelInfo,
|
|
25
|
-
)
|
|
26
16
|
from unique_toolkit.language_model.schemas import (
|
|
27
17
|
LanguageModelAssistantMessage,
|
|
28
18
|
LanguageModelCompletionChoice,
|
|
29
|
-
LanguageModelMessages,
|
|
30
19
|
)
|
|
31
20
|
from unique_toolkit.language_model.service import LanguageModelResponse
|
|
32
21
|
|
|
33
22
|
|
|
34
|
-
@pytest.
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
return EvaluationMetricConfig(
|
|
53
|
-
enabled=True,
|
|
54
|
-
name=EvaluationMetricName.CONTEXT_RELEVANCY,
|
|
55
|
-
language_model=LanguageModelInfo.from_name(
|
|
56
|
-
LanguageModelName.AZURE_GPT_4o_2024_0806
|
|
57
|
-
),
|
|
58
|
-
)
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
@pytest.fixture
|
|
62
|
-
def structured_config(basic_config):
|
|
63
|
-
model_info = LanguageModelInfo.from_name(LanguageModelName.AZURE_GPT_4o_2024_0806)
|
|
64
|
-
return EvaluationMetricConfig(
|
|
65
|
-
enabled=True,
|
|
66
|
-
name=EvaluationMetricName.CONTEXT_RELEVANCY,
|
|
67
|
-
language_model=model_info,
|
|
68
|
-
)
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
@pytest.fixture
|
|
72
|
-
def sample_input():
|
|
73
|
-
return EvaluationMetricInput(
|
|
74
|
-
input_text="test query",
|
|
75
|
-
context_texts=["test context 1", "test context 2"],
|
|
23
|
+
@pytest.mark.ai
|
|
24
|
+
@pytest.mark.asyncio
|
|
25
|
+
async def test_analyze__returns_none__when_disabled(
|
|
26
|
+
context_relevancy_evaluator: MagicMock,
|
|
27
|
+
sample_evaluation_input: EvaluationMetricInput,
|
|
28
|
+
basic_evaluation_config: EvaluationMetricConfig,
|
|
29
|
+
) -> None:
|
|
30
|
+
"""
|
|
31
|
+
Purpose: Verify that analyze returns None when evaluation is disabled in config.
|
|
32
|
+
Why this matters: Ensures evaluation can be toggled off without errors or side effects.
|
|
33
|
+
Setup summary: Set config.enabled=False, call analyze, assert None returned.
|
|
34
|
+
"""
|
|
35
|
+
# Arrange
|
|
36
|
+
basic_evaluation_config.enabled = False
|
|
37
|
+
|
|
38
|
+
# Act
|
|
39
|
+
result = await context_relevancy_evaluator.analyze(
|
|
40
|
+
sample_evaluation_input, basic_evaluation_config
|
|
76
41
|
)
|
|
77
42
|
|
|
78
|
-
|
|
79
|
-
@pytest.mark.asyncio
|
|
80
|
-
async def test_analyze_disabled(evaluator, sample_input, basic_config):
|
|
81
|
-
basic_config.enabled = False
|
|
82
|
-
result = await evaluator.analyze(sample_input, basic_config)
|
|
43
|
+
# Assert
|
|
83
44
|
assert result is None
|
|
84
45
|
|
|
85
46
|
|
|
47
|
+
@pytest.mark.ai
|
|
86
48
|
@pytest.mark.asyncio
|
|
87
|
-
async def
|
|
88
|
-
|
|
49
|
+
async def test_analyze__raises_evaluator_exception__with_empty_context(
|
|
50
|
+
context_relevancy_evaluator: MagicMock,
|
|
51
|
+
basic_evaluation_config: EvaluationMetricConfig,
|
|
52
|
+
) -> None:
|
|
53
|
+
"""
|
|
54
|
+
Purpose: Verify that analyze raises exception when context texts are empty.
|
|
55
|
+
Why this matters: Context relevancy evaluation requires at least one context.
|
|
56
|
+
Setup summary: Create input with empty context_texts, assert EvaluatorException raised.
|
|
57
|
+
"""
|
|
58
|
+
# Arrange
|
|
59
|
+
input_with_empty_context: EvaluationMetricInput = EvaluationMetricInput(
|
|
89
60
|
input_text="test query", context_texts=[]
|
|
90
61
|
)
|
|
91
62
|
|
|
63
|
+
# Act & Assert
|
|
92
64
|
with pytest.raises(EvaluatorException) as exc_info:
|
|
93
|
-
await
|
|
65
|
+
await context_relevancy_evaluator.analyze(
|
|
66
|
+
input_with_empty_context, basic_evaluation_config
|
|
67
|
+
)
|
|
94
68
|
|
|
95
69
|
assert "No context texts provided." in str(exc_info.value)
|
|
96
70
|
|
|
97
71
|
|
|
72
|
+
@pytest.mark.ai
|
|
98
73
|
@pytest.mark.asyncio
|
|
99
|
-
async def
|
|
100
|
-
|
|
74
|
+
async def test_analyze__returns_valid_result__with_regular_output(
|
|
75
|
+
context_relevancy_evaluator: MagicMock,
|
|
76
|
+
sample_evaluation_input: EvaluationMetricInput,
|
|
77
|
+
basic_evaluation_config: EvaluationMetricConfig,
|
|
78
|
+
) -> None:
|
|
79
|
+
"""
|
|
80
|
+
Purpose: Verify analyze successfully processes regular (non-structured) output from LLM.
|
|
81
|
+
Why this matters: Core functionality for evaluation with standard JSON responses.
|
|
82
|
+
Setup summary: Mock LLM response with JSON, call analyze, assert correct result parsing.
|
|
83
|
+
"""
|
|
84
|
+
# Arrange
|
|
85
|
+
mock_result: LanguageModelResponse = LanguageModelResponse(
|
|
101
86
|
choices=[
|
|
102
87
|
LanguageModelCompletionChoice(
|
|
103
88
|
index=0,
|
|
@@ -112,21 +97,36 @@ async def test_analyze_regular_output(evaluator, sample_input, basic_config):
|
|
|
112
97
|
]
|
|
113
98
|
)
|
|
114
99
|
|
|
100
|
+
# Act
|
|
115
101
|
with patch.object(
|
|
116
|
-
|
|
102
|
+
context_relevancy_evaluator.language_model_service,
|
|
117
103
|
"complete_async",
|
|
118
104
|
return_value=mock_result,
|
|
119
105
|
) as mock_complete:
|
|
120
|
-
result = await
|
|
106
|
+
result: EvaluationMetricResult = await context_relevancy_evaluator.analyze(
|
|
107
|
+
sample_evaluation_input, basic_evaluation_config
|
|
108
|
+
)
|
|
121
109
|
|
|
110
|
+
# Assert
|
|
122
111
|
assert isinstance(result, EvaluationMetricResult)
|
|
123
112
|
assert result.value.lower() == "high"
|
|
124
113
|
mock_complete.assert_called_once()
|
|
125
114
|
|
|
126
115
|
|
|
116
|
+
@pytest.mark.ai
|
|
127
117
|
@pytest.mark.asyncio
|
|
128
|
-
async def
|
|
129
|
-
|
|
118
|
+
async def test_analyze__returns_valid_result__with_structured_output(
|
|
119
|
+
context_relevancy_evaluator: MagicMock,
|
|
120
|
+
sample_evaluation_input: EvaluationMetricInput,
|
|
121
|
+
structured_evaluation_config: EvaluationMetricConfig,
|
|
122
|
+
) -> None:
|
|
123
|
+
"""
|
|
124
|
+
Purpose: Verify analyze successfully processes structured output from LLM.
|
|
125
|
+
Why this matters: Structured output provides more reliable parsing for evaluation results.
|
|
126
|
+
Setup summary: Mock LLM response with structured output, call analyze with schema, assert parsing.
|
|
127
|
+
"""
|
|
128
|
+
# Arrange
|
|
129
|
+
mock_result: LanguageModelResponse = LanguageModelResponse(
|
|
130
130
|
choices=[
|
|
131
131
|
LanguageModelCompletionChoice(
|
|
132
132
|
index=0,
|
|
@@ -138,27 +138,42 @@ async def test_analyze_structured_output(evaluator, sample_input, structured_con
|
|
|
138
138
|
)
|
|
139
139
|
]
|
|
140
140
|
)
|
|
141
|
+
structured_output_schema: type[EvaluationSchemaStructuredOutput] = (
|
|
142
|
+
EvaluationSchemaStructuredOutput
|
|
143
|
+
)
|
|
141
144
|
|
|
142
|
-
|
|
143
|
-
|
|
145
|
+
# Act
|
|
144
146
|
with patch.object(
|
|
145
|
-
|
|
147
|
+
context_relevancy_evaluator.language_model_service,
|
|
146
148
|
"complete_async",
|
|
147
149
|
return_value=mock_result,
|
|
148
150
|
) as mock_complete:
|
|
149
|
-
result = await
|
|
150
|
-
|
|
151
|
+
result: EvaluationMetricResult = await context_relevancy_evaluator.analyze(
|
|
152
|
+
sample_evaluation_input,
|
|
153
|
+
structured_evaluation_config,
|
|
154
|
+
structured_output_schema,
|
|
151
155
|
)
|
|
156
|
+
|
|
157
|
+
# Assert
|
|
152
158
|
assert isinstance(result, EvaluationMetricResult)
|
|
153
159
|
assert result.value.lower() == "high"
|
|
154
160
|
mock_complete.assert_called_once()
|
|
155
161
|
|
|
156
162
|
|
|
163
|
+
@pytest.mark.ai
|
|
157
164
|
@pytest.mark.asyncio
|
|
158
|
-
async def
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
165
|
+
async def test_analyze__raises_evaluator_exception__with_invalid_structured_output(
|
|
166
|
+
context_relevancy_evaluator: MagicMock,
|
|
167
|
+
sample_evaluation_input: EvaluationMetricInput,
|
|
168
|
+
structured_evaluation_config: EvaluationMetricConfig,
|
|
169
|
+
) -> None:
|
|
170
|
+
"""
|
|
171
|
+
Purpose: Verify analyze raises exception when structured output fails validation.
|
|
172
|
+
Why this matters: Invalid structured output should fail fast with clear error message.
|
|
173
|
+
Setup summary: Mock LLM response with invalid schema data, assert EvaluatorException raised.
|
|
174
|
+
"""
|
|
175
|
+
# Arrange
|
|
176
|
+
mock_result: LanguageModelResponse = LanguageModelResponse(
|
|
162
177
|
choices=[
|
|
163
178
|
LanguageModelCompletionChoice(
|
|
164
179
|
index=0,
|
|
@@ -169,28 +184,42 @@ async def test_analyze_structured_output_validation_error(
|
|
|
169
184
|
)
|
|
170
185
|
]
|
|
171
186
|
)
|
|
187
|
+
structured_output_schema: type[EvaluationSchemaStructuredOutput] = (
|
|
188
|
+
EvaluationSchemaStructuredOutput
|
|
189
|
+
)
|
|
172
190
|
|
|
173
|
-
|
|
174
|
-
|
|
191
|
+
# Act & Assert
|
|
175
192
|
with patch.object(
|
|
176
|
-
|
|
193
|
+
context_relevancy_evaluator.language_model_service,
|
|
177
194
|
"complete_async",
|
|
178
195
|
return_value=mock_result,
|
|
179
196
|
):
|
|
180
197
|
with pytest.raises(EvaluatorException) as exc_info:
|
|
181
|
-
await
|
|
182
|
-
|
|
198
|
+
await context_relevancy_evaluator.analyze(
|
|
199
|
+
sample_evaluation_input,
|
|
200
|
+
structured_evaluation_config,
|
|
201
|
+
structured_output_schema,
|
|
183
202
|
)
|
|
203
|
+
|
|
184
204
|
assert "Error occurred during structured output validation" in str(
|
|
185
205
|
exc_info.value
|
|
186
206
|
)
|
|
187
207
|
|
|
188
208
|
|
|
209
|
+
@pytest.mark.ai
|
|
189
210
|
@pytest.mark.asyncio
|
|
190
|
-
async def
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
211
|
+
async def test_analyze__raises_evaluator_exception__with_empty_response(
|
|
212
|
+
context_relevancy_evaluator: MagicMock,
|
|
213
|
+
sample_evaluation_input: EvaluationMetricInput,
|
|
214
|
+
basic_evaluation_config: EvaluationMetricConfig,
|
|
215
|
+
) -> None:
|
|
216
|
+
"""
|
|
217
|
+
Purpose: Verify analyze raises exception when LLM returns empty response content.
|
|
218
|
+
Why this matters: Empty responses should fail fast with clear error message.
|
|
219
|
+
Setup summary: Mock LLM response with empty content, assert EvaluatorException raised.
|
|
220
|
+
"""
|
|
221
|
+
# Arrange
|
|
222
|
+
mock_result: LanguageModelResponse = LanguageModelResponse(
|
|
194
223
|
choices=[
|
|
195
224
|
LanguageModelCompletionChoice(
|
|
196
225
|
index=0,
|
|
@@ -200,54 +229,45 @@ async def test_analyze_regular_output_empty_response(
|
|
|
200
229
|
]
|
|
201
230
|
)
|
|
202
231
|
|
|
232
|
+
# Act & Assert
|
|
203
233
|
with patch.object(
|
|
204
|
-
|
|
234
|
+
context_relevancy_evaluator.language_model_service,
|
|
205
235
|
"complete_async",
|
|
206
236
|
return_value=mock_result,
|
|
207
237
|
):
|
|
208
238
|
with pytest.raises(EvaluatorException) as exc_info:
|
|
209
|
-
await
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
def test_compose_msgs_regular(evaluator, sample_input, basic_config):
|
|
214
|
-
messages = evaluator._compose_msgs(
|
|
215
|
-
sample_input, basic_config, enable_structured_output=False
|
|
216
|
-
)
|
|
217
|
-
|
|
218
|
-
assert isinstance(messages, LanguageModelMessages)
|
|
219
|
-
assert messages.root[0].content == CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG
|
|
220
|
-
assert isinstance(messages.root[1].content, str)
|
|
221
|
-
assert "test query" in messages.root[1].content
|
|
222
|
-
assert "test context 1" in messages.root[1].content
|
|
223
|
-
assert "test context 2" in messages.root[1].content
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
def test_compose_msgs_structured(evaluator, sample_input, structured_config):
|
|
227
|
-
messages = evaluator._compose_msgs(
|
|
228
|
-
sample_input, structured_config, enable_structured_output=True
|
|
229
|
-
)
|
|
239
|
+
await context_relevancy_evaluator.analyze(
|
|
240
|
+
sample_evaluation_input, basic_evaluation_config
|
|
241
|
+
)
|
|
230
242
|
|
|
231
|
-
|
|
232
|
-
assert len(messages.root) == 2
|
|
233
|
-
assert (
|
|
234
|
-
messages.root[0].content != CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG
|
|
235
|
-
) # Should use structured output prompt
|
|
236
|
-
assert isinstance(messages.root[1].content, str)
|
|
237
|
-
assert "test query" in messages.root[1].content
|
|
238
|
-
assert "test context 1" in messages.root[1].content
|
|
239
|
-
assert "test context 2" in messages.root[1].content
|
|
243
|
+
assert "did not return a result" in str(exc_info.value)
|
|
240
244
|
|
|
241
245
|
|
|
246
|
+
@pytest.mark.ai
|
|
242
247
|
@pytest.mark.asyncio
|
|
243
|
-
async def
|
|
248
|
+
async def test_analyze__raises_evaluator_exception__with_unknown_error(
|
|
249
|
+
context_relevancy_evaluator: MagicMock,
|
|
250
|
+
sample_evaluation_input: EvaluationMetricInput,
|
|
251
|
+
basic_evaluation_config: EvaluationMetricConfig,
|
|
252
|
+
) -> None:
|
|
253
|
+
"""
|
|
254
|
+
Purpose: Verify analyze handles unexpected errors gracefully with wrapped exception.
|
|
255
|
+
Why this matters: Provides consistent error handling for all failure modes.
|
|
256
|
+
Setup summary: Mock LLM to raise generic exception, assert EvaluatorException wrapper.
|
|
257
|
+
"""
|
|
258
|
+
# Arrange - No additional setup needed
|
|
259
|
+
|
|
260
|
+
# Act & Assert
|
|
244
261
|
with patch.object(
|
|
245
|
-
|
|
262
|
+
context_relevancy_evaluator.language_model_service,
|
|
246
263
|
"complete_async",
|
|
247
264
|
side_effect=Exception("Unknown error"),
|
|
248
265
|
):
|
|
249
266
|
with pytest.raises(EvaluatorException) as exc_info:
|
|
250
|
-
await
|
|
267
|
+
await context_relevancy_evaluator.analyze(
|
|
268
|
+
sample_evaluation_input, basic_evaluation_config
|
|
269
|
+
)
|
|
270
|
+
|
|
251
271
|
assert "Unknown error occurred during context relevancy metric analysis" in str(
|
|
252
272
|
exc_info.value
|
|
253
273
|
)
|