unique_toolkit 1.45.5__py3-none-any.whl → 1.45.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. unique_toolkit/agentic/evaluation/config.py +25 -6
  2. unique_toolkit/agentic/evaluation/context_relevancy/prompts/__init__.py +13 -0
  3. unique_toolkit/agentic/evaluation/context_relevancy/{prompts.py → prompts/system_prompt.j2} +11 -43
  4. unique_toolkit/agentic/evaluation/context_relevancy/prompts/user_prompt.j2 +15 -0
  5. unique_toolkit/agentic/evaluation/context_relevancy/service.py +24 -56
  6. unique_toolkit/agentic/evaluation/hallucination/constants.py +26 -15
  7. unique_toolkit/agentic/evaluation/hallucination/prompts/__init__.py +13 -0
  8. unique_toolkit/agentic/evaluation/hallucination/prompts/system_prompt.j2 +35 -0
  9. unique_toolkit/agentic/evaluation/hallucination/prompts/user_prompt.j2 +27 -0
  10. unique_toolkit/agentic/evaluation/hallucination/utils.py +153 -102
  11. unique_toolkit/agentic/evaluation/tests/fixtures.py +102 -0
  12. unique_toolkit/agentic/evaluation/tests/test_config.py +247 -0
  13. unique_toolkit/agentic/evaluation/tests/test_context_relevancy_service.py +141 -121
  14. unique_toolkit/agentic/evaluation/tests/test_hallucination_constants.py +600 -0
  15. unique_toolkit/agentic/evaluation/tests/test_hallucination_utils.py +1009 -0
  16. unique_toolkit/agentic/evaluation/tests/test_output_parser.py +82 -23
  17. unique_toolkit/agentic/evaluation/tests/test_prompt_loaders.py +348 -0
  18. unique_toolkit/agentic/evaluation/utils.py +8 -0
  19. unique_toolkit/chat/responses_api.py +49 -45
  20. {unique_toolkit-1.45.5.dist-info → unique_toolkit-1.45.7.dist-info}/METADATA +9 -1
  21. {unique_toolkit-1.45.5.dist-info → unique_toolkit-1.45.7.dist-info}/RECORD +23 -13
  22. unique_toolkit/agentic/evaluation/hallucination/prompts.py +0 -79
  23. {unique_toolkit-1.45.5.dist-info → unique_toolkit-1.45.7.dist-info}/LICENSE +0 -0
  24. {unique_toolkit-1.45.5.dist-info → unique_toolkit-1.45.7.dist-info}/WHEEL +0 -0
@@ -1,103 +1,88 @@
1
+ """Tests for context relevancy evaluation service."""
2
+
1
3
  from unittest.mock import MagicMock, patch
2
4
 
3
5
  import pytest
4
6
 
5
7
  from unique_toolkit.agentic.evaluation.config import EvaluationMetricConfig
6
- from unique_toolkit.agentic.evaluation.context_relevancy.prompts import (
7
- CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG,
8
- )
9
8
  from unique_toolkit.agentic.evaluation.context_relevancy.schema import (
10
9
  EvaluationSchemaStructuredOutput,
11
10
  )
12
- from unique_toolkit.agentic.evaluation.context_relevancy.service import (
13
- ContextRelevancyEvaluator,
14
- )
15
11
  from unique_toolkit.agentic.evaluation.exception import EvaluatorException
16
12
  from unique_toolkit.agentic.evaluation.schemas import (
17
13
  EvaluationMetricInput,
18
- EvaluationMetricName,
19
14
  EvaluationMetricResult,
20
15
  )
21
- from unique_toolkit.app.schemas import ChatEvent
22
- from unique_toolkit.chat.service import LanguageModelName
23
- from unique_toolkit.language_model.infos import (
24
- LanguageModelInfo,
25
- )
26
16
  from unique_toolkit.language_model.schemas import (
27
17
  LanguageModelAssistantMessage,
28
18
  LanguageModelCompletionChoice,
29
- LanguageModelMessages,
30
19
  )
31
20
  from unique_toolkit.language_model.service import LanguageModelResponse
32
21
 
33
22
 
34
- @pytest.fixture
35
- def event():
36
- event = MagicMock(spec=ChatEvent)
37
- event.payload = MagicMock()
38
- event.payload.user_message = MagicMock()
39
- event.payload.user_message.text = "Test query"
40
- event.user_id = "user_0"
41
- event.company_id = "company_0"
42
- return event
43
-
44
-
45
- @pytest.fixture
46
- def evaluator(event):
47
- return ContextRelevancyEvaluator(event)
48
-
49
-
50
- @pytest.fixture
51
- def basic_config():
52
- return EvaluationMetricConfig(
53
- enabled=True,
54
- name=EvaluationMetricName.CONTEXT_RELEVANCY,
55
- language_model=LanguageModelInfo.from_name(
56
- LanguageModelName.AZURE_GPT_4o_2024_0806
57
- ),
58
- )
59
-
60
-
61
- @pytest.fixture
62
- def structured_config(basic_config):
63
- model_info = LanguageModelInfo.from_name(LanguageModelName.AZURE_GPT_4o_2024_0806)
64
- return EvaluationMetricConfig(
65
- enabled=True,
66
- name=EvaluationMetricName.CONTEXT_RELEVANCY,
67
- language_model=model_info,
68
- )
69
-
70
-
71
- @pytest.fixture
72
- def sample_input():
73
- return EvaluationMetricInput(
74
- input_text="test query",
75
- context_texts=["test context 1", "test context 2"],
23
+ @pytest.mark.ai
24
+ @pytest.mark.asyncio
25
+ async def test_analyze__returns_none__when_disabled(
26
+ context_relevancy_evaluator: MagicMock,
27
+ sample_evaluation_input: EvaluationMetricInput,
28
+ basic_evaluation_config: EvaluationMetricConfig,
29
+ ) -> None:
30
+ """
31
+ Purpose: Verify that analyze returns None when evaluation is disabled in config.
32
+ Why this matters: Ensures evaluation can be toggled off without errors or side effects.
33
+ Setup summary: Set config.enabled=False, call analyze, assert None returned.
34
+ """
35
+ # Arrange
36
+ basic_evaluation_config.enabled = False
37
+
38
+ # Act
39
+ result = await context_relevancy_evaluator.analyze(
40
+ sample_evaluation_input, basic_evaluation_config
76
41
  )
77
42
 
78
-
79
- @pytest.mark.asyncio
80
- async def test_analyze_disabled(evaluator, sample_input, basic_config):
81
- basic_config.enabled = False
82
- result = await evaluator.analyze(sample_input, basic_config)
43
+ # Assert
83
44
  assert result is None
84
45
 
85
46
 
47
+ @pytest.mark.ai
86
48
  @pytest.mark.asyncio
87
- async def test_analyze_empty_context(evaluator, basic_config):
88
- input_with_empty_context = EvaluationMetricInput(
49
+ async def test_analyze__raises_evaluator_exception__with_empty_context(
50
+ context_relevancy_evaluator: MagicMock,
51
+ basic_evaluation_config: EvaluationMetricConfig,
52
+ ) -> None:
53
+ """
54
+ Purpose: Verify that analyze raises exception when context texts are empty.
55
+ Why this matters: Context relevancy evaluation requires at least one context.
56
+ Setup summary: Create input with empty context_texts, assert EvaluatorException raised.
57
+ """
58
+ # Arrange
59
+ input_with_empty_context: EvaluationMetricInput = EvaluationMetricInput(
89
60
  input_text="test query", context_texts=[]
90
61
  )
91
62
 
63
+ # Act & Assert
92
64
  with pytest.raises(EvaluatorException) as exc_info:
93
- await evaluator.analyze(input_with_empty_context, basic_config)
65
+ await context_relevancy_evaluator.analyze(
66
+ input_with_empty_context, basic_evaluation_config
67
+ )
94
68
 
95
69
  assert "No context texts provided." in str(exc_info.value)
96
70
 
97
71
 
72
+ @pytest.mark.ai
98
73
  @pytest.mark.asyncio
99
- async def test_analyze_regular_output(evaluator, sample_input, basic_config):
100
- mock_result = LanguageModelResponse(
74
+ async def test_analyze__returns_valid_result__with_regular_output(
75
+ context_relevancy_evaluator: MagicMock,
76
+ sample_evaluation_input: EvaluationMetricInput,
77
+ basic_evaluation_config: EvaluationMetricConfig,
78
+ ) -> None:
79
+ """
80
+ Purpose: Verify analyze successfully processes regular (non-structured) output from LLM.
81
+ Why this matters: Core functionality for evaluation with standard JSON responses.
82
+ Setup summary: Mock LLM response with JSON, call analyze, assert correct result parsing.
83
+ """
84
+ # Arrange
85
+ mock_result: LanguageModelResponse = LanguageModelResponse(
101
86
  choices=[
102
87
  LanguageModelCompletionChoice(
103
88
  index=0,
@@ -112,21 +97,36 @@ async def test_analyze_regular_output(evaluator, sample_input, basic_config):
112
97
  ]
113
98
  )
114
99
 
100
+ # Act
115
101
  with patch.object(
116
- evaluator.language_model_service,
102
+ context_relevancy_evaluator.language_model_service,
117
103
  "complete_async",
118
104
  return_value=mock_result,
119
105
  ) as mock_complete:
120
- result = await evaluator.analyze(sample_input, basic_config)
106
+ result: EvaluationMetricResult = await context_relevancy_evaluator.analyze(
107
+ sample_evaluation_input, basic_evaluation_config
108
+ )
121
109
 
110
+ # Assert
122
111
  assert isinstance(result, EvaluationMetricResult)
123
112
  assert result.value.lower() == "high"
124
113
  mock_complete.assert_called_once()
125
114
 
126
115
 
116
+ @pytest.mark.ai
127
117
  @pytest.mark.asyncio
128
- async def test_analyze_structured_output(evaluator, sample_input, structured_config):
129
- mock_result = LanguageModelResponse(
118
+ async def test_analyze__returns_valid_result__with_structured_output(
119
+ context_relevancy_evaluator: MagicMock,
120
+ sample_evaluation_input: EvaluationMetricInput,
121
+ structured_evaluation_config: EvaluationMetricConfig,
122
+ ) -> None:
123
+ """
124
+ Purpose: Verify analyze successfully processes structured output from LLM.
125
+ Why this matters: Structured output provides more reliable parsing for evaluation results.
126
+ Setup summary: Mock LLM response with structured output, call analyze with schema, assert parsing.
127
+ """
128
+ # Arrange
129
+ mock_result: LanguageModelResponse = LanguageModelResponse(
130
130
  choices=[
131
131
  LanguageModelCompletionChoice(
132
132
  index=0,
@@ -138,27 +138,42 @@ async def test_analyze_structured_output(evaluator, sample_input, structured_con
138
138
  )
139
139
  ]
140
140
  )
141
+ structured_output_schema: type[EvaluationSchemaStructuredOutput] = (
142
+ EvaluationSchemaStructuredOutput
143
+ )
141
144
 
142
- structured_output_schema = EvaluationSchemaStructuredOutput
143
-
145
+ # Act
144
146
  with patch.object(
145
- evaluator.language_model_service,
147
+ context_relevancy_evaluator.language_model_service,
146
148
  "complete_async",
147
149
  return_value=mock_result,
148
150
  ) as mock_complete:
149
- result = await evaluator.analyze(
150
- sample_input, structured_config, structured_output_schema
151
+ result: EvaluationMetricResult = await context_relevancy_evaluator.analyze(
152
+ sample_evaluation_input,
153
+ structured_evaluation_config,
154
+ structured_output_schema,
151
155
  )
156
+
157
+ # Assert
152
158
  assert isinstance(result, EvaluationMetricResult)
153
159
  assert result.value.lower() == "high"
154
160
  mock_complete.assert_called_once()
155
161
 
156
162
 
163
+ @pytest.mark.ai
157
164
  @pytest.mark.asyncio
158
- async def test_analyze_structured_output_validation_error(
159
- evaluator, sample_input, structured_config
160
- ):
161
- mock_result = LanguageModelResponse(
165
+ async def test_analyze__raises_evaluator_exception__with_invalid_structured_output(
166
+ context_relevancy_evaluator: MagicMock,
167
+ sample_evaluation_input: EvaluationMetricInput,
168
+ structured_evaluation_config: EvaluationMetricConfig,
169
+ ) -> None:
170
+ """
171
+ Purpose: Verify analyze raises exception when structured output fails validation.
172
+ Why this matters: Invalid structured output should fail fast with clear error message.
173
+ Setup summary: Mock LLM response with invalid schema data, assert EvaluatorException raised.
174
+ """
175
+ # Arrange
176
+ mock_result: LanguageModelResponse = LanguageModelResponse(
162
177
  choices=[
163
178
  LanguageModelCompletionChoice(
164
179
  index=0,
@@ -169,28 +184,42 @@ async def test_analyze_structured_output_validation_error(
169
184
  )
170
185
  ]
171
186
  )
187
+ structured_output_schema: type[EvaluationSchemaStructuredOutput] = (
188
+ EvaluationSchemaStructuredOutput
189
+ )
172
190
 
173
- structured_output_schema = EvaluationSchemaStructuredOutput
174
-
191
+ # Act & Assert
175
192
  with patch.object(
176
- evaluator.language_model_service,
193
+ context_relevancy_evaluator.language_model_service,
177
194
  "complete_async",
178
195
  return_value=mock_result,
179
196
  ):
180
197
  with pytest.raises(EvaluatorException) as exc_info:
181
- await evaluator.analyze(
182
- sample_input, structured_config, structured_output_schema
198
+ await context_relevancy_evaluator.analyze(
199
+ sample_evaluation_input,
200
+ structured_evaluation_config,
201
+ structured_output_schema,
183
202
  )
203
+
184
204
  assert "Error occurred during structured output validation" in str(
185
205
  exc_info.value
186
206
  )
187
207
 
188
208
 
209
+ @pytest.mark.ai
189
210
  @pytest.mark.asyncio
190
- async def test_analyze_regular_output_empty_response(
191
- evaluator, sample_input, basic_config
192
- ):
193
- mock_result = LanguageModelResponse(
211
+ async def test_analyze__raises_evaluator_exception__with_empty_response(
212
+ context_relevancy_evaluator: MagicMock,
213
+ sample_evaluation_input: EvaluationMetricInput,
214
+ basic_evaluation_config: EvaluationMetricConfig,
215
+ ) -> None:
216
+ """
217
+ Purpose: Verify analyze raises exception when LLM returns empty response content.
218
+ Why this matters: Empty responses should fail fast with clear error message.
219
+ Setup summary: Mock LLM response with empty content, assert EvaluatorException raised.
220
+ """
221
+ # Arrange
222
+ mock_result: LanguageModelResponse = LanguageModelResponse(
194
223
  choices=[
195
224
  LanguageModelCompletionChoice(
196
225
  index=0,
@@ -200,54 +229,45 @@ async def test_analyze_regular_output_empty_response(
200
229
  ]
201
230
  )
202
231
 
232
+ # Act & Assert
203
233
  with patch.object(
204
- evaluator.language_model_service,
234
+ context_relevancy_evaluator.language_model_service,
205
235
  "complete_async",
206
236
  return_value=mock_result,
207
237
  ):
208
238
  with pytest.raises(EvaluatorException) as exc_info:
209
- await evaluator.analyze(sample_input, basic_config)
210
- assert "did not return a result" in str(exc_info.value)
211
-
212
-
213
- def test_compose_msgs_regular(evaluator, sample_input, basic_config):
214
- messages = evaluator._compose_msgs(
215
- sample_input, basic_config, enable_structured_output=False
216
- )
217
-
218
- assert isinstance(messages, LanguageModelMessages)
219
- assert messages.root[0].content == CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG
220
- assert isinstance(messages.root[1].content, str)
221
- assert "test query" in messages.root[1].content
222
- assert "test context 1" in messages.root[1].content
223
- assert "test context 2" in messages.root[1].content
224
-
225
-
226
- def test_compose_msgs_structured(evaluator, sample_input, structured_config):
227
- messages = evaluator._compose_msgs(
228
- sample_input, structured_config, enable_structured_output=True
229
- )
239
+ await context_relevancy_evaluator.analyze(
240
+ sample_evaluation_input, basic_evaluation_config
241
+ )
230
242
 
231
- assert isinstance(messages, LanguageModelMessages)
232
- assert len(messages.root) == 2
233
- assert (
234
- messages.root[0].content != CONTEXT_RELEVANCY_METRIC_SYSTEM_MSG
235
- ) # Should use structured output prompt
236
- assert isinstance(messages.root[1].content, str)
237
- assert "test query" in messages.root[1].content
238
- assert "test context 1" in messages.root[1].content
239
- assert "test context 2" in messages.root[1].content
243
+ assert "did not return a result" in str(exc_info.value)
240
244
 
241
245
 
246
+ @pytest.mark.ai
242
247
  @pytest.mark.asyncio
243
- async def test_analyze_unknown_error(evaluator, sample_input, basic_config):
248
+ async def test_analyze__raises_evaluator_exception__with_unknown_error(
249
+ context_relevancy_evaluator: MagicMock,
250
+ sample_evaluation_input: EvaluationMetricInput,
251
+ basic_evaluation_config: EvaluationMetricConfig,
252
+ ) -> None:
253
+ """
254
+ Purpose: Verify analyze handles unexpected errors gracefully with wrapped exception.
255
+ Why this matters: Provides consistent error handling for all failure modes.
256
+ Setup summary: Mock LLM to raise generic exception, assert EvaluatorException wrapper.
257
+ """
258
+ # Arrange - No additional setup needed
259
+
260
+ # Act & Assert
244
261
  with patch.object(
245
- evaluator.language_model_service,
262
+ context_relevancy_evaluator.language_model_service,
246
263
  "complete_async",
247
264
  side_effect=Exception("Unknown error"),
248
265
  ):
249
266
  with pytest.raises(EvaluatorException) as exc_info:
250
- await evaluator.analyze(sample_input, basic_config)
267
+ await context_relevancy_evaluator.analyze(
268
+ sample_evaluation_input, basic_evaluation_config
269
+ )
270
+
251
271
  assert "Unknown error occurred during context relevancy metric analysis" in str(
252
272
  exc_info.value
253
273
  )