unique_toolkit 1.45.4__py3-none-any.whl → 1.45.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- unique_toolkit/agentic/evaluation/config.py +25 -6
- unique_toolkit/agentic/evaluation/context_relevancy/prompts/__init__.py +13 -0
- unique_toolkit/agentic/evaluation/context_relevancy/{prompts.py → prompts/system_prompt.j2} +11 -43
- unique_toolkit/agentic/evaluation/context_relevancy/prompts/user_prompt.j2 +15 -0
- unique_toolkit/agentic/evaluation/context_relevancy/service.py +24 -56
- unique_toolkit/agentic/evaluation/hallucination/constants.py +26 -15
- unique_toolkit/agentic/evaluation/hallucination/prompts/__init__.py +13 -0
- unique_toolkit/agentic/evaluation/hallucination/prompts/system_prompt.j2 +35 -0
- unique_toolkit/agentic/evaluation/hallucination/prompts/user_prompt.j2 +27 -0
- unique_toolkit/agentic/evaluation/hallucination/utils.py +153 -102
- unique_toolkit/agentic/evaluation/tests/fixtures.py +102 -0
- unique_toolkit/agentic/evaluation/tests/test_config.py +247 -0
- unique_toolkit/agentic/evaluation/tests/test_context_relevancy_service.py +141 -121
- unique_toolkit/agentic/evaluation/tests/test_hallucination_constants.py +600 -0
- unique_toolkit/agentic/evaluation/tests/test_hallucination_utils.py +1009 -0
- unique_toolkit/agentic/evaluation/tests/test_output_parser.py +82 -23
- unique_toolkit/agentic/evaluation/tests/test_prompt_loaders.py +348 -0
- unique_toolkit/agentic/evaluation/utils.py +8 -0
- unique_toolkit/agentic/responses_api/postprocessors/generated_files.py +34 -0
- {unique_toolkit-1.45.4.dist-info → unique_toolkit-1.45.6.dist-info}/METADATA +7 -1
- {unique_toolkit-1.45.4.dist-info → unique_toolkit-1.45.6.dist-info}/RECORD +23 -13
- unique_toolkit/agentic/evaluation/hallucination/prompts.py +0 -79
- {unique_toolkit-1.45.4.dist-info → unique_toolkit-1.45.6.dist-info}/LICENSE +0 -0
- {unique_toolkit-1.45.4.dist-info → unique_toolkit-1.45.6.dist-info}/WHEEL +0 -0
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
"""Tests for evaluation metric output parsers."""
|
|
2
|
+
|
|
1
3
|
import pytest
|
|
2
4
|
|
|
3
5
|
from unique_toolkit.agentic.evaluation.context_relevancy.schema import (
|
|
@@ -15,11 +17,21 @@ from unique_toolkit.agentic.evaluation.schemas import (
|
|
|
15
17
|
)
|
|
16
18
|
|
|
17
19
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
20
|
+
@pytest.mark.ai
|
|
21
|
+
def test_parse_eval_metric_result__succeeds__with_all_fields() -> None:
|
|
22
|
+
"""
|
|
23
|
+
Purpose: Verify parsing of complete evaluation metric JSON result with all fields.
|
|
24
|
+
Why this matters: Core parsing functionality for evaluation results from LLM.
|
|
25
|
+
Setup summary: Provide valid JSON with all fields, assert correct parsing and field values.
|
|
26
|
+
"""
|
|
27
|
+
# Arrange
|
|
28
|
+
result_json: str = '{"value": "high", "reason": "Test reason"}'
|
|
29
|
+
metric_name: EvaluationMetricName = EvaluationMetricName.CONTEXT_RELEVANCY
|
|
30
|
+
|
|
31
|
+
# Act
|
|
32
|
+
parsed: EvaluationMetricResult = parse_eval_metric_result(result_json, metric_name)
|
|
22
33
|
|
|
34
|
+
# Assert
|
|
23
35
|
assert isinstance(parsed, EvaluationMetricResult)
|
|
24
36
|
assert parsed.name == EvaluationMetricName.CONTEXT_RELEVANCY
|
|
25
37
|
assert parsed.value == "high"
|
|
@@ -27,11 +39,21 @@ def test_parse_eval_metric_result_success():
|
|
|
27
39
|
assert parsed.fact_list == []
|
|
28
40
|
|
|
29
41
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
42
|
+
@pytest.mark.ai
|
|
43
|
+
def test_parse_eval_metric_result__uses_default_reason__with_missing_field() -> None:
|
|
44
|
+
"""
|
|
45
|
+
Purpose: Verify parsing handles missing optional fields by using defaults.
|
|
46
|
+
Why this matters: Ensures robustness when LLM returns incomplete JSON responses.
|
|
47
|
+
Setup summary: Provide JSON with only required field, assert default value for reason.
|
|
48
|
+
"""
|
|
49
|
+
# Arrange
|
|
50
|
+
result_json: str = '{"value": "high"}'
|
|
51
|
+
metric_name: EvaluationMetricName = EvaluationMetricName.CONTEXT_RELEVANCY
|
|
52
|
+
|
|
53
|
+
# Act
|
|
54
|
+
parsed: EvaluationMetricResult = parse_eval_metric_result(result_json, metric_name)
|
|
34
55
|
|
|
56
|
+
# Assert
|
|
35
57
|
assert isinstance(parsed, EvaluationMetricResult)
|
|
36
58
|
assert parsed.name == EvaluationMetricName.CONTEXT_RELEVANCY
|
|
37
59
|
assert parsed.value == "high"
|
|
@@ -39,24 +61,49 @@ def test_parse_eval_metric_result_missing_fields():
|
|
|
39
61
|
assert parsed.fact_list == []
|
|
40
62
|
|
|
41
63
|
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
64
|
+
@pytest.mark.ai
|
|
65
|
+
def test_parse_eval_metric_result__raises_evaluator_exception__with_invalid_json() -> (
|
|
66
|
+
None
|
|
67
|
+
):
|
|
68
|
+
"""
|
|
69
|
+
Purpose: Verify parser raises appropriate exception for malformed JSON.
|
|
70
|
+
Why this matters: Provides clear error handling for invalid LLM responses.
|
|
71
|
+
Setup summary: Provide invalid JSON string, assert EvaluatorException with descriptive message.
|
|
72
|
+
"""
|
|
73
|
+
# Arrange
|
|
74
|
+
result_json: str = "invalid json"
|
|
75
|
+
metric_name: EvaluationMetricName = EvaluationMetricName.CONTEXT_RELEVANCY
|
|
76
|
+
|
|
77
|
+
# Act & Assert
|
|
45
78
|
with pytest.raises(EvaluatorException) as exc_info:
|
|
46
|
-
parse_eval_metric_result(
|
|
79
|
+
parse_eval_metric_result(result_json, metric_name)
|
|
47
80
|
|
|
48
81
|
assert "Error occurred during parsing the evaluation metric result" in str(
|
|
49
82
|
exc_info.value
|
|
50
83
|
)
|
|
51
84
|
|
|
52
85
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
86
|
+
@pytest.mark.ai
|
|
87
|
+
def test_parse_eval_metric_result_structured_output__succeeds__without_fact_list() -> (
|
|
88
|
+
None
|
|
89
|
+
):
|
|
90
|
+
"""
|
|
91
|
+
Purpose: Verify parsing of structured output without optional fact list.
|
|
92
|
+
Why this matters: Ensures structured output parsing works for basic evaluations.
|
|
93
|
+
Setup summary: Create structured output object without facts, assert correct parsing.
|
|
94
|
+
"""
|
|
95
|
+
# Arrange
|
|
96
|
+
result: EvaluationSchemaStructuredOutput = EvaluationSchemaStructuredOutput(
|
|
97
|
+
value="high", reason="Test reason"
|
|
58
98
|
)
|
|
99
|
+
metric_name: EvaluationMetricName = EvaluationMetricName.CONTEXT_RELEVANCY
|
|
59
100
|
|
|
101
|
+
# Act
|
|
102
|
+
parsed: EvaluationMetricResult = parse_eval_metric_result_structured_output(
|
|
103
|
+
result, metric_name
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
# Assert
|
|
60
107
|
assert isinstance(parsed, EvaluationMetricResult)
|
|
61
108
|
assert parsed.name == EvaluationMetricName.CONTEXT_RELEVANCY
|
|
62
109
|
assert parsed.value == "high"
|
|
@@ -64,9 +111,17 @@ def test_parse_eval_metric_result_structured_output_basic():
|
|
|
64
111
|
assert parsed.fact_list == []
|
|
65
112
|
|
|
66
113
|
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
114
|
+
@pytest.mark.ai
|
|
115
|
+
def test_parse_eval_metric_result_structured_output__includes_facts__with_fact_list() -> (
|
|
116
|
+
None
|
|
117
|
+
):
|
|
118
|
+
"""
|
|
119
|
+
Purpose: Verify parsing of structured output with fact list extracts all facts.
|
|
120
|
+
Why this matters: Fact extraction is critical for detailed evaluation feedback.
|
|
121
|
+
Setup summary: Create structured output with multiple facts, assert all facts extracted.
|
|
122
|
+
"""
|
|
123
|
+
# Arrange
|
|
124
|
+
result: EvaluationSchemaStructuredOutput = EvaluationSchemaStructuredOutput(
|
|
70
125
|
value="high",
|
|
71
126
|
reason="Test reason",
|
|
72
127
|
fact_list=[
|
|
@@ -74,14 +129,18 @@ def test_parse_eval_metric_result_structured_output_with_facts():
|
|
|
74
129
|
Fact(fact="Fact 2"),
|
|
75
130
|
],
|
|
76
131
|
)
|
|
77
|
-
|
|
78
|
-
|
|
132
|
+
metric_name: EvaluationMetricName = EvaluationMetricName.CONTEXT_RELEVANCY
|
|
133
|
+
|
|
134
|
+
# Act
|
|
135
|
+
parsed: EvaluationMetricResult = parse_eval_metric_result_structured_output(
|
|
136
|
+
result, metric_name
|
|
79
137
|
)
|
|
80
138
|
|
|
139
|
+
# Assert
|
|
81
140
|
assert isinstance(parsed, EvaluationMetricResult)
|
|
82
141
|
assert parsed.name == EvaluationMetricName.CONTEXT_RELEVANCY
|
|
83
142
|
assert parsed.value == "high"
|
|
84
143
|
assert parsed.reason == "Test reason"
|
|
85
144
|
assert parsed.fact_list == ["Fact 1", "Fact 2"]
|
|
86
145
|
assert isinstance(parsed.fact_list, list)
|
|
87
|
-
assert len(parsed.fact_list) == 2
|
|
146
|
+
assert len(parsed.fact_list) == 2
|
|
@@ -0,0 +1,348 @@
|
|
|
1
|
+
"""Tests for prompt loader functions."""
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from unique_toolkit.agentic.evaluation.context_relevancy.prompts import (
|
|
6
|
+
system_prompt_loader as context_system_prompt_loader,
|
|
7
|
+
)
|
|
8
|
+
from unique_toolkit.agentic.evaluation.context_relevancy.prompts import (
|
|
9
|
+
user_prompt_loader as context_user_prompt_loader,
|
|
10
|
+
)
|
|
11
|
+
from unique_toolkit.agentic.evaluation.hallucination.prompts import (
|
|
12
|
+
system_prompt_loader as hallucination_system_prompt_loader,
|
|
13
|
+
)
|
|
14
|
+
from unique_toolkit.agentic.evaluation.hallucination.prompts import (
|
|
15
|
+
user_prompt_loader as hallucination_user_prompt_loader,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@pytest.mark.ai
|
|
20
|
+
def test_context_system_prompt_loader__returns_non_empty_string__on_call() -> None:
|
|
21
|
+
"""
|
|
22
|
+
Purpose: Verify that the context relevancy system prompt loader returns a valid string.
|
|
23
|
+
Why this matters: The system prompt is critical for guiding the evaluation LLM's behavior.
|
|
24
|
+
Setup summary: Call the loader function and assert type and non-empty content.
|
|
25
|
+
"""
|
|
26
|
+
# Arrange - No setup needed for this test
|
|
27
|
+
|
|
28
|
+
# Act
|
|
29
|
+
prompt: str = context_system_prompt_loader()
|
|
30
|
+
|
|
31
|
+
# Assert
|
|
32
|
+
assert isinstance(prompt, str)
|
|
33
|
+
assert len(prompt) > 0
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@pytest.mark.ai
|
|
37
|
+
def test_context_user_prompt_loader__returns_non_empty_string__on_call() -> None:
|
|
38
|
+
"""
|
|
39
|
+
Purpose: Verify that the context relevancy user prompt loader returns a valid string.
|
|
40
|
+
Why this matters: The user prompt template must be valid for evaluation requests.
|
|
41
|
+
Setup summary: Call the loader function and assert type and non-empty content.
|
|
42
|
+
"""
|
|
43
|
+
# Arrange - No setup needed for this test
|
|
44
|
+
|
|
45
|
+
# Act
|
|
46
|
+
prompt: str = context_user_prompt_loader()
|
|
47
|
+
|
|
48
|
+
# Assert
|
|
49
|
+
assert isinstance(prompt, str)
|
|
50
|
+
assert len(prompt) > 0
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@pytest.mark.ai
|
|
54
|
+
def test_context_system_prompt__contains_jinja_syntax__for_structured_output() -> None:
|
|
55
|
+
"""
|
|
56
|
+
Purpose: Verify that system prompt contains Jinja2 template syntax for structured output control.
|
|
57
|
+
Why this matters: Template must support conditional rendering based on structured_output flag.
|
|
58
|
+
Setup summary: Load system prompt and check for Jinja2 conditional blocks.
|
|
59
|
+
"""
|
|
60
|
+
# Arrange - No setup needed
|
|
61
|
+
|
|
62
|
+
# Act
|
|
63
|
+
prompt: str = context_system_prompt_loader()
|
|
64
|
+
|
|
65
|
+
# Assert
|
|
66
|
+
assert "{% if structured_output %}" in prompt or "{%" in prompt
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@pytest.mark.ai
|
|
70
|
+
def test_context_user_prompt__contains_jinja_variables__for_input_and_context() -> None:
|
|
71
|
+
"""
|
|
72
|
+
Purpose: Verify that user prompt contains required Jinja2 variable placeholders.
|
|
73
|
+
Why this matters: Template must support dynamic insertion of input text and context texts.
|
|
74
|
+
Setup summary: Load user prompt and check for expected variable placeholders.
|
|
75
|
+
"""
|
|
76
|
+
# Arrange - No setup needed
|
|
77
|
+
|
|
78
|
+
# Act
|
|
79
|
+
prompt: str = context_user_prompt_loader()
|
|
80
|
+
|
|
81
|
+
# Assert
|
|
82
|
+
assert "{{ input_text }}" in prompt
|
|
83
|
+
assert "{{ context_texts }}" in prompt
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
@pytest.mark.ai
|
|
87
|
+
def test_context_system_prompt__has_both_structured_and_regular_modes__in_template() -> (
|
|
88
|
+
None
|
|
89
|
+
):
|
|
90
|
+
"""
|
|
91
|
+
Purpose: Verify that system prompt template supports both structured and regular output modes.
|
|
92
|
+
Why this matters: Template must handle both evaluation output formats correctly.
|
|
93
|
+
Setup summary: Load system prompt and check for conditional blocks for both modes.
|
|
94
|
+
"""
|
|
95
|
+
# Arrange - No setup needed
|
|
96
|
+
|
|
97
|
+
# Act
|
|
98
|
+
prompt: str = context_system_prompt_loader()
|
|
99
|
+
|
|
100
|
+
# Assert
|
|
101
|
+
assert "{% if structured_output %}" in prompt
|
|
102
|
+
assert "{% else %}" in prompt or "{% endif %}" in prompt
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
@pytest.mark.ai
|
|
106
|
+
def test_context_user_prompt__has_conditional_json_instruction__for_unstructured_mode() -> (
|
|
107
|
+
None
|
|
108
|
+
):
|
|
109
|
+
"""
|
|
110
|
+
Purpose: Verify that user prompt has conditional JSON instruction for unstructured mode.
|
|
111
|
+
Why this matters: Non-structured mode requires explicit JSON formatting instructions.
|
|
112
|
+
Setup summary: Load user prompt and check for conditional JSON instruction block.
|
|
113
|
+
"""
|
|
114
|
+
# Arrange - No setup needed
|
|
115
|
+
|
|
116
|
+
# Act
|
|
117
|
+
prompt: str = context_user_prompt_loader()
|
|
118
|
+
|
|
119
|
+
# Assert
|
|
120
|
+
assert "{% if not structured_output %}" in prompt or "{%" in prompt
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
@pytest.mark.ai
|
|
124
|
+
def test_context_prompts__are_consistent_between_calls__for_determinism() -> None:
|
|
125
|
+
"""
|
|
126
|
+
Purpose: Verify that prompt loaders return consistent content across multiple invocations.
|
|
127
|
+
Why this matters: Ensures deterministic behavior and no hidden state in loaders.
|
|
128
|
+
Setup summary: Call loaders twice and compare results for equality.
|
|
129
|
+
"""
|
|
130
|
+
# Arrange - No setup needed
|
|
131
|
+
|
|
132
|
+
# Act
|
|
133
|
+
system_prompt_1: str = context_system_prompt_loader()
|
|
134
|
+
system_prompt_2: str = context_system_prompt_loader()
|
|
135
|
+
user_prompt_1: str = context_user_prompt_loader()
|
|
136
|
+
user_prompt_2: str = context_user_prompt_loader()
|
|
137
|
+
|
|
138
|
+
# Assert
|
|
139
|
+
assert system_prompt_1 == system_prompt_2
|
|
140
|
+
assert user_prompt_1 == user_prompt_2
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
@pytest.mark.ai
|
|
144
|
+
def test_hallucination_system_prompt_loader__returns_non_empty_string__on_call() -> (
|
|
145
|
+
None
|
|
146
|
+
):
|
|
147
|
+
"""
|
|
148
|
+
Purpose: Verify that the hallucination system prompt loader returns a valid string.
|
|
149
|
+
Why this matters: The system prompt is critical for hallucination detection behavior.
|
|
150
|
+
Setup summary: Call the loader function and assert type and non-empty content.
|
|
151
|
+
"""
|
|
152
|
+
# Arrange - No setup needed
|
|
153
|
+
|
|
154
|
+
# Act
|
|
155
|
+
prompt: str = hallucination_system_prompt_loader()
|
|
156
|
+
|
|
157
|
+
# Assert
|
|
158
|
+
assert isinstance(prompt, str)
|
|
159
|
+
assert len(prompt) > 0
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
@pytest.mark.ai
|
|
163
|
+
def test_hallucination_user_prompt_loader__returns_non_empty_string__on_call() -> None:
|
|
164
|
+
"""
|
|
165
|
+
Purpose: Verify that the hallucination user prompt loader returns a valid string.
|
|
166
|
+
Why this matters: The user prompt template must be valid for hallucination evaluation.
|
|
167
|
+
Setup summary: Call the loader function and assert type and non-empty content.
|
|
168
|
+
"""
|
|
169
|
+
# Arrange - No setup needed
|
|
170
|
+
|
|
171
|
+
# Act
|
|
172
|
+
prompt: str = hallucination_user_prompt_loader()
|
|
173
|
+
|
|
174
|
+
# Assert
|
|
175
|
+
assert isinstance(prompt, str)
|
|
176
|
+
assert len(prompt) > 0
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
@pytest.mark.ai
|
|
180
|
+
def test_hallucination_system_prompt__contains_jinja_syntax__for_has_context() -> None:
|
|
181
|
+
"""
|
|
182
|
+
Purpose: Verify that system prompt contains Jinja2 template syntax for context handling.
|
|
183
|
+
Why this matters: Template must support conditional rendering based on has_context flag.
|
|
184
|
+
Setup summary: Load system prompt and check for Jinja2 conditional blocks.
|
|
185
|
+
"""
|
|
186
|
+
# Arrange - No setup needed
|
|
187
|
+
|
|
188
|
+
# Act
|
|
189
|
+
prompt: str = hallucination_system_prompt_loader()
|
|
190
|
+
|
|
191
|
+
# Assert
|
|
192
|
+
assert "{% if has_context %}" in prompt or "{%" in prompt
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
@pytest.mark.ai
|
|
196
|
+
def test_hallucination_user_prompt__contains_jinja_variables__for_input_and_output() -> (
|
|
197
|
+
None
|
|
198
|
+
):
|
|
199
|
+
"""
|
|
200
|
+
Purpose: Verify that user prompt contains required Jinja2 variable placeholders.
|
|
201
|
+
Why this matters: Template must support dynamic insertion of input and output texts.
|
|
202
|
+
Setup summary: Load user prompt and check for expected variable placeholders.
|
|
203
|
+
"""
|
|
204
|
+
# Arrange - No setup needed
|
|
205
|
+
|
|
206
|
+
# Act
|
|
207
|
+
prompt: str = hallucination_user_prompt_loader()
|
|
208
|
+
|
|
209
|
+
# Assert
|
|
210
|
+
assert "{{ input_text }}" in prompt
|
|
211
|
+
assert "{{ output_text }}" in prompt
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
@pytest.mark.ai
|
|
215
|
+
def test_hallucination_system_prompt__has_context_conditional__in_template() -> None:
|
|
216
|
+
"""
|
|
217
|
+
Purpose: Verify that system prompt template has conditional logic for has_context.
|
|
218
|
+
Why this matters: Template must handle both context and non-context evaluation scenarios.
|
|
219
|
+
Setup summary: Load system prompt and check for conditional blocks with else/endif.
|
|
220
|
+
"""
|
|
221
|
+
# Arrange - No setup needed
|
|
222
|
+
|
|
223
|
+
# Act
|
|
224
|
+
prompt: str = hallucination_system_prompt_loader()
|
|
225
|
+
|
|
226
|
+
# Assert
|
|
227
|
+
assert "{% if has_context %}" in prompt
|
|
228
|
+
assert "{% else %}" in prompt or "{% endif %}" in prompt
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
@pytest.mark.ai
|
|
232
|
+
def test_hallucination_user_prompt__has_optional_context_fields__in_template() -> None:
|
|
233
|
+
"""
|
|
234
|
+
Purpose: Verify that user prompt has conditional blocks for optional context fields.
|
|
235
|
+
Why this matters: Template must support optional contexts_text and history_messages_text.
|
|
236
|
+
Setup summary: Load user prompt and check for conditional blocks or variable placeholders.
|
|
237
|
+
"""
|
|
238
|
+
# Arrange - No setup needed
|
|
239
|
+
|
|
240
|
+
# Act
|
|
241
|
+
prompt: str = hallucination_user_prompt_loader()
|
|
242
|
+
|
|
243
|
+
# Assert
|
|
244
|
+
assert "{% if contexts_text %}" in prompt or "{{ contexts_text }}" in prompt
|
|
245
|
+
assert (
|
|
246
|
+
"{% if history_messages_text %}" in prompt
|
|
247
|
+
or "{{ history_messages_text }}" in prompt
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
@pytest.mark.ai
|
|
252
|
+
def test_hallucination_system_prompt__mentions_hallucination_concepts__in_content() -> (
|
|
253
|
+
None
|
|
254
|
+
):
|
|
255
|
+
"""
|
|
256
|
+
Purpose: Verify that system prompt mentions hallucination-related concepts.
|
|
257
|
+
Why this matters: Ensures prompt properly guides model to detect hallucinations.
|
|
258
|
+
Setup summary: Load system prompt and check for hallucination-related keywords.
|
|
259
|
+
"""
|
|
260
|
+
# Arrange - No setup needed
|
|
261
|
+
|
|
262
|
+
# Act
|
|
263
|
+
prompt: str = hallucination_system_prompt_loader()
|
|
264
|
+
prompt_lower: str = prompt.lower()
|
|
265
|
+
|
|
266
|
+
# Assert
|
|
267
|
+
assert (
|
|
268
|
+
"hallucination" in prompt_lower
|
|
269
|
+
or "grounded" in prompt_lower
|
|
270
|
+
or "supported" in prompt_lower
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
@pytest.mark.ai
|
|
275
|
+
def test_hallucination_user_prompt__contains_data_sections__for_input_and_output() -> (
|
|
276
|
+
None
|
|
277
|
+
):
|
|
278
|
+
"""
|
|
279
|
+
Purpose: Verify that user prompt has sections for input and output data.
|
|
280
|
+
Why this matters: Template must clearly separate input and output for evaluation.
|
|
281
|
+
Setup summary: Load user prompt and check for input/output section markers.
|
|
282
|
+
"""
|
|
283
|
+
# Arrange - No setup needed
|
|
284
|
+
|
|
285
|
+
# Act
|
|
286
|
+
prompt: str = hallucination_user_prompt_loader()
|
|
287
|
+
|
|
288
|
+
# Assert
|
|
289
|
+
assert "Input:" in prompt or "input" in prompt.lower()
|
|
290
|
+
assert "Output:" in prompt or "output" in prompt.lower()
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
@pytest.mark.ai
|
|
294
|
+
def test_hallucination_prompts__are_consistent_between_calls__for_determinism() -> None:
|
|
295
|
+
"""
|
|
296
|
+
Purpose: Verify that hallucination prompt loaders return consistent content.
|
|
297
|
+
Why this matters: Ensures deterministic behavior and no hidden state in loaders.
|
|
298
|
+
Setup summary: Call loaders twice and compare results for equality.
|
|
299
|
+
"""
|
|
300
|
+
# Arrange - No setup needed
|
|
301
|
+
|
|
302
|
+
# Act
|
|
303
|
+
system_prompt_1: str = hallucination_system_prompt_loader()
|
|
304
|
+
system_prompt_2: str = hallucination_system_prompt_loader()
|
|
305
|
+
user_prompt_1: str = hallucination_user_prompt_loader()
|
|
306
|
+
user_prompt_2: str = hallucination_user_prompt_loader()
|
|
307
|
+
|
|
308
|
+
# Assert
|
|
309
|
+
assert system_prompt_1 == system_prompt_2
|
|
310
|
+
assert user_prompt_1 == user_prompt_2
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
@pytest.mark.ai
|
|
314
|
+
def test_context_relevancy_loaders__can_access_template_files__without_errors() -> None:
|
|
315
|
+
"""
|
|
316
|
+
Purpose: Verify that context relevancy loaders can successfully access template files.
|
|
317
|
+
Why this matters: Ensures template files exist and are readable at runtime.
|
|
318
|
+
Setup summary: Call loaders and assert no FileNotFoundError is raised.
|
|
319
|
+
"""
|
|
320
|
+
# Arrange - No setup needed
|
|
321
|
+
|
|
322
|
+
# Act & Assert
|
|
323
|
+
try:
|
|
324
|
+
system_prompt: str = context_system_prompt_loader()
|
|
325
|
+
user_prompt: str = context_user_prompt_loader()
|
|
326
|
+
assert system_prompt is not None
|
|
327
|
+
assert user_prompt is not None
|
|
328
|
+
except FileNotFoundError as e:
|
|
329
|
+
pytest.fail(f"Prompt loader failed to access template file: {e}")
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
@pytest.mark.ai
|
|
333
|
+
def test_hallucination_loaders__can_access_template_files__without_errors() -> None:
|
|
334
|
+
"""
|
|
335
|
+
Purpose: Verify that hallucination loaders can successfully access template files.
|
|
336
|
+
Why this matters: Ensures template files exist and are readable at runtime.
|
|
337
|
+
Setup summary: Call loaders and assert no FileNotFoundError is raised.
|
|
338
|
+
"""
|
|
339
|
+
# Arrange - No setup needed
|
|
340
|
+
|
|
341
|
+
# Act & Assert
|
|
342
|
+
try:
|
|
343
|
+
system_prompt: str = hallucination_system_prompt_loader()
|
|
344
|
+
user_prompt: str = hallucination_user_prompt_loader()
|
|
345
|
+
assert system_prompt is not None
|
|
346
|
+
assert user_prompt is not None
|
|
347
|
+
except FileNotFoundError as e:
|
|
348
|
+
pytest.fail(f"Prompt loader failed to access template file: {e}")
|
|
@@ -156,6 +156,7 @@ class DisplayCodeInterpreterFilesPostProcessor(
|
|
|
156
156
|
continue
|
|
157
157
|
|
|
158
158
|
is_image = (guess_type(filename)[0] or "").startswith("image/")
|
|
159
|
+
is_html = (guess_type(filename)[0] or "") == "text/html"
|
|
159
160
|
|
|
160
161
|
# Images
|
|
161
162
|
if is_image:
|
|
@@ -168,6 +169,15 @@ class DisplayCodeInterpreterFilesPostProcessor(
|
|
|
168
169
|
)
|
|
169
170
|
changed |= replaced
|
|
170
171
|
|
|
172
|
+
# HTML
|
|
173
|
+
elif is_html:
|
|
174
|
+
loop_response.message.text, replaced = _replace_container_html_citation(
|
|
175
|
+
text=loop_response.message.text,
|
|
176
|
+
filename=filename,
|
|
177
|
+
content_id=content_id,
|
|
178
|
+
)
|
|
179
|
+
changed |= replaced
|
|
180
|
+
|
|
171
181
|
# Files
|
|
172
182
|
else:
|
|
173
183
|
loop_response.message.text, replaced = _replace_container_file_citation(
|
|
@@ -302,6 +312,30 @@ def _replace_container_image_citation(
|
|
|
302
312
|
), True
|
|
303
313
|
|
|
304
314
|
|
|
315
|
+
def _replace_container_html_citation(
|
|
316
|
+
text: str, filename: str, content_id: str
|
|
317
|
+
) -> tuple[str, bool]:
|
|
318
|
+
html_markdown = rf"!?\[.*?\]\(sandbox:/mnt/data/{re.escape(filename)}\)"
|
|
319
|
+
|
|
320
|
+
if not re.search(html_markdown, text):
|
|
321
|
+
logger.info("No HTML markdown found for %s", filename)
|
|
322
|
+
return text, False
|
|
323
|
+
|
|
324
|
+
logger.info("Displaying HTML %s", filename)
|
|
325
|
+
html_rendering_block = f"""```HtmlRendering
|
|
326
|
+
100%
|
|
327
|
+
500px
|
|
328
|
+
|
|
329
|
+
unique://content/{content_id}
|
|
330
|
+
|
|
331
|
+
```"""
|
|
332
|
+
return re.sub(
|
|
333
|
+
html_markdown,
|
|
334
|
+
html_rendering_block,
|
|
335
|
+
text,
|
|
336
|
+
), True
|
|
337
|
+
|
|
338
|
+
|
|
305
339
|
def _replace_container_file_citation(
|
|
306
340
|
text: str, filename: str, ref_number: int
|
|
307
341
|
) -> tuple[str, bool]:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: unique_toolkit
|
|
3
|
-
Version: 1.45.
|
|
3
|
+
Version: 1.45.6
|
|
4
4
|
Summary:
|
|
5
5
|
License: Proprietary
|
|
6
6
|
Author: Cedric Klinkert
|
|
@@ -125,6 +125,12 @@ All notable changes to this project will be documented in this file.
|
|
|
125
125
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
126
126
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
127
127
|
|
|
128
|
+
## [1.45.6] - 2026-01-30
|
|
129
|
+
- hallucination evaluator: Use original response to retrieve referenced chunk
|
|
130
|
+
|
|
131
|
+
## [1.45.5] - 2026-01-29
|
|
132
|
+
- Add HTML rendering support for code interpreter generated files
|
|
133
|
+
|
|
128
134
|
## [1.45.4] - 2026-01-26
|
|
129
135
|
- Add ArtifactType `AGENTIC_REPORT`
|
|
130
136
|
|
|
@@ -69,21 +69,31 @@ unique_toolkit/_common/validators.py,sha256=ElnkMsyEY24TfzfTVHvireyT39EnZgW5N40T
|
|
|
69
69
|
unique_toolkit/agentic/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
|
70
70
|
unique_toolkit/agentic/debug_info_manager/debug_info_manager.py,sha256=30ZZaw0vffjZjiu9AYdO1Sm8G9FN6XR2ehdOEUCKqh0,891
|
|
71
71
|
unique_toolkit/agentic/debug_info_manager/test/test_debug_info_manager.py,sha256=_fIS6_DHA8A3AB64-LPgHgUGa1w0CFUWwtgV-ZbhkzA,10535
|
|
72
|
-
unique_toolkit/agentic/evaluation/config.py,sha256=
|
|
73
|
-
unique_toolkit/agentic/evaluation/context_relevancy/prompts.py,sha256=
|
|
72
|
+
unique_toolkit/agentic/evaluation/config.py,sha256=u5-iuT-4mfA2_9UZjZ-TN3YutVf35aR4EsF9l61Odnk,1696
|
|
73
|
+
unique_toolkit/agentic/evaluation/context_relevancy/prompts/__init__.py,sha256=IKGvHW2viBylTUjEJNKvIXO4YrYwdJXBHSxA9rKtiRI,355
|
|
74
|
+
unique_toolkit/agentic/evaluation/context_relevancy/prompts/system_prompt.j2,sha256=PFT9Y7QQ85h-ronwK8sq6zJJrxfrZhvZkYHXspMdDHU,985
|
|
75
|
+
unique_toolkit/agentic/evaluation/context_relevancy/prompts/user_prompt.j2,sha256=9HXm0Qg0xAPIE5mnaUiHrscolD_qO7f83qlIjXiP1n4,150
|
|
74
76
|
unique_toolkit/agentic/evaluation/context_relevancy/schema.py,sha256=lZd0TPzH43ifgWWGg3WO6b1AQX8aK2R9y51yH0d1DHM,2919
|
|
75
|
-
unique_toolkit/agentic/evaluation/context_relevancy/service.py,sha256=
|
|
77
|
+
unique_toolkit/agentic/evaluation/context_relevancy/service.py,sha256=dsgpfKRSg9B4kjLhHJD_Kath4GVhHE-ZOVAGRkiCz20,8729
|
|
76
78
|
unique_toolkit/agentic/evaluation/evaluation_manager.py,sha256=wDN_Uuut9kEGek8JY3QeInKpF-ukbvOSKOVd7DHFT3Q,8121
|
|
77
79
|
unique_toolkit/agentic/evaluation/exception.py,sha256=7lcVbCyoN4Md1chNJDFxpUYyWbVrcr9dcc3TxWykJTc,115
|
|
78
|
-
unique_toolkit/agentic/evaluation/hallucination/constants.py,sha256
|
|
80
|
+
unique_toolkit/agentic/evaluation/hallucination/constants.py,sha256=-PnZ3N9VpwgbIe6hcUye40nvJa-JIRuTidCZAQwZ3GA,2473
|
|
79
81
|
unique_toolkit/agentic/evaluation/hallucination/hallucination_evaluation.py,sha256=x5ta2Fum4fE5ySgIXPKlnbTtmV140z0IazSATd0-REg,4092
|
|
80
|
-
unique_toolkit/agentic/evaluation/hallucination/prompts.py,sha256=
|
|
82
|
+
unique_toolkit/agentic/evaluation/hallucination/prompts/__init__.py,sha256=4KFYMZsB3fJUKzoiUJE1npZ0gueWgvceB32EUrN-v7A,343
|
|
83
|
+
unique_toolkit/agentic/evaluation/hallucination/prompts/system_prompt.j2,sha256=sDUX6G645Ba40D_qKu4cUI8g-sJOfG8JpZreTNFgf7M,2616
|
|
84
|
+
unique_toolkit/agentic/evaluation/hallucination/prompts/user_prompt.j2,sha256=mD_qE9fOkyc1XXrebFt097ddx8bTlA6lbY04hKSQmWs,273
|
|
81
85
|
unique_toolkit/agentic/evaluation/hallucination/service.py,sha256=WJF1f45uHnYLx1S4TW31bSFobFpV-YlOS3G_zMhuBVU,2512
|
|
82
|
-
unique_toolkit/agentic/evaluation/hallucination/utils.py,sha256=
|
|
86
|
+
unique_toolkit/agentic/evaluation/hallucination/utils.py,sha256=fxT7H1PQ6xANNvtViuhhR_9ac5ggDmFx-YfjcKUZRcg,12013
|
|
83
87
|
unique_toolkit/agentic/evaluation/output_parser.py,sha256=0FDo8YY_Dc4qlTNeYyQkznzIFj9aX9wMrLOTbhhTl6g,1418
|
|
84
88
|
unique_toolkit/agentic/evaluation/schemas.py,sha256=m9JMCUmeqP8KhsJOVEzsz6dRXUe1uKw-bxRDtn5qwvM,3156
|
|
85
|
-
unique_toolkit/agentic/evaluation/tests/
|
|
86
|
-
unique_toolkit/agentic/evaluation/tests/
|
|
89
|
+
unique_toolkit/agentic/evaluation/tests/fixtures.py,sha256=Q-ughTfDiAdsMKbBVGzFiBucFdAx-FXgJ9iqp5xMyPs,2801
|
|
90
|
+
unique_toolkit/agentic/evaluation/tests/test_config.py,sha256=p7xFQ7KE_yU8jGpqYA7ntAYe5Vln33wd6nwv3FM9XfI,8327
|
|
91
|
+
unique_toolkit/agentic/evaluation/tests/test_context_relevancy_service.py,sha256=NcSOyBJ_lqYehtlraZPo9RLutCitTP76kvkuyogSD2A,9477
|
|
92
|
+
unique_toolkit/agentic/evaluation/tests/test_hallucination_constants.py,sha256=jT61WxKic-jDUJT1BeVjzhck02EnaMi1ng2H82-Aq_Q,19348
|
|
93
|
+
unique_toolkit/agentic/evaluation/tests/test_hallucination_utils.py,sha256=PKyGR073HxT0J_g8626kCURbMSlrMgkg-xPP7dPHD-0,31838
|
|
94
|
+
unique_toolkit/agentic/evaluation/tests/test_output_parser.py,sha256=KfltytmvqnPWLhmZpBXqcRmnlYorw_USwM5rkLVv8so,5179
|
|
95
|
+
unique_toolkit/agentic/evaluation/tests/test_prompt_loaders.py,sha256=zBREdlKf5tdDyB8XSaNgpQv3-tuZJoYteeJrp6WMWDM,11897
|
|
96
|
+
unique_toolkit/agentic/evaluation/utils.py,sha256=HmyPaDV8wdW-_gOjjW-wDaMKgdrsP5-SHP7OqTmGI_A,264
|
|
87
97
|
unique_toolkit/agentic/feature_flags/__init__.py,sha256=LhE2cHoa9AYBOR7TjiIToOn46sttm9paKcrzE7gnDPM,149
|
|
88
98
|
unique_toolkit/agentic/feature_flags/feature_flags.py,sha256=4jPH0GGGt5-tQ6PJWNpMBIlYzNrQIIqBLx8W02lwxD0,1140
|
|
89
99
|
unique_toolkit/agentic/history_manager/history_construction_with_contents.py,sha256=TwamOOnYTYZMQdY1mAzj6_MZOe3T5RsjFDarT1tCtYo,8150
|
|
@@ -111,7 +121,7 @@ unique_toolkit/agentic/postprocessor/postprocessor_manager.py,sha256=CoKzVFeLIr1
|
|
|
111
121
|
unique_toolkit/agentic/reference_manager/reference_manager.py,sha256=x51CT0D8HHu2LzgXdHGy0leOYpjnsxVbPZ2nc28G9mA,4005
|
|
112
122
|
unique_toolkit/agentic/responses_api/__init__.py,sha256=9WTO-ef7fGE9Y1QtZJFm8Q_jkwK8Srtl-HWvpAD2Wxs,668
|
|
113
123
|
unique_toolkit/agentic/responses_api/postprocessors/code_display.py,sha256=h6ZqPR0kPQnxM0ynshYQTa1BrcN8XGbUz9p03m8rOj0,2339
|
|
114
|
-
unique_toolkit/agentic/responses_api/postprocessors/generated_files.py,sha256=
|
|
124
|
+
unique_toolkit/agentic/responses_api/postprocessors/generated_files.py,sha256=janOVTJtGDXWvWsUjB1pnRfLUVXPSkjjbXHxUS0IjjE,12685
|
|
115
125
|
unique_toolkit/agentic/responses_api/stream_handler.py,sha256=Y1IM0uiPBdlab5UuOTCsHTaVX-fd9MxfS3xkwhdFie4,647
|
|
116
126
|
unique_toolkit/agentic/short_term_memory_manager/persistent_short_term_memory_manager.py,sha256=g8I64dKkpwWIXfwpxD1-rLte00hh_PoQ9-fXUAcNQCo,5817
|
|
117
127
|
unique_toolkit/agentic/thinking_manager/thinking_manager.py,sha256=41QWFsdRrbWlQHBfYCFv726UDom4WbcvaRfjCmoUOQI,4183
|
|
@@ -244,7 +254,7 @@ unique_toolkit/short_term_memory/service.py,sha256=5PeVBu1ZCAfyDb2HLVvlmqSbyzBBu
|
|
|
244
254
|
unique_toolkit/smart_rules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
245
255
|
unique_toolkit/smart_rules/compile.py,sha256=Ozhh70qCn2yOzRWr9d8WmJeTo7AQurwd3tStgBMPFLA,1246
|
|
246
256
|
unique_toolkit/test_utilities/events.py,sha256=_mwV2bs5iLjxS1ynDCjaIq-gjjKhXYCK-iy3dRfvO3g,6410
|
|
247
|
-
unique_toolkit-1.45.
|
|
248
|
-
unique_toolkit-1.45.
|
|
249
|
-
unique_toolkit-1.45.
|
|
250
|
-
unique_toolkit-1.45.
|
|
257
|
+
unique_toolkit-1.45.6.dist-info/LICENSE,sha256=GlN8wHNdh53xwOPg44URnwag6TEolCjoq3YD_KrWgss,193
|
|
258
|
+
unique_toolkit-1.45.6.dist-info/METADATA,sha256=Ojrf6ABO88IO5h892iywEweJuc9p67MWzHMXa--40gE,49243
|
|
259
|
+
unique_toolkit-1.45.6.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
260
|
+
unique_toolkit-1.45.6.dist-info/RECORD,,
|