unique_toolkit 1.45.4__py3-none-any.whl → 1.45.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. unique_toolkit/agentic/evaluation/config.py +25 -6
  2. unique_toolkit/agentic/evaluation/context_relevancy/prompts/__init__.py +13 -0
  3. unique_toolkit/agentic/evaluation/context_relevancy/{prompts.py → prompts/system_prompt.j2} +11 -43
  4. unique_toolkit/agentic/evaluation/context_relevancy/prompts/user_prompt.j2 +15 -0
  5. unique_toolkit/agentic/evaluation/context_relevancy/service.py +24 -56
  6. unique_toolkit/agentic/evaluation/hallucination/constants.py +26 -15
  7. unique_toolkit/agentic/evaluation/hallucination/prompts/__init__.py +13 -0
  8. unique_toolkit/agentic/evaluation/hallucination/prompts/system_prompt.j2 +35 -0
  9. unique_toolkit/agentic/evaluation/hallucination/prompts/user_prompt.j2 +27 -0
  10. unique_toolkit/agentic/evaluation/hallucination/utils.py +153 -102
  11. unique_toolkit/agentic/evaluation/tests/fixtures.py +102 -0
  12. unique_toolkit/agentic/evaluation/tests/test_config.py +247 -0
  13. unique_toolkit/agentic/evaluation/tests/test_context_relevancy_service.py +141 -121
  14. unique_toolkit/agentic/evaluation/tests/test_hallucination_constants.py +600 -0
  15. unique_toolkit/agentic/evaluation/tests/test_hallucination_utils.py +1009 -0
  16. unique_toolkit/agentic/evaluation/tests/test_output_parser.py +82 -23
  17. unique_toolkit/agentic/evaluation/tests/test_prompt_loaders.py +348 -0
  18. unique_toolkit/agentic/evaluation/utils.py +8 -0
  19. unique_toolkit/agentic/responses_api/postprocessors/generated_files.py +34 -0
  20. {unique_toolkit-1.45.4.dist-info → unique_toolkit-1.45.6.dist-info}/METADATA +7 -1
  21. {unique_toolkit-1.45.4.dist-info → unique_toolkit-1.45.6.dist-info}/RECORD +23 -13
  22. unique_toolkit/agentic/evaluation/hallucination/prompts.py +0 -79
  23. {unique_toolkit-1.45.4.dist-info → unique_toolkit-1.45.6.dist-info}/LICENSE +0 -0
  24. {unique_toolkit-1.45.4.dist-info → unique_toolkit-1.45.6.dist-info}/WHEEL +0 -0
@@ -1,3 +1,5 @@
1
+ """Tests for evaluation metric output parsers."""
2
+
1
3
  import pytest
2
4
 
3
5
  from unique_toolkit.agentic.evaluation.context_relevancy.schema import (
@@ -15,11 +17,21 @@ from unique_toolkit.agentic.evaluation.schemas import (
15
17
  )
16
18
 
17
19
 
18
- def test_parse_eval_metric_result_success():
19
- # Test successful parsing with all fields
20
- result = '{"value": "high", "reason": "Test reason"}'
21
- parsed = parse_eval_metric_result(result, EvaluationMetricName.CONTEXT_RELEVANCY)
20
+ @pytest.mark.ai
21
+ def test_parse_eval_metric_result__succeeds__with_all_fields() -> None:
22
+ """
23
+ Purpose: Verify parsing of complete evaluation metric JSON result with all fields.
24
+ Why this matters: Core parsing functionality for evaluation results from LLM.
25
+ Setup summary: Provide valid JSON with all fields, assert correct parsing and field values.
26
+ """
27
+ # Arrange
28
+ result_json: str = '{"value": "high", "reason": "Test reason"}'
29
+ metric_name: EvaluationMetricName = EvaluationMetricName.CONTEXT_RELEVANCY
30
+
31
+ # Act
32
+ parsed: EvaluationMetricResult = parse_eval_metric_result(result_json, metric_name)
22
33
 
34
+ # Assert
23
35
  assert isinstance(parsed, EvaluationMetricResult)
24
36
  assert parsed.name == EvaluationMetricName.CONTEXT_RELEVANCY
25
37
  assert parsed.value == "high"
@@ -27,11 +39,21 @@ def test_parse_eval_metric_result_success():
27
39
  assert parsed.fact_list == []
28
40
 
29
41
 
30
- def test_parse_eval_metric_result_missing_fields():
31
- # Test parsing with missing fields (should use default "None")
32
- result = '{"value": "high"}'
33
- parsed = parse_eval_metric_result(result, EvaluationMetricName.CONTEXT_RELEVANCY)
42
+ @pytest.mark.ai
43
+ def test_parse_eval_metric_result__uses_default_reason__with_missing_field() -> None:
44
+ """
45
+ Purpose: Verify parsing handles missing optional fields by using defaults.
46
+ Why this matters: Ensures robustness when LLM returns incomplete JSON responses.
47
+ Setup summary: Provide JSON with only required field, assert default value for reason.
48
+ """
49
+ # Arrange
50
+ result_json: str = '{"value": "high"}'
51
+ metric_name: EvaluationMetricName = EvaluationMetricName.CONTEXT_RELEVANCY
52
+
53
+ # Act
54
+ parsed: EvaluationMetricResult = parse_eval_metric_result(result_json, metric_name)
34
55
 
56
+ # Assert
35
57
  assert isinstance(parsed, EvaluationMetricResult)
36
58
  assert parsed.name == EvaluationMetricName.CONTEXT_RELEVANCY
37
59
  assert parsed.value == "high"
@@ -39,24 +61,49 @@ def test_parse_eval_metric_result_missing_fields():
39
61
  assert parsed.fact_list == []
40
62
 
41
63
 
42
- def test_parse_eval_metric_result_invalid_json():
43
- # Test parsing with invalid JSON
44
- result = "invalid json"
64
+ @pytest.mark.ai
65
+ def test_parse_eval_metric_result__raises_evaluator_exception__with_invalid_json() -> (
66
+ None
67
+ ):
68
+ """
69
+ Purpose: Verify parser raises appropriate exception for malformed JSON.
70
+ Why this matters: Provides clear error handling for invalid LLM responses.
71
+ Setup summary: Provide invalid JSON string, assert EvaluatorException with descriptive message.
72
+ """
73
+ # Arrange
74
+ result_json: str = "invalid json"
75
+ metric_name: EvaluationMetricName = EvaluationMetricName.CONTEXT_RELEVANCY
76
+
77
+ # Act & Assert
45
78
  with pytest.raises(EvaluatorException) as exc_info:
46
- parse_eval_metric_result(result, EvaluationMetricName.CONTEXT_RELEVANCY)
79
+ parse_eval_metric_result(result_json, metric_name)
47
80
 
48
81
  assert "Error occurred during parsing the evaluation metric result" in str(
49
82
  exc_info.value
50
83
  )
51
84
 
52
85
 
53
- def test_parse_eval_metric_result_structured_output_basic():
54
- # Test basic structured output without fact list
55
- result = EvaluationSchemaStructuredOutput(value="high", reason="Test reason")
56
- parsed = parse_eval_metric_result_structured_output(
57
- result, EvaluationMetricName.CONTEXT_RELEVANCY
86
+ @pytest.mark.ai
87
+ def test_parse_eval_metric_result_structured_output__succeeds__without_fact_list() -> (
88
+ None
89
+ ):
90
+ """
91
+ Purpose: Verify parsing of structured output without optional fact list.
92
+ Why this matters: Ensures structured output parsing works for basic evaluations.
93
+ Setup summary: Create structured output object without facts, assert correct parsing.
94
+ """
95
+ # Arrange
96
+ result: EvaluationSchemaStructuredOutput = EvaluationSchemaStructuredOutput(
97
+ value="high", reason="Test reason"
58
98
  )
99
+ metric_name: EvaluationMetricName = EvaluationMetricName.CONTEXT_RELEVANCY
59
100
 
101
+ # Act
102
+ parsed: EvaluationMetricResult = parse_eval_metric_result_structured_output(
103
+ result, metric_name
104
+ )
105
+
106
+ # Assert
60
107
  assert isinstance(parsed, EvaluationMetricResult)
61
108
  assert parsed.name == EvaluationMetricName.CONTEXT_RELEVANCY
62
109
  assert parsed.value == "high"
@@ -64,9 +111,17 @@ def test_parse_eval_metric_result_structured_output_basic():
64
111
  assert parsed.fact_list == []
65
112
 
66
113
 
67
- def test_parse_eval_metric_result_structured_output_with_facts():
68
- # Test structured output with fact list
69
- result = EvaluationSchemaStructuredOutput(
114
+ @pytest.mark.ai
115
+ def test_parse_eval_metric_result_structured_output__includes_facts__with_fact_list() -> (
116
+ None
117
+ ):
118
+ """
119
+ Purpose: Verify parsing of structured output with fact list extracts all facts.
120
+ Why this matters: Fact extraction is critical for detailed evaluation feedback.
121
+ Setup summary: Create structured output with multiple facts, assert all facts extracted.
122
+ """
123
+ # Arrange
124
+ result: EvaluationSchemaStructuredOutput = EvaluationSchemaStructuredOutput(
70
125
  value="high",
71
126
  reason="Test reason",
72
127
  fact_list=[
@@ -74,14 +129,18 @@ def test_parse_eval_metric_result_structured_output_with_facts():
74
129
  Fact(fact="Fact 2"),
75
130
  ],
76
131
  )
77
- parsed = parse_eval_metric_result_structured_output(
78
- result, EvaluationMetricName.CONTEXT_RELEVANCY
132
+ metric_name: EvaluationMetricName = EvaluationMetricName.CONTEXT_RELEVANCY
133
+
134
+ # Act
135
+ parsed: EvaluationMetricResult = parse_eval_metric_result_structured_output(
136
+ result, metric_name
79
137
  )
80
138
 
139
+ # Assert
81
140
  assert isinstance(parsed, EvaluationMetricResult)
82
141
  assert parsed.name == EvaluationMetricName.CONTEXT_RELEVANCY
83
142
  assert parsed.value == "high"
84
143
  assert parsed.reason == "Test reason"
85
144
  assert parsed.fact_list == ["Fact 1", "Fact 2"]
86
145
  assert isinstance(parsed.fact_list, list)
87
- assert len(parsed.fact_list) == 2 # None fact should be filtered out
146
+ assert len(parsed.fact_list) == 2
@@ -0,0 +1,348 @@
1
+ """Tests for prompt loader functions."""
2
+
3
+ import pytest
4
+
5
+ from unique_toolkit.agentic.evaluation.context_relevancy.prompts import (
6
+ system_prompt_loader as context_system_prompt_loader,
7
+ )
8
+ from unique_toolkit.agentic.evaluation.context_relevancy.prompts import (
9
+ user_prompt_loader as context_user_prompt_loader,
10
+ )
11
+ from unique_toolkit.agentic.evaluation.hallucination.prompts import (
12
+ system_prompt_loader as hallucination_system_prompt_loader,
13
+ )
14
+ from unique_toolkit.agentic.evaluation.hallucination.prompts import (
15
+ user_prompt_loader as hallucination_user_prompt_loader,
16
+ )
17
+
18
+
19
+ @pytest.mark.ai
20
+ def test_context_system_prompt_loader__returns_non_empty_string__on_call() -> None:
21
+ """
22
+ Purpose: Verify that the context relevancy system prompt loader returns a valid string.
23
+ Why this matters: The system prompt is critical for guiding the evaluation LLM's behavior.
24
+ Setup summary: Call the loader function and assert type and non-empty content.
25
+ """
26
+ # Arrange - No setup needed for this test
27
+
28
+ # Act
29
+ prompt: str = context_system_prompt_loader()
30
+
31
+ # Assert
32
+ assert isinstance(prompt, str)
33
+ assert len(prompt) > 0
34
+
35
+
36
+ @pytest.mark.ai
37
+ def test_context_user_prompt_loader__returns_non_empty_string__on_call() -> None:
38
+ """
39
+ Purpose: Verify that the context relevancy user prompt loader returns a valid string.
40
+ Why this matters: The user prompt template must be valid for evaluation requests.
41
+ Setup summary: Call the loader function and assert type and non-empty content.
42
+ """
43
+ # Arrange - No setup needed for this test
44
+
45
+ # Act
46
+ prompt: str = context_user_prompt_loader()
47
+
48
+ # Assert
49
+ assert isinstance(prompt, str)
50
+ assert len(prompt) > 0
51
+
52
+
53
+ @pytest.mark.ai
54
+ def test_context_system_prompt__contains_jinja_syntax__for_structured_output() -> None:
55
+ """
56
+ Purpose: Verify that system prompt contains Jinja2 template syntax for structured output control.
57
+ Why this matters: Template must support conditional rendering based on structured_output flag.
58
+ Setup summary: Load system prompt and check for Jinja2 conditional blocks.
59
+ """
60
+ # Arrange - No setup needed
61
+
62
+ # Act
63
+ prompt: str = context_system_prompt_loader()
64
+
65
+ # Assert
66
+ assert "{% if structured_output %}" in prompt or "{%" in prompt
67
+
68
+
69
+ @pytest.mark.ai
70
+ def test_context_user_prompt__contains_jinja_variables__for_input_and_context() -> None:
71
+ """
72
+ Purpose: Verify that user prompt contains required Jinja2 variable placeholders.
73
+ Why this matters: Template must support dynamic insertion of input text and context texts.
74
+ Setup summary: Load user prompt and check for expected variable placeholders.
75
+ """
76
+ # Arrange - No setup needed
77
+
78
+ # Act
79
+ prompt: str = context_user_prompt_loader()
80
+
81
+ # Assert
82
+ assert "{{ input_text }}" in prompt
83
+ assert "{{ context_texts }}" in prompt
84
+
85
+
86
+ @pytest.mark.ai
87
+ def test_context_system_prompt__has_both_structured_and_regular_modes__in_template() -> (
88
+ None
89
+ ):
90
+ """
91
+ Purpose: Verify that system prompt template supports both structured and regular output modes.
92
+ Why this matters: Template must handle both evaluation output formats correctly.
93
+ Setup summary: Load system prompt and check for conditional blocks for both modes.
94
+ """
95
+ # Arrange - No setup needed
96
+
97
+ # Act
98
+ prompt: str = context_system_prompt_loader()
99
+
100
+ # Assert
101
+ assert "{% if structured_output %}" in prompt
102
+ assert "{% else %}" in prompt or "{% endif %}" in prompt
103
+
104
+
105
+ @pytest.mark.ai
106
+ def test_context_user_prompt__has_conditional_json_instruction__for_unstructured_mode() -> (
107
+ None
108
+ ):
109
+ """
110
+ Purpose: Verify that user prompt has conditional JSON instruction for unstructured mode.
111
+ Why this matters: Non-structured mode requires explicit JSON formatting instructions.
112
+ Setup summary: Load user prompt and check for conditional JSON instruction block.
113
+ """
114
+ # Arrange - No setup needed
115
+
116
+ # Act
117
+ prompt: str = context_user_prompt_loader()
118
+
119
+ # Assert
120
+ assert "{% if not structured_output %}" in prompt or "{%" in prompt
121
+
122
+
123
+ @pytest.mark.ai
124
+ def test_context_prompts__are_consistent_between_calls__for_determinism() -> None:
125
+ """
126
+ Purpose: Verify that prompt loaders return consistent content across multiple invocations.
127
+ Why this matters: Ensures deterministic behavior and no hidden state in loaders.
128
+ Setup summary: Call loaders twice and compare results for equality.
129
+ """
130
+ # Arrange - No setup needed
131
+
132
+ # Act
133
+ system_prompt_1: str = context_system_prompt_loader()
134
+ system_prompt_2: str = context_system_prompt_loader()
135
+ user_prompt_1: str = context_user_prompt_loader()
136
+ user_prompt_2: str = context_user_prompt_loader()
137
+
138
+ # Assert
139
+ assert system_prompt_1 == system_prompt_2
140
+ assert user_prompt_1 == user_prompt_2
141
+
142
+
143
+ @pytest.mark.ai
144
+ def test_hallucination_system_prompt_loader__returns_non_empty_string__on_call() -> (
145
+ None
146
+ ):
147
+ """
148
+ Purpose: Verify that the hallucination system prompt loader returns a valid string.
149
+ Why this matters: The system prompt is critical for hallucination detection behavior.
150
+ Setup summary: Call the loader function and assert type and non-empty content.
151
+ """
152
+ # Arrange - No setup needed
153
+
154
+ # Act
155
+ prompt: str = hallucination_system_prompt_loader()
156
+
157
+ # Assert
158
+ assert isinstance(prompt, str)
159
+ assert len(prompt) > 0
160
+
161
+
162
+ @pytest.mark.ai
163
+ def test_hallucination_user_prompt_loader__returns_non_empty_string__on_call() -> None:
164
+ """
165
+ Purpose: Verify that the hallucination user prompt loader returns a valid string.
166
+ Why this matters: The user prompt template must be valid for hallucination evaluation.
167
+ Setup summary: Call the loader function and assert type and non-empty content.
168
+ """
169
+ # Arrange - No setup needed
170
+
171
+ # Act
172
+ prompt: str = hallucination_user_prompt_loader()
173
+
174
+ # Assert
175
+ assert isinstance(prompt, str)
176
+ assert len(prompt) > 0
177
+
178
+
179
+ @pytest.mark.ai
180
+ def test_hallucination_system_prompt__contains_jinja_syntax__for_has_context() -> None:
181
+ """
182
+ Purpose: Verify that system prompt contains Jinja2 template syntax for context handling.
183
+ Why this matters: Template must support conditional rendering based on has_context flag.
184
+ Setup summary: Load system prompt and check for Jinja2 conditional blocks.
185
+ """
186
+ # Arrange - No setup needed
187
+
188
+ # Act
189
+ prompt: str = hallucination_system_prompt_loader()
190
+
191
+ # Assert
192
+ assert "{% if has_context %}" in prompt or "{%" in prompt
193
+
194
+
195
+ @pytest.mark.ai
196
+ def test_hallucination_user_prompt__contains_jinja_variables__for_input_and_output() -> (
197
+ None
198
+ ):
199
+ """
200
+ Purpose: Verify that user prompt contains required Jinja2 variable placeholders.
201
+ Why this matters: Template must support dynamic insertion of input and output texts.
202
+ Setup summary: Load user prompt and check for expected variable placeholders.
203
+ """
204
+ # Arrange - No setup needed
205
+
206
+ # Act
207
+ prompt: str = hallucination_user_prompt_loader()
208
+
209
+ # Assert
210
+ assert "{{ input_text }}" in prompt
211
+ assert "{{ output_text }}" in prompt
212
+
213
+
214
+ @pytest.mark.ai
215
+ def test_hallucination_system_prompt__has_context_conditional__in_template() -> None:
216
+ """
217
+ Purpose: Verify that system prompt template has conditional logic for has_context.
218
+ Why this matters: Template must handle both context and non-context evaluation scenarios.
219
+ Setup summary: Load system prompt and check for conditional blocks with else/endif.
220
+ """
221
+ # Arrange - No setup needed
222
+
223
+ # Act
224
+ prompt: str = hallucination_system_prompt_loader()
225
+
226
+ # Assert
227
+ assert "{% if has_context %}" in prompt
228
+ assert "{% else %}" in prompt or "{% endif %}" in prompt
229
+
230
+
231
+ @pytest.mark.ai
232
+ def test_hallucination_user_prompt__has_optional_context_fields__in_template() -> None:
233
+ """
234
+ Purpose: Verify that user prompt has conditional blocks for optional context fields.
235
+ Why this matters: Template must support optional contexts_text and history_messages_text.
236
+ Setup summary: Load user prompt and check for conditional blocks or variable placeholders.
237
+ """
238
+ # Arrange - No setup needed
239
+
240
+ # Act
241
+ prompt: str = hallucination_user_prompt_loader()
242
+
243
+ # Assert
244
+ assert "{% if contexts_text %}" in prompt or "{{ contexts_text }}" in prompt
245
+ assert (
246
+ "{% if history_messages_text %}" in prompt
247
+ or "{{ history_messages_text }}" in prompt
248
+ )
249
+
250
+
251
+ @pytest.mark.ai
252
+ def test_hallucination_system_prompt__mentions_hallucination_concepts__in_content() -> (
253
+ None
254
+ ):
255
+ """
256
+ Purpose: Verify that system prompt mentions hallucination-related concepts.
257
+ Why this matters: Ensures prompt properly guides model to detect hallucinations.
258
+ Setup summary: Load system prompt and check for hallucination-related keywords.
259
+ """
260
+ # Arrange - No setup needed
261
+
262
+ # Act
263
+ prompt: str = hallucination_system_prompt_loader()
264
+ prompt_lower: str = prompt.lower()
265
+
266
+ # Assert
267
+ assert (
268
+ "hallucination" in prompt_lower
269
+ or "grounded" in prompt_lower
270
+ or "supported" in prompt_lower
271
+ )
272
+
273
+
274
+ @pytest.mark.ai
275
+ def test_hallucination_user_prompt__contains_data_sections__for_input_and_output() -> (
276
+ None
277
+ ):
278
+ """
279
+ Purpose: Verify that user prompt has sections for input and output data.
280
+ Why this matters: Template must clearly separate input and output for evaluation.
281
+ Setup summary: Load user prompt and check for input/output section markers.
282
+ """
283
+ # Arrange - No setup needed
284
+
285
+ # Act
286
+ prompt: str = hallucination_user_prompt_loader()
287
+
288
+ # Assert
289
+ assert "Input:" in prompt or "input" in prompt.lower()
290
+ assert "Output:" in prompt or "output" in prompt.lower()
291
+
292
+
293
+ @pytest.mark.ai
294
+ def test_hallucination_prompts__are_consistent_between_calls__for_determinism() -> None:
295
+ """
296
+ Purpose: Verify that hallucination prompt loaders return consistent content.
297
+ Why this matters: Ensures deterministic behavior and no hidden state in loaders.
298
+ Setup summary: Call loaders twice and compare results for equality.
299
+ """
300
+ # Arrange - No setup needed
301
+
302
+ # Act
303
+ system_prompt_1: str = hallucination_system_prompt_loader()
304
+ system_prompt_2: str = hallucination_system_prompt_loader()
305
+ user_prompt_1: str = hallucination_user_prompt_loader()
306
+ user_prompt_2: str = hallucination_user_prompt_loader()
307
+
308
+ # Assert
309
+ assert system_prompt_1 == system_prompt_2
310
+ assert user_prompt_1 == user_prompt_2
311
+
312
+
313
+ @pytest.mark.ai
314
+ def test_context_relevancy_loaders__can_access_template_files__without_errors() -> None:
315
+ """
316
+ Purpose: Verify that context relevancy loaders can successfully access template files.
317
+ Why this matters: Ensures template files exist and are readable at runtime.
318
+ Setup summary: Call loaders and assert no FileNotFoundError is raised.
319
+ """
320
+ # Arrange - No setup needed
321
+
322
+ # Act & Assert
323
+ try:
324
+ system_prompt: str = context_system_prompt_loader()
325
+ user_prompt: str = context_user_prompt_loader()
326
+ assert system_prompt is not None
327
+ assert user_prompt is not None
328
+ except FileNotFoundError as e:
329
+ pytest.fail(f"Prompt loader failed to access template file: {e}")
330
+
331
+
332
+ @pytest.mark.ai
333
+ def test_hallucination_loaders__can_access_template_files__without_errors() -> None:
334
+ """
335
+ Purpose: Verify that hallucination loaders can successfully access template files.
336
+ Why this matters: Ensures template files exist and are readable at runtime.
337
+ Setup summary: Call loaders and assert no FileNotFoundError is raised.
338
+ """
339
+ # Arrange - No setup needed
340
+
341
+ # Act & Assert
342
+ try:
343
+ system_prompt: str = hallucination_system_prompt_loader()
344
+ user_prompt: str = hallucination_user_prompt_loader()
345
+ assert system_prompt is not None
346
+ assert user_prompt is not None
347
+ except FileNotFoundError as e:
348
+ pytest.fail(f"Prompt loader failed to access template file: {e}")
@@ -0,0 +1,8 @@
1
+ from pathlib import Path
2
+
3
+
4
+ def load_template(parent_dir: Path, filename: str) -> str:
5
+ """Load a Jinja2 template file from the hallucination directory."""
6
+ template_path = parent_dir / filename
7
+ with open(template_path, "r") as f:
8
+ return f.read()
@@ -156,6 +156,7 @@ class DisplayCodeInterpreterFilesPostProcessor(
156
156
  continue
157
157
 
158
158
  is_image = (guess_type(filename)[0] or "").startswith("image/")
159
+ is_html = (guess_type(filename)[0] or "") == "text/html"
159
160
 
160
161
  # Images
161
162
  if is_image:
@@ -168,6 +169,15 @@ class DisplayCodeInterpreterFilesPostProcessor(
168
169
  )
169
170
  changed |= replaced
170
171
 
172
+ # HTML
173
+ elif is_html:
174
+ loop_response.message.text, replaced = _replace_container_html_citation(
175
+ text=loop_response.message.text,
176
+ filename=filename,
177
+ content_id=content_id,
178
+ )
179
+ changed |= replaced
180
+
171
181
  # Files
172
182
  else:
173
183
  loop_response.message.text, replaced = _replace_container_file_citation(
@@ -302,6 +312,30 @@ def _replace_container_image_citation(
302
312
  ), True
303
313
 
304
314
 
315
+ def _replace_container_html_citation(
316
+ text: str, filename: str, content_id: str
317
+ ) -> tuple[str, bool]:
318
+ html_markdown = rf"!?\[.*?\]\(sandbox:/mnt/data/{re.escape(filename)}\)"
319
+
320
+ if not re.search(html_markdown, text):
321
+ logger.info("No HTML markdown found for %s", filename)
322
+ return text, False
323
+
324
+ logger.info("Displaying HTML %s", filename)
325
+ html_rendering_block = f"""```HtmlRendering
326
+ 100%
327
+ 500px
328
+
329
+ unique://content/{content_id}
330
+
331
+ ```"""
332
+ return re.sub(
333
+ html_markdown,
334
+ html_rendering_block,
335
+ text,
336
+ ), True
337
+
338
+
305
339
  def _replace_container_file_citation(
306
340
  text: str, filename: str, ref_number: int
307
341
  ) -> tuple[str, bool]:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: unique_toolkit
3
- Version: 1.45.4
3
+ Version: 1.45.6
4
4
  Summary:
5
5
  License: Proprietary
6
6
  Author: Cedric Klinkert
@@ -125,6 +125,12 @@ All notable changes to this project will be documented in this file.
125
125
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
126
126
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
127
127
 
128
+ ## [1.45.6] - 2026-01-30
129
+ - hallucination evaluator: Use original response to retrieve referenced chunk
130
+
131
+ ## [1.45.5] - 2026-01-29
132
+ - Add HTML rendering support for code interpreter generated files
133
+
128
134
  ## [1.45.4] - 2026-01-26
129
135
  - Add ArtifactType `AGENTIC_REPORT`
130
136
 
@@ -69,21 +69,31 @@ unique_toolkit/_common/validators.py,sha256=ElnkMsyEY24TfzfTVHvireyT39EnZgW5N40T
69
69
  unique_toolkit/agentic/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
70
70
  unique_toolkit/agentic/debug_info_manager/debug_info_manager.py,sha256=30ZZaw0vffjZjiu9AYdO1Sm8G9FN6XR2ehdOEUCKqh0,891
71
71
  unique_toolkit/agentic/debug_info_manager/test/test_debug_info_manager.py,sha256=_fIS6_DHA8A3AB64-LPgHgUGa1w0CFUWwtgV-ZbhkzA,10535
72
- unique_toolkit/agentic/evaluation/config.py,sha256=Xbti9Tx7vgyUySed9Ovwx0uKL_tFWakMGll5MfOkvBM,905
73
- unique_toolkit/agentic/evaluation/context_relevancy/prompts.py,sha256=EdHFUOB581yVxcOL8482KUv_LzaRjuiem71EF8udYMc,1331
72
+ unique_toolkit/agentic/evaluation/config.py,sha256=u5-iuT-4mfA2_9UZjZ-TN3YutVf35aR4EsF9l61Odnk,1696
73
+ unique_toolkit/agentic/evaluation/context_relevancy/prompts/__init__.py,sha256=IKGvHW2viBylTUjEJNKvIXO4YrYwdJXBHSxA9rKtiRI,355
74
+ unique_toolkit/agentic/evaluation/context_relevancy/prompts/system_prompt.j2,sha256=PFT9Y7QQ85h-ronwK8sq6zJJrxfrZhvZkYHXspMdDHU,985
75
+ unique_toolkit/agentic/evaluation/context_relevancy/prompts/user_prompt.j2,sha256=9HXm0Qg0xAPIE5mnaUiHrscolD_qO7f83qlIjXiP1n4,150
74
76
  unique_toolkit/agentic/evaluation/context_relevancy/schema.py,sha256=lZd0TPzH43ifgWWGg3WO6b1AQX8aK2R9y51yH0d1DHM,2919
75
- unique_toolkit/agentic/evaluation/context_relevancy/service.py,sha256=2NM1_PCP6fXsRm0r-MGrUg5Z3WO00FBCqmiU8f5Kagg,9661
77
+ unique_toolkit/agentic/evaluation/context_relevancy/service.py,sha256=dsgpfKRSg9B4kjLhHJD_Kath4GVhHE-ZOVAGRkiCz20,8729
76
78
  unique_toolkit/agentic/evaluation/evaluation_manager.py,sha256=wDN_Uuut9kEGek8JY3QeInKpF-ukbvOSKOVd7DHFT3Q,8121
77
79
  unique_toolkit/agentic/evaluation/exception.py,sha256=7lcVbCyoN4Md1chNJDFxpUYyWbVrcr9dcc3TxWykJTc,115
78
- unique_toolkit/agentic/evaluation/hallucination/constants.py,sha256=XyRTAlL9nVWuHXTe6MP97fOkAtnbEhtw7KCl08QJydU,2150
80
+ unique_toolkit/agentic/evaluation/hallucination/constants.py,sha256=-PnZ3N9VpwgbIe6hcUye40nvJa-JIRuTidCZAQwZ3GA,2473
79
81
  unique_toolkit/agentic/evaluation/hallucination/hallucination_evaluation.py,sha256=x5ta2Fum4fE5ySgIXPKlnbTtmV140z0IazSATd0-REg,4092
80
- unique_toolkit/agentic/evaluation/hallucination/prompts.py,sha256=O3Hi_rOzZlujvnO2wn2jhoPmrYLjzVtRWwxn5Q81m9Y,3405
82
+ unique_toolkit/agentic/evaluation/hallucination/prompts/__init__.py,sha256=4KFYMZsB3fJUKzoiUJE1npZ0gueWgvceB32EUrN-v7A,343
83
+ unique_toolkit/agentic/evaluation/hallucination/prompts/system_prompt.j2,sha256=sDUX6G645Ba40D_qKu4cUI8g-sJOfG8JpZreTNFgf7M,2616
84
+ unique_toolkit/agentic/evaluation/hallucination/prompts/user_prompt.j2,sha256=mD_qE9fOkyc1XXrebFt097ddx8bTlA6lbY04hKSQmWs,273
81
85
  unique_toolkit/agentic/evaluation/hallucination/service.py,sha256=WJF1f45uHnYLx1S4TW31bSFobFpV-YlOS3G_zMhuBVU,2512
82
- unique_toolkit/agentic/evaluation/hallucination/utils.py,sha256=uHKTJw4kJyq0_Gi-EOhbocBAij4_Vzn3dW1wTxAuFg4,9706
86
+ unique_toolkit/agentic/evaluation/hallucination/utils.py,sha256=fxT7H1PQ6xANNvtViuhhR_9ac5ggDmFx-YfjcKUZRcg,12013
83
87
  unique_toolkit/agentic/evaluation/output_parser.py,sha256=0FDo8YY_Dc4qlTNeYyQkznzIFj9aX9wMrLOTbhhTl6g,1418
84
88
  unique_toolkit/agentic/evaluation/schemas.py,sha256=m9JMCUmeqP8KhsJOVEzsz6dRXUe1uKw-bxRDtn5qwvM,3156
85
- unique_toolkit/agentic/evaluation/tests/test_context_relevancy_service.py,sha256=4tDxHTApbaTMxN1sNS8WCqj2BweRk6YqZ5_zHP45jto,7977
86
- unique_toolkit/agentic/evaluation/tests/test_output_parser.py,sha256=RN_HcBbU6qy_e_PoYyUFcjWnp3ymJ6-gLj6TgEOupAI,3107
89
+ unique_toolkit/agentic/evaluation/tests/fixtures.py,sha256=Q-ughTfDiAdsMKbBVGzFiBucFdAx-FXgJ9iqp5xMyPs,2801
90
+ unique_toolkit/agentic/evaluation/tests/test_config.py,sha256=p7xFQ7KE_yU8jGpqYA7ntAYe5Vln33wd6nwv3FM9XfI,8327
91
+ unique_toolkit/agentic/evaluation/tests/test_context_relevancy_service.py,sha256=NcSOyBJ_lqYehtlraZPo9RLutCitTP76kvkuyogSD2A,9477
92
+ unique_toolkit/agentic/evaluation/tests/test_hallucination_constants.py,sha256=jT61WxKic-jDUJT1BeVjzhck02EnaMi1ng2H82-Aq_Q,19348
93
+ unique_toolkit/agentic/evaluation/tests/test_hallucination_utils.py,sha256=PKyGR073HxT0J_g8626kCURbMSlrMgkg-xPP7dPHD-0,31838
94
+ unique_toolkit/agentic/evaluation/tests/test_output_parser.py,sha256=KfltytmvqnPWLhmZpBXqcRmnlYorw_USwM5rkLVv8so,5179
95
+ unique_toolkit/agentic/evaluation/tests/test_prompt_loaders.py,sha256=zBREdlKf5tdDyB8XSaNgpQv3-tuZJoYteeJrp6WMWDM,11897
96
+ unique_toolkit/agentic/evaluation/utils.py,sha256=HmyPaDV8wdW-_gOjjW-wDaMKgdrsP5-SHP7OqTmGI_A,264
87
97
  unique_toolkit/agentic/feature_flags/__init__.py,sha256=LhE2cHoa9AYBOR7TjiIToOn46sttm9paKcrzE7gnDPM,149
88
98
  unique_toolkit/agentic/feature_flags/feature_flags.py,sha256=4jPH0GGGt5-tQ6PJWNpMBIlYzNrQIIqBLx8W02lwxD0,1140
89
99
  unique_toolkit/agentic/history_manager/history_construction_with_contents.py,sha256=TwamOOnYTYZMQdY1mAzj6_MZOe3T5RsjFDarT1tCtYo,8150
@@ -111,7 +121,7 @@ unique_toolkit/agentic/postprocessor/postprocessor_manager.py,sha256=CoKzVFeLIr1
111
121
  unique_toolkit/agentic/reference_manager/reference_manager.py,sha256=x51CT0D8HHu2LzgXdHGy0leOYpjnsxVbPZ2nc28G9mA,4005
112
122
  unique_toolkit/agentic/responses_api/__init__.py,sha256=9WTO-ef7fGE9Y1QtZJFm8Q_jkwK8Srtl-HWvpAD2Wxs,668
113
123
  unique_toolkit/agentic/responses_api/postprocessors/code_display.py,sha256=h6ZqPR0kPQnxM0ynshYQTa1BrcN8XGbUz9p03m8rOj0,2339
114
- unique_toolkit/agentic/responses_api/postprocessors/generated_files.py,sha256=JWJGxxCfIrBGyf45ic50MwFN4AqOhYSuNFrPXrIhPWI,11727
124
+ unique_toolkit/agentic/responses_api/postprocessors/generated_files.py,sha256=janOVTJtGDXWvWsUjB1pnRfLUVXPSkjjbXHxUS0IjjE,12685
115
125
  unique_toolkit/agentic/responses_api/stream_handler.py,sha256=Y1IM0uiPBdlab5UuOTCsHTaVX-fd9MxfS3xkwhdFie4,647
116
126
  unique_toolkit/agentic/short_term_memory_manager/persistent_short_term_memory_manager.py,sha256=g8I64dKkpwWIXfwpxD1-rLte00hh_PoQ9-fXUAcNQCo,5817
117
127
  unique_toolkit/agentic/thinking_manager/thinking_manager.py,sha256=41QWFsdRrbWlQHBfYCFv726UDom4WbcvaRfjCmoUOQI,4183
@@ -244,7 +254,7 @@ unique_toolkit/short_term_memory/service.py,sha256=5PeVBu1ZCAfyDb2HLVvlmqSbyzBBu
244
254
  unique_toolkit/smart_rules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
245
255
  unique_toolkit/smart_rules/compile.py,sha256=Ozhh70qCn2yOzRWr9d8WmJeTo7AQurwd3tStgBMPFLA,1246
246
256
  unique_toolkit/test_utilities/events.py,sha256=_mwV2bs5iLjxS1ynDCjaIq-gjjKhXYCK-iy3dRfvO3g,6410
247
- unique_toolkit-1.45.4.dist-info/LICENSE,sha256=GlN8wHNdh53xwOPg44URnwag6TEolCjoq3YD_KrWgss,193
248
- unique_toolkit-1.45.4.dist-info/METADATA,sha256=CF-1ODs9g_f-JcGdbVQhhOYuU4d5zpyIK1Sa1MdFqtU,49046
249
- unique_toolkit-1.45.4.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
250
- unique_toolkit-1.45.4.dist-info/RECORD,,
257
+ unique_toolkit-1.45.6.dist-info/LICENSE,sha256=GlN8wHNdh53xwOPg44URnwag6TEolCjoq3YD_KrWgss,193
258
+ unique_toolkit-1.45.6.dist-info/METADATA,sha256=Ojrf6ABO88IO5h892iywEweJuc9p67MWzHMXa--40gE,49243
259
+ unique_toolkit-1.45.6.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
260
+ unique_toolkit-1.45.6.dist-info/RECORD,,