unique_toolkit 1.45.4__py3-none-any.whl → 1.45.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- unique_toolkit/agentic/evaluation/config.py +25 -6
- unique_toolkit/agentic/evaluation/context_relevancy/prompts/__init__.py +13 -0
- unique_toolkit/agentic/evaluation/context_relevancy/{prompts.py → prompts/system_prompt.j2} +11 -43
- unique_toolkit/agentic/evaluation/context_relevancy/prompts/user_prompt.j2 +15 -0
- unique_toolkit/agentic/evaluation/context_relevancy/service.py +24 -56
- unique_toolkit/agentic/evaluation/hallucination/constants.py +26 -15
- unique_toolkit/agentic/evaluation/hallucination/prompts/__init__.py +13 -0
- unique_toolkit/agentic/evaluation/hallucination/prompts/system_prompt.j2 +35 -0
- unique_toolkit/agentic/evaluation/hallucination/prompts/user_prompt.j2 +27 -0
- unique_toolkit/agentic/evaluation/hallucination/utils.py +153 -102
- unique_toolkit/agentic/evaluation/tests/fixtures.py +102 -0
- unique_toolkit/agentic/evaluation/tests/test_config.py +247 -0
- unique_toolkit/agentic/evaluation/tests/test_context_relevancy_service.py +141 -121
- unique_toolkit/agentic/evaluation/tests/test_hallucination_constants.py +600 -0
- unique_toolkit/agentic/evaluation/tests/test_hallucination_utils.py +1009 -0
- unique_toolkit/agentic/evaluation/tests/test_output_parser.py +82 -23
- unique_toolkit/agentic/evaluation/tests/test_prompt_loaders.py +348 -0
- unique_toolkit/agentic/evaluation/utils.py +8 -0
- unique_toolkit/agentic/responses_api/postprocessors/generated_files.py +34 -0
- {unique_toolkit-1.45.4.dist-info → unique_toolkit-1.45.6.dist-info}/METADATA +7 -1
- {unique_toolkit-1.45.4.dist-info → unique_toolkit-1.45.6.dist-info}/RECORD +23 -13
- unique_toolkit/agentic/evaluation/hallucination/prompts.py +0 -79
- {unique_toolkit-1.45.4.dist-info → unique_toolkit-1.45.6.dist-info}/LICENSE +0 -0
- {unique_toolkit-1.45.4.dist-info → unique_toolkit-1.45.6.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,1009 @@
|
|
|
1
|
+
"""Tests for hallucination evaluation utils."""
|
|
2
|
+
|
|
3
|
+
from typing import List, Optional
|
|
4
|
+
|
|
5
|
+
import pytest
|
|
6
|
+
|
|
7
|
+
from unique_toolkit.agentic.evaluation.hallucination.constants import (
|
|
8
|
+
HallucinationConfig,
|
|
9
|
+
SourceSelectionMode,
|
|
10
|
+
)
|
|
11
|
+
from unique_toolkit.agentic.evaluation.hallucination.utils import (
|
|
12
|
+
_compose_msgs,
|
|
13
|
+
_default_source_selection_mode,
|
|
14
|
+
_from_order_source_selection_mode,
|
|
15
|
+
_from_original_response_source_selection_mode,
|
|
16
|
+
_get_msgs,
|
|
17
|
+
context_text_from_stream_response,
|
|
18
|
+
)
|
|
19
|
+
from unique_toolkit.agentic.evaluation.schemas import EvaluationMetricInput
|
|
20
|
+
from unique_toolkit.content import ContentReference
|
|
21
|
+
from unique_toolkit.content.schemas import ContentChunk
|
|
22
|
+
from unique_toolkit.language_model.schemas import (
|
|
23
|
+
LanguageModelMessageRole,
|
|
24
|
+
LanguageModelMessages,
|
|
25
|
+
LanguageModelStreamResponse,
|
|
26
|
+
LanguageModelStreamResponseMessage,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@pytest.fixture
|
|
31
|
+
def sample_chunks() -> List[ContentChunk]:
|
|
32
|
+
"""Create sample content chunks for testing."""
|
|
33
|
+
return [
|
|
34
|
+
ContentChunk(
|
|
35
|
+
id="cont_123",
|
|
36
|
+
chunk_id="chunk_001",
|
|
37
|
+
text="First chunk text",
|
|
38
|
+
order=0,
|
|
39
|
+
),
|
|
40
|
+
ContentChunk(
|
|
41
|
+
id="cont_123",
|
|
42
|
+
chunk_id="chunk_002",
|
|
43
|
+
text="Second chunk text",
|
|
44
|
+
order=1,
|
|
45
|
+
),
|
|
46
|
+
ContentChunk(
|
|
47
|
+
id="cont_456",
|
|
48
|
+
chunk_id="chunk_003",
|
|
49
|
+
text="Third chunk text",
|
|
50
|
+
order=0,
|
|
51
|
+
),
|
|
52
|
+
ContentChunk(
|
|
53
|
+
id="cont_456",
|
|
54
|
+
chunk_id="chunk_004",
|
|
55
|
+
text="Fourth chunk text",
|
|
56
|
+
order=1,
|
|
57
|
+
),
|
|
58
|
+
]
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@pytest.fixture
|
|
62
|
+
def sample_references() -> List[ContentReference]:
|
|
63
|
+
"""Create sample content references for testing."""
|
|
64
|
+
return [
|
|
65
|
+
ContentReference(
|
|
66
|
+
name="Reference 1",
|
|
67
|
+
sequence_number=1,
|
|
68
|
+
source="test_source",
|
|
69
|
+
source_id="cont_123_chunk_001",
|
|
70
|
+
url="http://example.com/1",
|
|
71
|
+
original_index=[0],
|
|
72
|
+
),
|
|
73
|
+
ContentReference(
|
|
74
|
+
name="Reference 2",
|
|
75
|
+
sequence_number=2,
|
|
76
|
+
source="test_source",
|
|
77
|
+
source_id="cont_456_chunk_003",
|
|
78
|
+
url="http://example.com/2",
|
|
79
|
+
original_index=[2],
|
|
80
|
+
),
|
|
81
|
+
]
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@pytest.fixture
|
|
85
|
+
def hallucination_config() -> HallucinationConfig:
|
|
86
|
+
"""Create a hallucination config for testing."""
|
|
87
|
+
return HallucinationConfig(enabled=True)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
@pytest.fixture
|
|
91
|
+
def evaluation_input() -> EvaluationMetricInput:
|
|
92
|
+
"""Create an evaluation input for testing."""
|
|
93
|
+
return EvaluationMetricInput(
|
|
94
|
+
input_text="Test question",
|
|
95
|
+
context_texts=["Context 1", "Context 2"],
|
|
96
|
+
output_text="Test output",
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
@pytest.mark.ai
|
|
101
|
+
def test_default_source_selection_mode__selects_chunks__by_source_id_match(
|
|
102
|
+
sample_references: List[ContentReference],
|
|
103
|
+
sample_chunks: List[ContentChunk],
|
|
104
|
+
) -> None:
|
|
105
|
+
"""
|
|
106
|
+
Purpose: Verify that chunks are selected by matching source_id from references.
|
|
107
|
+
Why this matters: FROM_IDS mode is the most precise chunk selection method.
|
|
108
|
+
Setup summary: Provide references with source IDs, assert matching chunks selected.
|
|
109
|
+
"""
|
|
110
|
+
# Arrange - Fixtures provide data
|
|
111
|
+
|
|
112
|
+
# Act
|
|
113
|
+
result: List[ContentChunk] = _default_source_selection_mode(
|
|
114
|
+
sample_references, sample_chunks
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# Assert
|
|
118
|
+
assert len(result) == 2
|
|
119
|
+
assert result[0].chunk_id == "chunk_001"
|
|
120
|
+
assert result[1].chunk_id == "chunk_003"
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
@pytest.mark.ai
|
|
124
|
+
def test_default_source_selection_mode__returns_empty_list__when_no_matches_found(
|
|
125
|
+
sample_chunks: List[ContentChunk],
|
|
126
|
+
) -> None:
|
|
127
|
+
"""
|
|
128
|
+
Purpose: Verify that empty list is returned when no references match chunks.
|
|
129
|
+
Why this matters: Graceful handling of missing references is critical.
|
|
130
|
+
Setup summary: Provide non-matching reference, assert empty result.
|
|
131
|
+
"""
|
|
132
|
+
# Arrange
|
|
133
|
+
references: List[ContentReference] = [
|
|
134
|
+
ContentReference(
|
|
135
|
+
name="No Match",
|
|
136
|
+
sequence_number=1,
|
|
137
|
+
source="test",
|
|
138
|
+
source_id="nonexistent_id",
|
|
139
|
+
url="http://example.com",
|
|
140
|
+
original_index=[0],
|
|
141
|
+
)
|
|
142
|
+
]
|
|
143
|
+
|
|
144
|
+
# Act
|
|
145
|
+
result: List[ContentChunk] = _default_source_selection_mode(
|
|
146
|
+
references, sample_chunks
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
# Assert
|
|
150
|
+
assert len(result) == 0
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
@pytest.mark.ai
|
|
154
|
+
def test_default_source_selection_mode__returns_empty_list__with_empty_references(
|
|
155
|
+
sample_chunks: List[ContentChunk],
|
|
156
|
+
) -> None:
|
|
157
|
+
"""
|
|
158
|
+
Purpose: Verify that empty list is returned when references list is empty.
|
|
159
|
+
Why this matters: Handles edge case of no references gracefully.
|
|
160
|
+
Setup summary: Provide empty references list, assert empty result.
|
|
161
|
+
"""
|
|
162
|
+
# Arrange
|
|
163
|
+
references: List[ContentReference] = []
|
|
164
|
+
|
|
165
|
+
# Act
|
|
166
|
+
result: List[ContentChunk] = _default_source_selection_mode(
|
|
167
|
+
references, sample_chunks
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
# Assert
|
|
171
|
+
assert len(result) == 0
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
@pytest.mark.ai
|
|
175
|
+
def test_default_source_selection_mode__returns_empty_list__with_empty_chunks(
|
|
176
|
+
sample_references: List[ContentReference],
|
|
177
|
+
) -> None:
|
|
178
|
+
"""
|
|
179
|
+
Purpose: Verify that empty list is returned when chunks list is empty.
|
|
180
|
+
Why this matters: Handles edge case of no available chunks gracefully.
|
|
181
|
+
Setup summary: Provide empty chunks list, assert empty result.
|
|
182
|
+
"""
|
|
183
|
+
# Arrange
|
|
184
|
+
chunks: List[ContentChunk] = []
|
|
185
|
+
|
|
186
|
+
# Act
|
|
187
|
+
result: List[ContentChunk] = _default_source_selection_mode(
|
|
188
|
+
sample_references, chunks
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
# Assert
|
|
192
|
+
assert len(result) == 0
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
@pytest.mark.ai
|
|
196
|
+
def test_default_source_selection_mode__builds_chunk_id_correctly__for_matching(
|
|
197
|
+
sample_chunks: List[ContentChunk],
|
|
198
|
+
) -> None:
|
|
199
|
+
"""
|
|
200
|
+
Purpose: Verify that chunk IDs are built correctly using id_chunkId format.
|
|
201
|
+
Why this matters: Correct ID construction is critical for chunk matching.
|
|
202
|
+
Setup summary: Provide reference with specific ID format, assert correct chunk found.
|
|
203
|
+
"""
|
|
204
|
+
# Arrange
|
|
205
|
+
references: List[ContentReference] = [
|
|
206
|
+
ContentReference(
|
|
207
|
+
name="Ref",
|
|
208
|
+
sequence_number=1,
|
|
209
|
+
source="test",
|
|
210
|
+
source_id="cont_123_chunk_001",
|
|
211
|
+
url="http://example.com",
|
|
212
|
+
original_index=[0],
|
|
213
|
+
)
|
|
214
|
+
]
|
|
215
|
+
|
|
216
|
+
# Act
|
|
217
|
+
result: List[ContentChunk] = _default_source_selection_mode(
|
|
218
|
+
references, sample_chunks
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
# Assert
|
|
222
|
+
assert len(result) == 1
|
|
223
|
+
assert result[0].id == "cont_123"
|
|
224
|
+
assert result[0].chunk_id == "chunk_001"
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
@pytest.mark.ai
|
|
228
|
+
def test_from_order_source_selection_mode__selects_chunks__by_original_index_order(
|
|
229
|
+
sample_references: List[ContentReference],
|
|
230
|
+
sample_chunks: List[ContentChunk],
|
|
231
|
+
) -> None:
|
|
232
|
+
"""
|
|
233
|
+
Purpose: Verify that chunks are selected by original_index values from references.
|
|
234
|
+
Why this matters: FROM_ORDER mode enables position-based chunk selection.
|
|
235
|
+
Setup summary: Provide references with original indices, assert chunks at those positions.
|
|
236
|
+
"""
|
|
237
|
+
# Arrange - Fixtures provide data
|
|
238
|
+
|
|
239
|
+
# Act
|
|
240
|
+
result: List[ContentChunk] = _from_order_source_selection_mode(
|
|
241
|
+
sample_references, sample_chunks
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
# Assert
|
|
245
|
+
assert len(result) == 2
|
|
246
|
+
assert result[0] == sample_chunks[0] # original_index [0]
|
|
247
|
+
assert result[1] == sample_chunks[2] # original_index [2]
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
@pytest.mark.ai
|
|
251
|
+
def test_from_order_source_selection_mode__handles_multiple_indices__in_single_reference(
|
|
252
|
+
sample_chunks: List[ContentChunk],
|
|
253
|
+
) -> None:
|
|
254
|
+
"""
|
|
255
|
+
Purpose: Verify handling of references with multiple original indices.
|
|
256
|
+
Why this matters: Single reference may cite multiple chunks.
|
|
257
|
+
Setup summary: Provide reference with multiple indices, assert all chunks selected.
|
|
258
|
+
"""
|
|
259
|
+
# Arrange
|
|
260
|
+
references: List[ContentReference] = [
|
|
261
|
+
ContentReference(
|
|
262
|
+
name="Multi",
|
|
263
|
+
sequence_number=1,
|
|
264
|
+
source="test",
|
|
265
|
+
source_id="test_id",
|
|
266
|
+
url="http://example.com",
|
|
267
|
+
original_index=[0, 2, 3],
|
|
268
|
+
)
|
|
269
|
+
]
|
|
270
|
+
|
|
271
|
+
# Act
|
|
272
|
+
result: List[ContentChunk] = _from_order_source_selection_mode(
|
|
273
|
+
references, sample_chunks
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
# Assert
|
|
277
|
+
assert len(result) == 3
|
|
278
|
+
assert result[0] == sample_chunks[0]
|
|
279
|
+
assert result[1] == sample_chunks[2]
|
|
280
|
+
assert result[2] == sample_chunks[3]
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
@pytest.mark.ai
|
|
284
|
+
def test_from_order_source_selection_mode__removes_duplicate_indices__while_preserving_order(
|
|
285
|
+
sample_chunks: List[ContentChunk],
|
|
286
|
+
) -> None:
|
|
287
|
+
"""
|
|
288
|
+
Purpose: Verify that duplicate indices across references are deduplicated.
|
|
289
|
+
Why this matters: Prevents duplicate chunks in context while maintaining order.
|
|
290
|
+
Setup summary: Provide overlapping indices, assert deduplicated result.
|
|
291
|
+
"""
|
|
292
|
+
# Arrange
|
|
293
|
+
references: List[ContentReference] = [
|
|
294
|
+
ContentReference(
|
|
295
|
+
name="Ref1",
|
|
296
|
+
sequence_number=1,
|
|
297
|
+
source="test",
|
|
298
|
+
source_id="id1",
|
|
299
|
+
url="http://example.com",
|
|
300
|
+
original_index=[0, 1],
|
|
301
|
+
),
|
|
302
|
+
ContentReference(
|
|
303
|
+
name="Ref2",
|
|
304
|
+
sequence_number=2,
|
|
305
|
+
source="test",
|
|
306
|
+
source_id="id2",
|
|
307
|
+
url="http://example.com",
|
|
308
|
+
original_index=[1, 2], # 1 is duplicate
|
|
309
|
+
),
|
|
310
|
+
]
|
|
311
|
+
|
|
312
|
+
# Act
|
|
313
|
+
result: List[ContentChunk] = _from_order_source_selection_mode(
|
|
314
|
+
references, sample_chunks
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
# Assert
|
|
318
|
+
assert len(result) == 3 # 0, 1, 2 (no duplicate 1)
|
|
319
|
+
assert result[0] == sample_chunks[0]
|
|
320
|
+
assert result[1] == sample_chunks[1]
|
|
321
|
+
assert result[2] == sample_chunks[2]
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
@pytest.mark.ai
|
|
325
|
+
def test_from_order_source_selection_mode__returns_empty_list__with_empty_references(
|
|
326
|
+
sample_chunks: List[ContentChunk],
|
|
327
|
+
) -> None:
|
|
328
|
+
"""
|
|
329
|
+
Purpose: Verify that empty list is returned when references list is empty.
|
|
330
|
+
Why this matters: Handles edge case gracefully without errors.
|
|
331
|
+
Setup summary: Provide empty references, assert empty result.
|
|
332
|
+
"""
|
|
333
|
+
# Arrange
|
|
334
|
+
references: List[ContentReference] = []
|
|
335
|
+
|
|
336
|
+
# Act
|
|
337
|
+
result: List[ContentChunk] = _from_order_source_selection_mode(
|
|
338
|
+
references, sample_chunks
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
# Assert
|
|
342
|
+
assert len(result) == 0
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
@pytest.mark.ai
|
|
346
|
+
def test_from_order_source_selection_mode__preserves_order__from_reference_appearance(
|
|
347
|
+
sample_chunks: List[ContentChunk],
|
|
348
|
+
) -> None:
|
|
349
|
+
"""
|
|
350
|
+
Purpose: Verify that chunk order matches the order indices appear in references.
|
|
351
|
+
Why this matters: Order preservation maintains citation flow from original response.
|
|
352
|
+
Setup summary: Provide indices in specific order, assert result matches that order.
|
|
353
|
+
"""
|
|
354
|
+
# Arrange
|
|
355
|
+
references: List[ContentReference] = [
|
|
356
|
+
ContentReference(
|
|
357
|
+
name="Ref",
|
|
358
|
+
sequence_number=1,
|
|
359
|
+
source="test",
|
|
360
|
+
source_id="id",
|
|
361
|
+
url="http://example.com",
|
|
362
|
+
original_index=[3, 1, 0], # Specific order
|
|
363
|
+
)
|
|
364
|
+
]
|
|
365
|
+
|
|
366
|
+
# Act
|
|
367
|
+
result: List[ContentChunk] = _from_order_source_selection_mode(
|
|
368
|
+
references, sample_chunks
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
# Assert
|
|
372
|
+
assert len(result) == 3
|
|
373
|
+
assert result[0] == sample_chunks[3]
|
|
374
|
+
assert result[1] == sample_chunks[1]
|
|
375
|
+
assert result[2] == sample_chunks[0]
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
@pytest.mark.ai
|
|
379
|
+
def test_from_original_response_source_selection_mode__extracts_source_numbers__from_text(
|
|
380
|
+
sample_chunks: List[ContentChunk],
|
|
381
|
+
) -> None:
|
|
382
|
+
"""
|
|
383
|
+
Purpose: Verify extraction of source numbers from text using regex pattern.
|
|
384
|
+
Why this matters: Enables detection of actually cited sources in generated text.
|
|
385
|
+
Setup summary: Provide text with source citations, assert correct chunks extracted.
|
|
386
|
+
"""
|
|
387
|
+
# Arrange
|
|
388
|
+
original_text: str = "Based on [source0] and [source2], we can conclude..."
|
|
389
|
+
pattern: str = r"[\[<]?source(\d+)[>\]]?"
|
|
390
|
+
|
|
391
|
+
# Act
|
|
392
|
+
result: List[ContentChunk] = _from_original_response_source_selection_mode(
|
|
393
|
+
original_text, sample_chunks, pattern
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
# Assert
|
|
397
|
+
assert len(result) == 2
|
|
398
|
+
assert result[0] == sample_chunks[0]
|
|
399
|
+
assert result[1] == sample_chunks[2]
|
|
400
|
+
|
|
401
|
+
|
|
402
|
+
@pytest.mark.ai
|
|
403
|
+
def test_from_original_response_source_selection_mode__handles_different_reference_formats(
|
|
404
|
+
sample_chunks: List[ContentChunk],
|
|
405
|
+
) -> None:
|
|
406
|
+
"""
|
|
407
|
+
Purpose: Verify extraction works with multiple reference format variations.
|
|
408
|
+
Why this matters: Different systems may use different citation formats.
|
|
409
|
+
Setup summary: Provide text with mixed citation formats, assert all extracted.
|
|
410
|
+
"""
|
|
411
|
+
# Arrange
|
|
412
|
+
original_text: str = "From <source0> and source1 and [source3]"
|
|
413
|
+
pattern: str = r"[\[<]?source(\d+)[>\]]?"
|
|
414
|
+
|
|
415
|
+
# Act
|
|
416
|
+
result: List[ContentChunk] = _from_original_response_source_selection_mode(
|
|
417
|
+
original_text, sample_chunks, pattern
|
|
418
|
+
)
|
|
419
|
+
|
|
420
|
+
# Assert
|
|
421
|
+
assert len(result) == 3
|
|
422
|
+
assert result[0] == sample_chunks[0]
|
|
423
|
+
assert result[1] == sample_chunks[1]
|
|
424
|
+
assert result[2] == sample_chunks[3]
|
|
425
|
+
|
|
426
|
+
|
|
427
|
+
@pytest.mark.ai
|
|
428
|
+
def test_from_original_response_source_selection_mode__removes_duplicate_references__while_preserving_order(
|
|
429
|
+
sample_chunks: List[ContentChunk],
|
|
430
|
+
) -> None:
|
|
431
|
+
"""
|
|
432
|
+
Purpose: Verify that duplicate source citations are deduplicated.
|
|
433
|
+
Why this matters: Prevents duplicate chunks in evaluation context.
|
|
434
|
+
Setup summary: Provide text with repeated citations, assert deduplicated result.
|
|
435
|
+
"""
|
|
436
|
+
# Arrange
|
|
437
|
+
original_text: str = "[source0] [source1] [source0] [source2]"
|
|
438
|
+
pattern: str = r"[\[<]?source(\d+)[>\]]?"
|
|
439
|
+
|
|
440
|
+
# Act
|
|
441
|
+
result: List[ContentChunk] = _from_original_response_source_selection_mode(
|
|
442
|
+
original_text, sample_chunks, pattern
|
|
443
|
+
)
|
|
444
|
+
|
|
445
|
+
# Assert
|
|
446
|
+
assert len(result) == 3 # 0, 1, 2 (no duplicate 0)
|
|
447
|
+
assert result[0] == sample_chunks[0]
|
|
448
|
+
assert result[1] == sample_chunks[1]
|
|
449
|
+
assert result[2] == sample_chunks[2]
|
|
450
|
+
|
|
451
|
+
|
|
452
|
+
@pytest.mark.ai
|
|
453
|
+
def test_from_original_response_source_selection_mode__raises_value_error__when_original_text_is_none(
|
|
454
|
+
sample_chunks: List[ContentChunk],
|
|
455
|
+
) -> None:
|
|
456
|
+
"""
|
|
457
|
+
Purpose: Verify that ValueError is raised when original_text is None.
|
|
458
|
+
Why this matters: This mode requires original text to extract citations.
|
|
459
|
+
Setup summary: Call with None text, assert ValueError with descriptive message.
|
|
460
|
+
"""
|
|
461
|
+
# Arrange
|
|
462
|
+
original_text: Optional[str] = None
|
|
463
|
+
pattern: str = r"[\[<]?source(\d+)[>\]]?"
|
|
464
|
+
|
|
465
|
+
# Act & Assert
|
|
466
|
+
with pytest.raises(ValueError) as exc_info:
|
|
467
|
+
_from_original_response_source_selection_mode(
|
|
468
|
+
original_text,
|
|
469
|
+
sample_chunks,
|
|
470
|
+
pattern, # type: ignore
|
|
471
|
+
)
|
|
472
|
+
|
|
473
|
+
assert "original_text is required" in str(exc_info.value)
|
|
474
|
+
|
|
475
|
+
|
|
476
|
+
@pytest.mark.ai
|
|
477
|
+
def test_from_original_response_source_selection_mode__filters_out_of_bounds_indices(
|
|
478
|
+
sample_chunks: List[ContentChunk],
|
|
479
|
+
) -> None:
|
|
480
|
+
"""
|
|
481
|
+
Purpose: Verify that source indices beyond chunk list length are filtered out.
|
|
482
|
+
Why this matters: Prevents index errors and gracefully handles invalid references.
|
|
483
|
+
Setup summary: Provide text with out-of-bounds index, assert it's filtered.
|
|
484
|
+
"""
|
|
485
|
+
# Arrange
|
|
486
|
+
original_text: str = "[source0] [source10] [source2]" # source10 is out of bounds
|
|
487
|
+
pattern: str = r"[\[<]?source(\d+)[>\]]?"
|
|
488
|
+
|
|
489
|
+
# Act
|
|
490
|
+
result: List[ContentChunk] = _from_original_response_source_selection_mode(
|
|
491
|
+
original_text, sample_chunks, pattern
|
|
492
|
+
)
|
|
493
|
+
|
|
494
|
+
# Assert
|
|
495
|
+
assert len(result) == 2
|
|
496
|
+
assert result[0] == sample_chunks[0]
|
|
497
|
+
assert result[1] == sample_chunks[2]
|
|
498
|
+
|
|
499
|
+
|
|
500
|
+
@pytest.mark.ai
|
|
501
|
+
def test_from_original_response_source_selection_mode__returns_empty_list__when_no_references_found(
|
|
502
|
+
sample_chunks: List[ContentChunk],
|
|
503
|
+
) -> None:
|
|
504
|
+
"""
|
|
505
|
+
Purpose: Verify that empty list is returned when no citations found in text.
|
|
506
|
+
Why this matters: Handles case of text without source citations gracefully.
|
|
507
|
+
Setup summary: Provide text with no citations, assert empty result.
|
|
508
|
+
"""
|
|
509
|
+
# Arrange
|
|
510
|
+
original_text: str = "No references in this text"
|
|
511
|
+
pattern: str = r"[\[<]?source(\d+)[>\]]?"
|
|
512
|
+
|
|
513
|
+
# Act
|
|
514
|
+
result: List[ContentChunk] = _from_original_response_source_selection_mode(
|
|
515
|
+
original_text, sample_chunks, pattern
|
|
516
|
+
)
|
|
517
|
+
|
|
518
|
+
# Assert
|
|
519
|
+
assert len(result) == 0
|
|
520
|
+
|
|
521
|
+
|
|
522
|
+
@pytest.mark.ai
|
|
523
|
+
def test_from_original_response_source_selection_mode__works_with_custom_regex_pattern(
|
|
524
|
+
sample_chunks: List[ContentChunk],
|
|
525
|
+
) -> None:
|
|
526
|
+
"""
|
|
527
|
+
Purpose: Verify that custom regex patterns can be used for extraction.
|
|
528
|
+
Why this matters: Enables support for organization-specific citation formats.
|
|
529
|
+
Setup summary: Provide custom pattern and matching text, assert extraction works.
|
|
530
|
+
"""
|
|
531
|
+
# Arrange
|
|
532
|
+
original_text: str = "See ref:0 and ref:2 for details"
|
|
533
|
+
pattern: str = r"ref:(\d+)"
|
|
534
|
+
|
|
535
|
+
# Act
|
|
536
|
+
result: List[ContentChunk] = _from_original_response_source_selection_mode(
|
|
537
|
+
original_text, sample_chunks, pattern
|
|
538
|
+
)
|
|
539
|
+
|
|
540
|
+
# Assert
|
|
541
|
+
assert len(result) == 2
|
|
542
|
+
assert result[0] == sample_chunks[0]
|
|
543
|
+
assert result[1] == sample_chunks[2]
|
|
544
|
+
|
|
545
|
+
|
|
546
|
+
@pytest.mark.ai
|
|
547
|
+
def test_from_original_response_source_selection_mode__preserves_order__from_text_appearance(
|
|
548
|
+
sample_chunks: List[ContentChunk],
|
|
549
|
+
) -> None:
|
|
550
|
+
"""
|
|
551
|
+
Purpose: Verify that chunk order matches citation order in original text.
|
|
552
|
+
Why this matters: Order preservation maintains logical flow of cited sources.
|
|
553
|
+
Setup summary: Provide text with specific citation order, assert result matches.
|
|
554
|
+
"""
|
|
555
|
+
# Arrange
|
|
556
|
+
original_text: str = "[source3] [source1] [source0]"
|
|
557
|
+
pattern: str = r"[\[<]?source(\d+)[>\]]?"
|
|
558
|
+
|
|
559
|
+
# Act
|
|
560
|
+
result: List[ContentChunk] = _from_original_response_source_selection_mode(
|
|
561
|
+
original_text, sample_chunks, pattern
|
|
562
|
+
)
|
|
563
|
+
|
|
564
|
+
# Assert
|
|
565
|
+
assert len(result) == 3
|
|
566
|
+
assert result[0] == sample_chunks[3]
|
|
567
|
+
assert result[1] == sample_chunks[1]
|
|
568
|
+
assert result[2] == sample_chunks[0]
|
|
569
|
+
|
|
570
|
+
|
|
571
|
+
@pytest.mark.ai
|
|
572
|
+
def test_context_text_from_stream_response__extracts_context__using_from_ids_mode(
|
|
573
|
+
sample_chunks: List[ContentChunk],
|
|
574
|
+
sample_references: List[ContentReference],
|
|
575
|
+
) -> None:
|
|
576
|
+
"""
|
|
577
|
+
Purpose: Verify context extraction using FROM_IDS source selection mode.
|
|
578
|
+
Why this matters: FROM_IDS is most accurate mode for known reference IDs.
|
|
579
|
+
Setup summary: Create response with references, use FROM_IDS mode, assert correct texts.
|
|
580
|
+
"""
|
|
581
|
+
# Arrange
|
|
582
|
+
response: LanguageModelStreamResponse = LanguageModelStreamResponse(
|
|
583
|
+
message=LanguageModelStreamResponseMessage(
|
|
584
|
+
id="msg_1",
|
|
585
|
+
previous_message_id=None,
|
|
586
|
+
role=LanguageModelMessageRole.ASSISTANT,
|
|
587
|
+
text="Test",
|
|
588
|
+
references=sample_references,
|
|
589
|
+
)
|
|
590
|
+
)
|
|
591
|
+
|
|
592
|
+
# Act
|
|
593
|
+
result: List[str] = context_text_from_stream_response(
|
|
594
|
+
response, sample_chunks, SourceSelectionMode.FROM_IDS
|
|
595
|
+
)
|
|
596
|
+
|
|
597
|
+
# Assert
|
|
598
|
+
assert len(result) == 2
|
|
599
|
+
assert result[0] == "First chunk text"
|
|
600
|
+
assert result[1] == "Third chunk text"
|
|
601
|
+
|
|
602
|
+
|
|
603
|
+
@pytest.mark.ai
|
|
604
|
+
def test_context_text_from_stream_response__extracts_context__using_from_order_mode(
|
|
605
|
+
sample_chunks: List[ContentChunk],
|
|
606
|
+
sample_references: List[ContentReference],
|
|
607
|
+
) -> None:
|
|
608
|
+
"""
|
|
609
|
+
Purpose: Verify context extraction using FROM_ORDER source selection mode.
|
|
610
|
+
Why this matters: FROM_ORDER enables index-based chunk selection.
|
|
611
|
+
Setup summary: Create response with references, use FROM_ORDER mode, assert strings returned.
|
|
612
|
+
"""
|
|
613
|
+
# Arrange
|
|
614
|
+
response: LanguageModelStreamResponse = LanguageModelStreamResponse(
|
|
615
|
+
message=LanguageModelStreamResponseMessage(
|
|
616
|
+
id="msg_1",
|
|
617
|
+
previous_message_id=None,
|
|
618
|
+
role=LanguageModelMessageRole.ASSISTANT,
|
|
619
|
+
text="Test",
|
|
620
|
+
references=sample_references,
|
|
621
|
+
)
|
|
622
|
+
)
|
|
623
|
+
|
|
624
|
+
# Act
|
|
625
|
+
result: List[str] = context_text_from_stream_response(
|
|
626
|
+
response, sample_chunks, SourceSelectionMode.FROM_ORDER
|
|
627
|
+
)
|
|
628
|
+
|
|
629
|
+
# Assert
|
|
630
|
+
assert len(result) == 2
|
|
631
|
+
assert isinstance(result[0], str)
|
|
632
|
+
assert isinstance(result[1], str)
|
|
633
|
+
|
|
634
|
+
|
|
635
|
+
@pytest.mark.ai
|
|
636
|
+
def test_context_text_from_stream_response__extracts_context__using_from_original_response_mode(
|
|
637
|
+
sample_chunks: List[ContentChunk],
|
|
638
|
+
) -> None:
|
|
639
|
+
"""
|
|
640
|
+
Purpose: Verify context extraction using FROM_ORIGINAL_RESPONSE mode.
|
|
641
|
+
Why this matters: Extracts only sources actually cited in generated text.
|
|
642
|
+
Setup summary: Create response with original_text citations, assert extraction works.
|
|
643
|
+
"""
|
|
644
|
+
# Arrange
|
|
645
|
+
references: List[ContentReference] = [
|
|
646
|
+
ContentReference(
|
|
647
|
+
name="Ref",
|
|
648
|
+
sequence_number=1,
|
|
649
|
+
source="test",
|
|
650
|
+
source_id="id",
|
|
651
|
+
url="http://example.com",
|
|
652
|
+
original_index=[0],
|
|
653
|
+
)
|
|
654
|
+
]
|
|
655
|
+
response: LanguageModelStreamResponse = LanguageModelStreamResponse(
|
|
656
|
+
message=LanguageModelStreamResponseMessage(
|
|
657
|
+
id="msg_1",
|
|
658
|
+
previous_message_id=None,
|
|
659
|
+
role=LanguageModelMessageRole.ASSISTANT,
|
|
660
|
+
text="Test",
|
|
661
|
+
original_text="Based on [source0] and [source2]",
|
|
662
|
+
references=references,
|
|
663
|
+
)
|
|
664
|
+
)
|
|
665
|
+
pattern: str = r"[\[<]?source(\d+)[>\]]?"
|
|
666
|
+
|
|
667
|
+
# Act
|
|
668
|
+
result: List[str] = context_text_from_stream_response(
|
|
669
|
+
response,
|
|
670
|
+
sample_chunks,
|
|
671
|
+
SourceSelectionMode.FROM_ORIGINAL_RESPONSE,
|
|
672
|
+
pattern,
|
|
673
|
+
)
|
|
674
|
+
|
|
675
|
+
# Assert
|
|
676
|
+
assert len(result) == 2
|
|
677
|
+
assert result[0] == "First chunk text"
|
|
678
|
+
assert result[1] == "Third chunk text"
|
|
679
|
+
|
|
680
|
+
|
|
681
|
+
@pytest.mark.ai
|
|
682
|
+
def test_context_text_from_stream_response__falls_back_to_default__with_invalid_mode(
|
|
683
|
+
sample_chunks: List[ContentChunk],
|
|
684
|
+
sample_references: List[ContentReference],
|
|
685
|
+
) -> None:
|
|
686
|
+
"""
|
|
687
|
+
Purpose: Verify that invalid mode falls back to FROM_IDS mode gracefully.
|
|
688
|
+
Why this matters: Ensures robustness against configuration errors.
|
|
689
|
+
Setup summary: Use invalid mode string, assert fallback returns results.
|
|
690
|
+
"""
|
|
691
|
+
# Arrange
|
|
692
|
+
response: LanguageModelStreamResponse = LanguageModelStreamResponse(
|
|
693
|
+
message=LanguageModelStreamResponseMessage(
|
|
694
|
+
id="msg_1",
|
|
695
|
+
previous_message_id=None,
|
|
696
|
+
role=LanguageModelMessageRole.ASSISTANT,
|
|
697
|
+
text="Test",
|
|
698
|
+
references=sample_references,
|
|
699
|
+
)
|
|
700
|
+
)
|
|
701
|
+
|
|
702
|
+
# Act
|
|
703
|
+
result: List[str] = context_text_from_stream_response(
|
|
704
|
+
response,
|
|
705
|
+
sample_chunks,
|
|
706
|
+
"INVALID_MODE", # type: ignore
|
|
707
|
+
)
|
|
708
|
+
|
|
709
|
+
# Assert
|
|
710
|
+
assert len(result) == 2
|
|
711
|
+
|
|
712
|
+
|
|
713
|
+
@pytest.mark.ai
|
|
714
|
+
def test_context_text_from_stream_response__falls_back_to_default__on_extraction_error(
|
|
715
|
+
sample_chunks: List[ContentChunk],
|
|
716
|
+
) -> None:
|
|
717
|
+
"""
|
|
718
|
+
Purpose: Verify that extraction errors trigger fallback to default mode.
|
|
719
|
+
Why this matters: Ensures evaluation continues even with malformed data.
|
|
720
|
+
Setup summary: Create scenario that causes error, assert fallback succeeds.
|
|
721
|
+
"""
|
|
722
|
+
# Arrange
|
|
723
|
+
references: List[ContentReference] = [
|
|
724
|
+
ContentReference(
|
|
725
|
+
name="Ref",
|
|
726
|
+
sequence_number=1,
|
|
727
|
+
source="test",
|
|
728
|
+
source_id="cont_123_chunk_001",
|
|
729
|
+
url="http://example.com",
|
|
730
|
+
original_index=[0],
|
|
731
|
+
)
|
|
732
|
+
]
|
|
733
|
+
response: LanguageModelStreamResponse = LanguageModelStreamResponse(
|
|
734
|
+
message=LanguageModelStreamResponseMessage(
|
|
735
|
+
id="msg_1",
|
|
736
|
+
previous_message_id=None,
|
|
737
|
+
role=LanguageModelMessageRole.ASSISTANT,
|
|
738
|
+
text="Test",
|
|
739
|
+
original_text=None, # Will cause error in FROM_ORIGINAL_RESPONSE
|
|
740
|
+
references=references,
|
|
741
|
+
)
|
|
742
|
+
)
|
|
743
|
+
|
|
744
|
+
# Act
|
|
745
|
+
result: List[str] = context_text_from_stream_response(
|
|
746
|
+
response, sample_chunks, SourceSelectionMode.FROM_ORIGINAL_RESPONSE
|
|
747
|
+
)
|
|
748
|
+
|
|
749
|
+
# Assert
|
|
750
|
+
assert len(result) == 1
|
|
751
|
+
assert result[0] == "First chunk text"
|
|
752
|
+
|
|
753
|
+
|
|
754
|
+
@pytest.mark.ai
|
|
755
|
+
def test_context_text_from_stream_response__returns_text_strings__not_chunk_objects(
|
|
756
|
+
sample_chunks: List[ContentChunk],
|
|
757
|
+
sample_references: List[ContentReference],
|
|
758
|
+
) -> None:
|
|
759
|
+
"""
|
|
760
|
+
Purpose: Verify that function returns list of text strings, not ContentChunk objects.
|
|
761
|
+
Why this matters: Evaluation expects string context, not chunk objects.
|
|
762
|
+
Setup summary: Call function, assert all results are strings not ContentChunk instances.
|
|
763
|
+
"""
|
|
764
|
+
# Arrange
|
|
765
|
+
response: LanguageModelStreamResponse = LanguageModelStreamResponse(
|
|
766
|
+
message=LanguageModelStreamResponseMessage(
|
|
767
|
+
id="msg_1",
|
|
768
|
+
previous_message_id=None,
|
|
769
|
+
role=LanguageModelMessageRole.ASSISTANT,
|
|
770
|
+
text="Test",
|
|
771
|
+
references=sample_references,
|
|
772
|
+
)
|
|
773
|
+
)
|
|
774
|
+
|
|
775
|
+
# Act
|
|
776
|
+
result: List[str] = context_text_from_stream_response(
|
|
777
|
+
response, sample_chunks, SourceSelectionMode.FROM_IDS
|
|
778
|
+
)
|
|
779
|
+
|
|
780
|
+
# Assert
|
|
781
|
+
assert all(isinstance(text, str) for text in result)
|
|
782
|
+
assert not any(isinstance(text, ContentChunk) for text in result)
|
|
783
|
+
|
|
784
|
+
|
|
785
|
+
@pytest.mark.ai
|
|
786
|
+
def test_get_msgs__composes_messages__with_context_and_history(
|
|
787
|
+
hallucination_config: HallucinationConfig,
|
|
788
|
+
) -> None:
|
|
789
|
+
"""
|
|
790
|
+
Purpose: Verify message composition with both context and history provided.
|
|
791
|
+
Why this matters: Full context enables accurate hallucination detection.
|
|
792
|
+
Setup summary: Create input with all fields, assert message structure.
|
|
793
|
+
"""
|
|
794
|
+
# Arrange
|
|
795
|
+
input_data: EvaluationMetricInput = EvaluationMetricInput(
|
|
796
|
+
input_text="Question",
|
|
797
|
+
context_texts=["Context 1"],
|
|
798
|
+
history_messages=[],
|
|
799
|
+
output_text="Output",
|
|
800
|
+
)
|
|
801
|
+
|
|
802
|
+
# Act
|
|
803
|
+
result: LanguageModelMessages = _get_msgs(input_data, hallucination_config)
|
|
804
|
+
|
|
805
|
+
# Assert
|
|
806
|
+
assert isinstance(result, LanguageModelMessages)
|
|
807
|
+
assert len(result.root) == 2
|
|
808
|
+
|
|
809
|
+
|
|
810
|
+
@pytest.mark.ai
|
|
811
|
+
def test_get_msgs__composes_messages__without_context_or_history(
|
|
812
|
+
hallucination_config: HallucinationConfig,
|
|
813
|
+
) -> None:
|
|
814
|
+
"""
|
|
815
|
+
Purpose: Verify message composition works without context or history.
|
|
816
|
+
Why this matters: Hallucination can be detected even without grounding context.
|
|
817
|
+
Setup summary: Create input with only input/output, assert message structure.
|
|
818
|
+
"""
|
|
819
|
+
# Arrange
|
|
820
|
+
input_data: EvaluationMetricInput = EvaluationMetricInput(
|
|
821
|
+
input_text="Question",
|
|
822
|
+
output_text="Output",
|
|
823
|
+
)
|
|
824
|
+
|
|
825
|
+
# Act
|
|
826
|
+
result: LanguageModelMessages = _get_msgs(input_data, hallucination_config)
|
|
827
|
+
|
|
828
|
+
# Assert
|
|
829
|
+
assert isinstance(result, LanguageModelMessages)
|
|
830
|
+
assert len(result.root) == 2
|
|
831
|
+
|
|
832
|
+
|
|
833
|
+
@pytest.mark.ai
|
|
834
|
+
def test_get_msgs__composes_messages__with_context_texts_only(
|
|
835
|
+
hallucination_config: HallucinationConfig,
|
|
836
|
+
) -> None:
|
|
837
|
+
"""
|
|
838
|
+
Purpose: Verify message composition with context texts but no history.
|
|
839
|
+
Why this matters: Common scenario for single-turn evaluations.
|
|
840
|
+
Setup summary: Create input with context but no history, assert message structure.
|
|
841
|
+
"""
|
|
842
|
+
# Arrange
|
|
843
|
+
input_data: EvaluationMetricInput = EvaluationMetricInput(
|
|
844
|
+
input_text="Question",
|
|
845
|
+
context_texts=["Context 1", "Context 2"],
|
|
846
|
+
output_text="Output",
|
|
847
|
+
)
|
|
848
|
+
|
|
849
|
+
# Act
|
|
850
|
+
result: LanguageModelMessages = _get_msgs(input_data, hallucination_config)
|
|
851
|
+
|
|
852
|
+
# Assert
|
|
853
|
+
assert isinstance(result, LanguageModelMessages)
|
|
854
|
+
assert len(result.root) == 2
|
|
855
|
+
|
|
856
|
+
|
|
857
|
+
@pytest.mark.ai
|
|
858
|
+
def test_compose_msgs__creates_valid_messages__with_context(
|
|
859
|
+
hallucination_config: HallucinationConfig,
|
|
860
|
+
) -> None:
|
|
861
|
+
"""
|
|
862
|
+
Purpose: Verify that messages are composed correctly when context is available.
|
|
863
|
+
Why this matters: Context affects prompt template rendering.
|
|
864
|
+
Setup summary: Call with has_context=True, assert message structure.
|
|
865
|
+
"""
|
|
866
|
+
# Arrange
|
|
867
|
+
input_data: EvaluationMetricInput = EvaluationMetricInput(
|
|
868
|
+
input_text="Question",
|
|
869
|
+
context_texts=["Context 1"],
|
|
870
|
+
output_text="Output",
|
|
871
|
+
)
|
|
872
|
+
|
|
873
|
+
# Act
|
|
874
|
+
result: LanguageModelMessages = _compose_msgs(
|
|
875
|
+
input_data, hallucination_config, has_context=True
|
|
876
|
+
)
|
|
877
|
+
|
|
878
|
+
# Assert
|
|
879
|
+
assert isinstance(result, LanguageModelMessages)
|
|
880
|
+
assert len(result.root) == 2
|
|
881
|
+
assert result.root[0].role == "system"
|
|
882
|
+
assert result.root[1].role == "user"
|
|
883
|
+
|
|
884
|
+
|
|
885
|
+
@pytest.mark.ai
|
|
886
|
+
def test_compose_msgs__creates_valid_messages__without_context(
|
|
887
|
+
hallucination_config: HallucinationConfig,
|
|
888
|
+
) -> None:
|
|
889
|
+
"""
|
|
890
|
+
Purpose: Verify that messages are composed correctly when context is absent.
|
|
891
|
+
Why this matters: No-context mode uses different prompt template.
|
|
892
|
+
Setup summary: Call with has_context=False, assert message structure.
|
|
893
|
+
"""
|
|
894
|
+
# Arrange
|
|
895
|
+
input_data: EvaluationMetricInput = EvaluationMetricInput(
|
|
896
|
+
input_text="Question",
|
|
897
|
+
output_text="Output",
|
|
898
|
+
)
|
|
899
|
+
|
|
900
|
+
# Act
|
|
901
|
+
result: LanguageModelMessages = _compose_msgs(
|
|
902
|
+
input_data, hallucination_config, has_context=False
|
|
903
|
+
)
|
|
904
|
+
|
|
905
|
+
# Assert
|
|
906
|
+
assert isinstance(result, LanguageModelMessages)
|
|
907
|
+
assert len(result.root) == 2
|
|
908
|
+
|
|
909
|
+
|
|
910
|
+
@pytest.mark.ai
|
|
911
|
+
def test_compose_msgs__uses_different_system_prompts__based_on_context_flag(
|
|
912
|
+
hallucination_config: HallucinationConfig,
|
|
913
|
+
) -> None:
|
|
914
|
+
"""
|
|
915
|
+
Purpose: Verify that system message content differs based on has_context flag.
|
|
916
|
+
Why this matters: Different prompts needed for grounded vs ungrounded evaluation.
|
|
917
|
+
Setup summary: Call with both flags, assert system prompts differ.
|
|
918
|
+
"""
|
|
919
|
+
# Arrange
|
|
920
|
+
input_data: EvaluationMetricInput = EvaluationMetricInput(
|
|
921
|
+
input_text="Question",
|
|
922
|
+
output_text="Output",
|
|
923
|
+
)
|
|
924
|
+
|
|
925
|
+
# Act
|
|
926
|
+
result_with_context: LanguageModelMessages = _compose_msgs(
|
|
927
|
+
input_data, hallucination_config, has_context=True
|
|
928
|
+
)
|
|
929
|
+
result_without_context: LanguageModelMessages = _compose_msgs(
|
|
930
|
+
input_data, hallucination_config, has_context=False
|
|
931
|
+
)
|
|
932
|
+
|
|
933
|
+
# Assert
|
|
934
|
+
assert result_with_context.root[0].content != result_without_context.root[0].content
|
|
935
|
+
|
|
936
|
+
|
|
937
|
+
@pytest.mark.ai
|
|
938
|
+
def test_compose_msgs__includes_input_text__in_user_message(
|
|
939
|
+
hallucination_config: HallucinationConfig,
|
|
940
|
+
) -> None:
|
|
941
|
+
"""
|
|
942
|
+
Purpose: Verify that user message contains the input text from evaluation input.
|
|
943
|
+
Why this matters: Input text provides context for hallucination evaluation.
|
|
944
|
+
Setup summary: Create input with specific text, assert it appears in user message.
|
|
945
|
+
"""
|
|
946
|
+
# Arrange
|
|
947
|
+
input_text: str = "What is the capital of France?"
|
|
948
|
+
input_data: EvaluationMetricInput = EvaluationMetricInput(
|
|
949
|
+
input_text=input_text,
|
|
950
|
+
output_text="Output",
|
|
951
|
+
)
|
|
952
|
+
|
|
953
|
+
# Act
|
|
954
|
+
result: LanguageModelMessages = _compose_msgs(
|
|
955
|
+
input_data, hallucination_config, has_context=False
|
|
956
|
+
)
|
|
957
|
+
|
|
958
|
+
# Assert
|
|
959
|
+
assert input_text in result.root[1].content
|
|
960
|
+
|
|
961
|
+
|
|
962
|
+
@pytest.mark.ai
|
|
963
|
+
def test_compose_msgs__includes_output_text__in_user_message(
|
|
964
|
+
hallucination_config: HallucinationConfig,
|
|
965
|
+
) -> None:
|
|
966
|
+
"""
|
|
967
|
+
Purpose: Verify that user message contains the output text being evaluated.
|
|
968
|
+
Why this matters: Output text is the primary target of hallucination detection.
|
|
969
|
+
Setup summary: Create input with specific output, assert it appears in user message.
|
|
970
|
+
"""
|
|
971
|
+
# Arrange
|
|
972
|
+
output_text: str = "The capital is Paris."
|
|
973
|
+
input_data: EvaluationMetricInput = EvaluationMetricInput(
|
|
974
|
+
input_text="Question",
|
|
975
|
+
output_text=output_text,
|
|
976
|
+
)
|
|
977
|
+
|
|
978
|
+
# Act
|
|
979
|
+
result: LanguageModelMessages = _compose_msgs(
|
|
980
|
+
input_data, hallucination_config, has_context=False
|
|
981
|
+
)
|
|
982
|
+
|
|
983
|
+
# Assert
|
|
984
|
+
assert output_text in result.root[1].content
|
|
985
|
+
|
|
986
|
+
|
|
987
|
+
@pytest.mark.ai
|
|
988
|
+
def test_compose_msgs__generates_non_empty_messages__from_config_prompts(
|
|
989
|
+
hallucination_config: HallucinationConfig,
|
|
990
|
+
) -> None:
|
|
991
|
+
"""
|
|
992
|
+
Purpose: Verify that messages use prompts from config and generate non-empty content.
|
|
993
|
+
Why this matters: Ensures prompt templates are properly rendered.
|
|
994
|
+
Setup summary: Compose messages, assert both system and user messages have content.
|
|
995
|
+
"""
|
|
996
|
+
# Arrange
|
|
997
|
+
input_data: EvaluationMetricInput = EvaluationMetricInput(
|
|
998
|
+
input_text="Question",
|
|
999
|
+
output_text="Output",
|
|
1000
|
+
)
|
|
1001
|
+
|
|
1002
|
+
# Act
|
|
1003
|
+
result: LanguageModelMessages = _compose_msgs(
|
|
1004
|
+
input_data, hallucination_config, has_context=True
|
|
1005
|
+
)
|
|
1006
|
+
|
|
1007
|
+
# Assert
|
|
1008
|
+
assert len(result.root[0].content) > 0
|
|
1009
|
+
assert len(result.root[1].content) > 0
|