unique_toolkit 1.45.4__py3-none-any.whl → 1.45.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. unique_toolkit/agentic/evaluation/config.py +25 -6
  2. unique_toolkit/agentic/evaluation/context_relevancy/prompts/__init__.py +13 -0
  3. unique_toolkit/agentic/evaluation/context_relevancy/{prompts.py → prompts/system_prompt.j2} +11 -43
  4. unique_toolkit/agentic/evaluation/context_relevancy/prompts/user_prompt.j2 +15 -0
  5. unique_toolkit/agentic/evaluation/context_relevancy/service.py +24 -56
  6. unique_toolkit/agentic/evaluation/hallucination/constants.py +26 -15
  7. unique_toolkit/agentic/evaluation/hallucination/prompts/__init__.py +13 -0
  8. unique_toolkit/agentic/evaluation/hallucination/prompts/system_prompt.j2 +35 -0
  9. unique_toolkit/agentic/evaluation/hallucination/prompts/user_prompt.j2 +27 -0
  10. unique_toolkit/agentic/evaluation/hallucination/utils.py +153 -102
  11. unique_toolkit/agentic/evaluation/tests/fixtures.py +102 -0
  12. unique_toolkit/agentic/evaluation/tests/test_config.py +247 -0
  13. unique_toolkit/agentic/evaluation/tests/test_context_relevancy_service.py +141 -121
  14. unique_toolkit/agentic/evaluation/tests/test_hallucination_constants.py +600 -0
  15. unique_toolkit/agentic/evaluation/tests/test_hallucination_utils.py +1009 -0
  16. unique_toolkit/agentic/evaluation/tests/test_output_parser.py +82 -23
  17. unique_toolkit/agentic/evaluation/tests/test_prompt_loaders.py +348 -0
  18. unique_toolkit/agentic/evaluation/utils.py +8 -0
  19. unique_toolkit/agentic/responses_api/postprocessors/generated_files.py +34 -0
  20. {unique_toolkit-1.45.4.dist-info → unique_toolkit-1.45.6.dist-info}/METADATA +7 -1
  21. {unique_toolkit-1.45.4.dist-info → unique_toolkit-1.45.6.dist-info}/RECORD +23 -13
  22. unique_toolkit/agentic/evaluation/hallucination/prompts.py +0 -79
  23. {unique_toolkit-1.45.4.dist-info → unique_toolkit-1.45.6.dist-info}/LICENSE +0 -0
  24. {unique_toolkit-1.45.4.dist-info → unique_toolkit-1.45.6.dist-info}/WHEEL +0 -0
@@ -0,0 +1,1009 @@
1
+ """Tests for hallucination evaluation utils."""
2
+
3
+ from typing import List, Optional
4
+
5
+ import pytest
6
+
7
+ from unique_toolkit.agentic.evaluation.hallucination.constants import (
8
+ HallucinationConfig,
9
+ SourceSelectionMode,
10
+ )
11
+ from unique_toolkit.agentic.evaluation.hallucination.utils import (
12
+ _compose_msgs,
13
+ _default_source_selection_mode,
14
+ _from_order_source_selection_mode,
15
+ _from_original_response_source_selection_mode,
16
+ _get_msgs,
17
+ context_text_from_stream_response,
18
+ )
19
+ from unique_toolkit.agentic.evaluation.schemas import EvaluationMetricInput
20
+ from unique_toolkit.content import ContentReference
21
+ from unique_toolkit.content.schemas import ContentChunk
22
+ from unique_toolkit.language_model.schemas import (
23
+ LanguageModelMessageRole,
24
+ LanguageModelMessages,
25
+ LanguageModelStreamResponse,
26
+ LanguageModelStreamResponseMessage,
27
+ )
28
+
29
+
30
+ @pytest.fixture
31
+ def sample_chunks() -> List[ContentChunk]:
32
+ """Create sample content chunks for testing."""
33
+ return [
34
+ ContentChunk(
35
+ id="cont_123",
36
+ chunk_id="chunk_001",
37
+ text="First chunk text",
38
+ order=0,
39
+ ),
40
+ ContentChunk(
41
+ id="cont_123",
42
+ chunk_id="chunk_002",
43
+ text="Second chunk text",
44
+ order=1,
45
+ ),
46
+ ContentChunk(
47
+ id="cont_456",
48
+ chunk_id="chunk_003",
49
+ text="Third chunk text",
50
+ order=0,
51
+ ),
52
+ ContentChunk(
53
+ id="cont_456",
54
+ chunk_id="chunk_004",
55
+ text="Fourth chunk text",
56
+ order=1,
57
+ ),
58
+ ]
59
+
60
+
61
+ @pytest.fixture
62
+ def sample_references() -> List[ContentReference]:
63
+ """Create sample content references for testing."""
64
+ return [
65
+ ContentReference(
66
+ name="Reference 1",
67
+ sequence_number=1,
68
+ source="test_source",
69
+ source_id="cont_123_chunk_001",
70
+ url="http://example.com/1",
71
+ original_index=[0],
72
+ ),
73
+ ContentReference(
74
+ name="Reference 2",
75
+ sequence_number=2,
76
+ source="test_source",
77
+ source_id="cont_456_chunk_003",
78
+ url="http://example.com/2",
79
+ original_index=[2],
80
+ ),
81
+ ]
82
+
83
+
84
+ @pytest.fixture
85
+ def hallucination_config() -> HallucinationConfig:
86
+ """Create a hallucination config for testing."""
87
+ return HallucinationConfig(enabled=True)
88
+
89
+
90
+ @pytest.fixture
91
+ def evaluation_input() -> EvaluationMetricInput:
92
+ """Create an evaluation input for testing."""
93
+ return EvaluationMetricInput(
94
+ input_text="Test question",
95
+ context_texts=["Context 1", "Context 2"],
96
+ output_text="Test output",
97
+ )
98
+
99
+
100
+ @pytest.mark.ai
101
+ def test_default_source_selection_mode__selects_chunks__by_source_id_match(
102
+ sample_references: List[ContentReference],
103
+ sample_chunks: List[ContentChunk],
104
+ ) -> None:
105
+ """
106
+ Purpose: Verify that chunks are selected by matching source_id from references.
107
+ Why this matters: FROM_IDS mode is the most precise chunk selection method.
108
+ Setup summary: Provide references with source IDs, assert matching chunks selected.
109
+ """
110
+ # Arrange - Fixtures provide data
111
+
112
+ # Act
113
+ result: List[ContentChunk] = _default_source_selection_mode(
114
+ sample_references, sample_chunks
115
+ )
116
+
117
+ # Assert
118
+ assert len(result) == 2
119
+ assert result[0].chunk_id == "chunk_001"
120
+ assert result[1].chunk_id == "chunk_003"
121
+
122
+
123
+ @pytest.mark.ai
124
+ def test_default_source_selection_mode__returns_empty_list__when_no_matches_found(
125
+ sample_chunks: List[ContentChunk],
126
+ ) -> None:
127
+ """
128
+ Purpose: Verify that empty list is returned when no references match chunks.
129
+ Why this matters: Graceful handling of missing references is critical.
130
+ Setup summary: Provide non-matching reference, assert empty result.
131
+ """
132
+ # Arrange
133
+ references: List[ContentReference] = [
134
+ ContentReference(
135
+ name="No Match",
136
+ sequence_number=1,
137
+ source="test",
138
+ source_id="nonexistent_id",
139
+ url="http://example.com",
140
+ original_index=[0],
141
+ )
142
+ ]
143
+
144
+ # Act
145
+ result: List[ContentChunk] = _default_source_selection_mode(
146
+ references, sample_chunks
147
+ )
148
+
149
+ # Assert
150
+ assert len(result) == 0
151
+
152
+
153
+ @pytest.mark.ai
154
+ def test_default_source_selection_mode__returns_empty_list__with_empty_references(
155
+ sample_chunks: List[ContentChunk],
156
+ ) -> None:
157
+ """
158
+ Purpose: Verify that empty list is returned when references list is empty.
159
+ Why this matters: Handles edge case of no references gracefully.
160
+ Setup summary: Provide empty references list, assert empty result.
161
+ """
162
+ # Arrange
163
+ references: List[ContentReference] = []
164
+
165
+ # Act
166
+ result: List[ContentChunk] = _default_source_selection_mode(
167
+ references, sample_chunks
168
+ )
169
+
170
+ # Assert
171
+ assert len(result) == 0
172
+
173
+
174
+ @pytest.mark.ai
175
+ def test_default_source_selection_mode__returns_empty_list__with_empty_chunks(
176
+ sample_references: List[ContentReference],
177
+ ) -> None:
178
+ """
179
+ Purpose: Verify that empty list is returned when chunks list is empty.
180
+ Why this matters: Handles edge case of no available chunks gracefully.
181
+ Setup summary: Provide empty chunks list, assert empty result.
182
+ """
183
+ # Arrange
184
+ chunks: List[ContentChunk] = []
185
+
186
+ # Act
187
+ result: List[ContentChunk] = _default_source_selection_mode(
188
+ sample_references, chunks
189
+ )
190
+
191
+ # Assert
192
+ assert len(result) == 0
193
+
194
+
195
+ @pytest.mark.ai
196
+ def test_default_source_selection_mode__builds_chunk_id_correctly__for_matching(
197
+ sample_chunks: List[ContentChunk],
198
+ ) -> None:
199
+ """
200
+ Purpose: Verify that chunk IDs are built correctly using id_chunkId format.
201
+ Why this matters: Correct ID construction is critical for chunk matching.
202
+ Setup summary: Provide reference with specific ID format, assert correct chunk found.
203
+ """
204
+ # Arrange
205
+ references: List[ContentReference] = [
206
+ ContentReference(
207
+ name="Ref",
208
+ sequence_number=1,
209
+ source="test",
210
+ source_id="cont_123_chunk_001",
211
+ url="http://example.com",
212
+ original_index=[0],
213
+ )
214
+ ]
215
+
216
+ # Act
217
+ result: List[ContentChunk] = _default_source_selection_mode(
218
+ references, sample_chunks
219
+ )
220
+
221
+ # Assert
222
+ assert len(result) == 1
223
+ assert result[0].id == "cont_123"
224
+ assert result[0].chunk_id == "chunk_001"
225
+
226
+
227
+ @pytest.mark.ai
228
+ def test_from_order_source_selection_mode__selects_chunks__by_original_index_order(
229
+ sample_references: List[ContentReference],
230
+ sample_chunks: List[ContentChunk],
231
+ ) -> None:
232
+ """
233
+ Purpose: Verify that chunks are selected by original_index values from references.
234
+ Why this matters: FROM_ORDER mode enables position-based chunk selection.
235
+ Setup summary: Provide references with original indices, assert chunks at those positions.
236
+ """
237
+ # Arrange - Fixtures provide data
238
+
239
+ # Act
240
+ result: List[ContentChunk] = _from_order_source_selection_mode(
241
+ sample_references, sample_chunks
242
+ )
243
+
244
+ # Assert
245
+ assert len(result) == 2
246
+ assert result[0] == sample_chunks[0] # original_index [0]
247
+ assert result[1] == sample_chunks[2] # original_index [2]
248
+
249
+
250
+ @pytest.mark.ai
251
+ def test_from_order_source_selection_mode__handles_multiple_indices__in_single_reference(
252
+ sample_chunks: List[ContentChunk],
253
+ ) -> None:
254
+ """
255
+ Purpose: Verify handling of references with multiple original indices.
256
+ Why this matters: Single reference may cite multiple chunks.
257
+ Setup summary: Provide reference with multiple indices, assert all chunks selected.
258
+ """
259
+ # Arrange
260
+ references: List[ContentReference] = [
261
+ ContentReference(
262
+ name="Multi",
263
+ sequence_number=1,
264
+ source="test",
265
+ source_id="test_id",
266
+ url="http://example.com",
267
+ original_index=[0, 2, 3],
268
+ )
269
+ ]
270
+
271
+ # Act
272
+ result: List[ContentChunk] = _from_order_source_selection_mode(
273
+ references, sample_chunks
274
+ )
275
+
276
+ # Assert
277
+ assert len(result) == 3
278
+ assert result[0] == sample_chunks[0]
279
+ assert result[1] == sample_chunks[2]
280
+ assert result[2] == sample_chunks[3]
281
+
282
+
283
+ @pytest.mark.ai
284
+ def test_from_order_source_selection_mode__removes_duplicate_indices__while_preserving_order(
285
+ sample_chunks: List[ContentChunk],
286
+ ) -> None:
287
+ """
288
+ Purpose: Verify that duplicate indices across references are deduplicated.
289
+ Why this matters: Prevents duplicate chunks in context while maintaining order.
290
+ Setup summary: Provide overlapping indices, assert deduplicated result.
291
+ """
292
+ # Arrange
293
+ references: List[ContentReference] = [
294
+ ContentReference(
295
+ name="Ref1",
296
+ sequence_number=1,
297
+ source="test",
298
+ source_id="id1",
299
+ url="http://example.com",
300
+ original_index=[0, 1],
301
+ ),
302
+ ContentReference(
303
+ name="Ref2",
304
+ sequence_number=2,
305
+ source="test",
306
+ source_id="id2",
307
+ url="http://example.com",
308
+ original_index=[1, 2], # 1 is duplicate
309
+ ),
310
+ ]
311
+
312
+ # Act
313
+ result: List[ContentChunk] = _from_order_source_selection_mode(
314
+ references, sample_chunks
315
+ )
316
+
317
+ # Assert
318
+ assert len(result) == 3 # 0, 1, 2 (no duplicate 1)
319
+ assert result[0] == sample_chunks[0]
320
+ assert result[1] == sample_chunks[1]
321
+ assert result[2] == sample_chunks[2]
322
+
323
+
324
+ @pytest.mark.ai
325
+ def test_from_order_source_selection_mode__returns_empty_list__with_empty_references(
326
+ sample_chunks: List[ContentChunk],
327
+ ) -> None:
328
+ """
329
+ Purpose: Verify that empty list is returned when references list is empty.
330
+ Why this matters: Handles edge case gracefully without errors.
331
+ Setup summary: Provide empty references, assert empty result.
332
+ """
333
+ # Arrange
334
+ references: List[ContentReference] = []
335
+
336
+ # Act
337
+ result: List[ContentChunk] = _from_order_source_selection_mode(
338
+ references, sample_chunks
339
+ )
340
+
341
+ # Assert
342
+ assert len(result) == 0
343
+
344
+
345
+ @pytest.mark.ai
346
+ def test_from_order_source_selection_mode__preserves_order__from_reference_appearance(
347
+ sample_chunks: List[ContentChunk],
348
+ ) -> None:
349
+ """
350
+ Purpose: Verify that chunk order matches the order indices appear in references.
351
+ Why this matters: Order preservation maintains citation flow from original response.
352
+ Setup summary: Provide indices in specific order, assert result matches that order.
353
+ """
354
+ # Arrange
355
+ references: List[ContentReference] = [
356
+ ContentReference(
357
+ name="Ref",
358
+ sequence_number=1,
359
+ source="test",
360
+ source_id="id",
361
+ url="http://example.com",
362
+ original_index=[3, 1, 0], # Specific order
363
+ )
364
+ ]
365
+
366
+ # Act
367
+ result: List[ContentChunk] = _from_order_source_selection_mode(
368
+ references, sample_chunks
369
+ )
370
+
371
+ # Assert
372
+ assert len(result) == 3
373
+ assert result[0] == sample_chunks[3]
374
+ assert result[1] == sample_chunks[1]
375
+ assert result[2] == sample_chunks[0]
376
+
377
+
378
+ @pytest.mark.ai
379
+ def test_from_original_response_source_selection_mode__extracts_source_numbers__from_text(
380
+ sample_chunks: List[ContentChunk],
381
+ ) -> None:
382
+ """
383
+ Purpose: Verify extraction of source numbers from text using regex pattern.
384
+ Why this matters: Enables detection of actually cited sources in generated text.
385
+ Setup summary: Provide text with source citations, assert correct chunks extracted.
386
+ """
387
+ # Arrange
388
+ original_text: str = "Based on [source0] and [source2], we can conclude..."
389
+ pattern: str = r"[\[<]?source(\d+)[>\]]?"
390
+
391
+ # Act
392
+ result: List[ContentChunk] = _from_original_response_source_selection_mode(
393
+ original_text, sample_chunks, pattern
394
+ )
395
+
396
+ # Assert
397
+ assert len(result) == 2
398
+ assert result[0] == sample_chunks[0]
399
+ assert result[1] == sample_chunks[2]
400
+
401
+
402
+ @pytest.mark.ai
403
+ def test_from_original_response_source_selection_mode__handles_different_reference_formats(
404
+ sample_chunks: List[ContentChunk],
405
+ ) -> None:
406
+ """
407
+ Purpose: Verify extraction works with multiple reference format variations.
408
+ Why this matters: Different systems may use different citation formats.
409
+ Setup summary: Provide text with mixed citation formats, assert all extracted.
410
+ """
411
+ # Arrange
412
+ original_text: str = "From <source0> and source1 and [source3]"
413
+ pattern: str = r"[\[<]?source(\d+)[>\]]?"
414
+
415
+ # Act
416
+ result: List[ContentChunk] = _from_original_response_source_selection_mode(
417
+ original_text, sample_chunks, pattern
418
+ )
419
+
420
+ # Assert
421
+ assert len(result) == 3
422
+ assert result[0] == sample_chunks[0]
423
+ assert result[1] == sample_chunks[1]
424
+ assert result[2] == sample_chunks[3]
425
+
426
+
427
+ @pytest.mark.ai
428
+ def test_from_original_response_source_selection_mode__removes_duplicate_references__while_preserving_order(
429
+ sample_chunks: List[ContentChunk],
430
+ ) -> None:
431
+ """
432
+ Purpose: Verify that duplicate source citations are deduplicated.
433
+ Why this matters: Prevents duplicate chunks in evaluation context.
434
+ Setup summary: Provide text with repeated citations, assert deduplicated result.
435
+ """
436
+ # Arrange
437
+ original_text: str = "[source0] [source1] [source0] [source2]"
438
+ pattern: str = r"[\[<]?source(\d+)[>\]]?"
439
+
440
+ # Act
441
+ result: List[ContentChunk] = _from_original_response_source_selection_mode(
442
+ original_text, sample_chunks, pattern
443
+ )
444
+
445
+ # Assert
446
+ assert len(result) == 3 # 0, 1, 2 (no duplicate 0)
447
+ assert result[0] == sample_chunks[0]
448
+ assert result[1] == sample_chunks[1]
449
+ assert result[2] == sample_chunks[2]
450
+
451
+
452
+ @pytest.mark.ai
453
+ def test_from_original_response_source_selection_mode__raises_value_error__when_original_text_is_none(
454
+ sample_chunks: List[ContentChunk],
455
+ ) -> None:
456
+ """
457
+ Purpose: Verify that ValueError is raised when original_text is None.
458
+ Why this matters: This mode requires original text to extract citations.
459
+ Setup summary: Call with None text, assert ValueError with descriptive message.
460
+ """
461
+ # Arrange
462
+ original_text: Optional[str] = None
463
+ pattern: str = r"[\[<]?source(\d+)[>\]]?"
464
+
465
+ # Act & Assert
466
+ with pytest.raises(ValueError) as exc_info:
467
+ _from_original_response_source_selection_mode(
468
+ original_text,
469
+ sample_chunks,
470
+ pattern, # type: ignore
471
+ )
472
+
473
+ assert "original_text is required" in str(exc_info.value)
474
+
475
+
476
+ @pytest.mark.ai
477
+ def test_from_original_response_source_selection_mode__filters_out_of_bounds_indices(
478
+ sample_chunks: List[ContentChunk],
479
+ ) -> None:
480
+ """
481
+ Purpose: Verify that source indices beyond chunk list length are filtered out.
482
+ Why this matters: Prevents index errors and gracefully handles invalid references.
483
+ Setup summary: Provide text with out-of-bounds index, assert it's filtered.
484
+ """
485
+ # Arrange
486
+ original_text: str = "[source0] [source10] [source2]" # source10 is out of bounds
487
+ pattern: str = r"[\[<]?source(\d+)[>\]]?"
488
+
489
+ # Act
490
+ result: List[ContentChunk] = _from_original_response_source_selection_mode(
491
+ original_text, sample_chunks, pattern
492
+ )
493
+
494
+ # Assert
495
+ assert len(result) == 2
496
+ assert result[0] == sample_chunks[0]
497
+ assert result[1] == sample_chunks[2]
498
+
499
+
500
+ @pytest.mark.ai
501
+ def test_from_original_response_source_selection_mode__returns_empty_list__when_no_references_found(
502
+ sample_chunks: List[ContentChunk],
503
+ ) -> None:
504
+ """
505
+ Purpose: Verify that empty list is returned when no citations found in text.
506
+ Why this matters: Handles case of text without source citations gracefully.
507
+ Setup summary: Provide text with no citations, assert empty result.
508
+ """
509
+ # Arrange
510
+ original_text: str = "No references in this text"
511
+ pattern: str = r"[\[<]?source(\d+)[>\]]?"
512
+
513
+ # Act
514
+ result: List[ContentChunk] = _from_original_response_source_selection_mode(
515
+ original_text, sample_chunks, pattern
516
+ )
517
+
518
+ # Assert
519
+ assert len(result) == 0
520
+
521
+
522
+ @pytest.mark.ai
523
+ def test_from_original_response_source_selection_mode__works_with_custom_regex_pattern(
524
+ sample_chunks: List[ContentChunk],
525
+ ) -> None:
526
+ """
527
+ Purpose: Verify that custom regex patterns can be used for extraction.
528
+ Why this matters: Enables support for organization-specific citation formats.
529
+ Setup summary: Provide custom pattern and matching text, assert extraction works.
530
+ """
531
+ # Arrange
532
+ original_text: str = "See ref:0 and ref:2 for details"
533
+ pattern: str = r"ref:(\d+)"
534
+
535
+ # Act
536
+ result: List[ContentChunk] = _from_original_response_source_selection_mode(
537
+ original_text, sample_chunks, pattern
538
+ )
539
+
540
+ # Assert
541
+ assert len(result) == 2
542
+ assert result[0] == sample_chunks[0]
543
+ assert result[1] == sample_chunks[2]
544
+
545
+
546
+ @pytest.mark.ai
547
+ def test_from_original_response_source_selection_mode__preserves_order__from_text_appearance(
548
+ sample_chunks: List[ContentChunk],
549
+ ) -> None:
550
+ """
551
+ Purpose: Verify that chunk order matches citation order in original text.
552
+ Why this matters: Order preservation maintains logical flow of cited sources.
553
+ Setup summary: Provide text with specific citation order, assert result matches.
554
+ """
555
+ # Arrange
556
+ original_text: str = "[source3] [source1] [source0]"
557
+ pattern: str = r"[\[<]?source(\d+)[>\]]?"
558
+
559
+ # Act
560
+ result: List[ContentChunk] = _from_original_response_source_selection_mode(
561
+ original_text, sample_chunks, pattern
562
+ )
563
+
564
+ # Assert
565
+ assert len(result) == 3
566
+ assert result[0] == sample_chunks[3]
567
+ assert result[1] == sample_chunks[1]
568
+ assert result[2] == sample_chunks[0]
569
+
570
+
571
+ @pytest.mark.ai
572
+ def test_context_text_from_stream_response__extracts_context__using_from_ids_mode(
573
+ sample_chunks: List[ContentChunk],
574
+ sample_references: List[ContentReference],
575
+ ) -> None:
576
+ """
577
+ Purpose: Verify context extraction using FROM_IDS source selection mode.
578
+ Why this matters: FROM_IDS is most accurate mode for known reference IDs.
579
+ Setup summary: Create response with references, use FROM_IDS mode, assert correct texts.
580
+ """
581
+ # Arrange
582
+ response: LanguageModelStreamResponse = LanguageModelStreamResponse(
583
+ message=LanguageModelStreamResponseMessage(
584
+ id="msg_1",
585
+ previous_message_id=None,
586
+ role=LanguageModelMessageRole.ASSISTANT,
587
+ text="Test",
588
+ references=sample_references,
589
+ )
590
+ )
591
+
592
+ # Act
593
+ result: List[str] = context_text_from_stream_response(
594
+ response, sample_chunks, SourceSelectionMode.FROM_IDS
595
+ )
596
+
597
+ # Assert
598
+ assert len(result) == 2
599
+ assert result[0] == "First chunk text"
600
+ assert result[1] == "Third chunk text"
601
+
602
+
603
+ @pytest.mark.ai
604
+ def test_context_text_from_stream_response__extracts_context__using_from_order_mode(
605
+ sample_chunks: List[ContentChunk],
606
+ sample_references: List[ContentReference],
607
+ ) -> None:
608
+ """
609
+ Purpose: Verify context extraction using FROM_ORDER source selection mode.
610
+ Why this matters: FROM_ORDER enables index-based chunk selection.
611
+ Setup summary: Create response with references, use FROM_ORDER mode, assert strings returned.
612
+ """
613
+ # Arrange
614
+ response: LanguageModelStreamResponse = LanguageModelStreamResponse(
615
+ message=LanguageModelStreamResponseMessage(
616
+ id="msg_1",
617
+ previous_message_id=None,
618
+ role=LanguageModelMessageRole.ASSISTANT,
619
+ text="Test",
620
+ references=sample_references,
621
+ )
622
+ )
623
+
624
+ # Act
625
+ result: List[str] = context_text_from_stream_response(
626
+ response, sample_chunks, SourceSelectionMode.FROM_ORDER
627
+ )
628
+
629
+ # Assert
630
+ assert len(result) == 2
631
+ assert isinstance(result[0], str)
632
+ assert isinstance(result[1], str)
633
+
634
+
635
+ @pytest.mark.ai
636
+ def test_context_text_from_stream_response__extracts_context__using_from_original_response_mode(
637
+ sample_chunks: List[ContentChunk],
638
+ ) -> None:
639
+ """
640
+ Purpose: Verify context extraction using FROM_ORIGINAL_RESPONSE mode.
641
+ Why this matters: Extracts only sources actually cited in generated text.
642
+ Setup summary: Create response with original_text citations, assert extraction works.
643
+ """
644
+ # Arrange
645
+ references: List[ContentReference] = [
646
+ ContentReference(
647
+ name="Ref",
648
+ sequence_number=1,
649
+ source="test",
650
+ source_id="id",
651
+ url="http://example.com",
652
+ original_index=[0],
653
+ )
654
+ ]
655
+ response: LanguageModelStreamResponse = LanguageModelStreamResponse(
656
+ message=LanguageModelStreamResponseMessage(
657
+ id="msg_1",
658
+ previous_message_id=None,
659
+ role=LanguageModelMessageRole.ASSISTANT,
660
+ text="Test",
661
+ original_text="Based on [source0] and [source2]",
662
+ references=references,
663
+ )
664
+ )
665
+ pattern: str = r"[\[<]?source(\d+)[>\]]?"
666
+
667
+ # Act
668
+ result: List[str] = context_text_from_stream_response(
669
+ response,
670
+ sample_chunks,
671
+ SourceSelectionMode.FROM_ORIGINAL_RESPONSE,
672
+ pattern,
673
+ )
674
+
675
+ # Assert
676
+ assert len(result) == 2
677
+ assert result[0] == "First chunk text"
678
+ assert result[1] == "Third chunk text"
679
+
680
+
681
+ @pytest.mark.ai
682
+ def test_context_text_from_stream_response__falls_back_to_default__with_invalid_mode(
683
+ sample_chunks: List[ContentChunk],
684
+ sample_references: List[ContentReference],
685
+ ) -> None:
686
+ """
687
+ Purpose: Verify that invalid mode falls back to FROM_IDS mode gracefully.
688
+ Why this matters: Ensures robustness against configuration errors.
689
+ Setup summary: Use invalid mode string, assert fallback returns results.
690
+ """
691
+ # Arrange
692
+ response: LanguageModelStreamResponse = LanguageModelStreamResponse(
693
+ message=LanguageModelStreamResponseMessage(
694
+ id="msg_1",
695
+ previous_message_id=None,
696
+ role=LanguageModelMessageRole.ASSISTANT,
697
+ text="Test",
698
+ references=sample_references,
699
+ )
700
+ )
701
+
702
+ # Act
703
+ result: List[str] = context_text_from_stream_response(
704
+ response,
705
+ sample_chunks,
706
+ "INVALID_MODE", # type: ignore
707
+ )
708
+
709
+ # Assert
710
+ assert len(result) == 2
711
+
712
+
713
+ @pytest.mark.ai
714
+ def test_context_text_from_stream_response__falls_back_to_default__on_extraction_error(
715
+ sample_chunks: List[ContentChunk],
716
+ ) -> None:
717
+ """
718
+ Purpose: Verify that extraction errors trigger fallback to default mode.
719
+ Why this matters: Ensures evaluation continues even with malformed data.
720
+ Setup summary: Create scenario that causes error, assert fallback succeeds.
721
+ """
722
+ # Arrange
723
+ references: List[ContentReference] = [
724
+ ContentReference(
725
+ name="Ref",
726
+ sequence_number=1,
727
+ source="test",
728
+ source_id="cont_123_chunk_001",
729
+ url="http://example.com",
730
+ original_index=[0],
731
+ )
732
+ ]
733
+ response: LanguageModelStreamResponse = LanguageModelStreamResponse(
734
+ message=LanguageModelStreamResponseMessage(
735
+ id="msg_1",
736
+ previous_message_id=None,
737
+ role=LanguageModelMessageRole.ASSISTANT,
738
+ text="Test",
739
+ original_text=None, # Will cause error in FROM_ORIGINAL_RESPONSE
740
+ references=references,
741
+ )
742
+ )
743
+
744
+ # Act
745
+ result: List[str] = context_text_from_stream_response(
746
+ response, sample_chunks, SourceSelectionMode.FROM_ORIGINAL_RESPONSE
747
+ )
748
+
749
+ # Assert
750
+ assert len(result) == 1
751
+ assert result[0] == "First chunk text"
752
+
753
+
754
+ @pytest.mark.ai
755
+ def test_context_text_from_stream_response__returns_text_strings__not_chunk_objects(
756
+ sample_chunks: List[ContentChunk],
757
+ sample_references: List[ContentReference],
758
+ ) -> None:
759
+ """
760
+ Purpose: Verify that function returns list of text strings, not ContentChunk objects.
761
+ Why this matters: Evaluation expects string context, not chunk objects.
762
+ Setup summary: Call function, assert all results are strings not ContentChunk instances.
763
+ """
764
+ # Arrange
765
+ response: LanguageModelStreamResponse = LanguageModelStreamResponse(
766
+ message=LanguageModelStreamResponseMessage(
767
+ id="msg_1",
768
+ previous_message_id=None,
769
+ role=LanguageModelMessageRole.ASSISTANT,
770
+ text="Test",
771
+ references=sample_references,
772
+ )
773
+ )
774
+
775
+ # Act
776
+ result: List[str] = context_text_from_stream_response(
777
+ response, sample_chunks, SourceSelectionMode.FROM_IDS
778
+ )
779
+
780
+ # Assert
781
+ assert all(isinstance(text, str) for text in result)
782
+ assert not any(isinstance(text, ContentChunk) for text in result)
783
+
784
+
785
+ @pytest.mark.ai
786
+ def test_get_msgs__composes_messages__with_context_and_history(
787
+ hallucination_config: HallucinationConfig,
788
+ ) -> None:
789
+ """
790
+ Purpose: Verify message composition with both context and history provided.
791
+ Why this matters: Full context enables accurate hallucination detection.
792
+ Setup summary: Create input with all fields, assert message structure.
793
+ """
794
+ # Arrange
795
+ input_data: EvaluationMetricInput = EvaluationMetricInput(
796
+ input_text="Question",
797
+ context_texts=["Context 1"],
798
+ history_messages=[],
799
+ output_text="Output",
800
+ )
801
+
802
+ # Act
803
+ result: LanguageModelMessages = _get_msgs(input_data, hallucination_config)
804
+
805
+ # Assert
806
+ assert isinstance(result, LanguageModelMessages)
807
+ assert len(result.root) == 2
808
+
809
+
810
+ @pytest.mark.ai
811
+ def test_get_msgs__composes_messages__without_context_or_history(
812
+ hallucination_config: HallucinationConfig,
813
+ ) -> None:
814
+ """
815
+ Purpose: Verify message composition works without context or history.
816
+ Why this matters: Hallucination can be detected even without grounding context.
817
+ Setup summary: Create input with only input/output, assert message structure.
818
+ """
819
+ # Arrange
820
+ input_data: EvaluationMetricInput = EvaluationMetricInput(
821
+ input_text="Question",
822
+ output_text="Output",
823
+ )
824
+
825
+ # Act
826
+ result: LanguageModelMessages = _get_msgs(input_data, hallucination_config)
827
+
828
+ # Assert
829
+ assert isinstance(result, LanguageModelMessages)
830
+ assert len(result.root) == 2
831
+
832
+
833
+ @pytest.mark.ai
834
+ def test_get_msgs__composes_messages__with_context_texts_only(
835
+ hallucination_config: HallucinationConfig,
836
+ ) -> None:
837
+ """
838
+ Purpose: Verify message composition with context texts but no history.
839
+ Why this matters: Common scenario for single-turn evaluations.
840
+ Setup summary: Create input with context but no history, assert message structure.
841
+ """
842
+ # Arrange
843
+ input_data: EvaluationMetricInput = EvaluationMetricInput(
844
+ input_text="Question",
845
+ context_texts=["Context 1", "Context 2"],
846
+ output_text="Output",
847
+ )
848
+
849
+ # Act
850
+ result: LanguageModelMessages = _get_msgs(input_data, hallucination_config)
851
+
852
+ # Assert
853
+ assert isinstance(result, LanguageModelMessages)
854
+ assert len(result.root) == 2
855
+
856
+
857
+ @pytest.mark.ai
858
+ def test_compose_msgs__creates_valid_messages__with_context(
859
+ hallucination_config: HallucinationConfig,
860
+ ) -> None:
861
+ """
862
+ Purpose: Verify that messages are composed correctly when context is available.
863
+ Why this matters: Context affects prompt template rendering.
864
+ Setup summary: Call with has_context=True, assert message structure.
865
+ """
866
+ # Arrange
867
+ input_data: EvaluationMetricInput = EvaluationMetricInput(
868
+ input_text="Question",
869
+ context_texts=["Context 1"],
870
+ output_text="Output",
871
+ )
872
+
873
+ # Act
874
+ result: LanguageModelMessages = _compose_msgs(
875
+ input_data, hallucination_config, has_context=True
876
+ )
877
+
878
+ # Assert
879
+ assert isinstance(result, LanguageModelMessages)
880
+ assert len(result.root) == 2
881
+ assert result.root[0].role == "system"
882
+ assert result.root[1].role == "user"
883
+
884
+
885
+ @pytest.mark.ai
886
+ def test_compose_msgs__creates_valid_messages__without_context(
887
+ hallucination_config: HallucinationConfig,
888
+ ) -> None:
889
+ """
890
+ Purpose: Verify that messages are composed correctly when context is absent.
891
+ Why this matters: No-context mode uses different prompt template.
892
+ Setup summary: Call with has_context=False, assert message structure.
893
+ """
894
+ # Arrange
895
+ input_data: EvaluationMetricInput = EvaluationMetricInput(
896
+ input_text="Question",
897
+ output_text="Output",
898
+ )
899
+
900
+ # Act
901
+ result: LanguageModelMessages = _compose_msgs(
902
+ input_data, hallucination_config, has_context=False
903
+ )
904
+
905
+ # Assert
906
+ assert isinstance(result, LanguageModelMessages)
907
+ assert len(result.root) == 2
908
+
909
+
910
+ @pytest.mark.ai
911
+ def test_compose_msgs__uses_different_system_prompts__based_on_context_flag(
912
+ hallucination_config: HallucinationConfig,
913
+ ) -> None:
914
+ """
915
+ Purpose: Verify that system message content differs based on has_context flag.
916
+ Why this matters: Different prompts needed for grounded vs ungrounded evaluation.
917
+ Setup summary: Call with both flags, assert system prompts differ.
918
+ """
919
+ # Arrange
920
+ input_data: EvaluationMetricInput = EvaluationMetricInput(
921
+ input_text="Question",
922
+ output_text="Output",
923
+ )
924
+
925
+ # Act
926
+ result_with_context: LanguageModelMessages = _compose_msgs(
927
+ input_data, hallucination_config, has_context=True
928
+ )
929
+ result_without_context: LanguageModelMessages = _compose_msgs(
930
+ input_data, hallucination_config, has_context=False
931
+ )
932
+
933
+ # Assert
934
+ assert result_with_context.root[0].content != result_without_context.root[0].content
935
+
936
+
937
+ @pytest.mark.ai
938
+ def test_compose_msgs__includes_input_text__in_user_message(
939
+ hallucination_config: HallucinationConfig,
940
+ ) -> None:
941
+ """
942
+ Purpose: Verify that user message contains the input text from evaluation input.
943
+ Why this matters: Input text provides context for hallucination evaluation.
944
+ Setup summary: Create input with specific text, assert it appears in user message.
945
+ """
946
+ # Arrange
947
+ input_text: str = "What is the capital of France?"
948
+ input_data: EvaluationMetricInput = EvaluationMetricInput(
949
+ input_text=input_text,
950
+ output_text="Output",
951
+ )
952
+
953
+ # Act
954
+ result: LanguageModelMessages = _compose_msgs(
955
+ input_data, hallucination_config, has_context=False
956
+ )
957
+
958
+ # Assert
959
+ assert input_text in result.root[1].content
960
+
961
+
962
+ @pytest.mark.ai
963
+ def test_compose_msgs__includes_output_text__in_user_message(
964
+ hallucination_config: HallucinationConfig,
965
+ ) -> None:
966
+ """
967
+ Purpose: Verify that user message contains the output text being evaluated.
968
+ Why this matters: Output text is the primary target of hallucination detection.
969
+ Setup summary: Create input with specific output, assert it appears in user message.
970
+ """
971
+ # Arrange
972
+ output_text: str = "The capital is Paris."
973
+ input_data: EvaluationMetricInput = EvaluationMetricInput(
974
+ input_text="Question",
975
+ output_text=output_text,
976
+ )
977
+
978
+ # Act
979
+ result: LanguageModelMessages = _compose_msgs(
980
+ input_data, hallucination_config, has_context=False
981
+ )
982
+
983
+ # Assert
984
+ assert output_text in result.root[1].content
985
+
986
+
987
+ @pytest.mark.ai
988
+ def test_compose_msgs__generates_non_empty_messages__from_config_prompts(
989
+ hallucination_config: HallucinationConfig,
990
+ ) -> None:
991
+ """
992
+ Purpose: Verify that messages use prompts from config and generate non-empty content.
993
+ Why this matters: Ensures prompt templates are properly rendered.
994
+ Setup summary: Compose messages, assert both system and user messages have content.
995
+ """
996
+ # Arrange
997
+ input_data: EvaluationMetricInput = EvaluationMetricInput(
998
+ input_text="Question",
999
+ output_text="Output",
1000
+ )
1001
+
1002
+ # Act
1003
+ result: LanguageModelMessages = _compose_msgs(
1004
+ input_data, hallucination_config, has_context=True
1005
+ )
1006
+
1007
+ # Assert
1008
+ assert len(result.root[0].content) > 0
1009
+ assert len(result.root[1].content) > 0