haiku.rag 0.5.4__tar.gz → 0.5.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of haiku.rag might be problematic. Click here for more details.
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/PKG-INFO +1 -1
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/docs/python.md +4 -1
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/pyproject.toml +1 -1
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/src/haiku/rag/client.py +7 -3
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/tests/test_client.py +89 -91
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/uv.lock +1 -1
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/.github/FUNDING.yml +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/.github/workflows/build-docs.yml +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/.github/workflows/build-publish.yml +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/.gitignore +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/.pre-commit-config.yaml +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/.python-version +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/LICENSE +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/README.md +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/docs/benchmarks.md +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/docs/cli.md +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/docs/configuration.md +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/docs/index.md +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/docs/installation.md +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/docs/mcp.md +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/docs/server.md +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/mkdocs.yml +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/src/haiku/rag/__init__.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/src/haiku/rag/app.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/src/haiku/rag/chunker.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/src/haiku/rag/cli.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/src/haiku/rag/config.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/src/haiku/rag/embeddings/__init__.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/src/haiku/rag/embeddings/base.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/src/haiku/rag/embeddings/ollama.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/src/haiku/rag/embeddings/openai.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/src/haiku/rag/embeddings/voyageai.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/src/haiku/rag/logging.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/src/haiku/rag/mcp.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/src/haiku/rag/monitor.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/src/haiku/rag/qa/__init__.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/src/haiku/rag/qa/anthropic.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/src/haiku/rag/qa/base.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/src/haiku/rag/qa/ollama.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/src/haiku/rag/qa/openai.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/src/haiku/rag/qa/prompts.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/src/haiku/rag/reader.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/src/haiku/rag/reranking/__init__.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/src/haiku/rag/reranking/base.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/src/haiku/rag/reranking/cohere.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/src/haiku/rag/reranking/mxbai.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/src/haiku/rag/reranking/ollama.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/src/haiku/rag/store/__init__.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/src/haiku/rag/store/engine.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/src/haiku/rag/store/models/__init__.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/src/haiku/rag/store/models/chunk.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/src/haiku/rag/store/models/document.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/src/haiku/rag/store/repositories/__init__.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/src/haiku/rag/store/repositories/base.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/src/haiku/rag/store/repositories/chunk.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/src/haiku/rag/store/repositories/document.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/src/haiku/rag/store/repositories/settings.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/src/haiku/rag/store/upgrades/__init__.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/src/haiku/rag/store/upgrades/v0_3_4.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/src/haiku/rag/utils.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/tests/__init__.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/tests/conftest.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/tests/generate_benchmark_db.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/tests/llm_judge.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/tests/test_app.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/tests/test_chunk.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/tests/test_chunker.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/tests/test_cli.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/tests/test_document.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/tests/test_embedder.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/tests/test_monitor.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/tests/test_qa.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/tests/test_reader.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/tests/test_rebuild.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/tests/test_reranker.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/tests/test_search.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/tests/test_settings.py +0 -0
- {haiku_rag-0.5.4 → haiku_rag-0.5.5}/tests/test_utils.py +0 -0
|
@@ -138,9 +138,12 @@ Expand search results with adjacent chunks for more complete context:
|
|
|
138
138
|
# Get initial search results
|
|
139
139
|
search_results = await client.search("machine learning", limit=3)
|
|
140
140
|
|
|
141
|
-
# Expand with adjacent chunks
|
|
141
|
+
# Expand with adjacent chunks using config setting
|
|
142
142
|
expanded_results = await client.expand_context(search_results)
|
|
143
143
|
|
|
144
|
+
# Or specify a custom radius
|
|
145
|
+
expanded_results = await client.expand_context(search_results, radius=2)
|
|
146
|
+
|
|
144
147
|
# The expanded results contain chunks with combined content from adjacent chunks
|
|
145
148
|
for chunk, score in expanded_results:
|
|
146
149
|
print(f"Expanded content: {chunk.content}") # Now includes before/after chunks
|
|
@@ -349,17 +349,21 @@ class HaikuRAG:
|
|
|
349
349
|
return reranked_results
|
|
350
350
|
|
|
351
351
|
async def expand_context(
|
|
352
|
-
self,
|
|
352
|
+
self,
|
|
353
|
+
search_results: list[tuple[Chunk, float]],
|
|
354
|
+
radius: int = Config.CONTEXT_CHUNK_RADIUS,
|
|
353
355
|
) -> list[tuple[Chunk, float]]:
|
|
354
356
|
"""Expand search results with adjacent chunks, merging overlapping chunks.
|
|
355
357
|
|
|
356
358
|
Args:
|
|
357
359
|
search_results: List of (chunk, score) tuples from search.
|
|
360
|
+
radius: Number of adjacent chunks to include before/after each chunk.
|
|
361
|
+
Defaults to CONTEXT_CHUNK_RADIUS config setting.
|
|
358
362
|
|
|
359
363
|
Returns:
|
|
360
364
|
List of (chunk, score) tuples with expanded and merged context chunks.
|
|
361
365
|
"""
|
|
362
|
-
if
|
|
366
|
+
if radius == 0:
|
|
363
367
|
return search_results
|
|
364
368
|
|
|
365
369
|
# Group chunks by document_id to handle merging within documents
|
|
@@ -377,7 +381,7 @@ class HaikuRAG:
|
|
|
377
381
|
expanded_ranges = []
|
|
378
382
|
for chunk, score in doc_chunks:
|
|
379
383
|
adjacent_chunks = await self.chunk_repository.get_adjacent_chunks(
|
|
380
|
-
chunk,
|
|
384
|
+
chunk, radius
|
|
381
385
|
)
|
|
382
386
|
|
|
383
387
|
all_chunks = adjacent_chunks + [chunk]
|
|
@@ -644,109 +644,107 @@ async def test_client_expand_context_multiple_chunks():
|
|
|
644
644
|
@pytest.mark.asyncio
|
|
645
645
|
async def test_client_expand_context_merges_overlapping_chunks():
|
|
646
646
|
"""Test that overlapping expanded chunks are merged into one."""
|
|
647
|
-
with
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
]
|
|
647
|
+
async with HaikuRAG(":memory:") as client:
|
|
648
|
+
# Create document with 5 chunks
|
|
649
|
+
manual_chunks = [
|
|
650
|
+
Chunk(content="Chunk 0", metadata={"order": 0}),
|
|
651
|
+
Chunk(content="Chunk 1", metadata={"order": 1}),
|
|
652
|
+
Chunk(content="Chunk 2", metadata={"order": 2}),
|
|
653
|
+
Chunk(content="Chunk 3", metadata={"order": 3}),
|
|
654
|
+
Chunk(content="Chunk 4", metadata={"order": 4}),
|
|
655
|
+
]
|
|
657
656
|
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
657
|
+
doc = await client.create_document(
|
|
658
|
+
content="Full document content", chunks=manual_chunks
|
|
659
|
+
)
|
|
661
660
|
|
|
662
|
-
|
|
663
|
-
|
|
661
|
+
assert doc.id is not None
|
|
662
|
+
chunks = await client.chunk_repository.get_by_document_id(doc.id)
|
|
664
663
|
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
664
|
+
# Get adjacent chunks (orders 1 and 2) - these will overlap when expanded
|
|
665
|
+
chunk1 = next(c for c in chunks if c.metadata.get("order") == 1)
|
|
666
|
+
chunk2 = next(c for c in chunks if c.metadata.get("order") == 2)
|
|
668
667
|
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
668
|
+
# With radius=1:
|
|
669
|
+
# chunk1 expanded would be [0,1,2]
|
|
670
|
+
# chunk2 expanded would be [1,2,3]
|
|
671
|
+
# These should merge into one chunk containing [0,1,2,3]
|
|
672
|
+
search_results = [(chunk1, 0.8), (chunk2, 0.7)]
|
|
673
|
+
expanded_results = await client.expand_context(search_results, radius=1)
|
|
675
674
|
|
|
676
|
-
|
|
677
|
-
|
|
675
|
+
# Should have only 1 merged result instead of 2 overlapping ones
|
|
676
|
+
assert len(expanded_results) == 1
|
|
678
677
|
|
|
679
|
-
|
|
678
|
+
merged_chunk, score = expanded_results[0]
|
|
680
679
|
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
680
|
+
# Should contain all chunks from 0 to 3
|
|
681
|
+
assert "Chunk 0" in merged_chunk.content
|
|
682
|
+
assert "Chunk 1" in merged_chunk.content
|
|
683
|
+
assert "Chunk 2" in merged_chunk.content
|
|
684
|
+
assert "Chunk 3" in merged_chunk.content
|
|
685
|
+
assert "Chunk 4" not in merged_chunk.content # Should not include chunk 4
|
|
687
686
|
|
|
688
|
-
|
|
689
|
-
|
|
687
|
+
# Should use the higher score (0.8)
|
|
688
|
+
assert score == 0.8
|
|
690
689
|
|
|
691
690
|
|
|
692
691
|
@pytest.mark.asyncio
|
|
693
692
|
async def test_client_expand_context_keeps_separate_non_overlapping():
|
|
694
693
|
"""Test that non-overlapping expanded chunks remain separate."""
|
|
695
|
-
with
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
]
|
|
706
|
-
|
|
707
|
-
doc = await client.create_document(
|
|
708
|
-
content="Full document content", chunks=manual_chunks
|
|
709
|
-
)
|
|
710
|
-
|
|
711
|
-
assert doc.id is not None
|
|
712
|
-
chunks = await client.chunk_repository.get_by_document_id(doc.id)
|
|
713
|
-
|
|
714
|
-
# Get chunks by index - they will have sequential orders 0,1,2,3,4,5
|
|
715
|
-
# So get chunk with order=0 and chunk with order=5 (far enough apart)
|
|
716
|
-
chunk0 = next(
|
|
717
|
-
c for c in chunks if c.metadata.get("order") == 0
|
|
718
|
-
) # Content: "Chunk 0"
|
|
719
|
-
chunk5 = next(
|
|
720
|
-
c for c in chunks if c.metadata.get("order") == 5
|
|
721
|
-
) # Content: "Chunk 7"
|
|
722
|
-
|
|
723
|
-
# chunk0 expanded: [0,1] with radius=1 (orders 0,1)
|
|
724
|
-
# chunk5 expanded: [4,5] with radius=1 (orders 4,5)
|
|
725
|
-
# These should remain separate (max_order 1 < min_order 4 - 1)
|
|
726
|
-
search_results = [(chunk0, 0.8), (chunk5, 0.7)]
|
|
727
|
-
expanded_results = await client.expand_context(search_results)
|
|
728
|
-
|
|
729
|
-
# Should have 2 separate results
|
|
730
|
-
assert len(expanded_results) == 2
|
|
731
|
-
|
|
732
|
-
# Sort by score to ensure predictable order
|
|
733
|
-
expanded_results.sort(key=lambda x: x[1], reverse=True)
|
|
734
|
-
|
|
735
|
-
chunk0_expanded, score1 = expanded_results[0]
|
|
736
|
-
chunk5_expanded, score2 = expanded_results[1]
|
|
694
|
+
async with HaikuRAG(":memory:") as client:
|
|
695
|
+
# Create document with chunks far apart
|
|
696
|
+
manual_chunks = [
|
|
697
|
+
Chunk(content="Chunk 0", metadata={"order": 0}),
|
|
698
|
+
Chunk(content="Chunk 1", metadata={"order": 1}),
|
|
699
|
+
Chunk(content="Chunk 2", metadata={"order": 2}),
|
|
700
|
+
Chunk(content="Chunk 5", metadata={"order": 5}), # Gap here
|
|
701
|
+
Chunk(content="Chunk 6", metadata={"order": 6}),
|
|
702
|
+
Chunk(content="Chunk 7", metadata={"order": 7}),
|
|
703
|
+
]
|
|
737
704
|
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
assert "Chunk 1" in chunk0_expanded.content
|
|
742
|
-
assert (
|
|
743
|
-
"Chunk 7" not in chunk0_expanded.content
|
|
744
|
-
) # Should not have chunk 7 content
|
|
745
|
-
assert score1 == 0.8
|
|
705
|
+
doc = await client.create_document(
|
|
706
|
+
content="Full document content", chunks=manual_chunks
|
|
707
|
+
)
|
|
746
708
|
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
709
|
+
assert doc.id is not None
|
|
710
|
+
chunks = await client.chunk_repository.get_by_document_id(doc.id)
|
|
711
|
+
|
|
712
|
+
# Get chunks by index - they will have sequential orders 0,1,2,3,4,5
|
|
713
|
+
# So get chunk with order=0 and chunk with order=5 (far enough apart)
|
|
714
|
+
chunk0 = next(
|
|
715
|
+
c for c in chunks if c.metadata.get("order") == 0
|
|
716
|
+
) # Content: "Chunk 0"
|
|
717
|
+
chunk5 = next(
|
|
718
|
+
c for c in chunks if c.metadata.get("order") == 5
|
|
719
|
+
) # Content: "Chunk 7"
|
|
720
|
+
|
|
721
|
+
# chunk0 expanded: [0,1] with radius=1 (orders 0,1)
|
|
722
|
+
# chunk5 expanded: [4,5] with radius=1 (orders 4,5)
|
|
723
|
+
# These should remain separate (max_order 1 < min_order 4 - 1)
|
|
724
|
+
search_results = [(chunk0, 0.8), (chunk5, 0.7)]
|
|
725
|
+
expanded_results = await client.expand_context(search_results, radius=1)
|
|
726
|
+
|
|
727
|
+
# Should have 2 separate results
|
|
728
|
+
assert len(expanded_results) == 2
|
|
729
|
+
|
|
730
|
+
# Sort by score to ensure predictable order
|
|
731
|
+
expanded_results.sort(key=lambda x: x[1], reverse=True)
|
|
732
|
+
|
|
733
|
+
chunk0_expanded, score1 = expanded_results[0]
|
|
734
|
+
chunk5_expanded, score2 = expanded_results[1]
|
|
735
|
+
|
|
736
|
+
# First chunk (order=0) expanded should contain orders [0,1]
|
|
737
|
+
# Content should be "Chunk 0" + "Chunk 1"
|
|
738
|
+
assert "Chunk 0" in chunk0_expanded.content
|
|
739
|
+
assert "Chunk 1" in chunk0_expanded.content
|
|
740
|
+
assert (
|
|
741
|
+
"Chunk 7" not in chunk0_expanded.content
|
|
742
|
+
) # Should not have chunk 7 content
|
|
743
|
+
assert score1 == 0.8
|
|
744
|
+
|
|
745
|
+
# Second chunk (order=5) expanded should contain orders [4,5]
|
|
746
|
+
# Content should be "Chunk 6" + "Chunk 7" (orders 4 and 5)
|
|
747
|
+
assert "Chunk 6" in chunk5_expanded.content # Order 4 content
|
|
748
|
+
assert "Chunk 7" in chunk5_expanded.content # Order 5 content
|
|
749
|
+
assert "Chunk 0" not in chunk5_expanded.content
|
|
750
|
+
assert score2 == 0.7
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|