haiku.rag 0.4.3__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of haiku.rag might be problematic. Click here for more details.
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/PKG-INFO +1 -1
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/docs/configuration.md +0 -3
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/pyproject.toml +1 -1
- haiku_rag-0.5.0/src/haiku/rag/chunker.py +60 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/src/haiku/rag/config.py +0 -1
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/src/haiku/rag/reader.py +2 -1
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/src/haiku/rag/store/repositories/settings.py +0 -1
- haiku_rag-0.5.0/tests/test_chunker.py +34 -0
- haiku_rag-0.5.0/tests/test_reader.py +22 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/tests/test_search.py +5 -1
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/uv.lock +1 -1
- haiku_rag-0.4.3/src/haiku/rag/chunker.py +0 -67
- haiku_rag-0.4.3/tests/test_chunker.py +0 -44
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/.github/FUNDING.yml +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/.github/workflows/build-docs.yml +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/.github/workflows/build-publish.yml +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/.gitignore +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/.pre-commit-config.yaml +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/.python-version +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/LICENSE +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/README.md +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/docs/benchmarks.md +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/docs/cli.md +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/docs/index.md +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/docs/installation.md +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/docs/mcp.md +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/docs/python.md +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/docs/server.md +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/mkdocs.yml +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/src/haiku/rag/__init__.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/src/haiku/rag/app.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/src/haiku/rag/cli.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/src/haiku/rag/client.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/src/haiku/rag/embeddings/__init__.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/src/haiku/rag/embeddings/base.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/src/haiku/rag/embeddings/ollama.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/src/haiku/rag/embeddings/openai.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/src/haiku/rag/embeddings/voyageai.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/src/haiku/rag/logging.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/src/haiku/rag/mcp.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/src/haiku/rag/monitor.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/src/haiku/rag/qa/__init__.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/src/haiku/rag/qa/anthropic.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/src/haiku/rag/qa/base.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/src/haiku/rag/qa/ollama.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/src/haiku/rag/qa/openai.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/src/haiku/rag/qa/prompts.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/src/haiku/rag/reranking/__init__.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/src/haiku/rag/reranking/base.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/src/haiku/rag/reranking/cohere.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/src/haiku/rag/reranking/mxbai.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/src/haiku/rag/store/__init__.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/src/haiku/rag/store/engine.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/src/haiku/rag/store/models/__init__.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/src/haiku/rag/store/models/chunk.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/src/haiku/rag/store/models/document.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/src/haiku/rag/store/repositories/__init__.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/src/haiku/rag/store/repositories/base.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/src/haiku/rag/store/repositories/chunk.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/src/haiku/rag/store/repositories/document.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/src/haiku/rag/store/upgrades/__init__.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/src/haiku/rag/store/upgrades/v0_3_4.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/src/haiku/rag/utils.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/tests/__init__.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/tests/conftest.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/tests/generate_benchmark_db.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/tests/llm_judge.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/tests/test_app.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/tests/test_chunk.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/tests/test_cli.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/tests/test_client.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/tests/test_document.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/tests/test_embedder.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/tests/test_monitor.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/tests/test_qa.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/tests/test_rebuild.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/tests/test_reranker.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/tests/test_settings.py +0 -0
- {haiku_rag-0.4.3 → haiku_rag-0.5.0}/tests/test_utils.py +0 -0
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
from io import BytesIO
|
|
2
|
+
from typing import ClassVar
|
|
3
|
+
|
|
4
|
+
import tiktoken
|
|
5
|
+
from docling.chunking import HybridChunker # type: ignore
|
|
6
|
+
from docling.document_converter import DocumentConverter
|
|
7
|
+
from docling_core.transforms.chunker.tokenizer.openai import OpenAITokenizer
|
|
8
|
+
from docling_core.types.io import DocumentStream
|
|
9
|
+
|
|
10
|
+
from haiku.rag.config import Config
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Chunker:
|
|
14
|
+
"""A class that chunks text into smaller pieces for embedding and retrieval.
|
|
15
|
+
|
|
16
|
+
Uses docling's structure-aware chunking to create semantically meaningful chunks
|
|
17
|
+
that respect document boundaries.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
chunk_size: The maximum size of a chunk in tokens.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
encoder: ClassVar[tiktoken.Encoding] = tiktoken.encoding_for_model("gpt-4o")
|
|
24
|
+
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
chunk_size: int = Config.CHUNK_SIZE,
|
|
28
|
+
):
|
|
29
|
+
self.chunk_size = chunk_size
|
|
30
|
+
tokenizer = OpenAITokenizer(
|
|
31
|
+
tokenizer=tiktoken.encoding_for_model("gpt-4o"), max_tokens=chunk_size
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
self.chunker = HybridChunker(tokenizer=tokenizer) # type: ignore
|
|
35
|
+
|
|
36
|
+
async def chunk(self, text: str) -> list[str]:
|
|
37
|
+
"""Split the text into chunks using docling's structure-aware chunking.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
text: The text to be split into chunks.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
A list of text chunks with semantic boundaries.
|
|
44
|
+
"""
|
|
45
|
+
if not text:
|
|
46
|
+
return []
|
|
47
|
+
|
|
48
|
+
# Convert to docling document
|
|
49
|
+
bytes_io = BytesIO(text.encode("utf-8"))
|
|
50
|
+
doc_stream = DocumentStream(name="text.md", stream=bytes_io)
|
|
51
|
+
converter = DocumentConverter()
|
|
52
|
+
result = converter.convert(doc_stream)
|
|
53
|
+
doc = result.document
|
|
54
|
+
|
|
55
|
+
# Chunk using docling's hybrid chunker
|
|
56
|
+
chunks = list(self.chunker.chunk(doc))
|
|
57
|
+
return [self.chunker.contextualize(chunk) for chunk in chunks]
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
chunker = Chunker()
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
from datasets import Dataset
|
|
3
|
+
|
|
4
|
+
from haiku.rag.chunker import Chunker
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@pytest.mark.asyncio
|
|
8
|
+
async def test_chunker(qa_corpus: Dataset):
|
|
9
|
+
chunker = Chunker()
|
|
10
|
+
doc = qa_corpus[0]["document_extracted"]
|
|
11
|
+
chunks = await Chunker().chunk(doc)
|
|
12
|
+
|
|
13
|
+
# Ensure that the text is split into multiple chunks
|
|
14
|
+
assert len(chunks) > 1
|
|
15
|
+
|
|
16
|
+
# Ensure that chunks are reasonably sized (allowing more flexibility for structure-aware chunking)
|
|
17
|
+
total_tokens = 0
|
|
18
|
+
for chunk in chunks:
|
|
19
|
+
encoded_tokens = Chunker.encoder.encode(chunk, disallowed_special=())
|
|
20
|
+
token_count = len(encoded_tokens)
|
|
21
|
+
total_tokens += token_count
|
|
22
|
+
|
|
23
|
+
# Each chunk should be reasonably sized (allowing more flexibility than the old strict limits)
|
|
24
|
+
assert (
|
|
25
|
+
token_count <= chunker.chunk_size * 1.2
|
|
26
|
+
) # Allow some flexibility for semantic boundaries
|
|
27
|
+
assert token_count > 5 # Ensure chunks aren't too small
|
|
28
|
+
|
|
29
|
+
# Ensure that all chunks together contain roughly the same content as original
|
|
30
|
+
original_tokens = len(Chunker.encoder.encode(doc, disallowed_special=()))
|
|
31
|
+
|
|
32
|
+
# Due to structure-aware chunking, we might have some variation in token count
|
|
33
|
+
# but it should be reasonable
|
|
34
|
+
assert abs(total_tokens - original_tokens) <= original_tokens * 0.1
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import tempfile
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from haiku.rag.reader import FileReader
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def test_code_file_wrapped_in_code_block():
|
|
8
|
+
"""Test that code files are wrapped in markdown code blocks."""
|
|
9
|
+
python_code = '''def hello_world():
|
|
10
|
+
print("Hello, World!")
|
|
11
|
+
return "success"'''
|
|
12
|
+
|
|
13
|
+
with tempfile.NamedTemporaryFile(mode="w", suffix=".py") as f:
|
|
14
|
+
f.write(python_code)
|
|
15
|
+
f.flush()
|
|
16
|
+
temp_path = Path(f.name)
|
|
17
|
+
|
|
18
|
+
result = FileReader.parse_file(temp_path)
|
|
19
|
+
|
|
20
|
+
assert result.startswith("```python\n")
|
|
21
|
+
assert result.endswith("\n```")
|
|
22
|
+
assert "def hello_world():" in result
|
|
@@ -36,7 +36,7 @@ async def test_search_qa_corpus(qa_corpus: Dataset):
|
|
|
36
36
|
created_document = await doc_repo.create(document)
|
|
37
37
|
documents.append((created_document, doc_data))
|
|
38
38
|
|
|
39
|
-
for i in range(
|
|
39
|
+
for i in range(5): # Test with first few documents
|
|
40
40
|
target_document, doc_data = documents[i]
|
|
41
41
|
question = doc_data["question"]
|
|
42
42
|
|
|
@@ -50,6 +50,10 @@ async def test_search_qa_corpus(qa_corpus: Dataset):
|
|
|
50
50
|
target_document_ids = {chunk.document_id for chunk, _ in fts_results}
|
|
51
51
|
assert target_document.id in target_document_ids
|
|
52
52
|
|
|
53
|
+
for i in range(num_documents): # Test with first few documents
|
|
54
|
+
target_document, doc_data = documents[i]
|
|
55
|
+
question = doc_data["question"]
|
|
56
|
+
|
|
53
57
|
# Test hybrid search
|
|
54
58
|
hybrid_results = await chunk_repo.search_chunks_hybrid(question, limit=5)
|
|
55
59
|
target_document_ids = {chunk.document_id for chunk, _ in hybrid_results}
|
|
@@ -1,67 +0,0 @@
|
|
|
1
|
-
from typing import ClassVar
|
|
2
|
-
|
|
3
|
-
import tiktoken
|
|
4
|
-
|
|
5
|
-
from haiku.rag.config import Config
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class Chunker:
|
|
9
|
-
"""A class that chunks text into smaller pieces for embedding and retrieval.
|
|
10
|
-
|
|
11
|
-
Args:
|
|
12
|
-
chunk_size: The maximum size of a chunk in tokens.
|
|
13
|
-
chunk_overlap: The number of tokens of overlap between chunks.
|
|
14
|
-
"""
|
|
15
|
-
|
|
16
|
-
encoder: ClassVar[tiktoken.Encoding] = tiktoken.encoding_for_model("gpt-4o")
|
|
17
|
-
|
|
18
|
-
def __init__(
|
|
19
|
-
self,
|
|
20
|
-
chunk_size: int = Config.CHUNK_SIZE,
|
|
21
|
-
chunk_overlap: int = Config.CHUNK_OVERLAP,
|
|
22
|
-
):
|
|
23
|
-
self.chunk_size = chunk_size
|
|
24
|
-
self.chunk_overlap = chunk_overlap
|
|
25
|
-
|
|
26
|
-
async def chunk(self, text: str) -> list[str]:
|
|
27
|
-
"""Split the text into chunks based on token boundaries.
|
|
28
|
-
|
|
29
|
-
Args:
|
|
30
|
-
text: The text to be split into chunks.
|
|
31
|
-
|
|
32
|
-
Returns:
|
|
33
|
-
A list of text chunks with token-based boundaries and overlap.
|
|
34
|
-
"""
|
|
35
|
-
if not text:
|
|
36
|
-
return []
|
|
37
|
-
|
|
38
|
-
encoded_tokens = self.encoder.encode(text, disallowed_special=())
|
|
39
|
-
|
|
40
|
-
if self.chunk_size > len(encoded_tokens):
|
|
41
|
-
return [text]
|
|
42
|
-
|
|
43
|
-
chunks = []
|
|
44
|
-
i = 0
|
|
45
|
-
split_id_counter = 0
|
|
46
|
-
while i < len(encoded_tokens):
|
|
47
|
-
# Overlap
|
|
48
|
-
start_i = i
|
|
49
|
-
end_i = min(i + self.chunk_size, len(encoded_tokens))
|
|
50
|
-
|
|
51
|
-
chunk_tokens = encoded_tokens[start_i:end_i]
|
|
52
|
-
chunk_text = self.encoder.decode(chunk_tokens)
|
|
53
|
-
|
|
54
|
-
chunks.append(chunk_text)
|
|
55
|
-
split_id_counter += 1
|
|
56
|
-
|
|
57
|
-
# Exit loop if this was the last possible chunk
|
|
58
|
-
if end_i == len(encoded_tokens):
|
|
59
|
-
break
|
|
60
|
-
|
|
61
|
-
i += (
|
|
62
|
-
self.chunk_size - self.chunk_overlap
|
|
63
|
-
) # Step forward, considering overlap
|
|
64
|
-
return chunks
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
chunker = Chunker()
|
|
@@ -1,44 +0,0 @@
|
|
|
1
|
-
import pytest
|
|
2
|
-
from datasets import Dataset
|
|
3
|
-
|
|
4
|
-
from haiku.rag.chunker import Chunker
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
@pytest.mark.asyncio
|
|
8
|
-
async def test_chunker(qa_corpus: Dataset):
|
|
9
|
-
chunker = Chunker()
|
|
10
|
-
doc = qa_corpus[0]["document_extracted"]
|
|
11
|
-
chunks = await Chunker().chunk(doc)
|
|
12
|
-
|
|
13
|
-
# Ensure that the text is split into multiple chunks
|
|
14
|
-
assert len(chunks) > 1
|
|
15
|
-
|
|
16
|
-
# Ensure that each chunk corresponds to roughly Config.CHUNK_SIZE tokens
|
|
17
|
-
for chunk in chunks[:-1]:
|
|
18
|
-
encoded_tokens = Chunker.encoder.encode(chunk, disallowed_special=())
|
|
19
|
-
assert len(encoded_tokens) <= Chunker().chunk_size
|
|
20
|
-
assert len(encoded_tokens) > Chunker().chunk_size * 0.9
|
|
21
|
-
|
|
22
|
-
# Ensure that the last chunk is less than Config.CHUNK_SIZE tokens
|
|
23
|
-
assert (
|
|
24
|
-
len(Chunker.encoder.encode(chunks[-1], disallowed_special=()))
|
|
25
|
-
< Chunker().chunk_size
|
|
26
|
-
)
|
|
27
|
-
|
|
28
|
-
# Test overlap between consecutive chunks
|
|
29
|
-
for i in range(len(chunks) - 1):
|
|
30
|
-
current_chunk = chunks[i]
|
|
31
|
-
next_chunk = chunks[i + 1]
|
|
32
|
-
|
|
33
|
-
current_tokens = Chunker.encoder.encode(current_chunk, disallowed_special=())
|
|
34
|
-
next_tokens = Chunker.encoder.encode(next_chunk, disallowed_special=())
|
|
35
|
-
|
|
36
|
-
overlap_size = min(chunker.chunk_overlap, len(current_tokens))
|
|
37
|
-
current_overlap_tokens = current_tokens[-overlap_size:]
|
|
38
|
-
next_overlap_tokens = next_tokens[:overlap_size]
|
|
39
|
-
|
|
40
|
-
# The overlapping tokens should be identical
|
|
41
|
-
assert current_overlap_tokens == next_overlap_tokens
|
|
42
|
-
assert len(current_overlap_tokens) == min(
|
|
43
|
-
chunker.chunk_overlap, len(current_tokens)
|
|
44
|
-
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|