haiku.rag 0.4.3__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of haiku.rag might be problematic. Click here for more details.
- haiku/rag/chunker.py +24 -31
- haiku/rag/config.py +0 -1
- haiku/rag/reader.py +2 -1
- haiku/rag/store/repositories/settings.py +0 -1
- {haiku_rag-0.4.3.dist-info → haiku_rag-0.5.0.dist-info}/METADATA +1 -1
- {haiku_rag-0.4.3.dist-info → haiku_rag-0.5.0.dist-info}/RECORD +9 -9
- {haiku_rag-0.4.3.dist-info → haiku_rag-0.5.0.dist-info}/WHEEL +0 -0
- {haiku_rag-0.4.3.dist-info → haiku_rag-0.5.0.dist-info}/entry_points.txt +0 -0
- {haiku_rag-0.4.3.dist-info → haiku_rag-0.5.0.dist-info}/licenses/LICENSE +0 -0
haiku/rag/chunker.py
CHANGED
|
@@ -1,6 +1,11 @@
|
|
|
1
|
+
from io import BytesIO
|
|
1
2
|
from typing import ClassVar
|
|
2
3
|
|
|
3
4
|
import tiktoken
|
|
5
|
+
from docling.chunking import HybridChunker # type: ignore
|
|
6
|
+
from docling.document_converter import DocumentConverter
|
|
7
|
+
from docling_core.transforms.chunker.tokenizer.openai import OpenAITokenizer
|
|
8
|
+
from docling_core.types.io import DocumentStream
|
|
4
9
|
|
|
5
10
|
from haiku.rag.config import Config
|
|
6
11
|
|
|
@@ -8,9 +13,11 @@ from haiku.rag.config import Config
|
|
|
8
13
|
class Chunker:
|
|
9
14
|
"""A class that chunks text into smaller pieces for embedding and retrieval.
|
|
10
15
|
|
|
16
|
+
Uses docling's structure-aware chunking to create semantically meaningful chunks
|
|
17
|
+
that respect document boundaries.
|
|
18
|
+
|
|
11
19
|
Args:
|
|
12
20
|
chunk_size: The maximum size of a chunk in tokens.
|
|
13
|
-
chunk_overlap: The number of tokens of overlap between chunks.
|
|
14
21
|
"""
|
|
15
22
|
|
|
16
23
|
encoder: ClassVar[tiktoken.Encoding] = tiktoken.encoding_for_model("gpt-4o")
|
|
@@ -18,50 +25,36 @@ class Chunker:
|
|
|
18
25
|
def __init__(
|
|
19
26
|
self,
|
|
20
27
|
chunk_size: int = Config.CHUNK_SIZE,
|
|
21
|
-
chunk_overlap: int = Config.CHUNK_OVERLAP,
|
|
22
28
|
):
|
|
23
29
|
self.chunk_size = chunk_size
|
|
24
|
-
|
|
30
|
+
tokenizer = OpenAITokenizer(
|
|
31
|
+
tokenizer=tiktoken.encoding_for_model("gpt-4o"), max_tokens=chunk_size
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
self.chunker = HybridChunker(tokenizer=tokenizer) # type: ignore
|
|
25
35
|
|
|
26
36
|
async def chunk(self, text: str) -> list[str]:
|
|
27
|
-
"""Split the text into chunks
|
|
37
|
+
"""Split the text into chunks using docling's structure-aware chunking.
|
|
28
38
|
|
|
29
39
|
Args:
|
|
30
40
|
text: The text to be split into chunks.
|
|
31
41
|
|
|
32
42
|
Returns:
|
|
33
|
-
A list of text chunks with
|
|
43
|
+
A list of text chunks with semantic boundaries.
|
|
34
44
|
"""
|
|
35
45
|
if not text:
|
|
36
46
|
return []
|
|
37
47
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
i = 0
|
|
45
|
-
split_id_counter = 0
|
|
46
|
-
while i < len(encoded_tokens):
|
|
47
|
-
# Overlap
|
|
48
|
-
start_i = i
|
|
49
|
-
end_i = min(i + self.chunk_size, len(encoded_tokens))
|
|
50
|
-
|
|
51
|
-
chunk_tokens = encoded_tokens[start_i:end_i]
|
|
52
|
-
chunk_text = self.encoder.decode(chunk_tokens)
|
|
53
|
-
|
|
54
|
-
chunks.append(chunk_text)
|
|
55
|
-
split_id_counter += 1
|
|
56
|
-
|
|
57
|
-
# Exit loop if this was the last possible chunk
|
|
58
|
-
if end_i == len(encoded_tokens):
|
|
59
|
-
break
|
|
48
|
+
# Convert to docling document
|
|
49
|
+
bytes_io = BytesIO(text.encode("utf-8"))
|
|
50
|
+
doc_stream = DocumentStream(name="text.md", stream=bytes_io)
|
|
51
|
+
converter = DocumentConverter()
|
|
52
|
+
result = converter.convert(doc_stream)
|
|
53
|
+
doc = result.document
|
|
60
54
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
return chunks
|
|
55
|
+
# Chunk using docling's hybrid chunker
|
|
56
|
+
chunks = list(self.chunker.chunk(doc))
|
|
57
|
+
return [self.chunker.contextualize(chunk) for chunk in chunks]
|
|
65
58
|
|
|
66
59
|
|
|
67
60
|
chunker = Chunker()
|
haiku/rag/config.py
CHANGED
haiku/rag/reader.py
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
haiku/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
2
|
haiku/rag/app.py,sha256=FpLVyP1-zAq_XPmU8CPVLkuIAeuhBOGvMqhYS8RbN40,7649
|
|
3
|
-
haiku/rag/chunker.py,sha256=
|
|
3
|
+
haiku/rag/chunker.py,sha256=P2slbmoABygYRlqjOGzPBEOYsBZNTnNpE6bnW_dkVOE,1850
|
|
4
4
|
haiku/rag/cli.py,sha256=k7EhLkvTncxsdh5TYrg8BHLYh_lfyzupsWGj1dEEdqY,5992
|
|
5
5
|
haiku/rag/client.py,sha256=MZNIpMm6MS3P6vjLqiCztT2dBOM7-bZOosX5IpbHJbI,12724
|
|
6
|
-
haiku/rag/config.py,sha256=
|
|
6
|
+
haiku/rag/config.py,sha256=GXTWC3vasBMaWju-yh8Es3CidBz1ftqRH6E5PHpgsSQ,1634
|
|
7
7
|
haiku/rag/logging.py,sha256=zTTGpGq5tPdcd7RpCbd9EGw1IZlQDbYkrCg9t9pqRc4,580
|
|
8
8
|
haiku/rag/mcp.py,sha256=tMN6fNX7ZtAER1R6DL1GkC9HZozTC4HzuQs199p7icI,4551
|
|
9
9
|
haiku/rag/monitor.py,sha256=r386nkhdlsU8UECwIuVwnrSlgMk3vNIuUZGNIzkZuec,2770
|
|
10
|
-
haiku/rag/reader.py,sha256=
|
|
10
|
+
haiku/rag/reader.py,sha256=s5dinZ-WffioiRH7OWZtE2v7FHRPd1PkqpPYsXtwqtc,2927
|
|
11
11
|
haiku/rag/utils.py,sha256=Ez_tvNlRO_D8c2CBZ83Hs9Gmzcqdq4cmw_V5GBdKy_8,2214
|
|
12
12
|
haiku/rag/embeddings/__init__.py,sha256=yFBlxS0jBiVHl_rWz5kb43t6Ha132U1ZGdlIPfhzPdg,1491
|
|
13
13
|
haiku/rag/embeddings/base.py,sha256=NTQvuzbZPu0LBo5wAu3qGyJ4xXUaRAt1fjBO0ygWn_Y,465
|
|
@@ -33,11 +33,11 @@ haiku/rag/store/repositories/__init__.py,sha256=uIBhxjQh-4o3O-ck8b7BQ58qXQTuJdPv
|
|
|
33
33
|
haiku/rag/store/repositories/base.py,sha256=cm3VyQXhtxvRfk1uJHpA0fDSxMpYN-mjQmRiDiLsQ68,1008
|
|
34
34
|
haiku/rag/store/repositories/chunk.py,sha256=UyvHhKb1ESZePoTp2GneAARdfKoocEdfPOwgWPPQ0v8,16878
|
|
35
35
|
haiku/rag/store/repositories/document.py,sha256=fXIWevJaOe6x2cK4u9cQxiEGD0ntKQb9y3VRqklQypE,7920
|
|
36
|
-
haiku/rag/store/repositories/settings.py,sha256=
|
|
36
|
+
haiku/rag/store/repositories/settings.py,sha256=qZLXvLsErnCWL0nBQQNfRnatHzCKhtUDLvUK9k-W_fU,2463
|
|
37
37
|
haiku/rag/store/upgrades/__init__.py,sha256=kKS1YWT_P-CYKhKtokOLTIFNKf9jlfjFFr8lyIMeogM,100
|
|
38
38
|
haiku/rag/store/upgrades/v0_3_4.py,sha256=GLogKZdZ40NX1vBHKdOJju7fFzNUCHoEnjSZg17Hm2U,663
|
|
39
|
-
haiku_rag-0.
|
|
40
|
-
haiku_rag-0.
|
|
41
|
-
haiku_rag-0.
|
|
42
|
-
haiku_rag-0.
|
|
43
|
-
haiku_rag-0.
|
|
39
|
+
haiku_rag-0.5.0.dist-info/METADATA,sha256=Z29lOzGgaD2PJ6OxZc53QuMzFdosEZCdm7HZYOUNN3M,4198
|
|
40
|
+
haiku_rag-0.5.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
41
|
+
haiku_rag-0.5.0.dist-info/entry_points.txt,sha256=G1U3nAkNd5YDYd4v0tuYFbriz0i-JheCsFuT9kIoGCI,48
|
|
42
|
+
haiku_rag-0.5.0.dist-info/licenses/LICENSE,sha256=eXZrWjSk9PwYFNK9yUczl3oPl95Z4V9UXH7bPN46iPo,1065
|
|
43
|
+
haiku_rag-0.5.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|