haiku.rag 0.4.3__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of haiku.rag might be problematic. Click here for more details.
- haiku/rag/chunker.py +19 -35
- haiku/rag/cli.py +14 -11
- haiku/rag/client.py +89 -19
- haiku/rag/config.py +1 -2
- haiku/rag/reader.py +13 -7
- haiku/rag/store/repositories/chunk.py +5 -3
- haiku/rag/store/repositories/document.py +29 -7
- haiku/rag/store/repositories/settings.py +0 -1
- haiku/rag/utils.py +21 -0
- {haiku_rag-0.4.3.dist-info → haiku_rag-0.5.1.dist-info}/METADATA +2 -2
- {haiku_rag-0.4.3.dist-info → haiku_rag-0.5.1.dist-info}/RECORD +14 -14
- {haiku_rag-0.4.3.dist-info → haiku_rag-0.5.1.dist-info}/WHEEL +0 -0
- {haiku_rag-0.4.3.dist-info → haiku_rag-0.5.1.dist-info}/entry_points.txt +0 -0
- {haiku_rag-0.4.3.dist-info → haiku_rag-0.5.1.dist-info}/licenses/LICENSE +0 -0
haiku/rag/chunker.py
CHANGED
|
@@ -1,6 +1,9 @@
|
|
|
1
1
|
from typing import ClassVar
|
|
2
2
|
|
|
3
3
|
import tiktoken
|
|
4
|
+
from docling.chunking import HybridChunker # type: ignore
|
|
5
|
+
from docling_core.transforms.chunker.tokenizer.openai import OpenAITokenizer
|
|
6
|
+
from docling_core.types.doc.document import DoclingDocument
|
|
4
7
|
|
|
5
8
|
from haiku.rag.config import Config
|
|
6
9
|
|
|
@@ -8,9 +11,11 @@ from haiku.rag.config import Config
|
|
|
8
11
|
class Chunker:
|
|
9
12
|
"""A class that chunks text into smaller pieces for embedding and retrieval.
|
|
10
13
|
|
|
14
|
+
Uses docling's structure-aware chunking to create semantically meaningful chunks
|
|
15
|
+
that respect document boundaries.
|
|
16
|
+
|
|
11
17
|
Args:
|
|
12
18
|
chunk_size: The maximum size of a chunk in tokens.
|
|
13
|
-
chunk_overlap: The number of tokens of overlap between chunks.
|
|
14
19
|
"""
|
|
15
20
|
|
|
16
21
|
encoder: ClassVar[tiktoken.Encoding] = tiktoken.encoding_for_model("gpt-4o")
|
|
@@ -18,50 +23,29 @@ class Chunker:
|
|
|
18
23
|
def __init__(
|
|
19
24
|
self,
|
|
20
25
|
chunk_size: int = Config.CHUNK_SIZE,
|
|
21
|
-
chunk_overlap: int = Config.CHUNK_OVERLAP,
|
|
22
26
|
):
|
|
23
27
|
self.chunk_size = chunk_size
|
|
24
|
-
|
|
28
|
+
tokenizer = OpenAITokenizer(
|
|
29
|
+
tokenizer=tiktoken.encoding_for_model("gpt-4o"), max_tokens=chunk_size
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
self.chunker = HybridChunker(tokenizer=tokenizer) # type: ignore
|
|
25
33
|
|
|
26
|
-
async def chunk(self,
|
|
27
|
-
"""Split the
|
|
34
|
+
async def chunk(self, document: DoclingDocument) -> list[str]:
|
|
35
|
+
"""Split the document into chunks using docling's structure-aware chunking.
|
|
28
36
|
|
|
29
37
|
Args:
|
|
30
|
-
|
|
38
|
+
document: The DoclingDocument to be split into chunks.
|
|
31
39
|
|
|
32
40
|
Returns:
|
|
33
|
-
A list of text chunks with
|
|
41
|
+
A list of text chunks with semantic boundaries.
|
|
34
42
|
"""
|
|
35
|
-
if
|
|
43
|
+
if document is None:
|
|
36
44
|
return []
|
|
37
45
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
return [text]
|
|
42
|
-
|
|
43
|
-
chunks = []
|
|
44
|
-
i = 0
|
|
45
|
-
split_id_counter = 0
|
|
46
|
-
while i < len(encoded_tokens):
|
|
47
|
-
# Overlap
|
|
48
|
-
start_i = i
|
|
49
|
-
end_i = min(i + self.chunk_size, len(encoded_tokens))
|
|
50
|
-
|
|
51
|
-
chunk_tokens = encoded_tokens[start_i:end_i]
|
|
52
|
-
chunk_text = self.encoder.decode(chunk_tokens)
|
|
53
|
-
|
|
54
|
-
chunks.append(chunk_text)
|
|
55
|
-
split_id_counter += 1
|
|
56
|
-
|
|
57
|
-
# Exit loop if this was the last possible chunk
|
|
58
|
-
if end_i == len(encoded_tokens):
|
|
59
|
-
break
|
|
60
|
-
|
|
61
|
-
i += (
|
|
62
|
-
self.chunk_size - self.chunk_overlap
|
|
63
|
-
) # Step forward, considering overlap
|
|
64
|
-
return chunks
|
|
46
|
+
# Chunk using docling's hybrid chunker
|
|
47
|
+
chunks = list(self.chunker.chunk(document))
|
|
48
|
+
return [self.chunker.contextualize(chunk) for chunk in chunks]
|
|
65
49
|
|
|
66
50
|
|
|
67
51
|
chunker = Chunker()
|
haiku/rag/cli.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
+
import warnings
|
|
2
3
|
from importlib.metadata import version
|
|
3
4
|
from pathlib import Path
|
|
4
5
|
|
|
@@ -9,12 +10,14 @@ from haiku.rag.app import HaikuRAGApp
|
|
|
9
10
|
from haiku.rag.config import Config
|
|
10
11
|
from haiku.rag.utils import is_up_to_date
|
|
11
12
|
|
|
13
|
+
if not Config.ENV == "development":
|
|
14
|
+
warnings.filterwarnings("ignore")
|
|
15
|
+
|
|
12
16
|
cli = typer.Typer(
|
|
13
17
|
context_settings={"help_option_names": ["-h", "--help"]}, no_args_is_help=True
|
|
14
18
|
)
|
|
15
19
|
|
|
16
20
|
console = Console()
|
|
17
|
-
event_loop = asyncio.get_event_loop()
|
|
18
21
|
|
|
19
22
|
|
|
20
23
|
async def check_version():
|
|
@@ -46,7 +49,7 @@ def main(
|
|
|
46
49
|
):
|
|
47
50
|
"""haiku.rag CLI - SQLite-based RAG system"""
|
|
48
51
|
# Run version check before any command
|
|
49
|
-
|
|
52
|
+
asyncio.run(check_version())
|
|
50
53
|
|
|
51
54
|
|
|
52
55
|
@cli.command("list", help="List all stored documents")
|
|
@@ -58,7 +61,7 @@ def list_documents(
|
|
|
58
61
|
),
|
|
59
62
|
):
|
|
60
63
|
app = HaikuRAGApp(db_path=db)
|
|
61
|
-
|
|
64
|
+
asyncio.run(app.list_documents())
|
|
62
65
|
|
|
63
66
|
|
|
64
67
|
@cli.command("add", help="Add a document from text input")
|
|
@@ -73,7 +76,7 @@ def add_document_text(
|
|
|
73
76
|
),
|
|
74
77
|
):
|
|
75
78
|
app = HaikuRAGApp(db_path=db)
|
|
76
|
-
|
|
79
|
+
asyncio.run(app.add_document_from_text(text=text))
|
|
77
80
|
|
|
78
81
|
|
|
79
82
|
@cli.command("add-src", help="Add a document from a file path or URL")
|
|
@@ -88,7 +91,7 @@ def add_document_src(
|
|
|
88
91
|
),
|
|
89
92
|
):
|
|
90
93
|
app = HaikuRAGApp(db_path=db)
|
|
91
|
-
|
|
94
|
+
asyncio.run(app.add_document_from_source(file_path=file_path))
|
|
92
95
|
|
|
93
96
|
|
|
94
97
|
@cli.command("get", help="Get and display a document by its ID")
|
|
@@ -103,7 +106,7 @@ def get_document(
|
|
|
103
106
|
),
|
|
104
107
|
):
|
|
105
108
|
app = HaikuRAGApp(db_path=db)
|
|
106
|
-
|
|
109
|
+
asyncio.run(app.get_document(doc_id=doc_id))
|
|
107
110
|
|
|
108
111
|
|
|
109
112
|
@cli.command("delete", help="Delete a document by its ID")
|
|
@@ -118,7 +121,7 @@ def delete_document(
|
|
|
118
121
|
),
|
|
119
122
|
):
|
|
120
123
|
app = HaikuRAGApp(db_path=db)
|
|
121
|
-
|
|
124
|
+
asyncio.run(app.delete_document(doc_id=doc_id))
|
|
122
125
|
|
|
123
126
|
|
|
124
127
|
@cli.command("search", help="Search for documents by a query")
|
|
@@ -144,7 +147,7 @@ def search(
|
|
|
144
147
|
),
|
|
145
148
|
):
|
|
146
149
|
app = HaikuRAGApp(db_path=db)
|
|
147
|
-
|
|
150
|
+
asyncio.run(app.search(query=query, limit=limit, k=k))
|
|
148
151
|
|
|
149
152
|
|
|
150
153
|
@cli.command("ask", help="Ask a question using the QA agent")
|
|
@@ -159,7 +162,7 @@ def ask(
|
|
|
159
162
|
),
|
|
160
163
|
):
|
|
161
164
|
app = HaikuRAGApp(db_path=db)
|
|
162
|
-
|
|
165
|
+
asyncio.run(app.ask(question=question))
|
|
163
166
|
|
|
164
167
|
|
|
165
168
|
@cli.command("settings", help="Display current configuration settings")
|
|
@@ -180,7 +183,7 @@ def rebuild(
|
|
|
180
183
|
),
|
|
181
184
|
):
|
|
182
185
|
app = HaikuRAGApp(db_path=db)
|
|
183
|
-
|
|
186
|
+
asyncio.run(app.rebuild())
|
|
184
187
|
|
|
185
188
|
|
|
186
189
|
@cli.command(
|
|
@@ -216,7 +219,7 @@ def serve(
|
|
|
216
219
|
elif sse:
|
|
217
220
|
transport = "sse"
|
|
218
221
|
|
|
219
|
-
|
|
222
|
+
asyncio.run(app.serve(transport=transport))
|
|
220
223
|
|
|
221
224
|
|
|
222
225
|
if __name__ == "__main__":
|
haiku/rag/client.py
CHANGED
|
@@ -16,6 +16,7 @@ from haiku.rag.store.models.chunk import Chunk
|
|
|
16
16
|
from haiku.rag.store.models.document import Document
|
|
17
17
|
from haiku.rag.store.repositories.chunk import ChunkRepository
|
|
18
18
|
from haiku.rag.store.repositories.document import DocumentRepository
|
|
19
|
+
from haiku.rag.utils import text_to_docling_document
|
|
19
20
|
|
|
20
21
|
|
|
21
22
|
class HaikuRAG:
|
|
@@ -49,6 +50,24 @@ class HaikuRAG:
|
|
|
49
50
|
self.close()
|
|
50
51
|
return False
|
|
51
52
|
|
|
53
|
+
async def _create_document_with_docling(
|
|
54
|
+
self,
|
|
55
|
+
docling_document,
|
|
56
|
+
uri: str | None = None,
|
|
57
|
+
metadata: dict | None = None,
|
|
58
|
+
chunks: list[Chunk] | None = None,
|
|
59
|
+
) -> Document:
|
|
60
|
+
"""Create a new document from DoclingDocument."""
|
|
61
|
+
content = docling_document.export_to_markdown()
|
|
62
|
+
document = Document(
|
|
63
|
+
content=content,
|
|
64
|
+
uri=uri,
|
|
65
|
+
metadata=metadata or {},
|
|
66
|
+
)
|
|
67
|
+
return await self.document_repository._create_with_docling(
|
|
68
|
+
document, docling_document, chunks
|
|
69
|
+
)
|
|
70
|
+
|
|
52
71
|
async def create_document(
|
|
53
72
|
self,
|
|
54
73
|
content: str,
|
|
@@ -67,12 +86,17 @@ class HaikuRAG:
|
|
|
67
86
|
Returns:
|
|
68
87
|
The created Document instance.
|
|
69
88
|
"""
|
|
89
|
+
# Convert content to DoclingDocument for processing
|
|
90
|
+
docling_document = text_to_docling_document(content)
|
|
91
|
+
|
|
70
92
|
document = Document(
|
|
71
93
|
content=content,
|
|
72
94
|
uri=uri,
|
|
73
95
|
metadata=metadata or {},
|
|
74
96
|
)
|
|
75
|
-
return await self.document_repository.
|
|
97
|
+
return await self.document_repository._create_with_docling(
|
|
98
|
+
document, docling_document, chunks
|
|
99
|
+
)
|
|
76
100
|
|
|
77
101
|
async def create_document_from_source(
|
|
78
102
|
self, source: str | Path, metadata: dict = {}
|
|
@@ -101,16 +125,19 @@ class HaikuRAG:
|
|
|
101
125
|
parsed_url = urlparse(source_str)
|
|
102
126
|
if parsed_url.scheme in ("http", "https"):
|
|
103
127
|
return await self._create_or_update_document_from_url(source_str, metadata)
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
128
|
+
elif parsed_url.scheme == "file":
|
|
129
|
+
# Handle file:// URI by converting to path
|
|
130
|
+
source_path = Path(parsed_url.path)
|
|
131
|
+
else:
|
|
132
|
+
# Handle as regular file path
|
|
133
|
+
source_path = Path(source) if isinstance(source, str) else source
|
|
107
134
|
if source_path.suffix.lower() not in FileReader.extensions:
|
|
108
135
|
raise ValueError(f"Unsupported file extension: {source_path.suffix}")
|
|
109
136
|
|
|
110
137
|
if not source_path.exists():
|
|
111
138
|
raise ValueError(f"File does not exist: {source_path}")
|
|
112
139
|
|
|
113
|
-
uri = source_path.as_uri()
|
|
140
|
+
uri = source_path.absolute().as_uri()
|
|
114
141
|
md5_hash = hashlib.md5(source_path.read_bytes()).hexdigest()
|
|
115
142
|
|
|
116
143
|
# Check if document already exists
|
|
@@ -119,7 +146,7 @@ class HaikuRAG:
|
|
|
119
146
|
# MD5 unchanged, return existing document
|
|
120
147
|
return existing_doc
|
|
121
148
|
|
|
122
|
-
|
|
149
|
+
docling_document = FileReader.parse_file(source_path)
|
|
123
150
|
|
|
124
151
|
# Get content type from file extension
|
|
125
152
|
content_type, _ = mimetypes.guess_type(str(source_path))
|
|
@@ -131,13 +158,15 @@ class HaikuRAG:
|
|
|
131
158
|
|
|
132
159
|
if existing_doc:
|
|
133
160
|
# Update existing document
|
|
134
|
-
existing_doc.content =
|
|
161
|
+
existing_doc.content = docling_document.export_to_markdown()
|
|
135
162
|
existing_doc.metadata = metadata
|
|
136
|
-
return await self.
|
|
163
|
+
return await self.document_repository._update_with_docling(
|
|
164
|
+
existing_doc, docling_document
|
|
165
|
+
)
|
|
137
166
|
else:
|
|
138
|
-
# Create new document
|
|
139
|
-
return await self.
|
|
140
|
-
|
|
167
|
+
# Create new document using DoclingDocument
|
|
168
|
+
return await self._create_document_with_docling(
|
|
169
|
+
docling_document=docling_document, uri=uri, metadata=metadata
|
|
141
170
|
)
|
|
142
171
|
|
|
143
172
|
async def _create_or_update_document_from_url(
|
|
@@ -193,18 +222,20 @@ class HaikuRAG:
|
|
|
193
222
|
temp_path = Path(temp_file.name)
|
|
194
223
|
|
|
195
224
|
# Parse the content using FileReader
|
|
196
|
-
|
|
225
|
+
docling_document = FileReader.parse_file(temp_path)
|
|
197
226
|
|
|
198
227
|
# Merge metadata with contentType and md5
|
|
199
228
|
metadata.update({"contentType": content_type, "md5": md5_hash})
|
|
200
229
|
|
|
201
230
|
if existing_doc:
|
|
202
|
-
existing_doc.content =
|
|
231
|
+
existing_doc.content = docling_document.export_to_markdown()
|
|
203
232
|
existing_doc.metadata = metadata
|
|
204
|
-
return await self.
|
|
233
|
+
return await self.document_repository._update_with_docling(
|
|
234
|
+
existing_doc, docling_document
|
|
235
|
+
)
|
|
205
236
|
else:
|
|
206
|
-
return await self.
|
|
207
|
-
|
|
237
|
+
return await self._create_document_with_docling(
|
|
238
|
+
docling_document=docling_document, uri=url, metadata=metadata
|
|
208
239
|
)
|
|
209
240
|
|
|
210
241
|
def _get_extension_from_content_type_or_url(
|
|
@@ -262,7 +293,12 @@ class HaikuRAG:
|
|
|
262
293
|
|
|
263
294
|
async def update_document(self, document: Document) -> Document:
|
|
264
295
|
"""Update an existing document."""
|
|
265
|
-
|
|
296
|
+
# Convert content to DoclingDocument
|
|
297
|
+
docling_document = text_to_docling_document(document.content)
|
|
298
|
+
|
|
299
|
+
return await self.document_repository._update_with_docling(
|
|
300
|
+
document, docling_document
|
|
301
|
+
)
|
|
266
302
|
|
|
267
303
|
async def delete_document(self, document_id: int) -> bool:
|
|
268
304
|
"""Delete a document by its ID."""
|
|
@@ -328,6 +364,13 @@ class HaikuRAG:
|
|
|
328
364
|
async def rebuild_database(self) -> AsyncGenerator[int, None]:
|
|
329
365
|
"""Rebuild the database by deleting all chunks and re-indexing all documents.
|
|
330
366
|
|
|
367
|
+
For documents with URIs:
|
|
368
|
+
- Deletes the document and re-adds it from source if source exists
|
|
369
|
+
- Skips documents where source no longer exists
|
|
370
|
+
|
|
371
|
+
For documents without URIs:
|
|
372
|
+
- Re-creates chunks from existing content
|
|
373
|
+
|
|
331
374
|
Yields:
|
|
332
375
|
int: The ID of the document currently being processed
|
|
333
376
|
"""
|
|
@@ -343,9 +386,36 @@ class HaikuRAG:
|
|
|
343
386
|
documents = await self.list_documents()
|
|
344
387
|
|
|
345
388
|
for doc in documents:
|
|
346
|
-
|
|
389
|
+
assert doc.id is not None, "Document ID should not be None"
|
|
390
|
+
if doc.uri:
|
|
391
|
+
# Document has a URI - delete and try to re-add from source
|
|
392
|
+
try:
|
|
393
|
+
# Delete the old document first
|
|
394
|
+
await self.delete_document(doc.id)
|
|
395
|
+
|
|
396
|
+
# Try to re-create from source (this creates the document with chunks)
|
|
397
|
+
new_doc = await self.create_document_from_source(
|
|
398
|
+
doc.uri, doc.metadata or {}
|
|
399
|
+
)
|
|
400
|
+
|
|
401
|
+
assert new_doc.id is not None, "New document ID should not be None"
|
|
402
|
+
yield new_doc.id
|
|
403
|
+
|
|
404
|
+
except (FileNotFoundError, ValueError, OSError) as e:
|
|
405
|
+
# Source doesn't exist or can't be accessed - document already deleted, skip
|
|
406
|
+
print(f"Skipping document with URI {doc.uri}: {e}")
|
|
407
|
+
continue
|
|
408
|
+
except Exception as e:
|
|
409
|
+
# Unexpected error - log it and skip
|
|
410
|
+
print(
|
|
411
|
+
f"Unexpected error processing document with URI {doc.uri}: {e}"
|
|
412
|
+
)
|
|
413
|
+
continue
|
|
414
|
+
else:
|
|
415
|
+
# Document without URI - re-create chunks from existing content
|
|
416
|
+
docling_document = text_to_docling_document(doc.content)
|
|
347
417
|
await self.chunk_repository.create_chunks_for_document(
|
|
348
|
-
doc.id,
|
|
418
|
+
doc.id, docling_document, commit=False
|
|
349
419
|
)
|
|
350
420
|
yield doc.id
|
|
351
421
|
|
haiku/rag/config.py
CHANGED
|
@@ -10,7 +10,7 @@ load_dotenv()
|
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
class AppConfig(BaseModel):
|
|
13
|
-
ENV: str = "
|
|
13
|
+
ENV: str = "production"
|
|
14
14
|
|
|
15
15
|
DEFAULT_DATA_DIR: Path = get_default_data_dir()
|
|
16
16
|
MONITOR_DIRECTORIES: list[Path] = []
|
|
@@ -27,7 +27,6 @@ class AppConfig(BaseModel):
|
|
|
27
27
|
QA_MODEL: str = "qwen3"
|
|
28
28
|
|
|
29
29
|
CHUNK_SIZE: int = 256
|
|
30
|
-
CHUNK_OVERLAP: int = 32
|
|
31
30
|
|
|
32
31
|
OLLAMA_BASE_URL: str = "http://localhost:11434"
|
|
33
32
|
|
haiku/rag/reader.py
CHANGED
|
@@ -2,6 +2,9 @@ from pathlib import Path
|
|
|
2
2
|
from typing import ClassVar
|
|
3
3
|
|
|
4
4
|
from docling.document_converter import DocumentConverter
|
|
5
|
+
from docling_core.types.doc.document import DoclingDocument
|
|
6
|
+
|
|
7
|
+
from haiku.rag.utils import text_to_docling_document
|
|
5
8
|
|
|
6
9
|
|
|
7
10
|
class FileReader:
|
|
@@ -16,7 +19,8 @@ class FileReader:
|
|
|
16
19
|
".jpeg",
|
|
17
20
|
".jpg",
|
|
18
21
|
".md",
|
|
19
|
-
".pdf
|
|
22
|
+
".pdf",
|
|
23
|
+
".png",
|
|
20
24
|
".pptx",
|
|
21
25
|
".tiff",
|
|
22
26
|
".xlsx",
|
|
@@ -83,7 +87,7 @@ class FileReader:
|
|
|
83
87
|
extensions: ClassVar[list[str]] = docling_extensions + text_extensions
|
|
84
88
|
|
|
85
89
|
@staticmethod
|
|
86
|
-
def parse_file(path: Path) ->
|
|
90
|
+
def parse_file(path: Path) -> DoclingDocument:
|
|
87
91
|
try:
|
|
88
92
|
file_extension = path.suffix.lower()
|
|
89
93
|
|
|
@@ -91,7 +95,7 @@ class FileReader:
|
|
|
91
95
|
# Use docling for complex document formats
|
|
92
96
|
converter = DocumentConverter()
|
|
93
97
|
result = converter.convert(path)
|
|
94
|
-
return result.document
|
|
98
|
+
return result.document
|
|
95
99
|
elif file_extension in FileReader.text_extensions:
|
|
96
100
|
# Read plain text files directly
|
|
97
101
|
content = path.read_text(encoding="utf-8")
|
|
@@ -99,11 +103,13 @@ class FileReader:
|
|
|
99
103
|
# Wrap code files (but not plain txt) in markdown code blocks for better presentation
|
|
100
104
|
if file_extension in FileReader.code_markdown_identifier:
|
|
101
105
|
language = FileReader.code_markdown_identifier[file_extension]
|
|
102
|
-
|
|
106
|
+
content = f"```{language}\n{content}\n```"
|
|
103
107
|
|
|
104
|
-
|
|
108
|
+
# Convert text to DoclingDocument by wrapping as markdown
|
|
109
|
+
return text_to_docling_document(content, name=f"{path.stem}.md")
|
|
105
110
|
else:
|
|
106
|
-
# Fallback: try to read as text
|
|
107
|
-
|
|
111
|
+
# Fallback: try to read as text and convert to DoclingDocument
|
|
112
|
+
content = path.read_text(encoding="utf-8")
|
|
113
|
+
return text_to_docling_document(content, name=f"{path.stem}.md")
|
|
108
114
|
except Exception:
|
|
109
115
|
raise ValueError(f"Failed to parse file: {path}")
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import re
|
|
3
3
|
|
|
4
|
+
from docling_core.types.doc.document import DoclingDocument
|
|
5
|
+
|
|
4
6
|
from haiku.rag.chunker import chunker
|
|
5
7
|
from haiku.rag.embeddings import get_embedder
|
|
6
8
|
from haiku.rag.store.models.chunk import Chunk
|
|
@@ -197,11 +199,11 @@ class ChunkRepository(BaseRepository[Chunk]):
|
|
|
197
199
|
]
|
|
198
200
|
|
|
199
201
|
async def create_chunks_for_document(
|
|
200
|
-
self, document_id: int,
|
|
202
|
+
self, document_id: int, document: DoclingDocument, commit: bool = True
|
|
201
203
|
) -> list[Chunk]:
|
|
202
|
-
"""Create chunks and embeddings for a document."""
|
|
204
|
+
"""Create chunks and embeddings for a document from DoclingDocument."""
|
|
203
205
|
# Chunk the document content
|
|
204
|
-
chunk_texts = await chunker.chunk(
|
|
206
|
+
chunk_texts = await chunker.chunk(document)
|
|
205
207
|
created_chunks = []
|
|
206
208
|
|
|
207
209
|
# Create chunks with embeddings using the create method
|
|
@@ -1,8 +1,11 @@
|
|
|
1
1
|
import json
|
|
2
2
|
from typing import TYPE_CHECKING
|
|
3
3
|
|
|
4
|
+
from docling_core.types.doc.document import DoclingDocument
|
|
5
|
+
|
|
4
6
|
from haiku.rag.store.models.document import Document
|
|
5
7
|
from haiku.rag.store.repositories.base import BaseRepository
|
|
8
|
+
from haiku.rag.utils import text_to_docling_document
|
|
6
9
|
|
|
7
10
|
if TYPE_CHECKING:
|
|
8
11
|
from haiku.rag.store.models.chunk import Chunk
|
|
@@ -20,8 +23,11 @@ class DocumentRepository(BaseRepository[Document]):
|
|
|
20
23
|
chunk_repository = ChunkRepository(store)
|
|
21
24
|
self.chunk_repository = chunk_repository
|
|
22
25
|
|
|
23
|
-
async def
|
|
24
|
-
self,
|
|
26
|
+
async def _create_with_docling(
|
|
27
|
+
self,
|
|
28
|
+
entity: Document,
|
|
29
|
+
docling_document: DoclingDocument,
|
|
30
|
+
chunks: list["Chunk"] | None = None,
|
|
25
31
|
) -> Document:
|
|
26
32
|
"""Create a document with its chunks and embeddings."""
|
|
27
33
|
if self.store._connection is None:
|
|
@@ -62,9 +68,9 @@ class DocumentRepository(BaseRepository[Document]):
|
|
|
62
68
|
chunk.metadata["order"] = order
|
|
63
69
|
await self.chunk_repository.create(chunk, commit=False)
|
|
64
70
|
else:
|
|
65
|
-
# Create chunks and embeddings using
|
|
71
|
+
# Create chunks and embeddings using DoclingDocument
|
|
66
72
|
await self.chunk_repository.create_chunks_for_document(
|
|
67
|
-
document_id,
|
|
73
|
+
document_id, docling_document, commit=False
|
|
68
74
|
)
|
|
69
75
|
|
|
70
76
|
cursor.execute("COMMIT")
|
|
@@ -74,6 +80,13 @@ class DocumentRepository(BaseRepository[Document]):
|
|
|
74
80
|
cursor.execute("ROLLBACK")
|
|
75
81
|
raise
|
|
76
82
|
|
|
83
|
+
async def create(self, entity: Document) -> Document:
|
|
84
|
+
"""Create a document with its chunks and embeddings."""
|
|
85
|
+
# Convert content to DoclingDocument
|
|
86
|
+
docling_document = text_to_docling_document(entity.content)
|
|
87
|
+
|
|
88
|
+
return await self._create_with_docling(entity, docling_document)
|
|
89
|
+
|
|
77
90
|
async def get_by_id(self, entity_id: int) -> Document | None:
|
|
78
91
|
"""Get a document by its ID."""
|
|
79
92
|
if self.store._connection is None:
|
|
@@ -134,7 +147,9 @@ class DocumentRepository(BaseRepository[Document]):
|
|
|
134
147
|
updated_at=updated_at,
|
|
135
148
|
)
|
|
136
149
|
|
|
137
|
-
async def
|
|
150
|
+
async def _update_with_docling(
|
|
151
|
+
self, entity: Document, docling_document: DoclingDocument
|
|
152
|
+
) -> Document:
|
|
138
153
|
"""Update an existing document and regenerate its chunks and embeddings."""
|
|
139
154
|
if self.store._connection is None:
|
|
140
155
|
raise ValueError("Store connection is not available")
|
|
@@ -163,10 +178,10 @@ class DocumentRepository(BaseRepository[Document]):
|
|
|
163
178
|
},
|
|
164
179
|
)
|
|
165
180
|
|
|
166
|
-
# Delete existing chunks and regenerate using
|
|
181
|
+
# Delete existing chunks and regenerate using DoclingDocument
|
|
167
182
|
await self.chunk_repository.delete_by_document_id(entity.id, commit=False)
|
|
168
183
|
await self.chunk_repository.create_chunks_for_document(
|
|
169
|
-
entity.id,
|
|
184
|
+
entity.id, docling_document, commit=False
|
|
170
185
|
)
|
|
171
186
|
|
|
172
187
|
cursor.execute("COMMIT")
|
|
@@ -176,6 +191,13 @@ class DocumentRepository(BaseRepository[Document]):
|
|
|
176
191
|
cursor.execute("ROLLBACK")
|
|
177
192
|
raise
|
|
178
193
|
|
|
194
|
+
async def update(self, entity: Document) -> Document:
|
|
195
|
+
"""Update an existing document and regenerate its chunks and embeddings."""
|
|
196
|
+
# Convert content to DoclingDocument
|
|
197
|
+
docling_document = text_to_docling_document(entity.content)
|
|
198
|
+
|
|
199
|
+
return await self._update_with_docling(entity, docling_document)
|
|
200
|
+
|
|
179
201
|
async def delete(self, entity_id: int) -> bool:
|
|
180
202
|
"""Delete a document and all its associated chunks and embeddings."""
|
|
181
203
|
# Delete chunks and embeddings first
|
haiku/rag/utils.py
CHANGED
|
@@ -1,8 +1,12 @@
|
|
|
1
1
|
import sys
|
|
2
2
|
from importlib import metadata
|
|
3
|
+
from io import BytesIO
|
|
3
4
|
from pathlib import Path
|
|
4
5
|
|
|
5
6
|
import httpx
|
|
7
|
+
from docling.document_converter import DocumentConverter
|
|
8
|
+
from docling_core.types.doc.document import DoclingDocument
|
|
9
|
+
from docling_core.types.io import DocumentStream
|
|
6
10
|
from packaging.version import Version, parse
|
|
7
11
|
|
|
8
12
|
|
|
@@ -77,3 +81,20 @@ async def is_up_to_date() -> tuple[bool, Version, Version]:
|
|
|
77
81
|
# If no network connection, do not raise alarms.
|
|
78
82
|
pypi_version = running_version
|
|
79
83
|
return running_version >= pypi_version, running_version, pypi_version
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def text_to_docling_document(text: str, name: str = "content.md") -> DoclingDocument:
|
|
87
|
+
"""Convert text content to a DoclingDocument.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
text: The text content to convert.
|
|
91
|
+
name: The name to use for the document stream (defaults to "content.md").
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
A DoclingDocument created from the text content.
|
|
95
|
+
"""
|
|
96
|
+
bytes_io = BytesIO(text.encode("utf-8"))
|
|
97
|
+
doc_stream = DocumentStream(name=name, stream=bytes_io)
|
|
98
|
+
converter = DocumentConverter()
|
|
99
|
+
result = converter.convert(doc_stream)
|
|
100
|
+
return result.document
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: haiku.rag
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.1
|
|
4
4
|
Summary: Retrieval Augmented Generation (RAG) with SQLite
|
|
5
5
|
Author-email: Yiorgis Gozadinos <ggozadinos@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -17,7 +17,7 @@ Classifier: Programming Language :: Python :: 3.10
|
|
|
17
17
|
Classifier: Programming Language :: Python :: 3.11
|
|
18
18
|
Classifier: Programming Language :: Python :: 3.12
|
|
19
19
|
Classifier: Typing :: Typed
|
|
20
|
-
Requires-Python: >=3.
|
|
20
|
+
Requires-Python: >=3.11
|
|
21
21
|
Requires-Dist: docling>=2.15.0
|
|
22
22
|
Requires-Dist: fastmcp>=2.8.1
|
|
23
23
|
Requires-Dist: httpx>=0.28.1
|
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
haiku/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
2
|
haiku/rag/app.py,sha256=FpLVyP1-zAq_XPmU8CPVLkuIAeuhBOGvMqhYS8RbN40,7649
|
|
3
|
-
haiku/rag/chunker.py,sha256=
|
|
4
|
-
haiku/rag/cli.py,sha256=
|
|
5
|
-
haiku/rag/client.py,sha256=
|
|
6
|
-
haiku/rag/config.py,sha256=
|
|
3
|
+
haiku/rag/chunker.py,sha256=PVe6ysv8UlacUd4Zb3_8RFWIaWDXnzBAy2VDJ4TaUsE,1555
|
|
4
|
+
haiku/rag/cli.py,sha256=rk4uUwN_FdMC-rai9_R2sgXXMI3TIWKRtdWWHg_WoWM,5865
|
|
5
|
+
haiku/rag/client.py,sha256=pFcrPkQo1h1zJ76jts-72goP_kGVtnJNfLuoT8qpsb8,15795
|
|
6
|
+
haiku/rag/config.py,sha256=8mlQ8gYFxxq1q9gi9tjY9StjqhfhiHkO1FvS4b0et0E,1633
|
|
7
7
|
haiku/rag/logging.py,sha256=zTTGpGq5tPdcd7RpCbd9EGw1IZlQDbYkrCg9t9pqRc4,580
|
|
8
8
|
haiku/rag/mcp.py,sha256=tMN6fNX7ZtAER1R6DL1GkC9HZozTC4HzuQs199p7icI,4551
|
|
9
9
|
haiku/rag/monitor.py,sha256=r386nkhdlsU8UECwIuVwnrSlgMk3vNIuUZGNIzkZuec,2770
|
|
10
|
-
haiku/rag/reader.py,sha256=
|
|
11
|
-
haiku/rag/utils.py,sha256=
|
|
10
|
+
haiku/rag/reader.py,sha256=qkPTMJuQ_o4sK-8zpDl9WFYe_MJ7aL_gUw6rczIpW-g,3274
|
|
11
|
+
haiku/rag/utils.py,sha256=g-uNTG60iBLgkeHHuah6eVZEkX3NFLs-LZU1YnzJzLQ,2967
|
|
12
12
|
haiku/rag/embeddings/__init__.py,sha256=yFBlxS0jBiVHl_rWz5kb43t6Ha132U1ZGdlIPfhzPdg,1491
|
|
13
13
|
haiku/rag/embeddings/base.py,sha256=NTQvuzbZPu0LBo5wAu3qGyJ4xXUaRAt1fjBO0ygWn_Y,465
|
|
14
14
|
haiku/rag/embeddings/ollama.py,sha256=y6-lp0XpbnyIjoOEdtSzMdEVkU5glOwnWQ1FkpUZnpI,370
|
|
@@ -31,13 +31,13 @@ haiku/rag/store/models/chunk.py,sha256=9-vIxW75-kMTelIhgVIMd_WhP-Drc1q65vjaWMP8w
|
|
|
31
31
|
haiku/rag/store/models/document.py,sha256=TVXVY-nQs-1vCORQEs9rA7zOtndeGC4dgCoujLAS054,396
|
|
32
32
|
haiku/rag/store/repositories/__init__.py,sha256=uIBhxjQh-4o3O-ck8b7BQ58qXQTuJdPvrDIHVhY5T1A,263
|
|
33
33
|
haiku/rag/store/repositories/base.py,sha256=cm3VyQXhtxvRfk1uJHpA0fDSxMpYN-mjQmRiDiLsQ68,1008
|
|
34
|
-
haiku/rag/store/repositories/chunk.py,sha256=
|
|
35
|
-
haiku/rag/store/repositories/document.py,sha256=
|
|
36
|
-
haiku/rag/store/repositories/settings.py,sha256=
|
|
34
|
+
haiku/rag/store/repositories/chunk.py,sha256=DIIdpHVemvxZOPHOLBL7pJGWY4VyNrUiQSWPWt24BYo,16974
|
|
35
|
+
haiku/rag/store/repositories/document.py,sha256=ki8LiDukwU1469Yw51i0rQFvBzUQeYkFYWs3Ly83akc,8815
|
|
36
|
+
haiku/rag/store/repositories/settings.py,sha256=qZLXvLsErnCWL0nBQQNfRnatHzCKhtUDLvUK9k-W_fU,2463
|
|
37
37
|
haiku/rag/store/upgrades/__init__.py,sha256=kKS1YWT_P-CYKhKtokOLTIFNKf9jlfjFFr8lyIMeogM,100
|
|
38
38
|
haiku/rag/store/upgrades/v0_3_4.py,sha256=GLogKZdZ40NX1vBHKdOJju7fFzNUCHoEnjSZg17Hm2U,663
|
|
39
|
-
haiku_rag-0.
|
|
40
|
-
haiku_rag-0.
|
|
41
|
-
haiku_rag-0.
|
|
42
|
-
haiku_rag-0.
|
|
43
|
-
haiku_rag-0.
|
|
39
|
+
haiku_rag-0.5.1.dist-info/METADATA,sha256=X4r-1CBCTef3_T9HWPgCHi5XumqOSF4tlHfUpxO533E,4198
|
|
40
|
+
haiku_rag-0.5.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
41
|
+
haiku_rag-0.5.1.dist-info/entry_points.txt,sha256=G1U3nAkNd5YDYd4v0tuYFbriz0i-JheCsFuT9kIoGCI,48
|
|
42
|
+
haiku_rag-0.5.1.dist-info/licenses/LICENSE,sha256=eXZrWjSk9PwYFNK9yUczl3oPl95Z4V9UXH7bPN46iPo,1065
|
|
43
|
+
haiku_rag-0.5.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|