haiku.rag 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of haiku.rag might be problematic. Click here for more details.

haiku/rag/app.py CHANGED
@@ -32,9 +32,9 @@ class HaikuRAGApp:
32
32
  f"[b]Document with id [cyan]{doc.id}[/cyan] added successfully.[/b]"
33
33
  )
34
34
 
35
- async def add_document_from_source(self, file_path: Path):
35
+ async def add_document_from_source(self, source: str):
36
36
  async with HaikuRAG(db_path=self.db_path) as self.client:
37
- doc = await self.client.create_document_from_source(file_path)
37
+ doc = await self.client.create_document_from_source(source)
38
38
  self._rich_print_document(doc, truncate=True)
39
39
  self.console.print(
40
40
  f"[b]Document with id [cyan]{doc.id}[/cyan] added successfully.[/b]"
haiku/rag/chunker.py CHANGED
@@ -1,11 +1,9 @@
1
- from io import BytesIO
2
1
  from typing import ClassVar
3
2
 
4
3
  import tiktoken
5
4
  from docling.chunking import HybridChunker # type: ignore
6
- from docling.document_converter import DocumentConverter
7
5
  from docling_core.transforms.chunker.tokenizer.openai import OpenAITokenizer
8
- from docling_core.types.io import DocumentStream
6
+ from docling_core.types.doc.document import DoclingDocument
9
7
 
10
8
  from haiku.rag.config import Config
11
9
 
@@ -33,27 +31,20 @@ class Chunker:
33
31
 
34
32
  self.chunker = HybridChunker(tokenizer=tokenizer) # type: ignore
35
33
 
36
- async def chunk(self, text: str) -> list[str]:
37
- """Split the text into chunks using docling's structure-aware chunking.
34
+ async def chunk(self, document: DoclingDocument) -> list[str]:
35
+ """Split the document into chunks using docling's structure-aware chunking.
38
36
 
39
37
  Args:
40
- text: The text to be split into chunks.
38
+ document: The DoclingDocument to be split into chunks.
41
39
 
42
40
  Returns:
43
41
  A list of text chunks with semantic boundaries.
44
42
  """
45
- if not text:
43
+ if document is None:
46
44
  return []
47
45
 
48
- # Convert to docling document
49
- bytes_io = BytesIO(text.encode("utf-8"))
50
- doc_stream = DocumentStream(name="text.md", stream=bytes_io)
51
- converter = DocumentConverter()
52
- result = converter.convert(doc_stream)
53
- doc = result.document
54
-
55
46
  # Chunk using docling's hybrid chunker
56
- chunks = list(self.chunker.chunk(doc))
47
+ chunks = list(self.chunker.chunk(document))
57
48
  return [self.chunker.contextualize(chunk) for chunk in chunks]
58
49
 
59
50
 
haiku/rag/cli.py CHANGED
@@ -1,4 +1,5 @@
1
1
  import asyncio
2
+ import warnings
2
3
  from importlib.metadata import version
3
4
  from pathlib import Path
4
5
 
@@ -9,12 +10,14 @@ from haiku.rag.app import HaikuRAGApp
9
10
  from haiku.rag.config import Config
10
11
  from haiku.rag.utils import is_up_to_date
11
12
 
13
+ if not Config.ENV == "development":
14
+ warnings.filterwarnings("ignore")
15
+
12
16
  cli = typer.Typer(
13
17
  context_settings={"help_option_names": ["-h", "--help"]}, no_args_is_help=True
14
18
  )
15
19
 
16
20
  console = Console()
17
- event_loop = asyncio.get_event_loop()
18
21
 
19
22
 
20
23
  async def check_version():
@@ -46,7 +49,7 @@ def main(
46
49
  ):
47
50
  """haiku.rag CLI - SQLite-based RAG system"""
48
51
  # Run version check before any command
49
- event_loop.run_until_complete(check_version())
52
+ asyncio.run(check_version())
50
53
 
51
54
 
52
55
  @cli.command("list", help="List all stored documents")
@@ -58,7 +61,7 @@ def list_documents(
58
61
  ),
59
62
  ):
60
63
  app = HaikuRAGApp(db_path=db)
61
- event_loop.run_until_complete(app.list_documents())
64
+ asyncio.run(app.list_documents())
62
65
 
63
66
 
64
67
  @cli.command("add", help="Add a document from text input")
@@ -73,12 +76,12 @@ def add_document_text(
73
76
  ),
74
77
  ):
75
78
  app = HaikuRAGApp(db_path=db)
76
- event_loop.run_until_complete(app.add_document_from_text(text=text))
79
+ asyncio.run(app.add_document_from_text(text=text))
77
80
 
78
81
 
79
82
  @cli.command("add-src", help="Add a document from a file path or URL")
80
83
  def add_document_src(
81
- file_path: Path = typer.Argument(
84
+ source: str = typer.Argument(
82
85
  help="The file path or URL of the document to add",
83
86
  ),
84
87
  db: Path = typer.Option(
@@ -88,7 +91,7 @@ def add_document_src(
88
91
  ),
89
92
  ):
90
93
  app = HaikuRAGApp(db_path=db)
91
- event_loop.run_until_complete(app.add_document_from_source(file_path=file_path))
94
+ asyncio.run(app.add_document_from_source(source=source))
92
95
 
93
96
 
94
97
  @cli.command("get", help="Get and display a document by its ID")
@@ -103,7 +106,7 @@ def get_document(
103
106
  ),
104
107
  ):
105
108
  app = HaikuRAGApp(db_path=db)
106
- event_loop.run_until_complete(app.get_document(doc_id=doc_id))
109
+ asyncio.run(app.get_document(doc_id=doc_id))
107
110
 
108
111
 
109
112
  @cli.command("delete", help="Delete a document by its ID")
@@ -118,7 +121,7 @@ def delete_document(
118
121
  ),
119
122
  ):
120
123
  app = HaikuRAGApp(db_path=db)
121
- event_loop.run_until_complete(app.delete_document(doc_id=doc_id))
124
+ asyncio.run(app.delete_document(doc_id=doc_id))
122
125
 
123
126
 
124
127
  @cli.command("search", help="Search for documents by a query")
@@ -144,7 +147,7 @@ def search(
144
147
  ),
145
148
  ):
146
149
  app = HaikuRAGApp(db_path=db)
147
- event_loop.run_until_complete(app.search(query=query, limit=limit, k=k))
150
+ asyncio.run(app.search(query=query, limit=limit, k=k))
148
151
 
149
152
 
150
153
  @cli.command("ask", help="Ask a question using the QA agent")
@@ -159,7 +162,7 @@ def ask(
159
162
  ),
160
163
  ):
161
164
  app = HaikuRAGApp(db_path=db)
162
- event_loop.run_until_complete(app.ask(question=question))
165
+ asyncio.run(app.ask(question=question))
163
166
 
164
167
 
165
168
  @cli.command("settings", help="Display current configuration settings")
@@ -180,7 +183,7 @@ def rebuild(
180
183
  ),
181
184
  ):
182
185
  app = HaikuRAGApp(db_path=db)
183
- event_loop.run_until_complete(app.rebuild())
186
+ asyncio.run(app.rebuild())
184
187
 
185
188
 
186
189
  @cli.command(
@@ -216,7 +219,7 @@ def serve(
216
219
  elif sse:
217
220
  transport = "sse"
218
221
 
219
- event_loop.run_until_complete(app.serve(transport=transport))
222
+ asyncio.run(app.serve(transport=transport))
220
223
 
221
224
 
222
225
  if __name__ == "__main__":
haiku/rag/client.py CHANGED
@@ -16,6 +16,7 @@ from haiku.rag.store.models.chunk import Chunk
16
16
  from haiku.rag.store.models.document import Document
17
17
  from haiku.rag.store.repositories.chunk import ChunkRepository
18
18
  from haiku.rag.store.repositories.document import DocumentRepository
19
+ from haiku.rag.utils import text_to_docling_document
19
20
 
20
21
 
21
22
  class HaikuRAG:
@@ -49,6 +50,24 @@ class HaikuRAG:
49
50
  self.close()
50
51
  return False
51
52
 
53
+ async def _create_document_with_docling(
54
+ self,
55
+ docling_document,
56
+ uri: str | None = None,
57
+ metadata: dict | None = None,
58
+ chunks: list[Chunk] | None = None,
59
+ ) -> Document:
60
+ """Create a new document from DoclingDocument."""
61
+ content = docling_document.export_to_markdown()
62
+ document = Document(
63
+ content=content,
64
+ uri=uri,
65
+ metadata=metadata or {},
66
+ )
67
+ return await self.document_repository._create_with_docling(
68
+ document, docling_document, chunks
69
+ )
70
+
52
71
  async def create_document(
53
72
  self,
54
73
  content: str,
@@ -67,12 +86,17 @@ class HaikuRAG:
67
86
  Returns:
68
87
  The created Document instance.
69
88
  """
89
+ # Convert content to DoclingDocument for processing
90
+ docling_document = text_to_docling_document(content)
91
+
70
92
  document = Document(
71
93
  content=content,
72
94
  uri=uri,
73
95
  metadata=metadata or {},
74
96
  )
75
- return await self.document_repository.create(document, chunks)
97
+ return await self.document_repository._create_with_docling(
98
+ document, docling_document, chunks
99
+ )
76
100
 
77
101
  async def create_document_from_source(
78
102
  self, source: str | Path, metadata: dict = {}
@@ -101,16 +125,19 @@ class HaikuRAG:
101
125
  parsed_url = urlparse(source_str)
102
126
  if parsed_url.scheme in ("http", "https"):
103
127
  return await self._create_or_update_document_from_url(source_str, metadata)
104
-
105
- # Handle as file path
106
- source_path = Path(source) if isinstance(source, str) else source
128
+ elif parsed_url.scheme == "file":
129
+ # Handle file:// URI by converting to path
130
+ source_path = Path(parsed_url.path)
131
+ else:
132
+ # Handle as regular file path
133
+ source_path = Path(source) if isinstance(source, str) else source
107
134
  if source_path.suffix.lower() not in FileReader.extensions:
108
135
  raise ValueError(f"Unsupported file extension: {source_path.suffix}")
109
136
 
110
137
  if not source_path.exists():
111
138
  raise ValueError(f"File does not exist: {source_path}")
112
139
 
113
- uri = source_path.as_uri()
140
+ uri = source_path.absolute().as_uri()
114
141
  md5_hash = hashlib.md5(source_path.read_bytes()).hexdigest()
115
142
 
116
143
  # Check if document already exists
@@ -119,7 +146,7 @@ class HaikuRAG:
119
146
  # MD5 unchanged, return existing document
120
147
  return existing_doc
121
148
 
122
- content = FileReader.parse_file(source_path)
149
+ docling_document = FileReader.parse_file(source_path)
123
150
 
124
151
  # Get content type from file extension
125
152
  content_type, _ = mimetypes.guess_type(str(source_path))
@@ -131,13 +158,15 @@ class HaikuRAG:
131
158
 
132
159
  if existing_doc:
133
160
  # Update existing document
134
- existing_doc.content = content
161
+ existing_doc.content = docling_document.export_to_markdown()
135
162
  existing_doc.metadata = metadata
136
- return await self.update_document(existing_doc)
163
+ return await self.document_repository._update_with_docling(
164
+ existing_doc, docling_document
165
+ )
137
166
  else:
138
- # Create new document
139
- return await self.create_document(
140
- content=content, uri=uri, metadata=metadata
167
+ # Create new document using DoclingDocument
168
+ return await self._create_document_with_docling(
169
+ docling_document=docling_document, uri=uri, metadata=metadata
141
170
  )
142
171
 
143
172
  async def _create_or_update_document_from_url(
@@ -193,18 +222,20 @@ class HaikuRAG:
193
222
  temp_path = Path(temp_file.name)
194
223
 
195
224
  # Parse the content using FileReader
196
- content = FileReader.parse_file(temp_path)
225
+ docling_document = FileReader.parse_file(temp_path)
197
226
 
198
227
  # Merge metadata with contentType and md5
199
228
  metadata.update({"contentType": content_type, "md5": md5_hash})
200
229
 
201
230
  if existing_doc:
202
- existing_doc.content = content
231
+ existing_doc.content = docling_document.export_to_markdown()
203
232
  existing_doc.metadata = metadata
204
- return await self.update_document(existing_doc)
233
+ return await self.document_repository._update_with_docling(
234
+ existing_doc, docling_document
235
+ )
205
236
  else:
206
- return await self.create_document(
207
- content=content, uri=url, metadata=metadata
237
+ return await self._create_document_with_docling(
238
+ docling_document=docling_document, uri=url, metadata=metadata
208
239
  )
209
240
 
210
241
  def _get_extension_from_content_type_or_url(
@@ -262,7 +293,12 @@ class HaikuRAG:
262
293
 
263
294
  async def update_document(self, document: Document) -> Document:
264
295
  """Update an existing document."""
265
- return await self.document_repository.update(document)
296
+ # Convert content to DoclingDocument
297
+ docling_document = text_to_docling_document(document.content)
298
+
299
+ return await self.document_repository._update_with_docling(
300
+ document, docling_document
301
+ )
266
302
 
267
303
  async def delete_document(self, document_id: int) -> bool:
268
304
  """Delete a document by its ID."""
@@ -283,7 +319,7 @@ class HaikuRAG:
283
319
  return await self.document_repository.list_all(limit=limit, offset=offset)
284
320
 
285
321
  async def search(
286
- self, query: str, limit: int = 5, k: int = 60, rerank=Config.RERANK
322
+ self, query: str, limit: int = 5, k: int = 60
287
323
  ) -> list[tuple[Chunk, float]]:
288
324
  """Search for relevant chunks using hybrid search (vector similarity + full-text search) with reranking.
289
325
 
@@ -295,8 +331,10 @@ class HaikuRAG:
295
331
  Returns:
296
332
  List of (chunk, score) tuples ordered by relevance.
297
333
  """
334
+ # Get reranker if available
335
+ reranker = get_reranker()
298
336
 
299
- if not rerank:
337
+ if reranker is None:
300
338
  return await self.chunk_repository.search_chunks_hybrid(query, limit, k)
301
339
 
302
340
  # Get more initial results (3X) for reranking
@@ -304,7 +342,6 @@ class HaikuRAG:
304
342
  query, limit * 3, k
305
343
  )
306
344
  # Apply reranking
307
- reranker = get_reranker()
308
345
  chunks = [chunk for chunk, _ in search_results]
309
346
  reranked_results = await reranker.rerank(query, chunks, top_n=limit)
310
347
 
@@ -328,6 +365,13 @@ class HaikuRAG:
328
365
  async def rebuild_database(self) -> AsyncGenerator[int, None]:
329
366
  """Rebuild the database by deleting all chunks and re-indexing all documents.
330
367
 
368
+ For documents with URIs:
369
+ - Deletes the document and re-adds it from source if source exists
370
+ - Skips documents where source no longer exists
371
+
372
+ For documents without URIs:
373
+ - Re-creates chunks from existing content
374
+
331
375
  Yields:
332
376
  int: The ID of the document currently being processed
333
377
  """
@@ -343,9 +387,36 @@ class HaikuRAG:
343
387
  documents = await self.list_documents()
344
388
 
345
389
  for doc in documents:
346
- if doc.id is not None:
390
+ assert doc.id is not None, "Document ID should not be None"
391
+ if doc.uri:
392
+ # Document has a URI - delete and try to re-add from source
393
+ try:
394
+ # Delete the old document first
395
+ await self.delete_document(doc.id)
396
+
397
+ # Try to re-create from source (this creates the document with chunks)
398
+ new_doc = await self.create_document_from_source(
399
+ doc.uri, doc.metadata or {}
400
+ )
401
+
402
+ assert new_doc.id is not None, "New document ID should not be None"
403
+ yield new_doc.id
404
+
405
+ except (FileNotFoundError, ValueError, OSError) as e:
406
+ # Source doesn't exist or can't be accessed - document already deleted, skip
407
+ print(f"Skipping document with URI {doc.uri}: {e}")
408
+ continue
409
+ except Exception as e:
410
+ # Unexpected error - log it and skip
411
+ print(
412
+ f"Unexpected error processing document with URI {doc.uri}: {e}"
413
+ )
414
+ continue
415
+ else:
416
+ # Document without URI - re-create chunks from existing content
417
+ docling_document = text_to_docling_document(doc.content)
347
418
  await self.chunk_repository.create_chunks_for_document(
348
- doc.id, doc.content, commit=False
419
+ doc.id, docling_document, commit=False
349
420
  )
350
421
  yield doc.id
351
422
 
haiku/rag/config.py CHANGED
@@ -10,7 +10,7 @@ load_dotenv()
10
10
 
11
11
 
12
12
  class AppConfig(BaseModel):
13
- ENV: str = "development"
13
+ ENV: str = "production"
14
14
 
15
15
  DEFAULT_DATA_DIR: Path = get_default_data_dir()
16
16
  MONITOR_DIRECTORIES: list[Path] = []
@@ -19,9 +19,8 @@ class AppConfig(BaseModel):
19
19
  EMBEDDINGS_MODEL: str = "mxbai-embed-large"
20
20
  EMBEDDINGS_VECTOR_DIM: int = 1024
21
21
 
22
- RERANK: bool = True
23
- RERANK_PROVIDER: str = "mxbai"
24
- RERANK_MODEL: str = "mixedbread-ai/mxbai-rerank-base-v2"
22
+ RERANK_PROVIDER: str = "ollama"
23
+ RERANK_MODEL: str = "qwen3"
25
24
 
26
25
  QA_PROVIDER: str = "ollama"
27
26
  QA_MODEL: str = "qwen3"
haiku/rag/reader.py CHANGED
@@ -2,6 +2,9 @@ from pathlib import Path
2
2
  from typing import ClassVar
3
3
 
4
4
  from docling.document_converter import DocumentConverter
5
+ from docling_core.types.doc.document import DoclingDocument
6
+
7
+ from haiku.rag.utils import text_to_docling_document
5
8
 
6
9
 
7
10
  class FileReader:
@@ -84,7 +87,7 @@ class FileReader:
84
87
  extensions: ClassVar[list[str]] = docling_extensions + text_extensions
85
88
 
86
89
  @staticmethod
87
- def parse_file(path: Path) -> str:
90
+ def parse_file(path: Path) -> DoclingDocument:
88
91
  try:
89
92
  file_extension = path.suffix.lower()
90
93
 
@@ -92,7 +95,7 @@ class FileReader:
92
95
  # Use docling for complex document formats
93
96
  converter = DocumentConverter()
94
97
  result = converter.convert(path)
95
- return result.document.export_to_markdown()
98
+ return result.document
96
99
  elif file_extension in FileReader.text_extensions:
97
100
  # Read plain text files directly
98
101
  content = path.read_text(encoding="utf-8")
@@ -100,11 +103,13 @@ class FileReader:
100
103
  # Wrap code files (but not plain txt) in markdown code blocks for better presentation
101
104
  if file_extension in FileReader.code_markdown_identifier:
102
105
  language = FileReader.code_markdown_identifier[file_extension]
103
- return f"```{language}\n{content}\n```"
106
+ content = f"```{language}\n{content}\n```"
104
107
 
105
- return content
108
+ # Convert text to DoclingDocument by wrapping as markdown
109
+ return text_to_docling_document(content, name=f"{path.stem}.md")
106
110
  else:
107
- # Fallback: try to read as text
108
- return path.read_text(encoding="utf-8")
111
+ # Fallback: try to read as text and convert to DoclingDocument
112
+ content = path.read_text(encoding="utf-8")
113
+ return text_to_docling_document(content, name=f"{path.stem}.md")
109
114
  except Exception:
110
115
  raise ValueError(f"Failed to parse file: {path}")
@@ -1,37 +1,40 @@
1
1
  from haiku.rag.config import Config
2
2
  from haiku.rag.reranking.base import RerankerBase
3
3
 
4
- try:
5
- from haiku.rag.reranking.cohere import CohereReranker
6
- except ImportError:
7
- pass
8
-
9
4
  _reranker: RerankerBase | None = None
10
5
 
11
6
 
12
- def get_reranker() -> RerankerBase:
7
+ def get_reranker() -> RerankerBase | None:
13
8
  """
14
9
  Factory function to get the appropriate reranker based on the configuration.
10
+ Returns None if if reranking is disabled.
15
11
  """
16
12
  global _reranker
17
13
  if _reranker is not None:
18
14
  return _reranker
15
+
19
16
  if Config.RERANK_PROVIDER == "mxbai":
20
- from haiku.rag.reranking.mxbai import MxBAIReranker
17
+ try:
18
+ from haiku.rag.reranking.mxbai import MxBAIReranker
21
19
 
22
- _reranker = MxBAIReranker()
23
- return _reranker
20
+ _reranker = MxBAIReranker()
21
+ return _reranker
22
+ except ImportError:
23
+ return None
24
24
 
25
25
  if Config.RERANK_PROVIDER == "cohere":
26
26
  try:
27
27
  from haiku.rag.reranking.cohere import CohereReranker
28
+
29
+ _reranker = CohereReranker()
30
+ return _reranker
28
31
  except ImportError:
29
- raise ImportError(
30
- "Cohere reranker requires the 'cohere' package. "
31
- "Please install haiku.rag with the 'cohere' extra:"
32
- "uv pip install haiku.rag[cohere]"
33
- )
34
- _reranker = CohereReranker()
32
+ return None
33
+
34
+ if Config.RERANK_PROVIDER == "ollama":
35
+ from haiku.rag.reranking.ollama import OllamaReranker
36
+
37
+ _reranker = OllamaReranker()
35
38
  return _reranker
36
39
 
37
- raise ValueError(f"Unsupported reranker provider: {Config.RERANK_PROVIDER}")
40
+ return None
@@ -0,0 +1,84 @@
1
+ import json
2
+
3
+ from ollama import AsyncClient
4
+ from pydantic import BaseModel
5
+
6
+ from haiku.rag.config import Config
7
+ from haiku.rag.reranking.base import RerankerBase
8
+ from haiku.rag.store.models.chunk import Chunk
9
+
10
+ OLLAMA_OPTIONS = {"temperature": 0.0, "seed": 42, "num_ctx": 16384}
11
+
12
+
13
+ class RerankResult(BaseModel):
14
+ """Individual rerank result with index and relevance score."""
15
+
16
+ index: int
17
+ relevance_score: float
18
+
19
+
20
+ class RerankResponse(BaseModel):
21
+ """Response from the reranking model containing ranked results."""
22
+
23
+ results: list[RerankResult]
24
+
25
+
26
+ class OllamaReranker(RerankerBase):
27
+ def __init__(self, model: str = Config.RERANK_MODEL):
28
+ self._model = model
29
+ self._client = AsyncClient(host=Config.OLLAMA_BASE_URL)
30
+
31
+ async def rerank(
32
+ self, query: str, chunks: list[Chunk], top_n: int = 10
33
+ ) -> list[tuple[Chunk, float]]:
34
+ if not chunks:
35
+ return []
36
+
37
+ documents = []
38
+ for i, chunk in enumerate(chunks):
39
+ documents.append({"index": i, "content": chunk.content})
40
+
41
+ # Create the prompt for reranking
42
+ system_prompt = """You are a document reranking assistant. Given a query and a list of document chunks, you must rank them by relevance to the query.
43
+
44
+ Return your response as a JSON object with a "results" array. Each result should have:
45
+ - "index": the original index of the document (integer)
46
+ - "relevance_score": a score between 0.0 and 1.0 indicating relevance (float, where 1.0 is most relevant)
47
+
48
+ Only return the top documents up to the requested limit, ordered by decreasing relevance score."""
49
+
50
+ documents_text = ""
51
+ for doc in documents:
52
+ documents_text += f"Index {doc['index']}: {doc['content']}\n\n"
53
+
54
+ user_prompt = f"""Query: {query}
55
+
56
+ Documents to rerank:
57
+ {documents_text.strip()}
58
+
59
+ Please rank these documents by relevance to the query and return the top {top_n} results as JSON."""
60
+
61
+ messages = [
62
+ {"role": "system", "content": system_prompt},
63
+ {"role": "user", "content": user_prompt},
64
+ ]
65
+
66
+ try:
67
+ response = await self._client.chat(
68
+ model=self._model,
69
+ messages=messages,
70
+ format=RerankResponse.model_json_schema(),
71
+ options=OLLAMA_OPTIONS,
72
+ )
73
+
74
+ content = response["message"]["content"]
75
+
76
+ parsed_response = RerankResponse.model_validate(json.loads(content))
77
+ return [
78
+ (chunks[result.index], result.relevance_score)
79
+ for result in parsed_response.results[:top_n]
80
+ ]
81
+
82
+ except Exception:
83
+ # Fallback: return chunks in original order with same score
84
+ return [(chunks[i], 1.0) for i in range(min(top_n, len(chunks)))]
@@ -1,6 +1,8 @@
1
1
  import json
2
2
  import re
3
3
 
4
+ from docling_core.types.doc.document import DoclingDocument
5
+
4
6
  from haiku.rag.chunker import chunker
5
7
  from haiku.rag.embeddings import get_embedder
6
8
  from haiku.rag.store.models.chunk import Chunk
@@ -197,11 +199,11 @@ class ChunkRepository(BaseRepository[Chunk]):
197
199
  ]
198
200
 
199
201
  async def create_chunks_for_document(
200
- self, document_id: int, content: str, commit: bool = True
202
+ self, document_id: int, document: DoclingDocument, commit: bool = True
201
203
  ) -> list[Chunk]:
202
- """Create chunks and embeddings for a document."""
204
+ """Create chunks and embeddings for a document from DoclingDocument."""
203
205
  # Chunk the document content
204
- chunk_texts = await chunker.chunk(content)
206
+ chunk_texts = await chunker.chunk(document)
205
207
  created_chunks = []
206
208
 
207
209
  # Create chunks with embeddings using the create method
@@ -1,8 +1,11 @@
1
1
  import json
2
2
  from typing import TYPE_CHECKING
3
3
 
4
+ from docling_core.types.doc.document import DoclingDocument
5
+
4
6
  from haiku.rag.store.models.document import Document
5
7
  from haiku.rag.store.repositories.base import BaseRepository
8
+ from haiku.rag.utils import text_to_docling_document
6
9
 
7
10
  if TYPE_CHECKING:
8
11
  from haiku.rag.store.models.chunk import Chunk
@@ -20,8 +23,11 @@ class DocumentRepository(BaseRepository[Document]):
20
23
  chunk_repository = ChunkRepository(store)
21
24
  self.chunk_repository = chunk_repository
22
25
 
23
- async def create(
24
- self, entity: Document, chunks: list["Chunk"] | None = None
26
+ async def _create_with_docling(
27
+ self,
28
+ entity: Document,
29
+ docling_document: DoclingDocument,
30
+ chunks: list["Chunk"] | None = None,
25
31
  ) -> Document:
26
32
  """Create a document with its chunks and embeddings."""
27
33
  if self.store._connection is None:
@@ -62,9 +68,9 @@ class DocumentRepository(BaseRepository[Document]):
62
68
  chunk.metadata["order"] = order
63
69
  await self.chunk_repository.create(chunk, commit=False)
64
70
  else:
65
- # Create chunks and embeddings using ChunkRepository
71
+ # Create chunks and embeddings using DoclingDocument
66
72
  await self.chunk_repository.create_chunks_for_document(
67
- document_id, entity.content, commit=False
73
+ document_id, docling_document, commit=False
68
74
  )
69
75
 
70
76
  cursor.execute("COMMIT")
@@ -74,6 +80,13 @@ class DocumentRepository(BaseRepository[Document]):
74
80
  cursor.execute("ROLLBACK")
75
81
  raise
76
82
 
83
+ async def create(self, entity: Document) -> Document:
84
+ """Create a document with its chunks and embeddings."""
85
+ # Convert content to DoclingDocument
86
+ docling_document = text_to_docling_document(entity.content)
87
+
88
+ return await self._create_with_docling(entity, docling_document)
89
+
77
90
  async def get_by_id(self, entity_id: int) -> Document | None:
78
91
  """Get a document by its ID."""
79
92
  if self.store._connection is None:
@@ -134,7 +147,9 @@ class DocumentRepository(BaseRepository[Document]):
134
147
  updated_at=updated_at,
135
148
  )
136
149
 
137
- async def update(self, entity: Document) -> Document:
150
+ async def _update_with_docling(
151
+ self, entity: Document, docling_document: DoclingDocument
152
+ ) -> Document:
138
153
  """Update an existing document and regenerate its chunks and embeddings."""
139
154
  if self.store._connection is None:
140
155
  raise ValueError("Store connection is not available")
@@ -163,10 +178,10 @@ class DocumentRepository(BaseRepository[Document]):
163
178
  },
164
179
  )
165
180
 
166
- # Delete existing chunks and regenerate using ChunkRepository
181
+ # Delete existing chunks and regenerate using DoclingDocument
167
182
  await self.chunk_repository.delete_by_document_id(entity.id, commit=False)
168
183
  await self.chunk_repository.create_chunks_for_document(
169
- entity.id, entity.content, commit=False
184
+ entity.id, docling_document, commit=False
170
185
  )
171
186
 
172
187
  cursor.execute("COMMIT")
@@ -176,6 +191,13 @@ class DocumentRepository(BaseRepository[Document]):
176
191
  cursor.execute("ROLLBACK")
177
192
  raise
178
193
 
194
+ async def update(self, entity: Document) -> Document:
195
+ """Update an existing document and regenerate its chunks and embeddings."""
196
+ # Convert content to DoclingDocument
197
+ docling_document = text_to_docling_document(entity.content)
198
+
199
+ return await self._update_with_docling(entity, docling_document)
200
+
179
201
  async def delete(self, entity_id: int) -> bool:
180
202
  """Delete a document and all its associated chunks and embeddings."""
181
203
  # Delete chunks and embeddings first
haiku/rag/utils.py CHANGED
@@ -1,8 +1,12 @@
1
1
  import sys
2
2
  from importlib import metadata
3
+ from io import BytesIO
3
4
  from pathlib import Path
4
5
 
5
6
  import httpx
7
+ from docling.document_converter import DocumentConverter
8
+ from docling_core.types.doc.document import DoclingDocument
9
+ from docling_core.types.io import DocumentStream
6
10
  from packaging.version import Version, parse
7
11
 
8
12
 
@@ -77,3 +81,20 @@ async def is_up_to_date() -> tuple[bool, Version, Version]:
77
81
  # If no network connection, do not raise alarms.
78
82
  pypi_version = running_version
79
83
  return running_version >= pypi_version, running_version, pypi_version
84
+
85
+
86
+ def text_to_docling_document(text: str, name: str = "content.md") -> DoclingDocument:
87
+ """Convert text content to a DoclingDocument.
88
+
89
+ Args:
90
+ text: The text content to convert.
91
+ name: The name to use for the document stream (defaults to "content.md").
92
+
93
+ Returns:
94
+ A DoclingDocument created from the text content.
95
+ """
96
+ bytes_io = BytesIO(text.encode("utf-8"))
97
+ doc_stream = DocumentStream(name=name, stream=bytes_io)
98
+ converter = DocumentConverter()
99
+ result = converter.convert(doc_stream)
100
+ return result.document
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: haiku.rag
3
- Version: 0.5.0
3
+ Version: 0.5.2
4
4
  Summary: Retrieval Augmented Generation (RAG) with SQLite
5
5
  Author-email: Yiorgis Gozadinos <ggozadinos@gmail.com>
6
6
  License: MIT
@@ -17,12 +17,11 @@ Classifier: Programming Language :: Python :: 3.10
17
17
  Classifier: Programming Language :: Python :: 3.11
18
18
  Classifier: Programming Language :: Python :: 3.12
19
19
  Classifier: Typing :: Typed
20
- Requires-Python: >=3.10
20
+ Requires-Python: >=3.11
21
21
  Requires-Dist: docling>=2.15.0
22
22
  Requires-Dist: fastmcp>=2.8.1
23
23
  Requires-Dist: httpx>=0.28.1
24
- Requires-Dist: mxbai-rerank>=0.1.6
25
- Requires-Dist: ollama>=0.5.1
24
+ Requires-Dist: ollama>=0.5.3
26
25
  Requires-Dist: pydantic>=2.11.7
27
26
  Requires-Dist: python-dotenv>=1.1.0
28
27
  Requires-Dist: rich>=14.0.0
@@ -34,6 +33,8 @@ Provides-Extra: anthropic
34
33
  Requires-Dist: anthropic>=0.56.0; extra == 'anthropic'
35
34
  Provides-Extra: cohere
36
35
  Requires-Dist: cohere>=5.16.1; extra == 'cohere'
36
+ Provides-Extra: mxbai
37
+ Requires-Dist: mxbai-rerank>=0.1.6; extra == 'mxbai'
37
38
  Provides-Extra: openai
38
39
  Requires-Dist: openai>=1.0.0; extra == 'openai'
39
40
  Provides-Extra: voyageai
@@ -1,14 +1,14 @@
1
1
  haiku/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- haiku/rag/app.py,sha256=FpLVyP1-zAq_XPmU8CPVLkuIAeuhBOGvMqhYS8RbN40,7649
3
- haiku/rag/chunker.py,sha256=P2slbmoABygYRlqjOGzPBEOYsBZNTnNpE6bnW_dkVOE,1850
4
- haiku/rag/cli.py,sha256=k7EhLkvTncxsdh5TYrg8BHLYh_lfyzupsWGj1dEEdqY,5992
5
- haiku/rag/client.py,sha256=MZNIpMm6MS3P6vjLqiCztT2dBOM7-bZOosX5IpbHJbI,12724
6
- haiku/rag/config.py,sha256=GXTWC3vasBMaWju-yh8Es3CidBz1ftqRH6E5PHpgsSQ,1634
2
+ haiku/rag/app.py,sha256=kuvULOIdgwqzJMaKtb1znStc1YAqB1-RkZ0fwdg6TBk,7642
3
+ haiku/rag/chunker.py,sha256=PVe6ysv8UlacUd4Zb3_8RFWIaWDXnzBAy2VDJ4TaUsE,1555
4
+ haiku/rag/cli.py,sha256=5CcWcBQ47KCZ1wl7DpLzMXtgJZ1nz5Hci8AYp72oXEI,5855
5
+ haiku/rag/client.py,sha256=K51l_orUc3BeKGTHX4xC7YClY9M4Eijpac5Hg1_q6LE,15815
6
+ haiku/rag/config.py,sha256=jiy5vg-YbYa7yY-834Dd9omFtfMBXQBYXmHRJXMPjrs,1581
7
7
  haiku/rag/logging.py,sha256=zTTGpGq5tPdcd7RpCbd9EGw1IZlQDbYkrCg9t9pqRc4,580
8
8
  haiku/rag/mcp.py,sha256=tMN6fNX7ZtAER1R6DL1GkC9HZozTC4HzuQs199p7icI,4551
9
9
  haiku/rag/monitor.py,sha256=r386nkhdlsU8UECwIuVwnrSlgMk3vNIuUZGNIzkZuec,2770
10
- haiku/rag/reader.py,sha256=s5dinZ-WffioiRH7OWZtE2v7FHRPd1PkqpPYsXtwqtc,2927
11
- haiku/rag/utils.py,sha256=Ez_tvNlRO_D8c2CBZ83Hs9Gmzcqdq4cmw_V5GBdKy_8,2214
10
+ haiku/rag/reader.py,sha256=qkPTMJuQ_o4sK-8zpDl9WFYe_MJ7aL_gUw6rczIpW-g,3274
11
+ haiku/rag/utils.py,sha256=g-uNTG60iBLgkeHHuah6eVZEkX3NFLs-LZU1YnzJzLQ,2967
12
12
  haiku/rag/embeddings/__init__.py,sha256=yFBlxS0jBiVHl_rWz5kb43t6Ha132U1ZGdlIPfhzPdg,1491
13
13
  haiku/rag/embeddings/base.py,sha256=NTQvuzbZPu0LBo5wAu3qGyJ4xXUaRAt1fjBO0ygWn_Y,465
14
14
  haiku/rag/embeddings/ollama.py,sha256=y6-lp0XpbnyIjoOEdtSzMdEVkU5glOwnWQ1FkpUZnpI,370
@@ -20,10 +20,11 @@ haiku/rag/qa/base.py,sha256=4ZTM_l5FAZ9cA0f8NeqRJiUAmjatwCTmSoclFw0gTFQ,1349
20
20
  haiku/rag/qa/ollama.py,sha256=EGUi4urSx9nrnsr5j-qHVDVOnvRTbSMKUbMvXEMIcxM,2381
21
21
  haiku/rag/qa/openai.py,sha256=dF32sGgVt8mZi5oVxByaeECs9NqLjvDiZnnpJBsrHm8,3968
22
22
  haiku/rag/qa/prompts.py,sha256=8uYMxHzbzI9vo2FPkCSSNTh_RNL96WkBbUWPCMBlLpo,1315
23
- haiku/rag/reranking/__init__.py,sha256=DsPCdU94wRzDCYl6hz2DySOMWwOvNxKviqKAUfyykK8,1118
23
+ haiku/rag/reranking/__init__.py,sha256=fwC3pauteJwh9Ulm2270QvwAdwr4NMr4RUEuolC-wKU,1063
24
24
  haiku/rag/reranking/base.py,sha256=LM9yUSSJ414UgBZhFTgxGprlRqzfTe4I1vgjricz2JY,405
25
25
  haiku/rag/reranking/cohere.py,sha256=1iTdiaa8vvb6oHVB2qpWzUOVkyfUcimVSZp6Qr4aq4c,1049
26
26
  haiku/rag/reranking/mxbai.py,sha256=46sVTsTIkzIX9THgM3u8HaEmgY7evvEyB-N54JTHvK8,867
27
+ haiku/rag/reranking/ollama.py,sha256=tCrLlNNDBCZu7J3to1gvBq-sOvN1flYEA7E3H3Jq0mU,2790
27
28
  haiku/rag/store/__init__.py,sha256=hq0W0DAC7ysqhWSP2M2uHX8cbG6kbr-sWHxhq6qQcY0,103
28
29
  haiku/rag/store/engine.py,sha256=cOMBToLilI1Di1qQrFzGLqtRMsuvtiX0Q5RNIEzQy9w,6232
29
30
  haiku/rag/store/models/__init__.py,sha256=s0E72zneGlowvZrFWaNxHYjOAUjgWdLxzdYsnvNRVlY,88
@@ -31,13 +32,13 @@ haiku/rag/store/models/chunk.py,sha256=9-vIxW75-kMTelIhgVIMd_WhP-Drc1q65vjaWMP8w
31
32
  haiku/rag/store/models/document.py,sha256=TVXVY-nQs-1vCORQEs9rA7zOtndeGC4dgCoujLAS054,396
32
33
  haiku/rag/store/repositories/__init__.py,sha256=uIBhxjQh-4o3O-ck8b7BQ58qXQTuJdPvrDIHVhY5T1A,263
33
34
  haiku/rag/store/repositories/base.py,sha256=cm3VyQXhtxvRfk1uJHpA0fDSxMpYN-mjQmRiDiLsQ68,1008
34
- haiku/rag/store/repositories/chunk.py,sha256=UyvHhKb1ESZePoTp2GneAARdfKoocEdfPOwgWPPQ0v8,16878
35
- haiku/rag/store/repositories/document.py,sha256=fXIWevJaOe6x2cK4u9cQxiEGD0ntKQb9y3VRqklQypE,7920
35
+ haiku/rag/store/repositories/chunk.py,sha256=DIIdpHVemvxZOPHOLBL7pJGWY4VyNrUiQSWPWt24BYo,16974
36
+ haiku/rag/store/repositories/document.py,sha256=ki8LiDukwU1469Yw51i0rQFvBzUQeYkFYWs3Ly83akc,8815
36
37
  haiku/rag/store/repositories/settings.py,sha256=qZLXvLsErnCWL0nBQQNfRnatHzCKhtUDLvUK9k-W_fU,2463
37
38
  haiku/rag/store/upgrades/__init__.py,sha256=kKS1YWT_P-CYKhKtokOLTIFNKf9jlfjFFr8lyIMeogM,100
38
39
  haiku/rag/store/upgrades/v0_3_4.py,sha256=GLogKZdZ40NX1vBHKdOJju7fFzNUCHoEnjSZg17Hm2U,663
39
- haiku_rag-0.5.0.dist-info/METADATA,sha256=Z29lOzGgaD2PJ6OxZc53QuMzFdosEZCdm7HZYOUNN3M,4198
40
- haiku_rag-0.5.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
41
- haiku_rag-0.5.0.dist-info/entry_points.txt,sha256=G1U3nAkNd5YDYd4v0tuYFbriz0i-JheCsFuT9kIoGCI,48
42
- haiku_rag-0.5.0.dist-info/licenses/LICENSE,sha256=eXZrWjSk9PwYFNK9yUczl3oPl95Z4V9UXH7bPN46iPo,1065
43
- haiku_rag-0.5.0.dist-info/RECORD,,
40
+ haiku_rag-0.5.2.dist-info/METADATA,sha256=b91HARmgPKSy_4LIhna9EoacKb9I_f-cRRTgHqaG-S8,4238
41
+ haiku_rag-0.5.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
42
+ haiku_rag-0.5.2.dist-info/entry_points.txt,sha256=G1U3nAkNd5YDYd4v0tuYFbriz0i-JheCsFuT9kIoGCI,48
43
+ haiku_rag-0.5.2.dist-info/licenses/LICENSE,sha256=eXZrWjSk9PwYFNK9yUczl3oPl95Z4V9UXH7bPN46iPo,1065
44
+ haiku_rag-0.5.2.dist-info/RECORD,,