haiku.rag 0.4.3__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of haiku.rag might be problematic. Click here for more details.

haiku/rag/chunker.py CHANGED
@@ -1,6 +1,9 @@
1
1
  from typing import ClassVar
2
2
 
3
3
  import tiktoken
4
+ from docling.chunking import HybridChunker # type: ignore
5
+ from docling_core.transforms.chunker.tokenizer.openai import OpenAITokenizer
6
+ from docling_core.types.doc.document import DoclingDocument
4
7
 
5
8
  from haiku.rag.config import Config
6
9
 
@@ -8,9 +11,11 @@ from haiku.rag.config import Config
8
11
  class Chunker:
9
12
  """A class that chunks text into smaller pieces for embedding and retrieval.
10
13
 
14
+ Uses docling's structure-aware chunking to create semantically meaningful chunks
15
+ that respect document boundaries.
16
+
11
17
  Args:
12
18
  chunk_size: The maximum size of a chunk in tokens.
13
- chunk_overlap: The number of tokens of overlap between chunks.
14
19
  """
15
20
 
16
21
  encoder: ClassVar[tiktoken.Encoding] = tiktoken.encoding_for_model("gpt-4o")
@@ -18,50 +23,29 @@ class Chunker:
18
23
  def __init__(
19
24
  self,
20
25
  chunk_size: int = Config.CHUNK_SIZE,
21
- chunk_overlap: int = Config.CHUNK_OVERLAP,
22
26
  ):
23
27
  self.chunk_size = chunk_size
24
- self.chunk_overlap = chunk_overlap
28
+ tokenizer = OpenAITokenizer(
29
+ tokenizer=tiktoken.encoding_for_model("gpt-4o"), max_tokens=chunk_size
30
+ )
31
+
32
+ self.chunker = HybridChunker(tokenizer=tokenizer) # type: ignore
25
33
 
26
- async def chunk(self, text: str) -> list[str]:
27
- """Split the text into chunks based on token boundaries.
34
+ async def chunk(self, document: DoclingDocument) -> list[str]:
35
+ """Split the document into chunks using docling's structure-aware chunking.
28
36
 
29
37
  Args:
30
- text: The text to be split into chunks.
38
+ document: The DoclingDocument to be split into chunks.
31
39
 
32
40
  Returns:
33
- A list of text chunks with token-based boundaries and overlap.
41
+ A list of text chunks with semantic boundaries.
34
42
  """
35
- if not text:
43
+ if document is None:
36
44
  return []
37
45
 
38
- encoded_tokens = self.encoder.encode(text, disallowed_special=())
39
-
40
- if self.chunk_size > len(encoded_tokens):
41
- return [text]
42
-
43
- chunks = []
44
- i = 0
45
- split_id_counter = 0
46
- while i < len(encoded_tokens):
47
- # Overlap
48
- start_i = i
49
- end_i = min(i + self.chunk_size, len(encoded_tokens))
50
-
51
- chunk_tokens = encoded_tokens[start_i:end_i]
52
- chunk_text = self.encoder.decode(chunk_tokens)
53
-
54
- chunks.append(chunk_text)
55
- split_id_counter += 1
56
-
57
- # Exit loop if this was the last possible chunk
58
- if end_i == len(encoded_tokens):
59
- break
60
-
61
- i += (
62
- self.chunk_size - self.chunk_overlap
63
- ) # Step forward, considering overlap
64
- return chunks
46
+ # Chunk using docling's hybrid chunker
47
+ chunks = list(self.chunker.chunk(document))
48
+ return [self.chunker.contextualize(chunk) for chunk in chunks]
65
49
 
66
50
 
67
51
  chunker = Chunker()
haiku/rag/cli.py CHANGED
@@ -1,4 +1,5 @@
1
1
  import asyncio
2
+ import warnings
2
3
  from importlib.metadata import version
3
4
  from pathlib import Path
4
5
 
@@ -9,12 +10,14 @@ from haiku.rag.app import HaikuRAGApp
9
10
  from haiku.rag.config import Config
10
11
  from haiku.rag.utils import is_up_to_date
11
12
 
13
+ if not Config.ENV == "development":
14
+ warnings.filterwarnings("ignore")
15
+
12
16
  cli = typer.Typer(
13
17
  context_settings={"help_option_names": ["-h", "--help"]}, no_args_is_help=True
14
18
  )
15
19
 
16
20
  console = Console()
17
- event_loop = asyncio.get_event_loop()
18
21
 
19
22
 
20
23
  async def check_version():
@@ -46,7 +49,7 @@ def main(
46
49
  ):
47
50
  """haiku.rag CLI - SQLite-based RAG system"""
48
51
  # Run version check before any command
49
- event_loop.run_until_complete(check_version())
52
+ asyncio.run(check_version())
50
53
 
51
54
 
52
55
  @cli.command("list", help="List all stored documents")
@@ -58,7 +61,7 @@ def list_documents(
58
61
  ),
59
62
  ):
60
63
  app = HaikuRAGApp(db_path=db)
61
- event_loop.run_until_complete(app.list_documents())
64
+ asyncio.run(app.list_documents())
62
65
 
63
66
 
64
67
  @cli.command("add", help="Add a document from text input")
@@ -73,7 +76,7 @@ def add_document_text(
73
76
  ),
74
77
  ):
75
78
  app = HaikuRAGApp(db_path=db)
76
- event_loop.run_until_complete(app.add_document_from_text(text=text))
79
+ asyncio.run(app.add_document_from_text(text=text))
77
80
 
78
81
 
79
82
  @cli.command("add-src", help="Add a document from a file path or URL")
@@ -88,7 +91,7 @@ def add_document_src(
88
91
  ),
89
92
  ):
90
93
  app = HaikuRAGApp(db_path=db)
91
- event_loop.run_until_complete(app.add_document_from_source(file_path=file_path))
94
+ asyncio.run(app.add_document_from_source(file_path=file_path))
92
95
 
93
96
 
94
97
  @cli.command("get", help="Get and display a document by its ID")
@@ -103,7 +106,7 @@ def get_document(
103
106
  ),
104
107
  ):
105
108
  app = HaikuRAGApp(db_path=db)
106
- event_loop.run_until_complete(app.get_document(doc_id=doc_id))
109
+ asyncio.run(app.get_document(doc_id=doc_id))
107
110
 
108
111
 
109
112
  @cli.command("delete", help="Delete a document by its ID")
@@ -118,7 +121,7 @@ def delete_document(
118
121
  ),
119
122
  ):
120
123
  app = HaikuRAGApp(db_path=db)
121
- event_loop.run_until_complete(app.delete_document(doc_id=doc_id))
124
+ asyncio.run(app.delete_document(doc_id=doc_id))
122
125
 
123
126
 
124
127
  @cli.command("search", help="Search for documents by a query")
@@ -144,7 +147,7 @@ def search(
144
147
  ),
145
148
  ):
146
149
  app = HaikuRAGApp(db_path=db)
147
- event_loop.run_until_complete(app.search(query=query, limit=limit, k=k))
150
+ asyncio.run(app.search(query=query, limit=limit, k=k))
148
151
 
149
152
 
150
153
  @cli.command("ask", help="Ask a question using the QA agent")
@@ -159,7 +162,7 @@ def ask(
159
162
  ),
160
163
  ):
161
164
  app = HaikuRAGApp(db_path=db)
162
- event_loop.run_until_complete(app.ask(question=question))
165
+ asyncio.run(app.ask(question=question))
163
166
 
164
167
 
165
168
  @cli.command("settings", help="Display current configuration settings")
@@ -180,7 +183,7 @@ def rebuild(
180
183
  ),
181
184
  ):
182
185
  app = HaikuRAGApp(db_path=db)
183
- event_loop.run_until_complete(app.rebuild())
186
+ asyncio.run(app.rebuild())
184
187
 
185
188
 
186
189
  @cli.command(
@@ -216,7 +219,7 @@ def serve(
216
219
  elif sse:
217
220
  transport = "sse"
218
221
 
219
- event_loop.run_until_complete(app.serve(transport=transport))
222
+ asyncio.run(app.serve(transport=transport))
220
223
 
221
224
 
222
225
  if __name__ == "__main__":
haiku/rag/client.py CHANGED
@@ -16,6 +16,7 @@ from haiku.rag.store.models.chunk import Chunk
16
16
  from haiku.rag.store.models.document import Document
17
17
  from haiku.rag.store.repositories.chunk import ChunkRepository
18
18
  from haiku.rag.store.repositories.document import DocumentRepository
19
+ from haiku.rag.utils import text_to_docling_document
19
20
 
20
21
 
21
22
  class HaikuRAG:
@@ -49,6 +50,24 @@ class HaikuRAG:
49
50
  self.close()
50
51
  return False
51
52
 
53
+ async def _create_document_with_docling(
54
+ self,
55
+ docling_document,
56
+ uri: str | None = None,
57
+ metadata: dict | None = None,
58
+ chunks: list[Chunk] | None = None,
59
+ ) -> Document:
60
+ """Create a new document from DoclingDocument."""
61
+ content = docling_document.export_to_markdown()
62
+ document = Document(
63
+ content=content,
64
+ uri=uri,
65
+ metadata=metadata or {},
66
+ )
67
+ return await self.document_repository._create_with_docling(
68
+ document, docling_document, chunks
69
+ )
70
+
52
71
  async def create_document(
53
72
  self,
54
73
  content: str,
@@ -67,12 +86,17 @@ class HaikuRAG:
67
86
  Returns:
68
87
  The created Document instance.
69
88
  """
89
+ # Convert content to DoclingDocument for processing
90
+ docling_document = text_to_docling_document(content)
91
+
70
92
  document = Document(
71
93
  content=content,
72
94
  uri=uri,
73
95
  metadata=metadata or {},
74
96
  )
75
- return await self.document_repository.create(document, chunks)
97
+ return await self.document_repository._create_with_docling(
98
+ document, docling_document, chunks
99
+ )
76
100
 
77
101
  async def create_document_from_source(
78
102
  self, source: str | Path, metadata: dict = {}
@@ -101,16 +125,19 @@ class HaikuRAG:
101
125
  parsed_url = urlparse(source_str)
102
126
  if parsed_url.scheme in ("http", "https"):
103
127
  return await self._create_or_update_document_from_url(source_str, metadata)
104
-
105
- # Handle as file path
106
- source_path = Path(source) if isinstance(source, str) else source
128
+ elif parsed_url.scheme == "file":
129
+ # Handle file:// URI by converting to path
130
+ source_path = Path(parsed_url.path)
131
+ else:
132
+ # Handle as regular file path
133
+ source_path = Path(source) if isinstance(source, str) else source
107
134
  if source_path.suffix.lower() not in FileReader.extensions:
108
135
  raise ValueError(f"Unsupported file extension: {source_path.suffix}")
109
136
 
110
137
  if not source_path.exists():
111
138
  raise ValueError(f"File does not exist: {source_path}")
112
139
 
113
- uri = source_path.as_uri()
140
+ uri = source_path.absolute().as_uri()
114
141
  md5_hash = hashlib.md5(source_path.read_bytes()).hexdigest()
115
142
 
116
143
  # Check if document already exists
@@ -119,7 +146,7 @@ class HaikuRAG:
119
146
  # MD5 unchanged, return existing document
120
147
  return existing_doc
121
148
 
122
- content = FileReader.parse_file(source_path)
149
+ docling_document = FileReader.parse_file(source_path)
123
150
 
124
151
  # Get content type from file extension
125
152
  content_type, _ = mimetypes.guess_type(str(source_path))
@@ -131,13 +158,15 @@ class HaikuRAG:
131
158
 
132
159
  if existing_doc:
133
160
  # Update existing document
134
- existing_doc.content = content
161
+ existing_doc.content = docling_document.export_to_markdown()
135
162
  existing_doc.metadata = metadata
136
- return await self.update_document(existing_doc)
163
+ return await self.document_repository._update_with_docling(
164
+ existing_doc, docling_document
165
+ )
137
166
  else:
138
- # Create new document
139
- return await self.create_document(
140
- content=content, uri=uri, metadata=metadata
167
+ # Create new document using DoclingDocument
168
+ return await self._create_document_with_docling(
169
+ docling_document=docling_document, uri=uri, metadata=metadata
141
170
  )
142
171
 
143
172
  async def _create_or_update_document_from_url(
@@ -193,18 +222,20 @@ class HaikuRAG:
193
222
  temp_path = Path(temp_file.name)
194
223
 
195
224
  # Parse the content using FileReader
196
- content = FileReader.parse_file(temp_path)
225
+ docling_document = FileReader.parse_file(temp_path)
197
226
 
198
227
  # Merge metadata with contentType and md5
199
228
  metadata.update({"contentType": content_type, "md5": md5_hash})
200
229
 
201
230
  if existing_doc:
202
- existing_doc.content = content
231
+ existing_doc.content = docling_document.export_to_markdown()
203
232
  existing_doc.metadata = metadata
204
- return await self.update_document(existing_doc)
233
+ return await self.document_repository._update_with_docling(
234
+ existing_doc, docling_document
235
+ )
205
236
  else:
206
- return await self.create_document(
207
- content=content, uri=url, metadata=metadata
237
+ return await self._create_document_with_docling(
238
+ docling_document=docling_document, uri=url, metadata=metadata
208
239
  )
209
240
 
210
241
  def _get_extension_from_content_type_or_url(
@@ -262,7 +293,12 @@ class HaikuRAG:
262
293
 
263
294
  async def update_document(self, document: Document) -> Document:
264
295
  """Update an existing document."""
265
- return await self.document_repository.update(document)
296
+ # Convert content to DoclingDocument
297
+ docling_document = text_to_docling_document(document.content)
298
+
299
+ return await self.document_repository._update_with_docling(
300
+ document, docling_document
301
+ )
266
302
 
267
303
  async def delete_document(self, document_id: int) -> bool:
268
304
  """Delete a document by its ID."""
@@ -328,6 +364,13 @@ class HaikuRAG:
328
364
  async def rebuild_database(self) -> AsyncGenerator[int, None]:
329
365
  """Rebuild the database by deleting all chunks and re-indexing all documents.
330
366
 
367
+ For documents with URIs:
368
+ - Deletes the document and re-adds it from source if source exists
369
+ - Skips documents where source no longer exists
370
+
371
+ For documents without URIs:
372
+ - Re-creates chunks from existing content
373
+
331
374
  Yields:
332
375
  int: The ID of the document currently being processed
333
376
  """
@@ -343,9 +386,36 @@ class HaikuRAG:
343
386
  documents = await self.list_documents()
344
387
 
345
388
  for doc in documents:
346
- if doc.id is not None:
389
+ assert doc.id is not None, "Document ID should not be None"
390
+ if doc.uri:
391
+ # Document has a URI - delete and try to re-add from source
392
+ try:
393
+ # Delete the old document first
394
+ await self.delete_document(doc.id)
395
+
396
+ # Try to re-create from source (this creates the document with chunks)
397
+ new_doc = await self.create_document_from_source(
398
+ doc.uri, doc.metadata or {}
399
+ )
400
+
401
+ assert new_doc.id is not None, "New document ID should not be None"
402
+ yield new_doc.id
403
+
404
+ except (FileNotFoundError, ValueError, OSError) as e:
405
+ # Source doesn't exist or can't be accessed - document already deleted, skip
406
+ print(f"Skipping document with URI {doc.uri}: {e}")
407
+ continue
408
+ except Exception as e:
409
+ # Unexpected error - log it and skip
410
+ print(
411
+ f"Unexpected error processing document with URI {doc.uri}: {e}"
412
+ )
413
+ continue
414
+ else:
415
+ # Document without URI - re-create chunks from existing content
416
+ docling_document = text_to_docling_document(doc.content)
347
417
  await self.chunk_repository.create_chunks_for_document(
348
- doc.id, doc.content, commit=False
418
+ doc.id, docling_document, commit=False
349
419
  )
350
420
  yield doc.id
351
421
 
haiku/rag/config.py CHANGED
@@ -10,7 +10,7 @@ load_dotenv()
10
10
 
11
11
 
12
12
  class AppConfig(BaseModel):
13
- ENV: str = "development"
13
+ ENV: str = "production"
14
14
 
15
15
  DEFAULT_DATA_DIR: Path = get_default_data_dir()
16
16
  MONITOR_DIRECTORIES: list[Path] = []
@@ -27,7 +27,6 @@ class AppConfig(BaseModel):
27
27
  QA_MODEL: str = "qwen3"
28
28
 
29
29
  CHUNK_SIZE: int = 256
30
- CHUNK_OVERLAP: int = 32
31
30
 
32
31
  OLLAMA_BASE_URL: str = "http://localhost:11434"
33
32
 
haiku/rag/reader.py CHANGED
@@ -2,6 +2,9 @@ from pathlib import Path
2
2
  from typing import ClassVar
3
3
 
4
4
  from docling.document_converter import DocumentConverter
5
+ from docling_core.types.doc.document import DoclingDocument
6
+
7
+ from haiku.rag.utils import text_to_docling_document
5
8
 
6
9
 
7
10
  class FileReader:
@@ -16,7 +19,8 @@ class FileReader:
16
19
  ".jpeg",
17
20
  ".jpg",
18
21
  ".md",
19
- ".pdf.png",
22
+ ".pdf",
23
+ ".png",
20
24
  ".pptx",
21
25
  ".tiff",
22
26
  ".xlsx",
@@ -83,7 +87,7 @@ class FileReader:
83
87
  extensions: ClassVar[list[str]] = docling_extensions + text_extensions
84
88
 
85
89
  @staticmethod
86
- def parse_file(path: Path) -> str:
90
+ def parse_file(path: Path) -> DoclingDocument:
87
91
  try:
88
92
  file_extension = path.suffix.lower()
89
93
 
@@ -91,7 +95,7 @@ class FileReader:
91
95
  # Use docling for complex document formats
92
96
  converter = DocumentConverter()
93
97
  result = converter.convert(path)
94
- return result.document.export_to_markdown()
98
+ return result.document
95
99
  elif file_extension in FileReader.text_extensions:
96
100
  # Read plain text files directly
97
101
  content = path.read_text(encoding="utf-8")
@@ -99,11 +103,13 @@ class FileReader:
99
103
  # Wrap code files (but not plain txt) in markdown code blocks for better presentation
100
104
  if file_extension in FileReader.code_markdown_identifier:
101
105
  language = FileReader.code_markdown_identifier[file_extension]
102
- return f"```{language}\n{content}\n```"
106
+ content = f"```{language}\n{content}\n```"
103
107
 
104
- return content
108
+ # Convert text to DoclingDocument by wrapping as markdown
109
+ return text_to_docling_document(content, name=f"{path.stem}.md")
105
110
  else:
106
- # Fallback: try to read as text
107
- return path.read_text(encoding="utf-8")
111
+ # Fallback: try to read as text and convert to DoclingDocument
112
+ content = path.read_text(encoding="utf-8")
113
+ return text_to_docling_document(content, name=f"{path.stem}.md")
108
114
  except Exception:
109
115
  raise ValueError(f"Failed to parse file: {path}")
@@ -1,6 +1,8 @@
1
1
  import json
2
2
  import re
3
3
 
4
+ from docling_core.types.doc.document import DoclingDocument
5
+
4
6
  from haiku.rag.chunker import chunker
5
7
  from haiku.rag.embeddings import get_embedder
6
8
  from haiku.rag.store.models.chunk import Chunk
@@ -197,11 +199,11 @@ class ChunkRepository(BaseRepository[Chunk]):
197
199
  ]
198
200
 
199
201
  async def create_chunks_for_document(
200
- self, document_id: int, content: str, commit: bool = True
202
+ self, document_id: int, document: DoclingDocument, commit: bool = True
201
203
  ) -> list[Chunk]:
202
- """Create chunks and embeddings for a document."""
204
+ """Create chunks and embeddings for a document from DoclingDocument."""
203
205
  # Chunk the document content
204
- chunk_texts = await chunker.chunk(content)
206
+ chunk_texts = await chunker.chunk(document)
205
207
  created_chunks = []
206
208
 
207
209
  # Create chunks with embeddings using the create method
@@ -1,8 +1,11 @@
1
1
  import json
2
2
  from typing import TYPE_CHECKING
3
3
 
4
+ from docling_core.types.doc.document import DoclingDocument
5
+
4
6
  from haiku.rag.store.models.document import Document
5
7
  from haiku.rag.store.repositories.base import BaseRepository
8
+ from haiku.rag.utils import text_to_docling_document
6
9
 
7
10
  if TYPE_CHECKING:
8
11
  from haiku.rag.store.models.chunk import Chunk
@@ -20,8 +23,11 @@ class DocumentRepository(BaseRepository[Document]):
20
23
  chunk_repository = ChunkRepository(store)
21
24
  self.chunk_repository = chunk_repository
22
25
 
23
- async def create(
24
- self, entity: Document, chunks: list["Chunk"] | None = None
26
+ async def _create_with_docling(
27
+ self,
28
+ entity: Document,
29
+ docling_document: DoclingDocument,
30
+ chunks: list["Chunk"] | None = None,
25
31
  ) -> Document:
26
32
  """Create a document with its chunks and embeddings."""
27
33
  if self.store._connection is None:
@@ -62,9 +68,9 @@ class DocumentRepository(BaseRepository[Document]):
62
68
  chunk.metadata["order"] = order
63
69
  await self.chunk_repository.create(chunk, commit=False)
64
70
  else:
65
- # Create chunks and embeddings using ChunkRepository
71
+ # Create chunks and embeddings using DoclingDocument
66
72
  await self.chunk_repository.create_chunks_for_document(
67
- document_id, entity.content, commit=False
73
+ document_id, docling_document, commit=False
68
74
  )
69
75
 
70
76
  cursor.execute("COMMIT")
@@ -74,6 +80,13 @@ class DocumentRepository(BaseRepository[Document]):
74
80
  cursor.execute("ROLLBACK")
75
81
  raise
76
82
 
83
+ async def create(self, entity: Document) -> Document:
84
+ """Create a document with its chunks and embeddings."""
85
+ # Convert content to DoclingDocument
86
+ docling_document = text_to_docling_document(entity.content)
87
+
88
+ return await self._create_with_docling(entity, docling_document)
89
+
77
90
  async def get_by_id(self, entity_id: int) -> Document | None:
78
91
  """Get a document by its ID."""
79
92
  if self.store._connection is None:
@@ -134,7 +147,9 @@ class DocumentRepository(BaseRepository[Document]):
134
147
  updated_at=updated_at,
135
148
  )
136
149
 
137
- async def update(self, entity: Document) -> Document:
150
+ async def _update_with_docling(
151
+ self, entity: Document, docling_document: DoclingDocument
152
+ ) -> Document:
138
153
  """Update an existing document and regenerate its chunks and embeddings."""
139
154
  if self.store._connection is None:
140
155
  raise ValueError("Store connection is not available")
@@ -163,10 +178,10 @@ class DocumentRepository(BaseRepository[Document]):
163
178
  },
164
179
  )
165
180
 
166
- # Delete existing chunks and regenerate using ChunkRepository
181
+ # Delete existing chunks and regenerate using DoclingDocument
167
182
  await self.chunk_repository.delete_by_document_id(entity.id, commit=False)
168
183
  await self.chunk_repository.create_chunks_for_document(
169
- entity.id, entity.content, commit=False
184
+ entity.id, docling_document, commit=False
170
185
  )
171
186
 
172
187
  cursor.execute("COMMIT")
@@ -176,6 +191,13 @@ class DocumentRepository(BaseRepository[Document]):
176
191
  cursor.execute("ROLLBACK")
177
192
  raise
178
193
 
194
+ async def update(self, entity: Document) -> Document:
195
+ """Update an existing document and regenerate its chunks and embeddings."""
196
+ # Convert content to DoclingDocument
197
+ docling_document = text_to_docling_document(entity.content)
198
+
199
+ return await self._update_with_docling(entity, docling_document)
200
+
179
201
  async def delete(self, entity_id: int) -> bool:
180
202
  """Delete a document and all its associated chunks and embeddings."""
181
203
  # Delete chunks and embeddings first
@@ -63,7 +63,6 @@ class SettingsRepository:
63
63
  "EMBEDDINGS_MODEL",
64
64
  "EMBEDDINGS_VECTOR_DIM",
65
65
  "CHUNK_SIZE",
66
- "CHUNK_OVERLAP",
67
66
  ]
68
67
 
69
68
  errors = []
haiku/rag/utils.py CHANGED
@@ -1,8 +1,12 @@
1
1
  import sys
2
2
  from importlib import metadata
3
+ from io import BytesIO
3
4
  from pathlib import Path
4
5
 
5
6
  import httpx
7
+ from docling.document_converter import DocumentConverter
8
+ from docling_core.types.doc.document import DoclingDocument
9
+ from docling_core.types.io import DocumentStream
6
10
  from packaging.version import Version, parse
7
11
 
8
12
 
@@ -77,3 +81,20 @@ async def is_up_to_date() -> tuple[bool, Version, Version]:
77
81
  # If no network connection, do not raise alarms.
78
82
  pypi_version = running_version
79
83
  return running_version >= pypi_version, running_version, pypi_version
84
+
85
+
86
+ def text_to_docling_document(text: str, name: str = "content.md") -> DoclingDocument:
87
+ """Convert text content to a DoclingDocument.
88
+
89
+ Args:
90
+ text: The text content to convert.
91
+ name: The name to use for the document stream (defaults to "content.md").
92
+
93
+ Returns:
94
+ A DoclingDocument created from the text content.
95
+ """
96
+ bytes_io = BytesIO(text.encode("utf-8"))
97
+ doc_stream = DocumentStream(name=name, stream=bytes_io)
98
+ converter = DocumentConverter()
99
+ result = converter.convert(doc_stream)
100
+ return result.document
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: haiku.rag
3
- Version: 0.4.3
3
+ Version: 0.5.1
4
4
  Summary: Retrieval Augmented Generation (RAG) with SQLite
5
5
  Author-email: Yiorgis Gozadinos <ggozadinos@gmail.com>
6
6
  License: MIT
@@ -17,7 +17,7 @@ Classifier: Programming Language :: Python :: 3.10
17
17
  Classifier: Programming Language :: Python :: 3.11
18
18
  Classifier: Programming Language :: Python :: 3.12
19
19
  Classifier: Typing :: Typed
20
- Requires-Python: >=3.10
20
+ Requires-Python: >=3.11
21
21
  Requires-Dist: docling>=2.15.0
22
22
  Requires-Dist: fastmcp>=2.8.1
23
23
  Requires-Dist: httpx>=0.28.1
@@ -1,14 +1,14 @@
1
1
  haiku/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  haiku/rag/app.py,sha256=FpLVyP1-zAq_XPmU8CPVLkuIAeuhBOGvMqhYS8RbN40,7649
3
- haiku/rag/chunker.py,sha256=MbCtP66OfTFoIBvqmVT9T9c87fozsYYzAQzJJEfPBVI,1812
4
- haiku/rag/cli.py,sha256=k7EhLkvTncxsdh5TYrg8BHLYh_lfyzupsWGj1dEEdqY,5992
5
- haiku/rag/client.py,sha256=MZNIpMm6MS3P6vjLqiCztT2dBOM7-bZOosX5IpbHJbI,12724
6
- haiku/rag/config.py,sha256=_Ss54kmfxVAJupExLKaYjYUlFxJgb7hEEdbG4-isapY,1662
3
+ haiku/rag/chunker.py,sha256=PVe6ysv8UlacUd4Zb3_8RFWIaWDXnzBAy2VDJ4TaUsE,1555
4
+ haiku/rag/cli.py,sha256=rk4uUwN_FdMC-rai9_R2sgXXMI3TIWKRtdWWHg_WoWM,5865
5
+ haiku/rag/client.py,sha256=pFcrPkQo1h1zJ76jts-72goP_kGVtnJNfLuoT8qpsb8,15795
6
+ haiku/rag/config.py,sha256=8mlQ8gYFxxq1q9gi9tjY9StjqhfhiHkO1FvS4b0et0E,1633
7
7
  haiku/rag/logging.py,sha256=zTTGpGq5tPdcd7RpCbd9EGw1IZlQDbYkrCg9t9pqRc4,580
8
8
  haiku/rag/mcp.py,sha256=tMN6fNX7ZtAER1R6DL1GkC9HZozTC4HzuQs199p7icI,4551
9
9
  haiku/rag/monitor.py,sha256=r386nkhdlsU8UECwIuVwnrSlgMk3vNIuUZGNIzkZuec,2770
10
- haiku/rag/reader.py,sha256=dLz3yyc5r8dzdqCc2VViC3fADpScw4lxXueKiu-cI7c,2915
11
- haiku/rag/utils.py,sha256=Ez_tvNlRO_D8c2CBZ83Hs9Gmzcqdq4cmw_V5GBdKy_8,2214
10
+ haiku/rag/reader.py,sha256=qkPTMJuQ_o4sK-8zpDl9WFYe_MJ7aL_gUw6rczIpW-g,3274
11
+ haiku/rag/utils.py,sha256=g-uNTG60iBLgkeHHuah6eVZEkX3NFLs-LZU1YnzJzLQ,2967
12
12
  haiku/rag/embeddings/__init__.py,sha256=yFBlxS0jBiVHl_rWz5kb43t6Ha132U1ZGdlIPfhzPdg,1491
13
13
  haiku/rag/embeddings/base.py,sha256=NTQvuzbZPu0LBo5wAu3qGyJ4xXUaRAt1fjBO0ygWn_Y,465
14
14
  haiku/rag/embeddings/ollama.py,sha256=y6-lp0XpbnyIjoOEdtSzMdEVkU5glOwnWQ1FkpUZnpI,370
@@ -31,13 +31,13 @@ haiku/rag/store/models/chunk.py,sha256=9-vIxW75-kMTelIhgVIMd_WhP-Drc1q65vjaWMP8w
31
31
  haiku/rag/store/models/document.py,sha256=TVXVY-nQs-1vCORQEs9rA7zOtndeGC4dgCoujLAS054,396
32
32
  haiku/rag/store/repositories/__init__.py,sha256=uIBhxjQh-4o3O-ck8b7BQ58qXQTuJdPvrDIHVhY5T1A,263
33
33
  haiku/rag/store/repositories/base.py,sha256=cm3VyQXhtxvRfk1uJHpA0fDSxMpYN-mjQmRiDiLsQ68,1008
34
- haiku/rag/store/repositories/chunk.py,sha256=UyvHhKb1ESZePoTp2GneAARdfKoocEdfPOwgWPPQ0v8,16878
35
- haiku/rag/store/repositories/document.py,sha256=fXIWevJaOe6x2cK4u9cQxiEGD0ntKQb9y3VRqklQypE,7920
36
- haiku/rag/store/repositories/settings.py,sha256=dme3_ulQdQvyF9daavSjAd-SjZ5hh0MJoxP7iXgap-A,2492
34
+ haiku/rag/store/repositories/chunk.py,sha256=DIIdpHVemvxZOPHOLBL7pJGWY4VyNrUiQSWPWt24BYo,16974
35
+ haiku/rag/store/repositories/document.py,sha256=ki8LiDukwU1469Yw51i0rQFvBzUQeYkFYWs3Ly83akc,8815
36
+ haiku/rag/store/repositories/settings.py,sha256=qZLXvLsErnCWL0nBQQNfRnatHzCKhtUDLvUK9k-W_fU,2463
37
37
  haiku/rag/store/upgrades/__init__.py,sha256=kKS1YWT_P-CYKhKtokOLTIFNKf9jlfjFFr8lyIMeogM,100
38
38
  haiku/rag/store/upgrades/v0_3_4.py,sha256=GLogKZdZ40NX1vBHKdOJju7fFzNUCHoEnjSZg17Hm2U,663
39
- haiku_rag-0.4.3.dist-info/METADATA,sha256=T2ZHdGL_zd1eSfEjFolh3R_zJpuWmUhKsnNkYLKtT7E,4198
40
- haiku_rag-0.4.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
41
- haiku_rag-0.4.3.dist-info/entry_points.txt,sha256=G1U3nAkNd5YDYd4v0tuYFbriz0i-JheCsFuT9kIoGCI,48
42
- haiku_rag-0.4.3.dist-info/licenses/LICENSE,sha256=eXZrWjSk9PwYFNK9yUczl3oPl95Z4V9UXH7bPN46iPo,1065
43
- haiku_rag-0.4.3.dist-info/RECORD,,
39
+ haiku_rag-0.5.1.dist-info/METADATA,sha256=X4r-1CBCTef3_T9HWPgCHi5XumqOSF4tlHfUpxO533E,4198
40
+ haiku_rag-0.5.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
41
+ haiku_rag-0.5.1.dist-info/entry_points.txt,sha256=G1U3nAkNd5YDYd4v0tuYFbriz0i-JheCsFuT9kIoGCI,48
42
+ haiku_rag-0.5.1.dist-info/licenses/LICENSE,sha256=eXZrWjSk9PwYFNK9yUczl3oPl95Z4V9UXH7bPN46iPo,1065
43
+ haiku_rag-0.5.1.dist-info/RECORD,,