PyPI - haiku.rag - Versions diffs - 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl - Mend

haiku.rag 0.5.0py3-none-any.whl → 0.5.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of haiku.rag might be problematic. Click here for more details.

Files changed (16) hide show

haiku/rag/app.py +2 -2
haiku/rag/chunker.py +6 -15
haiku/rag/cli.py +15 -12
haiku/rag/client.py +93 -22
haiku/rag/config.py +3 -4
haiku/rag/reader.py +11 -6
haiku/rag/reranking/__init__.py +19 -16
haiku/rag/reranking/ollama.py +84 -0
haiku/rag/store/repositories/chunk.py +5 -3
haiku/rag/store/repositories/document.py +29 -7
haiku/rag/utils.py +21 -0
{haiku_rag-0.5.0.dist-info → haiku_rag-0.5.2.dist-info}/METADATA +5 -4
{haiku_rag-0.5.0.dist-info → haiku_rag-0.5.2.dist-info}/RECORD +16 -15
{haiku_rag-0.5.0.dist-info → haiku_rag-0.5.2.dist-info}/WHEEL +0 -0
{haiku_rag-0.5.0.dist-info → haiku_rag-0.5.2.dist-info}/entry_points.txt +0 -0
{haiku_rag-0.5.0.dist-info → haiku_rag-0.5.2.dist-info}/licenses/LICENSE +0 -0

haiku/rag/app.py CHANGED Viewed

@@ -32,9 +32,9 @@ class HaikuRAGApp:
                 f"[b]Document with id [cyan]{doc.id}[/cyan] added successfully.[/b]"
             )
-    async def add_document_from_source(self, file_path: Path):
+    async def add_document_from_source(self, source: str):
         async with HaikuRAG(db_path=self.db_path) as self.client:
-            doc = await self.client.create_document_from_source(file_path)
+            doc = await self.client.create_document_from_source(source)
             self._rich_print_document(doc, truncate=True)
             self.console.print(
                 f"[b]Document with id [cyan]{doc.id}[/cyan] added successfully.[/b]"

haiku/rag/chunker.py CHANGED Viewed

@@ -1,11 +1,9 @@
-from io import BytesIO
 from typing import ClassVar
 import tiktoken
 from docling.chunking import HybridChunker  # type: ignore
-from docling.document_converter import DocumentConverter
 from docling_core.transforms.chunker.tokenizer.openai import OpenAITokenizer
-from docling_core.types.io import DocumentStream
+from docling_core.types.doc.document import DoclingDocument
 from haiku.rag.config import Config
@@ -33,27 +31,20 @@ class Chunker:
         self.chunker = HybridChunker(tokenizer=tokenizer)  # type: ignore
-    async def chunk(self, text: str) -> list[str]:
-        """Split the text into chunks using docling's structure-aware chunking.
+    async def chunk(self, document: DoclingDocument) -> list[str]:
+        """Split the document into chunks using docling's structure-aware chunking.
         Args:
-            text: The text to be split into chunks.
+            document: The DoclingDocument to be split into chunks.
         Returns:
             A list of text chunks with semantic boundaries.
         """
-        if not text:
+        if document is None:
             return []
-        # Convert to docling document
-        bytes_io = BytesIO(text.encode("utf-8"))
-        doc_stream = DocumentStream(name="text.md", stream=bytes_io)
-        converter = DocumentConverter()
-        result = converter.convert(doc_stream)
-        doc = result.document
         # Chunk using docling's hybrid chunker
-        chunks = list(self.chunker.chunk(doc))
+        chunks = list(self.chunker.chunk(document))
         return [self.chunker.contextualize(chunk) for chunk in chunks]

haiku/rag/cli.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import asyncio
+import warnings
 from importlib.metadata import version
 from pathlib import Path
@@ -9,12 +10,14 @@ from haiku.rag.app import HaikuRAGApp
 from haiku.rag.config import Config
 from haiku.rag.utils import is_up_to_date
+if not Config.ENV == "development":
+    warnings.filterwarnings("ignore")
 cli = typer.Typer(
     context_settings={"help_option_names": ["-h", "--help"]}, no_args_is_help=True
 )
 console = Console()
-event_loop = asyncio.get_event_loop()
 async def check_version():
@@ -46,7 +49,7 @@ def main(
 ):
     """haiku.rag CLI - SQLite-based RAG system"""
     # Run version check before any command
-    event_loop.run_until_complete(check_version())
+    asyncio.run(check_version())
 @cli.command("list", help="List all stored documents")
@@ -58,7 +61,7 @@ def list_documents(
     ),
 ):
     app = HaikuRAGApp(db_path=db)
-    event_loop.run_until_complete(app.list_documents())
+    asyncio.run(app.list_documents())
 @cli.command("add", help="Add a document from text input")
@@ -73,12 +76,12 @@ def add_document_text(
     ),
 ):
     app = HaikuRAGApp(db_path=db)
-    event_loop.run_until_complete(app.add_document_from_text(text=text))
+    asyncio.run(app.add_document_from_text(text=text))
 @cli.command("add-src", help="Add a document from a file path or URL")
 def add_document_src(
-    file_path: Path = typer.Argument(
+    source: str = typer.Argument(
         help="The file path or URL of the document to add",
     ),
     db: Path = typer.Option(
@@ -88,7 +91,7 @@ def add_document_src(
     ),
 ):
     app = HaikuRAGApp(db_path=db)
-    event_loop.run_until_complete(app.add_document_from_source(file_path=file_path))
+    asyncio.run(app.add_document_from_source(source=source))
 @cli.command("get", help="Get and display a document by its ID")
@@ -103,7 +106,7 @@ def get_document(
     ),
 ):
     app = HaikuRAGApp(db_path=db)
-    event_loop.run_until_complete(app.get_document(doc_id=doc_id))
+    asyncio.run(app.get_document(doc_id=doc_id))
 @cli.command("delete", help="Delete a document by its ID")
@@ -118,7 +121,7 @@ def delete_document(
     ),
 ):
     app = HaikuRAGApp(db_path=db)
-    event_loop.run_until_complete(app.delete_document(doc_id=doc_id))
+    asyncio.run(app.delete_document(doc_id=doc_id))
 @cli.command("search", help="Search for documents by a query")
@@ -144,7 +147,7 @@ def search(
     ),
 ):
     app = HaikuRAGApp(db_path=db)
-    event_loop.run_until_complete(app.search(query=query, limit=limit, k=k))
+    asyncio.run(app.search(query=query, limit=limit, k=k))
 @cli.command("ask", help="Ask a question using the QA agent")
@@ -159,7 +162,7 @@ def ask(
     ),
 ):
     app = HaikuRAGApp(db_path=db)
-    event_loop.run_until_complete(app.ask(question=question))
+    asyncio.run(app.ask(question=question))
 @cli.command("settings", help="Display current configuration settings")
@@ -180,7 +183,7 @@ def rebuild(
     ),
 ):
     app = HaikuRAGApp(db_path=db)
-    event_loop.run_until_complete(app.rebuild())
+    asyncio.run(app.rebuild())
 @cli.command(
@@ -216,7 +219,7 @@ def serve(
     elif sse:
         transport = "sse"
-    event_loop.run_until_complete(app.serve(transport=transport))
+    asyncio.run(app.serve(transport=transport))
 if __name__ == "__main__":

haiku/rag/client.py CHANGED Viewed

@@ -16,6 +16,7 @@ from haiku.rag.store.models.chunk import Chunk
 from haiku.rag.store.models.document import Document
 from haiku.rag.store.repositories.chunk import ChunkRepository
 from haiku.rag.store.repositories.document import DocumentRepository
+from haiku.rag.utils import text_to_docling_document
 class HaikuRAG:
@@ -49,6 +50,24 @@ class HaikuRAG:
         self.close()
         return False
+    async def _create_document_with_docling(
+        self,
+        docling_document,
+        uri: str | None = None,
+        metadata: dict | None = None,
+        chunks: list[Chunk] | None = None,
+    ) -> Document:
+        """Create a new document from DoclingDocument."""
+        content = docling_document.export_to_markdown()
+        document = Document(
+            content=content,
+            uri=uri,
+            metadata=metadata or {},
+        )
+        return await self.document_repository._create_with_docling(
+            document, docling_document, chunks
+        )
     async def create_document(
         self,
         content: str,
@@ -67,12 +86,17 @@ class HaikuRAG:
         Returns:
             The created Document instance.
         """
+        # Convert content to DoclingDocument for processing
+        docling_document = text_to_docling_document(content)
         document = Document(
             content=content,
             uri=uri,
             metadata=metadata or {},
         )
-        return await self.document_repository.create(document, chunks)
+        return await self.document_repository._create_with_docling(
+            document, docling_document, chunks
+        )
     async def create_document_from_source(
         self, source: str | Path, metadata: dict = {}
@@ -101,16 +125,19 @@ class HaikuRAG:
         parsed_url = urlparse(source_str)
         if parsed_url.scheme in ("http", "https"):
             return await self._create_or_update_document_from_url(source_str, metadata)
-        # Handle as file path
-        source_path = Path(source) if isinstance(source, str) else source
+        elif parsed_url.scheme == "file":
+            # Handle file:// URI by converting to path
+            source_path = Path(parsed_url.path)
+        else:
+            # Handle as regular file path
+            source_path = Path(source) if isinstance(source, str) else source
         if source_path.suffix.lower() not in FileReader.extensions:
             raise ValueError(f"Unsupported file extension: {source_path.suffix}")
         if not source_path.exists():
             raise ValueError(f"File does not exist: {source_path}")
-        uri = source_path.as_uri()
+        uri = source_path.absolute().as_uri()
         md5_hash = hashlib.md5(source_path.read_bytes()).hexdigest()
         # Check if document already exists
@@ -119,7 +146,7 @@ class HaikuRAG:
             # MD5 unchanged, return existing document
             return existing_doc
-        content = FileReader.parse_file(source_path)
+        docling_document = FileReader.parse_file(source_path)
         # Get content type from file extension
         content_type, _ = mimetypes.guess_type(str(source_path))
@@ -131,13 +158,15 @@ class HaikuRAG:
         if existing_doc:
             # Update existing document
-            existing_doc.content = content
+            existing_doc.content = docling_document.export_to_markdown()
             existing_doc.metadata = metadata
-            return await self.update_document(existing_doc)
+            return await self.document_repository._update_with_docling(
+                existing_doc, docling_document
+            )
         else:
-            # Create new document
-            return await self.create_document(
-                content=content, uri=uri, metadata=metadata
+            # Create new document using DoclingDocument
+            return await self._create_document_with_docling(
+                docling_document=docling_document, uri=uri, metadata=metadata
             )
     async def _create_or_update_document_from_url(
@@ -193,18 +222,20 @@ class HaikuRAG:
                 temp_path = Path(temp_file.name)
                 # Parse the content using FileReader
-                content = FileReader.parse_file(temp_path)
+                docling_document = FileReader.parse_file(temp_path)
             # Merge metadata with contentType and md5
             metadata.update({"contentType": content_type, "md5": md5_hash})
             if existing_doc:
-                existing_doc.content = content
+                existing_doc.content = docling_document.export_to_markdown()
                 existing_doc.metadata = metadata
-                return await self.update_document(existing_doc)
+                return await self.document_repository._update_with_docling(
+                    existing_doc, docling_document
+                )
             else:
-                return await self.create_document(
-                    content=content, uri=url, metadata=metadata
+                return await self._create_document_with_docling(
+                    docling_document=docling_document, uri=url, metadata=metadata
                 )
     def _get_extension_from_content_type_or_url(
@@ -262,7 +293,12 @@ class HaikuRAG:
     async def update_document(self, document: Document) -> Document:
         """Update an existing document."""
-        return await self.document_repository.update(document)
+        # Convert content to DoclingDocument
+        docling_document = text_to_docling_document(document.content)
+        return await self.document_repository._update_with_docling(
+            document, docling_document
+        )
     async def delete_document(self, document_id: int) -> bool:
         """Delete a document by its ID."""
@@ -283,7 +319,7 @@ class HaikuRAG:
         return await self.document_repository.list_all(limit=limit, offset=offset)
     async def search(
-        self, query: str, limit: int = 5, k: int = 60, rerank=Config.RERANK
+        self, query: str, limit: int = 5, k: int = 60
     ) -> list[tuple[Chunk, float]]:
         """Search for relevant chunks using hybrid search (vector similarity + full-text search) with reranking.
@@ -295,8 +331,10 @@ class HaikuRAG:
         Returns:
             List of (chunk, score) tuples ordered by relevance.
         """
+        # Get reranker if available
+        reranker = get_reranker()
-        if not rerank:
+        if reranker is None:
             return await self.chunk_repository.search_chunks_hybrid(query, limit, k)
         # Get more initial results (3X) for reranking
@@ -304,7 +342,6 @@ class HaikuRAG:
             query, limit * 3, k
         )
         # Apply reranking
-        reranker = get_reranker()
         chunks = [chunk for chunk, _ in search_results]
         reranked_results = await reranker.rerank(query, chunks, top_n=limit)
@@ -328,6 +365,13 @@ class HaikuRAG:
     async def rebuild_database(self) -> AsyncGenerator[int, None]:
         """Rebuild the database by deleting all chunks and re-indexing all documents.
+        For documents with URIs:
+        - Deletes the document and re-adds it from source if source exists
+        - Skips documents where source no longer exists
+        For documents without URIs:
+        - Re-creates chunks from existing content
         Yields:
             int: The ID of the document currently being processed
         """
@@ -343,9 +387,36 @@ class HaikuRAG:
         documents = await self.list_documents()
         for doc in documents:
-            if doc.id is not None:
+            assert doc.id is not None, "Document ID should not be None"
+            if doc.uri:
+                # Document has a URI - delete and try to re-add from source
+                try:
+                    # Delete the old document first
+                    await self.delete_document(doc.id)
+                    # Try to re-create from source (this creates the document with chunks)
+                    new_doc = await self.create_document_from_source(
+                        doc.uri, doc.metadata or {}
+                    )
+                    assert new_doc.id is not None, "New document ID should not be None"
+                    yield new_doc.id
+                except (FileNotFoundError, ValueError, OSError) as e:
+                    # Source doesn't exist or can't be accessed - document already deleted, skip
+                    print(f"Skipping document with URI {doc.uri}: {e}")
+                    continue
+                except Exception as e:
+                    # Unexpected error - log it and skip
+                    print(
+                        f"Unexpected error processing document with URI {doc.uri}: {e}"
+                    )
+                    continue
+            else:
+                # Document without URI - re-create chunks from existing content
+                docling_document = text_to_docling_document(doc.content)
                 await self.chunk_repository.create_chunks_for_document(
-                    doc.id, doc.content, commit=False
+                    doc.id, docling_document, commit=False
                 )
                 yield doc.id

haiku/rag/config.py CHANGED Viewed

@@ -10,7 +10,7 @@ load_dotenv()
 class AppConfig(BaseModel):
-    ENV: str = "development"
+    ENV: str = "production"
     DEFAULT_DATA_DIR: Path = get_default_data_dir()
     MONITOR_DIRECTORIES: list[Path] = []
@@ -19,9 +19,8 @@ class AppConfig(BaseModel):
     EMBEDDINGS_MODEL: str = "mxbai-embed-large"
     EMBEDDINGS_VECTOR_DIM: int = 1024
-    RERANK: bool = True
-    RERANK_PROVIDER: str = "mxbai"
-    RERANK_MODEL: str = "mixedbread-ai/mxbai-rerank-base-v2"
+    RERANK_PROVIDER: str = "ollama"
+    RERANK_MODEL: str = "qwen3"
     QA_PROVIDER: str = "ollama"
     QA_MODEL: str = "qwen3"

haiku/rag/reader.py CHANGED Viewed

@@ -2,6 +2,9 @@ from pathlib import Path
 from typing import ClassVar
 from docling.document_converter import DocumentConverter
+from docling_core.types.doc.document import DoclingDocument
+from haiku.rag.utils import text_to_docling_document
 class FileReader:
@@ -84,7 +87,7 @@ class FileReader:
     extensions: ClassVar[list[str]] = docling_extensions + text_extensions
     @staticmethod
-    def parse_file(path: Path) -> str:
+    def parse_file(path: Path) -> DoclingDocument:
         try:
             file_extension = path.suffix.lower()
@@ -92,7 +95,7 @@ class FileReader:
                 # Use docling for complex document formats
                 converter = DocumentConverter()
                 result = converter.convert(path)
-                return result.document.export_to_markdown()
+                return result.document
             elif file_extension in FileReader.text_extensions:
                 # Read plain text files directly
                 content = path.read_text(encoding="utf-8")
@@ -100,11 +103,13 @@ class FileReader:
                 # Wrap code files (but not plain txt) in markdown code blocks for better presentation
                 if file_extension in FileReader.code_markdown_identifier:
                     language = FileReader.code_markdown_identifier[file_extension]
-                    return f"```{language}\n{content}\n```"
+                    content = f"```{language}\n{content}\n```"
-                return content
+                # Convert text to DoclingDocument by wrapping as markdown
+                return text_to_docling_document(content, name=f"{path.stem}.md")
             else:
-                # Fallback: try to read as text
-                return path.read_text(encoding="utf-8")
+                # Fallback: try to read as text and convert to DoclingDocument
+                content = path.read_text(encoding="utf-8")
+                return text_to_docling_document(content, name=f"{path.stem}.md")
         except Exception:
             raise ValueError(f"Failed to parse file: {path}")

haiku/rag/reranking/__init__.py CHANGED Viewed

@@ -1,37 +1,40 @@
 from haiku.rag.config import Config
 from haiku.rag.reranking.base import RerankerBase
-try:
-    from haiku.rag.reranking.cohere import CohereReranker
-except ImportError:
-    pass
 _reranker: RerankerBase | None = None
-def get_reranker() -> RerankerBase:
+def get_reranker() -> RerankerBase | None:
     """
     Factory function to get the appropriate reranker based on the configuration.
+    Returns None if if reranking is disabled.
     """
     global _reranker
     if _reranker is not None:
         return _reranker
     if Config.RERANK_PROVIDER == "mxbai":
-        from haiku.rag.reranking.mxbai import MxBAIReranker
+        try:
+            from haiku.rag.reranking.mxbai import MxBAIReranker
-        _reranker = MxBAIReranker()
-        return _reranker
+            _reranker = MxBAIReranker()
+            return _reranker
+        except ImportError:
+            return None
     if Config.RERANK_PROVIDER == "cohere":
         try:
             from haiku.rag.reranking.cohere import CohereReranker
+            _reranker = CohereReranker()
+            return _reranker
         except ImportError:
-            raise ImportError(
-                "Cohere reranker requires the 'cohere' package. "
-                "Please install haiku.rag with the 'cohere' extra:"
-                "uv pip install haiku.rag[cohere]"
-            )
-        _reranker = CohereReranker()
+            return None
+    if Config.RERANK_PROVIDER == "ollama":
+        from haiku.rag.reranking.ollama import OllamaReranker
+        _reranker = OllamaReranker()
         return _reranker
-    raise ValueError(f"Unsupported reranker provider: {Config.RERANK_PROVIDER}")
+    return None

haiku/rag/reranking/ollama.py ADDED Viewed

@@ -0,0 +1,84 @@
+import json
+from ollama import AsyncClient
+from pydantic import BaseModel
+from haiku.rag.config import Config
+from haiku.rag.reranking.base import RerankerBase
+from haiku.rag.store.models.chunk import Chunk
+OLLAMA_OPTIONS = {"temperature": 0.0, "seed": 42, "num_ctx": 16384}
+class RerankResult(BaseModel):
+    """Individual rerank result with index and relevance score."""
+    index: int
+    relevance_score: float
+class RerankResponse(BaseModel):
+    """Response from the reranking model containing ranked results."""
+    results: list[RerankResult]
+class OllamaReranker(RerankerBase):
+    def __init__(self, model: str = Config.RERANK_MODEL):
+        self._model = model
+        self._client = AsyncClient(host=Config.OLLAMA_BASE_URL)
+    async def rerank(
+        self, query: str, chunks: list[Chunk], top_n: int = 10
+    ) -> list[tuple[Chunk, float]]:
+        if not chunks:
+            return []
+        documents = []
+        for i, chunk in enumerate(chunks):
+            documents.append({"index": i, "content": chunk.content})
+        # Create the prompt for reranking
+        system_prompt = """You are a document reranking assistant. Given a query and a list of document chunks, you must rank them by relevance to the query.
+Return your response as a JSON object with a "results" array. Each result should have:
+- "index": the original index of the document (integer)
+- "relevance_score": a score between 0.0 and 1.0 indicating relevance (float, where 1.0 is most relevant)
+Only return the top documents up to the requested limit, ordered by decreasing relevance score."""
+        documents_text = ""
+        for doc in documents:
+            documents_text += f"Index {doc['index']}: {doc['content']}\n\n"
+        user_prompt = f"""Query: {query}
+Documents to rerank:
+{documents_text.strip()}
+Please rank these documents by relevance to the query and return the top {top_n} results as JSON."""
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt},
+        ]
+        try:
+            response = await self._client.chat(
+                model=self._model,
+                messages=messages,
+                format=RerankResponse.model_json_schema(),
+                options=OLLAMA_OPTIONS,
+            )
+            content = response["message"]["content"]
+            parsed_response = RerankResponse.model_validate(json.loads(content))
+            return [
+                (chunks[result.index], result.relevance_score)
+                for result in parsed_response.results[:top_n]
+            ]
+        except Exception:
+            # Fallback: return chunks in original order with same score
+            return [(chunks[i], 1.0) for i in range(min(top_n, len(chunks)))]

haiku/rag/store/repositories/chunk.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import json
 import re
+from docling_core.types.doc.document import DoclingDocument
 from haiku.rag.chunker import chunker
 from haiku.rag.embeddings import get_embedder
 from haiku.rag.store.models.chunk import Chunk
@@ -197,11 +199,11 @@ class ChunkRepository(BaseRepository[Chunk]):
         ]
     async def create_chunks_for_document(
-        self, document_id: int, content: str, commit: bool = True
+        self, document_id: int, document: DoclingDocument, commit: bool = True
     ) -> list[Chunk]:
-        """Create chunks and embeddings for a document."""
+        """Create chunks and embeddings for a document from DoclingDocument."""
         # Chunk the document content
-        chunk_texts = await chunker.chunk(content)
+        chunk_texts = await chunker.chunk(document)
         created_chunks = []
         # Create chunks with embeddings using the create method

haiku/rag/store/repositories/document.py CHANGED Viewed

@@ -1,8 +1,11 @@
 import json
 from typing import TYPE_CHECKING
+from docling_core.types.doc.document import DoclingDocument
 from haiku.rag.store.models.document import Document
 from haiku.rag.store.repositories.base import BaseRepository
+from haiku.rag.utils import text_to_docling_document
 if TYPE_CHECKING:
     from haiku.rag.store.models.chunk import Chunk
@@ -20,8 +23,11 @@ class DocumentRepository(BaseRepository[Document]):
             chunk_repository = ChunkRepository(store)
         self.chunk_repository = chunk_repository
-    async def create(
-        self, entity: Document, chunks: list["Chunk"] | None = None
+    async def _create_with_docling(
+        self,
+        entity: Document,
+        docling_document: DoclingDocument,
+        chunks: list["Chunk"] | None = None,
     ) -> Document:
         """Create a document with its chunks and embeddings."""
         if self.store._connection is None:
@@ -62,9 +68,9 @@ class DocumentRepository(BaseRepository[Document]):
                     chunk.metadata["order"] = order
                     await self.chunk_repository.create(chunk, commit=False)
             else:
-                # Create chunks and embeddings using ChunkRepository
+                # Create chunks and embeddings using DoclingDocument
                 await self.chunk_repository.create_chunks_for_document(
-                    document_id, entity.content, commit=False
+                    document_id, docling_document, commit=False
                 )
             cursor.execute("COMMIT")
@@ -74,6 +80,13 @@ class DocumentRepository(BaseRepository[Document]):
             cursor.execute("ROLLBACK")
             raise
+    async def create(self, entity: Document) -> Document:
+        """Create a document with its chunks and embeddings."""
+        # Convert content to DoclingDocument
+        docling_document = text_to_docling_document(entity.content)
+        return await self._create_with_docling(entity, docling_document)
     async def get_by_id(self, entity_id: int) -> Document | None:
         """Get a document by its ID."""
         if self.store._connection is None:
@@ -134,7 +147,9 @@ class DocumentRepository(BaseRepository[Document]):
             updated_at=updated_at,
         )
-    async def update(self, entity: Document) -> Document:
+    async def _update_with_docling(
+        self, entity: Document, docling_document: DoclingDocument
+    ) -> Document:
         """Update an existing document and regenerate its chunks and embeddings."""
         if self.store._connection is None:
             raise ValueError("Store connection is not available")
@@ -163,10 +178,10 @@ class DocumentRepository(BaseRepository[Document]):
                 },
             )
-            # Delete existing chunks and regenerate using ChunkRepository
+            # Delete existing chunks and regenerate using DoclingDocument
             await self.chunk_repository.delete_by_document_id(entity.id, commit=False)
             await self.chunk_repository.create_chunks_for_document(
-                entity.id, entity.content, commit=False
+                entity.id, docling_document, commit=False
             )
             cursor.execute("COMMIT")
@@ -176,6 +191,13 @@ class DocumentRepository(BaseRepository[Document]):
             cursor.execute("ROLLBACK")
             raise
+    async def update(self, entity: Document) -> Document:
+        """Update an existing document and regenerate its chunks and embeddings."""
+        # Convert content to DoclingDocument
+        docling_document = text_to_docling_document(entity.content)
+        return await self._update_with_docling(entity, docling_document)
     async def delete(self, entity_id: int) -> bool:
         """Delete a document and all its associated chunks and embeddings."""
         # Delete chunks and embeddings first

haiku/rag/utils.py CHANGED Viewed

@@ -1,8 +1,12 @@
 import sys
 from importlib import metadata
+from io import BytesIO
 from pathlib import Path
 import httpx
+from docling.document_converter import DocumentConverter
+from docling_core.types.doc.document import DoclingDocument
+from docling_core.types.io import DocumentStream
 from packaging.version import Version, parse
@@ -77,3 +81,20 @@ async def is_up_to_date() -> tuple[bool, Version, Version]:
             # If no network connection, do not raise alarms.
             pypi_version = running_version
     return running_version >= pypi_version, running_version, pypi_version
+def text_to_docling_document(text: str, name: str = "content.md") -> DoclingDocument:
+    """Convert text content to a DoclingDocument.
+    Args:
+        text: The text content to convert.
+        name: The name to use for the document stream (defaults to "content.md").
+    Returns:
+        A DoclingDocument created from the text content.
+    """
+    bytes_io = BytesIO(text.encode("utf-8"))
+    doc_stream = DocumentStream(name=name, stream=bytes_io)
+    converter = DocumentConverter()
+    result = converter.convert(doc_stream)
+    return result.document

{haiku_rag-0.5.0.dist-info → haiku_rag-0.5.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: haiku.rag
-Version: 0.5.0
+Version: 0.5.2
 Summary: Retrieval Augmented Generation (RAG) with SQLite
 Author-email: Yiorgis Gozadinos <ggozadinos@gmail.com>
 License: MIT
@@ -17,12 +17,11 @@ Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Typing :: Typed
-Requires-Python: >=3.10
+Requires-Python: >=3.11
 Requires-Dist: docling>=2.15.0
 Requires-Dist: fastmcp>=2.8.1
 Requires-Dist: httpx>=0.28.1
-Requires-Dist: mxbai-rerank>=0.1.6
-Requires-Dist: ollama>=0.5.1
+Requires-Dist: ollama>=0.5.3
 Requires-Dist: pydantic>=2.11.7
 Requires-Dist: python-dotenv>=1.1.0
 Requires-Dist: rich>=14.0.0
@@ -34,6 +33,8 @@ Provides-Extra: anthropic
 Requires-Dist: anthropic>=0.56.0; extra == 'anthropic'
 Provides-Extra: cohere
 Requires-Dist: cohere>=5.16.1; extra == 'cohere'
+Provides-Extra: mxbai
+Requires-Dist: mxbai-rerank>=0.1.6; extra == 'mxbai'
 Provides-Extra: openai
 Requires-Dist: openai>=1.0.0; extra == 'openai'
 Provides-Extra: voyageai

{haiku_rag-0.5.0.dist-info → haiku_rag-0.5.2.dist-info}/RECORD RENAMED Viewed

@@ -1,14 +1,14 @@
 haiku/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-haiku/rag/app.py,sha256=FpLVyP1-zAq_XPmU8CPVLkuIAeuhBOGvMqhYS8RbN40,7649
-haiku/rag/chunker.py,sha256=P2slbmoABygYRlqjOGzPBEOYsBZNTnNpE6bnW_dkVOE,1850
-haiku/rag/cli.py,sha256=k7EhLkvTncxsdh5TYrg8BHLYh_lfyzupsWGj1dEEdqY,5992
-haiku/rag/client.py,sha256=MZNIpMm6MS3P6vjLqiCztT2dBOM7-bZOosX5IpbHJbI,12724
-haiku/rag/config.py,sha256=GXTWC3vasBMaWju-yh8Es3CidBz1ftqRH6E5PHpgsSQ,1634
+haiku/rag/app.py,sha256=kuvULOIdgwqzJMaKtb1znStc1YAqB1-RkZ0fwdg6TBk,7642
+haiku/rag/chunker.py,sha256=PVe6ysv8UlacUd4Zb3_8RFWIaWDXnzBAy2VDJ4TaUsE,1555
+haiku/rag/cli.py,sha256=5CcWcBQ47KCZ1wl7DpLzMXtgJZ1nz5Hci8AYp72oXEI,5855
+haiku/rag/client.py,sha256=K51l_orUc3BeKGTHX4xC7YClY9M4Eijpac5Hg1_q6LE,15815
+haiku/rag/config.py,sha256=jiy5vg-YbYa7yY-834Dd9omFtfMBXQBYXmHRJXMPjrs,1581
 haiku/rag/logging.py,sha256=zTTGpGq5tPdcd7RpCbd9EGw1IZlQDbYkrCg9t9pqRc4,580
 haiku/rag/mcp.py,sha256=tMN6fNX7ZtAER1R6DL1GkC9HZozTC4HzuQs199p7icI,4551
 haiku/rag/monitor.py,sha256=r386nkhdlsU8UECwIuVwnrSlgMk3vNIuUZGNIzkZuec,2770
-haiku/rag/reader.py,sha256=s5dinZ-WffioiRH7OWZtE2v7FHRPd1PkqpPYsXtwqtc,2927
-haiku/rag/utils.py,sha256=Ez_tvNlRO_D8c2CBZ83Hs9Gmzcqdq4cmw_V5GBdKy_8,2214
+haiku/rag/reader.py,sha256=qkPTMJuQ_o4sK-8zpDl9WFYe_MJ7aL_gUw6rczIpW-g,3274
+haiku/rag/utils.py,sha256=g-uNTG60iBLgkeHHuah6eVZEkX3NFLs-LZU1YnzJzLQ,2967
 haiku/rag/embeddings/__init__.py,sha256=yFBlxS0jBiVHl_rWz5kb43t6Ha132U1ZGdlIPfhzPdg,1491
 haiku/rag/embeddings/base.py,sha256=NTQvuzbZPu0LBo5wAu3qGyJ4xXUaRAt1fjBO0ygWn_Y,465
 haiku/rag/embeddings/ollama.py,sha256=y6-lp0XpbnyIjoOEdtSzMdEVkU5glOwnWQ1FkpUZnpI,370
@@ -20,10 +20,11 @@ haiku/rag/qa/base.py,sha256=4ZTM_l5FAZ9cA0f8NeqRJiUAmjatwCTmSoclFw0gTFQ,1349
 haiku/rag/qa/ollama.py,sha256=EGUi4urSx9nrnsr5j-qHVDVOnvRTbSMKUbMvXEMIcxM,2381
 haiku/rag/qa/openai.py,sha256=dF32sGgVt8mZi5oVxByaeECs9NqLjvDiZnnpJBsrHm8,3968
 haiku/rag/qa/prompts.py,sha256=8uYMxHzbzI9vo2FPkCSSNTh_RNL96WkBbUWPCMBlLpo,1315
-haiku/rag/reranking/__init__.py,sha256=DsPCdU94wRzDCYl6hz2DySOMWwOvNxKviqKAUfyykK8,1118
+haiku/rag/reranking/__init__.py,sha256=fwC3pauteJwh9Ulm2270QvwAdwr4NMr4RUEuolC-wKU,1063
 haiku/rag/reranking/base.py,sha256=LM9yUSSJ414UgBZhFTgxGprlRqzfTe4I1vgjricz2JY,405
 haiku/rag/reranking/cohere.py,sha256=1iTdiaa8vvb6oHVB2qpWzUOVkyfUcimVSZp6Qr4aq4c,1049
 haiku/rag/reranking/mxbai.py,sha256=46sVTsTIkzIX9THgM3u8HaEmgY7evvEyB-N54JTHvK8,867
+haiku/rag/reranking/ollama.py,sha256=tCrLlNNDBCZu7J3to1gvBq-sOvN1flYEA7E3H3Jq0mU,2790
 haiku/rag/store/__init__.py,sha256=hq0W0DAC7ysqhWSP2M2uHX8cbG6kbr-sWHxhq6qQcY0,103
 haiku/rag/store/engine.py,sha256=cOMBToLilI1Di1qQrFzGLqtRMsuvtiX0Q5RNIEzQy9w,6232
 haiku/rag/store/models/__init__.py,sha256=s0E72zneGlowvZrFWaNxHYjOAUjgWdLxzdYsnvNRVlY,88
@@ -31,13 +32,13 @@ haiku/rag/store/models/chunk.py,sha256=9-vIxW75-kMTelIhgVIMd_WhP-Drc1q65vjaWMP8w
 haiku/rag/store/models/document.py,sha256=TVXVY-nQs-1vCORQEs9rA7zOtndeGC4dgCoujLAS054,396
 haiku/rag/store/repositories/__init__.py,sha256=uIBhxjQh-4o3O-ck8b7BQ58qXQTuJdPvrDIHVhY5T1A,263
 haiku/rag/store/repositories/base.py,sha256=cm3VyQXhtxvRfk1uJHpA0fDSxMpYN-mjQmRiDiLsQ68,1008
-haiku/rag/store/repositories/chunk.py,sha256=UyvHhKb1ESZePoTp2GneAARdfKoocEdfPOwgWPPQ0v8,16878
-haiku/rag/store/repositories/document.py,sha256=fXIWevJaOe6x2cK4u9cQxiEGD0ntKQb9y3VRqklQypE,7920
+haiku/rag/store/repositories/chunk.py,sha256=DIIdpHVemvxZOPHOLBL7pJGWY4VyNrUiQSWPWt24BYo,16974
+haiku/rag/store/repositories/document.py,sha256=ki8LiDukwU1469Yw51i0rQFvBzUQeYkFYWs3Ly83akc,8815
 haiku/rag/store/repositories/settings.py,sha256=qZLXvLsErnCWL0nBQQNfRnatHzCKhtUDLvUK9k-W_fU,2463
 haiku/rag/store/upgrades/__init__.py,sha256=kKS1YWT_P-CYKhKtokOLTIFNKf9jlfjFFr8lyIMeogM,100
 haiku/rag/store/upgrades/v0_3_4.py,sha256=GLogKZdZ40NX1vBHKdOJju7fFzNUCHoEnjSZg17Hm2U,663
-haiku_rag-0.5.0.dist-info/METADATA,sha256=Z29lOzGgaD2PJ6OxZc53QuMzFdosEZCdm7HZYOUNN3M,4198
-haiku_rag-0.5.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-haiku_rag-0.5.0.dist-info/entry_points.txt,sha256=G1U3nAkNd5YDYd4v0tuYFbriz0i-JheCsFuT9kIoGCI,48
-haiku_rag-0.5.0.dist-info/licenses/LICENSE,sha256=eXZrWjSk9PwYFNK9yUczl3oPl95Z4V9UXH7bPN46iPo,1065
-haiku_rag-0.5.0.dist-info/RECORD,,
+haiku_rag-0.5.2.dist-info/METADATA,sha256=b91HARmgPKSy_4LIhna9EoacKb9I_f-cRRTgHqaG-S8,4238
+haiku_rag-0.5.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+haiku_rag-0.5.2.dist-info/entry_points.txt,sha256=G1U3nAkNd5YDYd4v0tuYFbriz0i-JheCsFuT9kIoGCI,48
+haiku_rag-0.5.2.dist-info/licenses/LICENSE,sha256=eXZrWjSk9PwYFNK9yUczl3oPl95Z4V9UXH7bPN46iPo,1065
+haiku_rag-0.5.2.dist-info/RECORD,,

{haiku_rag-0.5.0.dist-info → haiku_rag-0.5.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{haiku_rag-0.5.0.dist-info → haiku_rag-0.5.2.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{haiku_rag-0.5.0.dist-info → haiku_rag-0.5.2.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

haiku.rag 0.5.0__py3-none-any.whl → 0.5.2__py3-none-any.whl

Potentially problematic release.

haiku.rag 0.5.0py3-none-any.whl → 0.5.2py3-none-any.whl