PyPI - haiku.rag-slim - Versions diffs - 0.16.1__tar.gz → 0.17.1__tar.gz - Mend

haiku.rag-slim 0.16.1tar.gz → 0.17.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of haiku.rag-slim might be problematic. Click here for more details.

Files changed (81) hide show

{haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: haiku.rag-slim
-Version: 0.16.1
+Version: 0.17.1
 Summary: Agentic Retrieval Augmented Generation (RAG) with LanceDB - Minimal dependencies
 Author-email: Yiorgis Gozadinos <ggozadinos@gmail.com>
 License: MIT
@@ -26,7 +26,6 @@ Requires-Dist: pydantic>=2.12.3
 Requires-Dist: python-dotenv>=1.2.1
 Requires-Dist: pyyaml>=6.0.3
 Requires-Dist: rich>=14.2.0
-Requires-Dist: tiktoken>=0.12.0
 Requires-Dist: typer<0.20.0,>=0.19.2
 Requires-Dist: watchfiles>=1.1.1
 Provides-Extra: anthropic

haiku_rag_slim-0.17.1/haiku/rag/chunkers/__init__.py ADDED Viewed

@@ -0,0 +1,31 @@
+"""Document chunker abstraction for haiku.rag."""
+from haiku.rag.chunkers.base import DocumentChunker
+from haiku.rag.config import AppConfig, Config
+__all__ = ["DocumentChunker", "get_chunker"]
+def get_chunker(config: AppConfig = Config) -> DocumentChunker:
+    """Get a document chunker instance based on configuration.
+    Args:
+        config: Configuration to use. Defaults to global Config.
+    Returns:
+        DocumentChunker instance configured according to the config.
+    Raises:
+        ValueError: If the chunker provider is not recognized.
+    """
+    if config.processing.chunker == "docling-local":
+        from haiku.rag.chunkers.docling_local import DoclingLocalChunker
+        return DoclingLocalChunker(config)
+    if config.processing.chunker == "docling-serve":
+        from haiku.rag.chunkers.docling_serve import DoclingServeChunker
+        return DoclingServeChunker(config)
+    raise ValueError(f"Unsupported chunker: {config.processing.chunker}")

haiku_rag_slim-0.17.1/haiku/rag/chunkers/base.py ADDED Viewed

@@ -0,0 +1,28 @@
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from docling_core.types.doc.document import DoclingDocument
+class DocumentChunker(ABC):
+    """Abstract base class for document chunkers.
+    Document chunkers split DoclingDocuments into smaller text chunks suitable
+    for embedding and retrieval, respecting document structure and semantic boundaries.
+    """
+    @abstractmethod
+    async def chunk(self, document: "DoclingDocument") -> list[str]:
+        """Split a document into chunks.
+        Args:
+            document: The DoclingDocument to chunk.
+        Returns:
+            List of text chunks with semantic boundaries preserved.
+        Raises:
+            ValueError: If chunking fails.
+        """
+        pass

haiku_rag_slim-0.17.1/haiku/rag/chunkers/docling_local.py ADDED Viewed

@@ -0,0 +1,110 @@
+from typing import TYPE_CHECKING
+from haiku.rag.chunkers.base import DocumentChunker
+from haiku.rag.config import AppConfig, Config
+if TYPE_CHECKING:
+    from docling_core.types.doc.document import DoclingDocument
+def _create_markdown_serializer_provider(use_markdown_tables: bool = True):
+    """Create a markdown serializer provider with configurable table rendering.
+    This function creates a custom serializer provider that extends ChunkingSerializerProvider
+    from docling-core. It's implemented as a factory function to avoid importing
+    docling-core at module level.
+    Args:
+        use_markdown_tables: If True, use MarkdownTableSerializer for rendering tables as
+            markdown. If False, use default TripletTableSerializer for narrative format.
+    """
+    from docling_core.transforms.chunker.hierarchical_chunker import (
+        ChunkingDocSerializer,
+        ChunkingSerializerProvider,
+    )
+    from docling_core.transforms.serializer.markdown import MarkdownTableSerializer
+    class MDTableSerializerProvider(ChunkingSerializerProvider):
+        """Serializer provider for markdown table output."""
+        def __init__(self, use_markdown_tables: bool = True):
+            self.use_markdown_tables = use_markdown_tables
+        def get_serializer(self, doc):
+            if self.use_markdown_tables:
+                return ChunkingDocSerializer(
+                    doc=doc,
+                    table_serializer=MarkdownTableSerializer(),
+                )
+            else:
+                # Use default ChunkingDocSerializer (TripletTableSerializer)
+                return ChunkingDocSerializer(doc=doc)
+    return MDTableSerializerProvider(use_markdown_tables=use_markdown_tables)
+class DoclingLocalChunker(DocumentChunker):
+    """Local document chunker using docling's chunkers.
+    Supports both hybrid (structure-aware) and hierarchical chunking strategies.
+    Chunking is performed locally using the HuggingFace tokenizer specified in
+    configuration.
+    Args:
+        config: Application configuration.
+    """
+    def __init__(self, config: AppConfig = Config):
+        from docling_core.transforms.chunker.hierarchical_chunker import (
+            HierarchicalChunker,
+        )
+        from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
+        from docling_core.transforms.chunker.tokenizer.huggingface import (
+            HuggingFaceTokenizer,
+        )
+        from transformers import AutoTokenizer
+        self.config = config
+        self.chunk_size = config.processing.chunk_size
+        self.chunker_type = config.processing.chunker_type
+        self.tokenizer_name = config.processing.chunking_tokenizer
+        if self.chunker_type == "hybrid":
+            hf_tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name)
+            tokenizer = HuggingFaceTokenizer(
+                tokenizer=hf_tokenizer, max_tokens=self.chunk_size
+            )
+            serializer_provider = _create_markdown_serializer_provider(
+                use_markdown_tables=config.processing.chunking_use_markdown_tables
+            )
+            self.chunker = HybridChunker(
+                tokenizer=tokenizer,
+                merge_peers=config.processing.chunking_merge_peers,
+                serializer_provider=serializer_provider,
+            )
+        elif self.chunker_type == "hierarchical":
+            serializer_provider = _create_markdown_serializer_provider(
+                use_markdown_tables=config.processing.chunking_use_markdown_tables
+            )
+            self.chunker = HierarchicalChunker(serializer_provider=serializer_provider)
+        else:
+            raise ValueError(
+                f"Unsupported chunker_type: {self.chunker_type}. "
+                "Must be 'hybrid' or 'hierarchical'."
+            )
+    async def chunk(self, document: "DoclingDocument") -> list[str]:
+        """Split the document into chunks using docling's structure-aware chunking.
+        Args:
+            document: The DoclingDocument to be split into chunks.
+        Returns:
+            A list of text chunks with semantic boundaries.
+        """
+        if document is None:
+            return []
+        # Chunk using docling's hybrid chunker
+        chunks = list(self.chunker.chunk(document))
+        return [self.chunker.contextualize(chunk) for chunk in chunks]

haiku_rag_slim-0.17.1/haiku/rag/chunkers/docling_serve.py ADDED Viewed

@@ -0,0 +1,111 @@
+from io import BytesIO
+from typing import TYPE_CHECKING
+import requests
+from haiku.rag.chunkers.base import DocumentChunker
+from haiku.rag.config import AppConfig, Config
+if TYPE_CHECKING:
+    from docling_core.types.doc.document import DoclingDocument
+class DoclingServeChunker(DocumentChunker):
+    """Remote document chunker using docling-serve API.
+    Sends DoclingDocument JSON to docling-serve for chunking. Supports both hybrid
+    and hierarchical chunking strategies via remote API.
+    Args:
+        config: Application configuration containing docling-serve settings.
+    """
+    def __init__(self, config: AppConfig = Config):
+        self.config = config
+        self.base_url = config.providers.docling_serve.base_url.rstrip("/")
+        self.api_key = config.providers.docling_serve.api_key
+        self.timeout = config.providers.docling_serve.timeout
+        self.chunker_type = config.processing.chunker_type
+    async def chunk(self, document: "DoclingDocument") -> list[str]:
+        """Split the document into chunks via docling-serve.
+        Exports the DoclingDocument to JSON and sends it to docling-serve's chunking
+        endpoint. The API will chunk the document and return the text chunks.
+        Args:
+            document: The DoclingDocument to be split into chunks.
+        Returns:
+            A list of text chunks with semantic boundaries.
+        Raises:
+            ValueError: If chunking fails or service is unavailable.
+        """
+        if document is None:
+            return []
+        try:
+            # Determine endpoint based on chunker_type
+            if self.chunker_type == "hierarchical":
+                url = f"{self.base_url}/v1/chunk/hierarchical/file"
+            else:
+                url = f"{self.base_url}/v1/chunk/hybrid/file"
+            # Export document to JSON
+            doc_json = document.model_dump_json()
+            doc_bytes = doc_json.encode("utf-8")
+            # Prepare multipart request with DoclingDocument JSON
+            files = {"files": ("document.json", BytesIO(doc_bytes), "application/json")}
+            # Build form data with chunking parameters
+            data = {
+                "chunking_max_tokens": str(self.config.processing.chunk_size),
+                "chunking_tokenizer": self.config.processing.chunking_tokenizer,
+                "chunking_merge_peers": str(
+                    self.config.processing.chunking_merge_peers
+                ).lower(),
+                "chunking_use_markdown_tables": str(
+                    self.config.processing.chunking_use_markdown_tables
+                ).lower(),
+            }
+            headers = {}
+            if self.api_key:
+                headers["X-Api-Key"] = self.api_key
+            response = requests.post(
+                url,
+                files=files,
+                data=data,
+                headers=headers,
+                timeout=self.timeout,
+            )
+            response.raise_for_status()
+            result = response.json()
+            # Extract text from chunks
+            chunks = result.get("chunks", [])
+            return [chunk["text"] for chunk in chunks]
+        except requests.exceptions.ConnectionError as e:
+            raise ValueError(
+                f"Could not connect to docling-serve at {self.base_url}. "
+                f"Ensure the service is running and accessible. Error: {e}"
+            )
+        except requests.exceptions.Timeout as e:
+            raise ValueError(
+                f"Request to docling-serve timed out after {self.timeout}s. "
+                f"Consider increasing the timeout in configuration. Error: {e}"
+            )
+        except requests.exceptions.HTTPError as e:
+            if e.response.status_code == 401:
+                raise ValueError(
+                    "Authentication failed. Check your API key configuration."
+                )
+            raise ValueError(f"HTTP error from docling-serve: {e}")
+        except Exception as e:
+            raise ValueError(f"Failed to chunk via docling-serve: {e}")

{haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/client.py RENAMED Viewed

@@ -9,6 +9,7 @@ from urllib.parse import urlparse
 import httpx
 from haiku.rag.config import AppConfig, Config
+from haiku.rag.converters import get_converter
 from haiku.rag.reranking import get_reranker
 from haiku.rag.store.engine import Store
 from haiku.rag.store.models.chunk import Chunk
@@ -111,10 +112,9 @@ class HaikuRAG:
         # Only create docling_document if we need to generate chunks
         if chunks is None:
-            # Lazy import to avoid loading docling
-            from haiku.rag.utils import text_to_docling_document
-            docling_document = text_to_docling_document(content)
+            # Use converter to convert text
+            converter = get_converter(self._config)
+            docling_document = converter.convert_text(content)
         else:
             # Chunks already provided, no conversion needed
             docling_document = None
@@ -201,12 +201,10 @@ class HaikuRAG:
         Raises:
             ValueError: If the file cannot be parsed or doesn't exist
         """
-        # Lazy import to avoid loading docling
-        from haiku.rag.reader import FileReader
         metadata = metadata or {}
-        if source_path.suffix.lower() not in FileReader.extensions:
+        converter = get_converter(self._config)
+        if source_path.suffix.lower() not in converter.supported_extensions:
             raise ValueError(f"Unsupported file extension: {source_path.suffix}")
         if not source_path.exists():
@@ -242,7 +240,8 @@ class HaikuRAG:
             return existing_doc
         # Parse file only when content changed or new document
-        docling_document = FileReader.parse_file(source_path)
+        converter = get_converter(self._config)
+        docling_document = converter.convert_file(source_path)
         if existing_doc:
             # Update existing document
@@ -283,11 +282,11 @@ class HaikuRAG:
             ValueError: If the content cannot be parsed
             httpx.RequestError: If URL request fails
         """
-        # Lazy import to avoid loading docling
-        from haiku.rag.reader import FileReader
         metadata = metadata or {}
+        converter = get_converter(self._config)
+        supported_extensions = converter.supported_extensions
         async with httpx.AsyncClient() as client:
             response = await client.get(url)
             response.raise_for_status()
@@ -320,7 +319,7 @@ class HaikuRAG:
                 url, content_type
             )
-            if file_extension not in FileReader.extensions:
+            if file_extension not in supported_extensions:
                 raise ValueError(
                     f"Unsupported content type/extension: {content_type}/{file_extension}"
                 )
@@ -333,8 +332,8 @@ class HaikuRAG:
                 temp_file.flush()  # Ensure content is written to disk
                 temp_path = Path(temp_file.name)
-                # Parse the content using FileReader
-                docling_document = FileReader.parse_file(temp_path)
+                # Parse the content using converter
+                docling_document = converter.convert_file(temp_path)
             # Merge metadata with contentType and md5
             metadata.update({"contentType": content_type, "md5": md5_hash})
@@ -410,11 +409,9 @@ class HaikuRAG:
     async def update_document(self, document: Document) -> Document:
         """Update an existing document."""
-        # Lazy import to avoid loading docling
-        from haiku.rag.utils import text_to_docling_document
         # Convert content to DoclingDocument
-        docling_document = text_to_docling_document(document.content)
+        converter = get_converter(self._config)
+        docling_document = converter.convert_text(document.content)
         return await self.document_repository._update_and_rechunk(
             document, docling_document
@@ -469,8 +466,8 @@ class HaikuRAG:
             # No reranking - return direct search results
             return await self.chunk_repository.search(query, limit, search_type, filter)
-        # Get more initial results (3X) for reranking
-        search_limit = limit * 3
+        # Get more initial results (10X) for reranking
+        search_limit = limit * 10
         search_results = await self.chunk_repository.search(
             query, search_limit, search_type, filter
         )
@@ -646,12 +643,11 @@ class HaikuRAG:
         Yields:
             int: The ID of the document currently being processed
         """
-        # Lazy import to avoid loading docling
-        from haiku.rag.utils import text_to_docling_document
         await self.chunk_repository.delete_all()
         self.store.recreate_embeddings_table()
+        converter = get_converter(self._config)
         # Update settings to current config
         settings_repo = SettingsRepository(self.store)
         settings_repo.save_current_settings()
@@ -703,14 +699,14 @@ class HaikuRAG:
                     logger.warning(
                         "Source missing for %s, re-embedding from content", doc.uri
                     )
-                    docling_document = text_to_docling_document(doc.content)
+                    docling_document = converter.convert_text(doc.content)
                     await self.chunk_repository.create_chunks_for_document(
                         doc.id, docling_document
                     )
                     yield doc.id
             else:
                 # Document without URI - re-create chunks from existing content
-                docling_document = text_to_docling_document(doc.content)
+                docling_document = converter.convert_text(doc.content)
                 await self.chunk_repository.create_chunks_for_document(
                     doc.id, docling_document
                 )

{haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/config/__init__.py RENAMED Viewed

@@ -8,6 +8,7 @@ from haiku.rag.config.loader import (
 from haiku.rag.config.models import (
     AGUIConfig,
     AppConfig,
+    ConversionOptions,
     EmbeddingsConfig,
     LanceDBConfig,
     MonitorConfig,
@@ -25,6 +26,7 @@ __all__ = [
     "Config",
     "AGUIConfig",
     "AppConfig",
+    "ConversionOptions",
     "StorageConfig",
     "MonitorConfig",
     "LanceDBConfig",

{haiku_rag_slim-0.16.1 → haiku_rag_slim-0.17.1}/haiku/rag/config/models.py RENAMED Viewed

@@ -1,4 +1,5 @@
 from pathlib import Path
+from typing import Literal
 from pydantic import BaseModel, Field
@@ -50,10 +51,34 @@ class ResearchConfig(BaseModel):
     max_concurrency: int = 1
+class ConversionOptions(BaseModel):
+    """Options for document conversion."""
+    # OCR options
+    do_ocr: bool = True
+    force_ocr: bool = False
+    ocr_lang: list[str] = []
+    # Table options
+    do_table_structure: bool = True
+    table_mode: Literal["fast", "accurate"] = "accurate"
+    table_cell_matching: bool = True
+    # Image options
+    images_scale: float = 2.0
 class ProcessingConfig(BaseModel):
     chunk_size: int = 256
     context_chunk_radius: int = 0
     markdown_preprocessor: str = ""
+    converter: str = "docling-local"
+    chunker: str = "docling-local"
+    chunker_type: str = "hybrid"
+    chunking_tokenizer: str = "Qwen/Qwen3-Embedding-0.6B"
+    chunking_merge_peers: bool = True
+    chunking_use_markdown_tables: bool = False
+    conversion_options: ConversionOptions = Field(default_factory=ConversionOptions)
 class OllamaConfig(BaseModel):
@@ -71,9 +96,16 @@ class VLLMConfig(BaseModel):
     research_base_url: str = ""
+class DoclingServeConfig(BaseModel):
+    base_url: str = "http://localhost:5001"
+    api_key: str = ""
+    timeout: int = 300
 class ProvidersConfig(BaseModel):
     ollama: OllamaConfig = Field(default_factory=OllamaConfig)
     vllm: VLLMConfig = Field(default_factory=VLLMConfig)
+    docling_serve: DoclingServeConfig = Field(default_factory=DoclingServeConfig)
 class AGUIConfig(BaseModel):

haiku_rag_slim-0.17.1/haiku/rag/converters/__init__.py ADDED Viewed

@@ -0,0 +1,31 @@
+"""Document converter abstraction for haiku.rag."""
+from haiku.rag.config import AppConfig, Config
+from haiku.rag.converters.base import DocumentConverter
+__all__ = ["DocumentConverter", "get_converter"]
+def get_converter(config: AppConfig = Config) -> DocumentConverter:
+    """Get a document converter instance based on configuration.
+    Args:
+        config: Configuration to use. Defaults to global Config.
+    Returns:
+        DocumentConverter instance configured according to the config.
+    Raises:
+        ValueError: If the converter provider is not recognized.
+    """
+    if config.processing.converter == "docling-local":
+        from haiku.rag.converters.docling_local import DoclingLocalConverter
+        return DoclingLocalConverter(config)
+    if config.processing.converter == "docling-serve":
+        from haiku.rag.converters.docling_serve import DoclingServeConverter
+        return DoclingServeConverter(config)
+    raise ValueError(f"Unsupported converter provider: {config.processing.converter}")

haiku_rag_slim-0.17.1/haiku/rag/converters/base.py ADDED Viewed

@@ -0,0 +1,57 @@
+"""Base class for document converters."""
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from docling_core.types.doc.document import DoclingDocument
+class DocumentConverter(ABC):
+    """Abstract base class for document converters.
+    Document converters are responsible for converting various document formats
+    (PDF, DOCX, HTML, etc.) into DoclingDocument format for further processing.
+    """
+    @property
+    @abstractmethod
+    def supported_extensions(self) -> list[str]:
+        """Return list of file extensions supported by this converter.
+        Returns:
+            List of file extensions (including the dot, e.g., [".pdf", ".docx"]).
+        """
+        pass
+    @abstractmethod
+    def convert_file(self, path: Path) -> "DoclingDocument":
+        """Convert a file to DoclingDocument format.
+        Args:
+            path: Path to the file to convert.
+        Returns:
+            DoclingDocument representation of the file.
+        Raises:
+            ValueError: If the file cannot be converted.
+        """
+        pass
+    @abstractmethod
+    def convert_text(self, text: str, name: str = "content.md") -> "DoclingDocument":
+        """Convert text content to DoclingDocument format.
+        Args:
+            text: The text content to convert.
+            name: The name to use for the document (defaults to "content.md").
+        Returns:
+            DoclingDocument representation of the text.
+        Raises:
+            ValueError: If the text cannot be converted.
+        """
+        pass

haiku.rag-slim 0.16.1__tar.gz → 0.17.1__tar.gz

Potentially problematic release.

haiku.rag-slim 0.16.1tar.gz → 0.17.1tar.gz