PyPI - haiku.rag-slim - Versions diffs - 0.16.0__py3-none-any.whl → 0.24.0__py3-none-any.whl - Mend

haiku.rag-slim 0.16.0py3-none-any.whl → 0.24.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of haiku.rag-slim might be problematic. Click here for more details.

Files changed (94) hide show

haiku/rag/app.py +430 -72
haiku/rag/chunkers/__init__.py +31 -0
haiku/rag/chunkers/base.py +31 -0
haiku/rag/chunkers/docling_local.py +164 -0
haiku/rag/chunkers/docling_serve.py +179 -0
haiku/rag/cli.py +207 -24
haiku/rag/cli_chat.py +489 -0
haiku/rag/client.py +1251 -266
haiku/rag/config/__init__.py +16 -10
haiku/rag/config/loader.py +5 -44
haiku/rag/config/models.py +126 -17
haiku/rag/converters/__init__.py +31 -0
haiku/rag/converters/base.py +63 -0
haiku/rag/converters/docling_local.py +193 -0
haiku/rag/converters/docling_serve.py +229 -0
haiku/rag/converters/text_utils.py +237 -0
haiku/rag/embeddings/__init__.py +123 -24
haiku/rag/embeddings/voyageai.py +175 -20
haiku/rag/graph/__init__.py +0 -11
haiku/rag/graph/agui/__init__.py +8 -2
haiku/rag/graph/agui/cli_renderer.py +1 -1
haiku/rag/graph/agui/emitter.py +219 -31
haiku/rag/graph/agui/server.py +20 -62
haiku/rag/graph/agui/stream.py +1 -2
haiku/rag/graph/research/__init__.py +5 -2
haiku/rag/graph/research/dependencies.py +12 -126
haiku/rag/graph/research/graph.py +390 -135
haiku/rag/graph/research/models.py +91 -112
haiku/rag/graph/research/prompts.py +99 -91
haiku/rag/graph/research/state.py +35 -27
haiku/rag/inspector/__init__.py +8 -0
haiku/rag/inspector/app.py +259 -0
haiku/rag/inspector/widgets/__init__.py +6 -0
haiku/rag/inspector/widgets/chunk_list.py +100 -0
haiku/rag/inspector/widgets/context_modal.py +89 -0
haiku/rag/inspector/widgets/detail_view.py +130 -0
haiku/rag/inspector/widgets/document_list.py +75 -0
haiku/rag/inspector/widgets/info_modal.py +209 -0
haiku/rag/inspector/widgets/search_modal.py +183 -0
haiku/rag/inspector/widgets/visual_modal.py +126 -0
haiku/rag/mcp.py +106 -102
haiku/rag/monitor.py +33 -9
haiku/rag/providers/__init__.py +5 -0
haiku/rag/providers/docling_serve.py +108 -0
haiku/rag/qa/__init__.py +12 -10
haiku/rag/qa/agent.py +43 -61
haiku/rag/qa/prompts.py +35 -57
haiku/rag/reranking/__init__.py +9 -6
haiku/rag/reranking/base.py +1 -1
haiku/rag/reranking/cohere.py +5 -4
haiku/rag/reranking/mxbai.py +5 -2
haiku/rag/reranking/vllm.py +3 -4
haiku/rag/reranking/zeroentropy.py +6 -5
haiku/rag/store/__init__.py +2 -1
haiku/rag/store/engine.py +242 -42
haiku/rag/store/exceptions.py +4 -0
haiku/rag/store/models/__init__.py +8 -2
haiku/rag/store/models/chunk.py +190 -0
haiku/rag/store/models/document.py +46 -0
haiku/rag/store/repositories/chunk.py +141 -121
haiku/rag/store/repositories/document.py +25 -84
haiku/rag/store/repositories/settings.py +11 -14
haiku/rag/store/upgrades/__init__.py +19 -3
haiku/rag/store/upgrades/v0_10_1.py +1 -1
haiku/rag/store/upgrades/v0_19_6.py +65 -0
haiku/rag/store/upgrades/v0_20_0.py +68 -0
haiku/rag/store/upgrades/v0_23_1.py +100 -0
haiku/rag/store/upgrades/v0_9_3.py +3 -3
haiku/rag/utils.py +371 -146
{haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/METADATA +15 -12
haiku_rag_slim-0.24.0.dist-info/RECORD +78 -0
{haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/WHEEL +1 -1
haiku/rag/chunker.py +0 -65
haiku/rag/embeddings/base.py +0 -25
haiku/rag/embeddings/ollama.py +0 -28
haiku/rag/embeddings/openai.py +0 -26
haiku/rag/embeddings/vllm.py +0 -29
haiku/rag/graph/agui/events.py +0 -254
haiku/rag/graph/common/__init__.py +0 -5
haiku/rag/graph/common/models.py +0 -42
haiku/rag/graph/common/nodes.py +0 -265
haiku/rag/graph/common/prompts.py +0 -46
haiku/rag/graph/common/utils.py +0 -44
haiku/rag/graph/deep_qa/__init__.py +0 -1
haiku/rag/graph/deep_qa/dependencies.py +0 -27
haiku/rag/graph/deep_qa/graph.py +0 -243
haiku/rag/graph/deep_qa/models.py +0 -20
haiku/rag/graph/deep_qa/prompts.py +0 -59
haiku/rag/graph/deep_qa/state.py +0 -56
haiku/rag/graph/research/common.py +0 -87
haiku/rag/reader.py +0 -135
haiku_rag_slim-0.16.0.dist-info/RECORD +0 -71
{haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/entry_points.txt +0 -0
{haiku_rag_slim-0.16.0.dist-info → haiku_rag_slim-0.24.0.dist-info}/licenses/LICENSE +0 -0

haiku/rag/store/exceptions.py ADDED Viewed

@@ -0,0 +1,4 @@
+class ReadOnlyError(Exception):
+    """Raised when a write operation is attempted on a read-only store."""
+    pass

haiku/rag/store/models/__init__.py CHANGED Viewed

@@ -1,4 +1,10 @@
-from .chunk import Chunk
+from .chunk import BoundingBox, Chunk, ChunkMetadata, SearchResult
 from .document import Document
-__all__ = ["Chunk", "Document"]
+__all__ = [
+    "BoundingBox",
+    "Chunk",
+    "ChunkMetadata",
+    "Document",
+    "SearchResult",
+]

haiku/rag/store/models/chunk.py CHANGED Viewed

@@ -1,5 +1,92 @@
+from typing import TYPE_CHECKING
 from pydantic import BaseModel
+if TYPE_CHECKING:
+    from docling_core.types.doc.document import DocItem, DoclingDocument
+class BoundingBox(BaseModel):
+    """Bounding box coordinates for visual grounding."""
+    page_no: int
+    left: float
+    top: float
+    right: float
+    bottom: float
+class ChunkMetadata(BaseModel):
+    """
+    Structured metadata for a chunk, including DoclingDocument references.
+    Attributes:
+        doc_item_refs: JSON pointer references to DocItems in the parent DoclingDocument
+                       (e.g., ["#/texts/5", "#/texts/6", "#/tables/0"])
+        headings: Section heading hierarchy for this chunk
+                  (e.g., ["Chapter 1", "Section 1.1"])
+        labels: Semantic labels for each doc_item (e.g., ["paragraph", "table"])
+        page_numbers: Page numbers where the chunk content appears
+    """
+    doc_item_refs: list[str] = []
+    headings: list[str] | None = None
+    labels: list[str] = []
+    page_numbers: list[int] = []
+    def resolve_doc_items(self, docling_document: "DoclingDocument") -> list["DocItem"]:
+        """Resolve doc_item_refs to actual DocItem objects.
+        Args:
+            docling_document: The parent DoclingDocument containing the items.
+        Returns:
+            List of resolved DocItem objects. Items that fail to resolve are skipped.
+        """
+        from docling_core.types.doc.document import RefItem
+        doc_items = []
+        for ref in self.doc_item_refs:
+            try:
+                ref_item = RefItem.model_validate({"$ref": ref})
+                doc_item = ref_item.resolve(docling_document)
+                doc_items.append(doc_item)
+            except Exception:
+                # Graceful degradation: skip refs that can't be resolved
+                continue
+        return doc_items
+    def resolve_bounding_boxes(
+        self, docling_document: "DoclingDocument"
+    ) -> list[BoundingBox]:
+        """Resolve doc_item_refs to bounding boxes for visual grounding.
+        Args:
+            docling_document: The parent DoclingDocument containing the items.
+        Returns:
+            List of BoundingBox objects from resolved DocItems' provenance.
+        """
+        bounding_boxes = []
+        for doc_item in self.resolve_doc_items(docling_document):
+            prov = getattr(doc_item, "prov", None)
+            if not prov:
+                continue
+            for prov_item in prov:
+                bbox = getattr(prov_item, "bbox", None)
+                if bbox is None:
+                    continue
+                bounding_boxes.append(
+                    BoundingBox(
+                        page_no=prov_item.page_no,
+                        left=bbox.l,
+                        top=bbox.t,
+                        right=bbox.r,
+                        bottom=bbox.b,
+                    )
+                )
+        return bounding_boxes
 class Chunk(BaseModel):
     """
@@ -15,3 +102,106 @@ class Chunk(BaseModel):
     document_title: str | None = None
     document_meta: dict = {}
     embedding: list[float] | None = None
+    def get_chunk_metadata(self) -> ChunkMetadata:
+        """Parse metadata dict into structured ChunkMetadata."""
+        return ChunkMetadata.model_validate(self.metadata)
+class SearchResult(BaseModel):
+    """Search result with optional provenance information for citations."""
+    content: str
+    score: float
+    chunk_id: str | None = None
+    document_id: str | None = None
+    document_uri: str | None = None
+    document_title: str | None = None
+    doc_item_refs: list[str] = []
+    page_numbers: list[int] = []
+    headings: list[str] | None = None
+    labels: list[str] = []
+    @classmethod
+    def from_chunk(
+        cls,
+        chunk: "Chunk",
+        score: float,
+    ) -> "SearchResult":
+        """Create from a Chunk."""
+        meta = chunk.get_chunk_metadata()
+        return cls(
+            content=chunk.content,
+            score=score,
+            chunk_id=chunk.id,
+            document_id=chunk.document_id,
+            document_uri=chunk.document_uri,
+            document_title=chunk.document_title,
+            doc_item_refs=meta.doc_item_refs,
+            page_numbers=meta.page_numbers,
+            headings=meta.headings,
+            labels=meta.labels,
+        )
+    def format_for_agent(self) -> str:
+        """Format this search result for inclusion in agent context.
+        Produces a structured format with metadata that helps LLMs understand
+        the source and nature of the content.
+        """
+        parts = [f"[{self.chunk_id}] (score: {self.score:.2f})"]
+        # Document source info
+        source_parts = []
+        if self.document_title:
+            source_parts.append(f'"{self.document_title}"')
+        if self.headings:
+            source_parts.append(" > ".join(self.headings))
+        if source_parts:
+            parts.append(f"Source: {' > '.join(source_parts)}")
+        # Content type (use primary label if available)
+        if self.labels:
+            primary_label = self._get_primary_label()
+            if primary_label:
+                parts.append(f"Type: {primary_label}")
+        # The actual content
+        parts.append(f"Content:\n{self.content}")
+        return "\n".join(parts)
+    def _get_primary_label(self) -> str | None:
+        """Get the most significant label for display.
+        Prioritizes structural labels over text labels.
+        """
+        if not self.labels:
+            return None
+        # Priority order: structural > contextual > text
+        priority = {
+            "table": 1,
+            "code": 2,
+            "form": 3,
+            "key_value_region": 4,
+            "list_item": 5,
+            "formula": 6,
+            "chart": 7,
+            "picture": 8,
+            "caption": 9,
+            "footnote": 10,
+            "section_header": 11,
+            "title": 12,
+        }
+        # Find highest priority label
+        best_label = None
+        best_priority = float("inf")
+        for label in self.labels:
+            if label in priority and priority[label] < best_priority:
+                best_label = label
+                best_priority = priority[label]
+        # Return best structural/special label, or first label if all are text
+        return best_label if best_label else self.labels[0]

haiku/rag/store/models/document.py CHANGED Viewed

@@ -1,7 +1,32 @@
 from datetime import datetime
+from typing import TYPE_CHECKING
+from cachetools import LRUCache
 from pydantic import BaseModel, Field
+if TYPE_CHECKING:
+    from docling_core.types.doc.document import DoclingDocument
+_docling_document_cache: LRUCache[str, "DoclingDocument"] = LRUCache(maxsize=100)
+def _get_cached_docling_document(document_id: str, json_str: str) -> "DoclingDocument":
+    """Get or parse DoclingDocument with LRU caching by document ID."""
+    if document_id in _docling_document_cache:
+        return _docling_document_cache[document_id]
+    from docling_core.types.doc.document import DoclingDocument
+    doc = DoclingDocument.model_validate_json(json_str)
+    _docling_document_cache[document_id] = doc
+    return doc
+def invalidate_docling_document_cache(document_id: str) -> None:
+    """Remove a document from the DoclingDocument cache."""
+    _docling_document_cache.pop(document_id, None)
 class Document(BaseModel):
     """
@@ -13,5 +38,26 @@ class Document(BaseModel):
     uri: str | None = None
     title: str | None = None
     metadata: dict = {}
+    docling_document_json: str | None = None
+    docling_version: str | None = None
     created_at: datetime = Field(default_factory=datetime.now)
     updated_at: datetime = Field(default_factory=datetime.now)
+    def get_docling_document(self) -> "DoclingDocument | None":
+        """Parse and return the stored DoclingDocument.
+        Uses LRU cache (keyed by document ID) to avoid repeated parsing.
+        Returns:
+            The parsed DoclingDocument, or None if not stored or no ID.
+        """
+        if self.docling_document_json is None:
+            return None
+        # No caching for documents without ID
+        if self.id is None:
+            from docling_core.types.doc.document import DoclingDocument
+            return DoclingDocument.model_validate_json(self.docling_document_json)
+        return _get_cached_docling_document(self.id, self.docling_document_json)

haiku/rag/store/repositories/chunk.py CHANGED Viewed

@@ -1,21 +1,20 @@
-import inspect
 import json
 import logging
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, cast
 from uuid import uuid4
 if TYPE_CHECKING:
     import pandas as pd
-    from lancedb.query import LanceQueryBuilder
+    from lancedb.query import (
+        LanceHybridQueryBuilder,
+        LanceQueryBuilder,
+        LanceVectorQueryBuilder,
+    )
 from lancedb.rerankers import RRFReranker
 from haiku.rag.store.engine import DocumentRecord, Store
 from haiku.rag.store.models.chunk import Chunk
-from haiku.rag.utils import load_callable
-if TYPE_CHECKING:
-    from docling_core.types.doc.document import DoclingDocument
 logger = logging.getLogger(__name__)
@@ -28,43 +27,87 @@ class ChunkRepository:
         self.embedder = store.embedder
     def _ensure_fts_index(self) -> None:
-        """Ensure FTS index exists on the content column."""
+        """Ensure FTS index exists on the content_fts column."""
         try:
             self.store.chunks_table.create_fts_index(
-                "content", replace=True, with_position=True, remove_stop_words=False
+                "content_fts", replace=True, with_position=True, remove_stop_words=False
             )
         except Exception as e:
             # Log the error but don't fail - FTS might already exist
             logger.debug(f"FTS index creation skipped: {e}")
-    async def create(self, entity: Chunk) -> Chunk:
-        """Create a chunk in the database."""
-        assert entity.document_id, "Chunk must have a document_id to be created"
+    def _contextualize_content(self, chunk: Chunk) -> str:
+        """Generate contextualized content for FTS by prepending headings."""
+        meta = chunk.get_chunk_metadata()
+        if meta.headings:
+            return "\n".join(meta.headings) + "\n" + chunk.content
+        return chunk.content
-        chunk_id = str(uuid4())
+    async def create(self, entity: Chunk | list[Chunk]) -> Chunk | list[Chunk]:
+        """Create one or more chunks in the database.
-        # Generate embedding if not provided
-        if entity.embedding is not None:
-            embedding = entity.embedding
-        else:
-            embedding = await self.embedder.embed(entity.content)
-        order_val = int(entity.order)
-        chunk_record = self.store.ChunkRecord(
-            id=chunk_id,
-            document_id=entity.document_id,
-            content=entity.content,
-            metadata=json.dumps(
-                {k: v for k, v in entity.metadata.items() if k != "order"}
-            ),
-            order=order_val,
-            vector=embedding,
-        )
+        Chunks must have embeddings set before calling this method.
+        Use client._ensure_chunks_embedded() to embed chunks if needed.
+        """
+        self.store._assert_writable()
+        # Handle single chunk
+        if isinstance(entity, Chunk):
+            assert entity.document_id, "Chunk must have a document_id to be created"
+            assert entity.embedding is not None, "Chunk must have an embedding"
-        self.store.chunks_table.add([chunk_record])
+            chunk_id = str(uuid4())
-        entity.id = chunk_id
-        return entity
+            chunk_record = self.store.ChunkRecord(
+                id=chunk_id,
+                document_id=entity.document_id,
+                content=entity.content,
+                content_fts=self._contextualize_content(entity),
+                metadata=json.dumps(
+                    {k: v for k, v in entity.metadata.items() if k != "order"}
+                ),
+                order=int(entity.order),
+                vector=entity.embedding,
+            )
+            self.store.chunks_table.add([chunk_record])
+            entity.id = chunk_id
+            return entity
+        # Handle batch of chunks
+        chunks = entity
+        if not chunks:
+            return []
+        # Validate all chunks have document_id and embedding
+        for chunk in chunks:
+            assert chunk.document_id, "All chunks must have a document_id to be created"
+            assert chunk.embedding is not None, "All chunks must have embeddings"
+        # Prepare all chunk records
+        chunk_records = []
+        for chunk in chunks:
+            chunk_id = str(uuid4())
+            assert chunk.document_id is not None
+            chunk_record = self.store.ChunkRecord(
+                id=chunk_id,
+                document_id=chunk.document_id,
+                content=chunk.content,
+                content_fts=self._contextualize_content(chunk),
+                metadata=json.dumps(
+                    {k: v for k, v in chunk.metadata.items() if k != "order"}
+                ),
+                order=int(chunk.order),
+                vector=chunk.embedding,
+            )
+            chunk_records.append(chunk_record)
+            chunk.id = chunk_id
+        # Single batch insert for all chunks
+        self.store.chunks_table.add(chunk_records)
+        return chunks
     async def get_by_id(self, entity_id: str) -> Chunk | None:
         """Get a chunk by its ID."""
@@ -89,28 +132,32 @@ class ChunkRepository:
         )
     async def update(self, entity: Chunk) -> Chunk:
-        """Update an existing chunk."""
-        assert entity.id, "Chunk ID is required for update"
+        """Update an existing chunk.
-        embedding = await self.embedder.embed(entity.content)
-        order_val = int(entity.order)
+        Chunk must have embedding set before calling this method.
+        """
+        self.store._assert_writable()
+        assert entity.id, "Chunk ID is required for update"
+        assert entity.embedding is not None, "Chunk must have an embedding"
         self.store.chunks_table.update(
             where=f"id = '{entity.id}'",
             values={
                 "document_id": entity.document_id,
                 "content": entity.content,
+                "content_fts": self._contextualize_content(entity),
                 "metadata": json.dumps(
                     {k: v for k, v in entity.metadata.items() if k != "order"}
                 ),
-                "order": order_val,
-                "vector": embedding,
+                "order": int(entity.order),
+                "vector": entity.embedding,
             },
         )
         return entity
     async def delete(self, entity_id: str) -> bool:
         """Delete a chunk by its ID."""
+        self.store._assert_writable()
         chunk = await self.get_by_id(entity_id)
         if chunk is None:
             return False
@@ -145,86 +192,22 @@ class ChunkRepository:
             )
         return chunks
-    async def create_chunks_for_document(
-        self, document_id: str, document: "DoclingDocument"
-    ) -> list[Chunk]:
-        """Create chunks and embeddings for a document from DoclingDocument."""
-        # Lazy imports to avoid loading docling during module import
-        from haiku.rag.chunker import chunker
-        from haiku.rag.utils import text_to_docling_document
-        # Optionally preprocess markdown before chunking
-        processed_document = document
-        preprocessor_path = self.store._config.processing.markdown_preprocessor
-        if preprocessor_path:
-            try:
-                pre_fn = load_callable(preprocessor_path)
-                markdown = document.export_to_markdown()
-                result = pre_fn(markdown)
-                if inspect.isawaitable(result):
-                    result = await result  # type: ignore[assignment]
-                processed_markdown = result
-                if not isinstance(processed_markdown, str):
-                    raise ValueError("Preprocessor must return a markdown string")
-                processed_document = text_to_docling_document(
-                    processed_markdown, name="content.md"
-                )
-            except Exception as e:
-                logger.error(
-                    f"Failed to apply MARKDOWN_PREPROCESSOR '{preprocessor_path}': {e}. Proceeding without preprocessing."
-                )
-                raise e
-        chunk_texts = await chunker.chunk(processed_document)
-        embeddings = await self.embedder.embed(chunk_texts)
-        # Prepare all chunk records for batch insertion
-        chunk_records = []
-        created_chunks = []
-        for order, (chunk_text, embedding) in enumerate(zip(chunk_texts, embeddings)):
-            chunk_id = str(uuid4())
-            chunk_record = self.store.ChunkRecord(
-                id=chunk_id,
-                document_id=document_id,
-                content=chunk_text,
-                metadata=json.dumps({}),
-                order=order,
-                vector=embedding,
-            )
-            chunk_records.append(chunk_record)
-            chunk = Chunk(
-                id=chunk_id,
-                document_id=document_id,
-                content=chunk_text,
-                metadata={},
-                order=order,
-            )
-            created_chunks.append(chunk)
-        # Batch insert all chunks at once
-        if chunk_records:
-            self.store.chunks_table.add(chunk_records)
-        return created_chunks
     async def delete_all(self) -> None:
         """Delete all chunks from the database."""
+        self.store._assert_writable()
         # Drop and recreate table to clear all data
         self.store.db.drop_table("chunks")
         self.store.chunks_table = self.store.db.create_table(
             "chunks", schema=self.store.ChunkRecord
         )
-        # Create FTS index on the new table with phrase query support
+        # Create FTS index on content_fts (contextualized content) for better search
         self.store.chunks_table.create_fts_index(
-            "content", replace=True, with_position=True, remove_stop_words=False
+            "content_fts", replace=True, with_position=True, remove_stop_words=False
         )
     async def delete_by_document_id(self, document_id: str) -> bool:
         """Delete all chunks for a document."""
+        self.store._assert_writable()
         chunks = await self.get_by_document_id(document_id)
         if not chunks:
@@ -272,25 +255,34 @@ class ChunkRepository:
         # Prepare search query based on search type
         if search_type == "vector":
-            query_embedding = await self.embedder.embed(query)
-            results = self.store.chunks_table.search(
-                query_embedding, query_type="vector", vector_column_name="vector"
+            query_embedding = await self.embedder.embed_query(query)
+            vector_query = cast(
+                "LanceVectorQueryBuilder",
+                self.store.chunks_table.search(
+                    query_embedding, query_type="vector", vector_column_name="vector"
+                ),
+            )
+            results = vector_query.refine_factor(
+                self.store._config.search.vector_refine_factor
             )
         elif search_type == "fts":
             results = self.store.chunks_table.search(query, query_type="fts")
         else:  # hybrid (default)
-            query_embedding = await self.embedder.embed(query)
+            query_embedding = await self.embedder.embed_query(query)
             # Create RRF reranker
             reranker = RRFReranker()
             # Perform native hybrid search with RRF reranking
-            results = (
+            hybrid_query = cast(
+                "LanceHybridQueryBuilder",
                 self.store.chunks_table.search(query_type="hybrid")
                 .vector(query_embedding)
-                .text(query)
-                .rerank(reranker)
+                .text(query),
             )
+            results = hybrid_query.refine_factor(
+                self.store._config.search.vector_refine_factor
+            ).rerank(reranker)
         # Apply filtering if needed (common for all search types)
         if filtered_doc_ids is not None:
@@ -304,13 +296,30 @@ class ChunkRepository:
         results = results.limit(limit)
         return await self._process_search_results(results)
-    async def get_by_document_id(self, document_id: str) -> list[Chunk]:
-        """Get all chunks for a specific document."""
-        results = list(
-            self.store.chunks_table.search()
-            .where(f"document_id = '{document_id}'")
-            .to_pydantic(self.store.ChunkRecord)
-        )
+    async def get_by_document_id(
+        self,
+        document_id: str,
+        limit: int | None = None,
+        offset: int | None = None,
+    ) -> list[Chunk]:
+        """Get chunks for a specific document with optional pagination.
+        Args:
+            document_id: The document ID to get chunks for.
+            limit: Maximum number of chunks to return. None for all.
+            offset: Number of chunks to skip. None for no offset.
+        Returns:
+            List of chunks ordered by their order field.
+        """
+        query = self.store.chunks_table.search().where(f"document_id = '{document_id}'")
+        if offset is not None:
+            query = query.offset(offset)
+        if limit is not None:
+            query = query.limit(limit)
+        results = list(query.to_pydantic(self.store.ChunkRecord))
         # Get document info
         doc_results = list(
@@ -343,6 +352,16 @@ class ChunkRepository:
         chunks.sort(key=lambda c: c.order)
         return chunks
+    async def count_by_document_id(self, document_id: str) -> int:
+        """Count the number of chunks for a specific document."""
+        df = (
+            self.store.chunks_table.search()
+            .select(["id"])
+            .where(f"document_id = '{document_id}'")
+            .to_pandas()
+        )
+        return len(df)
     async def get_adjacent_chunks(self, chunk: Chunk, num_adjacent: int) -> list[Chunk]:
         """Get adjacent chunks before and after the given chunk within the same document."""
         assert chunk.document_id, "Document id is required for adjacent chunk finding"
@@ -400,6 +419,7 @@ class ChunkRepository:
                 id=str(row["id"]),
                 document_id=str(row["document_id"]),
                 content=str(row["content"]),
+                content_fts=str(row.get("content_fts", "")),
                 metadata=str(row["metadata"]),
                 order=int(row["order"]) if "order" in row else 0,
             )

haiku.rag-slim 0.16.0__py3-none-any.whl → 0.24.0__py3-none-any.whl

Potentially problematic release.

haiku.rag-slim 0.16.0py3-none-any.whl → 0.24.0py3-none-any.whl