PyPI - haiku.rag-slim - Versions diffs - 0.17.1__tar.gz → 0.21.0__tar.gz - Mend

haiku.rag-slim 0.17.1tar.gz → 0.21.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of haiku.rag-slim might be problematic. Click here for more details.

Files changed (110) hide show

{haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/.gitignore RENAMED Viewed

@@ -5,6 +5,7 @@ build/
 dist/
 wheels/
 *.egg-info
+**/.DS_Store
 # Virtual environments
 .venv

{haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/PKG-INFO RENAMED Viewed

@@ -1,7 +1,7 @@
 Metadata-Version: 2.4
 Name: haiku.rag-slim
-Version: 0.17.1
-Summary: Agentic Retrieval Augmented Generation (RAG) with LanceDB - Minimal dependencies
+Version: 0.21.0
+Summary: Opinionated agentic RAG powered by LanceDB, Pydantic AI, and Docling - Minimal dependencies
 Author-email: Yiorgis Gozadinos <ggozadinos@gmail.com>
 License: MIT
 License-File: LICENSE
@@ -17,12 +17,12 @@ Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3.13
 Classifier: Typing :: Typed
 Requires-Python: >=3.12
-Requires-Dist: docling-core==2.50.1
+Requires-Dist: docling-core==2.54.0
 Requires-Dist: httpx>=0.28.1
-Requires-Dist: lancedb==0.25.2
+Requires-Dist: lancedb==0.25.3
 Requires-Dist: pathspec>=0.12.1
-Requires-Dist: pydantic-ai-slim[ag-ui,fastmcp,logfire,openai]==1.17.0
-Requires-Dist: pydantic>=2.12.3
+Requires-Dist: pydantic-ai-slim[ag-ui,fastmcp,logfire,openai]==1.27.0
+Requires-Dist: pydantic>=2.12.5
 Requires-Dist: python-dotenv>=1.2.1
 Requires-Dist: pyyaml>=6.0.3
 Requires-Dist: rich>=14.2.0
@@ -33,13 +33,17 @@ Requires-Dist: pydantic-ai-slim[anthropic]; extra == 'anthropic'
 Provides-Extra: bedrock
 Requires-Dist: pydantic-ai-slim[bedrock]; extra == 'bedrock'
 Provides-Extra: cohere
-Requires-Dist: cohere>=5.0.0; extra == 'cohere'
+Requires-Dist: cohere>=5.20.0; extra == 'cohere'
 Provides-Extra: docling
-Requires-Dist: docling==2.61.1; extra == 'docling'
+Requires-Dist: docling==2.64.0; extra == 'docling'
+Requires-Dist: opencv-python-headless>=4.11.0.86; extra == 'docling'
 Provides-Extra: google
 Requires-Dist: pydantic-ai-slim[google]; extra == 'google'
 Provides-Extra: groq
 Requires-Dist: pydantic-ai-slim[groq]; extra == 'groq'
+Provides-Extra: inspector
+Requires-Dist: textual-image>=0.8.4; extra == 'inspector'
+Requires-Dist: textual>=6.0.0; extra == 'inspector'
 Provides-Extra: mistral
 Requires-Dist: pydantic-ai-slim[mistral]; extra == 'mistral'
 Provides-Extra: mxbai
@@ -49,12 +53,12 @@ Requires-Dist: pydantic-ai-slim[vertexai]; extra == 'vertexai'
 Provides-Extra: voyageai
 Requires-Dist: voyageai>=0.3.5; extra == 'voyageai'
 Provides-Extra: zeroentropy
-Requires-Dist: zeroentropy>=0.1.0a6; extra == 'zeroentropy'
+Requires-Dist: zeroentropy>=0.1.0a7; extra == 'zeroentropy'
 Description-Content-Type: text/markdown
 # haiku.rag-slim
-Retrieval-Augmented Generation (RAG) library built on LanceDB - Core package with minimal dependencies.
+Opinionated agentic RAG powered by LanceDB, Pydantic AI, and Docling - Core package with minimal dependencies.
 `haiku.rag-slim` is the core package for users who want to install only the dependencies they need. Document processing (docling), and reranker support are all optional extras.

{haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/README.md RENAMED Viewed

@@ -1,6 +1,6 @@
 # haiku.rag-slim
-Retrieval-Augmented Generation (RAG) library built on LanceDB - Core package with minimal dependencies.
+Opinionated agentic RAG powered by LanceDB, Pydantic AI, and Docling - Core package with minimal dependencies.
 `haiku.rag-slim` is the core package for users who want to install only the dependencies they need. Document processing (docling), and reranker support are all optional extras.

{haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/haiku/rag/app.py RENAMED Viewed

@@ -3,12 +3,21 @@ import json
 import logging
 from importlib.metadata import version as pkg_version
 from pathlib import Path
+from typing import TYPE_CHECKING
 from rich.console import Console
 from rich.markdown import Markdown
-from rich.progress import Progress
-from haiku.rag.client import HaikuRAG
+from rich.progress import (
+    BarColumn,
+    DownloadColumn,
+    Progress,
+    SpinnerColumn,
+    TaskID,
+    TextColumn,
+    TransferSpeedColumn,
+)
+from haiku.rag.client import HaikuRAG, RebuildMode
 from haiku.rag.config import AppConfig, Config
 from haiku.rag.graph.agui import AGUIConsoleRenderer, stream_graph
 from haiku.rag.graph.research.dependencies import ResearchContext
@@ -16,9 +25,12 @@ from haiku.rag.graph.research.graph import build_research_graph
 from haiku.rag.graph.research.state import ResearchDeps, ResearchState
 from haiku.rag.mcp import create_mcp_server
 from haiku.rag.monitor import FileWatcher
-from haiku.rag.store.models.chunk import Chunk
 from haiku.rag.store.models.document import Document
+if TYPE_CHECKING:
+    from haiku.rag.store.models import SearchResult
+from haiku.rag.utils import format_bytes, format_citations_rich
 logger = logging.getLogger(__name__)
@@ -28,6 +40,21 @@ class HaikuRAGApp:
         self.config = config
         self.console = Console()
+    async def init(self):
+        """Initialize a new database."""
+        if self.db_path.exists():
+            self.console.print(
+                f"[yellow]Database already exists at {self.db_path}[/yellow]"
+            )
+            return
+        # Create the database
+        client = HaikuRAG(db_path=self.db_path, config=self.config, create=True)
+        client.close()
+        self.console.print(
+            f"[bold green]Database initialized at {self.db_path}[/bold green]"
+        )
     async def info(self):
         """Display read-only information about the database without modifying it."""
@@ -64,7 +91,13 @@ class HaikuRAGApp:
         except Exception:
             docling_version = "unknown"
-        # Read settings (if present) to find stored haiku.rag version and embedding config
+        # Get comprehensive table statistics (this also runs migrations)
+        from haiku.rag.store.engine import Store
+        store = Store(self.db_path, config=self.config, skip_validation=True)
+        table_stats = store.get_stats()
+        # Read settings after Store init (migrations have run)
         stored_version = "unknown"
         embed_provider: str | None = None
         embed_model: str | None = None
@@ -79,14 +112,22 @@ class HaikuRAGApp:
                 data = json.loads(raw) if isinstance(raw, str) else (raw or {})
                 stored_version = str(data.get("version", stored_version))
                 embeddings = data.get("embeddings", {})
-                embed_provider = embeddings.get("provider")
-                embed_model = embeddings.get("model")
-                vector_dim = embeddings.get("vector_dim")
+                embed_model_obj = embeddings.get("model", {})
+                embed_provider = embed_model_obj.get("provider")
+                embed_model = embed_model_obj.get("name")
+                vector_dim = embed_model_obj.get("vector_dim")
+        store.close()
+        num_docs = table_stats["documents"].get("num_rows", 0)
+        doc_bytes = table_stats["documents"].get("total_bytes", 0)
-        num_docs = 0
-        if "documents" in table_names:
-            docs_tbl = db.open_table("documents")
-            num_docs = int(docs_tbl.count_rows())  # type: ignore[attr-defined]
+        num_chunks = table_stats["chunks"].get("num_rows", 0)
+        chunk_bytes = table_stats["chunks"].get("total_bytes", 0)
+        has_vector_index = table_stats["chunks"].get("has_vector_index", False)
+        num_indexed_rows = table_stats["chunks"].get("num_indexed_rows", 0)
+        num_unindexed_rows = table_stats["chunks"].get("num_unindexed_rows", 0)
         # Table versions per table (direct API)
         doc_versions = (
@@ -116,8 +157,43 @@ class HaikuRAGApp:
                 "  [repr.attrib_name]embeddings[/repr.attrib_name]: unknown"
             )
         self.console.print(
-            f"  [repr.attrib_name]documents[/repr.attrib_name]: {num_docs}"
+            f"  [repr.attrib_name]documents[/repr.attrib_name]: {num_docs} "
+            f"({format_bytes(doc_bytes)})"
+        )
+        self.console.print(
+            f"  [repr.attrib_name]chunks[/repr.attrib_name]: {num_chunks} "
+            f"({format_bytes(chunk_bytes)})"
         )
+        # Vector index information
+        if has_vector_index:
+            self.console.print(
+                "  [repr.attrib_name]vector index[/repr.attrib_name]: ✓ exists"
+            )
+            self.console.print(
+                f"  [repr.attrib_name]indexed chunks[/repr.attrib_name]: {num_indexed_rows}"
+            )
+            if num_unindexed_rows > 0:
+                self.console.print(
+                    f"  [repr.attrib_name]unindexed chunks[/repr.attrib_name]: [yellow]{num_unindexed_rows}[/yellow] "
+                    "(consider running: haiku-rag create-index)"
+                )
+            else:
+                self.console.print(
+                    f"  [repr.attrib_name]unindexed chunks[/repr.attrib_name]: {num_unindexed_rows}"
+                )
+        else:
+            if num_chunks >= 256:
+                self.console.print(
+                    "  [repr.attrib_name]vector index[/repr.attrib_name]: [yellow]✗ not created[/yellow] "
+                    "(run: haiku-rag create-index)"
+                )
+            else:
+                self.console.print(
+                    f"  [repr.attrib_name]vector index[/repr.attrib_name]: ✗ not created "
+                    f"(need {256 - num_chunks} more chunks)"
+                )
         self.console.print(
             f"  [repr.attrib_name]versions (documents)[/repr.attrib_name]: {doc_versions}"
         )
@@ -137,9 +213,7 @@ class HaikuRAGApp:
         )
     async def list_documents(self, filter: str | None = None):
-        async with HaikuRAG(
-            db_path=self.db_path, config=self.config, allow_create=False
-        ) as self.client:
+        async with HaikuRAG(db_path=self.db_path, config=self.config) as self.client:
             documents = await self.client.list_documents(filter=filter)
             for doc in documents:
                 self._rich_print_document(doc, truncate=True)
@@ -172,9 +246,7 @@ class HaikuRAGApp:
                 )
     async def get_document(self, doc_id: str):
-        async with HaikuRAG(
-            db_path=self.db_path, config=self.config, allow_create=False
-        ) as self.client:
+        async with HaikuRAG(db_path=self.db_path, config=self.config) as self.client:
             doc = await self.client.get_document_by_id(doc_id)
             if doc is None:
                 self.console.print(f"[red]Document with id {doc_id} not found.[/red]")
@@ -193,16 +265,48 @@ class HaikuRAGApp:
                     f"[yellow]Document with id {doc_id} not found.[/yellow]"
                 )
-    async def search(self, query: str, limit: int = 5, filter: str | None = None):
-        async with HaikuRAG(
-            db_path=self.db_path, config=self.config, allow_create=False
-        ) as self.client:
+    async def search(
+        self, query: str, limit: int | None = None, filter: str | None = None
+    ):
+        async with HaikuRAG(db_path=self.db_path, config=self.config) as self.client:
             results = await self.client.search(query, limit=limit, filter=filter)
             if not results:
                 self.console.print("[yellow]No results found.[/yellow]")
                 return
-            for chunk, score in results:
-                self._rich_print_search_result(chunk, score)
+            for result in results:
+                self._rich_print_search_result(result)
+    async def visualize_chunk(self, chunk_id: str):
+        """Display visual grounding images for a chunk."""
+        from textual_image.renderable import Image as RichImage
+        async with HaikuRAG(db_path=self.db_path, config=self.config) as self.client:
+            chunk = await self.client.chunk_repository.get_by_id(chunk_id)
+            if not chunk:
+                self.console.print(f"[red]Chunk with id {chunk_id} not found.[/red]")
+                return
+            images = await self.client.visualize_chunk(chunk)
+            if not images:
+                self.console.print(
+                    "[yellow]No visual grounding available for this chunk.[/yellow]"
+                )
+                self.console.print(
+                    "This may be because the document was converted without page images."
+                )
+                return
+            self.console.print(f"[bold]Visual grounding for chunk {chunk_id}[/bold]")
+            if chunk.document_uri:
+                self.console.print(
+                    f"[repr.attrib_name]document[/repr.attrib_name]: {chunk.document_uri}"
+                )
+            for i, img in enumerate(images):
+                self.console.print(
+                    f"\n[bold cyan]Page {i + 1}/{len(images)}[/bold cyan]"
+                )
+                self.console.print(RichImage(img))
     async def ask(
         self,
@@ -210,6 +314,7 @@ class HaikuRAGApp:
         cite: bool = False,
         deep: bool = False,
         verbose: bool = False,
+        filter: str | None = None,
     ):
         """Ask a question using the RAG system.
@@ -218,57 +323,78 @@ class HaikuRAGApp:
             cite: Include citations in the answer
             deep: Use deep QA mode (multi-step reasoning)
             verbose: Show verbose output
+            filter: SQL WHERE clause to filter documents
         """
-        async with HaikuRAG(
-            db_path=self.db_path, config=self.config, allow_create=False
-        ) as self.client:
+        async with HaikuRAG(db_path=self.db_path, config=self.config) as self.client:
             try:
+                citations = []
                 if deep:
-                    from haiku.rag.graph.deep_qa.dependencies import DeepQAContext
-                    from haiku.rag.graph.deep_qa.graph import build_deep_qa_graph
-                    from haiku.rag.graph.deep_qa.state import DeepQADeps, DeepQAState
-                    graph = build_deep_qa_graph(config=self.config)
-                    context = DeepQAContext(
-                        original_question=question, use_citations=cite
+                    from haiku.rag.graph.research.models import ResearchReport
+                    graph = build_research_graph(config=self.config)
+                    context = ResearchContext(original_question=question)
+                    state = ResearchState.from_config(
+                        context=context,
+                        config=self.config,
+                        max_iterations=2,
+                        confidence_threshold=0.0,
                     )
-                    state = DeepQAState.from_config(context=context, config=self.config)
-                    deps = DeepQADeps(client=self.client)
+                    state.search_filter = filter
+                    deps = ResearchDeps(client=self.client)
                     if verbose:
-                        # Use AG-UI renderer to process and display events
-                        from haiku.rag.graph.agui import AGUIConsoleRenderer
                         renderer = AGUIConsoleRenderer(self.console)
                         result_dict = await renderer.render(
                             stream_graph(graph, state, deps)
                         )
-                        # Result should be a dict with 'answer' key
-                        answer = result_dict.get("answer", "") if result_dict else ""
+                        report = (
+                            ResearchReport.model_validate(result_dict)
+                            if result_dict
+                            else None
+                        )
                     else:
-                        # Run without rendering events, just get the result
-                        result = await graph.run(state=state, deps=deps)
-                        answer = result.answer
+                        report = await graph.run(state=state, deps=deps)
+                    self.console.print(f"[bold blue]Question:[/bold blue] {question}")
+                    self.console.print()
+                    if report:
+                        self.console.print("[bold green]Answer:[/bold green]")
+                        self.console.print(Markdown(report.executive_summary))
+                        if report.main_findings:
+                            self.console.print()
+                            self.console.print("[bold cyan]Key Findings:[/bold cyan]")
+                            for finding in report.main_findings:
+                                self.console.print(f"• {finding}")
+                        if report.sources_summary:
+                            self.console.print()
+                            self.console.print("[bold cyan]Sources:[/bold cyan]")
+                            self.console.print(report.sources_summary)
+                    else:
+                        self.console.print("[yellow]No answer generated.[/yellow]")
                 else:
-                    answer = await self.client.ask(question, cite=cite)
+                    answer, citations = await self.client.ask(question, filter=filter)
-                self.console.print(f"[bold blue]Question:[/bold blue] {question}")
-                self.console.print()
-                self.console.print("[bold green]Answer:[/bold green]")
-                self.console.print(Markdown(answer))
+                    self.console.print(f"[bold blue]Question:[/bold blue] {question}")
+                    self.console.print()
+                    self.console.print("[bold green]Answer:[/bold green]")
+                    self.console.print(Markdown(answer))
+                    if cite and citations:
+                        for renderable in format_citations_rich(citations):
+                            self.console.print(renderable)
             except Exception as e:
                 self.console.print(f"[red]Error: {e}[/red]")
-    async def research(self, question: str, verbose: bool = False):
+    async def research(
+        self, question: str, verbose: bool = False, filter: str | None = None
+    ):
         """Run research via the pydantic-graph pipeline.
         Args:
             question: The research question
             verbose: Show AG-UI event stream during execution
+            filter: SQL WHERE clause to filter documents
         """
-        async with HaikuRAG(
-            db_path=self.db_path, config=self.config, allow_create=False
-        ) as client:
+        async with HaikuRAG(db_path=self.db_path, config=self.config) as client:
             try:
                 self.console.print("[bold cyan]Starting research[/bold cyan]")
                 self.console.print(f"[bold blue]Question:[/bold blue] {question}")
@@ -277,6 +403,7 @@ class HaikuRAGApp:
                 graph = build_research_graph(config=self.config)
                 context = ResearchContext(original_question=question)
                 state = ResearchState.from_config(context=context, config=self.config)
+                state.search_filter = filter
                 deps = ResearchDeps(client=client)
                 if verbose:
@@ -356,7 +483,7 @@ class HaikuRAGApp:
             except Exception as e:
                 self.console.print(f"[red]Error during research: {e}[/red]")
-    async def rebuild(self):
+    async def rebuild(self, mode: RebuildMode = RebuildMode.FULL):
         async with HaikuRAG(
             db_path=self.db_path, config=self.config, skip_validation=True
         ) as client:
@@ -370,12 +497,18 @@ class HaikuRAGApp:
                     )
                     return
+                mode_desc = {
+                    RebuildMode.FULL: "full rebuild",
+                    RebuildMode.RECHUNK: "rechunk",
+                    RebuildMode.EMBED_ONLY: "embed only",
+                }[mode]
                 self.console.print(
-                    f"[bold cyan]Rebuilding database with {total_docs} documents...[/bold cyan]"
+                    f"[bold cyan]Rebuilding database ({mode_desc}) with {total_docs} documents...[/bold cyan]"
                 )
                 with Progress() as progress:
                     task = progress.add_task("Rebuilding...", total=total_docs)
-                    async for _ in client.rebuild_database():
+                    async for _ in client.rebuild_database(mode=mode):
                         progress.update(task, advance=1)
                 self.console.print(
@@ -397,6 +530,96 @@ class HaikuRAGApp:
         except Exception as e:
             self.console.print(f"[red]Error during vacuum: {e}[/red]")
+    async def create_index(self):
+        """Create vector index on the chunks table."""
+        try:
+            async with HaikuRAG(
+                db_path=self.db_path, config=self.config, skip_validation=True
+            ) as client:
+                row_count = client.store.chunks_table.count_rows()
+                self.console.print(f"Chunks in database: {row_count}")
+                if row_count < 256:
+                    self.console.print(
+                        f"[yellow]Warning: Need at least 256 chunks to create an index (have {row_count})[/yellow]"
+                    )
+                    return
+                # Check if index already exists
+                indices = client.store.chunks_table.list_indices()
+                has_vector_index = any("vector" in str(idx).lower() for idx in indices)
+                if has_vector_index:
+                    self.console.print(
+                        "[yellow]Rebuilding existing vector index...[/yellow]"
+                    )
+                else:
+                    self.console.print("[bold]Creating vector index...[/bold]")
+                client.store._ensure_vector_index()
+                self.console.print(
+                    "[bold green]Vector index created successfully.[/bold green]"
+                )
+        except Exception as e:
+            self.console.print(f"[red]Error creating index: {e}[/red]")
+    async def download_models(self):
+        """Download Docling, HuggingFace tokenizer, and Ollama models per config."""
+        from haiku.rag.client import HaikuRAG
+        client = HaikuRAG(db_path=None, config=self.config)
+        progress: Progress | None = None
+        task_id: TaskID | None = None
+        current_model = ""
+        current_digest = ""
+        async for event in client.download_models():
+            if event.status == "start":
+                self.console.print(
+                    f"[bold blue]Downloading {event.model}...[/bold blue]"
+                )
+            elif event.status == "done":
+                if progress:
+                    progress.stop()
+                    progress = None
+                    task_id = None
+                self.console.print(f"[green]✓[/green] {event.model}")
+                current_model = ""
+                current_digest = ""
+            elif event.status == "pulling":
+                self.console.print(f"[bold blue]Pulling {event.model}...[/bold blue]")
+                current_model = event.model
+                progress = Progress(
+                    SpinnerColumn(),
+                    TextColumn("[progress.description]{task.description}"),
+                    BarColumn(),
+                    DownloadColumn(),
+                    TransferSpeedColumn(),
+                    console=self.console,
+                    transient=True,
+                    auto_refresh=False,
+                )
+                progress.start()
+                task_id = progress.add_task(event.model, total=None)
+            elif event.status == "downloading" and progress and task_id is not None:
+                if event.digest != current_digest:
+                    current_digest = event.digest
+                    short_digest = event.digest[:19] if event.digest else ""
+                    progress.update(
+                        task_id,
+                        description=f"{current_model} ({short_digest})",
+                        total=event.total,
+                        completed=0,
+                    )
+                progress.update(task_id, completed=event.completed, refresh=True)
+            elif progress and task_id is not None:
+                progress.update(
+                    task_id,
+                    description=f"{current_model}: {event.status}",
+                    refresh=True,
+                )
     def show_settings(self):
         """Display current configuration settings."""
         self.console.print("[bold]haiku.rag configuration[/bold]")
@@ -447,22 +670,27 @@ class HaikuRAGApp:
         self.console.print(content)
         self.console.rule()
-    def _rich_print_search_result(self, chunk: Chunk, score: float):
-        """Format a search result chunk for display."""
-        content = Markdown(chunk.content)
+    def _rich_print_search_result(self, result: "SearchResult"):
+        """Format a search result for display."""
+        content = Markdown(result.content)
         self.console.print(
-            f"[repr.attrib_name]document_id[/repr.attrib_name]: {chunk.document_id} "
-            f"[repr.attrib_name]score[/repr.attrib_name]: {score:.4f}"
+            f"[repr.attrib_name]document_id[/repr.attrib_name]: {result.document_id} "
+            f"[repr.attrib_name]chunk_id[/repr.attrib_name]: {result.chunk_id} "
+            f"[repr.attrib_name]score[/repr.attrib_name]: {result.score:.4f}"
         )
-        if chunk.document_uri:
-            self.console.print("[repr.attrib_name]document uri[/repr.attrib_name]:")
-            self.console.print(chunk.document_uri)
-        if chunk.document_title:
+        if result.document_uri:
+            self.console.print(
+                f"[repr.attrib_name]document uri[/repr.attrib_name]: {result.document_uri}"
+            )
+        if result.document_title:
             self.console.print("[repr.attrib_name]document title[/repr.attrib_name]:")
-            self.console.print(chunk.document_title)
-        if chunk.document_meta:
-            self.console.print("[repr.attrib_name]document meta[/repr.attrib_name]:")
-            self.console.print(chunk.document_meta)
+            self.console.print(result.document_title)
+        if result.page_numbers:
+            self.console.print("[repr.attrib_name]pages[/repr.attrib_name]:")
+            self.console.print(", ".join(str(p) for p in result.page_numbers))
+        if result.headings:
+            self.console.print("[repr.attrib_name]headings[/repr.attrib_name]:")
+            self.console.print(" > ".join(result.headings))
         self.console.print("[repr.attrib_name]content[/repr.attrib_name]:")
         self.console.print(content)
         self.console.rule()

{haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/haiku/rag/chunkers/base.py RENAMED Viewed

@@ -4,6 +4,8 @@ from typing import TYPE_CHECKING
 if TYPE_CHECKING:
     from docling_core.types.doc.document import DoclingDocument
+    from haiku.rag.store.models.chunk import Chunk
 class DocumentChunker(ABC):
     """Abstract base class for document chunkers.
@@ -13,14 +15,15 @@ class DocumentChunker(ABC):
     """
     @abstractmethod
-    async def chunk(self, document: "DoclingDocument") -> list[str]:
-        """Split a document into chunks.
+    async def chunk(self, document: "DoclingDocument") -> list["Chunk"]:
+        """Split a document into chunks with metadata.
         Args:
             document: The DoclingDocument to chunk.
         Returns:
-            List of text chunks with semantic boundaries preserved.
+            List of Chunk with content and structured metadata in the metadata dict
+            (doc_item_refs, headings, labels, page_numbers).
         Raises:
             ValueError: If chunking fails.

haiku.rag-slim 0.17.1__tar.gz → 0.21.0__tar.gz

Potentially problematic release.

haiku.rag-slim 0.17.1tar.gz → 0.21.0tar.gz