PyPI - haiku.rag - Versions diffs - 0.10.0__py3-none-any.whl → 0.10.1__py3-none-any.whl - Mend

haiku.rag 0.10.0py3-none-any.whl → 0.10.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of haiku.rag might be problematic. Click here for more details.

Files changed (23) hide show

haiku/rag/app.py +14 -5
haiku/rag/cli.py +55 -30
haiku/rag/client.py +63 -21
haiku/rag/config.py +4 -0
haiku/rag/mcp.py +18 -6
haiku/rag/qa/agent.py +4 -2
haiku/rag/qa/prompts.py +2 -2
haiku/rag/research/models.py +2 -2
haiku/rag/research/nodes/search.py +3 -1
haiku/rag/research/prompts.py +4 -3
haiku/rag/store/engine.py +14 -0
haiku/rag/store/models/chunk.py +1 -0
haiku/rag/store/models/document.py +1 -0
haiku/rag/store/repositories/chunk.py +4 -0
haiku/rag/store/repositories/document.py +3 -0
haiku/rag/store/upgrades/__init__.py +2 -0
haiku/rag/store/upgrades/v0_10_1.py +64 -0
haiku/rag/utils.py +8 -5
{haiku_rag-0.10.0.dist-info → haiku_rag-0.10.1.dist-info}/METADATA +1 -1
{haiku_rag-0.10.0.dist-info → haiku_rag-0.10.1.dist-info}/RECORD +23 -22
{haiku_rag-0.10.0.dist-info → haiku_rag-0.10.1.dist-info}/WHEEL +0 -0
{haiku_rag-0.10.0.dist-info → haiku_rag-0.10.1.dist-info}/entry_points.txt +0 -0
{haiku_rag-0.10.0.dist-info → haiku_rag-0.10.1.dist-info}/licenses/LICENSE +0 -0

haiku/rag/app.py CHANGED Viewed

@@ -39,9 +39,9 @@ class HaikuRAGApp:
                 f"[b]Document with id [cyan]{doc.id}[/cyan] added successfully.[/b]"
             )
-    async def add_document_from_source(self, source: str):
+    async def add_document_from_source(self, source: str, title: str | None = None):
         async with HaikuRAG(db_path=self.db_path) as self.client:
-            doc = await self.client.create_document_from_source(source)
+            doc = await self.client.create_document_from_source(source, title=title)
             self._rich_print_document(doc, truncate=True)
             self.console.print(
                 f"[b]Document with id [cyan]{doc.id}[/cyan] added successfully.[/b]"
@@ -252,8 +252,16 @@ class HaikuRAGApp:
             content = Markdown(content)
         else:
             content = Markdown(doc.content)
+        title_part = (
+            f" [repr.attrib_name]title[/repr.attrib_name]: {doc.title}"
+            if doc.title
+            else ""
+        )
         self.console.print(
-            f"[repr.attrib_name]id[/repr.attrib_name]: {doc.id} [repr.attrib_name]uri[/repr.attrib_name]: {doc.uri} [repr.attrib_name]meta[/repr.attrib_name]: {doc.metadata}"
+            f"[repr.attrib_name]id[/repr.attrib_name]: {doc.id} "
+            f"[repr.attrib_name]uri[/repr.attrib_name]: {doc.uri}"
+            + title_part
+            + f" [repr.attrib_name]meta[/repr.attrib_name]: {doc.metadata}"
         )
         self.console.print(
             f"[repr.attrib_name]created at[/repr.attrib_name]: {doc.created_at} [repr.attrib_name]updated at[/repr.attrib_name]: {doc.updated_at}"
@@ -272,6 +280,9 @@ class HaikuRAGApp:
         if chunk.document_uri:
             self.console.print("[repr.attrib_name]document uri[/repr.attrib_name]:")
             self.console.print(chunk.document_uri)
+        if chunk.document_title:
+            self.console.print("[repr.attrib_name]document title[/repr.attrib_name]:")
+            self.console.print(chunk.document_title)
         if chunk.document_meta:
             self.console.print("[repr.attrib_name]document meta[/repr.attrib_name]:")
             self.console.print(chunk.document_meta)
@@ -289,8 +300,6 @@ class HaikuRAGApp:
             try:
                 if transport == "stdio":
                     await server.run_stdio_async()
-                elif transport == "sse":
-                    await server.run_sse_async()
                 else:
                     await server.run_http_async(transport="streamable-http")
             except KeyboardInterrupt:

haiku/rag/cli.py CHANGED Viewed

@@ -3,28 +3,16 @@ import warnings
 from importlib.metadata import version
 from pathlib import Path
-import logfire
 import typer
-from rich.console import Console
-from haiku.rag.app import HaikuRAGApp
 from haiku.rag.config import Config
 from haiku.rag.logging import configure_cli_logging
-from haiku.rag.migration import migrate_sqlite_to_lancedb
 from haiku.rag.utils import is_up_to_date
-if Config.ENV == "development":
-    logfire.configure(send_to_logfire="if-token-present")
-    logfire.instrument_pydantic_ai()
-else:
-    warnings.filterwarnings("ignore")
 cli = typer.Typer(
     context_settings={"help_option_names": ["-h", "--help"]}, no_args_is_help=True
 )
-console = Console()
 def complete_document_ids(ctx: typer.Context, incomplete: str):
     """Autocomplete document IDs from the selected DB."""
@@ -89,16 +77,16 @@ async def check_version():
     """Check if haiku.rag is up to date and show warning if not."""
     up_to_date, current_version, latest_version = await is_up_to_date()
     if not up_to_date:
-        console.print(
-            f"[yellow]Warning: haiku.rag is outdated. Current: {current_version}, Latest: {latest_version}[/yellow]"
+        typer.echo(
+            f"Warning: haiku.rag is outdated. Current: {current_version}, Latest: {latest_version}",
         )
-        console.print("[yellow]Please update.[/yellow]")
+        typer.echo("Please update.")
 def version_callback(value: bool):
     if value:
         v = version("haiku.rag")
-        console.print(f"haiku.rag version {v}")
+        typer.echo(f"haiku.rag version {v}")
         raise typer.Exit()
@@ -113,10 +101,26 @@ def main(
     ),
 ):
     """haiku.rag CLI - Vector database RAG system"""
-    # Ensure only haiku.rag logs are emitted in CLI context
-    configure_cli_logging()
+    # Configure logging minimally for CLI context
+    if Config.ENV == "development":
+        # Lazy import logfire only in development
+        try:
+            import logfire  # type: ignore
+            logfire.configure(send_to_logfire="if-token-present")
+            logfire.instrument_pydantic_ai()
+        except Exception:
+            pass
+    else:
+        configure_cli_logging()
+        warnings.filterwarnings("ignore")
     # Run version check before any command
-    asyncio.run(check_version())
+    try:
+        asyncio.run(check_version())
+    except Exception:
+        # Do not block CLI on version check issues
+        pass
 @cli.command("list", help="List all stored documents")
@@ -127,6 +131,8 @@ def list_documents(
         help="Path to the LanceDB database file",
     ),
 ):
+    from haiku.rag.app import HaikuRAGApp
     app = HaikuRAGApp(db_path=db)
     asyncio.run(app.list_documents())
@@ -142,6 +148,8 @@ def add_document_text(
         help="Path to the LanceDB database file",
     ),
 ):
+    from haiku.rag.app import HaikuRAGApp
     app = HaikuRAGApp(db_path=db)
     asyncio.run(app.add_document_from_text(text=text))
@@ -152,14 +160,21 @@ def add_document_src(
         help="The file path or URL of the document to add",
         autocompletion=complete_local_paths,
     ),
+    title: str | None = typer.Option(
+        None,
+        "--title",
+        help="Optional human-readable title to store with the document",
+    ),
     db: Path = typer.Option(
         Config.DEFAULT_DATA_DIR / "haiku.rag.lancedb",
         "--db",
         help="Path to the LanceDB database file",
     ),
 ):
+    from haiku.rag.app import HaikuRAGApp
     app = HaikuRAGApp(db_path=db)
-    asyncio.run(app.add_document_from_source(source=source))
+    asyncio.run(app.add_document_from_source(source=source, title=title))
 @cli.command("get", help="Get and display a document by its ID")
@@ -174,6 +189,8 @@ def get_document(
         help="Path to the LanceDB database file",
     ),
 ):
+    from haiku.rag.app import HaikuRAGApp
     app = HaikuRAGApp(db_path=db)
     asyncio.run(app.get_document(doc_id=doc_id))
@@ -190,6 +207,8 @@ def delete_document(
         help="Path to the LanceDB database file",
     ),
 ):
+    from haiku.rag.app import HaikuRAGApp
     app = HaikuRAGApp(db_path=db)
     asyncio.run(app.delete_document(doc_id=doc_id))
@@ -215,6 +234,8 @@ def search(
         help="Path to the LanceDB database file",
     ),
 ):
+    from haiku.rag.app import HaikuRAGApp
     app = HaikuRAGApp(db_path=db)
     asyncio.run(app.search(query=query, limit=limit))
@@ -235,6 +256,8 @@ def ask(
         help="Include citations in the response",
     ),
 ):
+    from haiku.rag.app import HaikuRAGApp
     app = HaikuRAGApp(db_path=db)
     asyncio.run(app.ask(question=question, cite=cite))
@@ -271,6 +294,8 @@ def research(
         help="Show verbose progress output",
     ),
 ):
+    from haiku.rag.app import HaikuRAGApp
     app = HaikuRAGApp(db_path=db)
     asyncio.run(
         app.research(
@@ -285,6 +310,8 @@ def research(
 @cli.command("settings", help="Display current configuration settings")
 def settings():
+    from haiku.rag.app import HaikuRAGApp
     app = HaikuRAGApp(db_path=Path())  # Don't need actual DB for settings
     app.show_settings()
@@ -300,6 +327,8 @@ def rebuild(
         help="Path to the LanceDB database file",
     ),
 ):
+    from haiku.rag.app import HaikuRAGApp
     app = HaikuRAGApp(db_path=db)
     asyncio.run(app.rebuild())
@@ -312,6 +341,8 @@ def vacuum(
         help="Path to the LanceDB database file",
     ),
 ):
+    from haiku.rag.app import HaikuRAGApp
     app = HaikuRAGApp(db_path=db)
     asyncio.run(app.vacuum())
@@ -330,24 +361,15 @@ def serve(
         "--stdio",
         help="Run MCP server on stdio Transport",
     ),
-    sse: bool = typer.Option(
-        False,
-        "--sse",
-        help="Run MCP server on SSE transport",
-    ),
 ) -> None:
     """Start the MCP server."""
-    if stdio and sse:
-        console.print("[red]Error: Cannot use both --stdio and --http options[/red]")
-        raise typer.Exit(1)
+    from haiku.rag.app import HaikuRAGApp
     app = HaikuRAGApp(db_path=db)
     transport = None
     if stdio:
         transport = "stdio"
-    elif sse:
-        transport = "sse"
     asyncio.run(app.serve(transport=transport))
@@ -361,6 +383,9 @@ def migrate(
     # Generate LanceDB path in same parent directory
     lancedb_path = sqlite_path.parent / (sqlite_path.stem + ".lancedb")
+    # Lazy import to avoid heavy deps on simple invocations
+    from haiku.rag.migration import migrate_sqlite_to_lancedb
     success = asyncio.run(migrate_sqlite_to_lancedb(sqlite_path, lancedb_path))
     if not success:

haiku/rag/client.py CHANGED Viewed

@@ -33,8 +33,6 @@ class HaikuRAG:
             db_path: Path to the database file.
             skip_validation: Whether to skip configuration validation on database load.
         """
-        if not db_path.parent.exists():
-            Path.mkdir(db_path.parent, parents=True)
         self.store = Store(db_path, skip_validation=skip_validation)
         self.document_repository = DocumentRepository(self.store)
         self.chunk_repository = ChunkRepository(self.store)
@@ -52,6 +50,7 @@ class HaikuRAG:
         self,
         docling_document,
         uri: str | None = None,
+        title: str | None = None,
         metadata: dict | None = None,
         chunks: list[Chunk] | None = None,
     ) -> Document:
@@ -60,6 +59,7 @@ class HaikuRAG:
         document = Document(
             content=content,
             uri=uri,
+            title=title,
             metadata=metadata or {},
         )
         return await self.document_repository._create_with_docling(
@@ -70,6 +70,7 @@ class HaikuRAG:
         self,
         content: str,
         uri: str | None = None,
+        title: str | None = None,
         metadata: dict | None = None,
         chunks: list[Chunk] | None = None,
     ) -> Document:
@@ -90,6 +91,7 @@ class HaikuRAG:
         document = Document(
             content=content,
             uri=uri,
+            title=title,
             metadata=metadata or {},
         )
         return await self.document_repository._create_with_docling(
@@ -97,7 +99,7 @@ class HaikuRAG:
         )
     async def create_document_from_source(
-        self, source: str | Path, metadata: dict = {}
+        self, source: str | Path, title: str | None = None, metadata: dict | None = None
     ) -> Document:
         """Create or update a document from a file path or URL.
@@ -118,11 +120,16 @@ class HaikuRAG:
             httpx.RequestError: If URL request fails
         """
+        # Normalize metadata
+        metadata = metadata or {}
         # Check if it's a URL
         source_str = str(source)
         parsed_url = urlparse(source_str)
         if parsed_url.scheme in ("http", "https"):
-            return await self._create_or_update_document_from_url(source_str, metadata)
+            return await self._create_or_update_document_from_url(
+                source_str, title=title, metadata=metadata
+            )
         elif parsed_url.scheme == "file":
             # Handle file:// URI by converting to path
             source_path = Path(parsed_url.path)
@@ -138,37 +145,51 @@ class HaikuRAG:
         uri = source_path.absolute().as_uri()
         md5_hash = hashlib.md5(source_path.read_bytes()).hexdigest()
+        # Get content type from file extension (do before early return)
+        content_type, _ = mimetypes.guess_type(str(source_path))
+        if not content_type:
+            content_type = "application/octet-stream"
+        # Merge metadata with contentType and md5
+        metadata.update({"contentType": content_type, "md5": md5_hash})
         # Check if document already exists
         existing_doc = await self.get_document_by_uri(uri)
         if existing_doc and existing_doc.metadata.get("md5") == md5_hash:
-            # MD5 unchanged, return existing document
+            # MD5 unchanged; update title/metadata if provided
+            updated = False
+            if title is not None and title != existing_doc.title:
+                existing_doc.title = title
+                updated = True
+            if metadata:
+                existing_doc.metadata = {**(existing_doc.metadata or {}), **metadata}
+                updated = True
+            if updated:
+                return await self.document_repository.update(existing_doc)
             return existing_doc
+        # Parse file only when content changed or new document
         docling_document = FileReader.parse_file(source_path)
-        # Get content type from file extension
-        content_type, _ = mimetypes.guess_type(str(source_path))
-        if not content_type:
-            content_type = "application/octet-stream"
-        # Merge metadata with contentType and md5
-        metadata.update({"contentType": content_type, "md5": md5_hash})
         if existing_doc:
             # Update existing document
             existing_doc.content = docling_document.export_to_markdown()
             existing_doc.metadata = metadata
+            if title is not None:
+                existing_doc.title = title
             return await self.document_repository._update_with_docling(
                 existing_doc, docling_document
             )
         else:
             # Create new document using DoclingDocument
             return await self._create_document_with_docling(
-                docling_document=docling_document, uri=uri, metadata=metadata
+                docling_document=docling_document,
+                uri=uri,
+                title=title,
+                metadata=metadata,
             )
     async def _create_or_update_document_from_url(
-        self, url: str, metadata: dict = {}
+        self, url: str, title: str | None = None, metadata: dict | None = None
     ) -> Document:
         """Create or update a document from a URL by downloading and parsing the content.
@@ -188,20 +209,35 @@ class HaikuRAG:
             ValueError: If the content cannot be parsed
             httpx.RequestError: If URL request fails
         """
+        metadata = metadata or {}
         async with httpx.AsyncClient() as client:
             response = await client.get(url)
             response.raise_for_status()
             md5_hash = hashlib.md5(response.content).hexdigest()
+            # Get content type early (used for potential no-op update)
+            content_type = response.headers.get("content-type", "").lower()
             # Check if document already exists
             existing_doc = await self.get_document_by_uri(url)
             if existing_doc and existing_doc.metadata.get("md5") == md5_hash:
-                # MD5 unchanged, return existing document
+                # MD5 unchanged; update title/metadata if provided
+                updated = False
+                if title is not None and title != existing_doc.title:
+                    existing_doc.title = title
+                    updated = True
+                metadata.update({"contentType": content_type, "md5": md5_hash})
+                if metadata:
+                    existing_doc.metadata = {
+                        **(existing_doc.metadata or {}),
+                        **metadata,
+                    }
+                    updated = True
+                if updated:
+                    return await self.document_repository.update(existing_doc)
                 return existing_doc
-            # Get content type to determine file extension
-            content_type = response.headers.get("content-type", "").lower()
             file_extension = self._get_extension_from_content_type_or_url(
                 url, content_type
             )
@@ -228,12 +264,17 @@ class HaikuRAG:
             if existing_doc:
                 existing_doc.content = docling_document.export_to_markdown()
                 existing_doc.metadata = metadata
+                if title is not None:
+                    existing_doc.title = title
                 return await self.document_repository._update_with_docling(
                     existing_doc, docling_document
                 )
             else:
                 return await self._create_document_with_docling(
-                    docling_document=docling_document, uri=url, metadata=metadata
+                    docling_document=docling_document,
+                    uri=url,
+                    title=title,
+                    metadata=metadata,
                 )
     def _get_extension_from_content_type_or_url(
@@ -418,6 +459,7 @@ class HaikuRAG:
                     content="".join(combined_content_parts),
                     metadata=original_chunk.metadata,
                     document_uri=original_chunk.document_uri,
+                    document_title=original_chunk.document_title,
                     document_meta=original_chunk.document_meta,
                 )
@@ -524,7 +566,7 @@ class HaikuRAG:
                     # Try to re-create from source (this creates the document with chunks)
                     new_doc = await self.create_document_from_source(
-                        doc.uri, doc.metadata or {}
+                        source=doc.uri, metadata=doc.metadata or {}
                     )
                     assert new_doc.id is not None, "New document ID should not be None"

haiku/rag/config.py CHANGED Viewed

@@ -53,6 +53,10 @@ class AppConfig(BaseModel):
     ANTHROPIC_API_KEY: str = ""
     COHERE_API_KEY: str = ""
+    # If true, refuse to auto-create a new LanceDB database or tables
+    # and error out when the database does not already exist.
+    DISABLE_DB_AUTOCREATE: bool = False
     @field_validator("MONITOR_DIRECTORIES", mode="before")
     @classmethod
     def parse_monitor_directories(cls, v):

haiku/rag/mcp.py CHANGED Viewed

@@ -17,6 +17,7 @@ class DocumentResult(BaseModel):
     id: str | None
     content: str
     uri: str | None = None
+    title: str | None = None
     metadata: dict[str, Any] = {}
     created_at: str
     updated_at: str
@@ -28,13 +29,15 @@ def create_mcp_server(db_path: Path) -> FastMCP:
     @mcp.tool()
     async def add_document_from_file(
-        file_path: str, metadata: dict[str, Any] | None = None
+        file_path: str,
+        metadata: dict[str, Any] | None = None,
+        title: str | None = None,
     ) -> str | None:
         """Add a document to the RAG system from a file path."""
         try:
             async with HaikuRAG(db_path) as rag:
                 document = await rag.create_document_from_source(
-                    Path(file_path), metadata or {}
+                    Path(file_path), title=title, metadata=metadata or {}
                 )
                 return document.id
         except Exception:
@@ -42,24 +45,31 @@ def create_mcp_server(db_path: Path) -> FastMCP:
     @mcp.tool()
     async def add_document_from_url(
-        url: str, metadata: dict[str, Any] | None = None
+        url: str, metadata: dict[str, Any] | None = None, title: str | None = None
     ) -> str | None:
         """Add a document to the RAG system from a URL."""
         try:
             async with HaikuRAG(db_path) as rag:
-                document = await rag.create_document_from_source(url, metadata or {})
+                document = await rag.create_document_from_source(
+                    url, title=title, metadata=metadata or {}
+                )
                 return document.id
         except Exception:
             return None
     @mcp.tool()
     async def add_document_from_text(
-        content: str, uri: str | None = None, metadata: dict[str, Any] | None = None
+        content: str,
+        uri: str | None = None,
+        metadata: dict[str, Any] | None = None,
+        title: str | None = None,
     ) -> str | None:
         """Add a document to the RAG system from text content."""
         try:
             async with HaikuRAG(db_path) as rag:
-                document = await rag.create_document(content, uri, metadata or {})
+                document = await rag.create_document(
+                    content, uri, title=title, metadata=metadata or {}
+                )
                 return document.id
         except Exception:
             return None
@@ -102,6 +112,7 @@ def create_mcp_server(db_path: Path) -> FastMCP:
                     id=document.id,
                     content=document.content,
                     uri=document.uri,
+                    title=document.title,
                     metadata=document.metadata,
                     created_at=str(document.created_at),
                     updated_at=str(document.updated_at),
@@ -123,6 +134,7 @@ def create_mcp_server(db_path: Path) -> FastMCP:
                         id=doc.id,
                         content=doc.content,
                         uri=doc.uri,
+                        title=doc.title,
                         metadata=doc.metadata,
                         created_at=str(doc.created_at),
                         updated_at=str(doc.updated_at),

haiku/rag/qa/agent.py CHANGED Viewed

@@ -12,7 +12,9 @@ from haiku.rag.qa.prompts import QA_SYSTEM_PROMPT, QA_SYSTEM_PROMPT_WITH_CITATIO
 class SearchResult(BaseModel):
     content: str = Field(description="The document text content")
     score: float = Field(description="Relevance score (higher is more relevant)")
-    document_uri: str = Field(description="Source URI/path of the document")
+    document_uri: str = Field(
+        description="Source title (if available) or URI/path of the document"
+    )
 class Dependencies(BaseModel):
@@ -59,7 +61,7 @@ class QuestionAnswerAgent:
                 SearchResult(
                     content=chunk.content,
                     score=score,
-                    document_uri=chunk.document_uri or "",
+                    document_uri=(chunk.document_title or chunk.document_uri or ""),
                 )
                 for chunk, score in expanded_results
             ]

haiku/rag/qa/prompts.py CHANGED Viewed

@@ -44,9 +44,9 @@ Guidelines:
 Citation Format:
 After your answer, include a "Citations:" section that lists:
-- The document URI from each search result used
+- The document title (if available) or URI from each search result used
 - A brief excerpt (first 50-100 characters) of the content that supported your answer
-- Format: "Citations:\n- [document_uri]: [content_excerpt]..."
+- Format: "Citations:\n- [document title or URI]: [content_excerpt]..."
 Example response format:
 [Your answer here]

haiku/rag/research/models.py CHANGED Viewed

@@ -19,8 +19,8 @@ class SearchAnswer(BaseModel):
     )
     sources: list[str] = Field(
         description=(
-            "Document URIs corresponding to the snippets actually used in the"
-            " answer (one URI per snippet; omit if none)"
+            "Document titles (if available) or URIs corresponding to the"
+            " snippets actually used in the answer (one per snippet; omit if none)"
         ),
         default_factory=list,
     )

haiku/rag/research/nodes/search.py CHANGED Viewed

@@ -59,7 +59,9 @@ class SearchDispatchNode(BaseNode[ResearchState, ResearchDeps, ResearchReport]):
                     {
                         "text": chunk.content,
                         "score": score,
-                        "document_uri": (chunk.document_uri or ""),
+                        "document_uri": (
+                            chunk.document_title or chunk.document_uri or ""
+                        ),
                     }
                     for chunk, score in expanded
                 ]

haiku/rag/research/prompts.py CHANGED Viewed

@@ -27,13 +27,14 @@ Tasks:
 Tool usage:
 - Always call search_and_answer before drafting any answer.
 - The tool returns snippets with verbatim `text`, a relevance `score`, and the
-  originating `document_uri`.
+  originating document identifier (document title if available, otherwise URI).
 - You may call the tool multiple times to refine or broaden context, but do not
   exceed 3 total calls. Favor precision over volume.
 - Use scores to prioritize evidence, but include only the minimal subset of
   snippet texts (verbatim) in SearchAnswer.context (typically 1‑4).
-- Set SearchAnswer.sources to the corresponding document_uris for the snippets
-  you used (one URI per snippet; same order as context). Context must be text‑only.
+- Set SearchAnswer.sources to the corresponding document identifiers for the
+  snippets you used (title if available, otherwise URI; one per snippet; same
+  order as context). Context must be text‑only.
 - If no relevant information is found, clearly say so and return an empty
   context list and sources list.

haiku/rag/store/engine.py CHANGED Viewed

@@ -19,6 +19,7 @@ class DocumentRecord(LanceModel):
     id: str = Field(default_factory=lambda: str(uuid4()))
     content: str
     uri: str | None = None
+    title: str | None = None
     metadata: str = Field(default="{}")
     created_at: str = Field(default_factory=lambda: "")
     updated_at: str = Field(default_factory=lambda: "")
@@ -54,6 +55,19 @@ class Store:
         # Create the ChunkRecord model with the correct vector dimension
         self.ChunkRecord = create_chunk_model(self.embedder._vector_dim)
+        # Local filesystem handling for DB directory
+        if not self._has_cloud_config():
+            if Config.DISABLE_DB_AUTOCREATE:
+                # LanceDB uses a directory path for local databases; enforce presence
+                if not db_path.exists():
+                    raise FileNotFoundError(
+                        f"LanceDB path does not exist: {db_path}. Auto-creation is disabled."
+                    )
+            else:
+                # Ensure parent directories exist when autocreation allowed
+                if not db_path.parent.exists():
+                    Path.mkdir(db_path.parent, parents=True)
         # Connect to LanceDB
         self.db = self._connect_to_lancedb(db_path)

haiku/rag/store/models/chunk.py CHANGED Viewed

@@ -12,5 +12,6 @@ class Chunk(BaseModel):
     metadata: dict = {}
     order: int = 0
     document_uri: str | None = None
+    document_title: str | None = None
     document_meta: dict = {}
     embedding: list[float] | None = None

haiku/rag/store/models/document.py CHANGED Viewed

@@ -11,6 +11,7 @@ class Document(BaseModel):
     id: str | None = None
     content: str
     uri: str | None = None
+    title: str | None = None
     metadata: dict = {}
     created_at: datetime = Field(default_factory=datetime.now)
     updated_at: datetime = Field(default_factory=datetime.now)

haiku/rag/store/repositories/chunk.py CHANGED Viewed

@@ -317,6 +317,7 @@ class ChunkRepository:
         )
         doc_uri = doc_results[0].uri if doc_results else None
+        doc_title = doc_results[0].title if doc_results else None
         doc_meta = doc_results[0].metadata if doc_results else "{}"
         chunks: list[Chunk] = []
@@ -330,6 +331,7 @@ class ChunkRepository:
                     metadata=md,
                     order=rec.order,
                     document_uri=doc_uri,
+                    document_title=doc_title,
                     document_meta=json.loads(doc_meta),
                 )
             )
@@ -398,6 +400,7 @@ class ChunkRepository:
             # Get document info from pre-fetched map
             doc = documents_map.get(chunk_record.document_id)
             doc_uri = doc.uri if doc else None
+            doc_title = doc.title if doc else None
             doc_meta = doc.metadata if doc else "{}"
             md = json.loads(chunk_record.metadata)
@@ -409,6 +412,7 @@ class ChunkRepository:
                 metadata=md,
                 order=chunk_record.order,
                 document_uri=doc_uri,
+                document_title=doc_title,
                 document_meta=json.loads(doc_meta),
             )

haiku/rag/store/repositories/document.py CHANGED Viewed

@@ -34,6 +34,7 @@ class DocumentRepository:
             id=record.id,
             content=record.content,
             uri=record.uri,
+            title=record.title,
             metadata=json.loads(record.metadata),
             created_at=datetime.fromisoformat(record.created_at)
             if record.created_at
@@ -56,6 +57,7 @@ class DocumentRepository:
             id=doc_id,
             content=entity.content,
             uri=entity.uri,
+            title=entity.title,
             metadata=json.dumps(entity.metadata),
             created_at=now,
             updated_at=now,
@@ -97,6 +99,7 @@ class DocumentRepository:
             values={
                 "content": entity.content,
                 "uri": entity.uri,
+                "title": entity.title,
                 "metadata": json.dumps(entity.metadata),
                 "updated_at": now,
             },

haiku/rag/store/upgrades/__init__.py CHANGED Viewed

@@ -55,6 +55,8 @@ def run_pending_upgrades(store: Store, from_version: str, to_version: str) -> No
 from .v0_9_3 import upgrade_fts_phrase as upgrade_0_9_3_fts  # noqa: E402
 from .v0_9_3 import upgrade_order as upgrade_0_9_3_order  # noqa: E402
+from .v0_10_1 import upgrade_add_title as upgrade_0_10_1_add_title  # noqa: E402
 upgrades.append(upgrade_0_9_3_order)
 upgrades.append(upgrade_0_9_3_fts)
+upgrades.append(upgrade_0_10_1_add_title)

haiku/rag/store/upgrades/v0_10_1.py ADDED Viewed

@@ -0,0 +1,64 @@
+import json
+from lancedb.pydantic import LanceModel
+from pydantic import Field
+from haiku.rag.store.engine import Store
+from haiku.rag.store.upgrades import Upgrade
+def _apply_add_document_title(store: Store) -> None:
+    """Add a nullable 'title' column to the documents table."""
+    # Read existing rows using Arrow for schema-agnostic access
+    try:
+        docs_arrow = store.documents_table.search().to_arrow()
+        rows = docs_arrow.to_pylist()
+    except Exception:
+        rows = []
+    class DocumentRecordV2(LanceModel):
+        id: str
+        content: str
+        uri: str | None = None
+        title: str | None = None
+        metadata: str = Field(default="{}")
+        created_at: str = Field(default_factory=lambda: "")
+        updated_at: str = Field(default_factory=lambda: "")
+    # Drop and recreate documents table with the new schema
+    try:
+        store.db.drop_table("documents")
+    except Exception:
+        pass
+    store.documents_table = store.db.create_table("documents", schema=DocumentRecordV2)
+    # Reinsert previous rows with title=None
+    if rows:
+        backfilled = []
+        for row in rows:
+            backfilled.append(
+                DocumentRecordV2(
+                    id=row.get("id"),
+                    content=row.get("content", ""),
+                    uri=row.get("uri"),
+                    title=None,
+                    metadata=(
+                        row.get("metadata")
+                        if isinstance(row.get("metadata"), str)
+                        else json.dumps(row.get("metadata") or {})
+                    ),
+                    created_at=row.get("created_at", ""),
+                    updated_at=row.get("updated_at", ""),
+                )
+            )
+        store.documents_table.add(backfilled)
+upgrade_add_title = Upgrade(
+    version="0.10.1",
+    apply=_apply_add_document_title,
+    description="Add nullable 'title' column to documents table",
+)

haiku/rag/utils.py CHANGED Viewed

@@ -9,10 +9,6 @@ from io import BytesIO
 from pathlib import Path
 from types import ModuleType
-import httpx
-from docling.document_converter import DocumentConverter
-from docling_core.types.doc.document import DoclingDocument
-from docling_core.types.io import DocumentStream
 from packaging.version import Version, parse
@@ -82,6 +78,9 @@ async def is_up_to_date() -> tuple[bool, Version, Version]:
         the running version and the latest version.
     """
+    # Lazy import to avoid pulling httpx (and its deps) on module import
+    import httpx
     async with httpx.AsyncClient() as client:
         running_version = parse(metadata.version("haiku.rag"))
         try:
@@ -94,7 +93,7 @@ async def is_up_to_date() -> tuple[bool, Version, Version]:
     return running_version >= pypi_version, running_version, pypi_version
-def text_to_docling_document(text: str, name: str = "content.md") -> DoclingDocument:
+def text_to_docling_document(text: str, name: str = "content.md"):
     """Convert text content to a DoclingDocument.
     Args:
@@ -104,6 +103,10 @@ def text_to_docling_document(text: str, name: str = "content.md") -> DoclingDocu
     Returns:
         A DoclingDocument created from the text content.
     """
+    # Lazy import docling deps to keep import-time light
+    from docling.document_converter import DocumentConverter  # type: ignore
+    from docling_core.types.io import DocumentStream  # type: ignore
     bytes_io = BytesIO(text.encode("utf-8"))
     doc_stream = DocumentStream(name=name, stream=bytes_io)
     converter = DocumentConverter()

{haiku_rag-0.10.0.dist-info → haiku_rag-0.10.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: haiku.rag
-Version: 0.10.0
+Version: 0.10.1
 Summary: Agentic Retrieval Augmented Generation (RAG) with LanceDB
 Author-email: Yiorgis Gozadinos <ggozadinos@gmail.com>
 License: MIT

{haiku_rag-0.10.0.dist-info → haiku_rag-0.10.1.dist-info}/RECORD RENAMED Viewed

@@ -1,15 +1,15 @@
 haiku/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-haiku/rag/app.py,sha256=m5agkPrJhbzEbdC01CU_GR2Gj4voFuAGmxR7DS2K9is,12934
+haiku/rag/app.py,sha256=06nsdjrljPNqZew4gsLFIA0BSwv-CxPovJaAYSFzm-w,13265
 haiku/rag/chunker.py,sha256=PVe6ysv8UlacUd4Zb3_8RFWIaWDXnzBAy2VDJ4TaUsE,1555
-haiku/rag/cli.py,sha256=oXEQoRTlzrrJ9hC27_Dht9ElBb9q_wTEESnXdNy3eW8,10257
-haiku/rag/client.py,sha256=QgJQu7g7JjAzWN6R10NeDqpFf89Dml_LiWce4QRHLHc,21177
-haiku/rag/config.py,sha256=SPEIv2IElZmZh4Wsp8gk7ViRW5ZzD-UGmIqRAXscDdI,2134
+haiku/rag/cli.py,sha256=4aYUXPr54q-7U2-cySNpSnW1ntvD0Jck6oj8vvA6IoI,10830
+haiku/rag/client.py,sha256=iUaa6YUac3CXFniIm8DsaaNsiyHsi4cp8-fPhF5XuVU,22925
+haiku/rag/config.py,sha256=SEV2OzaKavYwHZ0LmRzBj-0dbI6YFIRuNiTw9el7SO0,2307
 haiku/rag/logging.py,sha256=dm65AwADpcQsH5OAPtRA-4hsw0w5DK-sGOvzYkj6jzw,1720
-haiku/rag/mcp.py,sha256=bR9Y-Nz-hvjiql20Y0KE0hwNGwyjmPGX8K9d-qmXptY,4683
+haiku/rag/mcp.py,sha256=H7XibtSNUviFeaJVsXzHiRqUm0nJCpA7A1QHuBv6SKQ,5057
 haiku/rag/migration.py,sha256=M--KnSF3lxgKjxmokb4vuzGH-pV8eg0C_8e7jvPqW8Y,11058
 haiku/rag/monitor.py,sha256=r386nkhdlsU8UECwIuVwnrSlgMk3vNIuUZGNIzkZuec,2770
 haiku/rag/reader.py,sha256=qkPTMJuQ_o4sK-8zpDl9WFYe_MJ7aL_gUw6rczIpW-g,3274
-haiku/rag/utils.py,sha256=aiuPu_rrfpyIvJJq0o5boUIIvCdNzdpKwAIPYYn3iG8,4965
+haiku/rag/utils.py,sha256=hKH8bBBbAVYlLFBOAcErvX-4cuWIaPTbrAFeeLN1HdM,5062
 haiku/rag/embeddings/__init__.py,sha256=44IfDITGIFTflGT6UEmiYOwpWFVbYv5smLY59D0YeCs,1419
 haiku/rag/embeddings/base.py,sha256=BnSviKrlzjv3L0sZJs_T-pxfawd-bcTak-rsX-D2f3A,497
 haiku/rag/embeddings/ollama.py,sha256=LuLlHH6RGoO9_gFCIlbmesuXOj017gTw6z-p8Ez0CfE,595
@@ -17,8 +17,8 @@ haiku/rag/embeddings/openai.py,sha256=fIFCk-jpUtaW0xsnrQnJ824O0UCjaGG2sgvBzREhil
 haiku/rag/embeddings/vllm.py,sha256=vhaUnCn6VMkfSluLhWKtSV-sekFaPsp4pKo2N7-SBCY,626
 haiku/rag/embeddings/voyageai.py,sha256=UW-MW4tJKnPB6Fs2P7A3yt-ZeRm46H9npckchSriPX8,661
 haiku/rag/qa/__init__.py,sha256=Sl7Kzrg9CuBOcMF01wc1NtQhUNWjJI0MhIHfCWrb8V4,434
-haiku/rag/qa/agent.py,sha256=f4Keh-ESgctNbTg96QL95HYjINVLOcxa8t8crx92MMk,3081
-haiku/rag/qa/prompts.py,sha256=LhRfDtO8Pb06lpr4PpwEaKUYItZ5OiIkeqcCogcssHY,3347
+haiku/rag/qa/agent.py,sha256=rtUkEmnD8lMHIxpPPVY6TdmF4aSlZnLjad5eDefrlBw,3145
+haiku/rag/qa/prompts.py,sha256=Lqwn3m4zCsu_CJiC4s9cLsuPNbb9nq6j2PqEF3lw1eA,3380
 haiku/rag/reranking/__init__.py,sha256=IRXHs4qPu6VbGJQpzSwhgtVWWumURH_vEoVFE-extlo,894
 haiku/rag/reranking/base.py,sha256=LM9yUSSJ414UgBZhFTgxGprlRqzfTe4I1vgjricz2JY,405
 haiku/rag/reranking/cohere.py,sha256=1iTdiaa8vvb6oHVB2qpWzUOVkyfUcimVSZp6Qr4aq4c,1049
@@ -28,26 +28,27 @@ haiku/rag/research/__init__.py,sha256=t4JAmIXcKaWqvpFGX5yaehsNrfblskEMn-4mDmdKn9
 haiku/rag/research/common.py,sha256=EUnsA6VZ3-WMweXESuUYezH1ALit8N38064bsZFqtBE,1688
 haiku/rag/research/dependencies.py,sha256=ZiSQdV6jHti4DuUp4WCaJL73TqYDr5vC8ppB34M2cNg,1639
 haiku/rag/research/graph.py,sha256=m3vDP1nPXWzfS7VeTQzmTOk-lFpoaTvKHvRIF2mbxvs,798
-haiku/rag/research/models.py,sha256=klE2qGF5fom5gJRQzQUbnoGYaXusNKeJ9veeXoYDD5Q,2308
-haiku/rag/research/prompts.py,sha256=v_DZNaKk88CDEF8qt9c-puO6QF-NyBQKnl_mO1pMauY,5013
+haiku/rag/research/models.py,sha256=Q92oxBNq3qp3DyUzTim9YGDOBtGzXH25K_mmfLAA7Y8,2329
+haiku/rag/research/prompts.py,sha256=0_EMA5CS7O37QhKJM7OCDdrdgMcoF2DgehBHR4L7xmk,5103
 haiku/rag/research/state.py,sha256=vFwO8c2JmwwfkELE5Mwjt9Oat-bHn5tayf31MIG2SRs,623
 haiku/rag/research/nodes/evaluate.py,sha256=Cp2J-jXYZothiQV3zRZFaCsBLaUU0Tm_-ri-hlgQQII,2897
 haiku/rag/research/nodes/plan.py,sha256=9AkTls01Q3zTLKGgIgSCX9X4VYC8IWjEWii8A_f77YQ,2439
-haiku/rag/research/nodes/search.py,sha256=lHgDCCL7hQdpQeMK-HVzsF_hH_pIv44xxSIiv1JuvYo,3513
+haiku/rag/research/nodes/search.py,sha256=2ioc5Ba3ciq2zpFxgzoGkZOvVsJ1TBX9zseURLDJpBg,3591
 haiku/rag/research/nodes/synthesize.py,sha256=4acKduqWnE11ML7elUksKLozxzWJTkBLSJ2li_YMxgY,1736
 haiku/rag/store/__init__.py,sha256=hq0W0DAC7ysqhWSP2M2uHX8cbG6kbr-sWHxhq6qQcY0,103
-haiku/rag/store/engine.py,sha256=-3MZJYft2XTWaLuyKha8DKhWQeU5E5CBeskXXF5fXso,9555
+haiku/rag/store/engine.py,sha256=BceAeTpDgV92B1A3GVcjsTwlD-c0cZPPvGiXW2Gola0,10215
 haiku/rag/store/models/__init__.py,sha256=s0E72zneGlowvZrFWaNxHYjOAUjgWdLxzdYsnvNRVlY,88
-haiku/rag/store/models/chunk.py,sha256=Ww_hj3DMwJLNM33l1GvIP84yzDFc6cxfiWcotUfWSYg,383
-haiku/rag/store/models/document.py,sha256=zSSpt6pyrMJAIXGQvIcqojcqUzwZnhp3WxVokaWxNRc,396
+haiku/rag/store/models/chunk.py,sha256=3EuZav4QekJIeHBCub48EM8SjNX8HEJ6wVDXGot4PEQ,421
+haiku/rag/store/models/document.py,sha256=cZXy_jEti-hnhq7FKhuhCfd99ccY9fIHMLovB_Thbb8,425
 haiku/rag/store/repositories/__init__.py,sha256=Olv5dLfBQINRV3HrsfUpjzkZ7Qm7goEYyMNykgo_DaY,291
-haiku/rag/store/repositories/chunk.py,sha256=O2SEhQy3ZptWjwwpxS-L8KNq2tEqEBqheHfLw-M_FqA,15012
-haiku/rag/store/repositories/document.py,sha256=m11SamQoGYs5ODfmarJGU1yIcqtgmnba-5bGOPQuYrI,7773
+haiku/rag/store/repositories/chunk.py,sha256=UfajEWf5VmMuSozGRDlWBjJNR0ngvOVFDrp6_augzBg,15217
+haiku/rag/store/repositories/document.py,sha256=C9GbIl8sa2-Djaml4hlaPTtjV2HwHaz_Wzs35sdbdhg,7876
 haiku/rag/store/repositories/settings.py,sha256=7XMBMavU8zRgdBoQzQg0Obfa7UKjuVnBugidTC6sEW0,5548
-haiku/rag/store/upgrades/__init__.py,sha256=gDOxiq3wdZPr3JoenjNYxx0cpgZJhbaFKNX2fzXRq1Q,1852
+haiku/rag/store/upgrades/__init__.py,sha256=RQ8A6rEXBASLb5PD9vdDnEas_m_GgRzzdVu4B88Snqc,1975
+haiku/rag/store/upgrades/v0_10_1.py,sha256=qNGnxj6hoHaHJ1rKTiALfw0c9NQOi0KAK-VZCD_073A,1959
 haiku/rag/store/upgrades/v0_9_3.py,sha256=NrjNilQSgDtFWRbL3ZUtzQzJ8tf9u0dDRJtnDFwwbdw,3322
-haiku_rag-0.10.0.dist-info/METADATA,sha256=QLc8BBJ4WCNEvseyYpWNfkuUfmdxGywD6Jtn0OTsrc0,5879
-haiku_rag-0.10.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-haiku_rag-0.10.0.dist-info/entry_points.txt,sha256=G1U3nAkNd5YDYd4v0tuYFbriz0i-JheCsFuT9kIoGCI,48
-haiku_rag-0.10.0.dist-info/licenses/LICENSE,sha256=eXZrWjSk9PwYFNK9yUczl3oPl95Z4V9UXH7bPN46iPo,1065
-haiku_rag-0.10.0.dist-info/RECORD,,
+haiku_rag-0.10.1.dist-info/METADATA,sha256=Bu1Nmz3AoD_EquvCvsbcJjGXmFsGDEwqnfaYIBgOLqQ,5879
+haiku_rag-0.10.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+haiku_rag-0.10.1.dist-info/entry_points.txt,sha256=G1U3nAkNd5YDYd4v0tuYFbriz0i-JheCsFuT9kIoGCI,48
+haiku_rag-0.10.1.dist-info/licenses/LICENSE,sha256=eXZrWjSk9PwYFNK9yUczl3oPl95Z4V9UXH7bPN46iPo,1065
+haiku_rag-0.10.1.dist-info/RECORD,,

{haiku_rag-0.10.0.dist-info → haiku_rag-0.10.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{haiku_rag-0.10.0.dist-info → haiku_rag-0.10.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{haiku_rag-0.10.0.dist-info → haiku_rag-0.10.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

haiku.rag 0.10.0__py3-none-any.whl → 0.10.1__py3-none-any.whl

Potentially problematic release.

haiku.rag 0.10.0py3-none-any.whl → 0.10.1py3-none-any.whl