PyPI - haiku.rag - Versions diffs - 0.10.1__py3-none-any.whl → 0.11.0__py3-none-any.whl - Mend

haiku.rag 0.10.1py3-none-any.whl → 0.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of haiku.rag might be problematic. Click here for more details.

Files changed (24) hide show

haiku/rag/app.py +152 -28
haiku/rag/cli.py +72 -2
haiku/rag/migration.py +2 -2
haiku/rag/research/__init__.py +8 -0
haiku/rag/research/common.py +71 -6
haiku/rag/research/dependencies.py +179 -11
haiku/rag/research/graph.py +5 -3
haiku/rag/research/models.py +134 -1
haiku/rag/research/nodes/analysis.py +181 -0
haiku/rag/research/nodes/plan.py +16 -9
haiku/rag/research/nodes/search.py +14 -11
haiku/rag/research/nodes/synthesize.py +7 -3
haiku/rag/research/prompts.py +67 -28
haiku/rag/research/state.py +11 -4
haiku/rag/research/stream.py +177 -0
haiku/rag/store/__init__.py +1 -1
haiku/rag/store/models/__init__.py +1 -1
haiku/rag/utils.py +34 -0
{haiku_rag-0.10.1.dist-info → haiku_rag-0.11.0.dist-info}/METADATA +34 -14
{haiku_rag-0.10.1.dist-info → haiku_rag-0.11.0.dist-info}/RECORD +23 -22
haiku/rag/research/nodes/evaluate.py +0 -80
{haiku_rag-0.10.1.dist-info → haiku_rag-0.11.0.dist-info}/WHEEL +0 -0
{haiku_rag-0.10.1.dist-info → haiku_rag-0.11.0.dist-info}/entry_points.txt +0 -0
{haiku_rag-0.10.1.dist-info → haiku_rag-0.11.0.dist-info}/licenses/LICENSE +0 -0

haiku/rag/app.py CHANGED Viewed

@@ -1,4 +1,6 @@
 import asyncio
+import json
+from importlib.metadata import version as pkg_version
 from pathlib import Path
 from rich.console import Console
@@ -16,6 +18,7 @@ from haiku.rag.research.graph import (
     ResearchState,
     build_research_graph,
 )
+from haiku.rag.research.stream import stream_research_graph
 from haiku.rag.store.models.chunk import Chunk
 from haiku.rag.store.models.document import Document
@@ -25,26 +28,141 @@ class HaikuRAGApp:
         self.db_path = db_path
         self.console = Console()
+    async def info(self):
+        """Display read-only information about the database without modifying it."""
+        import lancedb
+        # Basic: show path
+        self.console.print("[bold]haiku.rag database info[/bold]")
+        self.console.print(
+            f"  [repr.attrib_name]path[/repr.attrib_name]: {self.db_path}"
+        )
+        if not self.db_path.exists():
+            self.console.print("[red]Database path does not exist.[/red]")
+            return
+        # Connect without going through Store to avoid upgrades/validation writes
+        try:
+            db = lancedb.connect(self.db_path)
+            table_names = set(db.table_names())
+        except Exception as e:
+            self.console.print(f"[red]Failed to open database: {e}[/red]")
+            return
+        try:
+            ldb_version = pkg_version("lancedb")
+        except Exception:
+            ldb_version = "unknown"
+        try:
+            hr_version = pkg_version("haiku.rag")
+        except Exception:
+            hr_version = "unknown"
+        try:
+            docling_version = pkg_version("docling")
+        except Exception:
+            docling_version = "unknown"
+        # Read settings (if present) to find stored haiku.rag version and embedding config
+        stored_version = "unknown"
+        embed_provider: str | None = None
+        embed_model: str | None = None
+        vector_dim: int | None = None
+        if "settings" in table_names:
+            settings_tbl = db.open_table("settings")
+            arrow = settings_tbl.search().where("id = 'settings'").limit(1).to_arrow()
+            rows = arrow.to_pylist() if arrow is not None else []
+            if rows:
+                raw = rows[0].get("settings") or "{}"
+                data = json.loads(raw) if isinstance(raw, str) else (raw or {})
+                stored_version = str(data.get("version", stored_version))
+                embed_provider = data.get("EMBEDDINGS_PROVIDER")
+                embed_model = data.get("EMBEDDINGS_MODEL")
+                vector_dim = (
+                    int(data.get("EMBEDDINGS_VECTOR_DIM"))  # pyright: ignore[reportArgumentType]
+                    if data.get("EMBEDDINGS_VECTOR_DIM") is not None
+                    else None
+                )
+        num_docs = 0
+        if "documents" in table_names:
+            docs_tbl = db.open_table("documents")
+            num_docs = int(docs_tbl.count_rows())  # type: ignore[attr-defined]
+        # Table versions per table (direct API)
+        doc_versions = (
+            len(list(db.open_table("documents").list_versions()))
+            if "documents" in table_names
+            else 0
+        )
+        chunk_versions = (
+            len(list(db.open_table("chunks").list_versions()))
+            if "chunks" in table_names
+            else 0
+        )
+        self.console.print(
+            f"  [repr.attrib_name]haiku.rag version (db)[/repr.attrib_name]: {stored_version}"
+        )
+        if embed_provider or embed_model or vector_dim:
+            provider_part = embed_provider or "unknown"
+            model_part = embed_model or "unknown"
+            dim_part = f"{vector_dim}" if vector_dim is not None else "unknown"
+            self.console.print(
+                "  [repr.attrib_name]embeddings[/repr.attrib_name]: "
+                f"{provider_part}/{model_part} (dim: {dim_part})"
+            )
+        else:
+            self.console.print(
+                "  [repr.attrib_name]embeddings[/repr.attrib_name]: unknown"
+            )
+        self.console.print(
+            f"  [repr.attrib_name]documents[/repr.attrib_name]: {num_docs}"
+        )
+        self.console.print(
+            f"  [repr.attrib_name]versions (documents)[/repr.attrib_name]: {doc_versions}"
+        )
+        self.console.print(
+            f"  [repr.attrib_name]versions (chunks)[/repr.attrib_name]: {chunk_versions}"
+        )
+        self.console.rule()
+        self.console.print("[bold]Versions[/bold]")
+        self.console.print(
+            f"  [repr.attrib_name]haiku.rag[/repr.attrib_name]: {hr_version}"
+        )
+        self.console.print(
+            f"  [repr.attrib_name]lancedb[/repr.attrib_name]: {ldb_version}"
+        )
+        self.console.print(
+            f"  [repr.attrib_name]docling[/repr.attrib_name]: {docling_version}"
+        )
     async def list_documents(self):
         async with HaikuRAG(db_path=self.db_path) as self.client:
             documents = await self.client.list_documents()
             for doc in documents:
                 self._rich_print_document(doc, truncate=True)
-    async def add_document_from_text(self, text: str):
+    async def add_document_from_text(self, text: str, metadata: dict | None = None):
         async with HaikuRAG(db_path=self.db_path) as self.client:
-            doc = await self.client.create_document(text)
+            doc = await self.client.create_document(text, metadata=metadata)
             self._rich_print_document(doc, truncate=True)
             self.console.print(
-                f"[b]Document with id [cyan]{doc.id}[/cyan] added successfully.[/b]"
+                f"[bold green]Document {doc.id} added successfully.[/bold green]"
             )
-    async def add_document_from_source(self, source: str, title: str | None = None):
+    async def add_document_from_source(
+        self, source: str, title: str | None = None, metadata: dict | None = None
+    ):
         async with HaikuRAG(db_path=self.db_path) as self.client:
-            doc = await self.client.create_document_from_source(source, title=title)
+            doc = await self.client.create_document_from_source(
+                source, title=title, metadata=metadata
+            )
             self._rich_print_document(doc, truncate=True)
             self.console.print(
-                f"[b]Document with id [cyan]{doc.id}[/cyan] added successfully.[/b]"
+                f"[bold green]Document {doc.id} added successfully.[/bold green]"
             )
     async def get_document(self, doc_id: str):
@@ -59,7 +177,9 @@ class HaikuRAGApp:
         async with HaikuRAG(db_path=self.db_path) as self.client:
             deleted = await self.client.delete_document(doc_id)
             if deleted:
-                self.console.print(f"[b]Document {doc_id} deleted successfully.[/b]")
+                self.console.print(
+                    f"[bold green]Document {doc_id} deleted successfully.[/bold green]"
+                )
             else:
                 self.console.print(
                     f"[yellow]Document with id {doc_id} not found.[/yellow]"
@@ -69,7 +189,7 @@ class HaikuRAGApp:
         async with HaikuRAG(db_path=self.db_path) as self.client:
             results = await self.client.search(query, limit=limit)
             if not results:
-                self.console.print("[red]No results found.[/red]")
+                self.console.print("[yellow]No results found.[/yellow]")
                 return
             for chunk, score in results:
                 self._rich_print_search_result(chunk, score)
@@ -102,9 +222,9 @@ class HaikuRAGApp:
                     self.console.print()
                 graph = build_research_graph()
+                context = ResearchContext(original_question=question)
                 state = ResearchState(
-                    question=question,
-                    context=ResearchContext(original_question=question),
+                    context=context,
                     max_iterations=max_iterations,
                     confidence_threshold=confidence_threshold,
                     max_concurrency=max_concurrency,
@@ -117,22 +237,20 @@ class HaikuRAGApp:
                     provider=Config.RESEARCH_PROVIDER or Config.QA_PROVIDER,
                     model=Config.RESEARCH_MODEL or Config.QA_MODEL,
                 )
-                # Prefer graph.run; fall back to iter if unavailable
                 report = None
-                try:
-                    result = await graph.run(start, state=state, deps=deps)
-                    report = result.output
-                except Exception:
-                    from pydantic_graph import End
-                    async with graph.iter(start, state=state, deps=deps) as run:
-                        node = run.next_node
-                        while not isinstance(node, End):
-                            node = await run.next(node)
-                        if run.result:
-                            report = run.result.output
+                async for event in stream_research_graph(graph, start, state, deps):
+                    if event.type == "report":
+                        report = event.report
+                        break
+                    if event.type == "error":
+                        self.console.print(
+                            f"[red]Error during research: {event.message}[/red]"
+                        )
+                        return
                 if report is None:
-                    raise RuntimeError("Graph did not produce a report")
+                    self.console.print("[red]Research did not produce a report.[/red]")
+                    return
                 # Display the report
                 self.console.print("[bold green]Research Report[/bold green]")
@@ -202,14 +320,16 @@ class HaikuRAGApp:
                     return
                 self.console.print(
-                    f"[b]Rebuilding database with {total_docs} documents...[/b]"
+                    f"[bold cyan]Rebuilding database with {total_docs} documents...[/bold cyan]"
                 )
                 with Progress() as progress:
                     task = progress.add_task("Rebuilding...", total=total_docs)
                     async for _ in client.rebuild_database():
                         progress.update(task, advance=1)
-                self.console.print("[b]Database rebuild completed successfully.[/b]")
+                self.console.print(
+                    "[bold green]Database rebuild completed successfully.[/bold green]"
+                )
             except Exception as e:
                 self.console.print(f"[red]Error rebuilding database: {e}[/red]")
@@ -218,7 +338,9 @@ class HaikuRAGApp:
         try:
             async with HaikuRAG(db_path=self.db_path, skip_validation=True) as client:
                 await client.vacuum()
-            self.console.print("[b]Vacuum completed successfully.[/b]")
+            self.console.print(
+                "[bold green]Vacuum completed successfully.[/bold green]"
+            )
         except Exception as e:
             self.console.print(f"[red]Error during vacuum: {e}[/red]")
@@ -240,7 +362,9 @@ class HaikuRAGApp:
             else:
                 display_value = field_value
-            self.console.print(f"  [cyan]{field_name}[/cyan]: {display_value}")
+            self.console.print(
+                f"  [repr.attrib_name]{field_name}[/repr.attrib_name]: {display_value}"
+            )
     def _rich_print_document(self, doc: Document, truncate: bool = False):
         """Format a document for display."""

haiku/rag/cli.py CHANGED Viewed

@@ -1,7 +1,9 @@
 import asyncio
+import json
 import warnings
 from importlib.metadata import version
 from pathlib import Path
+from typing import Any
 import typer
@@ -137,11 +139,41 @@ def list_documents(
     asyncio.run(app.list_documents())
+def _parse_meta_options(meta: list[str] | None) -> dict[str, Any]:
+    """Parse repeated --meta KEY=VALUE options into a dictionary.
+    Raises a Typer error if any entry is malformed.
+    """
+    result: dict[str, Any] = {}
+    if not meta:
+        return result
+    for item in meta:
+        if "=" not in item:
+            raise typer.BadParameter("--meta must be in KEY=VALUE format")
+        key, value = item.split("=", 1)
+        if not key:
+            raise typer.BadParameter("--meta key cannot be empty")
+        # Best-effort JSON coercion: numbers, booleans, null, arrays/objects
+        try:
+            parsed = json.loads(value)
+            result[key] = parsed
+        except Exception:
+            # Leave as string if not valid JSON literal
+            result[key] = value
+    return result
 @cli.command("add", help="Add a document from text input")
 def add_document_text(
     text: str = typer.Argument(
         help="The text content of the document to add",
     ),
+    meta: list[str] | None = typer.Option(
+        None,
+        "--meta",
+        help="Metadata entries as KEY=VALUE (repeatable)",
+        metavar="KEY=VALUE",
+    ),
     db: Path = typer.Option(
         Config.DEFAULT_DATA_DIR / "haiku.rag.lancedb",
         "--db",
@@ -151,7 +183,8 @@ def add_document_text(
     from haiku.rag.app import HaikuRAGApp
     app = HaikuRAGApp(db_path=db)
-    asyncio.run(app.add_document_from_text(text=text))
+    metadata = _parse_meta_options(meta)
+    asyncio.run(app.add_document_from_text(text=text, metadata=metadata or None))
 @cli.command("add-src", help="Add a document from a file path or URL")
@@ -165,6 +198,12 @@ def add_document_src(
         "--title",
         help="Optional human-readable title to store with the document",
     ),
+    meta: list[str] | None = typer.Option(
+        None,
+        "--meta",
+        help="Metadata entries as KEY=VALUE (repeatable)",
+        metavar="KEY=VALUE",
+    ),
     db: Path = typer.Option(
         Config.DEFAULT_DATA_DIR / "haiku.rag.lancedb",
         "--db",
@@ -174,7 +213,12 @@ def add_document_src(
     from haiku.rag.app import HaikuRAGApp
     app = HaikuRAGApp(db_path=db)
-    asyncio.run(app.add_document_from_source(source=source, title=title))
+    metadata = _parse_meta_options(meta)
+    asyncio.run(
+        app.add_document_from_source(
+            source=source, title=title, metadata=metadata or None
+        )
+    )
 @cli.command("get", help="Get and display a document by its ID")
@@ -347,6 +391,32 @@ def vacuum(
     asyncio.run(app.vacuum())
+@cli.command("info", help="Show read-only database info (no upgrades or writes)")
+def info(
+    db: Path = typer.Option(
+        Config.DEFAULT_DATA_DIR / "haiku.rag.lancedb",
+        "--db",
+        help="Path to the LanceDB database file",
+    ),
+):
+    from haiku.rag.app import HaikuRAGApp
+    app = HaikuRAGApp(db_path=db)
+    asyncio.run(app.info())
+@cli.command("download-models", help="Download Docling and Ollama models per config")
+def download_models_cmd():
+    from haiku.rag.utils import prefetch_models
+    try:
+        prefetch_models()
+        typer.echo("Models downloaded successfully.")
+    except Exception as e:
+        typer.echo(f"Error downloading models: {e}")
+        raise typer.Exit(1)
 @cli.command(
     "serve", help="Start the haiku.rag MCP server (by default in streamable HTTP mode)"
 )

haiku/rag/migration.py CHANGED Viewed

@@ -51,7 +51,7 @@ class SQLiteToLanceDBMigrator:
                 sqlite_conn.enable_load_extension(True)
                 sqlite_vec.load(sqlite_conn)
-                self.console.print("[blue]Loaded sqlite-vec extension[/blue]")
+                self.console.print("[cyan]Loaded sqlite-vec extension[/cyan]")
             except Exception as e:
                 self.console.print(
                     f"[yellow]Warning: Could not load sqlite-vec extension: {e}[/yellow]"
@@ -92,7 +92,7 @@ class SQLiteToLanceDBMigrator:
             sqlite_conn.close()
             # Optimize and cleanup using centralized vacuum
-            self.console.print("[blue]Optimizing LanceDB...[/blue]")
+            self.console.print("[cyan]Optimizing LanceDB...[/cyan]")
             try:
                 lance_store.vacuum()
                 self.console.print("[green]✅ Optimization completed[/green]")

haiku/rag/research/__init__.py CHANGED Viewed

@@ -6,6 +6,11 @@ from haiku.rag.research.graph import (
     build_research_graph,
 )
 from haiku.rag.research.models import EvaluationResult, ResearchReport, SearchAnswer
+from haiku.rag.research.stream import (
+    ResearchStateSnapshot,
+    ResearchStreamEvent,
+    stream_research_graph,
+)
 __all__ = [
     "ResearchDependencies",
@@ -17,4 +22,7 @@ __all__ = [
     "ResearchState",
     "PlanNode",
     "build_research_graph",
+    "stream_research_graph",
+    "ResearchStreamEvent",
+    "ResearchStateSnapshot",
 ]

haiku/rag/research/common.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Any
+from typing import TYPE_CHECKING, Any
 from pydantic_ai import format_as_xml
 from pydantic_ai.models.openai import OpenAIChatModel
@@ -7,6 +7,10 @@ from pydantic_ai.providers.openai import OpenAIProvider
 from haiku.rag.config import Config
 from haiku.rag.research.dependencies import ResearchContext
+from haiku.rag.research.models import InsightAnalysis
+if TYPE_CHECKING:  # pragma: no cover
+    from haiku.rag.research.state import ResearchDeps, ResearchState
 def get_model(provider: str, model: str) -> Any:
@@ -27,9 +31,8 @@ def get_model(provider: str, model: str) -> Any:
         return f"{provider}:{model}"
-def log(console, msg: str) -> None:
-    if console:
-        console.print(msg)
+def log(deps: "ResearchDeps", state: "ResearchState", msg: str) -> None:
+    deps.emit_log(msg, state)
 def format_context_for_prompt(context: ResearchContext) -> str:
@@ -47,7 +50,69 @@ def format_context_for_prompt(context: ResearchContext) -> str:
             }
             for qa in context.qa_responses
         ],
-        "insights": context.insights,
-        "gaps": context.gaps,
+        "insights": [
+            {
+                "id": insight.id,
+                "summary": insight.summary,
+                "status": insight.status.value,
+                "supporting_sources": insight.supporting_sources,
+                "originating_questions": insight.originating_questions,
+                "notes": insight.notes,
+            }
+            for insight in context.insights
+        ],
+        "gaps": [
+            {
+                "id": gap.id,
+                "description": gap.description,
+                "severity": gap.severity.value,
+                "blocking": gap.blocking,
+                "resolved": gap.resolved,
+                "resolved_by": gap.resolved_by,
+                "supporting_sources": gap.supporting_sources,
+                "notes": gap.notes,
+            }
+            for gap in context.gaps
+        ],
     }
     return format_as_xml(context_data, root_tag="research_context")
+def format_analysis_for_prompt(
+    analysis: InsightAnalysis | None,
+) -> str:
+    """Format the latest insight analysis as XML for prompts."""
+    if analysis is None:
+        return "<latest_analysis />"
+    data = {
+        "commentary": analysis.commentary,
+        "highlights": [
+            {
+                "id": insight.id,
+                "summary": insight.summary,
+                "status": insight.status.value,
+                "supporting_sources": insight.supporting_sources,
+                "originating_questions": insight.originating_questions,
+                "notes": insight.notes,
+            }
+            for insight in analysis.highlights
+        ],
+        "gap_assessments": [
+            {
+                "id": gap.id,
+                "description": gap.description,
+                "severity": gap.severity.value,
+                "blocking": gap.blocking,
+                "resolved": gap.resolved,
+                "resolved_by": gap.resolved_by,
+                "supporting_sources": gap.supporting_sources,
+                "notes": gap.notes,
+            }
+            for gap in analysis.gap_assessments
+        ],
+        "resolved_gaps": analysis.resolved_gaps,
+        "new_questions": analysis.new_questions,
+    }
+    return format_as_xml(data, root_tag="latest_analysis")

haiku.rag 0.10.1__py3-none-any.whl → 0.11.0__py3-none-any.whl

Potentially problematic release.

haiku.rag 0.10.1py3-none-any.whl → 0.11.0py3-none-any.whl