haiku.rag-slim 0.17.1__tar.gz → 0.21.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of haiku.rag-slim might be problematic. Click here for more details.
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/.gitignore +1 -0
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/PKG-INFO +14 -10
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/README.md +1 -1
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/haiku/rag/app.py +298 -70
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/haiku/rag/chunkers/base.py +6 -3
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/haiku/rag/chunkers/docling_local.py +63 -7
- haiku_rag_slim-0.21.0/haiku/rag/chunkers/docling_serve.py +203 -0
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/haiku/rag/cli.py +148 -23
- haiku_rag_slim-0.21.0/haiku/rag/cli_chat.py +489 -0
- haiku_rag_slim-0.21.0/haiku/rag/client.py +1699 -0
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/haiku/rag/config/__init__.py +14 -8
- haiku_rag_slim-0.21.0/haiku/rag/config/loader.py +55 -0
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/haiku/rag/config/models.py +66 -11
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/haiku/rag/converters/base.py +9 -3
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/haiku/rag/converters/docling_local.py +64 -61
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/haiku/rag/converters/docling_serve.py +64 -43
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/haiku/rag/converters/text_utils.py +60 -12
- haiku_rag_slim-0.21.0/haiku/rag/embeddings/__init__.py +119 -0
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/haiku/rag/embeddings/base.py +2 -2
- haiku_rag_slim-0.21.0/haiku/rag/embeddings/lm_studio.py +28 -0
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/haiku/rag/embeddings/vllm.py +1 -1
- haiku_rag_slim-0.21.0/haiku/rag/graph/__init__.py +15 -0
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/haiku/rag/graph/agui/__init__.py +8 -2
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/haiku/rag/graph/agui/cli_renderer.py +1 -1
- haiku_rag_slim-0.21.0/haiku/rag/graph/agui/emitter.py +382 -0
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/haiku/rag/graph/agui/server.py +20 -62
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/haiku/rag/graph/agui/stream.py +1 -2
- haiku_rag_slim-0.21.0/haiku/rag/graph/research/__init__.py +6 -0
- haiku_rag_slim-0.21.0/haiku/rag/graph/research/dependencies.py +37 -0
- haiku_rag_slim-0.21.0/haiku/rag/graph/research/graph.py +541 -0
- haiku_rag_slim-0.21.0/haiku/rag/graph/research/models.py +145 -0
- haiku_rag_slim-0.21.0/haiku/rag/graph/research/prompts.py +115 -0
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/haiku/rag/graph/research/state.py +35 -27
- haiku_rag_slim-0.21.0/haiku/rag/inspector/__init__.py +8 -0
- haiku_rag_slim-0.21.0/haiku/rag/inspector/app.py +243 -0
- haiku_rag_slim-0.21.0/haiku/rag/inspector/widgets/__init__.py +6 -0
- haiku_rag_slim-0.21.0/haiku/rag/inspector/widgets/chunk_list.py +100 -0
- haiku_rag_slim-0.21.0/haiku/rag/inspector/widgets/context_modal.py +89 -0
- haiku_rag_slim-0.21.0/haiku/rag/inspector/widgets/detail_view.py +130 -0
- haiku_rag_slim-0.21.0/haiku/rag/inspector/widgets/document_list.py +75 -0
- haiku_rag_slim-0.21.0/haiku/rag/inspector/widgets/info_modal.py +209 -0
- haiku_rag_slim-0.21.0/haiku/rag/inspector/widgets/search_modal.py +183 -0
- haiku_rag_slim-0.21.0/haiku/rag/inspector/widgets/visual_modal.py +126 -0
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/haiku/rag/mcp.py +25 -33
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/haiku/rag/qa/__init__.py +2 -10
- haiku_rag_slim-0.21.0/haiku/rag/qa/agent.py +75 -0
- haiku_rag_slim-0.21.0/haiku/rag/qa/prompts.py +38 -0
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/haiku/rag/reranking/__init__.py +6 -6
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/haiku/rag/reranking/base.py +1 -1
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/haiku/rag/reranking/cohere.py +3 -2
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/haiku/rag/reranking/mxbai.py +5 -2
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/haiku/rag/reranking/vllm.py +1 -1
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/haiku/rag/reranking/zeroentropy.py +3 -2
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/haiku/rag/store/engine.py +110 -34
- haiku_rag_slim-0.21.0/haiku/rag/store/models/__init__.py +10 -0
- haiku_rag_slim-0.21.0/haiku/rag/store/models/chunk.py +207 -0
- haiku_rag_slim-0.21.0/haiku/rag/store/models/document.py +63 -0
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/haiku/rag/store/repositories/chunk.py +119 -118
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/haiku/rag/store/repositories/document.py +21 -84
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/haiku/rag/store/repositories/settings.py +10 -14
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/haiku/rag/store/upgrades/__init__.py +15 -3
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/haiku/rag/store/upgrades/v0_10_1.py +1 -1
- haiku_rag_slim-0.21.0/haiku/rag/store/upgrades/v0_19_6.py +65 -0
- haiku_rag_slim-0.21.0/haiku/rag/store/upgrades/v0_20_0.py +68 -0
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/haiku/rag/store/upgrades/v0_9_3.py +3 -3
- haiku_rag_slim-0.21.0/haiku/rag/utils.py +406 -0
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/pyproject.toml +11 -9
- haiku_rag_slim-0.17.1/haiku/rag/chunkers/docling_serve.py +0 -111
- haiku_rag_slim-0.17.1/haiku/rag/client.py +0 -727
- haiku_rag_slim-0.17.1/haiku/rag/config/loader.py +0 -94
- haiku_rag_slim-0.17.1/haiku/rag/embeddings/__init__.py +0 -49
- haiku_rag_slim-0.17.1/haiku/rag/graph/__init__.py +0 -26
- haiku_rag_slim-0.17.1/haiku/rag/graph/agui/emitter.py +0 -197
- haiku_rag_slim-0.17.1/haiku/rag/graph/agui/events.py +0 -254
- haiku_rag_slim-0.17.1/haiku/rag/graph/common/__init__.py +0 -5
- haiku_rag_slim-0.17.1/haiku/rag/graph/common/models.py +0 -42
- haiku_rag_slim-0.17.1/haiku/rag/graph/common/nodes.py +0 -265
- haiku_rag_slim-0.17.1/haiku/rag/graph/common/prompts.py +0 -46
- haiku_rag_slim-0.17.1/haiku/rag/graph/common/utils.py +0 -44
- haiku_rag_slim-0.17.1/haiku/rag/graph/deep_qa/__init__.py +0 -1
- haiku_rag_slim-0.17.1/haiku/rag/graph/deep_qa/dependencies.py +0 -27
- haiku_rag_slim-0.17.1/haiku/rag/graph/deep_qa/graph.py +0 -243
- haiku_rag_slim-0.17.1/haiku/rag/graph/deep_qa/models.py +0 -20
- haiku_rag_slim-0.17.1/haiku/rag/graph/deep_qa/prompts.py +0 -59
- haiku_rag_slim-0.17.1/haiku/rag/graph/deep_qa/state.py +0 -56
- haiku_rag_slim-0.17.1/haiku/rag/graph/research/__init__.py +0 -3
- haiku_rag_slim-0.17.1/haiku/rag/graph/research/common.py +0 -87
- haiku_rag_slim-0.17.1/haiku/rag/graph/research/dependencies.py +0 -151
- haiku_rag_slim-0.17.1/haiku/rag/graph/research/graph.py +0 -295
- haiku_rag_slim-0.17.1/haiku/rag/graph/research/models.py +0 -166
- haiku_rag_slim-0.17.1/haiku/rag/graph/research/prompts.py +0 -107
- haiku_rag_slim-0.17.1/haiku/rag/qa/agent.py +0 -93
- haiku_rag_slim-0.17.1/haiku/rag/qa/prompts.py +0 -60
- haiku_rag_slim-0.17.1/haiku/rag/store/models/__init__.py +0 -4
- haiku_rag_slim-0.17.1/haiku/rag/store/models/chunk.py +0 -17
- haiku_rag_slim-0.17.1/haiku/rag/store/models/document.py +0 -17
- haiku_rag_slim-0.17.1/haiku/rag/utils.py +0 -148
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/LICENSE +0 -0
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/haiku/rag/__init__.py +0 -0
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/haiku/rag/chunkers/__init__.py +0 -0
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/haiku/rag/converters/__init__.py +0 -0
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/haiku/rag/embeddings/ollama.py +0 -0
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/haiku/rag/embeddings/openai.py +0 -0
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/haiku/rag/embeddings/voyageai.py +0 -0
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/haiku/rag/graph/agui/state.py +0 -0
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/haiku/rag/logging.py +0 -0
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/haiku/rag/monitor.py +0 -0
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/haiku/rag/store/__init__.py +0 -0
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/haiku/rag/store/repositories/__init__.py +0 -0
- {haiku_rag_slim-0.17.1 → haiku_rag_slim-0.21.0}/test_agui_server.py +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: haiku.rag-slim
|
|
3
|
-
Version: 0.
|
|
4
|
-
Summary:
|
|
3
|
+
Version: 0.21.0
|
|
4
|
+
Summary: Opinionated agentic RAG powered by LanceDB, Pydantic AI, and Docling - Minimal dependencies
|
|
5
5
|
Author-email: Yiorgis Gozadinos <ggozadinos@gmail.com>
|
|
6
6
|
License: MIT
|
|
7
7
|
License-File: LICENSE
|
|
@@ -17,12 +17,12 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
17
17
|
Classifier: Programming Language :: Python :: 3.13
|
|
18
18
|
Classifier: Typing :: Typed
|
|
19
19
|
Requires-Python: >=3.12
|
|
20
|
-
Requires-Dist: docling-core==2.
|
|
20
|
+
Requires-Dist: docling-core==2.54.0
|
|
21
21
|
Requires-Dist: httpx>=0.28.1
|
|
22
|
-
Requires-Dist: lancedb==0.25.
|
|
22
|
+
Requires-Dist: lancedb==0.25.3
|
|
23
23
|
Requires-Dist: pathspec>=0.12.1
|
|
24
|
-
Requires-Dist: pydantic-ai-slim[ag-ui,fastmcp,logfire,openai]==1.
|
|
25
|
-
Requires-Dist: pydantic>=2.12.
|
|
24
|
+
Requires-Dist: pydantic-ai-slim[ag-ui,fastmcp,logfire,openai]==1.27.0
|
|
25
|
+
Requires-Dist: pydantic>=2.12.5
|
|
26
26
|
Requires-Dist: python-dotenv>=1.2.1
|
|
27
27
|
Requires-Dist: pyyaml>=6.0.3
|
|
28
28
|
Requires-Dist: rich>=14.2.0
|
|
@@ -33,13 +33,17 @@ Requires-Dist: pydantic-ai-slim[anthropic]; extra == 'anthropic'
|
|
|
33
33
|
Provides-Extra: bedrock
|
|
34
34
|
Requires-Dist: pydantic-ai-slim[bedrock]; extra == 'bedrock'
|
|
35
35
|
Provides-Extra: cohere
|
|
36
|
-
Requires-Dist: cohere>=5.
|
|
36
|
+
Requires-Dist: cohere>=5.20.0; extra == 'cohere'
|
|
37
37
|
Provides-Extra: docling
|
|
38
|
-
Requires-Dist: docling==2.
|
|
38
|
+
Requires-Dist: docling==2.64.0; extra == 'docling'
|
|
39
|
+
Requires-Dist: opencv-python-headless>=4.11.0.86; extra == 'docling'
|
|
39
40
|
Provides-Extra: google
|
|
40
41
|
Requires-Dist: pydantic-ai-slim[google]; extra == 'google'
|
|
41
42
|
Provides-Extra: groq
|
|
42
43
|
Requires-Dist: pydantic-ai-slim[groq]; extra == 'groq'
|
|
44
|
+
Provides-Extra: inspector
|
|
45
|
+
Requires-Dist: textual-image>=0.8.4; extra == 'inspector'
|
|
46
|
+
Requires-Dist: textual>=6.0.0; extra == 'inspector'
|
|
43
47
|
Provides-Extra: mistral
|
|
44
48
|
Requires-Dist: pydantic-ai-slim[mistral]; extra == 'mistral'
|
|
45
49
|
Provides-Extra: mxbai
|
|
@@ -49,12 +53,12 @@ Requires-Dist: pydantic-ai-slim[vertexai]; extra == 'vertexai'
|
|
|
49
53
|
Provides-Extra: voyageai
|
|
50
54
|
Requires-Dist: voyageai>=0.3.5; extra == 'voyageai'
|
|
51
55
|
Provides-Extra: zeroentropy
|
|
52
|
-
Requires-Dist: zeroentropy>=0.1.
|
|
56
|
+
Requires-Dist: zeroentropy>=0.1.0a7; extra == 'zeroentropy'
|
|
53
57
|
Description-Content-Type: text/markdown
|
|
54
58
|
|
|
55
59
|
# haiku.rag-slim
|
|
56
60
|
|
|
57
|
-
|
|
61
|
+
Opinionated agentic RAG powered by LanceDB, Pydantic AI, and Docling - Core package with minimal dependencies.
|
|
58
62
|
|
|
59
63
|
`haiku.rag-slim` is the core package for users who want to install only the dependencies they need. Document processing (docling), and reranker support are all optional extras.
|
|
60
64
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# haiku.rag-slim
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
Opinionated agentic RAG powered by LanceDB, Pydantic AI, and Docling - Core package with minimal dependencies.
|
|
4
4
|
|
|
5
5
|
`haiku.rag-slim` is the core package for users who want to install only the dependencies they need. Document processing (docling), and reranker support are all optional extras.
|
|
6
6
|
|
|
@@ -3,12 +3,21 @@ import json
|
|
|
3
3
|
import logging
|
|
4
4
|
from importlib.metadata import version as pkg_version
|
|
5
5
|
from pathlib import Path
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
6
7
|
|
|
7
8
|
from rich.console import Console
|
|
8
9
|
from rich.markdown import Markdown
|
|
9
|
-
from rich.progress import
|
|
10
|
-
|
|
11
|
-
|
|
10
|
+
from rich.progress import (
|
|
11
|
+
BarColumn,
|
|
12
|
+
DownloadColumn,
|
|
13
|
+
Progress,
|
|
14
|
+
SpinnerColumn,
|
|
15
|
+
TaskID,
|
|
16
|
+
TextColumn,
|
|
17
|
+
TransferSpeedColumn,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
from haiku.rag.client import HaikuRAG, RebuildMode
|
|
12
21
|
from haiku.rag.config import AppConfig, Config
|
|
13
22
|
from haiku.rag.graph.agui import AGUIConsoleRenderer, stream_graph
|
|
14
23
|
from haiku.rag.graph.research.dependencies import ResearchContext
|
|
@@ -16,9 +25,12 @@ from haiku.rag.graph.research.graph import build_research_graph
|
|
|
16
25
|
from haiku.rag.graph.research.state import ResearchDeps, ResearchState
|
|
17
26
|
from haiku.rag.mcp import create_mcp_server
|
|
18
27
|
from haiku.rag.monitor import FileWatcher
|
|
19
|
-
from haiku.rag.store.models.chunk import Chunk
|
|
20
28
|
from haiku.rag.store.models.document import Document
|
|
21
29
|
|
|
30
|
+
if TYPE_CHECKING:
|
|
31
|
+
from haiku.rag.store.models import SearchResult
|
|
32
|
+
from haiku.rag.utils import format_bytes, format_citations_rich
|
|
33
|
+
|
|
22
34
|
logger = logging.getLogger(__name__)
|
|
23
35
|
|
|
24
36
|
|
|
@@ -28,6 +40,21 @@ class HaikuRAGApp:
|
|
|
28
40
|
self.config = config
|
|
29
41
|
self.console = Console()
|
|
30
42
|
|
|
43
|
+
async def init(self):
|
|
44
|
+
"""Initialize a new database."""
|
|
45
|
+
if self.db_path.exists():
|
|
46
|
+
self.console.print(
|
|
47
|
+
f"[yellow]Database already exists at {self.db_path}[/yellow]"
|
|
48
|
+
)
|
|
49
|
+
return
|
|
50
|
+
|
|
51
|
+
# Create the database
|
|
52
|
+
client = HaikuRAG(db_path=self.db_path, config=self.config, create=True)
|
|
53
|
+
client.close()
|
|
54
|
+
self.console.print(
|
|
55
|
+
f"[bold green]Database initialized at {self.db_path}[/bold green]"
|
|
56
|
+
)
|
|
57
|
+
|
|
31
58
|
async def info(self):
|
|
32
59
|
"""Display read-only information about the database without modifying it."""
|
|
33
60
|
|
|
@@ -64,7 +91,13 @@ class HaikuRAGApp:
|
|
|
64
91
|
except Exception:
|
|
65
92
|
docling_version = "unknown"
|
|
66
93
|
|
|
67
|
-
#
|
|
94
|
+
# Get comprehensive table statistics (this also runs migrations)
|
|
95
|
+
from haiku.rag.store.engine import Store
|
|
96
|
+
|
|
97
|
+
store = Store(self.db_path, config=self.config, skip_validation=True)
|
|
98
|
+
table_stats = store.get_stats()
|
|
99
|
+
|
|
100
|
+
# Read settings after Store init (migrations have run)
|
|
68
101
|
stored_version = "unknown"
|
|
69
102
|
embed_provider: str | None = None
|
|
70
103
|
embed_model: str | None = None
|
|
@@ -79,14 +112,22 @@ class HaikuRAGApp:
|
|
|
79
112
|
data = json.loads(raw) if isinstance(raw, str) else (raw or {})
|
|
80
113
|
stored_version = str(data.get("version", stored_version))
|
|
81
114
|
embeddings = data.get("embeddings", {})
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
115
|
+
embed_model_obj = embeddings.get("model", {})
|
|
116
|
+
embed_provider = embed_model_obj.get("provider")
|
|
117
|
+
embed_model = embed_model_obj.get("name")
|
|
118
|
+
vector_dim = embed_model_obj.get("vector_dim")
|
|
119
|
+
|
|
120
|
+
store.close()
|
|
121
|
+
|
|
122
|
+
num_docs = table_stats["documents"].get("num_rows", 0)
|
|
123
|
+
doc_bytes = table_stats["documents"].get("total_bytes", 0)
|
|
85
124
|
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
125
|
+
num_chunks = table_stats["chunks"].get("num_rows", 0)
|
|
126
|
+
chunk_bytes = table_stats["chunks"].get("total_bytes", 0)
|
|
127
|
+
|
|
128
|
+
has_vector_index = table_stats["chunks"].get("has_vector_index", False)
|
|
129
|
+
num_indexed_rows = table_stats["chunks"].get("num_indexed_rows", 0)
|
|
130
|
+
num_unindexed_rows = table_stats["chunks"].get("num_unindexed_rows", 0)
|
|
90
131
|
|
|
91
132
|
# Table versions per table (direct API)
|
|
92
133
|
doc_versions = (
|
|
@@ -116,8 +157,43 @@ class HaikuRAGApp:
|
|
|
116
157
|
" [repr.attrib_name]embeddings[/repr.attrib_name]: unknown"
|
|
117
158
|
)
|
|
118
159
|
self.console.print(
|
|
119
|
-
f" [repr.attrib_name]documents[/repr.attrib_name]: {num_docs}"
|
|
160
|
+
f" [repr.attrib_name]documents[/repr.attrib_name]: {num_docs} "
|
|
161
|
+
f"({format_bytes(doc_bytes)})"
|
|
162
|
+
)
|
|
163
|
+
self.console.print(
|
|
164
|
+
f" [repr.attrib_name]chunks[/repr.attrib_name]: {num_chunks} "
|
|
165
|
+
f"({format_bytes(chunk_bytes)})"
|
|
120
166
|
)
|
|
167
|
+
|
|
168
|
+
# Vector index information
|
|
169
|
+
if has_vector_index:
|
|
170
|
+
self.console.print(
|
|
171
|
+
" [repr.attrib_name]vector index[/repr.attrib_name]: ✓ exists"
|
|
172
|
+
)
|
|
173
|
+
self.console.print(
|
|
174
|
+
f" [repr.attrib_name]indexed chunks[/repr.attrib_name]: {num_indexed_rows}"
|
|
175
|
+
)
|
|
176
|
+
if num_unindexed_rows > 0:
|
|
177
|
+
self.console.print(
|
|
178
|
+
f" [repr.attrib_name]unindexed chunks[/repr.attrib_name]: [yellow]{num_unindexed_rows}[/yellow] "
|
|
179
|
+
"(consider running: haiku-rag create-index)"
|
|
180
|
+
)
|
|
181
|
+
else:
|
|
182
|
+
self.console.print(
|
|
183
|
+
f" [repr.attrib_name]unindexed chunks[/repr.attrib_name]: {num_unindexed_rows}"
|
|
184
|
+
)
|
|
185
|
+
else:
|
|
186
|
+
if num_chunks >= 256:
|
|
187
|
+
self.console.print(
|
|
188
|
+
" [repr.attrib_name]vector index[/repr.attrib_name]: [yellow]✗ not created[/yellow] "
|
|
189
|
+
"(run: haiku-rag create-index)"
|
|
190
|
+
)
|
|
191
|
+
else:
|
|
192
|
+
self.console.print(
|
|
193
|
+
f" [repr.attrib_name]vector index[/repr.attrib_name]: ✗ not created "
|
|
194
|
+
f"(need {256 - num_chunks} more chunks)"
|
|
195
|
+
)
|
|
196
|
+
|
|
121
197
|
self.console.print(
|
|
122
198
|
f" [repr.attrib_name]versions (documents)[/repr.attrib_name]: {doc_versions}"
|
|
123
199
|
)
|
|
@@ -137,9 +213,7 @@ class HaikuRAGApp:
|
|
|
137
213
|
)
|
|
138
214
|
|
|
139
215
|
async def list_documents(self, filter: str | None = None):
|
|
140
|
-
async with HaikuRAG(
|
|
141
|
-
db_path=self.db_path, config=self.config, allow_create=False
|
|
142
|
-
) as self.client:
|
|
216
|
+
async with HaikuRAG(db_path=self.db_path, config=self.config) as self.client:
|
|
143
217
|
documents = await self.client.list_documents(filter=filter)
|
|
144
218
|
for doc in documents:
|
|
145
219
|
self._rich_print_document(doc, truncate=True)
|
|
@@ -172,9 +246,7 @@ class HaikuRAGApp:
|
|
|
172
246
|
)
|
|
173
247
|
|
|
174
248
|
async def get_document(self, doc_id: str):
|
|
175
|
-
async with HaikuRAG(
|
|
176
|
-
db_path=self.db_path, config=self.config, allow_create=False
|
|
177
|
-
) as self.client:
|
|
249
|
+
async with HaikuRAG(db_path=self.db_path, config=self.config) as self.client:
|
|
178
250
|
doc = await self.client.get_document_by_id(doc_id)
|
|
179
251
|
if doc is None:
|
|
180
252
|
self.console.print(f"[red]Document with id {doc_id} not found.[/red]")
|
|
@@ -193,16 +265,48 @@ class HaikuRAGApp:
|
|
|
193
265
|
f"[yellow]Document with id {doc_id} not found.[/yellow]"
|
|
194
266
|
)
|
|
195
267
|
|
|
196
|
-
async def search(
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
) as self.client:
|
|
268
|
+
async def search(
|
|
269
|
+
self, query: str, limit: int | None = None, filter: str | None = None
|
|
270
|
+
):
|
|
271
|
+
async with HaikuRAG(db_path=self.db_path, config=self.config) as self.client:
|
|
200
272
|
results = await self.client.search(query, limit=limit, filter=filter)
|
|
201
273
|
if not results:
|
|
202
274
|
self.console.print("[yellow]No results found.[/yellow]")
|
|
203
275
|
return
|
|
204
|
-
for
|
|
205
|
-
self._rich_print_search_result(
|
|
276
|
+
for result in results:
|
|
277
|
+
self._rich_print_search_result(result)
|
|
278
|
+
|
|
279
|
+
async def visualize_chunk(self, chunk_id: str):
|
|
280
|
+
"""Display visual grounding images for a chunk."""
|
|
281
|
+
from textual_image.renderable import Image as RichImage
|
|
282
|
+
|
|
283
|
+
async with HaikuRAG(db_path=self.db_path, config=self.config) as self.client:
|
|
284
|
+
chunk = await self.client.chunk_repository.get_by_id(chunk_id)
|
|
285
|
+
if not chunk:
|
|
286
|
+
self.console.print(f"[red]Chunk with id {chunk_id} not found.[/red]")
|
|
287
|
+
return
|
|
288
|
+
|
|
289
|
+
images = await self.client.visualize_chunk(chunk)
|
|
290
|
+
if not images:
|
|
291
|
+
self.console.print(
|
|
292
|
+
"[yellow]No visual grounding available for this chunk.[/yellow]"
|
|
293
|
+
)
|
|
294
|
+
self.console.print(
|
|
295
|
+
"This may be because the document was converted without page images."
|
|
296
|
+
)
|
|
297
|
+
return
|
|
298
|
+
|
|
299
|
+
self.console.print(f"[bold]Visual grounding for chunk {chunk_id}[/bold]")
|
|
300
|
+
if chunk.document_uri:
|
|
301
|
+
self.console.print(
|
|
302
|
+
f"[repr.attrib_name]document[/repr.attrib_name]: {chunk.document_uri}"
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
for i, img in enumerate(images):
|
|
306
|
+
self.console.print(
|
|
307
|
+
f"\n[bold cyan]Page {i + 1}/{len(images)}[/bold cyan]"
|
|
308
|
+
)
|
|
309
|
+
self.console.print(RichImage(img))
|
|
206
310
|
|
|
207
311
|
async def ask(
|
|
208
312
|
self,
|
|
@@ -210,6 +314,7 @@ class HaikuRAGApp:
|
|
|
210
314
|
cite: bool = False,
|
|
211
315
|
deep: bool = False,
|
|
212
316
|
verbose: bool = False,
|
|
317
|
+
filter: str | None = None,
|
|
213
318
|
):
|
|
214
319
|
"""Ask a question using the RAG system.
|
|
215
320
|
|
|
@@ -218,57 +323,78 @@ class HaikuRAGApp:
|
|
|
218
323
|
cite: Include citations in the answer
|
|
219
324
|
deep: Use deep QA mode (multi-step reasoning)
|
|
220
325
|
verbose: Show verbose output
|
|
326
|
+
filter: SQL WHERE clause to filter documents
|
|
221
327
|
"""
|
|
222
|
-
async with HaikuRAG(
|
|
223
|
-
db_path=self.db_path, config=self.config, allow_create=False
|
|
224
|
-
) as self.client:
|
|
328
|
+
async with HaikuRAG(db_path=self.db_path, config=self.config) as self.client:
|
|
225
329
|
try:
|
|
330
|
+
citations = []
|
|
226
331
|
if deep:
|
|
227
|
-
from haiku.rag.graph.
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
332
|
+
from haiku.rag.graph.research.models import ResearchReport
|
|
333
|
+
|
|
334
|
+
graph = build_research_graph(config=self.config)
|
|
335
|
+
context = ResearchContext(original_question=question)
|
|
336
|
+
state = ResearchState.from_config(
|
|
337
|
+
context=context,
|
|
338
|
+
config=self.config,
|
|
339
|
+
max_iterations=2,
|
|
340
|
+
confidence_threshold=0.0,
|
|
234
341
|
)
|
|
235
|
-
state =
|
|
236
|
-
deps =
|
|
342
|
+
state.search_filter = filter
|
|
343
|
+
deps = ResearchDeps(client=self.client)
|
|
237
344
|
|
|
238
345
|
if verbose:
|
|
239
|
-
# Use AG-UI renderer to process and display events
|
|
240
|
-
from haiku.rag.graph.agui import AGUIConsoleRenderer
|
|
241
|
-
|
|
242
346
|
renderer = AGUIConsoleRenderer(self.console)
|
|
243
347
|
result_dict = await renderer.render(
|
|
244
348
|
stream_graph(graph, state, deps)
|
|
245
349
|
)
|
|
246
|
-
|
|
247
|
-
|
|
350
|
+
report = (
|
|
351
|
+
ResearchReport.model_validate(result_dict)
|
|
352
|
+
if result_dict
|
|
353
|
+
else None
|
|
354
|
+
)
|
|
248
355
|
else:
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
356
|
+
report = await graph.run(state=state, deps=deps)
|
|
357
|
+
|
|
358
|
+
self.console.print(f"[bold blue]Question:[/bold blue] {question}")
|
|
359
|
+
self.console.print()
|
|
360
|
+
if report:
|
|
361
|
+
self.console.print("[bold green]Answer:[/bold green]")
|
|
362
|
+
self.console.print(Markdown(report.executive_summary))
|
|
363
|
+
if report.main_findings:
|
|
364
|
+
self.console.print()
|
|
365
|
+
self.console.print("[bold cyan]Key Findings:[/bold cyan]")
|
|
366
|
+
for finding in report.main_findings:
|
|
367
|
+
self.console.print(f"• {finding}")
|
|
368
|
+
if report.sources_summary:
|
|
369
|
+
self.console.print()
|
|
370
|
+
self.console.print("[bold cyan]Sources:[/bold cyan]")
|
|
371
|
+
self.console.print(report.sources_summary)
|
|
372
|
+
else:
|
|
373
|
+
self.console.print("[yellow]No answer generated.[/yellow]")
|
|
252
374
|
else:
|
|
253
|
-
answer = await self.client.ask(question,
|
|
375
|
+
answer, citations = await self.client.ask(question, filter=filter)
|
|
254
376
|
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
377
|
+
self.console.print(f"[bold blue]Question:[/bold blue] {question}")
|
|
378
|
+
self.console.print()
|
|
379
|
+
self.console.print("[bold green]Answer:[/bold green]")
|
|
380
|
+
self.console.print(Markdown(answer))
|
|
381
|
+
if cite and citations:
|
|
382
|
+
for renderable in format_citations_rich(citations):
|
|
383
|
+
self.console.print(renderable)
|
|
259
384
|
except Exception as e:
|
|
260
385
|
self.console.print(f"[red]Error: {e}[/red]")
|
|
261
386
|
|
|
262
|
-
async def research(
|
|
387
|
+
async def research(
|
|
388
|
+
self, question: str, verbose: bool = False, filter: str | None = None
|
|
389
|
+
):
|
|
263
390
|
"""Run research via the pydantic-graph pipeline.
|
|
264
391
|
|
|
265
392
|
Args:
|
|
266
393
|
question: The research question
|
|
267
394
|
verbose: Show AG-UI event stream during execution
|
|
395
|
+
filter: SQL WHERE clause to filter documents
|
|
268
396
|
"""
|
|
269
|
-
async with HaikuRAG(
|
|
270
|
-
db_path=self.db_path, config=self.config, allow_create=False
|
|
271
|
-
) as client:
|
|
397
|
+
async with HaikuRAG(db_path=self.db_path, config=self.config) as client:
|
|
272
398
|
try:
|
|
273
399
|
self.console.print("[bold cyan]Starting research[/bold cyan]")
|
|
274
400
|
self.console.print(f"[bold blue]Question:[/bold blue] {question}")
|
|
@@ -277,6 +403,7 @@ class HaikuRAGApp:
|
|
|
277
403
|
graph = build_research_graph(config=self.config)
|
|
278
404
|
context = ResearchContext(original_question=question)
|
|
279
405
|
state = ResearchState.from_config(context=context, config=self.config)
|
|
406
|
+
state.search_filter = filter
|
|
280
407
|
deps = ResearchDeps(client=client)
|
|
281
408
|
|
|
282
409
|
if verbose:
|
|
@@ -356,7 +483,7 @@ class HaikuRAGApp:
|
|
|
356
483
|
except Exception as e:
|
|
357
484
|
self.console.print(f"[red]Error during research: {e}[/red]")
|
|
358
485
|
|
|
359
|
-
async def rebuild(self):
|
|
486
|
+
async def rebuild(self, mode: RebuildMode = RebuildMode.FULL):
|
|
360
487
|
async with HaikuRAG(
|
|
361
488
|
db_path=self.db_path, config=self.config, skip_validation=True
|
|
362
489
|
) as client:
|
|
@@ -370,12 +497,18 @@ class HaikuRAGApp:
|
|
|
370
497
|
)
|
|
371
498
|
return
|
|
372
499
|
|
|
500
|
+
mode_desc = {
|
|
501
|
+
RebuildMode.FULL: "full rebuild",
|
|
502
|
+
RebuildMode.RECHUNK: "rechunk",
|
|
503
|
+
RebuildMode.EMBED_ONLY: "embed only",
|
|
504
|
+
}[mode]
|
|
505
|
+
|
|
373
506
|
self.console.print(
|
|
374
|
-
f"[bold cyan]Rebuilding database with {total_docs} documents...[/bold cyan]"
|
|
507
|
+
f"[bold cyan]Rebuilding database ({mode_desc}) with {total_docs} documents...[/bold cyan]"
|
|
375
508
|
)
|
|
376
509
|
with Progress() as progress:
|
|
377
510
|
task = progress.add_task("Rebuilding...", total=total_docs)
|
|
378
|
-
async for _ in client.rebuild_database():
|
|
511
|
+
async for _ in client.rebuild_database(mode=mode):
|
|
379
512
|
progress.update(task, advance=1)
|
|
380
513
|
|
|
381
514
|
self.console.print(
|
|
@@ -397,6 +530,96 @@ class HaikuRAGApp:
|
|
|
397
530
|
except Exception as e:
|
|
398
531
|
self.console.print(f"[red]Error during vacuum: {e}[/red]")
|
|
399
532
|
|
|
533
|
+
async def create_index(self):
|
|
534
|
+
"""Create vector index on the chunks table."""
|
|
535
|
+
try:
|
|
536
|
+
async with HaikuRAG(
|
|
537
|
+
db_path=self.db_path, config=self.config, skip_validation=True
|
|
538
|
+
) as client:
|
|
539
|
+
row_count = client.store.chunks_table.count_rows()
|
|
540
|
+
self.console.print(f"Chunks in database: {row_count}")
|
|
541
|
+
|
|
542
|
+
if row_count < 256:
|
|
543
|
+
self.console.print(
|
|
544
|
+
f"[yellow]Warning: Need at least 256 chunks to create an index (have {row_count})[/yellow]"
|
|
545
|
+
)
|
|
546
|
+
return
|
|
547
|
+
|
|
548
|
+
# Check if index already exists
|
|
549
|
+
indices = client.store.chunks_table.list_indices()
|
|
550
|
+
has_vector_index = any("vector" in str(idx).lower() for idx in indices)
|
|
551
|
+
|
|
552
|
+
if has_vector_index:
|
|
553
|
+
self.console.print(
|
|
554
|
+
"[yellow]Rebuilding existing vector index...[/yellow]"
|
|
555
|
+
)
|
|
556
|
+
else:
|
|
557
|
+
self.console.print("[bold]Creating vector index...[/bold]")
|
|
558
|
+
|
|
559
|
+
client.store._ensure_vector_index()
|
|
560
|
+
self.console.print(
|
|
561
|
+
"[bold green]Vector index created successfully.[/bold green]"
|
|
562
|
+
)
|
|
563
|
+
except Exception as e:
|
|
564
|
+
self.console.print(f"[red]Error creating index: {e}[/red]")
|
|
565
|
+
|
|
566
|
+
async def download_models(self):
|
|
567
|
+
"""Download Docling, HuggingFace tokenizer, and Ollama models per config."""
|
|
568
|
+
from haiku.rag.client import HaikuRAG
|
|
569
|
+
|
|
570
|
+
client = HaikuRAG(db_path=None, config=self.config)
|
|
571
|
+
|
|
572
|
+
progress: Progress | None = None
|
|
573
|
+
task_id: TaskID | None = None
|
|
574
|
+
current_model = ""
|
|
575
|
+
current_digest = ""
|
|
576
|
+
|
|
577
|
+
async for event in client.download_models():
|
|
578
|
+
if event.status == "start":
|
|
579
|
+
self.console.print(
|
|
580
|
+
f"[bold blue]Downloading {event.model}...[/bold blue]"
|
|
581
|
+
)
|
|
582
|
+
elif event.status == "done":
|
|
583
|
+
if progress:
|
|
584
|
+
progress.stop()
|
|
585
|
+
progress = None
|
|
586
|
+
task_id = None
|
|
587
|
+
self.console.print(f"[green]✓[/green] {event.model}")
|
|
588
|
+
current_model = ""
|
|
589
|
+
current_digest = ""
|
|
590
|
+
elif event.status == "pulling":
|
|
591
|
+
self.console.print(f"[bold blue]Pulling {event.model}...[/bold blue]")
|
|
592
|
+
current_model = event.model
|
|
593
|
+
progress = Progress(
|
|
594
|
+
SpinnerColumn(),
|
|
595
|
+
TextColumn("[progress.description]{task.description}"),
|
|
596
|
+
BarColumn(),
|
|
597
|
+
DownloadColumn(),
|
|
598
|
+
TransferSpeedColumn(),
|
|
599
|
+
console=self.console,
|
|
600
|
+
transient=True,
|
|
601
|
+
auto_refresh=False,
|
|
602
|
+
)
|
|
603
|
+
progress.start()
|
|
604
|
+
task_id = progress.add_task(event.model, total=None)
|
|
605
|
+
elif event.status == "downloading" and progress and task_id is not None:
|
|
606
|
+
if event.digest != current_digest:
|
|
607
|
+
current_digest = event.digest
|
|
608
|
+
short_digest = event.digest[:19] if event.digest else ""
|
|
609
|
+
progress.update(
|
|
610
|
+
task_id,
|
|
611
|
+
description=f"{current_model} ({short_digest})",
|
|
612
|
+
total=event.total,
|
|
613
|
+
completed=0,
|
|
614
|
+
)
|
|
615
|
+
progress.update(task_id, completed=event.completed, refresh=True)
|
|
616
|
+
elif progress and task_id is not None:
|
|
617
|
+
progress.update(
|
|
618
|
+
task_id,
|
|
619
|
+
description=f"{current_model}: {event.status}",
|
|
620
|
+
refresh=True,
|
|
621
|
+
)
|
|
622
|
+
|
|
400
623
|
def show_settings(self):
|
|
401
624
|
"""Display current configuration settings."""
|
|
402
625
|
self.console.print("[bold]haiku.rag configuration[/bold]")
|
|
@@ -447,22 +670,27 @@ class HaikuRAGApp:
|
|
|
447
670
|
self.console.print(content)
|
|
448
671
|
self.console.rule()
|
|
449
672
|
|
|
450
|
-
def _rich_print_search_result(self,
|
|
451
|
-
"""Format a search result
|
|
452
|
-
content = Markdown(
|
|
673
|
+
def _rich_print_search_result(self, result: "SearchResult"):
|
|
674
|
+
"""Format a search result for display."""
|
|
675
|
+
content = Markdown(result.content)
|
|
453
676
|
self.console.print(
|
|
454
|
-
f"[repr.attrib_name]document_id[/repr.attrib_name]: {
|
|
455
|
-
f"[repr.attrib_name]
|
|
677
|
+
f"[repr.attrib_name]document_id[/repr.attrib_name]: {result.document_id} "
|
|
678
|
+
f"[repr.attrib_name]chunk_id[/repr.attrib_name]: {result.chunk_id} "
|
|
679
|
+
f"[repr.attrib_name]score[/repr.attrib_name]: {result.score:.4f}"
|
|
456
680
|
)
|
|
457
|
-
if
|
|
458
|
-
self.console.print(
|
|
459
|
-
|
|
460
|
-
|
|
681
|
+
if result.document_uri:
|
|
682
|
+
self.console.print(
|
|
683
|
+
f"[repr.attrib_name]document uri[/repr.attrib_name]: {result.document_uri}"
|
|
684
|
+
)
|
|
685
|
+
if result.document_title:
|
|
461
686
|
self.console.print("[repr.attrib_name]document title[/repr.attrib_name]:")
|
|
462
|
-
self.console.print(
|
|
463
|
-
if
|
|
464
|
-
self.console.print("[repr.attrib_name]
|
|
465
|
-
self.console.print(
|
|
687
|
+
self.console.print(result.document_title)
|
|
688
|
+
if result.page_numbers:
|
|
689
|
+
self.console.print("[repr.attrib_name]pages[/repr.attrib_name]:")
|
|
690
|
+
self.console.print(", ".join(str(p) for p in result.page_numbers))
|
|
691
|
+
if result.headings:
|
|
692
|
+
self.console.print("[repr.attrib_name]headings[/repr.attrib_name]:")
|
|
693
|
+
self.console.print(" > ".join(result.headings))
|
|
466
694
|
self.console.print("[repr.attrib_name]content[/repr.attrib_name]:")
|
|
467
695
|
self.console.print(content)
|
|
468
696
|
self.console.rule()
|
|
@@ -4,6 +4,8 @@ from typing import TYPE_CHECKING
|
|
|
4
4
|
if TYPE_CHECKING:
|
|
5
5
|
from docling_core.types.doc.document import DoclingDocument
|
|
6
6
|
|
|
7
|
+
from haiku.rag.store.models.chunk import Chunk
|
|
8
|
+
|
|
7
9
|
|
|
8
10
|
class DocumentChunker(ABC):
|
|
9
11
|
"""Abstract base class for document chunkers.
|
|
@@ -13,14 +15,15 @@ class DocumentChunker(ABC):
|
|
|
13
15
|
"""
|
|
14
16
|
|
|
15
17
|
@abstractmethod
|
|
16
|
-
async def chunk(self, document: "DoclingDocument") -> list[
|
|
17
|
-
"""Split a document into chunks.
|
|
18
|
+
async def chunk(self, document: "DoclingDocument") -> list["Chunk"]:
|
|
19
|
+
"""Split a document into chunks with metadata.
|
|
18
20
|
|
|
19
21
|
Args:
|
|
20
22
|
document: The DoclingDocument to chunk.
|
|
21
23
|
|
|
22
24
|
Returns:
|
|
23
|
-
List of
|
|
25
|
+
List of Chunk with content and structured metadata in the metadata dict
|
|
26
|
+
(doc_item_refs, headings, labels, page_numbers).
|
|
24
27
|
|
|
25
28
|
Raises:
|
|
26
29
|
ValueError: If chunking fails.
|