haiku.rag 0.9.3__py3-none-any.whl → 0.10.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of haiku.rag might be problematic. Click here for more details.
- haiku/rag/app.py +64 -18
- haiku/rag/cli.py +67 -30
- haiku/rag/client.py +63 -21
- haiku/rag/config.py +4 -0
- haiku/rag/mcp.py +18 -6
- haiku/rag/qa/agent.py +4 -2
- haiku/rag/qa/prompts.py +2 -2
- haiku/rag/reranking/mxbai.py +1 -1
- haiku/rag/research/__init__.py +10 -27
- haiku/rag/research/common.py +53 -0
- haiku/rag/research/dependencies.py +3 -25
- haiku/rag/research/graph.py +29 -0
- haiku/rag/research/models.py +70 -0
- haiku/rag/research/nodes/evaluate.py +80 -0
- haiku/rag/research/nodes/plan.py +63 -0
- haiku/rag/research/nodes/search.py +93 -0
- haiku/rag/research/nodes/synthesize.py +51 -0
- haiku/rag/research/prompts.py +98 -113
- haiku/rag/research/state.py +25 -0
- haiku/rag/store/engine.py +14 -0
- haiku/rag/store/models/chunk.py +1 -0
- haiku/rag/store/models/document.py +1 -0
- haiku/rag/store/repositories/chunk.py +4 -0
- haiku/rag/store/repositories/document.py +3 -0
- haiku/rag/store/upgrades/__init__.py +2 -0
- haiku/rag/store/upgrades/v0_10_1.py +64 -0
- haiku/rag/utils.py +8 -5
- {haiku_rag-0.9.3.dist-info → haiku_rag-0.10.1.dist-info}/METADATA +37 -1
- haiku_rag-0.10.1.dist-info/RECORD +54 -0
- haiku/rag/research/base.py +0 -130
- haiku/rag/research/evaluation_agent.py +0 -85
- haiku/rag/research/orchestrator.py +0 -170
- haiku/rag/research/presearch_agent.py +0 -39
- haiku/rag/research/search_agent.py +0 -69
- haiku/rag/research/synthesis_agent.py +0 -60
- haiku_rag-0.9.3.dist-info/RECORD +0 -51
- {haiku_rag-0.9.3.dist-info → haiku_rag-0.10.1.dist-info}/WHEEL +0 -0
- {haiku_rag-0.9.3.dist-info → haiku_rag-0.10.1.dist-info}/entry_points.txt +0 -0
- {haiku_rag-0.9.3.dist-info → haiku_rag-0.10.1.dist-info}/licenses/LICENSE +0 -0
haiku/rag/app.py
CHANGED
|
@@ -9,7 +9,13 @@ from haiku.rag.client import HaikuRAG
|
|
|
9
9
|
from haiku.rag.config import Config
|
|
10
10
|
from haiku.rag.mcp import create_mcp_server
|
|
11
11
|
from haiku.rag.monitor import FileWatcher
|
|
12
|
-
from haiku.rag.research.
|
|
12
|
+
from haiku.rag.research.dependencies import ResearchContext
|
|
13
|
+
from haiku.rag.research.graph import (
|
|
14
|
+
PlanNode,
|
|
15
|
+
ResearchDeps,
|
|
16
|
+
ResearchState,
|
|
17
|
+
build_research_graph,
|
|
18
|
+
)
|
|
13
19
|
from haiku.rag.store.models.chunk import Chunk
|
|
14
20
|
from haiku.rag.store.models.document import Document
|
|
15
21
|
|
|
@@ -33,9 +39,9 @@ class HaikuRAGApp:
|
|
|
33
39
|
f"[b]Document with id [cyan]{doc.id}[/cyan] added successfully.[/b]"
|
|
34
40
|
)
|
|
35
41
|
|
|
36
|
-
async def add_document_from_source(self, source: str):
|
|
42
|
+
async def add_document_from_source(self, source: str, title: str | None = None):
|
|
37
43
|
async with HaikuRAG(db_path=self.db_path) as self.client:
|
|
38
|
-
doc = await self.client.create_document_from_source(source)
|
|
44
|
+
doc = await self.client.create_document_from_source(source, title=title)
|
|
39
45
|
self._rich_print_document(doc, truncate=True)
|
|
40
46
|
self.console.print(
|
|
41
47
|
f"[b]Document with id [cyan]{doc.id}[/cyan] added successfully.[/b]"
|
|
@@ -80,28 +86,53 @@ class HaikuRAGApp:
|
|
|
80
86
|
self.console.print(f"[red]Error: {e}[/red]")
|
|
81
87
|
|
|
82
88
|
async def research(
|
|
83
|
-
self,
|
|
89
|
+
self,
|
|
90
|
+
question: str,
|
|
91
|
+
max_iterations: int = 3,
|
|
92
|
+
confidence_threshold: float = 0.8,
|
|
93
|
+
max_concurrency: int = 1,
|
|
94
|
+
verbose: bool = False,
|
|
84
95
|
):
|
|
85
|
-
"""Run
|
|
96
|
+
"""Run research via the pydantic-graph pipeline (default)."""
|
|
86
97
|
async with HaikuRAG(db_path=self.db_path) as client:
|
|
87
98
|
try:
|
|
88
|
-
# Create orchestrator with default config or fallback to QA
|
|
89
|
-
orchestrator = ResearchOrchestrator()
|
|
90
|
-
|
|
91
99
|
if verbose:
|
|
92
|
-
self.console.print(
|
|
93
|
-
f"[bold cyan]Starting research with {orchestrator.provider}:{orchestrator.model}[/bold cyan]"
|
|
94
|
-
)
|
|
100
|
+
self.console.print("[bold cyan]Starting research[/bold cyan]")
|
|
95
101
|
self.console.print(f"[bold blue]Question:[/bold blue] {question}")
|
|
96
102
|
self.console.print()
|
|
97
103
|
|
|
98
|
-
|
|
99
|
-
|
|
104
|
+
graph = build_research_graph()
|
|
105
|
+
state = ResearchState(
|
|
100
106
|
question=question,
|
|
101
|
-
|
|
107
|
+
context=ResearchContext(original_question=question),
|
|
102
108
|
max_iterations=max_iterations,
|
|
103
|
-
|
|
109
|
+
confidence_threshold=confidence_threshold,
|
|
110
|
+
max_concurrency=max_concurrency,
|
|
111
|
+
)
|
|
112
|
+
deps = ResearchDeps(
|
|
113
|
+
client=client, console=self.console if verbose else None
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
start = PlanNode(
|
|
117
|
+
provider=Config.RESEARCH_PROVIDER or Config.QA_PROVIDER,
|
|
118
|
+
model=Config.RESEARCH_MODEL or Config.QA_MODEL,
|
|
104
119
|
)
|
|
120
|
+
# Prefer graph.run; fall back to iter if unavailable
|
|
121
|
+
report = None
|
|
122
|
+
try:
|
|
123
|
+
result = await graph.run(start, state=state, deps=deps)
|
|
124
|
+
report = result.output
|
|
125
|
+
except Exception:
|
|
126
|
+
from pydantic_graph import End
|
|
127
|
+
|
|
128
|
+
async with graph.iter(start, state=state, deps=deps) as run:
|
|
129
|
+
node = run.next_node
|
|
130
|
+
while not isinstance(node, End):
|
|
131
|
+
node = await run.next(node)
|
|
132
|
+
if run.result:
|
|
133
|
+
report = run.result.output
|
|
134
|
+
if report is None:
|
|
135
|
+
raise RuntimeError("Graph did not produce a report")
|
|
105
136
|
|
|
106
137
|
# Display the report
|
|
107
138
|
self.console.print("[bold green]Research Report[/bold green]")
|
|
@@ -114,6 +145,12 @@ class HaikuRAGApp:
|
|
|
114
145
|
self.console.print(report.executive_summary)
|
|
115
146
|
self.console.print()
|
|
116
147
|
|
|
148
|
+
# Confidence (from last evaluation)
|
|
149
|
+
if state.last_eval:
|
|
150
|
+
conf = state.last_eval.confidence_score # type: ignore[attr-defined]
|
|
151
|
+
self.console.print(f"[bold cyan]Confidence:[/bold cyan] {conf:.1%}")
|
|
152
|
+
self.console.print()
|
|
153
|
+
|
|
117
154
|
# Main Findings
|
|
118
155
|
if report.main_findings:
|
|
119
156
|
self.console.print("[bold cyan]Main Findings:[/bold cyan]")
|
|
@@ -215,8 +252,16 @@ class HaikuRAGApp:
|
|
|
215
252
|
content = Markdown(content)
|
|
216
253
|
else:
|
|
217
254
|
content = Markdown(doc.content)
|
|
255
|
+
title_part = (
|
|
256
|
+
f" [repr.attrib_name]title[/repr.attrib_name]: {doc.title}"
|
|
257
|
+
if doc.title
|
|
258
|
+
else ""
|
|
259
|
+
)
|
|
218
260
|
self.console.print(
|
|
219
|
-
f"[repr.attrib_name]id[/repr.attrib_name]: {doc.id}
|
|
261
|
+
f"[repr.attrib_name]id[/repr.attrib_name]: {doc.id} "
|
|
262
|
+
f"[repr.attrib_name]uri[/repr.attrib_name]: {doc.uri}"
|
|
263
|
+
+ title_part
|
|
264
|
+
+ f" [repr.attrib_name]meta[/repr.attrib_name]: {doc.metadata}"
|
|
220
265
|
)
|
|
221
266
|
self.console.print(
|
|
222
267
|
f"[repr.attrib_name]created at[/repr.attrib_name]: {doc.created_at} [repr.attrib_name]updated at[/repr.attrib_name]: {doc.updated_at}"
|
|
@@ -235,6 +280,9 @@ class HaikuRAGApp:
|
|
|
235
280
|
if chunk.document_uri:
|
|
236
281
|
self.console.print("[repr.attrib_name]document uri[/repr.attrib_name]:")
|
|
237
282
|
self.console.print(chunk.document_uri)
|
|
283
|
+
if chunk.document_title:
|
|
284
|
+
self.console.print("[repr.attrib_name]document title[/repr.attrib_name]:")
|
|
285
|
+
self.console.print(chunk.document_title)
|
|
238
286
|
if chunk.document_meta:
|
|
239
287
|
self.console.print("[repr.attrib_name]document meta[/repr.attrib_name]:")
|
|
240
288
|
self.console.print(chunk.document_meta)
|
|
@@ -252,8 +300,6 @@ class HaikuRAGApp:
|
|
|
252
300
|
try:
|
|
253
301
|
if transport == "stdio":
|
|
254
302
|
await server.run_stdio_async()
|
|
255
|
-
elif transport == "sse":
|
|
256
|
-
await server.run_sse_async()
|
|
257
303
|
else:
|
|
258
304
|
await server.run_http_async(transport="streamable-http")
|
|
259
305
|
except KeyboardInterrupt:
|
haiku/rag/cli.py
CHANGED
|
@@ -3,28 +3,16 @@ import warnings
|
|
|
3
3
|
from importlib.metadata import version
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
|
|
6
|
-
import logfire
|
|
7
6
|
import typer
|
|
8
|
-
from rich.console import Console
|
|
9
7
|
|
|
10
|
-
from haiku.rag.app import HaikuRAGApp
|
|
11
8
|
from haiku.rag.config import Config
|
|
12
9
|
from haiku.rag.logging import configure_cli_logging
|
|
13
|
-
from haiku.rag.migration import migrate_sqlite_to_lancedb
|
|
14
10
|
from haiku.rag.utils import is_up_to_date
|
|
15
11
|
|
|
16
|
-
logfire.configure(send_to_logfire="if-token-present")
|
|
17
|
-
logfire.instrument_pydantic_ai()
|
|
18
|
-
|
|
19
|
-
if not Config.ENV == "development":
|
|
20
|
-
warnings.filterwarnings("ignore")
|
|
21
|
-
|
|
22
12
|
cli = typer.Typer(
|
|
23
13
|
context_settings={"help_option_names": ["-h", "--help"]}, no_args_is_help=True
|
|
24
14
|
)
|
|
25
15
|
|
|
26
|
-
console = Console()
|
|
27
|
-
|
|
28
16
|
|
|
29
17
|
def complete_document_ids(ctx: typer.Context, incomplete: str):
|
|
30
18
|
"""Autocomplete document IDs from the selected DB."""
|
|
@@ -89,16 +77,16 @@ async def check_version():
|
|
|
89
77
|
"""Check if haiku.rag is up to date and show warning if not."""
|
|
90
78
|
up_to_date, current_version, latest_version = await is_up_to_date()
|
|
91
79
|
if not up_to_date:
|
|
92
|
-
|
|
93
|
-
f"
|
|
80
|
+
typer.echo(
|
|
81
|
+
f"Warning: haiku.rag is outdated. Current: {current_version}, Latest: {latest_version}",
|
|
94
82
|
)
|
|
95
|
-
|
|
83
|
+
typer.echo("Please update.")
|
|
96
84
|
|
|
97
85
|
|
|
98
86
|
def version_callback(value: bool):
|
|
99
87
|
if value:
|
|
100
88
|
v = version("haiku.rag")
|
|
101
|
-
|
|
89
|
+
typer.echo(f"haiku.rag version {v}")
|
|
102
90
|
raise typer.Exit()
|
|
103
91
|
|
|
104
92
|
|
|
@@ -113,10 +101,26 @@ def main(
|
|
|
113
101
|
),
|
|
114
102
|
):
|
|
115
103
|
"""haiku.rag CLI - Vector database RAG system"""
|
|
116
|
-
#
|
|
117
|
-
|
|
104
|
+
# Configure logging minimally for CLI context
|
|
105
|
+
if Config.ENV == "development":
|
|
106
|
+
# Lazy import logfire only in development
|
|
107
|
+
try:
|
|
108
|
+
import logfire # type: ignore
|
|
109
|
+
|
|
110
|
+
logfire.configure(send_to_logfire="if-token-present")
|
|
111
|
+
logfire.instrument_pydantic_ai()
|
|
112
|
+
except Exception:
|
|
113
|
+
pass
|
|
114
|
+
else:
|
|
115
|
+
configure_cli_logging()
|
|
116
|
+
warnings.filterwarnings("ignore")
|
|
117
|
+
|
|
118
118
|
# Run version check before any command
|
|
119
|
-
|
|
119
|
+
try:
|
|
120
|
+
asyncio.run(check_version())
|
|
121
|
+
except Exception:
|
|
122
|
+
# Do not block CLI on version check issues
|
|
123
|
+
pass
|
|
120
124
|
|
|
121
125
|
|
|
122
126
|
@cli.command("list", help="List all stored documents")
|
|
@@ -127,6 +131,8 @@ def list_documents(
|
|
|
127
131
|
help="Path to the LanceDB database file",
|
|
128
132
|
),
|
|
129
133
|
):
|
|
134
|
+
from haiku.rag.app import HaikuRAGApp
|
|
135
|
+
|
|
130
136
|
app = HaikuRAGApp(db_path=db)
|
|
131
137
|
asyncio.run(app.list_documents())
|
|
132
138
|
|
|
@@ -142,6 +148,8 @@ def add_document_text(
|
|
|
142
148
|
help="Path to the LanceDB database file",
|
|
143
149
|
),
|
|
144
150
|
):
|
|
151
|
+
from haiku.rag.app import HaikuRAGApp
|
|
152
|
+
|
|
145
153
|
app = HaikuRAGApp(db_path=db)
|
|
146
154
|
asyncio.run(app.add_document_from_text(text=text))
|
|
147
155
|
|
|
@@ -152,14 +160,21 @@ def add_document_src(
|
|
|
152
160
|
help="The file path or URL of the document to add",
|
|
153
161
|
autocompletion=complete_local_paths,
|
|
154
162
|
),
|
|
163
|
+
title: str | None = typer.Option(
|
|
164
|
+
None,
|
|
165
|
+
"--title",
|
|
166
|
+
help="Optional human-readable title to store with the document",
|
|
167
|
+
),
|
|
155
168
|
db: Path = typer.Option(
|
|
156
169
|
Config.DEFAULT_DATA_DIR / "haiku.rag.lancedb",
|
|
157
170
|
"--db",
|
|
158
171
|
help="Path to the LanceDB database file",
|
|
159
172
|
),
|
|
160
173
|
):
|
|
174
|
+
from haiku.rag.app import HaikuRAGApp
|
|
175
|
+
|
|
161
176
|
app = HaikuRAGApp(db_path=db)
|
|
162
|
-
asyncio.run(app.add_document_from_source(source=source))
|
|
177
|
+
asyncio.run(app.add_document_from_source(source=source, title=title))
|
|
163
178
|
|
|
164
179
|
|
|
165
180
|
@cli.command("get", help="Get and display a document by its ID")
|
|
@@ -174,6 +189,8 @@ def get_document(
|
|
|
174
189
|
help="Path to the LanceDB database file",
|
|
175
190
|
),
|
|
176
191
|
):
|
|
192
|
+
from haiku.rag.app import HaikuRAGApp
|
|
193
|
+
|
|
177
194
|
app = HaikuRAGApp(db_path=db)
|
|
178
195
|
asyncio.run(app.get_document(doc_id=doc_id))
|
|
179
196
|
|
|
@@ -190,6 +207,8 @@ def delete_document(
|
|
|
190
207
|
help="Path to the LanceDB database file",
|
|
191
208
|
),
|
|
192
209
|
):
|
|
210
|
+
from haiku.rag.app import HaikuRAGApp
|
|
211
|
+
|
|
193
212
|
app = HaikuRAGApp(db_path=db)
|
|
194
213
|
asyncio.run(app.delete_document(doc_id=doc_id))
|
|
195
214
|
|
|
@@ -215,6 +234,8 @@ def search(
|
|
|
215
234
|
help="Path to the LanceDB database file",
|
|
216
235
|
),
|
|
217
236
|
):
|
|
237
|
+
from haiku.rag.app import HaikuRAGApp
|
|
238
|
+
|
|
218
239
|
app = HaikuRAGApp(db_path=db)
|
|
219
240
|
asyncio.run(app.search(query=query, limit=limit))
|
|
220
241
|
|
|
@@ -235,6 +256,8 @@ def ask(
|
|
|
235
256
|
help="Include citations in the response",
|
|
236
257
|
),
|
|
237
258
|
):
|
|
259
|
+
from haiku.rag.app import HaikuRAGApp
|
|
260
|
+
|
|
238
261
|
app = HaikuRAGApp(db_path=db)
|
|
239
262
|
asyncio.run(app.ask(question=question, cite=cite))
|
|
240
263
|
|
|
@@ -250,6 +273,16 @@ def research(
|
|
|
250
273
|
"-n",
|
|
251
274
|
help="Maximum search/analyze iterations",
|
|
252
275
|
),
|
|
276
|
+
confidence_threshold: float = typer.Option(
|
|
277
|
+
0.8,
|
|
278
|
+
"--confidence-threshold",
|
|
279
|
+
help="Minimum confidence (0-1) to stop",
|
|
280
|
+
),
|
|
281
|
+
max_concurrency: int = typer.Option(
|
|
282
|
+
1,
|
|
283
|
+
"--max-concurrency",
|
|
284
|
+
help="Max concurrent searches per iteration (planned)",
|
|
285
|
+
),
|
|
253
286
|
db: Path = typer.Option(
|
|
254
287
|
Config.DEFAULT_DATA_DIR / "haiku.rag.lancedb",
|
|
255
288
|
"--db",
|
|
@@ -261,11 +294,15 @@ def research(
|
|
|
261
294
|
help="Show verbose progress output",
|
|
262
295
|
),
|
|
263
296
|
):
|
|
297
|
+
from haiku.rag.app import HaikuRAGApp
|
|
298
|
+
|
|
264
299
|
app = HaikuRAGApp(db_path=db)
|
|
265
300
|
asyncio.run(
|
|
266
301
|
app.research(
|
|
267
302
|
question=question,
|
|
268
303
|
max_iterations=max_iterations,
|
|
304
|
+
confidence_threshold=confidence_threshold,
|
|
305
|
+
max_concurrency=max_concurrency,
|
|
269
306
|
verbose=verbose,
|
|
270
307
|
)
|
|
271
308
|
)
|
|
@@ -273,6 +310,8 @@ def research(
|
|
|
273
310
|
|
|
274
311
|
@cli.command("settings", help="Display current configuration settings")
|
|
275
312
|
def settings():
|
|
313
|
+
from haiku.rag.app import HaikuRAGApp
|
|
314
|
+
|
|
276
315
|
app = HaikuRAGApp(db_path=Path()) # Don't need actual DB for settings
|
|
277
316
|
app.show_settings()
|
|
278
317
|
|
|
@@ -288,6 +327,8 @@ def rebuild(
|
|
|
288
327
|
help="Path to the LanceDB database file",
|
|
289
328
|
),
|
|
290
329
|
):
|
|
330
|
+
from haiku.rag.app import HaikuRAGApp
|
|
331
|
+
|
|
291
332
|
app = HaikuRAGApp(db_path=db)
|
|
292
333
|
asyncio.run(app.rebuild())
|
|
293
334
|
|
|
@@ -300,6 +341,8 @@ def vacuum(
|
|
|
300
341
|
help="Path to the LanceDB database file",
|
|
301
342
|
),
|
|
302
343
|
):
|
|
344
|
+
from haiku.rag.app import HaikuRAGApp
|
|
345
|
+
|
|
303
346
|
app = HaikuRAGApp(db_path=db)
|
|
304
347
|
asyncio.run(app.vacuum())
|
|
305
348
|
|
|
@@ -318,24 +361,15 @@ def serve(
|
|
|
318
361
|
"--stdio",
|
|
319
362
|
help="Run MCP server on stdio Transport",
|
|
320
363
|
),
|
|
321
|
-
sse: bool = typer.Option(
|
|
322
|
-
False,
|
|
323
|
-
"--sse",
|
|
324
|
-
help="Run MCP server on SSE transport",
|
|
325
|
-
),
|
|
326
364
|
) -> None:
|
|
327
365
|
"""Start the MCP server."""
|
|
328
|
-
|
|
329
|
-
console.print("[red]Error: Cannot use both --stdio and --http options[/red]")
|
|
330
|
-
raise typer.Exit(1)
|
|
366
|
+
from haiku.rag.app import HaikuRAGApp
|
|
331
367
|
|
|
332
368
|
app = HaikuRAGApp(db_path=db)
|
|
333
369
|
|
|
334
370
|
transport = None
|
|
335
371
|
if stdio:
|
|
336
372
|
transport = "stdio"
|
|
337
|
-
elif sse:
|
|
338
|
-
transport = "sse"
|
|
339
373
|
|
|
340
374
|
asyncio.run(app.serve(transport=transport))
|
|
341
375
|
|
|
@@ -349,6 +383,9 @@ def migrate(
|
|
|
349
383
|
# Generate LanceDB path in same parent directory
|
|
350
384
|
lancedb_path = sqlite_path.parent / (sqlite_path.stem + ".lancedb")
|
|
351
385
|
|
|
386
|
+
# Lazy import to avoid heavy deps on simple invocations
|
|
387
|
+
from haiku.rag.migration import migrate_sqlite_to_lancedb
|
|
388
|
+
|
|
352
389
|
success = asyncio.run(migrate_sqlite_to_lancedb(sqlite_path, lancedb_path))
|
|
353
390
|
|
|
354
391
|
if not success:
|
haiku/rag/client.py
CHANGED
|
@@ -33,8 +33,6 @@ class HaikuRAG:
|
|
|
33
33
|
db_path: Path to the database file.
|
|
34
34
|
skip_validation: Whether to skip configuration validation on database load.
|
|
35
35
|
"""
|
|
36
|
-
if not db_path.parent.exists():
|
|
37
|
-
Path.mkdir(db_path.parent, parents=True)
|
|
38
36
|
self.store = Store(db_path, skip_validation=skip_validation)
|
|
39
37
|
self.document_repository = DocumentRepository(self.store)
|
|
40
38
|
self.chunk_repository = ChunkRepository(self.store)
|
|
@@ -52,6 +50,7 @@ class HaikuRAG:
|
|
|
52
50
|
self,
|
|
53
51
|
docling_document,
|
|
54
52
|
uri: str | None = None,
|
|
53
|
+
title: str | None = None,
|
|
55
54
|
metadata: dict | None = None,
|
|
56
55
|
chunks: list[Chunk] | None = None,
|
|
57
56
|
) -> Document:
|
|
@@ -60,6 +59,7 @@ class HaikuRAG:
|
|
|
60
59
|
document = Document(
|
|
61
60
|
content=content,
|
|
62
61
|
uri=uri,
|
|
62
|
+
title=title,
|
|
63
63
|
metadata=metadata or {},
|
|
64
64
|
)
|
|
65
65
|
return await self.document_repository._create_with_docling(
|
|
@@ -70,6 +70,7 @@ class HaikuRAG:
|
|
|
70
70
|
self,
|
|
71
71
|
content: str,
|
|
72
72
|
uri: str | None = None,
|
|
73
|
+
title: str | None = None,
|
|
73
74
|
metadata: dict | None = None,
|
|
74
75
|
chunks: list[Chunk] | None = None,
|
|
75
76
|
) -> Document:
|
|
@@ -90,6 +91,7 @@ class HaikuRAG:
|
|
|
90
91
|
document = Document(
|
|
91
92
|
content=content,
|
|
92
93
|
uri=uri,
|
|
94
|
+
title=title,
|
|
93
95
|
metadata=metadata or {},
|
|
94
96
|
)
|
|
95
97
|
return await self.document_repository._create_with_docling(
|
|
@@ -97,7 +99,7 @@ class HaikuRAG:
|
|
|
97
99
|
)
|
|
98
100
|
|
|
99
101
|
async def create_document_from_source(
|
|
100
|
-
self, source: str | Path, metadata: dict =
|
|
102
|
+
self, source: str | Path, title: str | None = None, metadata: dict | None = None
|
|
101
103
|
) -> Document:
|
|
102
104
|
"""Create or update a document from a file path or URL.
|
|
103
105
|
|
|
@@ -118,11 +120,16 @@ class HaikuRAG:
|
|
|
118
120
|
httpx.RequestError: If URL request fails
|
|
119
121
|
"""
|
|
120
122
|
|
|
123
|
+
# Normalize metadata
|
|
124
|
+
metadata = metadata or {}
|
|
125
|
+
|
|
121
126
|
# Check if it's a URL
|
|
122
127
|
source_str = str(source)
|
|
123
128
|
parsed_url = urlparse(source_str)
|
|
124
129
|
if parsed_url.scheme in ("http", "https"):
|
|
125
|
-
return await self._create_or_update_document_from_url(
|
|
130
|
+
return await self._create_or_update_document_from_url(
|
|
131
|
+
source_str, title=title, metadata=metadata
|
|
132
|
+
)
|
|
126
133
|
elif parsed_url.scheme == "file":
|
|
127
134
|
# Handle file:// URI by converting to path
|
|
128
135
|
source_path = Path(parsed_url.path)
|
|
@@ -138,37 +145,51 @@ class HaikuRAG:
|
|
|
138
145
|
uri = source_path.absolute().as_uri()
|
|
139
146
|
md5_hash = hashlib.md5(source_path.read_bytes()).hexdigest()
|
|
140
147
|
|
|
148
|
+
# Get content type from file extension (do before early return)
|
|
149
|
+
content_type, _ = mimetypes.guess_type(str(source_path))
|
|
150
|
+
if not content_type:
|
|
151
|
+
content_type = "application/octet-stream"
|
|
152
|
+
# Merge metadata with contentType and md5
|
|
153
|
+
metadata.update({"contentType": content_type, "md5": md5_hash})
|
|
154
|
+
|
|
141
155
|
# Check if document already exists
|
|
142
156
|
existing_doc = await self.get_document_by_uri(uri)
|
|
143
157
|
if existing_doc and existing_doc.metadata.get("md5") == md5_hash:
|
|
144
|
-
# MD5 unchanged
|
|
158
|
+
# MD5 unchanged; update title/metadata if provided
|
|
159
|
+
updated = False
|
|
160
|
+
if title is not None and title != existing_doc.title:
|
|
161
|
+
existing_doc.title = title
|
|
162
|
+
updated = True
|
|
163
|
+
if metadata:
|
|
164
|
+
existing_doc.metadata = {**(existing_doc.metadata or {}), **metadata}
|
|
165
|
+
updated = True
|
|
166
|
+
if updated:
|
|
167
|
+
return await self.document_repository.update(existing_doc)
|
|
145
168
|
return existing_doc
|
|
146
169
|
|
|
170
|
+
# Parse file only when content changed or new document
|
|
147
171
|
docling_document = FileReader.parse_file(source_path)
|
|
148
172
|
|
|
149
|
-
# Get content type from file extension
|
|
150
|
-
content_type, _ = mimetypes.guess_type(str(source_path))
|
|
151
|
-
if not content_type:
|
|
152
|
-
content_type = "application/octet-stream"
|
|
153
|
-
|
|
154
|
-
# Merge metadata with contentType and md5
|
|
155
|
-
metadata.update({"contentType": content_type, "md5": md5_hash})
|
|
156
|
-
|
|
157
173
|
if existing_doc:
|
|
158
174
|
# Update existing document
|
|
159
175
|
existing_doc.content = docling_document.export_to_markdown()
|
|
160
176
|
existing_doc.metadata = metadata
|
|
177
|
+
if title is not None:
|
|
178
|
+
existing_doc.title = title
|
|
161
179
|
return await self.document_repository._update_with_docling(
|
|
162
180
|
existing_doc, docling_document
|
|
163
181
|
)
|
|
164
182
|
else:
|
|
165
183
|
# Create new document using DoclingDocument
|
|
166
184
|
return await self._create_document_with_docling(
|
|
167
|
-
docling_document=docling_document,
|
|
185
|
+
docling_document=docling_document,
|
|
186
|
+
uri=uri,
|
|
187
|
+
title=title,
|
|
188
|
+
metadata=metadata,
|
|
168
189
|
)
|
|
169
190
|
|
|
170
191
|
async def _create_or_update_document_from_url(
|
|
171
|
-
self, url: str, metadata: dict =
|
|
192
|
+
self, url: str, title: str | None = None, metadata: dict | None = None
|
|
172
193
|
) -> Document:
|
|
173
194
|
"""Create or update a document from a URL by downloading and parsing the content.
|
|
174
195
|
|
|
@@ -188,20 +209,35 @@ class HaikuRAG:
|
|
|
188
209
|
ValueError: If the content cannot be parsed
|
|
189
210
|
httpx.RequestError: If URL request fails
|
|
190
211
|
"""
|
|
212
|
+
metadata = metadata or {}
|
|
213
|
+
|
|
191
214
|
async with httpx.AsyncClient() as client:
|
|
192
215
|
response = await client.get(url)
|
|
193
216
|
response.raise_for_status()
|
|
194
217
|
|
|
195
218
|
md5_hash = hashlib.md5(response.content).hexdigest()
|
|
196
219
|
|
|
220
|
+
# Get content type early (used for potential no-op update)
|
|
221
|
+
content_type = response.headers.get("content-type", "").lower()
|
|
222
|
+
|
|
197
223
|
# Check if document already exists
|
|
198
224
|
existing_doc = await self.get_document_by_uri(url)
|
|
199
225
|
if existing_doc and existing_doc.metadata.get("md5") == md5_hash:
|
|
200
|
-
# MD5 unchanged
|
|
226
|
+
# MD5 unchanged; update title/metadata if provided
|
|
227
|
+
updated = False
|
|
228
|
+
if title is not None and title != existing_doc.title:
|
|
229
|
+
existing_doc.title = title
|
|
230
|
+
updated = True
|
|
231
|
+
metadata.update({"contentType": content_type, "md5": md5_hash})
|
|
232
|
+
if metadata:
|
|
233
|
+
existing_doc.metadata = {
|
|
234
|
+
**(existing_doc.metadata or {}),
|
|
235
|
+
**metadata,
|
|
236
|
+
}
|
|
237
|
+
updated = True
|
|
238
|
+
if updated:
|
|
239
|
+
return await self.document_repository.update(existing_doc)
|
|
201
240
|
return existing_doc
|
|
202
|
-
|
|
203
|
-
# Get content type to determine file extension
|
|
204
|
-
content_type = response.headers.get("content-type", "").lower()
|
|
205
241
|
file_extension = self._get_extension_from_content_type_or_url(
|
|
206
242
|
url, content_type
|
|
207
243
|
)
|
|
@@ -228,12 +264,17 @@ class HaikuRAG:
|
|
|
228
264
|
if existing_doc:
|
|
229
265
|
existing_doc.content = docling_document.export_to_markdown()
|
|
230
266
|
existing_doc.metadata = metadata
|
|
267
|
+
if title is not None:
|
|
268
|
+
existing_doc.title = title
|
|
231
269
|
return await self.document_repository._update_with_docling(
|
|
232
270
|
existing_doc, docling_document
|
|
233
271
|
)
|
|
234
272
|
else:
|
|
235
273
|
return await self._create_document_with_docling(
|
|
236
|
-
docling_document=docling_document,
|
|
274
|
+
docling_document=docling_document,
|
|
275
|
+
uri=url,
|
|
276
|
+
title=title,
|
|
277
|
+
metadata=metadata,
|
|
237
278
|
)
|
|
238
279
|
|
|
239
280
|
def _get_extension_from_content_type_or_url(
|
|
@@ -418,6 +459,7 @@ class HaikuRAG:
|
|
|
418
459
|
content="".join(combined_content_parts),
|
|
419
460
|
metadata=original_chunk.metadata,
|
|
420
461
|
document_uri=original_chunk.document_uri,
|
|
462
|
+
document_title=original_chunk.document_title,
|
|
421
463
|
document_meta=original_chunk.document_meta,
|
|
422
464
|
)
|
|
423
465
|
|
|
@@ -524,7 +566,7 @@ class HaikuRAG:
|
|
|
524
566
|
|
|
525
567
|
# Try to re-create from source (this creates the document with chunks)
|
|
526
568
|
new_doc = await self.create_document_from_source(
|
|
527
|
-
doc.uri, doc.metadata or {}
|
|
569
|
+
source=doc.uri, metadata=doc.metadata or {}
|
|
528
570
|
)
|
|
529
571
|
|
|
530
572
|
assert new_doc.id is not None, "New document ID should not be None"
|
haiku/rag/config.py
CHANGED
|
@@ -53,6 +53,10 @@ class AppConfig(BaseModel):
|
|
|
53
53
|
ANTHROPIC_API_KEY: str = ""
|
|
54
54
|
COHERE_API_KEY: str = ""
|
|
55
55
|
|
|
56
|
+
# If true, refuse to auto-create a new LanceDB database or tables
|
|
57
|
+
# and error out when the database does not already exist.
|
|
58
|
+
DISABLE_DB_AUTOCREATE: bool = False
|
|
59
|
+
|
|
56
60
|
@field_validator("MONITOR_DIRECTORIES", mode="before")
|
|
57
61
|
@classmethod
|
|
58
62
|
def parse_monitor_directories(cls, v):
|
haiku/rag/mcp.py
CHANGED
|
@@ -17,6 +17,7 @@ class DocumentResult(BaseModel):
|
|
|
17
17
|
id: str | None
|
|
18
18
|
content: str
|
|
19
19
|
uri: str | None = None
|
|
20
|
+
title: str | None = None
|
|
20
21
|
metadata: dict[str, Any] = {}
|
|
21
22
|
created_at: str
|
|
22
23
|
updated_at: str
|
|
@@ -28,13 +29,15 @@ def create_mcp_server(db_path: Path) -> FastMCP:
|
|
|
28
29
|
|
|
29
30
|
@mcp.tool()
|
|
30
31
|
async def add_document_from_file(
|
|
31
|
-
file_path: str,
|
|
32
|
+
file_path: str,
|
|
33
|
+
metadata: dict[str, Any] | None = None,
|
|
34
|
+
title: str | None = None,
|
|
32
35
|
) -> str | None:
|
|
33
36
|
"""Add a document to the RAG system from a file path."""
|
|
34
37
|
try:
|
|
35
38
|
async with HaikuRAG(db_path) as rag:
|
|
36
39
|
document = await rag.create_document_from_source(
|
|
37
|
-
Path(file_path), metadata or {}
|
|
40
|
+
Path(file_path), title=title, metadata=metadata or {}
|
|
38
41
|
)
|
|
39
42
|
return document.id
|
|
40
43
|
except Exception:
|
|
@@ -42,24 +45,31 @@ def create_mcp_server(db_path: Path) -> FastMCP:
|
|
|
42
45
|
|
|
43
46
|
@mcp.tool()
|
|
44
47
|
async def add_document_from_url(
|
|
45
|
-
url: str, metadata: dict[str, Any] | None = None
|
|
48
|
+
url: str, metadata: dict[str, Any] | None = None, title: str | None = None
|
|
46
49
|
) -> str | None:
|
|
47
50
|
"""Add a document to the RAG system from a URL."""
|
|
48
51
|
try:
|
|
49
52
|
async with HaikuRAG(db_path) as rag:
|
|
50
|
-
document = await rag.create_document_from_source(
|
|
53
|
+
document = await rag.create_document_from_source(
|
|
54
|
+
url, title=title, metadata=metadata or {}
|
|
55
|
+
)
|
|
51
56
|
return document.id
|
|
52
57
|
except Exception:
|
|
53
58
|
return None
|
|
54
59
|
|
|
55
60
|
@mcp.tool()
|
|
56
61
|
async def add_document_from_text(
|
|
57
|
-
content: str,
|
|
62
|
+
content: str,
|
|
63
|
+
uri: str | None = None,
|
|
64
|
+
metadata: dict[str, Any] | None = None,
|
|
65
|
+
title: str | None = None,
|
|
58
66
|
) -> str | None:
|
|
59
67
|
"""Add a document to the RAG system from text content."""
|
|
60
68
|
try:
|
|
61
69
|
async with HaikuRAG(db_path) as rag:
|
|
62
|
-
document = await rag.create_document(
|
|
70
|
+
document = await rag.create_document(
|
|
71
|
+
content, uri, title=title, metadata=metadata or {}
|
|
72
|
+
)
|
|
63
73
|
return document.id
|
|
64
74
|
except Exception:
|
|
65
75
|
return None
|
|
@@ -102,6 +112,7 @@ def create_mcp_server(db_path: Path) -> FastMCP:
|
|
|
102
112
|
id=document.id,
|
|
103
113
|
content=document.content,
|
|
104
114
|
uri=document.uri,
|
|
115
|
+
title=document.title,
|
|
105
116
|
metadata=document.metadata,
|
|
106
117
|
created_at=str(document.created_at),
|
|
107
118
|
updated_at=str(document.updated_at),
|
|
@@ -123,6 +134,7 @@ def create_mcp_server(db_path: Path) -> FastMCP:
|
|
|
123
134
|
id=doc.id,
|
|
124
135
|
content=doc.content,
|
|
125
136
|
uri=doc.uri,
|
|
137
|
+
title=doc.title,
|
|
126
138
|
metadata=doc.metadata,
|
|
127
139
|
created_at=str(doc.created_at),
|
|
128
140
|
updated_at=str(doc.updated_at),
|