haiku.rag 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- haiku/rag/app.py +149 -15
- haiku/rag/cli.py +126 -31
- haiku/rag/client.py +63 -21
- haiku/rag/config.py +4 -0
- haiku/rag/mcp.py +18 -6
- haiku/rag/migration.py +2 -2
- haiku/rag/qa/agent.py +4 -2
- haiku/rag/qa/prompts.py +2 -2
- haiku/rag/research/models.py +2 -2
- haiku/rag/research/nodes/search.py +3 -1
- haiku/rag/research/prompts.py +4 -3
- haiku/rag/store/__init__.py +1 -1
- haiku/rag/store/engine.py +14 -0
- haiku/rag/store/models/__init__.py +1 -1
- haiku/rag/store/models/chunk.py +1 -0
- haiku/rag/store/models/document.py +1 -0
- haiku/rag/store/repositories/chunk.py +4 -0
- haiku/rag/store/repositories/document.py +3 -0
- haiku/rag/store/upgrades/__init__.py +2 -0
- haiku/rag/store/upgrades/v0_10_1.py +64 -0
- haiku/rag/utils.py +42 -5
- {haiku_rag-0.10.0.dist-info → haiku_rag-0.10.2.dist-info}/METADATA +3 -2
- {haiku_rag-0.10.0.dist-info → haiku_rag-0.10.2.dist-info}/RECORD +26 -25
- {haiku_rag-0.10.0.dist-info → haiku_rag-0.10.2.dist-info}/WHEEL +0 -0
- {haiku_rag-0.10.0.dist-info → haiku_rag-0.10.2.dist-info}/entry_points.txt +0 -0
- {haiku_rag-0.10.0.dist-info → haiku_rag-0.10.2.dist-info}/licenses/LICENSE +0 -0
haiku/rag/client.py
CHANGED
|
@@ -33,8 +33,6 @@ class HaikuRAG:
|
|
|
33
33
|
db_path: Path to the database file.
|
|
34
34
|
skip_validation: Whether to skip configuration validation on database load.
|
|
35
35
|
"""
|
|
36
|
-
if not db_path.parent.exists():
|
|
37
|
-
Path.mkdir(db_path.parent, parents=True)
|
|
38
36
|
self.store = Store(db_path, skip_validation=skip_validation)
|
|
39
37
|
self.document_repository = DocumentRepository(self.store)
|
|
40
38
|
self.chunk_repository = ChunkRepository(self.store)
|
|
@@ -52,6 +50,7 @@ class HaikuRAG:
|
|
|
52
50
|
self,
|
|
53
51
|
docling_document,
|
|
54
52
|
uri: str | None = None,
|
|
53
|
+
title: str | None = None,
|
|
55
54
|
metadata: dict | None = None,
|
|
56
55
|
chunks: list[Chunk] | None = None,
|
|
57
56
|
) -> Document:
|
|
@@ -60,6 +59,7 @@ class HaikuRAG:
|
|
|
60
59
|
document = Document(
|
|
61
60
|
content=content,
|
|
62
61
|
uri=uri,
|
|
62
|
+
title=title,
|
|
63
63
|
metadata=metadata or {},
|
|
64
64
|
)
|
|
65
65
|
return await self.document_repository._create_with_docling(
|
|
@@ -70,6 +70,7 @@ class HaikuRAG:
|
|
|
70
70
|
self,
|
|
71
71
|
content: str,
|
|
72
72
|
uri: str | None = None,
|
|
73
|
+
title: str | None = None,
|
|
73
74
|
metadata: dict | None = None,
|
|
74
75
|
chunks: list[Chunk] | None = None,
|
|
75
76
|
) -> Document:
|
|
@@ -90,6 +91,7 @@ class HaikuRAG:
|
|
|
90
91
|
document = Document(
|
|
91
92
|
content=content,
|
|
92
93
|
uri=uri,
|
|
94
|
+
title=title,
|
|
93
95
|
metadata=metadata or {},
|
|
94
96
|
)
|
|
95
97
|
return await self.document_repository._create_with_docling(
|
|
@@ -97,7 +99,7 @@ class HaikuRAG:
|
|
|
97
99
|
)
|
|
98
100
|
|
|
99
101
|
async def create_document_from_source(
|
|
100
|
-
self, source: str | Path, metadata: dict =
|
|
102
|
+
self, source: str | Path, title: str | None = None, metadata: dict | None = None
|
|
101
103
|
) -> Document:
|
|
102
104
|
"""Create or update a document from a file path or URL.
|
|
103
105
|
|
|
@@ -118,11 +120,16 @@ class HaikuRAG:
|
|
|
118
120
|
httpx.RequestError: If URL request fails
|
|
119
121
|
"""
|
|
120
122
|
|
|
123
|
+
# Normalize metadata
|
|
124
|
+
metadata = metadata or {}
|
|
125
|
+
|
|
121
126
|
# Check if it's a URL
|
|
122
127
|
source_str = str(source)
|
|
123
128
|
parsed_url = urlparse(source_str)
|
|
124
129
|
if parsed_url.scheme in ("http", "https"):
|
|
125
|
-
return await self._create_or_update_document_from_url(
|
|
130
|
+
return await self._create_or_update_document_from_url(
|
|
131
|
+
source_str, title=title, metadata=metadata
|
|
132
|
+
)
|
|
126
133
|
elif parsed_url.scheme == "file":
|
|
127
134
|
# Handle file:// URI by converting to path
|
|
128
135
|
source_path = Path(parsed_url.path)
|
|
@@ -138,37 +145,51 @@ class HaikuRAG:
|
|
|
138
145
|
uri = source_path.absolute().as_uri()
|
|
139
146
|
md5_hash = hashlib.md5(source_path.read_bytes()).hexdigest()
|
|
140
147
|
|
|
148
|
+
# Get content type from file extension (do before early return)
|
|
149
|
+
content_type, _ = mimetypes.guess_type(str(source_path))
|
|
150
|
+
if not content_type:
|
|
151
|
+
content_type = "application/octet-stream"
|
|
152
|
+
# Merge metadata with contentType and md5
|
|
153
|
+
metadata.update({"contentType": content_type, "md5": md5_hash})
|
|
154
|
+
|
|
141
155
|
# Check if document already exists
|
|
142
156
|
existing_doc = await self.get_document_by_uri(uri)
|
|
143
157
|
if existing_doc and existing_doc.metadata.get("md5") == md5_hash:
|
|
144
|
-
# MD5 unchanged
|
|
158
|
+
# MD5 unchanged; update title/metadata if provided
|
|
159
|
+
updated = False
|
|
160
|
+
if title is not None and title != existing_doc.title:
|
|
161
|
+
existing_doc.title = title
|
|
162
|
+
updated = True
|
|
163
|
+
if metadata:
|
|
164
|
+
existing_doc.metadata = {**(existing_doc.metadata or {}), **metadata}
|
|
165
|
+
updated = True
|
|
166
|
+
if updated:
|
|
167
|
+
return await self.document_repository.update(existing_doc)
|
|
145
168
|
return existing_doc
|
|
146
169
|
|
|
170
|
+
# Parse file only when content changed or new document
|
|
147
171
|
docling_document = FileReader.parse_file(source_path)
|
|
148
172
|
|
|
149
|
-
# Get content type from file extension
|
|
150
|
-
content_type, _ = mimetypes.guess_type(str(source_path))
|
|
151
|
-
if not content_type:
|
|
152
|
-
content_type = "application/octet-stream"
|
|
153
|
-
|
|
154
|
-
# Merge metadata with contentType and md5
|
|
155
|
-
metadata.update({"contentType": content_type, "md5": md5_hash})
|
|
156
|
-
|
|
157
173
|
if existing_doc:
|
|
158
174
|
# Update existing document
|
|
159
175
|
existing_doc.content = docling_document.export_to_markdown()
|
|
160
176
|
existing_doc.metadata = metadata
|
|
177
|
+
if title is not None:
|
|
178
|
+
existing_doc.title = title
|
|
161
179
|
return await self.document_repository._update_with_docling(
|
|
162
180
|
existing_doc, docling_document
|
|
163
181
|
)
|
|
164
182
|
else:
|
|
165
183
|
# Create new document using DoclingDocument
|
|
166
184
|
return await self._create_document_with_docling(
|
|
167
|
-
docling_document=docling_document,
|
|
185
|
+
docling_document=docling_document,
|
|
186
|
+
uri=uri,
|
|
187
|
+
title=title,
|
|
188
|
+
metadata=metadata,
|
|
168
189
|
)
|
|
169
190
|
|
|
170
191
|
async def _create_or_update_document_from_url(
|
|
171
|
-
self, url: str, metadata: dict =
|
|
192
|
+
self, url: str, title: str | None = None, metadata: dict | None = None
|
|
172
193
|
) -> Document:
|
|
173
194
|
"""Create or update a document from a URL by downloading and parsing the content.
|
|
174
195
|
|
|
@@ -188,20 +209,35 @@ class HaikuRAG:
|
|
|
188
209
|
ValueError: If the content cannot be parsed
|
|
189
210
|
httpx.RequestError: If URL request fails
|
|
190
211
|
"""
|
|
212
|
+
metadata = metadata or {}
|
|
213
|
+
|
|
191
214
|
async with httpx.AsyncClient() as client:
|
|
192
215
|
response = await client.get(url)
|
|
193
216
|
response.raise_for_status()
|
|
194
217
|
|
|
195
218
|
md5_hash = hashlib.md5(response.content).hexdigest()
|
|
196
219
|
|
|
220
|
+
# Get content type early (used for potential no-op update)
|
|
221
|
+
content_type = response.headers.get("content-type", "").lower()
|
|
222
|
+
|
|
197
223
|
# Check if document already exists
|
|
198
224
|
existing_doc = await self.get_document_by_uri(url)
|
|
199
225
|
if existing_doc and existing_doc.metadata.get("md5") == md5_hash:
|
|
200
|
-
# MD5 unchanged
|
|
226
|
+
# MD5 unchanged; update title/metadata if provided
|
|
227
|
+
updated = False
|
|
228
|
+
if title is not None and title != existing_doc.title:
|
|
229
|
+
existing_doc.title = title
|
|
230
|
+
updated = True
|
|
231
|
+
metadata.update({"contentType": content_type, "md5": md5_hash})
|
|
232
|
+
if metadata:
|
|
233
|
+
existing_doc.metadata = {
|
|
234
|
+
**(existing_doc.metadata or {}),
|
|
235
|
+
**metadata,
|
|
236
|
+
}
|
|
237
|
+
updated = True
|
|
238
|
+
if updated:
|
|
239
|
+
return await self.document_repository.update(existing_doc)
|
|
201
240
|
return existing_doc
|
|
202
|
-
|
|
203
|
-
# Get content type to determine file extension
|
|
204
|
-
content_type = response.headers.get("content-type", "").lower()
|
|
205
241
|
file_extension = self._get_extension_from_content_type_or_url(
|
|
206
242
|
url, content_type
|
|
207
243
|
)
|
|
@@ -228,12 +264,17 @@ class HaikuRAG:
|
|
|
228
264
|
if existing_doc:
|
|
229
265
|
existing_doc.content = docling_document.export_to_markdown()
|
|
230
266
|
existing_doc.metadata = metadata
|
|
267
|
+
if title is not None:
|
|
268
|
+
existing_doc.title = title
|
|
231
269
|
return await self.document_repository._update_with_docling(
|
|
232
270
|
existing_doc, docling_document
|
|
233
271
|
)
|
|
234
272
|
else:
|
|
235
273
|
return await self._create_document_with_docling(
|
|
236
|
-
docling_document=docling_document,
|
|
274
|
+
docling_document=docling_document,
|
|
275
|
+
uri=url,
|
|
276
|
+
title=title,
|
|
277
|
+
metadata=metadata,
|
|
237
278
|
)
|
|
238
279
|
|
|
239
280
|
def _get_extension_from_content_type_or_url(
|
|
@@ -418,6 +459,7 @@ class HaikuRAG:
|
|
|
418
459
|
content="".join(combined_content_parts),
|
|
419
460
|
metadata=original_chunk.metadata,
|
|
420
461
|
document_uri=original_chunk.document_uri,
|
|
462
|
+
document_title=original_chunk.document_title,
|
|
421
463
|
document_meta=original_chunk.document_meta,
|
|
422
464
|
)
|
|
423
465
|
|
|
@@ -524,7 +566,7 @@ class HaikuRAG:
|
|
|
524
566
|
|
|
525
567
|
# Try to re-create from source (this creates the document with chunks)
|
|
526
568
|
new_doc = await self.create_document_from_source(
|
|
527
|
-
doc.uri, doc.metadata or {}
|
|
569
|
+
source=doc.uri, metadata=doc.metadata or {}
|
|
528
570
|
)
|
|
529
571
|
|
|
530
572
|
assert new_doc.id is not None, "New document ID should not be None"
|
haiku/rag/config.py
CHANGED
|
@@ -53,6 +53,10 @@ class AppConfig(BaseModel):
|
|
|
53
53
|
ANTHROPIC_API_KEY: str = ""
|
|
54
54
|
COHERE_API_KEY: str = ""
|
|
55
55
|
|
|
56
|
+
# If true, refuse to auto-create a new LanceDB database or tables
|
|
57
|
+
# and error out when the database does not already exist.
|
|
58
|
+
DISABLE_DB_AUTOCREATE: bool = False
|
|
59
|
+
|
|
56
60
|
@field_validator("MONITOR_DIRECTORIES", mode="before")
|
|
57
61
|
@classmethod
|
|
58
62
|
def parse_monitor_directories(cls, v):
|
haiku/rag/mcp.py
CHANGED
|
@@ -17,6 +17,7 @@ class DocumentResult(BaseModel):
|
|
|
17
17
|
id: str | None
|
|
18
18
|
content: str
|
|
19
19
|
uri: str | None = None
|
|
20
|
+
title: str | None = None
|
|
20
21
|
metadata: dict[str, Any] = {}
|
|
21
22
|
created_at: str
|
|
22
23
|
updated_at: str
|
|
@@ -28,13 +29,15 @@ def create_mcp_server(db_path: Path) -> FastMCP:
|
|
|
28
29
|
|
|
29
30
|
@mcp.tool()
|
|
30
31
|
async def add_document_from_file(
|
|
31
|
-
file_path: str,
|
|
32
|
+
file_path: str,
|
|
33
|
+
metadata: dict[str, Any] | None = None,
|
|
34
|
+
title: str | None = None,
|
|
32
35
|
) -> str | None:
|
|
33
36
|
"""Add a document to the RAG system from a file path."""
|
|
34
37
|
try:
|
|
35
38
|
async with HaikuRAG(db_path) as rag:
|
|
36
39
|
document = await rag.create_document_from_source(
|
|
37
|
-
Path(file_path), metadata or {}
|
|
40
|
+
Path(file_path), title=title, metadata=metadata or {}
|
|
38
41
|
)
|
|
39
42
|
return document.id
|
|
40
43
|
except Exception:
|
|
@@ -42,24 +45,31 @@ def create_mcp_server(db_path: Path) -> FastMCP:
|
|
|
42
45
|
|
|
43
46
|
@mcp.tool()
|
|
44
47
|
async def add_document_from_url(
|
|
45
|
-
url: str, metadata: dict[str, Any] | None = None
|
|
48
|
+
url: str, metadata: dict[str, Any] | None = None, title: str | None = None
|
|
46
49
|
) -> str | None:
|
|
47
50
|
"""Add a document to the RAG system from a URL."""
|
|
48
51
|
try:
|
|
49
52
|
async with HaikuRAG(db_path) as rag:
|
|
50
|
-
document = await rag.create_document_from_source(
|
|
53
|
+
document = await rag.create_document_from_source(
|
|
54
|
+
url, title=title, metadata=metadata or {}
|
|
55
|
+
)
|
|
51
56
|
return document.id
|
|
52
57
|
except Exception:
|
|
53
58
|
return None
|
|
54
59
|
|
|
55
60
|
@mcp.tool()
|
|
56
61
|
async def add_document_from_text(
|
|
57
|
-
content: str,
|
|
62
|
+
content: str,
|
|
63
|
+
uri: str | None = None,
|
|
64
|
+
metadata: dict[str, Any] | None = None,
|
|
65
|
+
title: str | None = None,
|
|
58
66
|
) -> str | None:
|
|
59
67
|
"""Add a document to the RAG system from text content."""
|
|
60
68
|
try:
|
|
61
69
|
async with HaikuRAG(db_path) as rag:
|
|
62
|
-
document = await rag.create_document(
|
|
70
|
+
document = await rag.create_document(
|
|
71
|
+
content, uri, title=title, metadata=metadata or {}
|
|
72
|
+
)
|
|
63
73
|
return document.id
|
|
64
74
|
except Exception:
|
|
65
75
|
return None
|
|
@@ -102,6 +112,7 @@ def create_mcp_server(db_path: Path) -> FastMCP:
|
|
|
102
112
|
id=document.id,
|
|
103
113
|
content=document.content,
|
|
104
114
|
uri=document.uri,
|
|
115
|
+
title=document.title,
|
|
105
116
|
metadata=document.metadata,
|
|
106
117
|
created_at=str(document.created_at),
|
|
107
118
|
updated_at=str(document.updated_at),
|
|
@@ -123,6 +134,7 @@ def create_mcp_server(db_path: Path) -> FastMCP:
|
|
|
123
134
|
id=doc.id,
|
|
124
135
|
content=doc.content,
|
|
125
136
|
uri=doc.uri,
|
|
137
|
+
title=doc.title,
|
|
126
138
|
metadata=doc.metadata,
|
|
127
139
|
created_at=str(doc.created_at),
|
|
128
140
|
updated_at=str(doc.updated_at),
|
haiku/rag/migration.py
CHANGED
|
@@ -51,7 +51,7 @@ class SQLiteToLanceDBMigrator:
|
|
|
51
51
|
|
|
52
52
|
sqlite_conn.enable_load_extension(True)
|
|
53
53
|
sqlite_vec.load(sqlite_conn)
|
|
54
|
-
self.console.print("[
|
|
54
|
+
self.console.print("[cyan]Loaded sqlite-vec extension[/cyan]")
|
|
55
55
|
except Exception as e:
|
|
56
56
|
self.console.print(
|
|
57
57
|
f"[yellow]Warning: Could not load sqlite-vec extension: {e}[/yellow]"
|
|
@@ -92,7 +92,7 @@ class SQLiteToLanceDBMigrator:
|
|
|
92
92
|
sqlite_conn.close()
|
|
93
93
|
|
|
94
94
|
# Optimize and cleanup using centralized vacuum
|
|
95
|
-
self.console.print("[
|
|
95
|
+
self.console.print("[cyan]Optimizing LanceDB...[/cyan]")
|
|
96
96
|
try:
|
|
97
97
|
lance_store.vacuum()
|
|
98
98
|
self.console.print("[green]✅ Optimization completed[/green]")
|
haiku/rag/qa/agent.py
CHANGED
|
@@ -12,7 +12,9 @@ from haiku.rag.qa.prompts import QA_SYSTEM_PROMPT, QA_SYSTEM_PROMPT_WITH_CITATIO
|
|
|
12
12
|
class SearchResult(BaseModel):
|
|
13
13
|
content: str = Field(description="The document text content")
|
|
14
14
|
score: float = Field(description="Relevance score (higher is more relevant)")
|
|
15
|
-
document_uri: str = Field(
|
|
15
|
+
document_uri: str = Field(
|
|
16
|
+
description="Source title (if available) or URI/path of the document"
|
|
17
|
+
)
|
|
16
18
|
|
|
17
19
|
|
|
18
20
|
class Dependencies(BaseModel):
|
|
@@ -59,7 +61,7 @@ class QuestionAnswerAgent:
|
|
|
59
61
|
SearchResult(
|
|
60
62
|
content=chunk.content,
|
|
61
63
|
score=score,
|
|
62
|
-
document_uri=chunk.document_uri or "",
|
|
64
|
+
document_uri=(chunk.document_title or chunk.document_uri or ""),
|
|
63
65
|
)
|
|
64
66
|
for chunk, score in expanded_results
|
|
65
67
|
]
|
haiku/rag/qa/prompts.py
CHANGED
|
@@ -44,9 +44,9 @@ Guidelines:
|
|
|
44
44
|
|
|
45
45
|
Citation Format:
|
|
46
46
|
After your answer, include a "Citations:" section that lists:
|
|
47
|
-
- The document URI from each search result used
|
|
47
|
+
- The document title (if available) or URI from each search result used
|
|
48
48
|
- A brief excerpt (first 50-100 characters) of the content that supported your answer
|
|
49
|
-
- Format: "Citations:\n- [
|
|
49
|
+
- Format: "Citations:\n- [document title or URI]: [content_excerpt]..."
|
|
50
50
|
|
|
51
51
|
Example response format:
|
|
52
52
|
[Your answer here]
|
haiku/rag/research/models.py
CHANGED
|
@@ -19,8 +19,8 @@ class SearchAnswer(BaseModel):
|
|
|
19
19
|
)
|
|
20
20
|
sources: list[str] = Field(
|
|
21
21
|
description=(
|
|
22
|
-
"Document
|
|
23
|
-
" answer (one
|
|
22
|
+
"Document titles (if available) or URIs corresponding to the"
|
|
23
|
+
" snippets actually used in the answer (one per snippet; omit if none)"
|
|
24
24
|
),
|
|
25
25
|
default_factory=list,
|
|
26
26
|
)
|
|
@@ -59,7 +59,9 @@ class SearchDispatchNode(BaseNode[ResearchState, ResearchDeps, ResearchReport]):
|
|
|
59
59
|
{
|
|
60
60
|
"text": chunk.content,
|
|
61
61
|
"score": score,
|
|
62
|
-
"document_uri": (
|
|
62
|
+
"document_uri": (
|
|
63
|
+
chunk.document_title or chunk.document_uri or ""
|
|
64
|
+
),
|
|
63
65
|
}
|
|
64
66
|
for chunk, score in expanded
|
|
65
67
|
]
|
haiku/rag/research/prompts.py
CHANGED
|
@@ -27,13 +27,14 @@ Tasks:
|
|
|
27
27
|
Tool usage:
|
|
28
28
|
- Always call search_and_answer before drafting any answer.
|
|
29
29
|
- The tool returns snippets with verbatim `text`, a relevance `score`, and the
|
|
30
|
-
originating
|
|
30
|
+
originating document identifier (document title if available, otherwise URI).
|
|
31
31
|
- You may call the tool multiple times to refine or broaden context, but do not
|
|
32
32
|
exceed 3 total calls. Favor precision over volume.
|
|
33
33
|
- Use scores to prioritize evidence, but include only the minimal subset of
|
|
34
34
|
snippet texts (verbatim) in SearchAnswer.context (typically 1‑4).
|
|
35
|
-
- Set SearchAnswer.sources to the corresponding
|
|
36
|
-
you used (
|
|
35
|
+
- Set SearchAnswer.sources to the corresponding document identifiers for the
|
|
36
|
+
snippets you used (title if available, otherwise URI; one per snippet; same
|
|
37
|
+
order as context). Context must be text‑only.
|
|
37
38
|
- If no relevant information is found, clearly say so and return an empty
|
|
38
39
|
context list and sources list.
|
|
39
40
|
|
haiku/rag/store/__init__.py
CHANGED
haiku/rag/store/engine.py
CHANGED
|
@@ -19,6 +19,7 @@ class DocumentRecord(LanceModel):
|
|
|
19
19
|
id: str = Field(default_factory=lambda: str(uuid4()))
|
|
20
20
|
content: str
|
|
21
21
|
uri: str | None = None
|
|
22
|
+
title: str | None = None
|
|
22
23
|
metadata: str = Field(default="{}")
|
|
23
24
|
created_at: str = Field(default_factory=lambda: "")
|
|
24
25
|
updated_at: str = Field(default_factory=lambda: "")
|
|
@@ -54,6 +55,19 @@ class Store:
|
|
|
54
55
|
# Create the ChunkRecord model with the correct vector dimension
|
|
55
56
|
self.ChunkRecord = create_chunk_model(self.embedder._vector_dim)
|
|
56
57
|
|
|
58
|
+
# Local filesystem handling for DB directory
|
|
59
|
+
if not self._has_cloud_config():
|
|
60
|
+
if Config.DISABLE_DB_AUTOCREATE:
|
|
61
|
+
# LanceDB uses a directory path for local databases; enforce presence
|
|
62
|
+
if not db_path.exists():
|
|
63
|
+
raise FileNotFoundError(
|
|
64
|
+
f"LanceDB path does not exist: {db_path}. Auto-creation is disabled."
|
|
65
|
+
)
|
|
66
|
+
else:
|
|
67
|
+
# Ensure parent directories exist when autocreation allowed
|
|
68
|
+
if not db_path.parent.exists():
|
|
69
|
+
Path.mkdir(db_path.parent, parents=True)
|
|
70
|
+
|
|
57
71
|
# Connect to LanceDB
|
|
58
72
|
self.db = self._connect_to_lancedb(db_path)
|
|
59
73
|
|
haiku/rag/store/models/chunk.py
CHANGED
|
@@ -11,6 +11,7 @@ class Document(BaseModel):
|
|
|
11
11
|
id: str | None = None
|
|
12
12
|
content: str
|
|
13
13
|
uri: str | None = None
|
|
14
|
+
title: str | None = None
|
|
14
15
|
metadata: dict = {}
|
|
15
16
|
created_at: datetime = Field(default_factory=datetime.now)
|
|
16
17
|
updated_at: datetime = Field(default_factory=datetime.now)
|
|
@@ -317,6 +317,7 @@ class ChunkRepository:
|
|
|
317
317
|
)
|
|
318
318
|
|
|
319
319
|
doc_uri = doc_results[0].uri if doc_results else None
|
|
320
|
+
doc_title = doc_results[0].title if doc_results else None
|
|
320
321
|
doc_meta = doc_results[0].metadata if doc_results else "{}"
|
|
321
322
|
|
|
322
323
|
chunks: list[Chunk] = []
|
|
@@ -330,6 +331,7 @@ class ChunkRepository:
|
|
|
330
331
|
metadata=md,
|
|
331
332
|
order=rec.order,
|
|
332
333
|
document_uri=doc_uri,
|
|
334
|
+
document_title=doc_title,
|
|
333
335
|
document_meta=json.loads(doc_meta),
|
|
334
336
|
)
|
|
335
337
|
)
|
|
@@ -398,6 +400,7 @@ class ChunkRepository:
|
|
|
398
400
|
# Get document info from pre-fetched map
|
|
399
401
|
doc = documents_map.get(chunk_record.document_id)
|
|
400
402
|
doc_uri = doc.uri if doc else None
|
|
403
|
+
doc_title = doc.title if doc else None
|
|
401
404
|
doc_meta = doc.metadata if doc else "{}"
|
|
402
405
|
|
|
403
406
|
md = json.loads(chunk_record.metadata)
|
|
@@ -409,6 +412,7 @@ class ChunkRepository:
|
|
|
409
412
|
metadata=md,
|
|
410
413
|
order=chunk_record.order,
|
|
411
414
|
document_uri=doc_uri,
|
|
415
|
+
document_title=doc_title,
|
|
412
416
|
document_meta=json.loads(doc_meta),
|
|
413
417
|
)
|
|
414
418
|
|
|
@@ -34,6 +34,7 @@ class DocumentRepository:
|
|
|
34
34
|
id=record.id,
|
|
35
35
|
content=record.content,
|
|
36
36
|
uri=record.uri,
|
|
37
|
+
title=record.title,
|
|
37
38
|
metadata=json.loads(record.metadata),
|
|
38
39
|
created_at=datetime.fromisoformat(record.created_at)
|
|
39
40
|
if record.created_at
|
|
@@ -56,6 +57,7 @@ class DocumentRepository:
|
|
|
56
57
|
id=doc_id,
|
|
57
58
|
content=entity.content,
|
|
58
59
|
uri=entity.uri,
|
|
60
|
+
title=entity.title,
|
|
59
61
|
metadata=json.dumps(entity.metadata),
|
|
60
62
|
created_at=now,
|
|
61
63
|
updated_at=now,
|
|
@@ -97,6 +99,7 @@ class DocumentRepository:
|
|
|
97
99
|
values={
|
|
98
100
|
"content": entity.content,
|
|
99
101
|
"uri": entity.uri,
|
|
102
|
+
"title": entity.title,
|
|
100
103
|
"metadata": json.dumps(entity.metadata),
|
|
101
104
|
"updated_at": now,
|
|
102
105
|
},
|
|
@@ -55,6 +55,8 @@ def run_pending_upgrades(store: Store, from_version: str, to_version: str) -> No
|
|
|
55
55
|
|
|
56
56
|
from .v0_9_3 import upgrade_fts_phrase as upgrade_0_9_3_fts # noqa: E402
|
|
57
57
|
from .v0_9_3 import upgrade_order as upgrade_0_9_3_order # noqa: E402
|
|
58
|
+
from .v0_10_1 import upgrade_add_title as upgrade_0_10_1_add_title # noqa: E402
|
|
58
59
|
|
|
59
60
|
upgrades.append(upgrade_0_9_3_order)
|
|
60
61
|
upgrades.append(upgrade_0_9_3_fts)
|
|
62
|
+
upgrades.append(upgrade_0_10_1_add_title)
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
from lancedb.pydantic import LanceModel
|
|
4
|
+
from pydantic import Field
|
|
5
|
+
|
|
6
|
+
from haiku.rag.store.engine import Store
|
|
7
|
+
from haiku.rag.store.upgrades import Upgrade
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _apply_add_document_title(store: Store) -> None:
|
|
11
|
+
"""Add a nullable 'title' column to the documents table."""
|
|
12
|
+
|
|
13
|
+
# Read existing rows using Arrow for schema-agnostic access
|
|
14
|
+
try:
|
|
15
|
+
docs_arrow = store.documents_table.search().to_arrow()
|
|
16
|
+
rows = docs_arrow.to_pylist()
|
|
17
|
+
except Exception:
|
|
18
|
+
rows = []
|
|
19
|
+
|
|
20
|
+
class DocumentRecordV2(LanceModel):
|
|
21
|
+
id: str
|
|
22
|
+
content: str
|
|
23
|
+
uri: str | None = None
|
|
24
|
+
title: str | None = None
|
|
25
|
+
metadata: str = Field(default="{}")
|
|
26
|
+
created_at: str = Field(default_factory=lambda: "")
|
|
27
|
+
updated_at: str = Field(default_factory=lambda: "")
|
|
28
|
+
|
|
29
|
+
# Drop and recreate documents table with the new schema
|
|
30
|
+
try:
|
|
31
|
+
store.db.drop_table("documents")
|
|
32
|
+
except Exception:
|
|
33
|
+
pass
|
|
34
|
+
|
|
35
|
+
store.documents_table = store.db.create_table("documents", schema=DocumentRecordV2)
|
|
36
|
+
|
|
37
|
+
# Reinsert previous rows with title=None
|
|
38
|
+
if rows:
|
|
39
|
+
backfilled = []
|
|
40
|
+
for row in rows:
|
|
41
|
+
backfilled.append(
|
|
42
|
+
DocumentRecordV2(
|
|
43
|
+
id=row.get("id"),
|
|
44
|
+
content=row.get("content", ""),
|
|
45
|
+
uri=row.get("uri"),
|
|
46
|
+
title=None,
|
|
47
|
+
metadata=(
|
|
48
|
+
row.get("metadata")
|
|
49
|
+
if isinstance(row.get("metadata"), str)
|
|
50
|
+
else json.dumps(row.get("metadata") or {})
|
|
51
|
+
),
|
|
52
|
+
created_at=row.get("created_at", ""),
|
|
53
|
+
updated_at=row.get("updated_at", ""),
|
|
54
|
+
)
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
store.documents_table.add(backfilled)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
upgrade_add_title = Upgrade(
|
|
61
|
+
version="0.10.1",
|
|
62
|
+
apply=_apply_add_document_title,
|
|
63
|
+
description="Add nullable 'title' column to documents table",
|
|
64
|
+
)
|
haiku/rag/utils.py
CHANGED
|
@@ -9,10 +9,6 @@ from io import BytesIO
|
|
|
9
9
|
from pathlib import Path
|
|
10
10
|
from types import ModuleType
|
|
11
11
|
|
|
12
|
-
import httpx
|
|
13
|
-
from docling.document_converter import DocumentConverter
|
|
14
|
-
from docling_core.types.doc.document import DoclingDocument
|
|
15
|
-
from docling_core.types.io import DocumentStream
|
|
16
12
|
from packaging.version import Version, parse
|
|
17
13
|
|
|
18
14
|
|
|
@@ -82,6 +78,9 @@ async def is_up_to_date() -> tuple[bool, Version, Version]:
|
|
|
82
78
|
the running version and the latest version.
|
|
83
79
|
"""
|
|
84
80
|
|
|
81
|
+
# Lazy import to avoid pulling httpx (and its deps) on module import
|
|
82
|
+
import httpx
|
|
83
|
+
|
|
85
84
|
async with httpx.AsyncClient() as client:
|
|
86
85
|
running_version = parse(metadata.version("haiku.rag"))
|
|
87
86
|
try:
|
|
@@ -94,7 +93,7 @@ async def is_up_to_date() -> tuple[bool, Version, Version]:
|
|
|
94
93
|
return running_version >= pypi_version, running_version, pypi_version
|
|
95
94
|
|
|
96
95
|
|
|
97
|
-
def text_to_docling_document(text: str, name: str = "content.md")
|
|
96
|
+
def text_to_docling_document(text: str, name: str = "content.md"):
|
|
98
97
|
"""Convert text content to a DoclingDocument.
|
|
99
98
|
|
|
100
99
|
Args:
|
|
@@ -104,6 +103,10 @@ def text_to_docling_document(text: str, name: str = "content.md") -> DoclingDocu
|
|
|
104
103
|
Returns:
|
|
105
104
|
A DoclingDocument created from the text content.
|
|
106
105
|
"""
|
|
106
|
+
# Lazy import docling deps to keep import-time light
|
|
107
|
+
from docling.document_converter import DocumentConverter # type: ignore
|
|
108
|
+
from docling_core.types.io import DocumentStream # type: ignore
|
|
109
|
+
|
|
107
110
|
bytes_io = BytesIO(text.encode("utf-8"))
|
|
108
111
|
doc_stream = DocumentStream(name=name, stream=bytes_io)
|
|
109
112
|
converter = DocumentConverter()
|
|
@@ -160,3 +163,37 @@ def load_callable(path: str):
|
|
|
160
163
|
f"Attribute '{func_name}' in module '{module_part}' is not callable"
|
|
161
164
|
)
|
|
162
165
|
return func
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def prefetch_models():
|
|
169
|
+
"""Prefetch runtime models (Docling + Ollama as configured)."""
|
|
170
|
+
import httpx
|
|
171
|
+
from docling.utils.model_downloader import download_models
|
|
172
|
+
|
|
173
|
+
from haiku.rag.config import Config
|
|
174
|
+
|
|
175
|
+
download_models()
|
|
176
|
+
|
|
177
|
+
# Collect Ollama models from config
|
|
178
|
+
required_models: set[str] = set()
|
|
179
|
+
if Config.EMBEDDINGS_PROVIDER == "ollama":
|
|
180
|
+
required_models.add(Config.EMBEDDINGS_MODEL)
|
|
181
|
+
if Config.QA_PROVIDER == "ollama":
|
|
182
|
+
required_models.add(Config.QA_MODEL)
|
|
183
|
+
if Config.RESEARCH_PROVIDER == "ollama":
|
|
184
|
+
required_models.add(Config.RESEARCH_MODEL)
|
|
185
|
+
if Config.RERANK_PROVIDER == "ollama":
|
|
186
|
+
required_models.add(Config.RERANK_MODEL)
|
|
187
|
+
|
|
188
|
+
if not required_models:
|
|
189
|
+
return
|
|
190
|
+
|
|
191
|
+
base_url = Config.OLLAMA_BASE_URL
|
|
192
|
+
|
|
193
|
+
with httpx.Client(timeout=None) as client:
|
|
194
|
+
for model in sorted(required_models):
|
|
195
|
+
with client.stream(
|
|
196
|
+
"POST", f"{base_url}/api/pull", json={"model": model}
|
|
197
|
+
) as r:
|
|
198
|
+
for _ in r.iter_lines():
|
|
199
|
+
pass
|