rnsr 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rnsr/__init__.py +118 -0
- rnsr/__main__.py +242 -0
- rnsr/agent/__init__.py +218 -0
- rnsr/agent/cross_doc_navigator.py +767 -0
- rnsr/agent/graph.py +1557 -0
- rnsr/agent/llm_cache.py +575 -0
- rnsr/agent/navigator_api.py +497 -0
- rnsr/agent/provenance.py +772 -0
- rnsr/agent/query_clarifier.py +617 -0
- rnsr/agent/reasoning_memory.py +736 -0
- rnsr/agent/repl_env.py +709 -0
- rnsr/agent/rlm_navigator.py +2108 -0
- rnsr/agent/self_reflection.py +602 -0
- rnsr/agent/variable_store.py +308 -0
- rnsr/benchmarks/__init__.py +118 -0
- rnsr/benchmarks/comprehensive_benchmark.py +733 -0
- rnsr/benchmarks/evaluation_suite.py +1210 -0
- rnsr/benchmarks/finance_bench.py +147 -0
- rnsr/benchmarks/pdf_merger.py +178 -0
- rnsr/benchmarks/performance.py +321 -0
- rnsr/benchmarks/quality.py +321 -0
- rnsr/benchmarks/runner.py +298 -0
- rnsr/benchmarks/standard_benchmarks.py +995 -0
- rnsr/client.py +560 -0
- rnsr/document_store.py +394 -0
- rnsr/exceptions.py +74 -0
- rnsr/extraction/__init__.py +172 -0
- rnsr/extraction/candidate_extractor.py +357 -0
- rnsr/extraction/entity_extractor.py +581 -0
- rnsr/extraction/entity_linker.py +825 -0
- rnsr/extraction/grounded_extractor.py +722 -0
- rnsr/extraction/learned_types.py +599 -0
- rnsr/extraction/models.py +232 -0
- rnsr/extraction/relationship_extractor.py +600 -0
- rnsr/extraction/relationship_patterns.py +511 -0
- rnsr/extraction/relationship_validator.py +392 -0
- rnsr/extraction/rlm_extractor.py +589 -0
- rnsr/extraction/rlm_unified_extractor.py +990 -0
- rnsr/extraction/tot_validator.py +610 -0
- rnsr/extraction/unified_extractor.py +342 -0
- rnsr/indexing/__init__.py +60 -0
- rnsr/indexing/knowledge_graph.py +1128 -0
- rnsr/indexing/kv_store.py +313 -0
- rnsr/indexing/persistence.py +323 -0
- rnsr/indexing/semantic_retriever.py +237 -0
- rnsr/indexing/semantic_search.py +320 -0
- rnsr/indexing/skeleton_index.py +395 -0
- rnsr/ingestion/__init__.py +161 -0
- rnsr/ingestion/chart_parser.py +569 -0
- rnsr/ingestion/document_boundary.py +662 -0
- rnsr/ingestion/font_histogram.py +334 -0
- rnsr/ingestion/header_classifier.py +595 -0
- rnsr/ingestion/hierarchical_cluster.py +515 -0
- rnsr/ingestion/layout_detector.py +356 -0
- rnsr/ingestion/layout_model.py +379 -0
- rnsr/ingestion/ocr_fallback.py +177 -0
- rnsr/ingestion/pipeline.py +936 -0
- rnsr/ingestion/semantic_fallback.py +417 -0
- rnsr/ingestion/table_parser.py +799 -0
- rnsr/ingestion/text_builder.py +460 -0
- rnsr/ingestion/tree_builder.py +402 -0
- rnsr/ingestion/vision_retrieval.py +965 -0
- rnsr/ingestion/xy_cut.py +555 -0
- rnsr/llm.py +733 -0
- rnsr/models.py +167 -0
- rnsr/py.typed +2 -0
- rnsr-0.1.0.dist-info/METADATA +592 -0
- rnsr-0.1.0.dist-info/RECORD +72 -0
- rnsr-0.1.0.dist-info/WHEEL +5 -0
- rnsr-0.1.0.dist-info/entry_points.txt +2 -0
- rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
- rnsr-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,313 @@
|
|
|
1
|
+
"""
|
|
2
|
+
KV Store - SQLite-Backed Key-Value Storage for Full Text
|
|
3
|
+
|
|
4
|
+
The Skeleton Index pattern requires separating:
|
|
5
|
+
- Summaries (stored in vector index for retrieval)
|
|
6
|
+
- Full Text (stored externally in this KV Store)
|
|
7
|
+
|
|
8
|
+
This prevents full text from polluting the LLM context until
|
|
9
|
+
explicitly requested during synthesis.
|
|
10
|
+
|
|
11
|
+
Usage:
|
|
12
|
+
kv = SQLiteKVStore("./data/document_kv.db")
|
|
13
|
+
kv.put("node_123", "Full text content here...")
|
|
14
|
+
content = kv.get("node_123")
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import hashlib
|
|
20
|
+
import sqlite3
|
|
21
|
+
from contextlib import contextmanager
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
from typing import Iterator, Union
|
|
24
|
+
|
|
25
|
+
import structlog
|
|
26
|
+
|
|
27
|
+
from rnsr.exceptions import IndexingError
|
|
28
|
+
|
|
29
|
+
logger = structlog.get_logger(__name__)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class SQLiteKVStore:
|
|
33
|
+
"""
|
|
34
|
+
SQLite-backed key-value store for document content.
|
|
35
|
+
|
|
36
|
+
Stores full text content separately from the vector index,
|
|
37
|
+
allowing the skeleton index to contain only summaries.
|
|
38
|
+
|
|
39
|
+
Attributes:
|
|
40
|
+
db_path: Path to the SQLite database file.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
def __init__(self, db_path: Path | str):
|
|
44
|
+
"""
|
|
45
|
+
Initialize the KV store.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
db_path: Path to the SQLite database file.
|
|
49
|
+
Will be created if it doesn't exist.
|
|
50
|
+
"""
|
|
51
|
+
self.db_path = Path(db_path)
|
|
52
|
+
self.db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
53
|
+
|
|
54
|
+
self._init_db()
|
|
55
|
+
|
|
56
|
+
logger.info("kv_store_initialized", db_path=str(self.db_path))
|
|
57
|
+
|
|
58
|
+
def _init_db(self) -> None:
|
|
59
|
+
"""Create the database schema if it doesn't exist."""
|
|
60
|
+
with self._connect() as conn:
|
|
61
|
+
conn.execute("""
|
|
62
|
+
CREATE TABLE IF NOT EXISTS documents (
|
|
63
|
+
node_id TEXT PRIMARY KEY,
|
|
64
|
+
content TEXT NOT NULL,
|
|
65
|
+
content_hash TEXT NOT NULL,
|
|
66
|
+
char_count INTEGER NOT NULL,
|
|
67
|
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
68
|
+
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|
69
|
+
)
|
|
70
|
+
""")
|
|
71
|
+
|
|
72
|
+
conn.execute("""
|
|
73
|
+
CREATE INDEX IF NOT EXISTS idx_content_hash
|
|
74
|
+
ON documents(content_hash)
|
|
75
|
+
""")
|
|
76
|
+
|
|
77
|
+
conn.commit()
|
|
78
|
+
|
|
79
|
+
@contextmanager
|
|
80
|
+
def _connect(self) -> Iterator[sqlite3.Connection]:
|
|
81
|
+
"""Context manager for database connections."""
|
|
82
|
+
conn = sqlite3.connect(self.db_path)
|
|
83
|
+
conn.row_factory = sqlite3.Row
|
|
84
|
+
try:
|
|
85
|
+
yield conn
|
|
86
|
+
finally:
|
|
87
|
+
conn.close()
|
|
88
|
+
|
|
89
|
+
def put(self, node_id: str, content: str) -> str:
|
|
90
|
+
"""
|
|
91
|
+
Store content for a node.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
node_id: Unique identifier for the node.
|
|
95
|
+
content: Full text content to store.
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
SHA256 hash of the content.
|
|
99
|
+
"""
|
|
100
|
+
content_hash = hashlib.sha256(content.encode()).hexdigest()[:16]
|
|
101
|
+
char_count = len(content)
|
|
102
|
+
|
|
103
|
+
with self._connect() as conn:
|
|
104
|
+
conn.execute(
|
|
105
|
+
"""
|
|
106
|
+
INSERT INTO documents (node_id, content, content_hash, char_count)
|
|
107
|
+
VALUES (?, ?, ?, ?)
|
|
108
|
+
ON CONFLICT(node_id) DO UPDATE SET
|
|
109
|
+
content = excluded.content,
|
|
110
|
+
content_hash = excluded.content_hash,
|
|
111
|
+
char_count = excluded.char_count,
|
|
112
|
+
updated_at = CURRENT_TIMESTAMP
|
|
113
|
+
""",
|
|
114
|
+
(node_id, content, content_hash, char_count),
|
|
115
|
+
)
|
|
116
|
+
conn.commit()
|
|
117
|
+
|
|
118
|
+
logger.debug(
|
|
119
|
+
"kv_put",
|
|
120
|
+
node_id=node_id,
|
|
121
|
+
char_count=char_count,
|
|
122
|
+
hash=content_hash,
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
return content_hash
|
|
126
|
+
|
|
127
|
+
def get(self, node_id: str) -> str | None:
|
|
128
|
+
"""
|
|
129
|
+
Retrieve content for a node.
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
node_id: Unique identifier for the node.
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
Full text content, or None if not found.
|
|
136
|
+
"""
|
|
137
|
+
with self._connect() as conn:
|
|
138
|
+
cursor = conn.execute(
|
|
139
|
+
"SELECT content FROM documents WHERE node_id = ?",
|
|
140
|
+
(node_id,),
|
|
141
|
+
)
|
|
142
|
+
row = cursor.fetchone()
|
|
143
|
+
|
|
144
|
+
if row is None:
|
|
145
|
+
logger.debug("kv_miss", node_id=node_id)
|
|
146
|
+
return None
|
|
147
|
+
|
|
148
|
+
logger.debug("kv_hit", node_id=node_id)
|
|
149
|
+
return row["content"]
|
|
150
|
+
|
|
151
|
+
def get_batch(self, node_ids: list[str]) -> dict[str, str | None]:
|
|
152
|
+
"""
|
|
153
|
+
Retrieve content for multiple nodes.
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
node_ids: List of node identifiers.
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
Dictionary mapping node_id to content (or None if not found).
|
|
160
|
+
"""
|
|
161
|
+
result: dict[str, str | None] = {nid: None for nid in node_ids}
|
|
162
|
+
|
|
163
|
+
if not node_ids:
|
|
164
|
+
return result
|
|
165
|
+
|
|
166
|
+
placeholders = ",".join("?" * len(node_ids))
|
|
167
|
+
|
|
168
|
+
with self._connect() as conn:
|
|
169
|
+
cursor = conn.execute(
|
|
170
|
+
f"SELECT node_id, content FROM documents WHERE node_id IN ({placeholders})",
|
|
171
|
+
node_ids,
|
|
172
|
+
)
|
|
173
|
+
for row in cursor:
|
|
174
|
+
result[row["node_id"]] = row["content"]
|
|
175
|
+
|
|
176
|
+
found = sum(1 for v in result.values() if v is not None)
|
|
177
|
+
logger.debug("kv_batch_get", requested=len(node_ids), found=found)
|
|
178
|
+
|
|
179
|
+
return result
|
|
180
|
+
|
|
181
|
+
def delete(self, node_id: str) -> bool:
|
|
182
|
+
"""
|
|
183
|
+
Delete content for a node.
|
|
184
|
+
|
|
185
|
+
Args:
|
|
186
|
+
node_id: Unique identifier for the node.
|
|
187
|
+
|
|
188
|
+
Returns:
|
|
189
|
+
True if deleted, False if not found.
|
|
190
|
+
"""
|
|
191
|
+
with self._connect() as conn:
|
|
192
|
+
cursor = conn.execute(
|
|
193
|
+
"DELETE FROM documents WHERE node_id = ?",
|
|
194
|
+
(node_id,),
|
|
195
|
+
)
|
|
196
|
+
conn.commit()
|
|
197
|
+
deleted = cursor.rowcount > 0
|
|
198
|
+
|
|
199
|
+
logger.debug("kv_delete", node_id=node_id, deleted=deleted)
|
|
200
|
+
return deleted
|
|
201
|
+
|
|
202
|
+
def exists(self, node_id: str) -> bool:
|
|
203
|
+
"""Check if a node exists in the store."""
|
|
204
|
+
with self._connect() as conn:
|
|
205
|
+
cursor = conn.execute(
|
|
206
|
+
"SELECT 1 FROM documents WHERE node_id = ? LIMIT 1",
|
|
207
|
+
(node_id,),
|
|
208
|
+
)
|
|
209
|
+
return cursor.fetchone() is not None
|
|
210
|
+
|
|
211
|
+
def count(self) -> int:
|
|
212
|
+
"""Get the total number of stored documents."""
|
|
213
|
+
with self._connect() as conn:
|
|
214
|
+
cursor = conn.execute("SELECT COUNT(*) FROM documents")
|
|
215
|
+
return cursor.fetchone()[0]
|
|
216
|
+
|
|
217
|
+
def get_metadata(self, node_id: str) -> dict | None:
|
|
218
|
+
"""
|
|
219
|
+
Get metadata about a stored document.
|
|
220
|
+
|
|
221
|
+
Args:
|
|
222
|
+
node_id: Unique identifier for the node.
|
|
223
|
+
|
|
224
|
+
Returns:
|
|
225
|
+
Dictionary with hash, char_count, timestamps, or None.
|
|
226
|
+
"""
|
|
227
|
+
with self._connect() as conn:
|
|
228
|
+
cursor = conn.execute(
|
|
229
|
+
"""
|
|
230
|
+
SELECT content_hash, char_count, created_at, updated_at
|
|
231
|
+
FROM documents WHERE node_id = ?
|
|
232
|
+
""",
|
|
233
|
+
(node_id,),
|
|
234
|
+
)
|
|
235
|
+
row = cursor.fetchone()
|
|
236
|
+
|
|
237
|
+
if row is None:
|
|
238
|
+
return None
|
|
239
|
+
|
|
240
|
+
return {
|
|
241
|
+
"content_hash": row["content_hash"],
|
|
242
|
+
"char_count": row["char_count"],
|
|
243
|
+
"created_at": row["created_at"],
|
|
244
|
+
"updated_at": row["updated_at"],
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
def clear(self) -> int:
|
|
248
|
+
"""
|
|
249
|
+
Delete all documents from the store.
|
|
250
|
+
|
|
251
|
+
Returns:
|
|
252
|
+
Number of documents deleted.
|
|
253
|
+
"""
|
|
254
|
+
with self._connect() as conn:
|
|
255
|
+
cursor = conn.execute("DELETE FROM documents")
|
|
256
|
+
count = cursor.rowcount
|
|
257
|
+
conn.commit()
|
|
258
|
+
|
|
259
|
+
logger.warning("kv_store_cleared", count=count)
|
|
260
|
+
return count
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
class InMemoryKVStore:
|
|
264
|
+
"""
|
|
265
|
+
In-memory key-value store for testing and ephemeral usage.
|
|
266
|
+
|
|
267
|
+
API-compatible with SQLiteKVStore.
|
|
268
|
+
"""
|
|
269
|
+
|
|
270
|
+
def __init__(self):
|
|
271
|
+
self._store: dict[str, str] = {}
|
|
272
|
+
self._metadata: dict[str, dict] = {}
|
|
273
|
+
|
|
274
|
+
def put(self, node_id: str, content: str) -> str:
|
|
275
|
+
content_hash = hashlib.sha256(content.encode()).hexdigest()[:16]
|
|
276
|
+
self._store[node_id] = content
|
|
277
|
+
self._metadata[node_id] = {
|
|
278
|
+
"content_hash": content_hash,
|
|
279
|
+
"char_count": len(content),
|
|
280
|
+
}
|
|
281
|
+
return content_hash
|
|
282
|
+
|
|
283
|
+
def get(self, node_id: str) -> str | None:
|
|
284
|
+
return self._store.get(node_id)
|
|
285
|
+
|
|
286
|
+
def get_batch(self, node_ids: list[str]) -> dict[str, str | None]:
|
|
287
|
+
return {nid: self._store.get(nid) for nid in node_ids}
|
|
288
|
+
|
|
289
|
+
def delete(self, node_id: str) -> bool:
|
|
290
|
+
if node_id in self._store:
|
|
291
|
+
del self._store[node_id]
|
|
292
|
+
del self._metadata[node_id]
|
|
293
|
+
return True
|
|
294
|
+
return False
|
|
295
|
+
|
|
296
|
+
def exists(self, node_id: str) -> bool:
|
|
297
|
+
return node_id in self._store
|
|
298
|
+
|
|
299
|
+
def count(self) -> int:
|
|
300
|
+
return len(self._store)
|
|
301
|
+
|
|
302
|
+
def get_metadata(self, node_id: str) -> dict | None:
|
|
303
|
+
return self._metadata.get(node_id)
|
|
304
|
+
|
|
305
|
+
def clear(self) -> int:
|
|
306
|
+
count = len(self._store)
|
|
307
|
+
self._store.clear()
|
|
308
|
+
self._metadata.clear()
|
|
309
|
+
return count
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
# Type alias for either store
|
|
313
|
+
KVStore = Union[SQLiteKVStore, InMemoryKVStore]
|
|
@@ -0,0 +1,323 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Persistence Module - Save and Load RNSR Indexes
|
|
3
|
+
|
|
4
|
+
Provides functionality to persist and restore:
|
|
5
|
+
- Skeleton Index (SkeletonNode structures)
|
|
6
|
+
- KV Store (already SQLite-backed, but needs export/import)
|
|
7
|
+
- Document metadata
|
|
8
|
+
|
|
9
|
+
Usage:
|
|
10
|
+
from rnsr.indexing import save_index, load_index
|
|
11
|
+
|
|
12
|
+
# Save after indexing
|
|
13
|
+
skeleton, kv_store = build_skeleton_index(tree)
|
|
14
|
+
save_index(skeleton, kv_store, "./my_document_index/")
|
|
15
|
+
|
|
16
|
+
# Load later (no re-processing needed!)
|
|
17
|
+
skeleton, kv_store = load_index("./my_document_index/")
|
|
18
|
+
answer = run_navigator("question", skeleton, kv_store)
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import json
|
|
24
|
+
import shutil
|
|
25
|
+
from dataclasses import asdict
|
|
26
|
+
from datetime import datetime
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
from typing import Any
|
|
29
|
+
|
|
30
|
+
import structlog
|
|
31
|
+
|
|
32
|
+
from rnsr.exceptions import IndexingError
|
|
33
|
+
from rnsr.indexing.kv_store import InMemoryKVStore, KVStore, SQLiteKVStore
|
|
34
|
+
from rnsr.models import SkeletonNode
|
|
35
|
+
|
|
36
|
+
logger = structlog.get_logger(__name__)
|
|
37
|
+
|
|
38
|
+
# Version for format compatibility
|
|
39
|
+
INDEX_FORMAT_VERSION = "1.0"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def save_index(
|
|
43
|
+
skeleton: dict[str, SkeletonNode],
|
|
44
|
+
kv_store: KVStore,
|
|
45
|
+
index_dir: str | Path,
|
|
46
|
+
metadata: dict[str, Any] | None = None,
|
|
47
|
+
) -> Path:
|
|
48
|
+
"""
|
|
49
|
+
Save a skeleton index and KV store to disk.
|
|
50
|
+
|
|
51
|
+
Creates a directory structure:
|
|
52
|
+
index_dir/
|
|
53
|
+
manifest.json # Version, metadata, timestamps
|
|
54
|
+
skeleton.json # SkeletonNode structures
|
|
55
|
+
content.db # SQLite KV store (copied or created)
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
skeleton: Dictionary of node_id -> SkeletonNode
|
|
59
|
+
kv_store: KV store containing full text
|
|
60
|
+
index_dir: Directory to save the index
|
|
61
|
+
metadata: Optional metadata (title, source, etc.)
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
Path to the index directory
|
|
65
|
+
|
|
66
|
+
Example:
|
|
67
|
+
skeleton, kv = build_skeleton_index(tree)
|
|
68
|
+
save_index(skeleton, kv, "./indexes/contract_2024/")
|
|
69
|
+
"""
|
|
70
|
+
index_path = Path(index_dir)
|
|
71
|
+
index_path.mkdir(parents=True, exist_ok=True)
|
|
72
|
+
|
|
73
|
+
# Build manifest
|
|
74
|
+
manifest = {
|
|
75
|
+
"version": INDEX_FORMAT_VERSION,
|
|
76
|
+
"created_at": datetime.now().isoformat(),
|
|
77
|
+
"node_count": len(skeleton),
|
|
78
|
+
"metadata": metadata or {},
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
# Find root node for extra info
|
|
82
|
+
root = next((n for n in skeleton.values() if n.level == 0), None)
|
|
83
|
+
if root:
|
|
84
|
+
manifest["root_id"] = root.node_id
|
|
85
|
+
manifest["root_header"] = root.header
|
|
86
|
+
|
|
87
|
+
# Save manifest
|
|
88
|
+
manifest_path = index_path / "manifest.json"
|
|
89
|
+
with open(manifest_path, "w") as f:
|
|
90
|
+
json.dump(manifest, f, indent=2)
|
|
91
|
+
|
|
92
|
+
# Save skeleton nodes
|
|
93
|
+
skeleton_path = index_path / "skeleton.json"
|
|
94
|
+
skeleton_data = {
|
|
95
|
+
node_id: _skeleton_node_to_dict(node)
|
|
96
|
+
for node_id, node in skeleton.items()
|
|
97
|
+
}
|
|
98
|
+
with open(skeleton_path, "w") as f:
|
|
99
|
+
json.dump(skeleton_data, f, indent=2)
|
|
100
|
+
|
|
101
|
+
# Handle KV store
|
|
102
|
+
content_path = index_path / "content.db"
|
|
103
|
+
|
|
104
|
+
if isinstance(kv_store, SQLiteKVStore):
|
|
105
|
+
# Copy the existing database
|
|
106
|
+
if kv_store.db_path != content_path:
|
|
107
|
+
shutil.copy2(kv_store.db_path, content_path)
|
|
108
|
+
elif isinstance(kv_store, InMemoryKVStore):
|
|
109
|
+
# Export in-memory store to SQLite
|
|
110
|
+
sqlite_kv = SQLiteKVStore(content_path)
|
|
111
|
+
for node_id in skeleton.keys():
|
|
112
|
+
content = kv_store.get(node_id)
|
|
113
|
+
if content:
|
|
114
|
+
sqlite_kv.put(node_id, content)
|
|
115
|
+
else:
|
|
116
|
+
raise IndexingError(f"Unsupported KV store type: {type(kv_store)}")
|
|
117
|
+
|
|
118
|
+
logger.info(
|
|
119
|
+
"index_saved",
|
|
120
|
+
path=str(index_path),
|
|
121
|
+
nodes=len(skeleton),
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
return index_path
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def load_index(
|
|
128
|
+
index_dir: str | Path,
|
|
129
|
+
in_memory: bool = False,
|
|
130
|
+
) -> tuple[dict[str, SkeletonNode], KVStore]:
|
|
131
|
+
"""
|
|
132
|
+
Load a skeleton index and KV store from disk.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
index_dir: Directory containing the saved index
|
|
136
|
+
in_memory: If True, load KV store into memory (faster but uses more RAM)
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
Tuple of (skeleton dict, kv_store)
|
|
140
|
+
|
|
141
|
+
Example:
|
|
142
|
+
skeleton, kv = load_index("./indexes/contract_2024/")
|
|
143
|
+
answer = run_navigator("What are the payment terms?", skeleton, kv)
|
|
144
|
+
"""
|
|
145
|
+
index_path = Path(index_dir)
|
|
146
|
+
|
|
147
|
+
if not index_path.exists():
|
|
148
|
+
raise IndexingError(f"Index directory not found: {index_path}")
|
|
149
|
+
|
|
150
|
+
# Load and validate manifest
|
|
151
|
+
manifest_path = index_path / "manifest.json"
|
|
152
|
+
if not manifest_path.exists():
|
|
153
|
+
raise IndexingError(f"Manifest not found: {manifest_path}")
|
|
154
|
+
|
|
155
|
+
with open(manifest_path) as f:
|
|
156
|
+
manifest = json.load(f)
|
|
157
|
+
|
|
158
|
+
version = manifest.get("version", "unknown")
|
|
159
|
+
if version != INDEX_FORMAT_VERSION:
|
|
160
|
+
logger.warning(
|
|
161
|
+
"index_version_mismatch",
|
|
162
|
+
expected=INDEX_FORMAT_VERSION,
|
|
163
|
+
found=version,
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
# Load skeleton nodes
|
|
167
|
+
skeleton_path = index_path / "skeleton.json"
|
|
168
|
+
if not skeleton_path.exists():
|
|
169
|
+
raise IndexingError(f"Skeleton index not found: {skeleton_path}")
|
|
170
|
+
|
|
171
|
+
with open(skeleton_path) as f:
|
|
172
|
+
skeleton_data = json.load(f)
|
|
173
|
+
|
|
174
|
+
skeleton: dict[str, SkeletonNode] = {
|
|
175
|
+
node_id: _dict_to_skeleton_node(data)
|
|
176
|
+
for node_id, data in skeleton_data.items()
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
# Load KV store
|
|
180
|
+
content_path = index_path / "content.db"
|
|
181
|
+
if not content_path.exists():
|
|
182
|
+
raise IndexingError(f"Content database not found: {content_path}")
|
|
183
|
+
|
|
184
|
+
if in_memory:
|
|
185
|
+
# Load into memory for faster access
|
|
186
|
+
sqlite_kv = SQLiteKVStore(content_path)
|
|
187
|
+
kv_store = InMemoryKVStore()
|
|
188
|
+
for node_id in skeleton.keys():
|
|
189
|
+
content = sqlite_kv.get(node_id)
|
|
190
|
+
if content:
|
|
191
|
+
kv_store.put(node_id, content)
|
|
192
|
+
else:
|
|
193
|
+
# Use SQLite directly
|
|
194
|
+
kv_store = SQLiteKVStore(content_path)
|
|
195
|
+
|
|
196
|
+
logger.info(
|
|
197
|
+
"index_loaded",
|
|
198
|
+
path=str(index_path),
|
|
199
|
+
nodes=len(skeleton),
|
|
200
|
+
version=version,
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
return skeleton, kv_store
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def get_index_info(index_dir: str | Path) -> dict[str, Any]:
|
|
207
|
+
"""
|
|
208
|
+
Get information about a saved index without loading it.
|
|
209
|
+
|
|
210
|
+
Args:
|
|
211
|
+
index_dir: Directory containing the saved index
|
|
212
|
+
|
|
213
|
+
Returns:
|
|
214
|
+
Dictionary with index metadata
|
|
215
|
+
|
|
216
|
+
Example:
|
|
217
|
+
info = get_index_info("./indexes/contract_2024/")
|
|
218
|
+
print(f"Index has {info['node_count']} nodes")
|
|
219
|
+
"""
|
|
220
|
+
index_path = Path(index_dir)
|
|
221
|
+
manifest_path = index_path / "manifest.json"
|
|
222
|
+
|
|
223
|
+
if not manifest_path.exists():
|
|
224
|
+
raise IndexingError(f"Manifest not found: {manifest_path}")
|
|
225
|
+
|
|
226
|
+
with open(manifest_path) as f:
|
|
227
|
+
manifest = json.load(f)
|
|
228
|
+
|
|
229
|
+
# Add file size info
|
|
230
|
+
content_path = index_path / "content.db"
|
|
231
|
+
if content_path.exists():
|
|
232
|
+
manifest["content_size_bytes"] = content_path.stat().st_size
|
|
233
|
+
|
|
234
|
+
skeleton_path = index_path / "skeleton.json"
|
|
235
|
+
if skeleton_path.exists():
|
|
236
|
+
manifest["skeleton_size_bytes"] = skeleton_path.stat().st_size
|
|
237
|
+
|
|
238
|
+
return manifest
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def delete_index(index_dir: str | Path) -> bool:
|
|
242
|
+
"""
|
|
243
|
+
Delete a saved index.
|
|
244
|
+
|
|
245
|
+
Args:
|
|
246
|
+
index_dir: Directory containing the saved index
|
|
247
|
+
|
|
248
|
+
Returns:
|
|
249
|
+
True if deleted, False if not found
|
|
250
|
+
"""
|
|
251
|
+
index_path = Path(index_dir)
|
|
252
|
+
|
|
253
|
+
if not index_path.exists():
|
|
254
|
+
return False
|
|
255
|
+
|
|
256
|
+
shutil.rmtree(index_path)
|
|
257
|
+
logger.info("index_deleted", path=str(index_path))
|
|
258
|
+
return True
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def list_indexes(base_dir: str | Path) -> list[dict[str, Any]]:
|
|
262
|
+
"""
|
|
263
|
+
List all indexes in a directory.
|
|
264
|
+
|
|
265
|
+
Args:
|
|
266
|
+
base_dir: Directory to search for indexes
|
|
267
|
+
|
|
268
|
+
Returns:
|
|
269
|
+
List of index info dictionaries
|
|
270
|
+
|
|
271
|
+
Example:
|
|
272
|
+
indexes = list_indexes("./indexes/")
|
|
273
|
+
for idx in indexes:
|
|
274
|
+
print(f"{idx['path']}: {idx['node_count']} nodes")
|
|
275
|
+
"""
|
|
276
|
+
base_path = Path(base_dir)
|
|
277
|
+
indexes = []
|
|
278
|
+
|
|
279
|
+
if not base_path.exists():
|
|
280
|
+
return indexes
|
|
281
|
+
|
|
282
|
+
for item in base_path.iterdir():
|
|
283
|
+
if item.is_dir() and (item / "manifest.json").exists():
|
|
284
|
+
try:
|
|
285
|
+
info = get_index_info(item)
|
|
286
|
+
info["path"] = str(item)
|
|
287
|
+
indexes.append(info)
|
|
288
|
+
except Exception as e:
|
|
289
|
+
logger.warning("failed_to_read_index", path=str(item), error=str(e))
|
|
290
|
+
|
|
291
|
+
return indexes
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
# =============================================================================
|
|
295
|
+
# Serialization Helpers
|
|
296
|
+
# =============================================================================
|
|
297
|
+
|
|
298
|
+
def _skeleton_node_to_dict(node: SkeletonNode) -> dict[str, Any]:
|
|
299
|
+
"""Convert SkeletonNode to JSON-serializable dict."""
|
|
300
|
+
return {
|
|
301
|
+
"node_id": node.node_id,
|
|
302
|
+
"parent_id": node.parent_id,
|
|
303
|
+
"level": node.level,
|
|
304
|
+
"header": node.header,
|
|
305
|
+
"summary": node.summary,
|
|
306
|
+
"child_ids": node.child_ids,
|
|
307
|
+
"page_num": node.page_num,
|
|
308
|
+
"metadata": node.metadata,
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
def _dict_to_skeleton_node(data: dict[str, Any]) -> SkeletonNode:
|
|
313
|
+
"""Convert dict back to SkeletonNode."""
|
|
314
|
+
return SkeletonNode(
|
|
315
|
+
node_id=data["node_id"],
|
|
316
|
+
parent_id=data.get("parent_id"),
|
|
317
|
+
level=data["level"],
|
|
318
|
+
header=data.get("header", ""),
|
|
319
|
+
summary=data.get("summary", ""),
|
|
320
|
+
child_ids=data.get("child_ids", []),
|
|
321
|
+
page_num=data.get("page_num"),
|
|
322
|
+
metadata=data.get("metadata", {}),
|
|
323
|
+
)
|