rnsr 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rnsr/__init__.py +118 -0
- rnsr/__main__.py +242 -0
- rnsr/agent/__init__.py +218 -0
- rnsr/agent/cross_doc_navigator.py +767 -0
- rnsr/agent/graph.py +1557 -0
- rnsr/agent/llm_cache.py +575 -0
- rnsr/agent/navigator_api.py +497 -0
- rnsr/agent/provenance.py +772 -0
- rnsr/agent/query_clarifier.py +617 -0
- rnsr/agent/reasoning_memory.py +736 -0
- rnsr/agent/repl_env.py +709 -0
- rnsr/agent/rlm_navigator.py +2108 -0
- rnsr/agent/self_reflection.py +602 -0
- rnsr/agent/variable_store.py +308 -0
- rnsr/benchmarks/__init__.py +118 -0
- rnsr/benchmarks/comprehensive_benchmark.py +733 -0
- rnsr/benchmarks/evaluation_suite.py +1210 -0
- rnsr/benchmarks/finance_bench.py +147 -0
- rnsr/benchmarks/pdf_merger.py +178 -0
- rnsr/benchmarks/performance.py +321 -0
- rnsr/benchmarks/quality.py +321 -0
- rnsr/benchmarks/runner.py +298 -0
- rnsr/benchmarks/standard_benchmarks.py +995 -0
- rnsr/client.py +560 -0
- rnsr/document_store.py +394 -0
- rnsr/exceptions.py +74 -0
- rnsr/extraction/__init__.py +172 -0
- rnsr/extraction/candidate_extractor.py +357 -0
- rnsr/extraction/entity_extractor.py +581 -0
- rnsr/extraction/entity_linker.py +825 -0
- rnsr/extraction/grounded_extractor.py +722 -0
- rnsr/extraction/learned_types.py +599 -0
- rnsr/extraction/models.py +232 -0
- rnsr/extraction/relationship_extractor.py +600 -0
- rnsr/extraction/relationship_patterns.py +511 -0
- rnsr/extraction/relationship_validator.py +392 -0
- rnsr/extraction/rlm_extractor.py +589 -0
- rnsr/extraction/rlm_unified_extractor.py +990 -0
- rnsr/extraction/tot_validator.py +610 -0
- rnsr/extraction/unified_extractor.py +342 -0
- rnsr/indexing/__init__.py +60 -0
- rnsr/indexing/knowledge_graph.py +1128 -0
- rnsr/indexing/kv_store.py +313 -0
- rnsr/indexing/persistence.py +323 -0
- rnsr/indexing/semantic_retriever.py +237 -0
- rnsr/indexing/semantic_search.py +320 -0
- rnsr/indexing/skeleton_index.py +395 -0
- rnsr/ingestion/__init__.py +161 -0
- rnsr/ingestion/chart_parser.py +569 -0
- rnsr/ingestion/document_boundary.py +662 -0
- rnsr/ingestion/font_histogram.py +334 -0
- rnsr/ingestion/header_classifier.py +595 -0
- rnsr/ingestion/hierarchical_cluster.py +515 -0
- rnsr/ingestion/layout_detector.py +356 -0
- rnsr/ingestion/layout_model.py +379 -0
- rnsr/ingestion/ocr_fallback.py +177 -0
- rnsr/ingestion/pipeline.py +936 -0
- rnsr/ingestion/semantic_fallback.py +417 -0
- rnsr/ingestion/table_parser.py +799 -0
- rnsr/ingestion/text_builder.py +460 -0
- rnsr/ingestion/tree_builder.py +402 -0
- rnsr/ingestion/vision_retrieval.py +965 -0
- rnsr/ingestion/xy_cut.py +555 -0
- rnsr/llm.py +733 -0
- rnsr/models.py +167 -0
- rnsr/py.typed +2 -0
- rnsr-0.1.0.dist-info/METADATA +592 -0
- rnsr-0.1.0.dist-info/RECORD +72 -0
- rnsr-0.1.0.dist-info/WHEEL +5 -0
- rnsr-0.1.0.dist-info/entry_points.txt +2 -0
- rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
- rnsr-0.1.0.dist-info/top_level.txt +1 -0
rnsr/document_store.py
ADDED
|
@@ -0,0 +1,394 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Document Store - Multi-Document Management
|
|
3
|
+
|
|
4
|
+
Provides a high-level interface for managing multiple indexed documents.
|
|
5
|
+
Handles persistence, loading, and querying across a document collection.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
from rnsr import DocumentStore
|
|
9
|
+
|
|
10
|
+
# Create or open a document store
|
|
11
|
+
store = DocumentStore("./my_documents/")
|
|
12
|
+
|
|
13
|
+
# Add documents
|
|
14
|
+
store.add_document("contract.pdf")
|
|
15
|
+
store.add_document("report.pdf", metadata={"year": 2024})
|
|
16
|
+
|
|
17
|
+
# Query a specific document
|
|
18
|
+
answer = store.query("contract", "What are the payment terms?")
|
|
19
|
+
|
|
20
|
+
# List all documents
|
|
21
|
+
for doc in store.list_documents():
|
|
22
|
+
print(f"{doc['id']}: {doc['title']}")
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
import hashlib
|
|
28
|
+
import json
|
|
29
|
+
import shutil
|
|
30
|
+
from dataclasses import dataclass, field, asdict
|
|
31
|
+
from datetime import datetime
|
|
32
|
+
from pathlib import Path
|
|
33
|
+
from typing import Any, Iterator
|
|
34
|
+
|
|
35
|
+
import structlog
|
|
36
|
+
|
|
37
|
+
from rnsr.exceptions import IndexingError
|
|
38
|
+
from rnsr.indexing.kv_store import KVStore, SQLiteKVStore
|
|
39
|
+
from rnsr.indexing.persistence import (
|
|
40
|
+
save_index,
|
|
41
|
+
load_index,
|
|
42
|
+
get_index_info,
|
|
43
|
+
delete_index,
|
|
44
|
+
)
|
|
45
|
+
from rnsr.indexing.skeleton_index import build_skeleton_index
|
|
46
|
+
from rnsr.ingestion import ingest_document
|
|
47
|
+
from rnsr.models import SkeletonNode
|
|
48
|
+
|
|
49
|
+
logger = structlog.get_logger(__name__)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass
|
|
53
|
+
class DocumentInfo:
|
|
54
|
+
"""Information about an indexed document."""
|
|
55
|
+
|
|
56
|
+
id: str
|
|
57
|
+
title: str
|
|
58
|
+
source_path: str | None
|
|
59
|
+
node_count: int
|
|
60
|
+
created_at: str
|
|
61
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
62
|
+
|
|
63
|
+
def to_dict(self) -> dict[str, Any]:
|
|
64
|
+
return asdict(self)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class DocumentStore:
|
|
68
|
+
"""
|
|
69
|
+
Manages a collection of indexed documents.
|
|
70
|
+
|
|
71
|
+
Provides:
|
|
72
|
+
- Add/remove documents
|
|
73
|
+
- Persistent storage
|
|
74
|
+
- Query individual documents
|
|
75
|
+
- List and search documents
|
|
76
|
+
|
|
77
|
+
Example:
|
|
78
|
+
store = DocumentStore("./documents/")
|
|
79
|
+
store.add_document("contract.pdf")
|
|
80
|
+
answer = store.query("contract", "What are the terms?")
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
def __init__(self, store_path: str | Path):
|
|
84
|
+
"""
|
|
85
|
+
Initialize or open a document store.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
store_path: Directory for storing document indexes
|
|
89
|
+
"""
|
|
90
|
+
self.store_path = Path(store_path)
|
|
91
|
+
self.store_path.mkdir(parents=True, exist_ok=True)
|
|
92
|
+
|
|
93
|
+
self._catalog_path = self.store_path / "catalog.json"
|
|
94
|
+
self._catalog: dict[str, DocumentInfo] = {}
|
|
95
|
+
|
|
96
|
+
# Load existing catalog if present
|
|
97
|
+
if self._catalog_path.exists():
|
|
98
|
+
self._load_catalog()
|
|
99
|
+
|
|
100
|
+
logger.info(
|
|
101
|
+
"document_store_initialized",
|
|
102
|
+
path=str(self.store_path),
|
|
103
|
+
documents=len(self._catalog),
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
def _load_catalog(self) -> None:
|
|
107
|
+
"""Load the document catalog from disk."""
|
|
108
|
+
with open(self._catalog_path) as f:
|
|
109
|
+
data = json.load(f)
|
|
110
|
+
|
|
111
|
+
self._catalog = {
|
|
112
|
+
doc_id: DocumentInfo(**info)
|
|
113
|
+
for doc_id, info in data.get("documents", {}).items()
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
def _save_catalog(self) -> None:
|
|
117
|
+
"""Save the document catalog to disk."""
|
|
118
|
+
data = {
|
|
119
|
+
"version": "1.0",
|
|
120
|
+
"updated_at": datetime.now().isoformat(),
|
|
121
|
+
"documents": {
|
|
122
|
+
doc_id: info.to_dict()
|
|
123
|
+
for doc_id, info in self._catalog.items()
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
with open(self._catalog_path, "w") as f:
|
|
128
|
+
json.dump(data, f, indent=2)
|
|
129
|
+
|
|
130
|
+
def add_document(
|
|
131
|
+
self,
|
|
132
|
+
source: str | Path,
|
|
133
|
+
doc_id: str | None = None,
|
|
134
|
+
title: str | None = None,
|
|
135
|
+
metadata: dict[str, Any] | None = None,
|
|
136
|
+
) -> str:
|
|
137
|
+
"""
|
|
138
|
+
Add and index a document.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
source: Path to PDF file
|
|
142
|
+
doc_id: Optional custom ID (defaults to filename hash)
|
|
143
|
+
title: Optional title (defaults to filename)
|
|
144
|
+
metadata: Optional metadata dictionary
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
Document ID
|
|
148
|
+
|
|
149
|
+
Example:
|
|
150
|
+
doc_id = store.add_document("report.pdf", metadata={"year": 2024})
|
|
151
|
+
"""
|
|
152
|
+
source_path = Path(source)
|
|
153
|
+
|
|
154
|
+
if not source_path.exists():
|
|
155
|
+
raise IndexingError(f"Source file not found: {source_path}")
|
|
156
|
+
|
|
157
|
+
# Generate ID if not provided
|
|
158
|
+
if doc_id is None:
|
|
159
|
+
# Hash of filename + file size for uniqueness
|
|
160
|
+
hash_input = f"{source_path.name}_{source_path.stat().st_size}"
|
|
161
|
+
doc_id = hashlib.md5(hash_input.encode()).hexdigest()[:12]
|
|
162
|
+
|
|
163
|
+
# Check if already exists
|
|
164
|
+
if doc_id in self._catalog:
|
|
165
|
+
logger.warning("document_already_exists", doc_id=doc_id)
|
|
166
|
+
return doc_id
|
|
167
|
+
|
|
168
|
+
# Ingest document
|
|
169
|
+
logger.info("ingesting_document", source=str(source_path))
|
|
170
|
+
result = ingest_document(str(source_path))
|
|
171
|
+
|
|
172
|
+
# Build skeleton index
|
|
173
|
+
skeleton, kv_store = build_skeleton_index(result.tree)
|
|
174
|
+
|
|
175
|
+
# Save to store
|
|
176
|
+
index_path = self.store_path / doc_id
|
|
177
|
+
save_index(skeleton, kv_store, index_path)
|
|
178
|
+
|
|
179
|
+
# Update catalog
|
|
180
|
+
info = DocumentInfo(
|
|
181
|
+
id=doc_id,
|
|
182
|
+
title=title or source_path.stem,
|
|
183
|
+
source_path=str(source_path),
|
|
184
|
+
node_count=len(skeleton),
|
|
185
|
+
created_at=datetime.now().isoformat(),
|
|
186
|
+
metadata=metadata or {},
|
|
187
|
+
)
|
|
188
|
+
self._catalog[doc_id] = info
|
|
189
|
+
self._save_catalog()
|
|
190
|
+
|
|
191
|
+
logger.info(
|
|
192
|
+
"document_added",
|
|
193
|
+
doc_id=doc_id,
|
|
194
|
+
title=info.title,
|
|
195
|
+
nodes=info.node_count,
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
return doc_id
|
|
199
|
+
|
|
200
|
+
def add_from_text(
|
|
201
|
+
self,
|
|
202
|
+
text: str | list[str],
|
|
203
|
+
doc_id: str,
|
|
204
|
+
title: str | None = None,
|
|
205
|
+
metadata: dict[str, Any] | None = None,
|
|
206
|
+
) -> str:
|
|
207
|
+
"""
|
|
208
|
+
Add and index a document from raw text.
|
|
209
|
+
|
|
210
|
+
Args:
|
|
211
|
+
text: Text content or list of text chunks
|
|
212
|
+
doc_id: Document ID
|
|
213
|
+
title: Optional title
|
|
214
|
+
metadata: Optional metadata
|
|
215
|
+
|
|
216
|
+
Returns:
|
|
217
|
+
Document ID
|
|
218
|
+
"""
|
|
219
|
+
from rnsr.ingestion import build_tree_from_text
|
|
220
|
+
|
|
221
|
+
# Check if already exists
|
|
222
|
+
if doc_id in self._catalog:
|
|
223
|
+
logger.warning("document_already_exists", doc_id=doc_id)
|
|
224
|
+
return doc_id
|
|
225
|
+
|
|
226
|
+
# Build tree from text
|
|
227
|
+
tree = build_tree_from_text(text)
|
|
228
|
+
|
|
229
|
+
# Build skeleton index
|
|
230
|
+
skeleton, kv_store = build_skeleton_index(tree)
|
|
231
|
+
|
|
232
|
+
# Save to store
|
|
233
|
+
index_path = self.store_path / doc_id
|
|
234
|
+
save_index(skeleton, kv_store, index_path)
|
|
235
|
+
|
|
236
|
+
# Update catalog
|
|
237
|
+
info = DocumentInfo(
|
|
238
|
+
id=doc_id,
|
|
239
|
+
title=title or doc_id,
|
|
240
|
+
source_path=None,
|
|
241
|
+
node_count=len(skeleton),
|
|
242
|
+
created_at=datetime.now().isoformat(),
|
|
243
|
+
metadata=metadata or {},
|
|
244
|
+
)
|
|
245
|
+
self._catalog[doc_id] = info
|
|
246
|
+
self._save_catalog()
|
|
247
|
+
|
|
248
|
+
logger.info(
|
|
249
|
+
"document_added_from_text",
|
|
250
|
+
doc_id=doc_id,
|
|
251
|
+
title=info.title,
|
|
252
|
+
nodes=info.node_count,
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
return doc_id
|
|
256
|
+
|
|
257
|
+
def remove_document(self, doc_id: str) -> bool:
|
|
258
|
+
"""
|
|
259
|
+
Remove a document from the store.
|
|
260
|
+
|
|
261
|
+
Args:
|
|
262
|
+
doc_id: Document ID to remove
|
|
263
|
+
|
|
264
|
+
Returns:
|
|
265
|
+
True if removed, False if not found
|
|
266
|
+
"""
|
|
267
|
+
if doc_id not in self._catalog:
|
|
268
|
+
return False
|
|
269
|
+
|
|
270
|
+
# Delete index files
|
|
271
|
+
index_path = self.store_path / doc_id
|
|
272
|
+
delete_index(index_path)
|
|
273
|
+
|
|
274
|
+
# Remove from catalog
|
|
275
|
+
del self._catalog[doc_id]
|
|
276
|
+
self._save_catalog()
|
|
277
|
+
|
|
278
|
+
logger.info("document_removed", doc_id=doc_id)
|
|
279
|
+
return True
|
|
280
|
+
|
|
281
|
+
def get_document(
|
|
282
|
+
self,
|
|
283
|
+
doc_id: str,
|
|
284
|
+
) -> tuple[dict[str, SkeletonNode], KVStore] | None:
|
|
285
|
+
"""
|
|
286
|
+
Load a document's index.
|
|
287
|
+
|
|
288
|
+
Args:
|
|
289
|
+
doc_id: Document ID
|
|
290
|
+
|
|
291
|
+
Returns:
|
|
292
|
+
Tuple of (skeleton, kv_store) or None if not found
|
|
293
|
+
"""
|
|
294
|
+
if doc_id not in self._catalog:
|
|
295
|
+
return None
|
|
296
|
+
|
|
297
|
+
index_path = self.store_path / doc_id
|
|
298
|
+
return load_index(index_path)
|
|
299
|
+
|
|
300
|
+
def query(
|
|
301
|
+
self,
|
|
302
|
+
doc_id: str,
|
|
303
|
+
question: str,
|
|
304
|
+
) -> str:
|
|
305
|
+
"""
|
|
306
|
+
Query a document.
|
|
307
|
+
|
|
308
|
+
Args:
|
|
309
|
+
doc_id: Document ID
|
|
310
|
+
question: Question to ask
|
|
311
|
+
|
|
312
|
+
Returns:
|
|
313
|
+
Answer string
|
|
314
|
+
|
|
315
|
+
Example:
|
|
316
|
+
answer = store.query("contract_123", "What are the payment terms?")
|
|
317
|
+
"""
|
|
318
|
+
from rnsr.agent import run_navigator
|
|
319
|
+
|
|
320
|
+
index_result = self.get_document(doc_id)
|
|
321
|
+
if index_result is None:
|
|
322
|
+
raise IndexingError(f"Document not found: {doc_id}")
|
|
323
|
+
|
|
324
|
+
skeleton, kv_store = index_result
|
|
325
|
+
nav_result = run_navigator(question, skeleton, kv_store)
|
|
326
|
+
return nav_result.get("answer", "No answer found.")
|
|
327
|
+
|
|
328
|
+
def list_documents(self) -> list[dict[str, Any]]:
|
|
329
|
+
"""
|
|
330
|
+
List all documents in the store.
|
|
331
|
+
|
|
332
|
+
Returns:
|
|
333
|
+
List of document info dictionaries
|
|
334
|
+
"""
|
|
335
|
+
return [info.to_dict() for info in self._catalog.values()]
|
|
336
|
+
|
|
337
|
+
def get_document_info(self, doc_id: str) -> DocumentInfo | None:
|
|
338
|
+
"""
|
|
339
|
+
Get information about a document.
|
|
340
|
+
|
|
341
|
+
Args:
|
|
342
|
+
doc_id: Document ID
|
|
343
|
+
|
|
344
|
+
Returns:
|
|
345
|
+
DocumentInfo or None if not found
|
|
346
|
+
"""
|
|
347
|
+
return self._catalog.get(doc_id)
|
|
348
|
+
|
|
349
|
+
def search_documents(
|
|
350
|
+
self,
|
|
351
|
+
query: str | None = None,
|
|
352
|
+
metadata_filter: dict[str, Any] | None = None,
|
|
353
|
+
) -> list[DocumentInfo]:
|
|
354
|
+
"""
|
|
355
|
+
Search documents by title or metadata.
|
|
356
|
+
|
|
357
|
+
Args:
|
|
358
|
+
query: Optional text to search in titles
|
|
359
|
+
metadata_filter: Optional metadata key-value pairs to match
|
|
360
|
+
|
|
361
|
+
Returns:
|
|
362
|
+
List of matching DocumentInfo objects
|
|
363
|
+
"""
|
|
364
|
+
results = []
|
|
365
|
+
|
|
366
|
+
for info in self._catalog.values():
|
|
367
|
+
# Title search
|
|
368
|
+
if query and query.lower() not in info.title.lower():
|
|
369
|
+
continue
|
|
370
|
+
|
|
371
|
+
# Metadata filter
|
|
372
|
+
if metadata_filter:
|
|
373
|
+
match = all(
|
|
374
|
+
info.metadata.get(k) == v
|
|
375
|
+
for k, v in metadata_filter.items()
|
|
376
|
+
)
|
|
377
|
+
if not match:
|
|
378
|
+
continue
|
|
379
|
+
|
|
380
|
+
results.append(info)
|
|
381
|
+
|
|
382
|
+
return results
|
|
383
|
+
|
|
384
|
+
def __len__(self) -> int:
|
|
385
|
+
"""Number of documents in the store."""
|
|
386
|
+
return len(self._catalog)
|
|
387
|
+
|
|
388
|
+
def __contains__(self, doc_id: str) -> bool:
|
|
389
|
+
"""Check if a document exists."""
|
|
390
|
+
return doc_id in self._catalog
|
|
391
|
+
|
|
392
|
+
def __iter__(self) -> Iterator[str]:
|
|
393
|
+
"""Iterate over document IDs."""
|
|
394
|
+
return iter(self._catalog.keys())
|
rnsr/exceptions.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
"""
|
|
2
|
+
RNSR Custom Exceptions
|
|
3
|
+
|
|
4
|
+
All module-specific exceptions inherit from RNSRError.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class RNSRError(Exception):
|
|
9
|
+
"""Base exception for all RNSR errors."""
|
|
10
|
+
|
|
11
|
+
pass
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
# Ingestion Exceptions
|
|
15
|
+
class IngestionError(RNSRError):
|
|
16
|
+
"""Base exception for ingestion errors."""
|
|
17
|
+
|
|
18
|
+
pass
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class FontAnalysisError(IngestionError):
|
|
22
|
+
"""Raised when font histogram analysis fails."""
|
|
23
|
+
|
|
24
|
+
pass
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class SegmentationError(IngestionError):
|
|
28
|
+
"""Raised when page segmentation fails."""
|
|
29
|
+
|
|
30
|
+
pass
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class OCRError(IngestionError):
|
|
34
|
+
"""Raised when OCR fallback fails."""
|
|
35
|
+
|
|
36
|
+
pass
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
# Indexing Exceptions
|
|
40
|
+
class IndexingError(RNSRError):
|
|
41
|
+
"""Base exception for indexing errors."""
|
|
42
|
+
|
|
43
|
+
pass
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class SummaryGenerationError(IndexingError):
|
|
47
|
+
"""Raised when LLM summary generation fails."""
|
|
48
|
+
|
|
49
|
+
pass
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class KVStoreError(IndexingError):
|
|
53
|
+
"""Raised when KV store operations fail."""
|
|
54
|
+
|
|
55
|
+
pass
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
# Agent Exceptions
|
|
59
|
+
class AgentError(RNSRError):
|
|
60
|
+
"""Base exception for agent errors."""
|
|
61
|
+
|
|
62
|
+
pass
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class VariableNotFoundError(AgentError):
|
|
66
|
+
"""Raised when a variable pointer cannot be resolved."""
|
|
67
|
+
|
|
68
|
+
pass
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class NavigationError(AgentError):
|
|
72
|
+
"""Raised when document navigation fails."""
|
|
73
|
+
|
|
74
|
+
pass
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
"""
|
|
2
|
+
RNSR Extraction Module
|
|
3
|
+
|
|
4
|
+
Entity and relationship extraction for ontological document understanding.
|
|
5
|
+
|
|
6
|
+
## Recommended: RLMUnifiedExtractor
|
|
7
|
+
|
|
8
|
+
Use the unified RLM extractor for all extraction needs:
|
|
9
|
+
|
|
10
|
+
```python
|
|
11
|
+
from rnsr.extraction import RLMUnifiedExtractor, extract_entities_and_relationships
|
|
12
|
+
|
|
13
|
+
# Simple API
|
|
14
|
+
result = extract_entities_and_relationships(node_id, doc_id, header, content)
|
|
15
|
+
|
|
16
|
+
# Full control
|
|
17
|
+
extractor = RLMUnifiedExtractor()
|
|
18
|
+
result = extractor.extract(node_id, doc_id, header, content)
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
This extractor:
|
|
22
|
+
1. LLM writes extraction code based on document (adaptive)
|
|
23
|
+
2. Code executes on DOC_VAR (grounded in text)
|
|
24
|
+
3. ToT validates with probabilities (accurate)
|
|
25
|
+
4. Cross-validates entities and relationships (comprehensive)
|
|
26
|
+
5. Learns new types from usage (domain-adaptive)
|
|
27
|
+
|
|
28
|
+
## Adaptive Learning
|
|
29
|
+
|
|
30
|
+
The system learns from your document workload:
|
|
31
|
+
- Entity types: `LearnedTypeRegistry`
|
|
32
|
+
- Relationship types: `LearnedRelationshipTypeRegistry`
|
|
33
|
+
- Normalization patterns: `LearnedNormalizationPatterns`
|
|
34
|
+
- Stop words: `LearnedStopWords`
|
|
35
|
+
- Header thresholds: `LearnedHeaderThresholds`
|
|
36
|
+
- Query patterns: `LearnedQueryPatterns`
|
|
37
|
+
|
|
38
|
+
All learned data persists in `~/.rnsr/`.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
from rnsr.extraction.models import (
|
|
42
|
+
Entity,
|
|
43
|
+
EntityLink,
|
|
44
|
+
EntityType,
|
|
45
|
+
ExtractionResult,
|
|
46
|
+
Mention,
|
|
47
|
+
Relationship,
|
|
48
|
+
RelationType,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# Primary extractor (recommended)
|
|
52
|
+
from rnsr.extraction.rlm_unified_extractor import (
|
|
53
|
+
RLMUnifiedExtractor,
|
|
54
|
+
RLMUnifiedResult,
|
|
55
|
+
extract_entities_and_relationships,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# Legacy/alternative extractors
|
|
59
|
+
from rnsr.extraction.entity_extractor import (
|
|
60
|
+
EntityExtractor, # DEPRECATED
|
|
61
|
+
merge_entities,
|
|
62
|
+
)
|
|
63
|
+
from rnsr.extraction.grounded_extractor import (
|
|
64
|
+
GroundedEntityExtractor,
|
|
65
|
+
ValidationMode,
|
|
66
|
+
)
|
|
67
|
+
from rnsr.extraction.unified_extractor import (
|
|
68
|
+
UnifiedGroundedExtractor,
|
|
69
|
+
UnifiedExtractionResult,
|
|
70
|
+
)
|
|
71
|
+
from rnsr.extraction.rlm_extractor import (
|
|
72
|
+
RLMEntityExtractor,
|
|
73
|
+
RLMExtractionResult,
|
|
74
|
+
LightweightREPL,
|
|
75
|
+
)
|
|
76
|
+
from rnsr.extraction.tot_validator import (
|
|
77
|
+
TotEntityValidator,
|
|
78
|
+
TotBatchResult,
|
|
79
|
+
TotValidationResult,
|
|
80
|
+
)
|
|
81
|
+
from rnsr.extraction.relationship_validator import (
|
|
82
|
+
RelationshipValidator,
|
|
83
|
+
RelationshipValidationResult,
|
|
84
|
+
RelationshipBatchResult,
|
|
85
|
+
)
|
|
86
|
+
from rnsr.extraction.candidate_extractor import (
|
|
87
|
+
CandidateExtractor,
|
|
88
|
+
EntityCandidate,
|
|
89
|
+
extract_candidates_from_text,
|
|
90
|
+
)
|
|
91
|
+
from rnsr.extraction.relationship_patterns import (
|
|
92
|
+
RelationshipPatternExtractor,
|
|
93
|
+
RelationshipCandidate,
|
|
94
|
+
extract_relationship_candidates,
|
|
95
|
+
)
|
|
96
|
+
from rnsr.extraction.relationship_extractor import (
|
|
97
|
+
RelationshipExtractor, # DEPRECATED
|
|
98
|
+
extract_implicit_relationships,
|
|
99
|
+
)
|
|
100
|
+
from rnsr.extraction.entity_linker import (
|
|
101
|
+
EntityLinker,
|
|
102
|
+
LearnedNormalizationPatterns,
|
|
103
|
+
get_learned_normalization_patterns,
|
|
104
|
+
)
|
|
105
|
+
from rnsr.extraction.learned_types import (
|
|
106
|
+
LearnedTypeRegistry,
|
|
107
|
+
LearnedRelationshipTypeRegistry,
|
|
108
|
+
get_learned_type_registry,
|
|
109
|
+
get_learned_relationship_type_registry,
|
|
110
|
+
record_learned_type,
|
|
111
|
+
record_learned_relationship_type,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
__all__ = [
|
|
115
|
+
# Models
|
|
116
|
+
"Entity",
|
|
117
|
+
"EntityLink",
|
|
118
|
+
"EntityType",
|
|
119
|
+
"ExtractionResult",
|
|
120
|
+
"Mention",
|
|
121
|
+
"Relationship",
|
|
122
|
+
"RelationType",
|
|
123
|
+
|
|
124
|
+
# PRIMARY EXTRACTOR (recommended)
|
|
125
|
+
"RLMUnifiedExtractor",
|
|
126
|
+
"RLMUnifiedResult",
|
|
127
|
+
"extract_entities_and_relationships", # Simple function API
|
|
128
|
+
|
|
129
|
+
# Alternative extractors
|
|
130
|
+
"UnifiedGroundedExtractor",
|
|
131
|
+
"UnifiedExtractionResult",
|
|
132
|
+
"RLMEntityExtractor",
|
|
133
|
+
"RLMExtractionResult",
|
|
134
|
+
"GroundedEntityExtractor",
|
|
135
|
+
"ValidationMode",
|
|
136
|
+
|
|
137
|
+
# Legacy extractors (DEPRECATED - emit warnings)
|
|
138
|
+
"EntityExtractor",
|
|
139
|
+
"RelationshipExtractor",
|
|
140
|
+
|
|
141
|
+
# Supporting components
|
|
142
|
+
"CandidateExtractor",
|
|
143
|
+
"RelationshipPatternExtractor",
|
|
144
|
+
"EntityLinker",
|
|
145
|
+
"TotEntityValidator",
|
|
146
|
+
"TotBatchResult",
|
|
147
|
+
"TotValidationResult",
|
|
148
|
+
"RelationshipValidator",
|
|
149
|
+
"RelationshipValidationResult",
|
|
150
|
+
"RelationshipBatchResult",
|
|
151
|
+
"LightweightREPL",
|
|
152
|
+
|
|
153
|
+
# Data classes
|
|
154
|
+
"EntityCandidate",
|
|
155
|
+
"RelationshipCandidate",
|
|
156
|
+
|
|
157
|
+
# Adaptive Learning Registries
|
|
158
|
+
"LearnedTypeRegistry",
|
|
159
|
+
"LearnedRelationshipTypeRegistry",
|
|
160
|
+
"LearnedNormalizationPatterns",
|
|
161
|
+
"get_learned_type_registry",
|
|
162
|
+
"get_learned_relationship_type_registry",
|
|
163
|
+
"get_learned_normalization_patterns",
|
|
164
|
+
"record_learned_type",
|
|
165
|
+
"record_learned_relationship_type",
|
|
166
|
+
|
|
167
|
+
# Utility functions
|
|
168
|
+
"merge_entities",
|
|
169
|
+
"extract_implicit_relationships",
|
|
170
|
+
"extract_candidates_from_text",
|
|
171
|
+
"extract_relationship_candidates",
|
|
172
|
+
]
|