rnsr 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. rnsr/__init__.py +118 -0
  2. rnsr/__main__.py +242 -0
  3. rnsr/agent/__init__.py +218 -0
  4. rnsr/agent/cross_doc_navigator.py +767 -0
  5. rnsr/agent/graph.py +1557 -0
  6. rnsr/agent/llm_cache.py +575 -0
  7. rnsr/agent/navigator_api.py +497 -0
  8. rnsr/agent/provenance.py +772 -0
  9. rnsr/agent/query_clarifier.py +617 -0
  10. rnsr/agent/reasoning_memory.py +736 -0
  11. rnsr/agent/repl_env.py +709 -0
  12. rnsr/agent/rlm_navigator.py +2108 -0
  13. rnsr/agent/self_reflection.py +602 -0
  14. rnsr/agent/variable_store.py +308 -0
  15. rnsr/benchmarks/__init__.py +118 -0
  16. rnsr/benchmarks/comprehensive_benchmark.py +733 -0
  17. rnsr/benchmarks/evaluation_suite.py +1210 -0
  18. rnsr/benchmarks/finance_bench.py +147 -0
  19. rnsr/benchmarks/pdf_merger.py +178 -0
  20. rnsr/benchmarks/performance.py +321 -0
  21. rnsr/benchmarks/quality.py +321 -0
  22. rnsr/benchmarks/runner.py +298 -0
  23. rnsr/benchmarks/standard_benchmarks.py +995 -0
  24. rnsr/client.py +560 -0
  25. rnsr/document_store.py +394 -0
  26. rnsr/exceptions.py +74 -0
  27. rnsr/extraction/__init__.py +172 -0
  28. rnsr/extraction/candidate_extractor.py +357 -0
  29. rnsr/extraction/entity_extractor.py +581 -0
  30. rnsr/extraction/entity_linker.py +825 -0
  31. rnsr/extraction/grounded_extractor.py +722 -0
  32. rnsr/extraction/learned_types.py +599 -0
  33. rnsr/extraction/models.py +232 -0
  34. rnsr/extraction/relationship_extractor.py +600 -0
  35. rnsr/extraction/relationship_patterns.py +511 -0
  36. rnsr/extraction/relationship_validator.py +392 -0
  37. rnsr/extraction/rlm_extractor.py +589 -0
  38. rnsr/extraction/rlm_unified_extractor.py +990 -0
  39. rnsr/extraction/tot_validator.py +610 -0
  40. rnsr/extraction/unified_extractor.py +342 -0
  41. rnsr/indexing/__init__.py +60 -0
  42. rnsr/indexing/knowledge_graph.py +1128 -0
  43. rnsr/indexing/kv_store.py +313 -0
  44. rnsr/indexing/persistence.py +323 -0
  45. rnsr/indexing/semantic_retriever.py +237 -0
  46. rnsr/indexing/semantic_search.py +320 -0
  47. rnsr/indexing/skeleton_index.py +395 -0
  48. rnsr/ingestion/__init__.py +161 -0
  49. rnsr/ingestion/chart_parser.py +569 -0
  50. rnsr/ingestion/document_boundary.py +662 -0
  51. rnsr/ingestion/font_histogram.py +334 -0
  52. rnsr/ingestion/header_classifier.py +595 -0
  53. rnsr/ingestion/hierarchical_cluster.py +515 -0
  54. rnsr/ingestion/layout_detector.py +356 -0
  55. rnsr/ingestion/layout_model.py +379 -0
  56. rnsr/ingestion/ocr_fallback.py +177 -0
  57. rnsr/ingestion/pipeline.py +936 -0
  58. rnsr/ingestion/semantic_fallback.py +417 -0
  59. rnsr/ingestion/table_parser.py +799 -0
  60. rnsr/ingestion/text_builder.py +460 -0
  61. rnsr/ingestion/tree_builder.py +402 -0
  62. rnsr/ingestion/vision_retrieval.py +965 -0
  63. rnsr/ingestion/xy_cut.py +555 -0
  64. rnsr/llm.py +733 -0
  65. rnsr/models.py +167 -0
  66. rnsr/py.typed +2 -0
  67. rnsr-0.1.0.dist-info/METADATA +592 -0
  68. rnsr-0.1.0.dist-info/RECORD +72 -0
  69. rnsr-0.1.0.dist-info/WHEEL +5 -0
  70. rnsr-0.1.0.dist-info/entry_points.txt +2 -0
  71. rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
  72. rnsr-0.1.0.dist-info/top_level.txt +1 -0
rnsr/document_store.py ADDED
@@ -0,0 +1,394 @@
1
+ """
2
+ Document Store - Multi-Document Management
3
+
4
+ Provides a high-level interface for managing multiple indexed documents.
5
+ Handles persistence, loading, and querying across a document collection.
6
+
7
+ Usage:
8
+ from rnsr import DocumentStore
9
+
10
+ # Create or open a document store
11
+ store = DocumentStore("./my_documents/")
12
+
13
+ # Add documents
14
+ store.add_document("contract.pdf")
15
+ store.add_document("report.pdf", metadata={"year": 2024})
16
+
17
+ # Query a specific document
18
+ answer = store.query("contract", "What are the payment terms?")
19
+
20
+ # List all documents
21
+ for doc in store.list_documents():
22
+ print(f"{doc['id']}: {doc['title']}")
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ import hashlib
28
+ import json
29
+ import shutil
30
+ from dataclasses import dataclass, field, asdict
31
+ from datetime import datetime
32
+ from pathlib import Path
33
+ from typing import Any, Iterator
34
+
35
+ import structlog
36
+
37
+ from rnsr.exceptions import IndexingError
38
+ from rnsr.indexing.kv_store import KVStore, SQLiteKVStore
39
+ from rnsr.indexing.persistence import (
40
+ save_index,
41
+ load_index,
42
+ get_index_info,
43
+ delete_index,
44
+ )
45
+ from rnsr.indexing.skeleton_index import build_skeleton_index
46
+ from rnsr.ingestion import ingest_document
47
+ from rnsr.models import SkeletonNode
48
+
49
+ logger = structlog.get_logger(__name__)
50
+
51
+
52
+ @dataclass
53
+ class DocumentInfo:
54
+ """Information about an indexed document."""
55
+
56
+ id: str
57
+ title: str
58
+ source_path: str | None
59
+ node_count: int
60
+ created_at: str
61
+ metadata: dict[str, Any] = field(default_factory=dict)
62
+
63
+ def to_dict(self) -> dict[str, Any]:
64
+ return asdict(self)
65
+
66
+
67
+ class DocumentStore:
68
+ """
69
+ Manages a collection of indexed documents.
70
+
71
+ Provides:
72
+ - Add/remove documents
73
+ - Persistent storage
74
+ - Query individual documents
75
+ - List and search documents
76
+
77
+ Example:
78
+ store = DocumentStore("./documents/")
79
+ store.add_document("contract.pdf")
80
+ answer = store.query("contract", "What are the terms?")
81
+ """
82
+
83
+ def __init__(self, store_path: str | Path):
84
+ """
85
+ Initialize or open a document store.
86
+
87
+ Args:
88
+ store_path: Directory for storing document indexes
89
+ """
90
+ self.store_path = Path(store_path)
91
+ self.store_path.mkdir(parents=True, exist_ok=True)
92
+
93
+ self._catalog_path = self.store_path / "catalog.json"
94
+ self._catalog: dict[str, DocumentInfo] = {}
95
+
96
+ # Load existing catalog if present
97
+ if self._catalog_path.exists():
98
+ self._load_catalog()
99
+
100
+ logger.info(
101
+ "document_store_initialized",
102
+ path=str(self.store_path),
103
+ documents=len(self._catalog),
104
+ )
105
+
106
+ def _load_catalog(self) -> None:
107
+ """Load the document catalog from disk."""
108
+ with open(self._catalog_path) as f:
109
+ data = json.load(f)
110
+
111
+ self._catalog = {
112
+ doc_id: DocumentInfo(**info)
113
+ for doc_id, info in data.get("documents", {}).items()
114
+ }
115
+
116
+ def _save_catalog(self) -> None:
117
+ """Save the document catalog to disk."""
118
+ data = {
119
+ "version": "1.0",
120
+ "updated_at": datetime.now().isoformat(),
121
+ "documents": {
122
+ doc_id: info.to_dict()
123
+ for doc_id, info in self._catalog.items()
124
+ }
125
+ }
126
+
127
+ with open(self._catalog_path, "w") as f:
128
+ json.dump(data, f, indent=2)
129
+
130
+ def add_document(
131
+ self,
132
+ source: str | Path,
133
+ doc_id: str | None = None,
134
+ title: str | None = None,
135
+ metadata: dict[str, Any] | None = None,
136
+ ) -> str:
137
+ """
138
+ Add and index a document.
139
+
140
+ Args:
141
+ source: Path to PDF file
142
+ doc_id: Optional custom ID (defaults to filename hash)
143
+ title: Optional title (defaults to filename)
144
+ metadata: Optional metadata dictionary
145
+
146
+ Returns:
147
+ Document ID
148
+
149
+ Example:
150
+ doc_id = store.add_document("report.pdf", metadata={"year": 2024})
151
+ """
152
+ source_path = Path(source)
153
+
154
+ if not source_path.exists():
155
+ raise IndexingError(f"Source file not found: {source_path}")
156
+
157
+ # Generate ID if not provided
158
+ if doc_id is None:
159
+ # Hash of filename + file size for uniqueness
160
+ hash_input = f"{source_path.name}_{source_path.stat().st_size}"
161
+ doc_id = hashlib.md5(hash_input.encode()).hexdigest()[:12]
162
+
163
+ # Check if already exists
164
+ if doc_id in self._catalog:
165
+ logger.warning("document_already_exists", doc_id=doc_id)
166
+ return doc_id
167
+
168
+ # Ingest document
169
+ logger.info("ingesting_document", source=str(source_path))
170
+ result = ingest_document(str(source_path))
171
+
172
+ # Build skeleton index
173
+ skeleton, kv_store = build_skeleton_index(result.tree)
174
+
175
+ # Save to store
176
+ index_path = self.store_path / doc_id
177
+ save_index(skeleton, kv_store, index_path)
178
+
179
+ # Update catalog
180
+ info = DocumentInfo(
181
+ id=doc_id,
182
+ title=title or source_path.stem,
183
+ source_path=str(source_path),
184
+ node_count=len(skeleton),
185
+ created_at=datetime.now().isoformat(),
186
+ metadata=metadata or {},
187
+ )
188
+ self._catalog[doc_id] = info
189
+ self._save_catalog()
190
+
191
+ logger.info(
192
+ "document_added",
193
+ doc_id=doc_id,
194
+ title=info.title,
195
+ nodes=info.node_count,
196
+ )
197
+
198
+ return doc_id
199
+
200
+ def add_from_text(
201
+ self,
202
+ text: str | list[str],
203
+ doc_id: str,
204
+ title: str | None = None,
205
+ metadata: dict[str, Any] | None = None,
206
+ ) -> str:
207
+ """
208
+ Add and index a document from raw text.
209
+
210
+ Args:
211
+ text: Text content or list of text chunks
212
+ doc_id: Document ID
213
+ title: Optional title
214
+ metadata: Optional metadata
215
+
216
+ Returns:
217
+ Document ID
218
+ """
219
+ from rnsr.ingestion import build_tree_from_text
220
+
221
+ # Check if already exists
222
+ if doc_id in self._catalog:
223
+ logger.warning("document_already_exists", doc_id=doc_id)
224
+ return doc_id
225
+
226
+ # Build tree from text
227
+ tree = build_tree_from_text(text)
228
+
229
+ # Build skeleton index
230
+ skeleton, kv_store = build_skeleton_index(tree)
231
+
232
+ # Save to store
233
+ index_path = self.store_path / doc_id
234
+ save_index(skeleton, kv_store, index_path)
235
+
236
+ # Update catalog
237
+ info = DocumentInfo(
238
+ id=doc_id,
239
+ title=title or doc_id,
240
+ source_path=None,
241
+ node_count=len(skeleton),
242
+ created_at=datetime.now().isoformat(),
243
+ metadata=metadata or {},
244
+ )
245
+ self._catalog[doc_id] = info
246
+ self._save_catalog()
247
+
248
+ logger.info(
249
+ "document_added_from_text",
250
+ doc_id=doc_id,
251
+ title=info.title,
252
+ nodes=info.node_count,
253
+ )
254
+
255
+ return doc_id
256
+
257
+ def remove_document(self, doc_id: str) -> bool:
258
+ """
259
+ Remove a document from the store.
260
+
261
+ Args:
262
+ doc_id: Document ID to remove
263
+
264
+ Returns:
265
+ True if removed, False if not found
266
+ """
267
+ if doc_id not in self._catalog:
268
+ return False
269
+
270
+ # Delete index files
271
+ index_path = self.store_path / doc_id
272
+ delete_index(index_path)
273
+
274
+ # Remove from catalog
275
+ del self._catalog[doc_id]
276
+ self._save_catalog()
277
+
278
+ logger.info("document_removed", doc_id=doc_id)
279
+ return True
280
+
281
+ def get_document(
282
+ self,
283
+ doc_id: str,
284
+ ) -> tuple[dict[str, SkeletonNode], KVStore] | None:
285
+ """
286
+ Load a document's index.
287
+
288
+ Args:
289
+ doc_id: Document ID
290
+
291
+ Returns:
292
+ Tuple of (skeleton, kv_store) or None if not found
293
+ """
294
+ if doc_id not in self._catalog:
295
+ return None
296
+
297
+ index_path = self.store_path / doc_id
298
+ return load_index(index_path)
299
+
300
+ def query(
301
+ self,
302
+ doc_id: str,
303
+ question: str,
304
+ ) -> str:
305
+ """
306
+ Query a document.
307
+
308
+ Args:
309
+ doc_id: Document ID
310
+ question: Question to ask
311
+
312
+ Returns:
313
+ Answer string
314
+
315
+ Example:
316
+ answer = store.query("contract_123", "What are the payment terms?")
317
+ """
318
+ from rnsr.agent import run_navigator
319
+
320
+ index_result = self.get_document(doc_id)
321
+ if index_result is None:
322
+ raise IndexingError(f"Document not found: {doc_id}")
323
+
324
+ skeleton, kv_store = index_result
325
+ nav_result = run_navigator(question, skeleton, kv_store)
326
+ return nav_result.get("answer", "No answer found.")
327
+
328
+ def list_documents(self) -> list[dict[str, Any]]:
329
+ """
330
+ List all documents in the store.
331
+
332
+ Returns:
333
+ List of document info dictionaries
334
+ """
335
+ return [info.to_dict() for info in self._catalog.values()]
336
+
337
+ def get_document_info(self, doc_id: str) -> DocumentInfo | None:
338
+ """
339
+ Get information about a document.
340
+
341
+ Args:
342
+ doc_id: Document ID
343
+
344
+ Returns:
345
+ DocumentInfo or None if not found
346
+ """
347
+ return self._catalog.get(doc_id)
348
+
349
+ def search_documents(
350
+ self,
351
+ query: str | None = None,
352
+ metadata_filter: dict[str, Any] | None = None,
353
+ ) -> list[DocumentInfo]:
354
+ """
355
+ Search documents by title or metadata.
356
+
357
+ Args:
358
+ query: Optional text to search in titles
359
+ metadata_filter: Optional metadata key-value pairs to match
360
+
361
+ Returns:
362
+ List of matching DocumentInfo objects
363
+ """
364
+ results = []
365
+
366
+ for info in self._catalog.values():
367
+ # Title search
368
+ if query and query.lower() not in info.title.lower():
369
+ continue
370
+
371
+ # Metadata filter
372
+ if metadata_filter:
373
+ match = all(
374
+ info.metadata.get(k) == v
375
+ for k, v in metadata_filter.items()
376
+ )
377
+ if not match:
378
+ continue
379
+
380
+ results.append(info)
381
+
382
+ return results
383
+
384
+ def __len__(self) -> int:
385
+ """Number of documents in the store."""
386
+ return len(self._catalog)
387
+
388
+ def __contains__(self, doc_id: str) -> bool:
389
+ """Check if a document exists."""
390
+ return doc_id in self._catalog
391
+
392
+ def __iter__(self) -> Iterator[str]:
393
+ """Iterate over document IDs."""
394
+ return iter(self._catalog.keys())
rnsr/exceptions.py ADDED
@@ -0,0 +1,74 @@
1
+ """
2
+ RNSR Custom Exceptions
3
+
4
+ All module-specific exceptions inherit from RNSRError.
5
+ """
6
+
7
+
8
+ class RNSRError(Exception):
9
+ """Base exception for all RNSR errors."""
10
+
11
+ pass
12
+
13
+
14
+ # Ingestion Exceptions
15
+ class IngestionError(RNSRError):
16
+ """Base exception for ingestion errors."""
17
+
18
+ pass
19
+
20
+
21
+ class FontAnalysisError(IngestionError):
22
+ """Raised when font histogram analysis fails."""
23
+
24
+ pass
25
+
26
+
27
+ class SegmentationError(IngestionError):
28
+ """Raised when page segmentation fails."""
29
+
30
+ pass
31
+
32
+
33
+ class OCRError(IngestionError):
34
+ """Raised when OCR fallback fails."""
35
+
36
+ pass
37
+
38
+
39
+ # Indexing Exceptions
40
+ class IndexingError(RNSRError):
41
+ """Base exception for indexing errors."""
42
+
43
+ pass
44
+
45
+
46
+ class SummaryGenerationError(IndexingError):
47
+ """Raised when LLM summary generation fails."""
48
+
49
+ pass
50
+
51
+
52
+ class KVStoreError(IndexingError):
53
+ """Raised when KV store operations fail."""
54
+
55
+ pass
56
+
57
+
58
+ # Agent Exceptions
59
+ class AgentError(RNSRError):
60
+ """Base exception for agent errors."""
61
+
62
+ pass
63
+
64
+
65
+ class VariableNotFoundError(AgentError):
66
+ """Raised when a variable pointer cannot be resolved."""
67
+
68
+ pass
69
+
70
+
71
+ class NavigationError(AgentError):
72
+ """Raised when document navigation fails."""
73
+
74
+ pass
@@ -0,0 +1,172 @@
1
+ """
2
+ RNSR Extraction Module
3
+
4
+ Entity and relationship extraction for ontological document understanding.
5
+
6
+ ## Recommended: RLMUnifiedExtractor
7
+
8
+ Use the unified RLM extractor for all extraction needs:
9
+
10
+ ```python
11
+ from rnsr.extraction import RLMUnifiedExtractor, extract_entities_and_relationships
12
+
13
+ # Simple API
14
+ result = extract_entities_and_relationships(node_id, doc_id, header, content)
15
+
16
+ # Full control
17
+ extractor = RLMUnifiedExtractor()
18
+ result = extractor.extract(node_id, doc_id, header, content)
19
+ ```
20
+
21
+ This extractor:
22
+ 1. LLM writes extraction code based on document (adaptive)
23
+ 2. Code executes on DOC_VAR (grounded in text)
24
+ 3. ToT validates with probabilities (accurate)
25
+ 4. Cross-validates entities and relationships (comprehensive)
26
+ 5. Learns new types from usage (domain-adaptive)
27
+
28
+ ## Adaptive Learning
29
+
30
+ The system learns from your document workload:
31
+ - Entity types: `LearnedTypeRegistry`
32
+ - Relationship types: `LearnedRelationshipTypeRegistry`
33
+ - Normalization patterns: `LearnedNormalizationPatterns`
34
+ - Stop words: `LearnedStopWords`
35
+ - Header thresholds: `LearnedHeaderThresholds`
36
+ - Query patterns: `LearnedQueryPatterns`
37
+
38
+ All learned data persists in `~/.rnsr/`.
39
+ """
40
+
41
+ from rnsr.extraction.models import (
42
+ Entity,
43
+ EntityLink,
44
+ EntityType,
45
+ ExtractionResult,
46
+ Mention,
47
+ Relationship,
48
+ RelationType,
49
+ )
50
+
51
+ # Primary extractor (recommended)
52
+ from rnsr.extraction.rlm_unified_extractor import (
53
+ RLMUnifiedExtractor,
54
+ RLMUnifiedResult,
55
+ extract_entities_and_relationships,
56
+ )
57
+
58
+ # Legacy/alternative extractors
59
+ from rnsr.extraction.entity_extractor import (
60
+ EntityExtractor, # DEPRECATED
61
+ merge_entities,
62
+ )
63
+ from rnsr.extraction.grounded_extractor import (
64
+ GroundedEntityExtractor,
65
+ ValidationMode,
66
+ )
67
+ from rnsr.extraction.unified_extractor import (
68
+ UnifiedGroundedExtractor,
69
+ UnifiedExtractionResult,
70
+ )
71
+ from rnsr.extraction.rlm_extractor import (
72
+ RLMEntityExtractor,
73
+ RLMExtractionResult,
74
+ LightweightREPL,
75
+ )
76
+ from rnsr.extraction.tot_validator import (
77
+ TotEntityValidator,
78
+ TotBatchResult,
79
+ TotValidationResult,
80
+ )
81
+ from rnsr.extraction.relationship_validator import (
82
+ RelationshipValidator,
83
+ RelationshipValidationResult,
84
+ RelationshipBatchResult,
85
+ )
86
+ from rnsr.extraction.candidate_extractor import (
87
+ CandidateExtractor,
88
+ EntityCandidate,
89
+ extract_candidates_from_text,
90
+ )
91
+ from rnsr.extraction.relationship_patterns import (
92
+ RelationshipPatternExtractor,
93
+ RelationshipCandidate,
94
+ extract_relationship_candidates,
95
+ )
96
+ from rnsr.extraction.relationship_extractor import (
97
+ RelationshipExtractor, # DEPRECATED
98
+ extract_implicit_relationships,
99
+ )
100
+ from rnsr.extraction.entity_linker import (
101
+ EntityLinker,
102
+ LearnedNormalizationPatterns,
103
+ get_learned_normalization_patterns,
104
+ )
105
+ from rnsr.extraction.learned_types import (
106
+ LearnedTypeRegistry,
107
+ LearnedRelationshipTypeRegistry,
108
+ get_learned_type_registry,
109
+ get_learned_relationship_type_registry,
110
+ record_learned_type,
111
+ record_learned_relationship_type,
112
+ )
113
+
114
+ __all__ = [
115
+ # Models
116
+ "Entity",
117
+ "EntityLink",
118
+ "EntityType",
119
+ "ExtractionResult",
120
+ "Mention",
121
+ "Relationship",
122
+ "RelationType",
123
+
124
+ # PRIMARY EXTRACTOR (recommended)
125
+ "RLMUnifiedExtractor",
126
+ "RLMUnifiedResult",
127
+ "extract_entities_and_relationships", # Simple function API
128
+
129
+ # Alternative extractors
130
+ "UnifiedGroundedExtractor",
131
+ "UnifiedExtractionResult",
132
+ "RLMEntityExtractor",
133
+ "RLMExtractionResult",
134
+ "GroundedEntityExtractor",
135
+ "ValidationMode",
136
+
137
+ # Legacy extractors (DEPRECATED - emit warnings)
138
+ "EntityExtractor",
139
+ "RelationshipExtractor",
140
+
141
+ # Supporting components
142
+ "CandidateExtractor",
143
+ "RelationshipPatternExtractor",
144
+ "EntityLinker",
145
+ "TotEntityValidator",
146
+ "TotBatchResult",
147
+ "TotValidationResult",
148
+ "RelationshipValidator",
149
+ "RelationshipValidationResult",
150
+ "RelationshipBatchResult",
151
+ "LightweightREPL",
152
+
153
+ # Data classes
154
+ "EntityCandidate",
155
+ "RelationshipCandidate",
156
+
157
+ # Adaptive Learning Registries
158
+ "LearnedTypeRegistry",
159
+ "LearnedRelationshipTypeRegistry",
160
+ "LearnedNormalizationPatterns",
161
+ "get_learned_type_registry",
162
+ "get_learned_relationship_type_registry",
163
+ "get_learned_normalization_patterns",
164
+ "record_learned_type",
165
+ "record_learned_relationship_type",
166
+
167
+ # Utility functions
168
+ "merge_entities",
169
+ "extract_implicit_relationships",
170
+ "extract_candidates_from_text",
171
+ "extract_relationship_candidates",
172
+ ]