rnsr 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. rnsr/__init__.py +118 -0
  2. rnsr/__main__.py +242 -0
  3. rnsr/agent/__init__.py +218 -0
  4. rnsr/agent/cross_doc_navigator.py +767 -0
  5. rnsr/agent/graph.py +1557 -0
  6. rnsr/agent/llm_cache.py +575 -0
  7. rnsr/agent/navigator_api.py +497 -0
  8. rnsr/agent/provenance.py +772 -0
  9. rnsr/agent/query_clarifier.py +617 -0
  10. rnsr/agent/reasoning_memory.py +736 -0
  11. rnsr/agent/repl_env.py +709 -0
  12. rnsr/agent/rlm_navigator.py +2108 -0
  13. rnsr/agent/self_reflection.py +602 -0
  14. rnsr/agent/variable_store.py +308 -0
  15. rnsr/benchmarks/__init__.py +118 -0
  16. rnsr/benchmarks/comprehensive_benchmark.py +733 -0
  17. rnsr/benchmarks/evaluation_suite.py +1210 -0
  18. rnsr/benchmarks/finance_bench.py +147 -0
  19. rnsr/benchmarks/pdf_merger.py +178 -0
  20. rnsr/benchmarks/performance.py +321 -0
  21. rnsr/benchmarks/quality.py +321 -0
  22. rnsr/benchmarks/runner.py +298 -0
  23. rnsr/benchmarks/standard_benchmarks.py +995 -0
  24. rnsr/client.py +560 -0
  25. rnsr/document_store.py +394 -0
  26. rnsr/exceptions.py +74 -0
  27. rnsr/extraction/__init__.py +172 -0
  28. rnsr/extraction/candidate_extractor.py +357 -0
  29. rnsr/extraction/entity_extractor.py +581 -0
  30. rnsr/extraction/entity_linker.py +825 -0
  31. rnsr/extraction/grounded_extractor.py +722 -0
  32. rnsr/extraction/learned_types.py +599 -0
  33. rnsr/extraction/models.py +232 -0
  34. rnsr/extraction/relationship_extractor.py +600 -0
  35. rnsr/extraction/relationship_patterns.py +511 -0
  36. rnsr/extraction/relationship_validator.py +392 -0
  37. rnsr/extraction/rlm_extractor.py +589 -0
  38. rnsr/extraction/rlm_unified_extractor.py +990 -0
  39. rnsr/extraction/tot_validator.py +610 -0
  40. rnsr/extraction/unified_extractor.py +342 -0
  41. rnsr/indexing/__init__.py +60 -0
  42. rnsr/indexing/knowledge_graph.py +1128 -0
  43. rnsr/indexing/kv_store.py +313 -0
  44. rnsr/indexing/persistence.py +323 -0
  45. rnsr/indexing/semantic_retriever.py +237 -0
  46. rnsr/indexing/semantic_search.py +320 -0
  47. rnsr/indexing/skeleton_index.py +395 -0
  48. rnsr/ingestion/__init__.py +161 -0
  49. rnsr/ingestion/chart_parser.py +569 -0
  50. rnsr/ingestion/document_boundary.py +662 -0
  51. rnsr/ingestion/font_histogram.py +334 -0
  52. rnsr/ingestion/header_classifier.py +595 -0
  53. rnsr/ingestion/hierarchical_cluster.py +515 -0
  54. rnsr/ingestion/layout_detector.py +356 -0
  55. rnsr/ingestion/layout_model.py +379 -0
  56. rnsr/ingestion/ocr_fallback.py +177 -0
  57. rnsr/ingestion/pipeline.py +936 -0
  58. rnsr/ingestion/semantic_fallback.py +417 -0
  59. rnsr/ingestion/table_parser.py +799 -0
  60. rnsr/ingestion/text_builder.py +460 -0
  61. rnsr/ingestion/tree_builder.py +402 -0
  62. rnsr/ingestion/vision_retrieval.py +965 -0
  63. rnsr/ingestion/xy_cut.py +555 -0
  64. rnsr/llm.py +733 -0
  65. rnsr/models.py +167 -0
  66. rnsr/py.typed +2 -0
  67. rnsr-0.1.0.dist-info/METADATA +592 -0
  68. rnsr-0.1.0.dist-info/RECORD +72 -0
  69. rnsr-0.1.0.dist-info/WHEEL +5 -0
  70. rnsr-0.1.0.dist-info/entry_points.txt +2 -0
  71. rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
  72. rnsr-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,592 @@
1
+ Metadata-Version: 2.4
2
+ Name: rnsr
3
+ Version: 0.1.0
4
+ Summary: Recursive Neural-Symbolic Retriever - Hierarchical document retrieval with font-based structure analysis
5
+ Author: RNSR Contributors
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/theeufj/RNSR
8
+ Project-URL: Documentation, https://github.com/theeufj/RNSR#readme
9
+ Project-URL: Repository, https://github.com/theeufj/RNSR.git
10
+ Project-URL: Issues, https://github.com/theeufj/RNSR/issues
11
+ Keywords: rag,retrieval,document-processing,llm,hierarchical-indexing,pdf-parsing,neural-symbolic
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Programming Language :: Python :: 3.14
22
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
+ Classifier: Topic :: Text Processing :: Indexing
24
+ Requires-Python: >=3.10
25
+ Description-Content-Type: text/markdown
26
+ License-File: LICENSE
27
+ Requires-Dist: pymupdf>=1.23.0
28
+ Requires-Dist: pdfplumber>=0.10.0
29
+ Requires-Dist: numpy>=1.24.0
30
+ Requires-Dist: scipy>=1.10.0
31
+ Requires-Dist: structlog>=23.0.0
32
+ Requires-Dist: python-dotenv>=1.0.0
33
+ Requires-Dist: llama-index>=0.10.0
34
+ Requires-Dist: langgraph>=0.2.0
35
+ Requires-Dist: langchain-core>=0.2.0
36
+ Requires-Dist: transformers>=4.35.0
37
+ Requires-Dist: torch>=2.0.0
38
+ Requires-Dist: torchvision>=0.15.0
39
+ Requires-Dist: pillow>=10.0.0
40
+ Requires-Dist: pdf2image>=1.16.0
41
+ Requires-Dist: scikit-learn>=1.3.0
42
+ Requires-Dist: pymupdf>=1.23.0
43
+ Provides-Extra: openai
44
+ Requires-Dist: openai>=1.0.0; extra == "openai"
45
+ Requires-Dist: llama-index-llms-openai>=0.1.0; extra == "openai"
46
+ Requires-Dist: llama-index-embeddings-openai>=0.1.0; extra == "openai"
47
+ Provides-Extra: anthropic
48
+ Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
49
+ Requires-Dist: llama-index-llms-anthropic>=0.1.0; extra == "anthropic"
50
+ Provides-Extra: gemini
51
+ Requires-Dist: google-genai>=1.0.0; extra == "gemini"
52
+ Provides-Extra: all
53
+ Requires-Dist: rnsr[anthropic,gemini,openai]; extra == "all"
54
+ Provides-Extra: benchmarks
55
+ Requires-Dist: datasets>=2.0.0; extra == "benchmarks"
56
+ Requires-Dist: ragas>=0.1.0; extra == "benchmarks"
57
+ Provides-Extra: demo
58
+ Requires-Dist: gradio>=4.0.0; extra == "demo"
59
+ Provides-Extra: dev
60
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
61
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
62
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
63
+ Requires-Dist: mypy>=1.0.0; extra == "dev"
64
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
65
+ Dynamic: license-file
66
+
67
+ # RNSR - Recursive Neural-Symbolic Retriever
68
+
69
+ A state-of-the-art document retrieval system that preserves hierarchical structure for superior RAG performance. Combines PageIndex, Recursive Language Models (RLM), Knowledge Graphs, and Tree of Thoughts navigation.
70
+
71
+ ## Overview
72
+
73
+ RNSR combines neural and symbolic approaches to achieve accurate document understanding:
74
+
75
+ - **Font Histogram Algorithm** - Automatically detects document hierarchy from font sizes (no training required)
76
+ - **Skeleton Index Pattern** - Lightweight summaries with KV store for efficient retrieval
77
+ - **Tree-of-Thoughts Navigation** - LLM reasons about document structure to find answers
78
+ - **RLM Unified Extraction** - LLM writes extraction code, grounded in actual text
79
+ - **Knowledge Graph** - Entity and relationship storage for cross-document linking
80
+ - **Self-Reflection Loop** - Iterative answer improvement through self-critique
81
+ - **Adaptive Learning** - System learns from your document workload over time
82
+
83
+ ## Key Features
84
+
85
+ | Feature | Description |
86
+ |---------|-------------|
87
+ | **Hierarchical Extraction** | Preserves document structure (sections, subsections, paragraphs) |
88
+ | **RLM Unified Extractor** | LLM writes extraction code + ToT validation (grounded, no hallucination) |
89
+ | **Provenance System** | Every answer traces back to exact document citations |
90
+ | **LLM Response Cache** | Semantic-aware caching for 10x cost/speed improvement |
91
+ | **Self-Reflection** | Iterative self-correction improves answer quality |
92
+ | **Reasoning Memory** | Learns successful query patterns for faster future queries |
93
+ | **Query Clarification** | Detects ambiguous queries and asks clarifying questions |
94
+ | **Table/Chart Parsing** | SQL-like queries over tables, chart trend analysis |
95
+ | **Adaptive Learning** | 6 registries that learn from usage and persist to disk |
96
+ | **Multi-Document Detection** | Automatically splits bundled PDFs |
97
+ | **Vision Mode** | OCR-free analysis for scanned documents and charts |
98
+
99
+ ## Installation
100
+
101
+ ```bash
102
+ # Clone the repository
103
+ git clone https://github.com/theeufj/RNSR.git
104
+ cd RNSR
105
+
106
+ # Create virtual environment
107
+ python -m venv .venv
108
+ source .venv/bin/activate # On Windows: .venv\Scripts\activate
109
+
110
+ # Install with all LLM providers
111
+ pip install -e ".[all]"
112
+
113
+ # Or install with specific provider
114
+ pip install -e ".[openai]" # OpenAI only
115
+ pip install -e ".[anthropic]" # Anthropic only
116
+ pip install -e ".[gemini]" # Google Gemini only
117
+ ```
118
+
119
+ ## Quick Start
120
+
121
+ ### 1. Set up API keys
122
+
123
+ Create a `.env` file:
124
+
125
+ ```bash
126
+ cp .env.example .env
127
+ # Edit .env with your API keys
128
+ ```
129
+
130
+ ```env
131
+ # Choose your preferred LLM provider
132
+ OPENAI_API_KEY=sk-...
133
+ # or
134
+ ANTHROPIC_API_KEY=sk-ant-...
135
+ # or
136
+ GOOGLE_API_KEY=AI...
137
+
138
+ # Optional: Override default models
139
+ LLM_PROVIDER=anthropic
140
+ SUMMARY_MODEL=claude-sonnet-4-5
141
+ ```
142
+
143
+ ### 2. Use the Python API
144
+
145
+ ```python
146
+ from rnsr import RNSRClient
147
+
148
+ # Simple one-line Q&A
149
+ client = RNSRClient()
150
+ answer = client.ask("contract.pdf", "What are the payment terms?")
151
+ print(answer)
152
+
153
+ # Advanced navigation with verification and self-reflection
154
+ result = client.ask_advanced(
155
+ "complex_report.pdf",
156
+ "Compare liability clauses in sections 5 and 8",
157
+ enable_verification=True,
158
+ enable_self_reflection=True,
159
+ max_recursion_depth=3,
160
+ )
161
+ ```
162
+
163
+ ### 3. Run the Demo UI
164
+
165
+ ```bash
166
+ python demo.py
167
+ # Open http://localhost:7860 in your browser
168
+ ```
169
+
170
+ ## New Features
171
+
172
+ ### Provenance System
173
+
174
+ Every answer includes traceable citations:
175
+
176
+ ```python
177
+ from rnsr.agent import ProvenanceTracker, format_citations_for_display
178
+
179
+ tracker = ProvenanceTracker(kv_store=kv_store, skeleton=skeleton)
180
+ record = tracker.create_provenance_record(
181
+ answer="The payment terms are net 30.",
182
+ question="What are the payment terms?",
183
+ variables=navigation_variables,
184
+ )
185
+
186
+ print(f"Confidence: {record.aggregate_confidence:.0%}")
187
+ print(format_citations_for_display(record.citations))
188
+ # Output:
189
+ # **Sources:**
190
+ # 1. [contract.pdf] Section: Payment Terms, Page 5: "Payment shall be due within 30 days..."
191
+ ```
192
+
193
+ ### LLM Response Caching
194
+
195
+ Automatic caching reduces costs and latency:
196
+
197
+ ```python
198
+ from rnsr.agent import wrap_llm_with_cache, get_global_cache
199
+
200
+ # Wrap any LLM function with caching
201
+ cached_llm = wrap_llm_with_cache(llm.complete, ttl_seconds=3600)
202
+
203
+ # Use cached LLM - repeated prompts hit cache
204
+ response = cached_llm("What is 2+2?") # Calls LLM
205
+ response = cached_llm("What is 2+2?") # Returns cached (instant)
206
+
207
+ # Check cache stats
208
+ print(get_global_cache().get_stats())
209
+ # {'entries': 150, 'hits': 89, 'hit_rate': 0.59}
210
+ ```
211
+
212
+ ### Self-Reflection Loop
213
+
214
+ Answers are automatically critiqued and improved:
215
+
216
+ ```python
217
+ from rnsr.agent import SelfReflectionEngine, reflect_on_answer
218
+
219
+ # Quick one-liner
220
+ result = reflect_on_answer(
221
+ answer="The contract expires in 2024.",
222
+ question="When does the contract expire?",
223
+ evidence="Contract dated 2023, 2-year term...",
224
+ )
225
+
226
+ print(f"Improved: {result.improved}")
227
+ print(f"Final answer: {result.final_answer}")
228
+ print(f"Iterations: {result.total_iterations}")
229
+ ```
230
+
231
+ ### Reasoning Chain Memory
232
+
233
+ The system learns from successful queries:
234
+
235
+ ```python
236
+ from rnsr.agent import get_reasoning_memory, find_similar_chains
237
+
238
+ # Find similar past queries
239
+ matches = find_similar_chains("What is the liability cap?")
240
+ for match in matches:
241
+ print(f"Similar query: {match.chain.query}")
242
+ print(f"Similarity: {match.similarity:.0%}")
243
+ print(f"Past answer: {match.chain.answer}")
244
+ ```
245
+
246
+ ### Table Parsing
247
+
248
+ Extract and query tables from documents:
249
+
250
+ ```python
251
+ from rnsr.ingestion import TableParser, TableQueryEngine
252
+
253
+ parser = TableParser()
254
+ tables = parser.parse_from_text(document_text)
255
+
256
+ # SQL-like queries
257
+ engine = TableQueryEngine(tables[0])
258
+ results = engine.select(
259
+ columns=["Name", "Amount"],
260
+ where={"Status": "Active"},
261
+ order_by="Amount",
262
+ )
263
+
264
+ # Aggregations
265
+ total = engine.aggregate("Amount", "sum")
266
+ ```
267
+
268
+ ### Query Clarification
269
+
270
+ Handle ambiguous queries gracefully:
271
+
272
+ ```python
273
+ from rnsr.agent import QueryClarifier, needs_clarification
274
+
275
+ # Check if query needs clarification
276
+ is_ambiguous, analysis = needs_clarification(
277
+ "What does it say about the clause?"
278
+ )
279
+
280
+ if is_ambiguous:
281
+ print(f"Ambiguity: {analysis.ambiguity_type}")
282
+ print(f"Clarifying question: {analysis.suggested_clarification}")
283
+ # "What does 'it' refer to in your question?"
284
+ ```
285
+
286
+ ## Adaptive Learning
287
+
288
+ RNSR learns from your document workload. All learned data persists in `~/.rnsr/`:
289
+
290
+ ```
291
+ ~/.rnsr/
292
+ ├── learned_entity_types.json # New entity types discovered
293
+ ├── learned_relationship_types.json # New relationship types
294
+ ├── learned_normalization.json # Title/suffix patterns
295
+ ├── learned_stop_words.json # Domain-specific stop words
296
+ ├── learned_header_thresholds.json # Document-type font thresholds
297
+ ├── learned_query_patterns.json # Successful query patterns
298
+ ├── reasoning_chains.json # Successful reasoning chains
299
+ └── llm_cache.db # LLM response cache
300
+ ```
301
+
302
+ The more you use RNSR, the better it gets at understanding your domain.
303
+
304
+ ## How It Works
305
+
306
+ ### Document Ingestion Pipeline
307
+
308
+ ```
309
+ PDF → Font Analysis → Header Classification → Tree Building → Skeleton Index
310
+ ↓ ↓ ↓ ↓
311
+ Detect font sizes Classify H1/H2/H3 Build hierarchy Create summaries
312
+
313
+ Multi-doc detection
314
+ (page number resets)
315
+ ```
316
+
317
+ ### Query Processing
318
+
319
+ ```
320
+ Question → Clarify → Pre-Filter → Tree Navigation → Answer → Self-Reflect → Verify
321
+ ↓ ↓ ↓ ↓ ↓ ↓
322
+ Ask if ambig Keyword scan ToT reasoning Synthesize Critique Fact-check
323
+ ↓ ↓
324
+ Sub-LLM recursion Improve answer
325
+ (complex queries) (if issues)
326
+ ```
327
+
328
+ ### Entity Extraction (RLM Unified)
329
+
330
+ ```
331
+ Document → LLM writes code → Execute on DOC_VAR → ToT validation → Cross-validate
332
+ ↓ ↓ ↓ ↓
333
+ Generates regex/Python Grounded results Probability scores Entity↔Relationship
334
+
335
+ All tied to exact text spans
336
+ ```
337
+
338
+ ## Architecture
339
+
340
+ ```
341
+ rnsr/
342
+ ├── agent/ # Query processing
343
+ │ ├── rlm_navigator.py # Main navigation agent
344
+ │ ├── provenance.py # Citation tracking (NEW)
345
+ │ ├── llm_cache.py # Response caching (NEW)
346
+ │ ├── self_reflection.py # Answer improvement (NEW)
347
+ │ ├── reasoning_memory.py # Chain memory (NEW)
348
+ │ ├── query_clarifier.py # Ambiguity handling (NEW)
349
+ │ ├── graph.py # LangGraph workflow
350
+ │ └── variable_store.py # Context management
351
+ ├── extraction/ # Entity/relationship extraction
352
+ │ ├── rlm_unified_extractor.py # Best extractor (NEW)
353
+ │ ├── learned_types.py # Adaptive type learning
354
+ │ ├── entity_linker.py # Cross-document linking
355
+ │ └── models.py # Entity/Relationship models
356
+ ├── indexing/ # Index construction
357
+ │ ├── skeleton_index.py # Summary generation
358
+ │ ├── knowledge_graph.py # Entity/relationship storage
359
+ │ ├── kv_store.py # SQLite/in-memory storage
360
+ │ └── semantic_search.py # Optional vector search
361
+ ├── ingestion/ # Document processing
362
+ │ ├── pipeline.py # Main ingestion orchestrator
363
+ │ ├── font_histogram.py # Font-based structure detection
364
+ │ ├── header_classifier.py # H1/H2/H3 classification
365
+ │ ├── table_parser.py # Table extraction (NEW)
366
+ │ ├── chart_parser.py # Chart interpretation (NEW)
367
+ │ └── tree_builder.py # Hierarchical tree construction
368
+ ├── llm.py # Multi-provider LLM abstraction
369
+ ├── client.py # High-level API
370
+ └── models.py # Data structures
371
+ ```
372
+
373
+ ## API Reference
374
+
375
+ ### High-Level API
376
+
377
+ ```python
378
+ from rnsr import RNSRClient
379
+
380
+ client = RNSRClient(
381
+ llm_provider="anthropic", # or "openai", "gemini"
382
+ llm_model="claude-sonnet-4-5"
383
+ )
384
+
385
+ # Simple query
386
+ answer = client.ask("document.pdf", "What is the main topic?")
387
+
388
+ # Vision mode (for scanned docs)
389
+ answer = client.ask_vision("scanned.pdf", "What does the chart show?")
390
+ ```
391
+
392
+ ### Low-Level API
393
+
394
+ ```python
395
+ from rnsr import (
396
+ ingest_document,
397
+ build_skeleton_index,
398
+ run_rlm_navigator,
399
+ SQLiteKVStore
400
+ )
401
+ from rnsr.extraction import RLMUnifiedExtractor
402
+ from rnsr.agent import ProvenanceTracker, SelfReflectionEngine
403
+
404
+ # Step 1: Ingest document
405
+ result = ingest_document("document.pdf")
406
+ print(f"Extracted {result.tree.total_nodes} nodes")
407
+
408
+ # Step 2: Build index
409
+ kv_store = SQLiteKVStore("./data/index.db")
410
+ skeleton = build_skeleton_index(result.tree, kv_store)
411
+
412
+ # Step 3: Extract entities (grounded, no hallucination)
413
+ extractor = RLMUnifiedExtractor()
414
+ extraction = extractor.extract(
415
+ node_id="section_1",
416
+ doc_id="document",
417
+ header="Introduction",
418
+ content="..."
419
+ )
420
+
421
+ # Step 4: Query with provenance
422
+ answer = run_rlm_navigator(
423
+ question="What are the key findings?",
424
+ skeleton=skeleton,
425
+ kv_store=kv_store
426
+ )
427
+
428
+ # Step 5: Get citations
429
+ tracker = ProvenanceTracker(kv_store=kv_store)
430
+ record = tracker.create_provenance_record(answer, question, variables)
431
+ ```
432
+
433
+ ## Configuration
434
+
435
+ ### Environment Variables
436
+
437
+ | Variable | Description | Default |
438
+ |----------|-------------|---------|
439
+ | `LLM_PROVIDER` | Primary LLM provider | `auto` (detect from keys) |
440
+ | `SUMMARY_MODEL` | Model for summarization | Provider default |
441
+ | `AGENT_MODEL` | Model for navigation | Provider default |
442
+ | `EMBEDDING_MODEL` | Embedding model | `text-embedding-3-small` |
443
+ | `KV_STORE_PATH` | SQLite database path | `./data/kv_store.db` |
444
+ | `LOG_LEVEL` | Logging verbosity | `INFO` |
445
+ | `RNSR_LLM_CACHE_PATH` | Custom cache location | `~/.rnsr/llm_cache.db` |
446
+ | `RNSR_REASONING_MEMORY_PATH` | Custom memory location | `~/.rnsr/reasoning_chains.json` |
447
+
448
+ ### Supported Models
449
+
450
+ | Provider | Models |
451
+ |----------|--------|
452
+ | **OpenAI** | `gpt-5.2`, `gpt-5-mini`, `gpt-5-nano`, `gpt-4.1`, `gpt-4o-mini` |
453
+ | **Anthropic** | `claude-opus-4-5`, `claude-sonnet-4-5`, `claude-haiku-4-5` |
454
+ | **Gemini** | `gemini-3-pro-preview`, `gemini-3-flash-preview`, `gemini-2.5-pro`, `gemini-2.5-flash` |
455
+
456
+ ## Benchmarks
457
+
458
+ RNSR is designed for complex document understanding tasks:
459
+
460
+ - **Multi-document PDFs** - Automatically detects and separates bundled documents
461
+ - **Hierarchical queries** - "Compare section 3.2 with section 5.1"
462
+ - **Cross-reference questions** - "What does the appendix say about the claim in section 2?"
463
+ - **Entity extraction** - Grounded extraction with ToT validation (no hallucination)
464
+ - **Table queries** - "What is the total for Q4 2024?"
465
+
466
+ ## Sample Documents
467
+
468
+ RNSR includes sample documents for testing and demonstration:
469
+
470
+ ### Synthetic Documents (`samples/`)
471
+
472
+ | File | Type | Features Demonstrated |
473
+ |------|------|----------------------|
474
+ | `sample_contract.md` | Legal Contract | Entities (people, orgs), relationships, payment tables, legal terms |
475
+ | `sample_financial_report.md` | Financial Report | Financial tables, metrics, executive names, quarterly data |
476
+ | `sample_research_paper.md` | Academic Paper | Citations, hierarchical sections, technical content, tables |
477
+
478
+ ### Real Test Documents (`rnsr/test-documents/`)
479
+
480
+ Legal documents from the Djokovic visa case (public court records) for testing with actual PDFs:
481
+ - Affidavits and court applications
482
+ - Legal submissions and orders
483
+ - Interview transcripts
484
+
485
+ ### Using Sample Documents
486
+
487
+ ```python
488
+ from pathlib import Path
489
+ from rnsr.ingestion import TableParser
490
+ from rnsr.extraction import CandidateExtractor
491
+
492
+ # Parse a sample document
493
+ sample = Path("samples/sample_contract.md").read_text()
494
+
495
+ # Extract tables
496
+ parser = TableParser()
497
+ tables = parser.parse_from_text(sample)
498
+ print(f"Found {len(tables)} tables")
499
+
500
+ # Extract entities
501
+ extractor = CandidateExtractor()
502
+ candidates = extractor.extract_candidates(sample)
503
+ print(f"Found {len(candidates)} entity candidates")
504
+ ```
505
+
506
+ ## Testing
507
+
508
+ ### Test Suite Overview
509
+
510
+ RNSR has comprehensive test coverage with **281+ tests**:
511
+
512
+ ```bash
513
+ # Run all tests
514
+ pytest tests/ -v
515
+
516
+ # Run specific feature tests
517
+ pytest tests/test_provenance.py tests/test_llm_cache.py -v
518
+
519
+ # Run end-to-end workflow tests
520
+ pytest tests/test_e2e_workflow.py -v
521
+
522
+ # Run with coverage
523
+ pytest tests/ --cov=rnsr --cov-report=html
524
+ ```
525
+
526
+ ### Test Categories
527
+
528
+ | Test File | Tests | Coverage |
529
+ |-----------|-------|----------|
530
+ | `test_e2e_workflow.py` | 18 | Full pipeline: ingestion → extraction → KG → query → provenance |
531
+ | `test_provenance.py` | 17 | Citations, contradictions, provenance records |
532
+ | `test_llm_cache.py` | 17 | Cache get/set, TTL, persistence |
533
+ | `test_self_reflection.py` | 13 | Critique, refinement, iteration limits |
534
+ | `test_reasoning_memory.py` | 15 | Chain storage, similarity matching |
535
+ | `test_query_clarifier.py` | 19 | Ambiguity detection, clarification |
536
+ | `test_table_parser.py` | 26 | Markdown/ASCII tables, SQL-like queries |
537
+ | `test_chart_parser.py` | 16 | Chart detection, trend analysis |
538
+ | `test_rlm_unified.py` | 13 | REPL execution, code cleaning |
539
+ | `test_learned_types.py` | 13 | Adaptive learning registries |
540
+
541
+ ### End-to-End Workflow Tests
542
+
543
+ The `test_e2e_workflow.py` demonstrates the complete pipeline:
544
+
545
+ ```python
546
+ # Tests cover:
547
+ # 1. Document Ingestion - Parse structure and tables
548
+ # 2. Entity Extraction - Pattern-based grounded extraction
549
+ # 3. Knowledge Graph - Store entities and relationships
550
+ # 4. Query Processing - Ambiguity detection, table queries
551
+ # 5. Provenance - Citations and evidence tracking
552
+ # 6. Self-Reflection - Answer improvement loop
553
+ # 7. Reasoning Memory - Learn from successful queries
554
+ # 8. LLM Cache - Response caching
555
+ # 9. Adaptive Learning - Type discovery
556
+ # 10. Full Workflow - Contract and financial analysis
557
+ ```
558
+
559
+ ## Development
560
+
561
+ ```bash
562
+ # Install dev dependencies
563
+ pip install -e ".[dev]"
564
+
565
+ # Run linting
566
+ ruff check .
567
+
568
+ # Type checking
569
+ mypy rnsr/
570
+ ```
571
+
572
+ ## Requirements
573
+
574
+ - Python 3.10+
575
+ - At least one LLM API key (OpenAI, Anthropic, or Gemini)
576
+
577
+ ## License
578
+
579
+ MIT License - see [LICENSE](LICENSE) for details.
580
+
581
+ ## Contributing
582
+
583
+ See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
584
+
585
+ ## Research
586
+
587
+ RNSR is inspired by:
588
+ - [Hybrid Document Retrieval System Design](Research/Hybrid%20Document%20Retrieval%20System%20Design.pdf) - Core architecture and design principles
589
+ - [PageIndex (VectifyAI)](https://github.com/VectifyAI/PageIndex) - Vectorless reasoning-based tree search
590
+ - [Recursive Language Models](https://arxiv.org/html/2512.24601v1) - REPL environment with recursive sub-LLM calls
591
+ - Tree of Thoughts - LLM-based decision making with probabilities
592
+ - Self-Refine / Reflexion - Iterative self-correction patterns
@@ -0,0 +1,72 @@
1
+ rnsr/__init__.py,sha256=KOphXkwumziihpMKN0R1hipflpKoKD2BJ2vZHhVgBAw,3588
2
+ rnsr/__main__.py,sha256=bupD9Fx6wJuDvmUMWQ4JWkOsk0nrhevmXuhqvolsaz4,6901
3
+ rnsr/client.py,sha256=23JKz_5xUnp-QDq8FysqMMQxTtDoJ28Zo8QQU5tby6g,19041
4
+ rnsr/document_store.py,sha256=NsYhkMAp8uB1DjkoZ_hKZiGQ9ut9ws2GCwA_3njtQh0,11168
5
+ rnsr/exceptions.py,sha256=ukfZG6lv6L1G1o0_2nu2hMdcHDoaHEtx7rA-qo1coi0,1209
6
+ rnsr/llm.py,sha256=Tyd5jF1NLKWOKoLPkMaCVlbtkTdQVU1Bq1lHIMCE2EY,26394
7
+ rnsr/models.py,sha256=XTxyb5o8LFpyummhLfsPhuLzHnHcFj6_3TXcaeVuvZA,4288
8
+ rnsr/py.typed,sha256=ixa8YukDZ3kLo0WsFJRGohLMyHzbMur1ALmmASML2cs,64
9
+ rnsr/agent/__init__.py,sha256=UjSESsMqu84EgEWV2zMzRM0CmVAtZmGafFhVf8gA5sI,5622
10
+ rnsr/agent/cross_doc_navigator.py,sha256=QD8m0EFy3P7Klp68hq57VK7QviIV5TFan_kZvu0ljnM,24118
11
+ rnsr/agent/graph.py,sha256=khxfdwHU9s2Fi4tWkShA_6_pJ4UxklgFtgP8DHUonPI,52516
12
+ rnsr/agent/llm_cache.py,sha256=mz4_IblfaTwhQQz58snn5Imc9HkfzW3DpMdmrRghsKY,17467
13
+ rnsr/agent/navigator_api.py,sha256=cvYpqPjDUeSsP-UflPN_E6a6X44qpkAwM-N5jXDg-k0,15747
14
+ rnsr/agent/provenance.py,sha256=pFYAVec-kbluhm2ibVAeQZuADQkexdHDWnLP7-Ycauk,25084
15
+ rnsr/agent/query_clarifier.py,sha256=t3BvmtxQp37U5Xx4qsxxCE2X-Bzegxm7BgQsujn5YWg,21071
16
+ rnsr/agent/reasoning_memory.py,sha256=d_JWt1rbWgXAPfcNh3_gi_75_JYFhCxIiurvqBij7pY,23984
17
+ rnsr/agent/repl_env.py,sha256=2lurSa0C9VGGwm0SJxMzZjA1S-sjKmGlPhcDvuWrM8s,25346
18
+ rnsr/agent/rlm_navigator.py,sha256=TSa3x85MywwERuaLfNfNN9GESQlnAX7b2ycpLwFuXPM,72070
19
+ rnsr/agent/self_reflection.py,sha256=XnQ6D9dVCNlr1zWGSxwWRgAkDWOYvSbtjeYbUcCq4jQ,19231
20
+ rnsr/agent/variable_store.py,sha256=NfKx2JzzRtP7WjPG0Zzr_3IFjac9BZl6jBb4KNNJ0-k,9187
21
+ rnsr/benchmarks/__init__.py,sha256=A2WNIIzsAdRFg7WYtcG3f6kemQFiE3m4_zpq4ldS7cI,3085
22
+ rnsr/benchmarks/comprehensive_benchmark.py,sha256=HqZNQ9DZiZ0LkgeEVCv3uhiATbSAYm73PookujXqe_c,24085
23
+ rnsr/benchmarks/evaluation_suite.py,sha256=gXmkXyAXQMtxLXGGveBdZYfuY10-9ma6u0yboIhNpU4,44904
24
+ rnsr/benchmarks/finance_bench.py,sha256=bSKsBwMtuabcWyKQEYJFYhnnJ2jRw0khzE9NBsTiJRY,5330
25
+ rnsr/benchmarks/pdf_merger.py,sha256=UNQHefghWTh7eSkWE2qD_1Rhi97gHcfWs0jXY8F9MDM,6323
26
+ rnsr/benchmarks/performance.py,sha256=bZnR-4xZtII8legY1c9b9ingtML9aSiHkHvQbc4oRbE,9728
27
+ rnsr/benchmarks/quality.py,sha256=Mog6nThF-5N9A0kFkFnXlfl1eOcLoKMUbHI_EK-adko,10054
28
+ rnsr/benchmarks/runner.py,sha256=hzbJZBAk1MgGDAPBC0bE3Gd_KKCobhSpdWiZOhh0cfg,9740
29
+ rnsr/benchmarks/standard_benchmarks.py,sha256=1xSLZH6jPYNwhZpdVTHp4vckoAm-P9UmUru9gpb47P0,34793
30
+ rnsr/extraction/__init__.py,sha256=4EBU5M2cIf3gqR26n7EuALO96VcZ8tfJUJhxGbmUr9A,4528
31
+ rnsr/extraction/candidate_extractor.py,sha256=9pmuxG3PhoRvDrRvyr0gF9ZntUMkS7iTDapc28Bhy7I,13457
32
+ rnsr/extraction/entity_extractor.py,sha256=DRH-tdr0jc7ujPGa4VuvG5xVDfkSavf4OhU7LVQhmnc,19496
33
+ rnsr/extraction/entity_linker.py,sha256=peqzkqvKkrWo8ZJ3MQF1qBVK9BoXY9tNmtPRIIZE9YA,28279
34
+ rnsr/extraction/grounded_extractor.py,sha256=aBiB7ckkFWnQiUXk9IrRYzgDN2PXujzdR6dqAGsYrJo,24512
35
+ rnsr/extraction/learned_types.py,sha256=XdAmwQkaQ2hMlakYYvM1sw_9TK1RLAbKoePhbKuDJBc,19117
36
+ rnsr/extraction/models.py,sha256=GwoGkczyp6hwnYrzUGR9mevATlEkxPRzOMEeLrwYo98,8921
37
+ rnsr/extraction/relationship_extractor.py,sha256=9a5U3iZECC-b5mwazFP2N3_fiwEz99heONq0PeIA-Ek,20080
38
+ rnsr/extraction/relationship_patterns.py,sha256=ByFAOHXqINqVsD-LlisYPS92lZNUKG60rWkAP6WFPyQ,18910
39
+ rnsr/extraction/relationship_validator.py,sha256=56aPDpKIx_EXRbPvdjBUc7fqBZU4mV8RehPNuh19pbg,13483
40
+ rnsr/extraction/rlm_extractor.py,sha256=UsSyMPPqcyPJ02jzkttjn4CzEK7u8TDYF0ciAFpnRLo,20366
41
+ rnsr/extraction/rlm_unified_extractor.py,sha256=JPHyOW04AamYs6RT4h1IEbFg3OBHSW8h7bAa2gjeXp8,36762
42
+ rnsr/extraction/tot_validator.py,sha256=mmM6TtEoyTV-8pCUucBy8TcJ3hPOpABa9yeFgLRDGfI,21586
43
+ rnsr/extraction/unified_extractor.py,sha256=AuRhPJ0Zna1_7vTpZ2fnl0-s_7xjQfGHQ6oNKGJzDH8,12540
44
+ rnsr/indexing/__init__.py,sha256=on-bohmE6qoLIxMQNBHQCXSRKr98aAC0RzVOkrVlIVA,1417
45
+ rnsr/indexing/knowledge_graph.py,sha256=xuceER-kevyZJHGNlCcIHxnoFifZWxmTvjnqPgxFQa8,38592
46
+ rnsr/indexing/kv_store.py,sha256=iXNwGGIKev-AhMDx-dinYqqhUE3UT20TGM2uTOSHQRM,9438
47
+ rnsr/indexing/persistence.py,sha256=GhE8VWCsB0SKAiN_SQiw5fLsYAXm849SqRdhUb9BMDE,9427
48
+ rnsr/indexing/semantic_retriever.py,sha256=ypNl0_gi6ySPBDhgYwML0_OkS5bgbOg5H1yV0xy8tQU,7365
49
+ rnsr/indexing/semantic_search.py,sha256=Qv_25cr3D-uX7Duk2clt_SJR6wATU4UdQogzeTeVYfw,10395
50
+ rnsr/indexing/skeleton_index.py,sha256=jNC6U0JdT-7zoQ-Z85mpB0xk_mgmTlEUfDUVAb6cFvk,11717
51
+ rnsr/ingestion/__init__.py,sha256=eWEWlWp7qmD_RkAjjNwUnWUHfSOnyNnPQdprochhgZM,4583
52
+ rnsr/ingestion/chart_parser.py,sha256=O5y7FfgeCoHL8W3DY0AQSPPlOAMV4JhvkbpxJnPlmlw,18672
53
+ rnsr/ingestion/document_boundary.py,sha256=aGsFxa7haXSCFuefxbTWiWVtXmHKmsTjsZI1y0GhAgc,25447
54
+ rnsr/ingestion/font_histogram.py,sha256=B8PqlJjjEASyYkd97JPPFPtNN6zO7uVBi4p6XqVFaTc,11370
55
+ rnsr/ingestion/header_classifier.py,sha256=9AT7BHNB4yXHyPpxROb0oydmcHUYJyMtshB6UTqnT_o,19535
56
+ rnsr/ingestion/hierarchical_cluster.py,sha256=-A77Kfvzwl6y6XfKjt-kF_Xk7tOy5aPc-K0QBIP2bOQ,17343
57
+ rnsr/ingestion/layout_detector.py,sha256=kHarT41pbETINewhI_MU-5bmB5OmU2LiJFeHHOzbDZo,10693
58
+ rnsr/ingestion/layout_model.py,sha256=41XtQsjxIny0Z6WfeuclXNACh6ayVYjlXILmQzjHljw,11629
59
+ rnsr/ingestion/ocr_fallback.py,sha256=WaUiY6YnoNYtD-smIogSltRKYwQ4MNDNId2mOT5fsD8,4615
60
+ rnsr/ingestion/pipeline.py,sha256=ciGeq51mzWPNViNUxdEccJ9gsKVp5nCa68xF9451zh8,30449
61
+ rnsr/ingestion/semantic_fallback.py,sha256=LhnY4qfqRDhNgutBVJqW_6I_uKyX0IvxM6tT6YhYwSo,13207
62
+ rnsr/ingestion/table_parser.py,sha256=-QzXsFLyZm4nsic6zLKBQWFaB8xlSYXg9QtpneojJ0o,24699
63
+ rnsr/ingestion/text_builder.py,sha256=NGc80EfehQCNQdf0do4U1lnhKpiObu6VsV0u9EepvSI,14700
64
+ rnsr/ingestion/tree_builder.py,sha256=2XrpJDhk1rPQNTR4FQKELKPOXFa5TDDmDGCXYebYmXY,12561
65
+ rnsr/ingestion/vision_retrieval.py,sha256=an5IOIQlWavZ1rgNHz8pdP_QVTLaBsBfWnxqj7gSoxI,31000
66
+ rnsr/ingestion/xy_cut.py,sha256=2KvC9RFtWZ0c2JhPkKtT5saOOuLaWAGgCMh40Fx1_7Q,18025
67
+ rnsr-0.1.0.dist-info/licenses/LICENSE,sha256=VplyxAvQdVnyS4R-X2Iakr_ffeFSgRAiIfEU7kZgxf8,1074
68
+ rnsr-0.1.0.dist-info/METADATA,sha256=P2lAd_Y1WkACXjbOIib1zTlHF5n-0eA4BqOrut35OeQ,19867
69
+ rnsr-0.1.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
70
+ rnsr-0.1.0.dist-info/entry_points.txt,sha256=WFGf4rTMJbDEOt_PvjaHHtHRmJHNk0qfaJpghduMaM8,44
71
+ rnsr-0.1.0.dist-info/top_level.txt,sha256=hylIhN9Hbr5V92x_rNExhkodqj8VvlbhZ8Evog443dY,5
72
+ rnsr-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.10.2)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ rnsr = rnsr.__main__:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 RNSR Contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.