rnsr 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. rnsr/__init__.py +118 -0
  2. rnsr/__main__.py +242 -0
  3. rnsr/agent/__init__.py +218 -0
  4. rnsr/agent/cross_doc_navigator.py +767 -0
  5. rnsr/agent/graph.py +1557 -0
  6. rnsr/agent/llm_cache.py +575 -0
  7. rnsr/agent/navigator_api.py +497 -0
  8. rnsr/agent/provenance.py +772 -0
  9. rnsr/agent/query_clarifier.py +617 -0
  10. rnsr/agent/reasoning_memory.py +736 -0
  11. rnsr/agent/repl_env.py +709 -0
  12. rnsr/agent/rlm_navigator.py +2108 -0
  13. rnsr/agent/self_reflection.py +602 -0
  14. rnsr/agent/variable_store.py +308 -0
  15. rnsr/benchmarks/__init__.py +118 -0
  16. rnsr/benchmarks/comprehensive_benchmark.py +733 -0
  17. rnsr/benchmarks/evaluation_suite.py +1210 -0
  18. rnsr/benchmarks/finance_bench.py +147 -0
  19. rnsr/benchmarks/pdf_merger.py +178 -0
  20. rnsr/benchmarks/performance.py +321 -0
  21. rnsr/benchmarks/quality.py +321 -0
  22. rnsr/benchmarks/runner.py +298 -0
  23. rnsr/benchmarks/standard_benchmarks.py +995 -0
  24. rnsr/client.py +560 -0
  25. rnsr/document_store.py +394 -0
  26. rnsr/exceptions.py +74 -0
  27. rnsr/extraction/__init__.py +172 -0
  28. rnsr/extraction/candidate_extractor.py +357 -0
  29. rnsr/extraction/entity_extractor.py +581 -0
  30. rnsr/extraction/entity_linker.py +825 -0
  31. rnsr/extraction/grounded_extractor.py +722 -0
  32. rnsr/extraction/learned_types.py +599 -0
  33. rnsr/extraction/models.py +232 -0
  34. rnsr/extraction/relationship_extractor.py +600 -0
  35. rnsr/extraction/relationship_patterns.py +511 -0
  36. rnsr/extraction/relationship_validator.py +392 -0
  37. rnsr/extraction/rlm_extractor.py +589 -0
  38. rnsr/extraction/rlm_unified_extractor.py +990 -0
  39. rnsr/extraction/tot_validator.py +610 -0
  40. rnsr/extraction/unified_extractor.py +342 -0
  41. rnsr/indexing/__init__.py +60 -0
  42. rnsr/indexing/knowledge_graph.py +1128 -0
  43. rnsr/indexing/kv_store.py +313 -0
  44. rnsr/indexing/persistence.py +323 -0
  45. rnsr/indexing/semantic_retriever.py +237 -0
  46. rnsr/indexing/semantic_search.py +320 -0
  47. rnsr/indexing/skeleton_index.py +395 -0
  48. rnsr/ingestion/__init__.py +161 -0
  49. rnsr/ingestion/chart_parser.py +569 -0
  50. rnsr/ingestion/document_boundary.py +662 -0
  51. rnsr/ingestion/font_histogram.py +334 -0
  52. rnsr/ingestion/header_classifier.py +595 -0
  53. rnsr/ingestion/hierarchical_cluster.py +515 -0
  54. rnsr/ingestion/layout_detector.py +356 -0
  55. rnsr/ingestion/layout_model.py +379 -0
  56. rnsr/ingestion/ocr_fallback.py +177 -0
  57. rnsr/ingestion/pipeline.py +936 -0
  58. rnsr/ingestion/semantic_fallback.py +417 -0
  59. rnsr/ingestion/table_parser.py +799 -0
  60. rnsr/ingestion/text_builder.py +460 -0
  61. rnsr/ingestion/tree_builder.py +402 -0
  62. rnsr/ingestion/vision_retrieval.py +965 -0
  63. rnsr/ingestion/xy_cut.py +555 -0
  64. rnsr/llm.py +733 -0
  65. rnsr/models.py +167 -0
  66. rnsr/py.typed +2 -0
  67. rnsr-0.1.0.dist-info/METADATA +592 -0
  68. rnsr-0.1.0.dist-info/RECORD +72 -0
  69. rnsr-0.1.0.dist-info/WHEEL +5 -0
  70. rnsr-0.1.0.dist-info/entry_points.txt +2 -0
  71. rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
  72. rnsr-0.1.0.dist-info/top_level.txt +1 -0
rnsr/client.py ADDED
@@ -0,0 +1,560 @@
1
+ """
2
+ RNSR Client - Simple High-Level API
3
+
4
+ Provides the simplest possible interface for using RNSR.
5
+ Handles all the complexity of ingestion, indexing, and navigation.
6
+
7
+ This is the state-of-the-art hybrid retrieval system combining:
8
+ - PageIndex: Vectorless, reasoning-based tree search
9
+ - RLMs: REPL environment with recursive sub-LLM calls
10
+ - Vision: OCR-free image-based document analysis
11
+
12
+ Usage:
13
+ from rnsr import RNSRClient
14
+
15
+ # One-line document Q&A
16
+ client = RNSRClient()
17
+ answer = client.ask("contract.pdf", "What are the payment terms?")
18
+
19
+ # With caching (faster for repeated queries)
20
+ client = RNSRClient(cache_dir="./cache")
21
+ answer = client.ask("contract.pdf", "What are the terms?")
22
+ answer2 = client.ask("contract.pdf", "Who are the parties?") # Uses cache
23
+
24
+ # Advanced: RLM Navigator with full features
25
+ result = client.ask_advanced(
26
+ "complex_report.pdf",
27
+ "Compare revenue in Q1 vs Q2",
28
+ use_rlm=True,
29
+ enable_verification=True,
30
+ )
31
+
32
+ # Vision-based analysis (for scanned docs, charts)
33
+ result = client.ask_vision(
34
+ "scanned_document.pdf",
35
+ "What is shown in the chart?",
36
+ )
37
+ """
38
+
39
+ from __future__ import annotations
40
+
41
+ import hashlib
42
+ from pathlib import Path
43
+ from typing import Any
44
+
45
+ import structlog
46
+
47
+ from rnsr.agent import run_navigator
48
+ from rnsr.document_store import DocumentStore
49
+ from rnsr.indexing import build_skeleton_index, save_index, load_index
50
+ from rnsr.indexing.kv_store import InMemoryKVStore, KVStore
51
+ from rnsr.ingestion import ingest_document
52
+ from rnsr.models import SkeletonNode
53
+
54
+ logger = structlog.get_logger(__name__)
55
+
56
+
57
+ class RNSRClient:
58
+ """
59
+ High-level client for RNSR document Q&A.
60
+
61
+ This is the simplest way to use RNSR. It handles:
62
+ - Document ingestion
63
+ - Skeleton index building
64
+ - Optional caching/persistence
65
+ - Navigation and answer generation
66
+
67
+ Example:
68
+ # Basic usage (no caching)
69
+ client = RNSRClient()
70
+ answer = client.ask("document.pdf", "What is the main topic?")
71
+
72
+ # With caching (recommended for production)
73
+ client = RNSRClient(cache_dir="./rnsr_cache")
74
+ answer = client.ask("document.pdf", "What is the main topic?")
75
+
76
+ # From raw text
77
+ answer = client.ask_text(
78
+ "This is my document content...",
79
+ "What is this about?"
80
+ )
81
+ """
82
+
83
+ def __init__(
84
+ self,
85
+ cache_dir: str | Path | None = None,
86
+ llm_provider: str | None = None,
87
+ llm_model: str | None = None,
88
+ ):
89
+ """
90
+ Initialize the RNSR client.
91
+
92
+ Args:
93
+ cache_dir: Optional directory for caching indexes.
94
+ If provided, indexes are persisted and reused.
95
+ llm_provider: LLM provider ("openai", "anthropic", "gemini")
96
+ llm_model: LLM model name
97
+ """
98
+ self.cache_dir = Path(cache_dir) if cache_dir else None
99
+ self.llm_provider = llm_provider
100
+ self.llm_model = llm_model
101
+
102
+ # In-memory cache for session
103
+ self._session_cache: dict[str, tuple[dict[str, SkeletonNode], KVStore]] = {}
104
+
105
+ if self.cache_dir:
106
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
107
+
108
+ logger.info(
109
+ "rnsr_client_initialized",
110
+ cache_dir=str(self.cache_dir) if self.cache_dir else None,
111
+ )
112
+
113
+ def ask(
114
+ self,
115
+ document: str | Path,
116
+ question: str,
117
+ force_reindex: bool = False,
118
+ ) -> dict[str, Any]:
119
+ """
120
+ Ask a question about a PDF document.
121
+
122
+ Args:
123
+ document: Path to PDF file
124
+ question: Question to ask
125
+ force_reindex: If True, re-process even if cached
126
+
127
+ Returns:
128
+ Result dictionary from the navigator
129
+
130
+ Example:
131
+ answer = client.ask("contract.pdf", "What are the payment terms?")
132
+ """
133
+ doc_path = Path(document)
134
+ if not doc_path.exists():
135
+ raise FileNotFoundError(f"Document not found: {doc_path}")
136
+
137
+ # Get or create index
138
+ skeleton, kv_store = self._get_or_create_index(doc_path, force_reindex)
139
+
140
+ # Run navigator
141
+ result = run_navigator(question, skeleton, kv_store)
142
+ return result.get("answer", "No answer found.")
143
+
144
+ def ask_text(
145
+ self,
146
+ text: str | list[str],
147
+ question: str,
148
+ cache_key: str | None = None,
149
+ ) -> dict[str, Any]:
150
+ """
151
+ Ask a question about raw text.
152
+
153
+ Args:
154
+ text: Text content or list of text chunks
155
+ question: Question to ask
156
+ cache_key: Optional key for caching (if cache_dir is set)
157
+
158
+ Returns:
159
+ Answer string
160
+
161
+ Example:
162
+ answer = client.ask_text(
163
+ "The company was founded in 2020...",
164
+ "When was the company founded?"
165
+ )
166
+ """
167
+ from rnsr.ingestion import build_tree_from_text
168
+
169
+ # Generate cache key from content if not provided
170
+ if cache_key is None:
171
+ content = text if isinstance(text, str) else "\n".join(text)
172
+ cache_key = f"text_{hashlib.md5(content[:1000].encode()).hexdigest()[:12]}"
173
+
174
+ # Check caches
175
+ if cache_key in self._session_cache:
176
+ skeleton, kv_store = self._session_cache[cache_key]
177
+ elif self.cache_dir and (self.cache_dir / cache_key).exists():
178
+ skeleton, kv_store = load_index(self.cache_dir / cache_key)
179
+ self._session_cache[cache_key] = (skeleton, kv_store)
180
+ else:
181
+ # Build index
182
+ tree = build_tree_from_text(text)
183
+ skeleton, kv_store = build_skeleton_index(tree)
184
+ self._session_cache[cache_key] = (skeleton, kv_store)
185
+
186
+ # Persist if cache_dir is set
187
+ if self.cache_dir:
188
+ save_index(skeleton, kv_store, self.cache_dir / cache_key)
189
+
190
+ result = run_navigator(question, skeleton, kv_store)
191
+ return result.get("answer", "No answer found.")
192
+
193
+ def ask_multiple(
194
+ self,
195
+ document: str | Path,
196
+ questions: list[str],
197
+ force_reindex: bool = False,
198
+ ) -> list[dict[str, Any]]:
199
+ """
200
+ Ask multiple questions about a document.
201
+
202
+ More efficient than calling ask() multiple times because
203
+ the document is only indexed once.
204
+
205
+ Args:
206
+ document: Path to PDF file
207
+ questions: List of questions
208
+ force_reindex: If True, re-process even if cached
209
+
210
+ Returns:
211
+ List of answers
212
+
213
+ Example:
214
+ answers = client.ask_multiple(
215
+ "contract.pdf",
216
+ ["What are the terms?", "Who are the parties?"]
217
+ )
218
+ """
219
+ doc_path = Path(document)
220
+ skeleton, kv_store = self._get_or_create_index(doc_path, force_reindex)
221
+
222
+ return [
223
+ run_navigator(q, skeleton, kv_store).get("answer", "No answer found.")
224
+ for q in questions
225
+ ]
226
+
227
+ def get_document_info(self, document: str | Path) -> dict[str, Any]:
228
+ """
229
+ Get information about a document without querying it.
230
+
231
+ Args:
232
+ document: Path to PDF file
233
+
234
+ Returns:
235
+ Dictionary with document metadata
236
+ """
237
+ doc_path = Path(document)
238
+ cache_key = self._get_cache_key(doc_path)
239
+
240
+ # Check if cached
241
+ if cache_key in self._session_cache:
242
+ skeleton, _ = self._session_cache[cache_key]
243
+ return {
244
+ "path": str(doc_path),
245
+ "cached": True,
246
+ "nodes": len(skeleton),
247
+ }
248
+
249
+ if self.cache_dir and (self.cache_dir / cache_key).exists():
250
+ from rnsr.indexing import get_index_info
251
+ info = get_index_info(self.cache_dir / cache_key)
252
+ info["path"] = str(doc_path)
253
+ info["cached"] = True
254
+ return info
255
+
256
+ return {
257
+ "path": str(doc_path),
258
+ "cached": False,
259
+ "exists": doc_path.exists(),
260
+ }
261
+
262
+ def clear_cache(self) -> int:
263
+ """
264
+ Clear all cached indexes.
265
+
266
+ Returns:
267
+ Number of caches cleared
268
+ """
269
+ count = len(self._session_cache)
270
+ self._session_cache.clear()
271
+
272
+ if self.cache_dir:
273
+ import shutil
274
+ for item in self.cache_dir.iterdir():
275
+ if item.is_dir():
276
+ shutil.rmtree(item)
277
+ count += 1
278
+
279
+ logger.info("cache_cleared", count=count)
280
+ return count
281
+
282
+ def _get_cache_key(self, doc_path: Path) -> str:
283
+ """Generate a cache key for a document."""
284
+ stat = doc_path.stat()
285
+ hash_input = f"{doc_path.name}_{stat.st_size}_{stat.st_mtime}"
286
+ return hashlib.md5(hash_input.encode()).hexdigest()[:16]
287
+
288
+ def _get_or_create_index(
289
+ self,
290
+ doc_path: Path,
291
+ force_reindex: bool,
292
+ ) -> tuple[dict[str, SkeletonNode], KVStore]:
293
+ """Get cached index or create new one."""
294
+ cache_key = self._get_cache_key(doc_path)
295
+
296
+ # Check session cache first
297
+ if not force_reindex and cache_key in self._session_cache:
298
+ logger.debug("using_session_cache", key=cache_key)
299
+ return self._session_cache[cache_key]
300
+
301
+ # Check persistent cache
302
+ if not force_reindex and self.cache_dir:
303
+ cache_path = self.cache_dir / cache_key
304
+ if cache_path.exists():
305
+ logger.debug("using_persistent_cache", key=cache_key)
306
+ skeleton, kv_store = load_index(cache_path)
307
+ self._session_cache[cache_key] = (skeleton, kv_store)
308
+ return skeleton, kv_store
309
+
310
+ # Create new index
311
+ logger.info("creating_new_index", path=str(doc_path))
312
+ result = ingest_document(str(doc_path))
313
+ skeleton, kv_store = build_skeleton_index(result.tree)
314
+
315
+ # Store in session cache
316
+ self._session_cache[cache_key] = (skeleton, kv_store)
317
+
318
+ # Persist if cache_dir is set
319
+ if self.cache_dir:
320
+ save_index(skeleton, kv_store, self.cache_dir / cache_key)
321
+
322
+ return skeleton, kv_store
323
+
324
+ # =========================================================================
325
+ # Advanced Navigation Methods
326
+ # =========================================================================
327
+
328
+ def ask_advanced(
329
+ self,
330
+ document: str | Path,
331
+ question: str,
332
+ use_rlm: bool = True,
333
+ enable_pre_filtering: bool = True,
334
+ enable_verification: bool = True,
335
+ max_recursion_depth: int = 3,
336
+ force_reindex: bool = False,
337
+ metadata: dict[str, Any] | None = None,
338
+ ) -> dict[str, Any]:
339
+ """
340
+ Advanced Q&A using the full RLM Navigator.
341
+
342
+ This uses the state-of-the-art Recursive Language Model approach:
343
+ - Pre-filtering with keyword extraction before LLM calls
344
+ - Deep recursive sub-LLM calls (configurable depth)
345
+ - Answer verification with sub-LLM validation
346
+
347
+ Args:
348
+ document: Path to PDF file.
349
+ question: Question to ask.
350
+ use_rlm: Use RLM Navigator (True) or standard navigator (False).
351
+ enable_pre_filtering: Use keyword filtering before ToT evaluation.
352
+ enable_verification: Verify answers with sub-LLM.
353
+ max_recursion_depth: Max depth for recursive sub-LLM calls.
354
+ force_reindex: Re-process even if cached.
355
+ metadata: Optional metadata (e.g., multiple choice options).
356
+
357
+ Returns:
358
+ Full result dictionary with answer, confidence, trace.
359
+
360
+ Example:
361
+ result = client.ask_advanced(
362
+ "complex_report.pdf",
363
+ "Compare the liability clauses in section 5 and section 8",
364
+ enable_verification=True,
365
+ )
366
+ print(f"Answer: {result['answer']}")
367
+ print(f"Confidence: {result['confidence']}")
368
+ """
369
+ doc_path = Path(document)
370
+ if not doc_path.exists():
371
+ raise FileNotFoundError(f"Document not found: {doc_path}")
372
+
373
+ # Get or create index
374
+ skeleton, kv_store = self._get_or_create_index(doc_path, force_reindex)
375
+
376
+ if use_rlm:
377
+ # Use the new RLM Navigator
378
+ from rnsr.agent.rlm_navigator import RLMConfig, run_rlm_navigator
379
+
380
+ config = RLMConfig(
381
+ max_recursion_depth=max_recursion_depth,
382
+ enable_pre_filtering=enable_pre_filtering,
383
+ enable_verification=enable_verification,
384
+ )
385
+
386
+ result = run_rlm_navigator(
387
+ question,
388
+ skeleton,
389
+ kv_store,
390
+ config=config,
391
+ metadata=metadata,
392
+ )
393
+
394
+ return result
395
+ else:
396
+ # Use standard navigator
397
+ result = run_navigator(question, skeleton, kv_store, metadata=metadata)
398
+ return result
399
+
400
+ def ask_vision(
401
+ self,
402
+ document: str | Path,
403
+ question: str,
404
+ use_hybrid: bool = True,
405
+ metadata: dict[str, Any] | None = None,
406
+ ) -> dict[str, Any]:
407
+ """
408
+ Vision-based Q&A working directly on page images.
409
+
410
+ This is ideal for:
411
+ - Scanned documents where OCR quality is poor
412
+ - Documents with charts, graphs, or diagrams
413
+ - Image-heavy presentations or reports
414
+
415
+ Args:
416
+ document: Path to PDF file.
417
+ question: Question to ask.
418
+ use_hybrid: Combine text+vision analysis (True) or vision-only (False).
419
+ metadata: Optional metadata (e.g., multiple choice options).
420
+
421
+ Returns:
422
+ Result dictionary with answer, confidence, selected_pages.
423
+
424
+ Example:
425
+ result = client.ask_vision(
426
+ "financial_report.pdf",
427
+ "What does the revenue chart show?",
428
+ )
429
+ print(f"Answer: {result['answer']}")
430
+ print(f"Pages analyzed: {result['selected_pages']}")
431
+ """
432
+ doc_path = Path(document)
433
+ if not doc_path.exists():
434
+ raise FileNotFoundError(f"Document not found: {doc_path}")
435
+
436
+ from rnsr.ingestion.vision_retrieval import (
437
+ VisionConfig,
438
+ create_vision_navigator,
439
+ create_hybrid_navigator,
440
+ )
441
+
442
+ config = VisionConfig()
443
+
444
+ if use_hybrid:
445
+ # Get text-based index for hybrid mode
446
+ try:
447
+ skeleton, kv_store = self._get_or_create_index(doc_path, False)
448
+ except Exception:
449
+ skeleton, kv_store = None, None
450
+
451
+ navigator = create_hybrid_navigator(
452
+ doc_path,
453
+ skeleton=skeleton,
454
+ kv_store=kv_store,
455
+ vision_config=config,
456
+ )
457
+
458
+ result = navigator.navigate(question, metadata)
459
+ return {
460
+ "answer": result.get("combined_answer"),
461
+ "confidence": result.get("confidence", 0),
462
+ "method_used": result.get("method_used"),
463
+ "text_result": result.get("text_result"),
464
+ "vision_result": result.get("vision_result"),
465
+ }
466
+ else:
467
+ # Vision-only mode
468
+ navigator = create_vision_navigator(doc_path, config)
469
+ return navigator.navigate(question, metadata)
470
+
471
+ def analyze_document_structure(
472
+ self,
473
+ document: str | Path,
474
+ force_reindex: bool = False,
475
+ ) -> dict[str, Any]:
476
+ """
477
+ Analyze a document's structure without querying it.
478
+
479
+ Returns detailed information about the document hierarchy,
480
+ sections, and content distribution.
481
+
482
+ Args:
483
+ document: Path to PDF file.
484
+ force_reindex: Re-process even if cached.
485
+
486
+ Returns:
487
+ Dictionary with structure analysis.
488
+
489
+ Example:
490
+ info = client.analyze_document_structure("contract.pdf")
491
+ print(f"Sections: {info['section_count']}")
492
+ print(f"Max depth: {info['max_depth']}")
493
+ """
494
+ doc_path = Path(document)
495
+ skeleton, kv_store = self._get_or_create_index(doc_path, force_reindex)
496
+
497
+ # Analyze structure
498
+ section_count = 0
499
+ max_depth = 0
500
+ total_chars = 0
501
+ level_counts: dict[int, int] = {}
502
+
503
+ for node_id, node in skeleton.items():
504
+ section_count += 1
505
+ max_depth = max(max_depth, node.level)
506
+ level_counts[node.level] = level_counts.get(node.level, 0) + 1
507
+
508
+ content = kv_store.get(node_id)
509
+ if content:
510
+ total_chars += len(content)
511
+
512
+ return {
513
+ "path": str(doc_path),
514
+ "section_count": section_count,
515
+ "max_depth": max_depth,
516
+ "level_distribution": level_counts,
517
+ "total_characters": total_chars,
518
+ "average_section_length": total_chars // max(section_count, 1),
519
+ }
520
+
521
+ def get_document_outline(
522
+ self,
523
+ document: str | Path,
524
+ max_depth: int = 2,
525
+ force_reindex: bool = False,
526
+ ) -> list[dict[str, Any]]:
527
+ """
528
+ Get a document outline (table of contents).
529
+
530
+ Args:
531
+ document: Path to PDF file.
532
+ max_depth: Maximum heading depth to include.
533
+ force_reindex: Re-process even if cached.
534
+
535
+ Returns:
536
+ List of section dictionaries with header and level.
537
+
538
+ Example:
539
+ outline = client.get_document_outline("report.pdf")
540
+ for section in outline:
541
+ indent = " " * section['level']
542
+ print(f"{indent}{section['header']}")
543
+ """
544
+ doc_path = Path(document)
545
+ skeleton, _ = self._get_or_create_index(doc_path, force_reindex)
546
+
547
+ outline = []
548
+ for node in skeleton.values():
549
+ if node.level <= max_depth:
550
+ outline.append({
551
+ "id": node.node_id,
552
+ "header": node.header,
553
+ "level": node.level,
554
+ "summary": node.summary[:100] if node.summary else "",
555
+ "child_count": len(node.child_ids),
556
+ })
557
+
558
+ # Sort by node_id to maintain document order
559
+ outline.sort(key=lambda x: x["id"])
560
+ return outline