rnsr 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. rnsr/__init__.py +118 -0
  2. rnsr/__main__.py +242 -0
  3. rnsr/agent/__init__.py +218 -0
  4. rnsr/agent/cross_doc_navigator.py +767 -0
  5. rnsr/agent/graph.py +1557 -0
  6. rnsr/agent/llm_cache.py +575 -0
  7. rnsr/agent/navigator_api.py +497 -0
  8. rnsr/agent/provenance.py +772 -0
  9. rnsr/agent/query_clarifier.py +617 -0
  10. rnsr/agent/reasoning_memory.py +736 -0
  11. rnsr/agent/repl_env.py +709 -0
  12. rnsr/agent/rlm_navigator.py +2108 -0
  13. rnsr/agent/self_reflection.py +602 -0
  14. rnsr/agent/variable_store.py +308 -0
  15. rnsr/benchmarks/__init__.py +118 -0
  16. rnsr/benchmarks/comprehensive_benchmark.py +733 -0
  17. rnsr/benchmarks/evaluation_suite.py +1210 -0
  18. rnsr/benchmarks/finance_bench.py +147 -0
  19. rnsr/benchmarks/pdf_merger.py +178 -0
  20. rnsr/benchmarks/performance.py +321 -0
  21. rnsr/benchmarks/quality.py +321 -0
  22. rnsr/benchmarks/runner.py +298 -0
  23. rnsr/benchmarks/standard_benchmarks.py +995 -0
  24. rnsr/client.py +560 -0
  25. rnsr/document_store.py +394 -0
  26. rnsr/exceptions.py +74 -0
  27. rnsr/extraction/__init__.py +172 -0
  28. rnsr/extraction/candidate_extractor.py +357 -0
  29. rnsr/extraction/entity_extractor.py +581 -0
  30. rnsr/extraction/entity_linker.py +825 -0
  31. rnsr/extraction/grounded_extractor.py +722 -0
  32. rnsr/extraction/learned_types.py +599 -0
  33. rnsr/extraction/models.py +232 -0
  34. rnsr/extraction/relationship_extractor.py +600 -0
  35. rnsr/extraction/relationship_patterns.py +511 -0
  36. rnsr/extraction/relationship_validator.py +392 -0
  37. rnsr/extraction/rlm_extractor.py +589 -0
  38. rnsr/extraction/rlm_unified_extractor.py +990 -0
  39. rnsr/extraction/tot_validator.py +610 -0
  40. rnsr/extraction/unified_extractor.py +342 -0
  41. rnsr/indexing/__init__.py +60 -0
  42. rnsr/indexing/knowledge_graph.py +1128 -0
  43. rnsr/indexing/kv_store.py +313 -0
  44. rnsr/indexing/persistence.py +323 -0
  45. rnsr/indexing/semantic_retriever.py +237 -0
  46. rnsr/indexing/semantic_search.py +320 -0
  47. rnsr/indexing/skeleton_index.py +395 -0
  48. rnsr/ingestion/__init__.py +161 -0
  49. rnsr/ingestion/chart_parser.py +569 -0
  50. rnsr/ingestion/document_boundary.py +662 -0
  51. rnsr/ingestion/font_histogram.py +334 -0
  52. rnsr/ingestion/header_classifier.py +595 -0
  53. rnsr/ingestion/hierarchical_cluster.py +515 -0
  54. rnsr/ingestion/layout_detector.py +356 -0
  55. rnsr/ingestion/layout_model.py +379 -0
  56. rnsr/ingestion/ocr_fallback.py +177 -0
  57. rnsr/ingestion/pipeline.py +936 -0
  58. rnsr/ingestion/semantic_fallback.py +417 -0
  59. rnsr/ingestion/table_parser.py +799 -0
  60. rnsr/ingestion/text_builder.py +460 -0
  61. rnsr/ingestion/tree_builder.py +402 -0
  62. rnsr/ingestion/vision_retrieval.py +965 -0
  63. rnsr/ingestion/xy_cut.py +555 -0
  64. rnsr/llm.py +733 -0
  65. rnsr/models.py +167 -0
  66. rnsr/py.typed +2 -0
  67. rnsr-0.1.0.dist-info/METADATA +592 -0
  68. rnsr-0.1.0.dist-info/RECORD +72 -0
  69. rnsr-0.1.0.dist-info/WHEEL +5 -0
  70. rnsr-0.1.0.dist-info/entry_points.txt +2 -0
  71. rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
  72. rnsr-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,395 @@
1
+ """
2
+ Skeleton Index - Summary-Only Vector Index with External Content
3
+
4
+ The Skeleton Index pattern implements a two-layer retrieval approach:
5
+
6
+ 1. **Skeleton Layer** (Vector Index): Contains ONLY summaries and metadata
7
+ - Each IndexNode's .text field contains a 50-100 word summary
8
+ - Child node IDs stored in metadata for navigation
9
+ - Used for initial retrieval and expand/traverse decisions
10
+
11
+ 2. **Content Layer** (KV Store): Contains full text content
12
+ - Stored separately to prevent context pollution
13
+ - Only fetched during synthesis when explicitly needed
14
+ - Accessed via node_id pointers
15
+
16
+ Agent Decision Protocol:
17
+ if summary_answers_question(node.text):
18
+ # EXPAND: Fetch full content from KV Store
19
+ content = kv_store.get(node.node_id)
20
+ store_as_variable(content)
21
+ else:
22
+ # TRAVERSE: Navigate to child nodes
23
+ children = [get_node(cid) for cid in node.child_ids]
24
+ continue_navigation(children)
25
+ """
26
+
27
+ from __future__ import annotations
28
+
29
+ from typing import Any
30
+
31
+ import structlog
32
+
33
+ from rnsr.exceptions import IndexingError
34
+ from rnsr.indexing.kv_store import InMemoryKVStore, KVStore, SQLiteKVStore
35
+ from rnsr.models import DocumentNode, DocumentTree, SkeletonNode
36
+
37
+ logger = structlog.get_logger(__name__)
38
+
39
+
40
+ def generate_summary(content: str, max_words: int = 100) -> str:
41
+ """
42
+ Generate a summary for a node's content.
43
+
44
+ EXTRACTIVE approach: Take first portion to preserve key facts,
45
+ entities, and concrete details that ToT needs for evaluation.
46
+
47
+ Args:
48
+ content: Full text content.
49
+ max_words: Maximum words in summary.
50
+
51
+ Returns:
52
+ Summary text (50-100 words).
53
+ """
54
+ if not content:
55
+ return ""
56
+
57
+ words = content.split()
58
+
59
+ if len(words) <= max_words:
60
+ return content
61
+
62
+ # EXTRACTIVE SUMMARY: Take first max_words to preserve:
63
+ # - Opening sentences (often contain key context)
64
+ # - Named entities (people, places, concepts)
65
+ # - Concrete facts (numbers, dates, specific actions)
66
+ # This gives ToT better signal than arbitrary truncation
67
+ return " ".join(words[:max_words]) + "..."
68
+
69
+
70
+ async def generate_summary_llm(
71
+ content: str,
72
+ llm: Any = None,
73
+ max_words: int = 75,
74
+ provider: str | None = None,
75
+ ) -> str:
76
+ """
77
+ Generate a summary using an LLM.
78
+
79
+ Supports OpenAI, Anthropic, and Gemini providers.
80
+
81
+ Args:
82
+ content: Full text content.
83
+ llm: LlamaIndex LLM instance (optional). If None, creates one.
84
+ max_words: Target word count.
85
+ provider: LLM provider ("openai", "anthropic", "gemini", or None for auto).
86
+
87
+ Returns:
88
+ LLM-generated summary.
89
+ """
90
+ if not content or len(content.strip()) < 50:
91
+ return content
92
+
93
+ # If no LLM provided, try to create one
94
+ if llm is None:
95
+ llm = _get_llm_for_summary(provider)
96
+ if llm is None:
97
+ return generate_summary(content, max_words)
98
+
99
+ prompt = f"""Summarize the following text in {max_words} words or less.
100
+
101
+ IMPORTANT: Use an EXTRACTIVE approach - preserve:
102
+ - Key facts, entities, names, and concrete details (who, what, when, where)
103
+ - Specific actions, events, and outcomes
104
+ - Numbers, dates, and measurements
105
+ - The main subject and what happens to/with it
106
+
107
+ Avoid:
108
+ - Vague generalizations ("discusses various topics")
109
+ - Meta-commentary ("this section explains...")
110
+ - Abstractions without specifics
111
+
112
+ TEXT:
113
+ {content}
114
+
115
+ EXTRACTIVE SUMMARY:"""
116
+
117
+ try:
118
+ response = await llm.acomplete(prompt)
119
+ return str(response).strip()
120
+ except Exception as e:
121
+ logger.warning("llm_summary_failed", error=str(e))
122
+ return generate_summary(content, max_words)
123
+
124
+
125
+ def _get_llm_for_summary(provider: str | None = None) -> Any:
126
+ """
127
+ Get an LLM instance for summary generation.
128
+
129
+ Supports: OpenAI, Anthropic, Gemini, auto-detect.
130
+
131
+ Args:
132
+ provider: "openai", "anthropic", "gemini", or None for auto-detect.
133
+
134
+ Returns:
135
+ LlamaIndex-compatible LLM, or None if unavailable.
136
+ """
137
+ import os
138
+
139
+ # Auto-detect provider if not specified
140
+ if provider is None:
141
+ if os.getenv("GOOGLE_API_KEY"):
142
+ provider = "gemini"
143
+ elif os.getenv("ANTHROPIC_API_KEY"):
144
+ provider = "anthropic"
145
+ elif os.getenv("OPENAI_API_KEY"):
146
+ provider = "openai"
147
+ else:
148
+ logger.warning("no_llm_api_key_found")
149
+ return None
150
+
151
+ provider = provider.lower()
152
+
153
+ try:
154
+ if provider == "gemini":
155
+ from llama_index.llms.gemini import Gemini
156
+
157
+ logger.info("using_gemini_llm")
158
+ return Gemini(model="gemini-2.5-flash")
159
+
160
+ elif provider == "anthropic":
161
+ from llama_index.llms.anthropic import Anthropic
162
+
163
+ logger.info("using_anthropic_llm")
164
+ return Anthropic(model="claude-sonnet-4-5")
165
+
166
+ elif provider == "openai":
167
+ from llama_index.llms.openai import OpenAI
168
+
169
+ logger.info("using_openai_llm")
170
+ return OpenAI(model="gpt-5-mini")
171
+
172
+ else:
173
+ logger.warning("unknown_llm_provider", provider=provider)
174
+ return None
175
+
176
+ except ImportError as e:
177
+ logger.warning("llm_import_failed", provider=provider, error=str(e))
178
+ return None
179
+
180
+
181
+ class SkeletonIndexBuilder:
182
+ """
183
+ Builds a Skeleton Index from a DocumentTree.
184
+
185
+ The index consists of:
186
+ 1. SkeletonNode objects (summaries + metadata)
187
+ 2. KV Store entries (full content)
188
+
189
+ Attributes:
190
+ kv_store: Key-value store for full content.
191
+ nodes: Dictionary of node_id -> SkeletonNode.
192
+ """
193
+
194
+ def __init__(self, kv_store: KVStore | None = None):
195
+ """
196
+ Initialize the builder.
197
+
198
+ Args:
199
+ kv_store: KV store instance. Defaults to InMemoryKVStore.
200
+ """
201
+ self.kv_store = kv_store or InMemoryKVStore()
202
+ self.nodes: dict[str, SkeletonNode] = {}
203
+
204
+ logger.info("skeleton_builder_initialized")
205
+
206
+ def build_from_tree(self, tree: DocumentTree) -> dict[str, SkeletonNode]:
207
+ """
208
+ Build a skeleton index from a DocumentTree.
209
+
210
+ Args:
211
+ tree: The document tree to index.
212
+
213
+ Returns:
214
+ Dictionary mapping node_id to SkeletonNode.
215
+ """
216
+ self.nodes.clear()
217
+
218
+ logger.info(
219
+ "building_skeleton_index",
220
+ doc_id=tree.id,
221
+ total_nodes=tree.total_nodes,
222
+ )
223
+
224
+ # Recursively process the tree
225
+ self._process_node(tree.root, parent_id=None)
226
+
227
+ logger.info(
228
+ "skeleton_index_complete",
229
+ indexed_nodes=len(self.nodes),
230
+ kv_entries=self.kv_store.count(),
231
+ )
232
+
233
+ return self.nodes
234
+
235
+ def _process_node(
236
+ self,
237
+ node: DocumentNode,
238
+ parent_id: str | None,
239
+ ) -> SkeletonNode:
240
+ """
241
+ Recursively process a document node.
242
+
243
+ 1. Store full content in KV Store
244
+ 2. Generate summary
245
+ 3. Create SkeletonNode
246
+ 4. Process children
247
+ """
248
+ # Store full content in KV Store
249
+ full_content = self._collect_content(node)
250
+ if full_content:
251
+ self.kv_store.put(node.id, full_content)
252
+
253
+ # Generate summary (summary-only in skeleton!)
254
+ summary = generate_summary(full_content)
255
+
256
+ # Create skeleton node
257
+ skeleton = SkeletonNode(
258
+ node_id=node.id,
259
+ parent_id=parent_id,
260
+ level=node.level,
261
+ header=node.header,
262
+ summary=summary, # ONLY summary in index
263
+ child_ids=[c.id for c in node.children],
264
+ page_num=node.page_num,
265
+ metadata={
266
+ "has_children": len(node.children) > 0,
267
+ "content_chars": len(full_content),
268
+ },
269
+ )
270
+
271
+ self.nodes[node.id] = skeleton
272
+
273
+ # Process children
274
+ for child in node.children:
275
+ self._process_node(child, parent_id=node.id)
276
+
277
+ return skeleton
278
+
279
+ def _collect_content(self, node: DocumentNode) -> str:
280
+ """
281
+ Collect content from a node (header + body content).
282
+ """
283
+ parts = []
284
+
285
+ if node.header:
286
+ parts.append(node.header)
287
+
288
+ if node.content:
289
+ parts.append(node.content)
290
+
291
+ return "\n\n".join(parts)
292
+
293
+ def get_node(self, node_id: str) -> SkeletonNode | None:
294
+ """Get a skeleton node by ID."""
295
+ return self.nodes.get(node_id)
296
+
297
+ def get_content(self, node_id: str) -> str | None:
298
+ """Get full content for a node from KV Store."""
299
+ return self.kv_store.get(node_id)
300
+
301
+ def get_children(self, node_id: str) -> list[SkeletonNode]:
302
+ """Get child skeleton nodes."""
303
+ node = self.nodes.get(node_id)
304
+ if node is None:
305
+ return []
306
+
307
+ return [
308
+ self.nodes[cid]
309
+ for cid in node.child_ids
310
+ if cid in self.nodes
311
+ ]
312
+
313
+ def get_root(self) -> SkeletonNode | None:
314
+ """Get the root node (level 0)."""
315
+ for node in self.nodes.values():
316
+ if node.level == 0:
317
+ return node
318
+ return None
319
+
320
+
321
+ def build_skeleton_index(
322
+ tree: DocumentTree,
323
+ kv_store: KVStore | None = None,
324
+ ) -> tuple[dict[str, SkeletonNode], KVStore]:
325
+ """
326
+ Convenience function to build a skeleton index.
327
+
328
+ Args:
329
+ tree: Document tree to index.
330
+ kv_store: Optional KV store (defaults to InMemoryKVStore).
331
+
332
+ Returns:
333
+ Tuple of (skeleton_nodes dict, kv_store).
334
+
335
+ Example:
336
+ tree = ingest_document("contract.pdf").tree
337
+ skeleton, kv = build_skeleton_index(tree)
338
+
339
+ # Navigate skeleton
340
+ root = skeleton[tree.root.id]
341
+ for child_id in root.child_ids:
342
+ child = skeleton[child_id]
343
+ print(f"{child.header}: {child.summary}")
344
+
345
+ # Only fetch full content when needed
346
+ if need_full_content:
347
+ content = kv.get(child_id)
348
+ """
349
+ kv_store = kv_store or InMemoryKVStore()
350
+ builder = SkeletonIndexBuilder(kv_store)
351
+ nodes = builder.build_from_tree(tree)
352
+ return nodes, kv_store
353
+
354
+
355
+ # For LlamaIndex integration
356
+ def create_llama_index_nodes(
357
+ skeleton_nodes: dict[str, SkeletonNode],
358
+ ) -> list:
359
+ """
360
+ Create LlamaIndex IndexNode objects from skeleton nodes.
361
+
362
+ Each IndexNode's .text field contains ONLY the summary,
363
+ with child_ids in metadata for navigation.
364
+
365
+ Returns:
366
+ List of LlamaIndex IndexNode objects.
367
+ """
368
+ try:
369
+ from llama_index.core.schema import IndexNode
370
+ except ImportError:
371
+ raise IndexingError(
372
+ "LlamaIndex not installed. "
373
+ "Install with: pip install llama-index"
374
+ )
375
+
376
+ llama_nodes = []
377
+
378
+ for skel in skeleton_nodes.values():
379
+ # IndexNode.text = summary ONLY (not full content!)
380
+ node = IndexNode(
381
+ text=skel.summary,
382
+ index_id=skel.node_id,
383
+ obj={
384
+ "node_id": skel.node_id,
385
+ "parent_id": skel.parent_id,
386
+ "level": skel.level,
387
+ "header": skel.header,
388
+ "child_ids": skel.child_ids,
389
+ "has_children": len(skel.child_ids) > 0,
390
+ },
391
+ )
392
+ llama_nodes.append(node)
393
+
394
+ logger.info("llama_nodes_created", count=len(llama_nodes))
395
+ return llama_nodes
@@ -0,0 +1,161 @@
1
+ """
2
+ Ingestion Module - Latent TOC Reconstruction + Vision Retrieval
3
+
4
+ Implements the "Latent Hierarchy Generator" from the research paper (Section 4):
5
+ - Visual-Geometric Analysis (Font Histogram + XY-Cut)
6
+ - Semantic Boundary Detection (SemanticSplitter + Hierarchical Clustering)
7
+ - Synthetic Header Generation (LLM-based titles for flat documents)
8
+ - Vision-Based Retrieval (OCR-free page image analysis)
9
+ - Table and Chart Parsing (Deep structured data extraction)
10
+
11
+ Responsible for:
12
+ 1. Font Histogram Analysis (PRIMARY - Section 6.1)
13
+ 2. Recursive XY-Cut (Visual Segmentation - Section 4.1.1)
14
+ 3. Hierarchical Clustering (Multi-resolution topics - Section 4.2.2)
15
+ 4. Synthetic Header Generation (LLM titles - Section 6.3)
16
+ 5. Graceful Degradation (3-tier fallback)
17
+ 6. Vision-Based Retrieval (PageIndex-inspired OCR-free mode)
18
+ 7. Table Parsing (NEW - SQL-like queries over tables)
19
+ 8. Chart Parsing (NEW - Trend analysis and data extraction)
20
+
21
+ Primary Entry Point:
22
+ ingest_document(pdf_path) -> IngestionResult
23
+
24
+ ALWAYS use ingest_document() - it handles fallbacks automatically.
25
+ """
26
+
27
+ from rnsr.ingestion.font_histogram import FontHistogramAnalyzer, FontAnalysis
28
+ from rnsr.ingestion.header_classifier import (
29
+ HeaderClassifier,
30
+ LearnedHeaderThresholds,
31
+ get_learned_header_thresholds,
32
+ )
33
+ from rnsr.ingestion.tree_builder import TreeBuilder, build_document_tree
34
+ from rnsr.ingestion.text_builder import build_tree_from_text, build_tree_from_contexts
35
+ from rnsr.ingestion.pipeline import ingest_document
36
+ from rnsr.ingestion.semantic_fallback import try_semantic_splitter_ingestion
37
+ from rnsr.ingestion.ocr_fallback import try_ocr_ingestion, check_ocr_available
38
+ from rnsr.ingestion.xy_cut import (
39
+ RecursiveXYCutter,
40
+ segment_pdf_with_xy_cut,
41
+ SegmentNode,
42
+ BoundingRegion,
43
+ )
44
+ from rnsr.ingestion.hierarchical_cluster import (
45
+ HierarchicalSemanticClusterer,
46
+ cluster_document_hierarchically,
47
+ TextCluster,
48
+ )
49
+ from rnsr.ingestion.layout_detector import detect_layout_complexity, LayoutComplexity
50
+ from rnsr.ingestion.layout_model import (
51
+ get_layout_model,
52
+ classify_layout_blocks,
53
+ check_layout_model_available,
54
+ get_layout_model_info,
55
+ LAYOUT_MODEL_BASE,
56
+ LAYOUT_MODEL_LARGE,
57
+ )
58
+ from rnsr.ingestion.vision_retrieval import (
59
+ VisionConfig,
60
+ VisionNavigator,
61
+ HybridVisionNavigator,
62
+ PageImageExtractor,
63
+ VisionLLM,
64
+ create_vision_navigator,
65
+ create_hybrid_navigator,
66
+ )
67
+ from rnsr.ingestion.table_parser import (
68
+ TableParser,
69
+ ParsedTable,
70
+ TableCell,
71
+ TableRow,
72
+ TableColumn,
73
+ TableQueryEngine,
74
+ parse_tables_from_text,
75
+ query_table,
76
+ )
77
+ from rnsr.ingestion.chart_parser import (
78
+ ChartParser,
79
+ ParsedChart,
80
+ ChartSeries,
81
+ DataPoint,
82
+ ChartType,
83
+ ChartAnalysis,
84
+ describe_chart,
85
+ )
86
+ from rnsr.models import IngestionResult
87
+
88
+ __all__ = [
89
+ # Pipeline (Primary Entry Point)
90
+ "ingest_document",
91
+ "IngestionResult",
92
+
93
+ # Tier 1: Font Histogram
94
+ "FontHistogramAnalyzer",
95
+ "FontAnalysis",
96
+ "HeaderClassifier",
97
+ "LearnedHeaderThresholds",
98
+ "get_learned_header_thresholds",
99
+ "TreeBuilder",
100
+ "build_document_tree",
101
+
102
+ # Text-to-Tree (for benchmarks using raw text)
103
+ "build_tree_from_text",
104
+ "build_tree_from_contexts",
105
+
106
+ # Tier 1b: Visual Analysis (LayoutLM + XY-Cut)
107
+ "detect_layout_complexity",
108
+ "LayoutComplexity",
109
+ "get_layout_model",
110
+ "classify_layout_blocks",
111
+ "check_layout_model_available",
112
+ "get_layout_model_info",
113
+ "LAYOUT_MODEL_BASE",
114
+ "LAYOUT_MODEL_LARGE",
115
+
116
+ # Tier 1b: Recursive XY-Cut (Visual Segmentation)
117
+ "RecursiveXYCutter",
118
+ "segment_pdf_with_xy_cut",
119
+ "SegmentNode",
120
+ "BoundingRegion",
121
+
122
+ # Tier 2: Semantic Splitter
123
+ "try_semantic_splitter_ingestion",
124
+
125
+ # Tier 2b: Hierarchical Clustering
126
+ "HierarchicalSemanticClusterer",
127
+ "cluster_document_hierarchically",
128
+ "TextCluster",
129
+
130
+ # Tier 3: OCR
131
+ "try_ocr_ingestion",
132
+ "check_ocr_available",
133
+
134
+ # Vision-Based Retrieval (PageIndex-inspired)
135
+ "VisionConfig",
136
+ "VisionNavigator",
137
+ "HybridVisionNavigator",
138
+ "PageImageExtractor",
139
+ "VisionLLM",
140
+ "create_vision_navigator",
141
+ "create_hybrid_navigator",
142
+
143
+ # Table Parsing (NEW)
144
+ "TableParser",
145
+ "ParsedTable",
146
+ "TableCell",
147
+ "TableRow",
148
+ "TableColumn",
149
+ "TableQueryEngine",
150
+ "parse_tables_from_text",
151
+ "query_table",
152
+
153
+ # Chart Parsing (NEW)
154
+ "ChartParser",
155
+ "ParsedChart",
156
+ "ChartSeries",
157
+ "DataPoint",
158
+ "ChartType",
159
+ "ChartAnalysis",
160
+ "describe_chart",
161
+ ]