rnsr 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. rnsr/__init__.py +118 -0
  2. rnsr/__main__.py +242 -0
  3. rnsr/agent/__init__.py +218 -0
  4. rnsr/agent/cross_doc_navigator.py +767 -0
  5. rnsr/agent/graph.py +1557 -0
  6. rnsr/agent/llm_cache.py +575 -0
  7. rnsr/agent/navigator_api.py +497 -0
  8. rnsr/agent/provenance.py +772 -0
  9. rnsr/agent/query_clarifier.py +617 -0
  10. rnsr/agent/reasoning_memory.py +736 -0
  11. rnsr/agent/repl_env.py +709 -0
  12. rnsr/agent/rlm_navigator.py +2108 -0
  13. rnsr/agent/self_reflection.py +602 -0
  14. rnsr/agent/variable_store.py +308 -0
  15. rnsr/benchmarks/__init__.py +118 -0
  16. rnsr/benchmarks/comprehensive_benchmark.py +733 -0
  17. rnsr/benchmarks/evaluation_suite.py +1210 -0
  18. rnsr/benchmarks/finance_bench.py +147 -0
  19. rnsr/benchmarks/pdf_merger.py +178 -0
  20. rnsr/benchmarks/performance.py +321 -0
  21. rnsr/benchmarks/quality.py +321 -0
  22. rnsr/benchmarks/runner.py +298 -0
  23. rnsr/benchmarks/standard_benchmarks.py +995 -0
  24. rnsr/client.py +560 -0
  25. rnsr/document_store.py +394 -0
  26. rnsr/exceptions.py +74 -0
  27. rnsr/extraction/__init__.py +172 -0
  28. rnsr/extraction/candidate_extractor.py +357 -0
  29. rnsr/extraction/entity_extractor.py +581 -0
  30. rnsr/extraction/entity_linker.py +825 -0
  31. rnsr/extraction/grounded_extractor.py +722 -0
  32. rnsr/extraction/learned_types.py +599 -0
  33. rnsr/extraction/models.py +232 -0
  34. rnsr/extraction/relationship_extractor.py +600 -0
  35. rnsr/extraction/relationship_patterns.py +511 -0
  36. rnsr/extraction/relationship_validator.py +392 -0
  37. rnsr/extraction/rlm_extractor.py +589 -0
  38. rnsr/extraction/rlm_unified_extractor.py +990 -0
  39. rnsr/extraction/tot_validator.py +610 -0
  40. rnsr/extraction/unified_extractor.py +342 -0
  41. rnsr/indexing/__init__.py +60 -0
  42. rnsr/indexing/knowledge_graph.py +1128 -0
  43. rnsr/indexing/kv_store.py +313 -0
  44. rnsr/indexing/persistence.py +323 -0
  45. rnsr/indexing/semantic_retriever.py +237 -0
  46. rnsr/indexing/semantic_search.py +320 -0
  47. rnsr/indexing/skeleton_index.py +395 -0
  48. rnsr/ingestion/__init__.py +161 -0
  49. rnsr/ingestion/chart_parser.py +569 -0
  50. rnsr/ingestion/document_boundary.py +662 -0
  51. rnsr/ingestion/font_histogram.py +334 -0
  52. rnsr/ingestion/header_classifier.py +595 -0
  53. rnsr/ingestion/hierarchical_cluster.py +515 -0
  54. rnsr/ingestion/layout_detector.py +356 -0
  55. rnsr/ingestion/layout_model.py +379 -0
  56. rnsr/ingestion/ocr_fallback.py +177 -0
  57. rnsr/ingestion/pipeline.py +936 -0
  58. rnsr/ingestion/semantic_fallback.py +417 -0
  59. rnsr/ingestion/table_parser.py +799 -0
  60. rnsr/ingestion/text_builder.py +460 -0
  61. rnsr/ingestion/tree_builder.py +402 -0
  62. rnsr/ingestion/vision_retrieval.py +965 -0
  63. rnsr/ingestion/xy_cut.py +555 -0
  64. rnsr/llm.py +733 -0
  65. rnsr/models.py +167 -0
  66. rnsr/py.typed +2 -0
  67. rnsr-0.1.0.dist-info/METADATA +592 -0
  68. rnsr-0.1.0.dist-info/RECORD +72 -0
  69. rnsr-0.1.0.dist-info/WHEEL +5 -0
  70. rnsr-0.1.0.dist-info/entry_points.txt +2 -0
  71. rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
  72. rnsr-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,402 @@
1
+ """
2
+ Tree Builder - Document Tree Assembly
3
+
4
+ This module builds a hierarchical document tree from classified spans.
5
+ Uses a stack-based parser to:
6
+ 1. Process blocks in reading order (top-to-bottom, left-to-right)
7
+ 2. Assign body text to the nearest preceding header
8
+ 3. Output nested DocumentNode structure
9
+
10
+ The tree structure enables:
11
+ - Hierarchical navigation by the agent
12
+ - Section-based summarization
13
+ - Context-aware retrieval
14
+
15
+ Multi-Document Support:
16
+ - Detects document boundaries in combined PDFs
17
+ - Creates separate subtrees for each logical document
18
+ - Preserves internal hierarchy within each document
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ from typing import TYPE_CHECKING
24
+ from uuid import uuid4
25
+
26
+ import structlog
27
+
28
+ from rnsr.models import ClassifiedSpan, DocumentNode, DocumentTree
29
+
30
+ if TYPE_CHECKING:
31
+ from rnsr.ingestion.document_boundary import DocumentSegment
32
+
33
+ logger = structlog.get_logger(__name__)
34
+
35
+
36
+ class TreeBuilder:
37
+ """
38
+ Builds a hierarchical document tree from classified spans.
39
+
40
+ Uses a stack-based approach where:
41
+ - Headers push new nodes onto the stack (at appropriate level)
42
+ - Body text appends to the current node's content
43
+ - The stack maintains the current path in the hierarchy
44
+ """
45
+
46
+ def __init__(self):
47
+ """Initialize the Tree Builder."""
48
+ self._current_page = -1
49
+
50
+ def build_tree(
51
+ self,
52
+ spans: list[ClassifiedSpan],
53
+ document_title: str = "",
54
+ ) -> DocumentTree:
55
+ """
56
+ Build a document tree from classified spans.
57
+
58
+ Args:
59
+ spans: List of ClassifiedSpan (headers and body text).
60
+ document_title: Optional title for the document.
61
+
62
+ Returns:
63
+ DocumentTree with hierarchical structure.
64
+ """
65
+ if not spans:
66
+ logger.warning("empty_spans_list")
67
+ root = DocumentNode(id="root", level=0, header="Document")
68
+ return DocumentTree(
69
+ title=document_title or "Untitled",
70
+ root=root,
71
+ total_nodes=1,
72
+ )
73
+
74
+ # Sort spans by reading order (page, then top-to-bottom, left-to-right)
75
+ sorted_spans = self._sort_by_reading_order(spans)
76
+
77
+ # Initialize root node
78
+ root = DocumentNode(
79
+ id="root",
80
+ level=0,
81
+ header=document_title or self._extract_title(sorted_spans),
82
+ )
83
+
84
+ # Stack tracks current path: [(level, node), ...]
85
+ # Start with root at level 0
86
+ stack: list[tuple[int, DocumentNode]] = [(0, root)]
87
+
88
+ # Process each span
89
+ for span in sorted_spans:
90
+ if span.role == "header":
91
+ self._process_header(span, stack)
92
+ else:
93
+ self._process_body(span, stack)
94
+
95
+ # Count total nodes
96
+ total_nodes = self._count_nodes(root)
97
+
98
+ logger.info(
99
+ "tree_built",
100
+ total_nodes=total_nodes,
101
+ max_depth=self._get_max_depth(root),
102
+ )
103
+
104
+ return DocumentTree(
105
+ title=root.header,
106
+ root=root,
107
+ total_nodes=total_nodes,
108
+ )
109
+
110
+ def _sort_by_reading_order(
111
+ self,
112
+ spans: list[ClassifiedSpan],
113
+ ) -> list[ClassifiedSpan]:
114
+ """
115
+ Sort spans by reading order: page number, then y position, then x position.
116
+ """
117
+ return sorted(
118
+ spans,
119
+ key=lambda s: (
120
+ s.page_num,
121
+ s.bbox.y0, # Top to bottom
122
+ s.bbox.x0, # Left to right
123
+ ),
124
+ )
125
+
126
+ def _extract_title(self, spans: list[ClassifiedSpan]) -> str:
127
+ """
128
+ Extract document title from the first H1 header.
129
+ """
130
+ for span in spans:
131
+ if span.role == "header" and span.header_level == 1:
132
+ return span.text.strip()
133
+
134
+ # Fallback: use first header of any level
135
+ for span in spans:
136
+ if span.role == "header":
137
+ return span.text.strip()
138
+
139
+ return "Untitled Document"
140
+
141
+ def _process_header(
142
+ self,
143
+ span: ClassifiedSpan,
144
+ stack: list[tuple[int, DocumentNode]],
145
+ ) -> None:
146
+ """
147
+ Process a header span by adding a new node to the tree.
148
+
149
+ The stack is adjusted so that:
150
+ - Headers pop nodes until finding a parent with lower level
151
+ - Then push the new header node
152
+ """
153
+ level = span.header_level
154
+
155
+ # Pop stack until we find a parent with level < this header
156
+ while len(stack) > 1 and stack[-1][0] >= level:
157
+ stack.pop()
158
+
159
+ # Create new node for this header
160
+ new_node = DocumentNode(
161
+ id=f"sec_{str(uuid4())[:6]}",
162
+ level=level,
163
+ header=span.text.strip(),
164
+ page_num=span.page_num,
165
+ bbox=span.bbox,
166
+ )
167
+
168
+ # Add as child of current top of stack
169
+ parent_node = stack[-1][1]
170
+ parent_node.children.append(new_node)
171
+
172
+ # Push onto stack
173
+ stack.append((level, new_node))
174
+
175
+ logger.debug(
176
+ "header_added",
177
+ level=level,
178
+ header=span.text[:50],
179
+ parent=parent_node.header[:30] if parent_node.header else "root",
180
+ )
181
+
182
+ def _process_body(
183
+ self,
184
+ span: ClassifiedSpan,
185
+ stack: list[tuple[int, DocumentNode]],
186
+ ) -> None:
187
+ """
188
+ Process body text by appending to the current node's content.
189
+ """
190
+ if not stack:
191
+ return
192
+
193
+ current_node = stack[-1][1]
194
+
195
+ # Append text with appropriate spacing
196
+ if current_node.content:
197
+ # Check if we need a new paragraph (different page or significant y gap)
198
+ current_node.content += " " + span.text.strip()
199
+ else:
200
+ current_node.content = span.text.strip()
201
+
202
+ # Update page number if not set
203
+ if current_node.page_num is None:
204
+ current_node.page_num = span.page_num
205
+
206
+ def _count_nodes(self, node: DocumentNode) -> int:
207
+ """Recursively count all nodes in the tree."""
208
+ count = 1 # Count this node
209
+ for child in node.children:
210
+ count += self._count_nodes(child)
211
+ return count
212
+
213
+ def _get_max_depth(self, node: DocumentNode, current_depth: int = 0) -> int:
214
+ """Get the maximum depth of the tree."""
215
+ if not node.children:
216
+ return current_depth
217
+
218
+ return max(
219
+ self._get_max_depth(child, current_depth + 1)
220
+ for child in node.children
221
+ )
222
+
223
+
224
+ def build_document_tree(
225
+ spans: list[ClassifiedSpan],
226
+ title: str = "",
227
+ ) -> DocumentTree:
228
+ """
229
+ Convenience function to build a document tree from classified spans.
230
+
231
+ Args:
232
+ spans: List of ClassifiedSpan from header classification.
233
+ title: Optional document title.
234
+
235
+ Returns:
236
+ DocumentTree with hierarchical structure.
237
+
238
+ Example:
239
+ analysis, raw_spans = analyze_font_histogram("doc.pdf")
240
+ classified = classify_headers(raw_spans, analysis)
241
+ tree = build_document_tree(classified)
242
+ """
243
+ builder = TreeBuilder()
244
+ return builder.build_tree(spans, title)
245
+
246
+
247
+ def build_multi_document_tree(
248
+ segments: list[DocumentSegment],
249
+ container_title: str = "",
250
+ ) -> DocumentTree:
251
+ """
252
+ Build a tree from multiple document segments.
253
+
254
+ Creates a structure where:
255
+ - Root node represents the container (e.g., the PDF file)
256
+ - Each document segment becomes a level-1 child
257
+ - Internal structure of each document is preserved below that
258
+
259
+ Args:
260
+ segments: List of DocumentSegment from boundary detection
261
+ container_title: Title for the root container node
262
+
263
+ Returns:
264
+ DocumentTree with multi-document structure
265
+ """
266
+ from rnsr.ingestion.header_classifier import classify_headers
267
+ from rnsr.ingestion.font_histogram import FontHistogramAnalyzer
268
+
269
+ if not segments:
270
+ root = DocumentNode(id="root", level=0, header=container_title or "Documents")
271
+ return DocumentTree(
272
+ title=container_title or "Documents",
273
+ root=root,
274
+ total_nodes=1,
275
+ )
276
+
277
+ # If only one segment, build a regular tree
278
+ if len(segments) == 1:
279
+ builder = TreeBuilder()
280
+ # Need to classify spans first
281
+ analyzer = FontHistogramAnalyzer()
282
+ analysis = analyzer.analyze_spans(segments[0].spans)
283
+ classified = classify_headers(segments[0].spans, analysis)
284
+ return builder.build_tree(classified, segments[0].title or container_title)
285
+
286
+ # Create container root node
287
+ root = DocumentNode(
288
+ id="root",
289
+ level=0,
290
+ header=container_title or f"{len(segments)} Documents",
291
+ )
292
+
293
+ total_nodes = 1
294
+ builder = TreeBuilder()
295
+ analyzer = FontHistogramAnalyzer()
296
+
297
+ for i, segment in enumerate(segments):
298
+ # Create a document node for this segment
299
+ doc_title = segment.title or f"Document {i + 1}"
300
+
301
+ logger.debug(
302
+ "building_document_subtree",
303
+ doc_index=i,
304
+ title=doc_title[:50],
305
+ span_count=len(segment.spans),
306
+ pages=f"{segment.start_page}-{segment.end_page}",
307
+ )
308
+
309
+ # Analyze and classify spans for this segment
310
+ if segment.spans:
311
+ analysis = analyzer.analyze_spans(segment.spans)
312
+ classified = classify_headers(segment.spans, analysis)
313
+
314
+ # Build subtree for this document
315
+ subtree = builder.build_tree(classified, doc_title)
316
+
317
+ # The subtree's root becomes a child of our container root
318
+ # But we need to shift levels down by 1
319
+ # Pass doc_index to generate unique IDs and avoid collision with container root
320
+ doc_node = _shift_node_levels(subtree.root, level_shift=1, doc_index=i)
321
+ doc_node.header = doc_title # Ensure title is preserved
322
+
323
+ root.children.append(doc_node)
324
+ total_nodes += subtree.total_nodes
325
+ else:
326
+ # Empty segment - just create a placeholder
327
+ doc_node = DocumentNode(
328
+ id=f"doc_{i:03d}",
329
+ level=1,
330
+ header=doc_title,
331
+ page_num=segment.start_page,
332
+ )
333
+ root.children.append(doc_node)
334
+ total_nodes += 1
335
+
336
+ logger.info(
337
+ "multi_document_tree_built",
338
+ documents=len(segments),
339
+ total_nodes=total_nodes,
340
+ )
341
+
342
+ return DocumentTree(
343
+ title=container_title or f"{len(segments)} Documents",
344
+ root=root,
345
+ total_nodes=total_nodes,
346
+ )
347
+
348
+
349
+ def _shift_node_levels(node: DocumentNode, level_shift: int, doc_index: int = 0) -> DocumentNode:
350
+ """
351
+ Recursively shift all node levels by a fixed amount.
352
+
353
+ Used when embedding a subtree under a higher-level node.
354
+ Generates new unique IDs to avoid collisions with the container root.
355
+ """
356
+ # Generate new unique ID to avoid collision with container root
357
+ if node.id == "root":
358
+ new_id = f"doc_{doc_index:03d}"
359
+ else:
360
+ new_id = f"d{doc_index}_{node.id}"
361
+
362
+ new_node = DocumentNode(
363
+ id=new_id,
364
+ level=node.level + level_shift,
365
+ header=node.header,
366
+ content=node.content,
367
+ page_num=node.page_num,
368
+ bbox=node.bbox,
369
+ children=[],
370
+ )
371
+
372
+ for child in node.children:
373
+ new_node.children.append(_shift_node_levels(child, level_shift, doc_index))
374
+
375
+ return new_node
376
+
377
+
378
+ def tree_to_dict(tree: DocumentTree) -> dict:
379
+ """
380
+ Convert a DocumentTree to a nested dictionary for serialization.
381
+
382
+ This produces the JSON structure specified in the plan.
383
+ """
384
+ def node_to_dict(node: DocumentNode) -> dict:
385
+ return {
386
+ "id": node.id,
387
+ "type": "section" if node.level > 0 else "document",
388
+ "level": node.level,
389
+ "header": node.header,
390
+ "content": node.content,
391
+ "page_num": node.page_num,
392
+ "children": [node_to_dict(child) for child in node.children],
393
+ }
394
+
395
+ return {
396
+ "id": tree.id,
397
+ "title": tree.title,
398
+ "total_nodes": tree.total_nodes,
399
+ "ingestion_tier": tree.ingestion_tier,
400
+ "ingestion_method": tree.ingestion_method,
401
+ "root": node_to_dict(tree.root),
402
+ }