rnsr 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rnsr/__init__.py +118 -0
- rnsr/__main__.py +242 -0
- rnsr/agent/__init__.py +218 -0
- rnsr/agent/cross_doc_navigator.py +767 -0
- rnsr/agent/graph.py +1557 -0
- rnsr/agent/llm_cache.py +575 -0
- rnsr/agent/navigator_api.py +497 -0
- rnsr/agent/provenance.py +772 -0
- rnsr/agent/query_clarifier.py +617 -0
- rnsr/agent/reasoning_memory.py +736 -0
- rnsr/agent/repl_env.py +709 -0
- rnsr/agent/rlm_navigator.py +2108 -0
- rnsr/agent/self_reflection.py +602 -0
- rnsr/agent/variable_store.py +308 -0
- rnsr/benchmarks/__init__.py +118 -0
- rnsr/benchmarks/comprehensive_benchmark.py +733 -0
- rnsr/benchmarks/evaluation_suite.py +1210 -0
- rnsr/benchmarks/finance_bench.py +147 -0
- rnsr/benchmarks/pdf_merger.py +178 -0
- rnsr/benchmarks/performance.py +321 -0
- rnsr/benchmarks/quality.py +321 -0
- rnsr/benchmarks/runner.py +298 -0
- rnsr/benchmarks/standard_benchmarks.py +995 -0
- rnsr/client.py +560 -0
- rnsr/document_store.py +394 -0
- rnsr/exceptions.py +74 -0
- rnsr/extraction/__init__.py +172 -0
- rnsr/extraction/candidate_extractor.py +357 -0
- rnsr/extraction/entity_extractor.py +581 -0
- rnsr/extraction/entity_linker.py +825 -0
- rnsr/extraction/grounded_extractor.py +722 -0
- rnsr/extraction/learned_types.py +599 -0
- rnsr/extraction/models.py +232 -0
- rnsr/extraction/relationship_extractor.py +600 -0
- rnsr/extraction/relationship_patterns.py +511 -0
- rnsr/extraction/relationship_validator.py +392 -0
- rnsr/extraction/rlm_extractor.py +589 -0
- rnsr/extraction/rlm_unified_extractor.py +990 -0
- rnsr/extraction/tot_validator.py +610 -0
- rnsr/extraction/unified_extractor.py +342 -0
- rnsr/indexing/__init__.py +60 -0
- rnsr/indexing/knowledge_graph.py +1128 -0
- rnsr/indexing/kv_store.py +313 -0
- rnsr/indexing/persistence.py +323 -0
- rnsr/indexing/semantic_retriever.py +237 -0
- rnsr/indexing/semantic_search.py +320 -0
- rnsr/indexing/skeleton_index.py +395 -0
- rnsr/ingestion/__init__.py +161 -0
- rnsr/ingestion/chart_parser.py +569 -0
- rnsr/ingestion/document_boundary.py +662 -0
- rnsr/ingestion/font_histogram.py +334 -0
- rnsr/ingestion/header_classifier.py +595 -0
- rnsr/ingestion/hierarchical_cluster.py +515 -0
- rnsr/ingestion/layout_detector.py +356 -0
- rnsr/ingestion/layout_model.py +379 -0
- rnsr/ingestion/ocr_fallback.py +177 -0
- rnsr/ingestion/pipeline.py +936 -0
- rnsr/ingestion/semantic_fallback.py +417 -0
- rnsr/ingestion/table_parser.py +799 -0
- rnsr/ingestion/text_builder.py +460 -0
- rnsr/ingestion/tree_builder.py +402 -0
- rnsr/ingestion/vision_retrieval.py +965 -0
- rnsr/ingestion/xy_cut.py +555 -0
- rnsr/llm.py +733 -0
- rnsr/models.py +167 -0
- rnsr/py.typed +2 -0
- rnsr-0.1.0.dist-info/METADATA +592 -0
- rnsr-0.1.0.dist-info/RECORD +72 -0
- rnsr-0.1.0.dist-info/WHEEL +5 -0
- rnsr-0.1.0.dist-info/entry_points.txt +2 -0
- rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
- rnsr-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,402 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Tree Builder - Document Tree Assembly
|
|
3
|
+
|
|
4
|
+
This module builds a hierarchical document tree from classified spans.
|
|
5
|
+
Uses a stack-based parser to:
|
|
6
|
+
1. Process blocks in reading order (top-to-bottom, left-to-right)
|
|
7
|
+
2. Assign body text to the nearest preceding header
|
|
8
|
+
3. Output nested DocumentNode structure
|
|
9
|
+
|
|
10
|
+
The tree structure enables:
|
|
11
|
+
- Hierarchical navigation by the agent
|
|
12
|
+
- Section-based summarization
|
|
13
|
+
- Context-aware retrieval
|
|
14
|
+
|
|
15
|
+
Multi-Document Support:
|
|
16
|
+
- Detects document boundaries in combined PDFs
|
|
17
|
+
- Creates separate subtrees for each logical document
|
|
18
|
+
- Preserves internal hierarchy within each document
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
from typing import TYPE_CHECKING
|
|
24
|
+
from uuid import uuid4
|
|
25
|
+
|
|
26
|
+
import structlog
|
|
27
|
+
|
|
28
|
+
from rnsr.models import ClassifiedSpan, DocumentNode, DocumentTree
|
|
29
|
+
|
|
30
|
+
if TYPE_CHECKING:
|
|
31
|
+
from rnsr.ingestion.document_boundary import DocumentSegment
|
|
32
|
+
|
|
33
|
+
logger = structlog.get_logger(__name__)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class TreeBuilder:
|
|
37
|
+
"""
|
|
38
|
+
Builds a hierarchical document tree from classified spans.
|
|
39
|
+
|
|
40
|
+
Uses a stack-based approach where:
|
|
41
|
+
- Headers push new nodes onto the stack (at appropriate level)
|
|
42
|
+
- Body text appends to the current node's content
|
|
43
|
+
- The stack maintains the current path in the hierarchy
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
def __init__(self):
|
|
47
|
+
"""Initialize the Tree Builder."""
|
|
48
|
+
self._current_page = -1
|
|
49
|
+
|
|
50
|
+
def build_tree(
|
|
51
|
+
self,
|
|
52
|
+
spans: list[ClassifiedSpan],
|
|
53
|
+
document_title: str = "",
|
|
54
|
+
) -> DocumentTree:
|
|
55
|
+
"""
|
|
56
|
+
Build a document tree from classified spans.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
spans: List of ClassifiedSpan (headers and body text).
|
|
60
|
+
document_title: Optional title for the document.
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
DocumentTree with hierarchical structure.
|
|
64
|
+
"""
|
|
65
|
+
if not spans:
|
|
66
|
+
logger.warning("empty_spans_list")
|
|
67
|
+
root = DocumentNode(id="root", level=0, header="Document")
|
|
68
|
+
return DocumentTree(
|
|
69
|
+
title=document_title or "Untitled",
|
|
70
|
+
root=root,
|
|
71
|
+
total_nodes=1,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
# Sort spans by reading order (page, then top-to-bottom, left-to-right)
|
|
75
|
+
sorted_spans = self._sort_by_reading_order(spans)
|
|
76
|
+
|
|
77
|
+
# Initialize root node
|
|
78
|
+
root = DocumentNode(
|
|
79
|
+
id="root",
|
|
80
|
+
level=0,
|
|
81
|
+
header=document_title or self._extract_title(sorted_spans),
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# Stack tracks current path: [(level, node), ...]
|
|
85
|
+
# Start with root at level 0
|
|
86
|
+
stack: list[tuple[int, DocumentNode]] = [(0, root)]
|
|
87
|
+
|
|
88
|
+
# Process each span
|
|
89
|
+
for span in sorted_spans:
|
|
90
|
+
if span.role == "header":
|
|
91
|
+
self._process_header(span, stack)
|
|
92
|
+
else:
|
|
93
|
+
self._process_body(span, stack)
|
|
94
|
+
|
|
95
|
+
# Count total nodes
|
|
96
|
+
total_nodes = self._count_nodes(root)
|
|
97
|
+
|
|
98
|
+
logger.info(
|
|
99
|
+
"tree_built",
|
|
100
|
+
total_nodes=total_nodes,
|
|
101
|
+
max_depth=self._get_max_depth(root),
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
return DocumentTree(
|
|
105
|
+
title=root.header,
|
|
106
|
+
root=root,
|
|
107
|
+
total_nodes=total_nodes,
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
def _sort_by_reading_order(
|
|
111
|
+
self,
|
|
112
|
+
spans: list[ClassifiedSpan],
|
|
113
|
+
) -> list[ClassifiedSpan]:
|
|
114
|
+
"""
|
|
115
|
+
Sort spans by reading order: page number, then y position, then x position.
|
|
116
|
+
"""
|
|
117
|
+
return sorted(
|
|
118
|
+
spans,
|
|
119
|
+
key=lambda s: (
|
|
120
|
+
s.page_num,
|
|
121
|
+
s.bbox.y0, # Top to bottom
|
|
122
|
+
s.bbox.x0, # Left to right
|
|
123
|
+
),
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
def _extract_title(self, spans: list[ClassifiedSpan]) -> str:
|
|
127
|
+
"""
|
|
128
|
+
Extract document title from the first H1 header.
|
|
129
|
+
"""
|
|
130
|
+
for span in spans:
|
|
131
|
+
if span.role == "header" and span.header_level == 1:
|
|
132
|
+
return span.text.strip()
|
|
133
|
+
|
|
134
|
+
# Fallback: use first header of any level
|
|
135
|
+
for span in spans:
|
|
136
|
+
if span.role == "header":
|
|
137
|
+
return span.text.strip()
|
|
138
|
+
|
|
139
|
+
return "Untitled Document"
|
|
140
|
+
|
|
141
|
+
def _process_header(
|
|
142
|
+
self,
|
|
143
|
+
span: ClassifiedSpan,
|
|
144
|
+
stack: list[tuple[int, DocumentNode]],
|
|
145
|
+
) -> None:
|
|
146
|
+
"""
|
|
147
|
+
Process a header span by adding a new node to the tree.
|
|
148
|
+
|
|
149
|
+
The stack is adjusted so that:
|
|
150
|
+
- Headers pop nodes until finding a parent with lower level
|
|
151
|
+
- Then push the new header node
|
|
152
|
+
"""
|
|
153
|
+
level = span.header_level
|
|
154
|
+
|
|
155
|
+
# Pop stack until we find a parent with level < this header
|
|
156
|
+
while len(stack) > 1 and stack[-1][0] >= level:
|
|
157
|
+
stack.pop()
|
|
158
|
+
|
|
159
|
+
# Create new node for this header
|
|
160
|
+
new_node = DocumentNode(
|
|
161
|
+
id=f"sec_{str(uuid4())[:6]}",
|
|
162
|
+
level=level,
|
|
163
|
+
header=span.text.strip(),
|
|
164
|
+
page_num=span.page_num,
|
|
165
|
+
bbox=span.bbox,
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
# Add as child of current top of stack
|
|
169
|
+
parent_node = stack[-1][1]
|
|
170
|
+
parent_node.children.append(new_node)
|
|
171
|
+
|
|
172
|
+
# Push onto stack
|
|
173
|
+
stack.append((level, new_node))
|
|
174
|
+
|
|
175
|
+
logger.debug(
|
|
176
|
+
"header_added",
|
|
177
|
+
level=level,
|
|
178
|
+
header=span.text[:50],
|
|
179
|
+
parent=parent_node.header[:30] if parent_node.header else "root",
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
def _process_body(
|
|
183
|
+
self,
|
|
184
|
+
span: ClassifiedSpan,
|
|
185
|
+
stack: list[tuple[int, DocumentNode]],
|
|
186
|
+
) -> None:
|
|
187
|
+
"""
|
|
188
|
+
Process body text by appending to the current node's content.
|
|
189
|
+
"""
|
|
190
|
+
if not stack:
|
|
191
|
+
return
|
|
192
|
+
|
|
193
|
+
current_node = stack[-1][1]
|
|
194
|
+
|
|
195
|
+
# Append text with appropriate spacing
|
|
196
|
+
if current_node.content:
|
|
197
|
+
# Check if we need a new paragraph (different page or significant y gap)
|
|
198
|
+
current_node.content += " " + span.text.strip()
|
|
199
|
+
else:
|
|
200
|
+
current_node.content = span.text.strip()
|
|
201
|
+
|
|
202
|
+
# Update page number if not set
|
|
203
|
+
if current_node.page_num is None:
|
|
204
|
+
current_node.page_num = span.page_num
|
|
205
|
+
|
|
206
|
+
def _count_nodes(self, node: DocumentNode) -> int:
|
|
207
|
+
"""Recursively count all nodes in the tree."""
|
|
208
|
+
count = 1 # Count this node
|
|
209
|
+
for child in node.children:
|
|
210
|
+
count += self._count_nodes(child)
|
|
211
|
+
return count
|
|
212
|
+
|
|
213
|
+
def _get_max_depth(self, node: DocumentNode, current_depth: int = 0) -> int:
|
|
214
|
+
"""Get the maximum depth of the tree."""
|
|
215
|
+
if not node.children:
|
|
216
|
+
return current_depth
|
|
217
|
+
|
|
218
|
+
return max(
|
|
219
|
+
self._get_max_depth(child, current_depth + 1)
|
|
220
|
+
for child in node.children
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def build_document_tree(
|
|
225
|
+
spans: list[ClassifiedSpan],
|
|
226
|
+
title: str = "",
|
|
227
|
+
) -> DocumentTree:
|
|
228
|
+
"""
|
|
229
|
+
Convenience function to build a document tree from classified spans.
|
|
230
|
+
|
|
231
|
+
Args:
|
|
232
|
+
spans: List of ClassifiedSpan from header classification.
|
|
233
|
+
title: Optional document title.
|
|
234
|
+
|
|
235
|
+
Returns:
|
|
236
|
+
DocumentTree with hierarchical structure.
|
|
237
|
+
|
|
238
|
+
Example:
|
|
239
|
+
analysis, raw_spans = analyze_font_histogram("doc.pdf")
|
|
240
|
+
classified = classify_headers(raw_spans, analysis)
|
|
241
|
+
tree = build_document_tree(classified)
|
|
242
|
+
"""
|
|
243
|
+
builder = TreeBuilder()
|
|
244
|
+
return builder.build_tree(spans, title)
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def build_multi_document_tree(
|
|
248
|
+
segments: list[DocumentSegment],
|
|
249
|
+
container_title: str = "",
|
|
250
|
+
) -> DocumentTree:
|
|
251
|
+
"""
|
|
252
|
+
Build a tree from multiple document segments.
|
|
253
|
+
|
|
254
|
+
Creates a structure where:
|
|
255
|
+
- Root node represents the container (e.g., the PDF file)
|
|
256
|
+
- Each document segment becomes a level-1 child
|
|
257
|
+
- Internal structure of each document is preserved below that
|
|
258
|
+
|
|
259
|
+
Args:
|
|
260
|
+
segments: List of DocumentSegment from boundary detection
|
|
261
|
+
container_title: Title for the root container node
|
|
262
|
+
|
|
263
|
+
Returns:
|
|
264
|
+
DocumentTree with multi-document structure
|
|
265
|
+
"""
|
|
266
|
+
from rnsr.ingestion.header_classifier import classify_headers
|
|
267
|
+
from rnsr.ingestion.font_histogram import FontHistogramAnalyzer
|
|
268
|
+
|
|
269
|
+
if not segments:
|
|
270
|
+
root = DocumentNode(id="root", level=0, header=container_title or "Documents")
|
|
271
|
+
return DocumentTree(
|
|
272
|
+
title=container_title or "Documents",
|
|
273
|
+
root=root,
|
|
274
|
+
total_nodes=1,
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
# If only one segment, build a regular tree
|
|
278
|
+
if len(segments) == 1:
|
|
279
|
+
builder = TreeBuilder()
|
|
280
|
+
# Need to classify spans first
|
|
281
|
+
analyzer = FontHistogramAnalyzer()
|
|
282
|
+
analysis = analyzer.analyze_spans(segments[0].spans)
|
|
283
|
+
classified = classify_headers(segments[0].spans, analysis)
|
|
284
|
+
return builder.build_tree(classified, segments[0].title or container_title)
|
|
285
|
+
|
|
286
|
+
# Create container root node
|
|
287
|
+
root = DocumentNode(
|
|
288
|
+
id="root",
|
|
289
|
+
level=0,
|
|
290
|
+
header=container_title or f"{len(segments)} Documents",
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
total_nodes = 1
|
|
294
|
+
builder = TreeBuilder()
|
|
295
|
+
analyzer = FontHistogramAnalyzer()
|
|
296
|
+
|
|
297
|
+
for i, segment in enumerate(segments):
|
|
298
|
+
# Create a document node for this segment
|
|
299
|
+
doc_title = segment.title or f"Document {i + 1}"
|
|
300
|
+
|
|
301
|
+
logger.debug(
|
|
302
|
+
"building_document_subtree",
|
|
303
|
+
doc_index=i,
|
|
304
|
+
title=doc_title[:50],
|
|
305
|
+
span_count=len(segment.spans),
|
|
306
|
+
pages=f"{segment.start_page}-{segment.end_page}",
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
# Analyze and classify spans for this segment
|
|
310
|
+
if segment.spans:
|
|
311
|
+
analysis = analyzer.analyze_spans(segment.spans)
|
|
312
|
+
classified = classify_headers(segment.spans, analysis)
|
|
313
|
+
|
|
314
|
+
# Build subtree for this document
|
|
315
|
+
subtree = builder.build_tree(classified, doc_title)
|
|
316
|
+
|
|
317
|
+
# The subtree's root becomes a child of our container root
|
|
318
|
+
# But we need to shift levels down by 1
|
|
319
|
+
# Pass doc_index to generate unique IDs and avoid collision with container root
|
|
320
|
+
doc_node = _shift_node_levels(subtree.root, level_shift=1, doc_index=i)
|
|
321
|
+
doc_node.header = doc_title # Ensure title is preserved
|
|
322
|
+
|
|
323
|
+
root.children.append(doc_node)
|
|
324
|
+
total_nodes += subtree.total_nodes
|
|
325
|
+
else:
|
|
326
|
+
# Empty segment - just create a placeholder
|
|
327
|
+
doc_node = DocumentNode(
|
|
328
|
+
id=f"doc_{i:03d}",
|
|
329
|
+
level=1,
|
|
330
|
+
header=doc_title,
|
|
331
|
+
page_num=segment.start_page,
|
|
332
|
+
)
|
|
333
|
+
root.children.append(doc_node)
|
|
334
|
+
total_nodes += 1
|
|
335
|
+
|
|
336
|
+
logger.info(
|
|
337
|
+
"multi_document_tree_built",
|
|
338
|
+
documents=len(segments),
|
|
339
|
+
total_nodes=total_nodes,
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
return DocumentTree(
|
|
343
|
+
title=container_title or f"{len(segments)} Documents",
|
|
344
|
+
root=root,
|
|
345
|
+
total_nodes=total_nodes,
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
def _shift_node_levels(node: DocumentNode, level_shift: int, doc_index: int = 0) -> DocumentNode:
|
|
350
|
+
"""
|
|
351
|
+
Recursively shift all node levels by a fixed amount.
|
|
352
|
+
|
|
353
|
+
Used when embedding a subtree under a higher-level node.
|
|
354
|
+
Generates new unique IDs to avoid collisions with the container root.
|
|
355
|
+
"""
|
|
356
|
+
# Generate new unique ID to avoid collision with container root
|
|
357
|
+
if node.id == "root":
|
|
358
|
+
new_id = f"doc_{doc_index:03d}"
|
|
359
|
+
else:
|
|
360
|
+
new_id = f"d{doc_index}_{node.id}"
|
|
361
|
+
|
|
362
|
+
new_node = DocumentNode(
|
|
363
|
+
id=new_id,
|
|
364
|
+
level=node.level + level_shift,
|
|
365
|
+
header=node.header,
|
|
366
|
+
content=node.content,
|
|
367
|
+
page_num=node.page_num,
|
|
368
|
+
bbox=node.bbox,
|
|
369
|
+
children=[],
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
for child in node.children:
|
|
373
|
+
new_node.children.append(_shift_node_levels(child, level_shift, doc_index))
|
|
374
|
+
|
|
375
|
+
return new_node
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
def tree_to_dict(tree: DocumentTree) -> dict:
|
|
379
|
+
"""
|
|
380
|
+
Convert a DocumentTree to a nested dictionary for serialization.
|
|
381
|
+
|
|
382
|
+
This produces the JSON structure specified in the plan.
|
|
383
|
+
"""
|
|
384
|
+
def node_to_dict(node: DocumentNode) -> dict:
|
|
385
|
+
return {
|
|
386
|
+
"id": node.id,
|
|
387
|
+
"type": "section" if node.level > 0 else "document",
|
|
388
|
+
"level": node.level,
|
|
389
|
+
"header": node.header,
|
|
390
|
+
"content": node.content,
|
|
391
|
+
"page_num": node.page_num,
|
|
392
|
+
"children": [node_to_dict(child) for child in node.children],
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
return {
|
|
396
|
+
"id": tree.id,
|
|
397
|
+
"title": tree.title,
|
|
398
|
+
"total_nodes": tree.total_nodes,
|
|
399
|
+
"ingestion_tier": tree.ingestion_tier,
|
|
400
|
+
"ingestion_method": tree.ingestion_method,
|
|
401
|
+
"root": node_to_dict(tree.root),
|
|
402
|
+
}
|