rnsr 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rnsr/__init__.py +118 -0
- rnsr/__main__.py +242 -0
- rnsr/agent/__init__.py +218 -0
- rnsr/agent/cross_doc_navigator.py +767 -0
- rnsr/agent/graph.py +1557 -0
- rnsr/agent/llm_cache.py +575 -0
- rnsr/agent/navigator_api.py +497 -0
- rnsr/agent/provenance.py +772 -0
- rnsr/agent/query_clarifier.py +617 -0
- rnsr/agent/reasoning_memory.py +736 -0
- rnsr/agent/repl_env.py +709 -0
- rnsr/agent/rlm_navigator.py +2108 -0
- rnsr/agent/self_reflection.py +602 -0
- rnsr/agent/variable_store.py +308 -0
- rnsr/benchmarks/__init__.py +118 -0
- rnsr/benchmarks/comprehensive_benchmark.py +733 -0
- rnsr/benchmarks/evaluation_suite.py +1210 -0
- rnsr/benchmarks/finance_bench.py +147 -0
- rnsr/benchmarks/pdf_merger.py +178 -0
- rnsr/benchmarks/performance.py +321 -0
- rnsr/benchmarks/quality.py +321 -0
- rnsr/benchmarks/runner.py +298 -0
- rnsr/benchmarks/standard_benchmarks.py +995 -0
- rnsr/client.py +560 -0
- rnsr/document_store.py +394 -0
- rnsr/exceptions.py +74 -0
- rnsr/extraction/__init__.py +172 -0
- rnsr/extraction/candidate_extractor.py +357 -0
- rnsr/extraction/entity_extractor.py +581 -0
- rnsr/extraction/entity_linker.py +825 -0
- rnsr/extraction/grounded_extractor.py +722 -0
- rnsr/extraction/learned_types.py +599 -0
- rnsr/extraction/models.py +232 -0
- rnsr/extraction/relationship_extractor.py +600 -0
- rnsr/extraction/relationship_patterns.py +511 -0
- rnsr/extraction/relationship_validator.py +392 -0
- rnsr/extraction/rlm_extractor.py +589 -0
- rnsr/extraction/rlm_unified_extractor.py +990 -0
- rnsr/extraction/tot_validator.py +610 -0
- rnsr/extraction/unified_extractor.py +342 -0
- rnsr/indexing/__init__.py +60 -0
- rnsr/indexing/knowledge_graph.py +1128 -0
- rnsr/indexing/kv_store.py +313 -0
- rnsr/indexing/persistence.py +323 -0
- rnsr/indexing/semantic_retriever.py +237 -0
- rnsr/indexing/semantic_search.py +320 -0
- rnsr/indexing/skeleton_index.py +395 -0
- rnsr/ingestion/__init__.py +161 -0
- rnsr/ingestion/chart_parser.py +569 -0
- rnsr/ingestion/document_boundary.py +662 -0
- rnsr/ingestion/font_histogram.py +334 -0
- rnsr/ingestion/header_classifier.py +595 -0
- rnsr/ingestion/hierarchical_cluster.py +515 -0
- rnsr/ingestion/layout_detector.py +356 -0
- rnsr/ingestion/layout_model.py +379 -0
- rnsr/ingestion/ocr_fallback.py +177 -0
- rnsr/ingestion/pipeline.py +936 -0
- rnsr/ingestion/semantic_fallback.py +417 -0
- rnsr/ingestion/table_parser.py +799 -0
- rnsr/ingestion/text_builder.py +460 -0
- rnsr/ingestion/tree_builder.py +402 -0
- rnsr/ingestion/vision_retrieval.py +965 -0
- rnsr/ingestion/xy_cut.py +555 -0
- rnsr/llm.py +733 -0
- rnsr/models.py +167 -0
- rnsr/py.typed +2 -0
- rnsr-0.1.0.dist-info/METADATA +592 -0
- rnsr-0.1.0.dist-info/RECORD +72 -0
- rnsr-0.1.0.dist-info/WHEEL +5 -0
- rnsr-0.1.0.dist-info/entry_points.txt +2 -0
- rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
- rnsr-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,356 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Layout Complexity Detector - Auto-detect when to use visual analysis
|
|
3
|
+
|
|
4
|
+
Analyzes document layout characteristics to determine when LayoutLM
|
|
5
|
+
visual analysis should be triggered:
|
|
6
|
+
|
|
7
|
+
- Multi-column layouts (text bboxes overlap vertically)
|
|
8
|
+
- Empty/image-only pages (no extractable text)
|
|
9
|
+
- Complex L-shaped wraps (irregular bounding box patterns)
|
|
10
|
+
|
|
11
|
+
Usage:
|
|
12
|
+
from rnsr.ingestion.layout_detector import detect_layout_complexity
|
|
13
|
+
|
|
14
|
+
complexity = detect_layout_complexity("document.pdf")
|
|
15
|
+
|
|
16
|
+
if complexity.needs_visual_analysis:
|
|
17
|
+
# Use LayoutLM + XY-Cut
|
|
18
|
+
pass
|
|
19
|
+
else:
|
|
20
|
+
# Use simple font histogram
|
|
21
|
+
pass
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
from dataclasses import dataclass
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
|
|
29
|
+
import fitz # PyMuPDF
|
|
30
|
+
import structlog
|
|
31
|
+
|
|
32
|
+
from rnsr.models import SpanInfo, BoundingBox
|
|
33
|
+
|
|
34
|
+
logger = structlog.get_logger(__name__)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class LayoutComplexity:
|
|
39
|
+
"""Result of layout complexity analysis."""
|
|
40
|
+
|
|
41
|
+
# Detection flags
|
|
42
|
+
has_multi_column: bool = False
|
|
43
|
+
has_empty_pages: bool = False
|
|
44
|
+
has_complex_wrapping: bool = False
|
|
45
|
+
|
|
46
|
+
# Metrics
|
|
47
|
+
avg_columns_per_page: float = 1.0
|
|
48
|
+
empty_page_ratio: float = 0.0
|
|
49
|
+
bbox_overlap_score: float = 0.0
|
|
50
|
+
|
|
51
|
+
# Overall assessment
|
|
52
|
+
complexity_score: float = 0.0 # 0.0 (simple) to 1.0 (complex)
|
|
53
|
+
needs_visual_analysis: bool = False
|
|
54
|
+
|
|
55
|
+
# Reasoning
|
|
56
|
+
reason: str = ""
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def detect_multi_column(spans: list[SpanInfo], page_height: float) -> bool:
|
|
60
|
+
"""
|
|
61
|
+
Detect if page has multi-column layout.
|
|
62
|
+
|
|
63
|
+
Algorithm:
|
|
64
|
+
1. Group spans by vertical position (Y coordinate)
|
|
65
|
+
2. For each row, count distinct horizontal regions (columns)
|
|
66
|
+
3. If >30% of rows have 2+ columns, it's multi-column
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
spans: List of text spans from a page.
|
|
70
|
+
page_height: Height of the page.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
True if multi-column layout detected.
|
|
74
|
+
"""
|
|
75
|
+
if len(spans) < 10:
|
|
76
|
+
return False
|
|
77
|
+
|
|
78
|
+
# Group spans by vertical bands (rows)
|
|
79
|
+
row_height = 20.0 # Approximate line height
|
|
80
|
+
rows: dict[int, list[SpanInfo]] = {}
|
|
81
|
+
|
|
82
|
+
for span in spans:
|
|
83
|
+
row_idx = int(span.bbox.y0 / row_height)
|
|
84
|
+
if row_idx not in rows:
|
|
85
|
+
rows[row_idx] = []
|
|
86
|
+
rows[row_idx].append(span)
|
|
87
|
+
|
|
88
|
+
# Count columns per row
|
|
89
|
+
multi_column_rows = 0
|
|
90
|
+
total_rows = len(rows)
|
|
91
|
+
|
|
92
|
+
for row_spans in rows.values():
|
|
93
|
+
if len(row_spans) < 2:
|
|
94
|
+
continue
|
|
95
|
+
|
|
96
|
+
# Sort by X position
|
|
97
|
+
sorted_spans = sorted(row_spans, key=lambda s: s.bbox.x0)
|
|
98
|
+
|
|
99
|
+
# Check for gaps indicating columns
|
|
100
|
+
gaps = []
|
|
101
|
+
for i in range(len(sorted_spans) - 1):
|
|
102
|
+
gap = sorted_spans[i + 1].bbox.x0 - sorted_spans[i].bbox.x1
|
|
103
|
+
if gap > 30: # Significant gap
|
|
104
|
+
gaps.append(gap)
|
|
105
|
+
|
|
106
|
+
# If we have 1+ large gap, it's multi-column
|
|
107
|
+
if gaps and max(gaps) > 50:
|
|
108
|
+
multi_column_rows += 1
|
|
109
|
+
|
|
110
|
+
# Threshold: >30% rows are multi-column
|
|
111
|
+
if total_rows > 0:
|
|
112
|
+
ratio = multi_column_rows / total_rows
|
|
113
|
+
is_multi_column = ratio > 0.3
|
|
114
|
+
|
|
115
|
+
logger.debug(
|
|
116
|
+
"multi_column_detection",
|
|
117
|
+
multi_column_rows=multi_column_rows,
|
|
118
|
+
total_rows=total_rows,
|
|
119
|
+
ratio=ratio,
|
|
120
|
+
result=is_multi_column,
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
return is_multi_column
|
|
124
|
+
|
|
125
|
+
return False
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def detect_empty_pages(pdf_path: Path | str, min_text_threshold: int = 10) -> tuple[bool, float]:
|
|
129
|
+
"""
|
|
130
|
+
Detect if document has empty or image-only pages.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
pdf_path: Path to PDF file.
|
|
134
|
+
min_text_threshold: Minimum word count to consider page non-empty.
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
Tuple of (has_empty_pages, empty_page_ratio).
|
|
138
|
+
"""
|
|
139
|
+
try:
|
|
140
|
+
doc = fitz.open(pdf_path)
|
|
141
|
+
except Exception as e:
|
|
142
|
+
logger.error("pdf_open_failed", path=str(pdf_path), error=str(e))
|
|
143
|
+
return False, 0.0
|
|
144
|
+
|
|
145
|
+
empty_pages = 0
|
|
146
|
+
total_pages = len(doc)
|
|
147
|
+
|
|
148
|
+
for page in doc:
|
|
149
|
+
text = page.get_text().strip() # type: ignore[union-attr]
|
|
150
|
+
word_count = len(text.split())
|
|
151
|
+
|
|
152
|
+
if word_count < min_text_threshold:
|
|
153
|
+
empty_pages += 1
|
|
154
|
+
logger.debug("empty_page_detected", page_num=page.number, words=word_count)
|
|
155
|
+
|
|
156
|
+
doc.close()
|
|
157
|
+
|
|
158
|
+
if total_pages > 0:
|
|
159
|
+
ratio = empty_pages / total_pages
|
|
160
|
+
has_empty = ratio > 0.1 # >10% pages empty
|
|
161
|
+
|
|
162
|
+
logger.info(
|
|
163
|
+
"empty_page_detection",
|
|
164
|
+
empty_pages=empty_pages,
|
|
165
|
+
total_pages=total_pages,
|
|
166
|
+
ratio=ratio,
|
|
167
|
+
result=has_empty,
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
return has_empty, ratio
|
|
171
|
+
|
|
172
|
+
return False, 0.0
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def calculate_bbox_overlap_score(spans: list[SpanInfo]) -> float:
|
|
176
|
+
"""
|
|
177
|
+
Calculate how much text bounding boxes overlap vertically.
|
|
178
|
+
|
|
179
|
+
High overlap suggests complex wrapping or multi-column layout.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
spans: List of text spans.
|
|
183
|
+
|
|
184
|
+
Returns:
|
|
185
|
+
Overlap score from 0.0 (no overlap) to 1.0 (high overlap).
|
|
186
|
+
"""
|
|
187
|
+
if len(spans) < 2:
|
|
188
|
+
return 0.0
|
|
189
|
+
|
|
190
|
+
overlap_count = 0
|
|
191
|
+
total_comparisons = 0
|
|
192
|
+
|
|
193
|
+
# Compare each span with others
|
|
194
|
+
for i, span1 in enumerate(spans):
|
|
195
|
+
for span2 in spans[i + 1:]:
|
|
196
|
+
total_comparisons += 1
|
|
197
|
+
|
|
198
|
+
# Check if Y ranges overlap
|
|
199
|
+
y1_min, y1_max = span1.bbox.y0, span1.bbox.y1
|
|
200
|
+
y2_min, y2_max = span2.bbox.y0, span2.bbox.y1
|
|
201
|
+
|
|
202
|
+
# Check for vertical overlap
|
|
203
|
+
if not (y1_max < y2_min or y2_max < y1_min):
|
|
204
|
+
# Check if they're in different horizontal regions
|
|
205
|
+
x1_center = (span1.bbox.x0 + span1.bbox.x1) / 2
|
|
206
|
+
x2_center = (span2.bbox.x0 + span2.bbox.x1) / 2
|
|
207
|
+
|
|
208
|
+
if abs(x1_center - x2_center) > 100: # Separated horizontally
|
|
209
|
+
overlap_count += 1
|
|
210
|
+
|
|
211
|
+
if total_comparisons > 0:
|
|
212
|
+
score = overlap_count / total_comparisons
|
|
213
|
+
logger.debug("bbox_overlap_calculated", score=score, overlaps=overlap_count)
|
|
214
|
+
return score
|
|
215
|
+
|
|
216
|
+
return 0.0
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def detect_layout_complexity(
|
|
220
|
+
pdf_path: Path | str,
|
|
221
|
+
threshold: float = 0.3,
|
|
222
|
+
) -> LayoutComplexity:
|
|
223
|
+
"""
|
|
224
|
+
Analyze document layout to determine if visual analysis is needed.
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
pdf_path: Path to PDF file.
|
|
228
|
+
threshold: Complexity threshold (0.0-1.0) for triggering visual analysis.
|
|
229
|
+
|
|
230
|
+
Returns:
|
|
231
|
+
LayoutComplexity object with analysis results.
|
|
232
|
+
|
|
233
|
+
Example:
|
|
234
|
+
complexity = detect_layout_complexity("report.pdf")
|
|
235
|
+
|
|
236
|
+
if complexity.needs_visual_analysis:
|
|
237
|
+
print(f"Reason: {complexity.reason}")
|
|
238
|
+
# Use LayoutLM + XY-Cut
|
|
239
|
+
else:
|
|
240
|
+
# Use simple font histogram
|
|
241
|
+
"""
|
|
242
|
+
pdf_path = Path(pdf_path)
|
|
243
|
+
|
|
244
|
+
logger.info("detecting_layout_complexity", path=pdf_path.name)
|
|
245
|
+
|
|
246
|
+
# Initialize result
|
|
247
|
+
result = LayoutComplexity()
|
|
248
|
+
|
|
249
|
+
try:
|
|
250
|
+
doc = fitz.open(pdf_path)
|
|
251
|
+
except Exception as e:
|
|
252
|
+
logger.error("pdf_open_failed", error=str(e))
|
|
253
|
+
return result
|
|
254
|
+
|
|
255
|
+
# 1. Check for empty pages
|
|
256
|
+
has_empty, empty_ratio = detect_empty_pages(pdf_path)
|
|
257
|
+
result.has_empty_pages = has_empty
|
|
258
|
+
result.empty_page_ratio = empty_ratio
|
|
259
|
+
|
|
260
|
+
# 2. Analyze first few pages for multi-column and overlap
|
|
261
|
+
pages_to_check = min(3, len(doc))
|
|
262
|
+
multi_column_pages = 0
|
|
263
|
+
total_overlap_score = 0.0
|
|
264
|
+
|
|
265
|
+
for page_num in range(pages_to_check):
|
|
266
|
+
page = doc[page_num]
|
|
267
|
+
|
|
268
|
+
# Extract spans
|
|
269
|
+
page_dict = page.get_text("dict") # type: ignore[assignment]
|
|
270
|
+
spans: list[SpanInfo] = []
|
|
271
|
+
|
|
272
|
+
for block in page_dict.get("blocks", []): # type: ignore[union-attr]
|
|
273
|
+
if "lines" not in block:
|
|
274
|
+
continue
|
|
275
|
+
|
|
276
|
+
for line in block["lines"]:
|
|
277
|
+
for span in line["spans"]:
|
|
278
|
+
text = str(span.get("text", "")).strip()
|
|
279
|
+
if len(text) < 2:
|
|
280
|
+
continue
|
|
281
|
+
|
|
282
|
+
bbox = span.get("bbox", [0, 0, 0, 0])
|
|
283
|
+
font_size = float(span.get("size", 12.0))
|
|
284
|
+
|
|
285
|
+
spans.append(SpanInfo(
|
|
286
|
+
text=text,
|
|
287
|
+
font_size=font_size,
|
|
288
|
+
font_name=str(span.get("font", "Unknown")),
|
|
289
|
+
is_bold=False,
|
|
290
|
+
is_italic=False,
|
|
291
|
+
bbox=BoundingBox(
|
|
292
|
+
x0=float(bbox[0]),
|
|
293
|
+
y0=float(bbox[1]),
|
|
294
|
+
x1=float(bbox[2]),
|
|
295
|
+
y1=float(bbox[3]),
|
|
296
|
+
),
|
|
297
|
+
page_num=page_num,
|
|
298
|
+
))
|
|
299
|
+
|
|
300
|
+
# Check multi-column
|
|
301
|
+
if detect_multi_column(spans, page.rect.height):
|
|
302
|
+
multi_column_pages += 1
|
|
303
|
+
|
|
304
|
+
# Calculate overlap
|
|
305
|
+
overlap = calculate_bbox_overlap_score(spans)
|
|
306
|
+
total_overlap_score += overlap
|
|
307
|
+
|
|
308
|
+
doc.close()
|
|
309
|
+
|
|
310
|
+
# 3. Calculate metrics
|
|
311
|
+
if pages_to_check > 0:
|
|
312
|
+
result.avg_columns_per_page = (multi_column_pages / pages_to_check) * 2
|
|
313
|
+
result.bbox_overlap_score = total_overlap_score / pages_to_check
|
|
314
|
+
|
|
315
|
+
result.has_multi_column = multi_column_pages >= 2 or result.bbox_overlap_score > 0.2
|
|
316
|
+
result.has_complex_wrapping = result.bbox_overlap_score > 0.3
|
|
317
|
+
|
|
318
|
+
# 4. Calculate overall complexity score
|
|
319
|
+
complexity_factors = []
|
|
320
|
+
|
|
321
|
+
if result.has_multi_column:
|
|
322
|
+
complexity_factors.append(0.5)
|
|
323
|
+
if result.has_empty_pages:
|
|
324
|
+
complexity_factors.append(0.3)
|
|
325
|
+
if result.has_complex_wrapping:
|
|
326
|
+
complexity_factors.append(0.4)
|
|
327
|
+
|
|
328
|
+
# Weighted average
|
|
329
|
+
if complexity_factors:
|
|
330
|
+
result.complexity_score = sum(complexity_factors) / len(complexity_factors)
|
|
331
|
+
|
|
332
|
+
# 5. Determine if visual analysis needed
|
|
333
|
+
result.needs_visual_analysis = result.complexity_score > threshold
|
|
334
|
+
|
|
335
|
+
# 6. Generate reasoning
|
|
336
|
+
reasons = []
|
|
337
|
+
if result.has_multi_column:
|
|
338
|
+
reasons.append("multi-column layout detected")
|
|
339
|
+
if result.has_empty_pages:
|
|
340
|
+
reasons.append(f"{result.empty_page_ratio:.0%} pages are empty/image-only")
|
|
341
|
+
if result.has_complex_wrapping:
|
|
342
|
+
reasons.append("complex text wrapping detected")
|
|
343
|
+
|
|
344
|
+
if reasons:
|
|
345
|
+
result.reason = "; ".join(reasons)
|
|
346
|
+
else:
|
|
347
|
+
result.reason = "simple single-column layout"
|
|
348
|
+
|
|
349
|
+
logger.info(
|
|
350
|
+
"layout_complexity_detected",
|
|
351
|
+
score=result.complexity_score,
|
|
352
|
+
needs_visual=result.needs_visual_analysis,
|
|
353
|
+
reason=result.reason,
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
return result
|