rnsr 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. rnsr/__init__.py +118 -0
  2. rnsr/__main__.py +242 -0
  3. rnsr/agent/__init__.py +218 -0
  4. rnsr/agent/cross_doc_navigator.py +767 -0
  5. rnsr/agent/graph.py +1557 -0
  6. rnsr/agent/llm_cache.py +575 -0
  7. rnsr/agent/navigator_api.py +497 -0
  8. rnsr/agent/provenance.py +772 -0
  9. rnsr/agent/query_clarifier.py +617 -0
  10. rnsr/agent/reasoning_memory.py +736 -0
  11. rnsr/agent/repl_env.py +709 -0
  12. rnsr/agent/rlm_navigator.py +2108 -0
  13. rnsr/agent/self_reflection.py +602 -0
  14. rnsr/agent/variable_store.py +308 -0
  15. rnsr/benchmarks/__init__.py +118 -0
  16. rnsr/benchmarks/comprehensive_benchmark.py +733 -0
  17. rnsr/benchmarks/evaluation_suite.py +1210 -0
  18. rnsr/benchmarks/finance_bench.py +147 -0
  19. rnsr/benchmarks/pdf_merger.py +178 -0
  20. rnsr/benchmarks/performance.py +321 -0
  21. rnsr/benchmarks/quality.py +321 -0
  22. rnsr/benchmarks/runner.py +298 -0
  23. rnsr/benchmarks/standard_benchmarks.py +995 -0
  24. rnsr/client.py +560 -0
  25. rnsr/document_store.py +394 -0
  26. rnsr/exceptions.py +74 -0
  27. rnsr/extraction/__init__.py +172 -0
  28. rnsr/extraction/candidate_extractor.py +357 -0
  29. rnsr/extraction/entity_extractor.py +581 -0
  30. rnsr/extraction/entity_linker.py +825 -0
  31. rnsr/extraction/grounded_extractor.py +722 -0
  32. rnsr/extraction/learned_types.py +599 -0
  33. rnsr/extraction/models.py +232 -0
  34. rnsr/extraction/relationship_extractor.py +600 -0
  35. rnsr/extraction/relationship_patterns.py +511 -0
  36. rnsr/extraction/relationship_validator.py +392 -0
  37. rnsr/extraction/rlm_extractor.py +589 -0
  38. rnsr/extraction/rlm_unified_extractor.py +990 -0
  39. rnsr/extraction/tot_validator.py +610 -0
  40. rnsr/extraction/unified_extractor.py +342 -0
  41. rnsr/indexing/__init__.py +60 -0
  42. rnsr/indexing/knowledge_graph.py +1128 -0
  43. rnsr/indexing/kv_store.py +313 -0
  44. rnsr/indexing/persistence.py +323 -0
  45. rnsr/indexing/semantic_retriever.py +237 -0
  46. rnsr/indexing/semantic_search.py +320 -0
  47. rnsr/indexing/skeleton_index.py +395 -0
  48. rnsr/ingestion/__init__.py +161 -0
  49. rnsr/ingestion/chart_parser.py +569 -0
  50. rnsr/ingestion/document_boundary.py +662 -0
  51. rnsr/ingestion/font_histogram.py +334 -0
  52. rnsr/ingestion/header_classifier.py +595 -0
  53. rnsr/ingestion/hierarchical_cluster.py +515 -0
  54. rnsr/ingestion/layout_detector.py +356 -0
  55. rnsr/ingestion/layout_model.py +379 -0
  56. rnsr/ingestion/ocr_fallback.py +177 -0
  57. rnsr/ingestion/pipeline.py +936 -0
  58. rnsr/ingestion/semantic_fallback.py +417 -0
  59. rnsr/ingestion/table_parser.py +799 -0
  60. rnsr/ingestion/text_builder.py +460 -0
  61. rnsr/ingestion/tree_builder.py +402 -0
  62. rnsr/ingestion/vision_retrieval.py +965 -0
  63. rnsr/ingestion/xy_cut.py +555 -0
  64. rnsr/llm.py +733 -0
  65. rnsr/models.py +167 -0
  66. rnsr/py.typed +2 -0
  67. rnsr-0.1.0.dist-info/METADATA +592 -0
  68. rnsr-0.1.0.dist-info/RECORD +72 -0
  69. rnsr-0.1.0.dist-info/WHEEL +5 -0
  70. rnsr-0.1.0.dist-info/entry_points.txt +2 -0
  71. rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
  72. rnsr-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,356 @@
1
+ """
2
+ Layout Complexity Detector - Auto-detect when to use visual analysis
3
+
4
+ Analyzes document layout characteristics to determine when LayoutLM
5
+ visual analysis should be triggered:
6
+
7
+ - Multi-column layouts (text bboxes overlap vertically)
8
+ - Empty/image-only pages (no extractable text)
9
+ - Complex L-shaped wraps (irregular bounding box patterns)
10
+
11
+ Usage:
12
+ from rnsr.ingestion.layout_detector import detect_layout_complexity
13
+
14
+ complexity = detect_layout_complexity("document.pdf")
15
+
16
+ if complexity.needs_visual_analysis:
17
+ # Use LayoutLM + XY-Cut
18
+ pass
19
+ else:
20
+ # Use simple font histogram
21
+ pass
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ from dataclasses import dataclass
27
+ from pathlib import Path
28
+
29
+ import fitz # PyMuPDF
30
+ import structlog
31
+
32
+ from rnsr.models import SpanInfo, BoundingBox
33
+
34
+ logger = structlog.get_logger(__name__)
35
+
36
+
37
+ @dataclass
38
+ class LayoutComplexity:
39
+ """Result of layout complexity analysis."""
40
+
41
+ # Detection flags
42
+ has_multi_column: bool = False
43
+ has_empty_pages: bool = False
44
+ has_complex_wrapping: bool = False
45
+
46
+ # Metrics
47
+ avg_columns_per_page: float = 1.0
48
+ empty_page_ratio: float = 0.0
49
+ bbox_overlap_score: float = 0.0
50
+
51
+ # Overall assessment
52
+ complexity_score: float = 0.0 # 0.0 (simple) to 1.0 (complex)
53
+ needs_visual_analysis: bool = False
54
+
55
+ # Reasoning
56
+ reason: str = ""
57
+
58
+
59
+ def detect_multi_column(spans: list[SpanInfo], page_height: float) -> bool:
60
+ """
61
+ Detect if page has multi-column layout.
62
+
63
+ Algorithm:
64
+ 1. Group spans by vertical position (Y coordinate)
65
+ 2. For each row, count distinct horizontal regions (columns)
66
+ 3. If >30% of rows have 2+ columns, it's multi-column
67
+
68
+ Args:
69
+ spans: List of text spans from a page.
70
+ page_height: Height of the page.
71
+
72
+ Returns:
73
+ True if multi-column layout detected.
74
+ """
75
+ if len(spans) < 10:
76
+ return False
77
+
78
+ # Group spans by vertical bands (rows)
79
+ row_height = 20.0 # Approximate line height
80
+ rows: dict[int, list[SpanInfo]] = {}
81
+
82
+ for span in spans:
83
+ row_idx = int(span.bbox.y0 / row_height)
84
+ if row_idx not in rows:
85
+ rows[row_idx] = []
86
+ rows[row_idx].append(span)
87
+
88
+ # Count columns per row
89
+ multi_column_rows = 0
90
+ total_rows = len(rows)
91
+
92
+ for row_spans in rows.values():
93
+ if len(row_spans) < 2:
94
+ continue
95
+
96
+ # Sort by X position
97
+ sorted_spans = sorted(row_spans, key=lambda s: s.bbox.x0)
98
+
99
+ # Check for gaps indicating columns
100
+ gaps = []
101
+ for i in range(len(sorted_spans) - 1):
102
+ gap = sorted_spans[i + 1].bbox.x0 - sorted_spans[i].bbox.x1
103
+ if gap > 30: # Significant gap
104
+ gaps.append(gap)
105
+
106
+ # If we have 1+ large gap, it's multi-column
107
+ if gaps and max(gaps) > 50:
108
+ multi_column_rows += 1
109
+
110
+ # Threshold: >30% rows are multi-column
111
+ if total_rows > 0:
112
+ ratio = multi_column_rows / total_rows
113
+ is_multi_column = ratio > 0.3
114
+
115
+ logger.debug(
116
+ "multi_column_detection",
117
+ multi_column_rows=multi_column_rows,
118
+ total_rows=total_rows,
119
+ ratio=ratio,
120
+ result=is_multi_column,
121
+ )
122
+
123
+ return is_multi_column
124
+
125
+ return False
126
+
127
+
128
+ def detect_empty_pages(pdf_path: Path | str, min_text_threshold: int = 10) -> tuple[bool, float]:
129
+ """
130
+ Detect if document has empty or image-only pages.
131
+
132
+ Args:
133
+ pdf_path: Path to PDF file.
134
+ min_text_threshold: Minimum word count to consider page non-empty.
135
+
136
+ Returns:
137
+ Tuple of (has_empty_pages, empty_page_ratio).
138
+ """
139
+ try:
140
+ doc = fitz.open(pdf_path)
141
+ except Exception as e:
142
+ logger.error("pdf_open_failed", path=str(pdf_path), error=str(e))
143
+ return False, 0.0
144
+
145
+ empty_pages = 0
146
+ total_pages = len(doc)
147
+
148
+ for page in doc:
149
+ text = page.get_text().strip() # type: ignore[union-attr]
150
+ word_count = len(text.split())
151
+
152
+ if word_count < min_text_threshold:
153
+ empty_pages += 1
154
+ logger.debug("empty_page_detected", page_num=page.number, words=word_count)
155
+
156
+ doc.close()
157
+
158
+ if total_pages > 0:
159
+ ratio = empty_pages / total_pages
160
+ has_empty = ratio > 0.1 # >10% pages empty
161
+
162
+ logger.info(
163
+ "empty_page_detection",
164
+ empty_pages=empty_pages,
165
+ total_pages=total_pages,
166
+ ratio=ratio,
167
+ result=has_empty,
168
+ )
169
+
170
+ return has_empty, ratio
171
+
172
+ return False, 0.0
173
+
174
+
175
+ def calculate_bbox_overlap_score(spans: list[SpanInfo]) -> float:
176
+ """
177
+ Calculate how much text bounding boxes overlap vertically.
178
+
179
+ High overlap suggests complex wrapping or multi-column layout.
180
+
181
+ Args:
182
+ spans: List of text spans.
183
+
184
+ Returns:
185
+ Overlap score from 0.0 (no overlap) to 1.0 (high overlap).
186
+ """
187
+ if len(spans) < 2:
188
+ return 0.0
189
+
190
+ overlap_count = 0
191
+ total_comparisons = 0
192
+
193
+ # Compare each span with others
194
+ for i, span1 in enumerate(spans):
195
+ for span2 in spans[i + 1:]:
196
+ total_comparisons += 1
197
+
198
+ # Check if Y ranges overlap
199
+ y1_min, y1_max = span1.bbox.y0, span1.bbox.y1
200
+ y2_min, y2_max = span2.bbox.y0, span2.bbox.y1
201
+
202
+ # Check for vertical overlap
203
+ if not (y1_max < y2_min or y2_max < y1_min):
204
+ # Check if they're in different horizontal regions
205
+ x1_center = (span1.bbox.x0 + span1.bbox.x1) / 2
206
+ x2_center = (span2.bbox.x0 + span2.bbox.x1) / 2
207
+
208
+ if abs(x1_center - x2_center) > 100: # Separated horizontally
209
+ overlap_count += 1
210
+
211
+ if total_comparisons > 0:
212
+ score = overlap_count / total_comparisons
213
+ logger.debug("bbox_overlap_calculated", score=score, overlaps=overlap_count)
214
+ return score
215
+
216
+ return 0.0
217
+
218
+
219
+ def detect_layout_complexity(
220
+ pdf_path: Path | str,
221
+ threshold: float = 0.3,
222
+ ) -> LayoutComplexity:
223
+ """
224
+ Analyze document layout to determine if visual analysis is needed.
225
+
226
+ Args:
227
+ pdf_path: Path to PDF file.
228
+ threshold: Complexity threshold (0.0-1.0) for triggering visual analysis.
229
+
230
+ Returns:
231
+ LayoutComplexity object with analysis results.
232
+
233
+ Example:
234
+ complexity = detect_layout_complexity("report.pdf")
235
+
236
+ if complexity.needs_visual_analysis:
237
+ print(f"Reason: {complexity.reason}")
238
+ # Use LayoutLM + XY-Cut
239
+ else:
240
+ # Use simple font histogram
241
+ """
242
+ pdf_path = Path(pdf_path)
243
+
244
+ logger.info("detecting_layout_complexity", path=pdf_path.name)
245
+
246
+ # Initialize result
247
+ result = LayoutComplexity()
248
+
249
+ try:
250
+ doc = fitz.open(pdf_path)
251
+ except Exception as e:
252
+ logger.error("pdf_open_failed", error=str(e))
253
+ return result
254
+
255
+ # 1. Check for empty pages
256
+ has_empty, empty_ratio = detect_empty_pages(pdf_path)
257
+ result.has_empty_pages = has_empty
258
+ result.empty_page_ratio = empty_ratio
259
+
260
+ # 2. Analyze first few pages for multi-column and overlap
261
+ pages_to_check = min(3, len(doc))
262
+ multi_column_pages = 0
263
+ total_overlap_score = 0.0
264
+
265
+ for page_num in range(pages_to_check):
266
+ page = doc[page_num]
267
+
268
+ # Extract spans
269
+ page_dict = page.get_text("dict") # type: ignore[assignment]
270
+ spans: list[SpanInfo] = []
271
+
272
+ for block in page_dict.get("blocks", []): # type: ignore[union-attr]
273
+ if "lines" not in block:
274
+ continue
275
+
276
+ for line in block["lines"]:
277
+ for span in line["spans"]:
278
+ text = str(span.get("text", "")).strip()
279
+ if len(text) < 2:
280
+ continue
281
+
282
+ bbox = span.get("bbox", [0, 0, 0, 0])
283
+ font_size = float(span.get("size", 12.0))
284
+
285
+ spans.append(SpanInfo(
286
+ text=text,
287
+ font_size=font_size,
288
+ font_name=str(span.get("font", "Unknown")),
289
+ is_bold=False,
290
+ is_italic=False,
291
+ bbox=BoundingBox(
292
+ x0=float(bbox[0]),
293
+ y0=float(bbox[1]),
294
+ x1=float(bbox[2]),
295
+ y1=float(bbox[3]),
296
+ ),
297
+ page_num=page_num,
298
+ ))
299
+
300
+ # Check multi-column
301
+ if detect_multi_column(spans, page.rect.height):
302
+ multi_column_pages += 1
303
+
304
+ # Calculate overlap
305
+ overlap = calculate_bbox_overlap_score(spans)
306
+ total_overlap_score += overlap
307
+
308
+ doc.close()
309
+
310
+ # 3. Calculate metrics
311
+ if pages_to_check > 0:
312
+ result.avg_columns_per_page = (multi_column_pages / pages_to_check) * 2
313
+ result.bbox_overlap_score = total_overlap_score / pages_to_check
314
+
315
+ result.has_multi_column = multi_column_pages >= 2 or result.bbox_overlap_score > 0.2
316
+ result.has_complex_wrapping = result.bbox_overlap_score > 0.3
317
+
318
+ # 4. Calculate overall complexity score
319
+ complexity_factors = []
320
+
321
+ if result.has_multi_column:
322
+ complexity_factors.append(0.5)
323
+ if result.has_empty_pages:
324
+ complexity_factors.append(0.3)
325
+ if result.has_complex_wrapping:
326
+ complexity_factors.append(0.4)
327
+
328
+ # Weighted average
329
+ if complexity_factors:
330
+ result.complexity_score = sum(complexity_factors) / len(complexity_factors)
331
+
332
+ # 5. Determine if visual analysis needed
333
+ result.needs_visual_analysis = result.complexity_score > threshold
334
+
335
+ # 6. Generate reasoning
336
+ reasons = []
337
+ if result.has_multi_column:
338
+ reasons.append("multi-column layout detected")
339
+ if result.has_empty_pages:
340
+ reasons.append(f"{result.empty_page_ratio:.0%} pages are empty/image-only")
341
+ if result.has_complex_wrapping:
342
+ reasons.append("complex text wrapping detected")
343
+
344
+ if reasons:
345
+ result.reason = "; ".join(reasons)
346
+ else:
347
+ result.reason = "simple single-column layout"
348
+
349
+ logger.info(
350
+ "layout_complexity_detected",
351
+ score=result.complexity_score,
352
+ needs_visual=result.needs_visual_analysis,
353
+ reason=result.reason,
354
+ )
355
+
356
+ return result