rnsr 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. rnsr/__init__.py +118 -0
  2. rnsr/__main__.py +242 -0
  3. rnsr/agent/__init__.py +218 -0
  4. rnsr/agent/cross_doc_navigator.py +767 -0
  5. rnsr/agent/graph.py +1557 -0
  6. rnsr/agent/llm_cache.py +575 -0
  7. rnsr/agent/navigator_api.py +497 -0
  8. rnsr/agent/provenance.py +772 -0
  9. rnsr/agent/query_clarifier.py +617 -0
  10. rnsr/agent/reasoning_memory.py +736 -0
  11. rnsr/agent/repl_env.py +709 -0
  12. rnsr/agent/rlm_navigator.py +2108 -0
  13. rnsr/agent/self_reflection.py +602 -0
  14. rnsr/agent/variable_store.py +308 -0
  15. rnsr/benchmarks/__init__.py +118 -0
  16. rnsr/benchmarks/comprehensive_benchmark.py +733 -0
  17. rnsr/benchmarks/evaluation_suite.py +1210 -0
  18. rnsr/benchmarks/finance_bench.py +147 -0
  19. rnsr/benchmarks/pdf_merger.py +178 -0
  20. rnsr/benchmarks/performance.py +321 -0
  21. rnsr/benchmarks/quality.py +321 -0
  22. rnsr/benchmarks/runner.py +298 -0
  23. rnsr/benchmarks/standard_benchmarks.py +995 -0
  24. rnsr/client.py +560 -0
  25. rnsr/document_store.py +394 -0
  26. rnsr/exceptions.py +74 -0
  27. rnsr/extraction/__init__.py +172 -0
  28. rnsr/extraction/candidate_extractor.py +357 -0
  29. rnsr/extraction/entity_extractor.py +581 -0
  30. rnsr/extraction/entity_linker.py +825 -0
  31. rnsr/extraction/grounded_extractor.py +722 -0
  32. rnsr/extraction/learned_types.py +599 -0
  33. rnsr/extraction/models.py +232 -0
  34. rnsr/extraction/relationship_extractor.py +600 -0
  35. rnsr/extraction/relationship_patterns.py +511 -0
  36. rnsr/extraction/relationship_validator.py +392 -0
  37. rnsr/extraction/rlm_extractor.py +589 -0
  38. rnsr/extraction/rlm_unified_extractor.py +990 -0
  39. rnsr/extraction/tot_validator.py +610 -0
  40. rnsr/extraction/unified_extractor.py +342 -0
  41. rnsr/indexing/__init__.py +60 -0
  42. rnsr/indexing/knowledge_graph.py +1128 -0
  43. rnsr/indexing/kv_store.py +313 -0
  44. rnsr/indexing/persistence.py +323 -0
  45. rnsr/indexing/semantic_retriever.py +237 -0
  46. rnsr/indexing/semantic_search.py +320 -0
  47. rnsr/indexing/skeleton_index.py +395 -0
  48. rnsr/ingestion/__init__.py +161 -0
  49. rnsr/ingestion/chart_parser.py +569 -0
  50. rnsr/ingestion/document_boundary.py +662 -0
  51. rnsr/ingestion/font_histogram.py +334 -0
  52. rnsr/ingestion/header_classifier.py +595 -0
  53. rnsr/ingestion/hierarchical_cluster.py +515 -0
  54. rnsr/ingestion/layout_detector.py +356 -0
  55. rnsr/ingestion/layout_model.py +379 -0
  56. rnsr/ingestion/ocr_fallback.py +177 -0
  57. rnsr/ingestion/pipeline.py +936 -0
  58. rnsr/ingestion/semantic_fallback.py +417 -0
  59. rnsr/ingestion/table_parser.py +799 -0
  60. rnsr/ingestion/text_builder.py +460 -0
  61. rnsr/ingestion/tree_builder.py +402 -0
  62. rnsr/ingestion/vision_retrieval.py +965 -0
  63. rnsr/ingestion/xy_cut.py +555 -0
  64. rnsr/llm.py +733 -0
  65. rnsr/models.py +167 -0
  66. rnsr/py.typed +2 -0
  67. rnsr-0.1.0.dist-info/METADATA +592 -0
  68. rnsr-0.1.0.dist-info/RECORD +72 -0
  69. rnsr-0.1.0.dist-info/WHEEL +5 -0
  70. rnsr-0.1.0.dist-info/entry_points.txt +2 -0
  71. rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
  72. rnsr-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,555 @@
1
+ """
2
+ Recursive XY-Cut Algorithm - Visual-Geometric Segmentation
3
+
4
+ Implements the Recursive XY-Cut (RXYC) algorithm from Section 4.1.1:
5
+ "A top-down page segmentation technique that is particularly effective
6
+ for discovering document structure without relying on text content."
7
+
8
+ The algorithm:
9
+ 1. Treats document page as a binary image
10
+ 2. Calculates projection profiles (sum of black pixels) along X and Y axes
11
+ 3. Identifies "valleys" (whitespace gaps) as natural separators
12
+ 4. Recursively cuts at widest valleys to produce a tree of bounding boxes
13
+ 5. Larger boxes (detected early) = major structural elements
14
+ 6. Smaller, deeply nested boxes = paragraphs/cells
15
+
16
+ Use this for:
17
+ - Multi-column layouts
18
+ - Complex L-shaped text wraps
19
+ - Documents with visual structure but no font variance
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ from dataclasses import dataclass, field
25
+ from pathlib import Path
26
+ from typing import Any
27
+
28
+ import numpy as np
29
+ import structlog
30
+
31
+ logger = structlog.get_logger(__name__)
32
+
33
+
34
+ @dataclass
35
+ class BoundingRegion:
36
+ """A rectangular region on a page."""
37
+
38
+ x0: float
39
+ y0: float
40
+ x1: float
41
+ y1: float
42
+ page_num: int = 0
43
+
44
+ @property
45
+ def width(self) -> float:
46
+ return self.x1 - self.x0
47
+
48
+ @property
49
+ def height(self) -> float:
50
+ return self.y1 - self.y0
51
+
52
+ @property
53
+ def area(self) -> float:
54
+ return self.width * self.height
55
+
56
+ def contains(self, other: "BoundingRegion") -> bool:
57
+ """Check if this region contains another."""
58
+ return (
59
+ self.x0 <= other.x0 and
60
+ self.y0 <= other.y0 and
61
+ self.x1 >= other.x1 and
62
+ self.y1 >= other.y1
63
+ )
64
+
65
+
66
+ @dataclass
67
+ class SegmentNode:
68
+ """A node in the XY-Cut segmentation tree."""
69
+
70
+ region: BoundingRegion
71
+ children: list["SegmentNode"] = field(default_factory=list)
72
+ text: str = ""
73
+ node_type: str = "region" # "region", "text_block", "header", "body"
74
+ depth: int = 0
75
+
76
+ @property
77
+ def is_leaf(self) -> bool:
78
+ return len(self.children) == 0
79
+
80
+
81
+ class RecursiveXYCutter:
82
+ """
83
+ Implements the Recursive XY-Cut algorithm for document segmentation.
84
+
85
+ Per Section 6.2 of the research paper:
86
+ "A major failure mode of simple parsing is complex layouts
87
+ (e.g., a figure spanning two columns, or an L-shaped text wrap).
88
+ The Recursive XY-Cut handles this."
89
+ """
90
+
91
+ def __init__(
92
+ self,
93
+ min_gap_ratio: float = 0.02, # Minimum gap as ratio of page dimension
94
+ min_region_ratio: float = 0.01, # Minimum region size ratio
95
+ max_depth: int = 10, # Maximum recursion depth
96
+ valley_threshold: float = 0.1, # Threshold for valley detection
97
+ ):
98
+ """
99
+ Initialize the XY-Cutter.
100
+
101
+ Args:
102
+ min_gap_ratio: Minimum whitespace gap size as ratio of page size.
103
+ min_region_ratio: Minimum region size to consider.
104
+ max_depth: Maximum recursion depth.
105
+ valley_threshold: Threshold for detecting valleys in projection.
106
+ """
107
+ self.min_gap_ratio = min_gap_ratio
108
+ self.min_region_ratio = min_region_ratio
109
+ self.max_depth = max_depth
110
+ self.valley_threshold = valley_threshold
111
+
112
+ def segment_pdf(self, pdf_path: Path | str) -> list[SegmentNode]:
113
+ """
114
+ Segment all pages of a PDF using XY-Cut.
115
+
116
+ Args:
117
+ pdf_path: Path to the PDF file.
118
+
119
+ Returns:
120
+ List of SegmentNode trees (one per page).
121
+ """
122
+ import fitz
123
+
124
+ pdf_path = Path(pdf_path)
125
+ doc = fitz.open(pdf_path)
126
+
127
+ page_trees = []
128
+
129
+ for page_num in range(len(doc)):
130
+ page = doc[page_num]
131
+ logger.debug("segmenting_page", page=page_num)
132
+ tree = self.segment_page(page, page_num)
133
+ page_trees.append(tree)
134
+
135
+ doc.close()
136
+
137
+ logger.info(
138
+ "xy_cut_complete",
139
+ pages=len(page_trees),
140
+ total_regions=sum(self._count_nodes(t) for t in page_trees),
141
+ )
142
+
143
+ return page_trees
144
+
145
+ def segment_page(self, page: Any, page_num: int = 0) -> SegmentNode:
146
+ """
147
+ Segment a single page using XY-Cut.
148
+
149
+ Args:
150
+ page: A fitz.Page object.
151
+ page_num: Page number for metadata.
152
+
153
+ Returns:
154
+ Root SegmentNode with hierarchy of regions.
155
+ """
156
+ import fitz
157
+
158
+ # Get page dimensions
159
+ rect = page.rect
160
+ page_width = rect.width
161
+ page_height = rect.height
162
+
163
+ # Create initial region (full page)
164
+ root_region = BoundingRegion(
165
+ x0=0, y0=0,
166
+ x1=page_width, y1=page_height,
167
+ page_num=page_num,
168
+ )
169
+
170
+ # Render page to pixmap for projection analysis
171
+ # Use lower resolution for speed (72 dpi = 1x)
172
+ mat = fitz.Matrix(1, 1) # 72 dpi
173
+ pix = page.get_pixmap(matrix=mat, alpha=False)
174
+
175
+ # Convert to numpy array (grayscale)
176
+ img = np.frombuffer(pix.samples, dtype=np.uint8)
177
+ img = img.reshape(pix.height, pix.width, 3)
178
+ gray = np.mean(img, axis=2)
179
+
180
+ # Binarize (invert so text = 1, background = 0)
181
+ binary = (gray < 240).astype(np.float32)
182
+
183
+ # Calculate minimum dimensions
184
+ min_gap_x = int(page_width * self.min_gap_ratio)
185
+ min_gap_y = int(page_height * self.min_gap_ratio)
186
+ min_region_w = int(page_width * self.min_region_ratio)
187
+ min_region_h = int(page_height * self.min_region_ratio)
188
+
189
+ # Recursive cut
190
+ root = SegmentNode(region=root_region, depth=0)
191
+ self._recursive_cut(
192
+ binary, root,
193
+ 0, 0, pix.width, pix.height,
194
+ min_gap_x, min_gap_y,
195
+ min_region_w, min_region_h,
196
+ page_width / pix.width, # Scale factor
197
+ page_height / pix.height,
198
+ )
199
+
200
+ return root
201
+
202
+ def _recursive_cut(
203
+ self,
204
+ binary: np.ndarray,
205
+ parent: SegmentNode,
206
+ x0: int, y0: int, x1: int, y1: int,
207
+ min_gap_x: int, min_gap_y: int,
208
+ min_region_w: int, min_region_h: int,
209
+ scale_x: float, scale_y: float,
210
+ ) -> None:
211
+ """
212
+ Recursively cut a region.
213
+
214
+ Per the research paper algorithm:
215
+ 1. Calculate projection profiles
216
+ 2. Find valleys (gaps of whitespace)
217
+ 3. Split horizontally first (Y-cut), then vertically (X-cut)
218
+ 4. Recurse on sub-regions
219
+ """
220
+ if parent.depth >= self.max_depth:
221
+ return
222
+
223
+ width = x1 - x0
224
+ height = y1 - y0
225
+
226
+ # Check minimum size
227
+ if width < min_region_w or height < min_region_h:
228
+ return
229
+
230
+ # Extract region
231
+ region_pixels = binary[y0:y1, x0:x1]
232
+
233
+ if region_pixels.size == 0:
234
+ return
235
+
236
+ # Calculate projection profiles
237
+ y_proj = np.sum(region_pixels, axis=1) # Horizontal projection
238
+ x_proj = np.sum(region_pixels, axis=0) # Vertical projection
239
+
240
+ # Try horizontal cut first (Y-cut - splits top/bottom)
241
+ y_valleys = self._find_valleys(y_proj, min_gap_y)
242
+
243
+ if y_valleys:
244
+ # Split at the widest valley
245
+ best_valley = max(y_valleys, key=lambda v: v[1] - v[0])
246
+ cut_y = (best_valley[0] + best_valley[1]) // 2
247
+
248
+ # Create two child regions
249
+ if cut_y - y0 > min_region_h:
250
+ top_region = BoundingRegion(
251
+ x0=x0 * scale_x, y0=y0 * scale_y,
252
+ x1=x1 * scale_x, y1=cut_y * scale_y,
253
+ page_num=parent.region.page_num,
254
+ )
255
+ top_node = SegmentNode(region=top_region, depth=parent.depth + 1)
256
+ parent.children.append(top_node)
257
+ self._recursive_cut(
258
+ binary, top_node,
259
+ x0, y0, x1, cut_y,
260
+ min_gap_x, min_gap_y,
261
+ min_region_w, min_region_h,
262
+ scale_x, scale_y,
263
+ )
264
+
265
+ if y1 - cut_y > min_region_h:
266
+ bottom_region = BoundingRegion(
267
+ x0=x0 * scale_x, y0=cut_y * scale_y,
268
+ x1=x1 * scale_x, y1=y1 * scale_y,
269
+ page_num=parent.region.page_num,
270
+ )
271
+ bottom_node = SegmentNode(region=bottom_region, depth=parent.depth + 1)
272
+ parent.children.append(bottom_node)
273
+ self._recursive_cut(
274
+ binary, bottom_node,
275
+ x0, cut_y, x1, y1,
276
+ min_gap_x, min_gap_y,
277
+ min_region_w, min_region_h,
278
+ scale_x, scale_y,
279
+ )
280
+ return
281
+
282
+ # No horizontal cut found - try vertical (X-cut - splits columns)
283
+ x_valleys = self._find_valleys(x_proj, min_gap_x)
284
+
285
+ if x_valleys:
286
+ # Split at the widest valley
287
+ best_valley = max(x_valleys, key=lambda v: v[1] - v[0])
288
+ cut_x = (best_valley[0] + best_valley[1]) // 2
289
+
290
+ # Create two child regions
291
+ if cut_x - x0 > min_region_w:
292
+ left_region = BoundingRegion(
293
+ x0=x0 * scale_x, y0=y0 * scale_y,
294
+ x1=cut_x * scale_x, y1=y1 * scale_y,
295
+ page_num=parent.region.page_num,
296
+ )
297
+ left_node = SegmentNode(region=left_region, depth=parent.depth + 1)
298
+ parent.children.append(left_node)
299
+ self._recursive_cut(
300
+ binary, left_node,
301
+ x0, y0, cut_x, y1,
302
+ min_gap_x, min_gap_y,
303
+ min_region_w, min_region_h,
304
+ scale_x, scale_y,
305
+ )
306
+
307
+ if x1 - cut_x > min_region_w:
308
+ right_region = BoundingRegion(
309
+ x0=cut_x * scale_x, y0=y0 * scale_y,
310
+ x1=x1 * scale_x, y1=y1 * scale_y,
311
+ page_num=parent.region.page_num,
312
+ )
313
+ right_node = SegmentNode(region=right_region, depth=parent.depth + 1)
314
+ parent.children.append(right_node)
315
+ self._recursive_cut(
316
+ binary, right_node,
317
+ cut_x, y0, x1, y1,
318
+ min_gap_x, min_gap_y,
319
+ min_region_w, min_region_h,
320
+ scale_x, scale_y,
321
+ )
322
+ return
323
+
324
+ # No cuts possible - this is a leaf (text block)
325
+ parent.node_type = "text_block"
326
+
327
+ def _find_valleys(
328
+ self,
329
+ projection: np.ndarray,
330
+ min_gap: int,
331
+ ) -> list[tuple[int, int]]:
332
+ """
333
+ Find valleys (whitespace gaps) in a projection profile.
334
+
335
+ A valley is a contiguous region where the projection is below threshold.
336
+
337
+ Args:
338
+ projection: 1D array of projection values.
339
+ min_gap: Minimum gap size to consider.
340
+
341
+ Returns:
342
+ List of (start, end) tuples for each valley.
343
+ """
344
+ if len(projection) == 0:
345
+ return []
346
+
347
+ # Normalize projection
348
+ max_val = np.max(projection)
349
+ if max_val == 0:
350
+ return []
351
+
352
+ normalized = projection / max_val
353
+
354
+ # Find regions below threshold (valleys)
355
+ is_valley = normalized < self.valley_threshold
356
+
357
+ valleys = []
358
+ in_valley = False
359
+ valley_start = 0
360
+
361
+ for i, is_v in enumerate(is_valley):
362
+ if is_v and not in_valley:
363
+ # Start of valley
364
+ valley_start = i
365
+ in_valley = True
366
+ elif not is_v and in_valley:
367
+ # End of valley
368
+ if i - valley_start >= min_gap:
369
+ valleys.append((valley_start, i))
370
+ in_valley = False
371
+
372
+ # Handle valley at end
373
+ if in_valley and len(projection) - valley_start >= min_gap:
374
+ valleys.append((valley_start, len(projection)))
375
+
376
+ return valleys
377
+
378
+ def _count_nodes(self, node: SegmentNode) -> int:
379
+ """Count total nodes in a tree."""
380
+ return 1 + sum(self._count_nodes(c) for c in node.children)
381
+
382
+ def extract_text_in_regions(
383
+ self,
384
+ page: Any,
385
+ root: SegmentNode,
386
+ ) -> None:
387
+ """
388
+ Extract text content for each leaf region.
389
+
390
+ Args:
391
+ page: A fitz.Page object.
392
+ root: Root SegmentNode from segment_page().
393
+ """
394
+ self._extract_text_recursive(page, root)
395
+
396
+ def _extract_text_recursive(
397
+ self,
398
+ page: Any,
399
+ node: SegmentNode,
400
+ ) -> None:
401
+ """Recursively extract text for leaf nodes."""
402
+ import fitz
403
+
404
+ if node.is_leaf:
405
+ # Extract text from this region
406
+ rect = fitz.Rect(
407
+ node.region.x0,
408
+ node.region.y0,
409
+ node.region.x1,
410
+ node.region.y1,
411
+ )
412
+ node.text = page.get_text("text", clip=rect).strip()
413
+ else:
414
+ for child in node.children:
415
+ self._extract_text_recursive(page, child)
416
+
417
+
418
+ def segment_pdf_with_xy_cut(pdf_path: Path | str) -> list[SegmentNode]:
419
+ """
420
+ Convenience function to segment a PDF using XY-Cut.
421
+
422
+ Args:
423
+ pdf_path: Path to the PDF file.
424
+
425
+ Returns:
426
+ List of SegmentNode trees (one per page).
427
+
428
+ Example:
429
+ trees = segment_pdf_with_xy_cut("document.pdf")
430
+ for page_tree in trees:
431
+ for leaf in get_leaves(page_tree):
432
+ print(leaf.text)
433
+ """
434
+ cutter = RecursiveXYCutter()
435
+ return cutter.segment_pdf(pdf_path)
436
+
437
+
438
+ def get_leaves(node: SegmentNode) -> list[SegmentNode]:
439
+ """Get all leaf nodes from a segment tree."""
440
+ if node.is_leaf:
441
+ return [node]
442
+
443
+ leaves = []
444
+ for child in node.children:
445
+ leaves.extend(get_leaves(child))
446
+ return leaves
447
+
448
+
449
+ def analyze_document_with_xycut(
450
+ pdf_path: Path | str,
451
+ use_layoutlm: bool = True,
452
+ ) -> Any:
453
+ """
454
+ Analyze document using XY-Cut + LayoutLM visual classification.
455
+
456
+ Combines geometric segmentation (XY-Cut) with visual analysis (LayoutLM)
457
+ to create a hierarchical document tree.
458
+
459
+ Args:
460
+ pdf_path: Path to PDF file.
461
+ use_layoutlm: Use LayoutLM to classify block types (Header/Body/Title).
462
+
463
+ Returns:
464
+ DocumentTree with visually-detected structure.
465
+ """
466
+ from rnsr.models import DocumentNode, DocumentTree
467
+ import fitz
468
+
469
+ pdf_path = Path(pdf_path)
470
+
471
+ # Segment with XY-Cut
472
+ cutter = RecursiveXYCutter()
473
+ page_trees = cutter.segment_pdf(pdf_path)
474
+
475
+ # Extract text for each region
476
+ doc = fitz.open(pdf_path)
477
+ for page_num, tree in enumerate(page_trees):
478
+ cutter.extract_text_in_regions(doc[page_num], tree)
479
+
480
+ # Optionally classify blocks with LayoutLM
481
+ if use_layoutlm:
482
+ try:
483
+ from rnsr.ingestion.layout_model import classify_layout_blocks
484
+ from PIL import Image
485
+
486
+ for page_num, tree in enumerate(page_trees):
487
+ # Render page as image
488
+ page = doc[page_num]
489
+ pix = page.get_pixmap(dpi=150)
490
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
491
+
492
+ # Get all leaf regions
493
+ leaves = get_leaves(tree)
494
+ bboxes = [(leaf.region.x0, leaf.region.y0, leaf.region.x1, leaf.region.y1)
495
+ for leaf in leaves]
496
+ texts = [leaf.text for leaf in leaves]
497
+
498
+ # Classify with LayoutLM
499
+ if bboxes:
500
+ results = classify_layout_blocks(img, bboxes, texts)
501
+
502
+ # Update node types based on classification
503
+ for leaf, result in zip(leaves, results):
504
+ leaf.node_type = result["label"].lower()
505
+
506
+ except Exception as e:
507
+ logger.warning("layoutlm_classification_failed", error=str(e))
508
+
509
+ doc.close()
510
+
511
+ # Convert to DocumentTree
512
+ root = DocumentNode(id="root", level=0, header=pdf_path.stem)
513
+
514
+ section_num = 0
515
+ for page_tree in page_trees:
516
+ for leaf in get_leaves(page_tree):
517
+ if not leaf.text.strip():
518
+ continue
519
+
520
+ section_num += 1
521
+
522
+ # Determine if it's a header based on LayoutLM classification
523
+ is_header = leaf.node_type in ("header", "title")
524
+
525
+ if is_header:
526
+ # Create header node
527
+ section = DocumentNode(
528
+ id=f"sec_{section_num:03d}",
529
+ level=1,
530
+ header=leaf.text.strip(),
531
+ page_num=leaf.region.page_num,
532
+ )
533
+ else:
534
+ # Create body node with synthetic header
535
+ from rnsr.ingestion.semantic_fallback import _generate_synthetic_header
536
+
537
+ section = DocumentNode(
538
+ id=f"sec_{section_num:03d}",
539
+ level=1,
540
+ header=_generate_synthetic_header(leaf.text, section_num),
541
+ content=leaf.text,
542
+ page_num=leaf.region.page_num,
543
+ )
544
+
545
+ root.children.append(section)
546
+
547
+ return DocumentTree(
548
+ title=pdf_path.stem,
549
+ root=root,
550
+ total_nodes=section_num + 1,
551
+ ingestion_tier=1,
552
+ ingestion_method="xy_cut_layoutlm" if use_layoutlm else "xy_cut",
553
+ )
554
+
555
+