rnsr 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rnsr/__init__.py +118 -0
- rnsr/__main__.py +242 -0
- rnsr/agent/__init__.py +218 -0
- rnsr/agent/cross_doc_navigator.py +767 -0
- rnsr/agent/graph.py +1557 -0
- rnsr/agent/llm_cache.py +575 -0
- rnsr/agent/navigator_api.py +497 -0
- rnsr/agent/provenance.py +772 -0
- rnsr/agent/query_clarifier.py +617 -0
- rnsr/agent/reasoning_memory.py +736 -0
- rnsr/agent/repl_env.py +709 -0
- rnsr/agent/rlm_navigator.py +2108 -0
- rnsr/agent/self_reflection.py +602 -0
- rnsr/agent/variable_store.py +308 -0
- rnsr/benchmarks/__init__.py +118 -0
- rnsr/benchmarks/comprehensive_benchmark.py +733 -0
- rnsr/benchmarks/evaluation_suite.py +1210 -0
- rnsr/benchmarks/finance_bench.py +147 -0
- rnsr/benchmarks/pdf_merger.py +178 -0
- rnsr/benchmarks/performance.py +321 -0
- rnsr/benchmarks/quality.py +321 -0
- rnsr/benchmarks/runner.py +298 -0
- rnsr/benchmarks/standard_benchmarks.py +995 -0
- rnsr/client.py +560 -0
- rnsr/document_store.py +394 -0
- rnsr/exceptions.py +74 -0
- rnsr/extraction/__init__.py +172 -0
- rnsr/extraction/candidate_extractor.py +357 -0
- rnsr/extraction/entity_extractor.py +581 -0
- rnsr/extraction/entity_linker.py +825 -0
- rnsr/extraction/grounded_extractor.py +722 -0
- rnsr/extraction/learned_types.py +599 -0
- rnsr/extraction/models.py +232 -0
- rnsr/extraction/relationship_extractor.py +600 -0
- rnsr/extraction/relationship_patterns.py +511 -0
- rnsr/extraction/relationship_validator.py +392 -0
- rnsr/extraction/rlm_extractor.py +589 -0
- rnsr/extraction/rlm_unified_extractor.py +990 -0
- rnsr/extraction/tot_validator.py +610 -0
- rnsr/extraction/unified_extractor.py +342 -0
- rnsr/indexing/__init__.py +60 -0
- rnsr/indexing/knowledge_graph.py +1128 -0
- rnsr/indexing/kv_store.py +313 -0
- rnsr/indexing/persistence.py +323 -0
- rnsr/indexing/semantic_retriever.py +237 -0
- rnsr/indexing/semantic_search.py +320 -0
- rnsr/indexing/skeleton_index.py +395 -0
- rnsr/ingestion/__init__.py +161 -0
- rnsr/ingestion/chart_parser.py +569 -0
- rnsr/ingestion/document_boundary.py +662 -0
- rnsr/ingestion/font_histogram.py +334 -0
- rnsr/ingestion/header_classifier.py +595 -0
- rnsr/ingestion/hierarchical_cluster.py +515 -0
- rnsr/ingestion/layout_detector.py +356 -0
- rnsr/ingestion/layout_model.py +379 -0
- rnsr/ingestion/ocr_fallback.py +177 -0
- rnsr/ingestion/pipeline.py +936 -0
- rnsr/ingestion/semantic_fallback.py +417 -0
- rnsr/ingestion/table_parser.py +799 -0
- rnsr/ingestion/text_builder.py +460 -0
- rnsr/ingestion/tree_builder.py +402 -0
- rnsr/ingestion/vision_retrieval.py +965 -0
- rnsr/ingestion/xy_cut.py +555 -0
- rnsr/llm.py +733 -0
- rnsr/models.py +167 -0
- rnsr/py.typed +2 -0
- rnsr-0.1.0.dist-info/METADATA +592 -0
- rnsr-0.1.0.dist-info/RECORD +72 -0
- rnsr-0.1.0.dist-info/WHEEL +5 -0
- rnsr-0.1.0.dist-info/entry_points.txt +2 -0
- rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
- rnsr-0.1.0.dist-info/top_level.txt +1 -0
rnsr/ingestion/xy_cut.py
ADDED
|
@@ -0,0 +1,555 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Recursive XY-Cut Algorithm - Visual-Geometric Segmentation
|
|
3
|
+
|
|
4
|
+
Implements the Recursive XY-Cut (RXYC) algorithm from Section 4.1.1:
|
|
5
|
+
"A top-down page segmentation technique that is particularly effective
|
|
6
|
+
for discovering document structure without relying on text content."
|
|
7
|
+
|
|
8
|
+
The algorithm:
|
|
9
|
+
1. Treats document page as a binary image
|
|
10
|
+
2. Calculates projection profiles (sum of black pixels) along X and Y axes
|
|
11
|
+
3. Identifies "valleys" (whitespace gaps) as natural separators
|
|
12
|
+
4. Recursively cuts at widest valleys to produce a tree of bounding boxes
|
|
13
|
+
5. Larger boxes (detected early) = major structural elements
|
|
14
|
+
6. Smaller, deeply nested boxes = paragraphs/cells
|
|
15
|
+
|
|
16
|
+
Use this for:
|
|
17
|
+
- Multi-column layouts
|
|
18
|
+
- Complex L-shaped text wraps
|
|
19
|
+
- Documents with visual structure but no font variance
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
from dataclasses import dataclass, field
|
|
25
|
+
from pathlib import Path
|
|
26
|
+
from typing import Any
|
|
27
|
+
|
|
28
|
+
import numpy as np
|
|
29
|
+
import structlog
|
|
30
|
+
|
|
31
|
+
logger = structlog.get_logger(__name__)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class BoundingRegion:
|
|
36
|
+
"""A rectangular region on a page."""
|
|
37
|
+
|
|
38
|
+
x0: float
|
|
39
|
+
y0: float
|
|
40
|
+
x1: float
|
|
41
|
+
y1: float
|
|
42
|
+
page_num: int = 0
|
|
43
|
+
|
|
44
|
+
@property
|
|
45
|
+
def width(self) -> float:
|
|
46
|
+
return self.x1 - self.x0
|
|
47
|
+
|
|
48
|
+
@property
|
|
49
|
+
def height(self) -> float:
|
|
50
|
+
return self.y1 - self.y0
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
def area(self) -> float:
|
|
54
|
+
return self.width * self.height
|
|
55
|
+
|
|
56
|
+
def contains(self, other: "BoundingRegion") -> bool:
|
|
57
|
+
"""Check if this region contains another."""
|
|
58
|
+
return (
|
|
59
|
+
self.x0 <= other.x0 and
|
|
60
|
+
self.y0 <= other.y0 and
|
|
61
|
+
self.x1 >= other.x1 and
|
|
62
|
+
self.y1 >= other.y1
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@dataclass
|
|
67
|
+
class SegmentNode:
|
|
68
|
+
"""A node in the XY-Cut segmentation tree."""
|
|
69
|
+
|
|
70
|
+
region: BoundingRegion
|
|
71
|
+
children: list["SegmentNode"] = field(default_factory=list)
|
|
72
|
+
text: str = ""
|
|
73
|
+
node_type: str = "region" # "region", "text_block", "header", "body"
|
|
74
|
+
depth: int = 0
|
|
75
|
+
|
|
76
|
+
@property
|
|
77
|
+
def is_leaf(self) -> bool:
|
|
78
|
+
return len(self.children) == 0
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class RecursiveXYCutter:
|
|
82
|
+
"""
|
|
83
|
+
Implements the Recursive XY-Cut algorithm for document segmentation.
|
|
84
|
+
|
|
85
|
+
Per Section 6.2 of the research paper:
|
|
86
|
+
"A major failure mode of simple parsing is complex layouts
|
|
87
|
+
(e.g., a figure spanning two columns, or an L-shaped text wrap).
|
|
88
|
+
The Recursive XY-Cut handles this."
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
def __init__(
|
|
92
|
+
self,
|
|
93
|
+
min_gap_ratio: float = 0.02, # Minimum gap as ratio of page dimension
|
|
94
|
+
min_region_ratio: float = 0.01, # Minimum region size ratio
|
|
95
|
+
max_depth: int = 10, # Maximum recursion depth
|
|
96
|
+
valley_threshold: float = 0.1, # Threshold for valley detection
|
|
97
|
+
):
|
|
98
|
+
"""
|
|
99
|
+
Initialize the XY-Cutter.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
min_gap_ratio: Minimum whitespace gap size as ratio of page size.
|
|
103
|
+
min_region_ratio: Minimum region size to consider.
|
|
104
|
+
max_depth: Maximum recursion depth.
|
|
105
|
+
valley_threshold: Threshold for detecting valleys in projection.
|
|
106
|
+
"""
|
|
107
|
+
self.min_gap_ratio = min_gap_ratio
|
|
108
|
+
self.min_region_ratio = min_region_ratio
|
|
109
|
+
self.max_depth = max_depth
|
|
110
|
+
self.valley_threshold = valley_threshold
|
|
111
|
+
|
|
112
|
+
def segment_pdf(self, pdf_path: Path | str) -> list[SegmentNode]:
|
|
113
|
+
"""
|
|
114
|
+
Segment all pages of a PDF using XY-Cut.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
pdf_path: Path to the PDF file.
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
List of SegmentNode trees (one per page).
|
|
121
|
+
"""
|
|
122
|
+
import fitz
|
|
123
|
+
|
|
124
|
+
pdf_path = Path(pdf_path)
|
|
125
|
+
doc = fitz.open(pdf_path)
|
|
126
|
+
|
|
127
|
+
page_trees = []
|
|
128
|
+
|
|
129
|
+
for page_num in range(len(doc)):
|
|
130
|
+
page = doc[page_num]
|
|
131
|
+
logger.debug("segmenting_page", page=page_num)
|
|
132
|
+
tree = self.segment_page(page, page_num)
|
|
133
|
+
page_trees.append(tree)
|
|
134
|
+
|
|
135
|
+
doc.close()
|
|
136
|
+
|
|
137
|
+
logger.info(
|
|
138
|
+
"xy_cut_complete",
|
|
139
|
+
pages=len(page_trees),
|
|
140
|
+
total_regions=sum(self._count_nodes(t) for t in page_trees),
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
return page_trees
|
|
144
|
+
|
|
145
|
+
def segment_page(self, page: Any, page_num: int = 0) -> SegmentNode:
|
|
146
|
+
"""
|
|
147
|
+
Segment a single page using XY-Cut.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
page: A fitz.Page object.
|
|
151
|
+
page_num: Page number for metadata.
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
Root SegmentNode with hierarchy of regions.
|
|
155
|
+
"""
|
|
156
|
+
import fitz
|
|
157
|
+
|
|
158
|
+
# Get page dimensions
|
|
159
|
+
rect = page.rect
|
|
160
|
+
page_width = rect.width
|
|
161
|
+
page_height = rect.height
|
|
162
|
+
|
|
163
|
+
# Create initial region (full page)
|
|
164
|
+
root_region = BoundingRegion(
|
|
165
|
+
x0=0, y0=0,
|
|
166
|
+
x1=page_width, y1=page_height,
|
|
167
|
+
page_num=page_num,
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
# Render page to pixmap for projection analysis
|
|
171
|
+
# Use lower resolution for speed (72 dpi = 1x)
|
|
172
|
+
mat = fitz.Matrix(1, 1) # 72 dpi
|
|
173
|
+
pix = page.get_pixmap(matrix=mat, alpha=False)
|
|
174
|
+
|
|
175
|
+
# Convert to numpy array (grayscale)
|
|
176
|
+
img = np.frombuffer(pix.samples, dtype=np.uint8)
|
|
177
|
+
img = img.reshape(pix.height, pix.width, 3)
|
|
178
|
+
gray = np.mean(img, axis=2)
|
|
179
|
+
|
|
180
|
+
# Binarize (invert so text = 1, background = 0)
|
|
181
|
+
binary = (gray < 240).astype(np.float32)
|
|
182
|
+
|
|
183
|
+
# Calculate minimum dimensions
|
|
184
|
+
min_gap_x = int(page_width * self.min_gap_ratio)
|
|
185
|
+
min_gap_y = int(page_height * self.min_gap_ratio)
|
|
186
|
+
min_region_w = int(page_width * self.min_region_ratio)
|
|
187
|
+
min_region_h = int(page_height * self.min_region_ratio)
|
|
188
|
+
|
|
189
|
+
# Recursive cut
|
|
190
|
+
root = SegmentNode(region=root_region, depth=0)
|
|
191
|
+
self._recursive_cut(
|
|
192
|
+
binary, root,
|
|
193
|
+
0, 0, pix.width, pix.height,
|
|
194
|
+
min_gap_x, min_gap_y,
|
|
195
|
+
min_region_w, min_region_h,
|
|
196
|
+
page_width / pix.width, # Scale factor
|
|
197
|
+
page_height / pix.height,
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
return root
|
|
201
|
+
|
|
202
|
+
def _recursive_cut(
|
|
203
|
+
self,
|
|
204
|
+
binary: np.ndarray,
|
|
205
|
+
parent: SegmentNode,
|
|
206
|
+
x0: int, y0: int, x1: int, y1: int,
|
|
207
|
+
min_gap_x: int, min_gap_y: int,
|
|
208
|
+
min_region_w: int, min_region_h: int,
|
|
209
|
+
scale_x: float, scale_y: float,
|
|
210
|
+
) -> None:
|
|
211
|
+
"""
|
|
212
|
+
Recursively cut a region.
|
|
213
|
+
|
|
214
|
+
Per the research paper algorithm:
|
|
215
|
+
1. Calculate projection profiles
|
|
216
|
+
2. Find valleys (gaps of whitespace)
|
|
217
|
+
3. Split horizontally first (Y-cut), then vertically (X-cut)
|
|
218
|
+
4. Recurse on sub-regions
|
|
219
|
+
"""
|
|
220
|
+
if parent.depth >= self.max_depth:
|
|
221
|
+
return
|
|
222
|
+
|
|
223
|
+
width = x1 - x0
|
|
224
|
+
height = y1 - y0
|
|
225
|
+
|
|
226
|
+
# Check minimum size
|
|
227
|
+
if width < min_region_w or height < min_region_h:
|
|
228
|
+
return
|
|
229
|
+
|
|
230
|
+
# Extract region
|
|
231
|
+
region_pixels = binary[y0:y1, x0:x1]
|
|
232
|
+
|
|
233
|
+
if region_pixels.size == 0:
|
|
234
|
+
return
|
|
235
|
+
|
|
236
|
+
# Calculate projection profiles
|
|
237
|
+
y_proj = np.sum(region_pixels, axis=1) # Horizontal projection
|
|
238
|
+
x_proj = np.sum(region_pixels, axis=0) # Vertical projection
|
|
239
|
+
|
|
240
|
+
# Try horizontal cut first (Y-cut - splits top/bottom)
|
|
241
|
+
y_valleys = self._find_valleys(y_proj, min_gap_y)
|
|
242
|
+
|
|
243
|
+
if y_valleys:
|
|
244
|
+
# Split at the widest valley
|
|
245
|
+
best_valley = max(y_valleys, key=lambda v: v[1] - v[0])
|
|
246
|
+
cut_y = (best_valley[0] + best_valley[1]) // 2
|
|
247
|
+
|
|
248
|
+
# Create two child regions
|
|
249
|
+
if cut_y - y0 > min_region_h:
|
|
250
|
+
top_region = BoundingRegion(
|
|
251
|
+
x0=x0 * scale_x, y0=y0 * scale_y,
|
|
252
|
+
x1=x1 * scale_x, y1=cut_y * scale_y,
|
|
253
|
+
page_num=parent.region.page_num,
|
|
254
|
+
)
|
|
255
|
+
top_node = SegmentNode(region=top_region, depth=parent.depth + 1)
|
|
256
|
+
parent.children.append(top_node)
|
|
257
|
+
self._recursive_cut(
|
|
258
|
+
binary, top_node,
|
|
259
|
+
x0, y0, x1, cut_y,
|
|
260
|
+
min_gap_x, min_gap_y,
|
|
261
|
+
min_region_w, min_region_h,
|
|
262
|
+
scale_x, scale_y,
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
if y1 - cut_y > min_region_h:
|
|
266
|
+
bottom_region = BoundingRegion(
|
|
267
|
+
x0=x0 * scale_x, y0=cut_y * scale_y,
|
|
268
|
+
x1=x1 * scale_x, y1=y1 * scale_y,
|
|
269
|
+
page_num=parent.region.page_num,
|
|
270
|
+
)
|
|
271
|
+
bottom_node = SegmentNode(region=bottom_region, depth=parent.depth + 1)
|
|
272
|
+
parent.children.append(bottom_node)
|
|
273
|
+
self._recursive_cut(
|
|
274
|
+
binary, bottom_node,
|
|
275
|
+
x0, cut_y, x1, y1,
|
|
276
|
+
min_gap_x, min_gap_y,
|
|
277
|
+
min_region_w, min_region_h,
|
|
278
|
+
scale_x, scale_y,
|
|
279
|
+
)
|
|
280
|
+
return
|
|
281
|
+
|
|
282
|
+
# No horizontal cut found - try vertical (X-cut - splits columns)
|
|
283
|
+
x_valleys = self._find_valleys(x_proj, min_gap_x)
|
|
284
|
+
|
|
285
|
+
if x_valleys:
|
|
286
|
+
# Split at the widest valley
|
|
287
|
+
best_valley = max(x_valleys, key=lambda v: v[1] - v[0])
|
|
288
|
+
cut_x = (best_valley[0] + best_valley[1]) // 2
|
|
289
|
+
|
|
290
|
+
# Create two child regions
|
|
291
|
+
if cut_x - x0 > min_region_w:
|
|
292
|
+
left_region = BoundingRegion(
|
|
293
|
+
x0=x0 * scale_x, y0=y0 * scale_y,
|
|
294
|
+
x1=cut_x * scale_x, y1=y1 * scale_y,
|
|
295
|
+
page_num=parent.region.page_num,
|
|
296
|
+
)
|
|
297
|
+
left_node = SegmentNode(region=left_region, depth=parent.depth + 1)
|
|
298
|
+
parent.children.append(left_node)
|
|
299
|
+
self._recursive_cut(
|
|
300
|
+
binary, left_node,
|
|
301
|
+
x0, y0, cut_x, y1,
|
|
302
|
+
min_gap_x, min_gap_y,
|
|
303
|
+
min_region_w, min_region_h,
|
|
304
|
+
scale_x, scale_y,
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
if x1 - cut_x > min_region_w:
|
|
308
|
+
right_region = BoundingRegion(
|
|
309
|
+
x0=cut_x * scale_x, y0=y0 * scale_y,
|
|
310
|
+
x1=x1 * scale_x, y1=y1 * scale_y,
|
|
311
|
+
page_num=parent.region.page_num,
|
|
312
|
+
)
|
|
313
|
+
right_node = SegmentNode(region=right_region, depth=parent.depth + 1)
|
|
314
|
+
parent.children.append(right_node)
|
|
315
|
+
self._recursive_cut(
|
|
316
|
+
binary, right_node,
|
|
317
|
+
cut_x, y0, x1, y1,
|
|
318
|
+
min_gap_x, min_gap_y,
|
|
319
|
+
min_region_w, min_region_h,
|
|
320
|
+
scale_x, scale_y,
|
|
321
|
+
)
|
|
322
|
+
return
|
|
323
|
+
|
|
324
|
+
# No cuts possible - this is a leaf (text block)
|
|
325
|
+
parent.node_type = "text_block"
|
|
326
|
+
|
|
327
|
+
def _find_valleys(
|
|
328
|
+
self,
|
|
329
|
+
projection: np.ndarray,
|
|
330
|
+
min_gap: int,
|
|
331
|
+
) -> list[tuple[int, int]]:
|
|
332
|
+
"""
|
|
333
|
+
Find valleys (whitespace gaps) in a projection profile.
|
|
334
|
+
|
|
335
|
+
A valley is a contiguous region where the projection is below threshold.
|
|
336
|
+
|
|
337
|
+
Args:
|
|
338
|
+
projection: 1D array of projection values.
|
|
339
|
+
min_gap: Minimum gap size to consider.
|
|
340
|
+
|
|
341
|
+
Returns:
|
|
342
|
+
List of (start, end) tuples for each valley.
|
|
343
|
+
"""
|
|
344
|
+
if len(projection) == 0:
|
|
345
|
+
return []
|
|
346
|
+
|
|
347
|
+
# Normalize projection
|
|
348
|
+
max_val = np.max(projection)
|
|
349
|
+
if max_val == 0:
|
|
350
|
+
return []
|
|
351
|
+
|
|
352
|
+
normalized = projection / max_val
|
|
353
|
+
|
|
354
|
+
# Find regions below threshold (valleys)
|
|
355
|
+
is_valley = normalized < self.valley_threshold
|
|
356
|
+
|
|
357
|
+
valleys = []
|
|
358
|
+
in_valley = False
|
|
359
|
+
valley_start = 0
|
|
360
|
+
|
|
361
|
+
for i, is_v in enumerate(is_valley):
|
|
362
|
+
if is_v and not in_valley:
|
|
363
|
+
# Start of valley
|
|
364
|
+
valley_start = i
|
|
365
|
+
in_valley = True
|
|
366
|
+
elif not is_v and in_valley:
|
|
367
|
+
# End of valley
|
|
368
|
+
if i - valley_start >= min_gap:
|
|
369
|
+
valleys.append((valley_start, i))
|
|
370
|
+
in_valley = False
|
|
371
|
+
|
|
372
|
+
# Handle valley at end
|
|
373
|
+
if in_valley and len(projection) - valley_start >= min_gap:
|
|
374
|
+
valleys.append((valley_start, len(projection)))
|
|
375
|
+
|
|
376
|
+
return valleys
|
|
377
|
+
|
|
378
|
+
def _count_nodes(self, node: SegmentNode) -> int:
|
|
379
|
+
"""Count total nodes in a tree."""
|
|
380
|
+
return 1 + sum(self._count_nodes(c) for c in node.children)
|
|
381
|
+
|
|
382
|
+
def extract_text_in_regions(
|
|
383
|
+
self,
|
|
384
|
+
page: Any,
|
|
385
|
+
root: SegmentNode,
|
|
386
|
+
) -> None:
|
|
387
|
+
"""
|
|
388
|
+
Extract text content for each leaf region.
|
|
389
|
+
|
|
390
|
+
Args:
|
|
391
|
+
page: A fitz.Page object.
|
|
392
|
+
root: Root SegmentNode from segment_page().
|
|
393
|
+
"""
|
|
394
|
+
self._extract_text_recursive(page, root)
|
|
395
|
+
|
|
396
|
+
def _extract_text_recursive(
|
|
397
|
+
self,
|
|
398
|
+
page: Any,
|
|
399
|
+
node: SegmentNode,
|
|
400
|
+
) -> None:
|
|
401
|
+
"""Recursively extract text for leaf nodes."""
|
|
402
|
+
import fitz
|
|
403
|
+
|
|
404
|
+
if node.is_leaf:
|
|
405
|
+
# Extract text from this region
|
|
406
|
+
rect = fitz.Rect(
|
|
407
|
+
node.region.x0,
|
|
408
|
+
node.region.y0,
|
|
409
|
+
node.region.x1,
|
|
410
|
+
node.region.y1,
|
|
411
|
+
)
|
|
412
|
+
node.text = page.get_text("text", clip=rect).strip()
|
|
413
|
+
else:
|
|
414
|
+
for child in node.children:
|
|
415
|
+
self._extract_text_recursive(page, child)
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
def segment_pdf_with_xy_cut(pdf_path: Path | str) -> list[SegmentNode]:
|
|
419
|
+
"""
|
|
420
|
+
Convenience function to segment a PDF using XY-Cut.
|
|
421
|
+
|
|
422
|
+
Args:
|
|
423
|
+
pdf_path: Path to the PDF file.
|
|
424
|
+
|
|
425
|
+
Returns:
|
|
426
|
+
List of SegmentNode trees (one per page).
|
|
427
|
+
|
|
428
|
+
Example:
|
|
429
|
+
trees = segment_pdf_with_xy_cut("document.pdf")
|
|
430
|
+
for page_tree in trees:
|
|
431
|
+
for leaf in get_leaves(page_tree):
|
|
432
|
+
print(leaf.text)
|
|
433
|
+
"""
|
|
434
|
+
cutter = RecursiveXYCutter()
|
|
435
|
+
return cutter.segment_pdf(pdf_path)
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
def get_leaves(node: SegmentNode) -> list[SegmentNode]:
|
|
439
|
+
"""Get all leaf nodes from a segment tree."""
|
|
440
|
+
if node.is_leaf:
|
|
441
|
+
return [node]
|
|
442
|
+
|
|
443
|
+
leaves = []
|
|
444
|
+
for child in node.children:
|
|
445
|
+
leaves.extend(get_leaves(child))
|
|
446
|
+
return leaves
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
def analyze_document_with_xycut(
|
|
450
|
+
pdf_path: Path | str,
|
|
451
|
+
use_layoutlm: bool = True,
|
|
452
|
+
) -> Any:
|
|
453
|
+
"""
|
|
454
|
+
Analyze document using XY-Cut + LayoutLM visual classification.
|
|
455
|
+
|
|
456
|
+
Combines geometric segmentation (XY-Cut) with visual analysis (LayoutLM)
|
|
457
|
+
to create a hierarchical document tree.
|
|
458
|
+
|
|
459
|
+
Args:
|
|
460
|
+
pdf_path: Path to PDF file.
|
|
461
|
+
use_layoutlm: Use LayoutLM to classify block types (Header/Body/Title).
|
|
462
|
+
|
|
463
|
+
Returns:
|
|
464
|
+
DocumentTree with visually-detected structure.
|
|
465
|
+
"""
|
|
466
|
+
from rnsr.models import DocumentNode, DocumentTree
|
|
467
|
+
import fitz
|
|
468
|
+
|
|
469
|
+
pdf_path = Path(pdf_path)
|
|
470
|
+
|
|
471
|
+
# Segment with XY-Cut
|
|
472
|
+
cutter = RecursiveXYCutter()
|
|
473
|
+
page_trees = cutter.segment_pdf(pdf_path)
|
|
474
|
+
|
|
475
|
+
# Extract text for each region
|
|
476
|
+
doc = fitz.open(pdf_path)
|
|
477
|
+
for page_num, tree in enumerate(page_trees):
|
|
478
|
+
cutter.extract_text_in_regions(doc[page_num], tree)
|
|
479
|
+
|
|
480
|
+
# Optionally classify blocks with LayoutLM
|
|
481
|
+
if use_layoutlm:
|
|
482
|
+
try:
|
|
483
|
+
from rnsr.ingestion.layout_model import classify_layout_blocks
|
|
484
|
+
from PIL import Image
|
|
485
|
+
|
|
486
|
+
for page_num, tree in enumerate(page_trees):
|
|
487
|
+
# Render page as image
|
|
488
|
+
page = doc[page_num]
|
|
489
|
+
pix = page.get_pixmap(dpi=150)
|
|
490
|
+
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
|
491
|
+
|
|
492
|
+
# Get all leaf regions
|
|
493
|
+
leaves = get_leaves(tree)
|
|
494
|
+
bboxes = [(leaf.region.x0, leaf.region.y0, leaf.region.x1, leaf.region.y1)
|
|
495
|
+
for leaf in leaves]
|
|
496
|
+
texts = [leaf.text for leaf in leaves]
|
|
497
|
+
|
|
498
|
+
# Classify with LayoutLM
|
|
499
|
+
if bboxes:
|
|
500
|
+
results = classify_layout_blocks(img, bboxes, texts)
|
|
501
|
+
|
|
502
|
+
# Update node types based on classification
|
|
503
|
+
for leaf, result in zip(leaves, results):
|
|
504
|
+
leaf.node_type = result["label"].lower()
|
|
505
|
+
|
|
506
|
+
except Exception as e:
|
|
507
|
+
logger.warning("layoutlm_classification_failed", error=str(e))
|
|
508
|
+
|
|
509
|
+
doc.close()
|
|
510
|
+
|
|
511
|
+
# Convert to DocumentTree
|
|
512
|
+
root = DocumentNode(id="root", level=0, header=pdf_path.stem)
|
|
513
|
+
|
|
514
|
+
section_num = 0
|
|
515
|
+
for page_tree in page_trees:
|
|
516
|
+
for leaf in get_leaves(page_tree):
|
|
517
|
+
if not leaf.text.strip():
|
|
518
|
+
continue
|
|
519
|
+
|
|
520
|
+
section_num += 1
|
|
521
|
+
|
|
522
|
+
# Determine if it's a header based on LayoutLM classification
|
|
523
|
+
is_header = leaf.node_type in ("header", "title")
|
|
524
|
+
|
|
525
|
+
if is_header:
|
|
526
|
+
# Create header node
|
|
527
|
+
section = DocumentNode(
|
|
528
|
+
id=f"sec_{section_num:03d}",
|
|
529
|
+
level=1,
|
|
530
|
+
header=leaf.text.strip(),
|
|
531
|
+
page_num=leaf.region.page_num,
|
|
532
|
+
)
|
|
533
|
+
else:
|
|
534
|
+
# Create body node with synthetic header
|
|
535
|
+
from rnsr.ingestion.semantic_fallback import _generate_synthetic_header
|
|
536
|
+
|
|
537
|
+
section = DocumentNode(
|
|
538
|
+
id=f"sec_{section_num:03d}",
|
|
539
|
+
level=1,
|
|
540
|
+
header=_generate_synthetic_header(leaf.text, section_num),
|
|
541
|
+
content=leaf.text,
|
|
542
|
+
page_num=leaf.region.page_num,
|
|
543
|
+
)
|
|
544
|
+
|
|
545
|
+
root.children.append(section)
|
|
546
|
+
|
|
547
|
+
return DocumentTree(
|
|
548
|
+
title=pdf_path.stem,
|
|
549
|
+
root=root,
|
|
550
|
+
total_nodes=section_num + 1,
|
|
551
|
+
ingestion_tier=1,
|
|
552
|
+
ingestion_method="xy_cut_layoutlm" if use_layoutlm else "xy_cut",
|
|
553
|
+
)
|
|
554
|
+
|
|
555
|
+
|