rnsr 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rnsr/__init__.py +118 -0
- rnsr/__main__.py +242 -0
- rnsr/agent/__init__.py +218 -0
- rnsr/agent/cross_doc_navigator.py +767 -0
- rnsr/agent/graph.py +1557 -0
- rnsr/agent/llm_cache.py +575 -0
- rnsr/agent/navigator_api.py +497 -0
- rnsr/agent/provenance.py +772 -0
- rnsr/agent/query_clarifier.py +617 -0
- rnsr/agent/reasoning_memory.py +736 -0
- rnsr/agent/repl_env.py +709 -0
- rnsr/agent/rlm_navigator.py +2108 -0
- rnsr/agent/self_reflection.py +602 -0
- rnsr/agent/variable_store.py +308 -0
- rnsr/benchmarks/__init__.py +118 -0
- rnsr/benchmarks/comprehensive_benchmark.py +733 -0
- rnsr/benchmarks/evaluation_suite.py +1210 -0
- rnsr/benchmarks/finance_bench.py +147 -0
- rnsr/benchmarks/pdf_merger.py +178 -0
- rnsr/benchmarks/performance.py +321 -0
- rnsr/benchmarks/quality.py +321 -0
- rnsr/benchmarks/runner.py +298 -0
- rnsr/benchmarks/standard_benchmarks.py +995 -0
- rnsr/client.py +560 -0
- rnsr/document_store.py +394 -0
- rnsr/exceptions.py +74 -0
- rnsr/extraction/__init__.py +172 -0
- rnsr/extraction/candidate_extractor.py +357 -0
- rnsr/extraction/entity_extractor.py +581 -0
- rnsr/extraction/entity_linker.py +825 -0
- rnsr/extraction/grounded_extractor.py +722 -0
- rnsr/extraction/learned_types.py +599 -0
- rnsr/extraction/models.py +232 -0
- rnsr/extraction/relationship_extractor.py +600 -0
- rnsr/extraction/relationship_patterns.py +511 -0
- rnsr/extraction/relationship_validator.py +392 -0
- rnsr/extraction/rlm_extractor.py +589 -0
- rnsr/extraction/rlm_unified_extractor.py +990 -0
- rnsr/extraction/tot_validator.py +610 -0
- rnsr/extraction/unified_extractor.py +342 -0
- rnsr/indexing/__init__.py +60 -0
- rnsr/indexing/knowledge_graph.py +1128 -0
- rnsr/indexing/kv_store.py +313 -0
- rnsr/indexing/persistence.py +323 -0
- rnsr/indexing/semantic_retriever.py +237 -0
- rnsr/indexing/semantic_search.py +320 -0
- rnsr/indexing/skeleton_index.py +395 -0
- rnsr/ingestion/__init__.py +161 -0
- rnsr/ingestion/chart_parser.py +569 -0
- rnsr/ingestion/document_boundary.py +662 -0
- rnsr/ingestion/font_histogram.py +334 -0
- rnsr/ingestion/header_classifier.py +595 -0
- rnsr/ingestion/hierarchical_cluster.py +515 -0
- rnsr/ingestion/layout_detector.py +356 -0
- rnsr/ingestion/layout_model.py +379 -0
- rnsr/ingestion/ocr_fallback.py +177 -0
- rnsr/ingestion/pipeline.py +936 -0
- rnsr/ingestion/semantic_fallback.py +417 -0
- rnsr/ingestion/table_parser.py +799 -0
- rnsr/ingestion/text_builder.py +460 -0
- rnsr/ingestion/tree_builder.py +402 -0
- rnsr/ingestion/vision_retrieval.py +965 -0
- rnsr/ingestion/xy_cut.py +555 -0
- rnsr/llm.py +733 -0
- rnsr/models.py +167 -0
- rnsr/py.typed +2 -0
- rnsr-0.1.0.dist-info/METADATA +592 -0
- rnsr-0.1.0.dist-info/RECORD +72 -0
- rnsr-0.1.0.dist-info/WHEEL +5 -0
- rnsr-0.1.0.dist-info/entry_points.txt +2 -0
- rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
- rnsr-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,662 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Document Boundary Detection for Multi-Document PDFs
|
|
3
|
+
|
|
4
|
+
This module detects boundaries between separate documents within a single PDF file.
|
|
5
|
+
It analyzes visual and textual signals to identify where one document ends and
|
|
6
|
+
another begins, enabling proper segmentation before tree building.
|
|
7
|
+
|
|
8
|
+
Boundary Detection Signals:
|
|
9
|
+
1. Title Page Patterns: Large font text at top of page (potential new document)
|
|
10
|
+
2. Page Number Resets: Page numbers restarting from 1 (strong signal)
|
|
11
|
+
3. Style Discontinuity: Dramatic font/style changes between pages
|
|
12
|
+
4. Document Type Indicators: Form headers, letterheads, report titles
|
|
13
|
+
5. LLM Validation: Optional LLM review of potential boundaries
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import re
|
|
19
|
+
from dataclasses import dataclass, field
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
from statistics import mean, stdev
|
|
22
|
+
|
|
23
|
+
import structlog
|
|
24
|
+
|
|
25
|
+
from rnsr.models import BoundingBox, SpanInfo
|
|
26
|
+
|
|
27
|
+
logger = structlog.get_logger(__name__)
|
|
28
|
+
|
|
29
|
+
# Regex patterns for page number detection
|
|
30
|
+
PAGE_NUMBER_PATTERNS = [
|
|
31
|
+
re.compile(r"^page\s*(\d+)$", re.IGNORECASE), # "Page 1", "page 1"
|
|
32
|
+
re.compile(r"^(\d+)\s*of\s*\d+$", re.IGNORECASE), # "1 of 10"
|
|
33
|
+
re.compile(r"^-\s*(\d+)\s*-$"), # "- 1 -"
|
|
34
|
+
re.compile(r"^(\d+)$"), # Just "1"
|
|
35
|
+
re.compile(r"^page\s*(\d+)\s*of\s*\d+$", re.IGNORECASE), # "Page 1 of 10"
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class DocumentBoundary:
|
|
41
|
+
"""Represents a detected document boundary."""
|
|
42
|
+
|
|
43
|
+
page_num: int # Page where new document starts
|
|
44
|
+
confidence: float # 0.0 - 1.0
|
|
45
|
+
signals: list[str] # What triggered detection
|
|
46
|
+
title_candidate: str = "" # Potential document title
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass
|
|
50
|
+
class DocumentSegment:
|
|
51
|
+
"""A segment of spans belonging to one logical document."""
|
|
52
|
+
|
|
53
|
+
doc_index: int
|
|
54
|
+
start_page: int
|
|
55
|
+
end_page: int
|
|
56
|
+
spans: list[SpanInfo] = field(default_factory=list)
|
|
57
|
+
title: str = ""
|
|
58
|
+
boundary: DocumentBoundary | None = None
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class DocumentBoundaryDetector:
|
|
62
|
+
"""
|
|
63
|
+
Detects boundaries between documents in a multi-document PDF.
|
|
64
|
+
|
|
65
|
+
Works by analyzing:
|
|
66
|
+
- Font size patterns (title pages have larger fonts)
|
|
67
|
+
- Page number resets (page numbers restarting from 1)
|
|
68
|
+
- Style discontinuities between pages
|
|
69
|
+
- Document type indicators
|
|
70
|
+
- Optional LLM validation of boundaries
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
def __init__(
|
|
74
|
+
self,
|
|
75
|
+
title_size_ratio: float = 2.0, # Title must be 2x body size (very conservative)
|
|
76
|
+
min_title_size: float = 18.0, # Minimum font size for title (larger)
|
|
77
|
+
page_top_fraction: float = 0.15, # "Top of page" = top 15% only
|
|
78
|
+
min_confidence: float = 0.75, # High confidence required
|
|
79
|
+
min_signals: int = 2, # Require at least 2 signals (page reset is strong)
|
|
80
|
+
use_llm_validation: bool = True, # Use LLM to validate boundaries
|
|
81
|
+
):
|
|
82
|
+
self.title_size_ratio = title_size_ratio
|
|
83
|
+
self.min_title_size = min_title_size
|
|
84
|
+
self.page_top_fraction = page_top_fraction
|
|
85
|
+
self.min_confidence = min_confidence
|
|
86
|
+
self.min_signals = min_signals
|
|
87
|
+
self.use_llm_validation = use_llm_validation
|
|
88
|
+
|
|
89
|
+
def _extract_page_number(self, spans: list[SpanInfo], page_height: float) -> int | None:
|
|
90
|
+
"""
|
|
91
|
+
Extract page number from footer/header of a page.
|
|
92
|
+
|
|
93
|
+
Looks for page number patterns in the bottom 15% or top 10% of the page.
|
|
94
|
+
"""
|
|
95
|
+
if not spans:
|
|
96
|
+
return None
|
|
97
|
+
|
|
98
|
+
# Look in footer (bottom 15%) and header (top 10%)
|
|
99
|
+
footer_threshold = page_height * 0.85
|
|
100
|
+
header_threshold = page_height * 0.10
|
|
101
|
+
|
|
102
|
+
# Get spans from footer and header areas
|
|
103
|
+
edge_spans = [
|
|
104
|
+
s for s in spans
|
|
105
|
+
if s.bbox.y0 > footer_threshold or s.bbox.y1 < header_threshold
|
|
106
|
+
]
|
|
107
|
+
|
|
108
|
+
for span in edge_spans:
|
|
109
|
+
text = span.text.strip()
|
|
110
|
+
|
|
111
|
+
# Try each pattern
|
|
112
|
+
for pattern in PAGE_NUMBER_PATTERNS:
|
|
113
|
+
match = pattern.match(text)
|
|
114
|
+
if match:
|
|
115
|
+
try:
|
|
116
|
+
return int(match.group(1))
|
|
117
|
+
except (ValueError, IndexError):
|
|
118
|
+
continue
|
|
119
|
+
|
|
120
|
+
return None
|
|
121
|
+
|
|
122
|
+
def _detect_page_number_resets(
|
|
123
|
+
self,
|
|
124
|
+
pages: dict[int, list[SpanInfo]],
|
|
125
|
+
page_heights: dict[int, float] | None,
|
|
126
|
+
) -> dict[int, bool]:
|
|
127
|
+
"""
|
|
128
|
+
Detect pages where page numbers reset to 1 or restart a sequence.
|
|
129
|
+
|
|
130
|
+
Returns dict mapping page_num -> True if page number reset detected.
|
|
131
|
+
"""
|
|
132
|
+
page_nums = sorted(pages.keys())
|
|
133
|
+
resets: dict[int, bool] = {}
|
|
134
|
+
|
|
135
|
+
if len(page_nums) < 2:
|
|
136
|
+
return resets
|
|
137
|
+
|
|
138
|
+
prev_page_num: int | None = None
|
|
139
|
+
|
|
140
|
+
for pdf_page in page_nums:
|
|
141
|
+
page_spans = pages[pdf_page]
|
|
142
|
+
|
|
143
|
+
# Estimate page height
|
|
144
|
+
if page_heights and pdf_page in page_heights:
|
|
145
|
+
page_height = page_heights[pdf_page]
|
|
146
|
+
else:
|
|
147
|
+
max_y = max((s.bbox.y1 for s in page_spans), default=800)
|
|
148
|
+
page_height = max_y * 1.1
|
|
149
|
+
|
|
150
|
+
# Extract page number from this page
|
|
151
|
+
doc_page_num = self._extract_page_number(page_spans, page_height)
|
|
152
|
+
|
|
153
|
+
if doc_page_num is not None:
|
|
154
|
+
# Check for reset: page number is 1, or significantly less than previous
|
|
155
|
+
if doc_page_num == 1 and prev_page_num is not None and prev_page_num > 1:
|
|
156
|
+
resets[pdf_page] = True
|
|
157
|
+
logger.debug(
|
|
158
|
+
"page_number_reset_detected",
|
|
159
|
+
pdf_page=pdf_page,
|
|
160
|
+
doc_page=doc_page_num,
|
|
161
|
+
prev_doc_page=prev_page_num,
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
prev_page_num = doc_page_num
|
|
165
|
+
|
|
166
|
+
return resets
|
|
167
|
+
|
|
168
|
+
def detect_boundaries(
|
|
169
|
+
self,
|
|
170
|
+
spans: list[SpanInfo],
|
|
171
|
+
page_heights: dict[int, float] | None = None,
|
|
172
|
+
) -> list[DocumentBoundary]:
|
|
173
|
+
"""
|
|
174
|
+
Detect document boundaries in a list of spans.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
spans: List of SpanInfo from the PDF
|
|
178
|
+
page_heights: Optional dict mapping page_num -> page_height
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
List of DocumentBoundary objects (sorted by page number)
|
|
182
|
+
"""
|
|
183
|
+
if not spans:
|
|
184
|
+
return []
|
|
185
|
+
|
|
186
|
+
# Group spans by page
|
|
187
|
+
pages = self._group_by_page(spans)
|
|
188
|
+
page_nums = sorted(pages.keys())
|
|
189
|
+
|
|
190
|
+
if len(page_nums) < 2:
|
|
191
|
+
return [] # Single page, no boundaries to detect
|
|
192
|
+
|
|
193
|
+
# Calculate global statistics
|
|
194
|
+
all_sizes = [s.font_size for s in spans]
|
|
195
|
+
body_size = self._estimate_body_size(all_sizes)
|
|
196
|
+
|
|
197
|
+
# STEP 1: Detect page number resets (very strong signal)
|
|
198
|
+
page_resets = self._detect_page_number_resets(pages, page_heights)
|
|
199
|
+
|
|
200
|
+
logger.info(
|
|
201
|
+
"page_number_resets_detected",
|
|
202
|
+
count=len(page_resets),
|
|
203
|
+
pages=list(page_resets.keys()),
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
boundaries: list[DocumentBoundary] = []
|
|
207
|
+
|
|
208
|
+
# Analyze each page (skip first - it's always the start)
|
|
209
|
+
for i, page_num in enumerate(page_nums[1:], 1):
|
|
210
|
+
prev_page_num = page_nums[i - 1]
|
|
211
|
+
|
|
212
|
+
# Check if this page has a page number reset
|
|
213
|
+
has_page_reset = page_resets.get(page_num, False)
|
|
214
|
+
|
|
215
|
+
boundary = self._analyze_page_boundary(
|
|
216
|
+
current_page=pages[page_num],
|
|
217
|
+
prev_page=pages[prev_page_num],
|
|
218
|
+
page_num=page_num,
|
|
219
|
+
body_size=body_size,
|
|
220
|
+
page_heights=page_heights,
|
|
221
|
+
has_page_reset=has_page_reset,
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
if boundary and boundary.confidence >= self.min_confidence:
|
|
225
|
+
boundaries.append(boundary)
|
|
226
|
+
logger.debug(
|
|
227
|
+
"boundary_detected",
|
|
228
|
+
page=page_num,
|
|
229
|
+
confidence=boundary.confidence,
|
|
230
|
+
signals=boundary.signals,
|
|
231
|
+
title=boundary.title_candidate[:50] if boundary.title_candidate else "",
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
# STEP 2: Optional LLM validation
|
|
235
|
+
if self.use_llm_validation and boundaries:
|
|
236
|
+
boundaries = self._validate_boundaries_with_llm(boundaries, pages)
|
|
237
|
+
|
|
238
|
+
logger.info(
|
|
239
|
+
"boundary_detection_complete",
|
|
240
|
+
total_pages=len(page_nums),
|
|
241
|
+
boundaries_found=len(boundaries),
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
return boundaries
|
|
245
|
+
|
|
246
|
+
def segment_spans(
|
|
247
|
+
self,
|
|
248
|
+
spans: list[SpanInfo],
|
|
249
|
+
boundaries: list[DocumentBoundary],
|
|
250
|
+
) -> list[DocumentSegment]:
|
|
251
|
+
"""
|
|
252
|
+
Split spans into document segments based on detected boundaries.
|
|
253
|
+
|
|
254
|
+
Args:
|
|
255
|
+
spans: All spans from the PDF
|
|
256
|
+
boundaries: Detected document boundaries
|
|
257
|
+
|
|
258
|
+
Returns:
|
|
259
|
+
List of DocumentSegment, each containing spans for one document
|
|
260
|
+
"""
|
|
261
|
+
if not spans:
|
|
262
|
+
return []
|
|
263
|
+
|
|
264
|
+
# Sort boundaries by page number
|
|
265
|
+
sorted_boundaries = sorted(boundaries, key=lambda b: b.page_num)
|
|
266
|
+
|
|
267
|
+
# Get boundary page numbers
|
|
268
|
+
boundary_pages = {b.page_num for b in sorted_boundaries}
|
|
269
|
+
|
|
270
|
+
# Group spans by page
|
|
271
|
+
pages = self._group_by_page(spans)
|
|
272
|
+
page_nums = sorted(pages.keys())
|
|
273
|
+
|
|
274
|
+
segments: list[DocumentSegment] = []
|
|
275
|
+
current_segment = DocumentSegment(
|
|
276
|
+
doc_index=0,
|
|
277
|
+
start_page=page_nums[0] if page_nums else 0,
|
|
278
|
+
end_page=page_nums[0] if page_nums else 0,
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
for page_num in page_nums:
|
|
282
|
+
page_spans = pages[page_num]
|
|
283
|
+
|
|
284
|
+
# Check if this page starts a new document
|
|
285
|
+
if page_num in boundary_pages:
|
|
286
|
+
# Save current segment
|
|
287
|
+
if current_segment.spans:
|
|
288
|
+
segments.append(current_segment)
|
|
289
|
+
|
|
290
|
+
# Find the boundary for this page
|
|
291
|
+
boundary = next(b for b in sorted_boundaries if b.page_num == page_num)
|
|
292
|
+
|
|
293
|
+
# Start new segment
|
|
294
|
+
current_segment = DocumentSegment(
|
|
295
|
+
doc_index=len(segments),
|
|
296
|
+
start_page=page_num,
|
|
297
|
+
end_page=page_num,
|
|
298
|
+
title=boundary.title_candidate,
|
|
299
|
+
boundary=boundary,
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
# Add page spans to current segment
|
|
303
|
+
current_segment.spans.extend(page_spans)
|
|
304
|
+
current_segment.end_page = page_num
|
|
305
|
+
|
|
306
|
+
# Don't forget the last segment
|
|
307
|
+
if current_segment.spans:
|
|
308
|
+
segments.append(current_segment)
|
|
309
|
+
|
|
310
|
+
# If no title was detected for first segment, try to extract one
|
|
311
|
+
if segments and not segments[0].title:
|
|
312
|
+
segments[0].title = self._extract_title_from_spans(segments[0].spans)
|
|
313
|
+
|
|
314
|
+
logger.info(
|
|
315
|
+
"spans_segmented",
|
|
316
|
+
total_segments=len(segments),
|
|
317
|
+
spans_per_segment=[len(s.spans) for s in segments],
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
return segments
|
|
321
|
+
|
|
322
|
+
def _group_by_page(self, spans: list[SpanInfo]) -> dict[int, list[SpanInfo]]:
|
|
323
|
+
"""Group spans by page number."""
|
|
324
|
+
pages: dict[int, list[SpanInfo]] = {}
|
|
325
|
+
for span in spans:
|
|
326
|
+
if span.page_num not in pages:
|
|
327
|
+
pages[span.page_num] = []
|
|
328
|
+
pages[span.page_num].append(span)
|
|
329
|
+
return pages
|
|
330
|
+
|
|
331
|
+
def _estimate_body_size(self, sizes: list[float]) -> float:
|
|
332
|
+
"""Estimate body text size (mode of font sizes)."""
|
|
333
|
+
if not sizes:
|
|
334
|
+
return 12.0
|
|
335
|
+
|
|
336
|
+
# Round to 1 decimal and find mode
|
|
337
|
+
rounded = [round(s, 1) for s in sizes]
|
|
338
|
+
return max(set(rounded), key=rounded.count)
|
|
339
|
+
|
|
340
|
+
def _analyze_page_boundary(
|
|
341
|
+
self,
|
|
342
|
+
current_page: list[SpanInfo],
|
|
343
|
+
prev_page: list[SpanInfo],
|
|
344
|
+
page_num: int,
|
|
345
|
+
body_size: float,
|
|
346
|
+
page_heights: dict[int, float] | None,
|
|
347
|
+
has_page_reset: bool = False,
|
|
348
|
+
) -> DocumentBoundary | None:
|
|
349
|
+
"""
|
|
350
|
+
Analyze whether a page represents a document boundary.
|
|
351
|
+
|
|
352
|
+
Returns DocumentBoundary if signals indicate a new document, None otherwise.
|
|
353
|
+
"""
|
|
354
|
+
signals: list[str] = []
|
|
355
|
+
confidence_factors: list[float] = []
|
|
356
|
+
title_candidate = ""
|
|
357
|
+
|
|
358
|
+
# STRONGEST SIGNAL: Page number reset to 1
|
|
359
|
+
# This is a very reliable indicator of a new document
|
|
360
|
+
if has_page_reset:
|
|
361
|
+
signals.append("page_number_reset")
|
|
362
|
+
confidence_factors.append(0.7) # Very high weight
|
|
363
|
+
|
|
364
|
+
# Get page dimensions
|
|
365
|
+
if page_heights and page_num in page_heights:
|
|
366
|
+
page_height = page_heights[page_num]
|
|
367
|
+
else:
|
|
368
|
+
# Estimate from spans
|
|
369
|
+
max_y = max((s.bbox.y1 for s in current_page), default=800)
|
|
370
|
+
page_height = max_y * 1.1 # Add margin
|
|
371
|
+
|
|
372
|
+
top_threshold = page_height * self.page_top_fraction
|
|
373
|
+
|
|
374
|
+
# Signal 1: Large text at top of page (title pattern)
|
|
375
|
+
top_spans = [s for s in current_page if s.bbox.y0 < top_threshold]
|
|
376
|
+
if top_spans:
|
|
377
|
+
max_top_size = max(s.font_size for s in top_spans)
|
|
378
|
+
|
|
379
|
+
# Check for title-like text
|
|
380
|
+
if max_top_size >= body_size * self.title_size_ratio:
|
|
381
|
+
signals.append(f"large_title_font_{max_top_size:.1f}pt")
|
|
382
|
+
confidence_factors.append(0.4) # Reduced from 0.7 - needs other signals
|
|
383
|
+
|
|
384
|
+
# Get the title text
|
|
385
|
+
title_spans = [s for s in top_spans if s.font_size == max_top_size]
|
|
386
|
+
if title_spans:
|
|
387
|
+
title_candidate = " ".join(s.text.strip() for s in title_spans[:3])
|
|
388
|
+
|
|
389
|
+
# Check for very large text (stronger signal)
|
|
390
|
+
if max_top_size >= self.min_title_size * 1.5:
|
|
391
|
+
signals.append("very_large_header")
|
|
392
|
+
confidence_factors.append(0.35)
|
|
393
|
+
|
|
394
|
+
# Signal 2: Complete style discontinuity with previous page
|
|
395
|
+
# Only triggers if there's a DRAMATIC change, not just minor differences
|
|
396
|
+
if prev_page and current_page:
|
|
397
|
+
prev_sizes = {round(s.font_size, 1) for s in prev_page}
|
|
398
|
+
curr_sizes = {round(s.font_size, 1) for s in current_page}
|
|
399
|
+
|
|
400
|
+
# Check for completely different font sizes (NO overlap)
|
|
401
|
+
overlap = prev_sizes & curr_sizes
|
|
402
|
+
if len(overlap) == 0: # Zero overlap = very strong signal
|
|
403
|
+
signals.append("complete_style_change")
|
|
404
|
+
confidence_factors.append(0.5)
|
|
405
|
+
|
|
406
|
+
# Check for completely different fonts (strong indicator)
|
|
407
|
+
prev_fonts = {s.font_name for s in prev_page}
|
|
408
|
+
curr_fonts = {s.font_name for s in current_page}
|
|
409
|
+
if prev_fonts and curr_fonts and not prev_fonts & curr_fonts:
|
|
410
|
+
signals.append("font_family_change")
|
|
411
|
+
confidence_factors.append(0.4)
|
|
412
|
+
|
|
413
|
+
# Signal 4: Previous page ended with typical document end patterns
|
|
414
|
+
if prev_page:
|
|
415
|
+
last_spans = sorted(prev_page, key=lambda s: s.bbox.y1, reverse=True)[:3]
|
|
416
|
+
last_text = " ".join(s.text.strip().lower() for s in last_spans)
|
|
417
|
+
|
|
418
|
+
end_patterns = [
|
|
419
|
+
"signature", "signed", "date:", "end of document",
|
|
420
|
+
"appendix", "attachment", "annex", "exhibit",
|
|
421
|
+
"page", "of", "---", "___",
|
|
422
|
+
]
|
|
423
|
+
|
|
424
|
+
if any(p in last_text for p in end_patterns):
|
|
425
|
+
signals.append("prev_page_end_pattern")
|
|
426
|
+
confidence_factors.append(0.3)
|
|
427
|
+
|
|
428
|
+
# Signal 5: Current page has STRONG document type indicators
|
|
429
|
+
# These are patterns that strongly suggest a new document, not just a section
|
|
430
|
+
if top_spans:
|
|
431
|
+
top_text = " ".join(s.text.strip().lower() for s in top_spans)
|
|
432
|
+
|
|
433
|
+
# Strong indicators - things that typically start new documents
|
|
434
|
+
strong_patterns = [
|
|
435
|
+
("certificate of", 0.5), # Certificate of capacity, etc.
|
|
436
|
+
("report to", 0.45), # Report to Parliament, etc.
|
|
437
|
+
("form 10:", 0.5), # Form numbers
|
|
438
|
+
("whs form", 0.5), # WHS forms
|
|
439
|
+
("incident", 0.35), # Incident reports
|
|
440
|
+
("court of", 0.45), # Supreme Court of...
|
|
441
|
+
("comprehensive", 0.35), # Comprehensive checkup, etc.
|
|
442
|
+
("clinical notes", 0.45), # Medical notes
|
|
443
|
+
("court attendance", 0.5), # Court attendance notice
|
|
444
|
+
("machine safety", 0.4), # Safety documents
|
|
445
|
+
]
|
|
446
|
+
|
|
447
|
+
for pattern, weight in strong_patterns:
|
|
448
|
+
if pattern in top_text:
|
|
449
|
+
signals.append(f"strong_doc_indicator_{pattern.replace(' ', '_')}")
|
|
450
|
+
confidence_factors.append(weight)
|
|
451
|
+
break # Only count the strongest match
|
|
452
|
+
|
|
453
|
+
# Signal 6: Sparse content page (potential divider or cover page)
|
|
454
|
+
total_chars = sum(len(s.text) for s in current_page)
|
|
455
|
+
if total_chars < 200 and top_spans: # Little text but has header
|
|
456
|
+
max_size = max(s.font_size for s in current_page)
|
|
457
|
+
if max_size >= body_size * self.title_size_ratio:
|
|
458
|
+
signals.append("sparse_title_page")
|
|
459
|
+
confidence_factors.append(0.4)
|
|
460
|
+
|
|
461
|
+
# Calculate overall confidence
|
|
462
|
+
if not confidence_factors:
|
|
463
|
+
return None
|
|
464
|
+
|
|
465
|
+
# Require minimum number of signals for a document boundary
|
|
466
|
+
# This prevents section headers from being mistaken for doc boundaries
|
|
467
|
+
if len(signals) < self.min_signals:
|
|
468
|
+
return None
|
|
469
|
+
|
|
470
|
+
# NEGATIVE SIGNAL: Check for style continuity with previous page
|
|
471
|
+
# If styles are very similar, this is likely a continuation, not a new doc
|
|
472
|
+
if prev_page and current_page:
|
|
473
|
+
prev_sizes = [round(s.font_size, 0) for s in prev_page]
|
|
474
|
+
curr_sizes = [round(s.font_size, 0) for s in current_page]
|
|
475
|
+
|
|
476
|
+
if prev_sizes and curr_sizes:
|
|
477
|
+
# Check if the most common sizes match
|
|
478
|
+
prev_common = max(set(prev_sizes), key=prev_sizes.count)
|
|
479
|
+
curr_common = max(set(curr_sizes), key=curr_sizes.count)
|
|
480
|
+
|
|
481
|
+
if prev_common == curr_common:
|
|
482
|
+
# Same body text size - likely same document
|
|
483
|
+
# Reduce confidence significantly
|
|
484
|
+
confidence_factors = [f * 0.6 for f in confidence_factors]
|
|
485
|
+
signals.append("style_continuity_penalty")
|
|
486
|
+
|
|
487
|
+
# Combine confidence factors (diminishing returns)
|
|
488
|
+
confidence = 0.0
|
|
489
|
+
for i, factor in enumerate(sorted(confidence_factors, reverse=True)):
|
|
490
|
+
confidence += factor * (0.7 ** i)
|
|
491
|
+
|
|
492
|
+
# Cap at 1.0
|
|
493
|
+
confidence = min(confidence, 1.0)
|
|
494
|
+
|
|
495
|
+
if confidence >= self.min_confidence:
|
|
496
|
+
return DocumentBoundary(
|
|
497
|
+
page_num=page_num,
|
|
498
|
+
confidence=confidence,
|
|
499
|
+
signals=signals,
|
|
500
|
+
title_candidate=title_candidate,
|
|
501
|
+
)
|
|
502
|
+
|
|
503
|
+
return None
|
|
504
|
+
|
|
505
|
+
def _validate_boundaries_with_llm(
|
|
506
|
+
self,
|
|
507
|
+
boundaries: list[DocumentBoundary],
|
|
508
|
+
pages: dict[int, list[SpanInfo]],
|
|
509
|
+
) -> list[DocumentBoundary]:
|
|
510
|
+
"""
|
|
511
|
+
Use LLM to validate detected boundaries.
|
|
512
|
+
|
|
513
|
+
Sends context from around each boundary to the LLM to confirm
|
|
514
|
+
if it's a true document boundary or just a section break.
|
|
515
|
+
"""
|
|
516
|
+
try:
|
|
517
|
+
from rnsr.llm import get_llm
|
|
518
|
+
llm = get_llm()
|
|
519
|
+
except Exception as e:
|
|
520
|
+
logger.warning("llm_validation_unavailable", error=str(e))
|
|
521
|
+
return boundaries
|
|
522
|
+
|
|
523
|
+
validated: list[DocumentBoundary] = []
|
|
524
|
+
|
|
525
|
+
for boundary in boundaries:
|
|
526
|
+
# Get context: last 500 chars of previous page, first 500 chars of current page
|
|
527
|
+
prev_page_num = boundary.page_num - 1
|
|
528
|
+
|
|
529
|
+
prev_text = ""
|
|
530
|
+
if prev_page_num in pages:
|
|
531
|
+
prev_spans = sorted(pages[prev_page_num], key=lambda s: (s.bbox.y0, s.bbox.x0))
|
|
532
|
+
prev_text = " ".join(s.text.strip() for s in prev_spans)[-500:]
|
|
533
|
+
|
|
534
|
+
curr_text = ""
|
|
535
|
+
if boundary.page_num in pages:
|
|
536
|
+
curr_spans = sorted(pages[boundary.page_num], key=lambda s: (s.bbox.y0, s.bbox.x0))
|
|
537
|
+
curr_text = " ".join(s.text.strip() for s in curr_spans)[:500]
|
|
538
|
+
|
|
539
|
+
# Build prompt for LLM
|
|
540
|
+
prompt = f"""You are analyzing a multi-document PDF to detect document boundaries.
|
|
541
|
+
|
|
542
|
+
The system detected a potential document boundary at this location.
|
|
543
|
+
Review the text from BEFORE and AFTER the boundary and determine if this is:
|
|
544
|
+
- A TRUE document boundary (a completely new, separate document starts here)
|
|
545
|
+
- A FALSE boundary (this is just a section/chapter break within the same document)
|
|
546
|
+
|
|
547
|
+
SIGNALS DETECTED: {', '.join(boundary.signals)}
|
|
548
|
+
POTENTIAL NEW DOCUMENT TITLE: {boundary.title_candidate or 'Unknown'}
|
|
549
|
+
CONFIDENCE: {boundary.confidence:.2f}
|
|
550
|
+
|
|
551
|
+
--- END OF PREVIOUS DOCUMENT ---
|
|
552
|
+
{prev_text}
|
|
553
|
+
|
|
554
|
+
--- START OF POTENTIAL NEW DOCUMENT ---
|
|
555
|
+
{curr_text}
|
|
556
|
+
|
|
557
|
+
Is this a TRUE document boundary (a completely separate document starts here)?
|
|
558
|
+
Answer ONLY with: TRUE or FALSE
|
|
559
|
+
|
|
560
|
+
Your answer:"""
|
|
561
|
+
|
|
562
|
+
try:
|
|
563
|
+
response = llm.complete(prompt)
|
|
564
|
+
response_text = str(response).strip().upper()
|
|
565
|
+
|
|
566
|
+
is_valid = "TRUE" in response_text
|
|
567
|
+
|
|
568
|
+
logger.debug(
|
|
569
|
+
"llm_boundary_validation",
|
|
570
|
+
page=boundary.page_num,
|
|
571
|
+
title=boundary.title_candidate[:30] if boundary.title_candidate else "",
|
|
572
|
+
llm_response=response_text[:50],
|
|
573
|
+
is_valid=is_valid,
|
|
574
|
+
)
|
|
575
|
+
|
|
576
|
+
if is_valid:
|
|
577
|
+
validated.append(boundary)
|
|
578
|
+
else:
|
|
579
|
+
logger.info(
|
|
580
|
+
"boundary_rejected_by_llm",
|
|
581
|
+
page=boundary.page_num,
|
|
582
|
+
title=boundary.title_candidate[:30] if boundary.title_candidate else "",
|
|
583
|
+
)
|
|
584
|
+
|
|
585
|
+
except Exception as e:
|
|
586
|
+
logger.warning("llm_validation_failed", page=boundary.page_num, error=str(e))
|
|
587
|
+
# On LLM error, keep boundaries with high confidence
|
|
588
|
+
if boundary.confidence >= 0.8:
|
|
589
|
+
validated.append(boundary)
|
|
590
|
+
|
|
591
|
+
logger.info(
|
|
592
|
+
"llm_validation_complete",
|
|
593
|
+
original_count=len(boundaries),
|
|
594
|
+
validated_count=len(validated),
|
|
595
|
+
rejected_count=len(boundaries) - len(validated),
|
|
596
|
+
)
|
|
597
|
+
|
|
598
|
+
return validated
|
|
599
|
+
|
|
600
|
+
def _extract_title_from_spans(self, spans: list[SpanInfo]) -> str:
|
|
601
|
+
"""Extract a title from the first page's spans."""
|
|
602
|
+
if not spans:
|
|
603
|
+
return "Document"
|
|
604
|
+
|
|
605
|
+
# Get spans from first page
|
|
606
|
+
first_page = min(s.page_num for s in spans)
|
|
607
|
+
first_page_spans = [s for s in spans if s.page_num == first_page]
|
|
608
|
+
|
|
609
|
+
if not first_page_spans:
|
|
610
|
+
return "Document"
|
|
611
|
+
|
|
612
|
+
# Find largest text near top
|
|
613
|
+
page_height = max(s.bbox.y1 for s in first_page_spans)
|
|
614
|
+
top_spans = [s for s in first_page_spans if s.bbox.y0 < page_height * 0.3]
|
|
615
|
+
|
|
616
|
+
if top_spans:
|
|
617
|
+
max_size = max(s.font_size for s in top_spans)
|
|
618
|
+
title_spans = [s for s in top_spans if s.font_size == max_size]
|
|
619
|
+
return " ".join(s.text.strip() for s in title_spans[:3])
|
|
620
|
+
|
|
621
|
+
return "Document"
|
|
622
|
+
|
|
623
|
+
|
|
624
|
+
def detect_document_boundaries(
|
|
625
|
+
spans: list[SpanInfo],
|
|
626
|
+
page_heights: dict[int, float] | None = None,
|
|
627
|
+
min_confidence: float = 0.5,
|
|
628
|
+
) -> list[DocumentBoundary]:
|
|
629
|
+
"""
|
|
630
|
+
Convenience function to detect document boundaries.
|
|
631
|
+
|
|
632
|
+
Args:
|
|
633
|
+
spans: List of SpanInfo from the PDF
|
|
634
|
+
page_heights: Optional dict mapping page_num -> page_height
|
|
635
|
+
min_confidence: Minimum confidence threshold (0.0-1.0)
|
|
636
|
+
|
|
637
|
+
Returns:
|
|
638
|
+
List of DocumentBoundary objects
|
|
639
|
+
"""
|
|
640
|
+
detector = DocumentBoundaryDetector(min_confidence=min_confidence)
|
|
641
|
+
return detector.detect_boundaries(spans, page_heights)
|
|
642
|
+
|
|
643
|
+
|
|
644
|
+
def segment_by_documents(
|
|
645
|
+
spans: list[SpanInfo],
|
|
646
|
+
page_heights: dict[int, float] | None = None,
|
|
647
|
+
min_confidence: float = 0.5,
|
|
648
|
+
) -> list[DocumentSegment]:
|
|
649
|
+
"""
|
|
650
|
+
Convenience function to segment spans into separate documents.
|
|
651
|
+
|
|
652
|
+
Args:
|
|
653
|
+
spans: List of SpanInfo from the PDF
|
|
654
|
+
page_heights: Optional dict mapping page_num -> page_height
|
|
655
|
+
min_confidence: Minimum confidence threshold (0.0-1.0)
|
|
656
|
+
|
|
657
|
+
Returns:
|
|
658
|
+
List of DocumentSegment, each containing spans for one document
|
|
659
|
+
"""
|
|
660
|
+
detector = DocumentBoundaryDetector(min_confidence=min_confidence)
|
|
661
|
+
boundaries = detector.detect_boundaries(spans, page_heights)
|
|
662
|
+
return detector.segment_spans(spans, boundaries)
|