rnsr 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. rnsr/__init__.py +118 -0
  2. rnsr/__main__.py +242 -0
  3. rnsr/agent/__init__.py +218 -0
  4. rnsr/agent/cross_doc_navigator.py +767 -0
  5. rnsr/agent/graph.py +1557 -0
  6. rnsr/agent/llm_cache.py +575 -0
  7. rnsr/agent/navigator_api.py +497 -0
  8. rnsr/agent/provenance.py +772 -0
  9. rnsr/agent/query_clarifier.py +617 -0
  10. rnsr/agent/reasoning_memory.py +736 -0
  11. rnsr/agent/repl_env.py +709 -0
  12. rnsr/agent/rlm_navigator.py +2108 -0
  13. rnsr/agent/self_reflection.py +602 -0
  14. rnsr/agent/variable_store.py +308 -0
  15. rnsr/benchmarks/__init__.py +118 -0
  16. rnsr/benchmarks/comprehensive_benchmark.py +733 -0
  17. rnsr/benchmarks/evaluation_suite.py +1210 -0
  18. rnsr/benchmarks/finance_bench.py +147 -0
  19. rnsr/benchmarks/pdf_merger.py +178 -0
  20. rnsr/benchmarks/performance.py +321 -0
  21. rnsr/benchmarks/quality.py +321 -0
  22. rnsr/benchmarks/runner.py +298 -0
  23. rnsr/benchmarks/standard_benchmarks.py +995 -0
  24. rnsr/client.py +560 -0
  25. rnsr/document_store.py +394 -0
  26. rnsr/exceptions.py +74 -0
  27. rnsr/extraction/__init__.py +172 -0
  28. rnsr/extraction/candidate_extractor.py +357 -0
  29. rnsr/extraction/entity_extractor.py +581 -0
  30. rnsr/extraction/entity_linker.py +825 -0
  31. rnsr/extraction/grounded_extractor.py +722 -0
  32. rnsr/extraction/learned_types.py +599 -0
  33. rnsr/extraction/models.py +232 -0
  34. rnsr/extraction/relationship_extractor.py +600 -0
  35. rnsr/extraction/relationship_patterns.py +511 -0
  36. rnsr/extraction/relationship_validator.py +392 -0
  37. rnsr/extraction/rlm_extractor.py +589 -0
  38. rnsr/extraction/rlm_unified_extractor.py +990 -0
  39. rnsr/extraction/tot_validator.py +610 -0
  40. rnsr/extraction/unified_extractor.py +342 -0
  41. rnsr/indexing/__init__.py +60 -0
  42. rnsr/indexing/knowledge_graph.py +1128 -0
  43. rnsr/indexing/kv_store.py +313 -0
  44. rnsr/indexing/persistence.py +323 -0
  45. rnsr/indexing/semantic_retriever.py +237 -0
  46. rnsr/indexing/semantic_search.py +320 -0
  47. rnsr/indexing/skeleton_index.py +395 -0
  48. rnsr/ingestion/__init__.py +161 -0
  49. rnsr/ingestion/chart_parser.py +569 -0
  50. rnsr/ingestion/document_boundary.py +662 -0
  51. rnsr/ingestion/font_histogram.py +334 -0
  52. rnsr/ingestion/header_classifier.py +595 -0
  53. rnsr/ingestion/hierarchical_cluster.py +515 -0
  54. rnsr/ingestion/layout_detector.py +356 -0
  55. rnsr/ingestion/layout_model.py +379 -0
  56. rnsr/ingestion/ocr_fallback.py +177 -0
  57. rnsr/ingestion/pipeline.py +936 -0
  58. rnsr/ingestion/semantic_fallback.py +417 -0
  59. rnsr/ingestion/table_parser.py +799 -0
  60. rnsr/ingestion/text_builder.py +460 -0
  61. rnsr/ingestion/tree_builder.py +402 -0
  62. rnsr/ingestion/vision_retrieval.py +965 -0
  63. rnsr/ingestion/xy_cut.py +555 -0
  64. rnsr/llm.py +733 -0
  65. rnsr/models.py +167 -0
  66. rnsr/py.typed +2 -0
  67. rnsr-0.1.0.dist-info/METADATA +592 -0
  68. rnsr-0.1.0.dist-info/RECORD +72 -0
  69. rnsr-0.1.0.dist-info/WHEEL +5 -0
  70. rnsr-0.1.0.dist-info/entry_points.txt +2 -0
  71. rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
  72. rnsr-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,662 @@
1
+ """
2
+ Document Boundary Detection for Multi-Document PDFs
3
+
4
+ This module detects boundaries between separate documents within a single PDF file.
5
+ It analyzes visual and textual signals to identify where one document ends and
6
+ another begins, enabling proper segmentation before tree building.
7
+
8
+ Boundary Detection Signals:
9
+ 1. Title Page Patterns: Large font text at top of page (potential new document)
10
+ 2. Page Number Resets: Page numbers restarting from 1 (strong signal)
11
+ 3. Style Discontinuity: Dramatic font/style changes between pages
12
+ 4. Document Type Indicators: Form headers, letterheads, report titles
13
+ 5. LLM Validation: Optional LLM review of potential boundaries
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import re
19
+ from dataclasses import dataclass, field
20
+ from pathlib import Path
21
+ from statistics import mean, stdev
22
+
23
+ import structlog
24
+
25
+ from rnsr.models import BoundingBox, SpanInfo
26
+
27
+ logger = structlog.get_logger(__name__)
28
+
29
+ # Regex patterns for page number detection
30
+ PAGE_NUMBER_PATTERNS = [
31
+ re.compile(r"^page\s*(\d+)$", re.IGNORECASE), # "Page 1", "page 1"
32
+ re.compile(r"^(\d+)\s*of\s*\d+$", re.IGNORECASE), # "1 of 10"
33
+ re.compile(r"^-\s*(\d+)\s*-$"), # "- 1 -"
34
+ re.compile(r"^(\d+)$"), # Just "1"
35
+ re.compile(r"^page\s*(\d+)\s*of\s*\d+$", re.IGNORECASE), # "Page 1 of 10"
36
+ ]
37
+
38
+
39
+ @dataclass
40
+ class DocumentBoundary:
41
+ """Represents a detected document boundary."""
42
+
43
+ page_num: int # Page where new document starts
44
+ confidence: float # 0.0 - 1.0
45
+ signals: list[str] # What triggered detection
46
+ title_candidate: str = "" # Potential document title
47
+
48
+
49
+ @dataclass
50
+ class DocumentSegment:
51
+ """A segment of spans belonging to one logical document."""
52
+
53
+ doc_index: int
54
+ start_page: int
55
+ end_page: int
56
+ spans: list[SpanInfo] = field(default_factory=list)
57
+ title: str = ""
58
+ boundary: DocumentBoundary | None = None
59
+
60
+
61
+ class DocumentBoundaryDetector:
62
+ """
63
+ Detects boundaries between documents in a multi-document PDF.
64
+
65
+ Works by analyzing:
66
+ - Font size patterns (title pages have larger fonts)
67
+ - Page number resets (page numbers restarting from 1)
68
+ - Style discontinuities between pages
69
+ - Document type indicators
70
+ - Optional LLM validation of boundaries
71
+ """
72
+
73
+ def __init__(
74
+ self,
75
+ title_size_ratio: float = 2.0, # Title must be 2x body size (very conservative)
76
+ min_title_size: float = 18.0, # Minimum font size for title (larger)
77
+ page_top_fraction: float = 0.15, # "Top of page" = top 15% only
78
+ min_confidence: float = 0.75, # High confidence required
79
+ min_signals: int = 2, # Require at least 2 signals (page reset is strong)
80
+ use_llm_validation: bool = True, # Use LLM to validate boundaries
81
+ ):
82
+ self.title_size_ratio = title_size_ratio
83
+ self.min_title_size = min_title_size
84
+ self.page_top_fraction = page_top_fraction
85
+ self.min_confidence = min_confidence
86
+ self.min_signals = min_signals
87
+ self.use_llm_validation = use_llm_validation
88
+
89
+ def _extract_page_number(self, spans: list[SpanInfo], page_height: float) -> int | None:
90
+ """
91
+ Extract page number from footer/header of a page.
92
+
93
+ Looks for page number patterns in the bottom 15% or top 10% of the page.
94
+ """
95
+ if not spans:
96
+ return None
97
+
98
+ # Look in footer (bottom 15%) and header (top 10%)
99
+ footer_threshold = page_height * 0.85
100
+ header_threshold = page_height * 0.10
101
+
102
+ # Get spans from footer and header areas
103
+ edge_spans = [
104
+ s for s in spans
105
+ if s.bbox.y0 > footer_threshold or s.bbox.y1 < header_threshold
106
+ ]
107
+
108
+ for span in edge_spans:
109
+ text = span.text.strip()
110
+
111
+ # Try each pattern
112
+ for pattern in PAGE_NUMBER_PATTERNS:
113
+ match = pattern.match(text)
114
+ if match:
115
+ try:
116
+ return int(match.group(1))
117
+ except (ValueError, IndexError):
118
+ continue
119
+
120
+ return None
121
+
122
+ def _detect_page_number_resets(
123
+ self,
124
+ pages: dict[int, list[SpanInfo]],
125
+ page_heights: dict[int, float] | None,
126
+ ) -> dict[int, bool]:
127
+ """
128
+ Detect pages where page numbers reset to 1 or restart a sequence.
129
+
130
+ Returns dict mapping page_num -> True if page number reset detected.
131
+ """
132
+ page_nums = sorted(pages.keys())
133
+ resets: dict[int, bool] = {}
134
+
135
+ if len(page_nums) < 2:
136
+ return resets
137
+
138
+ prev_page_num: int | None = None
139
+
140
+ for pdf_page in page_nums:
141
+ page_spans = pages[pdf_page]
142
+
143
+ # Estimate page height
144
+ if page_heights and pdf_page in page_heights:
145
+ page_height = page_heights[pdf_page]
146
+ else:
147
+ max_y = max((s.bbox.y1 for s in page_spans), default=800)
148
+ page_height = max_y * 1.1
149
+
150
+ # Extract page number from this page
151
+ doc_page_num = self._extract_page_number(page_spans, page_height)
152
+
153
+ if doc_page_num is not None:
154
+ # Check for reset: page number is 1, or significantly less than previous
155
+ if doc_page_num == 1 and prev_page_num is not None and prev_page_num > 1:
156
+ resets[pdf_page] = True
157
+ logger.debug(
158
+ "page_number_reset_detected",
159
+ pdf_page=pdf_page,
160
+ doc_page=doc_page_num,
161
+ prev_doc_page=prev_page_num,
162
+ )
163
+
164
+ prev_page_num = doc_page_num
165
+
166
+ return resets
167
+
168
+ def detect_boundaries(
169
+ self,
170
+ spans: list[SpanInfo],
171
+ page_heights: dict[int, float] | None = None,
172
+ ) -> list[DocumentBoundary]:
173
+ """
174
+ Detect document boundaries in a list of spans.
175
+
176
+ Args:
177
+ spans: List of SpanInfo from the PDF
178
+ page_heights: Optional dict mapping page_num -> page_height
179
+
180
+ Returns:
181
+ List of DocumentBoundary objects (sorted by page number)
182
+ """
183
+ if not spans:
184
+ return []
185
+
186
+ # Group spans by page
187
+ pages = self._group_by_page(spans)
188
+ page_nums = sorted(pages.keys())
189
+
190
+ if len(page_nums) < 2:
191
+ return [] # Single page, no boundaries to detect
192
+
193
+ # Calculate global statistics
194
+ all_sizes = [s.font_size for s in spans]
195
+ body_size = self._estimate_body_size(all_sizes)
196
+
197
+ # STEP 1: Detect page number resets (very strong signal)
198
+ page_resets = self._detect_page_number_resets(pages, page_heights)
199
+
200
+ logger.info(
201
+ "page_number_resets_detected",
202
+ count=len(page_resets),
203
+ pages=list(page_resets.keys()),
204
+ )
205
+
206
+ boundaries: list[DocumentBoundary] = []
207
+
208
+ # Analyze each page (skip first - it's always the start)
209
+ for i, page_num in enumerate(page_nums[1:], 1):
210
+ prev_page_num = page_nums[i - 1]
211
+
212
+ # Check if this page has a page number reset
213
+ has_page_reset = page_resets.get(page_num, False)
214
+
215
+ boundary = self._analyze_page_boundary(
216
+ current_page=pages[page_num],
217
+ prev_page=pages[prev_page_num],
218
+ page_num=page_num,
219
+ body_size=body_size,
220
+ page_heights=page_heights,
221
+ has_page_reset=has_page_reset,
222
+ )
223
+
224
+ if boundary and boundary.confidence >= self.min_confidence:
225
+ boundaries.append(boundary)
226
+ logger.debug(
227
+ "boundary_detected",
228
+ page=page_num,
229
+ confidence=boundary.confidence,
230
+ signals=boundary.signals,
231
+ title=boundary.title_candidate[:50] if boundary.title_candidate else "",
232
+ )
233
+
234
+ # STEP 2: Optional LLM validation
235
+ if self.use_llm_validation and boundaries:
236
+ boundaries = self._validate_boundaries_with_llm(boundaries, pages)
237
+
238
+ logger.info(
239
+ "boundary_detection_complete",
240
+ total_pages=len(page_nums),
241
+ boundaries_found=len(boundaries),
242
+ )
243
+
244
+ return boundaries
245
+
246
+ def segment_spans(
247
+ self,
248
+ spans: list[SpanInfo],
249
+ boundaries: list[DocumentBoundary],
250
+ ) -> list[DocumentSegment]:
251
+ """
252
+ Split spans into document segments based on detected boundaries.
253
+
254
+ Args:
255
+ spans: All spans from the PDF
256
+ boundaries: Detected document boundaries
257
+
258
+ Returns:
259
+ List of DocumentSegment, each containing spans for one document
260
+ """
261
+ if not spans:
262
+ return []
263
+
264
+ # Sort boundaries by page number
265
+ sorted_boundaries = sorted(boundaries, key=lambda b: b.page_num)
266
+
267
+ # Get boundary page numbers
268
+ boundary_pages = {b.page_num for b in sorted_boundaries}
269
+
270
+ # Group spans by page
271
+ pages = self._group_by_page(spans)
272
+ page_nums = sorted(pages.keys())
273
+
274
+ segments: list[DocumentSegment] = []
275
+ current_segment = DocumentSegment(
276
+ doc_index=0,
277
+ start_page=page_nums[0] if page_nums else 0,
278
+ end_page=page_nums[0] if page_nums else 0,
279
+ )
280
+
281
+ for page_num in page_nums:
282
+ page_spans = pages[page_num]
283
+
284
+ # Check if this page starts a new document
285
+ if page_num in boundary_pages:
286
+ # Save current segment
287
+ if current_segment.spans:
288
+ segments.append(current_segment)
289
+
290
+ # Find the boundary for this page
291
+ boundary = next(b for b in sorted_boundaries if b.page_num == page_num)
292
+
293
+ # Start new segment
294
+ current_segment = DocumentSegment(
295
+ doc_index=len(segments),
296
+ start_page=page_num,
297
+ end_page=page_num,
298
+ title=boundary.title_candidate,
299
+ boundary=boundary,
300
+ )
301
+
302
+ # Add page spans to current segment
303
+ current_segment.spans.extend(page_spans)
304
+ current_segment.end_page = page_num
305
+
306
+ # Don't forget the last segment
307
+ if current_segment.spans:
308
+ segments.append(current_segment)
309
+
310
+ # If no title was detected for first segment, try to extract one
311
+ if segments and not segments[0].title:
312
+ segments[0].title = self._extract_title_from_spans(segments[0].spans)
313
+
314
+ logger.info(
315
+ "spans_segmented",
316
+ total_segments=len(segments),
317
+ spans_per_segment=[len(s.spans) for s in segments],
318
+ )
319
+
320
+ return segments
321
+
322
+ def _group_by_page(self, spans: list[SpanInfo]) -> dict[int, list[SpanInfo]]:
323
+ """Group spans by page number."""
324
+ pages: dict[int, list[SpanInfo]] = {}
325
+ for span in spans:
326
+ if span.page_num not in pages:
327
+ pages[span.page_num] = []
328
+ pages[span.page_num].append(span)
329
+ return pages
330
+
331
+ def _estimate_body_size(self, sizes: list[float]) -> float:
332
+ """Estimate body text size (mode of font sizes)."""
333
+ if not sizes:
334
+ return 12.0
335
+
336
+ # Round to 1 decimal and find mode
337
+ rounded = [round(s, 1) for s in sizes]
338
+ return max(set(rounded), key=rounded.count)
339
+
340
+ def _analyze_page_boundary(
341
+ self,
342
+ current_page: list[SpanInfo],
343
+ prev_page: list[SpanInfo],
344
+ page_num: int,
345
+ body_size: float,
346
+ page_heights: dict[int, float] | None,
347
+ has_page_reset: bool = False,
348
+ ) -> DocumentBoundary | None:
349
+ """
350
+ Analyze whether a page represents a document boundary.
351
+
352
+ Returns DocumentBoundary if signals indicate a new document, None otherwise.
353
+ """
354
+ signals: list[str] = []
355
+ confidence_factors: list[float] = []
356
+ title_candidate = ""
357
+
358
+ # STRONGEST SIGNAL: Page number reset to 1
359
+ # This is a very reliable indicator of a new document
360
+ if has_page_reset:
361
+ signals.append("page_number_reset")
362
+ confidence_factors.append(0.7) # Very high weight
363
+
364
+ # Get page dimensions
365
+ if page_heights and page_num in page_heights:
366
+ page_height = page_heights[page_num]
367
+ else:
368
+ # Estimate from spans
369
+ max_y = max((s.bbox.y1 for s in current_page), default=800)
370
+ page_height = max_y * 1.1 # Add margin
371
+
372
+ top_threshold = page_height * self.page_top_fraction
373
+
374
+ # Signal 1: Large text at top of page (title pattern)
375
+ top_spans = [s for s in current_page if s.bbox.y0 < top_threshold]
376
+ if top_spans:
377
+ max_top_size = max(s.font_size for s in top_spans)
378
+
379
+ # Check for title-like text
380
+ if max_top_size >= body_size * self.title_size_ratio:
381
+ signals.append(f"large_title_font_{max_top_size:.1f}pt")
382
+ confidence_factors.append(0.4) # Reduced from 0.7 - needs other signals
383
+
384
+ # Get the title text
385
+ title_spans = [s for s in top_spans if s.font_size == max_top_size]
386
+ if title_spans:
387
+ title_candidate = " ".join(s.text.strip() for s in title_spans[:3])
388
+
389
+ # Check for very large text (stronger signal)
390
+ if max_top_size >= self.min_title_size * 1.5:
391
+ signals.append("very_large_header")
392
+ confidence_factors.append(0.35)
393
+
394
+ # Signal 2: Complete style discontinuity with previous page
395
+ # Only triggers if there's a DRAMATIC change, not just minor differences
396
+ if prev_page and current_page:
397
+ prev_sizes = {round(s.font_size, 1) for s in prev_page}
398
+ curr_sizes = {round(s.font_size, 1) for s in current_page}
399
+
400
+ # Check for completely different font sizes (NO overlap)
401
+ overlap = prev_sizes & curr_sizes
402
+ if len(overlap) == 0: # Zero overlap = very strong signal
403
+ signals.append("complete_style_change")
404
+ confidence_factors.append(0.5)
405
+
406
+ # Check for completely different fonts (strong indicator)
407
+ prev_fonts = {s.font_name for s in prev_page}
408
+ curr_fonts = {s.font_name for s in current_page}
409
+ if prev_fonts and curr_fonts and not prev_fonts & curr_fonts:
410
+ signals.append("font_family_change")
411
+ confidence_factors.append(0.4)
412
+
413
+ # Signal 4: Previous page ended with typical document end patterns
414
+ if prev_page:
415
+ last_spans = sorted(prev_page, key=lambda s: s.bbox.y1, reverse=True)[:3]
416
+ last_text = " ".join(s.text.strip().lower() for s in last_spans)
417
+
418
+ end_patterns = [
419
+ "signature", "signed", "date:", "end of document",
420
+ "appendix", "attachment", "annex", "exhibit",
421
+ "page", "of", "---", "___",
422
+ ]
423
+
424
+ if any(p in last_text for p in end_patterns):
425
+ signals.append("prev_page_end_pattern")
426
+ confidence_factors.append(0.3)
427
+
428
+ # Signal 5: Current page has STRONG document type indicators
429
+ # These are patterns that strongly suggest a new document, not just a section
430
+ if top_spans:
431
+ top_text = " ".join(s.text.strip().lower() for s in top_spans)
432
+
433
+ # Strong indicators - things that typically start new documents
434
+ strong_patterns = [
435
+ ("certificate of", 0.5), # Certificate of capacity, etc.
436
+ ("report to", 0.45), # Report to Parliament, etc.
437
+ ("form 10:", 0.5), # Form numbers
438
+ ("whs form", 0.5), # WHS forms
439
+ ("incident", 0.35), # Incident reports
440
+ ("court of", 0.45), # Supreme Court of...
441
+ ("comprehensive", 0.35), # Comprehensive checkup, etc.
442
+ ("clinical notes", 0.45), # Medical notes
443
+ ("court attendance", 0.5), # Court attendance notice
444
+ ("machine safety", 0.4), # Safety documents
445
+ ]
446
+
447
+ for pattern, weight in strong_patterns:
448
+ if pattern in top_text:
449
+ signals.append(f"strong_doc_indicator_{pattern.replace(' ', '_')}")
450
+ confidence_factors.append(weight)
451
+ break # Only count the strongest match
452
+
453
+ # Signal 6: Sparse content page (potential divider or cover page)
454
+ total_chars = sum(len(s.text) for s in current_page)
455
+ if total_chars < 200 and top_spans: # Little text but has header
456
+ max_size = max(s.font_size for s in current_page)
457
+ if max_size >= body_size * self.title_size_ratio:
458
+ signals.append("sparse_title_page")
459
+ confidence_factors.append(0.4)
460
+
461
+ # Calculate overall confidence
462
+ if not confidence_factors:
463
+ return None
464
+
465
+ # Require minimum number of signals for a document boundary
466
+ # This prevents section headers from being mistaken for doc boundaries
467
+ if len(signals) < self.min_signals:
468
+ return None
469
+
470
+ # NEGATIVE SIGNAL: Check for style continuity with previous page
471
+ # If styles are very similar, this is likely a continuation, not a new doc
472
+ if prev_page and current_page:
473
+ prev_sizes = [round(s.font_size, 0) for s in prev_page]
474
+ curr_sizes = [round(s.font_size, 0) for s in current_page]
475
+
476
+ if prev_sizes and curr_sizes:
477
+ # Check if the most common sizes match
478
+ prev_common = max(set(prev_sizes), key=prev_sizes.count)
479
+ curr_common = max(set(curr_sizes), key=curr_sizes.count)
480
+
481
+ if prev_common == curr_common:
482
+ # Same body text size - likely same document
483
+ # Reduce confidence significantly
484
+ confidence_factors = [f * 0.6 for f in confidence_factors]
485
+ signals.append("style_continuity_penalty")
486
+
487
+ # Combine confidence factors (diminishing returns)
488
+ confidence = 0.0
489
+ for i, factor in enumerate(sorted(confidence_factors, reverse=True)):
490
+ confidence += factor * (0.7 ** i)
491
+
492
+ # Cap at 1.0
493
+ confidence = min(confidence, 1.0)
494
+
495
+ if confidence >= self.min_confidence:
496
+ return DocumentBoundary(
497
+ page_num=page_num,
498
+ confidence=confidence,
499
+ signals=signals,
500
+ title_candidate=title_candidate,
501
+ )
502
+
503
+ return None
504
+
505
+ def _validate_boundaries_with_llm(
506
+ self,
507
+ boundaries: list[DocumentBoundary],
508
+ pages: dict[int, list[SpanInfo]],
509
+ ) -> list[DocumentBoundary]:
510
+ """
511
+ Use LLM to validate detected boundaries.
512
+
513
+ Sends context from around each boundary to the LLM to confirm
514
+ if it's a true document boundary or just a section break.
515
+ """
516
+ try:
517
+ from rnsr.llm import get_llm
518
+ llm = get_llm()
519
+ except Exception as e:
520
+ logger.warning("llm_validation_unavailable", error=str(e))
521
+ return boundaries
522
+
523
+ validated: list[DocumentBoundary] = []
524
+
525
+ for boundary in boundaries:
526
+ # Get context: last 500 chars of previous page, first 500 chars of current page
527
+ prev_page_num = boundary.page_num - 1
528
+
529
+ prev_text = ""
530
+ if prev_page_num in pages:
531
+ prev_spans = sorted(pages[prev_page_num], key=lambda s: (s.bbox.y0, s.bbox.x0))
532
+ prev_text = " ".join(s.text.strip() for s in prev_spans)[-500:]
533
+
534
+ curr_text = ""
535
+ if boundary.page_num in pages:
536
+ curr_spans = sorted(pages[boundary.page_num], key=lambda s: (s.bbox.y0, s.bbox.x0))
537
+ curr_text = " ".join(s.text.strip() for s in curr_spans)[:500]
538
+
539
+ # Build prompt for LLM
540
+ prompt = f"""You are analyzing a multi-document PDF to detect document boundaries.
541
+
542
+ The system detected a potential document boundary at this location.
543
+ Review the text from BEFORE and AFTER the boundary and determine if this is:
544
+ - A TRUE document boundary (a completely new, separate document starts here)
545
+ - A FALSE boundary (this is just a section/chapter break within the same document)
546
+
547
+ SIGNALS DETECTED: {', '.join(boundary.signals)}
548
+ POTENTIAL NEW DOCUMENT TITLE: {boundary.title_candidate or 'Unknown'}
549
+ CONFIDENCE: {boundary.confidence:.2f}
550
+
551
+ --- END OF PREVIOUS DOCUMENT ---
552
+ {prev_text}
553
+
554
+ --- START OF POTENTIAL NEW DOCUMENT ---
555
+ {curr_text}
556
+
557
+ Is this a TRUE document boundary (a completely separate document starts here)?
558
+ Answer ONLY with: TRUE or FALSE
559
+
560
+ Your answer:"""
561
+
562
+ try:
563
+ response = llm.complete(prompt)
564
+ response_text = str(response).strip().upper()
565
+
566
+ is_valid = "TRUE" in response_text
567
+
568
+ logger.debug(
569
+ "llm_boundary_validation",
570
+ page=boundary.page_num,
571
+ title=boundary.title_candidate[:30] if boundary.title_candidate else "",
572
+ llm_response=response_text[:50],
573
+ is_valid=is_valid,
574
+ )
575
+
576
+ if is_valid:
577
+ validated.append(boundary)
578
+ else:
579
+ logger.info(
580
+ "boundary_rejected_by_llm",
581
+ page=boundary.page_num,
582
+ title=boundary.title_candidate[:30] if boundary.title_candidate else "",
583
+ )
584
+
585
+ except Exception as e:
586
+ logger.warning("llm_validation_failed", page=boundary.page_num, error=str(e))
587
+ # On LLM error, keep boundaries with high confidence
588
+ if boundary.confidence >= 0.8:
589
+ validated.append(boundary)
590
+
591
+ logger.info(
592
+ "llm_validation_complete",
593
+ original_count=len(boundaries),
594
+ validated_count=len(validated),
595
+ rejected_count=len(boundaries) - len(validated),
596
+ )
597
+
598
+ return validated
599
+
600
+ def _extract_title_from_spans(self, spans: list[SpanInfo]) -> str:
601
+ """Extract a title from the first page's spans."""
602
+ if not spans:
603
+ return "Document"
604
+
605
+ # Get spans from first page
606
+ first_page = min(s.page_num for s in spans)
607
+ first_page_spans = [s for s in spans if s.page_num == first_page]
608
+
609
+ if not first_page_spans:
610
+ return "Document"
611
+
612
+ # Find largest text near top
613
+ page_height = max(s.bbox.y1 for s in first_page_spans)
614
+ top_spans = [s for s in first_page_spans if s.bbox.y0 < page_height * 0.3]
615
+
616
+ if top_spans:
617
+ max_size = max(s.font_size for s in top_spans)
618
+ title_spans = [s for s in top_spans if s.font_size == max_size]
619
+ return " ".join(s.text.strip() for s in title_spans[:3])
620
+
621
+ return "Document"
622
+
623
+
624
+ def detect_document_boundaries(
625
+ spans: list[SpanInfo],
626
+ page_heights: dict[int, float] | None = None,
627
+ min_confidence: float = 0.5,
628
+ ) -> list[DocumentBoundary]:
629
+ """
630
+ Convenience function to detect document boundaries.
631
+
632
+ Args:
633
+ spans: List of SpanInfo from the PDF
634
+ page_heights: Optional dict mapping page_num -> page_height
635
+ min_confidence: Minimum confidence threshold (0.0-1.0)
636
+
637
+ Returns:
638
+ List of DocumentBoundary objects
639
+ """
640
+ detector = DocumentBoundaryDetector(min_confidence=min_confidence)
641
+ return detector.detect_boundaries(spans, page_heights)
642
+
643
+
644
+ def segment_by_documents(
645
+ spans: list[SpanInfo],
646
+ page_heights: dict[int, float] | None = None,
647
+ min_confidence: float = 0.5,
648
+ ) -> list[DocumentSegment]:
649
+ """
650
+ Convenience function to segment spans into separate documents.
651
+
652
+ Args:
653
+ spans: List of SpanInfo from the PDF
654
+ page_heights: Optional dict mapping page_num -> page_height
655
+ min_confidence: Minimum confidence threshold (0.0-1.0)
656
+
657
+ Returns:
658
+ List of DocumentSegment, each containing spans for one document
659
+ """
660
+ detector = DocumentBoundaryDetector(min_confidence=min_confidence)
661
+ boundaries = detector.detect_boundaries(spans, page_heights)
662
+ return detector.segment_spans(spans, boundaries)