rnsr 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. rnsr/__init__.py +118 -0
  2. rnsr/__main__.py +242 -0
  3. rnsr/agent/__init__.py +218 -0
  4. rnsr/agent/cross_doc_navigator.py +767 -0
  5. rnsr/agent/graph.py +1557 -0
  6. rnsr/agent/llm_cache.py +575 -0
  7. rnsr/agent/navigator_api.py +497 -0
  8. rnsr/agent/provenance.py +772 -0
  9. rnsr/agent/query_clarifier.py +617 -0
  10. rnsr/agent/reasoning_memory.py +736 -0
  11. rnsr/agent/repl_env.py +709 -0
  12. rnsr/agent/rlm_navigator.py +2108 -0
  13. rnsr/agent/self_reflection.py +602 -0
  14. rnsr/agent/variable_store.py +308 -0
  15. rnsr/benchmarks/__init__.py +118 -0
  16. rnsr/benchmarks/comprehensive_benchmark.py +733 -0
  17. rnsr/benchmarks/evaluation_suite.py +1210 -0
  18. rnsr/benchmarks/finance_bench.py +147 -0
  19. rnsr/benchmarks/pdf_merger.py +178 -0
  20. rnsr/benchmarks/performance.py +321 -0
  21. rnsr/benchmarks/quality.py +321 -0
  22. rnsr/benchmarks/runner.py +298 -0
  23. rnsr/benchmarks/standard_benchmarks.py +995 -0
  24. rnsr/client.py +560 -0
  25. rnsr/document_store.py +394 -0
  26. rnsr/exceptions.py +74 -0
  27. rnsr/extraction/__init__.py +172 -0
  28. rnsr/extraction/candidate_extractor.py +357 -0
  29. rnsr/extraction/entity_extractor.py +581 -0
  30. rnsr/extraction/entity_linker.py +825 -0
  31. rnsr/extraction/grounded_extractor.py +722 -0
  32. rnsr/extraction/learned_types.py +599 -0
  33. rnsr/extraction/models.py +232 -0
  34. rnsr/extraction/relationship_extractor.py +600 -0
  35. rnsr/extraction/relationship_patterns.py +511 -0
  36. rnsr/extraction/relationship_validator.py +392 -0
  37. rnsr/extraction/rlm_extractor.py +589 -0
  38. rnsr/extraction/rlm_unified_extractor.py +990 -0
  39. rnsr/extraction/tot_validator.py +610 -0
  40. rnsr/extraction/unified_extractor.py +342 -0
  41. rnsr/indexing/__init__.py +60 -0
  42. rnsr/indexing/knowledge_graph.py +1128 -0
  43. rnsr/indexing/kv_store.py +313 -0
  44. rnsr/indexing/persistence.py +323 -0
  45. rnsr/indexing/semantic_retriever.py +237 -0
  46. rnsr/indexing/semantic_search.py +320 -0
  47. rnsr/indexing/skeleton_index.py +395 -0
  48. rnsr/ingestion/__init__.py +161 -0
  49. rnsr/ingestion/chart_parser.py +569 -0
  50. rnsr/ingestion/document_boundary.py +662 -0
  51. rnsr/ingestion/font_histogram.py +334 -0
  52. rnsr/ingestion/header_classifier.py +595 -0
  53. rnsr/ingestion/hierarchical_cluster.py +515 -0
  54. rnsr/ingestion/layout_detector.py +356 -0
  55. rnsr/ingestion/layout_model.py +379 -0
  56. rnsr/ingestion/ocr_fallback.py +177 -0
  57. rnsr/ingestion/pipeline.py +936 -0
  58. rnsr/ingestion/semantic_fallback.py +417 -0
  59. rnsr/ingestion/table_parser.py +799 -0
  60. rnsr/ingestion/text_builder.py +460 -0
  61. rnsr/ingestion/tree_builder.py +402 -0
  62. rnsr/ingestion/vision_retrieval.py +965 -0
  63. rnsr/ingestion/xy_cut.py +555 -0
  64. rnsr/llm.py +733 -0
  65. rnsr/models.py +167 -0
  66. rnsr/py.typed +2 -0
  67. rnsr-0.1.0.dist-info/METADATA +592 -0
  68. rnsr-0.1.0.dist-info/RECORD +72 -0
  69. rnsr-0.1.0.dist-info/WHEEL +5 -0
  70. rnsr-0.1.0.dist-info/entry_points.txt +2 -0
  71. rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
  72. rnsr-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,936 @@
1
+ """
2
+ Ingestion Pipeline - Master Function with Enhanced Latent Hierarchy Generation
3
+
4
+ This module provides the main `ingest_document()` function that implements
5
+ the full Latent Hierarchy Generator from the research paper (Section 4-6).
6
+
7
+ TIER 1: Visual-Geometric Analysis (Primary)
8
+ 1a. PyMuPDF Font Histogram (Section 6.1)
9
+ - If headers detected via font variance → Build hierarchical tree
10
+ 1b. Recursive XY-Cut (Section 4.1.1) - Optional for complex layouts
11
+ - For multi-column documents, L-shaped text wraps
12
+
13
+ TIER 2: Semantic Boundary Detection (Fallback 1 - Flat Text)
14
+ 2a. LlamaIndex SemanticSplitterNodeParser (Section 4.2.1)
15
+ - Embedding-based splitting at topic shifts
16
+ 2b. Hierarchical Clustering (Section 4.2.2) - Enhanced option
17
+ - Multi-resolution: micro-clusters → macro-clusters
18
+ 2c. Synthetic Header Generation (Section 6.3)
19
+ - LLM-generated titles for each section
20
+
21
+ TIER 3: OCR + Re-analyze (Fallback 2 - Scanned PDFs)
22
+ - Apply Tesseract or Doctr OCR
23
+ - Generate text layer from images
24
+ - Build tree from OCR output
25
+
26
+ ALWAYS call `ingest_document()` - never call individual tiers directly.
27
+ """
28
+
29
+ from __future__ import annotations
30
+
31
+ from pathlib import Path
32
+
33
+ import structlog
34
+
35
+ from rnsr.exceptions import IngestionError
36
+ from rnsr.ingestion.document_boundary import (
37
+ DocumentBoundaryDetector,
38
+ segment_by_documents,
39
+ )
40
+ from rnsr.ingestion.font_histogram import FontHistogramAnalyzer
41
+ from rnsr.ingestion.header_classifier import classify_headers
42
+ from rnsr.ingestion.layout_detector import detect_layout_complexity
43
+ from rnsr.ingestion.ocr_fallback import has_extractable_text, try_ocr_ingestion
44
+ from rnsr.ingestion.semantic_fallback import try_semantic_splitter_ingestion
45
+ from rnsr.ingestion.tree_builder import build_document_tree, build_multi_document_tree
46
+ from rnsr.models import DocumentTree, IngestionResult
47
+
48
+ logger = structlog.get_logger(__name__)
49
+
50
+
51
+ def ingest_document(
52
+ pdf_path: Path | str,
53
+ use_visual_analysis: bool = True,
54
+ complexity_threshold: float = 0.3,
55
+ ) -> IngestionResult:
56
+ """
57
+ Master ingestion function implementing 3-tier graceful degradation.
58
+
59
+ ALWAYS call this function - never call individual tiers directly.
60
+
61
+ Ingestion Flow:
62
+ 0. Pre-analysis: Detect layout complexity (multi-column, empty pages)
63
+ 1. Tier 1a: Font Histogram (simple layouts)
64
+ 1. Tier 1b: LayoutLM + XY-Cut (complex layouts, if use_visual_analysis=True)
65
+ 2. Tier 2: Semantic Splitter (flat text, no structure)
66
+ 3. Tier 3: OCR (scanned/image-only PDFs)
67
+
68
+ Args:
69
+ pdf_path: Path to the PDF file to ingest.
70
+ use_visual_analysis: Enable LayoutLM for complex layouts (default: True).
71
+ complexity_threshold: Threshold for triggering visual analysis (0.0-1.0).
72
+
73
+ Returns:
74
+ IngestionResult containing the DocumentTree and metadata.
75
+
76
+ Raises:
77
+ IngestionError: If all tiers fail.
78
+
79
+ Example:
80
+ # Auto-detect layout complexity
81
+ result = ingest_document("contract.pdf")
82
+
83
+ # Force visual analysis
84
+ result = ingest_document("report.pdf", use_visual_analysis=True)
85
+
86
+ # Disable visual analysis
87
+ result = ingest_document("simple.pdf", use_visual_analysis=False)
88
+ """
89
+ pdf_path = Path(pdf_path)
90
+
91
+ if not pdf_path.exists():
92
+ raise IngestionError(f"PDF file not found: {pdf_path}")
93
+
94
+ logger.info("ingestion_started", path=str(pdf_path))
95
+
96
+ warnings: list[str] = []
97
+ stats: dict = {"path": str(pdf_path)}
98
+
99
+ # Check if document has extractable text
100
+ if not has_extractable_text(pdf_path):
101
+ # No text - go directly to Tier 3 (OCR)
102
+ logger.info("no_extractable_text", path=str(pdf_path))
103
+ return _try_tier_3(pdf_path, warnings, stats)
104
+
105
+ # PRE-ANALYSIS: Detect layout complexity
106
+ if use_visual_analysis:
107
+ try:
108
+ complexity = detect_layout_complexity(pdf_path, threshold=complexity_threshold)
109
+
110
+ stats["layout_complexity"] = complexity.complexity_score
111
+ stats["needs_visual"] = complexity.needs_visual_analysis
112
+ stats["complexity_reason"] = complexity.reason
113
+
114
+ if complexity.needs_visual_analysis:
115
+ logger.info(
116
+ "complex_layout_detected",
117
+ path=str(pdf_path),
118
+ score=complexity.complexity_score,
119
+ reason=complexity.reason,
120
+ )
121
+
122
+ # Try visual analysis first
123
+ result = _try_tier_1b_visual(pdf_path, warnings, stats)
124
+ if result is not None:
125
+ return result
126
+
127
+ # Fall through to standard font histogram if visual fails
128
+ warnings.append(f"Visual analysis failed, using font histogram fallback")
129
+ except Exception as e:
130
+ logger.warning("layout_detection_failed", error=str(e))
131
+ warnings.append(f"Layout detection failed: {e}")
132
+
133
+ # TIER 1: Try PyMuPDF Font Histogram
134
+ result = _try_tier_1(pdf_path, warnings, stats)
135
+ if result is not None:
136
+ return result
137
+
138
+ # TIER 2: Try Semantic Splitter
139
+ result = _try_tier_2(pdf_path, warnings, stats)
140
+ if result is not None:
141
+ return result
142
+
143
+ # This shouldn't happen, but just in case
144
+ raise IngestionError("All ingestion tiers failed")
145
+
146
+
147
+ def _try_tier_1b_visual(
148
+ pdf_path: Path,
149
+ warnings: list[str],
150
+ stats: dict,
151
+ ) -> IngestionResult | None:
152
+ """
153
+ TIER 1b: Try LayoutLM + XY-Cut for complex layouts.
154
+
155
+ Uses visual analysis to detect document structure when
156
+ layout is too complex for simple font histogram.
157
+ """
158
+ logger.debug("trying_tier_1b_visual", path=str(pdf_path))
159
+
160
+ try:
161
+ from rnsr.ingestion.layout_model import check_layout_model_available
162
+
163
+ if not check_layout_model_available():
164
+ logger.warning("layout_model_unavailable")
165
+ warnings.append("LayoutLM not available - falling back to font histogram")
166
+ return None
167
+
168
+ from rnsr.ingestion.xy_cut import analyze_document_with_xycut
169
+
170
+ # Use XY-Cut + LayoutLM for visual analysis
171
+ tree = analyze_document_with_xycut(pdf_path)
172
+ tree.ingestion_tier = 1
173
+ tree.ingestion_method = "layoutlm_xycut"
174
+
175
+ logger.info(
176
+ "tier_1b_visual_success",
177
+ path=str(pdf_path),
178
+ nodes=tree.total_nodes,
179
+ )
180
+
181
+ return IngestionResult(
182
+ tree=tree,
183
+ tier_used=1,
184
+ method="layoutlm_xycut",
185
+ warnings=warnings,
186
+ stats=stats,
187
+ )
188
+
189
+ except Exception as e:
190
+ logger.warning("tier_1b_visual_failed", path=str(pdf_path), error=str(e))
191
+ warnings.append(f"LayoutLM visual analysis failed: {e}")
192
+ return None
193
+
194
+
195
+ def _try_tier_1(
196
+ pdf_path: Path,
197
+ warnings: list[str],
198
+ stats: dict,
199
+ detect_multi_document: bool = True,
200
+ boundary_confidence: float = 0.5,
201
+ ) -> IngestionResult | None:
202
+ """
203
+ TIER 1: Try Font Histogram ingestion.
204
+
205
+ Now includes multi-document detection for combined PDFs.
206
+
207
+ Returns None if should fall back to Tier 2.
208
+ """
209
+ logger.debug("trying_tier_1", path=str(pdf_path))
210
+
211
+ try:
212
+ analyzer = FontHistogramAnalyzer()
213
+ analysis, spans = analyzer.analyze(pdf_path)
214
+
215
+ stats["span_count"] = len(spans)
216
+ stats["unique_sizes"] = analysis.unique_sizes
217
+ stats["body_size"] = analysis.body_size
218
+
219
+ # Check if we have font variance
220
+ if not analyzer.has_font_variance(analysis):
221
+ logger.info("no_font_variance", path=str(pdf_path))
222
+ warnings.append("No font variance detected - using semantic splitter")
223
+ return None # Trigger Tier 2
224
+
225
+ # Check if we can detect headers
226
+ if not analyzer.has_detectable_headers(analysis, spans):
227
+ logger.info("no_headers_detected", path=str(pdf_path))
228
+ warnings.append("No headers detected - using semantic splitter")
229
+ return None # Trigger Tier 2
230
+
231
+ # NEW: Detect document boundaries for multi-document PDFs
232
+ if detect_multi_document:
233
+ segments = segment_by_documents(
234
+ spans,
235
+ min_confidence=boundary_confidence,
236
+ )
237
+
238
+ stats["documents_detected"] = len(segments)
239
+
240
+ if len(segments) > 1:
241
+ logger.info(
242
+ "multi_document_detected",
243
+ path=str(pdf_path),
244
+ document_count=len(segments),
245
+ titles=[s.title[:30] for s in segments],
246
+ )
247
+
248
+ # Build multi-document tree
249
+ tree = build_multi_document_tree(
250
+ segments,
251
+ container_title=pdf_path.stem,
252
+ )
253
+ tree.ingestion_tier = 1
254
+ tree.ingestion_method = "font_histogram"
255
+
256
+ logger.info(
257
+ "tier_1_success",
258
+ path=str(pdf_path),
259
+ nodes=tree.total_nodes,
260
+ documents=len(segments),
261
+ )
262
+
263
+ return IngestionResult(
264
+ tree=tree,
265
+ tier_used=1,
266
+ method="font_histogram",
267
+ warnings=warnings,
268
+ stats=stats,
269
+ )
270
+
271
+ # Single document: standard processing
272
+ # Classify spans
273
+ classified = classify_headers(spans, analysis)
274
+
275
+ header_count = sum(1 for s in classified if s.role == "header")
276
+ stats["header_count"] = header_count
277
+
278
+ # Build tree
279
+ tree = build_document_tree(classified, title=pdf_path.stem)
280
+ tree.ingestion_tier = 1
281
+ tree.ingestion_method = "font_histogram"
282
+
283
+ logger.info(
284
+ "tier_1_success",
285
+ path=str(pdf_path),
286
+ nodes=tree.total_nodes,
287
+ )
288
+
289
+ return IngestionResult(
290
+ tree=tree,
291
+ tier_used=1,
292
+ method="font_histogram",
293
+ warnings=warnings,
294
+ stats=stats,
295
+ )
296
+
297
+ except Exception as e:
298
+ logger.warning("tier_1_failed", path=str(pdf_path), error=str(e))
299
+ warnings.append(f"Font histogram failed: {e}")
300
+ return None
301
+
302
+
303
+ def _try_tier_2(
304
+ pdf_path: Path,
305
+ warnings: list[str],
306
+ stats: dict,
307
+ use_hierarchical_clustering: bool = False,
308
+ ) -> IngestionResult | None:
309
+ """
310
+ TIER 2: Try Semantic Splitter or Hierarchical Clustering ingestion.
311
+
312
+ Implements Section 4.2 of the research paper:
313
+ - 4.2.1: SemanticSplitterNodeParser for breakpoint detection
314
+ - 4.2.2: Hierarchical Clustering for multi-resolution topics
315
+ - 6.3: Synthetic Header Generation via LLM
316
+ """
317
+ logger.debug("trying_tier_2", path=str(pdf_path))
318
+
319
+ # Option: Use hierarchical clustering for richer structure
320
+ if use_hierarchical_clustering:
321
+ try:
322
+ from rnsr.ingestion.hierarchical_cluster import cluster_document_hierarchically
323
+
324
+ tree = cluster_document_hierarchically(pdf_path)
325
+
326
+ logger.info(
327
+ "tier_2_hierarchical_success",
328
+ path=str(pdf_path),
329
+ nodes=tree.total_nodes,
330
+ )
331
+
332
+ return IngestionResult(
333
+ tree=tree,
334
+ tier_used=2,
335
+ method="hierarchical_clustering",
336
+ warnings=warnings,
337
+ stats=stats,
338
+ )
339
+ except Exception as e:
340
+ logger.warning("hierarchical_clustering_failed", error=str(e))
341
+ warnings.append(f"Hierarchical clustering failed: {e}")
342
+ # Fall through to semantic splitter
343
+
344
+ # Default: Semantic Splitter (with LLM-generated headers)
345
+ try:
346
+ tree = try_semantic_splitter_ingestion(pdf_path)
347
+
348
+ logger.info(
349
+ "tier_2_success",
350
+ path=str(pdf_path),
351
+ nodes=tree.total_nodes,
352
+ )
353
+
354
+ return IngestionResult(
355
+ tree=tree,
356
+ tier_used=2,
357
+ method="semantic_splitter",
358
+ warnings=warnings,
359
+ stats=stats,
360
+ )
361
+
362
+ except Exception as e:
363
+ logger.warning("tier_2_failed", path=str(pdf_path), error=str(e))
364
+ warnings.append(f"Semantic splitter failed: {e}")
365
+ # Continue to Tier 3
366
+ return _try_tier_3(pdf_path, warnings, stats)
367
+
368
+
369
+ def _try_tier_3(
370
+ pdf_path: Path,
371
+ warnings: list[str],
372
+ stats: dict,
373
+ ) -> IngestionResult:
374
+ """
375
+ TIER 3: Try OCR ingestion (last resort).
376
+ """
377
+ logger.debug("trying_tier_3", path=str(pdf_path))
378
+
379
+ try:
380
+ tree = try_ocr_ingestion(pdf_path)
381
+
382
+ logger.info(
383
+ "tier_3_success",
384
+ path=str(pdf_path),
385
+ nodes=tree.total_nodes,
386
+ )
387
+
388
+ return IngestionResult(
389
+ tree=tree,
390
+ tier_used=3,
391
+ method="ocr",
392
+ warnings=warnings,
393
+ stats=stats,
394
+ )
395
+
396
+ except Exception as e:
397
+ logger.error("tier_3_failed", path=str(pdf_path), error=str(e))
398
+ raise IngestionError(f"All ingestion tiers failed. Last error: {e}") from e
399
+
400
+
401
+ def ingest_document_enhanced(
402
+ pdf_path: Path | str,
403
+ use_xy_cut: bool = False,
404
+ use_hierarchical_clustering: bool = False,
405
+ ) -> IngestionResult:
406
+ """
407
+ Enhanced ingestion with all research paper features.
408
+
409
+ This exposes the full Latent Hierarchy Generator from the paper:
410
+ - XY-Cut for complex multi-column layouts (Section 4.1.1)
411
+ - Hierarchical Clustering for multi-resolution topics (Section 4.2.2)
412
+ - Synthetic Header Generation via LLM (Section 6.3)
413
+
414
+ Args:
415
+ pdf_path: Path to the PDF file to ingest.
416
+ use_xy_cut: Enable Recursive XY-Cut for complex layouts.
417
+ use_hierarchical_clustering: Use clustering instead of simple splits.
418
+
419
+ Returns:
420
+ IngestionResult containing the DocumentTree and metadata.
421
+
422
+ Example:
423
+ # For a complex multi-column PDF:
424
+ result = ingest_document_enhanced("report.pdf", use_xy_cut=True)
425
+
426
+ # For flat text that needs hierarchical structure:
427
+ result = ingest_document_enhanced(
428
+ "transcript.pdf",
429
+ use_hierarchical_clustering=True
430
+ )
431
+ """
432
+ pdf_path = Path(pdf_path)
433
+
434
+ if not pdf_path.exists():
435
+ raise IngestionError(f"PDF file not found: {pdf_path}")
436
+
437
+ logger.info(
438
+ "enhanced_ingestion_started",
439
+ path=str(pdf_path),
440
+ xy_cut=use_xy_cut,
441
+ hierarchical=use_hierarchical_clustering,
442
+ )
443
+
444
+ warnings: list[str] = []
445
+ stats: dict = {"path": str(pdf_path)}
446
+
447
+ # Check if document has extractable text
448
+ if not has_extractable_text(pdf_path):
449
+ return _try_tier_3(pdf_path, warnings, stats)
450
+
451
+ # Try XY-Cut first if enabled (for complex layouts)
452
+ if use_xy_cut:
453
+ result = _try_xy_cut_ingestion(pdf_path, warnings, stats)
454
+ if result is not None:
455
+ return result
456
+
457
+ # TIER 1: Try Font Histogram
458
+ result = _try_tier_1(pdf_path, warnings, stats)
459
+ if result is not None:
460
+ return result
461
+
462
+ # TIER 2: Semantic analysis with optional hierarchical clustering
463
+ result = _try_tier_2(pdf_path, warnings, stats, use_hierarchical_clustering)
464
+ if result is not None:
465
+ return result
466
+
467
+ raise IngestionError("All ingestion tiers failed")
468
+
469
+
470
+ def _try_xy_cut_ingestion(
471
+ pdf_path: Path,
472
+ warnings: list[str],
473
+ stats: dict,
474
+ ) -> IngestionResult | None:
475
+ """
476
+ Optional: Use Recursive XY-Cut + LayoutLM for complex layouts.
477
+
478
+ Implements Section 4.1.1:
479
+ "A top-down page segmentation technique that is particularly
480
+ effective for discovering document structure."
481
+ """
482
+ logger.debug("trying_xy_cut_with_layoutlm", path=str(pdf_path))
483
+
484
+ try:
485
+ # Check if LayoutLM is available
486
+ from rnsr.ingestion.layout_model import check_layout_model_available
487
+
488
+ if not check_layout_model_available():
489
+ logger.warning("layout_model_unavailable")
490
+ warnings.append("LayoutLM not available for XY-Cut enhancement")
491
+ return None
492
+
493
+ from rnsr.ingestion.xy_cut import analyze_document_with_xycut
494
+
495
+ # Use XY-Cut + LayoutLM for visual analysis
496
+ tree = analyze_document_with_xycut(pdf_path)
497
+ tree.ingestion_tier = 1
498
+ tree.ingestion_method = "xy_cut_layoutlm"
499
+
500
+ logger.info(
501
+ "xy_cut_layoutlm_success",
502
+ path=str(pdf_path),
503
+ nodes=tree.total_nodes,
504
+ )
505
+
506
+ return IngestionResult(
507
+ tree=tree,
508
+ tier_used=1,
509
+ method="xy_cut_layoutlm",
510
+ warnings=warnings,
511
+ stats=stats,
512
+ )
513
+
514
+ except Exception as e:
515
+ logger.warning("xy_cut_layoutlm_failed", path=str(pdf_path), error=str(e))
516
+ warnings.append(f"XY-Cut + LayoutLM failed: {e}")
517
+ return None
518
+
519
+
520
+ def _try_xy_cut_ingestion_legacy(
521
+ pdf_path: Path,
522
+ warnings: list[str],
523
+ stats: dict,
524
+ ) -> IngestionResult | None:
525
+ """
526
+ Legacy XY-Cut implementation without LayoutLM.
527
+
528
+ Implements Section 4.1.1:
529
+ "A top-down page segmentation technique that is particularly
530
+ effective for discovering document structure."
531
+ """
532
+ logger.debug("trying_xy_cut", path=str(pdf_path))
533
+
534
+ try:
535
+ from rnsr.ingestion.xy_cut import RecursiveXYCutter
536
+ import fitz
537
+
538
+ cutter = RecursiveXYCutter()
539
+ page_trees = cutter.segment_pdf(pdf_path)
540
+
541
+ # Extract text for each leaf region
542
+ doc = fitz.open(pdf_path)
543
+ for page_num, tree in enumerate(page_trees):
544
+ cutter.extract_text_in_regions(doc[page_num], tree)
545
+ doc.close()
546
+
547
+ # Convert XY-Cut tree to DocumentTree
548
+ from rnsr.models import DocumentNode, DocumentTree
549
+
550
+ root = DocumentNode(
551
+ id="root",
552
+ level=0,
553
+ header=pdf_path.stem,
554
+ )
555
+
556
+ section_num = 0
557
+ for page_tree in page_trees:
558
+ for leaf in _get_xy_cut_leaves(page_tree):
559
+ if leaf.text.strip():
560
+ section_num += 1
561
+ # Generate synthetic header
562
+ from rnsr.ingestion.semantic_fallback import _generate_synthetic_header
563
+
564
+ section = DocumentNode(
565
+ id=f"xycut_{section_num:03d}",
566
+ level=1,
567
+ header=_generate_synthetic_header(leaf.text, section_num),
568
+ content=leaf.text,
569
+ )
570
+ root.children.append(section)
571
+
572
+ if section_num == 0:
573
+ warnings.append("XY-Cut found no text regions")
574
+ return None
575
+
576
+ tree = DocumentTree(
577
+ title=pdf_path.stem,
578
+ root=root,
579
+ total_nodes=section_num + 1,
580
+ ingestion_tier=1,
581
+ ingestion_method="xy_cut",
582
+ )
583
+
584
+ logger.info("xy_cut_success", path=str(pdf_path), nodes=tree.total_nodes)
585
+
586
+ return IngestionResult(
587
+ tree=tree,
588
+ tier_used=1,
589
+ method="xy_cut",
590
+ warnings=warnings,
591
+ stats=stats,
592
+ )
593
+
594
+ except Exception as e:
595
+ logger.warning("xy_cut_failed", path=str(pdf_path), error=str(e))
596
+ warnings.append(f"XY-Cut failed: {e}")
597
+ return None
598
+
599
+
600
+ def _get_xy_cut_leaves(node) -> list:
601
+ """Get all leaf nodes from an XY-Cut segment tree."""
602
+ if node.is_leaf:
603
+ return [node]
604
+ leaves = []
605
+ for child in node.children:
606
+ leaves.extend(_get_xy_cut_leaves(child))
607
+ return leaves
608
+
609
+
610
+ # =============================================================================
611
+ # Entity Extraction Integration
612
+ # =============================================================================
613
+
614
+
615
+ def extract_entities_from_tree(
616
+ tree: DocumentTree,
617
+ doc_id: str | None = None,
618
+ extract_relationships: bool = True,
619
+ max_nodes: int = 100,
620
+ sample_strategy: str = "important",
621
+ ) -> dict:
622
+ """
623
+ Extract entities and relationships from an ingested document tree.
624
+
625
+ Uses the RLM Unified Extractor - the most accurate approach:
626
+ 1. LLM writes extraction code based on document
627
+ 2. Code executes on DOC_VAR (grounded in actual text)
628
+ 3. ToT validates with probabilities
629
+ 4. Cross-validation between entities and relationships
630
+
631
+ Args:
632
+ tree: The ingested DocumentTree.
633
+ doc_id: Document ID (defaults to tree.id).
634
+ extract_relationships: Whether to also extract relationships.
635
+ max_nodes: Maximum nodes to process (for large documents).
636
+ sample_strategy: How to select nodes - "important" (headers first),
637
+ "uniform" (evenly spaced), or "all" (process all).
638
+
639
+ Returns:
640
+ Dictionary containing:
641
+ - entities: List of extracted Entity objects
642
+ - relationships: List of extracted Relationship objects
643
+ - stats: Extraction statistics
644
+
645
+ Example:
646
+ result = ingest_document("contract.pdf")
647
+ extraction = extract_entities_from_tree(result.tree)
648
+
649
+ # Store in knowledge graph
650
+ for entity in extraction["entities"]:
651
+ kg.add_entity(entity)
652
+ """
653
+ from rnsr.extraction import (
654
+ RLMUnifiedExtractor,
655
+ merge_entities,
656
+ )
657
+
658
+ doc_id = doc_id or tree.id
659
+
660
+ # Use RLM Unified Extractor (LLM writes code + ToT validation)
661
+ extractor = RLMUnifiedExtractor(
662
+ enable_type_learning=True,
663
+ enable_tot_validation=True,
664
+ enable_cross_validation=True,
665
+ )
666
+
667
+ all_entities = []
668
+ all_relationships = []
669
+
670
+ # Collect all nodes for processing
671
+ all_nodes = _collect_nodes(tree.root, doc_id)
672
+
673
+ # Sample nodes if too many
674
+ if sample_strategy != "all" and len(all_nodes) > max_nodes:
675
+ nodes_to_process = _sample_nodes(all_nodes, max_nodes, sample_strategy)
676
+ logger.info(
677
+ "entity_extraction_sampling",
678
+ total_nodes=len(all_nodes),
679
+ sampled_nodes=len(nodes_to_process),
680
+ strategy=sample_strategy,
681
+ )
682
+ else:
683
+ nodes_to_process = all_nodes
684
+
685
+ logger.info(
686
+ "entity_extraction_started",
687
+ doc_id=doc_id,
688
+ node_count=len(nodes_to_process),
689
+ total_nodes=len(all_nodes),
690
+ extractor="RLMUnifiedExtractor",
691
+ )
692
+
693
+ # Process nodes in batches for efficiency
694
+ batch_size = 10
695
+ processed = 0
696
+
697
+ for i in range(0, len(nodes_to_process), batch_size):
698
+ batch = nodes_to_process[i:i + batch_size]
699
+
700
+ for node_data in batch:
701
+ try:
702
+ result = extractor.extract(
703
+ node_id=node_data["node_id"],
704
+ doc_id=doc_id,
705
+ header=node_data["header"],
706
+ content=node_data["content"],
707
+ page_num=node_data.get("page_num"),
708
+ )
709
+
710
+ if result.entities:
711
+ all_entities.extend(result.entities)
712
+
713
+ if extract_relationships and result.relationships:
714
+ all_relationships.extend(result.relationships)
715
+
716
+ except Exception as e:
717
+ logger.warning(
718
+ "node_extraction_failed",
719
+ node_id=node_data.get("node_id"),
720
+ error=str(e)[:100],
721
+ )
722
+
723
+ processed += len(batch)
724
+ if processed % 20 == 0:
725
+ logger.info(
726
+ "entity_extraction_progress",
727
+ processed=processed,
728
+ total=len(nodes_to_process),
729
+ entities_so_far=len(all_entities),
730
+ )
731
+
732
+ # Merge duplicate entities
733
+ merged_entities = merge_entities(all_entities)
734
+
735
+ stats = {
736
+ "nodes_processed": len(nodes_to_process),
737
+ "entities_extracted": len(all_entities),
738
+ "entities_after_merge": len(merged_entities),
739
+ "relationships_extracted": len(all_relationships),
740
+ "entity_types": _count_entity_types(merged_entities),
741
+ "extraction_method": "rlm_unified",
742
+ }
743
+
744
+ logger.info(
745
+ "entity_extraction_complete",
746
+ doc_id=doc_id,
747
+ **stats,
748
+ )
749
+
750
+ return {
751
+ "entities": merged_entities,
752
+ "relationships": all_relationships,
753
+ "stats": stats,
754
+ }
755
+
756
+
757
+ def _collect_nodes(node, doc_id: str, collected: list | None = None) -> list[dict]:
758
+ """
759
+ Recursively collect all nodes from a DocumentNode tree.
760
+
761
+ Args:
762
+ node: Root DocumentNode.
763
+ doc_id: Document ID.
764
+ collected: List to collect into.
765
+
766
+ Returns:
767
+ List of node data dictionaries.
768
+ """
769
+ if collected is None:
770
+ collected = []
771
+
772
+ # Add this node if it has content
773
+ if node.content or node.header:
774
+ collected.append({
775
+ "node_id": node.id,
776
+ "header": node.header,
777
+ "content": node.content,
778
+ "page_num": node.page_num,
779
+ "level": node.level,
780
+ })
781
+
782
+ # Process children
783
+ for child in node.children:
784
+ _collect_nodes(child, doc_id, collected)
785
+
786
+ return collected
787
+
788
+
789
+ def _sample_nodes(nodes: list[dict], max_nodes: int, strategy: str) -> list[dict]:
790
+ """
791
+ Sample nodes from a large document for efficient processing.
792
+
793
+ Args:
794
+ nodes: All collected nodes.
795
+ max_nodes: Maximum number of nodes to return.
796
+ strategy: Sampling strategy - "important", "uniform", or "first".
797
+
798
+ Returns:
799
+ Sampled list of nodes.
800
+ """
801
+ if len(nodes) <= max_nodes:
802
+ return nodes
803
+
804
+ if strategy == "important":
805
+ # Prioritize nodes with headers and higher-level sections
806
+ scored_nodes = []
807
+ for node in nodes:
808
+ score = 0
809
+
810
+ # Prefer nodes with headers
811
+ if node.get("header"):
812
+ score += 10
813
+
814
+ # Prefer higher-level (lower number) sections
815
+ level = node.get("level", 3)
816
+ score += max(0, 5 - level)
817
+
818
+ # Prefer nodes with substantial content
819
+ content_len = len(node.get("content", ""))
820
+ if content_len > 500:
821
+ score += 3
822
+ elif content_len > 200:
823
+ score += 2
824
+ elif content_len > 50:
825
+ score += 1
826
+
827
+ scored_nodes.append((score, node))
828
+
829
+ # Sort by score (descending) and take top nodes
830
+ scored_nodes.sort(key=lambda x: x[0], reverse=True)
831
+ return [node for _, node in scored_nodes[:max_nodes]]
832
+
833
+ elif strategy == "uniform":
834
+ # Evenly sample across the document
835
+ step = len(nodes) // max_nodes
836
+ return [nodes[i] for i in range(0, len(nodes), step)][:max_nodes]
837
+
838
+ else: # "first"
839
+ return nodes[:max_nodes]
840
+
841
+
842
+ def _count_entity_types(entities: list) -> dict[str, int]:
843
+ """Count entities by type."""
844
+ counts: dict[str, int] = {}
845
+ for entity in entities:
846
+ type_name = entity.type.value
847
+ counts[type_name] = counts.get(type_name, 0) + 1
848
+ return counts
849
+
850
+
851
+ def ingest_with_entities(
852
+ pdf_path: Path | str,
853
+ knowledge_graph=None,
854
+ extract_relationships: bool = True,
855
+ link_entities: bool = True,
856
+ **ingest_kwargs,
857
+ ) -> dict:
858
+ """
859
+ Ingest a document and extract entities in a single operation.
860
+
861
+ Combines document ingestion with entity extraction, optionally
862
+ storing results in a knowledge graph.
863
+
864
+ Args:
865
+ pdf_path: Path to the PDF file.
866
+ knowledge_graph: Optional KnowledgeGraph to store entities.
867
+ extract_relationships: Whether to extract relationships.
868
+ link_entities: Whether to link entities across documents.
869
+ **ingest_kwargs: Additional arguments for ingest_document.
870
+
871
+ Returns:
872
+ Dictionary containing:
873
+ - ingestion_result: The IngestionResult
874
+ - extraction: Entity extraction results
875
+ - links: Entity links (if link_entities=True)
876
+
877
+ Example:
878
+ from rnsr.indexing.knowledge_graph import KnowledgeGraph
879
+
880
+ kg = KnowledgeGraph("./data/kg.db")
881
+ result = ingest_with_entities("contract.pdf", knowledge_graph=kg)
882
+
883
+ print(f"Found {len(result['extraction']['entities'])} entities")
884
+ """
885
+ # Ingest document
886
+ ingestion_result = ingest_document(pdf_path, **ingest_kwargs)
887
+
888
+ # Extract entities
889
+ extraction = extract_entities_from_tree(
890
+ tree=ingestion_result.tree,
891
+ extract_relationships=extract_relationships,
892
+ )
893
+
894
+ result = {
895
+ "ingestion_result": ingestion_result,
896
+ "extraction": extraction,
897
+ "links": [],
898
+ }
899
+
900
+ # Store in knowledge graph if provided
901
+ if knowledge_graph is not None:
902
+ from rnsr.extraction import EntityLinker
903
+
904
+ # Store entities
905
+ for entity in extraction["entities"]:
906
+ knowledge_graph.add_entity(entity)
907
+
908
+ # Store relationships
909
+ for relationship in extraction["relationships"]:
910
+ knowledge_graph.add_relationship(relationship)
911
+
912
+ # Link entities if enabled
913
+ if link_entities:
914
+ linker = EntityLinker(knowledge_graph)
915
+ links = linker.link_all_entities_in_document(ingestion_result.tree.id)
916
+ result["links"] = links
917
+
918
+ logger.info(
919
+ "stored_in_knowledge_graph",
920
+ doc_id=ingestion_result.tree.id,
921
+ entities=len(extraction["entities"]),
922
+ relationships=len(extraction["relationships"]),
923
+ links=len(result["links"]),
924
+ )
925
+
926
+ return result
927
+
928
+
929
+ # Convenience exports
930
+ __all__ = [
931
+ "ingest_document",
932
+ "ingest_document_enhanced",
933
+ "ingest_with_entities",
934
+ "extract_entities_from_tree",
935
+ "IngestionResult",
936
+ ]