rnsr 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rnsr/__init__.py +118 -0
- rnsr/__main__.py +242 -0
- rnsr/agent/__init__.py +218 -0
- rnsr/agent/cross_doc_navigator.py +767 -0
- rnsr/agent/graph.py +1557 -0
- rnsr/agent/llm_cache.py +575 -0
- rnsr/agent/navigator_api.py +497 -0
- rnsr/agent/provenance.py +772 -0
- rnsr/agent/query_clarifier.py +617 -0
- rnsr/agent/reasoning_memory.py +736 -0
- rnsr/agent/repl_env.py +709 -0
- rnsr/agent/rlm_navigator.py +2108 -0
- rnsr/agent/self_reflection.py +602 -0
- rnsr/agent/variable_store.py +308 -0
- rnsr/benchmarks/__init__.py +118 -0
- rnsr/benchmarks/comprehensive_benchmark.py +733 -0
- rnsr/benchmarks/evaluation_suite.py +1210 -0
- rnsr/benchmarks/finance_bench.py +147 -0
- rnsr/benchmarks/pdf_merger.py +178 -0
- rnsr/benchmarks/performance.py +321 -0
- rnsr/benchmarks/quality.py +321 -0
- rnsr/benchmarks/runner.py +298 -0
- rnsr/benchmarks/standard_benchmarks.py +995 -0
- rnsr/client.py +560 -0
- rnsr/document_store.py +394 -0
- rnsr/exceptions.py +74 -0
- rnsr/extraction/__init__.py +172 -0
- rnsr/extraction/candidate_extractor.py +357 -0
- rnsr/extraction/entity_extractor.py +581 -0
- rnsr/extraction/entity_linker.py +825 -0
- rnsr/extraction/grounded_extractor.py +722 -0
- rnsr/extraction/learned_types.py +599 -0
- rnsr/extraction/models.py +232 -0
- rnsr/extraction/relationship_extractor.py +600 -0
- rnsr/extraction/relationship_patterns.py +511 -0
- rnsr/extraction/relationship_validator.py +392 -0
- rnsr/extraction/rlm_extractor.py +589 -0
- rnsr/extraction/rlm_unified_extractor.py +990 -0
- rnsr/extraction/tot_validator.py +610 -0
- rnsr/extraction/unified_extractor.py +342 -0
- rnsr/indexing/__init__.py +60 -0
- rnsr/indexing/knowledge_graph.py +1128 -0
- rnsr/indexing/kv_store.py +313 -0
- rnsr/indexing/persistence.py +323 -0
- rnsr/indexing/semantic_retriever.py +237 -0
- rnsr/indexing/semantic_search.py +320 -0
- rnsr/indexing/skeleton_index.py +395 -0
- rnsr/ingestion/__init__.py +161 -0
- rnsr/ingestion/chart_parser.py +569 -0
- rnsr/ingestion/document_boundary.py +662 -0
- rnsr/ingestion/font_histogram.py +334 -0
- rnsr/ingestion/header_classifier.py +595 -0
- rnsr/ingestion/hierarchical_cluster.py +515 -0
- rnsr/ingestion/layout_detector.py +356 -0
- rnsr/ingestion/layout_model.py +379 -0
- rnsr/ingestion/ocr_fallback.py +177 -0
- rnsr/ingestion/pipeline.py +936 -0
- rnsr/ingestion/semantic_fallback.py +417 -0
- rnsr/ingestion/table_parser.py +799 -0
- rnsr/ingestion/text_builder.py +460 -0
- rnsr/ingestion/tree_builder.py +402 -0
- rnsr/ingestion/vision_retrieval.py +965 -0
- rnsr/ingestion/xy_cut.py +555 -0
- rnsr/llm.py +733 -0
- rnsr/models.py +167 -0
- rnsr/py.typed +2 -0
- rnsr-0.1.0.dist-info/METADATA +592 -0
- rnsr-0.1.0.dist-info/RECORD +72 -0
- rnsr-0.1.0.dist-info/WHEEL +5 -0
- rnsr-0.1.0.dist-info/entry_points.txt +2 -0
- rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
- rnsr-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,936 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Ingestion Pipeline - Master Function with Enhanced Latent Hierarchy Generation
|
|
3
|
+
|
|
4
|
+
This module provides the main `ingest_document()` function that implements
|
|
5
|
+
the full Latent Hierarchy Generator from the research paper (Section 4-6).
|
|
6
|
+
|
|
7
|
+
TIER 1: Visual-Geometric Analysis (Primary)
|
|
8
|
+
1a. PyMuPDF Font Histogram (Section 6.1)
|
|
9
|
+
- If headers detected via font variance → Build hierarchical tree
|
|
10
|
+
1b. Recursive XY-Cut (Section 4.1.1) - Optional for complex layouts
|
|
11
|
+
- For multi-column documents, L-shaped text wraps
|
|
12
|
+
|
|
13
|
+
TIER 2: Semantic Boundary Detection (Fallback 1 - Flat Text)
|
|
14
|
+
2a. LlamaIndex SemanticSplitterNodeParser (Section 4.2.1)
|
|
15
|
+
- Embedding-based splitting at topic shifts
|
|
16
|
+
2b. Hierarchical Clustering (Section 4.2.2) - Enhanced option
|
|
17
|
+
- Multi-resolution: micro-clusters → macro-clusters
|
|
18
|
+
2c. Synthetic Header Generation (Section 6.3)
|
|
19
|
+
- LLM-generated titles for each section
|
|
20
|
+
|
|
21
|
+
TIER 3: OCR + Re-analyze (Fallback 2 - Scanned PDFs)
|
|
22
|
+
- Apply Tesseract or Doctr OCR
|
|
23
|
+
- Generate text layer from images
|
|
24
|
+
- Build tree from OCR output
|
|
25
|
+
|
|
26
|
+
ALWAYS call `ingest_document()` - never call individual tiers directly.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
from __future__ import annotations
|
|
30
|
+
|
|
31
|
+
from pathlib import Path
|
|
32
|
+
|
|
33
|
+
import structlog
|
|
34
|
+
|
|
35
|
+
from rnsr.exceptions import IngestionError
|
|
36
|
+
from rnsr.ingestion.document_boundary import (
|
|
37
|
+
DocumentBoundaryDetector,
|
|
38
|
+
segment_by_documents,
|
|
39
|
+
)
|
|
40
|
+
from rnsr.ingestion.font_histogram import FontHistogramAnalyzer
|
|
41
|
+
from rnsr.ingestion.header_classifier import classify_headers
|
|
42
|
+
from rnsr.ingestion.layout_detector import detect_layout_complexity
|
|
43
|
+
from rnsr.ingestion.ocr_fallback import has_extractable_text, try_ocr_ingestion
|
|
44
|
+
from rnsr.ingestion.semantic_fallback import try_semantic_splitter_ingestion
|
|
45
|
+
from rnsr.ingestion.tree_builder import build_document_tree, build_multi_document_tree
|
|
46
|
+
from rnsr.models import DocumentTree, IngestionResult
|
|
47
|
+
|
|
48
|
+
logger = structlog.get_logger(__name__)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def ingest_document(
|
|
52
|
+
pdf_path: Path | str,
|
|
53
|
+
use_visual_analysis: bool = True,
|
|
54
|
+
complexity_threshold: float = 0.3,
|
|
55
|
+
) -> IngestionResult:
|
|
56
|
+
"""
|
|
57
|
+
Master ingestion function implementing 3-tier graceful degradation.
|
|
58
|
+
|
|
59
|
+
ALWAYS call this function - never call individual tiers directly.
|
|
60
|
+
|
|
61
|
+
Ingestion Flow:
|
|
62
|
+
0. Pre-analysis: Detect layout complexity (multi-column, empty pages)
|
|
63
|
+
1. Tier 1a: Font Histogram (simple layouts)
|
|
64
|
+
1. Tier 1b: LayoutLM + XY-Cut (complex layouts, if use_visual_analysis=True)
|
|
65
|
+
2. Tier 2: Semantic Splitter (flat text, no structure)
|
|
66
|
+
3. Tier 3: OCR (scanned/image-only PDFs)
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
pdf_path: Path to the PDF file to ingest.
|
|
70
|
+
use_visual_analysis: Enable LayoutLM for complex layouts (default: True).
|
|
71
|
+
complexity_threshold: Threshold for triggering visual analysis (0.0-1.0).
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
IngestionResult containing the DocumentTree and metadata.
|
|
75
|
+
|
|
76
|
+
Raises:
|
|
77
|
+
IngestionError: If all tiers fail.
|
|
78
|
+
|
|
79
|
+
Example:
|
|
80
|
+
# Auto-detect layout complexity
|
|
81
|
+
result = ingest_document("contract.pdf")
|
|
82
|
+
|
|
83
|
+
# Force visual analysis
|
|
84
|
+
result = ingest_document("report.pdf", use_visual_analysis=True)
|
|
85
|
+
|
|
86
|
+
# Disable visual analysis
|
|
87
|
+
result = ingest_document("simple.pdf", use_visual_analysis=False)
|
|
88
|
+
"""
|
|
89
|
+
pdf_path = Path(pdf_path)
|
|
90
|
+
|
|
91
|
+
if not pdf_path.exists():
|
|
92
|
+
raise IngestionError(f"PDF file not found: {pdf_path}")
|
|
93
|
+
|
|
94
|
+
logger.info("ingestion_started", path=str(pdf_path))
|
|
95
|
+
|
|
96
|
+
warnings: list[str] = []
|
|
97
|
+
stats: dict = {"path": str(pdf_path)}
|
|
98
|
+
|
|
99
|
+
# Check if document has extractable text
|
|
100
|
+
if not has_extractable_text(pdf_path):
|
|
101
|
+
# No text - go directly to Tier 3 (OCR)
|
|
102
|
+
logger.info("no_extractable_text", path=str(pdf_path))
|
|
103
|
+
return _try_tier_3(pdf_path, warnings, stats)
|
|
104
|
+
|
|
105
|
+
# PRE-ANALYSIS: Detect layout complexity
|
|
106
|
+
if use_visual_analysis:
|
|
107
|
+
try:
|
|
108
|
+
complexity = detect_layout_complexity(pdf_path, threshold=complexity_threshold)
|
|
109
|
+
|
|
110
|
+
stats["layout_complexity"] = complexity.complexity_score
|
|
111
|
+
stats["needs_visual"] = complexity.needs_visual_analysis
|
|
112
|
+
stats["complexity_reason"] = complexity.reason
|
|
113
|
+
|
|
114
|
+
if complexity.needs_visual_analysis:
|
|
115
|
+
logger.info(
|
|
116
|
+
"complex_layout_detected",
|
|
117
|
+
path=str(pdf_path),
|
|
118
|
+
score=complexity.complexity_score,
|
|
119
|
+
reason=complexity.reason,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
# Try visual analysis first
|
|
123
|
+
result = _try_tier_1b_visual(pdf_path, warnings, stats)
|
|
124
|
+
if result is not None:
|
|
125
|
+
return result
|
|
126
|
+
|
|
127
|
+
# Fall through to standard font histogram if visual fails
|
|
128
|
+
warnings.append(f"Visual analysis failed, using font histogram fallback")
|
|
129
|
+
except Exception as e:
|
|
130
|
+
logger.warning("layout_detection_failed", error=str(e))
|
|
131
|
+
warnings.append(f"Layout detection failed: {e}")
|
|
132
|
+
|
|
133
|
+
# TIER 1: Try PyMuPDF Font Histogram
|
|
134
|
+
result = _try_tier_1(pdf_path, warnings, stats)
|
|
135
|
+
if result is not None:
|
|
136
|
+
return result
|
|
137
|
+
|
|
138
|
+
# TIER 2: Try Semantic Splitter
|
|
139
|
+
result = _try_tier_2(pdf_path, warnings, stats)
|
|
140
|
+
if result is not None:
|
|
141
|
+
return result
|
|
142
|
+
|
|
143
|
+
# This shouldn't happen, but just in case
|
|
144
|
+
raise IngestionError("All ingestion tiers failed")
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _try_tier_1b_visual(
|
|
148
|
+
pdf_path: Path,
|
|
149
|
+
warnings: list[str],
|
|
150
|
+
stats: dict,
|
|
151
|
+
) -> IngestionResult | None:
|
|
152
|
+
"""
|
|
153
|
+
TIER 1b: Try LayoutLM + XY-Cut for complex layouts.
|
|
154
|
+
|
|
155
|
+
Uses visual analysis to detect document structure when
|
|
156
|
+
layout is too complex for simple font histogram.
|
|
157
|
+
"""
|
|
158
|
+
logger.debug("trying_tier_1b_visual", path=str(pdf_path))
|
|
159
|
+
|
|
160
|
+
try:
|
|
161
|
+
from rnsr.ingestion.layout_model import check_layout_model_available
|
|
162
|
+
|
|
163
|
+
if not check_layout_model_available():
|
|
164
|
+
logger.warning("layout_model_unavailable")
|
|
165
|
+
warnings.append("LayoutLM not available - falling back to font histogram")
|
|
166
|
+
return None
|
|
167
|
+
|
|
168
|
+
from rnsr.ingestion.xy_cut import analyze_document_with_xycut
|
|
169
|
+
|
|
170
|
+
# Use XY-Cut + LayoutLM for visual analysis
|
|
171
|
+
tree = analyze_document_with_xycut(pdf_path)
|
|
172
|
+
tree.ingestion_tier = 1
|
|
173
|
+
tree.ingestion_method = "layoutlm_xycut"
|
|
174
|
+
|
|
175
|
+
logger.info(
|
|
176
|
+
"tier_1b_visual_success",
|
|
177
|
+
path=str(pdf_path),
|
|
178
|
+
nodes=tree.total_nodes,
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
return IngestionResult(
|
|
182
|
+
tree=tree,
|
|
183
|
+
tier_used=1,
|
|
184
|
+
method="layoutlm_xycut",
|
|
185
|
+
warnings=warnings,
|
|
186
|
+
stats=stats,
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
except Exception as e:
|
|
190
|
+
logger.warning("tier_1b_visual_failed", path=str(pdf_path), error=str(e))
|
|
191
|
+
warnings.append(f"LayoutLM visual analysis failed: {e}")
|
|
192
|
+
return None
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def _try_tier_1(
|
|
196
|
+
pdf_path: Path,
|
|
197
|
+
warnings: list[str],
|
|
198
|
+
stats: dict,
|
|
199
|
+
detect_multi_document: bool = True,
|
|
200
|
+
boundary_confidence: float = 0.5,
|
|
201
|
+
) -> IngestionResult | None:
|
|
202
|
+
"""
|
|
203
|
+
TIER 1: Try Font Histogram ingestion.
|
|
204
|
+
|
|
205
|
+
Now includes multi-document detection for combined PDFs.
|
|
206
|
+
|
|
207
|
+
Returns None if should fall back to Tier 2.
|
|
208
|
+
"""
|
|
209
|
+
logger.debug("trying_tier_1", path=str(pdf_path))
|
|
210
|
+
|
|
211
|
+
try:
|
|
212
|
+
analyzer = FontHistogramAnalyzer()
|
|
213
|
+
analysis, spans = analyzer.analyze(pdf_path)
|
|
214
|
+
|
|
215
|
+
stats["span_count"] = len(spans)
|
|
216
|
+
stats["unique_sizes"] = analysis.unique_sizes
|
|
217
|
+
stats["body_size"] = analysis.body_size
|
|
218
|
+
|
|
219
|
+
# Check if we have font variance
|
|
220
|
+
if not analyzer.has_font_variance(analysis):
|
|
221
|
+
logger.info("no_font_variance", path=str(pdf_path))
|
|
222
|
+
warnings.append("No font variance detected - using semantic splitter")
|
|
223
|
+
return None # Trigger Tier 2
|
|
224
|
+
|
|
225
|
+
# Check if we can detect headers
|
|
226
|
+
if not analyzer.has_detectable_headers(analysis, spans):
|
|
227
|
+
logger.info("no_headers_detected", path=str(pdf_path))
|
|
228
|
+
warnings.append("No headers detected - using semantic splitter")
|
|
229
|
+
return None # Trigger Tier 2
|
|
230
|
+
|
|
231
|
+
# NEW: Detect document boundaries for multi-document PDFs
|
|
232
|
+
if detect_multi_document:
|
|
233
|
+
segments = segment_by_documents(
|
|
234
|
+
spans,
|
|
235
|
+
min_confidence=boundary_confidence,
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
stats["documents_detected"] = len(segments)
|
|
239
|
+
|
|
240
|
+
if len(segments) > 1:
|
|
241
|
+
logger.info(
|
|
242
|
+
"multi_document_detected",
|
|
243
|
+
path=str(pdf_path),
|
|
244
|
+
document_count=len(segments),
|
|
245
|
+
titles=[s.title[:30] for s in segments],
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
# Build multi-document tree
|
|
249
|
+
tree = build_multi_document_tree(
|
|
250
|
+
segments,
|
|
251
|
+
container_title=pdf_path.stem,
|
|
252
|
+
)
|
|
253
|
+
tree.ingestion_tier = 1
|
|
254
|
+
tree.ingestion_method = "font_histogram"
|
|
255
|
+
|
|
256
|
+
logger.info(
|
|
257
|
+
"tier_1_success",
|
|
258
|
+
path=str(pdf_path),
|
|
259
|
+
nodes=tree.total_nodes,
|
|
260
|
+
documents=len(segments),
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
return IngestionResult(
|
|
264
|
+
tree=tree,
|
|
265
|
+
tier_used=1,
|
|
266
|
+
method="font_histogram",
|
|
267
|
+
warnings=warnings,
|
|
268
|
+
stats=stats,
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
# Single document: standard processing
|
|
272
|
+
# Classify spans
|
|
273
|
+
classified = classify_headers(spans, analysis)
|
|
274
|
+
|
|
275
|
+
header_count = sum(1 for s in classified if s.role == "header")
|
|
276
|
+
stats["header_count"] = header_count
|
|
277
|
+
|
|
278
|
+
# Build tree
|
|
279
|
+
tree = build_document_tree(classified, title=pdf_path.stem)
|
|
280
|
+
tree.ingestion_tier = 1
|
|
281
|
+
tree.ingestion_method = "font_histogram"
|
|
282
|
+
|
|
283
|
+
logger.info(
|
|
284
|
+
"tier_1_success",
|
|
285
|
+
path=str(pdf_path),
|
|
286
|
+
nodes=tree.total_nodes,
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
return IngestionResult(
|
|
290
|
+
tree=tree,
|
|
291
|
+
tier_used=1,
|
|
292
|
+
method="font_histogram",
|
|
293
|
+
warnings=warnings,
|
|
294
|
+
stats=stats,
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
except Exception as e:
|
|
298
|
+
logger.warning("tier_1_failed", path=str(pdf_path), error=str(e))
|
|
299
|
+
warnings.append(f"Font histogram failed: {e}")
|
|
300
|
+
return None
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def _try_tier_2(
|
|
304
|
+
pdf_path: Path,
|
|
305
|
+
warnings: list[str],
|
|
306
|
+
stats: dict,
|
|
307
|
+
use_hierarchical_clustering: bool = False,
|
|
308
|
+
) -> IngestionResult | None:
|
|
309
|
+
"""
|
|
310
|
+
TIER 2: Try Semantic Splitter or Hierarchical Clustering ingestion.
|
|
311
|
+
|
|
312
|
+
Implements Section 4.2 of the research paper:
|
|
313
|
+
- 4.2.1: SemanticSplitterNodeParser for breakpoint detection
|
|
314
|
+
- 4.2.2: Hierarchical Clustering for multi-resolution topics
|
|
315
|
+
- 6.3: Synthetic Header Generation via LLM
|
|
316
|
+
"""
|
|
317
|
+
logger.debug("trying_tier_2", path=str(pdf_path))
|
|
318
|
+
|
|
319
|
+
# Option: Use hierarchical clustering for richer structure
|
|
320
|
+
if use_hierarchical_clustering:
|
|
321
|
+
try:
|
|
322
|
+
from rnsr.ingestion.hierarchical_cluster import cluster_document_hierarchically
|
|
323
|
+
|
|
324
|
+
tree = cluster_document_hierarchically(pdf_path)
|
|
325
|
+
|
|
326
|
+
logger.info(
|
|
327
|
+
"tier_2_hierarchical_success",
|
|
328
|
+
path=str(pdf_path),
|
|
329
|
+
nodes=tree.total_nodes,
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
return IngestionResult(
|
|
333
|
+
tree=tree,
|
|
334
|
+
tier_used=2,
|
|
335
|
+
method="hierarchical_clustering",
|
|
336
|
+
warnings=warnings,
|
|
337
|
+
stats=stats,
|
|
338
|
+
)
|
|
339
|
+
except Exception as e:
|
|
340
|
+
logger.warning("hierarchical_clustering_failed", error=str(e))
|
|
341
|
+
warnings.append(f"Hierarchical clustering failed: {e}")
|
|
342
|
+
# Fall through to semantic splitter
|
|
343
|
+
|
|
344
|
+
# Default: Semantic Splitter (with LLM-generated headers)
|
|
345
|
+
try:
|
|
346
|
+
tree = try_semantic_splitter_ingestion(pdf_path)
|
|
347
|
+
|
|
348
|
+
logger.info(
|
|
349
|
+
"tier_2_success",
|
|
350
|
+
path=str(pdf_path),
|
|
351
|
+
nodes=tree.total_nodes,
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
return IngestionResult(
|
|
355
|
+
tree=tree,
|
|
356
|
+
tier_used=2,
|
|
357
|
+
method="semantic_splitter",
|
|
358
|
+
warnings=warnings,
|
|
359
|
+
stats=stats,
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
except Exception as e:
|
|
363
|
+
logger.warning("tier_2_failed", path=str(pdf_path), error=str(e))
|
|
364
|
+
warnings.append(f"Semantic splitter failed: {e}")
|
|
365
|
+
# Continue to Tier 3
|
|
366
|
+
return _try_tier_3(pdf_path, warnings, stats)
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
def _try_tier_3(
|
|
370
|
+
pdf_path: Path,
|
|
371
|
+
warnings: list[str],
|
|
372
|
+
stats: dict,
|
|
373
|
+
) -> IngestionResult:
|
|
374
|
+
"""
|
|
375
|
+
TIER 3: Try OCR ingestion (last resort).
|
|
376
|
+
"""
|
|
377
|
+
logger.debug("trying_tier_3", path=str(pdf_path))
|
|
378
|
+
|
|
379
|
+
try:
|
|
380
|
+
tree = try_ocr_ingestion(pdf_path)
|
|
381
|
+
|
|
382
|
+
logger.info(
|
|
383
|
+
"tier_3_success",
|
|
384
|
+
path=str(pdf_path),
|
|
385
|
+
nodes=tree.total_nodes,
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
return IngestionResult(
|
|
389
|
+
tree=tree,
|
|
390
|
+
tier_used=3,
|
|
391
|
+
method="ocr",
|
|
392
|
+
warnings=warnings,
|
|
393
|
+
stats=stats,
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
except Exception as e:
|
|
397
|
+
logger.error("tier_3_failed", path=str(pdf_path), error=str(e))
|
|
398
|
+
raise IngestionError(f"All ingestion tiers failed. Last error: {e}") from e
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
def ingest_document_enhanced(
|
|
402
|
+
pdf_path: Path | str,
|
|
403
|
+
use_xy_cut: bool = False,
|
|
404
|
+
use_hierarchical_clustering: bool = False,
|
|
405
|
+
) -> IngestionResult:
|
|
406
|
+
"""
|
|
407
|
+
Enhanced ingestion with all research paper features.
|
|
408
|
+
|
|
409
|
+
This exposes the full Latent Hierarchy Generator from the paper:
|
|
410
|
+
- XY-Cut for complex multi-column layouts (Section 4.1.1)
|
|
411
|
+
- Hierarchical Clustering for multi-resolution topics (Section 4.2.2)
|
|
412
|
+
- Synthetic Header Generation via LLM (Section 6.3)
|
|
413
|
+
|
|
414
|
+
Args:
|
|
415
|
+
pdf_path: Path to the PDF file to ingest.
|
|
416
|
+
use_xy_cut: Enable Recursive XY-Cut for complex layouts.
|
|
417
|
+
use_hierarchical_clustering: Use clustering instead of simple splits.
|
|
418
|
+
|
|
419
|
+
Returns:
|
|
420
|
+
IngestionResult containing the DocumentTree and metadata.
|
|
421
|
+
|
|
422
|
+
Example:
|
|
423
|
+
# For a complex multi-column PDF:
|
|
424
|
+
result = ingest_document_enhanced("report.pdf", use_xy_cut=True)
|
|
425
|
+
|
|
426
|
+
# For flat text that needs hierarchical structure:
|
|
427
|
+
result = ingest_document_enhanced(
|
|
428
|
+
"transcript.pdf",
|
|
429
|
+
use_hierarchical_clustering=True
|
|
430
|
+
)
|
|
431
|
+
"""
|
|
432
|
+
pdf_path = Path(pdf_path)
|
|
433
|
+
|
|
434
|
+
if not pdf_path.exists():
|
|
435
|
+
raise IngestionError(f"PDF file not found: {pdf_path}")
|
|
436
|
+
|
|
437
|
+
logger.info(
|
|
438
|
+
"enhanced_ingestion_started",
|
|
439
|
+
path=str(pdf_path),
|
|
440
|
+
xy_cut=use_xy_cut,
|
|
441
|
+
hierarchical=use_hierarchical_clustering,
|
|
442
|
+
)
|
|
443
|
+
|
|
444
|
+
warnings: list[str] = []
|
|
445
|
+
stats: dict = {"path": str(pdf_path)}
|
|
446
|
+
|
|
447
|
+
# Check if document has extractable text
|
|
448
|
+
if not has_extractable_text(pdf_path):
|
|
449
|
+
return _try_tier_3(pdf_path, warnings, stats)
|
|
450
|
+
|
|
451
|
+
# Try XY-Cut first if enabled (for complex layouts)
|
|
452
|
+
if use_xy_cut:
|
|
453
|
+
result = _try_xy_cut_ingestion(pdf_path, warnings, stats)
|
|
454
|
+
if result is not None:
|
|
455
|
+
return result
|
|
456
|
+
|
|
457
|
+
# TIER 1: Try Font Histogram
|
|
458
|
+
result = _try_tier_1(pdf_path, warnings, stats)
|
|
459
|
+
if result is not None:
|
|
460
|
+
return result
|
|
461
|
+
|
|
462
|
+
# TIER 2: Semantic analysis with optional hierarchical clustering
|
|
463
|
+
result = _try_tier_2(pdf_path, warnings, stats, use_hierarchical_clustering)
|
|
464
|
+
if result is not None:
|
|
465
|
+
return result
|
|
466
|
+
|
|
467
|
+
raise IngestionError("All ingestion tiers failed")
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
def _try_xy_cut_ingestion(
|
|
471
|
+
pdf_path: Path,
|
|
472
|
+
warnings: list[str],
|
|
473
|
+
stats: dict,
|
|
474
|
+
) -> IngestionResult | None:
|
|
475
|
+
"""
|
|
476
|
+
Optional: Use Recursive XY-Cut + LayoutLM for complex layouts.
|
|
477
|
+
|
|
478
|
+
Implements Section 4.1.1:
|
|
479
|
+
"A top-down page segmentation technique that is particularly
|
|
480
|
+
effective for discovering document structure."
|
|
481
|
+
"""
|
|
482
|
+
logger.debug("trying_xy_cut_with_layoutlm", path=str(pdf_path))
|
|
483
|
+
|
|
484
|
+
try:
|
|
485
|
+
# Check if LayoutLM is available
|
|
486
|
+
from rnsr.ingestion.layout_model import check_layout_model_available
|
|
487
|
+
|
|
488
|
+
if not check_layout_model_available():
|
|
489
|
+
logger.warning("layout_model_unavailable")
|
|
490
|
+
warnings.append("LayoutLM not available for XY-Cut enhancement")
|
|
491
|
+
return None
|
|
492
|
+
|
|
493
|
+
from rnsr.ingestion.xy_cut import analyze_document_with_xycut
|
|
494
|
+
|
|
495
|
+
# Use XY-Cut + LayoutLM for visual analysis
|
|
496
|
+
tree = analyze_document_with_xycut(pdf_path)
|
|
497
|
+
tree.ingestion_tier = 1
|
|
498
|
+
tree.ingestion_method = "xy_cut_layoutlm"
|
|
499
|
+
|
|
500
|
+
logger.info(
|
|
501
|
+
"xy_cut_layoutlm_success",
|
|
502
|
+
path=str(pdf_path),
|
|
503
|
+
nodes=tree.total_nodes,
|
|
504
|
+
)
|
|
505
|
+
|
|
506
|
+
return IngestionResult(
|
|
507
|
+
tree=tree,
|
|
508
|
+
tier_used=1,
|
|
509
|
+
method="xy_cut_layoutlm",
|
|
510
|
+
warnings=warnings,
|
|
511
|
+
stats=stats,
|
|
512
|
+
)
|
|
513
|
+
|
|
514
|
+
except Exception as e:
|
|
515
|
+
logger.warning("xy_cut_layoutlm_failed", path=str(pdf_path), error=str(e))
|
|
516
|
+
warnings.append(f"XY-Cut + LayoutLM failed: {e}")
|
|
517
|
+
return None
|
|
518
|
+
|
|
519
|
+
|
|
520
|
+
def _try_xy_cut_ingestion_legacy(
|
|
521
|
+
pdf_path: Path,
|
|
522
|
+
warnings: list[str],
|
|
523
|
+
stats: dict,
|
|
524
|
+
) -> IngestionResult | None:
|
|
525
|
+
"""
|
|
526
|
+
Legacy XY-Cut implementation without LayoutLM.
|
|
527
|
+
|
|
528
|
+
Implements Section 4.1.1:
|
|
529
|
+
"A top-down page segmentation technique that is particularly
|
|
530
|
+
effective for discovering document structure."
|
|
531
|
+
"""
|
|
532
|
+
logger.debug("trying_xy_cut", path=str(pdf_path))
|
|
533
|
+
|
|
534
|
+
try:
|
|
535
|
+
from rnsr.ingestion.xy_cut import RecursiveXYCutter
|
|
536
|
+
import fitz
|
|
537
|
+
|
|
538
|
+
cutter = RecursiveXYCutter()
|
|
539
|
+
page_trees = cutter.segment_pdf(pdf_path)
|
|
540
|
+
|
|
541
|
+
# Extract text for each leaf region
|
|
542
|
+
doc = fitz.open(pdf_path)
|
|
543
|
+
for page_num, tree in enumerate(page_trees):
|
|
544
|
+
cutter.extract_text_in_regions(doc[page_num], tree)
|
|
545
|
+
doc.close()
|
|
546
|
+
|
|
547
|
+
# Convert XY-Cut tree to DocumentTree
|
|
548
|
+
from rnsr.models import DocumentNode, DocumentTree
|
|
549
|
+
|
|
550
|
+
root = DocumentNode(
|
|
551
|
+
id="root",
|
|
552
|
+
level=0,
|
|
553
|
+
header=pdf_path.stem,
|
|
554
|
+
)
|
|
555
|
+
|
|
556
|
+
section_num = 0
|
|
557
|
+
for page_tree in page_trees:
|
|
558
|
+
for leaf in _get_xy_cut_leaves(page_tree):
|
|
559
|
+
if leaf.text.strip():
|
|
560
|
+
section_num += 1
|
|
561
|
+
# Generate synthetic header
|
|
562
|
+
from rnsr.ingestion.semantic_fallback import _generate_synthetic_header
|
|
563
|
+
|
|
564
|
+
section = DocumentNode(
|
|
565
|
+
id=f"xycut_{section_num:03d}",
|
|
566
|
+
level=1,
|
|
567
|
+
header=_generate_synthetic_header(leaf.text, section_num),
|
|
568
|
+
content=leaf.text,
|
|
569
|
+
)
|
|
570
|
+
root.children.append(section)
|
|
571
|
+
|
|
572
|
+
if section_num == 0:
|
|
573
|
+
warnings.append("XY-Cut found no text regions")
|
|
574
|
+
return None
|
|
575
|
+
|
|
576
|
+
tree = DocumentTree(
|
|
577
|
+
title=pdf_path.stem,
|
|
578
|
+
root=root,
|
|
579
|
+
total_nodes=section_num + 1,
|
|
580
|
+
ingestion_tier=1,
|
|
581
|
+
ingestion_method="xy_cut",
|
|
582
|
+
)
|
|
583
|
+
|
|
584
|
+
logger.info("xy_cut_success", path=str(pdf_path), nodes=tree.total_nodes)
|
|
585
|
+
|
|
586
|
+
return IngestionResult(
|
|
587
|
+
tree=tree,
|
|
588
|
+
tier_used=1,
|
|
589
|
+
method="xy_cut",
|
|
590
|
+
warnings=warnings,
|
|
591
|
+
stats=stats,
|
|
592
|
+
)
|
|
593
|
+
|
|
594
|
+
except Exception as e:
|
|
595
|
+
logger.warning("xy_cut_failed", path=str(pdf_path), error=str(e))
|
|
596
|
+
warnings.append(f"XY-Cut failed: {e}")
|
|
597
|
+
return None
|
|
598
|
+
|
|
599
|
+
|
|
600
|
+
def _get_xy_cut_leaves(node) -> list:
|
|
601
|
+
"""Get all leaf nodes from an XY-Cut segment tree."""
|
|
602
|
+
if node.is_leaf:
|
|
603
|
+
return [node]
|
|
604
|
+
leaves = []
|
|
605
|
+
for child in node.children:
|
|
606
|
+
leaves.extend(_get_xy_cut_leaves(child))
|
|
607
|
+
return leaves
|
|
608
|
+
|
|
609
|
+
|
|
610
|
+
# =============================================================================
|
|
611
|
+
# Entity Extraction Integration
|
|
612
|
+
# =============================================================================
|
|
613
|
+
|
|
614
|
+
|
|
615
|
+
def extract_entities_from_tree(
|
|
616
|
+
tree: DocumentTree,
|
|
617
|
+
doc_id: str | None = None,
|
|
618
|
+
extract_relationships: bool = True,
|
|
619
|
+
max_nodes: int = 100,
|
|
620
|
+
sample_strategy: str = "important",
|
|
621
|
+
) -> dict:
|
|
622
|
+
"""
|
|
623
|
+
Extract entities and relationships from an ingested document tree.
|
|
624
|
+
|
|
625
|
+
Uses the RLM Unified Extractor - the most accurate approach:
|
|
626
|
+
1. LLM writes extraction code based on document
|
|
627
|
+
2. Code executes on DOC_VAR (grounded in actual text)
|
|
628
|
+
3. ToT validates with probabilities
|
|
629
|
+
4. Cross-validation between entities and relationships
|
|
630
|
+
|
|
631
|
+
Args:
|
|
632
|
+
tree: The ingested DocumentTree.
|
|
633
|
+
doc_id: Document ID (defaults to tree.id).
|
|
634
|
+
extract_relationships: Whether to also extract relationships.
|
|
635
|
+
max_nodes: Maximum nodes to process (for large documents).
|
|
636
|
+
sample_strategy: How to select nodes - "important" (headers first),
|
|
637
|
+
"uniform" (evenly spaced), or "all" (process all).
|
|
638
|
+
|
|
639
|
+
Returns:
|
|
640
|
+
Dictionary containing:
|
|
641
|
+
- entities: List of extracted Entity objects
|
|
642
|
+
- relationships: List of extracted Relationship objects
|
|
643
|
+
- stats: Extraction statistics
|
|
644
|
+
|
|
645
|
+
Example:
|
|
646
|
+
result = ingest_document("contract.pdf")
|
|
647
|
+
extraction = extract_entities_from_tree(result.tree)
|
|
648
|
+
|
|
649
|
+
# Store in knowledge graph
|
|
650
|
+
for entity in extraction["entities"]:
|
|
651
|
+
kg.add_entity(entity)
|
|
652
|
+
"""
|
|
653
|
+
from rnsr.extraction import (
|
|
654
|
+
RLMUnifiedExtractor,
|
|
655
|
+
merge_entities,
|
|
656
|
+
)
|
|
657
|
+
|
|
658
|
+
doc_id = doc_id or tree.id
|
|
659
|
+
|
|
660
|
+
# Use RLM Unified Extractor (LLM writes code + ToT validation)
|
|
661
|
+
extractor = RLMUnifiedExtractor(
|
|
662
|
+
enable_type_learning=True,
|
|
663
|
+
enable_tot_validation=True,
|
|
664
|
+
enable_cross_validation=True,
|
|
665
|
+
)
|
|
666
|
+
|
|
667
|
+
all_entities = []
|
|
668
|
+
all_relationships = []
|
|
669
|
+
|
|
670
|
+
# Collect all nodes for processing
|
|
671
|
+
all_nodes = _collect_nodes(tree.root, doc_id)
|
|
672
|
+
|
|
673
|
+
# Sample nodes if too many
|
|
674
|
+
if sample_strategy != "all" and len(all_nodes) > max_nodes:
|
|
675
|
+
nodes_to_process = _sample_nodes(all_nodes, max_nodes, sample_strategy)
|
|
676
|
+
logger.info(
|
|
677
|
+
"entity_extraction_sampling",
|
|
678
|
+
total_nodes=len(all_nodes),
|
|
679
|
+
sampled_nodes=len(nodes_to_process),
|
|
680
|
+
strategy=sample_strategy,
|
|
681
|
+
)
|
|
682
|
+
else:
|
|
683
|
+
nodes_to_process = all_nodes
|
|
684
|
+
|
|
685
|
+
logger.info(
|
|
686
|
+
"entity_extraction_started",
|
|
687
|
+
doc_id=doc_id,
|
|
688
|
+
node_count=len(nodes_to_process),
|
|
689
|
+
total_nodes=len(all_nodes),
|
|
690
|
+
extractor="RLMUnifiedExtractor",
|
|
691
|
+
)
|
|
692
|
+
|
|
693
|
+
# Process nodes in batches for efficiency
|
|
694
|
+
batch_size = 10
|
|
695
|
+
processed = 0
|
|
696
|
+
|
|
697
|
+
for i in range(0, len(nodes_to_process), batch_size):
|
|
698
|
+
batch = nodes_to_process[i:i + batch_size]
|
|
699
|
+
|
|
700
|
+
for node_data in batch:
|
|
701
|
+
try:
|
|
702
|
+
result = extractor.extract(
|
|
703
|
+
node_id=node_data["node_id"],
|
|
704
|
+
doc_id=doc_id,
|
|
705
|
+
header=node_data["header"],
|
|
706
|
+
content=node_data["content"],
|
|
707
|
+
page_num=node_data.get("page_num"),
|
|
708
|
+
)
|
|
709
|
+
|
|
710
|
+
if result.entities:
|
|
711
|
+
all_entities.extend(result.entities)
|
|
712
|
+
|
|
713
|
+
if extract_relationships and result.relationships:
|
|
714
|
+
all_relationships.extend(result.relationships)
|
|
715
|
+
|
|
716
|
+
except Exception as e:
|
|
717
|
+
logger.warning(
|
|
718
|
+
"node_extraction_failed",
|
|
719
|
+
node_id=node_data.get("node_id"),
|
|
720
|
+
error=str(e)[:100],
|
|
721
|
+
)
|
|
722
|
+
|
|
723
|
+
processed += len(batch)
|
|
724
|
+
if processed % 20 == 0:
|
|
725
|
+
logger.info(
|
|
726
|
+
"entity_extraction_progress",
|
|
727
|
+
processed=processed,
|
|
728
|
+
total=len(nodes_to_process),
|
|
729
|
+
entities_so_far=len(all_entities),
|
|
730
|
+
)
|
|
731
|
+
|
|
732
|
+
# Merge duplicate entities
|
|
733
|
+
merged_entities = merge_entities(all_entities)
|
|
734
|
+
|
|
735
|
+
stats = {
|
|
736
|
+
"nodes_processed": len(nodes_to_process),
|
|
737
|
+
"entities_extracted": len(all_entities),
|
|
738
|
+
"entities_after_merge": len(merged_entities),
|
|
739
|
+
"relationships_extracted": len(all_relationships),
|
|
740
|
+
"entity_types": _count_entity_types(merged_entities),
|
|
741
|
+
"extraction_method": "rlm_unified",
|
|
742
|
+
}
|
|
743
|
+
|
|
744
|
+
logger.info(
|
|
745
|
+
"entity_extraction_complete",
|
|
746
|
+
doc_id=doc_id,
|
|
747
|
+
**stats,
|
|
748
|
+
)
|
|
749
|
+
|
|
750
|
+
return {
|
|
751
|
+
"entities": merged_entities,
|
|
752
|
+
"relationships": all_relationships,
|
|
753
|
+
"stats": stats,
|
|
754
|
+
}
|
|
755
|
+
|
|
756
|
+
|
|
757
|
+
def _collect_nodes(node, doc_id: str, collected: list | None = None) -> list[dict]:
|
|
758
|
+
"""
|
|
759
|
+
Recursively collect all nodes from a DocumentNode tree.
|
|
760
|
+
|
|
761
|
+
Args:
|
|
762
|
+
node: Root DocumentNode.
|
|
763
|
+
doc_id: Document ID.
|
|
764
|
+
collected: List to collect into.
|
|
765
|
+
|
|
766
|
+
Returns:
|
|
767
|
+
List of node data dictionaries.
|
|
768
|
+
"""
|
|
769
|
+
if collected is None:
|
|
770
|
+
collected = []
|
|
771
|
+
|
|
772
|
+
# Add this node if it has content
|
|
773
|
+
if node.content or node.header:
|
|
774
|
+
collected.append({
|
|
775
|
+
"node_id": node.id,
|
|
776
|
+
"header": node.header,
|
|
777
|
+
"content": node.content,
|
|
778
|
+
"page_num": node.page_num,
|
|
779
|
+
"level": node.level,
|
|
780
|
+
})
|
|
781
|
+
|
|
782
|
+
# Process children
|
|
783
|
+
for child in node.children:
|
|
784
|
+
_collect_nodes(child, doc_id, collected)
|
|
785
|
+
|
|
786
|
+
return collected
|
|
787
|
+
|
|
788
|
+
|
|
789
|
+
def _sample_nodes(nodes: list[dict], max_nodes: int, strategy: str) -> list[dict]:
|
|
790
|
+
"""
|
|
791
|
+
Sample nodes from a large document for efficient processing.
|
|
792
|
+
|
|
793
|
+
Args:
|
|
794
|
+
nodes: All collected nodes.
|
|
795
|
+
max_nodes: Maximum number of nodes to return.
|
|
796
|
+
strategy: Sampling strategy - "important", "uniform", or "first".
|
|
797
|
+
|
|
798
|
+
Returns:
|
|
799
|
+
Sampled list of nodes.
|
|
800
|
+
"""
|
|
801
|
+
if len(nodes) <= max_nodes:
|
|
802
|
+
return nodes
|
|
803
|
+
|
|
804
|
+
if strategy == "important":
|
|
805
|
+
# Prioritize nodes with headers and higher-level sections
|
|
806
|
+
scored_nodes = []
|
|
807
|
+
for node in nodes:
|
|
808
|
+
score = 0
|
|
809
|
+
|
|
810
|
+
# Prefer nodes with headers
|
|
811
|
+
if node.get("header"):
|
|
812
|
+
score += 10
|
|
813
|
+
|
|
814
|
+
# Prefer higher-level (lower number) sections
|
|
815
|
+
level = node.get("level", 3)
|
|
816
|
+
score += max(0, 5 - level)
|
|
817
|
+
|
|
818
|
+
# Prefer nodes with substantial content
|
|
819
|
+
content_len = len(node.get("content", ""))
|
|
820
|
+
if content_len > 500:
|
|
821
|
+
score += 3
|
|
822
|
+
elif content_len > 200:
|
|
823
|
+
score += 2
|
|
824
|
+
elif content_len > 50:
|
|
825
|
+
score += 1
|
|
826
|
+
|
|
827
|
+
scored_nodes.append((score, node))
|
|
828
|
+
|
|
829
|
+
# Sort by score (descending) and take top nodes
|
|
830
|
+
scored_nodes.sort(key=lambda x: x[0], reverse=True)
|
|
831
|
+
return [node for _, node in scored_nodes[:max_nodes]]
|
|
832
|
+
|
|
833
|
+
elif strategy == "uniform":
|
|
834
|
+
# Evenly sample across the document
|
|
835
|
+
step = len(nodes) // max_nodes
|
|
836
|
+
return [nodes[i] for i in range(0, len(nodes), step)][:max_nodes]
|
|
837
|
+
|
|
838
|
+
else: # "first"
|
|
839
|
+
return nodes[:max_nodes]
|
|
840
|
+
|
|
841
|
+
|
|
842
|
+
def _count_entity_types(entities: list) -> dict[str, int]:
|
|
843
|
+
"""Count entities by type."""
|
|
844
|
+
counts: dict[str, int] = {}
|
|
845
|
+
for entity in entities:
|
|
846
|
+
type_name = entity.type.value
|
|
847
|
+
counts[type_name] = counts.get(type_name, 0) + 1
|
|
848
|
+
return counts
|
|
849
|
+
|
|
850
|
+
|
|
851
|
+
def ingest_with_entities(
|
|
852
|
+
pdf_path: Path | str,
|
|
853
|
+
knowledge_graph=None,
|
|
854
|
+
extract_relationships: bool = True,
|
|
855
|
+
link_entities: bool = True,
|
|
856
|
+
**ingest_kwargs,
|
|
857
|
+
) -> dict:
|
|
858
|
+
"""
|
|
859
|
+
Ingest a document and extract entities in a single operation.
|
|
860
|
+
|
|
861
|
+
Combines document ingestion with entity extraction, optionally
|
|
862
|
+
storing results in a knowledge graph.
|
|
863
|
+
|
|
864
|
+
Args:
|
|
865
|
+
pdf_path: Path to the PDF file.
|
|
866
|
+
knowledge_graph: Optional KnowledgeGraph to store entities.
|
|
867
|
+
extract_relationships: Whether to extract relationships.
|
|
868
|
+
link_entities: Whether to link entities across documents.
|
|
869
|
+
**ingest_kwargs: Additional arguments for ingest_document.
|
|
870
|
+
|
|
871
|
+
Returns:
|
|
872
|
+
Dictionary containing:
|
|
873
|
+
- ingestion_result: The IngestionResult
|
|
874
|
+
- extraction: Entity extraction results
|
|
875
|
+
- links: Entity links (if link_entities=True)
|
|
876
|
+
|
|
877
|
+
Example:
|
|
878
|
+
from rnsr.indexing.knowledge_graph import KnowledgeGraph
|
|
879
|
+
|
|
880
|
+
kg = KnowledgeGraph("./data/kg.db")
|
|
881
|
+
result = ingest_with_entities("contract.pdf", knowledge_graph=kg)
|
|
882
|
+
|
|
883
|
+
print(f"Found {len(result['extraction']['entities'])} entities")
|
|
884
|
+
"""
|
|
885
|
+
# Ingest document
|
|
886
|
+
ingestion_result = ingest_document(pdf_path, **ingest_kwargs)
|
|
887
|
+
|
|
888
|
+
# Extract entities
|
|
889
|
+
extraction = extract_entities_from_tree(
|
|
890
|
+
tree=ingestion_result.tree,
|
|
891
|
+
extract_relationships=extract_relationships,
|
|
892
|
+
)
|
|
893
|
+
|
|
894
|
+
result = {
|
|
895
|
+
"ingestion_result": ingestion_result,
|
|
896
|
+
"extraction": extraction,
|
|
897
|
+
"links": [],
|
|
898
|
+
}
|
|
899
|
+
|
|
900
|
+
# Store in knowledge graph if provided
|
|
901
|
+
if knowledge_graph is not None:
|
|
902
|
+
from rnsr.extraction import EntityLinker
|
|
903
|
+
|
|
904
|
+
# Store entities
|
|
905
|
+
for entity in extraction["entities"]:
|
|
906
|
+
knowledge_graph.add_entity(entity)
|
|
907
|
+
|
|
908
|
+
# Store relationships
|
|
909
|
+
for relationship in extraction["relationships"]:
|
|
910
|
+
knowledge_graph.add_relationship(relationship)
|
|
911
|
+
|
|
912
|
+
# Link entities if enabled
|
|
913
|
+
if link_entities:
|
|
914
|
+
linker = EntityLinker(knowledge_graph)
|
|
915
|
+
links = linker.link_all_entities_in_document(ingestion_result.tree.id)
|
|
916
|
+
result["links"] = links
|
|
917
|
+
|
|
918
|
+
logger.info(
|
|
919
|
+
"stored_in_knowledge_graph",
|
|
920
|
+
doc_id=ingestion_result.tree.id,
|
|
921
|
+
entities=len(extraction["entities"]),
|
|
922
|
+
relationships=len(extraction["relationships"]),
|
|
923
|
+
links=len(result["links"]),
|
|
924
|
+
)
|
|
925
|
+
|
|
926
|
+
return result
|
|
927
|
+
|
|
928
|
+
|
|
929
|
+
# Convenience exports
|
|
930
|
+
__all__ = [
|
|
931
|
+
"ingest_document",
|
|
932
|
+
"ingest_document_enhanced",
|
|
933
|
+
"ingest_with_entities",
|
|
934
|
+
"extract_entities_from_tree",
|
|
935
|
+
"IngestionResult",
|
|
936
|
+
]
|