rnsr 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rnsr/__init__.py +118 -0
- rnsr/__main__.py +242 -0
- rnsr/agent/__init__.py +218 -0
- rnsr/agent/cross_doc_navigator.py +767 -0
- rnsr/agent/graph.py +1557 -0
- rnsr/agent/llm_cache.py +575 -0
- rnsr/agent/navigator_api.py +497 -0
- rnsr/agent/provenance.py +772 -0
- rnsr/agent/query_clarifier.py +617 -0
- rnsr/agent/reasoning_memory.py +736 -0
- rnsr/agent/repl_env.py +709 -0
- rnsr/agent/rlm_navigator.py +2108 -0
- rnsr/agent/self_reflection.py +602 -0
- rnsr/agent/variable_store.py +308 -0
- rnsr/benchmarks/__init__.py +118 -0
- rnsr/benchmarks/comprehensive_benchmark.py +733 -0
- rnsr/benchmarks/evaluation_suite.py +1210 -0
- rnsr/benchmarks/finance_bench.py +147 -0
- rnsr/benchmarks/pdf_merger.py +178 -0
- rnsr/benchmarks/performance.py +321 -0
- rnsr/benchmarks/quality.py +321 -0
- rnsr/benchmarks/runner.py +298 -0
- rnsr/benchmarks/standard_benchmarks.py +995 -0
- rnsr/client.py +560 -0
- rnsr/document_store.py +394 -0
- rnsr/exceptions.py +74 -0
- rnsr/extraction/__init__.py +172 -0
- rnsr/extraction/candidate_extractor.py +357 -0
- rnsr/extraction/entity_extractor.py +581 -0
- rnsr/extraction/entity_linker.py +825 -0
- rnsr/extraction/grounded_extractor.py +722 -0
- rnsr/extraction/learned_types.py +599 -0
- rnsr/extraction/models.py +232 -0
- rnsr/extraction/relationship_extractor.py +600 -0
- rnsr/extraction/relationship_patterns.py +511 -0
- rnsr/extraction/relationship_validator.py +392 -0
- rnsr/extraction/rlm_extractor.py +589 -0
- rnsr/extraction/rlm_unified_extractor.py +990 -0
- rnsr/extraction/tot_validator.py +610 -0
- rnsr/extraction/unified_extractor.py +342 -0
- rnsr/indexing/__init__.py +60 -0
- rnsr/indexing/knowledge_graph.py +1128 -0
- rnsr/indexing/kv_store.py +313 -0
- rnsr/indexing/persistence.py +323 -0
- rnsr/indexing/semantic_retriever.py +237 -0
- rnsr/indexing/semantic_search.py +320 -0
- rnsr/indexing/skeleton_index.py +395 -0
- rnsr/ingestion/__init__.py +161 -0
- rnsr/ingestion/chart_parser.py +569 -0
- rnsr/ingestion/document_boundary.py +662 -0
- rnsr/ingestion/font_histogram.py +334 -0
- rnsr/ingestion/header_classifier.py +595 -0
- rnsr/ingestion/hierarchical_cluster.py +515 -0
- rnsr/ingestion/layout_detector.py +356 -0
- rnsr/ingestion/layout_model.py +379 -0
- rnsr/ingestion/ocr_fallback.py +177 -0
- rnsr/ingestion/pipeline.py +936 -0
- rnsr/ingestion/semantic_fallback.py +417 -0
- rnsr/ingestion/table_parser.py +799 -0
- rnsr/ingestion/text_builder.py +460 -0
- rnsr/ingestion/tree_builder.py +402 -0
- rnsr/ingestion/vision_retrieval.py +965 -0
- rnsr/ingestion/xy_cut.py +555 -0
- rnsr/llm.py +733 -0
- rnsr/models.py +167 -0
- rnsr/py.typed +2 -0
- rnsr-0.1.0.dist-info/METADATA +592 -0
- rnsr-0.1.0.dist-info/RECORD +72 -0
- rnsr-0.1.0.dist-info/WHEEL +5 -0
- rnsr-0.1.0.dist-info/entry_points.txt +2 -0
- rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
- rnsr-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,379 @@
|
|
|
1
|
+
"""
|
|
2
|
+
LayoutLM Model - Visual Document Structure Analysis
|
|
3
|
+
|
|
4
|
+
Implements LayoutLMv3 for multimodal document understanding using:
|
|
5
|
+
- Text: Token sequences from OCR
|
|
6
|
+
- Layout: 2D bounding box coordinates
|
|
7
|
+
- Image: Visual features from document patches
|
|
8
|
+
|
|
9
|
+
The model "sees" that text is bold, 24pt font, and centered, allowing
|
|
10
|
+
it to classify blocks as Header, Body, Title, Caption, etc.
|
|
11
|
+
|
|
12
|
+
Usage:
|
|
13
|
+
from rnsr.ingestion.layout_model import get_layout_model, classify_layout_blocks
|
|
14
|
+
|
|
15
|
+
# Auto-load default model (layoutlmv3-large)
|
|
16
|
+
model = get_layout_model()
|
|
17
|
+
|
|
18
|
+
# Or specify model explicitly
|
|
19
|
+
model = get_layout_model(model_name="microsoft/layoutlmv3-base")
|
|
20
|
+
|
|
21
|
+
# Classify document blocks
|
|
22
|
+
labels = classify_layout_blocks(page_image, bboxes, text_spans)
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
import os
|
|
28
|
+
from pathlib import Path
|
|
29
|
+
from typing import Any
|
|
30
|
+
|
|
31
|
+
import structlog
|
|
32
|
+
from PIL import Image
|
|
33
|
+
|
|
34
|
+
logger = structlog.get_logger(__name__)
|
|
35
|
+
|
|
36
|
+
# =============================================================================
|
|
37
|
+
# Model Configuration
|
|
38
|
+
# =============================================================================
|
|
39
|
+
|
|
40
|
+
# Default models
|
|
41
|
+
LAYOUT_MODEL_BASE = "microsoft/layoutlmv3-base" # 133M params, 400MB
|
|
42
|
+
LAYOUT_MODEL_LARGE = "microsoft/layoutlmv3-large" # 368M params, 1.2GB
|
|
43
|
+
|
|
44
|
+
DEFAULT_LAYOUT_MODEL = LAYOUT_MODEL_LARGE # Large by default for 16GB+ RAM
|
|
45
|
+
|
|
46
|
+
# Label mapping for document structure
|
|
47
|
+
LABEL_NAMES = [
|
|
48
|
+
"O", # Other/None
|
|
49
|
+
"B-TITLE", # Beginning of title
|
|
50
|
+
"I-TITLE", # Inside title
|
|
51
|
+
"B-HEADER", # Beginning of header
|
|
52
|
+
"I-HEADER", # Inside header
|
|
53
|
+
"B-BODY", # Beginning of body text
|
|
54
|
+
"I-BODY", # Inside body text
|
|
55
|
+
"B-CAPTION", # Beginning of caption
|
|
56
|
+
"I-CAPTION", # Inside caption
|
|
57
|
+
"B-FOOTER", # Beginning of footer
|
|
58
|
+
"I-FOOTER", # Inside footer
|
|
59
|
+
"B-TABLE", # Beginning of table
|
|
60
|
+
"I-TABLE", # Inside table
|
|
61
|
+
]
|
|
62
|
+
|
|
63
|
+
# =============================================================================
|
|
64
|
+
# Global Model Cache
|
|
65
|
+
# =============================================================================
|
|
66
|
+
|
|
67
|
+
_LAYOUT_MODEL_CACHE: dict[str, Any] = {}
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def detect_device() -> str:
|
|
71
|
+
"""
|
|
72
|
+
Auto-detect best available device for inference.
|
|
73
|
+
|
|
74
|
+
Priority:
|
|
75
|
+
1. CUDA (NVIDIA GPU)
|
|
76
|
+
2. MPS (Apple Silicon GPU)
|
|
77
|
+
3. CPU (fallback)
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
Device string ("cuda", "mps", or "cpu").
|
|
81
|
+
"""
|
|
82
|
+
try:
|
|
83
|
+
import torch
|
|
84
|
+
|
|
85
|
+
if torch.cuda.is_available():
|
|
86
|
+
logger.info("device_detected", device="cuda", gpus=torch.cuda.device_count())
|
|
87
|
+
return "cuda"
|
|
88
|
+
|
|
89
|
+
# Check for Apple Silicon MPS
|
|
90
|
+
if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
|
|
91
|
+
logger.info("device_detected", device="mps", note="Apple Silicon GPU")
|
|
92
|
+
return "mps"
|
|
93
|
+
|
|
94
|
+
logger.info("device_detected", device="cpu", note="No GPU available")
|
|
95
|
+
return "cpu"
|
|
96
|
+
|
|
97
|
+
except ImportError:
|
|
98
|
+
logger.warning("torch_not_installed", fallback="cpu")
|
|
99
|
+
return "cpu"
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def get_model_name_from_env() -> str:
|
|
103
|
+
"""
|
|
104
|
+
Get model name from environment variable or default.
|
|
105
|
+
|
|
106
|
+
Environment variables:
|
|
107
|
+
- RNSR_LAYOUT_MODEL: Model name or path
|
|
108
|
+
Examples: "microsoft/layoutlmv3-base", "microsoft/layoutlmv3-large"
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
Model name or path.
|
|
112
|
+
"""
|
|
113
|
+
model = os.getenv("RNSR_LAYOUT_MODEL")
|
|
114
|
+
if model:
|
|
115
|
+
logger.info("layout_model_from_env", model=model)
|
|
116
|
+
return model
|
|
117
|
+
|
|
118
|
+
return DEFAULT_LAYOUT_MODEL
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def get_device_from_env() -> str:
|
|
122
|
+
"""
|
|
123
|
+
Get device from environment variable or auto-detect.
|
|
124
|
+
|
|
125
|
+
Environment variables:
|
|
126
|
+
- RNSR_LAYOUT_DEVICE: Device override
|
|
127
|
+
Options: "cuda", "mps", "cpu", "auto"
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
Device string.
|
|
131
|
+
"""
|
|
132
|
+
device = os.getenv("RNSR_LAYOUT_DEVICE", "auto").lower()
|
|
133
|
+
|
|
134
|
+
if device == "auto":
|
|
135
|
+
return detect_device()
|
|
136
|
+
|
|
137
|
+
if device not in ("cuda", "mps", "cpu"):
|
|
138
|
+
logger.warning("invalid_device", device=device, fallback="auto")
|
|
139
|
+
return detect_device()
|
|
140
|
+
|
|
141
|
+
logger.info("layout_device_from_env", device=device)
|
|
142
|
+
return device
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def get_layout_model(
|
|
146
|
+
model_name: str | None = None,
|
|
147
|
+
device: str | None = None,
|
|
148
|
+
force_reload: bool = False,
|
|
149
|
+
) -> Any:
|
|
150
|
+
"""
|
|
151
|
+
Get LayoutLMv3 model instance with caching.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
model_name: Model name or path. Uses env var or default if None.
|
|
155
|
+
device: Device for inference ("cuda", "mps", "cpu", "auto").
|
|
156
|
+
force_reload: Force reload model even if cached.
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
LayoutLMv3 model instance.
|
|
160
|
+
|
|
161
|
+
Raises:
|
|
162
|
+
ImportError: If transformers or torch not installed.
|
|
163
|
+
RuntimeError: If model cannot be loaded.
|
|
164
|
+
|
|
165
|
+
Example:
|
|
166
|
+
# Default (layoutlmv3-large, auto device)
|
|
167
|
+
model = get_layout_model()
|
|
168
|
+
|
|
169
|
+
# Custom model
|
|
170
|
+
model = get_layout_model(model_name="microsoft/layoutlmv3-base")
|
|
171
|
+
|
|
172
|
+
# Force CPU
|
|
173
|
+
model = get_layout_model(device="cpu")
|
|
174
|
+
"""
|
|
175
|
+
# Resolve model name and device
|
|
176
|
+
model_name = model_name or get_model_name_from_env()
|
|
177
|
+
device = device or get_device_from_env()
|
|
178
|
+
|
|
179
|
+
# Check cache
|
|
180
|
+
cache_key = f"{model_name}:{device}"
|
|
181
|
+
if not force_reload and cache_key in _LAYOUT_MODEL_CACHE:
|
|
182
|
+
logger.debug("layout_model_from_cache", model=model_name, device=device)
|
|
183
|
+
return _LAYOUT_MODEL_CACHE[cache_key]
|
|
184
|
+
|
|
185
|
+
# Import dependencies
|
|
186
|
+
try:
|
|
187
|
+
from transformers import AutoModelForTokenClassification, AutoProcessor
|
|
188
|
+
import torch
|
|
189
|
+
except ImportError as e:
|
|
190
|
+
raise ImportError(
|
|
191
|
+
"transformers and torch required for LayoutLM. "
|
|
192
|
+
"Install with: pip install transformers torch torchvision"
|
|
193
|
+
) from e
|
|
194
|
+
|
|
195
|
+
logger.info(
|
|
196
|
+
"loading_layout_model",
|
|
197
|
+
model=model_name,
|
|
198
|
+
device=device,
|
|
199
|
+
note="First load downloads ~1.2GB" if "large" in model_name else "First load downloads ~400MB"
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
try:
|
|
203
|
+
# Load model and processor
|
|
204
|
+
model = AutoModelForTokenClassification.from_pretrained(
|
|
205
|
+
model_name,
|
|
206
|
+
num_labels=len(LABEL_NAMES),
|
|
207
|
+
)
|
|
208
|
+
processor = AutoProcessor.from_pretrained(model_name, apply_ocr=False)
|
|
209
|
+
|
|
210
|
+
# Move to device
|
|
211
|
+
if device != "cpu":
|
|
212
|
+
model = model.to(device)
|
|
213
|
+
|
|
214
|
+
model.eval() # Set to evaluation mode
|
|
215
|
+
|
|
216
|
+
# Cache model and processor together
|
|
217
|
+
_LAYOUT_MODEL_CACHE[cache_key] = {
|
|
218
|
+
"model": model,
|
|
219
|
+
"processor": processor,
|
|
220
|
+
"device": device,
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
logger.info("layout_model_loaded", model=model_name, device=device)
|
|
224
|
+
return _LAYOUT_MODEL_CACHE[cache_key]
|
|
225
|
+
|
|
226
|
+
except Exception as e:
|
|
227
|
+
logger.error("layout_model_load_failed", model=model_name, error=str(e))
|
|
228
|
+
raise RuntimeError(f"Failed to load LayoutLM model: {e}") from e
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def classify_layout_blocks(
|
|
232
|
+
page_image: Image.Image,
|
|
233
|
+
bboxes: list[tuple[float, float, float, float]],
|
|
234
|
+
text_spans: list[str],
|
|
235
|
+
model_name: str | None = None,
|
|
236
|
+
device: str | None = None,
|
|
237
|
+
) -> list[dict[str, Any]]:
|
|
238
|
+
"""
|
|
239
|
+
Classify layout blocks using LayoutLMv3.
|
|
240
|
+
|
|
241
|
+
Args:
|
|
242
|
+
page_image: PIL Image of the document page.
|
|
243
|
+
bboxes: List of bounding boxes as (x0, y0, x1, y1) tuples.
|
|
244
|
+
text_spans: List of text content for each bounding box.
|
|
245
|
+
model_name: Model name override (uses default if None).
|
|
246
|
+
device: Device override (uses auto-detect if None).
|
|
247
|
+
|
|
248
|
+
Returns:
|
|
249
|
+
List of classification results with structure:
|
|
250
|
+
[
|
|
251
|
+
{
|
|
252
|
+
"text": str,
|
|
253
|
+
"bbox": tuple,
|
|
254
|
+
"label": str, # "TITLE", "HEADER", "BODY", etc.
|
|
255
|
+
"confidence": float,
|
|
256
|
+
},
|
|
257
|
+
...
|
|
258
|
+
]
|
|
259
|
+
|
|
260
|
+
Example:
|
|
261
|
+
from PIL import Image
|
|
262
|
+
|
|
263
|
+
image = Image.open("page.png")
|
|
264
|
+
bboxes = [(10, 10, 100, 30), (10, 50, 200, 70)]
|
|
265
|
+
texts = ["Document Title", "This is the introduction."]
|
|
266
|
+
|
|
267
|
+
results = classify_layout_blocks(image, bboxes, texts)
|
|
268
|
+
for r in results:
|
|
269
|
+
print(f"{r['label']}: {r['text']}")
|
|
270
|
+
"""
|
|
271
|
+
if len(bboxes) != len(text_spans):
|
|
272
|
+
raise ValueError("Number of bboxes must match number of text_spans")
|
|
273
|
+
|
|
274
|
+
if not bboxes:
|
|
275
|
+
logger.warning("no_bboxes_to_classify")
|
|
276
|
+
return []
|
|
277
|
+
|
|
278
|
+
# Load model
|
|
279
|
+
model_dict = get_layout_model(model_name, device)
|
|
280
|
+
model = model_dict["model"]
|
|
281
|
+
processor = model_dict["processor"]
|
|
282
|
+
device_str = model_dict["device"]
|
|
283
|
+
|
|
284
|
+
try:
|
|
285
|
+
import torch
|
|
286
|
+
|
|
287
|
+
# Normalize bboxes to 0-1000 scale (LayoutLM format)
|
|
288
|
+
width, height = page_image.size
|
|
289
|
+
normalized_bboxes = []
|
|
290
|
+
for x0, y0, x1, y1 in bboxes:
|
|
291
|
+
normalized_bboxes.append([
|
|
292
|
+
int((x0 / width) * 1000),
|
|
293
|
+
int((y0 / height) * 1000),
|
|
294
|
+
int((x1 / width) * 1000),
|
|
295
|
+
int((y1 / height) * 1000),
|
|
296
|
+
])
|
|
297
|
+
|
|
298
|
+
# Prepare inputs
|
|
299
|
+
encoding = processor(
|
|
300
|
+
page_image,
|
|
301
|
+
text_spans,
|
|
302
|
+
boxes=normalized_bboxes,
|
|
303
|
+
return_tensors="pt",
|
|
304
|
+
padding="max_length",
|
|
305
|
+
truncation=True,
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
# Move to device
|
|
309
|
+
if device_str != "cpu":
|
|
310
|
+
encoding = {k: v.to(device_str) for k, v in encoding.items()}
|
|
311
|
+
|
|
312
|
+
# Run inference
|
|
313
|
+
with torch.no_grad():
|
|
314
|
+
outputs = model(**encoding)
|
|
315
|
+
predictions = torch.argmax(outputs.logits, dim=-1)
|
|
316
|
+
probabilities = torch.softmax(outputs.logits, dim=-1)
|
|
317
|
+
|
|
318
|
+
# Extract results
|
|
319
|
+
results = []
|
|
320
|
+
for i, (text, bbox) in enumerate(zip(text_spans, bboxes)):
|
|
321
|
+
pred_idx = int(predictions[0, i].item())
|
|
322
|
+
confidence = float(probabilities[0, i, pred_idx].item())
|
|
323
|
+
|
|
324
|
+
label = LABEL_NAMES[pred_idx] if pred_idx < len(LABEL_NAMES) else "O"
|
|
325
|
+
|
|
326
|
+
# Simplify label (remove B-/I- prefix)
|
|
327
|
+
simplified_label = label.split("-")[-1] if "-" in label else label
|
|
328
|
+
|
|
329
|
+
results.append({
|
|
330
|
+
"text": text,
|
|
331
|
+
"bbox": bbox,
|
|
332
|
+
"label": simplified_label,
|
|
333
|
+
"confidence": confidence,
|
|
334
|
+
})
|
|
335
|
+
|
|
336
|
+
logger.debug("layout_classification_complete", blocks=len(results))
|
|
337
|
+
return results
|
|
338
|
+
|
|
339
|
+
except Exception as e:
|
|
340
|
+
logger.error("layout_classification_failed", error=str(e))
|
|
341
|
+
raise RuntimeError(f"Failed to classify layout blocks: {e}") from e
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
def check_layout_model_available() -> bool:
|
|
345
|
+
"""
|
|
346
|
+
Check if LayoutLM dependencies are available.
|
|
347
|
+
|
|
348
|
+
Returns:
|
|
349
|
+
True if transformers and torch are installed.
|
|
350
|
+
"""
|
|
351
|
+
try:
|
|
352
|
+
import torch
|
|
353
|
+
import transformers
|
|
354
|
+
return True
|
|
355
|
+
except ImportError:
|
|
356
|
+
return False
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
def get_layout_model_info() -> dict[str, Any]:
|
|
360
|
+
"""
|
|
361
|
+
Get information about LayoutLM configuration.
|
|
362
|
+
|
|
363
|
+
Returns:
|
|
364
|
+
Dictionary with model configuration and availability.
|
|
365
|
+
"""
|
|
366
|
+
info = {
|
|
367
|
+
"available": check_layout_model_available(),
|
|
368
|
+
"default_model": DEFAULT_LAYOUT_MODEL,
|
|
369
|
+
"models": {
|
|
370
|
+
"base": LAYOUT_MODEL_BASE,
|
|
371
|
+
"large": LAYOUT_MODEL_LARGE,
|
|
372
|
+
},
|
|
373
|
+
"device": get_device_from_env(),
|
|
374
|
+
"env_model": os.getenv("RNSR_LAYOUT_MODEL"),
|
|
375
|
+
"env_device": os.getenv("RNSR_LAYOUT_DEVICE"),
|
|
376
|
+
"cache_dir": os.getenv("HF_HOME", os.path.expanduser("~/.cache/huggingface")),
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
return info
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
"""
|
|
2
|
+
OCR Fallback - TIER 3: For Scanned/Image-Only PDFs
|
|
3
|
+
|
|
4
|
+
When the document contains no extractable text (scanned PDFs, image-only),
|
|
5
|
+
this module applies OCR to generate a text layer, then re-runs analysis.
|
|
6
|
+
|
|
7
|
+
Use this fallback when:
|
|
8
|
+
- PDF contains only images (scanned documents)
|
|
9
|
+
- No text can be extracted via PyMuPDF
|
|
10
|
+
- Document was scanned without OCR processing
|
|
11
|
+
|
|
12
|
+
Dependencies:
|
|
13
|
+
- pytesseract (OCR engine wrapper)
|
|
14
|
+
- pdf2image (PDF to image conversion)
|
|
15
|
+
- Tesseract-OCR installed on system
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
|
|
22
|
+
import structlog
|
|
23
|
+
|
|
24
|
+
from rnsr.exceptions import OCRError
|
|
25
|
+
from rnsr.models import DocumentNode, DocumentTree
|
|
26
|
+
|
|
27
|
+
logger = structlog.get_logger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def check_ocr_available() -> bool:
|
|
31
|
+
"""
|
|
32
|
+
Check if OCR dependencies are available.
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
True if pytesseract and pdf2image are importable.
|
|
36
|
+
"""
|
|
37
|
+
try:
|
|
38
|
+
import pytesseract
|
|
39
|
+
from pdf2image import convert_from_path
|
|
40
|
+
|
|
41
|
+
# Test tesseract is installed
|
|
42
|
+
pytesseract.get_tesseract_version()
|
|
43
|
+
return True
|
|
44
|
+
except Exception:
|
|
45
|
+
return False
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def try_ocr_ingestion(pdf_path: Path | str) -> DocumentTree:
|
|
49
|
+
"""
|
|
50
|
+
TIER 3 Fallback: Use OCR for scanned/image-only PDFs.
|
|
51
|
+
|
|
52
|
+
This method:
|
|
53
|
+
1. Converts PDF pages to images
|
|
54
|
+
2. Applies Tesseract OCR to each page
|
|
55
|
+
3. Builds a document tree from OCR output
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
pdf_path: Path to the PDF file.
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
DocumentTree from OCR text.
|
|
62
|
+
|
|
63
|
+
Raises:
|
|
64
|
+
OCRError: If OCR fails or dependencies not available.
|
|
65
|
+
"""
|
|
66
|
+
pdf_path = Path(pdf_path)
|
|
67
|
+
|
|
68
|
+
logger.info("using_ocr_fallback", path=str(pdf_path))
|
|
69
|
+
|
|
70
|
+
# Check dependencies
|
|
71
|
+
try:
|
|
72
|
+
import pytesseract
|
|
73
|
+
from pdf2image import convert_from_path
|
|
74
|
+
except ImportError as e:
|
|
75
|
+
raise OCRError(
|
|
76
|
+
f"OCR dependencies not available: {e}. "
|
|
77
|
+
"Install with: pip install pytesseract pdf2image"
|
|
78
|
+
) from e
|
|
79
|
+
|
|
80
|
+
try:
|
|
81
|
+
# Convert PDF pages to images
|
|
82
|
+
logger.debug("converting_pdf_to_images", path=str(pdf_path))
|
|
83
|
+
images = convert_from_path(pdf_path, dpi=300)
|
|
84
|
+
|
|
85
|
+
logger.info("pdf_converted", pages=len(images))
|
|
86
|
+
|
|
87
|
+
# OCR each page
|
|
88
|
+
ocr_texts: list[str] = []
|
|
89
|
+
for i, image in enumerate(images):
|
|
90
|
+
logger.debug("processing_page_ocr", page=i)
|
|
91
|
+
text = pytesseract.image_to_string(image)
|
|
92
|
+
ocr_texts.append(text)
|
|
93
|
+
|
|
94
|
+
# Combine and build tree
|
|
95
|
+
full_text = "\n\n".join(ocr_texts)
|
|
96
|
+
|
|
97
|
+
if not full_text.strip():
|
|
98
|
+
logger.warning("ocr_no_text_found", path=str(pdf_path))
|
|
99
|
+
root = DocumentNode(id="root", level=0, header="Document")
|
|
100
|
+
return DocumentTree(
|
|
101
|
+
title="Empty OCR Result",
|
|
102
|
+
root=root,
|
|
103
|
+
total_nodes=1,
|
|
104
|
+
ingestion_tier=3,
|
|
105
|
+
ingestion_method="ocr",
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
# Build tree from OCR text
|
|
109
|
+
return _build_tree_from_ocr(ocr_texts, pdf_path.stem)
|
|
110
|
+
|
|
111
|
+
except Exception as e:
|
|
112
|
+
raise OCRError(f"OCR processing failed: {e}") from e
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _build_tree_from_ocr(
|
|
116
|
+
page_texts: list[str],
|
|
117
|
+
title: str,
|
|
118
|
+
) -> DocumentTree:
|
|
119
|
+
"""
|
|
120
|
+
Build a document tree from OCR output.
|
|
121
|
+
|
|
122
|
+
Creates a simple page-based structure since OCR
|
|
123
|
+
doesn't preserve font information.
|
|
124
|
+
"""
|
|
125
|
+
root = DocumentNode(
|
|
126
|
+
id="root",
|
|
127
|
+
level=0,
|
|
128
|
+
header=title,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
for page_num, text in enumerate(page_texts, 1):
|
|
132
|
+
text = text.strip()
|
|
133
|
+
if not text:
|
|
134
|
+
continue
|
|
135
|
+
|
|
136
|
+
# Create a section per page
|
|
137
|
+
section = DocumentNode(
|
|
138
|
+
id=f"page_{page_num:03d}",
|
|
139
|
+
level=1,
|
|
140
|
+
header=f"Page {page_num}",
|
|
141
|
+
content=text,
|
|
142
|
+
page_num=page_num - 1, # 0-indexed
|
|
143
|
+
)
|
|
144
|
+
root.children.append(section)
|
|
145
|
+
|
|
146
|
+
return DocumentTree(
|
|
147
|
+
title=title,
|
|
148
|
+
root=root,
|
|
149
|
+
total_nodes=len(root.children) + 1,
|
|
150
|
+
ingestion_tier=3,
|
|
151
|
+
ingestion_method="ocr",
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def has_extractable_text(pdf_path: Path | str) -> bool:
|
|
156
|
+
"""
|
|
157
|
+
Check if a PDF has extractable text.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
pdf_path: Path to the PDF file.
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
True if text can be extracted, False if OCR is needed.
|
|
164
|
+
"""
|
|
165
|
+
import fitz
|
|
166
|
+
|
|
167
|
+
pdf_path = Path(pdf_path)
|
|
168
|
+
doc = fitz.open(pdf_path)
|
|
169
|
+
|
|
170
|
+
for page in doc:
|
|
171
|
+
text = str(page.get_text()).strip()
|
|
172
|
+
if text:
|
|
173
|
+
doc.close()
|
|
174
|
+
return True
|
|
175
|
+
|
|
176
|
+
doc.close()
|
|
177
|
+
return False
|