rnsr 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. rnsr/__init__.py +118 -0
  2. rnsr/__main__.py +242 -0
  3. rnsr/agent/__init__.py +218 -0
  4. rnsr/agent/cross_doc_navigator.py +767 -0
  5. rnsr/agent/graph.py +1557 -0
  6. rnsr/agent/llm_cache.py +575 -0
  7. rnsr/agent/navigator_api.py +497 -0
  8. rnsr/agent/provenance.py +772 -0
  9. rnsr/agent/query_clarifier.py +617 -0
  10. rnsr/agent/reasoning_memory.py +736 -0
  11. rnsr/agent/repl_env.py +709 -0
  12. rnsr/agent/rlm_navigator.py +2108 -0
  13. rnsr/agent/self_reflection.py +602 -0
  14. rnsr/agent/variable_store.py +308 -0
  15. rnsr/benchmarks/__init__.py +118 -0
  16. rnsr/benchmarks/comprehensive_benchmark.py +733 -0
  17. rnsr/benchmarks/evaluation_suite.py +1210 -0
  18. rnsr/benchmarks/finance_bench.py +147 -0
  19. rnsr/benchmarks/pdf_merger.py +178 -0
  20. rnsr/benchmarks/performance.py +321 -0
  21. rnsr/benchmarks/quality.py +321 -0
  22. rnsr/benchmarks/runner.py +298 -0
  23. rnsr/benchmarks/standard_benchmarks.py +995 -0
  24. rnsr/client.py +560 -0
  25. rnsr/document_store.py +394 -0
  26. rnsr/exceptions.py +74 -0
  27. rnsr/extraction/__init__.py +172 -0
  28. rnsr/extraction/candidate_extractor.py +357 -0
  29. rnsr/extraction/entity_extractor.py +581 -0
  30. rnsr/extraction/entity_linker.py +825 -0
  31. rnsr/extraction/grounded_extractor.py +722 -0
  32. rnsr/extraction/learned_types.py +599 -0
  33. rnsr/extraction/models.py +232 -0
  34. rnsr/extraction/relationship_extractor.py +600 -0
  35. rnsr/extraction/relationship_patterns.py +511 -0
  36. rnsr/extraction/relationship_validator.py +392 -0
  37. rnsr/extraction/rlm_extractor.py +589 -0
  38. rnsr/extraction/rlm_unified_extractor.py +990 -0
  39. rnsr/extraction/tot_validator.py +610 -0
  40. rnsr/extraction/unified_extractor.py +342 -0
  41. rnsr/indexing/__init__.py +60 -0
  42. rnsr/indexing/knowledge_graph.py +1128 -0
  43. rnsr/indexing/kv_store.py +313 -0
  44. rnsr/indexing/persistence.py +323 -0
  45. rnsr/indexing/semantic_retriever.py +237 -0
  46. rnsr/indexing/semantic_search.py +320 -0
  47. rnsr/indexing/skeleton_index.py +395 -0
  48. rnsr/ingestion/__init__.py +161 -0
  49. rnsr/ingestion/chart_parser.py +569 -0
  50. rnsr/ingestion/document_boundary.py +662 -0
  51. rnsr/ingestion/font_histogram.py +334 -0
  52. rnsr/ingestion/header_classifier.py +595 -0
  53. rnsr/ingestion/hierarchical_cluster.py +515 -0
  54. rnsr/ingestion/layout_detector.py +356 -0
  55. rnsr/ingestion/layout_model.py +379 -0
  56. rnsr/ingestion/ocr_fallback.py +177 -0
  57. rnsr/ingestion/pipeline.py +936 -0
  58. rnsr/ingestion/semantic_fallback.py +417 -0
  59. rnsr/ingestion/table_parser.py +799 -0
  60. rnsr/ingestion/text_builder.py +460 -0
  61. rnsr/ingestion/tree_builder.py +402 -0
  62. rnsr/ingestion/vision_retrieval.py +965 -0
  63. rnsr/ingestion/xy_cut.py +555 -0
  64. rnsr/llm.py +733 -0
  65. rnsr/models.py +167 -0
  66. rnsr/py.typed +2 -0
  67. rnsr-0.1.0.dist-info/METADATA +592 -0
  68. rnsr-0.1.0.dist-info/RECORD +72 -0
  69. rnsr-0.1.0.dist-info/WHEEL +5 -0
  70. rnsr-0.1.0.dist-info/entry_points.txt +2 -0
  71. rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
  72. rnsr-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,379 @@
1
+ """
2
+ LayoutLM Model - Visual Document Structure Analysis
3
+
4
+ Implements LayoutLMv3 for multimodal document understanding using:
5
+ - Text: Token sequences from OCR
6
+ - Layout: 2D bounding box coordinates
7
+ - Image: Visual features from document patches
8
+
9
+ The model "sees" that text is bold, 24pt font, and centered, allowing
10
+ it to classify blocks as Header, Body, Title, Caption, etc.
11
+
12
+ Usage:
13
+ from rnsr.ingestion.layout_model import get_layout_model, classify_layout_blocks
14
+
15
+ # Auto-load default model (layoutlmv3-large)
16
+ model = get_layout_model()
17
+
18
+ # Or specify model explicitly
19
+ model = get_layout_model(model_name="microsoft/layoutlmv3-base")
20
+
21
+ # Classify document blocks
22
+ labels = classify_layout_blocks(page_image, bboxes, text_spans)
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ import os
28
+ from pathlib import Path
29
+ from typing import Any
30
+
31
+ import structlog
32
+ from PIL import Image
33
+
34
+ logger = structlog.get_logger(__name__)
35
+
36
+ # =============================================================================
37
+ # Model Configuration
38
+ # =============================================================================
39
+
40
+ # Default models
41
+ LAYOUT_MODEL_BASE = "microsoft/layoutlmv3-base" # 133M params, 400MB
42
+ LAYOUT_MODEL_LARGE = "microsoft/layoutlmv3-large" # 368M params, 1.2GB
43
+
44
+ DEFAULT_LAYOUT_MODEL = LAYOUT_MODEL_LARGE # Large by default for 16GB+ RAM
45
+
46
+ # Label mapping for document structure
47
+ LABEL_NAMES = [
48
+ "O", # Other/None
49
+ "B-TITLE", # Beginning of title
50
+ "I-TITLE", # Inside title
51
+ "B-HEADER", # Beginning of header
52
+ "I-HEADER", # Inside header
53
+ "B-BODY", # Beginning of body text
54
+ "I-BODY", # Inside body text
55
+ "B-CAPTION", # Beginning of caption
56
+ "I-CAPTION", # Inside caption
57
+ "B-FOOTER", # Beginning of footer
58
+ "I-FOOTER", # Inside footer
59
+ "B-TABLE", # Beginning of table
60
+ "I-TABLE", # Inside table
61
+ ]
62
+
63
+ # =============================================================================
64
+ # Global Model Cache
65
+ # =============================================================================
66
+
67
+ _LAYOUT_MODEL_CACHE: dict[str, Any] = {}
68
+
69
+
70
+ def detect_device() -> str:
71
+ """
72
+ Auto-detect best available device for inference.
73
+
74
+ Priority:
75
+ 1. CUDA (NVIDIA GPU)
76
+ 2. MPS (Apple Silicon GPU)
77
+ 3. CPU (fallback)
78
+
79
+ Returns:
80
+ Device string ("cuda", "mps", or "cpu").
81
+ """
82
+ try:
83
+ import torch
84
+
85
+ if torch.cuda.is_available():
86
+ logger.info("device_detected", device="cuda", gpus=torch.cuda.device_count())
87
+ return "cuda"
88
+
89
+ # Check for Apple Silicon MPS
90
+ if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
91
+ logger.info("device_detected", device="mps", note="Apple Silicon GPU")
92
+ return "mps"
93
+
94
+ logger.info("device_detected", device="cpu", note="No GPU available")
95
+ return "cpu"
96
+
97
+ except ImportError:
98
+ logger.warning("torch_not_installed", fallback="cpu")
99
+ return "cpu"
100
+
101
+
102
+ def get_model_name_from_env() -> str:
103
+ """
104
+ Get model name from environment variable or default.
105
+
106
+ Environment variables:
107
+ - RNSR_LAYOUT_MODEL: Model name or path
108
+ Examples: "microsoft/layoutlmv3-base", "microsoft/layoutlmv3-large"
109
+
110
+ Returns:
111
+ Model name or path.
112
+ """
113
+ model = os.getenv("RNSR_LAYOUT_MODEL")
114
+ if model:
115
+ logger.info("layout_model_from_env", model=model)
116
+ return model
117
+
118
+ return DEFAULT_LAYOUT_MODEL
119
+
120
+
121
+ def get_device_from_env() -> str:
122
+ """
123
+ Get device from environment variable or auto-detect.
124
+
125
+ Environment variables:
126
+ - RNSR_LAYOUT_DEVICE: Device override
127
+ Options: "cuda", "mps", "cpu", "auto"
128
+
129
+ Returns:
130
+ Device string.
131
+ """
132
+ device = os.getenv("RNSR_LAYOUT_DEVICE", "auto").lower()
133
+
134
+ if device == "auto":
135
+ return detect_device()
136
+
137
+ if device not in ("cuda", "mps", "cpu"):
138
+ logger.warning("invalid_device", device=device, fallback="auto")
139
+ return detect_device()
140
+
141
+ logger.info("layout_device_from_env", device=device)
142
+ return device
143
+
144
+
145
+ def get_layout_model(
146
+ model_name: str | None = None,
147
+ device: str | None = None,
148
+ force_reload: bool = False,
149
+ ) -> Any:
150
+ """
151
+ Get LayoutLMv3 model instance with caching.
152
+
153
+ Args:
154
+ model_name: Model name or path. Uses env var or default if None.
155
+ device: Device for inference ("cuda", "mps", "cpu", "auto").
156
+ force_reload: Force reload model even if cached.
157
+
158
+ Returns:
159
+ LayoutLMv3 model instance.
160
+
161
+ Raises:
162
+ ImportError: If transformers or torch not installed.
163
+ RuntimeError: If model cannot be loaded.
164
+
165
+ Example:
166
+ # Default (layoutlmv3-large, auto device)
167
+ model = get_layout_model()
168
+
169
+ # Custom model
170
+ model = get_layout_model(model_name="microsoft/layoutlmv3-base")
171
+
172
+ # Force CPU
173
+ model = get_layout_model(device="cpu")
174
+ """
175
+ # Resolve model name and device
176
+ model_name = model_name or get_model_name_from_env()
177
+ device = device or get_device_from_env()
178
+
179
+ # Check cache
180
+ cache_key = f"{model_name}:{device}"
181
+ if not force_reload and cache_key in _LAYOUT_MODEL_CACHE:
182
+ logger.debug("layout_model_from_cache", model=model_name, device=device)
183
+ return _LAYOUT_MODEL_CACHE[cache_key]
184
+
185
+ # Import dependencies
186
+ try:
187
+ from transformers import AutoModelForTokenClassification, AutoProcessor
188
+ import torch
189
+ except ImportError as e:
190
+ raise ImportError(
191
+ "transformers and torch required for LayoutLM. "
192
+ "Install with: pip install transformers torch torchvision"
193
+ ) from e
194
+
195
+ logger.info(
196
+ "loading_layout_model",
197
+ model=model_name,
198
+ device=device,
199
+ note="First load downloads ~1.2GB" if "large" in model_name else "First load downloads ~400MB"
200
+ )
201
+
202
+ try:
203
+ # Load model and processor
204
+ model = AutoModelForTokenClassification.from_pretrained(
205
+ model_name,
206
+ num_labels=len(LABEL_NAMES),
207
+ )
208
+ processor = AutoProcessor.from_pretrained(model_name, apply_ocr=False)
209
+
210
+ # Move to device
211
+ if device != "cpu":
212
+ model = model.to(device)
213
+
214
+ model.eval() # Set to evaluation mode
215
+
216
+ # Cache model and processor together
217
+ _LAYOUT_MODEL_CACHE[cache_key] = {
218
+ "model": model,
219
+ "processor": processor,
220
+ "device": device,
221
+ }
222
+
223
+ logger.info("layout_model_loaded", model=model_name, device=device)
224
+ return _LAYOUT_MODEL_CACHE[cache_key]
225
+
226
+ except Exception as e:
227
+ logger.error("layout_model_load_failed", model=model_name, error=str(e))
228
+ raise RuntimeError(f"Failed to load LayoutLM model: {e}") from e
229
+
230
+
231
+ def classify_layout_blocks(
232
+ page_image: Image.Image,
233
+ bboxes: list[tuple[float, float, float, float]],
234
+ text_spans: list[str],
235
+ model_name: str | None = None,
236
+ device: str | None = None,
237
+ ) -> list[dict[str, Any]]:
238
+ """
239
+ Classify layout blocks using LayoutLMv3.
240
+
241
+ Args:
242
+ page_image: PIL Image of the document page.
243
+ bboxes: List of bounding boxes as (x0, y0, x1, y1) tuples.
244
+ text_spans: List of text content for each bounding box.
245
+ model_name: Model name override (uses default if None).
246
+ device: Device override (uses auto-detect if None).
247
+
248
+ Returns:
249
+ List of classification results with structure:
250
+ [
251
+ {
252
+ "text": str,
253
+ "bbox": tuple,
254
+ "label": str, # "TITLE", "HEADER", "BODY", etc.
255
+ "confidence": float,
256
+ },
257
+ ...
258
+ ]
259
+
260
+ Example:
261
+ from PIL import Image
262
+
263
+ image = Image.open("page.png")
264
+ bboxes = [(10, 10, 100, 30), (10, 50, 200, 70)]
265
+ texts = ["Document Title", "This is the introduction."]
266
+
267
+ results = classify_layout_blocks(image, bboxes, texts)
268
+ for r in results:
269
+ print(f"{r['label']}: {r['text']}")
270
+ """
271
+ if len(bboxes) != len(text_spans):
272
+ raise ValueError("Number of bboxes must match number of text_spans")
273
+
274
+ if not bboxes:
275
+ logger.warning("no_bboxes_to_classify")
276
+ return []
277
+
278
+ # Load model
279
+ model_dict = get_layout_model(model_name, device)
280
+ model = model_dict["model"]
281
+ processor = model_dict["processor"]
282
+ device_str = model_dict["device"]
283
+
284
+ try:
285
+ import torch
286
+
287
+ # Normalize bboxes to 0-1000 scale (LayoutLM format)
288
+ width, height = page_image.size
289
+ normalized_bboxes = []
290
+ for x0, y0, x1, y1 in bboxes:
291
+ normalized_bboxes.append([
292
+ int((x0 / width) * 1000),
293
+ int((y0 / height) * 1000),
294
+ int((x1 / width) * 1000),
295
+ int((y1 / height) * 1000),
296
+ ])
297
+
298
+ # Prepare inputs
299
+ encoding = processor(
300
+ page_image,
301
+ text_spans,
302
+ boxes=normalized_bboxes,
303
+ return_tensors="pt",
304
+ padding="max_length",
305
+ truncation=True,
306
+ )
307
+
308
+ # Move to device
309
+ if device_str != "cpu":
310
+ encoding = {k: v.to(device_str) for k, v in encoding.items()}
311
+
312
+ # Run inference
313
+ with torch.no_grad():
314
+ outputs = model(**encoding)
315
+ predictions = torch.argmax(outputs.logits, dim=-1)
316
+ probabilities = torch.softmax(outputs.logits, dim=-1)
317
+
318
+ # Extract results
319
+ results = []
320
+ for i, (text, bbox) in enumerate(zip(text_spans, bboxes)):
321
+ pred_idx = int(predictions[0, i].item())
322
+ confidence = float(probabilities[0, i, pred_idx].item())
323
+
324
+ label = LABEL_NAMES[pred_idx] if pred_idx < len(LABEL_NAMES) else "O"
325
+
326
+ # Simplify label (remove B-/I- prefix)
327
+ simplified_label = label.split("-")[-1] if "-" in label else label
328
+
329
+ results.append({
330
+ "text": text,
331
+ "bbox": bbox,
332
+ "label": simplified_label,
333
+ "confidence": confidence,
334
+ })
335
+
336
+ logger.debug("layout_classification_complete", blocks=len(results))
337
+ return results
338
+
339
+ except Exception as e:
340
+ logger.error("layout_classification_failed", error=str(e))
341
+ raise RuntimeError(f"Failed to classify layout blocks: {e}") from e
342
+
343
+
344
+ def check_layout_model_available() -> bool:
345
+ """
346
+ Check if LayoutLM dependencies are available.
347
+
348
+ Returns:
349
+ True if transformers and torch are installed.
350
+ """
351
+ try:
352
+ import torch
353
+ import transformers
354
+ return True
355
+ except ImportError:
356
+ return False
357
+
358
+
359
+ def get_layout_model_info() -> dict[str, Any]:
360
+ """
361
+ Get information about LayoutLM configuration.
362
+
363
+ Returns:
364
+ Dictionary with model configuration and availability.
365
+ """
366
+ info = {
367
+ "available": check_layout_model_available(),
368
+ "default_model": DEFAULT_LAYOUT_MODEL,
369
+ "models": {
370
+ "base": LAYOUT_MODEL_BASE,
371
+ "large": LAYOUT_MODEL_LARGE,
372
+ },
373
+ "device": get_device_from_env(),
374
+ "env_model": os.getenv("RNSR_LAYOUT_MODEL"),
375
+ "env_device": os.getenv("RNSR_LAYOUT_DEVICE"),
376
+ "cache_dir": os.getenv("HF_HOME", os.path.expanduser("~/.cache/huggingface")),
377
+ }
378
+
379
+ return info
@@ -0,0 +1,177 @@
1
+ """
2
+ OCR Fallback - TIER 3: For Scanned/Image-Only PDFs
3
+
4
+ When the document contains no extractable text (scanned PDFs, image-only),
5
+ this module applies OCR to generate a text layer, then re-runs analysis.
6
+
7
+ Use this fallback when:
8
+ - PDF contains only images (scanned documents)
9
+ - No text can be extracted via PyMuPDF
10
+ - Document was scanned without OCR processing
11
+
12
+ Dependencies:
13
+ - pytesseract (OCR engine wrapper)
14
+ - pdf2image (PDF to image conversion)
15
+ - Tesseract-OCR installed on system
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ from pathlib import Path
21
+
22
+ import structlog
23
+
24
+ from rnsr.exceptions import OCRError
25
+ from rnsr.models import DocumentNode, DocumentTree
26
+
27
+ logger = structlog.get_logger(__name__)
28
+
29
+
30
+ def check_ocr_available() -> bool:
31
+ """
32
+ Check if OCR dependencies are available.
33
+
34
+ Returns:
35
+ True if pytesseract and pdf2image are importable.
36
+ """
37
+ try:
38
+ import pytesseract
39
+ from pdf2image import convert_from_path
40
+
41
+ # Test tesseract is installed
42
+ pytesseract.get_tesseract_version()
43
+ return True
44
+ except Exception:
45
+ return False
46
+
47
+
48
+ def try_ocr_ingestion(pdf_path: Path | str) -> DocumentTree:
49
+ """
50
+ TIER 3 Fallback: Use OCR for scanned/image-only PDFs.
51
+
52
+ This method:
53
+ 1. Converts PDF pages to images
54
+ 2. Applies Tesseract OCR to each page
55
+ 3. Builds a document tree from OCR output
56
+
57
+ Args:
58
+ pdf_path: Path to the PDF file.
59
+
60
+ Returns:
61
+ DocumentTree from OCR text.
62
+
63
+ Raises:
64
+ OCRError: If OCR fails or dependencies not available.
65
+ """
66
+ pdf_path = Path(pdf_path)
67
+
68
+ logger.info("using_ocr_fallback", path=str(pdf_path))
69
+
70
+ # Check dependencies
71
+ try:
72
+ import pytesseract
73
+ from pdf2image import convert_from_path
74
+ except ImportError as e:
75
+ raise OCRError(
76
+ f"OCR dependencies not available: {e}. "
77
+ "Install with: pip install pytesseract pdf2image"
78
+ ) from e
79
+
80
+ try:
81
+ # Convert PDF pages to images
82
+ logger.debug("converting_pdf_to_images", path=str(pdf_path))
83
+ images = convert_from_path(pdf_path, dpi=300)
84
+
85
+ logger.info("pdf_converted", pages=len(images))
86
+
87
+ # OCR each page
88
+ ocr_texts: list[str] = []
89
+ for i, image in enumerate(images):
90
+ logger.debug("processing_page_ocr", page=i)
91
+ text = pytesseract.image_to_string(image)
92
+ ocr_texts.append(text)
93
+
94
+ # Combine and build tree
95
+ full_text = "\n\n".join(ocr_texts)
96
+
97
+ if not full_text.strip():
98
+ logger.warning("ocr_no_text_found", path=str(pdf_path))
99
+ root = DocumentNode(id="root", level=0, header="Document")
100
+ return DocumentTree(
101
+ title="Empty OCR Result",
102
+ root=root,
103
+ total_nodes=1,
104
+ ingestion_tier=3,
105
+ ingestion_method="ocr",
106
+ )
107
+
108
+ # Build tree from OCR text
109
+ return _build_tree_from_ocr(ocr_texts, pdf_path.stem)
110
+
111
+ except Exception as e:
112
+ raise OCRError(f"OCR processing failed: {e}") from e
113
+
114
+
115
+ def _build_tree_from_ocr(
116
+ page_texts: list[str],
117
+ title: str,
118
+ ) -> DocumentTree:
119
+ """
120
+ Build a document tree from OCR output.
121
+
122
+ Creates a simple page-based structure since OCR
123
+ doesn't preserve font information.
124
+ """
125
+ root = DocumentNode(
126
+ id="root",
127
+ level=0,
128
+ header=title,
129
+ )
130
+
131
+ for page_num, text in enumerate(page_texts, 1):
132
+ text = text.strip()
133
+ if not text:
134
+ continue
135
+
136
+ # Create a section per page
137
+ section = DocumentNode(
138
+ id=f"page_{page_num:03d}",
139
+ level=1,
140
+ header=f"Page {page_num}",
141
+ content=text,
142
+ page_num=page_num - 1, # 0-indexed
143
+ )
144
+ root.children.append(section)
145
+
146
+ return DocumentTree(
147
+ title=title,
148
+ root=root,
149
+ total_nodes=len(root.children) + 1,
150
+ ingestion_tier=3,
151
+ ingestion_method="ocr",
152
+ )
153
+
154
+
155
+ def has_extractable_text(pdf_path: Path | str) -> bool:
156
+ """
157
+ Check if a PDF has extractable text.
158
+
159
+ Args:
160
+ pdf_path: Path to the PDF file.
161
+
162
+ Returns:
163
+ True if text can be extracted, False if OCR is needed.
164
+ """
165
+ import fitz
166
+
167
+ pdf_path = Path(pdf_path)
168
+ doc = fitz.open(pdf_path)
169
+
170
+ for page in doc:
171
+ text = str(page.get_text()).strip()
172
+ if text:
173
+ doc.close()
174
+ return True
175
+
176
+ doc.close()
177
+ return False