rnsr 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. rnsr/__init__.py +118 -0
  2. rnsr/__main__.py +242 -0
  3. rnsr/agent/__init__.py +218 -0
  4. rnsr/agent/cross_doc_navigator.py +767 -0
  5. rnsr/agent/graph.py +1557 -0
  6. rnsr/agent/llm_cache.py +575 -0
  7. rnsr/agent/navigator_api.py +497 -0
  8. rnsr/agent/provenance.py +772 -0
  9. rnsr/agent/query_clarifier.py +617 -0
  10. rnsr/agent/reasoning_memory.py +736 -0
  11. rnsr/agent/repl_env.py +709 -0
  12. rnsr/agent/rlm_navigator.py +2108 -0
  13. rnsr/agent/self_reflection.py +602 -0
  14. rnsr/agent/variable_store.py +308 -0
  15. rnsr/benchmarks/__init__.py +118 -0
  16. rnsr/benchmarks/comprehensive_benchmark.py +733 -0
  17. rnsr/benchmarks/evaluation_suite.py +1210 -0
  18. rnsr/benchmarks/finance_bench.py +147 -0
  19. rnsr/benchmarks/pdf_merger.py +178 -0
  20. rnsr/benchmarks/performance.py +321 -0
  21. rnsr/benchmarks/quality.py +321 -0
  22. rnsr/benchmarks/runner.py +298 -0
  23. rnsr/benchmarks/standard_benchmarks.py +995 -0
  24. rnsr/client.py +560 -0
  25. rnsr/document_store.py +394 -0
  26. rnsr/exceptions.py +74 -0
  27. rnsr/extraction/__init__.py +172 -0
  28. rnsr/extraction/candidate_extractor.py +357 -0
  29. rnsr/extraction/entity_extractor.py +581 -0
  30. rnsr/extraction/entity_linker.py +825 -0
  31. rnsr/extraction/grounded_extractor.py +722 -0
  32. rnsr/extraction/learned_types.py +599 -0
  33. rnsr/extraction/models.py +232 -0
  34. rnsr/extraction/relationship_extractor.py +600 -0
  35. rnsr/extraction/relationship_patterns.py +511 -0
  36. rnsr/extraction/relationship_validator.py +392 -0
  37. rnsr/extraction/rlm_extractor.py +589 -0
  38. rnsr/extraction/rlm_unified_extractor.py +990 -0
  39. rnsr/extraction/tot_validator.py +610 -0
  40. rnsr/extraction/unified_extractor.py +342 -0
  41. rnsr/indexing/__init__.py +60 -0
  42. rnsr/indexing/knowledge_graph.py +1128 -0
  43. rnsr/indexing/kv_store.py +313 -0
  44. rnsr/indexing/persistence.py +323 -0
  45. rnsr/indexing/semantic_retriever.py +237 -0
  46. rnsr/indexing/semantic_search.py +320 -0
  47. rnsr/indexing/skeleton_index.py +395 -0
  48. rnsr/ingestion/__init__.py +161 -0
  49. rnsr/ingestion/chart_parser.py +569 -0
  50. rnsr/ingestion/document_boundary.py +662 -0
  51. rnsr/ingestion/font_histogram.py +334 -0
  52. rnsr/ingestion/header_classifier.py +595 -0
  53. rnsr/ingestion/hierarchical_cluster.py +515 -0
  54. rnsr/ingestion/layout_detector.py +356 -0
  55. rnsr/ingestion/layout_model.py +379 -0
  56. rnsr/ingestion/ocr_fallback.py +177 -0
  57. rnsr/ingestion/pipeline.py +936 -0
  58. rnsr/ingestion/semantic_fallback.py +417 -0
  59. rnsr/ingestion/table_parser.py +799 -0
  60. rnsr/ingestion/text_builder.py +460 -0
  61. rnsr/ingestion/tree_builder.py +402 -0
  62. rnsr/ingestion/vision_retrieval.py +965 -0
  63. rnsr/ingestion/xy_cut.py +555 -0
  64. rnsr/llm.py +733 -0
  65. rnsr/models.py +167 -0
  66. rnsr/py.typed +2 -0
  67. rnsr-0.1.0.dist-info/METADATA +592 -0
  68. rnsr-0.1.0.dist-info/RECORD +72 -0
  69. rnsr-0.1.0.dist-info/WHEEL +5 -0
  70. rnsr-0.1.0.dist-info/entry_points.txt +2 -0
  71. rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
  72. rnsr-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,965 @@
1
+ """
2
+ Vision-Based Retrieval - OCR-Free Document Analysis
3
+
4
+ This module implements vision-based document retrieval that works directly
5
+ on PDF page images without requiring text extraction or OCR.
6
+
7
+ Inspired by PageIndex's Vision-based Vectorless RAG:
8
+ "OCR-free, vision-only RAG with PageIndex's reasoning-native retrieval
9
+ workflow that works directly over PDF page images."
10
+
11
+ Key Features:
12
+ 1. Page image extraction from PDFs
13
+ 2. Vision LLM integration (GPT-4V, Gemini Vision)
14
+ 3. Page-level navigation using visual reasoning
15
+ 4. Hybrid text+vision mode for charts/diagrams
16
+ 5. Image caching for performance
17
+
18
+ Use Cases:
19
+ - Scanned documents where OCR quality is poor
20
+ - Documents with complex layouts, charts, or diagrams
21
+ - Image-heavy documents (presentations, reports with graphics)
22
+ - Documents where visual structure provides context
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ import base64
28
+ import hashlib
29
+ import io
30
+ import os
31
+ from dataclasses import dataclass, field
32
+ from pathlib import Path
33
+ from typing import Any, Callable
34
+
35
+ import structlog
36
+
37
+ logger = structlog.get_logger(__name__)
38
+
39
+
40
+ # =============================================================================
41
+ # Configuration
42
+ # =============================================================================
43
+
44
+
45
+ @dataclass
46
+ class VisionConfig:
47
+ """Configuration for vision-based retrieval."""
48
+
49
+ # Vision model settings
50
+ vision_model: str = "gemini-2.5-flash" # Model with vision support
51
+ provider: str = "gemini" # gemini, openai
52
+
53
+ # Image settings
54
+ image_dpi: int = 150 # DPI for page rendering
55
+ max_image_size: int = 2048 # Max dimension in pixels
56
+ image_format: str = "PNG" # PNG or JPEG
57
+
58
+ # Caching
59
+ enable_cache: bool = True
60
+ cache_dir: str = ".rnsr_cache/vision"
61
+
62
+ # Navigation settings
63
+ max_pages_per_batch: int = 5 # Pages to evaluate per LLM call
64
+ page_selection_threshold: float = 0.3 # Min relevance for selection
65
+
66
+
67
+ # =============================================================================
68
+ # Page Image Extractor
69
+ # =============================================================================
70
+
71
+
72
+ class PageImageExtractor:
73
+ """
74
+ Extracts page images from PDF documents.
75
+
76
+ Uses PyMuPDF (fitz) for high-quality page rendering.
77
+ """
78
+
79
+ def __init__(self, config: VisionConfig):
80
+ self.config = config
81
+ self._cache_dir = Path(config.cache_dir)
82
+
83
+ if config.enable_cache:
84
+ self._cache_dir.mkdir(parents=True, exist_ok=True)
85
+
86
+ def extract_page_image(
87
+ self,
88
+ pdf_path: Path | str,
89
+ page_num: int,
90
+ ) -> bytes:
91
+ """
92
+ Extract a single page as an image.
93
+
94
+ Args:
95
+ pdf_path: Path to PDF file.
96
+ page_num: Page number (0-indexed).
97
+
98
+ Returns:
99
+ Image bytes (PNG or JPEG).
100
+ """
101
+ pdf_path = Path(pdf_path)
102
+
103
+ # Check cache
104
+ cache_key = self._get_cache_key(pdf_path, page_num)
105
+ if self.config.enable_cache:
106
+ cached = self._load_from_cache(cache_key)
107
+ if cached:
108
+ return cached
109
+
110
+ try:
111
+ import fitz # PyMuPDF
112
+ except ImportError:
113
+ raise ImportError("PyMuPDF not installed. Install with: pip install pymupdf")
114
+
115
+ doc = fitz.open(pdf_path)
116
+
117
+ if page_num >= len(doc):
118
+ doc.close()
119
+ raise ValueError(f"Page {page_num} does not exist (document has {len(doc)} pages)")
120
+
121
+ page = doc[page_num]
122
+
123
+ # Calculate zoom for target DPI
124
+ zoom = self.config.image_dpi / 72 # 72 is default PDF DPI
125
+ mat = fitz.Matrix(zoom, zoom)
126
+
127
+ # Render page to pixmap
128
+ pix = page.get_pixmap(matrix=mat)
129
+
130
+ # Resize if too large
131
+ if max(pix.width, pix.height) > self.config.max_image_size:
132
+ scale = self.config.max_image_size / max(pix.width, pix.height)
133
+ new_width = int(pix.width * scale)
134
+ new_height = int(pix.height * scale)
135
+
136
+ # Re-render at correct size
137
+ zoom = zoom * scale
138
+ mat = fitz.Matrix(zoom, zoom)
139
+ pix = page.get_pixmap(matrix=mat)
140
+
141
+ # Convert to image bytes
142
+ if self.config.image_format.upper() == "PNG":
143
+ image_bytes = pix.tobytes("png")
144
+ else:
145
+ image_bytes = pix.tobytes("jpeg")
146
+
147
+ doc.close()
148
+
149
+ # Cache the result
150
+ if self.config.enable_cache:
151
+ self._save_to_cache(cache_key, image_bytes)
152
+
153
+ logger.debug(
154
+ "page_image_extracted",
155
+ page=page_num,
156
+ width=pix.width,
157
+ height=pix.height,
158
+ )
159
+
160
+ return image_bytes
161
+
162
+ def extract_all_pages(
163
+ self,
164
+ pdf_path: Path | str,
165
+ max_pages: int | None = None,
166
+ ) -> list[bytes]:
167
+ """
168
+ Extract all pages as images.
169
+
170
+ Args:
171
+ pdf_path: Path to PDF file.
172
+ max_pages: Optional limit on number of pages.
173
+
174
+ Returns:
175
+ List of image bytes.
176
+ """
177
+ pdf_path = Path(pdf_path)
178
+
179
+ try:
180
+ import fitz
181
+ except ImportError:
182
+ raise ImportError("PyMuPDF not installed. Install with: pip install pymupdf")
183
+
184
+ doc = fitz.open(pdf_path)
185
+ num_pages = min(len(doc), max_pages) if max_pages else len(doc)
186
+ doc.close()
187
+
188
+ images = []
189
+ for i in range(num_pages):
190
+ images.append(self.extract_page_image(pdf_path, i))
191
+
192
+ logger.info("all_pages_extracted", count=len(images))
193
+ return images
194
+
195
+ def get_page_count(self, pdf_path: Path | str) -> int:
196
+ """Get the number of pages in a PDF."""
197
+ try:
198
+ import fitz
199
+ except ImportError:
200
+ raise ImportError("PyMuPDF not installed. Install with: pip install pymupdf")
201
+
202
+ doc = fitz.open(pdf_path)
203
+ count = len(doc)
204
+ doc.close()
205
+ return count
206
+
207
+ def _get_cache_key(self, pdf_path: Path, page_num: int) -> str:
208
+ """Generate a cache key for a page."""
209
+ # Include file modification time for cache invalidation
210
+ mtime = pdf_path.stat().st_mtime
211
+ key_data = f"{pdf_path}:{page_num}:{mtime}:{self.config.image_dpi}"
212
+ return hashlib.md5(key_data.encode()).hexdigest()
213
+
214
+ def _load_from_cache(self, cache_key: str) -> bytes | None:
215
+ """Load an image from cache."""
216
+ cache_path = self._cache_dir / f"{cache_key}.{self.config.image_format.lower()}"
217
+ if cache_path.exists():
218
+ return cache_path.read_bytes()
219
+ return None
220
+
221
+ def _save_to_cache(self, cache_key: str, image_bytes: bytes) -> None:
222
+ """Save an image to cache."""
223
+ cache_path = self._cache_dir / f"{cache_key}.{self.config.image_format.lower()}"
224
+ cache_path.write_bytes(image_bytes)
225
+
226
+
227
+ # =============================================================================
228
+ # Vision LLM Integration
229
+ # =============================================================================
230
+
231
+
232
+ class VisionLLM:
233
+ """
234
+ Vision LLM wrapper supporting multiple providers.
235
+
236
+ Provides a unified interface for vision-based reasoning.
237
+ """
238
+
239
+ def __init__(self, config: VisionConfig):
240
+ self.config = config
241
+ self._client: Any = None
242
+
243
+ def _get_client(self) -> Any:
244
+ """Get or create the vision LLM client."""
245
+ if self._client is not None:
246
+ return self._client
247
+
248
+ if self.config.provider == "gemini":
249
+ self._client = self._create_gemini_client()
250
+ elif self.config.provider == "openai":
251
+ self._client = self._create_openai_client()
252
+ else:
253
+ raise ValueError(f"Unknown vision provider: {self.config.provider}")
254
+
255
+ return self._client
256
+
257
+ def _create_gemini_client(self) -> Any:
258
+ """Create a Gemini vision client."""
259
+ try:
260
+ from google import genai
261
+ except ImportError:
262
+ raise ImportError("google-genai not installed. Install with: pip install google-genai")
263
+
264
+ api_key = os.getenv("GOOGLE_API_KEY")
265
+ if not api_key:
266
+ raise ValueError("GOOGLE_API_KEY environment variable not set")
267
+
268
+ return genai.Client(api_key=api_key)
269
+
270
+ def _create_openai_client(self) -> Any:
271
+ """Create an OpenAI vision client."""
272
+ try:
273
+ from openai import OpenAI
274
+ except ImportError:
275
+ raise ImportError("openai not installed. Install with: pip install openai")
276
+
277
+ api_key = os.getenv("OPENAI_API_KEY")
278
+ if not api_key:
279
+ raise ValueError("OPENAI_API_KEY environment variable not set")
280
+
281
+ return OpenAI(api_key=api_key)
282
+
283
+ def analyze_image(
284
+ self,
285
+ image_bytes: bytes,
286
+ prompt: str,
287
+ ) -> str:
288
+ """
289
+ Analyze a single image with a prompt.
290
+
291
+ Args:
292
+ image_bytes: Image as bytes.
293
+ prompt: Question or instruction about the image.
294
+
295
+ Returns:
296
+ LLM response.
297
+ """
298
+ client = self._get_client()
299
+
300
+ if self.config.provider == "gemini":
301
+ return self._analyze_gemini(client, image_bytes, prompt)
302
+ elif self.config.provider == "openai":
303
+ return self._analyze_openai(client, image_bytes, prompt)
304
+ else:
305
+ raise ValueError(f"Unknown provider: {self.config.provider}")
306
+
307
+ def analyze_multiple_images(
308
+ self,
309
+ images: list[bytes],
310
+ prompt: str,
311
+ ) -> str:
312
+ """
313
+ Analyze multiple images together.
314
+
315
+ Args:
316
+ images: List of image bytes.
317
+ prompt: Question about all images.
318
+
319
+ Returns:
320
+ LLM response.
321
+ """
322
+ client = self._get_client()
323
+
324
+ if self.config.provider == "gemini":
325
+ return self._analyze_gemini_multi(client, images, prompt)
326
+ elif self.config.provider == "openai":
327
+ return self._analyze_openai_multi(client, images, prompt)
328
+ else:
329
+ raise ValueError(f"Unknown provider: {self.config.provider}")
330
+
331
+ def _analyze_gemini(
332
+ self,
333
+ client: Any,
334
+ image_bytes: bytes,
335
+ prompt: str,
336
+ ) -> str:
337
+ """Analyze image using Gemini."""
338
+ from google.genai import types
339
+
340
+ # Create image part
341
+ image_part = types.Part.from_bytes(
342
+ data=image_bytes,
343
+ mime_type=f"image/{self.config.image_format.lower()}",
344
+ )
345
+
346
+ response = client.models.generate_content(
347
+ model=self.config.vision_model,
348
+ contents=[prompt, image_part],
349
+ )
350
+
351
+ return response.text or ""
352
+
353
+ def _analyze_gemini_multi(
354
+ self,
355
+ client: Any,
356
+ images: list[bytes],
357
+ prompt: str,
358
+ ) -> str:
359
+ """Analyze multiple images using Gemini."""
360
+ from google.genai import types
361
+
362
+ # Create content parts
363
+ parts = [prompt]
364
+ for i, img_bytes in enumerate(images):
365
+ parts.append(f"\n\n[Page {i+1}]")
366
+ parts.append(types.Part.from_bytes(
367
+ data=img_bytes,
368
+ mime_type=f"image/{self.config.image_format.lower()}",
369
+ ))
370
+
371
+ response = client.models.generate_content(
372
+ model=self.config.vision_model,
373
+ contents=parts,
374
+ )
375
+
376
+ return response.text or ""
377
+
378
+ def _analyze_openai(
379
+ self,
380
+ client: Any,
381
+ image_bytes: bytes,
382
+ prompt: str,
383
+ ) -> str:
384
+ """Analyze image using OpenAI."""
385
+ # Encode image as base64
386
+ b64_image = base64.b64encode(image_bytes).decode("utf-8")
387
+ mime_type = f"image/{self.config.image_format.lower()}"
388
+
389
+ response = client.chat.completions.create(
390
+ model=self.config.vision_model,
391
+ messages=[
392
+ {
393
+ "role": "user",
394
+ "content": [
395
+ {"type": "text", "text": prompt},
396
+ {
397
+ "type": "image_url",
398
+ "image_url": {
399
+ "url": f"data:{mime_type};base64,{b64_image}",
400
+ },
401
+ },
402
+ ],
403
+ }
404
+ ],
405
+ max_tokens=4096,
406
+ )
407
+
408
+ return response.choices[0].message.content or ""
409
+
410
+ def _analyze_openai_multi(
411
+ self,
412
+ client: Any,
413
+ images: list[bytes],
414
+ prompt: str,
415
+ ) -> str:
416
+ """Analyze multiple images using OpenAI."""
417
+ mime_type = f"image/{self.config.image_format.lower()}"
418
+
419
+ content = [{"type": "text", "text": prompt}]
420
+
421
+ for i, img_bytes in enumerate(images):
422
+ b64_image = base64.b64encode(img_bytes).decode("utf-8")
423
+ content.append({
424
+ "type": "text",
425
+ "text": f"\n[Page {i+1}]",
426
+ })
427
+ content.append({
428
+ "type": "image_url",
429
+ "image_url": {
430
+ "url": f"data:{mime_type};base64,{b64_image}",
431
+ },
432
+ })
433
+
434
+ response = client.chat.completions.create(
435
+ model=self.config.vision_model,
436
+ messages=[{"role": "user", "content": content}],
437
+ max_tokens=4096,
438
+ )
439
+
440
+ return response.choices[0].message.content or ""
441
+
442
+
443
+ # =============================================================================
444
+ # Vision-Based Navigator
445
+ # =============================================================================
446
+
447
+
448
+ @dataclass
449
+ class VisionPage:
450
+ """Represents a page in the vision index."""
451
+ page_num: int
452
+ image_bytes: bytes
453
+ summary: str = ""
454
+ relevance_score: float = 0.0
455
+
456
+
457
+ class VisionNavigator:
458
+ """
459
+ Vision-based document navigator.
460
+
461
+ Works directly on PDF page images without OCR or text extraction.
462
+ Uses vision LLM to understand and navigate document content.
463
+ """
464
+
465
+ def __init__(
466
+ self,
467
+ pdf_path: Path | str,
468
+ config: VisionConfig | None = None,
469
+ ):
470
+ self.pdf_path = Path(pdf_path)
471
+ self.config = config or VisionConfig()
472
+
473
+ # Components
474
+ self.extractor = PageImageExtractor(self.config)
475
+ self.vision_llm = VisionLLM(self.config)
476
+
477
+ # State
478
+ self.page_count = self.extractor.get_page_count(self.pdf_path)
479
+ self.page_summaries: dict[int, str] = {}
480
+ self.selected_pages: list[int] = []
481
+
482
+ def navigate(
483
+ self,
484
+ question: str,
485
+ metadata: dict[str, Any] | None = None,
486
+ ) -> dict[str, Any]:
487
+ """
488
+ Navigate document visually to answer a question.
489
+
490
+ Args:
491
+ question: The user's question.
492
+ metadata: Optional metadata (e.g., multiple choice options).
493
+
494
+ Returns:
495
+ Dict with answer, confidence, selected_pages.
496
+ """
497
+ logger.info(
498
+ "vision_navigation_started",
499
+ question=question[:100],
500
+ pages=self.page_count,
501
+ )
502
+
503
+ trace = []
504
+
505
+ # Phase 1: Page selection using vision
506
+ relevant_pages = self._select_relevant_pages(question, trace)
507
+ self.selected_pages = relevant_pages
508
+
509
+ if not relevant_pages:
510
+ return {
511
+ "answer": "Could not identify relevant pages from visual analysis.",
512
+ "confidence": 0.0,
513
+ "selected_pages": [],
514
+ "trace": trace,
515
+ }
516
+
517
+ # Phase 2: Deep analysis of selected pages
518
+ answer, confidence = self._analyze_selected_pages(
519
+ question,
520
+ relevant_pages,
521
+ metadata,
522
+ trace,
523
+ )
524
+
525
+ logger.info(
526
+ "vision_navigation_complete",
527
+ selected_pages=relevant_pages,
528
+ confidence=confidence,
529
+ )
530
+
531
+ return {
532
+ "answer": answer,
533
+ "confidence": confidence,
534
+ "selected_pages": relevant_pages,
535
+ "trace": trace,
536
+ }
537
+
538
+ def _select_relevant_pages(
539
+ self,
540
+ question: str,
541
+ trace: list[dict],
542
+ ) -> list[int]:
543
+ """Select pages that are likely relevant to the question."""
544
+ relevant = []
545
+
546
+ # Process pages in batches
547
+ batch_size = self.config.max_pages_per_batch
548
+
549
+ for batch_start in range(0, self.page_count, batch_size):
550
+ batch_end = min(batch_start + batch_size, self.page_count)
551
+ batch_pages = list(range(batch_start, batch_end))
552
+
553
+ # Extract images for this batch
554
+ images = [
555
+ self.extractor.extract_page_image(self.pdf_path, p)
556
+ for p in batch_pages
557
+ ]
558
+
559
+ # Ask vision LLM to evaluate pages
560
+ evaluation_prompt = f"""You are evaluating document pages for relevance to a question.
561
+
562
+ Question: {question}
563
+
564
+ For each page shown (numbered starting from {batch_start + 1}), estimate its relevance:
565
+ - Does this page contain information that could help answer the question?
566
+ - What key content is visible on this page?
567
+
568
+ OUTPUT FORMAT (JSON):
569
+ {{
570
+ "pages": [
571
+ {{"page_num": {batch_start + 1}, "relevance": 0.0-1.0, "summary": "brief description"}}
572
+ ]
573
+ }}
574
+
575
+ Respond with JSON only:"""
576
+
577
+ try:
578
+ import json
579
+
580
+ response = self.vision_llm.analyze_multiple_images(
581
+ images,
582
+ evaluation_prompt,
583
+ )
584
+
585
+ # Parse response
586
+ json_match = __import__("re").search(r'\{[\s\S]*\}', response)
587
+ if json_match:
588
+ result = json.loads(json_match.group())
589
+
590
+ for page_info in result.get("pages", []):
591
+ page_num = page_info.get("page_num", 0) - 1 # Convert to 0-indexed
592
+ relevance = page_info.get("relevance", 0)
593
+ summary = page_info.get("summary", "")
594
+
595
+ if page_num >= 0 and page_num < self.page_count:
596
+ self.page_summaries[page_num] = summary
597
+
598
+ if relevance >= self.config.page_selection_threshold:
599
+ relevant.append(page_num)
600
+
601
+ trace.append({
602
+ "action": "page_selection",
603
+ "batch": f"{batch_start}-{batch_end}",
604
+ "selected": [p for p in relevant if batch_start <= p < batch_end],
605
+ })
606
+
607
+ except Exception as e:
608
+ logger.warning("page_evaluation_failed", error=str(e))
609
+ # Fallback: include all pages in batch
610
+ relevant.extend(batch_pages)
611
+
612
+ # Sort by page number
613
+ relevant.sort()
614
+
615
+ return relevant[:10] # Limit to 10 most relevant pages
616
+
617
+ def _analyze_selected_pages(
618
+ self,
619
+ question: str,
620
+ pages: list[int],
621
+ metadata: dict[str, Any] | None,
622
+ trace: list[dict],
623
+ ) -> tuple[str, float]:
624
+ """Perform deep analysis on selected pages."""
625
+ # Extract images for selected pages
626
+ images = [
627
+ self.extractor.extract_page_image(self.pdf_path, p)
628
+ for p in pages
629
+ ]
630
+
631
+ # Build analysis prompt
632
+ page_descriptions = "\n".join(
633
+ f"Page {p+1}: {self.page_summaries.get(p, 'No summary')}"
634
+ for p in pages
635
+ )
636
+
637
+ options = metadata.get("options") if metadata else None
638
+ if options:
639
+ options_text = "\n".join(f"{chr(65+i)}. {opt}" for i, opt in enumerate(options))
640
+ analysis_prompt = f"""Based on the document pages shown, answer this multiple-choice question.
641
+
642
+ Question: {question}
643
+
644
+ Options:
645
+ {options_text}
646
+
647
+ Page summaries:
648
+ {page_descriptions}
649
+
650
+ Instructions:
651
+ 1. Carefully examine ALL page images
652
+ 2. Find evidence that supports one of the options
653
+ 3. Respond with the letter and full option text
654
+
655
+ Your answer (e.g., "A. [option text]"):"""
656
+ else:
657
+ analysis_prompt = f"""Based on the document pages shown, answer the question.
658
+
659
+ Question: {question}
660
+
661
+ Page summaries:
662
+ {page_descriptions}
663
+
664
+ Instructions:
665
+ 1. Carefully examine ALL page images
666
+ 2. Find specific information that answers the question
667
+ 3. Cite the page number(s) where you found the answer
668
+
669
+ Answer:"""
670
+
671
+ try:
672
+ response = self.vision_llm.analyze_multiple_images(images, analysis_prompt)
673
+
674
+ trace.append({
675
+ "action": "deep_analysis",
676
+ "pages": pages,
677
+ "response_length": len(response),
678
+ })
679
+
680
+ # Estimate confidence based on response quality
681
+ confidence = 0.7 if len(response) > 50 else 0.5
682
+
683
+ # Normalize multiple choice answer
684
+ if options:
685
+ response = self._normalize_mc_answer(response, options)
686
+ confidence = 0.8
687
+
688
+ return response.strip(), confidence
689
+
690
+ except Exception as e:
691
+ logger.error("page_analysis_failed", error=str(e))
692
+ return f"Error analyzing pages: {str(e)}", 0.0
693
+
694
+ def _normalize_mc_answer(self, answer: str, options: list) -> str:
695
+ """Normalize multiple choice answer."""
696
+ answer_lower = answer.lower().strip()
697
+
698
+ for i, opt in enumerate(options):
699
+ letter = chr(65 + i)
700
+ if answer_lower.startswith(f"{letter.lower()}.") or opt.lower() in answer_lower:
701
+ return opt
702
+
703
+ return answer
704
+
705
+
706
+ # =============================================================================
707
+ # Hybrid Text+Vision Navigator
708
+ # =============================================================================
709
+
710
+
711
+ class HybridVisionNavigator:
712
+ """
713
+ Hybrid navigator combining text-based tree navigation with vision analysis.
714
+
715
+ Best of both worlds:
716
+ - Use tree-based ToT navigation for structured content
717
+ - Use vision analysis for charts, diagrams, complex layouts
718
+
719
+ The hybrid approach detects when vision analysis would be beneficial:
720
+ - Pages with low text extraction quality
721
+ - Pages with images/charts
722
+ - Pages where text structure is unclear
723
+ """
724
+
725
+ def __init__(
726
+ self,
727
+ pdf_path: Path | str,
728
+ skeleton: dict | None = None,
729
+ kv_store: Any = None,
730
+ vision_config: VisionConfig | None = None,
731
+ ):
732
+ self.pdf_path = Path(pdf_path)
733
+ self.skeleton = skeleton
734
+ self.kv_store = kv_store
735
+ self.vision_config = vision_config or VisionConfig()
736
+
737
+ # Vision components
738
+ self.vision_nav = VisionNavigator(pdf_path, self.vision_config)
739
+
740
+ # Determine when to use vision
741
+ self._vision_pages: set[int] = set()
742
+ self._analyze_pages_for_vision_need()
743
+
744
+ def _analyze_pages_for_vision_need(self) -> None:
745
+ """Identify pages that would benefit from vision analysis."""
746
+ if self.skeleton is None:
747
+ # No text structure - use vision for all
748
+ self._vision_pages = set(range(self.vision_nav.page_count))
749
+ return
750
+
751
+ # Check each page for:
752
+ # 1. Low text content
753
+ # 2. Images/figures mentioned in text
754
+ # 3. Tables referenced but not parsed
755
+
756
+ try:
757
+ import fitz
758
+ doc = fitz.open(self.pdf_path)
759
+
760
+ for page_num in range(len(doc)):
761
+ page = doc[page_num]
762
+
763
+ # Check for images
764
+ images = page.get_images()
765
+ if len(images) > 2:
766
+ self._vision_pages.add(page_num)
767
+ continue
768
+
769
+ # Check text density
770
+ text = page.get_text()
771
+ if len(text.strip()) < 100: # Low text content
772
+ self._vision_pages.add(page_num)
773
+ continue
774
+
775
+ # Check for table indicators
776
+ if "table" in text.lower() or "figure" in text.lower():
777
+ self._vision_pages.add(page_num)
778
+
779
+ doc.close()
780
+
781
+ except Exception as e:
782
+ logger.warning("vision_need_analysis_failed", error=str(e))
783
+
784
+ def navigate(
785
+ self,
786
+ question: str,
787
+ metadata: dict[str, Any] | None = None,
788
+ ) -> dict[str, Any]:
789
+ """
790
+ Navigate using hybrid text+vision approach.
791
+
792
+ 1. First try text-based navigation
793
+ 2. If text mentions charts/figures, or low confidence, use vision
794
+ 3. Combine results for final answer
795
+ """
796
+ results = {
797
+ "text_result": None,
798
+ "vision_result": None,
799
+ "combined_answer": None,
800
+ "confidence": 0.0,
801
+ "method_used": "text",
802
+ }
803
+
804
+ # Try text-based navigation first
805
+ if self.skeleton and self.kv_store:
806
+ try:
807
+ from rnsr.agent.rlm_navigator import run_rlm_navigator
808
+
809
+ text_result = run_rlm_navigator(
810
+ question,
811
+ self.skeleton,
812
+ self.kv_store,
813
+ metadata=metadata,
814
+ )
815
+ results["text_result"] = text_result
816
+
817
+ # Check if vision would help
818
+ needs_vision = self._should_use_vision(text_result, question)
819
+
820
+ if not needs_vision:
821
+ results["combined_answer"] = text_result.get("answer")
822
+ results["confidence"] = text_result.get("confidence", 0.5)
823
+ results["method_used"] = "text"
824
+ return results
825
+
826
+ except Exception as e:
827
+ logger.warning("text_navigation_failed", error=str(e))
828
+
829
+ # Use vision navigation
830
+ vision_result = self.vision_nav.navigate(question, metadata)
831
+ results["vision_result"] = vision_result
832
+
833
+ # Combine results
834
+ if results["text_result"] and vision_result.get("confidence", 0) > 0.3:
835
+ # Both methods produced results - combine
836
+ results["combined_answer"] = self._combine_answers(
837
+ results["text_result"].get("answer"),
838
+ vision_result.get("answer"),
839
+ question,
840
+ )
841
+ results["confidence"] = max(
842
+ results["text_result"].get("confidence", 0),
843
+ vision_result.get("confidence", 0),
844
+ )
845
+ results["method_used"] = "hybrid"
846
+ else:
847
+ # Use vision result
848
+ results["combined_answer"] = vision_result.get("answer")
849
+ results["confidence"] = vision_result.get("confidence", 0)
850
+ results["method_used"] = "vision"
851
+
852
+ return results
853
+
854
+ def _should_use_vision(
855
+ self,
856
+ text_result: dict[str, Any],
857
+ question: str,
858
+ ) -> bool:
859
+ """Determine if vision analysis should be used."""
860
+ # Low confidence from text
861
+ if text_result.get("confidence", 0) < 0.5:
862
+ return True
863
+
864
+ # Question mentions visual elements
865
+ visual_keywords = ["chart", "graph", "figure", "diagram", "image", "table", "picture"]
866
+ question_lower = question.lower()
867
+ if any(kw in question_lower for kw in visual_keywords):
868
+ return True
869
+
870
+ # Answer mentions visual elements
871
+ answer = str(text_result.get("answer", "")).lower()
872
+ if any(kw in answer for kw in visual_keywords):
873
+ return True
874
+
875
+ return False
876
+
877
+ def _combine_answers(
878
+ self,
879
+ text_answer: str | None,
880
+ vision_answer: str | None,
881
+ question: str,
882
+ ) -> str:
883
+ """Combine text and vision answers."""
884
+ if not text_answer:
885
+ return vision_answer or "No answer found"
886
+ if not vision_answer:
887
+ return text_answer
888
+
889
+ # If answers are similar, use text (usually more precise)
890
+ if text_answer.lower().strip() == vision_answer.lower().strip():
891
+ return text_answer
892
+
893
+ # Use LLM to combine
894
+ try:
895
+ from rnsr.llm import get_llm
896
+ llm = get_llm()
897
+
898
+ prompt = f"""Two methods analyzed a document to answer a question.
899
+ Both methods found relevant information. Combine their answers.
900
+
901
+ Question: {question}
902
+
903
+ Text-based answer: {text_answer}
904
+
905
+ Vision-based answer: {vision_answer}
906
+
907
+ Combined answer (choose the most accurate and complete one, or merge if complementary):"""
908
+
909
+ response = llm.complete(prompt)
910
+ return str(response).strip()
911
+
912
+ except Exception as e:
913
+ logger.warning("answer_combination_failed", error=str(e))
914
+ # Fallback: return text answer
915
+ return text_answer
916
+
917
+
918
+ # =============================================================================
919
+ # Factory Functions
920
+ # =============================================================================
921
+
922
+
923
+ def create_vision_navigator(
924
+ pdf_path: Path | str,
925
+ config: VisionConfig | None = None,
926
+ ) -> VisionNavigator:
927
+ """
928
+ Create a vision-based navigator.
929
+
930
+ Args:
931
+ pdf_path: Path to PDF file.
932
+ config: Optional vision configuration.
933
+
934
+ Returns:
935
+ VisionNavigator instance.
936
+
937
+ Example:
938
+ from rnsr.ingestion.vision_retrieval import create_vision_navigator
939
+
940
+ nav = create_vision_navigator("scanned_document.pdf")
941
+ result = nav.navigate("What is the total amount?")
942
+ print(result["answer"])
943
+ """
944
+ return VisionNavigator(pdf_path, config)
945
+
946
+
947
+ def create_hybrid_navigator(
948
+ pdf_path: Path | str,
949
+ skeleton: dict | None = None,
950
+ kv_store: Any = None,
951
+ vision_config: VisionConfig | None = None,
952
+ ) -> HybridVisionNavigator:
953
+ """
954
+ Create a hybrid text+vision navigator.
955
+
956
+ Args:
957
+ pdf_path: Path to PDF file.
958
+ skeleton: Optional skeleton index for text navigation.
959
+ kv_store: Optional KV store for text content.
960
+ vision_config: Optional vision configuration.
961
+
962
+ Returns:
963
+ HybridVisionNavigator instance.
964
+ """
965
+ return HybridVisionNavigator(pdf_path, skeleton, kv_store, vision_config)