rnsr 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. rnsr/__init__.py +118 -0
  2. rnsr/__main__.py +242 -0
  3. rnsr/agent/__init__.py +218 -0
  4. rnsr/agent/cross_doc_navigator.py +767 -0
  5. rnsr/agent/graph.py +1557 -0
  6. rnsr/agent/llm_cache.py +575 -0
  7. rnsr/agent/navigator_api.py +497 -0
  8. rnsr/agent/provenance.py +772 -0
  9. rnsr/agent/query_clarifier.py +617 -0
  10. rnsr/agent/reasoning_memory.py +736 -0
  11. rnsr/agent/repl_env.py +709 -0
  12. rnsr/agent/rlm_navigator.py +2108 -0
  13. rnsr/agent/self_reflection.py +602 -0
  14. rnsr/agent/variable_store.py +308 -0
  15. rnsr/benchmarks/__init__.py +118 -0
  16. rnsr/benchmarks/comprehensive_benchmark.py +733 -0
  17. rnsr/benchmarks/evaluation_suite.py +1210 -0
  18. rnsr/benchmarks/finance_bench.py +147 -0
  19. rnsr/benchmarks/pdf_merger.py +178 -0
  20. rnsr/benchmarks/performance.py +321 -0
  21. rnsr/benchmarks/quality.py +321 -0
  22. rnsr/benchmarks/runner.py +298 -0
  23. rnsr/benchmarks/standard_benchmarks.py +995 -0
  24. rnsr/client.py +560 -0
  25. rnsr/document_store.py +394 -0
  26. rnsr/exceptions.py +74 -0
  27. rnsr/extraction/__init__.py +172 -0
  28. rnsr/extraction/candidate_extractor.py +357 -0
  29. rnsr/extraction/entity_extractor.py +581 -0
  30. rnsr/extraction/entity_linker.py +825 -0
  31. rnsr/extraction/grounded_extractor.py +722 -0
  32. rnsr/extraction/learned_types.py +599 -0
  33. rnsr/extraction/models.py +232 -0
  34. rnsr/extraction/relationship_extractor.py +600 -0
  35. rnsr/extraction/relationship_patterns.py +511 -0
  36. rnsr/extraction/relationship_validator.py +392 -0
  37. rnsr/extraction/rlm_extractor.py +589 -0
  38. rnsr/extraction/rlm_unified_extractor.py +990 -0
  39. rnsr/extraction/tot_validator.py +610 -0
  40. rnsr/extraction/unified_extractor.py +342 -0
  41. rnsr/indexing/__init__.py +60 -0
  42. rnsr/indexing/knowledge_graph.py +1128 -0
  43. rnsr/indexing/kv_store.py +313 -0
  44. rnsr/indexing/persistence.py +323 -0
  45. rnsr/indexing/semantic_retriever.py +237 -0
  46. rnsr/indexing/semantic_search.py +320 -0
  47. rnsr/indexing/skeleton_index.py +395 -0
  48. rnsr/ingestion/__init__.py +161 -0
  49. rnsr/ingestion/chart_parser.py +569 -0
  50. rnsr/ingestion/document_boundary.py +662 -0
  51. rnsr/ingestion/font_histogram.py +334 -0
  52. rnsr/ingestion/header_classifier.py +595 -0
  53. rnsr/ingestion/hierarchical_cluster.py +515 -0
  54. rnsr/ingestion/layout_detector.py +356 -0
  55. rnsr/ingestion/layout_model.py +379 -0
  56. rnsr/ingestion/ocr_fallback.py +177 -0
  57. rnsr/ingestion/pipeline.py +936 -0
  58. rnsr/ingestion/semantic_fallback.py +417 -0
  59. rnsr/ingestion/table_parser.py +799 -0
  60. rnsr/ingestion/text_builder.py +460 -0
  61. rnsr/ingestion/tree_builder.py +402 -0
  62. rnsr/ingestion/vision_retrieval.py +965 -0
  63. rnsr/ingestion/xy_cut.py +555 -0
  64. rnsr/llm.py +733 -0
  65. rnsr/models.py +167 -0
  66. rnsr/py.typed +2 -0
  67. rnsr-0.1.0.dist-info/METADATA +592 -0
  68. rnsr-0.1.0.dist-info/RECORD +72 -0
  69. rnsr-0.1.0.dist-info/WHEEL +5 -0
  70. rnsr-0.1.0.dist-info/entry_points.txt +2 -0
  71. rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
  72. rnsr-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,733 @@
1
+ """
2
+ Comprehensive Benchmark Suite
3
+
4
+ Evaluates RNSR against different configurations and approaches:
5
+ 1. Standard Navigator (ToT-based)
6
+ 2. RLM Navigator (pre-filtering + deep recursion + verification)
7
+ 3. Vision Navigator (OCR-free page image analysis)
8
+ 4. Hybrid Navigator (text + vision)
9
+
10
+ Benchmarks inspired by:
11
+ - FinanceBench: Financial document QA (PageIndex achieved 98.7%)
12
+ - OOLONG: Long context aggregation tasks
13
+ - Custom Multi-hop: Complex relational queries
14
+
15
+ Usage:
16
+ from rnsr.benchmarks.comprehensive_benchmark import run_comprehensive_benchmark
17
+
18
+ results = run_comprehensive_benchmark(
19
+ pdf_paths=["contract.pdf", "report.pdf"],
20
+ benchmark_type="all", # or "financebench", "custom", etc.
21
+ )
22
+ results.print_report()
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ import json
28
+ import time
29
+ from dataclasses import dataclass, field
30
+ from datetime import datetime, timezone
31
+ from pathlib import Path
32
+ from typing import Any, Callable, Literal
33
+
34
+ import structlog
35
+
36
+ logger = structlog.get_logger(__name__)
37
+
38
+
39
+ # =============================================================================
40
+ # Benchmark Test Cases
41
+ # =============================================================================
42
+
43
+
44
+ @dataclass
45
+ class BenchmarkTestCase:
46
+ """A single test case for benchmarking."""
47
+
48
+ id: str
49
+ question: str
50
+ expected_answer: str | None = None
51
+ expected_keywords: list[str] = field(default_factory=list)
52
+ category: str = "general"
53
+ difficulty: Literal["easy", "medium", "hard"] = "medium"
54
+ requires_multi_hop: bool = False
55
+ requires_aggregation: bool = False
56
+ metadata: dict[str, Any] = field(default_factory=dict)
57
+
58
+
59
+ @dataclass
60
+ class BenchmarkResult:
61
+ """Result of a single benchmark test."""
62
+
63
+ test_case_id: str
64
+ method: str
65
+ answer: str
66
+ is_correct: bool
67
+ confidence: float
68
+ latency_ms: float
69
+ tokens_used: int = 0
70
+ iterations: int = 0
71
+ error: str | None = None
72
+ trace: list[dict] = field(default_factory=list)
73
+
74
+
75
+ @dataclass
76
+ class MethodResults:
77
+ """Aggregated results for a single method."""
78
+
79
+ method: str
80
+ total_tests: int = 0
81
+ correct: int = 0
82
+ accuracy: float = 0.0
83
+ avg_latency_ms: float = 0.0
84
+ avg_confidence: float = 0.0
85
+ results: list[BenchmarkResult] = field(default_factory=list)
86
+
87
+ def compute_stats(self) -> None:
88
+ """Compute aggregate statistics."""
89
+ if not self.results:
90
+ return
91
+
92
+ self.total_tests = len(self.results)
93
+ self.correct = sum(1 for r in self.results if r.is_correct)
94
+ self.accuracy = self.correct / self.total_tests if self.total_tests > 0 else 0
95
+
96
+ latencies = [r.latency_ms for r in self.results if r.latency_ms > 0]
97
+ self.avg_latency_ms = sum(latencies) / len(latencies) if latencies else 0
98
+
99
+ confidences = [r.confidence for r in self.results]
100
+ self.avg_confidence = sum(confidences) / len(confidences) if confidences else 0
101
+
102
+
103
+ @dataclass
104
+ class ComprehensiveBenchmarkReport:
105
+ """Complete benchmark report with all methods."""
106
+
107
+ timestamp: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
108
+ benchmark_type: str = "comprehensive"
109
+ document_count: int = 0
110
+ test_case_count: int = 0
111
+
112
+ # Results by method
113
+ standard_navigator: MethodResults | None = None
114
+ rlm_navigator: MethodResults | None = None
115
+ vision_navigator: MethodResults | None = None
116
+ hybrid_navigator: MethodResults | None = None
117
+
118
+ # Comparison summary
119
+ best_method: str = ""
120
+ best_accuracy: float = 0.0
121
+
122
+ def print_report(self) -> None:
123
+ """Print a formatted report."""
124
+ print("\n" + "=" * 70)
125
+ print("COMPREHENSIVE RNSR BENCHMARK REPORT")
126
+ print("=" * 70)
127
+ print(f"Timestamp: {self.timestamp}")
128
+ print(f"Documents: {self.document_count}")
129
+ print(f"Test Cases: {self.test_case_count}")
130
+ print()
131
+
132
+ methods = [
133
+ ("Standard Navigator", self.standard_navigator),
134
+ ("RLM Navigator", self.rlm_navigator),
135
+ ("Vision Navigator", self.vision_navigator),
136
+ ("Hybrid Navigator", self.hybrid_navigator),
137
+ ]
138
+
139
+ print("-" * 70)
140
+ print(f"{'Method':<25} {'Accuracy':<12} {'Avg Latency':<15} {'Confidence':<12}")
141
+ print("-" * 70)
142
+
143
+ for name, result in methods:
144
+ if result:
145
+ acc = f"{result.accuracy:.1%}"
146
+ lat = f"{result.avg_latency_ms:.0f}ms"
147
+ conf = f"{result.avg_confidence:.2f}"
148
+ print(f"{name:<25} {acc:<12} {lat:<15} {conf:<12}")
149
+
150
+ print("-" * 70)
151
+ print(f"\nBest Method: {self.best_method} ({self.best_accuracy:.1%} accuracy)")
152
+ print("=" * 70)
153
+
154
+ def to_json(self, path: Path | str) -> None:
155
+ """Save report to JSON."""
156
+ path = Path(path)
157
+ path.parent.mkdir(parents=True, exist_ok=True)
158
+
159
+ data = {
160
+ "timestamp": self.timestamp,
161
+ "benchmark_type": self.benchmark_type,
162
+ "document_count": self.document_count,
163
+ "test_case_count": self.test_case_count,
164
+ "best_method": self.best_method,
165
+ "best_accuracy": self.best_accuracy,
166
+ "methods": {},
167
+ }
168
+
169
+ for name, result in [
170
+ ("standard_navigator", self.standard_navigator),
171
+ ("rlm_navigator", self.rlm_navigator),
172
+ ("vision_navigator", self.vision_navigator),
173
+ ("hybrid_navigator", self.hybrid_navigator),
174
+ ]:
175
+ if result:
176
+ data["methods"][name] = {
177
+ "total_tests": result.total_tests,
178
+ "correct": result.correct,
179
+ "accuracy": result.accuracy,
180
+ "avg_latency_ms": result.avg_latency_ms,
181
+ "avg_confidence": result.avg_confidence,
182
+ }
183
+
184
+ with open(path, "w") as f:
185
+ json.dump(data, f, indent=2)
186
+
187
+ logger.info("benchmark_report_saved", path=str(path))
188
+
189
+
190
+ # =============================================================================
191
+ # Standard Benchmark Test Suites
192
+ # =============================================================================
193
+
194
+
195
+ def get_financebench_cases() -> list[BenchmarkTestCase]:
196
+ """
197
+ FinanceBench-style test cases.
198
+
199
+ PageIndex achieved 98.7% accuracy on FinanceBench.
200
+ These are financial document QA questions requiring multi-hop reasoning.
201
+ """
202
+ return [
203
+ BenchmarkTestCase(
204
+ id="fin_001",
205
+ question="What was the total revenue in the most recent fiscal year?",
206
+ expected_keywords=["revenue", "fiscal", "year", "total"],
207
+ category="financial",
208
+ difficulty="medium",
209
+ ),
210
+ BenchmarkTestCase(
211
+ id="fin_002",
212
+ question="What is the year-over-year growth in net income?",
213
+ expected_keywords=["growth", "net income", "year-over-year"],
214
+ category="financial",
215
+ difficulty="hard",
216
+ requires_multi_hop=True,
217
+ ),
218
+ BenchmarkTestCase(
219
+ id="fin_003",
220
+ question="What are the key risk factors mentioned in the report?",
221
+ expected_keywords=["risk", "factors"],
222
+ category="financial",
223
+ difficulty="medium",
224
+ requires_aggregation=True,
225
+ ),
226
+ BenchmarkTestCase(
227
+ id="fin_004",
228
+ question="Compare the gross margin between Q1 and Q4.",
229
+ expected_keywords=["gross margin", "Q1", "Q4", "compare"],
230
+ category="financial",
231
+ difficulty="hard",
232
+ requires_multi_hop=True,
233
+ ),
234
+ BenchmarkTestCase(
235
+ id="fin_005",
236
+ question="What is the company's debt-to-equity ratio?",
237
+ expected_keywords=["debt", "equity", "ratio"],
238
+ category="financial",
239
+ difficulty="medium",
240
+ ),
241
+ ]
242
+
243
+
244
+ def get_oolong_style_cases() -> list[BenchmarkTestCase]:
245
+ """
246
+ OOLONG-style test cases.
247
+
248
+ OOLONG tests long context reasoning and aggregation.
249
+ These require processing many parts of a document.
250
+ """
251
+ return [
252
+ BenchmarkTestCase(
253
+ id="ool_001",
254
+ question="List all the parties mentioned in this agreement.",
255
+ expected_keywords=["parties", "agreement"],
256
+ category="aggregation",
257
+ difficulty="hard",
258
+ requires_aggregation=True,
259
+ ),
260
+ BenchmarkTestCase(
261
+ id="ool_002",
262
+ question="How many sections are there in total?",
263
+ expected_keywords=["sections", "total"],
264
+ category="structure",
265
+ difficulty="easy",
266
+ ),
267
+ BenchmarkTestCase(
268
+ id="ool_003",
269
+ question="What are all the obligations of Party A?",
270
+ expected_keywords=["obligations", "party"],
271
+ category="aggregation",
272
+ difficulty="hard",
273
+ requires_aggregation=True,
274
+ ),
275
+ BenchmarkTestCase(
276
+ id="ool_004",
277
+ question="Summarize the key terms across all sections.",
278
+ expected_keywords=["key terms", "summarize", "sections"],
279
+ category="aggregation",
280
+ difficulty="hard",
281
+ requires_aggregation=True,
282
+ ),
283
+ ]
284
+
285
+
286
+ def get_multi_hop_cases() -> list[BenchmarkTestCase]:
287
+ """
288
+ Multi-hop reasoning test cases.
289
+
290
+ These require connecting information from multiple sections.
291
+ """
292
+ return [
293
+ BenchmarkTestCase(
294
+ id="mh_001",
295
+ question="If the termination clause is triggered, what penalties apply according to the payment terms?",
296
+ expected_keywords=["termination", "penalties", "payment"],
297
+ category="multi_hop",
298
+ difficulty="hard",
299
+ requires_multi_hop=True,
300
+ ),
301
+ BenchmarkTestCase(
302
+ id="mh_002",
303
+ question="How do the warranties in Section 5 relate to the limitations in Section 8?",
304
+ expected_keywords=["warranties", "limitations", "section"],
305
+ category="multi_hop",
306
+ difficulty="hard",
307
+ requires_multi_hop=True,
308
+ ),
309
+ BenchmarkTestCase(
310
+ id="mh_003",
311
+ question="What happens to the IP rights if the contract is terminated for cause?",
312
+ expected_keywords=["IP", "intellectual property", "terminated", "cause"],
313
+ category="multi_hop",
314
+ difficulty="hard",
315
+ requires_multi_hop=True,
316
+ ),
317
+ ]
318
+
319
+
320
+ # =============================================================================
321
+ # Benchmark Runner
322
+ # =============================================================================
323
+
324
+
325
+ class ComprehensiveBenchmarkRunner:
326
+ """
327
+ Runs comprehensive benchmarks across all navigator types.
328
+ """
329
+
330
+ def __init__(
331
+ self,
332
+ pdf_paths: list[Path | str],
333
+ test_cases: list[BenchmarkTestCase] | None = None,
334
+ methods: list[str] | None = None,
335
+ ):
336
+ self.pdf_paths = [Path(p) for p in pdf_paths]
337
+ self.test_cases = test_cases or self._get_default_cases()
338
+ self.methods = methods or ["standard", "rlm", "vision", "hybrid"]
339
+
340
+ # Indexes (built on first use)
341
+ self._indexes: dict[str, tuple[dict, Any]] = {}
342
+
343
+ def _get_default_cases(self) -> list[BenchmarkTestCase]:
344
+ """Get default test cases."""
345
+ cases = []
346
+ cases.extend(get_financebench_cases())
347
+ cases.extend(get_oolong_style_cases())
348
+ cases.extend(get_multi_hop_cases())
349
+ return cases
350
+
351
+ def _get_or_build_index(self, pdf_path: Path) -> tuple[dict, Any]:
352
+ """Get or build index for a PDF."""
353
+ key = str(pdf_path)
354
+ if key in self._indexes:
355
+ return self._indexes[key]
356
+
357
+ from rnsr import ingest_document, build_skeleton_index
358
+
359
+ result = ingest_document(pdf_path)
360
+ skeleton, kv_store = build_skeleton_index(result.tree)
361
+ self._indexes[key] = (skeleton, kv_store)
362
+ return skeleton, kv_store
363
+
364
+ def run(self) -> ComprehensiveBenchmarkReport:
365
+ """Run all benchmarks."""
366
+ report = ComprehensiveBenchmarkReport(
367
+ document_count=len(self.pdf_paths),
368
+ test_case_count=len(self.test_cases),
369
+ )
370
+
371
+ logger.info(
372
+ "comprehensive_benchmark_start",
373
+ documents=len(self.pdf_paths),
374
+ test_cases=len(self.test_cases),
375
+ methods=self.methods,
376
+ )
377
+
378
+ # Run each method
379
+ if "standard" in self.methods:
380
+ report.standard_navigator = self._run_method(
381
+ "standard",
382
+ self._run_standard_navigator,
383
+ )
384
+
385
+ if "rlm" in self.methods:
386
+ report.rlm_navigator = self._run_method(
387
+ "rlm",
388
+ self._run_rlm_navigator,
389
+ )
390
+
391
+ if "vision" in self.methods:
392
+ report.vision_navigator = self._run_method(
393
+ "vision",
394
+ self._run_vision_navigator,
395
+ )
396
+
397
+ if "hybrid" in self.methods:
398
+ report.hybrid_navigator = self._run_method(
399
+ "hybrid",
400
+ self._run_hybrid_navigator,
401
+ )
402
+
403
+ # Determine best method
404
+ best_accuracy = 0.0
405
+ best_method = ""
406
+ for name, result in [
407
+ ("Standard Navigator", report.standard_navigator),
408
+ ("RLM Navigator", report.rlm_navigator),
409
+ ("Vision Navigator", report.vision_navigator),
410
+ ("Hybrid Navigator", report.hybrid_navigator),
411
+ ]:
412
+ if result and result.accuracy > best_accuracy:
413
+ best_accuracy = result.accuracy
414
+ best_method = name
415
+
416
+ report.best_method = best_method
417
+ report.best_accuracy = best_accuracy
418
+
419
+ logger.info("comprehensive_benchmark_complete", best_method=best_method)
420
+
421
+ return report
422
+
423
+ def _run_method(
424
+ self,
425
+ method_name: str,
426
+ runner_fn: Callable,
427
+ ) -> MethodResults:
428
+ """Run a single method across all test cases."""
429
+ results = MethodResults(method=method_name)
430
+
431
+ logger.info("running_method", method=method_name)
432
+
433
+ for pdf_path in self.pdf_paths:
434
+ if not pdf_path.exists():
435
+ logger.warning("pdf_not_found", path=str(pdf_path))
436
+ continue
437
+
438
+ for test_case in self.test_cases:
439
+ try:
440
+ result = runner_fn(pdf_path, test_case)
441
+ results.results.append(result)
442
+ except Exception as e:
443
+ logger.warning(
444
+ "test_case_failed",
445
+ method=method_name,
446
+ test_case=test_case.id,
447
+ error=str(e),
448
+ )
449
+ results.results.append(BenchmarkResult(
450
+ test_case_id=test_case.id,
451
+ method=method_name,
452
+ answer="",
453
+ is_correct=False,
454
+ confidence=0.0,
455
+ latency_ms=0,
456
+ error=str(e),
457
+ ))
458
+
459
+ results.compute_stats()
460
+ return results
461
+
462
+ def _run_standard_navigator(
463
+ self,
464
+ pdf_path: Path,
465
+ test_case: BenchmarkTestCase,
466
+ ) -> BenchmarkResult:
467
+ """Run standard ToT-based navigator."""
468
+ from rnsr.agent import run_navigator
469
+
470
+ skeleton, kv_store = self._get_or_build_index(pdf_path)
471
+
472
+ start_time = time.time()
473
+ result = run_navigator(
474
+ test_case.question,
475
+ skeleton,
476
+ kv_store,
477
+ metadata=test_case.metadata,
478
+ )
479
+ latency_ms = (time.time() - start_time) * 1000
480
+
481
+ answer = result.get("answer", "")
482
+ confidence = result.get("confidence", 0.0)
483
+
484
+ is_correct = self._evaluate_answer(answer, test_case)
485
+
486
+ return BenchmarkResult(
487
+ test_case_id=test_case.id,
488
+ method="standard",
489
+ answer=answer,
490
+ is_correct=is_correct,
491
+ confidence=confidence,
492
+ latency_ms=latency_ms,
493
+ iterations=len(result.get("trace", [])),
494
+ trace=result.get("trace", []),
495
+ )
496
+
497
+ def _run_rlm_navigator(
498
+ self,
499
+ pdf_path: Path,
500
+ test_case: BenchmarkTestCase,
501
+ ) -> BenchmarkResult:
502
+ """Run RLM navigator with full features."""
503
+ from rnsr.agent.rlm_navigator import RLMConfig, run_rlm_navigator
504
+
505
+ skeleton, kv_store = self._get_or_build_index(pdf_path)
506
+
507
+ config = RLMConfig(
508
+ enable_pre_filtering=True,
509
+ enable_verification=True,
510
+ max_recursion_depth=3,
511
+ )
512
+
513
+ start_time = time.time()
514
+ result = run_rlm_navigator(
515
+ test_case.question,
516
+ skeleton,
517
+ kv_store,
518
+ config=config,
519
+ metadata=test_case.metadata,
520
+ )
521
+ latency_ms = (time.time() - start_time) * 1000
522
+
523
+ answer = result.get("answer", "")
524
+ confidence = result.get("confidence", 0.0)
525
+
526
+ is_correct = self._evaluate_answer(answer, test_case)
527
+
528
+ return BenchmarkResult(
529
+ test_case_id=test_case.id,
530
+ method="rlm",
531
+ answer=answer,
532
+ is_correct=is_correct,
533
+ confidence=confidence,
534
+ latency_ms=latency_ms,
535
+ iterations=result.get("iteration", 0),
536
+ trace=result.get("trace", []),
537
+ )
538
+
539
+ def _run_vision_navigator(
540
+ self,
541
+ pdf_path: Path,
542
+ test_case: BenchmarkTestCase,
543
+ ) -> BenchmarkResult:
544
+ """Run vision-based navigator."""
545
+ from rnsr.ingestion.vision_retrieval import create_vision_navigator
546
+
547
+ navigator = create_vision_navigator(pdf_path)
548
+
549
+ start_time = time.time()
550
+ result = navigator.navigate(
551
+ test_case.question,
552
+ metadata=test_case.metadata,
553
+ )
554
+ latency_ms = (time.time() - start_time) * 1000
555
+
556
+ answer = result.get("answer", "")
557
+ confidence = result.get("confidence", 0.0)
558
+
559
+ is_correct = self._evaluate_answer(answer, test_case)
560
+
561
+ return BenchmarkResult(
562
+ test_case_id=test_case.id,
563
+ method="vision",
564
+ answer=answer,
565
+ is_correct=is_correct,
566
+ confidence=confidence,
567
+ latency_ms=latency_ms,
568
+ trace=result.get("trace", []),
569
+ )
570
+
571
+ def _run_hybrid_navigator(
572
+ self,
573
+ pdf_path: Path,
574
+ test_case: BenchmarkTestCase,
575
+ ) -> BenchmarkResult:
576
+ """Run hybrid text+vision navigator."""
577
+ from rnsr.ingestion.vision_retrieval import create_hybrid_navigator
578
+
579
+ skeleton, kv_store = self._get_or_build_index(pdf_path)
580
+ navigator = create_hybrid_navigator(pdf_path, skeleton, kv_store)
581
+
582
+ start_time = time.time()
583
+ result = navigator.navigate(
584
+ test_case.question,
585
+ metadata=test_case.metadata,
586
+ )
587
+ latency_ms = (time.time() - start_time) * 1000
588
+
589
+ answer = result.get("combined_answer", "")
590
+ confidence = result.get("confidence", 0.0)
591
+
592
+ is_correct = self._evaluate_answer(answer, test_case)
593
+
594
+ return BenchmarkResult(
595
+ test_case_id=test_case.id,
596
+ method="hybrid",
597
+ answer=answer,
598
+ is_correct=is_correct,
599
+ confidence=confidence,
600
+ latency_ms=latency_ms,
601
+ )
602
+
603
+ def _evaluate_answer(
604
+ self,
605
+ answer: str,
606
+ test_case: BenchmarkTestCase,
607
+ ) -> bool:
608
+ """Evaluate if an answer is correct."""
609
+ if not answer:
610
+ return False
611
+
612
+ answer_lower = answer.lower()
613
+
614
+ # Check expected answer if provided
615
+ if test_case.expected_answer:
616
+ if test_case.expected_answer.lower() in answer_lower:
617
+ return True
618
+
619
+ # Check keywords
620
+ if test_case.expected_keywords:
621
+ matches = sum(
622
+ 1 for kw in test_case.expected_keywords
623
+ if kw.lower() in answer_lower
624
+ )
625
+ # Require at least half the keywords
626
+ required = len(test_case.expected_keywords) // 2 + 1
627
+ return matches >= required
628
+
629
+ # Default: consider non-empty as potentially correct
630
+ return len(answer) > 20
631
+
632
+
633
+ # =============================================================================
634
+ # Convenience Functions
635
+ # =============================================================================
636
+
637
+
638
+ def run_comprehensive_benchmark(
639
+ pdf_paths: list[Path | str],
640
+ benchmark_type: Literal["all", "financebench", "oolong", "multihop", "custom"] = "all",
641
+ custom_test_cases: list[BenchmarkTestCase] | None = None,
642
+ methods: list[str] | None = None,
643
+ output_path: Path | str | None = None,
644
+ ) -> ComprehensiveBenchmarkReport:
645
+ """
646
+ Run a comprehensive benchmark across all navigator types.
647
+
648
+ Args:
649
+ pdf_paths: List of PDF files to benchmark.
650
+ benchmark_type: Type of benchmark to run.
651
+ custom_test_cases: Custom test cases (for 'custom' type).
652
+ methods: List of methods to benchmark ['standard', 'rlm', 'vision', 'hybrid'].
653
+ output_path: Path to save JSON report.
654
+
655
+ Returns:
656
+ ComprehensiveBenchmarkReport with all results.
657
+
658
+ Example:
659
+ from rnsr.benchmarks.comprehensive_benchmark import run_comprehensive_benchmark
660
+
661
+ results = run_comprehensive_benchmark(
662
+ pdf_paths=["financial_report.pdf"],
663
+ benchmark_type="financebench",
664
+ methods=["standard", "rlm"],
665
+ )
666
+ results.print_report()
667
+ """
668
+ # Get test cases based on type
669
+ if benchmark_type == "all":
670
+ test_cases = []
671
+ test_cases.extend(get_financebench_cases())
672
+ test_cases.extend(get_oolong_style_cases())
673
+ test_cases.extend(get_multi_hop_cases())
674
+ elif benchmark_type == "financebench":
675
+ test_cases = get_financebench_cases()
676
+ elif benchmark_type == "oolong":
677
+ test_cases = get_oolong_style_cases()
678
+ elif benchmark_type == "multihop":
679
+ test_cases = get_multi_hop_cases()
680
+ elif benchmark_type == "custom":
681
+ test_cases = custom_test_cases or []
682
+ else:
683
+ test_cases = []
684
+
685
+ # Run benchmark
686
+ runner = ComprehensiveBenchmarkRunner(
687
+ pdf_paths=pdf_paths,
688
+ test_cases=test_cases,
689
+ methods=methods,
690
+ )
691
+
692
+ report = runner.run()
693
+ report.benchmark_type = benchmark_type
694
+
695
+ # Save if path provided
696
+ if output_path:
697
+ report.to_json(output_path)
698
+
699
+ return report
700
+
701
+
702
+ def quick_benchmark(
703
+ pdf_path: Path | str,
704
+ question: str,
705
+ ) -> dict[str, Any]:
706
+ """
707
+ Quick benchmark a single question across all methods.
708
+
709
+ Args:
710
+ pdf_path: Path to PDF file.
711
+ question: Question to ask.
712
+
713
+ Returns:
714
+ Dict with results from each method.
715
+ """
716
+ test_case = BenchmarkTestCase(
717
+ id="quick",
718
+ question=question,
719
+ )
720
+
721
+ runner = ComprehensiveBenchmarkRunner(
722
+ pdf_paths=[pdf_path],
723
+ test_cases=[test_case],
724
+ )
725
+
726
+ report = runner.run()
727
+
728
+ return {
729
+ "standard": report.standard_navigator.results[0] if report.standard_navigator else None,
730
+ "rlm": report.rlm_navigator.results[0] if report.rlm_navigator else None,
731
+ "vision": report.vision_navigator.results[0] if report.vision_navigator else None,
732
+ "hybrid": report.hybrid_navigator.results[0] if report.hybrid_navigator else None,
733
+ }