rnsr 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rnsr/__init__.py +118 -0
- rnsr/__main__.py +242 -0
- rnsr/agent/__init__.py +218 -0
- rnsr/agent/cross_doc_navigator.py +767 -0
- rnsr/agent/graph.py +1557 -0
- rnsr/agent/llm_cache.py +575 -0
- rnsr/agent/navigator_api.py +497 -0
- rnsr/agent/provenance.py +772 -0
- rnsr/agent/query_clarifier.py +617 -0
- rnsr/agent/reasoning_memory.py +736 -0
- rnsr/agent/repl_env.py +709 -0
- rnsr/agent/rlm_navigator.py +2108 -0
- rnsr/agent/self_reflection.py +602 -0
- rnsr/agent/variable_store.py +308 -0
- rnsr/benchmarks/__init__.py +118 -0
- rnsr/benchmarks/comprehensive_benchmark.py +733 -0
- rnsr/benchmarks/evaluation_suite.py +1210 -0
- rnsr/benchmarks/finance_bench.py +147 -0
- rnsr/benchmarks/pdf_merger.py +178 -0
- rnsr/benchmarks/performance.py +321 -0
- rnsr/benchmarks/quality.py +321 -0
- rnsr/benchmarks/runner.py +298 -0
- rnsr/benchmarks/standard_benchmarks.py +995 -0
- rnsr/client.py +560 -0
- rnsr/document_store.py +394 -0
- rnsr/exceptions.py +74 -0
- rnsr/extraction/__init__.py +172 -0
- rnsr/extraction/candidate_extractor.py +357 -0
- rnsr/extraction/entity_extractor.py +581 -0
- rnsr/extraction/entity_linker.py +825 -0
- rnsr/extraction/grounded_extractor.py +722 -0
- rnsr/extraction/learned_types.py +599 -0
- rnsr/extraction/models.py +232 -0
- rnsr/extraction/relationship_extractor.py +600 -0
- rnsr/extraction/relationship_patterns.py +511 -0
- rnsr/extraction/relationship_validator.py +392 -0
- rnsr/extraction/rlm_extractor.py +589 -0
- rnsr/extraction/rlm_unified_extractor.py +990 -0
- rnsr/extraction/tot_validator.py +610 -0
- rnsr/extraction/unified_extractor.py +342 -0
- rnsr/indexing/__init__.py +60 -0
- rnsr/indexing/knowledge_graph.py +1128 -0
- rnsr/indexing/kv_store.py +313 -0
- rnsr/indexing/persistence.py +323 -0
- rnsr/indexing/semantic_retriever.py +237 -0
- rnsr/indexing/semantic_search.py +320 -0
- rnsr/indexing/skeleton_index.py +395 -0
- rnsr/ingestion/__init__.py +161 -0
- rnsr/ingestion/chart_parser.py +569 -0
- rnsr/ingestion/document_boundary.py +662 -0
- rnsr/ingestion/font_histogram.py +334 -0
- rnsr/ingestion/header_classifier.py +595 -0
- rnsr/ingestion/hierarchical_cluster.py +515 -0
- rnsr/ingestion/layout_detector.py +356 -0
- rnsr/ingestion/layout_model.py +379 -0
- rnsr/ingestion/ocr_fallback.py +177 -0
- rnsr/ingestion/pipeline.py +936 -0
- rnsr/ingestion/semantic_fallback.py +417 -0
- rnsr/ingestion/table_parser.py +799 -0
- rnsr/ingestion/text_builder.py +460 -0
- rnsr/ingestion/tree_builder.py +402 -0
- rnsr/ingestion/vision_retrieval.py +965 -0
- rnsr/ingestion/xy_cut.py +555 -0
- rnsr/llm.py +733 -0
- rnsr/models.py +167 -0
- rnsr/py.typed +2 -0
- rnsr-0.1.0.dist-info/METADATA +592 -0
- rnsr-0.1.0.dist-info/RECORD +72 -0
- rnsr-0.1.0.dist-info/WHEEL +5 -0
- rnsr-0.1.0.dist-info/entry_points.txt +2 -0
- rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
- rnsr-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,733 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Comprehensive Benchmark Suite
|
|
3
|
+
|
|
4
|
+
Evaluates RNSR against different configurations and approaches:
|
|
5
|
+
1. Standard Navigator (ToT-based)
|
|
6
|
+
2. RLM Navigator (pre-filtering + deep recursion + verification)
|
|
7
|
+
3. Vision Navigator (OCR-free page image analysis)
|
|
8
|
+
4. Hybrid Navigator (text + vision)
|
|
9
|
+
|
|
10
|
+
Benchmarks inspired by:
|
|
11
|
+
- FinanceBench: Financial document QA (PageIndex achieved 98.7%)
|
|
12
|
+
- OOLONG: Long context aggregation tasks
|
|
13
|
+
- Custom Multi-hop: Complex relational queries
|
|
14
|
+
|
|
15
|
+
Usage:
|
|
16
|
+
from rnsr.benchmarks.comprehensive_benchmark import run_comprehensive_benchmark
|
|
17
|
+
|
|
18
|
+
results = run_comprehensive_benchmark(
|
|
19
|
+
pdf_paths=["contract.pdf", "report.pdf"],
|
|
20
|
+
benchmark_type="all", # or "financebench", "custom", etc.
|
|
21
|
+
)
|
|
22
|
+
results.print_report()
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
import json
|
|
28
|
+
import time
|
|
29
|
+
from dataclasses import dataclass, field
|
|
30
|
+
from datetime import datetime, timezone
|
|
31
|
+
from pathlib import Path
|
|
32
|
+
from typing import Any, Callable, Literal
|
|
33
|
+
|
|
34
|
+
import structlog
|
|
35
|
+
|
|
36
|
+
logger = structlog.get_logger(__name__)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
# =============================================================================
|
|
40
|
+
# Benchmark Test Cases
|
|
41
|
+
# =============================================================================
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@dataclass
|
|
45
|
+
class BenchmarkTestCase:
|
|
46
|
+
"""A single test case for benchmarking."""
|
|
47
|
+
|
|
48
|
+
id: str
|
|
49
|
+
question: str
|
|
50
|
+
expected_answer: str | None = None
|
|
51
|
+
expected_keywords: list[str] = field(default_factory=list)
|
|
52
|
+
category: str = "general"
|
|
53
|
+
difficulty: Literal["easy", "medium", "hard"] = "medium"
|
|
54
|
+
requires_multi_hop: bool = False
|
|
55
|
+
requires_aggregation: bool = False
|
|
56
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@dataclass
|
|
60
|
+
class BenchmarkResult:
|
|
61
|
+
"""Result of a single benchmark test."""
|
|
62
|
+
|
|
63
|
+
test_case_id: str
|
|
64
|
+
method: str
|
|
65
|
+
answer: str
|
|
66
|
+
is_correct: bool
|
|
67
|
+
confidence: float
|
|
68
|
+
latency_ms: float
|
|
69
|
+
tokens_used: int = 0
|
|
70
|
+
iterations: int = 0
|
|
71
|
+
error: str | None = None
|
|
72
|
+
trace: list[dict] = field(default_factory=list)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@dataclass
|
|
76
|
+
class MethodResults:
|
|
77
|
+
"""Aggregated results for a single method."""
|
|
78
|
+
|
|
79
|
+
method: str
|
|
80
|
+
total_tests: int = 0
|
|
81
|
+
correct: int = 0
|
|
82
|
+
accuracy: float = 0.0
|
|
83
|
+
avg_latency_ms: float = 0.0
|
|
84
|
+
avg_confidence: float = 0.0
|
|
85
|
+
results: list[BenchmarkResult] = field(default_factory=list)
|
|
86
|
+
|
|
87
|
+
def compute_stats(self) -> None:
|
|
88
|
+
"""Compute aggregate statistics."""
|
|
89
|
+
if not self.results:
|
|
90
|
+
return
|
|
91
|
+
|
|
92
|
+
self.total_tests = len(self.results)
|
|
93
|
+
self.correct = sum(1 for r in self.results if r.is_correct)
|
|
94
|
+
self.accuracy = self.correct / self.total_tests if self.total_tests > 0 else 0
|
|
95
|
+
|
|
96
|
+
latencies = [r.latency_ms for r in self.results if r.latency_ms > 0]
|
|
97
|
+
self.avg_latency_ms = sum(latencies) / len(latencies) if latencies else 0
|
|
98
|
+
|
|
99
|
+
confidences = [r.confidence for r in self.results]
|
|
100
|
+
self.avg_confidence = sum(confidences) / len(confidences) if confidences else 0
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
@dataclass
|
|
104
|
+
class ComprehensiveBenchmarkReport:
|
|
105
|
+
"""Complete benchmark report with all methods."""
|
|
106
|
+
|
|
107
|
+
timestamp: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
|
|
108
|
+
benchmark_type: str = "comprehensive"
|
|
109
|
+
document_count: int = 0
|
|
110
|
+
test_case_count: int = 0
|
|
111
|
+
|
|
112
|
+
# Results by method
|
|
113
|
+
standard_navigator: MethodResults | None = None
|
|
114
|
+
rlm_navigator: MethodResults | None = None
|
|
115
|
+
vision_navigator: MethodResults | None = None
|
|
116
|
+
hybrid_navigator: MethodResults | None = None
|
|
117
|
+
|
|
118
|
+
# Comparison summary
|
|
119
|
+
best_method: str = ""
|
|
120
|
+
best_accuracy: float = 0.0
|
|
121
|
+
|
|
122
|
+
def print_report(self) -> None:
|
|
123
|
+
"""Print a formatted report."""
|
|
124
|
+
print("\n" + "=" * 70)
|
|
125
|
+
print("COMPREHENSIVE RNSR BENCHMARK REPORT")
|
|
126
|
+
print("=" * 70)
|
|
127
|
+
print(f"Timestamp: {self.timestamp}")
|
|
128
|
+
print(f"Documents: {self.document_count}")
|
|
129
|
+
print(f"Test Cases: {self.test_case_count}")
|
|
130
|
+
print()
|
|
131
|
+
|
|
132
|
+
methods = [
|
|
133
|
+
("Standard Navigator", self.standard_navigator),
|
|
134
|
+
("RLM Navigator", self.rlm_navigator),
|
|
135
|
+
("Vision Navigator", self.vision_navigator),
|
|
136
|
+
("Hybrid Navigator", self.hybrid_navigator),
|
|
137
|
+
]
|
|
138
|
+
|
|
139
|
+
print("-" * 70)
|
|
140
|
+
print(f"{'Method':<25} {'Accuracy':<12} {'Avg Latency':<15} {'Confidence':<12}")
|
|
141
|
+
print("-" * 70)
|
|
142
|
+
|
|
143
|
+
for name, result in methods:
|
|
144
|
+
if result:
|
|
145
|
+
acc = f"{result.accuracy:.1%}"
|
|
146
|
+
lat = f"{result.avg_latency_ms:.0f}ms"
|
|
147
|
+
conf = f"{result.avg_confidence:.2f}"
|
|
148
|
+
print(f"{name:<25} {acc:<12} {lat:<15} {conf:<12}")
|
|
149
|
+
|
|
150
|
+
print("-" * 70)
|
|
151
|
+
print(f"\nBest Method: {self.best_method} ({self.best_accuracy:.1%} accuracy)")
|
|
152
|
+
print("=" * 70)
|
|
153
|
+
|
|
154
|
+
def to_json(self, path: Path | str) -> None:
|
|
155
|
+
"""Save report to JSON."""
|
|
156
|
+
path = Path(path)
|
|
157
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
158
|
+
|
|
159
|
+
data = {
|
|
160
|
+
"timestamp": self.timestamp,
|
|
161
|
+
"benchmark_type": self.benchmark_type,
|
|
162
|
+
"document_count": self.document_count,
|
|
163
|
+
"test_case_count": self.test_case_count,
|
|
164
|
+
"best_method": self.best_method,
|
|
165
|
+
"best_accuracy": self.best_accuracy,
|
|
166
|
+
"methods": {},
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
for name, result in [
|
|
170
|
+
("standard_navigator", self.standard_navigator),
|
|
171
|
+
("rlm_navigator", self.rlm_navigator),
|
|
172
|
+
("vision_navigator", self.vision_navigator),
|
|
173
|
+
("hybrid_navigator", self.hybrid_navigator),
|
|
174
|
+
]:
|
|
175
|
+
if result:
|
|
176
|
+
data["methods"][name] = {
|
|
177
|
+
"total_tests": result.total_tests,
|
|
178
|
+
"correct": result.correct,
|
|
179
|
+
"accuracy": result.accuracy,
|
|
180
|
+
"avg_latency_ms": result.avg_latency_ms,
|
|
181
|
+
"avg_confidence": result.avg_confidence,
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
with open(path, "w") as f:
|
|
185
|
+
json.dump(data, f, indent=2)
|
|
186
|
+
|
|
187
|
+
logger.info("benchmark_report_saved", path=str(path))
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
# =============================================================================
|
|
191
|
+
# Standard Benchmark Test Suites
|
|
192
|
+
# =============================================================================
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def get_financebench_cases() -> list[BenchmarkTestCase]:
|
|
196
|
+
"""
|
|
197
|
+
FinanceBench-style test cases.
|
|
198
|
+
|
|
199
|
+
PageIndex achieved 98.7% accuracy on FinanceBench.
|
|
200
|
+
These are financial document QA questions requiring multi-hop reasoning.
|
|
201
|
+
"""
|
|
202
|
+
return [
|
|
203
|
+
BenchmarkTestCase(
|
|
204
|
+
id="fin_001",
|
|
205
|
+
question="What was the total revenue in the most recent fiscal year?",
|
|
206
|
+
expected_keywords=["revenue", "fiscal", "year", "total"],
|
|
207
|
+
category="financial",
|
|
208
|
+
difficulty="medium",
|
|
209
|
+
),
|
|
210
|
+
BenchmarkTestCase(
|
|
211
|
+
id="fin_002",
|
|
212
|
+
question="What is the year-over-year growth in net income?",
|
|
213
|
+
expected_keywords=["growth", "net income", "year-over-year"],
|
|
214
|
+
category="financial",
|
|
215
|
+
difficulty="hard",
|
|
216
|
+
requires_multi_hop=True,
|
|
217
|
+
),
|
|
218
|
+
BenchmarkTestCase(
|
|
219
|
+
id="fin_003",
|
|
220
|
+
question="What are the key risk factors mentioned in the report?",
|
|
221
|
+
expected_keywords=["risk", "factors"],
|
|
222
|
+
category="financial",
|
|
223
|
+
difficulty="medium",
|
|
224
|
+
requires_aggregation=True,
|
|
225
|
+
),
|
|
226
|
+
BenchmarkTestCase(
|
|
227
|
+
id="fin_004",
|
|
228
|
+
question="Compare the gross margin between Q1 and Q4.",
|
|
229
|
+
expected_keywords=["gross margin", "Q1", "Q4", "compare"],
|
|
230
|
+
category="financial",
|
|
231
|
+
difficulty="hard",
|
|
232
|
+
requires_multi_hop=True,
|
|
233
|
+
),
|
|
234
|
+
BenchmarkTestCase(
|
|
235
|
+
id="fin_005",
|
|
236
|
+
question="What is the company's debt-to-equity ratio?",
|
|
237
|
+
expected_keywords=["debt", "equity", "ratio"],
|
|
238
|
+
category="financial",
|
|
239
|
+
difficulty="medium",
|
|
240
|
+
),
|
|
241
|
+
]
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def get_oolong_style_cases() -> list[BenchmarkTestCase]:
|
|
245
|
+
"""
|
|
246
|
+
OOLONG-style test cases.
|
|
247
|
+
|
|
248
|
+
OOLONG tests long context reasoning and aggregation.
|
|
249
|
+
These require processing many parts of a document.
|
|
250
|
+
"""
|
|
251
|
+
return [
|
|
252
|
+
BenchmarkTestCase(
|
|
253
|
+
id="ool_001",
|
|
254
|
+
question="List all the parties mentioned in this agreement.",
|
|
255
|
+
expected_keywords=["parties", "agreement"],
|
|
256
|
+
category="aggregation",
|
|
257
|
+
difficulty="hard",
|
|
258
|
+
requires_aggregation=True,
|
|
259
|
+
),
|
|
260
|
+
BenchmarkTestCase(
|
|
261
|
+
id="ool_002",
|
|
262
|
+
question="How many sections are there in total?",
|
|
263
|
+
expected_keywords=["sections", "total"],
|
|
264
|
+
category="structure",
|
|
265
|
+
difficulty="easy",
|
|
266
|
+
),
|
|
267
|
+
BenchmarkTestCase(
|
|
268
|
+
id="ool_003",
|
|
269
|
+
question="What are all the obligations of Party A?",
|
|
270
|
+
expected_keywords=["obligations", "party"],
|
|
271
|
+
category="aggregation",
|
|
272
|
+
difficulty="hard",
|
|
273
|
+
requires_aggregation=True,
|
|
274
|
+
),
|
|
275
|
+
BenchmarkTestCase(
|
|
276
|
+
id="ool_004",
|
|
277
|
+
question="Summarize the key terms across all sections.",
|
|
278
|
+
expected_keywords=["key terms", "summarize", "sections"],
|
|
279
|
+
category="aggregation",
|
|
280
|
+
difficulty="hard",
|
|
281
|
+
requires_aggregation=True,
|
|
282
|
+
),
|
|
283
|
+
]
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def get_multi_hop_cases() -> list[BenchmarkTestCase]:
|
|
287
|
+
"""
|
|
288
|
+
Multi-hop reasoning test cases.
|
|
289
|
+
|
|
290
|
+
These require connecting information from multiple sections.
|
|
291
|
+
"""
|
|
292
|
+
return [
|
|
293
|
+
BenchmarkTestCase(
|
|
294
|
+
id="mh_001",
|
|
295
|
+
question="If the termination clause is triggered, what penalties apply according to the payment terms?",
|
|
296
|
+
expected_keywords=["termination", "penalties", "payment"],
|
|
297
|
+
category="multi_hop",
|
|
298
|
+
difficulty="hard",
|
|
299
|
+
requires_multi_hop=True,
|
|
300
|
+
),
|
|
301
|
+
BenchmarkTestCase(
|
|
302
|
+
id="mh_002",
|
|
303
|
+
question="How do the warranties in Section 5 relate to the limitations in Section 8?",
|
|
304
|
+
expected_keywords=["warranties", "limitations", "section"],
|
|
305
|
+
category="multi_hop",
|
|
306
|
+
difficulty="hard",
|
|
307
|
+
requires_multi_hop=True,
|
|
308
|
+
),
|
|
309
|
+
BenchmarkTestCase(
|
|
310
|
+
id="mh_003",
|
|
311
|
+
question="What happens to the IP rights if the contract is terminated for cause?",
|
|
312
|
+
expected_keywords=["IP", "intellectual property", "terminated", "cause"],
|
|
313
|
+
category="multi_hop",
|
|
314
|
+
difficulty="hard",
|
|
315
|
+
requires_multi_hop=True,
|
|
316
|
+
),
|
|
317
|
+
]
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
# =============================================================================
|
|
321
|
+
# Benchmark Runner
|
|
322
|
+
# =============================================================================
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
class ComprehensiveBenchmarkRunner:
|
|
326
|
+
"""
|
|
327
|
+
Runs comprehensive benchmarks across all navigator types.
|
|
328
|
+
"""
|
|
329
|
+
|
|
330
|
+
def __init__(
|
|
331
|
+
self,
|
|
332
|
+
pdf_paths: list[Path | str],
|
|
333
|
+
test_cases: list[BenchmarkTestCase] | None = None,
|
|
334
|
+
methods: list[str] | None = None,
|
|
335
|
+
):
|
|
336
|
+
self.pdf_paths = [Path(p) for p in pdf_paths]
|
|
337
|
+
self.test_cases = test_cases or self._get_default_cases()
|
|
338
|
+
self.methods = methods or ["standard", "rlm", "vision", "hybrid"]
|
|
339
|
+
|
|
340
|
+
# Indexes (built on first use)
|
|
341
|
+
self._indexes: dict[str, tuple[dict, Any]] = {}
|
|
342
|
+
|
|
343
|
+
def _get_default_cases(self) -> list[BenchmarkTestCase]:
|
|
344
|
+
"""Get default test cases."""
|
|
345
|
+
cases = []
|
|
346
|
+
cases.extend(get_financebench_cases())
|
|
347
|
+
cases.extend(get_oolong_style_cases())
|
|
348
|
+
cases.extend(get_multi_hop_cases())
|
|
349
|
+
return cases
|
|
350
|
+
|
|
351
|
+
def _get_or_build_index(self, pdf_path: Path) -> tuple[dict, Any]:
|
|
352
|
+
"""Get or build index for a PDF."""
|
|
353
|
+
key = str(pdf_path)
|
|
354
|
+
if key in self._indexes:
|
|
355
|
+
return self._indexes[key]
|
|
356
|
+
|
|
357
|
+
from rnsr import ingest_document, build_skeleton_index
|
|
358
|
+
|
|
359
|
+
result = ingest_document(pdf_path)
|
|
360
|
+
skeleton, kv_store = build_skeleton_index(result.tree)
|
|
361
|
+
self._indexes[key] = (skeleton, kv_store)
|
|
362
|
+
return skeleton, kv_store
|
|
363
|
+
|
|
364
|
+
def run(self) -> ComprehensiveBenchmarkReport:
|
|
365
|
+
"""Run all benchmarks."""
|
|
366
|
+
report = ComprehensiveBenchmarkReport(
|
|
367
|
+
document_count=len(self.pdf_paths),
|
|
368
|
+
test_case_count=len(self.test_cases),
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
logger.info(
|
|
372
|
+
"comprehensive_benchmark_start",
|
|
373
|
+
documents=len(self.pdf_paths),
|
|
374
|
+
test_cases=len(self.test_cases),
|
|
375
|
+
methods=self.methods,
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
# Run each method
|
|
379
|
+
if "standard" in self.methods:
|
|
380
|
+
report.standard_navigator = self._run_method(
|
|
381
|
+
"standard",
|
|
382
|
+
self._run_standard_navigator,
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
if "rlm" in self.methods:
|
|
386
|
+
report.rlm_navigator = self._run_method(
|
|
387
|
+
"rlm",
|
|
388
|
+
self._run_rlm_navigator,
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
if "vision" in self.methods:
|
|
392
|
+
report.vision_navigator = self._run_method(
|
|
393
|
+
"vision",
|
|
394
|
+
self._run_vision_navigator,
|
|
395
|
+
)
|
|
396
|
+
|
|
397
|
+
if "hybrid" in self.methods:
|
|
398
|
+
report.hybrid_navigator = self._run_method(
|
|
399
|
+
"hybrid",
|
|
400
|
+
self._run_hybrid_navigator,
|
|
401
|
+
)
|
|
402
|
+
|
|
403
|
+
# Determine best method
|
|
404
|
+
best_accuracy = 0.0
|
|
405
|
+
best_method = ""
|
|
406
|
+
for name, result in [
|
|
407
|
+
("Standard Navigator", report.standard_navigator),
|
|
408
|
+
("RLM Navigator", report.rlm_navigator),
|
|
409
|
+
("Vision Navigator", report.vision_navigator),
|
|
410
|
+
("Hybrid Navigator", report.hybrid_navigator),
|
|
411
|
+
]:
|
|
412
|
+
if result and result.accuracy > best_accuracy:
|
|
413
|
+
best_accuracy = result.accuracy
|
|
414
|
+
best_method = name
|
|
415
|
+
|
|
416
|
+
report.best_method = best_method
|
|
417
|
+
report.best_accuracy = best_accuracy
|
|
418
|
+
|
|
419
|
+
logger.info("comprehensive_benchmark_complete", best_method=best_method)
|
|
420
|
+
|
|
421
|
+
return report
|
|
422
|
+
|
|
423
|
+
def _run_method(
|
|
424
|
+
self,
|
|
425
|
+
method_name: str,
|
|
426
|
+
runner_fn: Callable,
|
|
427
|
+
) -> MethodResults:
|
|
428
|
+
"""Run a single method across all test cases."""
|
|
429
|
+
results = MethodResults(method=method_name)
|
|
430
|
+
|
|
431
|
+
logger.info("running_method", method=method_name)
|
|
432
|
+
|
|
433
|
+
for pdf_path in self.pdf_paths:
|
|
434
|
+
if not pdf_path.exists():
|
|
435
|
+
logger.warning("pdf_not_found", path=str(pdf_path))
|
|
436
|
+
continue
|
|
437
|
+
|
|
438
|
+
for test_case in self.test_cases:
|
|
439
|
+
try:
|
|
440
|
+
result = runner_fn(pdf_path, test_case)
|
|
441
|
+
results.results.append(result)
|
|
442
|
+
except Exception as e:
|
|
443
|
+
logger.warning(
|
|
444
|
+
"test_case_failed",
|
|
445
|
+
method=method_name,
|
|
446
|
+
test_case=test_case.id,
|
|
447
|
+
error=str(e),
|
|
448
|
+
)
|
|
449
|
+
results.results.append(BenchmarkResult(
|
|
450
|
+
test_case_id=test_case.id,
|
|
451
|
+
method=method_name,
|
|
452
|
+
answer="",
|
|
453
|
+
is_correct=False,
|
|
454
|
+
confidence=0.0,
|
|
455
|
+
latency_ms=0,
|
|
456
|
+
error=str(e),
|
|
457
|
+
))
|
|
458
|
+
|
|
459
|
+
results.compute_stats()
|
|
460
|
+
return results
|
|
461
|
+
|
|
462
|
+
def _run_standard_navigator(
|
|
463
|
+
self,
|
|
464
|
+
pdf_path: Path,
|
|
465
|
+
test_case: BenchmarkTestCase,
|
|
466
|
+
) -> BenchmarkResult:
|
|
467
|
+
"""Run standard ToT-based navigator."""
|
|
468
|
+
from rnsr.agent import run_navigator
|
|
469
|
+
|
|
470
|
+
skeleton, kv_store = self._get_or_build_index(pdf_path)
|
|
471
|
+
|
|
472
|
+
start_time = time.time()
|
|
473
|
+
result = run_navigator(
|
|
474
|
+
test_case.question,
|
|
475
|
+
skeleton,
|
|
476
|
+
kv_store,
|
|
477
|
+
metadata=test_case.metadata,
|
|
478
|
+
)
|
|
479
|
+
latency_ms = (time.time() - start_time) * 1000
|
|
480
|
+
|
|
481
|
+
answer = result.get("answer", "")
|
|
482
|
+
confidence = result.get("confidence", 0.0)
|
|
483
|
+
|
|
484
|
+
is_correct = self._evaluate_answer(answer, test_case)
|
|
485
|
+
|
|
486
|
+
return BenchmarkResult(
|
|
487
|
+
test_case_id=test_case.id,
|
|
488
|
+
method="standard",
|
|
489
|
+
answer=answer,
|
|
490
|
+
is_correct=is_correct,
|
|
491
|
+
confidence=confidence,
|
|
492
|
+
latency_ms=latency_ms,
|
|
493
|
+
iterations=len(result.get("trace", [])),
|
|
494
|
+
trace=result.get("trace", []),
|
|
495
|
+
)
|
|
496
|
+
|
|
497
|
+
def _run_rlm_navigator(
|
|
498
|
+
self,
|
|
499
|
+
pdf_path: Path,
|
|
500
|
+
test_case: BenchmarkTestCase,
|
|
501
|
+
) -> BenchmarkResult:
|
|
502
|
+
"""Run RLM navigator with full features."""
|
|
503
|
+
from rnsr.agent.rlm_navigator import RLMConfig, run_rlm_navigator
|
|
504
|
+
|
|
505
|
+
skeleton, kv_store = self._get_or_build_index(pdf_path)
|
|
506
|
+
|
|
507
|
+
config = RLMConfig(
|
|
508
|
+
enable_pre_filtering=True,
|
|
509
|
+
enable_verification=True,
|
|
510
|
+
max_recursion_depth=3,
|
|
511
|
+
)
|
|
512
|
+
|
|
513
|
+
start_time = time.time()
|
|
514
|
+
result = run_rlm_navigator(
|
|
515
|
+
test_case.question,
|
|
516
|
+
skeleton,
|
|
517
|
+
kv_store,
|
|
518
|
+
config=config,
|
|
519
|
+
metadata=test_case.metadata,
|
|
520
|
+
)
|
|
521
|
+
latency_ms = (time.time() - start_time) * 1000
|
|
522
|
+
|
|
523
|
+
answer = result.get("answer", "")
|
|
524
|
+
confidence = result.get("confidence", 0.0)
|
|
525
|
+
|
|
526
|
+
is_correct = self._evaluate_answer(answer, test_case)
|
|
527
|
+
|
|
528
|
+
return BenchmarkResult(
|
|
529
|
+
test_case_id=test_case.id,
|
|
530
|
+
method="rlm",
|
|
531
|
+
answer=answer,
|
|
532
|
+
is_correct=is_correct,
|
|
533
|
+
confidence=confidence,
|
|
534
|
+
latency_ms=latency_ms,
|
|
535
|
+
iterations=result.get("iteration", 0),
|
|
536
|
+
trace=result.get("trace", []),
|
|
537
|
+
)
|
|
538
|
+
|
|
539
|
+
def _run_vision_navigator(
|
|
540
|
+
self,
|
|
541
|
+
pdf_path: Path,
|
|
542
|
+
test_case: BenchmarkTestCase,
|
|
543
|
+
) -> BenchmarkResult:
|
|
544
|
+
"""Run vision-based navigator."""
|
|
545
|
+
from rnsr.ingestion.vision_retrieval import create_vision_navigator
|
|
546
|
+
|
|
547
|
+
navigator = create_vision_navigator(pdf_path)
|
|
548
|
+
|
|
549
|
+
start_time = time.time()
|
|
550
|
+
result = navigator.navigate(
|
|
551
|
+
test_case.question,
|
|
552
|
+
metadata=test_case.metadata,
|
|
553
|
+
)
|
|
554
|
+
latency_ms = (time.time() - start_time) * 1000
|
|
555
|
+
|
|
556
|
+
answer = result.get("answer", "")
|
|
557
|
+
confidence = result.get("confidence", 0.0)
|
|
558
|
+
|
|
559
|
+
is_correct = self._evaluate_answer(answer, test_case)
|
|
560
|
+
|
|
561
|
+
return BenchmarkResult(
|
|
562
|
+
test_case_id=test_case.id,
|
|
563
|
+
method="vision",
|
|
564
|
+
answer=answer,
|
|
565
|
+
is_correct=is_correct,
|
|
566
|
+
confidence=confidence,
|
|
567
|
+
latency_ms=latency_ms,
|
|
568
|
+
trace=result.get("trace", []),
|
|
569
|
+
)
|
|
570
|
+
|
|
571
|
+
def _run_hybrid_navigator(
|
|
572
|
+
self,
|
|
573
|
+
pdf_path: Path,
|
|
574
|
+
test_case: BenchmarkTestCase,
|
|
575
|
+
) -> BenchmarkResult:
|
|
576
|
+
"""Run hybrid text+vision navigator."""
|
|
577
|
+
from rnsr.ingestion.vision_retrieval import create_hybrid_navigator
|
|
578
|
+
|
|
579
|
+
skeleton, kv_store = self._get_or_build_index(pdf_path)
|
|
580
|
+
navigator = create_hybrid_navigator(pdf_path, skeleton, kv_store)
|
|
581
|
+
|
|
582
|
+
start_time = time.time()
|
|
583
|
+
result = navigator.navigate(
|
|
584
|
+
test_case.question,
|
|
585
|
+
metadata=test_case.metadata,
|
|
586
|
+
)
|
|
587
|
+
latency_ms = (time.time() - start_time) * 1000
|
|
588
|
+
|
|
589
|
+
answer = result.get("combined_answer", "")
|
|
590
|
+
confidence = result.get("confidence", 0.0)
|
|
591
|
+
|
|
592
|
+
is_correct = self._evaluate_answer(answer, test_case)
|
|
593
|
+
|
|
594
|
+
return BenchmarkResult(
|
|
595
|
+
test_case_id=test_case.id,
|
|
596
|
+
method="hybrid",
|
|
597
|
+
answer=answer,
|
|
598
|
+
is_correct=is_correct,
|
|
599
|
+
confidence=confidence,
|
|
600
|
+
latency_ms=latency_ms,
|
|
601
|
+
)
|
|
602
|
+
|
|
603
|
+
def _evaluate_answer(
|
|
604
|
+
self,
|
|
605
|
+
answer: str,
|
|
606
|
+
test_case: BenchmarkTestCase,
|
|
607
|
+
) -> bool:
|
|
608
|
+
"""Evaluate if an answer is correct."""
|
|
609
|
+
if not answer:
|
|
610
|
+
return False
|
|
611
|
+
|
|
612
|
+
answer_lower = answer.lower()
|
|
613
|
+
|
|
614
|
+
# Check expected answer if provided
|
|
615
|
+
if test_case.expected_answer:
|
|
616
|
+
if test_case.expected_answer.lower() in answer_lower:
|
|
617
|
+
return True
|
|
618
|
+
|
|
619
|
+
# Check keywords
|
|
620
|
+
if test_case.expected_keywords:
|
|
621
|
+
matches = sum(
|
|
622
|
+
1 for kw in test_case.expected_keywords
|
|
623
|
+
if kw.lower() in answer_lower
|
|
624
|
+
)
|
|
625
|
+
# Require at least half the keywords
|
|
626
|
+
required = len(test_case.expected_keywords) // 2 + 1
|
|
627
|
+
return matches >= required
|
|
628
|
+
|
|
629
|
+
# Default: consider non-empty as potentially correct
|
|
630
|
+
return len(answer) > 20
|
|
631
|
+
|
|
632
|
+
|
|
633
|
+
# =============================================================================
|
|
634
|
+
# Convenience Functions
|
|
635
|
+
# =============================================================================
|
|
636
|
+
|
|
637
|
+
|
|
638
|
+
def run_comprehensive_benchmark(
|
|
639
|
+
pdf_paths: list[Path | str],
|
|
640
|
+
benchmark_type: Literal["all", "financebench", "oolong", "multihop", "custom"] = "all",
|
|
641
|
+
custom_test_cases: list[BenchmarkTestCase] | None = None,
|
|
642
|
+
methods: list[str] | None = None,
|
|
643
|
+
output_path: Path | str | None = None,
|
|
644
|
+
) -> ComprehensiveBenchmarkReport:
|
|
645
|
+
"""
|
|
646
|
+
Run a comprehensive benchmark across all navigator types.
|
|
647
|
+
|
|
648
|
+
Args:
|
|
649
|
+
pdf_paths: List of PDF files to benchmark.
|
|
650
|
+
benchmark_type: Type of benchmark to run.
|
|
651
|
+
custom_test_cases: Custom test cases (for 'custom' type).
|
|
652
|
+
methods: List of methods to benchmark ['standard', 'rlm', 'vision', 'hybrid'].
|
|
653
|
+
output_path: Path to save JSON report.
|
|
654
|
+
|
|
655
|
+
Returns:
|
|
656
|
+
ComprehensiveBenchmarkReport with all results.
|
|
657
|
+
|
|
658
|
+
Example:
|
|
659
|
+
from rnsr.benchmarks.comprehensive_benchmark import run_comprehensive_benchmark
|
|
660
|
+
|
|
661
|
+
results = run_comprehensive_benchmark(
|
|
662
|
+
pdf_paths=["financial_report.pdf"],
|
|
663
|
+
benchmark_type="financebench",
|
|
664
|
+
methods=["standard", "rlm"],
|
|
665
|
+
)
|
|
666
|
+
results.print_report()
|
|
667
|
+
"""
|
|
668
|
+
# Get test cases based on type
|
|
669
|
+
if benchmark_type == "all":
|
|
670
|
+
test_cases = []
|
|
671
|
+
test_cases.extend(get_financebench_cases())
|
|
672
|
+
test_cases.extend(get_oolong_style_cases())
|
|
673
|
+
test_cases.extend(get_multi_hop_cases())
|
|
674
|
+
elif benchmark_type == "financebench":
|
|
675
|
+
test_cases = get_financebench_cases()
|
|
676
|
+
elif benchmark_type == "oolong":
|
|
677
|
+
test_cases = get_oolong_style_cases()
|
|
678
|
+
elif benchmark_type == "multihop":
|
|
679
|
+
test_cases = get_multi_hop_cases()
|
|
680
|
+
elif benchmark_type == "custom":
|
|
681
|
+
test_cases = custom_test_cases or []
|
|
682
|
+
else:
|
|
683
|
+
test_cases = []
|
|
684
|
+
|
|
685
|
+
# Run benchmark
|
|
686
|
+
runner = ComprehensiveBenchmarkRunner(
|
|
687
|
+
pdf_paths=pdf_paths,
|
|
688
|
+
test_cases=test_cases,
|
|
689
|
+
methods=methods,
|
|
690
|
+
)
|
|
691
|
+
|
|
692
|
+
report = runner.run()
|
|
693
|
+
report.benchmark_type = benchmark_type
|
|
694
|
+
|
|
695
|
+
# Save if path provided
|
|
696
|
+
if output_path:
|
|
697
|
+
report.to_json(output_path)
|
|
698
|
+
|
|
699
|
+
return report
|
|
700
|
+
|
|
701
|
+
|
|
702
|
+
def quick_benchmark(
|
|
703
|
+
pdf_path: Path | str,
|
|
704
|
+
question: str,
|
|
705
|
+
) -> dict[str, Any]:
|
|
706
|
+
"""
|
|
707
|
+
Quick benchmark a single question across all methods.
|
|
708
|
+
|
|
709
|
+
Args:
|
|
710
|
+
pdf_path: Path to PDF file.
|
|
711
|
+
question: Question to ask.
|
|
712
|
+
|
|
713
|
+
Returns:
|
|
714
|
+
Dict with results from each method.
|
|
715
|
+
"""
|
|
716
|
+
test_case = BenchmarkTestCase(
|
|
717
|
+
id="quick",
|
|
718
|
+
question=question,
|
|
719
|
+
)
|
|
720
|
+
|
|
721
|
+
runner = ComprehensiveBenchmarkRunner(
|
|
722
|
+
pdf_paths=[pdf_path],
|
|
723
|
+
test_cases=[test_case],
|
|
724
|
+
)
|
|
725
|
+
|
|
726
|
+
report = runner.run()
|
|
727
|
+
|
|
728
|
+
return {
|
|
729
|
+
"standard": report.standard_navigator.results[0] if report.standard_navigator else None,
|
|
730
|
+
"rlm": report.rlm_navigator.results[0] if report.rlm_navigator else None,
|
|
731
|
+
"vision": report.vision_navigator.results[0] if report.vision_navigator else None,
|
|
732
|
+
"hybrid": report.hybrid_navigator.results[0] if report.hybrid_navigator else None,
|
|
733
|
+
}
|