rnsr 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. rnsr/__init__.py +118 -0
  2. rnsr/__main__.py +242 -0
  3. rnsr/agent/__init__.py +218 -0
  4. rnsr/agent/cross_doc_navigator.py +767 -0
  5. rnsr/agent/graph.py +1557 -0
  6. rnsr/agent/llm_cache.py +575 -0
  7. rnsr/agent/navigator_api.py +497 -0
  8. rnsr/agent/provenance.py +772 -0
  9. rnsr/agent/query_clarifier.py +617 -0
  10. rnsr/agent/reasoning_memory.py +736 -0
  11. rnsr/agent/repl_env.py +709 -0
  12. rnsr/agent/rlm_navigator.py +2108 -0
  13. rnsr/agent/self_reflection.py +602 -0
  14. rnsr/agent/variable_store.py +308 -0
  15. rnsr/benchmarks/__init__.py +118 -0
  16. rnsr/benchmarks/comprehensive_benchmark.py +733 -0
  17. rnsr/benchmarks/evaluation_suite.py +1210 -0
  18. rnsr/benchmarks/finance_bench.py +147 -0
  19. rnsr/benchmarks/pdf_merger.py +178 -0
  20. rnsr/benchmarks/performance.py +321 -0
  21. rnsr/benchmarks/quality.py +321 -0
  22. rnsr/benchmarks/runner.py +298 -0
  23. rnsr/benchmarks/standard_benchmarks.py +995 -0
  24. rnsr/client.py +560 -0
  25. rnsr/document_store.py +394 -0
  26. rnsr/exceptions.py +74 -0
  27. rnsr/extraction/__init__.py +172 -0
  28. rnsr/extraction/candidate_extractor.py +357 -0
  29. rnsr/extraction/entity_extractor.py +581 -0
  30. rnsr/extraction/entity_linker.py +825 -0
  31. rnsr/extraction/grounded_extractor.py +722 -0
  32. rnsr/extraction/learned_types.py +599 -0
  33. rnsr/extraction/models.py +232 -0
  34. rnsr/extraction/relationship_extractor.py +600 -0
  35. rnsr/extraction/relationship_patterns.py +511 -0
  36. rnsr/extraction/relationship_validator.py +392 -0
  37. rnsr/extraction/rlm_extractor.py +589 -0
  38. rnsr/extraction/rlm_unified_extractor.py +990 -0
  39. rnsr/extraction/tot_validator.py +610 -0
  40. rnsr/extraction/unified_extractor.py +342 -0
  41. rnsr/indexing/__init__.py +60 -0
  42. rnsr/indexing/knowledge_graph.py +1128 -0
  43. rnsr/indexing/kv_store.py +313 -0
  44. rnsr/indexing/persistence.py +323 -0
  45. rnsr/indexing/semantic_retriever.py +237 -0
  46. rnsr/indexing/semantic_search.py +320 -0
  47. rnsr/indexing/skeleton_index.py +395 -0
  48. rnsr/ingestion/__init__.py +161 -0
  49. rnsr/ingestion/chart_parser.py +569 -0
  50. rnsr/ingestion/document_boundary.py +662 -0
  51. rnsr/ingestion/font_histogram.py +334 -0
  52. rnsr/ingestion/header_classifier.py +595 -0
  53. rnsr/ingestion/hierarchical_cluster.py +515 -0
  54. rnsr/ingestion/layout_detector.py +356 -0
  55. rnsr/ingestion/layout_model.py +379 -0
  56. rnsr/ingestion/ocr_fallback.py +177 -0
  57. rnsr/ingestion/pipeline.py +936 -0
  58. rnsr/ingestion/semantic_fallback.py +417 -0
  59. rnsr/ingestion/table_parser.py +799 -0
  60. rnsr/ingestion/text_builder.py +460 -0
  61. rnsr/ingestion/tree_builder.py +402 -0
  62. rnsr/ingestion/vision_retrieval.py +965 -0
  63. rnsr/ingestion/xy_cut.py +555 -0
  64. rnsr/llm.py +733 -0
  65. rnsr/models.py +167 -0
  66. rnsr/py.typed +2 -0
  67. rnsr-0.1.0.dist-info/METADATA +592 -0
  68. rnsr-0.1.0.dist-info/RECORD +72 -0
  69. rnsr-0.1.0.dist-info/WHEEL +5 -0
  70. rnsr-0.1.0.dist-info/entry_points.txt +2 -0
  71. rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
  72. rnsr-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,321 @@
1
+ """
2
+ Quality Benchmarks
3
+
4
+ Measures retrieval and answer quality:
5
+ - Precision/Recall for retrieved nodes
6
+ - Answer relevance (semantic similarity)
7
+ - Context coverage
8
+ - Faithfulness (answer grounded in retrieved content)
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import json
14
+ from dataclasses import dataclass, field
15
+ from pathlib import Path
16
+ from typing import Any
17
+
18
+ import structlog
19
+
20
+ logger = structlog.get_logger(__name__)
21
+
22
+
23
+ @dataclass
24
+ class RetrievalMetrics:
25
+ """Metrics for a single retrieval evaluation."""
26
+
27
+ question: str
28
+ expected_nodes: list[str] # Ground truth node IDs
29
+ retrieved_nodes: list[str] # Actually retrieved node IDs
30
+
31
+ @property
32
+ def precision(self) -> float:
33
+ """Fraction of retrieved nodes that are relevant."""
34
+ if not self.retrieved_nodes:
35
+ return 0.0
36
+ relevant = set(self.expected_nodes) & set(self.retrieved_nodes)
37
+ return len(relevant) / len(self.retrieved_nodes)
38
+
39
+ @property
40
+ def recall(self) -> float:
41
+ """Fraction of relevant nodes that were retrieved."""
42
+ if not self.expected_nodes:
43
+ return 1.0 # No expected = perfect recall
44
+ relevant = set(self.expected_nodes) & set(self.retrieved_nodes)
45
+ return len(relevant) / len(self.expected_nodes)
46
+
47
+ @property
48
+ def f1(self) -> float:
49
+ """Harmonic mean of precision and recall."""
50
+ p, r = self.precision, self.recall
51
+ if p + r == 0:
52
+ return 0.0
53
+ return 2 * (p * r) / (p + r)
54
+
55
+ @property
56
+ def hit_rate(self) -> float:
57
+ """Whether at least one relevant node was retrieved."""
58
+ return 1.0 if set(self.expected_nodes) & set(self.retrieved_nodes) else 0.0
59
+
60
+
61
+ @dataclass
62
+ class AnswerMetrics:
63
+ """Metrics for answer quality."""
64
+
65
+ question: str
66
+ expected_answer: str
67
+ generated_answer: str
68
+ semantic_similarity: float = 0.0 # Cosine similarity of embeddings
69
+ rouge_l: float = 0.0 # ROUGE-L score
70
+ contains_expected: bool = False # Whether key info is present
71
+
72
+ def to_dict(self) -> dict[str, Any]:
73
+ return {
74
+ "question": self.question,
75
+ "semantic_similarity": self.semantic_similarity,
76
+ "rouge_l": self.rouge_l,
77
+ "contains_expected": self.contains_expected,
78
+ }
79
+
80
+
81
+ @dataclass
82
+ class QualityMetrics:
83
+ """Aggregated quality metrics across all test cases."""
84
+
85
+ retrieval_metrics: list[RetrievalMetrics] = field(default_factory=list)
86
+ answer_metrics: list[AnswerMetrics] = field(default_factory=list)
87
+
88
+ @property
89
+ def avg_precision(self) -> float:
90
+ if not self.retrieval_metrics:
91
+ return 0.0
92
+ return sum(m.precision for m in self.retrieval_metrics) / len(self.retrieval_metrics)
93
+
94
+ @property
95
+ def avg_recall(self) -> float:
96
+ if not self.retrieval_metrics:
97
+ return 0.0
98
+ return sum(m.recall for m in self.retrieval_metrics) / len(self.retrieval_metrics)
99
+
100
+ @property
101
+ def avg_f1(self) -> float:
102
+ if not self.retrieval_metrics:
103
+ return 0.0
104
+ return sum(m.f1 for m in self.retrieval_metrics) / len(self.retrieval_metrics)
105
+
106
+ @property
107
+ def hit_rate(self) -> float:
108
+ if not self.retrieval_metrics:
109
+ return 0.0
110
+ return sum(m.hit_rate for m in self.retrieval_metrics) / len(self.retrieval_metrics)
111
+
112
+ @property
113
+ def avg_semantic_similarity(self) -> float:
114
+ if not self.answer_metrics:
115
+ return 0.0
116
+ return sum(m.semantic_similarity for m in self.answer_metrics) / len(self.answer_metrics)
117
+
118
+ def summary(self) -> dict[str, float]:
119
+ return {
120
+ "precision": self.avg_precision,
121
+ "recall": self.avg_recall,
122
+ "f1": self.avg_f1,
123
+ "hit_rate": self.hit_rate,
124
+ "semantic_similarity": self.avg_semantic_similarity,
125
+ "test_cases": len(self.retrieval_metrics),
126
+ }
127
+
128
+ def __str__(self) -> str:
129
+ s = self.summary()
130
+ return (
131
+ f"=== Quality Metrics ===\n"
132
+ f"Precision: {s['precision']:.3f}\n"
133
+ f"Recall: {s['recall']:.3f}\n"
134
+ f"F1 Score: {s['f1']:.3f}\n"
135
+ f"Hit Rate: {s['hit_rate']:.3f}\n"
136
+ f"Semantic Similarity: {s['semantic_similarity']:.3f}\n"
137
+ f"Test Cases: {s['test_cases']}"
138
+ )
139
+
140
+
141
+ @dataclass
142
+ class TestCase:
143
+ """A single test case for quality evaluation."""
144
+
145
+ question: str
146
+ expected_nodes: list[str] = field(default_factory=list)
147
+ expected_answer: str = ""
148
+ keywords: list[str] = field(default_factory=list) # Keywords that should appear
149
+
150
+
151
+ @dataclass
152
+ class QualityBenchmark:
153
+ """Quality benchmark configuration and results."""
154
+
155
+ test_cases: list[TestCase] = field(default_factory=list)
156
+ metrics: QualityMetrics = field(default_factory=QualityMetrics)
157
+
158
+ @classmethod
159
+ def from_json(cls, path: Path | str) -> "QualityBenchmark":
160
+ """Load test cases from JSON file."""
161
+ path = Path(path)
162
+ with open(path) as f:
163
+ data = json.load(f)
164
+
165
+ test_cases = [
166
+ TestCase(
167
+ question=tc["question"],
168
+ expected_nodes=tc.get("expected_nodes", []),
169
+ expected_answer=tc.get("expected_answer", ""),
170
+ keywords=tc.get("keywords", []),
171
+ )
172
+ for tc in data.get("test_cases", [])
173
+ ]
174
+
175
+ return cls(test_cases=test_cases)
176
+
177
+ def to_json(self, path: Path | str) -> None:
178
+ """Save results to JSON file."""
179
+ path = Path(path)
180
+ data = {
181
+ "test_cases": [
182
+ {
183
+ "question": tc.question,
184
+ "expected_nodes": tc.expected_nodes,
185
+ "expected_answer": tc.expected_answer,
186
+ "keywords": tc.keywords,
187
+ }
188
+ for tc in self.test_cases
189
+ ],
190
+ "metrics": self.metrics.summary(),
191
+ }
192
+ with open(path, "w") as f:
193
+ json.dump(data, f, indent=2)
194
+
195
+
196
+ def compute_semantic_similarity(text1: str, text2: str) -> float:
197
+ """
198
+ Compute semantic similarity between two texts using embeddings.
199
+
200
+ Returns cosine similarity in range [0, 1].
201
+ """
202
+ try:
203
+ from sentence_transformers import SentenceTransformer
204
+ import numpy as np
205
+
206
+ model = SentenceTransformer("all-MiniLM-L6-v2")
207
+ embeddings = model.encode([text1, text2])
208
+
209
+ # Cosine similarity
210
+ similarity = np.dot(embeddings[0], embeddings[1]) / (
211
+ np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[1])
212
+ )
213
+
214
+ return float(similarity)
215
+ except ImportError:
216
+ logger.warning("sentence-transformers not installed, returning 0")
217
+ return 0.0
218
+
219
+
220
+ def compute_rouge_l(reference: str, candidate: str) -> float:
221
+ """
222
+ Compute ROUGE-L score (longest common subsequence).
223
+
224
+ Returns F1-based ROUGE-L score.
225
+ """
226
+ def lcs_length(x: str, y: str) -> int:
227
+ """Compute length of longest common subsequence."""
228
+ m, n = len(x), len(y)
229
+ dp = [[0] * (n + 1) for _ in range(m + 1)]
230
+
231
+ for i in range(1, m + 1):
232
+ for j in range(1, n + 1):
233
+ if x[i - 1] == y[j - 1]:
234
+ dp[i][j] = dp[i - 1][j - 1] + 1
235
+ else:
236
+ dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
237
+
238
+ return dp[m][n]
239
+
240
+ # Tokenize (simple word-level)
241
+ ref_tokens = reference.lower().split()
242
+ cand_tokens = candidate.lower().split()
243
+
244
+ if not ref_tokens or not cand_tokens:
245
+ return 0.0
246
+
247
+ lcs = lcs_length(" ".join(ref_tokens), " ".join(cand_tokens))
248
+
249
+ precision = lcs / len(cand_tokens)
250
+ recall = lcs / len(ref_tokens)
251
+
252
+ if precision + recall == 0:
253
+ return 0.0
254
+
255
+ return 2 * precision * recall / (precision + recall)
256
+
257
+
258
+ def evaluate_retrieval(
259
+ skeleton: dict,
260
+ kv_store: Any,
261
+ test_cases: list[TestCase],
262
+ compute_answer_quality: bool = True,
263
+ ) -> QualityMetrics:
264
+ """
265
+ Evaluate retrieval quality against test cases.
266
+
267
+ Args:
268
+ skeleton: Skeleton index
269
+ kv_store: KV store with content
270
+ test_cases: List of test cases with ground truth
271
+ compute_answer_quality: Whether to compute answer metrics (slower)
272
+
273
+ Returns:
274
+ QualityMetrics with all evaluation results
275
+ """
276
+ from rnsr.agent import run_navigator
277
+
278
+ metrics = QualityMetrics()
279
+
280
+ for i, tc in enumerate(test_cases):
281
+ logger.info("evaluating_test_case", index=i + 1, total=len(test_cases))
282
+
283
+ # Run the navigator
284
+ result = run_navigator(
285
+ question=tc.question,
286
+ skeleton=skeleton,
287
+ kv_store=kv_store,
288
+ max_iterations=20,
289
+ )
290
+
291
+ # Extract retrieved nodes
292
+ retrieved_nodes = result.get("nodes_visited", [])
293
+
294
+ # Retrieval metrics
295
+ retrieval = RetrievalMetrics(
296
+ question=tc.question,
297
+ expected_nodes=tc.expected_nodes,
298
+ retrieved_nodes=retrieved_nodes,
299
+ )
300
+ metrics.retrieval_metrics.append(retrieval)
301
+
302
+ # Answer quality metrics
303
+ if compute_answer_quality and tc.expected_answer:
304
+ generated = result.get("answer", "")
305
+
306
+ answer = AnswerMetrics(
307
+ question=tc.question,
308
+ expected_answer=tc.expected_answer,
309
+ generated_answer=generated,
310
+ semantic_similarity=compute_semantic_similarity(
311
+ tc.expected_answer, generated
312
+ ),
313
+ rouge_l=compute_rouge_l(tc.expected_answer, generated),
314
+ contains_expected=all(
315
+ kw.lower() in generated.lower()
316
+ for kw in tc.keywords
317
+ ) if tc.keywords else True,
318
+ )
319
+ metrics.answer_metrics.append(answer)
320
+
321
+ return metrics
@@ -0,0 +1,298 @@
1
+ """
2
+ Benchmark Runner
3
+
4
+ Orchestrates performance and quality benchmarks,
5
+ compares RNSR against baseline approaches.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ import time
12
+ from dataclasses import dataclass, field
13
+ from datetime import datetime, timezone
14
+ from pathlib import Path
15
+ from typing import Any, Literal
16
+
17
+ import structlog
18
+
19
+ from rnsr.benchmarks.performance import (
20
+ BenchmarkResult,
21
+ PerformanceBenchmark,
22
+ run_end_to_end_benchmark,
23
+ )
24
+ from rnsr.benchmarks.quality import (
25
+ QualityBenchmark,
26
+ QualityMetrics,
27
+ TestCase,
28
+ evaluate_retrieval,
29
+ )
30
+
31
+ logger = structlog.get_logger(__name__)
32
+
33
+
34
+ @dataclass
35
+ class BenchmarkConfig:
36
+ """Configuration for benchmark runs."""
37
+
38
+ # Files to benchmark
39
+ pdf_paths: list[Path] = field(default_factory=list)
40
+
41
+ # Test questions
42
+ questions: list[str] = field(default_factory=list)
43
+
44
+ # Quality test cases (optional)
45
+ quality_test_cases: list[TestCase] = field(default_factory=list)
46
+
47
+ # Settings
48
+ iterations: int = 1
49
+ warmup: bool = True
50
+ compute_quality: bool = True
51
+ compare_baseline: bool = False
52
+
53
+ # Output
54
+ output_dir: Path = field(default_factory=lambda: Path("benchmark_results"))
55
+
56
+ @classmethod
57
+ def from_json(cls, path: Path | str) -> "BenchmarkConfig":
58
+ """Load config from JSON file."""
59
+ path = Path(path)
60
+ with open(path) as f:
61
+ data = json.load(f)
62
+
63
+ return cls(
64
+ pdf_paths=[Path(p) for p in data.get("pdf_paths", [])],
65
+ questions=data.get("questions", []),
66
+ quality_test_cases=[
67
+ TestCase(
68
+ question=tc["question"],
69
+ expected_nodes=tc.get("expected_nodes", []),
70
+ expected_answer=tc.get("expected_answer", ""),
71
+ keywords=tc.get("keywords", []),
72
+ )
73
+ for tc in data.get("quality_test_cases", [])
74
+ ],
75
+ iterations=data.get("iterations", 1),
76
+ warmup=data.get("warmup", True),
77
+ compute_quality=data.get("compute_quality", True),
78
+ compare_baseline=data.get("compare_baseline", False),
79
+ output_dir=Path(data.get("output_dir", "benchmark_results")),
80
+ )
81
+
82
+
83
+ @dataclass
84
+ class ComparisonResult:
85
+ """Comparison between RNSR and baseline."""
86
+
87
+ method: Literal["rnsr", "baseline_chunk", "baseline_semantic"]
88
+ performance: PerformanceBenchmark | None = None
89
+ quality: QualityMetrics | None = None
90
+
91
+ def summary(self) -> dict[str, Any]:
92
+ return {
93
+ "method": self.method,
94
+ "performance": self.performance.summary() if self.performance else None,
95
+ "quality": self.quality.summary() if self.quality else None,
96
+ }
97
+
98
+
99
+ @dataclass
100
+ class BenchmarkReport:
101
+ """Complete benchmark report."""
102
+
103
+ timestamp: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
104
+ config: BenchmarkConfig | None = None
105
+
106
+ # Results per file
107
+ file_results: dict[str, dict[str, Any]] = field(default_factory=dict)
108
+
109
+ # Aggregated results
110
+ rnsr_performance: PerformanceBenchmark | None = None
111
+ rnsr_quality: QualityMetrics | None = None
112
+
113
+ # Comparison (if enabled)
114
+ comparisons: list[ComparisonResult] = field(default_factory=list)
115
+
116
+ def to_json(self, path: Path | str) -> None:
117
+ """Save report to JSON."""
118
+ path = Path(path)
119
+ path.parent.mkdir(parents=True, exist_ok=True)
120
+
121
+ data = {
122
+ "timestamp": self.timestamp,
123
+ "file_results": self.file_results,
124
+ "rnsr_performance": self.rnsr_performance.summary() if self.rnsr_performance else None,
125
+ "rnsr_quality": self.rnsr_quality.summary() if self.rnsr_quality else None,
126
+ "comparisons": [c.summary() for c in self.comparisons],
127
+ }
128
+
129
+ with open(path, "w") as f:
130
+ json.dump(data, f, indent=2)
131
+
132
+ logger.info("benchmark_report_saved", path=str(path))
133
+
134
+ def print_summary(self) -> None:
135
+ """Print human-readable summary."""
136
+ print("\n" + "=" * 60)
137
+ print("RNSR BENCHMARK REPORT")
138
+ print("=" * 60)
139
+ print(f"Timestamp: {self.timestamp}")
140
+ print(f"Files benchmarked: {len(self.file_results)}")
141
+
142
+ if self.rnsr_performance:
143
+ print("\n--- Performance ---")
144
+ print(self.rnsr_performance)
145
+
146
+ if self.rnsr_quality:
147
+ print("\n--- Quality ---")
148
+ print(self.rnsr_quality)
149
+
150
+ if self.comparisons:
151
+ print("\n--- Comparison ---")
152
+ for comp in self.comparisons:
153
+ print(f"\n{comp.method}:")
154
+ if comp.performance:
155
+ perf = comp.performance.summary()
156
+ print(f" Query Latency: {perf.get('query_warm_ms', 'N/A')}ms")
157
+ if comp.quality:
158
+ qual = comp.quality.summary()
159
+ print(f" F1 Score: {qual.get('f1', 'N/A'):.3f}")
160
+
161
+ print("\n" + "=" * 60)
162
+
163
+
164
+ class BenchmarkRunner:
165
+ """Runs benchmarks according to configuration."""
166
+
167
+ def __init__(self, config: BenchmarkConfig):
168
+ self.config = config
169
+ self.report = BenchmarkReport(config=config)
170
+
171
+ def run(self) -> BenchmarkReport:
172
+ """Run all configured benchmarks."""
173
+ logger.info("benchmark_runner_start", file_count=len(self.config.pdf_paths))
174
+
175
+ all_performance: list[PerformanceBenchmark] = []
176
+ all_quality: list[QualityMetrics] = []
177
+
178
+ for pdf_path in self.config.pdf_paths:
179
+ if not pdf_path.exists():
180
+ logger.warning("pdf_not_found", path=str(pdf_path))
181
+ continue
182
+
183
+ logger.info("benchmarking_file", file=pdf_path.name)
184
+
185
+ # Performance benchmark
186
+ perf = run_end_to_end_benchmark(
187
+ pdf_path,
188
+ self.config.questions,
189
+ )
190
+ all_performance.append(perf)
191
+
192
+ # Quality benchmark (if test cases provided)
193
+ quality = None
194
+ if self.config.compute_quality and self.config.quality_test_cases:
195
+ from rnsr.indexing import build_skeleton_index
196
+ from rnsr.ingestion import ingest_document
197
+
198
+ result = ingest_document(pdf_path)
199
+ skeleton, kv_store = build_skeleton_index(result.tree)
200
+
201
+ quality = evaluate_retrieval(
202
+ skeleton,
203
+ kv_store,
204
+ self.config.quality_test_cases,
205
+ )
206
+ all_quality.append(quality)
207
+
208
+ # Store per-file results
209
+ self.report.file_results[pdf_path.name] = {
210
+ "performance": perf.summary(),
211
+ "quality": quality.summary() if quality else None,
212
+ }
213
+
214
+ # Aggregate results
215
+ if all_performance:
216
+ self.report.rnsr_performance = self._aggregate_performance(all_performance)
217
+
218
+ if all_quality:
219
+ self.report.rnsr_quality = self._aggregate_quality(all_quality)
220
+
221
+ # Run baseline comparison if enabled
222
+ if self.config.compare_baseline:
223
+ self._run_baseline_comparison()
224
+
225
+ logger.info("benchmark_runner_complete")
226
+
227
+ return self.report
228
+
229
+ def _aggregate_performance(
230
+ self,
231
+ results: list[PerformanceBenchmark],
232
+ ) -> PerformanceBenchmark:
233
+ """Aggregate multiple performance results."""
234
+ # For now, just return the last one
235
+ # TODO: Implement proper averaging
236
+ return results[-1] if results else PerformanceBenchmark()
237
+
238
+ def _aggregate_quality(
239
+ self,
240
+ results: list[QualityMetrics],
241
+ ) -> QualityMetrics:
242
+ """Aggregate multiple quality results."""
243
+ combined = QualityMetrics()
244
+ for r in results:
245
+ combined.retrieval_metrics.extend(r.retrieval_metrics)
246
+ combined.answer_metrics.extend(r.answer_metrics)
247
+ return combined
248
+
249
+ def _run_baseline_comparison(self) -> None:
250
+ """Run baseline chunking approach for comparison."""
251
+ logger.info("running_baseline_comparison")
252
+
253
+ # TODO: Implement baseline comparison
254
+ # This would use simple fixed-size chunking instead of RNSR
255
+
256
+ baseline = ComparisonResult(
257
+ method="baseline_chunk",
258
+ performance=None, # Would run baseline benchmark
259
+ quality=None, # Would evaluate baseline quality
260
+ )
261
+ self.report.comparisons.append(baseline)
262
+
263
+
264
+ def run_full_benchmark(
265
+ pdf_paths: list[Path | str],
266
+ questions: list[str] | None = None,
267
+ output_dir: Path | str = "benchmark_results",
268
+ ) -> BenchmarkReport:
269
+ """
270
+ Convenience function to run a full benchmark.
271
+
272
+ Args:
273
+ pdf_paths: List of PDF files to benchmark
274
+ questions: Test questions (optional)
275
+ output_dir: Directory for results
276
+
277
+ Returns:
278
+ BenchmarkReport with all results
279
+ """
280
+ config = BenchmarkConfig(
281
+ pdf_paths=[Path(p) for p in pdf_paths],
282
+ questions=questions or [
283
+ "What is this document about?",
284
+ "What are the main sections?",
285
+ "What are the key findings?",
286
+ ],
287
+ output_dir=Path(output_dir),
288
+ )
289
+
290
+ runner = BenchmarkRunner(config)
291
+ report = runner.run()
292
+
293
+ # Save results
294
+ output_path = config.output_dir / f"benchmark_{int(time.time())}.json"
295
+ report.to_json(output_path)
296
+ report.print_summary()
297
+
298
+ return report