rnsr 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rnsr/__init__.py +118 -0
- rnsr/__main__.py +242 -0
- rnsr/agent/__init__.py +218 -0
- rnsr/agent/cross_doc_navigator.py +767 -0
- rnsr/agent/graph.py +1557 -0
- rnsr/agent/llm_cache.py +575 -0
- rnsr/agent/navigator_api.py +497 -0
- rnsr/agent/provenance.py +772 -0
- rnsr/agent/query_clarifier.py +617 -0
- rnsr/agent/reasoning_memory.py +736 -0
- rnsr/agent/repl_env.py +709 -0
- rnsr/agent/rlm_navigator.py +2108 -0
- rnsr/agent/self_reflection.py +602 -0
- rnsr/agent/variable_store.py +308 -0
- rnsr/benchmarks/__init__.py +118 -0
- rnsr/benchmarks/comprehensive_benchmark.py +733 -0
- rnsr/benchmarks/evaluation_suite.py +1210 -0
- rnsr/benchmarks/finance_bench.py +147 -0
- rnsr/benchmarks/pdf_merger.py +178 -0
- rnsr/benchmarks/performance.py +321 -0
- rnsr/benchmarks/quality.py +321 -0
- rnsr/benchmarks/runner.py +298 -0
- rnsr/benchmarks/standard_benchmarks.py +995 -0
- rnsr/client.py +560 -0
- rnsr/document_store.py +394 -0
- rnsr/exceptions.py +74 -0
- rnsr/extraction/__init__.py +172 -0
- rnsr/extraction/candidate_extractor.py +357 -0
- rnsr/extraction/entity_extractor.py +581 -0
- rnsr/extraction/entity_linker.py +825 -0
- rnsr/extraction/grounded_extractor.py +722 -0
- rnsr/extraction/learned_types.py +599 -0
- rnsr/extraction/models.py +232 -0
- rnsr/extraction/relationship_extractor.py +600 -0
- rnsr/extraction/relationship_patterns.py +511 -0
- rnsr/extraction/relationship_validator.py +392 -0
- rnsr/extraction/rlm_extractor.py +589 -0
- rnsr/extraction/rlm_unified_extractor.py +990 -0
- rnsr/extraction/tot_validator.py +610 -0
- rnsr/extraction/unified_extractor.py +342 -0
- rnsr/indexing/__init__.py +60 -0
- rnsr/indexing/knowledge_graph.py +1128 -0
- rnsr/indexing/kv_store.py +313 -0
- rnsr/indexing/persistence.py +323 -0
- rnsr/indexing/semantic_retriever.py +237 -0
- rnsr/indexing/semantic_search.py +320 -0
- rnsr/indexing/skeleton_index.py +395 -0
- rnsr/ingestion/__init__.py +161 -0
- rnsr/ingestion/chart_parser.py +569 -0
- rnsr/ingestion/document_boundary.py +662 -0
- rnsr/ingestion/font_histogram.py +334 -0
- rnsr/ingestion/header_classifier.py +595 -0
- rnsr/ingestion/hierarchical_cluster.py +515 -0
- rnsr/ingestion/layout_detector.py +356 -0
- rnsr/ingestion/layout_model.py +379 -0
- rnsr/ingestion/ocr_fallback.py +177 -0
- rnsr/ingestion/pipeline.py +936 -0
- rnsr/ingestion/semantic_fallback.py +417 -0
- rnsr/ingestion/table_parser.py +799 -0
- rnsr/ingestion/text_builder.py +460 -0
- rnsr/ingestion/tree_builder.py +402 -0
- rnsr/ingestion/vision_retrieval.py +965 -0
- rnsr/ingestion/xy_cut.py +555 -0
- rnsr/llm.py +733 -0
- rnsr/models.py +167 -0
- rnsr/py.typed +2 -0
- rnsr-0.1.0.dist-info/METADATA +592 -0
- rnsr-0.1.0.dist-info/RECORD +72 -0
- rnsr-0.1.0.dist-info/WHEEL +5 -0
- rnsr-0.1.0.dist-info/entry_points.txt +2 -0
- rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
- rnsr-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,321 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Quality Benchmarks
|
|
3
|
+
|
|
4
|
+
Measures retrieval and answer quality:
|
|
5
|
+
- Precision/Recall for retrieved nodes
|
|
6
|
+
- Answer relevance (semantic similarity)
|
|
7
|
+
- Context coverage
|
|
8
|
+
- Faithfulness (answer grounded in retrieved content)
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import json
|
|
14
|
+
from dataclasses import dataclass, field
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import Any
|
|
17
|
+
|
|
18
|
+
import structlog
|
|
19
|
+
|
|
20
|
+
logger = structlog.get_logger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class RetrievalMetrics:
|
|
25
|
+
"""Metrics for a single retrieval evaluation."""
|
|
26
|
+
|
|
27
|
+
question: str
|
|
28
|
+
expected_nodes: list[str] # Ground truth node IDs
|
|
29
|
+
retrieved_nodes: list[str] # Actually retrieved node IDs
|
|
30
|
+
|
|
31
|
+
@property
|
|
32
|
+
def precision(self) -> float:
|
|
33
|
+
"""Fraction of retrieved nodes that are relevant."""
|
|
34
|
+
if not self.retrieved_nodes:
|
|
35
|
+
return 0.0
|
|
36
|
+
relevant = set(self.expected_nodes) & set(self.retrieved_nodes)
|
|
37
|
+
return len(relevant) / len(self.retrieved_nodes)
|
|
38
|
+
|
|
39
|
+
@property
|
|
40
|
+
def recall(self) -> float:
|
|
41
|
+
"""Fraction of relevant nodes that were retrieved."""
|
|
42
|
+
if not self.expected_nodes:
|
|
43
|
+
return 1.0 # No expected = perfect recall
|
|
44
|
+
relevant = set(self.expected_nodes) & set(self.retrieved_nodes)
|
|
45
|
+
return len(relevant) / len(self.expected_nodes)
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def f1(self) -> float:
|
|
49
|
+
"""Harmonic mean of precision and recall."""
|
|
50
|
+
p, r = self.precision, self.recall
|
|
51
|
+
if p + r == 0:
|
|
52
|
+
return 0.0
|
|
53
|
+
return 2 * (p * r) / (p + r)
|
|
54
|
+
|
|
55
|
+
@property
|
|
56
|
+
def hit_rate(self) -> float:
|
|
57
|
+
"""Whether at least one relevant node was retrieved."""
|
|
58
|
+
return 1.0 if set(self.expected_nodes) & set(self.retrieved_nodes) else 0.0
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@dataclass
|
|
62
|
+
class AnswerMetrics:
|
|
63
|
+
"""Metrics for answer quality."""
|
|
64
|
+
|
|
65
|
+
question: str
|
|
66
|
+
expected_answer: str
|
|
67
|
+
generated_answer: str
|
|
68
|
+
semantic_similarity: float = 0.0 # Cosine similarity of embeddings
|
|
69
|
+
rouge_l: float = 0.0 # ROUGE-L score
|
|
70
|
+
contains_expected: bool = False # Whether key info is present
|
|
71
|
+
|
|
72
|
+
def to_dict(self) -> dict[str, Any]:
|
|
73
|
+
return {
|
|
74
|
+
"question": self.question,
|
|
75
|
+
"semantic_similarity": self.semantic_similarity,
|
|
76
|
+
"rouge_l": self.rouge_l,
|
|
77
|
+
"contains_expected": self.contains_expected,
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
@dataclass
|
|
82
|
+
class QualityMetrics:
|
|
83
|
+
"""Aggregated quality metrics across all test cases."""
|
|
84
|
+
|
|
85
|
+
retrieval_metrics: list[RetrievalMetrics] = field(default_factory=list)
|
|
86
|
+
answer_metrics: list[AnswerMetrics] = field(default_factory=list)
|
|
87
|
+
|
|
88
|
+
@property
|
|
89
|
+
def avg_precision(self) -> float:
|
|
90
|
+
if not self.retrieval_metrics:
|
|
91
|
+
return 0.0
|
|
92
|
+
return sum(m.precision for m in self.retrieval_metrics) / len(self.retrieval_metrics)
|
|
93
|
+
|
|
94
|
+
@property
|
|
95
|
+
def avg_recall(self) -> float:
|
|
96
|
+
if not self.retrieval_metrics:
|
|
97
|
+
return 0.0
|
|
98
|
+
return sum(m.recall for m in self.retrieval_metrics) / len(self.retrieval_metrics)
|
|
99
|
+
|
|
100
|
+
@property
|
|
101
|
+
def avg_f1(self) -> float:
|
|
102
|
+
if not self.retrieval_metrics:
|
|
103
|
+
return 0.0
|
|
104
|
+
return sum(m.f1 for m in self.retrieval_metrics) / len(self.retrieval_metrics)
|
|
105
|
+
|
|
106
|
+
@property
|
|
107
|
+
def hit_rate(self) -> float:
|
|
108
|
+
if not self.retrieval_metrics:
|
|
109
|
+
return 0.0
|
|
110
|
+
return sum(m.hit_rate for m in self.retrieval_metrics) / len(self.retrieval_metrics)
|
|
111
|
+
|
|
112
|
+
@property
|
|
113
|
+
def avg_semantic_similarity(self) -> float:
|
|
114
|
+
if not self.answer_metrics:
|
|
115
|
+
return 0.0
|
|
116
|
+
return sum(m.semantic_similarity for m in self.answer_metrics) / len(self.answer_metrics)
|
|
117
|
+
|
|
118
|
+
def summary(self) -> dict[str, float]:
|
|
119
|
+
return {
|
|
120
|
+
"precision": self.avg_precision,
|
|
121
|
+
"recall": self.avg_recall,
|
|
122
|
+
"f1": self.avg_f1,
|
|
123
|
+
"hit_rate": self.hit_rate,
|
|
124
|
+
"semantic_similarity": self.avg_semantic_similarity,
|
|
125
|
+
"test_cases": len(self.retrieval_metrics),
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
def __str__(self) -> str:
|
|
129
|
+
s = self.summary()
|
|
130
|
+
return (
|
|
131
|
+
f"=== Quality Metrics ===\n"
|
|
132
|
+
f"Precision: {s['precision']:.3f}\n"
|
|
133
|
+
f"Recall: {s['recall']:.3f}\n"
|
|
134
|
+
f"F1 Score: {s['f1']:.3f}\n"
|
|
135
|
+
f"Hit Rate: {s['hit_rate']:.3f}\n"
|
|
136
|
+
f"Semantic Similarity: {s['semantic_similarity']:.3f}\n"
|
|
137
|
+
f"Test Cases: {s['test_cases']}"
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
@dataclass
|
|
142
|
+
class TestCase:
|
|
143
|
+
"""A single test case for quality evaluation."""
|
|
144
|
+
|
|
145
|
+
question: str
|
|
146
|
+
expected_nodes: list[str] = field(default_factory=list)
|
|
147
|
+
expected_answer: str = ""
|
|
148
|
+
keywords: list[str] = field(default_factory=list) # Keywords that should appear
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
@dataclass
|
|
152
|
+
class QualityBenchmark:
|
|
153
|
+
"""Quality benchmark configuration and results."""
|
|
154
|
+
|
|
155
|
+
test_cases: list[TestCase] = field(default_factory=list)
|
|
156
|
+
metrics: QualityMetrics = field(default_factory=QualityMetrics)
|
|
157
|
+
|
|
158
|
+
@classmethod
|
|
159
|
+
def from_json(cls, path: Path | str) -> "QualityBenchmark":
|
|
160
|
+
"""Load test cases from JSON file."""
|
|
161
|
+
path = Path(path)
|
|
162
|
+
with open(path) as f:
|
|
163
|
+
data = json.load(f)
|
|
164
|
+
|
|
165
|
+
test_cases = [
|
|
166
|
+
TestCase(
|
|
167
|
+
question=tc["question"],
|
|
168
|
+
expected_nodes=tc.get("expected_nodes", []),
|
|
169
|
+
expected_answer=tc.get("expected_answer", ""),
|
|
170
|
+
keywords=tc.get("keywords", []),
|
|
171
|
+
)
|
|
172
|
+
for tc in data.get("test_cases", [])
|
|
173
|
+
]
|
|
174
|
+
|
|
175
|
+
return cls(test_cases=test_cases)
|
|
176
|
+
|
|
177
|
+
def to_json(self, path: Path | str) -> None:
|
|
178
|
+
"""Save results to JSON file."""
|
|
179
|
+
path = Path(path)
|
|
180
|
+
data = {
|
|
181
|
+
"test_cases": [
|
|
182
|
+
{
|
|
183
|
+
"question": tc.question,
|
|
184
|
+
"expected_nodes": tc.expected_nodes,
|
|
185
|
+
"expected_answer": tc.expected_answer,
|
|
186
|
+
"keywords": tc.keywords,
|
|
187
|
+
}
|
|
188
|
+
for tc in self.test_cases
|
|
189
|
+
],
|
|
190
|
+
"metrics": self.metrics.summary(),
|
|
191
|
+
}
|
|
192
|
+
with open(path, "w") as f:
|
|
193
|
+
json.dump(data, f, indent=2)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def compute_semantic_similarity(text1: str, text2: str) -> float:
|
|
197
|
+
"""
|
|
198
|
+
Compute semantic similarity between two texts using embeddings.
|
|
199
|
+
|
|
200
|
+
Returns cosine similarity in range [0, 1].
|
|
201
|
+
"""
|
|
202
|
+
try:
|
|
203
|
+
from sentence_transformers import SentenceTransformer
|
|
204
|
+
import numpy as np
|
|
205
|
+
|
|
206
|
+
model = SentenceTransformer("all-MiniLM-L6-v2")
|
|
207
|
+
embeddings = model.encode([text1, text2])
|
|
208
|
+
|
|
209
|
+
# Cosine similarity
|
|
210
|
+
similarity = np.dot(embeddings[0], embeddings[1]) / (
|
|
211
|
+
np.linalg.norm(embeddings[0]) * np.linalg.norm(embeddings[1])
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
return float(similarity)
|
|
215
|
+
except ImportError:
|
|
216
|
+
logger.warning("sentence-transformers not installed, returning 0")
|
|
217
|
+
return 0.0
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def compute_rouge_l(reference: str, candidate: str) -> float:
|
|
221
|
+
"""
|
|
222
|
+
Compute ROUGE-L score (longest common subsequence).
|
|
223
|
+
|
|
224
|
+
Returns F1-based ROUGE-L score.
|
|
225
|
+
"""
|
|
226
|
+
def lcs_length(x: str, y: str) -> int:
|
|
227
|
+
"""Compute length of longest common subsequence."""
|
|
228
|
+
m, n = len(x), len(y)
|
|
229
|
+
dp = [[0] * (n + 1) for _ in range(m + 1)]
|
|
230
|
+
|
|
231
|
+
for i in range(1, m + 1):
|
|
232
|
+
for j in range(1, n + 1):
|
|
233
|
+
if x[i - 1] == y[j - 1]:
|
|
234
|
+
dp[i][j] = dp[i - 1][j - 1] + 1
|
|
235
|
+
else:
|
|
236
|
+
dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
|
|
237
|
+
|
|
238
|
+
return dp[m][n]
|
|
239
|
+
|
|
240
|
+
# Tokenize (simple word-level)
|
|
241
|
+
ref_tokens = reference.lower().split()
|
|
242
|
+
cand_tokens = candidate.lower().split()
|
|
243
|
+
|
|
244
|
+
if not ref_tokens or not cand_tokens:
|
|
245
|
+
return 0.0
|
|
246
|
+
|
|
247
|
+
lcs = lcs_length(" ".join(ref_tokens), " ".join(cand_tokens))
|
|
248
|
+
|
|
249
|
+
precision = lcs / len(cand_tokens)
|
|
250
|
+
recall = lcs / len(ref_tokens)
|
|
251
|
+
|
|
252
|
+
if precision + recall == 0:
|
|
253
|
+
return 0.0
|
|
254
|
+
|
|
255
|
+
return 2 * precision * recall / (precision + recall)
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def evaluate_retrieval(
|
|
259
|
+
skeleton: dict,
|
|
260
|
+
kv_store: Any,
|
|
261
|
+
test_cases: list[TestCase],
|
|
262
|
+
compute_answer_quality: bool = True,
|
|
263
|
+
) -> QualityMetrics:
|
|
264
|
+
"""
|
|
265
|
+
Evaluate retrieval quality against test cases.
|
|
266
|
+
|
|
267
|
+
Args:
|
|
268
|
+
skeleton: Skeleton index
|
|
269
|
+
kv_store: KV store with content
|
|
270
|
+
test_cases: List of test cases with ground truth
|
|
271
|
+
compute_answer_quality: Whether to compute answer metrics (slower)
|
|
272
|
+
|
|
273
|
+
Returns:
|
|
274
|
+
QualityMetrics with all evaluation results
|
|
275
|
+
"""
|
|
276
|
+
from rnsr.agent import run_navigator
|
|
277
|
+
|
|
278
|
+
metrics = QualityMetrics()
|
|
279
|
+
|
|
280
|
+
for i, tc in enumerate(test_cases):
|
|
281
|
+
logger.info("evaluating_test_case", index=i + 1, total=len(test_cases))
|
|
282
|
+
|
|
283
|
+
# Run the navigator
|
|
284
|
+
result = run_navigator(
|
|
285
|
+
question=tc.question,
|
|
286
|
+
skeleton=skeleton,
|
|
287
|
+
kv_store=kv_store,
|
|
288
|
+
max_iterations=20,
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
# Extract retrieved nodes
|
|
292
|
+
retrieved_nodes = result.get("nodes_visited", [])
|
|
293
|
+
|
|
294
|
+
# Retrieval metrics
|
|
295
|
+
retrieval = RetrievalMetrics(
|
|
296
|
+
question=tc.question,
|
|
297
|
+
expected_nodes=tc.expected_nodes,
|
|
298
|
+
retrieved_nodes=retrieved_nodes,
|
|
299
|
+
)
|
|
300
|
+
metrics.retrieval_metrics.append(retrieval)
|
|
301
|
+
|
|
302
|
+
# Answer quality metrics
|
|
303
|
+
if compute_answer_quality and tc.expected_answer:
|
|
304
|
+
generated = result.get("answer", "")
|
|
305
|
+
|
|
306
|
+
answer = AnswerMetrics(
|
|
307
|
+
question=tc.question,
|
|
308
|
+
expected_answer=tc.expected_answer,
|
|
309
|
+
generated_answer=generated,
|
|
310
|
+
semantic_similarity=compute_semantic_similarity(
|
|
311
|
+
tc.expected_answer, generated
|
|
312
|
+
),
|
|
313
|
+
rouge_l=compute_rouge_l(tc.expected_answer, generated),
|
|
314
|
+
contains_expected=all(
|
|
315
|
+
kw.lower() in generated.lower()
|
|
316
|
+
for kw in tc.keywords
|
|
317
|
+
) if tc.keywords else True,
|
|
318
|
+
)
|
|
319
|
+
metrics.answer_metrics.append(answer)
|
|
320
|
+
|
|
321
|
+
return metrics
|
|
@@ -0,0 +1,298 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Benchmark Runner
|
|
3
|
+
|
|
4
|
+
Orchestrates performance and quality benchmarks,
|
|
5
|
+
compares RNSR against baseline approaches.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import time
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from datetime import datetime, timezone
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Any, Literal
|
|
16
|
+
|
|
17
|
+
import structlog
|
|
18
|
+
|
|
19
|
+
from rnsr.benchmarks.performance import (
|
|
20
|
+
BenchmarkResult,
|
|
21
|
+
PerformanceBenchmark,
|
|
22
|
+
run_end_to_end_benchmark,
|
|
23
|
+
)
|
|
24
|
+
from rnsr.benchmarks.quality import (
|
|
25
|
+
QualityBenchmark,
|
|
26
|
+
QualityMetrics,
|
|
27
|
+
TestCase,
|
|
28
|
+
evaluate_retrieval,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
logger = structlog.get_logger(__name__)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class BenchmarkConfig:
|
|
36
|
+
"""Configuration for benchmark runs."""
|
|
37
|
+
|
|
38
|
+
# Files to benchmark
|
|
39
|
+
pdf_paths: list[Path] = field(default_factory=list)
|
|
40
|
+
|
|
41
|
+
# Test questions
|
|
42
|
+
questions: list[str] = field(default_factory=list)
|
|
43
|
+
|
|
44
|
+
# Quality test cases (optional)
|
|
45
|
+
quality_test_cases: list[TestCase] = field(default_factory=list)
|
|
46
|
+
|
|
47
|
+
# Settings
|
|
48
|
+
iterations: int = 1
|
|
49
|
+
warmup: bool = True
|
|
50
|
+
compute_quality: bool = True
|
|
51
|
+
compare_baseline: bool = False
|
|
52
|
+
|
|
53
|
+
# Output
|
|
54
|
+
output_dir: Path = field(default_factory=lambda: Path("benchmark_results"))
|
|
55
|
+
|
|
56
|
+
@classmethod
|
|
57
|
+
def from_json(cls, path: Path | str) -> "BenchmarkConfig":
|
|
58
|
+
"""Load config from JSON file."""
|
|
59
|
+
path = Path(path)
|
|
60
|
+
with open(path) as f:
|
|
61
|
+
data = json.load(f)
|
|
62
|
+
|
|
63
|
+
return cls(
|
|
64
|
+
pdf_paths=[Path(p) for p in data.get("pdf_paths", [])],
|
|
65
|
+
questions=data.get("questions", []),
|
|
66
|
+
quality_test_cases=[
|
|
67
|
+
TestCase(
|
|
68
|
+
question=tc["question"],
|
|
69
|
+
expected_nodes=tc.get("expected_nodes", []),
|
|
70
|
+
expected_answer=tc.get("expected_answer", ""),
|
|
71
|
+
keywords=tc.get("keywords", []),
|
|
72
|
+
)
|
|
73
|
+
for tc in data.get("quality_test_cases", [])
|
|
74
|
+
],
|
|
75
|
+
iterations=data.get("iterations", 1),
|
|
76
|
+
warmup=data.get("warmup", True),
|
|
77
|
+
compute_quality=data.get("compute_quality", True),
|
|
78
|
+
compare_baseline=data.get("compare_baseline", False),
|
|
79
|
+
output_dir=Path(data.get("output_dir", "benchmark_results")),
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@dataclass
|
|
84
|
+
class ComparisonResult:
|
|
85
|
+
"""Comparison between RNSR and baseline."""
|
|
86
|
+
|
|
87
|
+
method: Literal["rnsr", "baseline_chunk", "baseline_semantic"]
|
|
88
|
+
performance: PerformanceBenchmark | None = None
|
|
89
|
+
quality: QualityMetrics | None = None
|
|
90
|
+
|
|
91
|
+
def summary(self) -> dict[str, Any]:
|
|
92
|
+
return {
|
|
93
|
+
"method": self.method,
|
|
94
|
+
"performance": self.performance.summary() if self.performance else None,
|
|
95
|
+
"quality": self.quality.summary() if self.quality else None,
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
@dataclass
|
|
100
|
+
class BenchmarkReport:
|
|
101
|
+
"""Complete benchmark report."""
|
|
102
|
+
|
|
103
|
+
timestamp: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
|
|
104
|
+
config: BenchmarkConfig | None = None
|
|
105
|
+
|
|
106
|
+
# Results per file
|
|
107
|
+
file_results: dict[str, dict[str, Any]] = field(default_factory=dict)
|
|
108
|
+
|
|
109
|
+
# Aggregated results
|
|
110
|
+
rnsr_performance: PerformanceBenchmark | None = None
|
|
111
|
+
rnsr_quality: QualityMetrics | None = None
|
|
112
|
+
|
|
113
|
+
# Comparison (if enabled)
|
|
114
|
+
comparisons: list[ComparisonResult] = field(default_factory=list)
|
|
115
|
+
|
|
116
|
+
def to_json(self, path: Path | str) -> None:
|
|
117
|
+
"""Save report to JSON."""
|
|
118
|
+
path = Path(path)
|
|
119
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
120
|
+
|
|
121
|
+
data = {
|
|
122
|
+
"timestamp": self.timestamp,
|
|
123
|
+
"file_results": self.file_results,
|
|
124
|
+
"rnsr_performance": self.rnsr_performance.summary() if self.rnsr_performance else None,
|
|
125
|
+
"rnsr_quality": self.rnsr_quality.summary() if self.rnsr_quality else None,
|
|
126
|
+
"comparisons": [c.summary() for c in self.comparisons],
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
with open(path, "w") as f:
|
|
130
|
+
json.dump(data, f, indent=2)
|
|
131
|
+
|
|
132
|
+
logger.info("benchmark_report_saved", path=str(path))
|
|
133
|
+
|
|
134
|
+
def print_summary(self) -> None:
|
|
135
|
+
"""Print human-readable summary."""
|
|
136
|
+
print("\n" + "=" * 60)
|
|
137
|
+
print("RNSR BENCHMARK REPORT")
|
|
138
|
+
print("=" * 60)
|
|
139
|
+
print(f"Timestamp: {self.timestamp}")
|
|
140
|
+
print(f"Files benchmarked: {len(self.file_results)}")
|
|
141
|
+
|
|
142
|
+
if self.rnsr_performance:
|
|
143
|
+
print("\n--- Performance ---")
|
|
144
|
+
print(self.rnsr_performance)
|
|
145
|
+
|
|
146
|
+
if self.rnsr_quality:
|
|
147
|
+
print("\n--- Quality ---")
|
|
148
|
+
print(self.rnsr_quality)
|
|
149
|
+
|
|
150
|
+
if self.comparisons:
|
|
151
|
+
print("\n--- Comparison ---")
|
|
152
|
+
for comp in self.comparisons:
|
|
153
|
+
print(f"\n{comp.method}:")
|
|
154
|
+
if comp.performance:
|
|
155
|
+
perf = comp.performance.summary()
|
|
156
|
+
print(f" Query Latency: {perf.get('query_warm_ms', 'N/A')}ms")
|
|
157
|
+
if comp.quality:
|
|
158
|
+
qual = comp.quality.summary()
|
|
159
|
+
print(f" F1 Score: {qual.get('f1', 'N/A'):.3f}")
|
|
160
|
+
|
|
161
|
+
print("\n" + "=" * 60)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
class BenchmarkRunner:
|
|
165
|
+
"""Runs benchmarks according to configuration."""
|
|
166
|
+
|
|
167
|
+
def __init__(self, config: BenchmarkConfig):
|
|
168
|
+
self.config = config
|
|
169
|
+
self.report = BenchmarkReport(config=config)
|
|
170
|
+
|
|
171
|
+
def run(self) -> BenchmarkReport:
|
|
172
|
+
"""Run all configured benchmarks."""
|
|
173
|
+
logger.info("benchmark_runner_start", file_count=len(self.config.pdf_paths))
|
|
174
|
+
|
|
175
|
+
all_performance: list[PerformanceBenchmark] = []
|
|
176
|
+
all_quality: list[QualityMetrics] = []
|
|
177
|
+
|
|
178
|
+
for pdf_path in self.config.pdf_paths:
|
|
179
|
+
if not pdf_path.exists():
|
|
180
|
+
logger.warning("pdf_not_found", path=str(pdf_path))
|
|
181
|
+
continue
|
|
182
|
+
|
|
183
|
+
logger.info("benchmarking_file", file=pdf_path.name)
|
|
184
|
+
|
|
185
|
+
# Performance benchmark
|
|
186
|
+
perf = run_end_to_end_benchmark(
|
|
187
|
+
pdf_path,
|
|
188
|
+
self.config.questions,
|
|
189
|
+
)
|
|
190
|
+
all_performance.append(perf)
|
|
191
|
+
|
|
192
|
+
# Quality benchmark (if test cases provided)
|
|
193
|
+
quality = None
|
|
194
|
+
if self.config.compute_quality and self.config.quality_test_cases:
|
|
195
|
+
from rnsr.indexing import build_skeleton_index
|
|
196
|
+
from rnsr.ingestion import ingest_document
|
|
197
|
+
|
|
198
|
+
result = ingest_document(pdf_path)
|
|
199
|
+
skeleton, kv_store = build_skeleton_index(result.tree)
|
|
200
|
+
|
|
201
|
+
quality = evaluate_retrieval(
|
|
202
|
+
skeleton,
|
|
203
|
+
kv_store,
|
|
204
|
+
self.config.quality_test_cases,
|
|
205
|
+
)
|
|
206
|
+
all_quality.append(quality)
|
|
207
|
+
|
|
208
|
+
# Store per-file results
|
|
209
|
+
self.report.file_results[pdf_path.name] = {
|
|
210
|
+
"performance": perf.summary(),
|
|
211
|
+
"quality": quality.summary() if quality else None,
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
# Aggregate results
|
|
215
|
+
if all_performance:
|
|
216
|
+
self.report.rnsr_performance = self._aggregate_performance(all_performance)
|
|
217
|
+
|
|
218
|
+
if all_quality:
|
|
219
|
+
self.report.rnsr_quality = self._aggregate_quality(all_quality)
|
|
220
|
+
|
|
221
|
+
# Run baseline comparison if enabled
|
|
222
|
+
if self.config.compare_baseline:
|
|
223
|
+
self._run_baseline_comparison()
|
|
224
|
+
|
|
225
|
+
logger.info("benchmark_runner_complete")
|
|
226
|
+
|
|
227
|
+
return self.report
|
|
228
|
+
|
|
229
|
+
def _aggregate_performance(
|
|
230
|
+
self,
|
|
231
|
+
results: list[PerformanceBenchmark],
|
|
232
|
+
) -> PerformanceBenchmark:
|
|
233
|
+
"""Aggregate multiple performance results."""
|
|
234
|
+
# For now, just return the last one
|
|
235
|
+
# TODO: Implement proper averaging
|
|
236
|
+
return results[-1] if results else PerformanceBenchmark()
|
|
237
|
+
|
|
238
|
+
def _aggregate_quality(
|
|
239
|
+
self,
|
|
240
|
+
results: list[QualityMetrics],
|
|
241
|
+
) -> QualityMetrics:
|
|
242
|
+
"""Aggregate multiple quality results."""
|
|
243
|
+
combined = QualityMetrics()
|
|
244
|
+
for r in results:
|
|
245
|
+
combined.retrieval_metrics.extend(r.retrieval_metrics)
|
|
246
|
+
combined.answer_metrics.extend(r.answer_metrics)
|
|
247
|
+
return combined
|
|
248
|
+
|
|
249
|
+
def _run_baseline_comparison(self) -> None:
|
|
250
|
+
"""Run baseline chunking approach for comparison."""
|
|
251
|
+
logger.info("running_baseline_comparison")
|
|
252
|
+
|
|
253
|
+
# TODO: Implement baseline comparison
|
|
254
|
+
# This would use simple fixed-size chunking instead of RNSR
|
|
255
|
+
|
|
256
|
+
baseline = ComparisonResult(
|
|
257
|
+
method="baseline_chunk",
|
|
258
|
+
performance=None, # Would run baseline benchmark
|
|
259
|
+
quality=None, # Would evaluate baseline quality
|
|
260
|
+
)
|
|
261
|
+
self.report.comparisons.append(baseline)
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def run_full_benchmark(
|
|
265
|
+
pdf_paths: list[Path | str],
|
|
266
|
+
questions: list[str] | None = None,
|
|
267
|
+
output_dir: Path | str = "benchmark_results",
|
|
268
|
+
) -> BenchmarkReport:
|
|
269
|
+
"""
|
|
270
|
+
Convenience function to run a full benchmark.
|
|
271
|
+
|
|
272
|
+
Args:
|
|
273
|
+
pdf_paths: List of PDF files to benchmark
|
|
274
|
+
questions: Test questions (optional)
|
|
275
|
+
output_dir: Directory for results
|
|
276
|
+
|
|
277
|
+
Returns:
|
|
278
|
+
BenchmarkReport with all results
|
|
279
|
+
"""
|
|
280
|
+
config = BenchmarkConfig(
|
|
281
|
+
pdf_paths=[Path(p) for p in pdf_paths],
|
|
282
|
+
questions=questions or [
|
|
283
|
+
"What is this document about?",
|
|
284
|
+
"What are the main sections?",
|
|
285
|
+
"What are the key findings?",
|
|
286
|
+
],
|
|
287
|
+
output_dir=Path(output_dir),
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
runner = BenchmarkRunner(config)
|
|
291
|
+
report = runner.run()
|
|
292
|
+
|
|
293
|
+
# Save results
|
|
294
|
+
output_path = config.output_dir / f"benchmark_{int(time.time())}.json"
|
|
295
|
+
report.to_json(output_path)
|
|
296
|
+
report.print_summary()
|
|
297
|
+
|
|
298
|
+
return report
|