rnsr 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rnsr/__init__.py +118 -0
- rnsr/__main__.py +242 -0
- rnsr/agent/__init__.py +218 -0
- rnsr/agent/cross_doc_navigator.py +767 -0
- rnsr/agent/graph.py +1557 -0
- rnsr/agent/llm_cache.py +575 -0
- rnsr/agent/navigator_api.py +497 -0
- rnsr/agent/provenance.py +772 -0
- rnsr/agent/query_clarifier.py +617 -0
- rnsr/agent/reasoning_memory.py +736 -0
- rnsr/agent/repl_env.py +709 -0
- rnsr/agent/rlm_navigator.py +2108 -0
- rnsr/agent/self_reflection.py +602 -0
- rnsr/agent/variable_store.py +308 -0
- rnsr/benchmarks/__init__.py +118 -0
- rnsr/benchmarks/comprehensive_benchmark.py +733 -0
- rnsr/benchmarks/evaluation_suite.py +1210 -0
- rnsr/benchmarks/finance_bench.py +147 -0
- rnsr/benchmarks/pdf_merger.py +178 -0
- rnsr/benchmarks/performance.py +321 -0
- rnsr/benchmarks/quality.py +321 -0
- rnsr/benchmarks/runner.py +298 -0
- rnsr/benchmarks/standard_benchmarks.py +995 -0
- rnsr/client.py +560 -0
- rnsr/document_store.py +394 -0
- rnsr/exceptions.py +74 -0
- rnsr/extraction/__init__.py +172 -0
- rnsr/extraction/candidate_extractor.py +357 -0
- rnsr/extraction/entity_extractor.py +581 -0
- rnsr/extraction/entity_linker.py +825 -0
- rnsr/extraction/grounded_extractor.py +722 -0
- rnsr/extraction/learned_types.py +599 -0
- rnsr/extraction/models.py +232 -0
- rnsr/extraction/relationship_extractor.py +600 -0
- rnsr/extraction/relationship_patterns.py +511 -0
- rnsr/extraction/relationship_validator.py +392 -0
- rnsr/extraction/rlm_extractor.py +589 -0
- rnsr/extraction/rlm_unified_extractor.py +990 -0
- rnsr/extraction/tot_validator.py +610 -0
- rnsr/extraction/unified_extractor.py +342 -0
- rnsr/indexing/__init__.py +60 -0
- rnsr/indexing/knowledge_graph.py +1128 -0
- rnsr/indexing/kv_store.py +313 -0
- rnsr/indexing/persistence.py +323 -0
- rnsr/indexing/semantic_retriever.py +237 -0
- rnsr/indexing/semantic_search.py +320 -0
- rnsr/indexing/skeleton_index.py +395 -0
- rnsr/ingestion/__init__.py +161 -0
- rnsr/ingestion/chart_parser.py +569 -0
- rnsr/ingestion/document_boundary.py +662 -0
- rnsr/ingestion/font_histogram.py +334 -0
- rnsr/ingestion/header_classifier.py +595 -0
- rnsr/ingestion/hierarchical_cluster.py +515 -0
- rnsr/ingestion/layout_detector.py +356 -0
- rnsr/ingestion/layout_model.py +379 -0
- rnsr/ingestion/ocr_fallback.py +177 -0
- rnsr/ingestion/pipeline.py +936 -0
- rnsr/ingestion/semantic_fallback.py +417 -0
- rnsr/ingestion/table_parser.py +799 -0
- rnsr/ingestion/text_builder.py +460 -0
- rnsr/ingestion/tree_builder.py +402 -0
- rnsr/ingestion/vision_retrieval.py +965 -0
- rnsr/ingestion/xy_cut.py +555 -0
- rnsr/llm.py +733 -0
- rnsr/models.py +167 -0
- rnsr/py.typed +2 -0
- rnsr-0.1.0.dist-info/METADATA +592 -0
- rnsr-0.1.0.dist-info/RECORD +72 -0
- rnsr-0.1.0.dist-info/WHEEL +5 -0
- rnsr-0.1.0.dist-info/entry_points.txt +2 -0
- rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
- rnsr-0.1.0.dist-info/top_level.txt +1 -0
rnsr/agent/provenance.py
ADDED
|
@@ -0,0 +1,772 @@
|
|
|
1
|
+
"""
|
|
2
|
+
RNSR Provenance and Citation System
|
|
3
|
+
|
|
4
|
+
Every answer should trace back to exact document evidence.
|
|
5
|
+
Provides structured citations with:
|
|
6
|
+
- Exact document location (doc_id, node_id, page_num)
|
|
7
|
+
- Exact quote with character spans
|
|
8
|
+
- Confidence score per citation
|
|
9
|
+
- Contradiction detection when sources disagree
|
|
10
|
+
|
|
11
|
+
Critical for legal, academic, and enterprise use cases where
|
|
12
|
+
answers must be verifiable.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import hashlib
|
|
18
|
+
import json
|
|
19
|
+
import re
|
|
20
|
+
from dataclasses import dataclass, field
|
|
21
|
+
from datetime import datetime
|
|
22
|
+
from enum import Enum
|
|
23
|
+
from typing import Any
|
|
24
|
+
from uuid import uuid4
|
|
25
|
+
|
|
26
|
+
import structlog
|
|
27
|
+
|
|
28
|
+
logger = structlog.get_logger(__name__)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# =============================================================================
|
|
32
|
+
# Citation Models
|
|
33
|
+
# =============================================================================
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class CitationStrength(str, Enum):
|
|
37
|
+
"""How strongly a citation supports a claim."""
|
|
38
|
+
|
|
39
|
+
DIRECT = "direct" # Explicitly states the claim
|
|
40
|
+
SUPPORTING = "supporting" # Implies or supports the claim
|
|
41
|
+
CONTEXTUAL = "contextual" # Provides background context
|
|
42
|
+
WEAK = "weak" # Tangentially related
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class ContradictionType(str, Enum):
|
|
46
|
+
"""Types of contradictions between sources."""
|
|
47
|
+
|
|
48
|
+
DIRECT = "direct" # Sources directly contradict
|
|
49
|
+
TEMPORAL = "temporal" # Different time periods
|
|
50
|
+
PARTIAL = "partial" # Partially contradictory
|
|
51
|
+
INTERPRETATION = "interpretation" # Different interpretations
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@dataclass
|
|
55
|
+
class Citation:
|
|
56
|
+
"""
|
|
57
|
+
A structured citation linking an answer to document evidence.
|
|
58
|
+
|
|
59
|
+
Provides complete traceability from answer to source.
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
id: str = field(default_factory=lambda: f"cite_{str(uuid4())[:8]}")
|
|
63
|
+
|
|
64
|
+
# Document location
|
|
65
|
+
doc_id: str = ""
|
|
66
|
+
node_id: str = ""
|
|
67
|
+
page_num: int | None = None
|
|
68
|
+
|
|
69
|
+
# Exact quote
|
|
70
|
+
quote: str = ""
|
|
71
|
+
span_start: int | None = None
|
|
72
|
+
span_end: int | None = None
|
|
73
|
+
|
|
74
|
+
# Context around the quote
|
|
75
|
+
context_before: str = ""
|
|
76
|
+
context_after: str = ""
|
|
77
|
+
|
|
78
|
+
# Relevance and confidence
|
|
79
|
+
strength: CitationStrength = CitationStrength.SUPPORTING
|
|
80
|
+
confidence: float = 0.5
|
|
81
|
+
relevance_score: float = 0.5
|
|
82
|
+
|
|
83
|
+
# Claim this citation supports
|
|
84
|
+
claim: str = ""
|
|
85
|
+
|
|
86
|
+
# Metadata
|
|
87
|
+
extracted_at: str = field(default_factory=lambda: datetime.utcnow().isoformat())
|
|
88
|
+
section_header: str = ""
|
|
89
|
+
|
|
90
|
+
def to_dict(self) -> dict[str, Any]:
|
|
91
|
+
"""Convert to dictionary."""
|
|
92
|
+
return {
|
|
93
|
+
"id": self.id,
|
|
94
|
+
"doc_id": self.doc_id,
|
|
95
|
+
"node_id": self.node_id,
|
|
96
|
+
"page_num": self.page_num,
|
|
97
|
+
"quote": self.quote,
|
|
98
|
+
"span_start": self.span_start,
|
|
99
|
+
"span_end": self.span_end,
|
|
100
|
+
"context_before": self.context_before,
|
|
101
|
+
"context_after": self.context_after,
|
|
102
|
+
"strength": self.strength.value,
|
|
103
|
+
"confidence": self.confidence,
|
|
104
|
+
"relevance_score": self.relevance_score,
|
|
105
|
+
"claim": self.claim,
|
|
106
|
+
"section_header": self.section_header,
|
|
107
|
+
"extracted_at": self.extracted_at,
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
def to_formatted_string(self, include_context: bool = False) -> str:
|
|
111
|
+
"""Format citation for display."""
|
|
112
|
+
parts = []
|
|
113
|
+
|
|
114
|
+
if self.doc_id:
|
|
115
|
+
parts.append(f"[{self.doc_id}]")
|
|
116
|
+
|
|
117
|
+
if self.section_header:
|
|
118
|
+
parts.append(f"Section: {self.section_header}")
|
|
119
|
+
|
|
120
|
+
if self.page_num:
|
|
121
|
+
parts.append(f"Page {self.page_num}")
|
|
122
|
+
|
|
123
|
+
location = ", ".join(parts) if parts else "Unknown location"
|
|
124
|
+
|
|
125
|
+
quote_display = self.quote
|
|
126
|
+
if len(quote_display) > 200:
|
|
127
|
+
quote_display = quote_display[:200] + "..."
|
|
128
|
+
|
|
129
|
+
result = f'{location}: "{quote_display}"'
|
|
130
|
+
|
|
131
|
+
if include_context and (self.context_before or self.context_after):
|
|
132
|
+
result += f"\n Context: ...{self.context_before[-50:]}" if self.context_before else ""
|
|
133
|
+
result += f"[QUOTE]{self.context_after[:50]}..." if self.context_after else ""
|
|
134
|
+
|
|
135
|
+
return result
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
@dataclass
|
|
139
|
+
class Contradiction:
|
|
140
|
+
"""A detected contradiction between citations."""
|
|
141
|
+
|
|
142
|
+
id: str = field(default_factory=lambda: f"contra_{str(uuid4())[:8]}")
|
|
143
|
+
|
|
144
|
+
citation_1_id: str = ""
|
|
145
|
+
citation_2_id: str = ""
|
|
146
|
+
|
|
147
|
+
type: ContradictionType = ContradictionType.PARTIAL
|
|
148
|
+
|
|
149
|
+
description: str = ""
|
|
150
|
+
resolution_suggestion: str = ""
|
|
151
|
+
|
|
152
|
+
confidence: float = 0.5
|
|
153
|
+
|
|
154
|
+
def to_dict(self) -> dict[str, Any]:
|
|
155
|
+
"""Convert to dictionary."""
|
|
156
|
+
return {
|
|
157
|
+
"id": self.id,
|
|
158
|
+
"citation_1_id": self.citation_1_id,
|
|
159
|
+
"citation_2_id": self.citation_2_id,
|
|
160
|
+
"type": self.type.value,
|
|
161
|
+
"description": self.description,
|
|
162
|
+
"resolution_suggestion": self.resolution_suggestion,
|
|
163
|
+
"confidence": self.confidence,
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
@dataclass
|
|
168
|
+
class ProvenanceRecord:
|
|
169
|
+
"""
|
|
170
|
+
Complete provenance for an answer.
|
|
171
|
+
|
|
172
|
+
Links an answer to all supporting citations and any contradictions.
|
|
173
|
+
"""
|
|
174
|
+
|
|
175
|
+
id: str = field(default_factory=lambda: f"prov_{str(uuid4())[:8]}")
|
|
176
|
+
|
|
177
|
+
# The answer being traced
|
|
178
|
+
answer: str = ""
|
|
179
|
+
question: str = ""
|
|
180
|
+
|
|
181
|
+
# All citations supporting this answer
|
|
182
|
+
citations: list[Citation] = field(default_factory=list)
|
|
183
|
+
|
|
184
|
+
# Detected contradictions
|
|
185
|
+
contradictions: list[Contradiction] = field(default_factory=list)
|
|
186
|
+
|
|
187
|
+
# Overall confidence based on citations
|
|
188
|
+
aggregate_confidence: float = 0.0
|
|
189
|
+
|
|
190
|
+
# Summary
|
|
191
|
+
evidence_summary: str = ""
|
|
192
|
+
|
|
193
|
+
# Metadata
|
|
194
|
+
created_at: str = field(default_factory=lambda: datetime.utcnow().isoformat())
|
|
195
|
+
|
|
196
|
+
def to_dict(self) -> dict[str, Any]:
|
|
197
|
+
"""Convert to dictionary."""
|
|
198
|
+
return {
|
|
199
|
+
"id": self.id,
|
|
200
|
+
"answer": self.answer,
|
|
201
|
+
"question": self.question,
|
|
202
|
+
"citations": [c.to_dict() for c in self.citations],
|
|
203
|
+
"contradictions": [c.to_dict() for c in self.contradictions],
|
|
204
|
+
"aggregate_confidence": self.aggregate_confidence,
|
|
205
|
+
"evidence_summary": self.evidence_summary,
|
|
206
|
+
"created_at": self.created_at,
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
def to_markdown(self) -> str:
|
|
210
|
+
"""Export as markdown for documentation."""
|
|
211
|
+
lines = [
|
|
212
|
+
f"# Provenance Record",
|
|
213
|
+
f"",
|
|
214
|
+
f"**Question:** {self.question}",
|
|
215
|
+
f"",
|
|
216
|
+
f"**Answer:** {self.answer}",
|
|
217
|
+
f"",
|
|
218
|
+
f"**Confidence:** {self.aggregate_confidence:.2%}",
|
|
219
|
+
f"",
|
|
220
|
+
f"## Citations ({len(self.citations)})",
|
|
221
|
+
f"",
|
|
222
|
+
]
|
|
223
|
+
|
|
224
|
+
for i, citation in enumerate(self.citations, 1):
|
|
225
|
+
lines.append(f"### Citation {i}")
|
|
226
|
+
lines.append(f"- **Document:** {citation.doc_id}")
|
|
227
|
+
lines.append(f"- **Section:** {citation.section_header or citation.node_id}")
|
|
228
|
+
if citation.page_num:
|
|
229
|
+
lines.append(f"- **Page:** {citation.page_num}")
|
|
230
|
+
lines.append(f"- **Strength:** {citation.strength.value}")
|
|
231
|
+
lines.append(f"- **Confidence:** {citation.confidence:.2%}")
|
|
232
|
+
lines.append(f"")
|
|
233
|
+
lines.append(f"> {citation.quote}")
|
|
234
|
+
lines.append(f"")
|
|
235
|
+
|
|
236
|
+
if self.contradictions:
|
|
237
|
+
lines.append(f"## Contradictions ({len(self.contradictions)})")
|
|
238
|
+
lines.append(f"")
|
|
239
|
+
for contra in self.contradictions:
|
|
240
|
+
lines.append(f"- **Type:** {contra.type.value}")
|
|
241
|
+
lines.append(f"- **Description:** {contra.description}")
|
|
242
|
+
if contra.resolution_suggestion:
|
|
243
|
+
lines.append(f"- **Resolution:** {contra.resolution_suggestion}")
|
|
244
|
+
lines.append(f"")
|
|
245
|
+
|
|
246
|
+
return "\n".join(lines)
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
# =============================================================================
|
|
250
|
+
# Provenance Tracker
|
|
251
|
+
# =============================================================================
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
class ProvenanceTracker:
|
|
255
|
+
"""
|
|
256
|
+
Tracks provenance for answers.
|
|
257
|
+
|
|
258
|
+
Extracts citations from navigation results and detects contradictions.
|
|
259
|
+
"""
|
|
260
|
+
|
|
261
|
+
def __init__(
|
|
262
|
+
self,
|
|
263
|
+
kv_store: Any | None = None,
|
|
264
|
+
skeleton: dict | None = None,
|
|
265
|
+
min_quote_length: int = 20,
|
|
266
|
+
context_window: int = 100,
|
|
267
|
+
):
|
|
268
|
+
"""
|
|
269
|
+
Initialize the provenance tracker.
|
|
270
|
+
|
|
271
|
+
Args:
|
|
272
|
+
kv_store: KV store for retrieving full content.
|
|
273
|
+
skeleton: Skeleton index for node metadata.
|
|
274
|
+
min_quote_length: Minimum quote length to consider.
|
|
275
|
+
context_window: Characters of context around quotes.
|
|
276
|
+
"""
|
|
277
|
+
self.kv_store = kv_store
|
|
278
|
+
self.skeleton = skeleton or {}
|
|
279
|
+
self.min_quote_length = min_quote_length
|
|
280
|
+
self.context_window = context_window
|
|
281
|
+
|
|
282
|
+
def extract_citations(
|
|
283
|
+
self,
|
|
284
|
+
answer: str,
|
|
285
|
+
question: str,
|
|
286
|
+
variables: dict[str, Any],
|
|
287
|
+
trace: list[dict] | None = None,
|
|
288
|
+
) -> list[Citation]:
|
|
289
|
+
"""
|
|
290
|
+
Extract citations from navigation results.
|
|
291
|
+
|
|
292
|
+
Args:
|
|
293
|
+
answer: The generated answer.
|
|
294
|
+
question: The original question.
|
|
295
|
+
variables: Variable store contents (contains retrieved content).
|
|
296
|
+
trace: Navigation trace entries.
|
|
297
|
+
|
|
298
|
+
Returns:
|
|
299
|
+
List of Citation objects.
|
|
300
|
+
"""
|
|
301
|
+
citations = []
|
|
302
|
+
|
|
303
|
+
# Extract from variables (most reliable source)
|
|
304
|
+
for var_name, var_data in variables.items():
|
|
305
|
+
if isinstance(var_data, dict):
|
|
306
|
+
citation = self._extract_citation_from_variable(
|
|
307
|
+
var_name, var_data, answer
|
|
308
|
+
)
|
|
309
|
+
if citation:
|
|
310
|
+
citations.append(citation)
|
|
311
|
+
|
|
312
|
+
# Extract from trace if available
|
|
313
|
+
if trace:
|
|
314
|
+
trace_citations = self._extract_citations_from_trace(trace, answer)
|
|
315
|
+
citations.extend(trace_citations)
|
|
316
|
+
|
|
317
|
+
# Deduplicate
|
|
318
|
+
citations = self._deduplicate_citations(citations)
|
|
319
|
+
|
|
320
|
+
# Score relevance to answer
|
|
321
|
+
for citation in citations:
|
|
322
|
+
citation.relevance_score = self._score_relevance(
|
|
323
|
+
citation.quote, answer
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
# Sort by relevance
|
|
327
|
+
citations.sort(key=lambda c: -c.relevance_score)
|
|
328
|
+
|
|
329
|
+
logger.info(
|
|
330
|
+
"citations_extracted",
|
|
331
|
+
count=len(citations),
|
|
332
|
+
question=question[:50],
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
return citations
|
|
336
|
+
|
|
337
|
+
def _extract_citation_from_variable(
|
|
338
|
+
self,
|
|
339
|
+
var_name: str,
|
|
340
|
+
var_data: dict,
|
|
341
|
+
answer: str,
|
|
342
|
+
) -> Citation | None:
|
|
343
|
+
"""Extract citation from a stored variable."""
|
|
344
|
+
content = var_data.get("content", "")
|
|
345
|
+
|
|
346
|
+
if not content or len(content) < self.min_quote_length:
|
|
347
|
+
return None
|
|
348
|
+
|
|
349
|
+
# Find the most relevant quote from this content
|
|
350
|
+
quote, span_start, span_end = self._find_best_quote(content, answer)
|
|
351
|
+
|
|
352
|
+
if not quote:
|
|
353
|
+
return None
|
|
354
|
+
|
|
355
|
+
# Get context around quote
|
|
356
|
+
context_before = content[max(0, span_start - self.context_window):span_start]
|
|
357
|
+
context_after = content[span_end:span_end + self.context_window]
|
|
358
|
+
|
|
359
|
+
# Determine citation strength
|
|
360
|
+
strength = self._determine_strength(quote, answer)
|
|
361
|
+
|
|
362
|
+
# Get node metadata
|
|
363
|
+
node_id = var_data.get("node_id", var_name)
|
|
364
|
+
doc_id = var_data.get("doc_id", "")
|
|
365
|
+
page_num = var_data.get("page_num")
|
|
366
|
+
|
|
367
|
+
# Get section header from skeleton
|
|
368
|
+
section_header = ""
|
|
369
|
+
if node_id in self.skeleton:
|
|
370
|
+
section_header = self.skeleton[node_id].get("header", "")
|
|
371
|
+
|
|
372
|
+
return Citation(
|
|
373
|
+
doc_id=doc_id,
|
|
374
|
+
node_id=node_id,
|
|
375
|
+
page_num=page_num,
|
|
376
|
+
quote=quote,
|
|
377
|
+
span_start=span_start,
|
|
378
|
+
span_end=span_end,
|
|
379
|
+
context_before=context_before,
|
|
380
|
+
context_after=context_after,
|
|
381
|
+
strength=strength,
|
|
382
|
+
confidence=0.8 if strength == CitationStrength.DIRECT else 0.6,
|
|
383
|
+
section_header=section_header,
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
def _extract_citations_from_trace(
|
|
387
|
+
self,
|
|
388
|
+
trace: list[dict],
|
|
389
|
+
answer: str,
|
|
390
|
+
) -> list[Citation]:
|
|
391
|
+
"""Extract citations from navigation trace."""
|
|
392
|
+
citations = []
|
|
393
|
+
|
|
394
|
+
for entry in trace:
|
|
395
|
+
if entry.get("action") == "read_content":
|
|
396
|
+
node_id = entry.get("node_id", "")
|
|
397
|
+
content = entry.get("content", "")
|
|
398
|
+
|
|
399
|
+
if content and len(content) >= self.min_quote_length:
|
|
400
|
+
quote, start, end = self._find_best_quote(content, answer)
|
|
401
|
+
|
|
402
|
+
if quote:
|
|
403
|
+
citations.append(Citation(
|
|
404
|
+
node_id=node_id,
|
|
405
|
+
quote=quote,
|
|
406
|
+
span_start=start,
|
|
407
|
+
span_end=end,
|
|
408
|
+
strength=self._determine_strength(quote, answer),
|
|
409
|
+
))
|
|
410
|
+
|
|
411
|
+
return citations
|
|
412
|
+
|
|
413
|
+
def _find_best_quote(
|
|
414
|
+
self,
|
|
415
|
+
content: str,
|
|
416
|
+
answer: str,
|
|
417
|
+
) -> tuple[str, int, int]:
|
|
418
|
+
"""Find the most relevant quote from content."""
|
|
419
|
+
# Split answer into key phrases
|
|
420
|
+
answer_words = set(answer.lower().split())
|
|
421
|
+
|
|
422
|
+
# Find sentences in content
|
|
423
|
+
sentences = re.split(r'[.!?]\s+', content)
|
|
424
|
+
|
|
425
|
+
best_quote = ""
|
|
426
|
+
best_score = 0
|
|
427
|
+
best_start = 0
|
|
428
|
+
best_end = 0
|
|
429
|
+
|
|
430
|
+
current_pos = 0
|
|
431
|
+
for sentence in sentences:
|
|
432
|
+
sentence = sentence.strip()
|
|
433
|
+
if len(sentence) < self.min_quote_length:
|
|
434
|
+
current_pos += len(sentence) + 2
|
|
435
|
+
continue
|
|
436
|
+
|
|
437
|
+
# Score by word overlap
|
|
438
|
+
sentence_words = set(sentence.lower().split())
|
|
439
|
+
overlap = len(answer_words & sentence_words)
|
|
440
|
+
score = overlap / max(len(answer_words), 1)
|
|
441
|
+
|
|
442
|
+
if score > best_score:
|
|
443
|
+
best_score = score
|
|
444
|
+
best_quote = sentence
|
|
445
|
+
best_start = content.find(sentence, current_pos)
|
|
446
|
+
best_end = best_start + len(sentence)
|
|
447
|
+
|
|
448
|
+
current_pos += len(sentence) + 2
|
|
449
|
+
|
|
450
|
+
return best_quote, best_start, best_end
|
|
451
|
+
|
|
452
|
+
def _determine_strength(self, quote: str, answer: str) -> CitationStrength:
|
|
453
|
+
"""Determine how strongly a quote supports the answer."""
|
|
454
|
+
quote_lower = quote.lower()
|
|
455
|
+
answer_lower = answer.lower()
|
|
456
|
+
|
|
457
|
+
# Check for direct overlap
|
|
458
|
+
answer_words = set(answer_lower.split())
|
|
459
|
+
quote_words = set(quote_lower.split())
|
|
460
|
+
overlap = len(answer_words & quote_words) / max(len(answer_words), 1)
|
|
461
|
+
|
|
462
|
+
if overlap > 0.5:
|
|
463
|
+
return CitationStrength.DIRECT
|
|
464
|
+
elif overlap > 0.3:
|
|
465
|
+
return CitationStrength.SUPPORTING
|
|
466
|
+
elif overlap > 0.1:
|
|
467
|
+
return CitationStrength.CONTEXTUAL
|
|
468
|
+
else:
|
|
469
|
+
return CitationStrength.WEAK
|
|
470
|
+
|
|
471
|
+
def _score_relevance(self, quote: str, answer: str) -> float:
|
|
472
|
+
"""Score relevance of quote to answer."""
|
|
473
|
+
if not quote or not answer:
|
|
474
|
+
return 0.0
|
|
475
|
+
|
|
476
|
+
quote_words = set(quote.lower().split())
|
|
477
|
+
answer_words = set(answer.lower().split())
|
|
478
|
+
|
|
479
|
+
if not answer_words:
|
|
480
|
+
return 0.0
|
|
481
|
+
|
|
482
|
+
overlap = len(quote_words & answer_words)
|
|
483
|
+
return overlap / len(answer_words)
|
|
484
|
+
|
|
485
|
+
def _deduplicate_citations(
|
|
486
|
+
self,
|
|
487
|
+
citations: list[Citation],
|
|
488
|
+
) -> list[Citation]:
|
|
489
|
+
"""Remove duplicate citations."""
|
|
490
|
+
seen_quotes = set()
|
|
491
|
+
unique = []
|
|
492
|
+
|
|
493
|
+
for citation in citations:
|
|
494
|
+
# Hash the quote
|
|
495
|
+
quote_hash = hashlib.md5(citation.quote.encode()).hexdigest()[:16]
|
|
496
|
+
|
|
497
|
+
if quote_hash not in seen_quotes:
|
|
498
|
+
seen_quotes.add(quote_hash)
|
|
499
|
+
unique.append(citation)
|
|
500
|
+
|
|
501
|
+
return unique
|
|
502
|
+
|
|
503
|
+
def detect_contradictions(
|
|
504
|
+
self,
|
|
505
|
+
citations: list[Citation],
|
|
506
|
+
llm_fn: Any | None = None,
|
|
507
|
+
) -> list[Contradiction]:
|
|
508
|
+
"""
|
|
509
|
+
Detect contradictions between citations.
|
|
510
|
+
|
|
511
|
+
Args:
|
|
512
|
+
citations: List of citations to check.
|
|
513
|
+
llm_fn: Optional LLM function for semantic contradiction detection.
|
|
514
|
+
|
|
515
|
+
Returns:
|
|
516
|
+
List of Contradiction objects.
|
|
517
|
+
"""
|
|
518
|
+
contradictions = []
|
|
519
|
+
|
|
520
|
+
if len(citations) < 2:
|
|
521
|
+
return contradictions
|
|
522
|
+
|
|
523
|
+
# Simple heuristic-based detection
|
|
524
|
+
for i, c1 in enumerate(citations):
|
|
525
|
+
for c2 in citations[i + 1:]:
|
|
526
|
+
contradiction = self._check_contradiction(c1, c2)
|
|
527
|
+
if contradiction:
|
|
528
|
+
contradictions.append(contradiction)
|
|
529
|
+
|
|
530
|
+
# Optional: LLM-based semantic detection
|
|
531
|
+
if llm_fn and len(citations) >= 2:
|
|
532
|
+
semantic_contradictions = self._detect_semantic_contradictions(
|
|
533
|
+
citations, llm_fn
|
|
534
|
+
)
|
|
535
|
+
contradictions.extend(semantic_contradictions)
|
|
536
|
+
|
|
537
|
+
return contradictions
|
|
538
|
+
|
|
539
|
+
def _check_contradiction(
|
|
540
|
+
self,
|
|
541
|
+
c1: Citation,
|
|
542
|
+
c2: Citation,
|
|
543
|
+
) -> Contradiction | None:
|
|
544
|
+
"""Check for contradiction between two citations using heuristics."""
|
|
545
|
+
q1 = c1.quote.lower()
|
|
546
|
+
q2 = c2.quote.lower()
|
|
547
|
+
|
|
548
|
+
# Look for negation patterns
|
|
549
|
+
negation_pairs = [
|
|
550
|
+
("is not", "is"),
|
|
551
|
+
("was not", "was"),
|
|
552
|
+
("did not", "did"),
|
|
553
|
+
("cannot", "can"),
|
|
554
|
+
("never", "always"),
|
|
555
|
+
("false", "true"),
|
|
556
|
+
("incorrect", "correct"),
|
|
557
|
+
]
|
|
558
|
+
|
|
559
|
+
for neg, pos in negation_pairs:
|
|
560
|
+
if (neg in q1 and pos in q2 and neg not in q2) or \
|
|
561
|
+
(neg in q2 and pos in q1 and neg not in q1):
|
|
562
|
+
return Contradiction(
|
|
563
|
+
citation_1_id=c1.id,
|
|
564
|
+
citation_2_id=c2.id,
|
|
565
|
+
type=ContradictionType.DIRECT,
|
|
566
|
+
description=f"Potential negation contradiction detected",
|
|
567
|
+
confidence=0.6,
|
|
568
|
+
)
|
|
569
|
+
|
|
570
|
+
# Look for number contradictions
|
|
571
|
+
nums_1 = re.findall(r'\$?[\d,]+\.?\d*', q1)
|
|
572
|
+
nums_2 = re.findall(r'\$?[\d,]+\.?\d*', q2)
|
|
573
|
+
|
|
574
|
+
if nums_1 and nums_2 and nums_1 != nums_2:
|
|
575
|
+
# Could be contradictory numbers
|
|
576
|
+
return Contradiction(
|
|
577
|
+
citation_1_id=c1.id,
|
|
578
|
+
citation_2_id=c2.id,
|
|
579
|
+
type=ContradictionType.PARTIAL,
|
|
580
|
+
description=f"Different numbers mentioned: {nums_1} vs {nums_2}",
|
|
581
|
+
confidence=0.4,
|
|
582
|
+
)
|
|
583
|
+
|
|
584
|
+
return None
|
|
585
|
+
|
|
586
|
+
def _detect_semantic_contradictions(
|
|
587
|
+
self,
|
|
588
|
+
citations: list[Citation],
|
|
589
|
+
llm_fn: Any,
|
|
590
|
+
) -> list[Contradiction]:
|
|
591
|
+
"""Use LLM to detect semantic contradictions."""
|
|
592
|
+
if len(citations) < 2:
|
|
593
|
+
return []
|
|
594
|
+
|
|
595
|
+
# Build prompt with top citations
|
|
596
|
+
top_citations = citations[:5] # Limit to avoid token overflow
|
|
597
|
+
|
|
598
|
+
quotes_text = "\n".join([
|
|
599
|
+
f"[{i+1}] {c.quote[:200]}"
|
|
600
|
+
for i, c in enumerate(top_citations)
|
|
601
|
+
])
|
|
602
|
+
|
|
603
|
+
prompt = f"""Analyze these quotes for contradictions:
|
|
604
|
+
|
|
605
|
+
{quotes_text}
|
|
606
|
+
|
|
607
|
+
Do any of these quotes contradict each other? If yes, specify which quotes (by number) and explain the contradiction.
|
|
608
|
+
|
|
609
|
+
Respond in JSON:
|
|
610
|
+
{{
|
|
611
|
+
"contradictions": [
|
|
612
|
+
{{"quote_1": 1, "quote_2": 2, "type": "direct|temporal|partial", "explanation": "..."}}
|
|
613
|
+
]
|
|
614
|
+
}}
|
|
615
|
+
|
|
616
|
+
If no contradictions, respond: {{"contradictions": []}}"""
|
|
617
|
+
|
|
618
|
+
try:
|
|
619
|
+
response = llm_fn(prompt)
|
|
620
|
+
|
|
621
|
+
# Parse response
|
|
622
|
+
json_match = re.search(r'\{[\s\S]*\}', response)
|
|
623
|
+
if not json_match:
|
|
624
|
+
return []
|
|
625
|
+
|
|
626
|
+
data = json.loads(json_match.group())
|
|
627
|
+
|
|
628
|
+
contradictions = []
|
|
629
|
+
for c in data.get("contradictions", []):
|
|
630
|
+
idx1 = c.get("quote_1", 1) - 1
|
|
631
|
+
idx2 = c.get("quote_2", 2) - 1
|
|
632
|
+
|
|
633
|
+
if 0 <= idx1 < len(top_citations) and 0 <= idx2 < len(top_citations):
|
|
634
|
+
contradictions.append(Contradiction(
|
|
635
|
+
citation_1_id=top_citations[idx1].id,
|
|
636
|
+
citation_2_id=top_citations[idx2].id,
|
|
637
|
+
type=ContradictionType(c.get("type", "partial")),
|
|
638
|
+
description=c.get("explanation", ""),
|
|
639
|
+
confidence=0.7,
|
|
640
|
+
))
|
|
641
|
+
|
|
642
|
+
return contradictions
|
|
643
|
+
|
|
644
|
+
except Exception as e:
|
|
645
|
+
logger.warning("semantic_contradiction_detection_failed", error=str(e))
|
|
646
|
+
return []
|
|
647
|
+
|
|
648
|
+
def create_provenance_record(
|
|
649
|
+
self,
|
|
650
|
+
answer: str,
|
|
651
|
+
question: str,
|
|
652
|
+
variables: dict[str, Any],
|
|
653
|
+
trace: list[dict] | None = None,
|
|
654
|
+
llm_fn: Any | None = None,
|
|
655
|
+
) -> ProvenanceRecord:
|
|
656
|
+
"""
|
|
657
|
+
Create a complete provenance record for an answer.
|
|
658
|
+
|
|
659
|
+
Args:
|
|
660
|
+
answer: The generated answer.
|
|
661
|
+
question: The original question.
|
|
662
|
+
variables: Variable store contents.
|
|
663
|
+
trace: Navigation trace.
|
|
664
|
+
llm_fn: Optional LLM for contradiction detection.
|
|
665
|
+
|
|
666
|
+
Returns:
|
|
667
|
+
ProvenanceRecord with citations and contradictions.
|
|
668
|
+
"""
|
|
669
|
+
# Extract citations
|
|
670
|
+
citations = self.extract_citations(answer, question, variables, trace)
|
|
671
|
+
|
|
672
|
+
# Detect contradictions
|
|
673
|
+
contradictions = self.detect_contradictions(citations, llm_fn)
|
|
674
|
+
|
|
675
|
+
# Calculate aggregate confidence
|
|
676
|
+
if citations:
|
|
677
|
+
# Weighted average by relevance
|
|
678
|
+
total_weight = sum(c.relevance_score for c in citations)
|
|
679
|
+
if total_weight > 0:
|
|
680
|
+
aggregate = sum(
|
|
681
|
+
c.confidence * c.relevance_score for c in citations
|
|
682
|
+
) / total_weight
|
|
683
|
+
else:
|
|
684
|
+
aggregate = sum(c.confidence for c in citations) / len(citations)
|
|
685
|
+
|
|
686
|
+
# Reduce confidence if contradictions found
|
|
687
|
+
if contradictions:
|
|
688
|
+
aggregate *= 0.8
|
|
689
|
+
else:
|
|
690
|
+
aggregate = 0.0
|
|
691
|
+
|
|
692
|
+
# Generate evidence summary
|
|
693
|
+
summary = self._generate_evidence_summary(citations, contradictions)
|
|
694
|
+
|
|
695
|
+
record = ProvenanceRecord(
|
|
696
|
+
answer=answer,
|
|
697
|
+
question=question,
|
|
698
|
+
citations=citations,
|
|
699
|
+
contradictions=contradictions,
|
|
700
|
+
aggregate_confidence=aggregate,
|
|
701
|
+
evidence_summary=summary,
|
|
702
|
+
)
|
|
703
|
+
|
|
704
|
+
logger.info(
|
|
705
|
+
"provenance_record_created",
|
|
706
|
+
citations=len(citations),
|
|
707
|
+
contradictions=len(contradictions),
|
|
708
|
+
confidence=aggregate,
|
|
709
|
+
)
|
|
710
|
+
|
|
711
|
+
return record
|
|
712
|
+
|
|
713
|
+
def _generate_evidence_summary(
|
|
714
|
+
self,
|
|
715
|
+
citations: list[Citation],
|
|
716
|
+
contradictions: list[Contradiction],
|
|
717
|
+
) -> str:
|
|
718
|
+
"""Generate a summary of evidence quality."""
|
|
719
|
+
if not citations:
|
|
720
|
+
return "No supporting evidence found."
|
|
721
|
+
|
|
722
|
+
direct = sum(1 for c in citations if c.strength == CitationStrength.DIRECT)
|
|
723
|
+
supporting = sum(1 for c in citations if c.strength == CitationStrength.SUPPORTING)
|
|
724
|
+
|
|
725
|
+
parts = []
|
|
726
|
+
parts.append(f"Found {len(citations)} citation(s)")
|
|
727
|
+
|
|
728
|
+
if direct:
|
|
729
|
+
parts.append(f"{direct} directly supporting")
|
|
730
|
+
if supporting:
|
|
731
|
+
parts.append(f"{supporting} supporting")
|
|
732
|
+
|
|
733
|
+
if contradictions:
|
|
734
|
+
parts.append(f"WARNING: {len(contradictions)} contradiction(s) detected")
|
|
735
|
+
|
|
736
|
+
return ". ".join(parts) + "."
|
|
737
|
+
|
|
738
|
+
|
|
739
|
+
# =============================================================================
|
|
740
|
+
# Convenience Functions
|
|
741
|
+
# =============================================================================
|
|
742
|
+
|
|
743
|
+
|
|
744
|
+
def create_citation(
|
|
745
|
+
doc_id: str,
|
|
746
|
+
node_id: str,
|
|
747
|
+
quote: str,
|
|
748
|
+
page_num: int | None = None,
|
|
749
|
+
strength: str = "supporting",
|
|
750
|
+
) -> Citation:
|
|
751
|
+
"""Create a citation with minimal parameters."""
|
|
752
|
+
return Citation(
|
|
753
|
+
doc_id=doc_id,
|
|
754
|
+
node_id=node_id,
|
|
755
|
+
quote=quote,
|
|
756
|
+
page_num=page_num,
|
|
757
|
+
strength=CitationStrength(strength),
|
|
758
|
+
)
|
|
759
|
+
|
|
760
|
+
|
|
761
|
+
def format_citations_for_display(citations: list[Citation]) -> str:
|
|
762
|
+
"""Format citations for user display."""
|
|
763
|
+
if not citations:
|
|
764
|
+
return "No citations available."
|
|
765
|
+
|
|
766
|
+
lines = ["**Sources:**", ""]
|
|
767
|
+
|
|
768
|
+
for i, citation in enumerate(citations, 1):
|
|
769
|
+
lines.append(f"{i}. {citation.to_formatted_string()}")
|
|
770
|
+
lines.append("")
|
|
771
|
+
|
|
772
|
+
return "\n".join(lines)
|