rnsr 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. rnsr/__init__.py +118 -0
  2. rnsr/__main__.py +242 -0
  3. rnsr/agent/__init__.py +218 -0
  4. rnsr/agent/cross_doc_navigator.py +767 -0
  5. rnsr/agent/graph.py +1557 -0
  6. rnsr/agent/llm_cache.py +575 -0
  7. rnsr/agent/navigator_api.py +497 -0
  8. rnsr/agent/provenance.py +772 -0
  9. rnsr/agent/query_clarifier.py +617 -0
  10. rnsr/agent/reasoning_memory.py +736 -0
  11. rnsr/agent/repl_env.py +709 -0
  12. rnsr/agent/rlm_navigator.py +2108 -0
  13. rnsr/agent/self_reflection.py +602 -0
  14. rnsr/agent/variable_store.py +308 -0
  15. rnsr/benchmarks/__init__.py +118 -0
  16. rnsr/benchmarks/comprehensive_benchmark.py +733 -0
  17. rnsr/benchmarks/evaluation_suite.py +1210 -0
  18. rnsr/benchmarks/finance_bench.py +147 -0
  19. rnsr/benchmarks/pdf_merger.py +178 -0
  20. rnsr/benchmarks/performance.py +321 -0
  21. rnsr/benchmarks/quality.py +321 -0
  22. rnsr/benchmarks/runner.py +298 -0
  23. rnsr/benchmarks/standard_benchmarks.py +995 -0
  24. rnsr/client.py +560 -0
  25. rnsr/document_store.py +394 -0
  26. rnsr/exceptions.py +74 -0
  27. rnsr/extraction/__init__.py +172 -0
  28. rnsr/extraction/candidate_extractor.py +357 -0
  29. rnsr/extraction/entity_extractor.py +581 -0
  30. rnsr/extraction/entity_linker.py +825 -0
  31. rnsr/extraction/grounded_extractor.py +722 -0
  32. rnsr/extraction/learned_types.py +599 -0
  33. rnsr/extraction/models.py +232 -0
  34. rnsr/extraction/relationship_extractor.py +600 -0
  35. rnsr/extraction/relationship_patterns.py +511 -0
  36. rnsr/extraction/relationship_validator.py +392 -0
  37. rnsr/extraction/rlm_extractor.py +589 -0
  38. rnsr/extraction/rlm_unified_extractor.py +990 -0
  39. rnsr/extraction/tot_validator.py +610 -0
  40. rnsr/extraction/unified_extractor.py +342 -0
  41. rnsr/indexing/__init__.py +60 -0
  42. rnsr/indexing/knowledge_graph.py +1128 -0
  43. rnsr/indexing/kv_store.py +313 -0
  44. rnsr/indexing/persistence.py +323 -0
  45. rnsr/indexing/semantic_retriever.py +237 -0
  46. rnsr/indexing/semantic_search.py +320 -0
  47. rnsr/indexing/skeleton_index.py +395 -0
  48. rnsr/ingestion/__init__.py +161 -0
  49. rnsr/ingestion/chart_parser.py +569 -0
  50. rnsr/ingestion/document_boundary.py +662 -0
  51. rnsr/ingestion/font_histogram.py +334 -0
  52. rnsr/ingestion/header_classifier.py +595 -0
  53. rnsr/ingestion/hierarchical_cluster.py +515 -0
  54. rnsr/ingestion/layout_detector.py +356 -0
  55. rnsr/ingestion/layout_model.py +379 -0
  56. rnsr/ingestion/ocr_fallback.py +177 -0
  57. rnsr/ingestion/pipeline.py +936 -0
  58. rnsr/ingestion/semantic_fallback.py +417 -0
  59. rnsr/ingestion/table_parser.py +799 -0
  60. rnsr/ingestion/text_builder.py +460 -0
  61. rnsr/ingestion/tree_builder.py +402 -0
  62. rnsr/ingestion/vision_retrieval.py +965 -0
  63. rnsr/ingestion/xy_cut.py +555 -0
  64. rnsr/llm.py +733 -0
  65. rnsr/models.py +167 -0
  66. rnsr/py.typed +2 -0
  67. rnsr-0.1.0.dist-info/METADATA +592 -0
  68. rnsr-0.1.0.dist-info/RECORD +72 -0
  69. rnsr-0.1.0.dist-info/WHEEL +5 -0
  70. rnsr-0.1.0.dist-info/entry_points.txt +2 -0
  71. rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
  72. rnsr-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,772 @@
1
+ """
2
+ RNSR Provenance and Citation System
3
+
4
+ Every answer should trace back to exact document evidence.
5
+ Provides structured citations with:
6
+ - Exact document location (doc_id, node_id, page_num)
7
+ - Exact quote with character spans
8
+ - Confidence score per citation
9
+ - Contradiction detection when sources disagree
10
+
11
+ Critical for legal, academic, and enterprise use cases where
12
+ answers must be verifiable.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import hashlib
18
+ import json
19
+ import re
20
+ from dataclasses import dataclass, field
21
+ from datetime import datetime
22
+ from enum import Enum
23
+ from typing import Any
24
+ from uuid import uuid4
25
+
26
+ import structlog
27
+
28
+ logger = structlog.get_logger(__name__)
29
+
30
+
31
+ # =============================================================================
32
+ # Citation Models
33
+ # =============================================================================
34
+
35
+
36
+ class CitationStrength(str, Enum):
37
+ """How strongly a citation supports a claim."""
38
+
39
+ DIRECT = "direct" # Explicitly states the claim
40
+ SUPPORTING = "supporting" # Implies or supports the claim
41
+ CONTEXTUAL = "contextual" # Provides background context
42
+ WEAK = "weak" # Tangentially related
43
+
44
+
45
+ class ContradictionType(str, Enum):
46
+ """Types of contradictions between sources."""
47
+
48
+ DIRECT = "direct" # Sources directly contradict
49
+ TEMPORAL = "temporal" # Different time periods
50
+ PARTIAL = "partial" # Partially contradictory
51
+ INTERPRETATION = "interpretation" # Different interpretations
52
+
53
+
54
+ @dataclass
55
+ class Citation:
56
+ """
57
+ A structured citation linking an answer to document evidence.
58
+
59
+ Provides complete traceability from answer to source.
60
+ """
61
+
62
+ id: str = field(default_factory=lambda: f"cite_{str(uuid4())[:8]}")
63
+
64
+ # Document location
65
+ doc_id: str = ""
66
+ node_id: str = ""
67
+ page_num: int | None = None
68
+
69
+ # Exact quote
70
+ quote: str = ""
71
+ span_start: int | None = None
72
+ span_end: int | None = None
73
+
74
+ # Context around the quote
75
+ context_before: str = ""
76
+ context_after: str = ""
77
+
78
+ # Relevance and confidence
79
+ strength: CitationStrength = CitationStrength.SUPPORTING
80
+ confidence: float = 0.5
81
+ relevance_score: float = 0.5
82
+
83
+ # Claim this citation supports
84
+ claim: str = ""
85
+
86
+ # Metadata
87
+ extracted_at: str = field(default_factory=lambda: datetime.utcnow().isoformat())
88
+ section_header: str = ""
89
+
90
+ def to_dict(self) -> dict[str, Any]:
91
+ """Convert to dictionary."""
92
+ return {
93
+ "id": self.id,
94
+ "doc_id": self.doc_id,
95
+ "node_id": self.node_id,
96
+ "page_num": self.page_num,
97
+ "quote": self.quote,
98
+ "span_start": self.span_start,
99
+ "span_end": self.span_end,
100
+ "context_before": self.context_before,
101
+ "context_after": self.context_after,
102
+ "strength": self.strength.value,
103
+ "confidence": self.confidence,
104
+ "relevance_score": self.relevance_score,
105
+ "claim": self.claim,
106
+ "section_header": self.section_header,
107
+ "extracted_at": self.extracted_at,
108
+ }
109
+
110
+ def to_formatted_string(self, include_context: bool = False) -> str:
111
+ """Format citation for display."""
112
+ parts = []
113
+
114
+ if self.doc_id:
115
+ parts.append(f"[{self.doc_id}]")
116
+
117
+ if self.section_header:
118
+ parts.append(f"Section: {self.section_header}")
119
+
120
+ if self.page_num:
121
+ parts.append(f"Page {self.page_num}")
122
+
123
+ location = ", ".join(parts) if parts else "Unknown location"
124
+
125
+ quote_display = self.quote
126
+ if len(quote_display) > 200:
127
+ quote_display = quote_display[:200] + "..."
128
+
129
+ result = f'{location}: "{quote_display}"'
130
+
131
+ if include_context and (self.context_before or self.context_after):
132
+ result += f"\n Context: ...{self.context_before[-50:]}" if self.context_before else ""
133
+ result += f"[QUOTE]{self.context_after[:50]}..." if self.context_after else ""
134
+
135
+ return result
136
+
137
+
138
+ @dataclass
139
+ class Contradiction:
140
+ """A detected contradiction between citations."""
141
+
142
+ id: str = field(default_factory=lambda: f"contra_{str(uuid4())[:8]}")
143
+
144
+ citation_1_id: str = ""
145
+ citation_2_id: str = ""
146
+
147
+ type: ContradictionType = ContradictionType.PARTIAL
148
+
149
+ description: str = ""
150
+ resolution_suggestion: str = ""
151
+
152
+ confidence: float = 0.5
153
+
154
+ def to_dict(self) -> dict[str, Any]:
155
+ """Convert to dictionary."""
156
+ return {
157
+ "id": self.id,
158
+ "citation_1_id": self.citation_1_id,
159
+ "citation_2_id": self.citation_2_id,
160
+ "type": self.type.value,
161
+ "description": self.description,
162
+ "resolution_suggestion": self.resolution_suggestion,
163
+ "confidence": self.confidence,
164
+ }
165
+
166
+
167
+ @dataclass
168
+ class ProvenanceRecord:
169
+ """
170
+ Complete provenance for an answer.
171
+
172
+ Links an answer to all supporting citations and any contradictions.
173
+ """
174
+
175
+ id: str = field(default_factory=lambda: f"prov_{str(uuid4())[:8]}")
176
+
177
+ # The answer being traced
178
+ answer: str = ""
179
+ question: str = ""
180
+
181
+ # All citations supporting this answer
182
+ citations: list[Citation] = field(default_factory=list)
183
+
184
+ # Detected contradictions
185
+ contradictions: list[Contradiction] = field(default_factory=list)
186
+
187
+ # Overall confidence based on citations
188
+ aggregate_confidence: float = 0.0
189
+
190
+ # Summary
191
+ evidence_summary: str = ""
192
+
193
+ # Metadata
194
+ created_at: str = field(default_factory=lambda: datetime.utcnow().isoformat())
195
+
196
+ def to_dict(self) -> dict[str, Any]:
197
+ """Convert to dictionary."""
198
+ return {
199
+ "id": self.id,
200
+ "answer": self.answer,
201
+ "question": self.question,
202
+ "citations": [c.to_dict() for c in self.citations],
203
+ "contradictions": [c.to_dict() for c in self.contradictions],
204
+ "aggregate_confidence": self.aggregate_confidence,
205
+ "evidence_summary": self.evidence_summary,
206
+ "created_at": self.created_at,
207
+ }
208
+
209
+ def to_markdown(self) -> str:
210
+ """Export as markdown for documentation."""
211
+ lines = [
212
+ f"# Provenance Record",
213
+ f"",
214
+ f"**Question:** {self.question}",
215
+ f"",
216
+ f"**Answer:** {self.answer}",
217
+ f"",
218
+ f"**Confidence:** {self.aggregate_confidence:.2%}",
219
+ f"",
220
+ f"## Citations ({len(self.citations)})",
221
+ f"",
222
+ ]
223
+
224
+ for i, citation in enumerate(self.citations, 1):
225
+ lines.append(f"### Citation {i}")
226
+ lines.append(f"- **Document:** {citation.doc_id}")
227
+ lines.append(f"- **Section:** {citation.section_header or citation.node_id}")
228
+ if citation.page_num:
229
+ lines.append(f"- **Page:** {citation.page_num}")
230
+ lines.append(f"- **Strength:** {citation.strength.value}")
231
+ lines.append(f"- **Confidence:** {citation.confidence:.2%}")
232
+ lines.append(f"")
233
+ lines.append(f"> {citation.quote}")
234
+ lines.append(f"")
235
+
236
+ if self.contradictions:
237
+ lines.append(f"## Contradictions ({len(self.contradictions)})")
238
+ lines.append(f"")
239
+ for contra in self.contradictions:
240
+ lines.append(f"- **Type:** {contra.type.value}")
241
+ lines.append(f"- **Description:** {contra.description}")
242
+ if contra.resolution_suggestion:
243
+ lines.append(f"- **Resolution:** {contra.resolution_suggestion}")
244
+ lines.append(f"")
245
+
246
+ return "\n".join(lines)
247
+
248
+
249
+ # =============================================================================
250
+ # Provenance Tracker
251
+ # =============================================================================
252
+
253
+
254
+ class ProvenanceTracker:
255
+ """
256
+ Tracks provenance for answers.
257
+
258
+ Extracts citations from navigation results and detects contradictions.
259
+ """
260
+
261
+ def __init__(
262
+ self,
263
+ kv_store: Any | None = None,
264
+ skeleton: dict | None = None,
265
+ min_quote_length: int = 20,
266
+ context_window: int = 100,
267
+ ):
268
+ """
269
+ Initialize the provenance tracker.
270
+
271
+ Args:
272
+ kv_store: KV store for retrieving full content.
273
+ skeleton: Skeleton index for node metadata.
274
+ min_quote_length: Minimum quote length to consider.
275
+ context_window: Characters of context around quotes.
276
+ """
277
+ self.kv_store = kv_store
278
+ self.skeleton = skeleton or {}
279
+ self.min_quote_length = min_quote_length
280
+ self.context_window = context_window
281
+
282
+ def extract_citations(
283
+ self,
284
+ answer: str,
285
+ question: str,
286
+ variables: dict[str, Any],
287
+ trace: list[dict] | None = None,
288
+ ) -> list[Citation]:
289
+ """
290
+ Extract citations from navigation results.
291
+
292
+ Args:
293
+ answer: The generated answer.
294
+ question: The original question.
295
+ variables: Variable store contents (contains retrieved content).
296
+ trace: Navigation trace entries.
297
+
298
+ Returns:
299
+ List of Citation objects.
300
+ """
301
+ citations = []
302
+
303
+ # Extract from variables (most reliable source)
304
+ for var_name, var_data in variables.items():
305
+ if isinstance(var_data, dict):
306
+ citation = self._extract_citation_from_variable(
307
+ var_name, var_data, answer
308
+ )
309
+ if citation:
310
+ citations.append(citation)
311
+
312
+ # Extract from trace if available
313
+ if trace:
314
+ trace_citations = self._extract_citations_from_trace(trace, answer)
315
+ citations.extend(trace_citations)
316
+
317
+ # Deduplicate
318
+ citations = self._deduplicate_citations(citations)
319
+
320
+ # Score relevance to answer
321
+ for citation in citations:
322
+ citation.relevance_score = self._score_relevance(
323
+ citation.quote, answer
324
+ )
325
+
326
+ # Sort by relevance
327
+ citations.sort(key=lambda c: -c.relevance_score)
328
+
329
+ logger.info(
330
+ "citations_extracted",
331
+ count=len(citations),
332
+ question=question[:50],
333
+ )
334
+
335
+ return citations
336
+
337
+ def _extract_citation_from_variable(
338
+ self,
339
+ var_name: str,
340
+ var_data: dict,
341
+ answer: str,
342
+ ) -> Citation | None:
343
+ """Extract citation from a stored variable."""
344
+ content = var_data.get("content", "")
345
+
346
+ if not content or len(content) < self.min_quote_length:
347
+ return None
348
+
349
+ # Find the most relevant quote from this content
350
+ quote, span_start, span_end = self._find_best_quote(content, answer)
351
+
352
+ if not quote:
353
+ return None
354
+
355
+ # Get context around quote
356
+ context_before = content[max(0, span_start - self.context_window):span_start]
357
+ context_after = content[span_end:span_end + self.context_window]
358
+
359
+ # Determine citation strength
360
+ strength = self._determine_strength(quote, answer)
361
+
362
+ # Get node metadata
363
+ node_id = var_data.get("node_id", var_name)
364
+ doc_id = var_data.get("doc_id", "")
365
+ page_num = var_data.get("page_num")
366
+
367
+ # Get section header from skeleton
368
+ section_header = ""
369
+ if node_id in self.skeleton:
370
+ section_header = self.skeleton[node_id].get("header", "")
371
+
372
+ return Citation(
373
+ doc_id=doc_id,
374
+ node_id=node_id,
375
+ page_num=page_num,
376
+ quote=quote,
377
+ span_start=span_start,
378
+ span_end=span_end,
379
+ context_before=context_before,
380
+ context_after=context_after,
381
+ strength=strength,
382
+ confidence=0.8 if strength == CitationStrength.DIRECT else 0.6,
383
+ section_header=section_header,
384
+ )
385
+
386
+ def _extract_citations_from_trace(
387
+ self,
388
+ trace: list[dict],
389
+ answer: str,
390
+ ) -> list[Citation]:
391
+ """Extract citations from navigation trace."""
392
+ citations = []
393
+
394
+ for entry in trace:
395
+ if entry.get("action") == "read_content":
396
+ node_id = entry.get("node_id", "")
397
+ content = entry.get("content", "")
398
+
399
+ if content and len(content) >= self.min_quote_length:
400
+ quote, start, end = self._find_best_quote(content, answer)
401
+
402
+ if quote:
403
+ citations.append(Citation(
404
+ node_id=node_id,
405
+ quote=quote,
406
+ span_start=start,
407
+ span_end=end,
408
+ strength=self._determine_strength(quote, answer),
409
+ ))
410
+
411
+ return citations
412
+
413
+ def _find_best_quote(
414
+ self,
415
+ content: str,
416
+ answer: str,
417
+ ) -> tuple[str, int, int]:
418
+ """Find the most relevant quote from content."""
419
+ # Split answer into key phrases
420
+ answer_words = set(answer.lower().split())
421
+
422
+ # Find sentences in content
423
+ sentences = re.split(r'[.!?]\s+', content)
424
+
425
+ best_quote = ""
426
+ best_score = 0
427
+ best_start = 0
428
+ best_end = 0
429
+
430
+ current_pos = 0
431
+ for sentence in sentences:
432
+ sentence = sentence.strip()
433
+ if len(sentence) < self.min_quote_length:
434
+ current_pos += len(sentence) + 2
435
+ continue
436
+
437
+ # Score by word overlap
438
+ sentence_words = set(sentence.lower().split())
439
+ overlap = len(answer_words & sentence_words)
440
+ score = overlap / max(len(answer_words), 1)
441
+
442
+ if score > best_score:
443
+ best_score = score
444
+ best_quote = sentence
445
+ best_start = content.find(sentence, current_pos)
446
+ best_end = best_start + len(sentence)
447
+
448
+ current_pos += len(sentence) + 2
449
+
450
+ return best_quote, best_start, best_end
451
+
452
+ def _determine_strength(self, quote: str, answer: str) -> CitationStrength:
453
+ """Determine how strongly a quote supports the answer."""
454
+ quote_lower = quote.lower()
455
+ answer_lower = answer.lower()
456
+
457
+ # Check for direct overlap
458
+ answer_words = set(answer_lower.split())
459
+ quote_words = set(quote_lower.split())
460
+ overlap = len(answer_words & quote_words) / max(len(answer_words), 1)
461
+
462
+ if overlap > 0.5:
463
+ return CitationStrength.DIRECT
464
+ elif overlap > 0.3:
465
+ return CitationStrength.SUPPORTING
466
+ elif overlap > 0.1:
467
+ return CitationStrength.CONTEXTUAL
468
+ else:
469
+ return CitationStrength.WEAK
470
+
471
+ def _score_relevance(self, quote: str, answer: str) -> float:
472
+ """Score relevance of quote to answer."""
473
+ if not quote or not answer:
474
+ return 0.0
475
+
476
+ quote_words = set(quote.lower().split())
477
+ answer_words = set(answer.lower().split())
478
+
479
+ if not answer_words:
480
+ return 0.0
481
+
482
+ overlap = len(quote_words & answer_words)
483
+ return overlap / len(answer_words)
484
+
485
+ def _deduplicate_citations(
486
+ self,
487
+ citations: list[Citation],
488
+ ) -> list[Citation]:
489
+ """Remove duplicate citations."""
490
+ seen_quotes = set()
491
+ unique = []
492
+
493
+ for citation in citations:
494
+ # Hash the quote
495
+ quote_hash = hashlib.md5(citation.quote.encode()).hexdigest()[:16]
496
+
497
+ if quote_hash not in seen_quotes:
498
+ seen_quotes.add(quote_hash)
499
+ unique.append(citation)
500
+
501
+ return unique
502
+
503
+ def detect_contradictions(
504
+ self,
505
+ citations: list[Citation],
506
+ llm_fn: Any | None = None,
507
+ ) -> list[Contradiction]:
508
+ """
509
+ Detect contradictions between citations.
510
+
511
+ Args:
512
+ citations: List of citations to check.
513
+ llm_fn: Optional LLM function for semantic contradiction detection.
514
+
515
+ Returns:
516
+ List of Contradiction objects.
517
+ """
518
+ contradictions = []
519
+
520
+ if len(citations) < 2:
521
+ return contradictions
522
+
523
+ # Simple heuristic-based detection
524
+ for i, c1 in enumerate(citations):
525
+ for c2 in citations[i + 1:]:
526
+ contradiction = self._check_contradiction(c1, c2)
527
+ if contradiction:
528
+ contradictions.append(contradiction)
529
+
530
+ # Optional: LLM-based semantic detection
531
+ if llm_fn and len(citations) >= 2:
532
+ semantic_contradictions = self._detect_semantic_contradictions(
533
+ citations, llm_fn
534
+ )
535
+ contradictions.extend(semantic_contradictions)
536
+
537
+ return contradictions
538
+
539
+ def _check_contradiction(
540
+ self,
541
+ c1: Citation,
542
+ c2: Citation,
543
+ ) -> Contradiction | None:
544
+ """Check for contradiction between two citations using heuristics."""
545
+ q1 = c1.quote.lower()
546
+ q2 = c2.quote.lower()
547
+
548
+ # Look for negation patterns
549
+ negation_pairs = [
550
+ ("is not", "is"),
551
+ ("was not", "was"),
552
+ ("did not", "did"),
553
+ ("cannot", "can"),
554
+ ("never", "always"),
555
+ ("false", "true"),
556
+ ("incorrect", "correct"),
557
+ ]
558
+
559
+ for neg, pos in negation_pairs:
560
+ if (neg in q1 and pos in q2 and neg not in q2) or \
561
+ (neg in q2 and pos in q1 and neg not in q1):
562
+ return Contradiction(
563
+ citation_1_id=c1.id,
564
+ citation_2_id=c2.id,
565
+ type=ContradictionType.DIRECT,
566
+ description=f"Potential negation contradiction detected",
567
+ confidence=0.6,
568
+ )
569
+
570
+ # Look for number contradictions
571
+ nums_1 = re.findall(r'\$?[\d,]+\.?\d*', q1)
572
+ nums_2 = re.findall(r'\$?[\d,]+\.?\d*', q2)
573
+
574
+ if nums_1 and nums_2 and nums_1 != nums_2:
575
+ # Could be contradictory numbers
576
+ return Contradiction(
577
+ citation_1_id=c1.id,
578
+ citation_2_id=c2.id,
579
+ type=ContradictionType.PARTIAL,
580
+ description=f"Different numbers mentioned: {nums_1} vs {nums_2}",
581
+ confidence=0.4,
582
+ )
583
+
584
+ return None
585
+
586
+ def _detect_semantic_contradictions(
587
+ self,
588
+ citations: list[Citation],
589
+ llm_fn: Any,
590
+ ) -> list[Contradiction]:
591
+ """Use LLM to detect semantic contradictions."""
592
+ if len(citations) < 2:
593
+ return []
594
+
595
+ # Build prompt with top citations
596
+ top_citations = citations[:5] # Limit to avoid token overflow
597
+
598
+ quotes_text = "\n".join([
599
+ f"[{i+1}] {c.quote[:200]}"
600
+ for i, c in enumerate(top_citations)
601
+ ])
602
+
603
+ prompt = f"""Analyze these quotes for contradictions:
604
+
605
+ {quotes_text}
606
+
607
+ Do any of these quotes contradict each other? If yes, specify which quotes (by number) and explain the contradiction.
608
+
609
+ Respond in JSON:
610
+ {{
611
+ "contradictions": [
612
+ {{"quote_1": 1, "quote_2": 2, "type": "direct|temporal|partial", "explanation": "..."}}
613
+ ]
614
+ }}
615
+
616
+ If no contradictions, respond: {{"contradictions": []}}"""
617
+
618
+ try:
619
+ response = llm_fn(prompt)
620
+
621
+ # Parse response
622
+ json_match = re.search(r'\{[\s\S]*\}', response)
623
+ if not json_match:
624
+ return []
625
+
626
+ data = json.loads(json_match.group())
627
+
628
+ contradictions = []
629
+ for c in data.get("contradictions", []):
630
+ idx1 = c.get("quote_1", 1) - 1
631
+ idx2 = c.get("quote_2", 2) - 1
632
+
633
+ if 0 <= idx1 < len(top_citations) and 0 <= idx2 < len(top_citations):
634
+ contradictions.append(Contradiction(
635
+ citation_1_id=top_citations[idx1].id,
636
+ citation_2_id=top_citations[idx2].id,
637
+ type=ContradictionType(c.get("type", "partial")),
638
+ description=c.get("explanation", ""),
639
+ confidence=0.7,
640
+ ))
641
+
642
+ return contradictions
643
+
644
+ except Exception as e:
645
+ logger.warning("semantic_contradiction_detection_failed", error=str(e))
646
+ return []
647
+
648
+ def create_provenance_record(
649
+ self,
650
+ answer: str,
651
+ question: str,
652
+ variables: dict[str, Any],
653
+ trace: list[dict] | None = None,
654
+ llm_fn: Any | None = None,
655
+ ) -> ProvenanceRecord:
656
+ """
657
+ Create a complete provenance record for an answer.
658
+
659
+ Args:
660
+ answer: The generated answer.
661
+ question: The original question.
662
+ variables: Variable store contents.
663
+ trace: Navigation trace.
664
+ llm_fn: Optional LLM for contradiction detection.
665
+
666
+ Returns:
667
+ ProvenanceRecord with citations and contradictions.
668
+ """
669
+ # Extract citations
670
+ citations = self.extract_citations(answer, question, variables, trace)
671
+
672
+ # Detect contradictions
673
+ contradictions = self.detect_contradictions(citations, llm_fn)
674
+
675
+ # Calculate aggregate confidence
676
+ if citations:
677
+ # Weighted average by relevance
678
+ total_weight = sum(c.relevance_score for c in citations)
679
+ if total_weight > 0:
680
+ aggregate = sum(
681
+ c.confidence * c.relevance_score for c in citations
682
+ ) / total_weight
683
+ else:
684
+ aggregate = sum(c.confidence for c in citations) / len(citations)
685
+
686
+ # Reduce confidence if contradictions found
687
+ if contradictions:
688
+ aggregate *= 0.8
689
+ else:
690
+ aggregate = 0.0
691
+
692
+ # Generate evidence summary
693
+ summary = self._generate_evidence_summary(citations, contradictions)
694
+
695
+ record = ProvenanceRecord(
696
+ answer=answer,
697
+ question=question,
698
+ citations=citations,
699
+ contradictions=contradictions,
700
+ aggregate_confidence=aggregate,
701
+ evidence_summary=summary,
702
+ )
703
+
704
+ logger.info(
705
+ "provenance_record_created",
706
+ citations=len(citations),
707
+ contradictions=len(contradictions),
708
+ confidence=aggregate,
709
+ )
710
+
711
+ return record
712
+
713
+ def _generate_evidence_summary(
714
+ self,
715
+ citations: list[Citation],
716
+ contradictions: list[Contradiction],
717
+ ) -> str:
718
+ """Generate a summary of evidence quality."""
719
+ if not citations:
720
+ return "No supporting evidence found."
721
+
722
+ direct = sum(1 for c in citations if c.strength == CitationStrength.DIRECT)
723
+ supporting = sum(1 for c in citations if c.strength == CitationStrength.SUPPORTING)
724
+
725
+ parts = []
726
+ parts.append(f"Found {len(citations)} citation(s)")
727
+
728
+ if direct:
729
+ parts.append(f"{direct} directly supporting")
730
+ if supporting:
731
+ parts.append(f"{supporting} supporting")
732
+
733
+ if contradictions:
734
+ parts.append(f"WARNING: {len(contradictions)} contradiction(s) detected")
735
+
736
+ return ". ".join(parts) + "."
737
+
738
+
739
+ # =============================================================================
740
+ # Convenience Functions
741
+ # =============================================================================
742
+
743
+
744
+ def create_citation(
745
+ doc_id: str,
746
+ node_id: str,
747
+ quote: str,
748
+ page_num: int | None = None,
749
+ strength: str = "supporting",
750
+ ) -> Citation:
751
+ """Create a citation with minimal parameters."""
752
+ return Citation(
753
+ doc_id=doc_id,
754
+ node_id=node_id,
755
+ quote=quote,
756
+ page_num=page_num,
757
+ strength=CitationStrength(strength),
758
+ )
759
+
760
+
761
+ def format_citations_for_display(citations: list[Citation]) -> str:
762
+ """Format citations for user display."""
763
+ if not citations:
764
+ return "No citations available."
765
+
766
+ lines = ["**Sources:**", ""]
767
+
768
+ for i, citation in enumerate(citations, 1):
769
+ lines.append(f"{i}. {citation.to_formatted_string()}")
770
+ lines.append("")
771
+
772
+ return "\n".join(lines)