rnsr 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rnsr/__init__.py +118 -0
- rnsr/__main__.py +242 -0
- rnsr/agent/__init__.py +218 -0
- rnsr/agent/cross_doc_navigator.py +767 -0
- rnsr/agent/graph.py +1557 -0
- rnsr/agent/llm_cache.py +575 -0
- rnsr/agent/navigator_api.py +497 -0
- rnsr/agent/provenance.py +772 -0
- rnsr/agent/query_clarifier.py +617 -0
- rnsr/agent/reasoning_memory.py +736 -0
- rnsr/agent/repl_env.py +709 -0
- rnsr/agent/rlm_navigator.py +2108 -0
- rnsr/agent/self_reflection.py +602 -0
- rnsr/agent/variable_store.py +308 -0
- rnsr/benchmarks/__init__.py +118 -0
- rnsr/benchmarks/comprehensive_benchmark.py +733 -0
- rnsr/benchmarks/evaluation_suite.py +1210 -0
- rnsr/benchmarks/finance_bench.py +147 -0
- rnsr/benchmarks/pdf_merger.py +178 -0
- rnsr/benchmarks/performance.py +321 -0
- rnsr/benchmarks/quality.py +321 -0
- rnsr/benchmarks/runner.py +298 -0
- rnsr/benchmarks/standard_benchmarks.py +995 -0
- rnsr/client.py +560 -0
- rnsr/document_store.py +394 -0
- rnsr/exceptions.py +74 -0
- rnsr/extraction/__init__.py +172 -0
- rnsr/extraction/candidate_extractor.py +357 -0
- rnsr/extraction/entity_extractor.py +581 -0
- rnsr/extraction/entity_linker.py +825 -0
- rnsr/extraction/grounded_extractor.py +722 -0
- rnsr/extraction/learned_types.py +599 -0
- rnsr/extraction/models.py +232 -0
- rnsr/extraction/relationship_extractor.py +600 -0
- rnsr/extraction/relationship_patterns.py +511 -0
- rnsr/extraction/relationship_validator.py +392 -0
- rnsr/extraction/rlm_extractor.py +589 -0
- rnsr/extraction/rlm_unified_extractor.py +990 -0
- rnsr/extraction/tot_validator.py +610 -0
- rnsr/extraction/unified_extractor.py +342 -0
- rnsr/indexing/__init__.py +60 -0
- rnsr/indexing/knowledge_graph.py +1128 -0
- rnsr/indexing/kv_store.py +313 -0
- rnsr/indexing/persistence.py +323 -0
- rnsr/indexing/semantic_retriever.py +237 -0
- rnsr/indexing/semantic_search.py +320 -0
- rnsr/indexing/skeleton_index.py +395 -0
- rnsr/ingestion/__init__.py +161 -0
- rnsr/ingestion/chart_parser.py +569 -0
- rnsr/ingestion/document_boundary.py +662 -0
- rnsr/ingestion/font_histogram.py +334 -0
- rnsr/ingestion/header_classifier.py +595 -0
- rnsr/ingestion/hierarchical_cluster.py +515 -0
- rnsr/ingestion/layout_detector.py +356 -0
- rnsr/ingestion/layout_model.py +379 -0
- rnsr/ingestion/ocr_fallback.py +177 -0
- rnsr/ingestion/pipeline.py +936 -0
- rnsr/ingestion/semantic_fallback.py +417 -0
- rnsr/ingestion/table_parser.py +799 -0
- rnsr/ingestion/text_builder.py +460 -0
- rnsr/ingestion/tree_builder.py +402 -0
- rnsr/ingestion/vision_retrieval.py +965 -0
- rnsr/ingestion/xy_cut.py +555 -0
- rnsr/llm.py +733 -0
- rnsr/models.py +167 -0
- rnsr/py.typed +2 -0
- rnsr-0.1.0.dist-info/METADATA +592 -0
- rnsr-0.1.0.dist-info/RECORD +72 -0
- rnsr-0.1.0.dist-info/WHEEL +5 -0
- rnsr-0.1.0.dist-info/entry_points.txt +2 -0
- rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
- rnsr-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,602 @@
|
|
|
1
|
+
"""
|
|
2
|
+
RNSR Self-Reflection Loop
|
|
3
|
+
|
|
4
|
+
Implements iterative self-correction where the system:
|
|
5
|
+
1. Generates an initial answer
|
|
6
|
+
2. Critiques its own answer
|
|
7
|
+
3. If issues found, re-navigates with critique as context
|
|
8
|
+
4. Repeats until confident or max iterations
|
|
9
|
+
|
|
10
|
+
Based on self-reflection patterns from:
|
|
11
|
+
- Reflexion (Shinn et al.)
|
|
12
|
+
- Self-Refine (Madaan et al.)
|
|
13
|
+
- Constitutional AI principles
|
|
14
|
+
|
|
15
|
+
Key insight: LLMs can often identify problems in their own outputs
|
|
16
|
+
that they couldn't avoid in initial generation.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import json
|
|
22
|
+
import re
|
|
23
|
+
import time
|
|
24
|
+
from dataclasses import dataclass, field
|
|
25
|
+
from datetime import datetime
|
|
26
|
+
from enum import Enum
|
|
27
|
+
from typing import Any, Callable
|
|
28
|
+
|
|
29
|
+
import structlog
|
|
30
|
+
|
|
31
|
+
logger = structlog.get_logger(__name__)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# =============================================================================
|
|
35
|
+
# Self-Reflection Prompts
|
|
36
|
+
# =============================================================================
|
|
37
|
+
|
|
38
|
+
CRITIQUE_PROMPT = """You are a critical reviewer. Analyze this answer for potential issues.
|
|
39
|
+
|
|
40
|
+
QUESTION: {question}
|
|
41
|
+
|
|
42
|
+
ANSWER: {answer}
|
|
43
|
+
|
|
44
|
+
EVIDENCE USED:
|
|
45
|
+
{evidence}
|
|
46
|
+
|
|
47
|
+
Critically evaluate:
|
|
48
|
+
1. ACCURACY: Does the evidence actually support this answer?
|
|
49
|
+
2. COMPLETENESS: Is anything important missing?
|
|
50
|
+
3. CONTRADICTIONS: Does any evidence contradict the answer?
|
|
51
|
+
4. SPECIFICITY: Is the answer too vague or too specific?
|
|
52
|
+
5. ASSUMPTIONS: Are there unstated assumptions?
|
|
53
|
+
|
|
54
|
+
If you find issues, explain them clearly.
|
|
55
|
+
If the answer is good, say "NO ISSUES FOUND".
|
|
56
|
+
|
|
57
|
+
Respond in JSON:
|
|
58
|
+
{{
|
|
59
|
+
"has_issues": true/false,
|
|
60
|
+
"issues": [
|
|
61
|
+
{{"type": "accuracy|completeness|contradiction|specificity|assumption", "description": "...", "severity": "high|medium|low"}}
|
|
62
|
+
],
|
|
63
|
+
"confidence_in_critique": 0.0-1.0,
|
|
64
|
+
"suggested_improvements": ["..."],
|
|
65
|
+
"should_retry": true/false
|
|
66
|
+
}}"""
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
REFINEMENT_PROMPT = """You are improving an answer based on feedback.
|
|
70
|
+
|
|
71
|
+
ORIGINAL QUESTION: {question}
|
|
72
|
+
|
|
73
|
+
PREVIOUS ANSWER: {previous_answer}
|
|
74
|
+
|
|
75
|
+
CRITIQUE/ISSUES FOUND:
|
|
76
|
+
{critique}
|
|
77
|
+
|
|
78
|
+
EVIDENCE AVAILABLE:
|
|
79
|
+
{evidence}
|
|
80
|
+
|
|
81
|
+
Generate an IMPROVED answer that addresses the identified issues.
|
|
82
|
+
Be specific and directly address each criticism.
|
|
83
|
+
|
|
84
|
+
Respond with ONLY the improved answer, no meta-commentary."""
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
VERIFICATION_PROMPT = """Compare these two answers and determine which is better.
|
|
88
|
+
|
|
89
|
+
QUESTION: {question}
|
|
90
|
+
|
|
91
|
+
ANSWER A (Original):
|
|
92
|
+
{answer_a}
|
|
93
|
+
|
|
94
|
+
ANSWER B (Refined):
|
|
95
|
+
{answer_b}
|
|
96
|
+
|
|
97
|
+
Which answer is:
|
|
98
|
+
1. More accurate?
|
|
99
|
+
2. More complete?
|
|
100
|
+
3. Better supported by evidence?
|
|
101
|
+
|
|
102
|
+
Respond in JSON:
|
|
103
|
+
{{
|
|
104
|
+
"better_answer": "A" or "B",
|
|
105
|
+
"confidence": 0.0-1.0,
|
|
106
|
+
"reasoning": "..."
|
|
107
|
+
}}"""
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
# =============================================================================
|
|
111
|
+
# Data Models
|
|
112
|
+
# =============================================================================
|
|
113
|
+
|
|
114
|
+
class IssueType(str, Enum):
|
|
115
|
+
"""Types of issues that can be identified."""
|
|
116
|
+
|
|
117
|
+
ACCURACY = "accuracy"
|
|
118
|
+
COMPLETENESS = "completeness"
|
|
119
|
+
CONTRADICTION = "contradiction"
|
|
120
|
+
SPECIFICITY = "specificity"
|
|
121
|
+
ASSUMPTION = "assumption"
|
|
122
|
+
HALLUCINATION = "hallucination"
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
class IssueSeverity(str, Enum):
|
|
126
|
+
"""Severity of identified issues."""
|
|
127
|
+
|
|
128
|
+
HIGH = "high"
|
|
129
|
+
MEDIUM = "medium"
|
|
130
|
+
LOW = "low"
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
@dataclass
|
|
134
|
+
class Issue:
|
|
135
|
+
"""An issue identified during self-critique."""
|
|
136
|
+
|
|
137
|
+
type: IssueType
|
|
138
|
+
description: str
|
|
139
|
+
severity: IssueSeverity
|
|
140
|
+
|
|
141
|
+
def to_dict(self) -> dict[str, Any]:
|
|
142
|
+
"""Convert to dictionary."""
|
|
143
|
+
return {
|
|
144
|
+
"type": self.type.value,
|
|
145
|
+
"description": self.description,
|
|
146
|
+
"severity": self.severity.value,
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
@dataclass
|
|
151
|
+
class CritiqueResult:
|
|
152
|
+
"""Result of self-critique."""
|
|
153
|
+
|
|
154
|
+
has_issues: bool = False
|
|
155
|
+
issues: list[Issue] = field(default_factory=list)
|
|
156
|
+
confidence: float = 0.5
|
|
157
|
+
suggested_improvements: list[str] = field(default_factory=list)
|
|
158
|
+
should_retry: bool = False
|
|
159
|
+
raw_response: str = ""
|
|
160
|
+
|
|
161
|
+
def to_dict(self) -> dict[str, Any]:
|
|
162
|
+
"""Convert to dictionary."""
|
|
163
|
+
return {
|
|
164
|
+
"has_issues": self.has_issues,
|
|
165
|
+
"issues": [i.to_dict() for i in self.issues],
|
|
166
|
+
"confidence": self.confidence,
|
|
167
|
+
"suggested_improvements": self.suggested_improvements,
|
|
168
|
+
"should_retry": self.should_retry,
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
@dataclass
|
|
173
|
+
class ReflectionIteration:
|
|
174
|
+
"""One iteration of the reflection loop."""
|
|
175
|
+
|
|
176
|
+
iteration: int
|
|
177
|
+
answer: str
|
|
178
|
+
critique: CritiqueResult | None = None
|
|
179
|
+
improved_answer: str | None = None
|
|
180
|
+
improvement_accepted: bool = False
|
|
181
|
+
duration_ms: float = 0.0
|
|
182
|
+
|
|
183
|
+
def to_dict(self) -> dict[str, Any]:
|
|
184
|
+
"""Convert to dictionary."""
|
|
185
|
+
return {
|
|
186
|
+
"iteration": self.iteration,
|
|
187
|
+
"answer": self.answer,
|
|
188
|
+
"critique": self.critique.to_dict() if self.critique else None,
|
|
189
|
+
"improved_answer": self.improved_answer,
|
|
190
|
+
"improvement_accepted": self.improvement_accepted,
|
|
191
|
+
"duration_ms": self.duration_ms,
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
@dataclass
|
|
196
|
+
class ReflectionResult:
|
|
197
|
+
"""Complete result of self-reflection process."""
|
|
198
|
+
|
|
199
|
+
original_answer: str = ""
|
|
200
|
+
final_answer: str = ""
|
|
201
|
+
question: str = ""
|
|
202
|
+
|
|
203
|
+
# Iterations
|
|
204
|
+
iterations: list[ReflectionIteration] = field(default_factory=list)
|
|
205
|
+
total_iterations: int = 0
|
|
206
|
+
|
|
207
|
+
# Outcome
|
|
208
|
+
improved: bool = False
|
|
209
|
+
final_confidence: float = 0.0
|
|
210
|
+
all_issues: list[Issue] = field(default_factory=list)
|
|
211
|
+
|
|
212
|
+
# Timing
|
|
213
|
+
total_duration_ms: float = 0.0
|
|
214
|
+
|
|
215
|
+
def to_dict(self) -> dict[str, Any]:
|
|
216
|
+
"""Convert to dictionary."""
|
|
217
|
+
return {
|
|
218
|
+
"original_answer": self.original_answer,
|
|
219
|
+
"final_answer": self.final_answer,
|
|
220
|
+
"question": self.question,
|
|
221
|
+
"iterations": [i.to_dict() for i in self.iterations],
|
|
222
|
+
"total_iterations": self.total_iterations,
|
|
223
|
+
"improved": self.improved,
|
|
224
|
+
"final_confidence": self.final_confidence,
|
|
225
|
+
"all_issues": [i.to_dict() for i in self.all_issues],
|
|
226
|
+
"total_duration_ms": self.total_duration_ms,
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
# =============================================================================
|
|
231
|
+
# Self-Reflection Engine
|
|
232
|
+
# =============================================================================
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
class SelfReflectionEngine:
|
|
236
|
+
"""
|
|
237
|
+
Implements iterative self-correction for answers.
|
|
238
|
+
|
|
239
|
+
Flow:
|
|
240
|
+
1. Take initial answer
|
|
241
|
+
2. Generate critique (what could be wrong?)
|
|
242
|
+
3. If issues found, generate improved answer
|
|
243
|
+
4. Verify improvement is actually better
|
|
244
|
+
5. Repeat until confident or max iterations
|
|
245
|
+
"""
|
|
246
|
+
|
|
247
|
+
def __init__(
|
|
248
|
+
self,
|
|
249
|
+
llm_fn: Callable[[str], str] | None = None,
|
|
250
|
+
max_iterations: int = 3,
|
|
251
|
+
min_confidence_threshold: float = 0.8,
|
|
252
|
+
accept_improvement_threshold: float = 0.6,
|
|
253
|
+
enable_verification: bool = True,
|
|
254
|
+
):
|
|
255
|
+
"""
|
|
256
|
+
Initialize the self-reflection engine.
|
|
257
|
+
|
|
258
|
+
Args:
|
|
259
|
+
llm_fn: LLM function for critique and refinement.
|
|
260
|
+
max_iterations: Maximum reflection iterations.
|
|
261
|
+
min_confidence_threshold: Stop if critique confidence exceeds this.
|
|
262
|
+
accept_improvement_threshold: Accept improvement if confidence exceeds this.
|
|
263
|
+
enable_verification: Verify improvements are actually better.
|
|
264
|
+
"""
|
|
265
|
+
self.llm_fn = llm_fn
|
|
266
|
+
self.max_iterations = max_iterations
|
|
267
|
+
self.min_confidence_threshold = min_confidence_threshold
|
|
268
|
+
self.accept_improvement_threshold = accept_improvement_threshold
|
|
269
|
+
self.enable_verification = enable_verification
|
|
270
|
+
|
|
271
|
+
# Learning: track which issues are commonly found
|
|
272
|
+
self._issue_stats: dict[str, int] = {}
|
|
273
|
+
|
|
274
|
+
def set_llm_function(self, llm_fn: Callable[[str], str]) -> None:
|
|
275
|
+
"""Set the LLM function."""
|
|
276
|
+
self.llm_fn = llm_fn
|
|
277
|
+
|
|
278
|
+
def reflect(
|
|
279
|
+
self,
|
|
280
|
+
answer: str,
|
|
281
|
+
question: str,
|
|
282
|
+
evidence: str = "",
|
|
283
|
+
navigate_fn: Callable[[str], str] | None = None,
|
|
284
|
+
) -> ReflectionResult:
|
|
285
|
+
"""
|
|
286
|
+
Perform self-reflection on an answer.
|
|
287
|
+
|
|
288
|
+
Args:
|
|
289
|
+
answer: The initial answer to reflect on.
|
|
290
|
+
question: The original question.
|
|
291
|
+
evidence: Evidence that was used to generate the answer.
|
|
292
|
+
navigate_fn: Optional function to re-navigate with new context.
|
|
293
|
+
|
|
294
|
+
Returns:
|
|
295
|
+
ReflectionResult with final answer and iteration history.
|
|
296
|
+
"""
|
|
297
|
+
if self.llm_fn is None:
|
|
298
|
+
logger.warning("no_llm_configured_for_reflection")
|
|
299
|
+
return ReflectionResult(
|
|
300
|
+
original_answer=answer,
|
|
301
|
+
final_answer=answer,
|
|
302
|
+
question=question,
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
start_time = time.time()
|
|
306
|
+
|
|
307
|
+
result = ReflectionResult(
|
|
308
|
+
original_answer=answer,
|
|
309
|
+
final_answer=answer,
|
|
310
|
+
question=question,
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
current_answer = answer
|
|
314
|
+
|
|
315
|
+
for iteration in range(self.max_iterations):
|
|
316
|
+
iter_start = time.time()
|
|
317
|
+
|
|
318
|
+
logger.info(
|
|
319
|
+
"reflection_iteration_started",
|
|
320
|
+
iteration=iteration + 1,
|
|
321
|
+
max=self.max_iterations,
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
# Step 1: Critique the current answer
|
|
325
|
+
critique = self._critique(current_answer, question, evidence)
|
|
326
|
+
|
|
327
|
+
iter_result = ReflectionIteration(
|
|
328
|
+
iteration=iteration + 1,
|
|
329
|
+
answer=current_answer,
|
|
330
|
+
critique=critique,
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
# Collect issues for statistics
|
|
334
|
+
for issue in critique.issues:
|
|
335
|
+
self._issue_stats[issue.type.value] = \
|
|
336
|
+
self._issue_stats.get(issue.type.value, 0) + 1
|
|
337
|
+
result.all_issues.append(issue)
|
|
338
|
+
|
|
339
|
+
# Check if we should stop
|
|
340
|
+
if not critique.has_issues or not critique.should_retry:
|
|
341
|
+
logger.info(
|
|
342
|
+
"reflection_no_issues",
|
|
343
|
+
iteration=iteration + 1,
|
|
344
|
+
confidence=critique.confidence,
|
|
345
|
+
)
|
|
346
|
+
iter_result.duration_ms = (time.time() - iter_start) * 1000
|
|
347
|
+
result.iterations.append(iter_result)
|
|
348
|
+
break
|
|
349
|
+
|
|
350
|
+
# Step 2: Generate improved answer
|
|
351
|
+
improved_answer = self._refine(
|
|
352
|
+
current_answer, question, critique, evidence
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
iter_result.improved_answer = improved_answer
|
|
356
|
+
|
|
357
|
+
# Step 3: Verify improvement (optional)
|
|
358
|
+
if self.enable_verification and improved_answer:
|
|
359
|
+
is_better = self._verify_improvement(
|
|
360
|
+
question, current_answer, improved_answer
|
|
361
|
+
)
|
|
362
|
+
iter_result.improvement_accepted = is_better
|
|
363
|
+
|
|
364
|
+
if is_better:
|
|
365
|
+
current_answer = improved_answer
|
|
366
|
+
result.improved = True
|
|
367
|
+
logger.info(
|
|
368
|
+
"improvement_accepted",
|
|
369
|
+
iteration=iteration + 1,
|
|
370
|
+
)
|
|
371
|
+
else:
|
|
372
|
+
logger.info(
|
|
373
|
+
"improvement_rejected",
|
|
374
|
+
iteration=iteration + 1,
|
|
375
|
+
)
|
|
376
|
+
elif improved_answer:
|
|
377
|
+
# Accept without verification
|
|
378
|
+
current_answer = improved_answer
|
|
379
|
+
iter_result.improvement_accepted = True
|
|
380
|
+
result.improved = True
|
|
381
|
+
|
|
382
|
+
iter_result.duration_ms = (time.time() - iter_start) * 1000
|
|
383
|
+
result.iterations.append(iter_result)
|
|
384
|
+
|
|
385
|
+
# Check confidence threshold
|
|
386
|
+
if critique.confidence >= self.min_confidence_threshold:
|
|
387
|
+
logger.info(
|
|
388
|
+
"confidence_threshold_reached",
|
|
389
|
+
confidence=critique.confidence,
|
|
390
|
+
threshold=self.min_confidence_threshold,
|
|
391
|
+
)
|
|
392
|
+
break
|
|
393
|
+
|
|
394
|
+
result.final_answer = current_answer
|
|
395
|
+
result.total_iterations = len(result.iterations)
|
|
396
|
+
result.total_duration_ms = (time.time() - start_time) * 1000
|
|
397
|
+
|
|
398
|
+
# Calculate final confidence
|
|
399
|
+
if result.iterations and result.iterations[-1].critique:
|
|
400
|
+
last_critique = result.iterations[-1].critique
|
|
401
|
+
result.final_confidence = last_critique.confidence if not last_critique.has_issues else 1.0 - (len(last_critique.issues) * 0.1)
|
|
402
|
+
else:
|
|
403
|
+
result.final_confidence = 0.7 # Default
|
|
404
|
+
|
|
405
|
+
logger.info(
|
|
406
|
+
"reflection_complete",
|
|
407
|
+
iterations=result.total_iterations,
|
|
408
|
+
improved=result.improved,
|
|
409
|
+
final_confidence=result.final_confidence,
|
|
410
|
+
duration_ms=result.total_duration_ms,
|
|
411
|
+
)
|
|
412
|
+
|
|
413
|
+
return result
|
|
414
|
+
|
|
415
|
+
def _critique(
|
|
416
|
+
self,
|
|
417
|
+
answer: str,
|
|
418
|
+
question: str,
|
|
419
|
+
evidence: str,
|
|
420
|
+
) -> CritiqueResult:
|
|
421
|
+
"""Generate a critique of the answer."""
|
|
422
|
+
prompt = CRITIQUE_PROMPT.format(
|
|
423
|
+
question=question,
|
|
424
|
+
answer=answer,
|
|
425
|
+
evidence=evidence[:2000] if evidence else "No specific evidence provided.",
|
|
426
|
+
)
|
|
427
|
+
|
|
428
|
+
try:
|
|
429
|
+
response = self.llm_fn(prompt)
|
|
430
|
+
return self._parse_critique(response)
|
|
431
|
+
|
|
432
|
+
except Exception as e:
|
|
433
|
+
logger.warning("critique_failed", error=str(e))
|
|
434
|
+
return CritiqueResult(
|
|
435
|
+
has_issues=False,
|
|
436
|
+
confidence=0.5,
|
|
437
|
+
raw_response=str(e),
|
|
438
|
+
)
|
|
439
|
+
|
|
440
|
+
def _parse_critique(self, response: str) -> CritiqueResult:
|
|
441
|
+
"""Parse critique response into structured format."""
|
|
442
|
+
result = CritiqueResult(raw_response=response)
|
|
443
|
+
|
|
444
|
+
# Check for "NO ISSUES FOUND"
|
|
445
|
+
if "NO ISSUES FOUND" in response.upper():
|
|
446
|
+
result.has_issues = False
|
|
447
|
+
result.confidence = 0.9
|
|
448
|
+
return result
|
|
449
|
+
|
|
450
|
+
# Parse JSON
|
|
451
|
+
try:
|
|
452
|
+
json_match = re.search(r'\{[\s\S]*\}', response)
|
|
453
|
+
if not json_match:
|
|
454
|
+
return result
|
|
455
|
+
|
|
456
|
+
data = json.loads(json_match.group())
|
|
457
|
+
|
|
458
|
+
result.has_issues = data.get("has_issues", False)
|
|
459
|
+
result.confidence = data.get("confidence_in_critique", 0.5)
|
|
460
|
+
result.should_retry = data.get("should_retry", False)
|
|
461
|
+
result.suggested_improvements = data.get("suggested_improvements", [])
|
|
462
|
+
|
|
463
|
+
for issue_data in data.get("issues", []):
|
|
464
|
+
try:
|
|
465
|
+
issue = Issue(
|
|
466
|
+
type=IssueType(issue_data.get("type", "accuracy")),
|
|
467
|
+
description=issue_data.get("description", ""),
|
|
468
|
+
severity=IssueSeverity(issue_data.get("severity", "medium")),
|
|
469
|
+
)
|
|
470
|
+
result.issues.append(issue)
|
|
471
|
+
except ValueError:
|
|
472
|
+
pass
|
|
473
|
+
|
|
474
|
+
except json.JSONDecodeError:
|
|
475
|
+
# If JSON parsing fails, look for issue indicators
|
|
476
|
+
if any(word in response.lower() for word in ["issue", "problem", "incorrect", "missing"]):
|
|
477
|
+
result.has_issues = True
|
|
478
|
+
result.should_retry = True
|
|
479
|
+
|
|
480
|
+
return result
|
|
481
|
+
|
|
482
|
+
def _refine(
|
|
483
|
+
self,
|
|
484
|
+
answer: str,
|
|
485
|
+
question: str,
|
|
486
|
+
critique: CritiqueResult,
|
|
487
|
+
evidence: str,
|
|
488
|
+
) -> str:
|
|
489
|
+
"""Generate an improved answer based on critique."""
|
|
490
|
+
# Format critique for prompt
|
|
491
|
+
critique_text = []
|
|
492
|
+
for issue in critique.issues:
|
|
493
|
+
critique_text.append(f"- [{issue.severity.value.upper()}] {issue.type.value}: {issue.description}")
|
|
494
|
+
|
|
495
|
+
if critique.suggested_improvements:
|
|
496
|
+
critique_text.append("\nSuggested improvements:")
|
|
497
|
+
for suggestion in critique.suggested_improvements:
|
|
498
|
+
critique_text.append(f"- {suggestion}")
|
|
499
|
+
|
|
500
|
+
prompt = REFINEMENT_PROMPT.format(
|
|
501
|
+
question=question,
|
|
502
|
+
previous_answer=answer,
|
|
503
|
+
critique="\n".join(critique_text) if critique_text else "No specific issues identified.",
|
|
504
|
+
evidence=evidence[:2000] if evidence else "Use your knowledge to improve the answer.",
|
|
505
|
+
)
|
|
506
|
+
|
|
507
|
+
try:
|
|
508
|
+
response = self.llm_fn(prompt)
|
|
509
|
+
return response.strip()
|
|
510
|
+
|
|
511
|
+
except Exception as e:
|
|
512
|
+
logger.warning("refinement_failed", error=str(e))
|
|
513
|
+
return ""
|
|
514
|
+
|
|
515
|
+
def _verify_improvement(
|
|
516
|
+
self,
|
|
517
|
+
question: str,
|
|
518
|
+
original: str,
|
|
519
|
+
improved: str,
|
|
520
|
+
) -> bool:
|
|
521
|
+
"""Verify that the improved answer is actually better."""
|
|
522
|
+
if not improved:
|
|
523
|
+
return False
|
|
524
|
+
|
|
525
|
+
prompt = VERIFICATION_PROMPT.format(
|
|
526
|
+
question=question,
|
|
527
|
+
answer_a=original,
|
|
528
|
+
answer_b=improved,
|
|
529
|
+
)
|
|
530
|
+
|
|
531
|
+
try:
|
|
532
|
+
response = self.llm_fn(prompt)
|
|
533
|
+
|
|
534
|
+
# Parse response
|
|
535
|
+
json_match = re.search(r'\{[\s\S]*\}', response)
|
|
536
|
+
if not json_match:
|
|
537
|
+
# Default to accepting improvement
|
|
538
|
+
return True
|
|
539
|
+
|
|
540
|
+
data = json.loads(json_match.group())
|
|
541
|
+
better = data.get("better_answer", "B")
|
|
542
|
+
confidence = data.get("confidence", 0.5)
|
|
543
|
+
|
|
544
|
+
# Accept B (improved) if confident enough
|
|
545
|
+
return better == "B" and confidence >= self.accept_improvement_threshold
|
|
546
|
+
|
|
547
|
+
except Exception as e:
|
|
548
|
+
logger.warning("verification_failed", error=str(e))
|
|
549
|
+
# Default to accepting improvement
|
|
550
|
+
return True
|
|
551
|
+
|
|
552
|
+
def get_issue_stats(self) -> dict[str, int]:
|
|
553
|
+
"""Get statistics on issues found across all reflections."""
|
|
554
|
+
return dict(self._issue_stats)
|
|
555
|
+
|
|
556
|
+
|
|
557
|
+
# =============================================================================
|
|
558
|
+
# Convenience Functions
|
|
559
|
+
# =============================================================================
|
|
560
|
+
|
|
561
|
+
|
|
562
|
+
def reflect_on_answer(
|
|
563
|
+
answer: str,
|
|
564
|
+
question: str,
|
|
565
|
+
evidence: str = "",
|
|
566
|
+
llm_fn: Callable[[str], str] | None = None,
|
|
567
|
+
max_iterations: int = 2,
|
|
568
|
+
) -> ReflectionResult:
|
|
569
|
+
"""
|
|
570
|
+
Perform self-reflection on an answer.
|
|
571
|
+
|
|
572
|
+
Simple interface for one-off reflection.
|
|
573
|
+
|
|
574
|
+
Args:
|
|
575
|
+
answer: The answer to reflect on.
|
|
576
|
+
question: The original question.
|
|
577
|
+
evidence: Evidence used.
|
|
578
|
+
llm_fn: LLM function (uses default if not provided).
|
|
579
|
+
max_iterations: Maximum iterations.
|
|
580
|
+
|
|
581
|
+
Returns:
|
|
582
|
+
ReflectionResult with final answer.
|
|
583
|
+
"""
|
|
584
|
+
if llm_fn is None:
|
|
585
|
+
try:
|
|
586
|
+
from rnsr.llm import get_llm
|
|
587
|
+
llm = get_llm()
|
|
588
|
+
llm_fn = lambda p: str(llm.complete(p))
|
|
589
|
+
except Exception as e:
|
|
590
|
+
logger.warning("no_llm_available", error=str(e))
|
|
591
|
+
return ReflectionResult(
|
|
592
|
+
original_answer=answer,
|
|
593
|
+
final_answer=answer,
|
|
594
|
+
question=question,
|
|
595
|
+
)
|
|
596
|
+
|
|
597
|
+
engine = SelfReflectionEngine(
|
|
598
|
+
llm_fn=llm_fn,
|
|
599
|
+
max_iterations=max_iterations,
|
|
600
|
+
)
|
|
601
|
+
|
|
602
|
+
return engine.reflect(answer, question, evidence)
|