rnsr 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rnsr/__init__.py +118 -0
- rnsr/__main__.py +242 -0
- rnsr/agent/__init__.py +218 -0
- rnsr/agent/cross_doc_navigator.py +767 -0
- rnsr/agent/graph.py +1557 -0
- rnsr/agent/llm_cache.py +575 -0
- rnsr/agent/navigator_api.py +497 -0
- rnsr/agent/provenance.py +772 -0
- rnsr/agent/query_clarifier.py +617 -0
- rnsr/agent/reasoning_memory.py +736 -0
- rnsr/agent/repl_env.py +709 -0
- rnsr/agent/rlm_navigator.py +2108 -0
- rnsr/agent/self_reflection.py +602 -0
- rnsr/agent/variable_store.py +308 -0
- rnsr/benchmarks/__init__.py +118 -0
- rnsr/benchmarks/comprehensive_benchmark.py +733 -0
- rnsr/benchmarks/evaluation_suite.py +1210 -0
- rnsr/benchmarks/finance_bench.py +147 -0
- rnsr/benchmarks/pdf_merger.py +178 -0
- rnsr/benchmarks/performance.py +321 -0
- rnsr/benchmarks/quality.py +321 -0
- rnsr/benchmarks/runner.py +298 -0
- rnsr/benchmarks/standard_benchmarks.py +995 -0
- rnsr/client.py +560 -0
- rnsr/document_store.py +394 -0
- rnsr/exceptions.py +74 -0
- rnsr/extraction/__init__.py +172 -0
- rnsr/extraction/candidate_extractor.py +357 -0
- rnsr/extraction/entity_extractor.py +581 -0
- rnsr/extraction/entity_linker.py +825 -0
- rnsr/extraction/grounded_extractor.py +722 -0
- rnsr/extraction/learned_types.py +599 -0
- rnsr/extraction/models.py +232 -0
- rnsr/extraction/relationship_extractor.py +600 -0
- rnsr/extraction/relationship_patterns.py +511 -0
- rnsr/extraction/relationship_validator.py +392 -0
- rnsr/extraction/rlm_extractor.py +589 -0
- rnsr/extraction/rlm_unified_extractor.py +990 -0
- rnsr/extraction/tot_validator.py +610 -0
- rnsr/extraction/unified_extractor.py +342 -0
- rnsr/indexing/__init__.py +60 -0
- rnsr/indexing/knowledge_graph.py +1128 -0
- rnsr/indexing/kv_store.py +313 -0
- rnsr/indexing/persistence.py +323 -0
- rnsr/indexing/semantic_retriever.py +237 -0
- rnsr/indexing/semantic_search.py +320 -0
- rnsr/indexing/skeleton_index.py +395 -0
- rnsr/ingestion/__init__.py +161 -0
- rnsr/ingestion/chart_parser.py +569 -0
- rnsr/ingestion/document_boundary.py +662 -0
- rnsr/ingestion/font_histogram.py +334 -0
- rnsr/ingestion/header_classifier.py +595 -0
- rnsr/ingestion/hierarchical_cluster.py +515 -0
- rnsr/ingestion/layout_detector.py +356 -0
- rnsr/ingestion/layout_model.py +379 -0
- rnsr/ingestion/ocr_fallback.py +177 -0
- rnsr/ingestion/pipeline.py +936 -0
- rnsr/ingestion/semantic_fallback.py +417 -0
- rnsr/ingestion/table_parser.py +799 -0
- rnsr/ingestion/text_builder.py +460 -0
- rnsr/ingestion/tree_builder.py +402 -0
- rnsr/ingestion/vision_retrieval.py +965 -0
- rnsr/ingestion/xy_cut.py +555 -0
- rnsr/llm.py +733 -0
- rnsr/models.py +167 -0
- rnsr/py.typed +2 -0
- rnsr-0.1.0.dist-info/METADATA +592 -0
- rnsr-0.1.0.dist-info/RECORD +72 -0
- rnsr-0.1.0.dist-info/WHEEL +5 -0
- rnsr-0.1.0.dist-info/entry_points.txt +2 -0
- rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
- rnsr-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,392 @@
|
|
|
1
|
+
"""
|
|
2
|
+
RNSR Relationship Validator (ToT Pattern)
|
|
3
|
+
|
|
4
|
+
Validates relationship candidates using Tree of Thoughts reasoning.
|
|
5
|
+
Same pattern as entity validation:
|
|
6
|
+
|
|
7
|
+
1. Pattern extraction provides grounded candidates
|
|
8
|
+
2. ToT evaluates each with probability + reasoning
|
|
9
|
+
3. Navigate for context if uncertain
|
|
10
|
+
4. Prevents hallucinated relationships
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import json
|
|
16
|
+
import re
|
|
17
|
+
from dataclasses import dataclass, field
|
|
18
|
+
from typing import Any, TYPE_CHECKING
|
|
19
|
+
|
|
20
|
+
import structlog
|
|
21
|
+
|
|
22
|
+
from rnsr.extraction.models import Entity, Relationship, RelationType
|
|
23
|
+
from rnsr.extraction.relationship_patterns import RelationshipCandidate
|
|
24
|
+
from rnsr.llm import get_llm
|
|
25
|
+
|
|
26
|
+
if TYPE_CHECKING:
|
|
27
|
+
from rnsr.models import DocumentTree
|
|
28
|
+
|
|
29
|
+
logger = structlog.get_logger(__name__)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# ToT prompt for relationship validation
|
|
33
|
+
TOT_RELATIONSHIP_VALIDATION_PROMPT = """You are validating relationship candidates extracted from a document.
|
|
34
|
+
|
|
35
|
+
Current Section: {section_header}
|
|
36
|
+
Section Content:
|
|
37
|
+
---
|
|
38
|
+
{section_content}
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
Known Entities:
|
|
42
|
+
{entities_formatted}
|
|
43
|
+
|
|
44
|
+
Relationship Candidates to Validate:
|
|
45
|
+
{candidates_formatted}
|
|
46
|
+
|
|
47
|
+
EVALUATION TASK:
|
|
48
|
+
For each relationship candidate, determine if it represents a real, meaningful relationship.
|
|
49
|
+
|
|
50
|
+
RELATIONSHIP TYPES:
|
|
51
|
+
- MENTIONS: Section/document mentions an entity
|
|
52
|
+
- TEMPORAL_BEFORE: X occurred before Y
|
|
53
|
+
- TEMPORAL_AFTER: X occurred after Y
|
|
54
|
+
- CAUSAL: X caused/led to Y
|
|
55
|
+
- SUPPORTS: X supports claim in Y
|
|
56
|
+
- CONTRADICTS: X contradicts Y
|
|
57
|
+
- AFFILIATED_WITH: Person affiliated with organization
|
|
58
|
+
- PARTY_TO: Entity is party to document/case
|
|
59
|
+
- REFERENCES: References another document/section
|
|
60
|
+
- SUPERSEDES: X supersedes/replaces Y
|
|
61
|
+
- AMENDS: X amends/modifies Y
|
|
62
|
+
|
|
63
|
+
For each candidate, provide:
|
|
64
|
+
1. valid: true if this is a real, meaningful relationship
|
|
65
|
+
2. probability: 0.0-1.0 confidence score
|
|
66
|
+
3. relationship_type: Corrected type if pattern was wrong
|
|
67
|
+
4. reasoning: Brief explanation
|
|
68
|
+
|
|
69
|
+
OUTPUT FORMAT (JSON):
|
|
70
|
+
{{
|
|
71
|
+
"evaluations": [
|
|
72
|
+
{{
|
|
73
|
+
"candidate_id": 0,
|
|
74
|
+
"valid": true,
|
|
75
|
+
"probability": 0.85,
|
|
76
|
+
"relationship_type": "AFFILIATED_WITH",
|
|
77
|
+
"reasoning": "Evidence clearly shows John Smith is CEO of Acme Corp"
|
|
78
|
+
}},
|
|
79
|
+
{{
|
|
80
|
+
"candidate_id": 1,
|
|
81
|
+
"valid": false,
|
|
82
|
+
"probability": 0.2,
|
|
83
|
+
"reasoning": "Co-occurrence does not indicate actual relationship"
|
|
84
|
+
}}
|
|
85
|
+
],
|
|
86
|
+
"selected_relationships": [0],
|
|
87
|
+
"needs_more_context": []
|
|
88
|
+
}}
|
|
89
|
+
|
|
90
|
+
Rules:
|
|
91
|
+
- Only validate relationships with clear evidence in the text
|
|
92
|
+
- Co-occurrence alone is NOT sufficient - need explicit connection
|
|
93
|
+
- Set valid=false for weak or ambiguous connections
|
|
94
|
+
- Be conservative - uncertain relationships should be rejected
|
|
95
|
+
|
|
96
|
+
Respond ONLY with the JSON, no other text."""
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
@dataclass
|
|
100
|
+
class RelationshipValidationResult:
|
|
101
|
+
"""Result of validating a relationship candidate."""
|
|
102
|
+
|
|
103
|
+
candidate_id: int
|
|
104
|
+
probability: float
|
|
105
|
+
is_valid: bool
|
|
106
|
+
relationship_type: str | None = None
|
|
107
|
+
reasoning: str = ""
|
|
108
|
+
used_navigation: bool = False
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
@dataclass
|
|
112
|
+
class RelationshipBatchResult:
|
|
113
|
+
"""Result of validating a batch of relationship candidates."""
|
|
114
|
+
|
|
115
|
+
evaluations: list[RelationshipValidationResult] = field(default_factory=list)
|
|
116
|
+
selected_relationships: list[int] = field(default_factory=list)
|
|
117
|
+
needs_more_context: list[int] = field(default_factory=list)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class RelationshipValidator:
|
|
121
|
+
"""
|
|
122
|
+
Tree of Thoughts relationship validator.
|
|
123
|
+
|
|
124
|
+
Validates grounded relationship candidates with:
|
|
125
|
+
- Probability scores for each candidate
|
|
126
|
+
- Explicit reasoning
|
|
127
|
+
- Optional navigation for context
|
|
128
|
+
"""
|
|
129
|
+
|
|
130
|
+
def __init__(
|
|
131
|
+
self,
|
|
132
|
+
llm: Any | None = None,
|
|
133
|
+
selection_threshold: float = 0.6,
|
|
134
|
+
rejection_threshold: float = 0.3,
|
|
135
|
+
max_candidates_per_batch: int = 15,
|
|
136
|
+
):
|
|
137
|
+
"""
|
|
138
|
+
Initialize the relationship validator.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
llm: LLM instance.
|
|
142
|
+
selection_threshold: Probability threshold for accepting.
|
|
143
|
+
rejection_threshold: Probability threshold for rejecting.
|
|
144
|
+
max_candidates_per_batch: Max candidates per LLM call.
|
|
145
|
+
"""
|
|
146
|
+
self.llm = llm
|
|
147
|
+
self.selection_threshold = selection_threshold
|
|
148
|
+
self.rejection_threshold = rejection_threshold
|
|
149
|
+
self.max_candidates_per_batch = max_candidates_per_batch
|
|
150
|
+
|
|
151
|
+
self._llm_initialized = False
|
|
152
|
+
|
|
153
|
+
def _get_llm(self) -> Any:
|
|
154
|
+
"""Get or initialize LLM."""
|
|
155
|
+
if self.llm is None and not self._llm_initialized:
|
|
156
|
+
self.llm = get_llm()
|
|
157
|
+
self._llm_initialized = True
|
|
158
|
+
return self.llm
|
|
159
|
+
|
|
160
|
+
def validate_candidates(
|
|
161
|
+
self,
|
|
162
|
+
candidates: list[RelationshipCandidate],
|
|
163
|
+
entities: list[Entity],
|
|
164
|
+
section_header: str,
|
|
165
|
+
section_content: str,
|
|
166
|
+
) -> RelationshipBatchResult:
|
|
167
|
+
"""
|
|
168
|
+
Validate relationship candidates using ToT reasoning.
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
candidates: Pre-extracted relationship candidates.
|
|
172
|
+
entities: Known entities in the section.
|
|
173
|
+
section_header: Section header for context.
|
|
174
|
+
section_content: Section content.
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
RelationshipBatchResult with validated relationships.
|
|
178
|
+
"""
|
|
179
|
+
if not candidates:
|
|
180
|
+
return RelationshipBatchResult()
|
|
181
|
+
|
|
182
|
+
llm = self._get_llm()
|
|
183
|
+
if llm is None:
|
|
184
|
+
return self._accept_high_confidence(candidates)
|
|
185
|
+
|
|
186
|
+
# Process in batches
|
|
187
|
+
all_results = RelationshipBatchResult()
|
|
188
|
+
|
|
189
|
+
for i in range(0, len(candidates), self.max_candidates_per_batch):
|
|
190
|
+
batch = candidates[i:i + self.max_candidates_per_batch]
|
|
191
|
+
batch_offset = i
|
|
192
|
+
|
|
193
|
+
batch_result = self._validate_batch(
|
|
194
|
+
candidates=batch,
|
|
195
|
+
batch_offset=batch_offset,
|
|
196
|
+
entities=entities,
|
|
197
|
+
section_header=section_header,
|
|
198
|
+
section_content=section_content,
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
all_results.evaluations.extend(batch_result.evaluations)
|
|
202
|
+
all_results.selected_relationships.extend(batch_result.selected_relationships)
|
|
203
|
+
all_results.needs_more_context.extend(batch_result.needs_more_context)
|
|
204
|
+
|
|
205
|
+
return all_results
|
|
206
|
+
|
|
207
|
+
def _validate_batch(
|
|
208
|
+
self,
|
|
209
|
+
candidates: list[RelationshipCandidate],
|
|
210
|
+
batch_offset: int,
|
|
211
|
+
entities: list[Entity],
|
|
212
|
+
section_header: str,
|
|
213
|
+
section_content: str,
|
|
214
|
+
) -> RelationshipBatchResult:
|
|
215
|
+
"""Validate a batch with ToT."""
|
|
216
|
+
# Format entities
|
|
217
|
+
entities_formatted = "\n".join([
|
|
218
|
+
f"- [{e.id}] {e.canonical_name} ({e.type.value})"
|
|
219
|
+
for e in entities[:20] # Limit
|
|
220
|
+
]) if entities else "(no entities)"
|
|
221
|
+
|
|
222
|
+
# Format candidates
|
|
223
|
+
candidates_formatted = "\n".join([
|
|
224
|
+
f"[{i + batch_offset}] {c.source_text} --[{c.relationship_type}]--> {c.target_text}\n"
|
|
225
|
+
f" Evidence: \"{c.evidence[:100]}...\"\n"
|
|
226
|
+
f" Pattern: {c.pattern_name}, Confidence: {c.confidence:.2f}"
|
|
227
|
+
for i, c in enumerate(candidates)
|
|
228
|
+
])
|
|
229
|
+
|
|
230
|
+
prompt = TOT_RELATIONSHIP_VALIDATION_PROMPT.format(
|
|
231
|
+
section_header=section_header,
|
|
232
|
+
section_content=section_content[:2000],
|
|
233
|
+
entities_formatted=entities_formatted,
|
|
234
|
+
candidates_formatted=candidates_formatted,
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
try:
|
|
238
|
+
response = self.llm.complete(prompt)
|
|
239
|
+
response_text = str(response) if not isinstance(response, str) else response
|
|
240
|
+
|
|
241
|
+
return self._parse_validation_response(response_text, len(candidates), batch_offset)
|
|
242
|
+
|
|
243
|
+
except Exception as e:
|
|
244
|
+
logger.warning("relationship_validation_failed", error=str(e))
|
|
245
|
+
return self._accept_high_confidence(candidates, offset=batch_offset)
|
|
246
|
+
|
|
247
|
+
def _parse_validation_response(
|
|
248
|
+
self,
|
|
249
|
+
response_text: str,
|
|
250
|
+
candidate_count: int,
|
|
251
|
+
batch_offset: int,
|
|
252
|
+
) -> RelationshipBatchResult:
|
|
253
|
+
"""Parse ToT validation response."""
|
|
254
|
+
result = RelationshipBatchResult()
|
|
255
|
+
|
|
256
|
+
json_match = re.search(r'\{[\s\S]*\}', response_text)
|
|
257
|
+
if not json_match:
|
|
258
|
+
return result
|
|
259
|
+
|
|
260
|
+
try:
|
|
261
|
+
data = json.loads(json_match.group())
|
|
262
|
+
except json.JSONDecodeError:
|
|
263
|
+
return result
|
|
264
|
+
|
|
265
|
+
for eval_data in data.get("evaluations", []):
|
|
266
|
+
try:
|
|
267
|
+
validation = RelationshipValidationResult(
|
|
268
|
+
candidate_id=eval_data.get("candidate_id", 0),
|
|
269
|
+
probability=float(eval_data.get("probability", 0.5)),
|
|
270
|
+
is_valid=eval_data.get("valid", False),
|
|
271
|
+
relationship_type=eval_data.get("relationship_type"),
|
|
272
|
+
reasoning=eval_data.get("reasoning", ""),
|
|
273
|
+
)
|
|
274
|
+
result.evaluations.append(validation)
|
|
275
|
+
except (KeyError, TypeError, ValueError):
|
|
276
|
+
continue
|
|
277
|
+
|
|
278
|
+
result.selected_relationships = [
|
|
279
|
+
idx for idx in data.get("selected_relationships", [])
|
|
280
|
+
if isinstance(idx, int)
|
|
281
|
+
]
|
|
282
|
+
|
|
283
|
+
result.needs_more_context = [
|
|
284
|
+
idx for idx in data.get("needs_more_context", [])
|
|
285
|
+
if isinstance(idx, int)
|
|
286
|
+
]
|
|
287
|
+
|
|
288
|
+
return result
|
|
289
|
+
|
|
290
|
+
def _accept_high_confidence(
|
|
291
|
+
self,
|
|
292
|
+
candidates: list[RelationshipCandidate],
|
|
293
|
+
offset: int = 0,
|
|
294
|
+
) -> RelationshipBatchResult:
|
|
295
|
+
"""Accept only high confidence candidates (fallback)."""
|
|
296
|
+
result = RelationshipBatchResult()
|
|
297
|
+
|
|
298
|
+
for i, candidate in enumerate(candidates):
|
|
299
|
+
idx = i + offset
|
|
300
|
+
# Only accept high confidence pattern matches
|
|
301
|
+
is_valid = candidate.confidence >= 0.7
|
|
302
|
+
|
|
303
|
+
result.evaluations.append(RelationshipValidationResult(
|
|
304
|
+
candidate_id=idx,
|
|
305
|
+
probability=candidate.confidence,
|
|
306
|
+
is_valid=is_valid,
|
|
307
|
+
relationship_type=candidate.relationship_type,
|
|
308
|
+
reasoning=f"Pattern: {candidate.pattern_name}" if is_valid else "Low confidence",
|
|
309
|
+
))
|
|
310
|
+
|
|
311
|
+
if is_valid:
|
|
312
|
+
result.selected_relationships.append(idx)
|
|
313
|
+
|
|
314
|
+
return result
|
|
315
|
+
|
|
316
|
+
def candidates_to_relationships(
|
|
317
|
+
self,
|
|
318
|
+
candidates: list[RelationshipCandidate],
|
|
319
|
+
validation_result: RelationshipBatchResult,
|
|
320
|
+
node_id: str,
|
|
321
|
+
doc_id: str,
|
|
322
|
+
) -> list[Relationship]:
|
|
323
|
+
"""
|
|
324
|
+
Convert validated candidates to Relationship objects.
|
|
325
|
+
"""
|
|
326
|
+
relationships = []
|
|
327
|
+
|
|
328
|
+
eval_by_id = {e.candidate_id: e for e in validation_result.evaluations}
|
|
329
|
+
|
|
330
|
+
for idx in validation_result.selected_relationships:
|
|
331
|
+
if idx >= len(candidates):
|
|
332
|
+
continue
|
|
333
|
+
|
|
334
|
+
candidate = candidates[idx]
|
|
335
|
+
evaluation = eval_by_id.get(idx)
|
|
336
|
+
|
|
337
|
+
if not evaluation or not evaluation.is_valid:
|
|
338
|
+
continue
|
|
339
|
+
|
|
340
|
+
# Get relationship type
|
|
341
|
+
rel_type_str = evaluation.relationship_type or candidate.relationship_type
|
|
342
|
+
rel_type = self._map_relationship_type(rel_type_str)
|
|
343
|
+
|
|
344
|
+
# Determine source and target
|
|
345
|
+
source_id = candidate.source_entity_id or node_id
|
|
346
|
+
source_type = "entity" if candidate.source_entity_id else "node"
|
|
347
|
+
target_id = candidate.target_entity_id or f"{doc_id}:{candidate.target_text}"
|
|
348
|
+
target_type = "entity" if candidate.target_entity_id else "node"
|
|
349
|
+
|
|
350
|
+
relationship = Relationship(
|
|
351
|
+
type=rel_type,
|
|
352
|
+
source_id=source_id,
|
|
353
|
+
source_type=source_type,
|
|
354
|
+
target_id=target_id,
|
|
355
|
+
target_type=target_type,
|
|
356
|
+
confidence=evaluation.probability,
|
|
357
|
+
evidence=candidate.evidence,
|
|
358
|
+
doc_id=doc_id,
|
|
359
|
+
node_id=node_id,
|
|
360
|
+
metadata={
|
|
361
|
+
"grounded": True,
|
|
362
|
+
"tot_validated": True,
|
|
363
|
+
"tot_probability": evaluation.probability,
|
|
364
|
+
"tot_reasoning": evaluation.reasoning,
|
|
365
|
+
"pattern": candidate.pattern_name,
|
|
366
|
+
},
|
|
367
|
+
)
|
|
368
|
+
relationships.append(relationship)
|
|
369
|
+
|
|
370
|
+
return relationships
|
|
371
|
+
|
|
372
|
+
def _map_relationship_type(self, type_str: str) -> RelationType:
|
|
373
|
+
"""Map type string to RelationType enum."""
|
|
374
|
+
type_str = type_str.upper()
|
|
375
|
+
|
|
376
|
+
try:
|
|
377
|
+
return RelationType(type_str.lower())
|
|
378
|
+
except ValueError:
|
|
379
|
+
mapping = {
|
|
380
|
+
"TEMPORAL_BEFORE": RelationType.TEMPORAL_BEFORE,
|
|
381
|
+
"TEMPORAL_AFTER": RelationType.TEMPORAL_AFTER,
|
|
382
|
+
"TEMPORAL": RelationType.TEMPORAL_BEFORE,
|
|
383
|
+
"CAUSAL": RelationType.CAUSAL,
|
|
384
|
+
"SUPPORTS": RelationType.SUPPORTS,
|
|
385
|
+
"CONTRADICTS": RelationType.CONTRADICTS,
|
|
386
|
+
"AFFILIATED_WITH": RelationType.AFFILIATED_WITH,
|
|
387
|
+
"PARTY_TO": RelationType.PARTY_TO,
|
|
388
|
+
"REFERENCES": RelationType.REFERENCES,
|
|
389
|
+
"SUPERSEDES": RelationType.SUPERSEDES,
|
|
390
|
+
"AMENDS": RelationType.AMENDS,
|
|
391
|
+
}
|
|
392
|
+
return mapping.get(type_str, RelationType.MENTIONS)
|