rnsr 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. rnsr/__init__.py +118 -0
  2. rnsr/__main__.py +242 -0
  3. rnsr/agent/__init__.py +218 -0
  4. rnsr/agent/cross_doc_navigator.py +767 -0
  5. rnsr/agent/graph.py +1557 -0
  6. rnsr/agent/llm_cache.py +575 -0
  7. rnsr/agent/navigator_api.py +497 -0
  8. rnsr/agent/provenance.py +772 -0
  9. rnsr/agent/query_clarifier.py +617 -0
  10. rnsr/agent/reasoning_memory.py +736 -0
  11. rnsr/agent/repl_env.py +709 -0
  12. rnsr/agent/rlm_navigator.py +2108 -0
  13. rnsr/agent/self_reflection.py +602 -0
  14. rnsr/agent/variable_store.py +308 -0
  15. rnsr/benchmarks/__init__.py +118 -0
  16. rnsr/benchmarks/comprehensive_benchmark.py +733 -0
  17. rnsr/benchmarks/evaluation_suite.py +1210 -0
  18. rnsr/benchmarks/finance_bench.py +147 -0
  19. rnsr/benchmarks/pdf_merger.py +178 -0
  20. rnsr/benchmarks/performance.py +321 -0
  21. rnsr/benchmarks/quality.py +321 -0
  22. rnsr/benchmarks/runner.py +298 -0
  23. rnsr/benchmarks/standard_benchmarks.py +995 -0
  24. rnsr/client.py +560 -0
  25. rnsr/document_store.py +394 -0
  26. rnsr/exceptions.py +74 -0
  27. rnsr/extraction/__init__.py +172 -0
  28. rnsr/extraction/candidate_extractor.py +357 -0
  29. rnsr/extraction/entity_extractor.py +581 -0
  30. rnsr/extraction/entity_linker.py +825 -0
  31. rnsr/extraction/grounded_extractor.py +722 -0
  32. rnsr/extraction/learned_types.py +599 -0
  33. rnsr/extraction/models.py +232 -0
  34. rnsr/extraction/relationship_extractor.py +600 -0
  35. rnsr/extraction/relationship_patterns.py +511 -0
  36. rnsr/extraction/relationship_validator.py +392 -0
  37. rnsr/extraction/rlm_extractor.py +589 -0
  38. rnsr/extraction/rlm_unified_extractor.py +990 -0
  39. rnsr/extraction/tot_validator.py +610 -0
  40. rnsr/extraction/unified_extractor.py +342 -0
  41. rnsr/indexing/__init__.py +60 -0
  42. rnsr/indexing/knowledge_graph.py +1128 -0
  43. rnsr/indexing/kv_store.py +313 -0
  44. rnsr/indexing/persistence.py +323 -0
  45. rnsr/indexing/semantic_retriever.py +237 -0
  46. rnsr/indexing/semantic_search.py +320 -0
  47. rnsr/indexing/skeleton_index.py +395 -0
  48. rnsr/ingestion/__init__.py +161 -0
  49. rnsr/ingestion/chart_parser.py +569 -0
  50. rnsr/ingestion/document_boundary.py +662 -0
  51. rnsr/ingestion/font_histogram.py +334 -0
  52. rnsr/ingestion/header_classifier.py +595 -0
  53. rnsr/ingestion/hierarchical_cluster.py +515 -0
  54. rnsr/ingestion/layout_detector.py +356 -0
  55. rnsr/ingestion/layout_model.py +379 -0
  56. rnsr/ingestion/ocr_fallback.py +177 -0
  57. rnsr/ingestion/pipeline.py +936 -0
  58. rnsr/ingestion/semantic_fallback.py +417 -0
  59. rnsr/ingestion/table_parser.py +799 -0
  60. rnsr/ingestion/text_builder.py +460 -0
  61. rnsr/ingestion/tree_builder.py +402 -0
  62. rnsr/ingestion/vision_retrieval.py +965 -0
  63. rnsr/ingestion/xy_cut.py +555 -0
  64. rnsr/llm.py +733 -0
  65. rnsr/models.py +167 -0
  66. rnsr/py.typed +2 -0
  67. rnsr-0.1.0.dist-info/METADATA +592 -0
  68. rnsr-0.1.0.dist-info/RECORD +72 -0
  69. rnsr-0.1.0.dist-info/WHEEL +5 -0
  70. rnsr-0.1.0.dist-info/entry_points.txt +2 -0
  71. rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
  72. rnsr-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,392 @@
1
+ """
2
+ RNSR Relationship Validator (ToT Pattern)
3
+
4
+ Validates relationship candidates using Tree of Thoughts reasoning.
5
+ Same pattern as entity validation:
6
+
7
+ 1. Pattern extraction provides grounded candidates
8
+ 2. ToT evaluates each with probability + reasoning
9
+ 3. Navigate for context if uncertain
10
+ 4. Prevents hallucinated relationships
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import json
16
+ import re
17
+ from dataclasses import dataclass, field
18
+ from typing import Any, TYPE_CHECKING
19
+
20
+ import structlog
21
+
22
+ from rnsr.extraction.models import Entity, Relationship, RelationType
23
+ from rnsr.extraction.relationship_patterns import RelationshipCandidate
24
+ from rnsr.llm import get_llm
25
+
26
+ if TYPE_CHECKING:
27
+ from rnsr.models import DocumentTree
28
+
29
+ logger = structlog.get_logger(__name__)
30
+
31
+
32
+ # ToT prompt for relationship validation
33
+ TOT_RELATIONSHIP_VALIDATION_PROMPT = """You are validating relationship candidates extracted from a document.
34
+
35
+ Current Section: {section_header}
36
+ Section Content:
37
+ ---
38
+ {section_content}
39
+ ---
40
+
41
+ Known Entities:
42
+ {entities_formatted}
43
+
44
+ Relationship Candidates to Validate:
45
+ {candidates_formatted}
46
+
47
+ EVALUATION TASK:
48
+ For each relationship candidate, determine if it represents a real, meaningful relationship.
49
+
50
+ RELATIONSHIP TYPES:
51
+ - MENTIONS: Section/document mentions an entity
52
+ - TEMPORAL_BEFORE: X occurred before Y
53
+ - TEMPORAL_AFTER: X occurred after Y
54
+ - CAUSAL: X caused/led to Y
55
+ - SUPPORTS: X supports claim in Y
56
+ - CONTRADICTS: X contradicts Y
57
+ - AFFILIATED_WITH: Person affiliated with organization
58
+ - PARTY_TO: Entity is party to document/case
59
+ - REFERENCES: References another document/section
60
+ - SUPERSEDES: X supersedes/replaces Y
61
+ - AMENDS: X amends/modifies Y
62
+
63
+ For each candidate, provide:
64
+ 1. valid: true if this is a real, meaningful relationship
65
+ 2. probability: 0.0-1.0 confidence score
66
+ 3. relationship_type: Corrected type if pattern was wrong
67
+ 4. reasoning: Brief explanation
68
+
69
+ OUTPUT FORMAT (JSON):
70
+ {{
71
+ "evaluations": [
72
+ {{
73
+ "candidate_id": 0,
74
+ "valid": true,
75
+ "probability": 0.85,
76
+ "relationship_type": "AFFILIATED_WITH",
77
+ "reasoning": "Evidence clearly shows John Smith is CEO of Acme Corp"
78
+ }},
79
+ {{
80
+ "candidate_id": 1,
81
+ "valid": false,
82
+ "probability": 0.2,
83
+ "reasoning": "Co-occurrence does not indicate actual relationship"
84
+ }}
85
+ ],
86
+ "selected_relationships": [0],
87
+ "needs_more_context": []
88
+ }}
89
+
90
+ Rules:
91
+ - Only validate relationships with clear evidence in the text
92
+ - Co-occurrence alone is NOT sufficient - need explicit connection
93
+ - Set valid=false for weak or ambiguous connections
94
+ - Be conservative - uncertain relationships should be rejected
95
+
96
+ Respond ONLY with the JSON, no other text."""
97
+
98
+
99
+ @dataclass
100
+ class RelationshipValidationResult:
101
+ """Result of validating a relationship candidate."""
102
+
103
+ candidate_id: int
104
+ probability: float
105
+ is_valid: bool
106
+ relationship_type: str | None = None
107
+ reasoning: str = ""
108
+ used_navigation: bool = False
109
+
110
+
111
+ @dataclass
112
+ class RelationshipBatchResult:
113
+ """Result of validating a batch of relationship candidates."""
114
+
115
+ evaluations: list[RelationshipValidationResult] = field(default_factory=list)
116
+ selected_relationships: list[int] = field(default_factory=list)
117
+ needs_more_context: list[int] = field(default_factory=list)
118
+
119
+
120
+ class RelationshipValidator:
121
+ """
122
+ Tree of Thoughts relationship validator.
123
+
124
+ Validates grounded relationship candidates with:
125
+ - Probability scores for each candidate
126
+ - Explicit reasoning
127
+ - Optional navigation for context
128
+ """
129
+
130
+ def __init__(
131
+ self,
132
+ llm: Any | None = None,
133
+ selection_threshold: float = 0.6,
134
+ rejection_threshold: float = 0.3,
135
+ max_candidates_per_batch: int = 15,
136
+ ):
137
+ """
138
+ Initialize the relationship validator.
139
+
140
+ Args:
141
+ llm: LLM instance.
142
+ selection_threshold: Probability threshold for accepting.
143
+ rejection_threshold: Probability threshold for rejecting.
144
+ max_candidates_per_batch: Max candidates per LLM call.
145
+ """
146
+ self.llm = llm
147
+ self.selection_threshold = selection_threshold
148
+ self.rejection_threshold = rejection_threshold
149
+ self.max_candidates_per_batch = max_candidates_per_batch
150
+
151
+ self._llm_initialized = False
152
+
153
+ def _get_llm(self) -> Any:
154
+ """Get or initialize LLM."""
155
+ if self.llm is None and not self._llm_initialized:
156
+ self.llm = get_llm()
157
+ self._llm_initialized = True
158
+ return self.llm
159
+
160
+ def validate_candidates(
161
+ self,
162
+ candidates: list[RelationshipCandidate],
163
+ entities: list[Entity],
164
+ section_header: str,
165
+ section_content: str,
166
+ ) -> RelationshipBatchResult:
167
+ """
168
+ Validate relationship candidates using ToT reasoning.
169
+
170
+ Args:
171
+ candidates: Pre-extracted relationship candidates.
172
+ entities: Known entities in the section.
173
+ section_header: Section header for context.
174
+ section_content: Section content.
175
+
176
+ Returns:
177
+ RelationshipBatchResult with validated relationships.
178
+ """
179
+ if not candidates:
180
+ return RelationshipBatchResult()
181
+
182
+ llm = self._get_llm()
183
+ if llm is None:
184
+ return self._accept_high_confidence(candidates)
185
+
186
+ # Process in batches
187
+ all_results = RelationshipBatchResult()
188
+
189
+ for i in range(0, len(candidates), self.max_candidates_per_batch):
190
+ batch = candidates[i:i + self.max_candidates_per_batch]
191
+ batch_offset = i
192
+
193
+ batch_result = self._validate_batch(
194
+ candidates=batch,
195
+ batch_offset=batch_offset,
196
+ entities=entities,
197
+ section_header=section_header,
198
+ section_content=section_content,
199
+ )
200
+
201
+ all_results.evaluations.extend(batch_result.evaluations)
202
+ all_results.selected_relationships.extend(batch_result.selected_relationships)
203
+ all_results.needs_more_context.extend(batch_result.needs_more_context)
204
+
205
+ return all_results
206
+
207
+ def _validate_batch(
208
+ self,
209
+ candidates: list[RelationshipCandidate],
210
+ batch_offset: int,
211
+ entities: list[Entity],
212
+ section_header: str,
213
+ section_content: str,
214
+ ) -> RelationshipBatchResult:
215
+ """Validate a batch with ToT."""
216
+ # Format entities
217
+ entities_formatted = "\n".join([
218
+ f"- [{e.id}] {e.canonical_name} ({e.type.value})"
219
+ for e in entities[:20] # Limit
220
+ ]) if entities else "(no entities)"
221
+
222
+ # Format candidates
223
+ candidates_formatted = "\n".join([
224
+ f"[{i + batch_offset}] {c.source_text} --[{c.relationship_type}]--> {c.target_text}\n"
225
+ f" Evidence: \"{c.evidence[:100]}...\"\n"
226
+ f" Pattern: {c.pattern_name}, Confidence: {c.confidence:.2f}"
227
+ for i, c in enumerate(candidates)
228
+ ])
229
+
230
+ prompt = TOT_RELATIONSHIP_VALIDATION_PROMPT.format(
231
+ section_header=section_header,
232
+ section_content=section_content[:2000],
233
+ entities_formatted=entities_formatted,
234
+ candidates_formatted=candidates_formatted,
235
+ )
236
+
237
+ try:
238
+ response = self.llm.complete(prompt)
239
+ response_text = str(response) if not isinstance(response, str) else response
240
+
241
+ return self._parse_validation_response(response_text, len(candidates), batch_offset)
242
+
243
+ except Exception as e:
244
+ logger.warning("relationship_validation_failed", error=str(e))
245
+ return self._accept_high_confidence(candidates, offset=batch_offset)
246
+
247
+ def _parse_validation_response(
248
+ self,
249
+ response_text: str,
250
+ candidate_count: int,
251
+ batch_offset: int,
252
+ ) -> RelationshipBatchResult:
253
+ """Parse ToT validation response."""
254
+ result = RelationshipBatchResult()
255
+
256
+ json_match = re.search(r'\{[\s\S]*\}', response_text)
257
+ if not json_match:
258
+ return result
259
+
260
+ try:
261
+ data = json.loads(json_match.group())
262
+ except json.JSONDecodeError:
263
+ return result
264
+
265
+ for eval_data in data.get("evaluations", []):
266
+ try:
267
+ validation = RelationshipValidationResult(
268
+ candidate_id=eval_data.get("candidate_id", 0),
269
+ probability=float(eval_data.get("probability", 0.5)),
270
+ is_valid=eval_data.get("valid", False),
271
+ relationship_type=eval_data.get("relationship_type"),
272
+ reasoning=eval_data.get("reasoning", ""),
273
+ )
274
+ result.evaluations.append(validation)
275
+ except (KeyError, TypeError, ValueError):
276
+ continue
277
+
278
+ result.selected_relationships = [
279
+ idx for idx in data.get("selected_relationships", [])
280
+ if isinstance(idx, int)
281
+ ]
282
+
283
+ result.needs_more_context = [
284
+ idx for idx in data.get("needs_more_context", [])
285
+ if isinstance(idx, int)
286
+ ]
287
+
288
+ return result
289
+
290
+ def _accept_high_confidence(
291
+ self,
292
+ candidates: list[RelationshipCandidate],
293
+ offset: int = 0,
294
+ ) -> RelationshipBatchResult:
295
+ """Accept only high confidence candidates (fallback)."""
296
+ result = RelationshipBatchResult()
297
+
298
+ for i, candidate in enumerate(candidates):
299
+ idx = i + offset
300
+ # Only accept high confidence pattern matches
301
+ is_valid = candidate.confidence >= 0.7
302
+
303
+ result.evaluations.append(RelationshipValidationResult(
304
+ candidate_id=idx,
305
+ probability=candidate.confidence,
306
+ is_valid=is_valid,
307
+ relationship_type=candidate.relationship_type,
308
+ reasoning=f"Pattern: {candidate.pattern_name}" if is_valid else "Low confidence",
309
+ ))
310
+
311
+ if is_valid:
312
+ result.selected_relationships.append(idx)
313
+
314
+ return result
315
+
316
+ def candidates_to_relationships(
317
+ self,
318
+ candidates: list[RelationshipCandidate],
319
+ validation_result: RelationshipBatchResult,
320
+ node_id: str,
321
+ doc_id: str,
322
+ ) -> list[Relationship]:
323
+ """
324
+ Convert validated candidates to Relationship objects.
325
+ """
326
+ relationships = []
327
+
328
+ eval_by_id = {e.candidate_id: e for e in validation_result.evaluations}
329
+
330
+ for idx in validation_result.selected_relationships:
331
+ if idx >= len(candidates):
332
+ continue
333
+
334
+ candidate = candidates[idx]
335
+ evaluation = eval_by_id.get(idx)
336
+
337
+ if not evaluation or not evaluation.is_valid:
338
+ continue
339
+
340
+ # Get relationship type
341
+ rel_type_str = evaluation.relationship_type or candidate.relationship_type
342
+ rel_type = self._map_relationship_type(rel_type_str)
343
+
344
+ # Determine source and target
345
+ source_id = candidate.source_entity_id or node_id
346
+ source_type = "entity" if candidate.source_entity_id else "node"
347
+ target_id = candidate.target_entity_id or f"{doc_id}:{candidate.target_text}"
348
+ target_type = "entity" if candidate.target_entity_id else "node"
349
+
350
+ relationship = Relationship(
351
+ type=rel_type,
352
+ source_id=source_id,
353
+ source_type=source_type,
354
+ target_id=target_id,
355
+ target_type=target_type,
356
+ confidence=evaluation.probability,
357
+ evidence=candidate.evidence,
358
+ doc_id=doc_id,
359
+ node_id=node_id,
360
+ metadata={
361
+ "grounded": True,
362
+ "tot_validated": True,
363
+ "tot_probability": evaluation.probability,
364
+ "tot_reasoning": evaluation.reasoning,
365
+ "pattern": candidate.pattern_name,
366
+ },
367
+ )
368
+ relationships.append(relationship)
369
+
370
+ return relationships
371
+
372
+ def _map_relationship_type(self, type_str: str) -> RelationType:
373
+ """Map type string to RelationType enum."""
374
+ type_str = type_str.upper()
375
+
376
+ try:
377
+ return RelationType(type_str.lower())
378
+ except ValueError:
379
+ mapping = {
380
+ "TEMPORAL_BEFORE": RelationType.TEMPORAL_BEFORE,
381
+ "TEMPORAL_AFTER": RelationType.TEMPORAL_AFTER,
382
+ "TEMPORAL": RelationType.TEMPORAL_BEFORE,
383
+ "CAUSAL": RelationType.CAUSAL,
384
+ "SUPPORTS": RelationType.SUPPORTS,
385
+ "CONTRADICTS": RelationType.CONTRADICTS,
386
+ "AFFILIATED_WITH": RelationType.AFFILIATED_WITH,
387
+ "PARTY_TO": RelationType.PARTY_TO,
388
+ "REFERENCES": RelationType.REFERENCES,
389
+ "SUPERSEDES": RelationType.SUPERSEDES,
390
+ "AMENDS": RelationType.AMENDS,
391
+ }
392
+ return mapping.get(type_str, RelationType.MENTIONS)