rnsr 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. rnsr/__init__.py +118 -0
  2. rnsr/__main__.py +242 -0
  3. rnsr/agent/__init__.py +218 -0
  4. rnsr/agent/cross_doc_navigator.py +767 -0
  5. rnsr/agent/graph.py +1557 -0
  6. rnsr/agent/llm_cache.py +575 -0
  7. rnsr/agent/navigator_api.py +497 -0
  8. rnsr/agent/provenance.py +772 -0
  9. rnsr/agent/query_clarifier.py +617 -0
  10. rnsr/agent/reasoning_memory.py +736 -0
  11. rnsr/agent/repl_env.py +709 -0
  12. rnsr/agent/rlm_navigator.py +2108 -0
  13. rnsr/agent/self_reflection.py +602 -0
  14. rnsr/agent/variable_store.py +308 -0
  15. rnsr/benchmarks/__init__.py +118 -0
  16. rnsr/benchmarks/comprehensive_benchmark.py +733 -0
  17. rnsr/benchmarks/evaluation_suite.py +1210 -0
  18. rnsr/benchmarks/finance_bench.py +147 -0
  19. rnsr/benchmarks/pdf_merger.py +178 -0
  20. rnsr/benchmarks/performance.py +321 -0
  21. rnsr/benchmarks/quality.py +321 -0
  22. rnsr/benchmarks/runner.py +298 -0
  23. rnsr/benchmarks/standard_benchmarks.py +995 -0
  24. rnsr/client.py +560 -0
  25. rnsr/document_store.py +394 -0
  26. rnsr/exceptions.py +74 -0
  27. rnsr/extraction/__init__.py +172 -0
  28. rnsr/extraction/candidate_extractor.py +357 -0
  29. rnsr/extraction/entity_extractor.py +581 -0
  30. rnsr/extraction/entity_linker.py +825 -0
  31. rnsr/extraction/grounded_extractor.py +722 -0
  32. rnsr/extraction/learned_types.py +599 -0
  33. rnsr/extraction/models.py +232 -0
  34. rnsr/extraction/relationship_extractor.py +600 -0
  35. rnsr/extraction/relationship_patterns.py +511 -0
  36. rnsr/extraction/relationship_validator.py +392 -0
  37. rnsr/extraction/rlm_extractor.py +589 -0
  38. rnsr/extraction/rlm_unified_extractor.py +990 -0
  39. rnsr/extraction/tot_validator.py +610 -0
  40. rnsr/extraction/unified_extractor.py +342 -0
  41. rnsr/indexing/__init__.py +60 -0
  42. rnsr/indexing/knowledge_graph.py +1128 -0
  43. rnsr/indexing/kv_store.py +313 -0
  44. rnsr/indexing/persistence.py +323 -0
  45. rnsr/indexing/semantic_retriever.py +237 -0
  46. rnsr/indexing/semantic_search.py +320 -0
  47. rnsr/indexing/skeleton_index.py +395 -0
  48. rnsr/ingestion/__init__.py +161 -0
  49. rnsr/ingestion/chart_parser.py +569 -0
  50. rnsr/ingestion/document_boundary.py +662 -0
  51. rnsr/ingestion/font_histogram.py +334 -0
  52. rnsr/ingestion/header_classifier.py +595 -0
  53. rnsr/ingestion/hierarchical_cluster.py +515 -0
  54. rnsr/ingestion/layout_detector.py +356 -0
  55. rnsr/ingestion/layout_model.py +379 -0
  56. rnsr/ingestion/ocr_fallback.py +177 -0
  57. rnsr/ingestion/pipeline.py +936 -0
  58. rnsr/ingestion/semantic_fallback.py +417 -0
  59. rnsr/ingestion/table_parser.py +799 -0
  60. rnsr/ingestion/text_builder.py +460 -0
  61. rnsr/ingestion/tree_builder.py +402 -0
  62. rnsr/ingestion/vision_retrieval.py +965 -0
  63. rnsr/ingestion/xy_cut.py +555 -0
  64. rnsr/llm.py +733 -0
  65. rnsr/models.py +167 -0
  66. rnsr/py.typed +2 -0
  67. rnsr-0.1.0.dist-info/METADATA +592 -0
  68. rnsr-0.1.0.dist-info/RECORD +72 -0
  69. rnsr-0.1.0.dist-info/WHEEL +5 -0
  70. rnsr-0.1.0.dist-info/entry_points.txt +2 -0
  71. rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
  72. rnsr-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,308 @@
1
+ """
2
+ Variable Store - Pointer-Based Content Management
3
+
4
+ Implements the Variable Stitching pattern for efficient context management.
5
+
6
+ CRITICAL DESIGN:
7
+ - Agent stores findings as $POINTER_NAME (e.g., "$LIABILITY_CLAUSE")
8
+ - Full content stored externally in this VariableStore
9
+ - LLM context contains ONLY pointers until synthesis
10
+ - Pointers resolved to full text ONLY at final synthesis step
11
+
12
+ Why This Matters:
13
+ - Prevents context pollution during navigation
14
+ - Allows comparison of multiple sections efficiently
15
+ - Enables true multi-hop reasoning without context overflow
16
+
17
+ Usage:
18
+ store = VariableStore()
19
+
20
+ # During navigation - store finding as pointer
21
+ store.assign("$LIABILITY_CLAUSE", content, source_node_id)
22
+
23
+ # Agent context contains only: "Found: $LIABILITY_CLAUSE"
24
+ # NOT the full 2000-word clause text
25
+
26
+ # At synthesis - resolve pointers
27
+ full_text = store.resolve("$LIABILITY_CLAUSE")
28
+ """
29
+
30
+ from __future__ import annotations
31
+
32
+ import hashlib
33
+ import re
34
+ from datetime import datetime, timezone
35
+ from typing import Any
36
+
37
+ import structlog
38
+
39
+ from rnsr.exceptions import AgentError
40
+ from rnsr.models import StoredVariable
41
+
42
+ logger = structlog.get_logger(__name__)
43
+
44
+
45
+ class VariableStore:
46
+ """
47
+ Pointer-based variable storage for agent context management.
48
+
49
+ Stores content externally while agent context only holds pointers.
50
+
51
+ Attributes:
52
+ variables: Dictionary of pointer name -> content.
53
+ metadata: Dictionary of pointer name -> StoredVariable.
54
+ """
55
+
56
+ # Pattern for valid pointer names: $UPPER_CASE_NAME
57
+ POINTER_PATTERN = re.compile(r"^\$[A-Z][A-Z0-9_]*$")
58
+
59
+ def __init__(self):
60
+ """Initialize an empty variable store."""
61
+ self._content: dict[str, str] = {}
62
+ self._metadata: dict[str, StoredVariable] = {}
63
+
64
+ logger.debug("variable_store_initialized")
65
+
66
+ def assign(
67
+ self,
68
+ pointer: str,
69
+ content: str,
70
+ source_node_id: str = "",
71
+ ) -> StoredVariable:
72
+ """
73
+ Store content under a pointer name.
74
+
75
+ Args:
76
+ pointer: Variable pointer (e.g., "$LIABILITY_CLAUSE").
77
+ Must match pattern $UPPER_CASE_NAME.
78
+ content: Full text content to store.
79
+ source_node_id: ID of the source node (for traceability).
80
+
81
+ Returns:
82
+ StoredVariable with metadata.
83
+
84
+ Raises:
85
+ AgentError: If pointer format is invalid.
86
+
87
+ Example:
88
+ store.assign("$PAYMENT_TERMS", "Payment due in 30 days...", "node_123")
89
+ """
90
+ # Validate pointer format
91
+ if not self.POINTER_PATTERN.match(pointer):
92
+ raise AgentError(
93
+ f"Invalid pointer format: {pointer}. "
94
+ "Must match $UPPER_CASE_NAME pattern."
95
+ )
96
+
97
+ # Generate content hash
98
+ content_hash = hashlib.sha256(content.encode()).hexdigest()[:16]
99
+
100
+ # Create metadata
101
+ meta = StoredVariable(
102
+ pointer=pointer,
103
+ source_node_id=source_node_id,
104
+ content_hash=content_hash,
105
+ char_count=len(content),
106
+ created_at=datetime.now(timezone.utc).isoformat(),
107
+ )
108
+
109
+ # Store content and metadata
110
+ self._content[pointer] = content
111
+ self._metadata[pointer] = meta
112
+
113
+ logger.info(
114
+ "variable_assigned",
115
+ pointer=pointer,
116
+ source=source_node_id,
117
+ chars=len(content),
118
+ )
119
+
120
+ return meta
121
+
122
+ def resolve(self, pointer: str) -> str | None:
123
+ """
124
+ Resolve a pointer to its full content.
125
+
126
+ Args:
127
+ pointer: Variable pointer (e.g., "$LIABILITY_CLAUSE").
128
+
129
+ Returns:
130
+ Full text content, or None if not found.
131
+
132
+ Example:
133
+ content = store.resolve("$PAYMENT_TERMS")
134
+ """
135
+ content = self._content.get(pointer)
136
+
137
+ if content is None:
138
+ logger.warning("variable_not_found", pointer=pointer)
139
+ else:
140
+ logger.debug("variable_resolved", pointer=pointer)
141
+
142
+ return content
143
+
144
+ def resolve_many(self, pointers: list[str]) -> dict[str, str | None]:
145
+ """
146
+ Resolve multiple pointers at once.
147
+
148
+ Args:
149
+ pointers: List of pointer names.
150
+
151
+ Returns:
152
+ Dictionary mapping pointer -> content (or None).
153
+ """
154
+ return {p: self._content.get(p) for p in pointers}
155
+
156
+ def resolve_all_in_text(self, text: str) -> str:
157
+ """
158
+ Find and resolve all pointers in a text string.
159
+
160
+ Args:
161
+ text: Text containing $POINTER references.
162
+
163
+ Returns:
164
+ Text with pointers replaced by their content.
165
+
166
+ Example:
167
+ text = "Compare $SECTION_A with $SECTION_B"
168
+ resolved = store.resolve_all_in_text(text)
169
+ # Returns: "Compare [full content A] with [full content B]"
170
+ """
171
+ # Find all pointers in text
172
+ pointers = re.findall(r"\$[A-Z][A-Z0-9_]*", text)
173
+
174
+ result = text
175
+ for pointer in pointers:
176
+ content = self._content.get(pointer)
177
+ if content is not None:
178
+ result = result.replace(pointer, content)
179
+ else:
180
+ logger.warning("unresolved_pointer", pointer=pointer)
181
+
182
+ return result
183
+
184
+ def list_variables(self) -> list[StoredVariable]:
185
+ """
186
+ List all stored variables with metadata.
187
+
188
+ Returns:
189
+ List of StoredVariable objects.
190
+ """
191
+ return list(self._metadata.values())
192
+
193
+ def list_pointers(self) -> list[str]:
194
+ """
195
+ Get all pointer names.
196
+
197
+ Returns:
198
+ List of pointer strings.
199
+ """
200
+ return list(self._content.keys())
201
+
202
+ def get_metadata(self, pointer: str) -> StoredVariable | None:
203
+ """Get metadata for a stored variable."""
204
+ return self._metadata.get(pointer)
205
+
206
+ def exists(self, pointer: str) -> bool:
207
+ """Check if a pointer exists."""
208
+ return pointer in self._content
209
+
210
+ def delete(self, pointer: str) -> bool:
211
+ """
212
+ Delete a stored variable.
213
+
214
+ Returns:
215
+ True if deleted, False if not found.
216
+ """
217
+ if pointer in self._content:
218
+ del self._content[pointer]
219
+ del self._metadata[pointer]
220
+ logger.debug("variable_deleted", pointer=pointer)
221
+ return True
222
+ return False
223
+
224
+ def clear(self) -> int:
225
+ """
226
+ Clear all stored variables.
227
+
228
+ Returns:
229
+ Number of variables cleared.
230
+ """
231
+ count = len(self._content)
232
+ self._content.clear()
233
+ self._metadata.clear()
234
+ logger.info("variable_store_cleared", count=count)
235
+ return count
236
+
237
+ def count(self) -> int:
238
+ """Get the number of stored variables."""
239
+ return len(self._content)
240
+
241
+ def total_chars(self) -> int:
242
+ """Get total character count across all stored content."""
243
+ return sum(len(c) for c in self._content.values())
244
+
245
+ def summary(self) -> dict[str, Any]:
246
+ """
247
+ Get a summary of store contents for agent context.
248
+
249
+ This summary can be included in agent context to show
250
+ what variables are available without including content.
251
+
252
+ Returns:
253
+ Summary dict suitable for LLM context.
254
+ """
255
+ variables = []
256
+ for meta in self._metadata.values():
257
+ variables.append({
258
+ "pointer": meta.pointer,
259
+ "source": meta.source_node_id,
260
+ "chars": meta.char_count,
261
+ })
262
+
263
+ return {
264
+ "stored_variables": variables,
265
+ "count": len(variables),
266
+ "total_chars": self.total_chars(),
267
+ "hint": "Use resolve_variable(pointer) to get content during synthesis",
268
+ }
269
+
270
+
271
+ def generate_pointer_name(header: str, prefix: str = "") -> str:
272
+ """
273
+ Generate a valid pointer name from a section header.
274
+
275
+ Args:
276
+ header: Section header text.
277
+ prefix: Optional prefix (e.g., "SEC" for sections).
278
+
279
+ Returns:
280
+ Valid pointer name like $LIABILITY_CLAUSE.
281
+
282
+ Example:
283
+ generate_pointer_name("Liability Clause") -> "$LIABILITY_CLAUSE"
284
+ generate_pointer_name("Section 3.2", prefix="S") -> "$S_SECTION_3_2"
285
+ """
286
+ # Clean and normalize
287
+ name = header.upper()
288
+
289
+ # Replace non-alphanumeric with underscore
290
+ name = re.sub(r"[^A-Z0-9]+", "_", name)
291
+
292
+ # Remove leading/trailing underscores
293
+ name = name.strip("_")
294
+
295
+ # Add prefix
296
+ if prefix:
297
+ name = f"{prefix}_{name}"
298
+
299
+ # Ensure valid start (letter, not number)
300
+ if name and name[0].isdigit():
301
+ name = "N" + name
302
+
303
+ # Truncate if too long
304
+ if len(name) > 30:
305
+ name = name[:30].rstrip("_")
306
+
307
+ # Add $ prefix
308
+ return f"${name}" if name else "$UNNAMED"
@@ -0,0 +1,118 @@
1
+ """
2
+ RNSR Benchmarking Suite
3
+
4
+ Measures:
5
+ 1. Performance: Ingestion speed, query latency, memory usage
6
+ 2. Quality: Retrieval accuracy, answer relevance (requires ground truth)
7
+ 3. Comparison: RNSR vs baseline chunking approaches
8
+ 4. Standard Benchmarks: HotpotQA, MuSiQue, BEIR, RAGAS
9
+ 5. Comprehensive: All navigator types (standard, RLM, vision, hybrid)
10
+
11
+ Standard RAG Benchmarks:
12
+ - HotpotQA: Multi-hop question answering (EMNLP 2018)
13
+ - MuSiQue: Compositional multi-hop QA (TACL 2022)
14
+ - BEIR: Information retrieval benchmark (NeurIPS 2021)
15
+ - RAGAS: RAG evaluation metrics (faithfulness, relevance, etc.)
16
+
17
+ Comprehensive Benchmark (PageIndex/RLM-inspired):
18
+ - FinanceBench-style: Financial document QA
19
+ - OOLONG-style: Long context aggregation
20
+ - Multi-hop: Complex relational queries
21
+ """
22
+
23
+ from rnsr.benchmarks.performance import (
24
+ PerformanceBenchmark,
25
+ BenchmarkResult,
26
+ run_ingestion_benchmark,
27
+ run_query_benchmark,
28
+ )
29
+ from rnsr.benchmarks.quality import (
30
+ QualityBenchmark,
31
+ QualityMetrics,
32
+ evaluate_retrieval,
33
+ )
34
+ from rnsr.benchmarks.runner import (
35
+ BenchmarkRunner,
36
+ BenchmarkConfig,
37
+ run_full_benchmark,
38
+ )
39
+ from rnsr.benchmarks.standard_benchmarks import (
40
+ # Baselines
41
+ NaiveChunkRAG,
42
+ SemanticChunkRAG,
43
+ BaselineResult,
44
+ # Benchmark datasets
45
+ BenchmarkLoader,
46
+ BenchmarkDataset,
47
+ BenchmarkQuestion,
48
+ # RAGAS metrics
49
+ RAGASEvaluator,
50
+ RAGASMetrics,
51
+ # Multi-hop metrics
52
+ MultiHopMetrics,
53
+ evaluate_multihop,
54
+ # Comparison
55
+ compare_rnsr_vs_baseline,
56
+ )
57
+ from rnsr.benchmarks.evaluation_suite import (
58
+ EvaluationSuite,
59
+ EvaluationConfig,
60
+ EvaluationReport,
61
+ RNSRBenchmarkAdapter,
62
+ )
63
+ from rnsr.benchmarks.comprehensive_benchmark import (
64
+ # Comprehensive benchmark for all navigator types
65
+ ComprehensiveBenchmarkRunner,
66
+ ComprehensiveBenchmarkReport,
67
+ BenchmarkTestCase,
68
+ MethodResults,
69
+ run_comprehensive_benchmark,
70
+ quick_benchmark,
71
+ # Standard test suites
72
+ get_financebench_cases,
73
+ get_oolong_style_cases,
74
+ get_multi_hop_cases,
75
+ )
76
+
77
+ __all__ = [
78
+ # Performance
79
+ "PerformanceBenchmark",
80
+ "BenchmarkResult",
81
+ "run_ingestion_benchmark",
82
+ "run_query_benchmark",
83
+ # Quality
84
+ "QualityBenchmark",
85
+ "QualityMetrics",
86
+ "evaluate_retrieval",
87
+ # Runner
88
+ "BenchmarkRunner",
89
+ "BenchmarkConfig",
90
+ "run_full_benchmark",
91
+ # Standard Benchmarks
92
+ "NaiveChunkRAG",
93
+ "SemanticChunkRAG",
94
+ "BaselineResult",
95
+ "BenchmarkLoader",
96
+ "BenchmarkDataset",
97
+ "BenchmarkQuestion",
98
+ "RAGASEvaluator",
99
+ "RAGASMetrics",
100
+ "MultiHopMetrics",
101
+ "evaluate_multihop",
102
+ "compare_rnsr_vs_baseline",
103
+ # Evaluation Suite
104
+ "EvaluationSuite",
105
+ "EvaluationConfig",
106
+ "EvaluationReport",
107
+ "RNSRBenchmarkAdapter",
108
+ # Comprehensive Benchmark (State-of-the-Art)
109
+ "ComprehensiveBenchmarkRunner",
110
+ "ComprehensiveBenchmarkReport",
111
+ "BenchmarkTestCase",
112
+ "MethodResults",
113
+ "run_comprehensive_benchmark",
114
+ "quick_benchmark",
115
+ "get_financebench_cases",
116
+ "get_oolong_style_cases",
117
+ "get_multi_hop_cases",
118
+ ]