rnsr 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. rnsr/__init__.py +118 -0
  2. rnsr/__main__.py +242 -0
  3. rnsr/agent/__init__.py +218 -0
  4. rnsr/agent/cross_doc_navigator.py +767 -0
  5. rnsr/agent/graph.py +1557 -0
  6. rnsr/agent/llm_cache.py +575 -0
  7. rnsr/agent/navigator_api.py +497 -0
  8. rnsr/agent/provenance.py +772 -0
  9. rnsr/agent/query_clarifier.py +617 -0
  10. rnsr/agent/reasoning_memory.py +736 -0
  11. rnsr/agent/repl_env.py +709 -0
  12. rnsr/agent/rlm_navigator.py +2108 -0
  13. rnsr/agent/self_reflection.py +602 -0
  14. rnsr/agent/variable_store.py +308 -0
  15. rnsr/benchmarks/__init__.py +118 -0
  16. rnsr/benchmarks/comprehensive_benchmark.py +733 -0
  17. rnsr/benchmarks/evaluation_suite.py +1210 -0
  18. rnsr/benchmarks/finance_bench.py +147 -0
  19. rnsr/benchmarks/pdf_merger.py +178 -0
  20. rnsr/benchmarks/performance.py +321 -0
  21. rnsr/benchmarks/quality.py +321 -0
  22. rnsr/benchmarks/runner.py +298 -0
  23. rnsr/benchmarks/standard_benchmarks.py +995 -0
  24. rnsr/client.py +560 -0
  25. rnsr/document_store.py +394 -0
  26. rnsr/exceptions.py +74 -0
  27. rnsr/extraction/__init__.py +172 -0
  28. rnsr/extraction/candidate_extractor.py +357 -0
  29. rnsr/extraction/entity_extractor.py +581 -0
  30. rnsr/extraction/entity_linker.py +825 -0
  31. rnsr/extraction/grounded_extractor.py +722 -0
  32. rnsr/extraction/learned_types.py +599 -0
  33. rnsr/extraction/models.py +232 -0
  34. rnsr/extraction/relationship_extractor.py +600 -0
  35. rnsr/extraction/relationship_patterns.py +511 -0
  36. rnsr/extraction/relationship_validator.py +392 -0
  37. rnsr/extraction/rlm_extractor.py +589 -0
  38. rnsr/extraction/rlm_unified_extractor.py +990 -0
  39. rnsr/extraction/tot_validator.py +610 -0
  40. rnsr/extraction/unified_extractor.py +342 -0
  41. rnsr/indexing/__init__.py +60 -0
  42. rnsr/indexing/knowledge_graph.py +1128 -0
  43. rnsr/indexing/kv_store.py +313 -0
  44. rnsr/indexing/persistence.py +323 -0
  45. rnsr/indexing/semantic_retriever.py +237 -0
  46. rnsr/indexing/semantic_search.py +320 -0
  47. rnsr/indexing/skeleton_index.py +395 -0
  48. rnsr/ingestion/__init__.py +161 -0
  49. rnsr/ingestion/chart_parser.py +569 -0
  50. rnsr/ingestion/document_boundary.py +662 -0
  51. rnsr/ingestion/font_histogram.py +334 -0
  52. rnsr/ingestion/header_classifier.py +595 -0
  53. rnsr/ingestion/hierarchical_cluster.py +515 -0
  54. rnsr/ingestion/layout_detector.py +356 -0
  55. rnsr/ingestion/layout_model.py +379 -0
  56. rnsr/ingestion/ocr_fallback.py +177 -0
  57. rnsr/ingestion/pipeline.py +936 -0
  58. rnsr/ingestion/semantic_fallback.py +417 -0
  59. rnsr/ingestion/table_parser.py +799 -0
  60. rnsr/ingestion/text_builder.py +460 -0
  61. rnsr/ingestion/tree_builder.py +402 -0
  62. rnsr/ingestion/vision_retrieval.py +965 -0
  63. rnsr/ingestion/xy_cut.py +555 -0
  64. rnsr/llm.py +733 -0
  65. rnsr/models.py +167 -0
  66. rnsr/py.typed +2 -0
  67. rnsr-0.1.0.dist-info/METADATA +592 -0
  68. rnsr-0.1.0.dist-info/RECORD +72 -0
  69. rnsr-0.1.0.dist-info/WHEEL +5 -0
  70. rnsr-0.1.0.dist-info/entry_points.txt +2 -0
  71. rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
  72. rnsr-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,237 @@
1
+ """
2
+ Semantic Retriever - Query-driven node selection using embeddings.
3
+
4
+ This module provides retrieval-based navigation that reduces complexity
5
+ from O(N) (evaluating all nodes) to O(log N) (retrieving top-k relevant nodes).
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from typing import Any
11
+
12
+ import structlog
13
+
14
+ from rnsr.models import SkeletonNode
15
+
16
+ logger = structlog.get_logger(__name__)
17
+
18
+
19
+ class SemanticRetriever:
20
+ """
21
+ Retrieves the most relevant nodes given a query using semantic search.
22
+
23
+ Uses embeddings and cosine similarity to rank nodes by relevance,
24
+ reducing search complexity from O(N) to O(log N).
25
+ """
26
+
27
+ def __init__(
28
+ self,
29
+ skeleton_nodes: dict[str, SkeletonNode],
30
+ llm_provider: str = "gemini",
31
+ ):
32
+ """
33
+ Initialize the retriever.
34
+
35
+ Args:
36
+ skeleton_nodes: Dictionary of node_id -> SkeletonNode.
37
+ llm_provider: LLM provider for embeddings.
38
+ """
39
+ self.skeleton_nodes = skeleton_nodes
40
+ self.llm_provider = llm_provider
41
+ self._index = None
42
+ self._embed_model = None
43
+
44
+ logger.info(
45
+ "semantic_retriever_initialized",
46
+ total_nodes=len(skeleton_nodes),
47
+ provider=llm_provider,
48
+ )
49
+
50
+ def _initialize_index(self) -> None:
51
+ """Initialize the vector index lazily."""
52
+ if self._index is not None:
53
+ return
54
+
55
+ try:
56
+ from llama_index.core import VectorStoreIndex
57
+ from llama_index.embeddings.gemini import GeminiEmbedding
58
+ from rnsr.indexing.skeleton_index import create_llama_index_nodes
59
+
60
+ # Create embedding model
61
+ self._embed_model = GeminiEmbedding(
62
+ model_name="models/text-embedding-004"
63
+ )
64
+
65
+ # Create index nodes (summaries only!)
66
+ llama_nodes = create_llama_index_nodes(self.skeleton_nodes)
67
+
68
+ # Build vector index
69
+ self._index = VectorStoreIndex(
70
+ nodes=llama_nodes,
71
+ embed_model=self._embed_model,
72
+ )
73
+
74
+ logger.info(
75
+ "vector_index_built",
76
+ nodes_indexed=len(llama_nodes),
77
+ )
78
+
79
+ except ImportError as e:
80
+ logger.warning(
81
+ "vector_index_unavailable",
82
+ error=str(e),
83
+ fallback="will_use_bm25",
84
+ )
85
+ self._index = None
86
+
87
+ def retrieve(
88
+ self,
89
+ query: str,
90
+ top_k: int = 5,
91
+ parent_id: str | None = None,
92
+ ) -> list[SkeletonNode]:
93
+ """
94
+ Retrieve the most relevant nodes for a query.
95
+
96
+ Args:
97
+ query: The question or search query.
98
+ top_k: Number of results to return.
99
+ parent_id: Optional parent node to restrict search to children.
100
+
101
+ Returns:
102
+ List of SkeletonNode objects ranked by relevance.
103
+ """
104
+ # Initialize index on first use
105
+ self._initialize_index()
106
+
107
+ # Filter candidates if parent_id specified
108
+ candidates = self._get_candidates(parent_id)
109
+
110
+ if self._index is not None:
111
+ # Use vector search
112
+ return self._retrieve_vector(query, top_k, candidates)
113
+ else:
114
+ # Fallback to BM25/keyword search
115
+ return self._retrieve_bm25(query, top_k, candidates)
116
+
117
+ def _get_candidates(
118
+ self,
119
+ parent_id: str | None,
120
+ ) -> dict[str, SkeletonNode]:
121
+ """Get candidate nodes to search over."""
122
+ if parent_id is None:
123
+ return self.skeleton_nodes
124
+
125
+ # Filter to children of parent
126
+ parent = self.skeleton_nodes.get(parent_id)
127
+ if parent is None:
128
+ return self.skeleton_nodes
129
+
130
+ return {
131
+ cid: self.skeleton_nodes[cid]
132
+ for cid in parent.child_ids
133
+ if cid in self.skeleton_nodes
134
+ }
135
+
136
+ def _retrieve_vector(
137
+ self,
138
+ query: str,
139
+ top_k: int,
140
+ candidates: dict[str, SkeletonNode],
141
+ ) -> list[SkeletonNode]:
142
+ """Retrieve using vector similarity."""
143
+ try:
144
+ if self._index is None:
145
+ return self._retrieve_bm25(query, top_k, candidates)
146
+
147
+ retriever = self._index.as_retriever(similarity_top_k=top_k * 2)
148
+ results = retriever.retrieve(query)
149
+
150
+ # Filter to candidates and return SkeletonNodes
151
+ relevant = []
152
+ for result in results:
153
+ # Access metadata safely
154
+ node_id = result.node.metadata.get("node_id") if hasattr(result.node, "metadata") else None
155
+ if node_id and node_id in candidates:
156
+ relevant.append(candidates[node_id])
157
+ if len(relevant) >= top_k:
158
+ break
159
+
160
+ logger.info(
161
+ "vector_retrieval_complete",
162
+ query_words=len(query.split()),
163
+ results=len(relevant),
164
+ )
165
+
166
+ return relevant
167
+
168
+ except Exception as e:
169
+ logger.warning(
170
+ "vector_retrieval_failed",
171
+ error=str(e),
172
+ fallback="bm25",
173
+ )
174
+ return self._retrieve_bm25(query, top_k, candidates)
175
+
176
+ def _retrieve_bm25(
177
+ self,
178
+ query: str,
179
+ top_k: int,
180
+ candidates: dict[str, SkeletonNode],
181
+ ) -> list[SkeletonNode]:
182
+ """Fallback retrieval using BM25/keyword matching."""
183
+ from collections import Counter
184
+
185
+ query_terms = set(query.lower().split())
186
+
187
+ # Score each candidate by term overlap
188
+ scores = []
189
+ for node_id, node in candidates.items():
190
+ # Combine header and summary for matching
191
+ text = f"{node.header or ''} {node.summary}".lower()
192
+ text_terms = text.split()
193
+
194
+ # Count matching terms
195
+ matches = sum(1 for term in query_terms if term in text_terms)
196
+
197
+ # Boost for header matches
198
+ header_matches = sum(
199
+ 1 for term in query_terms
200
+ if term in (node.header or "").lower()
201
+ )
202
+
203
+ score = matches + (header_matches * 2)
204
+
205
+ if score > 0:
206
+ scores.append((score, node))
207
+
208
+ # Sort by score descending
209
+ scores.sort(reverse=True, key=lambda x: x[0])
210
+
211
+ results = [node for score, node in scores[:top_k]]
212
+
213
+ logger.info(
214
+ "bm25_retrieval_complete",
215
+ query_terms=len(query_terms),
216
+ candidates=len(candidates),
217
+ results=len(results),
218
+ )
219
+
220
+ return results
221
+
222
+
223
+ def create_retriever(
224
+ skeleton_nodes: dict[str, SkeletonNode],
225
+ llm_provider: str = "gemini",
226
+ ) -> SemanticRetriever:
227
+ """
228
+ Convenience function to create a semantic retriever.
229
+
230
+ Args:
231
+ skeleton_nodes: Dictionary of node_id -> SkeletonNode.
232
+ llm_provider: LLM provider for embeddings.
233
+
234
+ Returns:
235
+ SemanticRetriever instance.
236
+ """
237
+ return SemanticRetriever(skeleton_nodes, llm_provider)
@@ -0,0 +1,320 @@
1
+ """
2
+ Semantic Search for Skeleton Index
3
+
4
+ Provides O(log N) retrieval using vector similarity search on node summaries.
5
+ Falls back to full exploration if needed.
6
+
7
+ Usage:
8
+ searcher = SemanticSearcher(skeleton_nodes, kv_store)
9
+ relevant_nodes = searcher.search(query, top_k=5)
10
+
11
+ # Or get all node IDs ranked by relevance
12
+ all_ranked = searcher.rank_all_nodes(query)
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ from typing import Any
18
+
19
+ import structlog
20
+
21
+ from rnsr.exceptions import IndexingError
22
+ from rnsr.indexing.kv_store import KVStore
23
+ from rnsr.models import SkeletonNode
24
+
25
+ logger = structlog.get_logger(__name__)
26
+
27
+
28
+ class SemanticSearcher:
29
+ """
30
+ Semantic search over skeleton node summaries.
31
+
32
+ Uses vector embeddings for O(log N) retrieval instead of
33
+ evaluating all nodes with expensive LLM calls.
34
+
35
+ Attributes:
36
+ skeleton_nodes: Dictionary of node_id -> SkeletonNode
37
+ kv_store: KV store for full content retrieval
38
+ index: LlamaIndex VectorStoreIndex (built lazily)
39
+ embedder: Embedding model instance
40
+ """
41
+
42
+ def __init__(
43
+ self,
44
+ skeleton_nodes: dict[str, SkeletonNode],
45
+ kv_store: KVStore,
46
+ embed_model: str = "text-embedding-3-small",
47
+ provider: str | None = None,
48
+ ):
49
+ """
50
+ Initialize semantic searcher.
51
+
52
+ Args:
53
+ skeleton_nodes: Skeleton nodes to search over
54
+ kv_store: KV store for content retrieval
55
+ embed_model: Embedding model name
56
+ provider: "openai", "gemini", or None for auto-detect
57
+ """
58
+ self.skeleton_nodes = skeleton_nodes
59
+ self.kv_store = kv_store
60
+ self.embed_model_name = embed_model
61
+ self.provider = provider
62
+
63
+ self._index = None
64
+ self._embedder = None
65
+ self._node_map: dict[str, SkeletonNode] = {}
66
+
67
+ logger.info(
68
+ "semantic_searcher_initialized",
69
+ nodes=len(skeleton_nodes),
70
+ embed_model=embed_model,
71
+ )
72
+
73
+ def _build_index(self) -> None:
74
+ """Build vector index lazily on first search."""
75
+ if self._index is not None:
76
+ return
77
+
78
+ try:
79
+ from llama_index.core import VectorStoreIndex
80
+ from llama_index.core.schema import TextNode
81
+
82
+ # Get embedding model
83
+ embed_model = self._get_embedding_model()
84
+
85
+ # Create text nodes from skeleton summaries
86
+ text_nodes = []
87
+ for node_id, skel in self.skeleton_nodes.items():
88
+ # Skip nodes with no content
89
+ if not skel.summary or len(skel.summary.strip()) < 10:
90
+ continue
91
+
92
+ # Create text node with summary
93
+ text = f"{skel.header or ''}\n{skel.summary}".strip()
94
+
95
+ text_node = TextNode(
96
+ text=text,
97
+ id_=node_id,
98
+ metadata={
99
+ "node_id": node_id,
100
+ "level": skel.level,
101
+ "header": skel.header,
102
+ "has_children": len(skel.child_ids) > 0,
103
+ "child_ids": skel.child_ids,
104
+ },
105
+ )
106
+ text_nodes.append(text_node)
107
+ self._node_map[node_id] = skel
108
+
109
+ # Build index
110
+ self._index = VectorStoreIndex(
111
+ nodes=text_nodes,
112
+ embed_model=embed_model,
113
+ show_progress=False,
114
+ )
115
+
116
+ logger.info(
117
+ "vector_index_built",
118
+ nodes_indexed=len(text_nodes),
119
+ embed_model=self.embed_model_name,
120
+ )
121
+
122
+ except ImportError as e:
123
+ logger.warning(
124
+ "llama_index_not_available",
125
+ error=str(e),
126
+ fallback="Will use linear search",
127
+ )
128
+ raise IndexingError(
129
+ "LlamaIndex not installed. "
130
+ "Install with: pip install llama-index llama-index-embeddings-openai"
131
+ ) from e
132
+
133
+ def _get_embedding_model(self) -> Any:
134
+ """Get embedding model based on provider."""
135
+ import os
136
+
137
+ # Auto-detect provider
138
+ provider = self.provider
139
+ if provider is None:
140
+ if os.getenv("OPENAI_API_KEY"):
141
+ provider = "openai"
142
+ elif os.getenv("GOOGLE_API_KEY"):
143
+ provider = "gemini"
144
+ else:
145
+ logger.warning("no_embedding_api_key_found")
146
+ raise IndexingError("No API key found for embeddings")
147
+
148
+ provider = provider.lower()
149
+
150
+ try:
151
+ if provider == "openai":
152
+ from llama_index.embeddings.openai import OpenAIEmbedding
153
+
154
+ logger.info("using_openai_embeddings", model=self.embed_model_name)
155
+ return OpenAIEmbedding(model=self.embed_model_name)
156
+
157
+ elif provider == "gemini":
158
+ from llama_index.embeddings.gemini import GeminiEmbedding
159
+
160
+ logger.info("using_gemini_embeddings")
161
+ return GeminiEmbedding(model_name="models/text-embedding-004")
162
+
163
+ else:
164
+ raise IndexingError(f"Unsupported provider: {provider}")
165
+
166
+ except ImportError as e:
167
+ raise IndexingError(
168
+ f"Failed to import {provider} embeddings. "
169
+ f"Install with: pip install llama-index-embeddings-{provider}"
170
+ ) from e
171
+
172
+ def search(
173
+ self,
174
+ query: str,
175
+ top_k: int = 5,
176
+ similarity_threshold: float = 0.0,
177
+ ) -> list[tuple[SkeletonNode, float]]:
178
+ """
179
+ Search for relevant nodes using semantic similarity.
180
+
181
+ Args:
182
+ query: Search query (user question)
183
+ top_k: Number of results to return
184
+ similarity_threshold: Minimum similarity score (0-1)
185
+
186
+ Returns:
187
+ List of (SkeletonNode, similarity_score) tuples, sorted by relevance
188
+ """
189
+ # Build index if not already built
190
+ if self._index is None:
191
+ self._build_index()
192
+
193
+ # Index should be built now, but check again for type safety
194
+ if self._index is None:
195
+ logger.error("index_build_failed")
196
+ return []
197
+
198
+ # Query the index
199
+ retriever = self._index.as_retriever(similarity_top_k=top_k)
200
+ results = retriever.retrieve(query)
201
+
202
+ # Convert to skeleton nodes with scores
203
+ node_scores = []
204
+ for result in results:
205
+ node_id = result.node.id_
206
+ if node_id in self._node_map:
207
+ # LlamaIndex similarity scores are already normalized 0-1
208
+ score = result.score if result.score is not None else 0.0
209
+ if score >= similarity_threshold:
210
+ node_scores.append((self._node_map[node_id], score))
211
+
212
+ logger.info(
213
+ "semantic_search_complete",
214
+ query_len=len(query),
215
+ results=len(node_scores),
216
+ top_score=node_scores[0][1] if node_scores else 0,
217
+ )
218
+
219
+ return node_scores
220
+
221
+ def rank_all_nodes(
222
+ self,
223
+ query: str,
224
+ filter_leaves_only: bool = False,
225
+ ) -> list[tuple[SkeletonNode, float]]:
226
+ """
227
+ Rank ALL nodes by relevance to query.
228
+
229
+ This is useful for exploring everything but in priority order.
230
+ Much faster than LLM-based Tree of Thoughts evaluation.
231
+
232
+ Args:
233
+ query: Search query
234
+ filter_leaves_only: If True, only return leaf nodes
235
+
236
+ Returns:
237
+ All nodes ranked by similarity score
238
+ """
239
+ # Get all nodes (use high top_k)
240
+ all_ranked = self.search(query, top_k=len(self.skeleton_nodes))
241
+
242
+ if filter_leaves_only:
243
+ all_ranked = [
244
+ (node, score)
245
+ for node, score in all_ranked
246
+ if len(node.child_ids) == 0
247
+ ]
248
+
249
+ logger.info(
250
+ "all_nodes_ranked",
251
+ total=len(all_ranked),
252
+ leaves_only=filter_leaves_only,
253
+ )
254
+
255
+ return all_ranked
256
+
257
+ def search_and_expand(
258
+ self,
259
+ query: str,
260
+ top_k: int = 10,
261
+ max_explore: int = 20,
262
+ ) -> list[str]:
263
+ """
264
+ Adaptive search strategy:
265
+ 1. Find top_k most relevant nodes via semantic search (O(log N))
266
+ 2. If needed, expand to explore up to max_explore nodes
267
+
268
+ This ensures we don't miss important data while staying efficient.
269
+
270
+ Args:
271
+ query: Search query
272
+ top_k: Initial number of nodes to explore
273
+ max_explore: Maximum nodes to explore if initial set insufficient
274
+
275
+ Returns:
276
+ List of node IDs to explore, in priority order
277
+ """
278
+ # Get top-k via semantic search
279
+ top_results = self.search(query, top_k=min(top_k, max_explore))
280
+ node_ids = [node.node_id for node, score in top_results]
281
+
282
+ logger.info(
283
+ "adaptive_search",
284
+ initial_nodes=len(node_ids),
285
+ max_explore=max_explore,
286
+ )
287
+
288
+ return node_ids
289
+
290
+
291
+ def create_semantic_searcher(
292
+ skeleton_nodes: dict[str, SkeletonNode],
293
+ kv_store: KVStore,
294
+ provider: str | None = None,
295
+ ) -> SemanticSearcher | None:
296
+ """
297
+ Create a semantic searcher if embeddings are available.
298
+
299
+ Args:
300
+ skeleton_nodes: Skeleton nodes to search
301
+ kv_store: KV store for content
302
+ provider: "openai", "gemini", or None for auto-detect
303
+
304
+ Returns:
305
+ SemanticSearcher instance, or None if embeddings unavailable
306
+ """
307
+ try:
308
+ searcher = SemanticSearcher(
309
+ skeleton_nodes=skeleton_nodes,
310
+ kv_store=kv_store,
311
+ provider=provider,
312
+ )
313
+ return searcher
314
+ except IndexingError as e:
315
+ logger.warning(
316
+ "semantic_search_unavailable",
317
+ error=str(e),
318
+ fallback="Will use Tree of Thoughts evaluation",
319
+ )
320
+ return None