rnsr 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rnsr/__init__.py +118 -0
- rnsr/__main__.py +242 -0
- rnsr/agent/__init__.py +218 -0
- rnsr/agent/cross_doc_navigator.py +767 -0
- rnsr/agent/graph.py +1557 -0
- rnsr/agent/llm_cache.py +575 -0
- rnsr/agent/navigator_api.py +497 -0
- rnsr/agent/provenance.py +772 -0
- rnsr/agent/query_clarifier.py +617 -0
- rnsr/agent/reasoning_memory.py +736 -0
- rnsr/agent/repl_env.py +709 -0
- rnsr/agent/rlm_navigator.py +2108 -0
- rnsr/agent/self_reflection.py +602 -0
- rnsr/agent/variable_store.py +308 -0
- rnsr/benchmarks/__init__.py +118 -0
- rnsr/benchmarks/comprehensive_benchmark.py +733 -0
- rnsr/benchmarks/evaluation_suite.py +1210 -0
- rnsr/benchmarks/finance_bench.py +147 -0
- rnsr/benchmarks/pdf_merger.py +178 -0
- rnsr/benchmarks/performance.py +321 -0
- rnsr/benchmarks/quality.py +321 -0
- rnsr/benchmarks/runner.py +298 -0
- rnsr/benchmarks/standard_benchmarks.py +995 -0
- rnsr/client.py +560 -0
- rnsr/document_store.py +394 -0
- rnsr/exceptions.py +74 -0
- rnsr/extraction/__init__.py +172 -0
- rnsr/extraction/candidate_extractor.py +357 -0
- rnsr/extraction/entity_extractor.py +581 -0
- rnsr/extraction/entity_linker.py +825 -0
- rnsr/extraction/grounded_extractor.py +722 -0
- rnsr/extraction/learned_types.py +599 -0
- rnsr/extraction/models.py +232 -0
- rnsr/extraction/relationship_extractor.py +600 -0
- rnsr/extraction/relationship_patterns.py +511 -0
- rnsr/extraction/relationship_validator.py +392 -0
- rnsr/extraction/rlm_extractor.py +589 -0
- rnsr/extraction/rlm_unified_extractor.py +990 -0
- rnsr/extraction/tot_validator.py +610 -0
- rnsr/extraction/unified_extractor.py +342 -0
- rnsr/indexing/__init__.py +60 -0
- rnsr/indexing/knowledge_graph.py +1128 -0
- rnsr/indexing/kv_store.py +313 -0
- rnsr/indexing/persistence.py +323 -0
- rnsr/indexing/semantic_retriever.py +237 -0
- rnsr/indexing/semantic_search.py +320 -0
- rnsr/indexing/skeleton_index.py +395 -0
- rnsr/ingestion/__init__.py +161 -0
- rnsr/ingestion/chart_parser.py +569 -0
- rnsr/ingestion/document_boundary.py +662 -0
- rnsr/ingestion/font_histogram.py +334 -0
- rnsr/ingestion/header_classifier.py +595 -0
- rnsr/ingestion/hierarchical_cluster.py +515 -0
- rnsr/ingestion/layout_detector.py +356 -0
- rnsr/ingestion/layout_model.py +379 -0
- rnsr/ingestion/ocr_fallback.py +177 -0
- rnsr/ingestion/pipeline.py +936 -0
- rnsr/ingestion/semantic_fallback.py +417 -0
- rnsr/ingestion/table_parser.py +799 -0
- rnsr/ingestion/text_builder.py +460 -0
- rnsr/ingestion/tree_builder.py +402 -0
- rnsr/ingestion/vision_retrieval.py +965 -0
- rnsr/ingestion/xy_cut.py +555 -0
- rnsr/llm.py +733 -0
- rnsr/models.py +167 -0
- rnsr/py.typed +2 -0
- rnsr-0.1.0.dist-info/METADATA +592 -0
- rnsr-0.1.0.dist-info/RECORD +72 -0
- rnsr-0.1.0.dist-info/WHEEL +5 -0
- rnsr-0.1.0.dist-info/entry_points.txt +2 -0
- rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
- rnsr-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Semantic Retriever - Query-driven node selection using embeddings.
|
|
3
|
+
|
|
4
|
+
This module provides retrieval-based navigation that reduces complexity
|
|
5
|
+
from O(N) (evaluating all nodes) to O(log N) (retrieving top-k relevant nodes).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
import structlog
|
|
13
|
+
|
|
14
|
+
from rnsr.models import SkeletonNode
|
|
15
|
+
|
|
16
|
+
logger = structlog.get_logger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class SemanticRetriever:
|
|
20
|
+
"""
|
|
21
|
+
Retrieves the most relevant nodes given a query using semantic search.
|
|
22
|
+
|
|
23
|
+
Uses embeddings and cosine similarity to rank nodes by relevance,
|
|
24
|
+
reducing search complexity from O(N) to O(log N).
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
skeleton_nodes: dict[str, SkeletonNode],
|
|
30
|
+
llm_provider: str = "gemini",
|
|
31
|
+
):
|
|
32
|
+
"""
|
|
33
|
+
Initialize the retriever.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
skeleton_nodes: Dictionary of node_id -> SkeletonNode.
|
|
37
|
+
llm_provider: LLM provider for embeddings.
|
|
38
|
+
"""
|
|
39
|
+
self.skeleton_nodes = skeleton_nodes
|
|
40
|
+
self.llm_provider = llm_provider
|
|
41
|
+
self._index = None
|
|
42
|
+
self._embed_model = None
|
|
43
|
+
|
|
44
|
+
logger.info(
|
|
45
|
+
"semantic_retriever_initialized",
|
|
46
|
+
total_nodes=len(skeleton_nodes),
|
|
47
|
+
provider=llm_provider,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
def _initialize_index(self) -> None:
|
|
51
|
+
"""Initialize the vector index lazily."""
|
|
52
|
+
if self._index is not None:
|
|
53
|
+
return
|
|
54
|
+
|
|
55
|
+
try:
|
|
56
|
+
from llama_index.core import VectorStoreIndex
|
|
57
|
+
from llama_index.embeddings.gemini import GeminiEmbedding
|
|
58
|
+
from rnsr.indexing.skeleton_index import create_llama_index_nodes
|
|
59
|
+
|
|
60
|
+
# Create embedding model
|
|
61
|
+
self._embed_model = GeminiEmbedding(
|
|
62
|
+
model_name="models/text-embedding-004"
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
# Create index nodes (summaries only!)
|
|
66
|
+
llama_nodes = create_llama_index_nodes(self.skeleton_nodes)
|
|
67
|
+
|
|
68
|
+
# Build vector index
|
|
69
|
+
self._index = VectorStoreIndex(
|
|
70
|
+
nodes=llama_nodes,
|
|
71
|
+
embed_model=self._embed_model,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
logger.info(
|
|
75
|
+
"vector_index_built",
|
|
76
|
+
nodes_indexed=len(llama_nodes),
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
except ImportError as e:
|
|
80
|
+
logger.warning(
|
|
81
|
+
"vector_index_unavailable",
|
|
82
|
+
error=str(e),
|
|
83
|
+
fallback="will_use_bm25",
|
|
84
|
+
)
|
|
85
|
+
self._index = None
|
|
86
|
+
|
|
87
|
+
def retrieve(
|
|
88
|
+
self,
|
|
89
|
+
query: str,
|
|
90
|
+
top_k: int = 5,
|
|
91
|
+
parent_id: str | None = None,
|
|
92
|
+
) -> list[SkeletonNode]:
|
|
93
|
+
"""
|
|
94
|
+
Retrieve the most relevant nodes for a query.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
query: The question or search query.
|
|
98
|
+
top_k: Number of results to return.
|
|
99
|
+
parent_id: Optional parent node to restrict search to children.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
List of SkeletonNode objects ranked by relevance.
|
|
103
|
+
"""
|
|
104
|
+
# Initialize index on first use
|
|
105
|
+
self._initialize_index()
|
|
106
|
+
|
|
107
|
+
# Filter candidates if parent_id specified
|
|
108
|
+
candidates = self._get_candidates(parent_id)
|
|
109
|
+
|
|
110
|
+
if self._index is not None:
|
|
111
|
+
# Use vector search
|
|
112
|
+
return self._retrieve_vector(query, top_k, candidates)
|
|
113
|
+
else:
|
|
114
|
+
# Fallback to BM25/keyword search
|
|
115
|
+
return self._retrieve_bm25(query, top_k, candidates)
|
|
116
|
+
|
|
117
|
+
def _get_candidates(
|
|
118
|
+
self,
|
|
119
|
+
parent_id: str | None,
|
|
120
|
+
) -> dict[str, SkeletonNode]:
|
|
121
|
+
"""Get candidate nodes to search over."""
|
|
122
|
+
if parent_id is None:
|
|
123
|
+
return self.skeleton_nodes
|
|
124
|
+
|
|
125
|
+
# Filter to children of parent
|
|
126
|
+
parent = self.skeleton_nodes.get(parent_id)
|
|
127
|
+
if parent is None:
|
|
128
|
+
return self.skeleton_nodes
|
|
129
|
+
|
|
130
|
+
return {
|
|
131
|
+
cid: self.skeleton_nodes[cid]
|
|
132
|
+
for cid in parent.child_ids
|
|
133
|
+
if cid in self.skeleton_nodes
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
def _retrieve_vector(
|
|
137
|
+
self,
|
|
138
|
+
query: str,
|
|
139
|
+
top_k: int,
|
|
140
|
+
candidates: dict[str, SkeletonNode],
|
|
141
|
+
) -> list[SkeletonNode]:
|
|
142
|
+
"""Retrieve using vector similarity."""
|
|
143
|
+
try:
|
|
144
|
+
if self._index is None:
|
|
145
|
+
return self._retrieve_bm25(query, top_k, candidates)
|
|
146
|
+
|
|
147
|
+
retriever = self._index.as_retriever(similarity_top_k=top_k * 2)
|
|
148
|
+
results = retriever.retrieve(query)
|
|
149
|
+
|
|
150
|
+
# Filter to candidates and return SkeletonNodes
|
|
151
|
+
relevant = []
|
|
152
|
+
for result in results:
|
|
153
|
+
# Access metadata safely
|
|
154
|
+
node_id = result.node.metadata.get("node_id") if hasattr(result.node, "metadata") else None
|
|
155
|
+
if node_id and node_id in candidates:
|
|
156
|
+
relevant.append(candidates[node_id])
|
|
157
|
+
if len(relevant) >= top_k:
|
|
158
|
+
break
|
|
159
|
+
|
|
160
|
+
logger.info(
|
|
161
|
+
"vector_retrieval_complete",
|
|
162
|
+
query_words=len(query.split()),
|
|
163
|
+
results=len(relevant),
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
return relevant
|
|
167
|
+
|
|
168
|
+
except Exception as e:
|
|
169
|
+
logger.warning(
|
|
170
|
+
"vector_retrieval_failed",
|
|
171
|
+
error=str(e),
|
|
172
|
+
fallback="bm25",
|
|
173
|
+
)
|
|
174
|
+
return self._retrieve_bm25(query, top_k, candidates)
|
|
175
|
+
|
|
176
|
+
def _retrieve_bm25(
|
|
177
|
+
self,
|
|
178
|
+
query: str,
|
|
179
|
+
top_k: int,
|
|
180
|
+
candidates: dict[str, SkeletonNode],
|
|
181
|
+
) -> list[SkeletonNode]:
|
|
182
|
+
"""Fallback retrieval using BM25/keyword matching."""
|
|
183
|
+
from collections import Counter
|
|
184
|
+
|
|
185
|
+
query_terms = set(query.lower().split())
|
|
186
|
+
|
|
187
|
+
# Score each candidate by term overlap
|
|
188
|
+
scores = []
|
|
189
|
+
for node_id, node in candidates.items():
|
|
190
|
+
# Combine header and summary for matching
|
|
191
|
+
text = f"{node.header or ''} {node.summary}".lower()
|
|
192
|
+
text_terms = text.split()
|
|
193
|
+
|
|
194
|
+
# Count matching terms
|
|
195
|
+
matches = sum(1 for term in query_terms if term in text_terms)
|
|
196
|
+
|
|
197
|
+
# Boost for header matches
|
|
198
|
+
header_matches = sum(
|
|
199
|
+
1 for term in query_terms
|
|
200
|
+
if term in (node.header or "").lower()
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
score = matches + (header_matches * 2)
|
|
204
|
+
|
|
205
|
+
if score > 0:
|
|
206
|
+
scores.append((score, node))
|
|
207
|
+
|
|
208
|
+
# Sort by score descending
|
|
209
|
+
scores.sort(reverse=True, key=lambda x: x[0])
|
|
210
|
+
|
|
211
|
+
results = [node for score, node in scores[:top_k]]
|
|
212
|
+
|
|
213
|
+
logger.info(
|
|
214
|
+
"bm25_retrieval_complete",
|
|
215
|
+
query_terms=len(query_terms),
|
|
216
|
+
candidates=len(candidates),
|
|
217
|
+
results=len(results),
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
return results
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def create_retriever(
|
|
224
|
+
skeleton_nodes: dict[str, SkeletonNode],
|
|
225
|
+
llm_provider: str = "gemini",
|
|
226
|
+
) -> SemanticRetriever:
|
|
227
|
+
"""
|
|
228
|
+
Convenience function to create a semantic retriever.
|
|
229
|
+
|
|
230
|
+
Args:
|
|
231
|
+
skeleton_nodes: Dictionary of node_id -> SkeletonNode.
|
|
232
|
+
llm_provider: LLM provider for embeddings.
|
|
233
|
+
|
|
234
|
+
Returns:
|
|
235
|
+
SemanticRetriever instance.
|
|
236
|
+
"""
|
|
237
|
+
return SemanticRetriever(skeleton_nodes, llm_provider)
|
|
@@ -0,0 +1,320 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Semantic Search for Skeleton Index
|
|
3
|
+
|
|
4
|
+
Provides O(log N) retrieval using vector similarity search on node summaries.
|
|
5
|
+
Falls back to full exploration if needed.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
searcher = SemanticSearcher(skeleton_nodes, kv_store)
|
|
9
|
+
relevant_nodes = searcher.search(query, top_k=5)
|
|
10
|
+
|
|
11
|
+
# Or get all node IDs ranked by relevance
|
|
12
|
+
all_ranked = searcher.rank_all_nodes(query)
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
import structlog
|
|
20
|
+
|
|
21
|
+
from rnsr.exceptions import IndexingError
|
|
22
|
+
from rnsr.indexing.kv_store import KVStore
|
|
23
|
+
from rnsr.models import SkeletonNode
|
|
24
|
+
|
|
25
|
+
logger = structlog.get_logger(__name__)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class SemanticSearcher:
|
|
29
|
+
"""
|
|
30
|
+
Semantic search over skeleton node summaries.
|
|
31
|
+
|
|
32
|
+
Uses vector embeddings for O(log N) retrieval instead of
|
|
33
|
+
evaluating all nodes with expensive LLM calls.
|
|
34
|
+
|
|
35
|
+
Attributes:
|
|
36
|
+
skeleton_nodes: Dictionary of node_id -> SkeletonNode
|
|
37
|
+
kv_store: KV store for full content retrieval
|
|
38
|
+
index: LlamaIndex VectorStoreIndex (built lazily)
|
|
39
|
+
embedder: Embedding model instance
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
def __init__(
|
|
43
|
+
self,
|
|
44
|
+
skeleton_nodes: dict[str, SkeletonNode],
|
|
45
|
+
kv_store: KVStore,
|
|
46
|
+
embed_model: str = "text-embedding-3-small",
|
|
47
|
+
provider: str | None = None,
|
|
48
|
+
):
|
|
49
|
+
"""
|
|
50
|
+
Initialize semantic searcher.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
skeleton_nodes: Skeleton nodes to search over
|
|
54
|
+
kv_store: KV store for content retrieval
|
|
55
|
+
embed_model: Embedding model name
|
|
56
|
+
provider: "openai", "gemini", or None for auto-detect
|
|
57
|
+
"""
|
|
58
|
+
self.skeleton_nodes = skeleton_nodes
|
|
59
|
+
self.kv_store = kv_store
|
|
60
|
+
self.embed_model_name = embed_model
|
|
61
|
+
self.provider = provider
|
|
62
|
+
|
|
63
|
+
self._index = None
|
|
64
|
+
self._embedder = None
|
|
65
|
+
self._node_map: dict[str, SkeletonNode] = {}
|
|
66
|
+
|
|
67
|
+
logger.info(
|
|
68
|
+
"semantic_searcher_initialized",
|
|
69
|
+
nodes=len(skeleton_nodes),
|
|
70
|
+
embed_model=embed_model,
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
def _build_index(self) -> None:
|
|
74
|
+
"""Build vector index lazily on first search."""
|
|
75
|
+
if self._index is not None:
|
|
76
|
+
return
|
|
77
|
+
|
|
78
|
+
try:
|
|
79
|
+
from llama_index.core import VectorStoreIndex
|
|
80
|
+
from llama_index.core.schema import TextNode
|
|
81
|
+
|
|
82
|
+
# Get embedding model
|
|
83
|
+
embed_model = self._get_embedding_model()
|
|
84
|
+
|
|
85
|
+
# Create text nodes from skeleton summaries
|
|
86
|
+
text_nodes = []
|
|
87
|
+
for node_id, skel in self.skeleton_nodes.items():
|
|
88
|
+
# Skip nodes with no content
|
|
89
|
+
if not skel.summary or len(skel.summary.strip()) < 10:
|
|
90
|
+
continue
|
|
91
|
+
|
|
92
|
+
# Create text node with summary
|
|
93
|
+
text = f"{skel.header or ''}\n{skel.summary}".strip()
|
|
94
|
+
|
|
95
|
+
text_node = TextNode(
|
|
96
|
+
text=text,
|
|
97
|
+
id_=node_id,
|
|
98
|
+
metadata={
|
|
99
|
+
"node_id": node_id,
|
|
100
|
+
"level": skel.level,
|
|
101
|
+
"header": skel.header,
|
|
102
|
+
"has_children": len(skel.child_ids) > 0,
|
|
103
|
+
"child_ids": skel.child_ids,
|
|
104
|
+
},
|
|
105
|
+
)
|
|
106
|
+
text_nodes.append(text_node)
|
|
107
|
+
self._node_map[node_id] = skel
|
|
108
|
+
|
|
109
|
+
# Build index
|
|
110
|
+
self._index = VectorStoreIndex(
|
|
111
|
+
nodes=text_nodes,
|
|
112
|
+
embed_model=embed_model,
|
|
113
|
+
show_progress=False,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
logger.info(
|
|
117
|
+
"vector_index_built",
|
|
118
|
+
nodes_indexed=len(text_nodes),
|
|
119
|
+
embed_model=self.embed_model_name,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
except ImportError as e:
|
|
123
|
+
logger.warning(
|
|
124
|
+
"llama_index_not_available",
|
|
125
|
+
error=str(e),
|
|
126
|
+
fallback="Will use linear search",
|
|
127
|
+
)
|
|
128
|
+
raise IndexingError(
|
|
129
|
+
"LlamaIndex not installed. "
|
|
130
|
+
"Install with: pip install llama-index llama-index-embeddings-openai"
|
|
131
|
+
) from e
|
|
132
|
+
|
|
133
|
+
def _get_embedding_model(self) -> Any:
|
|
134
|
+
"""Get embedding model based on provider."""
|
|
135
|
+
import os
|
|
136
|
+
|
|
137
|
+
# Auto-detect provider
|
|
138
|
+
provider = self.provider
|
|
139
|
+
if provider is None:
|
|
140
|
+
if os.getenv("OPENAI_API_KEY"):
|
|
141
|
+
provider = "openai"
|
|
142
|
+
elif os.getenv("GOOGLE_API_KEY"):
|
|
143
|
+
provider = "gemini"
|
|
144
|
+
else:
|
|
145
|
+
logger.warning("no_embedding_api_key_found")
|
|
146
|
+
raise IndexingError("No API key found for embeddings")
|
|
147
|
+
|
|
148
|
+
provider = provider.lower()
|
|
149
|
+
|
|
150
|
+
try:
|
|
151
|
+
if provider == "openai":
|
|
152
|
+
from llama_index.embeddings.openai import OpenAIEmbedding
|
|
153
|
+
|
|
154
|
+
logger.info("using_openai_embeddings", model=self.embed_model_name)
|
|
155
|
+
return OpenAIEmbedding(model=self.embed_model_name)
|
|
156
|
+
|
|
157
|
+
elif provider == "gemini":
|
|
158
|
+
from llama_index.embeddings.gemini import GeminiEmbedding
|
|
159
|
+
|
|
160
|
+
logger.info("using_gemini_embeddings")
|
|
161
|
+
return GeminiEmbedding(model_name="models/text-embedding-004")
|
|
162
|
+
|
|
163
|
+
else:
|
|
164
|
+
raise IndexingError(f"Unsupported provider: {provider}")
|
|
165
|
+
|
|
166
|
+
except ImportError as e:
|
|
167
|
+
raise IndexingError(
|
|
168
|
+
f"Failed to import {provider} embeddings. "
|
|
169
|
+
f"Install with: pip install llama-index-embeddings-{provider}"
|
|
170
|
+
) from e
|
|
171
|
+
|
|
172
|
+
def search(
|
|
173
|
+
self,
|
|
174
|
+
query: str,
|
|
175
|
+
top_k: int = 5,
|
|
176
|
+
similarity_threshold: float = 0.0,
|
|
177
|
+
) -> list[tuple[SkeletonNode, float]]:
|
|
178
|
+
"""
|
|
179
|
+
Search for relevant nodes using semantic similarity.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
query: Search query (user question)
|
|
183
|
+
top_k: Number of results to return
|
|
184
|
+
similarity_threshold: Minimum similarity score (0-1)
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
List of (SkeletonNode, similarity_score) tuples, sorted by relevance
|
|
188
|
+
"""
|
|
189
|
+
# Build index if not already built
|
|
190
|
+
if self._index is None:
|
|
191
|
+
self._build_index()
|
|
192
|
+
|
|
193
|
+
# Index should be built now, but check again for type safety
|
|
194
|
+
if self._index is None:
|
|
195
|
+
logger.error("index_build_failed")
|
|
196
|
+
return []
|
|
197
|
+
|
|
198
|
+
# Query the index
|
|
199
|
+
retriever = self._index.as_retriever(similarity_top_k=top_k)
|
|
200
|
+
results = retriever.retrieve(query)
|
|
201
|
+
|
|
202
|
+
# Convert to skeleton nodes with scores
|
|
203
|
+
node_scores = []
|
|
204
|
+
for result in results:
|
|
205
|
+
node_id = result.node.id_
|
|
206
|
+
if node_id in self._node_map:
|
|
207
|
+
# LlamaIndex similarity scores are already normalized 0-1
|
|
208
|
+
score = result.score if result.score is not None else 0.0
|
|
209
|
+
if score >= similarity_threshold:
|
|
210
|
+
node_scores.append((self._node_map[node_id], score))
|
|
211
|
+
|
|
212
|
+
logger.info(
|
|
213
|
+
"semantic_search_complete",
|
|
214
|
+
query_len=len(query),
|
|
215
|
+
results=len(node_scores),
|
|
216
|
+
top_score=node_scores[0][1] if node_scores else 0,
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
return node_scores
|
|
220
|
+
|
|
221
|
+
def rank_all_nodes(
|
|
222
|
+
self,
|
|
223
|
+
query: str,
|
|
224
|
+
filter_leaves_only: bool = False,
|
|
225
|
+
) -> list[tuple[SkeletonNode, float]]:
|
|
226
|
+
"""
|
|
227
|
+
Rank ALL nodes by relevance to query.
|
|
228
|
+
|
|
229
|
+
This is useful for exploring everything but in priority order.
|
|
230
|
+
Much faster than LLM-based Tree of Thoughts evaluation.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
query: Search query
|
|
234
|
+
filter_leaves_only: If True, only return leaf nodes
|
|
235
|
+
|
|
236
|
+
Returns:
|
|
237
|
+
All nodes ranked by similarity score
|
|
238
|
+
"""
|
|
239
|
+
# Get all nodes (use high top_k)
|
|
240
|
+
all_ranked = self.search(query, top_k=len(self.skeleton_nodes))
|
|
241
|
+
|
|
242
|
+
if filter_leaves_only:
|
|
243
|
+
all_ranked = [
|
|
244
|
+
(node, score)
|
|
245
|
+
for node, score in all_ranked
|
|
246
|
+
if len(node.child_ids) == 0
|
|
247
|
+
]
|
|
248
|
+
|
|
249
|
+
logger.info(
|
|
250
|
+
"all_nodes_ranked",
|
|
251
|
+
total=len(all_ranked),
|
|
252
|
+
leaves_only=filter_leaves_only,
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
return all_ranked
|
|
256
|
+
|
|
257
|
+
def search_and_expand(
|
|
258
|
+
self,
|
|
259
|
+
query: str,
|
|
260
|
+
top_k: int = 10,
|
|
261
|
+
max_explore: int = 20,
|
|
262
|
+
) -> list[str]:
|
|
263
|
+
"""
|
|
264
|
+
Adaptive search strategy:
|
|
265
|
+
1. Find top_k most relevant nodes via semantic search (O(log N))
|
|
266
|
+
2. If needed, expand to explore up to max_explore nodes
|
|
267
|
+
|
|
268
|
+
This ensures we don't miss important data while staying efficient.
|
|
269
|
+
|
|
270
|
+
Args:
|
|
271
|
+
query: Search query
|
|
272
|
+
top_k: Initial number of nodes to explore
|
|
273
|
+
max_explore: Maximum nodes to explore if initial set insufficient
|
|
274
|
+
|
|
275
|
+
Returns:
|
|
276
|
+
List of node IDs to explore, in priority order
|
|
277
|
+
"""
|
|
278
|
+
# Get top-k via semantic search
|
|
279
|
+
top_results = self.search(query, top_k=min(top_k, max_explore))
|
|
280
|
+
node_ids = [node.node_id for node, score in top_results]
|
|
281
|
+
|
|
282
|
+
logger.info(
|
|
283
|
+
"adaptive_search",
|
|
284
|
+
initial_nodes=len(node_ids),
|
|
285
|
+
max_explore=max_explore,
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
return node_ids
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def create_semantic_searcher(
|
|
292
|
+
skeleton_nodes: dict[str, SkeletonNode],
|
|
293
|
+
kv_store: KVStore,
|
|
294
|
+
provider: str | None = None,
|
|
295
|
+
) -> SemanticSearcher | None:
|
|
296
|
+
"""
|
|
297
|
+
Create a semantic searcher if embeddings are available.
|
|
298
|
+
|
|
299
|
+
Args:
|
|
300
|
+
skeleton_nodes: Skeleton nodes to search
|
|
301
|
+
kv_store: KV store for content
|
|
302
|
+
provider: "openai", "gemini", or None for auto-detect
|
|
303
|
+
|
|
304
|
+
Returns:
|
|
305
|
+
SemanticSearcher instance, or None if embeddings unavailable
|
|
306
|
+
"""
|
|
307
|
+
try:
|
|
308
|
+
searcher = SemanticSearcher(
|
|
309
|
+
skeleton_nodes=skeleton_nodes,
|
|
310
|
+
kv_store=kv_store,
|
|
311
|
+
provider=provider,
|
|
312
|
+
)
|
|
313
|
+
return searcher
|
|
314
|
+
except IndexingError as e:
|
|
315
|
+
logger.warning(
|
|
316
|
+
"semantic_search_unavailable",
|
|
317
|
+
error=str(e),
|
|
318
|
+
fallback="Will use Tree of Thoughts evaluation",
|
|
319
|
+
)
|
|
320
|
+
return None
|