claude-code-workflow 6.2.4 → 6.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/ccw/dist/core/lite-scanner-complete.d.ts.map +1 -1
- package/ccw/dist/core/lite-scanner-complete.js +4 -1
- package/ccw/dist/core/lite-scanner-complete.js.map +1 -1
- package/ccw/dist/core/lite-scanner.d.ts.map +1 -1
- package/ccw/dist/core/lite-scanner.js +4 -1
- package/ccw/dist/core/lite-scanner.js.map +1 -1
- package/ccw/dist/core/routes/claude-routes.d.ts.map +1 -1
- package/ccw/dist/core/routes/claude-routes.js +3 -5
- package/ccw/dist/core/routes/claude-routes.js.map +1 -1
- package/ccw/dist/core/routes/cli-routes.d.ts.map +1 -1
- package/ccw/dist/core/routes/cli-routes.js +2 -1
- package/ccw/dist/core/routes/cli-routes.js.map +1 -1
- package/ccw/dist/core/routes/codexlens-routes.d.ts.map +1 -1
- package/ccw/dist/core/routes/codexlens-routes.js +31 -6
- package/ccw/dist/core/routes/codexlens-routes.js.map +1 -1
- package/ccw/dist/core/routes/rules-routes.d.ts.map +1 -1
- package/ccw/dist/core/routes/rules-routes.js +4 -3
- package/ccw/dist/core/routes/rules-routes.js.map +1 -1
- package/ccw/dist/core/routes/skills-routes.d.ts.map +1 -1
- package/ccw/dist/core/routes/skills-routes.js +124 -6
- package/ccw/dist/core/routes/skills-routes.js.map +1 -1
- package/ccw/dist/tools/cli-executor.d.ts +4 -1
- package/ccw/dist/tools/cli-executor.d.ts.map +1 -1
- package/ccw/dist/tools/cli-executor.js +54 -2
- package/ccw/dist/tools/cli-executor.js.map +1 -1
- package/ccw/dist/tools/codex-lens.d.ts +20 -3
- package/ccw/dist/tools/codex-lens.d.ts.map +1 -1
- package/ccw/dist/tools/codex-lens.js +166 -37
- package/ccw/dist/tools/codex-lens.js.map +1 -1
- package/ccw/package.json +1 -1
- package/ccw/src/core/lite-scanner-complete.ts +5 -1
- package/ccw/src/core/lite-scanner.ts +5 -1
- package/ccw/src/core/routes/claude-routes.ts +3 -5
- package/ccw/src/core/routes/cli-routes.ts +2 -1
- package/ccw/src/core/routes/codexlens-routes.ts +34 -6
- package/ccw/src/core/routes/rules-routes.ts +4 -3
- package/ccw/src/core/routes/skills-routes.ts +144 -6
- package/ccw/src/templates/dashboard-js/components/mcp-manager.js +7 -12
- package/ccw/src/templates/dashboard-js/i18n.js +167 -5
- package/ccw/src/templates/dashboard-js/views/claude-manager.js +18 -4
- package/ccw/src/templates/dashboard-js/views/cli-manager.js +5 -3
- package/ccw/src/templates/dashboard-js/views/codexlens-manager.js +790 -25
- package/ccw/src/templates/dashboard-js/views/rules-manager.js +35 -6
- package/ccw/src/templates/dashboard-js/views/skills-manager.js +385 -21
- package/ccw/src/tools/cli-executor.ts +70 -2
- package/ccw/src/tools/codex-lens.ts +183 -35
- package/codex-lens/pyproject.toml +66 -48
- package/codex-lens/src/codexlens/__pycache__/config.cpython-313.pyc +0 -0
- package/codex-lens/src/codexlens/cli/__pycache__/embedding_manager.cpython-313.pyc +0 -0
- package/codex-lens/src/codexlens/cli/__pycache__/model_manager.cpython-313.pyc +0 -0
- package/codex-lens/src/codexlens/cli/embedding_manager.py +3 -3
- package/codex-lens/src/codexlens/cli/model_manager.py +24 -2
- package/codex-lens/src/codexlens/search/__pycache__/hybrid_search.cpython-313.pyc +0 -0
- package/codex-lens/src/codexlens/search/hybrid_search.py +313 -313
- package/codex-lens/src/codexlens/semantic/__init__.py +76 -39
- package/codex-lens/src/codexlens/semantic/__pycache__/__init__.cpython-313.pyc +0 -0
- package/codex-lens/src/codexlens/semantic/__pycache__/embedder.cpython-313.pyc +0 -0
- package/codex-lens/src/codexlens/semantic/__pycache__/gpu_support.cpython-313.pyc +0 -0
- package/codex-lens/src/codexlens/semantic/__pycache__/ollama_backend.cpython-313.pyc +0 -0
- package/codex-lens/src/codexlens/semantic/embedder.py +244 -185
- package/codex-lens/src/codexlens/semantic/gpu_support.py +192 -0
- package/package.json +1 -1
|
@@ -1,313 +1,313 @@
|
|
|
1
|
-
"""Hybrid search engine orchestrating parallel exact/fuzzy/vector searches with RRF fusion.
|
|
2
|
-
|
|
3
|
-
Coordinates multiple search backends in parallel using ThreadPoolExecutor and combines
|
|
4
|
-
results via Reciprocal Rank Fusion (RRF) algorithm.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
from __future__ import annotations
|
|
8
|
-
|
|
9
|
-
import logging
|
|
10
|
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
11
|
-
from pathlib import Path
|
|
12
|
-
from typing import Dict, List, Optional
|
|
13
|
-
|
|
14
|
-
from codexlens.entities import SearchResult
|
|
15
|
-
from codexlens.search.ranking import reciprocal_rank_fusion, tag_search_source
|
|
16
|
-
from codexlens.storage.dir_index import DirIndexStore
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
class HybridSearchEngine:
|
|
20
|
-
"""Hybrid search engine with parallel execution and RRF fusion.
|
|
21
|
-
|
|
22
|
-
Orchestrates searches across exact FTS, fuzzy FTS, and optional vector backends,
|
|
23
|
-
executing them in parallel and fusing results via Reciprocal Rank Fusion.
|
|
24
|
-
|
|
25
|
-
Attributes:
|
|
26
|
-
logger: Python logger instance
|
|
27
|
-
default_weights: Default RRF weights for each source
|
|
28
|
-
"""
|
|
29
|
-
|
|
30
|
-
# Default RRF weights (exact: 40%, fuzzy: 30%, vector: 30%)
|
|
31
|
-
DEFAULT_WEIGHTS = {
|
|
32
|
-
"exact": 0.4,
|
|
33
|
-
"fuzzy": 0.3,
|
|
34
|
-
"vector": 0.3,
|
|
35
|
-
}
|
|
36
|
-
|
|
37
|
-
def __init__(self, weights: Optional[Dict[str, float]] = None):
|
|
38
|
-
"""Initialize hybrid search engine.
|
|
39
|
-
|
|
40
|
-
Args:
|
|
41
|
-
weights: Optional custom RRF weights (default: DEFAULT_WEIGHTS)
|
|
42
|
-
"""
|
|
43
|
-
self.logger = logging.getLogger(__name__)
|
|
44
|
-
self.weights = weights or self.DEFAULT_WEIGHTS.copy()
|
|
45
|
-
|
|
46
|
-
def search(
|
|
47
|
-
self,
|
|
48
|
-
index_path: Path,
|
|
49
|
-
query: str,
|
|
50
|
-
limit: int = 20,
|
|
51
|
-
enable_fuzzy: bool = True,
|
|
52
|
-
enable_vector: bool = False,
|
|
53
|
-
pure_vector: bool = False,
|
|
54
|
-
) -> List[SearchResult]:
|
|
55
|
-
"""Execute hybrid search with parallel retrieval and RRF fusion.
|
|
56
|
-
|
|
57
|
-
Args:
|
|
58
|
-
index_path: Path to _index.db file
|
|
59
|
-
query: FTS5 query string (for FTS) or natural language query (for vector)
|
|
60
|
-
limit: Maximum results to return after fusion
|
|
61
|
-
enable_fuzzy: Enable fuzzy FTS search (default True)
|
|
62
|
-
enable_vector: Enable vector search (default False)
|
|
63
|
-
pure_vector: If True, only use vector search without FTS fallback (default False)
|
|
64
|
-
|
|
65
|
-
Returns:
|
|
66
|
-
List of SearchResult objects sorted by fusion score
|
|
67
|
-
|
|
68
|
-
Examples:
|
|
69
|
-
>>> engine = HybridSearchEngine()
|
|
70
|
-
>>> # Hybrid search (exact + fuzzy + vector)
|
|
71
|
-
>>> results = engine.search(Path("project/_index.db"), "authentication",
|
|
72
|
-
... enable_vector=True)
|
|
73
|
-
>>> # Pure vector search (semantic only)
|
|
74
|
-
>>> results = engine.search(Path("project/_index.db"),
|
|
75
|
-
... "how to authenticate users",
|
|
76
|
-
... enable_vector=True, pure_vector=True)
|
|
77
|
-
>>> for r in results[:5]:
|
|
78
|
-
... print(f"{r.path}: {r.score:.3f}")
|
|
79
|
-
"""
|
|
80
|
-
# Determine which backends to use
|
|
81
|
-
backends = {}
|
|
82
|
-
|
|
83
|
-
if pure_vector:
|
|
84
|
-
# Pure vector mode: only use vector search, no FTS fallback
|
|
85
|
-
if enable_vector:
|
|
86
|
-
backends["vector"] = True
|
|
87
|
-
else:
|
|
88
|
-
# Invalid configuration: pure_vector=True but enable_vector=False
|
|
89
|
-
self.logger.warning(
|
|
90
|
-
"pure_vector=True requires enable_vector=True. "
|
|
91
|
-
"Falling back to exact search. "
|
|
92
|
-
"To use pure vector search, enable vector search mode."
|
|
93
|
-
)
|
|
94
|
-
backends["exact"] = True
|
|
95
|
-
else:
|
|
96
|
-
# Hybrid mode: always include exact search as baseline
|
|
97
|
-
backends["exact"] = True
|
|
98
|
-
if enable_fuzzy:
|
|
99
|
-
backends["fuzzy"] = True
|
|
100
|
-
if enable_vector:
|
|
101
|
-
backends["vector"] = True
|
|
102
|
-
|
|
103
|
-
# Execute parallel searches
|
|
104
|
-
results_map = self._search_parallel(index_path, query, backends, limit)
|
|
105
|
-
|
|
106
|
-
# Provide helpful message if pure-vector mode returns no results
|
|
107
|
-
if pure_vector and enable_vector and len(results_map.get("vector", [])) == 0:
|
|
108
|
-
self.logger.warning(
|
|
109
|
-
"Pure vector search returned no results. "
|
|
110
|
-
"This usually means embeddings haven't been generated. "
|
|
111
|
-
"Run: codexlens embeddings-generate %s",
|
|
112
|
-
index_path.parent if index_path.name == "_index.db" else index_path
|
|
113
|
-
)
|
|
114
|
-
|
|
115
|
-
# Apply RRF fusion
|
|
116
|
-
# Filter weights to only active backends
|
|
117
|
-
active_weights = {
|
|
118
|
-
source: weight
|
|
119
|
-
for source, weight in self.weights.items()
|
|
120
|
-
if source in results_map
|
|
121
|
-
}
|
|
122
|
-
|
|
123
|
-
fused_results = reciprocal_rank_fusion(results_map, active_weights)
|
|
124
|
-
|
|
125
|
-
# Apply final limit
|
|
126
|
-
return fused_results[:limit]
|
|
127
|
-
|
|
128
|
-
def _search_parallel(
|
|
129
|
-
self,
|
|
130
|
-
index_path: Path,
|
|
131
|
-
query: str,
|
|
132
|
-
backends: Dict[str, bool],
|
|
133
|
-
limit: int,
|
|
134
|
-
) -> Dict[str, List[SearchResult]]:
|
|
135
|
-
"""Execute parallel searches across enabled backends.
|
|
136
|
-
|
|
137
|
-
Args:
|
|
138
|
-
index_path: Path to _index.db file
|
|
139
|
-
query: FTS5 query string
|
|
140
|
-
backends: Dictionary of backend name to enabled flag
|
|
141
|
-
limit: Results limit per backend
|
|
142
|
-
|
|
143
|
-
Returns:
|
|
144
|
-
Dictionary mapping source name to results list
|
|
145
|
-
"""
|
|
146
|
-
results_map: Dict[str, List[SearchResult]] = {}
|
|
147
|
-
|
|
148
|
-
# Use ThreadPoolExecutor for parallel I/O-bound searches
|
|
149
|
-
with ThreadPoolExecutor(max_workers=len(backends)) as executor:
|
|
150
|
-
# Submit search tasks
|
|
151
|
-
future_to_source = {}
|
|
152
|
-
|
|
153
|
-
if backends.get("exact"):
|
|
154
|
-
future = executor.submit(
|
|
155
|
-
self._search_exact, index_path, query, limit
|
|
156
|
-
)
|
|
157
|
-
future_to_source[future] = "exact"
|
|
158
|
-
|
|
159
|
-
if backends.get("fuzzy"):
|
|
160
|
-
future = executor.submit(
|
|
161
|
-
self._search_fuzzy, index_path, query, limit
|
|
162
|
-
)
|
|
163
|
-
future_to_source[future] = "fuzzy"
|
|
164
|
-
|
|
165
|
-
if backends.get("vector"):
|
|
166
|
-
future = executor.submit(
|
|
167
|
-
self._search_vector, index_path, query, limit
|
|
168
|
-
)
|
|
169
|
-
future_to_source[future] = "vector"
|
|
170
|
-
|
|
171
|
-
# Collect results as they complete
|
|
172
|
-
for future in as_completed(future_to_source):
|
|
173
|
-
source = future_to_source[future]
|
|
174
|
-
try:
|
|
175
|
-
results = future.result()
|
|
176
|
-
# Tag results with source for debugging
|
|
177
|
-
tagged_results = tag_search_source(results, source)
|
|
178
|
-
results_map[source] = tagged_results
|
|
179
|
-
self.logger.debug(
|
|
180
|
-
"Got %d results from %s search", len(results), source
|
|
181
|
-
)
|
|
182
|
-
except Exception as exc:
|
|
183
|
-
self.logger.error("Search failed for %s: %s", source, exc)
|
|
184
|
-
results_map[source] = []
|
|
185
|
-
|
|
186
|
-
return results_map
|
|
187
|
-
|
|
188
|
-
def _search_exact(
|
|
189
|
-
self, index_path: Path, query: str, limit: int
|
|
190
|
-
) -> List[SearchResult]:
|
|
191
|
-
"""Execute exact FTS search using unicode61 tokenizer.
|
|
192
|
-
|
|
193
|
-
Args:
|
|
194
|
-
index_path: Path to _index.db file
|
|
195
|
-
query: FTS5 query string
|
|
196
|
-
limit: Maximum results
|
|
197
|
-
|
|
198
|
-
Returns:
|
|
199
|
-
List of SearchResult objects
|
|
200
|
-
"""
|
|
201
|
-
try:
|
|
202
|
-
with DirIndexStore(index_path) as store:
|
|
203
|
-
return store.search_fts_exact(query, limit=limit)
|
|
204
|
-
except Exception as exc:
|
|
205
|
-
self.logger.debug("Exact search error: %s", exc)
|
|
206
|
-
return []
|
|
207
|
-
|
|
208
|
-
def _search_fuzzy(
|
|
209
|
-
self, index_path: Path, query: str, limit: int
|
|
210
|
-
) -> List[SearchResult]:
|
|
211
|
-
"""Execute fuzzy FTS search using trigram/extended unicode61 tokenizer.
|
|
212
|
-
|
|
213
|
-
Args:
|
|
214
|
-
index_path: Path to _index.db file
|
|
215
|
-
query: FTS5 query string
|
|
216
|
-
limit: Maximum results
|
|
217
|
-
|
|
218
|
-
Returns:
|
|
219
|
-
List of SearchResult objects
|
|
220
|
-
"""
|
|
221
|
-
try:
|
|
222
|
-
with DirIndexStore(index_path) as store:
|
|
223
|
-
return store.search_fts_fuzzy(query, limit=limit)
|
|
224
|
-
except Exception as exc:
|
|
225
|
-
self.logger.debug("Fuzzy search error: %s", exc)
|
|
226
|
-
return []
|
|
227
|
-
|
|
228
|
-
def _search_vector(
|
|
229
|
-
self, index_path: Path, query: str, limit: int
|
|
230
|
-
) -> List[SearchResult]:
|
|
231
|
-
"""Execute vector similarity search using semantic embeddings.
|
|
232
|
-
|
|
233
|
-
Args:
|
|
234
|
-
index_path: Path to _index.db file
|
|
235
|
-
query: Natural language query string
|
|
236
|
-
limit: Maximum results
|
|
237
|
-
|
|
238
|
-
Returns:
|
|
239
|
-
List of SearchResult objects ordered by semantic similarity
|
|
240
|
-
"""
|
|
241
|
-
try:
|
|
242
|
-
# Check if semantic chunks table exists
|
|
243
|
-
import sqlite3
|
|
244
|
-
try:
|
|
245
|
-
with sqlite3.connect(index_path) as conn:
|
|
246
|
-
cursor = conn.execute(
|
|
247
|
-
"SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_chunks'"
|
|
248
|
-
)
|
|
249
|
-
has_semantic_table = cursor.fetchone() is not None
|
|
250
|
-
except sqlite3.Error as e:
|
|
251
|
-
self.logger.error("Database check failed in vector search: %s", e)
|
|
252
|
-
return []
|
|
253
|
-
|
|
254
|
-
if not has_semantic_table:
|
|
255
|
-
self.logger.info(
|
|
256
|
-
"No embeddings found in index. "
|
|
257
|
-
"Generate embeddings with: codexlens embeddings-generate %s",
|
|
258
|
-
index_path.parent if index_path.name == "_index.db" else index_path
|
|
259
|
-
)
|
|
260
|
-
return []
|
|
261
|
-
|
|
262
|
-
# Initialize embedder and vector store
|
|
263
|
-
from codexlens.semantic.embedder import get_embedder
|
|
264
|
-
from codexlens.semantic.vector_store import VectorStore
|
|
265
|
-
|
|
266
|
-
vector_store = VectorStore(index_path)
|
|
267
|
-
|
|
268
|
-
# Check if vector store has data
|
|
269
|
-
if vector_store.count_chunks() == 0:
|
|
270
|
-
self.logger.info(
|
|
271
|
-
"Vector store is empty (0 chunks). "
|
|
272
|
-
"Generate embeddings with: codexlens embeddings-generate %s",
|
|
273
|
-
index_path.parent if index_path.name == "_index.db" else index_path
|
|
274
|
-
)
|
|
275
|
-
return []
|
|
276
|
-
|
|
277
|
-
# Auto-detect embedding dimension and select appropriate profile
|
|
278
|
-
detected_dim = vector_store.dimension
|
|
279
|
-
if detected_dim is None:
|
|
280
|
-
self.logger.info("Vector store dimension unknown, using default profile")
|
|
281
|
-
profile = "code" # Default fallback
|
|
282
|
-
elif detected_dim == 384:
|
|
283
|
-
profile = "fast"
|
|
284
|
-
elif detected_dim == 768:
|
|
285
|
-
profile = "code"
|
|
286
|
-
elif detected_dim == 1024:
|
|
287
|
-
profile = "multilingual" # or balanced, both are 1024
|
|
288
|
-
else:
|
|
289
|
-
profile = "code" # Default fallback
|
|
290
|
-
|
|
291
|
-
# Use cached embedder (singleton) for performance
|
|
292
|
-
embedder = get_embedder(profile=profile)
|
|
293
|
-
|
|
294
|
-
# Generate query embedding
|
|
295
|
-
query_embedding = embedder.embed_single(query)
|
|
296
|
-
|
|
297
|
-
# Search for similar chunks
|
|
298
|
-
results = vector_store.search_similar(
|
|
299
|
-
query_embedding=query_embedding,
|
|
300
|
-
top_k=limit,
|
|
301
|
-
min_score=0.0, # Return all results, let RRF handle filtering
|
|
302
|
-
return_full_content=True,
|
|
303
|
-
)
|
|
304
|
-
|
|
305
|
-
self.logger.debug("Vector search found %d results", len(results))
|
|
306
|
-
return results
|
|
307
|
-
|
|
308
|
-
except ImportError as exc:
|
|
309
|
-
self.logger.debug("Semantic dependencies not available: %s", exc)
|
|
310
|
-
return []
|
|
311
|
-
except Exception as exc:
|
|
312
|
-
self.logger.error("Vector search error: %s", exc)
|
|
313
|
-
return []
|
|
1
|
+
"""Hybrid search engine orchestrating parallel exact/fuzzy/vector searches with RRF fusion.
|
|
2
|
+
|
|
3
|
+
Coordinates multiple search backends in parallel using ThreadPoolExecutor and combines
|
|
4
|
+
results via Reciprocal Rank Fusion (RRF) algorithm.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Dict, List, Optional
|
|
13
|
+
|
|
14
|
+
from codexlens.entities import SearchResult
|
|
15
|
+
from codexlens.search.ranking import reciprocal_rank_fusion, tag_search_source
|
|
16
|
+
from codexlens.storage.dir_index import DirIndexStore
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class HybridSearchEngine:
|
|
20
|
+
"""Hybrid search engine with parallel execution and RRF fusion.
|
|
21
|
+
|
|
22
|
+
Orchestrates searches across exact FTS, fuzzy FTS, and optional vector backends,
|
|
23
|
+
executing them in parallel and fusing results via Reciprocal Rank Fusion.
|
|
24
|
+
|
|
25
|
+
Attributes:
|
|
26
|
+
logger: Python logger instance
|
|
27
|
+
default_weights: Default RRF weights for each source
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
# Default RRF weights (exact: 40%, fuzzy: 30%, vector: 30%)
|
|
31
|
+
DEFAULT_WEIGHTS = {
|
|
32
|
+
"exact": 0.4,
|
|
33
|
+
"fuzzy": 0.3,
|
|
34
|
+
"vector": 0.3,
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
def __init__(self, weights: Optional[Dict[str, float]] = None):
|
|
38
|
+
"""Initialize hybrid search engine.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
weights: Optional custom RRF weights (default: DEFAULT_WEIGHTS)
|
|
42
|
+
"""
|
|
43
|
+
self.logger = logging.getLogger(__name__)
|
|
44
|
+
self.weights = weights or self.DEFAULT_WEIGHTS.copy()
|
|
45
|
+
|
|
46
|
+
def search(
|
|
47
|
+
self,
|
|
48
|
+
index_path: Path,
|
|
49
|
+
query: str,
|
|
50
|
+
limit: int = 20,
|
|
51
|
+
enable_fuzzy: bool = True,
|
|
52
|
+
enable_vector: bool = False,
|
|
53
|
+
pure_vector: bool = False,
|
|
54
|
+
) -> List[SearchResult]:
|
|
55
|
+
"""Execute hybrid search with parallel retrieval and RRF fusion.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
index_path: Path to _index.db file
|
|
59
|
+
query: FTS5 query string (for FTS) or natural language query (for vector)
|
|
60
|
+
limit: Maximum results to return after fusion
|
|
61
|
+
enable_fuzzy: Enable fuzzy FTS search (default True)
|
|
62
|
+
enable_vector: Enable vector search (default False)
|
|
63
|
+
pure_vector: If True, only use vector search without FTS fallback (default False)
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
List of SearchResult objects sorted by fusion score
|
|
67
|
+
|
|
68
|
+
Examples:
|
|
69
|
+
>>> engine = HybridSearchEngine()
|
|
70
|
+
>>> # Hybrid search (exact + fuzzy + vector)
|
|
71
|
+
>>> results = engine.search(Path("project/_index.db"), "authentication",
|
|
72
|
+
... enable_vector=True)
|
|
73
|
+
>>> # Pure vector search (semantic only)
|
|
74
|
+
>>> results = engine.search(Path("project/_index.db"),
|
|
75
|
+
... "how to authenticate users",
|
|
76
|
+
... enable_vector=True, pure_vector=True)
|
|
77
|
+
>>> for r in results[:5]:
|
|
78
|
+
... print(f"{r.path}: {r.score:.3f}")
|
|
79
|
+
"""
|
|
80
|
+
# Determine which backends to use
|
|
81
|
+
backends = {}
|
|
82
|
+
|
|
83
|
+
if pure_vector:
|
|
84
|
+
# Pure vector mode: only use vector search, no FTS fallback
|
|
85
|
+
if enable_vector:
|
|
86
|
+
backends["vector"] = True
|
|
87
|
+
else:
|
|
88
|
+
# Invalid configuration: pure_vector=True but enable_vector=False
|
|
89
|
+
self.logger.warning(
|
|
90
|
+
"pure_vector=True requires enable_vector=True. "
|
|
91
|
+
"Falling back to exact search. "
|
|
92
|
+
"To use pure vector search, enable vector search mode."
|
|
93
|
+
)
|
|
94
|
+
backends["exact"] = True
|
|
95
|
+
else:
|
|
96
|
+
# Hybrid mode: always include exact search as baseline
|
|
97
|
+
backends["exact"] = True
|
|
98
|
+
if enable_fuzzy:
|
|
99
|
+
backends["fuzzy"] = True
|
|
100
|
+
if enable_vector:
|
|
101
|
+
backends["vector"] = True
|
|
102
|
+
|
|
103
|
+
# Execute parallel searches
|
|
104
|
+
results_map = self._search_parallel(index_path, query, backends, limit)
|
|
105
|
+
|
|
106
|
+
# Provide helpful message if pure-vector mode returns no results
|
|
107
|
+
if pure_vector and enable_vector and len(results_map.get("vector", [])) == 0:
|
|
108
|
+
self.logger.warning(
|
|
109
|
+
"Pure vector search returned no results. "
|
|
110
|
+
"This usually means embeddings haven't been generated. "
|
|
111
|
+
"Run: codexlens embeddings-generate %s",
|
|
112
|
+
index_path.parent if index_path.name == "_index.db" else index_path
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
# Apply RRF fusion
|
|
116
|
+
# Filter weights to only active backends
|
|
117
|
+
active_weights = {
|
|
118
|
+
source: weight
|
|
119
|
+
for source, weight in self.weights.items()
|
|
120
|
+
if source in results_map
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
fused_results = reciprocal_rank_fusion(results_map, active_weights)
|
|
124
|
+
|
|
125
|
+
# Apply final limit
|
|
126
|
+
return fused_results[:limit]
|
|
127
|
+
|
|
128
|
+
def _search_parallel(
|
|
129
|
+
self,
|
|
130
|
+
index_path: Path,
|
|
131
|
+
query: str,
|
|
132
|
+
backends: Dict[str, bool],
|
|
133
|
+
limit: int,
|
|
134
|
+
) -> Dict[str, List[SearchResult]]:
|
|
135
|
+
"""Execute parallel searches across enabled backends.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
index_path: Path to _index.db file
|
|
139
|
+
query: FTS5 query string
|
|
140
|
+
backends: Dictionary of backend name to enabled flag
|
|
141
|
+
limit: Results limit per backend
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
Dictionary mapping source name to results list
|
|
145
|
+
"""
|
|
146
|
+
results_map: Dict[str, List[SearchResult]] = {}
|
|
147
|
+
|
|
148
|
+
# Use ThreadPoolExecutor for parallel I/O-bound searches
|
|
149
|
+
with ThreadPoolExecutor(max_workers=len(backends)) as executor:
|
|
150
|
+
# Submit search tasks
|
|
151
|
+
future_to_source = {}
|
|
152
|
+
|
|
153
|
+
if backends.get("exact"):
|
|
154
|
+
future = executor.submit(
|
|
155
|
+
self._search_exact, index_path, query, limit
|
|
156
|
+
)
|
|
157
|
+
future_to_source[future] = "exact"
|
|
158
|
+
|
|
159
|
+
if backends.get("fuzzy"):
|
|
160
|
+
future = executor.submit(
|
|
161
|
+
self._search_fuzzy, index_path, query, limit
|
|
162
|
+
)
|
|
163
|
+
future_to_source[future] = "fuzzy"
|
|
164
|
+
|
|
165
|
+
if backends.get("vector"):
|
|
166
|
+
future = executor.submit(
|
|
167
|
+
self._search_vector, index_path, query, limit
|
|
168
|
+
)
|
|
169
|
+
future_to_source[future] = "vector"
|
|
170
|
+
|
|
171
|
+
# Collect results as they complete
|
|
172
|
+
for future in as_completed(future_to_source):
|
|
173
|
+
source = future_to_source[future]
|
|
174
|
+
try:
|
|
175
|
+
results = future.result()
|
|
176
|
+
# Tag results with source for debugging
|
|
177
|
+
tagged_results = tag_search_source(results, source)
|
|
178
|
+
results_map[source] = tagged_results
|
|
179
|
+
self.logger.debug(
|
|
180
|
+
"Got %d results from %s search", len(results), source
|
|
181
|
+
)
|
|
182
|
+
except Exception as exc:
|
|
183
|
+
self.logger.error("Search failed for %s: %s", source, exc)
|
|
184
|
+
results_map[source] = []
|
|
185
|
+
|
|
186
|
+
return results_map
|
|
187
|
+
|
|
188
|
+
def _search_exact(
|
|
189
|
+
self, index_path: Path, query: str, limit: int
|
|
190
|
+
) -> List[SearchResult]:
|
|
191
|
+
"""Execute exact FTS search using unicode61 tokenizer.
|
|
192
|
+
|
|
193
|
+
Args:
|
|
194
|
+
index_path: Path to _index.db file
|
|
195
|
+
query: FTS5 query string
|
|
196
|
+
limit: Maximum results
|
|
197
|
+
|
|
198
|
+
Returns:
|
|
199
|
+
List of SearchResult objects
|
|
200
|
+
"""
|
|
201
|
+
try:
|
|
202
|
+
with DirIndexStore(index_path) as store:
|
|
203
|
+
return store.search_fts_exact(query, limit=limit)
|
|
204
|
+
except Exception as exc:
|
|
205
|
+
self.logger.debug("Exact search error: %s", exc)
|
|
206
|
+
return []
|
|
207
|
+
|
|
208
|
+
def _search_fuzzy(
|
|
209
|
+
self, index_path: Path, query: str, limit: int
|
|
210
|
+
) -> List[SearchResult]:
|
|
211
|
+
"""Execute fuzzy FTS search using trigram/extended unicode61 tokenizer.
|
|
212
|
+
|
|
213
|
+
Args:
|
|
214
|
+
index_path: Path to _index.db file
|
|
215
|
+
query: FTS5 query string
|
|
216
|
+
limit: Maximum results
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
List of SearchResult objects
|
|
220
|
+
"""
|
|
221
|
+
try:
|
|
222
|
+
with DirIndexStore(index_path) as store:
|
|
223
|
+
return store.search_fts_fuzzy(query, limit=limit)
|
|
224
|
+
except Exception as exc:
|
|
225
|
+
self.logger.debug("Fuzzy search error: %s", exc)
|
|
226
|
+
return []
|
|
227
|
+
|
|
228
|
+
def _search_vector(
|
|
229
|
+
self, index_path: Path, query: str, limit: int
|
|
230
|
+
) -> List[SearchResult]:
|
|
231
|
+
"""Execute vector similarity search using semantic embeddings.
|
|
232
|
+
|
|
233
|
+
Args:
|
|
234
|
+
index_path: Path to _index.db file
|
|
235
|
+
query: Natural language query string
|
|
236
|
+
limit: Maximum results
|
|
237
|
+
|
|
238
|
+
Returns:
|
|
239
|
+
List of SearchResult objects ordered by semantic similarity
|
|
240
|
+
"""
|
|
241
|
+
try:
|
|
242
|
+
# Check if semantic chunks table exists
|
|
243
|
+
import sqlite3
|
|
244
|
+
try:
|
|
245
|
+
with sqlite3.connect(index_path) as conn:
|
|
246
|
+
cursor = conn.execute(
|
|
247
|
+
"SELECT name FROM sqlite_master WHERE type='table' AND name='semantic_chunks'"
|
|
248
|
+
)
|
|
249
|
+
has_semantic_table = cursor.fetchone() is not None
|
|
250
|
+
except sqlite3.Error as e:
|
|
251
|
+
self.logger.error("Database check failed in vector search: %s", e)
|
|
252
|
+
return []
|
|
253
|
+
|
|
254
|
+
if not has_semantic_table:
|
|
255
|
+
self.logger.info(
|
|
256
|
+
"No embeddings found in index. "
|
|
257
|
+
"Generate embeddings with: codexlens embeddings-generate %s",
|
|
258
|
+
index_path.parent if index_path.name == "_index.db" else index_path
|
|
259
|
+
)
|
|
260
|
+
return []
|
|
261
|
+
|
|
262
|
+
# Initialize embedder and vector store
|
|
263
|
+
from codexlens.semantic.embedder import get_embedder
|
|
264
|
+
from codexlens.semantic.vector_store import VectorStore
|
|
265
|
+
|
|
266
|
+
vector_store = VectorStore(index_path)
|
|
267
|
+
|
|
268
|
+
# Check if vector store has data
|
|
269
|
+
if vector_store.count_chunks() == 0:
|
|
270
|
+
self.logger.info(
|
|
271
|
+
"Vector store is empty (0 chunks). "
|
|
272
|
+
"Generate embeddings with: codexlens embeddings-generate %s",
|
|
273
|
+
index_path.parent if index_path.name == "_index.db" else index_path
|
|
274
|
+
)
|
|
275
|
+
return []
|
|
276
|
+
|
|
277
|
+
# Auto-detect embedding dimension and select appropriate profile
|
|
278
|
+
detected_dim = vector_store.dimension
|
|
279
|
+
if detected_dim is None:
|
|
280
|
+
self.logger.info("Vector store dimension unknown, using default profile")
|
|
281
|
+
profile = "code" # Default fallback
|
|
282
|
+
elif detected_dim == 384:
|
|
283
|
+
profile = "fast"
|
|
284
|
+
elif detected_dim == 768:
|
|
285
|
+
profile = "code"
|
|
286
|
+
elif detected_dim == 1024:
|
|
287
|
+
profile = "multilingual" # or balanced, both are 1024
|
|
288
|
+
else:
|
|
289
|
+
profile = "code" # Default fallback
|
|
290
|
+
|
|
291
|
+
# Use cached embedder (singleton) for performance
|
|
292
|
+
embedder = get_embedder(profile=profile)
|
|
293
|
+
|
|
294
|
+
# Generate query embedding
|
|
295
|
+
query_embedding = embedder.embed_single(query)
|
|
296
|
+
|
|
297
|
+
# Search for similar chunks
|
|
298
|
+
results = vector_store.search_similar(
|
|
299
|
+
query_embedding=query_embedding,
|
|
300
|
+
top_k=limit,
|
|
301
|
+
min_score=0.0, # Return all results, let RRF handle filtering
|
|
302
|
+
return_full_content=True,
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
self.logger.debug("Vector search found %d results", len(results))
|
|
306
|
+
return results
|
|
307
|
+
|
|
308
|
+
except ImportError as exc:
|
|
309
|
+
self.logger.debug("Semantic dependencies not available: %s", exc)
|
|
310
|
+
return []
|
|
311
|
+
except Exception as exc:
|
|
312
|
+
self.logger.error("Vector search error: %s", exc)
|
|
313
|
+
return []
|