signalwire-agents 0.1.6__py3-none-any.whl → 1.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- signalwire_agents/__init__.py +130 -4
- signalwire_agents/agent_server.py +438 -32
- signalwire_agents/agents/bedrock.py +296 -0
- signalwire_agents/cli/__init__.py +18 -0
- signalwire_agents/cli/build_search.py +1367 -0
- signalwire_agents/cli/config.py +80 -0
- signalwire_agents/cli/core/__init__.py +10 -0
- signalwire_agents/cli/core/agent_loader.py +470 -0
- signalwire_agents/cli/core/argparse_helpers.py +179 -0
- signalwire_agents/cli/core/dynamic_config.py +71 -0
- signalwire_agents/cli/core/service_loader.py +303 -0
- signalwire_agents/cli/execution/__init__.py +10 -0
- signalwire_agents/cli/execution/datamap_exec.py +446 -0
- signalwire_agents/cli/execution/webhook_exec.py +134 -0
- signalwire_agents/cli/init_project.py +1225 -0
- signalwire_agents/cli/output/__init__.py +10 -0
- signalwire_agents/cli/output/output_formatter.py +255 -0
- signalwire_agents/cli/output/swml_dump.py +186 -0
- signalwire_agents/cli/simulation/__init__.py +10 -0
- signalwire_agents/cli/simulation/data_generation.py +374 -0
- signalwire_agents/cli/simulation/data_overrides.py +200 -0
- signalwire_agents/cli/simulation/mock_env.py +282 -0
- signalwire_agents/cli/swaig_test_wrapper.py +52 -0
- signalwire_agents/cli/test_swaig.py +809 -0
- signalwire_agents/cli/types.py +81 -0
- signalwire_agents/core/__init__.py +2 -2
- signalwire_agents/core/agent/__init__.py +12 -0
- signalwire_agents/core/agent/config/__init__.py +12 -0
- signalwire_agents/core/agent/deployment/__init__.py +9 -0
- signalwire_agents/core/agent/deployment/handlers/__init__.py +9 -0
- signalwire_agents/core/agent/prompt/__init__.py +14 -0
- signalwire_agents/core/agent/prompt/manager.py +306 -0
- signalwire_agents/core/agent/routing/__init__.py +9 -0
- signalwire_agents/core/agent/security/__init__.py +9 -0
- signalwire_agents/core/agent/swml/__init__.py +9 -0
- signalwire_agents/core/agent/tools/__init__.py +15 -0
- signalwire_agents/core/agent/tools/decorator.py +97 -0
- signalwire_agents/core/agent/tools/registry.py +210 -0
- signalwire_agents/core/agent_base.py +959 -2166
- signalwire_agents/core/auth_handler.py +233 -0
- signalwire_agents/core/config_loader.py +259 -0
- signalwire_agents/core/contexts.py +707 -0
- signalwire_agents/core/data_map.py +487 -0
- signalwire_agents/core/function_result.py +1150 -1
- signalwire_agents/core/logging_config.py +376 -0
- signalwire_agents/core/mixins/__init__.py +28 -0
- signalwire_agents/core/mixins/ai_config_mixin.py +442 -0
- signalwire_agents/core/mixins/auth_mixin.py +287 -0
- signalwire_agents/core/mixins/prompt_mixin.py +358 -0
- signalwire_agents/core/mixins/serverless_mixin.py +368 -0
- signalwire_agents/core/mixins/skill_mixin.py +55 -0
- signalwire_agents/core/mixins/state_mixin.py +153 -0
- signalwire_agents/core/mixins/tool_mixin.py +230 -0
- signalwire_agents/core/mixins/web_mixin.py +1134 -0
- signalwire_agents/core/security/session_manager.py +174 -86
- signalwire_agents/core/security_config.py +333 -0
- signalwire_agents/core/skill_base.py +200 -0
- signalwire_agents/core/skill_manager.py +244 -0
- signalwire_agents/core/swaig_function.py +33 -9
- signalwire_agents/core/swml_builder.py +212 -12
- signalwire_agents/core/swml_handler.py +43 -13
- signalwire_agents/core/swml_renderer.py +123 -297
- signalwire_agents/core/swml_service.py +277 -260
- signalwire_agents/prefabs/concierge.py +6 -2
- signalwire_agents/prefabs/info_gatherer.py +149 -33
- signalwire_agents/prefabs/receptionist.py +14 -22
- signalwire_agents/prefabs/survey.py +6 -2
- signalwire_agents/schema.json +9218 -5489
- signalwire_agents/search/__init__.py +137 -0
- signalwire_agents/search/document_processor.py +1223 -0
- signalwire_agents/search/index_builder.py +804 -0
- signalwire_agents/search/migration.py +418 -0
- signalwire_agents/search/models.py +30 -0
- signalwire_agents/search/pgvector_backend.py +752 -0
- signalwire_agents/search/query_processor.py +502 -0
- signalwire_agents/search/search_engine.py +1264 -0
- signalwire_agents/search/search_service.py +574 -0
- signalwire_agents/skills/README.md +452 -0
- signalwire_agents/skills/__init__.py +23 -0
- signalwire_agents/skills/api_ninjas_trivia/README.md +215 -0
- signalwire_agents/skills/api_ninjas_trivia/__init__.py +12 -0
- signalwire_agents/skills/api_ninjas_trivia/skill.py +237 -0
- signalwire_agents/skills/datasphere/README.md +210 -0
- signalwire_agents/skills/datasphere/__init__.py +12 -0
- signalwire_agents/skills/datasphere/skill.py +310 -0
- signalwire_agents/skills/datasphere_serverless/README.md +258 -0
- signalwire_agents/skills/datasphere_serverless/__init__.py +10 -0
- signalwire_agents/skills/datasphere_serverless/skill.py +237 -0
- signalwire_agents/skills/datetime/README.md +132 -0
- signalwire_agents/skills/datetime/__init__.py +10 -0
- signalwire_agents/skills/datetime/skill.py +126 -0
- signalwire_agents/skills/joke/README.md +149 -0
- signalwire_agents/skills/joke/__init__.py +10 -0
- signalwire_agents/skills/joke/skill.py +109 -0
- signalwire_agents/skills/math/README.md +161 -0
- signalwire_agents/skills/math/__init__.py +10 -0
- signalwire_agents/skills/math/skill.py +105 -0
- signalwire_agents/skills/mcp_gateway/README.md +230 -0
- signalwire_agents/skills/mcp_gateway/__init__.py +10 -0
- signalwire_agents/skills/mcp_gateway/skill.py +421 -0
- signalwire_agents/skills/native_vector_search/README.md +210 -0
- signalwire_agents/skills/native_vector_search/__init__.py +10 -0
- signalwire_agents/skills/native_vector_search/skill.py +820 -0
- signalwire_agents/skills/play_background_file/README.md +218 -0
- signalwire_agents/skills/play_background_file/__init__.py +12 -0
- signalwire_agents/skills/play_background_file/skill.py +242 -0
- signalwire_agents/skills/registry.py +459 -0
- signalwire_agents/skills/spider/README.md +236 -0
- signalwire_agents/skills/spider/__init__.py +13 -0
- signalwire_agents/skills/spider/skill.py +598 -0
- signalwire_agents/skills/swml_transfer/README.md +395 -0
- signalwire_agents/skills/swml_transfer/__init__.py +10 -0
- signalwire_agents/skills/swml_transfer/skill.py +359 -0
- signalwire_agents/skills/weather_api/README.md +178 -0
- signalwire_agents/skills/weather_api/__init__.py +12 -0
- signalwire_agents/skills/weather_api/skill.py +191 -0
- signalwire_agents/skills/web_search/README.md +163 -0
- signalwire_agents/skills/web_search/__init__.py +10 -0
- signalwire_agents/skills/web_search/skill.py +739 -0
- signalwire_agents/skills/wikipedia_search/README.md +228 -0
- signalwire_agents/{core/state → skills/wikipedia_search}/__init__.py +5 -4
- signalwire_agents/skills/wikipedia_search/skill.py +210 -0
- signalwire_agents/utils/__init__.py +14 -0
- signalwire_agents/utils/schema_utils.py +111 -44
- signalwire_agents/web/__init__.py +17 -0
- signalwire_agents/web/web_service.py +559 -0
- signalwire_agents-1.0.7.data/data/share/man/man1/sw-agent-init.1 +307 -0
- signalwire_agents-1.0.7.data/data/share/man/man1/sw-search.1 +483 -0
- signalwire_agents-1.0.7.data/data/share/man/man1/swaig-test.1 +308 -0
- signalwire_agents-1.0.7.dist-info/METADATA +992 -0
- signalwire_agents-1.0.7.dist-info/RECORD +142 -0
- {signalwire_agents-0.1.6.dist-info → signalwire_agents-1.0.7.dist-info}/WHEEL +1 -1
- signalwire_agents-1.0.7.dist-info/entry_points.txt +4 -0
- signalwire_agents/core/state/file_state_manager.py +0 -219
- signalwire_agents/core/state/state_manager.py +0 -101
- signalwire_agents-0.1.6.data/data/schema.json +0 -5611
- signalwire_agents-0.1.6.dist-info/METADATA +0 -199
- signalwire_agents-0.1.6.dist-info/RECORD +0 -34
- {signalwire_agents-0.1.6.dist-info → signalwire_agents-1.0.7.dist-info}/licenses/LICENSE +0 -0
- {signalwire_agents-0.1.6.dist-info → signalwire_agents-1.0.7.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,1264 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright (c) 2025 SignalWire
|
|
3
|
+
|
|
4
|
+
This file is part of the SignalWire AI Agents SDK.
|
|
5
|
+
|
|
6
|
+
Licensed under the MIT License.
|
|
7
|
+
See LICENSE file in the project root for full license information.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import sqlite3
|
|
11
|
+
import json
|
|
12
|
+
import logging
|
|
13
|
+
from typing import List, Dict, Any, Optional, Union
|
|
14
|
+
|
|
15
|
+
try:
|
|
16
|
+
import numpy as np
|
|
17
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
|
18
|
+
NDArray = np.ndarray
|
|
19
|
+
except ImportError:
|
|
20
|
+
np = None
|
|
21
|
+
cosine_similarity = None
|
|
22
|
+
NDArray = Any # Fallback type for when numpy is not available
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
class SearchEngine:
|
|
27
|
+
"""Hybrid search engine for vector and keyword search"""
|
|
28
|
+
|
|
29
|
+
def __init__(self, backend: str = 'sqlite', index_path: Optional[str] = None,
|
|
30
|
+
connection_string: Optional[str] = None, collection_name: Optional[str] = None,
|
|
31
|
+
model=None):
|
|
32
|
+
"""
|
|
33
|
+
Initialize search engine
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
backend: Storage backend ('sqlite' or 'pgvector')
|
|
37
|
+
index_path: Path to .swsearch file (for sqlite backend)
|
|
38
|
+
connection_string: PostgreSQL connection string (for pgvector backend)
|
|
39
|
+
collection_name: Collection name (for pgvector backend)
|
|
40
|
+
model: Optional sentence transformer model
|
|
41
|
+
"""
|
|
42
|
+
self.backend = backend
|
|
43
|
+
self.model = model
|
|
44
|
+
|
|
45
|
+
if backend == 'sqlite':
|
|
46
|
+
if not index_path:
|
|
47
|
+
raise ValueError("index_path is required for sqlite backend")
|
|
48
|
+
self.index_path = index_path
|
|
49
|
+
self.config = self._load_config()
|
|
50
|
+
self.embedding_dim = int(self.config.get('embedding_dimensions', 768))
|
|
51
|
+
self._backend = None # SQLite uses direct connection
|
|
52
|
+
elif backend == 'pgvector':
|
|
53
|
+
if not connection_string or not collection_name:
|
|
54
|
+
raise ValueError("connection_string and collection_name are required for pgvector backend")
|
|
55
|
+
from .pgvector_backend import PgVectorSearchBackend
|
|
56
|
+
self._backend = PgVectorSearchBackend(connection_string, collection_name)
|
|
57
|
+
self.config = self._backend.config
|
|
58
|
+
self.embedding_dim = int(self.config.get('embedding_dimensions', 768))
|
|
59
|
+
else:
|
|
60
|
+
raise ValueError(f"Invalid backend '{backend}'. Must be 'sqlite' or 'pgvector'")
|
|
61
|
+
|
|
62
|
+
def _load_config(self) -> Dict[str, str]:
|
|
63
|
+
"""Load index configuration"""
|
|
64
|
+
try:
|
|
65
|
+
conn = sqlite3.connect(self.index_path)
|
|
66
|
+
cursor = conn.cursor()
|
|
67
|
+
cursor.execute("SELECT key, value FROM config")
|
|
68
|
+
config = dict(cursor.fetchall())
|
|
69
|
+
conn.close()
|
|
70
|
+
return config
|
|
71
|
+
except Exception as e:
|
|
72
|
+
logger.error(f"Error loading config from {self.index_path}: {e}")
|
|
73
|
+
return {}
|
|
74
|
+
|
|
75
|
+
def search(self, query_vector: List[float], enhanced_text: str,
|
|
76
|
+
count: int = 3, similarity_threshold: float = 0.0,
|
|
77
|
+
tags: Optional[List[str]] = None,
|
|
78
|
+
keyword_weight: Optional[float] = None,
|
|
79
|
+
original_query: Optional[str] = None) -> List[Dict[str, Any]]:
|
|
80
|
+
"""
|
|
81
|
+
Perform improved search with fast filtering and vector re-ranking
|
|
82
|
+
|
|
83
|
+
Strategy:
|
|
84
|
+
1. Fast candidate collection (filename, metadata, keywords)
|
|
85
|
+
2. Vector re-ranking on candidates only
|
|
86
|
+
3. Fallback to full vector search if few candidates
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
query_vector: Embedding vector for the query
|
|
90
|
+
enhanced_text: Processed query text for keyword search
|
|
91
|
+
count: Number of results to return
|
|
92
|
+
similarity_threshold: Minimum similarity score
|
|
93
|
+
tags: Filter by tags
|
|
94
|
+
keyword_weight: Optional manual weight for keyword vs vector
|
|
95
|
+
original_query: Original query for exact matching
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
List of search results with scores and metadata
|
|
99
|
+
"""
|
|
100
|
+
|
|
101
|
+
# Use pgvector backend if available
|
|
102
|
+
if self.backend == 'pgvector':
|
|
103
|
+
return self._backend.search(query_vector, enhanced_text, count, similarity_threshold, tags, keyword_weight)
|
|
104
|
+
|
|
105
|
+
# Check for numpy/sklearn availability
|
|
106
|
+
if not np or not cosine_similarity:
|
|
107
|
+
logger.warning("NumPy or scikit-learn not available. Using keyword search only.")
|
|
108
|
+
return self._keyword_search_only(enhanced_text, count, tags, original_query)
|
|
109
|
+
|
|
110
|
+
# Convert query vector to numpy array
|
|
111
|
+
try:
|
|
112
|
+
query_array = np.array(query_vector).reshape(1, -1)
|
|
113
|
+
except Exception as e:
|
|
114
|
+
logger.error(f"Error converting query vector: {e}")
|
|
115
|
+
return self._keyword_search_only(enhanced_text, count, tags, original_query)
|
|
116
|
+
|
|
117
|
+
# HYBRID APPROACH: Search vector AND metadata in parallel
|
|
118
|
+
# Stage 1: Run both search types simultaneously
|
|
119
|
+
search_multiplier = 3
|
|
120
|
+
|
|
121
|
+
# Vector search (semantic similarity - primary ranking signal)
|
|
122
|
+
vector_results = self._vector_search(query_array, count * search_multiplier)
|
|
123
|
+
|
|
124
|
+
# Metadata/keyword searches (confirmation signals and backfill)
|
|
125
|
+
filename_results = self._filename_search(original_query or enhanced_text, count * search_multiplier)
|
|
126
|
+
metadata_results = self._metadata_search(original_query or enhanced_text, count * search_multiplier)
|
|
127
|
+
keyword_results = self._keyword_search(enhanced_text, count * search_multiplier, original_query)
|
|
128
|
+
|
|
129
|
+
logger.debug(f"Parallel search: vector={len(vector_results)}, filename={len(filename_results)}, "
|
|
130
|
+
f"metadata={len(metadata_results)}, keyword={len(keyword_results)}")
|
|
131
|
+
|
|
132
|
+
# Stage 2: Merge all results into candidate pool
|
|
133
|
+
candidates = {}
|
|
134
|
+
|
|
135
|
+
# Add vector results first (primary signal)
|
|
136
|
+
for result in vector_results:
|
|
137
|
+
chunk_id = result['id']
|
|
138
|
+
candidates[chunk_id] = result
|
|
139
|
+
candidates[chunk_id]['vector_score'] = result['score']
|
|
140
|
+
candidates[chunk_id]['vector_distance'] = 1 - result['score']
|
|
141
|
+
candidates[chunk_id]['sources'] = {'vector': True}
|
|
142
|
+
candidates[chunk_id]['source_scores'] = {'vector': result['score']}
|
|
143
|
+
|
|
144
|
+
# Add metadata/keyword results (secondary signals that boost or backfill)
|
|
145
|
+
for result_set, source_type, source_weight in [(filename_results, 'filename', 2.0),
|
|
146
|
+
(metadata_results, 'metadata', 1.5),
|
|
147
|
+
(keyword_results, 'keyword', 1.0)]:
|
|
148
|
+
for result in result_set:
|
|
149
|
+
chunk_id = result['id']
|
|
150
|
+
if chunk_id not in candidates:
|
|
151
|
+
# New candidate from metadata/keyword (no vector match)
|
|
152
|
+
candidates[chunk_id] = result
|
|
153
|
+
candidates[chunk_id]['sources'] = {source_type: True}
|
|
154
|
+
candidates[chunk_id]['source_scores'] = {source_type: result['score'] * source_weight}
|
|
155
|
+
else:
|
|
156
|
+
# Exists in vector results - add metadata/keyword as confirmation signal
|
|
157
|
+
candidates[chunk_id]['sources'][source_type] = True
|
|
158
|
+
candidates[chunk_id]['source_scores'][source_type] = result['score'] * source_weight
|
|
159
|
+
|
|
160
|
+
# Stage 3: Score and rank all candidates
|
|
161
|
+
final_results = []
|
|
162
|
+
for chunk_id, candidate in candidates.items():
|
|
163
|
+
# Calculate final score combining all signals
|
|
164
|
+
score = self._calculate_combined_score(candidate, similarity_threshold)
|
|
165
|
+
candidate['final_score'] = score
|
|
166
|
+
final_results.append(candidate)
|
|
167
|
+
|
|
168
|
+
# Sort by final score
|
|
169
|
+
final_results.sort(key=lambda x: x['final_score'], reverse=True)
|
|
170
|
+
|
|
171
|
+
# Filter by tags if specified
|
|
172
|
+
if tags:
|
|
173
|
+
final_results = [r for r in final_results
|
|
174
|
+
if any(tag in r['metadata'].get('tags', []) for tag in tags)]
|
|
175
|
+
|
|
176
|
+
# Apply distance threshold as final filter (soft threshold already applied in scoring)
|
|
177
|
+
if similarity_threshold > 0:
|
|
178
|
+
final_results = [r for r in final_results
|
|
179
|
+
if r.get('vector_distance', 0) <= similarity_threshold * 1.5
|
|
180
|
+
or 'vector' not in r.get('sources', {})]
|
|
181
|
+
|
|
182
|
+
# Boost exact matches if we have the original query
|
|
183
|
+
if original_query:
|
|
184
|
+
final_results = self._boost_exact_matches(final_results, original_query)
|
|
185
|
+
# Re-sort after boosting
|
|
186
|
+
final_results.sort(key=lambda x: x['final_score'], reverse=True)
|
|
187
|
+
|
|
188
|
+
# Apply diversity penalties to prevent single-file dominance
|
|
189
|
+
final_results = self._apply_diversity_penalties(final_results, count)
|
|
190
|
+
|
|
191
|
+
# Ensure 'score' field exists for CLI compatibility
|
|
192
|
+
for r in final_results:
|
|
193
|
+
if 'score' not in r:
|
|
194
|
+
r['score'] = r.get('final_score', 0.0)
|
|
195
|
+
|
|
196
|
+
return final_results[:count]
|
|
197
|
+
|
|
198
|
+
def _keyword_search_only(self, enhanced_text: str, count: int,
|
|
199
|
+
tags: Optional[List[str]] = None, original_query: Optional[str] = None) -> List[Dict[str, Any]]:
|
|
200
|
+
"""Fallback to keyword search only when vector search is unavailable"""
|
|
201
|
+
keyword_results = self._keyword_search(enhanced_text, count, original_query)
|
|
202
|
+
|
|
203
|
+
if tags:
|
|
204
|
+
keyword_results = self._filter_by_tags(keyword_results, tags)
|
|
205
|
+
|
|
206
|
+
return keyword_results[:count]
|
|
207
|
+
|
|
208
|
+
def _vector_search(self, query_vector: Union[NDArray, Any], count: int) -> List[Dict[str, Any]]:
|
|
209
|
+
"""Perform vector similarity search"""
|
|
210
|
+
if not np or not cosine_similarity:
|
|
211
|
+
return []
|
|
212
|
+
|
|
213
|
+
try:
|
|
214
|
+
conn = sqlite3.connect(self.index_path)
|
|
215
|
+
cursor = conn.cursor()
|
|
216
|
+
|
|
217
|
+
# Get all embeddings (for small datasets, this is fine)
|
|
218
|
+
# For large datasets, we'd use FAISS or similar
|
|
219
|
+
cursor.execute('''
|
|
220
|
+
SELECT id, content, embedding, filename, section, tags, metadata
|
|
221
|
+
FROM chunks
|
|
222
|
+
WHERE embedding IS NOT NULL AND embedding != ''
|
|
223
|
+
''')
|
|
224
|
+
|
|
225
|
+
results = []
|
|
226
|
+
for row in cursor.fetchall():
|
|
227
|
+
chunk_id, content, embedding_blob, filename, section, tags_json, metadata_json = row
|
|
228
|
+
|
|
229
|
+
if not embedding_blob:
|
|
230
|
+
continue
|
|
231
|
+
|
|
232
|
+
try:
|
|
233
|
+
# Convert embedding back to numpy array
|
|
234
|
+
embedding = np.frombuffer(embedding_blob, dtype=np.float32).reshape(1, -1)
|
|
235
|
+
|
|
236
|
+
# Calculate similarity
|
|
237
|
+
similarity = cosine_similarity(query_vector, embedding)[0][0]
|
|
238
|
+
|
|
239
|
+
results.append({
|
|
240
|
+
'id': chunk_id,
|
|
241
|
+
'content': content,
|
|
242
|
+
'score': float(similarity),
|
|
243
|
+
'metadata': {
|
|
244
|
+
'filename': filename,
|
|
245
|
+
'section': section,
|
|
246
|
+
'tags': json.loads(tags_json) if tags_json else [],
|
|
247
|
+
'metadata': json.loads(metadata_json) if metadata_json else {}
|
|
248
|
+
},
|
|
249
|
+
'search_type': 'vector'
|
|
250
|
+
})
|
|
251
|
+
except Exception as e:
|
|
252
|
+
logger.warning(f"Error processing embedding for chunk {chunk_id}: {e}")
|
|
253
|
+
continue
|
|
254
|
+
|
|
255
|
+
conn.close()
|
|
256
|
+
|
|
257
|
+
# Sort by similarity score
|
|
258
|
+
results.sort(key=lambda x: x['score'], reverse=True)
|
|
259
|
+
return results[:count]
|
|
260
|
+
|
|
261
|
+
except Exception as e:
|
|
262
|
+
logger.error(f"Error in vector search: {e}")
|
|
263
|
+
return []
|
|
264
|
+
|
|
265
|
+
def _keyword_search(self, enhanced_text: str, count: int, original_query: Optional[str] = None) -> List[Dict[str, Any]]:
|
|
266
|
+
"""Perform full-text search"""
|
|
267
|
+
try:
|
|
268
|
+
conn = sqlite3.connect(self.index_path)
|
|
269
|
+
cursor = conn.cursor()
|
|
270
|
+
|
|
271
|
+
# Escape FTS5 special characters
|
|
272
|
+
escaped_text = self._escape_fts_query(enhanced_text)
|
|
273
|
+
|
|
274
|
+
# FTS5 search
|
|
275
|
+
cursor.execute('''
|
|
276
|
+
SELECT c.id, c.content, c.filename, c.section, c.tags, c.metadata,
|
|
277
|
+
chunks_fts.rank
|
|
278
|
+
FROM chunks_fts
|
|
279
|
+
JOIN chunks c ON chunks_fts.rowid = c.id
|
|
280
|
+
WHERE chunks_fts MATCH ?
|
|
281
|
+
ORDER BY chunks_fts.rank
|
|
282
|
+
LIMIT ?
|
|
283
|
+
''', (escaped_text, count))
|
|
284
|
+
|
|
285
|
+
results = []
|
|
286
|
+
for row in cursor.fetchall():
|
|
287
|
+
chunk_id, content, filename, section, tags_json, metadata_json, rank = row
|
|
288
|
+
|
|
289
|
+
# Convert FTS rank to similarity score (higher rank = lower score)
|
|
290
|
+
# FTS5 rank is negative, so we convert it to a positive similarity score
|
|
291
|
+
score = 1.0 / (1.0 + abs(rank))
|
|
292
|
+
|
|
293
|
+
results.append({
|
|
294
|
+
'id': chunk_id,
|
|
295
|
+
'content': content,
|
|
296
|
+
'score': float(score),
|
|
297
|
+
'metadata': {
|
|
298
|
+
'filename': filename,
|
|
299
|
+
'section': section,
|
|
300
|
+
'tags': json.loads(tags_json) if tags_json else [],
|
|
301
|
+
'metadata': json.loads(metadata_json) if metadata_json else {}
|
|
302
|
+
},
|
|
303
|
+
'search_type': 'keyword'
|
|
304
|
+
})
|
|
305
|
+
|
|
306
|
+
conn.close()
|
|
307
|
+
|
|
308
|
+
# If FTS returns no results, try fallback LIKE search
|
|
309
|
+
if not results:
|
|
310
|
+
logger.debug(f"FTS returned no results for '{enhanced_text}', trying fallback search")
|
|
311
|
+
return self._fallback_search(enhanced_text, count)
|
|
312
|
+
|
|
313
|
+
return results
|
|
314
|
+
|
|
315
|
+
except Exception as e:
|
|
316
|
+
logger.error(f"Error in keyword search: {e}")
|
|
317
|
+
# Fallback to simple LIKE search
|
|
318
|
+
return self._fallback_search(enhanced_text, count)
|
|
319
|
+
|
|
320
|
+
def _escape_fts_query(self, query: str) -> str:
|
|
321
|
+
"""Escape special characters for FTS5 queries"""
|
|
322
|
+
# FTS5 special characters that need escaping
|
|
323
|
+
special_chars = ['"', "'", '(', ')', '*', '-', '+', ':', '^']
|
|
324
|
+
|
|
325
|
+
escaped = query
|
|
326
|
+
for char in special_chars:
|
|
327
|
+
escaped = escaped.replace(char, f'\\{char}')
|
|
328
|
+
|
|
329
|
+
return escaped
|
|
330
|
+
|
|
331
|
+
def _fallback_search(self, enhanced_text: str, count: int) -> List[Dict[str, Any]]:
|
|
332
|
+
"""Fallback search using LIKE when FTS fails"""
|
|
333
|
+
try:
|
|
334
|
+
conn = sqlite3.connect(self.index_path)
|
|
335
|
+
cursor = conn.cursor()
|
|
336
|
+
|
|
337
|
+
# Simple LIKE search with word boundaries
|
|
338
|
+
search_terms = enhanced_text.lower().split()
|
|
339
|
+
like_conditions = []
|
|
340
|
+
params = []
|
|
341
|
+
|
|
342
|
+
for term in search_terms[:5]: # Limit to 5 terms to avoid too complex queries
|
|
343
|
+
# Search for term with word boundaries (space or punctuation)
|
|
344
|
+
like_conditions.append("""
|
|
345
|
+
(LOWER(processed_content) LIKE ?
|
|
346
|
+
OR LOWER(processed_content) LIKE ?
|
|
347
|
+
OR LOWER(processed_content) LIKE ?
|
|
348
|
+
OR LOWER(processed_content) LIKE ?)
|
|
349
|
+
""")
|
|
350
|
+
params.extend([
|
|
351
|
+
f"% {term} %", # space on both sides
|
|
352
|
+
f"{term} %", # at beginning
|
|
353
|
+
f"% {term}", # at end
|
|
354
|
+
f"{term}" # exact match
|
|
355
|
+
])
|
|
356
|
+
|
|
357
|
+
if not like_conditions:
|
|
358
|
+
return []
|
|
359
|
+
|
|
360
|
+
# Also search in original content
|
|
361
|
+
content_conditions = []
|
|
362
|
+
for term in search_terms[:5]:
|
|
363
|
+
content_conditions.append("""
|
|
364
|
+
(LOWER(content) LIKE ?
|
|
365
|
+
OR LOWER(content) LIKE ?
|
|
366
|
+
OR LOWER(content) LIKE ?
|
|
367
|
+
OR LOWER(content) LIKE ?)
|
|
368
|
+
""")
|
|
369
|
+
params.extend([
|
|
370
|
+
f"% {term} %", # with spaces
|
|
371
|
+
f"{term} %", # at beginning
|
|
372
|
+
f"% {term}", # at end
|
|
373
|
+
f"{term}" # exact match
|
|
374
|
+
])
|
|
375
|
+
|
|
376
|
+
query = f'''
|
|
377
|
+
SELECT id, content, filename, section, tags, metadata
|
|
378
|
+
FROM chunks
|
|
379
|
+
WHERE ({" OR ".join(like_conditions)})
|
|
380
|
+
OR ({" OR ".join(content_conditions)})
|
|
381
|
+
LIMIT ?
|
|
382
|
+
'''
|
|
383
|
+
params.append(count)
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
cursor.execute(query, params)
|
|
387
|
+
|
|
388
|
+
results = []
|
|
389
|
+
for row in cursor.fetchall():
|
|
390
|
+
chunk_id, content, filename, section, tags_json, metadata_json = row
|
|
391
|
+
|
|
392
|
+
# Simple scoring based on term matches with word boundaries
|
|
393
|
+
content_lower = content.lower()
|
|
394
|
+
# Check for whole word matches
|
|
395
|
+
word_matches = 0
|
|
396
|
+
for term in search_terms:
|
|
397
|
+
term_lower = term.lower()
|
|
398
|
+
# Check word boundaries
|
|
399
|
+
if (f" {term_lower} " in f" {content_lower} " or
|
|
400
|
+
content_lower.startswith(f"{term_lower} ") or
|
|
401
|
+
content_lower.endswith(f" {term_lower}") or
|
|
402
|
+
content_lower == term_lower):
|
|
403
|
+
word_matches += 1
|
|
404
|
+
score = word_matches / len(search_terms) if search_terms else 0.0
|
|
405
|
+
|
|
406
|
+
results.append({
|
|
407
|
+
'id': chunk_id,
|
|
408
|
+
'content': content,
|
|
409
|
+
'score': float(score),
|
|
410
|
+
'metadata': {
|
|
411
|
+
'filename': filename,
|
|
412
|
+
'section': section,
|
|
413
|
+
'tags': json.loads(tags_json) if tags_json else [],
|
|
414
|
+
'metadata': json.loads(metadata_json) if metadata_json else {}
|
|
415
|
+
},
|
|
416
|
+
'search_type': 'fallback'
|
|
417
|
+
})
|
|
418
|
+
|
|
419
|
+
conn.close()
|
|
420
|
+
|
|
421
|
+
# Sort by score
|
|
422
|
+
results.sort(key=lambda x: x['score'], reverse=True)
|
|
423
|
+
|
|
424
|
+
return results
|
|
425
|
+
|
|
426
|
+
except Exception as e:
|
|
427
|
+
logger.error(f"Error in fallback search: {e}")
|
|
428
|
+
return []
|
|
429
|
+
|
|
430
|
+
def _merge_results(self, vector_results: List[Dict], keyword_results: List[Dict],
|
|
431
|
+
vector_weight: Optional[float] = None,
|
|
432
|
+
keyword_weight: Optional[float] = None) -> List[Dict[str, Any]]:
|
|
433
|
+
"""Merge and rank vector and keyword search results"""
|
|
434
|
+
# Use provided weights or defaults
|
|
435
|
+
if vector_weight is None:
|
|
436
|
+
vector_weight = 0.7
|
|
437
|
+
if keyword_weight is None:
|
|
438
|
+
keyword_weight = 0.3
|
|
439
|
+
|
|
440
|
+
# Create a combined list with weighted scores
|
|
441
|
+
combined = {}
|
|
442
|
+
|
|
443
|
+
# Add vector results with weight
|
|
444
|
+
for result in vector_results:
|
|
445
|
+
chunk_id = result['id']
|
|
446
|
+
combined[chunk_id] = result.copy()
|
|
447
|
+
combined[chunk_id]['vector_score'] = result['score']
|
|
448
|
+
combined[chunk_id]['keyword_score'] = 0.0
|
|
449
|
+
|
|
450
|
+
# Add keyword results with weight
|
|
451
|
+
for result in keyword_results:
|
|
452
|
+
chunk_id = result['id']
|
|
453
|
+
if chunk_id in combined:
|
|
454
|
+
combined[chunk_id]['keyword_score'] = result['score']
|
|
455
|
+
else:
|
|
456
|
+
combined[chunk_id] = result.copy()
|
|
457
|
+
combined[chunk_id]['vector_score'] = 0.0
|
|
458
|
+
combined[chunk_id]['keyword_score'] = result['score']
|
|
459
|
+
|
|
460
|
+
# Calculate combined score (weighted average)
|
|
461
|
+
|
|
462
|
+
for chunk_id, result in combined.items():
|
|
463
|
+
vector_score = result.get('vector_score', 0.0)
|
|
464
|
+
keyword_score = result.get('keyword_score', 0.0)
|
|
465
|
+
result['score'] = (vector_score * vector_weight + keyword_score * keyword_weight)
|
|
466
|
+
|
|
467
|
+
# Add debug info
|
|
468
|
+
result['metadata']['search_scores'] = {
|
|
469
|
+
'vector': vector_score,
|
|
470
|
+
'keyword': keyword_score,
|
|
471
|
+
'combined': result['score']
|
|
472
|
+
}
|
|
473
|
+
|
|
474
|
+
# Sort by combined score
|
|
475
|
+
sorted_results = sorted(combined.values(), key=lambda x: x['score'], reverse=True)
|
|
476
|
+
return sorted_results
|
|
477
|
+
|
|
478
|
+
def _filter_by_tags(self, results: List[Dict], required_tags: List[str]) -> List[Dict[str, Any]]:
|
|
479
|
+
"""Filter results by required tags"""
|
|
480
|
+
filtered = []
|
|
481
|
+
for result in results:
|
|
482
|
+
result_tags = result['metadata'].get('tags', [])
|
|
483
|
+
if any(tag in result_tags for tag in required_tags):
|
|
484
|
+
filtered.append(result)
|
|
485
|
+
return filtered
|
|
486
|
+
|
|
487
|
+
def _boost_exact_matches(self, results: List[Dict[str, Any]], original_query: str) -> List[Dict[str, Any]]:
|
|
488
|
+
"""Boost scores for results that contain exact matches of the original query"""
|
|
489
|
+
if not original_query:
|
|
490
|
+
return results
|
|
491
|
+
|
|
492
|
+
# Extract key phrases to look for
|
|
493
|
+
query_lower = original_query.lower()
|
|
494
|
+
|
|
495
|
+
for result in results:
|
|
496
|
+
content_lower = result['content'].lower()
|
|
497
|
+
filename_lower = result['metadata'].get('filename', '').lower()
|
|
498
|
+
|
|
499
|
+
# Boost for exact phrase match in content
|
|
500
|
+
if query_lower in content_lower:
|
|
501
|
+
result['score'] *= 2.0 # Double score for exact match
|
|
502
|
+
|
|
503
|
+
# Boost for matches in filenames that suggest relevance
|
|
504
|
+
if any(term in filename_lower for term in ['example', 'sample', 'demo', 'tutorial', 'guide']):
|
|
505
|
+
if 'example' in query_lower or 'sample' in query_lower or 'code' in query_lower:
|
|
506
|
+
result['score'] *= 1.5
|
|
507
|
+
|
|
508
|
+
# Boost for "getting started" type queries
|
|
509
|
+
if 'getting started' in query_lower and 'start' in content_lower:
|
|
510
|
+
result['score'] *= 1.5
|
|
511
|
+
|
|
512
|
+
return results
|
|
513
|
+
|
|
514
|
+
def _filename_search(self, query: str, count: int) -> List[Dict[str, Any]]:
|
|
515
|
+
"""Search for query in filenames with term coverage scoring"""
|
|
516
|
+
try:
|
|
517
|
+
conn = sqlite3.connect(self.index_path)
|
|
518
|
+
cursor = conn.cursor()
|
|
519
|
+
|
|
520
|
+
query_lower = query.lower()
|
|
521
|
+
terms = query_lower.split()
|
|
522
|
+
|
|
523
|
+
# First try exact phrase match
|
|
524
|
+
cursor.execute('''
|
|
525
|
+
SELECT DISTINCT id, content, filename, section, tags, metadata
|
|
526
|
+
FROM chunks
|
|
527
|
+
WHERE LOWER(filename) LIKE ?
|
|
528
|
+
LIMIT ?
|
|
529
|
+
''', (f'%{query_lower}%', count))
|
|
530
|
+
|
|
531
|
+
results = []
|
|
532
|
+
seen_ids = set()
|
|
533
|
+
|
|
534
|
+
# Process exact matches
|
|
535
|
+
for row in cursor.fetchall():
|
|
536
|
+
chunk_id, content, filename, section, tags_json, metadata_json = row
|
|
537
|
+
seen_ids.add(chunk_id)
|
|
538
|
+
|
|
539
|
+
# High score for exact phrase match
|
|
540
|
+
filename_lower = filename.lower()
|
|
541
|
+
basename = filename_lower.split('/')[-1] if '/' in filename_lower else filename_lower
|
|
542
|
+
if query_lower in basename:
|
|
543
|
+
score = 3.0 # Exact match in basename (increased weight)
|
|
544
|
+
else:
|
|
545
|
+
score = 2.0 # Exact match in path
|
|
546
|
+
|
|
547
|
+
results.append({
|
|
548
|
+
'id': chunk_id,
|
|
549
|
+
'content': content,
|
|
550
|
+
'score': float(score),
|
|
551
|
+
'metadata': {
|
|
552
|
+
'filename': filename,
|
|
553
|
+
'section': section,
|
|
554
|
+
'tags': json.loads(tags_json) if tags_json else [],
|
|
555
|
+
'metadata': json.loads(metadata_json) if metadata_json else {}
|
|
556
|
+
},
|
|
557
|
+
'search_type': 'filename',
|
|
558
|
+
'match_coverage': 1.0 # Exact match = 100% coverage
|
|
559
|
+
})
|
|
560
|
+
|
|
561
|
+
# Then search for files containing ANY of the terms
|
|
562
|
+
if terms and len(results) < count * 3: # Get more candidates
|
|
563
|
+
# Build OR query for any term match
|
|
564
|
+
conditions = []
|
|
565
|
+
params = []
|
|
566
|
+
for term in terms:
|
|
567
|
+
conditions.append("LOWER(filename) LIKE ?")
|
|
568
|
+
params.append(f'%{term}%')
|
|
569
|
+
|
|
570
|
+
sql = f'''
|
|
571
|
+
SELECT DISTINCT id, content, filename, section, tags, metadata
|
|
572
|
+
FROM chunks
|
|
573
|
+
WHERE ({' OR '.join(conditions)})
|
|
574
|
+
AND id NOT IN ({','.join(['?' for _ in seen_ids]) if seen_ids else '0'})
|
|
575
|
+
LIMIT ?
|
|
576
|
+
'''
|
|
577
|
+
if seen_ids:
|
|
578
|
+
params.extend(seen_ids)
|
|
579
|
+
params.append(count * 3)
|
|
580
|
+
|
|
581
|
+
cursor.execute(sql, params)
|
|
582
|
+
|
|
583
|
+
for row in cursor.fetchall():
|
|
584
|
+
chunk_id, content, filename, section, tags_json, metadata_json = row
|
|
585
|
+
|
|
586
|
+
# Enhanced scoring based on term coverage
|
|
587
|
+
filename_lower = filename.lower()
|
|
588
|
+
basename = filename_lower.split('/')[-1] if '/' in filename_lower else filename_lower
|
|
589
|
+
|
|
590
|
+
# Count matches in basename vs full path
|
|
591
|
+
basename_matches = sum(1 for term in terms if term in basename)
|
|
592
|
+
path_matches = sum(1 for term in terms if term in filename_lower)
|
|
593
|
+
|
|
594
|
+
# Calculate term coverage (what % of query terms are matched)
|
|
595
|
+
term_coverage = path_matches / len(terms) if terms else 0
|
|
596
|
+
basename_coverage = basename_matches / len(terms) if terms else 0
|
|
597
|
+
|
|
598
|
+
# Check for substring bonus (e.g., "code_examples" contains both terms together)
|
|
599
|
+
substring_bonus = 0
|
|
600
|
+
if len(terms) > 1:
|
|
601
|
+
# Check if terms appear consecutively
|
|
602
|
+
for i in range(len(terms) - 1):
|
|
603
|
+
if f"{terms[i]}_{terms[i+1]}" in filename_lower or f"{terms[i]}{terms[i+1]}" in filename_lower:
|
|
604
|
+
substring_bonus = 0.3
|
|
605
|
+
break
|
|
606
|
+
|
|
607
|
+
# Score based on coverage with exponential boost for more matches
|
|
608
|
+
if basename_coverage > 0:
|
|
609
|
+
# Exponential scoring for basename matches
|
|
610
|
+
score = basename_coverage ** 1.5 + substring_bonus
|
|
611
|
+
else:
|
|
612
|
+
# Lower score for path-only matches
|
|
613
|
+
score = (term_coverage * 0.5) ** 1.5 + substring_bonus
|
|
614
|
+
|
|
615
|
+
results.append({
|
|
616
|
+
'id': chunk_id,
|
|
617
|
+
'content': content,
|
|
618
|
+
'score': float(score),
|
|
619
|
+
'metadata': {
|
|
620
|
+
'filename': filename,
|
|
621
|
+
'section': section,
|
|
622
|
+
'tags': json.loads(tags_json) if tags_json else [],
|
|
623
|
+
'metadata': json.loads(metadata_json) if metadata_json else {}
|
|
624
|
+
},
|
|
625
|
+
'search_type': 'filename',
|
|
626
|
+
'match_coverage': term_coverage
|
|
627
|
+
})
|
|
628
|
+
|
|
629
|
+
conn.close()
|
|
630
|
+
|
|
631
|
+
# Sort by score and return top results
|
|
632
|
+
results.sort(key=lambda x: x['score'], reverse=True)
|
|
633
|
+
return results[:count]
|
|
634
|
+
|
|
635
|
+
except Exception as e:
|
|
636
|
+
logger.error(f"Error in filename search: {e}")
|
|
637
|
+
return []
|
|
638
|
+
|
|
639
|
+
def _metadata_search(self, query: str, count: int) -> List[Dict[str, Any]]:
|
|
640
|
+
"""Search in all metadata fields (tags, sections, category, product, source)"""
|
|
641
|
+
try:
|
|
642
|
+
conn = sqlite3.connect(self.index_path)
|
|
643
|
+
cursor = conn.cursor()
|
|
644
|
+
|
|
645
|
+
query_lower = query.lower()
|
|
646
|
+
terms = query_lower.split()
|
|
647
|
+
results = []
|
|
648
|
+
seen_ids = set()
|
|
649
|
+
|
|
650
|
+
# First, try to use the metadata_text column if it exists
|
|
651
|
+
try:
|
|
652
|
+
# Check if metadata_text column exists
|
|
653
|
+
cursor.execute("PRAGMA table_info(chunks)")
|
|
654
|
+
columns = [col[1] for col in cursor.fetchall()]
|
|
655
|
+
has_metadata_text = 'metadata_text' in columns
|
|
656
|
+
except:
|
|
657
|
+
has_metadata_text = False
|
|
658
|
+
|
|
659
|
+
if has_metadata_text:
|
|
660
|
+
# Use the new metadata_text column for efficient searching
|
|
661
|
+
# Build conditions for each term
|
|
662
|
+
conditions = []
|
|
663
|
+
for term in terms:
|
|
664
|
+
conditions.append(f"metadata_text LIKE '%{term}%'")
|
|
665
|
+
|
|
666
|
+
if conditions:
|
|
667
|
+
query_sql = f'''
|
|
668
|
+
SELECT id, content, filename, section, tags, metadata
|
|
669
|
+
FROM chunks
|
|
670
|
+
WHERE {' AND '.join(conditions)}
|
|
671
|
+
LIMIT ?
|
|
672
|
+
'''
|
|
673
|
+
cursor.execute(query_sql, (count * 10,))
|
|
674
|
+
|
|
675
|
+
for row in cursor.fetchall():
|
|
676
|
+
chunk_id, content, filename, section, tags_json, metadata_json = row
|
|
677
|
+
|
|
678
|
+
if chunk_id in seen_ids:
|
|
679
|
+
continue
|
|
680
|
+
|
|
681
|
+
# Parse metadata
|
|
682
|
+
metadata = json.loads(metadata_json) if metadata_json else {}
|
|
683
|
+
tags = json.loads(tags_json) if tags_json else []
|
|
684
|
+
|
|
685
|
+
# Calculate score based on how many terms match
|
|
686
|
+
score = 0
|
|
687
|
+
for term in terms:
|
|
688
|
+
# Check metadata values
|
|
689
|
+
metadata_str = json.dumps(metadata).lower()
|
|
690
|
+
if term in metadata_str:
|
|
691
|
+
score += 1.5
|
|
692
|
+
# Check tags
|
|
693
|
+
if any(term in str(tag).lower() for tag in tags):
|
|
694
|
+
score += 1.0
|
|
695
|
+
# Check section
|
|
696
|
+
if section and term in section.lower():
|
|
697
|
+
score += 0.8
|
|
698
|
+
|
|
699
|
+
if score > 0:
|
|
700
|
+
seen_ids.add(chunk_id)
|
|
701
|
+
results.append({
|
|
702
|
+
'id': chunk_id,
|
|
703
|
+
'content': content,
|
|
704
|
+
'score': score,
|
|
705
|
+
'metadata': {
|
|
706
|
+
'filename': filename,
|
|
707
|
+
'section': section,
|
|
708
|
+
'tags': tags,
|
|
709
|
+
'metadata': metadata
|
|
710
|
+
},
|
|
711
|
+
'search_type': 'metadata'
|
|
712
|
+
})
|
|
713
|
+
|
|
714
|
+
# Fallback: search for JSON metadata embedded in content
|
|
715
|
+
# This ensures backwards compatibility
|
|
716
|
+
if len(results) < count:
|
|
717
|
+
# Build specific conditions for known patterns
|
|
718
|
+
specific_conditions = []
|
|
719
|
+
|
|
720
|
+
# Look for specific high-value patterns first
|
|
721
|
+
if 'code' in terms and 'examples' in terms:
|
|
722
|
+
specific_conditions.append('content LIKE \'%"category": "Code Examples"%\'')
|
|
723
|
+
if 'sdk' in terms:
|
|
724
|
+
specific_conditions.append('content LIKE \'%"product": "%\' || \'SDK\' || \'%"%\'')
|
|
725
|
+
|
|
726
|
+
# General term search in JSON content
|
|
727
|
+
for term in terms:
|
|
728
|
+
specific_conditions.append(f"content LIKE '%\"{term}%'")
|
|
729
|
+
|
|
730
|
+
if specific_conditions:
|
|
731
|
+
# Limit conditions to avoid too broad search
|
|
732
|
+
conditions_to_use = specific_conditions[:10]
|
|
733
|
+
query_sql = f'''
|
|
734
|
+
SELECT id, content, filename, section, tags, metadata
|
|
735
|
+
FROM chunks
|
|
736
|
+
WHERE ({' OR '.join(conditions_to_use)})
|
|
737
|
+
AND id NOT IN ({','.join(str(id) for id in seen_ids) if seen_ids else '0'})
|
|
738
|
+
LIMIT ?
|
|
739
|
+
'''
|
|
740
|
+
cursor.execute(query_sql, (count * 5,))
|
|
741
|
+
|
|
742
|
+
rows = cursor.fetchall()
|
|
743
|
+
|
|
744
|
+
for row in rows:
|
|
745
|
+
chunk_id, content, filename, section, tags_json, metadata_json = row
|
|
746
|
+
|
|
747
|
+
if chunk_id in seen_ids:
|
|
748
|
+
continue
|
|
749
|
+
|
|
750
|
+
# Try to extract metadata from JSON content
|
|
751
|
+
json_metadata = {}
|
|
752
|
+
try:
|
|
753
|
+
# Look for metadata in JSON structure
|
|
754
|
+
if '"metadata":' in content:
|
|
755
|
+
import re
|
|
756
|
+
# More robust regex to extract nested JSON object
|
|
757
|
+
# This handles nested braces properly
|
|
758
|
+
start = content.find('"metadata":')
|
|
759
|
+
if start != -1:
|
|
760
|
+
# Find the opening brace
|
|
761
|
+
brace_start = content.find('{', start)
|
|
762
|
+
if brace_start != -1:
|
|
763
|
+
# Count braces to find matching closing brace
|
|
764
|
+
brace_count = 0
|
|
765
|
+
i = brace_start
|
|
766
|
+
while i < len(content):
|
|
767
|
+
if content[i] == '{':
|
|
768
|
+
brace_count += 1
|
|
769
|
+
elif content[i] == '}':
|
|
770
|
+
brace_count -= 1
|
|
771
|
+
if brace_count == 0:
|
|
772
|
+
# Found matching closing brace
|
|
773
|
+
metadata_str = content[brace_start:i+1]
|
|
774
|
+
json_metadata = json.loads(metadata_str)
|
|
775
|
+
break
|
|
776
|
+
i += 1
|
|
777
|
+
except:
|
|
778
|
+
pass
|
|
779
|
+
|
|
780
|
+
# Calculate score based on matches
|
|
781
|
+
score = 0
|
|
782
|
+
fields_matched = 0
|
|
783
|
+
|
|
784
|
+
# Check JSON metadata extracted from content
|
|
785
|
+
if json_metadata:
|
|
786
|
+
# Check category - count how many terms match
|
|
787
|
+
category = json_metadata.get('category', '').lower()
|
|
788
|
+
if category:
|
|
789
|
+
category_matches = sum(1 for term in terms if term in category)
|
|
790
|
+
if category_matches > 0:
|
|
791
|
+
score += 1.8 * (category_matches / len(terms) if terms else 1)
|
|
792
|
+
fields_matched += 1
|
|
793
|
+
|
|
794
|
+
# Check product - count how many terms match
|
|
795
|
+
product = json_metadata.get('product', '').lower()
|
|
796
|
+
if product:
|
|
797
|
+
product_matches = sum(1 for term in terms if term in product)
|
|
798
|
+
if product_matches > 0:
|
|
799
|
+
score += 1.5 * (product_matches / len(terms) if terms else 1)
|
|
800
|
+
fields_matched += 1
|
|
801
|
+
|
|
802
|
+
# Check source
|
|
803
|
+
source = json_metadata.get('source', '').lower()
|
|
804
|
+
if source:
|
|
805
|
+
source_matches = sum(1 for term in terms if term in source)
|
|
806
|
+
if source_matches > 0:
|
|
807
|
+
score += 1.2 * (source_matches / len(terms) if terms else 1)
|
|
808
|
+
fields_matched += 1
|
|
809
|
+
|
|
810
|
+
# Also check tags from JSON metadata
|
|
811
|
+
json_tags = json_metadata.get('tags', [])
|
|
812
|
+
if json_tags:
|
|
813
|
+
tags_str = str(json_tags).lower()
|
|
814
|
+
tag_matches = sum(1 for term in terms if term in tags_str)
|
|
815
|
+
if tag_matches > 0:
|
|
816
|
+
score += 1.3 * (tag_matches / len(terms) if terms else 1)
|
|
817
|
+
fields_matched += 1
|
|
818
|
+
|
|
819
|
+
if score > 0:
|
|
820
|
+
seen_ids.add(chunk_id)
|
|
821
|
+
results.append({
|
|
822
|
+
'id': chunk_id,
|
|
823
|
+
'content': content,
|
|
824
|
+
'score': float(score),
|
|
825
|
+
'metadata': {
|
|
826
|
+
'filename': filename,
|
|
827
|
+
'section': section,
|
|
828
|
+
'tags': json.loads(tags_json) if tags_json else [],
|
|
829
|
+
'metadata': json.loads(metadata_json) if metadata_json else {}
|
|
830
|
+
},
|
|
831
|
+
'search_type': 'metadata',
|
|
832
|
+
'fields_matched': fields_matched
|
|
833
|
+
})
|
|
834
|
+
logger.debug(f"Metadata match: {filename} - score={score:.2f}, fields_matched={fields_matched}, json_metadata={json_metadata}")
|
|
835
|
+
|
|
836
|
+
# Also get chunks with regular metadata
|
|
837
|
+
cursor.execute('''
|
|
838
|
+
SELECT id, content, filename, section, tags, metadata
|
|
839
|
+
FROM chunks
|
|
840
|
+
WHERE (tags IS NOT NULL AND tags != '')
|
|
841
|
+
OR (metadata IS NOT NULL AND metadata != '{}')
|
|
842
|
+
OR (section IS NOT NULL AND section != '')
|
|
843
|
+
LIMIT ?
|
|
844
|
+
''', (count * 10,)) # Get more to search through
|
|
845
|
+
|
|
846
|
+
for row in cursor.fetchall():
|
|
847
|
+
chunk_id, content, filename, section, tags_json, metadata_json = row
|
|
848
|
+
|
|
849
|
+
if chunk_id in seen_ids:
|
|
850
|
+
continue
|
|
851
|
+
|
|
852
|
+
# Parse metadata
|
|
853
|
+
tags = json.loads(tags_json) if tags_json else []
|
|
854
|
+
metadata = json.loads(metadata_json) if metadata_json else {}
|
|
855
|
+
|
|
856
|
+
# Flatten nested metadata if present
|
|
857
|
+
if 'metadata' in metadata:
|
|
858
|
+
# Handle double-nested metadata from some indexes
|
|
859
|
+
nested_meta = metadata['metadata']
|
|
860
|
+
metadata.update(nested_meta)
|
|
861
|
+
|
|
862
|
+
# Initialize scoring components
|
|
863
|
+
score_components = {
|
|
864
|
+
'tags': 0,
|
|
865
|
+
'section': 0,
|
|
866
|
+
'category': 0,
|
|
867
|
+
'product': 0,
|
|
868
|
+
'source': 0,
|
|
869
|
+
'description': 0
|
|
870
|
+
}
|
|
871
|
+
|
|
872
|
+
# Check tags
|
|
873
|
+
if tags:
|
|
874
|
+
tag_matches = 0
|
|
875
|
+
for tag in tags:
|
|
876
|
+
tag_lower = tag.lower()
|
|
877
|
+
# Full query match in tag
|
|
878
|
+
if query_lower in tag_lower:
|
|
879
|
+
tag_matches += 2.0
|
|
880
|
+
else:
|
|
881
|
+
# Individual term matches
|
|
882
|
+
term_matches = sum(1 for term in terms if term in tag_lower)
|
|
883
|
+
tag_matches += term_matches * 0.5
|
|
884
|
+
|
|
885
|
+
if tag_matches > 0:
|
|
886
|
+
score_components['tags'] = min(1.0, tag_matches / len(tags))
|
|
887
|
+
|
|
888
|
+
# Check section
|
|
889
|
+
if section and section.lower() != 'none':
|
|
890
|
+
section_lower = section.lower()
|
|
891
|
+
if query_lower in section_lower:
|
|
892
|
+
score_components['section'] = 1.0
|
|
893
|
+
else:
|
|
894
|
+
term_matches = sum(1 for term in terms if term in section_lower)
|
|
895
|
+
score_components['section'] = (term_matches / len(terms)) * 0.8 if terms else 0
|
|
896
|
+
|
|
897
|
+
# Check category field
|
|
898
|
+
category = metadata.get('category', '')
|
|
899
|
+
if category:
|
|
900
|
+
category_lower = category.lower()
|
|
901
|
+
if query_lower in category_lower:
|
|
902
|
+
score_components['category'] = 1.0
|
|
903
|
+
else:
|
|
904
|
+
term_matches = sum(1 for term in terms if term in category_lower)
|
|
905
|
+
score_components['category'] = (term_matches / len(terms)) * 0.9 if terms else 0
|
|
906
|
+
|
|
907
|
+
# Check product field
|
|
908
|
+
product = metadata.get('product', '')
|
|
909
|
+
if product:
|
|
910
|
+
product_lower = product.lower()
|
|
911
|
+
if query_lower in product_lower:
|
|
912
|
+
score_components['product'] = 1.0
|
|
913
|
+
else:
|
|
914
|
+
term_matches = sum(1 for term in terms if term in product_lower)
|
|
915
|
+
score_components['product'] = (term_matches / len(terms)) * 0.8 if terms else 0
|
|
916
|
+
|
|
917
|
+
# Check source field (original filename)
|
|
918
|
+
source = metadata.get('source', '')
|
|
919
|
+
if source:
|
|
920
|
+
source_lower = source.lower()
|
|
921
|
+
if query_lower in source_lower:
|
|
922
|
+
score_components['source'] = 1.0
|
|
923
|
+
else:
|
|
924
|
+
term_matches = sum(1 for term in terms if term in source_lower)
|
|
925
|
+
score_components['source'] = (term_matches / len(terms)) * 0.7 if terms else 0
|
|
926
|
+
|
|
927
|
+
# Check description or title fields
|
|
928
|
+
description = metadata.get('description', metadata.get('title', ''))
|
|
929
|
+
if description:
|
|
930
|
+
desc_lower = description.lower()
|
|
931
|
+
if query_lower in desc_lower:
|
|
932
|
+
score_components['description'] = 0.8
|
|
933
|
+
else:
|
|
934
|
+
term_matches = sum(1 for term in terms if term in desc_lower)
|
|
935
|
+
score_components['description'] = (term_matches / len(terms)) * 0.6 if terms else 0
|
|
936
|
+
|
|
937
|
+
# Calculate total score with weights
|
|
938
|
+
weights = {
|
|
939
|
+
'category': 1.8, # Strong signal
|
|
940
|
+
'product': 1.5, # Strong signal
|
|
941
|
+
'tags': 1.3, # Good signal
|
|
942
|
+
'source': 1.2, # Good signal
|
|
943
|
+
'section': 1.0, # Moderate signal
|
|
944
|
+
'description': 0.8 # Weaker signal
|
|
945
|
+
}
|
|
946
|
+
|
|
947
|
+
total_score = sum(score_components[field] * weights.get(field, 1.0)
|
|
948
|
+
for field in score_components)
|
|
949
|
+
|
|
950
|
+
# Track match coverage
|
|
951
|
+
fields_matched = sum(1 for score in score_components.values() if score > 0)
|
|
952
|
+
match_coverage = sum(1 for term in terms if any(
|
|
953
|
+
term in str(field_value).lower()
|
|
954
|
+
for field_value in [tags, section, category, product, source, description]
|
|
955
|
+
if field_value
|
|
956
|
+
)) / len(terms) if terms else 0
|
|
957
|
+
|
|
958
|
+
if total_score > 0:
|
|
959
|
+
results.append({
|
|
960
|
+
'id': chunk_id,
|
|
961
|
+
'content': content,
|
|
962
|
+
'score': float(total_score),
|
|
963
|
+
'metadata': {
|
|
964
|
+
'filename': filename,
|
|
965
|
+
'section': section,
|
|
966
|
+
'tags': tags,
|
|
967
|
+
'metadata': metadata,
|
|
968
|
+
'category': category,
|
|
969
|
+
'product': product,
|
|
970
|
+
'source': source
|
|
971
|
+
},
|
|
972
|
+
'search_type': 'metadata',
|
|
973
|
+
'metadata_matches': score_components,
|
|
974
|
+
'fields_matched': fields_matched,
|
|
975
|
+
'match_coverage': match_coverage
|
|
976
|
+
})
|
|
977
|
+
seen_ids.add(chunk_id)
|
|
978
|
+
|
|
979
|
+
conn.close()
|
|
980
|
+
|
|
981
|
+
# Sort by score and return top results
|
|
982
|
+
results.sort(key=lambda x: x['score'], reverse=True)
|
|
983
|
+
return results[:count]
|
|
984
|
+
|
|
985
|
+
except Exception as e:
|
|
986
|
+
logger.error(f"Error in metadata search: {e}")
|
|
987
|
+
return []
|
|
988
|
+
|
|
989
|
+
def _add_vector_scores_to_candidates(self, candidates: Dict[str, Dict], query_vector: NDArray,
|
|
990
|
+
similarity_threshold: float):
|
|
991
|
+
"""Add vector similarity scores to existing candidates"""
|
|
992
|
+
if not candidates or not np:
|
|
993
|
+
return
|
|
994
|
+
|
|
995
|
+
try:
|
|
996
|
+
conn = sqlite3.connect(self.index_path)
|
|
997
|
+
cursor = conn.cursor()
|
|
998
|
+
|
|
999
|
+
# Get embeddings for candidate chunks only
|
|
1000
|
+
chunk_ids = list(candidates.keys())
|
|
1001
|
+
placeholders = ','.join(['?' for _ in chunk_ids])
|
|
1002
|
+
|
|
1003
|
+
cursor.execute(f'''
|
|
1004
|
+
SELECT id, embedding
|
|
1005
|
+
FROM chunks
|
|
1006
|
+
WHERE id IN ({placeholders}) AND embedding IS NOT NULL AND embedding != ''
|
|
1007
|
+
''', chunk_ids)
|
|
1008
|
+
|
|
1009
|
+
for row in cursor.fetchall():
|
|
1010
|
+
chunk_id, embedding_blob = row
|
|
1011
|
+
|
|
1012
|
+
if not embedding_blob:
|
|
1013
|
+
continue
|
|
1014
|
+
|
|
1015
|
+
try:
|
|
1016
|
+
# Convert embedding back to numpy array
|
|
1017
|
+
embedding = np.frombuffer(embedding_blob, dtype=np.float32).reshape(1, -1)
|
|
1018
|
+
|
|
1019
|
+
# Calculate similarity
|
|
1020
|
+
similarity = cosine_similarity(query_vector, embedding)[0][0]
|
|
1021
|
+
distance = 1 - similarity
|
|
1022
|
+
|
|
1023
|
+
# Add vector scores to candidate
|
|
1024
|
+
candidates[chunk_id]['vector_score'] = float(similarity)
|
|
1025
|
+
candidates[chunk_id]['vector_distance'] = float(distance)
|
|
1026
|
+
candidates[chunk_id]['sources']['vector_rerank'] = True
|
|
1027
|
+
|
|
1028
|
+
except Exception as e:
|
|
1029
|
+
logger.debug(f"Error processing embedding for chunk {chunk_id}: {e}")
|
|
1030
|
+
continue
|
|
1031
|
+
|
|
1032
|
+
conn.close()
|
|
1033
|
+
|
|
1034
|
+
except Exception as e:
|
|
1035
|
+
logger.error(f"Error in vector re-ranking: {e}")
|
|
1036
|
+
|
|
1037
|
+
def _calculate_combined_score(self, candidate: Dict, similarity_threshold: float) -> float:
|
|
1038
|
+
"""Calculate final score with hybrid vector + metadata weighting
|
|
1039
|
+
|
|
1040
|
+
Hybrid approach:
|
|
1041
|
+
- Vector score is the primary ranking signal (semantic similarity)
|
|
1042
|
+
- Metadata/keyword matches provide confirmation boost
|
|
1043
|
+
- Multiple signal types indicate high relevance (confirmation bonus)
|
|
1044
|
+
- Special boost for 'code' tag matches when query contains code-related terms
|
|
1045
|
+
"""
|
|
1046
|
+
sources = candidate.get('sources', {})
|
|
1047
|
+
source_scores = candidate.get('source_scores', {})
|
|
1048
|
+
|
|
1049
|
+
# Vector score is PRIMARY
|
|
1050
|
+
if 'vector_score' in candidate:
|
|
1051
|
+
vector_score = candidate['vector_score']
|
|
1052
|
+
base_score = vector_score
|
|
1053
|
+
|
|
1054
|
+
# Metadata/keyword matches provide confirmation boost
|
|
1055
|
+
if len(sources) > 1:
|
|
1056
|
+
# Has both vector AND metadata/keyword matches - strong confirmation signal
|
|
1057
|
+
keyword_signals = sum(source_scores.get(k, 0) for k in ['keyword', 'filename', 'metadata'])
|
|
1058
|
+
if keyword_signals > 0:
|
|
1059
|
+
# Normalize and apply boost (up to 30% for strong confirmation)
|
|
1060
|
+
keyword_boost = min(0.3, keyword_signals * 0.15)
|
|
1061
|
+
base_score = vector_score * (1.0 + keyword_boost)
|
|
1062
|
+
|
|
1063
|
+
# Additional boost if multiple signal types confirm (2+ sources)
|
|
1064
|
+
num_metadata_sources = sum(1 for s in ['keyword', 'filename', 'metadata'] if s in sources)
|
|
1065
|
+
if num_metadata_sources >= 2:
|
|
1066
|
+
# Multiple confirmation signals - very high confidence
|
|
1067
|
+
base_score *= 1.1
|
|
1068
|
+
|
|
1069
|
+
# Check for code-related tags to boost code examples
|
|
1070
|
+
tags = candidate.get('metadata', {}).get('tags', [])
|
|
1071
|
+
if 'code' in tags:
|
|
1072
|
+
# This chunk contains code - boost if query is code-related
|
|
1073
|
+
# (metadata search would have found it if query mentioned code/example/python/etc)
|
|
1074
|
+
if 'metadata' in sources or 'keyword' in sources:
|
|
1075
|
+
# Query matched code-related metadata - apply code boost
|
|
1076
|
+
base_score *= 1.2
|
|
1077
|
+
else:
|
|
1078
|
+
# No vector score - this is a keyword-only result (backfill)
|
|
1079
|
+
# Use keyword scores but penalize for lack of semantic match
|
|
1080
|
+
base_score = sum(source_scores.values()) * 0.6 # 40% penalty for no vector
|
|
1081
|
+
|
|
1082
|
+
# Still boost code chunks if metadata matched
|
|
1083
|
+
tags = candidate.get('metadata', {}).get('tags', [])
|
|
1084
|
+
if 'code' in tags and 'metadata' in sources:
|
|
1085
|
+
base_score *= 1.15
|
|
1086
|
+
|
|
1087
|
+
return base_score
|
|
1088
|
+
|
|
1089
|
+
def _apply_diversity_penalties(self, results: List[Dict], target_count: int) -> List[Dict]:
|
|
1090
|
+
"""Apply penalties to prevent single-file dominance while maintaining quality"""
|
|
1091
|
+
if not results:
|
|
1092
|
+
return results
|
|
1093
|
+
|
|
1094
|
+
# Track file occurrences
|
|
1095
|
+
file_counts = {}
|
|
1096
|
+
penalized_results = []
|
|
1097
|
+
|
|
1098
|
+
# Define penalty multipliers
|
|
1099
|
+
occurrence_penalties = {
|
|
1100
|
+
1: 1.0, # First chunk: no penalty
|
|
1101
|
+
2: 0.85, # Second chunk: 15% penalty
|
|
1102
|
+
3: 0.7, # Third chunk: 30% penalty
|
|
1103
|
+
4: 0.5, # Fourth chunk: 50% penalty
|
|
1104
|
+
}
|
|
1105
|
+
|
|
1106
|
+
for result in results:
|
|
1107
|
+
filename = result['metadata']['filename']
|
|
1108
|
+
|
|
1109
|
+
# Get current count for this file
|
|
1110
|
+
current_count = file_counts.get(filename, 0) + 1
|
|
1111
|
+
file_counts[filename] = current_count
|
|
1112
|
+
|
|
1113
|
+
# Apply penalty based on occurrence
|
|
1114
|
+
penalty = occurrence_penalties.get(current_count, 0.4) # 60% penalty for 5+ chunks
|
|
1115
|
+
|
|
1116
|
+
# Create a copy to avoid modifying original
|
|
1117
|
+
penalized_result = result.copy()
|
|
1118
|
+
penalized_result['diversity_penalty'] = penalty
|
|
1119
|
+
penalized_result['final_score'] = result.get('final_score', result.get('score', 0)) * penalty
|
|
1120
|
+
|
|
1121
|
+
penalized_results.append(penalized_result)
|
|
1122
|
+
|
|
1123
|
+
# Re-sort by penalized scores
|
|
1124
|
+
penalized_results.sort(key=lambda x: x['final_score'], reverse=True)
|
|
1125
|
+
|
|
1126
|
+
# Ensure minimum diversity if we have enough results
|
|
1127
|
+
if len(penalized_results) > target_count:
|
|
1128
|
+
unique_files = len(set(r['metadata']['filename'] for r in penalized_results[:target_count]))
|
|
1129
|
+
|
|
1130
|
+
# If top results are too homogeneous (e.g., all from 1-2 files)
|
|
1131
|
+
if unique_files < min(3, target_count):
|
|
1132
|
+
# Try to inject some diversity
|
|
1133
|
+
selected = penalized_results[:target_count]
|
|
1134
|
+
seen_files = set(r['metadata']['filename'] for r in selected)
|
|
1135
|
+
|
|
1136
|
+
# Look for high-quality results from other files
|
|
1137
|
+
for result in penalized_results[target_count:]:
|
|
1138
|
+
if result['metadata']['filename'] not in seen_files:
|
|
1139
|
+
# If it's reasonably good (within 50% of top score), include it
|
|
1140
|
+
if result['final_score'] > 0.5 * selected[0]['final_score']:
|
|
1141
|
+
# Replace the lowest scoring result from an over-represented file
|
|
1142
|
+
for i in range(len(selected) - 1, -1, -1):
|
|
1143
|
+
if file_counts[selected[i]['metadata']['filename']] > 2:
|
|
1144
|
+
selected[i] = result
|
|
1145
|
+
seen_files.add(result['metadata']['filename'])
|
|
1146
|
+
break
|
|
1147
|
+
|
|
1148
|
+
penalized_results[:target_count] = selected
|
|
1149
|
+
|
|
1150
|
+
return penalized_results
|
|
1151
|
+
|
|
1152
|
+
def _apply_match_type_diversity(self, results: List[Dict], target_count: int) -> List[Dict]:
|
|
1153
|
+
"""Ensure diversity of match types in final results
|
|
1154
|
+
|
|
1155
|
+
Ensures we have a mix of:
|
|
1156
|
+
- Vector-only matches (semantic similarity, good for code examples)
|
|
1157
|
+
- Keyword-only matches (exact term matches)
|
|
1158
|
+
- Hybrid matches (both vector + keyword/metadata)
|
|
1159
|
+
"""
|
|
1160
|
+
if not results or len(results) <= target_count:
|
|
1161
|
+
return results
|
|
1162
|
+
|
|
1163
|
+
# Categorize results by match type
|
|
1164
|
+
vector_only = []
|
|
1165
|
+
keyword_only = []
|
|
1166
|
+
hybrid = []
|
|
1167
|
+
|
|
1168
|
+
for result in results:
|
|
1169
|
+
sources = result.get('sources', {})
|
|
1170
|
+
has_vector = 'vector' in sources
|
|
1171
|
+
has_keyword = any(k in sources for k in ['keyword', 'filename', 'metadata'])
|
|
1172
|
+
|
|
1173
|
+
if has_vector and not has_keyword:
|
|
1174
|
+
vector_only.append(result)
|
|
1175
|
+
elif has_keyword and not has_vector:
|
|
1176
|
+
keyword_only.append(result)
|
|
1177
|
+
else:
|
|
1178
|
+
hybrid.append(result)
|
|
1179
|
+
|
|
1180
|
+
# Build diverse result set
|
|
1181
|
+
# Target distribution: 40% hybrid, 40% vector-only, 20% keyword-only
|
|
1182
|
+
# This ensures we include semantic matches (code examples) even if keywords don't match
|
|
1183
|
+
diversified = []
|
|
1184
|
+
|
|
1185
|
+
# Take top hybrid matches first (best overall)
|
|
1186
|
+
hybrid_target = max(1, int(target_count * 0.4))
|
|
1187
|
+
diversified.extend(hybrid[:hybrid_target])
|
|
1188
|
+
|
|
1189
|
+
# Ensure we have vector-only matches (critical for code examples)
|
|
1190
|
+
vector_target = max(1, int(target_count * 0.4))
|
|
1191
|
+
diversified.extend(vector_only[:vector_target])
|
|
1192
|
+
|
|
1193
|
+
# Add keyword-only matches
|
|
1194
|
+
keyword_target = max(1, int(target_count * 0.2))
|
|
1195
|
+
diversified.extend(keyword_only[:keyword_target])
|
|
1196
|
+
|
|
1197
|
+
# Fill remaining slots with best remaining results regardless of type
|
|
1198
|
+
remaining_slots = target_count - len(diversified)
|
|
1199
|
+
if remaining_slots > 0:
|
|
1200
|
+
# Get all unused results
|
|
1201
|
+
used_ids = set(r['id'] for r in diversified)
|
|
1202
|
+
unused = [r for r in results if r['id'] not in used_ids]
|
|
1203
|
+
diversified.extend(unused[:remaining_slots])
|
|
1204
|
+
|
|
1205
|
+
# Sort by final score to maintain quality ordering
|
|
1206
|
+
diversified.sort(key=lambda x: x['final_score'], reverse=True)
|
|
1207
|
+
|
|
1208
|
+
return diversified
|
|
1209
|
+
|
|
1210
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
1211
|
+
"""Get statistics about the search index"""
|
|
1212
|
+
# Use pgvector backend if available
|
|
1213
|
+
if self.backend == 'pgvector':
|
|
1214
|
+
return self._backend.get_stats()
|
|
1215
|
+
|
|
1216
|
+
# Original SQLite implementation
|
|
1217
|
+
conn = sqlite3.connect(self.index_path)
|
|
1218
|
+
cursor = conn.cursor()
|
|
1219
|
+
|
|
1220
|
+
try:
|
|
1221
|
+
# Get total chunks
|
|
1222
|
+
cursor.execute("SELECT COUNT(*) FROM chunks")
|
|
1223
|
+
total_chunks = cursor.fetchone()[0]
|
|
1224
|
+
|
|
1225
|
+
# Get total files
|
|
1226
|
+
cursor.execute("SELECT COUNT(DISTINCT filename) FROM chunks")
|
|
1227
|
+
total_files = cursor.fetchone()[0]
|
|
1228
|
+
|
|
1229
|
+
# Get average chunk size
|
|
1230
|
+
cursor.execute("SELECT AVG(LENGTH(content)) FROM chunks")
|
|
1231
|
+
avg_chunk_size = cursor.fetchone()[0] or 0
|
|
1232
|
+
|
|
1233
|
+
# Get file types
|
|
1234
|
+
cursor.execute("""
|
|
1235
|
+
SELECT
|
|
1236
|
+
CASE
|
|
1237
|
+
WHEN filename LIKE '%.md' THEN 'markdown'
|
|
1238
|
+
WHEN filename LIKE '%.py' THEN 'python'
|
|
1239
|
+
WHEN filename LIKE '%.txt' THEN 'text'
|
|
1240
|
+
WHEN filename LIKE '%.pdf' THEN 'pdf'
|
|
1241
|
+
WHEN filename LIKE '%.docx' THEN 'docx'
|
|
1242
|
+
ELSE 'other'
|
|
1243
|
+
END as file_type,
|
|
1244
|
+
COUNT(DISTINCT filename) as count
|
|
1245
|
+
FROM chunks
|
|
1246
|
+
GROUP BY file_type
|
|
1247
|
+
""")
|
|
1248
|
+
file_types = dict(cursor.fetchall())
|
|
1249
|
+
|
|
1250
|
+
# Get languages
|
|
1251
|
+
cursor.execute("SELECT language, COUNT(*) FROM chunks GROUP BY language")
|
|
1252
|
+
languages = dict(cursor.fetchall())
|
|
1253
|
+
|
|
1254
|
+
return {
|
|
1255
|
+
'total_chunks': total_chunks,
|
|
1256
|
+
'total_files': total_files,
|
|
1257
|
+
'avg_chunk_size': int(avg_chunk_size),
|
|
1258
|
+
'file_types': file_types,
|
|
1259
|
+
'languages': languages,
|
|
1260
|
+
'config': self.config
|
|
1261
|
+
}
|
|
1262
|
+
|
|
1263
|
+
finally:
|
|
1264
|
+
conn.close()
|