signalwire-agents 0.1.13__py3-none-any.whl → 1.0.17.dev4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- signalwire_agents/__init__.py +99 -15
- signalwire_agents/agent_server.py +248 -60
- signalwire_agents/agents/bedrock.py +296 -0
- signalwire_agents/cli/__init__.py +9 -0
- signalwire_agents/cli/build_search.py +951 -41
- signalwire_agents/cli/config.py +80 -0
- signalwire_agents/cli/core/__init__.py +10 -0
- signalwire_agents/cli/core/agent_loader.py +470 -0
- signalwire_agents/cli/core/argparse_helpers.py +179 -0
- signalwire_agents/cli/core/dynamic_config.py +71 -0
- signalwire_agents/cli/core/service_loader.py +303 -0
- signalwire_agents/cli/dokku.py +2320 -0
- signalwire_agents/cli/execution/__init__.py +10 -0
- signalwire_agents/cli/execution/datamap_exec.py +446 -0
- signalwire_agents/cli/execution/webhook_exec.py +134 -0
- signalwire_agents/cli/init_project.py +2636 -0
- signalwire_agents/cli/output/__init__.py +10 -0
- signalwire_agents/cli/output/output_formatter.py +255 -0
- signalwire_agents/cli/output/swml_dump.py +186 -0
- signalwire_agents/cli/simulation/__init__.py +10 -0
- signalwire_agents/cli/simulation/data_generation.py +374 -0
- signalwire_agents/cli/simulation/data_overrides.py +200 -0
- signalwire_agents/cli/simulation/mock_env.py +282 -0
- signalwire_agents/cli/swaig_test_wrapper.py +52 -0
- signalwire_agents/cli/test_swaig.py +566 -2366
- signalwire_agents/cli/types.py +81 -0
- signalwire_agents/core/__init__.py +2 -2
- signalwire_agents/core/agent/__init__.py +12 -0
- signalwire_agents/core/agent/config/__init__.py +12 -0
- signalwire_agents/core/agent/deployment/__init__.py +9 -0
- signalwire_agents/core/agent/deployment/handlers/__init__.py +9 -0
- signalwire_agents/core/agent/prompt/__init__.py +14 -0
- signalwire_agents/core/agent/prompt/manager.py +306 -0
- signalwire_agents/core/agent/routing/__init__.py +9 -0
- signalwire_agents/core/agent/security/__init__.py +9 -0
- signalwire_agents/core/agent/swml/__init__.py +9 -0
- signalwire_agents/core/agent/tools/__init__.py +15 -0
- signalwire_agents/core/agent/tools/decorator.py +97 -0
- signalwire_agents/core/agent/tools/registry.py +210 -0
- signalwire_agents/core/agent_base.py +845 -2916
- signalwire_agents/core/auth_handler.py +233 -0
- signalwire_agents/core/config_loader.py +259 -0
- signalwire_agents/core/contexts.py +418 -0
- signalwire_agents/core/data_map.py +3 -15
- signalwire_agents/core/function_result.py +116 -44
- signalwire_agents/core/logging_config.py +162 -18
- signalwire_agents/core/mixins/__init__.py +28 -0
- signalwire_agents/core/mixins/ai_config_mixin.py +442 -0
- signalwire_agents/core/mixins/auth_mixin.py +280 -0
- signalwire_agents/core/mixins/prompt_mixin.py +358 -0
- signalwire_agents/core/mixins/serverless_mixin.py +460 -0
- signalwire_agents/core/mixins/skill_mixin.py +55 -0
- signalwire_agents/core/mixins/state_mixin.py +153 -0
- signalwire_agents/core/mixins/tool_mixin.py +230 -0
- signalwire_agents/core/mixins/web_mixin.py +1142 -0
- signalwire_agents/core/security_config.py +333 -0
- signalwire_agents/core/skill_base.py +84 -1
- signalwire_agents/core/skill_manager.py +62 -20
- signalwire_agents/core/swaig_function.py +18 -5
- signalwire_agents/core/swml_builder.py +207 -11
- signalwire_agents/core/swml_handler.py +27 -21
- signalwire_agents/core/swml_renderer.py +123 -312
- signalwire_agents/core/swml_service.py +171 -203
- signalwire_agents/mcp_gateway/__init__.py +29 -0
- signalwire_agents/mcp_gateway/gateway_service.py +564 -0
- signalwire_agents/mcp_gateway/mcp_manager.py +513 -0
- signalwire_agents/mcp_gateway/session_manager.py +218 -0
- signalwire_agents/prefabs/concierge.py +0 -3
- signalwire_agents/prefabs/faq_bot.py +0 -3
- signalwire_agents/prefabs/info_gatherer.py +0 -3
- signalwire_agents/prefabs/receptionist.py +0 -3
- signalwire_agents/prefabs/survey.py +0 -3
- signalwire_agents/schema.json +9218 -5489
- signalwire_agents/search/__init__.py +7 -1
- signalwire_agents/search/document_processor.py +490 -31
- signalwire_agents/search/index_builder.py +307 -37
- signalwire_agents/search/migration.py +418 -0
- signalwire_agents/search/models.py +30 -0
- signalwire_agents/search/pgvector_backend.py +748 -0
- signalwire_agents/search/query_processor.py +162 -31
- signalwire_agents/search/search_engine.py +916 -35
- signalwire_agents/search/search_service.py +376 -53
- signalwire_agents/skills/README.md +452 -0
- signalwire_agents/skills/__init__.py +14 -2
- signalwire_agents/skills/api_ninjas_trivia/README.md +215 -0
- signalwire_agents/skills/api_ninjas_trivia/__init__.py +12 -0
- signalwire_agents/skills/api_ninjas_trivia/skill.py +237 -0
- signalwire_agents/skills/datasphere/README.md +210 -0
- signalwire_agents/skills/datasphere/skill.py +84 -3
- signalwire_agents/skills/datasphere_serverless/README.md +258 -0
- signalwire_agents/skills/datasphere_serverless/__init__.py +9 -0
- signalwire_agents/skills/datasphere_serverless/skill.py +82 -1
- signalwire_agents/skills/datetime/README.md +132 -0
- signalwire_agents/skills/datetime/__init__.py +9 -0
- signalwire_agents/skills/datetime/skill.py +20 -7
- signalwire_agents/skills/joke/README.md +149 -0
- signalwire_agents/skills/joke/__init__.py +9 -0
- signalwire_agents/skills/joke/skill.py +21 -0
- signalwire_agents/skills/math/README.md +161 -0
- signalwire_agents/skills/math/__init__.py +9 -0
- signalwire_agents/skills/math/skill.py +18 -4
- signalwire_agents/skills/mcp_gateway/README.md +230 -0
- signalwire_agents/skills/mcp_gateway/__init__.py +10 -0
- signalwire_agents/skills/mcp_gateway/skill.py +421 -0
- signalwire_agents/skills/native_vector_search/README.md +210 -0
- signalwire_agents/skills/native_vector_search/__init__.py +9 -0
- signalwire_agents/skills/native_vector_search/skill.py +569 -101
- signalwire_agents/skills/play_background_file/README.md +218 -0
- signalwire_agents/skills/play_background_file/__init__.py +12 -0
- signalwire_agents/skills/play_background_file/skill.py +242 -0
- signalwire_agents/skills/registry.py +395 -40
- signalwire_agents/skills/spider/README.md +236 -0
- signalwire_agents/skills/spider/__init__.py +13 -0
- signalwire_agents/skills/spider/skill.py +598 -0
- signalwire_agents/skills/swml_transfer/README.md +395 -0
- signalwire_agents/skills/swml_transfer/__init__.py +10 -0
- signalwire_agents/skills/swml_transfer/skill.py +359 -0
- signalwire_agents/skills/weather_api/README.md +178 -0
- signalwire_agents/skills/weather_api/__init__.py +12 -0
- signalwire_agents/skills/weather_api/skill.py +191 -0
- signalwire_agents/skills/web_search/README.md +163 -0
- signalwire_agents/skills/web_search/__init__.py +9 -0
- signalwire_agents/skills/web_search/skill.py +586 -112
- signalwire_agents/skills/wikipedia_search/README.md +228 -0
- signalwire_agents/{core/state → skills/wikipedia_search}/__init__.py +5 -4
- signalwire_agents/skills/{wikipedia → wikipedia_search}/skill.py +33 -3
- signalwire_agents/web/__init__.py +17 -0
- signalwire_agents/web/web_service.py +559 -0
- signalwire_agents-1.0.17.dev4.data/data/share/man/man1/sw-agent-init.1 +400 -0
- signalwire_agents-1.0.17.dev4.data/data/share/man/man1/sw-search.1 +483 -0
- signalwire_agents-1.0.17.dev4.data/data/share/man/man1/swaig-test.1 +308 -0
- {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/METADATA +347 -215
- signalwire_agents-1.0.17.dev4.dist-info/RECORD +147 -0
- signalwire_agents-1.0.17.dev4.dist-info/entry_points.txt +6 -0
- signalwire_agents/core/state/file_state_manager.py +0 -219
- signalwire_agents/core/state/state_manager.py +0 -101
- signalwire_agents/skills/wikipedia/__init__.py +0 -9
- signalwire_agents-0.1.13.data/data/schema.json +0 -5611
- signalwire_agents-0.1.13.dist-info/RECORD +0 -67
- signalwire_agents-0.1.13.dist-info/entry_points.txt +0 -3
- {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/WHEEL +0 -0
- {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/licenses/LICENSE +0 -0
- {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/top_level.txt +0 -0
|
@@ -26,11 +26,38 @@ logger = logging.getLogger(__name__)
|
|
|
26
26
|
class SearchEngine:
|
|
27
27
|
"""Hybrid search engine for vector and keyword search"""
|
|
28
28
|
|
|
29
|
-
def __init__(self,
|
|
30
|
-
|
|
29
|
+
def __init__(self, backend: str = 'sqlite', index_path: Optional[str] = None,
|
|
30
|
+
connection_string: Optional[str] = None, collection_name: Optional[str] = None,
|
|
31
|
+
model=None):
|
|
32
|
+
"""
|
|
33
|
+
Initialize search engine
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
backend: Storage backend ('sqlite' or 'pgvector')
|
|
37
|
+
index_path: Path to .swsearch file (for sqlite backend)
|
|
38
|
+
connection_string: PostgreSQL connection string (for pgvector backend)
|
|
39
|
+
collection_name: Collection name (for pgvector backend)
|
|
40
|
+
model: Optional sentence transformer model
|
|
41
|
+
"""
|
|
42
|
+
self.backend = backend
|
|
31
43
|
self.model = model
|
|
32
|
-
|
|
33
|
-
|
|
44
|
+
|
|
45
|
+
if backend == 'sqlite':
|
|
46
|
+
if not index_path:
|
|
47
|
+
raise ValueError("index_path is required for sqlite backend")
|
|
48
|
+
self.index_path = index_path
|
|
49
|
+
self.config = self._load_config()
|
|
50
|
+
self.embedding_dim = int(self.config.get('embedding_dimensions', 768))
|
|
51
|
+
self._backend = None # SQLite uses direct connection
|
|
52
|
+
elif backend == 'pgvector':
|
|
53
|
+
if not connection_string or not collection_name:
|
|
54
|
+
raise ValueError("connection_string and collection_name are required for pgvector backend")
|
|
55
|
+
from .pgvector_backend import PgVectorSearchBackend
|
|
56
|
+
self._backend = PgVectorSearchBackend(connection_string, collection_name)
|
|
57
|
+
self.config = self._backend.config
|
|
58
|
+
self.embedding_dim = int(self.config.get('embedding_dimensions', 768))
|
|
59
|
+
else:
|
|
60
|
+
raise ValueError(f"Invalid backend '{backend}'. Must be 'sqlite' or 'pgvector'")
|
|
34
61
|
|
|
35
62
|
def _load_config(self) -> Dict[str, str]:
|
|
36
63
|
"""Load index configuration"""
|
|
@@ -46,58 +73,132 @@ class SearchEngine:
|
|
|
46
73
|
return {}
|
|
47
74
|
|
|
48
75
|
def search(self, query_vector: List[float], enhanced_text: str,
|
|
49
|
-
count: int = 3,
|
|
50
|
-
tags: Optional[List[str]] = None
|
|
76
|
+
count: int = 3, similarity_threshold: float = 0.0,
|
|
77
|
+
tags: Optional[List[str]] = None,
|
|
78
|
+
keyword_weight: Optional[float] = None,
|
|
79
|
+
original_query: Optional[str] = None) -> List[Dict[str, Any]]:
|
|
51
80
|
"""
|
|
52
|
-
Perform
|
|
81
|
+
Perform improved search with fast filtering and vector re-ranking
|
|
82
|
+
|
|
83
|
+
Strategy:
|
|
84
|
+
1. Fast candidate collection (filename, metadata, keywords)
|
|
85
|
+
2. Vector re-ranking on candidates only
|
|
86
|
+
3. Fallback to full vector search if few candidates
|
|
53
87
|
|
|
54
88
|
Args:
|
|
55
89
|
query_vector: Embedding vector for the query
|
|
56
90
|
enhanced_text: Processed query text for keyword search
|
|
57
91
|
count: Number of results to return
|
|
58
|
-
|
|
92
|
+
similarity_threshold: Minimum similarity score
|
|
59
93
|
tags: Filter by tags
|
|
94
|
+
keyword_weight: Optional manual weight for keyword vs vector
|
|
95
|
+
original_query: Original query for exact matching
|
|
60
96
|
|
|
61
97
|
Returns:
|
|
62
98
|
List of search results with scores and metadata
|
|
63
99
|
"""
|
|
64
100
|
|
|
101
|
+
# Use pgvector backend if available
|
|
102
|
+
if self.backend == 'pgvector':
|
|
103
|
+
return self._backend.search(query_vector, enhanced_text, count, similarity_threshold, tags, keyword_weight)
|
|
104
|
+
|
|
105
|
+
# Check for numpy/sklearn availability
|
|
65
106
|
if not np or not cosine_similarity:
|
|
66
107
|
logger.warning("NumPy or scikit-learn not available. Using keyword search only.")
|
|
67
|
-
return self._keyword_search_only(enhanced_text, count, tags)
|
|
108
|
+
return self._keyword_search_only(enhanced_text, count, tags, original_query)
|
|
68
109
|
|
|
69
110
|
# Convert query vector to numpy array
|
|
70
111
|
try:
|
|
71
112
|
query_array = np.array(query_vector).reshape(1, -1)
|
|
72
113
|
except Exception as e:
|
|
73
114
|
logger.error(f"Error converting query vector: {e}")
|
|
74
|
-
return self._keyword_search_only(enhanced_text, count, tags)
|
|
115
|
+
return self._keyword_search_only(enhanced_text, count, tags, original_query)
|
|
75
116
|
|
|
76
|
-
#
|
|
77
|
-
|
|
117
|
+
# HYBRID APPROACH: Search vector AND metadata in parallel
|
|
118
|
+
# Stage 1: Run both search types simultaneously
|
|
119
|
+
search_multiplier = 3
|
|
120
|
+
|
|
121
|
+
# Vector search (semantic similarity - primary ranking signal)
|
|
122
|
+
vector_results = self._vector_search(query_array, count * search_multiplier)
|
|
123
|
+
|
|
124
|
+
# Metadata/keyword searches (confirmation signals and backfill)
|
|
125
|
+
filename_results = self._filename_search(original_query or enhanced_text, count * search_multiplier)
|
|
126
|
+
metadata_results = self._metadata_search(original_query or enhanced_text, count * search_multiplier)
|
|
127
|
+
keyword_results = self._keyword_search(enhanced_text, count * search_multiplier, original_query)
|
|
128
|
+
|
|
129
|
+
logger.debug(f"Parallel search: vector={len(vector_results)}, filename={len(filename_results)}, "
|
|
130
|
+
f"metadata={len(metadata_results)}, keyword={len(keyword_results)}")
|
|
131
|
+
|
|
132
|
+
# Stage 2: Merge all results into candidate pool
|
|
133
|
+
candidates = {}
|
|
134
|
+
|
|
135
|
+
# Add vector results first (primary signal)
|
|
136
|
+
for result in vector_results:
|
|
137
|
+
chunk_id = result['id']
|
|
138
|
+
candidates[chunk_id] = result
|
|
139
|
+
candidates[chunk_id]['vector_score'] = result['score']
|
|
140
|
+
candidates[chunk_id]['vector_distance'] = 1 - result['score']
|
|
141
|
+
candidates[chunk_id]['sources'] = {'vector': True}
|
|
142
|
+
candidates[chunk_id]['source_scores'] = {'vector': result['score']}
|
|
143
|
+
|
|
144
|
+
# Add metadata/keyword results (secondary signals that boost or backfill)
|
|
145
|
+
for result_set, source_type, source_weight in [(filename_results, 'filename', 2.0),
|
|
146
|
+
(metadata_results, 'metadata', 1.5),
|
|
147
|
+
(keyword_results, 'keyword', 1.0)]:
|
|
148
|
+
for result in result_set:
|
|
149
|
+
chunk_id = result['id']
|
|
150
|
+
if chunk_id not in candidates:
|
|
151
|
+
# New candidate from metadata/keyword (no vector match)
|
|
152
|
+
candidates[chunk_id] = result
|
|
153
|
+
candidates[chunk_id]['sources'] = {source_type: True}
|
|
154
|
+
candidates[chunk_id]['source_scores'] = {source_type: result['score'] * source_weight}
|
|
155
|
+
else:
|
|
156
|
+
# Exists in vector results - add metadata/keyword as confirmation signal
|
|
157
|
+
candidates[chunk_id]['sources'][source_type] = True
|
|
158
|
+
candidates[chunk_id]['source_scores'][source_type] = result['score'] * source_weight
|
|
78
159
|
|
|
79
|
-
#
|
|
80
|
-
|
|
160
|
+
# Stage 3: Score and rank all candidates
|
|
161
|
+
final_results = []
|
|
162
|
+
for chunk_id, candidate in candidates.items():
|
|
163
|
+
# Calculate final score combining all signals
|
|
164
|
+
score = self._calculate_combined_score(candidate, similarity_threshold)
|
|
165
|
+
candidate['final_score'] = score
|
|
166
|
+
final_results.append(candidate)
|
|
81
167
|
|
|
82
|
-
#
|
|
83
|
-
|
|
168
|
+
# Sort by final score
|
|
169
|
+
final_results.sort(key=lambda x: x['final_score'], reverse=True)
|
|
84
170
|
|
|
85
171
|
# Filter by tags if specified
|
|
86
172
|
if tags:
|
|
87
|
-
|
|
173
|
+
final_results = [r for r in final_results
|
|
174
|
+
if any(tag in r['metadata'].get('tags', []) for tag in tags)]
|
|
175
|
+
|
|
176
|
+
# Apply distance threshold as final filter (soft threshold already applied in scoring)
|
|
177
|
+
if similarity_threshold > 0:
|
|
178
|
+
final_results = [r for r in final_results
|
|
179
|
+
if r.get('vector_distance', 0) <= similarity_threshold * 1.5
|
|
180
|
+
or 'vector' not in r.get('sources', {})]
|
|
88
181
|
|
|
89
|
-
#
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
182
|
+
# Boost exact matches if we have the original query
|
|
183
|
+
if original_query:
|
|
184
|
+
final_results = self._boost_exact_matches(final_results, original_query)
|
|
185
|
+
# Re-sort after boosting
|
|
186
|
+
final_results.sort(key=lambda x: x['final_score'], reverse=True)
|
|
94
187
|
|
|
95
|
-
|
|
188
|
+
# Apply diversity penalties to prevent single-file dominance
|
|
189
|
+
final_results = self._apply_diversity_penalties(final_results, count)
|
|
190
|
+
|
|
191
|
+
# Ensure 'score' field exists for CLI compatibility
|
|
192
|
+
for r in final_results:
|
|
193
|
+
if 'score' not in r:
|
|
194
|
+
r['score'] = r.get('final_score', 0.0)
|
|
195
|
+
|
|
196
|
+
return final_results[:count]
|
|
96
197
|
|
|
97
198
|
def _keyword_search_only(self, enhanced_text: str, count: int,
|
|
98
|
-
tags: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
|
199
|
+
tags: Optional[List[str]] = None, original_query: Optional[str] = None) -> List[Dict[str, Any]]:
|
|
99
200
|
"""Fallback to keyword search only when vector search is unavailable"""
|
|
100
|
-
keyword_results = self._keyword_search(enhanced_text, count)
|
|
201
|
+
keyword_results = self._keyword_search(enhanced_text, count, original_query)
|
|
101
202
|
|
|
102
203
|
if tags:
|
|
103
204
|
keyword_results = self._filter_by_tags(keyword_results, tags)
|
|
@@ -161,7 +262,7 @@ class SearchEngine:
|
|
|
161
262
|
logger.error(f"Error in vector search: {e}")
|
|
162
263
|
return []
|
|
163
264
|
|
|
164
|
-
def _keyword_search(self, enhanced_text: str, count: int) -> List[Dict[str, Any]]:
|
|
265
|
+
def _keyword_search(self, enhanced_text: str, count: int, original_query: Optional[str] = None) -> List[Dict[str, Any]]:
|
|
165
266
|
"""Perform full-text search"""
|
|
166
267
|
try:
|
|
167
268
|
conn = sqlite3.connect(self.index_path)
|
|
@@ -203,6 +304,12 @@ class SearchEngine:
|
|
|
203
304
|
})
|
|
204
305
|
|
|
205
306
|
conn.close()
|
|
307
|
+
|
|
308
|
+
# If FTS returns no results, try fallback LIKE search
|
|
309
|
+
if not results:
|
|
310
|
+
logger.debug(f"FTS returned no results for '{enhanced_text}', trying fallback search")
|
|
311
|
+
return self._fallback_search(enhanced_text, count)
|
|
312
|
+
|
|
206
313
|
return results
|
|
207
314
|
|
|
208
315
|
except Exception as e:
|
|
@@ -227,35 +334,74 @@ class SearchEngine:
|
|
|
227
334
|
conn = sqlite3.connect(self.index_path)
|
|
228
335
|
cursor = conn.cursor()
|
|
229
336
|
|
|
230
|
-
# Simple LIKE search
|
|
337
|
+
# Simple LIKE search with word boundaries
|
|
231
338
|
search_terms = enhanced_text.lower().split()
|
|
232
339
|
like_conditions = []
|
|
233
340
|
params = []
|
|
234
341
|
|
|
235
342
|
for term in search_terms[:5]: # Limit to 5 terms to avoid too complex queries
|
|
236
|
-
|
|
237
|
-
|
|
343
|
+
# Search for term with word boundaries (space or punctuation)
|
|
344
|
+
like_conditions.append("""
|
|
345
|
+
(LOWER(processed_content) LIKE ?
|
|
346
|
+
OR LOWER(processed_content) LIKE ?
|
|
347
|
+
OR LOWER(processed_content) LIKE ?
|
|
348
|
+
OR LOWER(processed_content) LIKE ?)
|
|
349
|
+
""")
|
|
350
|
+
params.extend([
|
|
351
|
+
f"% {term} %", # space on both sides
|
|
352
|
+
f"{term} %", # at beginning
|
|
353
|
+
f"% {term}", # at end
|
|
354
|
+
f"{term}" # exact match
|
|
355
|
+
])
|
|
238
356
|
|
|
239
357
|
if not like_conditions:
|
|
240
358
|
return []
|
|
241
359
|
|
|
360
|
+
# Also search in original content
|
|
361
|
+
content_conditions = []
|
|
362
|
+
for term in search_terms[:5]:
|
|
363
|
+
content_conditions.append("""
|
|
364
|
+
(LOWER(content) LIKE ?
|
|
365
|
+
OR LOWER(content) LIKE ?
|
|
366
|
+
OR LOWER(content) LIKE ?
|
|
367
|
+
OR LOWER(content) LIKE ?)
|
|
368
|
+
""")
|
|
369
|
+
params.extend([
|
|
370
|
+
f"% {term} %", # with spaces
|
|
371
|
+
f"{term} %", # at beginning
|
|
372
|
+
f"% {term}", # at end
|
|
373
|
+
f"{term}" # exact match
|
|
374
|
+
])
|
|
375
|
+
|
|
242
376
|
query = f'''
|
|
243
377
|
SELECT id, content, filename, section, tags, metadata
|
|
244
378
|
FROM chunks
|
|
245
|
-
WHERE {" OR ".join(like_conditions)}
|
|
379
|
+
WHERE ({" OR ".join(like_conditions)})
|
|
380
|
+
OR ({" OR ".join(content_conditions)})
|
|
246
381
|
LIMIT ?
|
|
247
382
|
'''
|
|
248
383
|
params.append(count)
|
|
249
384
|
|
|
385
|
+
|
|
250
386
|
cursor.execute(query, params)
|
|
251
387
|
|
|
252
388
|
results = []
|
|
253
389
|
for row in cursor.fetchall():
|
|
254
390
|
chunk_id, content, filename, section, tags_json, metadata_json = row
|
|
255
391
|
|
|
256
|
-
# Simple scoring based on term matches
|
|
392
|
+
# Simple scoring based on term matches with word boundaries
|
|
257
393
|
content_lower = content.lower()
|
|
258
|
-
|
|
394
|
+
# Check for whole word matches
|
|
395
|
+
word_matches = 0
|
|
396
|
+
for term in search_terms:
|
|
397
|
+
term_lower = term.lower()
|
|
398
|
+
# Check word boundaries
|
|
399
|
+
if (f" {term_lower} " in f" {content_lower} " or
|
|
400
|
+
content_lower.startswith(f"{term_lower} ") or
|
|
401
|
+
content_lower.endswith(f" {term_lower}") or
|
|
402
|
+
content_lower == term_lower):
|
|
403
|
+
word_matches += 1
|
|
404
|
+
score = word_matches / len(search_terms) if search_terms else 0.0
|
|
259
405
|
|
|
260
406
|
results.append({
|
|
261
407
|
'id': chunk_id,
|
|
@@ -274,14 +420,23 @@ class SearchEngine:
|
|
|
274
420
|
|
|
275
421
|
# Sort by score
|
|
276
422
|
results.sort(key=lambda x: x['score'], reverse=True)
|
|
423
|
+
|
|
277
424
|
return results
|
|
278
425
|
|
|
279
426
|
except Exception as e:
|
|
280
427
|
logger.error(f"Error in fallback search: {e}")
|
|
281
428
|
return []
|
|
282
429
|
|
|
283
|
-
def _merge_results(self, vector_results: List[Dict], keyword_results: List[Dict]
|
|
430
|
+
def _merge_results(self, vector_results: List[Dict], keyword_results: List[Dict],
|
|
431
|
+
vector_weight: Optional[float] = None,
|
|
432
|
+
keyword_weight: Optional[float] = None) -> List[Dict[str, Any]]:
|
|
284
433
|
"""Merge and rank vector and keyword search results"""
|
|
434
|
+
# Use provided weights or defaults
|
|
435
|
+
if vector_weight is None:
|
|
436
|
+
vector_weight = 0.7
|
|
437
|
+
if keyword_weight is None:
|
|
438
|
+
keyword_weight = 0.3
|
|
439
|
+
|
|
285
440
|
# Create a combined list with weighted scores
|
|
286
441
|
combined = {}
|
|
287
442
|
|
|
@@ -303,8 +458,6 @@ class SearchEngine:
|
|
|
303
458
|
combined[chunk_id]['keyword_score'] = result['score']
|
|
304
459
|
|
|
305
460
|
# Calculate combined score (weighted average)
|
|
306
|
-
vector_weight = 0.7
|
|
307
|
-
keyword_weight = 0.3
|
|
308
461
|
|
|
309
462
|
for chunk_id, result in combined.items():
|
|
310
463
|
vector_score = result.get('vector_score', 0.0)
|
|
@@ -331,8 +484,736 @@ class SearchEngine:
|
|
|
331
484
|
filtered.append(result)
|
|
332
485
|
return filtered
|
|
333
486
|
|
|
487
|
+
def _boost_exact_matches(self, results: List[Dict[str, Any]], original_query: str) -> List[Dict[str, Any]]:
|
|
488
|
+
"""Boost scores for results that contain exact matches of the original query"""
|
|
489
|
+
if not original_query:
|
|
490
|
+
return results
|
|
491
|
+
|
|
492
|
+
# Extract key phrases to look for
|
|
493
|
+
query_lower = original_query.lower()
|
|
494
|
+
|
|
495
|
+
for result in results:
|
|
496
|
+
content_lower = result['content'].lower()
|
|
497
|
+
filename_lower = result['metadata'].get('filename', '').lower()
|
|
498
|
+
|
|
499
|
+
# Boost for exact phrase match in content
|
|
500
|
+
if query_lower in content_lower:
|
|
501
|
+
result['score'] *= 2.0 # Double score for exact match
|
|
502
|
+
|
|
503
|
+
# Boost for matches in filenames that suggest relevance
|
|
504
|
+
if any(term in filename_lower for term in ['example', 'sample', 'demo', 'tutorial', 'guide']):
|
|
505
|
+
if 'example' in query_lower or 'sample' in query_lower or 'code' in query_lower:
|
|
506
|
+
result['score'] *= 1.5
|
|
507
|
+
|
|
508
|
+
# Boost for "getting started" type queries
|
|
509
|
+
if 'getting started' in query_lower and 'start' in content_lower:
|
|
510
|
+
result['score'] *= 1.5
|
|
511
|
+
|
|
512
|
+
return results
|
|
513
|
+
|
|
514
|
+
def _filename_search(self, query: str, count: int) -> List[Dict[str, Any]]:
|
|
515
|
+
"""Search for query in filenames with term coverage scoring"""
|
|
516
|
+
try:
|
|
517
|
+
conn = sqlite3.connect(self.index_path)
|
|
518
|
+
cursor = conn.cursor()
|
|
519
|
+
|
|
520
|
+
query_lower = query.lower()
|
|
521
|
+
terms = query_lower.split()
|
|
522
|
+
|
|
523
|
+
# First try exact phrase match
|
|
524
|
+
cursor.execute('''
|
|
525
|
+
SELECT DISTINCT id, content, filename, section, tags, metadata
|
|
526
|
+
FROM chunks
|
|
527
|
+
WHERE LOWER(filename) LIKE ?
|
|
528
|
+
LIMIT ?
|
|
529
|
+
''', (f'%{query_lower}%', count))
|
|
530
|
+
|
|
531
|
+
results = []
|
|
532
|
+
seen_ids = set()
|
|
533
|
+
|
|
534
|
+
# Process exact matches
|
|
535
|
+
for row in cursor.fetchall():
|
|
536
|
+
chunk_id, content, filename, section, tags_json, metadata_json = row
|
|
537
|
+
seen_ids.add(chunk_id)
|
|
538
|
+
|
|
539
|
+
# High score for exact phrase match
|
|
540
|
+
filename_lower = filename.lower()
|
|
541
|
+
basename = filename_lower.split('/')[-1] if '/' in filename_lower else filename_lower
|
|
542
|
+
if query_lower in basename:
|
|
543
|
+
score = 3.0 # Exact match in basename (increased weight)
|
|
544
|
+
else:
|
|
545
|
+
score = 2.0 # Exact match in path
|
|
546
|
+
|
|
547
|
+
results.append({
|
|
548
|
+
'id': chunk_id,
|
|
549
|
+
'content': content,
|
|
550
|
+
'score': float(score),
|
|
551
|
+
'metadata': {
|
|
552
|
+
'filename': filename,
|
|
553
|
+
'section': section,
|
|
554
|
+
'tags': json.loads(tags_json) if tags_json else [],
|
|
555
|
+
'metadata': json.loads(metadata_json) if metadata_json else {}
|
|
556
|
+
},
|
|
557
|
+
'search_type': 'filename',
|
|
558
|
+
'match_coverage': 1.0 # Exact match = 100% coverage
|
|
559
|
+
})
|
|
560
|
+
|
|
561
|
+
# Then search for files containing ANY of the terms
|
|
562
|
+
if terms and len(results) < count * 3: # Get more candidates
|
|
563
|
+
# Build OR query for any term match
|
|
564
|
+
conditions = []
|
|
565
|
+
params = []
|
|
566
|
+
for term in terms:
|
|
567
|
+
conditions.append("LOWER(filename) LIKE ?")
|
|
568
|
+
params.append(f'%{term}%')
|
|
569
|
+
|
|
570
|
+
sql = f'''
|
|
571
|
+
SELECT DISTINCT id, content, filename, section, tags, metadata
|
|
572
|
+
FROM chunks
|
|
573
|
+
WHERE ({' OR '.join(conditions)})
|
|
574
|
+
AND id NOT IN ({','.join(['?' for _ in seen_ids]) if seen_ids else '0'})
|
|
575
|
+
LIMIT ?
|
|
576
|
+
'''
|
|
577
|
+
if seen_ids:
|
|
578
|
+
params.extend(seen_ids)
|
|
579
|
+
params.append(count * 3)
|
|
580
|
+
|
|
581
|
+
cursor.execute(sql, params)
|
|
582
|
+
|
|
583
|
+
for row in cursor.fetchall():
|
|
584
|
+
chunk_id, content, filename, section, tags_json, metadata_json = row
|
|
585
|
+
|
|
586
|
+
# Enhanced scoring based on term coverage
|
|
587
|
+
filename_lower = filename.lower()
|
|
588
|
+
basename = filename_lower.split('/')[-1] if '/' in filename_lower else filename_lower
|
|
589
|
+
|
|
590
|
+
# Count matches in basename vs full path
|
|
591
|
+
basename_matches = sum(1 for term in terms if term in basename)
|
|
592
|
+
path_matches = sum(1 for term in terms if term in filename_lower)
|
|
593
|
+
|
|
594
|
+
# Calculate term coverage (what % of query terms are matched)
|
|
595
|
+
term_coverage = path_matches / len(terms) if terms else 0
|
|
596
|
+
basename_coverage = basename_matches / len(terms) if terms else 0
|
|
597
|
+
|
|
598
|
+
# Check for substring bonus (e.g., "code_examples" contains both terms together)
|
|
599
|
+
substring_bonus = 0
|
|
600
|
+
if len(terms) > 1:
|
|
601
|
+
# Check if terms appear consecutively
|
|
602
|
+
for i in range(len(terms) - 1):
|
|
603
|
+
if f"{terms[i]}_{terms[i+1]}" in filename_lower or f"{terms[i]}{terms[i+1]}" in filename_lower:
|
|
604
|
+
substring_bonus = 0.3
|
|
605
|
+
break
|
|
606
|
+
|
|
607
|
+
# Score based on coverage with exponential boost for more matches
|
|
608
|
+
if basename_coverage > 0:
|
|
609
|
+
# Exponential scoring for basename matches
|
|
610
|
+
score = basename_coverage ** 1.5 + substring_bonus
|
|
611
|
+
else:
|
|
612
|
+
# Lower score for path-only matches
|
|
613
|
+
score = (term_coverage * 0.5) ** 1.5 + substring_bonus
|
|
614
|
+
|
|
615
|
+
results.append({
|
|
616
|
+
'id': chunk_id,
|
|
617
|
+
'content': content,
|
|
618
|
+
'score': float(score),
|
|
619
|
+
'metadata': {
|
|
620
|
+
'filename': filename,
|
|
621
|
+
'section': section,
|
|
622
|
+
'tags': json.loads(tags_json) if tags_json else [],
|
|
623
|
+
'metadata': json.loads(metadata_json) if metadata_json else {}
|
|
624
|
+
},
|
|
625
|
+
'search_type': 'filename',
|
|
626
|
+
'match_coverage': term_coverage
|
|
627
|
+
})
|
|
628
|
+
|
|
629
|
+
conn.close()
|
|
630
|
+
|
|
631
|
+
# Sort by score and return top results
|
|
632
|
+
results.sort(key=lambda x: x['score'], reverse=True)
|
|
633
|
+
return results[:count]
|
|
634
|
+
|
|
635
|
+
except Exception as e:
|
|
636
|
+
logger.error(f"Error in filename search: {e}")
|
|
637
|
+
return []
|
|
638
|
+
|
|
639
|
+
def _metadata_search(self, query: str, count: int) -> List[Dict[str, Any]]:
|
|
640
|
+
"""Search in all metadata fields (tags, sections, category, product, source)"""
|
|
641
|
+
try:
|
|
642
|
+
conn = sqlite3.connect(self.index_path)
|
|
643
|
+
cursor = conn.cursor()
|
|
644
|
+
|
|
645
|
+
query_lower = query.lower()
|
|
646
|
+
terms = query_lower.split()
|
|
647
|
+
results = []
|
|
648
|
+
seen_ids = set()
|
|
649
|
+
|
|
650
|
+
# First, try to use the metadata_text column if it exists
|
|
651
|
+
try:
|
|
652
|
+
# Check if metadata_text column exists
|
|
653
|
+
cursor.execute("PRAGMA table_info(chunks)")
|
|
654
|
+
columns = [col[1] for col in cursor.fetchall()]
|
|
655
|
+
has_metadata_text = 'metadata_text' in columns
|
|
656
|
+
except:
|
|
657
|
+
has_metadata_text = False
|
|
658
|
+
|
|
659
|
+
if has_metadata_text:
|
|
660
|
+
# Use the new metadata_text column for efficient searching
|
|
661
|
+
# Build conditions for each term
|
|
662
|
+
conditions = []
|
|
663
|
+
for term in terms:
|
|
664
|
+
conditions.append(f"metadata_text LIKE '%{term}%'")
|
|
665
|
+
|
|
666
|
+
if conditions:
|
|
667
|
+
query_sql = f'''
|
|
668
|
+
SELECT id, content, filename, section, tags, metadata
|
|
669
|
+
FROM chunks
|
|
670
|
+
WHERE {' AND '.join(conditions)}
|
|
671
|
+
LIMIT ?
|
|
672
|
+
'''
|
|
673
|
+
cursor.execute(query_sql, (count * 10,))
|
|
674
|
+
|
|
675
|
+
for row in cursor.fetchall():
|
|
676
|
+
chunk_id, content, filename, section, tags_json, metadata_json = row
|
|
677
|
+
|
|
678
|
+
if chunk_id in seen_ids:
|
|
679
|
+
continue
|
|
680
|
+
|
|
681
|
+
# Parse metadata
|
|
682
|
+
metadata = json.loads(metadata_json) if metadata_json else {}
|
|
683
|
+
tags = json.loads(tags_json) if tags_json else []
|
|
684
|
+
|
|
685
|
+
# Calculate score based on how many terms match
|
|
686
|
+
score = 0
|
|
687
|
+
for term in terms:
|
|
688
|
+
# Check metadata values
|
|
689
|
+
metadata_str = json.dumps(metadata).lower()
|
|
690
|
+
if term in metadata_str:
|
|
691
|
+
score += 1.5
|
|
692
|
+
# Check tags
|
|
693
|
+
if any(term in str(tag).lower() for tag in tags):
|
|
694
|
+
score += 1.0
|
|
695
|
+
# Check section
|
|
696
|
+
if section and term in section.lower():
|
|
697
|
+
score += 0.8
|
|
698
|
+
|
|
699
|
+
if score > 0:
|
|
700
|
+
seen_ids.add(chunk_id)
|
|
701
|
+
results.append({
|
|
702
|
+
'id': chunk_id,
|
|
703
|
+
'content': content,
|
|
704
|
+
'score': score,
|
|
705
|
+
'metadata': {
|
|
706
|
+
'filename': filename,
|
|
707
|
+
'section': section,
|
|
708
|
+
'tags': tags,
|
|
709
|
+
'metadata': metadata
|
|
710
|
+
},
|
|
711
|
+
'search_type': 'metadata'
|
|
712
|
+
})
|
|
713
|
+
|
|
714
|
+
# Fallback: search for JSON metadata embedded in content
|
|
715
|
+
# This ensures backwards compatibility
|
|
716
|
+
if len(results) < count:
|
|
717
|
+
# Build specific conditions for known patterns
|
|
718
|
+
specific_conditions = []
|
|
719
|
+
|
|
720
|
+
# Look for specific high-value patterns first
|
|
721
|
+
if 'code' in terms and 'examples' in terms:
|
|
722
|
+
specific_conditions.append('content LIKE \'%"category": "Code Examples"%\'')
|
|
723
|
+
if 'sdk' in terms:
|
|
724
|
+
specific_conditions.append('content LIKE \'%"product": "%\' || \'SDK\' || \'%"%\'')
|
|
725
|
+
|
|
726
|
+
# General term search in JSON content
|
|
727
|
+
for term in terms:
|
|
728
|
+
specific_conditions.append(f"content LIKE '%\"{term}%'")
|
|
729
|
+
|
|
730
|
+
if specific_conditions:
|
|
731
|
+
# Limit conditions to avoid too broad search
|
|
732
|
+
conditions_to_use = specific_conditions[:10]
|
|
733
|
+
query_sql = f'''
|
|
734
|
+
SELECT id, content, filename, section, tags, metadata
|
|
735
|
+
FROM chunks
|
|
736
|
+
WHERE ({' OR '.join(conditions_to_use)})
|
|
737
|
+
AND id NOT IN ({','.join(str(id) for id in seen_ids) if seen_ids else '0'})
|
|
738
|
+
LIMIT ?
|
|
739
|
+
'''
|
|
740
|
+
cursor.execute(query_sql, (count * 5,))
|
|
741
|
+
|
|
742
|
+
rows = cursor.fetchall()
|
|
743
|
+
|
|
744
|
+
for row in rows:
|
|
745
|
+
chunk_id, content, filename, section, tags_json, metadata_json = row
|
|
746
|
+
|
|
747
|
+
if chunk_id in seen_ids:
|
|
748
|
+
continue
|
|
749
|
+
|
|
750
|
+
# Try to extract metadata from JSON content
|
|
751
|
+
json_metadata = {}
|
|
752
|
+
try:
|
|
753
|
+
# Look for metadata in JSON structure
|
|
754
|
+
if '"metadata":' in content:
|
|
755
|
+
import re
|
|
756
|
+
# More robust regex to extract nested JSON object
|
|
757
|
+
# This handles nested braces properly
|
|
758
|
+
start = content.find('"metadata":')
|
|
759
|
+
if start != -1:
|
|
760
|
+
# Find the opening brace
|
|
761
|
+
brace_start = content.find('{', start)
|
|
762
|
+
if brace_start != -1:
|
|
763
|
+
# Count braces to find matching closing brace
|
|
764
|
+
brace_count = 0
|
|
765
|
+
i = brace_start
|
|
766
|
+
while i < len(content):
|
|
767
|
+
if content[i] == '{':
|
|
768
|
+
brace_count += 1
|
|
769
|
+
elif content[i] == '}':
|
|
770
|
+
brace_count -= 1
|
|
771
|
+
if brace_count == 0:
|
|
772
|
+
# Found matching closing brace
|
|
773
|
+
metadata_str = content[brace_start:i+1]
|
|
774
|
+
json_metadata = json.loads(metadata_str)
|
|
775
|
+
break
|
|
776
|
+
i += 1
|
|
777
|
+
except:
|
|
778
|
+
pass
|
|
779
|
+
|
|
780
|
+
# Calculate score based on matches
|
|
781
|
+
score = 0
|
|
782
|
+
fields_matched = 0
|
|
783
|
+
|
|
784
|
+
# Check JSON metadata extracted from content
|
|
785
|
+
if json_metadata:
|
|
786
|
+
# Check category - count how many terms match
|
|
787
|
+
category = json_metadata.get('category', '').lower()
|
|
788
|
+
if category:
|
|
789
|
+
category_matches = sum(1 for term in terms if term in category)
|
|
790
|
+
if category_matches > 0:
|
|
791
|
+
score += 1.8 * (category_matches / len(terms) if terms else 1)
|
|
792
|
+
fields_matched += 1
|
|
793
|
+
|
|
794
|
+
# Check product - count how many terms match
|
|
795
|
+
product = json_metadata.get('product', '').lower()
|
|
796
|
+
if product:
|
|
797
|
+
product_matches = sum(1 for term in terms if term in product)
|
|
798
|
+
if product_matches > 0:
|
|
799
|
+
score += 1.5 * (product_matches / len(terms) if terms else 1)
|
|
800
|
+
fields_matched += 1
|
|
801
|
+
|
|
802
|
+
# Check source
|
|
803
|
+
source = json_metadata.get('source', '').lower()
|
|
804
|
+
if source:
|
|
805
|
+
source_matches = sum(1 for term in terms if term in source)
|
|
806
|
+
if source_matches > 0:
|
|
807
|
+
score += 1.2 * (source_matches / len(terms) if terms else 1)
|
|
808
|
+
fields_matched += 1
|
|
809
|
+
|
|
810
|
+
# Also check tags from JSON metadata
|
|
811
|
+
json_tags = json_metadata.get('tags', [])
|
|
812
|
+
if json_tags:
|
|
813
|
+
tags_str = str(json_tags).lower()
|
|
814
|
+
tag_matches = sum(1 for term in terms if term in tags_str)
|
|
815
|
+
if tag_matches > 0:
|
|
816
|
+
score += 1.3 * (tag_matches / len(terms) if terms else 1)
|
|
817
|
+
fields_matched += 1
|
|
818
|
+
|
|
819
|
+
if score > 0:
|
|
820
|
+
seen_ids.add(chunk_id)
|
|
821
|
+
results.append({
|
|
822
|
+
'id': chunk_id,
|
|
823
|
+
'content': content,
|
|
824
|
+
'score': float(score),
|
|
825
|
+
'metadata': {
|
|
826
|
+
'filename': filename,
|
|
827
|
+
'section': section,
|
|
828
|
+
'tags': json.loads(tags_json) if tags_json else [],
|
|
829
|
+
'metadata': json.loads(metadata_json) if metadata_json else {}
|
|
830
|
+
},
|
|
831
|
+
'search_type': 'metadata',
|
|
832
|
+
'fields_matched': fields_matched
|
|
833
|
+
})
|
|
834
|
+
logger.debug(f"Metadata match: {filename} - score={score:.2f}, fields_matched={fields_matched}, json_metadata={json_metadata}")
|
|
835
|
+
|
|
836
|
+
# Also get chunks with regular metadata
|
|
837
|
+
cursor.execute('''
|
|
838
|
+
SELECT id, content, filename, section, tags, metadata
|
|
839
|
+
FROM chunks
|
|
840
|
+
WHERE (tags IS NOT NULL AND tags != '')
|
|
841
|
+
OR (metadata IS NOT NULL AND metadata != '{}')
|
|
842
|
+
OR (section IS NOT NULL AND section != '')
|
|
843
|
+
LIMIT ?
|
|
844
|
+
''', (count * 10,)) # Get more to search through
|
|
845
|
+
|
|
846
|
+
for row in cursor.fetchall():
|
|
847
|
+
chunk_id, content, filename, section, tags_json, metadata_json = row
|
|
848
|
+
|
|
849
|
+
if chunk_id in seen_ids:
|
|
850
|
+
continue
|
|
851
|
+
|
|
852
|
+
# Parse metadata
|
|
853
|
+
tags = json.loads(tags_json) if tags_json else []
|
|
854
|
+
metadata = json.loads(metadata_json) if metadata_json else {}
|
|
855
|
+
|
|
856
|
+
# Flatten nested metadata if present
|
|
857
|
+
if 'metadata' in metadata:
|
|
858
|
+
# Handle double-nested metadata from some indexes
|
|
859
|
+
nested_meta = metadata['metadata']
|
|
860
|
+
metadata.update(nested_meta)
|
|
861
|
+
|
|
862
|
+
# Initialize scoring components
|
|
863
|
+
score_components = {
|
|
864
|
+
'tags': 0,
|
|
865
|
+
'section': 0,
|
|
866
|
+
'category': 0,
|
|
867
|
+
'product': 0,
|
|
868
|
+
'source': 0,
|
|
869
|
+
'description': 0
|
|
870
|
+
}
|
|
871
|
+
|
|
872
|
+
# Check tags
|
|
873
|
+
if tags:
|
|
874
|
+
tag_matches = 0
|
|
875
|
+
for tag in tags:
|
|
876
|
+
tag_lower = tag.lower()
|
|
877
|
+
# Full query match in tag
|
|
878
|
+
if query_lower in tag_lower:
|
|
879
|
+
tag_matches += 2.0
|
|
880
|
+
else:
|
|
881
|
+
# Individual term matches
|
|
882
|
+
term_matches = sum(1 for term in terms if term in tag_lower)
|
|
883
|
+
tag_matches += term_matches * 0.5
|
|
884
|
+
|
|
885
|
+
if tag_matches > 0:
|
|
886
|
+
score_components['tags'] = min(1.0, tag_matches / len(tags))
|
|
887
|
+
|
|
888
|
+
# Check section
|
|
889
|
+
if section and section.lower() != 'none':
|
|
890
|
+
section_lower = section.lower()
|
|
891
|
+
if query_lower in section_lower:
|
|
892
|
+
score_components['section'] = 1.0
|
|
893
|
+
else:
|
|
894
|
+
term_matches = sum(1 for term in terms if term in section_lower)
|
|
895
|
+
score_components['section'] = (term_matches / len(terms)) * 0.8 if terms else 0
|
|
896
|
+
|
|
897
|
+
# Check category field
|
|
898
|
+
category = metadata.get('category', '')
|
|
899
|
+
if category:
|
|
900
|
+
category_lower = category.lower()
|
|
901
|
+
if query_lower in category_lower:
|
|
902
|
+
score_components['category'] = 1.0
|
|
903
|
+
else:
|
|
904
|
+
term_matches = sum(1 for term in terms if term in category_lower)
|
|
905
|
+
score_components['category'] = (term_matches / len(terms)) * 0.9 if terms else 0
|
|
906
|
+
|
|
907
|
+
# Check product field
|
|
908
|
+
product = metadata.get('product', '')
|
|
909
|
+
if product:
|
|
910
|
+
product_lower = product.lower()
|
|
911
|
+
if query_lower in product_lower:
|
|
912
|
+
score_components['product'] = 1.0
|
|
913
|
+
else:
|
|
914
|
+
term_matches = sum(1 for term in terms if term in product_lower)
|
|
915
|
+
score_components['product'] = (term_matches / len(terms)) * 0.8 if terms else 0
|
|
916
|
+
|
|
917
|
+
# Check source field (original filename)
|
|
918
|
+
source = metadata.get('source', '')
|
|
919
|
+
if source:
|
|
920
|
+
source_lower = source.lower()
|
|
921
|
+
if query_lower in source_lower:
|
|
922
|
+
score_components['source'] = 1.0
|
|
923
|
+
else:
|
|
924
|
+
term_matches = sum(1 for term in terms if term in source_lower)
|
|
925
|
+
score_components['source'] = (term_matches / len(terms)) * 0.7 if terms else 0
|
|
926
|
+
|
|
927
|
+
# Check description or title fields
|
|
928
|
+
description = metadata.get('description', metadata.get('title', ''))
|
|
929
|
+
if description:
|
|
930
|
+
desc_lower = description.lower()
|
|
931
|
+
if query_lower in desc_lower:
|
|
932
|
+
score_components['description'] = 0.8
|
|
933
|
+
else:
|
|
934
|
+
term_matches = sum(1 for term in terms if term in desc_lower)
|
|
935
|
+
score_components['description'] = (term_matches / len(terms)) * 0.6 if terms else 0
|
|
936
|
+
|
|
937
|
+
# Calculate total score with weights
|
|
938
|
+
weights = {
|
|
939
|
+
'category': 1.8, # Strong signal
|
|
940
|
+
'product': 1.5, # Strong signal
|
|
941
|
+
'tags': 1.3, # Good signal
|
|
942
|
+
'source': 1.2, # Good signal
|
|
943
|
+
'section': 1.0, # Moderate signal
|
|
944
|
+
'description': 0.8 # Weaker signal
|
|
945
|
+
}
|
|
946
|
+
|
|
947
|
+
total_score = sum(score_components[field] * weights.get(field, 1.0)
|
|
948
|
+
for field in score_components)
|
|
949
|
+
|
|
950
|
+
# Track match coverage
|
|
951
|
+
fields_matched = sum(1 for score in score_components.values() if score > 0)
|
|
952
|
+
match_coverage = sum(1 for term in terms if any(
|
|
953
|
+
term in str(field_value).lower()
|
|
954
|
+
for field_value in [tags, section, category, product, source, description]
|
|
955
|
+
if field_value
|
|
956
|
+
)) / len(terms) if terms else 0
|
|
957
|
+
|
|
958
|
+
if total_score > 0:
|
|
959
|
+
results.append({
|
|
960
|
+
'id': chunk_id,
|
|
961
|
+
'content': content,
|
|
962
|
+
'score': float(total_score),
|
|
963
|
+
'metadata': {
|
|
964
|
+
'filename': filename,
|
|
965
|
+
'section': section,
|
|
966
|
+
'tags': tags,
|
|
967
|
+
'metadata': metadata,
|
|
968
|
+
'category': category,
|
|
969
|
+
'product': product,
|
|
970
|
+
'source': source
|
|
971
|
+
},
|
|
972
|
+
'search_type': 'metadata',
|
|
973
|
+
'metadata_matches': score_components,
|
|
974
|
+
'fields_matched': fields_matched,
|
|
975
|
+
'match_coverage': match_coverage
|
|
976
|
+
})
|
|
977
|
+
seen_ids.add(chunk_id)
|
|
978
|
+
|
|
979
|
+
conn.close()
|
|
980
|
+
|
|
981
|
+
# Sort by score and return top results
|
|
982
|
+
results.sort(key=lambda x: x['score'], reverse=True)
|
|
983
|
+
return results[:count]
|
|
984
|
+
|
|
985
|
+
except Exception as e:
|
|
986
|
+
logger.error(f"Error in metadata search: {e}")
|
|
987
|
+
return []
|
|
988
|
+
|
|
989
|
+
def _add_vector_scores_to_candidates(self, candidates: Dict[str, Dict], query_vector: NDArray,
|
|
990
|
+
similarity_threshold: float):
|
|
991
|
+
"""Add vector similarity scores to existing candidates"""
|
|
992
|
+
if not candidates or not np:
|
|
993
|
+
return
|
|
994
|
+
|
|
995
|
+
try:
|
|
996
|
+
conn = sqlite3.connect(self.index_path)
|
|
997
|
+
cursor = conn.cursor()
|
|
998
|
+
|
|
999
|
+
# Get embeddings for candidate chunks only
|
|
1000
|
+
chunk_ids = list(candidates.keys())
|
|
1001
|
+
placeholders = ','.join(['?' for _ in chunk_ids])
|
|
1002
|
+
|
|
1003
|
+
cursor.execute(f'''
|
|
1004
|
+
SELECT id, embedding
|
|
1005
|
+
FROM chunks
|
|
1006
|
+
WHERE id IN ({placeholders}) AND embedding IS NOT NULL AND embedding != ''
|
|
1007
|
+
''', chunk_ids)
|
|
1008
|
+
|
|
1009
|
+
for row in cursor.fetchall():
|
|
1010
|
+
chunk_id, embedding_blob = row
|
|
1011
|
+
|
|
1012
|
+
if not embedding_blob:
|
|
1013
|
+
continue
|
|
1014
|
+
|
|
1015
|
+
try:
|
|
1016
|
+
# Convert embedding back to numpy array
|
|
1017
|
+
embedding = np.frombuffer(embedding_blob, dtype=np.float32).reshape(1, -1)
|
|
1018
|
+
|
|
1019
|
+
# Calculate similarity
|
|
1020
|
+
similarity = cosine_similarity(query_vector, embedding)[0][0]
|
|
1021
|
+
distance = 1 - similarity
|
|
1022
|
+
|
|
1023
|
+
# Add vector scores to candidate
|
|
1024
|
+
candidates[chunk_id]['vector_score'] = float(similarity)
|
|
1025
|
+
candidates[chunk_id]['vector_distance'] = float(distance)
|
|
1026
|
+
candidates[chunk_id]['sources']['vector_rerank'] = True
|
|
1027
|
+
|
|
1028
|
+
except Exception as e:
|
|
1029
|
+
logger.debug(f"Error processing embedding for chunk {chunk_id}: {e}")
|
|
1030
|
+
continue
|
|
1031
|
+
|
|
1032
|
+
conn.close()
|
|
1033
|
+
|
|
1034
|
+
except Exception as e:
|
|
1035
|
+
logger.error(f"Error in vector re-ranking: {e}")
|
|
1036
|
+
|
|
1037
|
+
def _calculate_combined_score(self, candidate: Dict, similarity_threshold: float) -> float:
|
|
1038
|
+
"""Calculate final score with hybrid vector + metadata weighting
|
|
1039
|
+
|
|
1040
|
+
Hybrid approach:
|
|
1041
|
+
- Vector score is the primary ranking signal (semantic similarity)
|
|
1042
|
+
- Metadata/keyword matches provide confirmation boost
|
|
1043
|
+
- Multiple signal types indicate high relevance (confirmation bonus)
|
|
1044
|
+
- Special boost for 'code' tag matches when query contains code-related terms
|
|
1045
|
+
"""
|
|
1046
|
+
sources = candidate.get('sources', {})
|
|
1047
|
+
source_scores = candidate.get('source_scores', {})
|
|
1048
|
+
|
|
1049
|
+
# Vector score is PRIMARY
|
|
1050
|
+
if 'vector_score' in candidate:
|
|
1051
|
+
vector_score = candidate['vector_score']
|
|
1052
|
+
base_score = vector_score
|
|
1053
|
+
|
|
1054
|
+
# Metadata/keyword matches provide confirmation boost
|
|
1055
|
+
if len(sources) > 1:
|
|
1056
|
+
# Has both vector AND metadata/keyword matches - strong confirmation signal
|
|
1057
|
+
keyword_signals = sum(source_scores.get(k, 0) for k in ['keyword', 'filename', 'metadata'])
|
|
1058
|
+
if keyword_signals > 0:
|
|
1059
|
+
# Normalize and apply boost (up to 30% for strong confirmation)
|
|
1060
|
+
keyword_boost = min(0.3, keyword_signals * 0.15)
|
|
1061
|
+
base_score = vector_score * (1.0 + keyword_boost)
|
|
1062
|
+
|
|
1063
|
+
# Additional boost if multiple signal types confirm (2+ sources)
|
|
1064
|
+
num_metadata_sources = sum(1 for s in ['keyword', 'filename', 'metadata'] if s in sources)
|
|
1065
|
+
if num_metadata_sources >= 2:
|
|
1066
|
+
# Multiple confirmation signals - very high confidence
|
|
1067
|
+
base_score *= 1.1
|
|
1068
|
+
|
|
1069
|
+
# Check for code-related tags to boost code examples
|
|
1070
|
+
tags = candidate.get('metadata', {}).get('tags', [])
|
|
1071
|
+
if 'code' in tags:
|
|
1072
|
+
# This chunk contains code - boost if query is code-related
|
|
1073
|
+
# (metadata search would have found it if query mentioned code/example/python/etc)
|
|
1074
|
+
if 'metadata' in sources or 'keyword' in sources:
|
|
1075
|
+
# Query matched code-related metadata - apply code boost
|
|
1076
|
+
base_score *= 1.2
|
|
1077
|
+
else:
|
|
1078
|
+
# No vector score - this is a keyword-only result (backfill)
|
|
1079
|
+
# Use keyword scores but penalize for lack of semantic match
|
|
1080
|
+
base_score = sum(source_scores.values()) * 0.6 # 40% penalty for no vector
|
|
1081
|
+
|
|
1082
|
+
# Still boost code chunks if metadata matched
|
|
1083
|
+
tags = candidate.get('metadata', {}).get('tags', [])
|
|
1084
|
+
if 'code' in tags and 'metadata' in sources:
|
|
1085
|
+
base_score *= 1.15
|
|
1086
|
+
|
|
1087
|
+
return base_score
|
|
1088
|
+
|
|
1089
|
+
def _apply_diversity_penalties(self, results: List[Dict], target_count: int) -> List[Dict]:
|
|
1090
|
+
"""Apply penalties to prevent single-file dominance while maintaining quality"""
|
|
1091
|
+
if not results:
|
|
1092
|
+
return results
|
|
1093
|
+
|
|
1094
|
+
# Track file occurrences
|
|
1095
|
+
file_counts = {}
|
|
1096
|
+
penalized_results = []
|
|
1097
|
+
|
|
1098
|
+
# Define penalty multipliers
|
|
1099
|
+
occurrence_penalties = {
|
|
1100
|
+
1: 1.0, # First chunk: no penalty
|
|
1101
|
+
2: 0.85, # Second chunk: 15% penalty
|
|
1102
|
+
3: 0.7, # Third chunk: 30% penalty
|
|
1103
|
+
4: 0.5, # Fourth chunk: 50% penalty
|
|
1104
|
+
}
|
|
1105
|
+
|
|
1106
|
+
for result in results:
|
|
1107
|
+
filename = result['metadata']['filename']
|
|
1108
|
+
|
|
1109
|
+
# Get current count for this file
|
|
1110
|
+
current_count = file_counts.get(filename, 0) + 1
|
|
1111
|
+
file_counts[filename] = current_count
|
|
1112
|
+
|
|
1113
|
+
# Apply penalty based on occurrence
|
|
1114
|
+
penalty = occurrence_penalties.get(current_count, 0.4) # 60% penalty for 5+ chunks
|
|
1115
|
+
|
|
1116
|
+
# Create a copy to avoid modifying original
|
|
1117
|
+
penalized_result = result.copy()
|
|
1118
|
+
penalized_result['diversity_penalty'] = penalty
|
|
1119
|
+
penalized_result['final_score'] = result.get('final_score', result.get('score', 0)) * penalty
|
|
1120
|
+
|
|
1121
|
+
penalized_results.append(penalized_result)
|
|
1122
|
+
|
|
1123
|
+
# Re-sort by penalized scores
|
|
1124
|
+
penalized_results.sort(key=lambda x: x['final_score'], reverse=True)
|
|
1125
|
+
|
|
1126
|
+
# Ensure minimum diversity if we have enough results
|
|
1127
|
+
if len(penalized_results) > target_count:
|
|
1128
|
+
unique_files = len(set(r['metadata']['filename'] for r in penalized_results[:target_count]))
|
|
1129
|
+
|
|
1130
|
+
# If top results are too homogeneous (e.g., all from 1-2 files)
|
|
1131
|
+
if unique_files < min(3, target_count):
|
|
1132
|
+
# Try to inject some diversity
|
|
1133
|
+
selected = penalized_results[:target_count]
|
|
1134
|
+
seen_files = set(r['metadata']['filename'] for r in selected)
|
|
1135
|
+
|
|
1136
|
+
# Look for high-quality results from other files
|
|
1137
|
+
for result in penalized_results[target_count:]:
|
|
1138
|
+
if result['metadata']['filename'] not in seen_files:
|
|
1139
|
+
# If it's reasonably good (within 50% of top score), include it
|
|
1140
|
+
if result['final_score'] > 0.5 * selected[0]['final_score']:
|
|
1141
|
+
# Replace the lowest scoring result from an over-represented file
|
|
1142
|
+
for i in range(len(selected) - 1, -1, -1):
|
|
1143
|
+
if file_counts[selected[i]['metadata']['filename']] > 2:
|
|
1144
|
+
selected[i] = result
|
|
1145
|
+
seen_files.add(result['metadata']['filename'])
|
|
1146
|
+
break
|
|
1147
|
+
|
|
1148
|
+
penalized_results[:target_count] = selected
|
|
1149
|
+
|
|
1150
|
+
return penalized_results
|
|
1151
|
+
|
|
1152
|
+
def _apply_match_type_diversity(self, results: List[Dict], target_count: int) -> List[Dict]:
|
|
1153
|
+
"""Ensure diversity of match types in final results
|
|
1154
|
+
|
|
1155
|
+
Ensures we have a mix of:
|
|
1156
|
+
- Vector-only matches (semantic similarity, good for code examples)
|
|
1157
|
+
- Keyword-only matches (exact term matches)
|
|
1158
|
+
- Hybrid matches (both vector + keyword/metadata)
|
|
1159
|
+
"""
|
|
1160
|
+
if not results or len(results) <= target_count:
|
|
1161
|
+
return results
|
|
1162
|
+
|
|
1163
|
+
# Categorize results by match type
|
|
1164
|
+
vector_only = []
|
|
1165
|
+
keyword_only = []
|
|
1166
|
+
hybrid = []
|
|
1167
|
+
|
|
1168
|
+
for result in results:
|
|
1169
|
+
sources = result.get('sources', {})
|
|
1170
|
+
has_vector = 'vector' in sources
|
|
1171
|
+
has_keyword = any(k in sources for k in ['keyword', 'filename', 'metadata'])
|
|
1172
|
+
|
|
1173
|
+
if has_vector and not has_keyword:
|
|
1174
|
+
vector_only.append(result)
|
|
1175
|
+
elif has_keyword and not has_vector:
|
|
1176
|
+
keyword_only.append(result)
|
|
1177
|
+
else:
|
|
1178
|
+
hybrid.append(result)
|
|
1179
|
+
|
|
1180
|
+
# Build diverse result set
|
|
1181
|
+
# Target distribution: 40% hybrid, 40% vector-only, 20% keyword-only
|
|
1182
|
+
# This ensures we include semantic matches (code examples) even if keywords don't match
|
|
1183
|
+
diversified = []
|
|
1184
|
+
|
|
1185
|
+
# Take top hybrid matches first (best overall)
|
|
1186
|
+
hybrid_target = max(1, int(target_count * 0.4))
|
|
1187
|
+
diversified.extend(hybrid[:hybrid_target])
|
|
1188
|
+
|
|
1189
|
+
# Ensure we have vector-only matches (critical for code examples)
|
|
1190
|
+
vector_target = max(1, int(target_count * 0.4))
|
|
1191
|
+
diversified.extend(vector_only[:vector_target])
|
|
1192
|
+
|
|
1193
|
+
# Add keyword-only matches
|
|
1194
|
+
keyword_target = max(1, int(target_count * 0.2))
|
|
1195
|
+
diversified.extend(keyword_only[:keyword_target])
|
|
1196
|
+
|
|
1197
|
+
# Fill remaining slots with best remaining results regardless of type
|
|
1198
|
+
remaining_slots = target_count - len(diversified)
|
|
1199
|
+
if remaining_slots > 0:
|
|
1200
|
+
# Get all unused results
|
|
1201
|
+
used_ids = set(r['id'] for r in diversified)
|
|
1202
|
+
unused = [r for r in results if r['id'] not in used_ids]
|
|
1203
|
+
diversified.extend(unused[:remaining_slots])
|
|
1204
|
+
|
|
1205
|
+
# Sort by final score to maintain quality ordering
|
|
1206
|
+
diversified.sort(key=lambda x: x['final_score'], reverse=True)
|
|
1207
|
+
|
|
1208
|
+
return diversified
|
|
1209
|
+
|
|
334
1210
|
def get_stats(self) -> Dict[str, Any]:
|
|
335
1211
|
"""Get statistics about the search index"""
|
|
1212
|
+
# Use pgvector backend if available
|
|
1213
|
+
if self.backend == 'pgvector':
|
|
1214
|
+
return self._backend.get_stats()
|
|
1215
|
+
|
|
1216
|
+
# Original SQLite implementation
|
|
336
1217
|
conn = sqlite3.connect(self.index_path)
|
|
337
1218
|
cursor = conn.cursor()
|
|
338
1219
|
|