signalwire-agents 0.1.13__py3-none-any.whl → 1.0.17.dev4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. signalwire_agents/__init__.py +99 -15
  2. signalwire_agents/agent_server.py +248 -60
  3. signalwire_agents/agents/bedrock.py +296 -0
  4. signalwire_agents/cli/__init__.py +9 -0
  5. signalwire_agents/cli/build_search.py +951 -41
  6. signalwire_agents/cli/config.py +80 -0
  7. signalwire_agents/cli/core/__init__.py +10 -0
  8. signalwire_agents/cli/core/agent_loader.py +470 -0
  9. signalwire_agents/cli/core/argparse_helpers.py +179 -0
  10. signalwire_agents/cli/core/dynamic_config.py +71 -0
  11. signalwire_agents/cli/core/service_loader.py +303 -0
  12. signalwire_agents/cli/dokku.py +2320 -0
  13. signalwire_agents/cli/execution/__init__.py +10 -0
  14. signalwire_agents/cli/execution/datamap_exec.py +446 -0
  15. signalwire_agents/cli/execution/webhook_exec.py +134 -0
  16. signalwire_agents/cli/init_project.py +2636 -0
  17. signalwire_agents/cli/output/__init__.py +10 -0
  18. signalwire_agents/cli/output/output_formatter.py +255 -0
  19. signalwire_agents/cli/output/swml_dump.py +186 -0
  20. signalwire_agents/cli/simulation/__init__.py +10 -0
  21. signalwire_agents/cli/simulation/data_generation.py +374 -0
  22. signalwire_agents/cli/simulation/data_overrides.py +200 -0
  23. signalwire_agents/cli/simulation/mock_env.py +282 -0
  24. signalwire_agents/cli/swaig_test_wrapper.py +52 -0
  25. signalwire_agents/cli/test_swaig.py +566 -2366
  26. signalwire_agents/cli/types.py +81 -0
  27. signalwire_agents/core/__init__.py +2 -2
  28. signalwire_agents/core/agent/__init__.py +12 -0
  29. signalwire_agents/core/agent/config/__init__.py +12 -0
  30. signalwire_agents/core/agent/deployment/__init__.py +9 -0
  31. signalwire_agents/core/agent/deployment/handlers/__init__.py +9 -0
  32. signalwire_agents/core/agent/prompt/__init__.py +14 -0
  33. signalwire_agents/core/agent/prompt/manager.py +306 -0
  34. signalwire_agents/core/agent/routing/__init__.py +9 -0
  35. signalwire_agents/core/agent/security/__init__.py +9 -0
  36. signalwire_agents/core/agent/swml/__init__.py +9 -0
  37. signalwire_agents/core/agent/tools/__init__.py +15 -0
  38. signalwire_agents/core/agent/tools/decorator.py +97 -0
  39. signalwire_agents/core/agent/tools/registry.py +210 -0
  40. signalwire_agents/core/agent_base.py +845 -2916
  41. signalwire_agents/core/auth_handler.py +233 -0
  42. signalwire_agents/core/config_loader.py +259 -0
  43. signalwire_agents/core/contexts.py +418 -0
  44. signalwire_agents/core/data_map.py +3 -15
  45. signalwire_agents/core/function_result.py +116 -44
  46. signalwire_agents/core/logging_config.py +162 -18
  47. signalwire_agents/core/mixins/__init__.py +28 -0
  48. signalwire_agents/core/mixins/ai_config_mixin.py +442 -0
  49. signalwire_agents/core/mixins/auth_mixin.py +280 -0
  50. signalwire_agents/core/mixins/prompt_mixin.py +358 -0
  51. signalwire_agents/core/mixins/serverless_mixin.py +460 -0
  52. signalwire_agents/core/mixins/skill_mixin.py +55 -0
  53. signalwire_agents/core/mixins/state_mixin.py +153 -0
  54. signalwire_agents/core/mixins/tool_mixin.py +230 -0
  55. signalwire_agents/core/mixins/web_mixin.py +1142 -0
  56. signalwire_agents/core/security_config.py +333 -0
  57. signalwire_agents/core/skill_base.py +84 -1
  58. signalwire_agents/core/skill_manager.py +62 -20
  59. signalwire_agents/core/swaig_function.py +18 -5
  60. signalwire_agents/core/swml_builder.py +207 -11
  61. signalwire_agents/core/swml_handler.py +27 -21
  62. signalwire_agents/core/swml_renderer.py +123 -312
  63. signalwire_agents/core/swml_service.py +171 -203
  64. signalwire_agents/mcp_gateway/__init__.py +29 -0
  65. signalwire_agents/mcp_gateway/gateway_service.py +564 -0
  66. signalwire_agents/mcp_gateway/mcp_manager.py +513 -0
  67. signalwire_agents/mcp_gateway/session_manager.py +218 -0
  68. signalwire_agents/prefabs/concierge.py +0 -3
  69. signalwire_agents/prefabs/faq_bot.py +0 -3
  70. signalwire_agents/prefabs/info_gatherer.py +0 -3
  71. signalwire_agents/prefabs/receptionist.py +0 -3
  72. signalwire_agents/prefabs/survey.py +0 -3
  73. signalwire_agents/schema.json +9218 -5489
  74. signalwire_agents/search/__init__.py +7 -1
  75. signalwire_agents/search/document_processor.py +490 -31
  76. signalwire_agents/search/index_builder.py +307 -37
  77. signalwire_agents/search/migration.py +418 -0
  78. signalwire_agents/search/models.py +30 -0
  79. signalwire_agents/search/pgvector_backend.py +748 -0
  80. signalwire_agents/search/query_processor.py +162 -31
  81. signalwire_agents/search/search_engine.py +916 -35
  82. signalwire_agents/search/search_service.py +376 -53
  83. signalwire_agents/skills/README.md +452 -0
  84. signalwire_agents/skills/__init__.py +14 -2
  85. signalwire_agents/skills/api_ninjas_trivia/README.md +215 -0
  86. signalwire_agents/skills/api_ninjas_trivia/__init__.py +12 -0
  87. signalwire_agents/skills/api_ninjas_trivia/skill.py +237 -0
  88. signalwire_agents/skills/datasphere/README.md +210 -0
  89. signalwire_agents/skills/datasphere/skill.py +84 -3
  90. signalwire_agents/skills/datasphere_serverless/README.md +258 -0
  91. signalwire_agents/skills/datasphere_serverless/__init__.py +9 -0
  92. signalwire_agents/skills/datasphere_serverless/skill.py +82 -1
  93. signalwire_agents/skills/datetime/README.md +132 -0
  94. signalwire_agents/skills/datetime/__init__.py +9 -0
  95. signalwire_agents/skills/datetime/skill.py +20 -7
  96. signalwire_agents/skills/joke/README.md +149 -0
  97. signalwire_agents/skills/joke/__init__.py +9 -0
  98. signalwire_agents/skills/joke/skill.py +21 -0
  99. signalwire_agents/skills/math/README.md +161 -0
  100. signalwire_agents/skills/math/__init__.py +9 -0
  101. signalwire_agents/skills/math/skill.py +18 -4
  102. signalwire_agents/skills/mcp_gateway/README.md +230 -0
  103. signalwire_agents/skills/mcp_gateway/__init__.py +10 -0
  104. signalwire_agents/skills/mcp_gateway/skill.py +421 -0
  105. signalwire_agents/skills/native_vector_search/README.md +210 -0
  106. signalwire_agents/skills/native_vector_search/__init__.py +9 -0
  107. signalwire_agents/skills/native_vector_search/skill.py +569 -101
  108. signalwire_agents/skills/play_background_file/README.md +218 -0
  109. signalwire_agents/skills/play_background_file/__init__.py +12 -0
  110. signalwire_agents/skills/play_background_file/skill.py +242 -0
  111. signalwire_agents/skills/registry.py +395 -40
  112. signalwire_agents/skills/spider/README.md +236 -0
  113. signalwire_agents/skills/spider/__init__.py +13 -0
  114. signalwire_agents/skills/spider/skill.py +598 -0
  115. signalwire_agents/skills/swml_transfer/README.md +395 -0
  116. signalwire_agents/skills/swml_transfer/__init__.py +10 -0
  117. signalwire_agents/skills/swml_transfer/skill.py +359 -0
  118. signalwire_agents/skills/weather_api/README.md +178 -0
  119. signalwire_agents/skills/weather_api/__init__.py +12 -0
  120. signalwire_agents/skills/weather_api/skill.py +191 -0
  121. signalwire_agents/skills/web_search/README.md +163 -0
  122. signalwire_agents/skills/web_search/__init__.py +9 -0
  123. signalwire_agents/skills/web_search/skill.py +586 -112
  124. signalwire_agents/skills/wikipedia_search/README.md +228 -0
  125. signalwire_agents/{core/state → skills/wikipedia_search}/__init__.py +5 -4
  126. signalwire_agents/skills/{wikipedia → wikipedia_search}/skill.py +33 -3
  127. signalwire_agents/web/__init__.py +17 -0
  128. signalwire_agents/web/web_service.py +559 -0
  129. signalwire_agents-1.0.17.dev4.data/data/share/man/man1/sw-agent-init.1 +400 -0
  130. signalwire_agents-1.0.17.dev4.data/data/share/man/man1/sw-search.1 +483 -0
  131. signalwire_agents-1.0.17.dev4.data/data/share/man/man1/swaig-test.1 +308 -0
  132. {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/METADATA +347 -215
  133. signalwire_agents-1.0.17.dev4.dist-info/RECORD +147 -0
  134. signalwire_agents-1.0.17.dev4.dist-info/entry_points.txt +6 -0
  135. signalwire_agents/core/state/file_state_manager.py +0 -219
  136. signalwire_agents/core/state/state_manager.py +0 -101
  137. signalwire_agents/skills/wikipedia/__init__.py +0 -9
  138. signalwire_agents-0.1.13.data/data/schema.json +0 -5611
  139. signalwire_agents-0.1.13.dist-info/RECORD +0 -67
  140. signalwire_agents-0.1.13.dist-info/entry_points.txt +0 -3
  141. {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/WHEEL +0 -0
  142. {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/licenses/LICENSE +0 -0
  143. {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/top_level.txt +0 -0
@@ -26,11 +26,38 @@ logger = logging.getLogger(__name__)
26
26
  class SearchEngine:
27
27
  """Hybrid search engine for vector and keyword search"""
28
28
 
29
- def __init__(self, index_path: str, model=None):
30
- self.index_path = index_path
29
+ def __init__(self, backend: str = 'sqlite', index_path: Optional[str] = None,
30
+ connection_string: Optional[str] = None, collection_name: Optional[str] = None,
31
+ model=None):
32
+ """
33
+ Initialize search engine
34
+
35
+ Args:
36
+ backend: Storage backend ('sqlite' or 'pgvector')
37
+ index_path: Path to .swsearch file (for sqlite backend)
38
+ connection_string: PostgreSQL connection string (for pgvector backend)
39
+ collection_name: Collection name (for pgvector backend)
40
+ model: Optional sentence transformer model
41
+ """
42
+ self.backend = backend
31
43
  self.model = model
32
- self.config = self._load_config()
33
- self.embedding_dim = int(self.config.get('embedding_dimensions', 768))
44
+
45
+ if backend == 'sqlite':
46
+ if not index_path:
47
+ raise ValueError("index_path is required for sqlite backend")
48
+ self.index_path = index_path
49
+ self.config = self._load_config()
50
+ self.embedding_dim = int(self.config.get('embedding_dimensions', 768))
51
+ self._backend = None # SQLite uses direct connection
52
+ elif backend == 'pgvector':
53
+ if not connection_string or not collection_name:
54
+ raise ValueError("connection_string and collection_name are required for pgvector backend")
55
+ from .pgvector_backend import PgVectorSearchBackend
56
+ self._backend = PgVectorSearchBackend(connection_string, collection_name)
57
+ self.config = self._backend.config
58
+ self.embedding_dim = int(self.config.get('embedding_dimensions', 768))
59
+ else:
60
+ raise ValueError(f"Invalid backend '{backend}'. Must be 'sqlite' or 'pgvector'")
34
61
 
35
62
  def _load_config(self) -> Dict[str, str]:
36
63
  """Load index configuration"""
@@ -46,58 +73,132 @@ class SearchEngine:
46
73
  return {}
47
74
 
48
75
  def search(self, query_vector: List[float], enhanced_text: str,
49
- count: int = 3, distance_threshold: float = 0.0,
50
- tags: Optional[List[str]] = None) -> List[Dict[str, Any]]:
76
+ count: int = 3, similarity_threshold: float = 0.0,
77
+ tags: Optional[List[str]] = None,
78
+ keyword_weight: Optional[float] = None,
79
+ original_query: Optional[str] = None) -> List[Dict[str, Any]]:
51
80
  """
52
- Perform hybrid search (vector + keyword)
81
+ Perform improved search with fast filtering and vector re-ranking
82
+
83
+ Strategy:
84
+ 1. Fast candidate collection (filename, metadata, keywords)
85
+ 2. Vector re-ranking on candidates only
86
+ 3. Fallback to full vector search if few candidates
53
87
 
54
88
  Args:
55
89
  query_vector: Embedding vector for the query
56
90
  enhanced_text: Processed query text for keyword search
57
91
  count: Number of results to return
58
- distance_threshold: Minimum similarity score
92
+ similarity_threshold: Minimum similarity score
59
93
  tags: Filter by tags
94
+ keyword_weight: Optional manual weight for keyword vs vector
95
+ original_query: Original query for exact matching
60
96
 
61
97
  Returns:
62
98
  List of search results with scores and metadata
63
99
  """
64
100
 
101
+ # Use pgvector backend if available
102
+ if self.backend == 'pgvector':
103
+ return self._backend.search(query_vector, enhanced_text, count, similarity_threshold, tags, keyword_weight)
104
+
105
+ # Check for numpy/sklearn availability
65
106
  if not np or not cosine_similarity:
66
107
  logger.warning("NumPy or scikit-learn not available. Using keyword search only.")
67
- return self._keyword_search_only(enhanced_text, count, tags)
108
+ return self._keyword_search_only(enhanced_text, count, tags, original_query)
68
109
 
69
110
  # Convert query vector to numpy array
70
111
  try:
71
112
  query_array = np.array(query_vector).reshape(1, -1)
72
113
  except Exception as e:
73
114
  logger.error(f"Error converting query vector: {e}")
74
- return self._keyword_search_only(enhanced_text, count, tags)
115
+ return self._keyword_search_only(enhanced_text, count, tags, original_query)
75
116
 
76
- # Vector search
77
- vector_results = self._vector_search(query_array, count * 2)
117
+ # HYBRID APPROACH: Search vector AND metadata in parallel
118
+ # Stage 1: Run both search types simultaneously
119
+ search_multiplier = 3
120
+
121
+ # Vector search (semantic similarity - primary ranking signal)
122
+ vector_results = self._vector_search(query_array, count * search_multiplier)
123
+
124
+ # Metadata/keyword searches (confirmation signals and backfill)
125
+ filename_results = self._filename_search(original_query or enhanced_text, count * search_multiplier)
126
+ metadata_results = self._metadata_search(original_query or enhanced_text, count * search_multiplier)
127
+ keyword_results = self._keyword_search(enhanced_text, count * search_multiplier, original_query)
128
+
129
+ logger.debug(f"Parallel search: vector={len(vector_results)}, filename={len(filename_results)}, "
130
+ f"metadata={len(metadata_results)}, keyword={len(keyword_results)}")
131
+
132
+ # Stage 2: Merge all results into candidate pool
133
+ candidates = {}
134
+
135
+ # Add vector results first (primary signal)
136
+ for result in vector_results:
137
+ chunk_id = result['id']
138
+ candidates[chunk_id] = result
139
+ candidates[chunk_id]['vector_score'] = result['score']
140
+ candidates[chunk_id]['vector_distance'] = 1 - result['score']
141
+ candidates[chunk_id]['sources'] = {'vector': True}
142
+ candidates[chunk_id]['source_scores'] = {'vector': result['score']}
143
+
144
+ # Add metadata/keyword results (secondary signals that boost or backfill)
145
+ for result_set, source_type, source_weight in [(filename_results, 'filename', 2.0),
146
+ (metadata_results, 'metadata', 1.5),
147
+ (keyword_results, 'keyword', 1.0)]:
148
+ for result in result_set:
149
+ chunk_id = result['id']
150
+ if chunk_id not in candidates:
151
+ # New candidate from metadata/keyword (no vector match)
152
+ candidates[chunk_id] = result
153
+ candidates[chunk_id]['sources'] = {source_type: True}
154
+ candidates[chunk_id]['source_scores'] = {source_type: result['score'] * source_weight}
155
+ else:
156
+ # Exists in vector results - add metadata/keyword as confirmation signal
157
+ candidates[chunk_id]['sources'][source_type] = True
158
+ candidates[chunk_id]['source_scores'][source_type] = result['score'] * source_weight
78
159
 
79
- # Keyword search
80
- keyword_results = self._keyword_search(enhanced_text, count * 2)
160
+ # Stage 3: Score and rank all candidates
161
+ final_results = []
162
+ for chunk_id, candidate in candidates.items():
163
+ # Calculate final score combining all signals
164
+ score = self._calculate_combined_score(candidate, similarity_threshold)
165
+ candidate['final_score'] = score
166
+ final_results.append(candidate)
81
167
 
82
- # Merge and rank results
83
- merged_results = self._merge_results(vector_results, keyword_results)
168
+ # Sort by final score
169
+ final_results.sort(key=lambda x: x['final_score'], reverse=True)
84
170
 
85
171
  # Filter by tags if specified
86
172
  if tags:
87
- merged_results = self._filter_by_tags(merged_results, tags)
173
+ final_results = [r for r in final_results
174
+ if any(tag in r['metadata'].get('tags', []) for tag in tags)]
175
+
176
+ # Apply distance threshold as final filter (soft threshold already applied in scoring)
177
+ if similarity_threshold > 0:
178
+ final_results = [r for r in final_results
179
+ if r.get('vector_distance', 0) <= similarity_threshold * 1.5
180
+ or 'vector' not in r.get('sources', {})]
88
181
 
89
- # Filter by distance threshold
90
- filtered_results = [
91
- r for r in merged_results
92
- if r['score'] >= distance_threshold
93
- ]
182
+ # Boost exact matches if we have the original query
183
+ if original_query:
184
+ final_results = self._boost_exact_matches(final_results, original_query)
185
+ # Re-sort after boosting
186
+ final_results.sort(key=lambda x: x['final_score'], reverse=True)
94
187
 
95
- return filtered_results[:count]
188
+ # Apply diversity penalties to prevent single-file dominance
189
+ final_results = self._apply_diversity_penalties(final_results, count)
190
+
191
+ # Ensure 'score' field exists for CLI compatibility
192
+ for r in final_results:
193
+ if 'score' not in r:
194
+ r['score'] = r.get('final_score', 0.0)
195
+
196
+ return final_results[:count]
96
197
 
97
198
  def _keyword_search_only(self, enhanced_text: str, count: int,
98
- tags: Optional[List[str]] = None) -> List[Dict[str, Any]]:
199
+ tags: Optional[List[str]] = None, original_query: Optional[str] = None) -> List[Dict[str, Any]]:
99
200
  """Fallback to keyword search only when vector search is unavailable"""
100
- keyword_results = self._keyword_search(enhanced_text, count)
201
+ keyword_results = self._keyword_search(enhanced_text, count, original_query)
101
202
 
102
203
  if tags:
103
204
  keyword_results = self._filter_by_tags(keyword_results, tags)
@@ -161,7 +262,7 @@ class SearchEngine:
161
262
  logger.error(f"Error in vector search: {e}")
162
263
  return []
163
264
 
164
- def _keyword_search(self, enhanced_text: str, count: int) -> List[Dict[str, Any]]:
265
+ def _keyword_search(self, enhanced_text: str, count: int, original_query: Optional[str] = None) -> List[Dict[str, Any]]:
165
266
  """Perform full-text search"""
166
267
  try:
167
268
  conn = sqlite3.connect(self.index_path)
@@ -203,6 +304,12 @@ class SearchEngine:
203
304
  })
204
305
 
205
306
  conn.close()
307
+
308
+ # If FTS returns no results, try fallback LIKE search
309
+ if not results:
310
+ logger.debug(f"FTS returned no results for '{enhanced_text}', trying fallback search")
311
+ return self._fallback_search(enhanced_text, count)
312
+
206
313
  return results
207
314
 
208
315
  except Exception as e:
@@ -227,35 +334,74 @@ class SearchEngine:
227
334
  conn = sqlite3.connect(self.index_path)
228
335
  cursor = conn.cursor()
229
336
 
230
- # Simple LIKE search
337
+ # Simple LIKE search with word boundaries
231
338
  search_terms = enhanced_text.lower().split()
232
339
  like_conditions = []
233
340
  params = []
234
341
 
235
342
  for term in search_terms[:5]: # Limit to 5 terms to avoid too complex queries
236
- like_conditions.append("LOWER(processed_content) LIKE ?")
237
- params.append(f"%{term}%")
343
+ # Search for term with word boundaries (space or punctuation)
344
+ like_conditions.append("""
345
+ (LOWER(processed_content) LIKE ?
346
+ OR LOWER(processed_content) LIKE ?
347
+ OR LOWER(processed_content) LIKE ?
348
+ OR LOWER(processed_content) LIKE ?)
349
+ """)
350
+ params.extend([
351
+ f"% {term} %", # space on both sides
352
+ f"{term} %", # at beginning
353
+ f"% {term}", # at end
354
+ f"{term}" # exact match
355
+ ])
238
356
 
239
357
  if not like_conditions:
240
358
  return []
241
359
 
360
+ # Also search in original content
361
+ content_conditions = []
362
+ for term in search_terms[:5]:
363
+ content_conditions.append("""
364
+ (LOWER(content) LIKE ?
365
+ OR LOWER(content) LIKE ?
366
+ OR LOWER(content) LIKE ?
367
+ OR LOWER(content) LIKE ?)
368
+ """)
369
+ params.extend([
370
+ f"% {term} %", # with spaces
371
+ f"{term} %", # at beginning
372
+ f"% {term}", # at end
373
+ f"{term}" # exact match
374
+ ])
375
+
242
376
  query = f'''
243
377
  SELECT id, content, filename, section, tags, metadata
244
378
  FROM chunks
245
- WHERE {" OR ".join(like_conditions)}
379
+ WHERE ({" OR ".join(like_conditions)})
380
+ OR ({" OR ".join(content_conditions)})
246
381
  LIMIT ?
247
382
  '''
248
383
  params.append(count)
249
384
 
385
+
250
386
  cursor.execute(query, params)
251
387
 
252
388
  results = []
253
389
  for row in cursor.fetchall():
254
390
  chunk_id, content, filename, section, tags_json, metadata_json = row
255
391
 
256
- # Simple scoring based on term matches
392
+ # Simple scoring based on term matches with word boundaries
257
393
  content_lower = content.lower()
258
- score = sum(1 for term in search_terms if term.lower() in content_lower) / len(search_terms)
394
+ # Check for whole word matches
395
+ word_matches = 0
396
+ for term in search_terms:
397
+ term_lower = term.lower()
398
+ # Check word boundaries
399
+ if (f" {term_lower} " in f" {content_lower} " or
400
+ content_lower.startswith(f"{term_lower} ") or
401
+ content_lower.endswith(f" {term_lower}") or
402
+ content_lower == term_lower):
403
+ word_matches += 1
404
+ score = word_matches / len(search_terms) if search_terms else 0.0
259
405
 
260
406
  results.append({
261
407
  'id': chunk_id,
@@ -274,14 +420,23 @@ class SearchEngine:
274
420
 
275
421
  # Sort by score
276
422
  results.sort(key=lambda x: x['score'], reverse=True)
423
+
277
424
  return results
278
425
 
279
426
  except Exception as e:
280
427
  logger.error(f"Error in fallback search: {e}")
281
428
  return []
282
429
 
283
- def _merge_results(self, vector_results: List[Dict], keyword_results: List[Dict]) -> List[Dict[str, Any]]:
430
+ def _merge_results(self, vector_results: List[Dict], keyword_results: List[Dict],
431
+ vector_weight: Optional[float] = None,
432
+ keyword_weight: Optional[float] = None) -> List[Dict[str, Any]]:
284
433
  """Merge and rank vector and keyword search results"""
434
+ # Use provided weights or defaults
435
+ if vector_weight is None:
436
+ vector_weight = 0.7
437
+ if keyword_weight is None:
438
+ keyword_weight = 0.3
439
+
285
440
  # Create a combined list with weighted scores
286
441
  combined = {}
287
442
 
@@ -303,8 +458,6 @@ class SearchEngine:
303
458
  combined[chunk_id]['keyword_score'] = result['score']
304
459
 
305
460
  # Calculate combined score (weighted average)
306
- vector_weight = 0.7
307
- keyword_weight = 0.3
308
461
 
309
462
  for chunk_id, result in combined.items():
310
463
  vector_score = result.get('vector_score', 0.0)
@@ -331,8 +484,736 @@ class SearchEngine:
331
484
  filtered.append(result)
332
485
  return filtered
333
486
 
487
+ def _boost_exact_matches(self, results: List[Dict[str, Any]], original_query: str) -> List[Dict[str, Any]]:
488
+ """Boost scores for results that contain exact matches of the original query"""
489
+ if not original_query:
490
+ return results
491
+
492
+ # Extract key phrases to look for
493
+ query_lower = original_query.lower()
494
+
495
+ for result in results:
496
+ content_lower = result['content'].lower()
497
+ filename_lower = result['metadata'].get('filename', '').lower()
498
+
499
+ # Boost for exact phrase match in content
500
+ if query_lower in content_lower:
501
+ result['score'] *= 2.0 # Double score for exact match
502
+
503
+ # Boost for matches in filenames that suggest relevance
504
+ if any(term in filename_lower for term in ['example', 'sample', 'demo', 'tutorial', 'guide']):
505
+ if 'example' in query_lower or 'sample' in query_lower or 'code' in query_lower:
506
+ result['score'] *= 1.5
507
+
508
+ # Boost for "getting started" type queries
509
+ if 'getting started' in query_lower and 'start' in content_lower:
510
+ result['score'] *= 1.5
511
+
512
+ return results
513
+
514
+ def _filename_search(self, query: str, count: int) -> List[Dict[str, Any]]:
515
+ """Search for query in filenames with term coverage scoring"""
516
+ try:
517
+ conn = sqlite3.connect(self.index_path)
518
+ cursor = conn.cursor()
519
+
520
+ query_lower = query.lower()
521
+ terms = query_lower.split()
522
+
523
+ # First try exact phrase match
524
+ cursor.execute('''
525
+ SELECT DISTINCT id, content, filename, section, tags, metadata
526
+ FROM chunks
527
+ WHERE LOWER(filename) LIKE ?
528
+ LIMIT ?
529
+ ''', (f'%{query_lower}%', count))
530
+
531
+ results = []
532
+ seen_ids = set()
533
+
534
+ # Process exact matches
535
+ for row in cursor.fetchall():
536
+ chunk_id, content, filename, section, tags_json, metadata_json = row
537
+ seen_ids.add(chunk_id)
538
+
539
+ # High score for exact phrase match
540
+ filename_lower = filename.lower()
541
+ basename = filename_lower.split('/')[-1] if '/' in filename_lower else filename_lower
542
+ if query_lower in basename:
543
+ score = 3.0 # Exact match in basename (increased weight)
544
+ else:
545
+ score = 2.0 # Exact match in path
546
+
547
+ results.append({
548
+ 'id': chunk_id,
549
+ 'content': content,
550
+ 'score': float(score),
551
+ 'metadata': {
552
+ 'filename': filename,
553
+ 'section': section,
554
+ 'tags': json.loads(tags_json) if tags_json else [],
555
+ 'metadata': json.loads(metadata_json) if metadata_json else {}
556
+ },
557
+ 'search_type': 'filename',
558
+ 'match_coverage': 1.0 # Exact match = 100% coverage
559
+ })
560
+
561
+ # Then search for files containing ANY of the terms
562
+ if terms and len(results) < count * 3: # Get more candidates
563
+ # Build OR query for any term match
564
+ conditions = []
565
+ params = []
566
+ for term in terms:
567
+ conditions.append("LOWER(filename) LIKE ?")
568
+ params.append(f'%{term}%')
569
+
570
+ sql = f'''
571
+ SELECT DISTINCT id, content, filename, section, tags, metadata
572
+ FROM chunks
573
+ WHERE ({' OR '.join(conditions)})
574
+ AND id NOT IN ({','.join(['?' for _ in seen_ids]) if seen_ids else '0'})
575
+ LIMIT ?
576
+ '''
577
+ if seen_ids:
578
+ params.extend(seen_ids)
579
+ params.append(count * 3)
580
+
581
+ cursor.execute(sql, params)
582
+
583
+ for row in cursor.fetchall():
584
+ chunk_id, content, filename, section, tags_json, metadata_json = row
585
+
586
+ # Enhanced scoring based on term coverage
587
+ filename_lower = filename.lower()
588
+ basename = filename_lower.split('/')[-1] if '/' in filename_lower else filename_lower
589
+
590
+ # Count matches in basename vs full path
591
+ basename_matches = sum(1 for term in terms if term in basename)
592
+ path_matches = sum(1 for term in terms if term in filename_lower)
593
+
594
+ # Calculate term coverage (what % of query terms are matched)
595
+ term_coverage = path_matches / len(terms) if terms else 0
596
+ basename_coverage = basename_matches / len(terms) if terms else 0
597
+
598
+ # Check for substring bonus (e.g., "code_examples" contains both terms together)
599
+ substring_bonus = 0
600
+ if len(terms) > 1:
601
+ # Check if terms appear consecutively
602
+ for i in range(len(terms) - 1):
603
+ if f"{terms[i]}_{terms[i+1]}" in filename_lower or f"{terms[i]}{terms[i+1]}" in filename_lower:
604
+ substring_bonus = 0.3
605
+ break
606
+
607
+ # Score based on coverage with exponential boost for more matches
608
+ if basename_coverage > 0:
609
+ # Exponential scoring for basename matches
610
+ score = basename_coverage ** 1.5 + substring_bonus
611
+ else:
612
+ # Lower score for path-only matches
613
+ score = (term_coverage * 0.5) ** 1.5 + substring_bonus
614
+
615
+ results.append({
616
+ 'id': chunk_id,
617
+ 'content': content,
618
+ 'score': float(score),
619
+ 'metadata': {
620
+ 'filename': filename,
621
+ 'section': section,
622
+ 'tags': json.loads(tags_json) if tags_json else [],
623
+ 'metadata': json.loads(metadata_json) if metadata_json else {}
624
+ },
625
+ 'search_type': 'filename',
626
+ 'match_coverage': term_coverage
627
+ })
628
+
629
+ conn.close()
630
+
631
+ # Sort by score and return top results
632
+ results.sort(key=lambda x: x['score'], reverse=True)
633
+ return results[:count]
634
+
635
+ except Exception as e:
636
+ logger.error(f"Error in filename search: {e}")
637
+ return []
638
+
639
+ def _metadata_search(self, query: str, count: int) -> List[Dict[str, Any]]:
640
+ """Search in all metadata fields (tags, sections, category, product, source)"""
641
+ try:
642
+ conn = sqlite3.connect(self.index_path)
643
+ cursor = conn.cursor()
644
+
645
+ query_lower = query.lower()
646
+ terms = query_lower.split()
647
+ results = []
648
+ seen_ids = set()
649
+
650
+ # First, try to use the metadata_text column if it exists
651
+ try:
652
+ # Check if metadata_text column exists
653
+ cursor.execute("PRAGMA table_info(chunks)")
654
+ columns = [col[1] for col in cursor.fetchall()]
655
+ has_metadata_text = 'metadata_text' in columns
656
+ except:
657
+ has_metadata_text = False
658
+
659
+ if has_metadata_text:
660
+ # Use the new metadata_text column for efficient searching
661
+ # Build conditions for each term
662
+ conditions = []
663
+ for term in terms:
664
+ conditions.append(f"metadata_text LIKE '%{term}%'")
665
+
666
+ if conditions:
667
+ query_sql = f'''
668
+ SELECT id, content, filename, section, tags, metadata
669
+ FROM chunks
670
+ WHERE {' AND '.join(conditions)}
671
+ LIMIT ?
672
+ '''
673
+ cursor.execute(query_sql, (count * 10,))
674
+
675
+ for row in cursor.fetchall():
676
+ chunk_id, content, filename, section, tags_json, metadata_json = row
677
+
678
+ if chunk_id in seen_ids:
679
+ continue
680
+
681
+ # Parse metadata
682
+ metadata = json.loads(metadata_json) if metadata_json else {}
683
+ tags = json.loads(tags_json) if tags_json else []
684
+
685
+ # Calculate score based on how many terms match
686
+ score = 0
687
+ for term in terms:
688
+ # Check metadata values
689
+ metadata_str = json.dumps(metadata).lower()
690
+ if term in metadata_str:
691
+ score += 1.5
692
+ # Check tags
693
+ if any(term in str(tag).lower() for tag in tags):
694
+ score += 1.0
695
+ # Check section
696
+ if section and term in section.lower():
697
+ score += 0.8
698
+
699
+ if score > 0:
700
+ seen_ids.add(chunk_id)
701
+ results.append({
702
+ 'id': chunk_id,
703
+ 'content': content,
704
+ 'score': score,
705
+ 'metadata': {
706
+ 'filename': filename,
707
+ 'section': section,
708
+ 'tags': tags,
709
+ 'metadata': metadata
710
+ },
711
+ 'search_type': 'metadata'
712
+ })
713
+
714
+ # Fallback: search for JSON metadata embedded in content
715
+ # This ensures backwards compatibility
716
+ if len(results) < count:
717
+ # Build specific conditions for known patterns
718
+ specific_conditions = []
719
+
720
+ # Look for specific high-value patterns first
721
+ if 'code' in terms and 'examples' in terms:
722
+ specific_conditions.append('content LIKE \'%"category": "Code Examples"%\'')
723
+ if 'sdk' in terms:
724
+ specific_conditions.append('content LIKE \'%"product": "%\' || \'SDK\' || \'%"%\'')
725
+
726
+ # General term search in JSON content
727
+ for term in terms:
728
+ specific_conditions.append(f"content LIKE '%\"{term}%'")
729
+
730
+ if specific_conditions:
731
+ # Limit conditions to avoid too broad search
732
+ conditions_to_use = specific_conditions[:10]
733
+ query_sql = f'''
734
+ SELECT id, content, filename, section, tags, metadata
735
+ FROM chunks
736
+ WHERE ({' OR '.join(conditions_to_use)})
737
+ AND id NOT IN ({','.join(str(id) for id in seen_ids) if seen_ids else '0'})
738
+ LIMIT ?
739
+ '''
740
+ cursor.execute(query_sql, (count * 5,))
741
+
742
+ rows = cursor.fetchall()
743
+
744
+ for row in rows:
745
+ chunk_id, content, filename, section, tags_json, metadata_json = row
746
+
747
+ if chunk_id in seen_ids:
748
+ continue
749
+
750
+ # Try to extract metadata from JSON content
751
+ json_metadata = {}
752
+ try:
753
+ # Look for metadata in JSON structure
754
+ if '"metadata":' in content:
755
+ import re
756
+ # More robust regex to extract nested JSON object
757
+ # This handles nested braces properly
758
+ start = content.find('"metadata":')
759
+ if start != -1:
760
+ # Find the opening brace
761
+ brace_start = content.find('{', start)
762
+ if brace_start != -1:
763
+ # Count braces to find matching closing brace
764
+ brace_count = 0
765
+ i = brace_start
766
+ while i < len(content):
767
+ if content[i] == '{':
768
+ brace_count += 1
769
+ elif content[i] == '}':
770
+ brace_count -= 1
771
+ if brace_count == 0:
772
+ # Found matching closing brace
773
+ metadata_str = content[brace_start:i+1]
774
+ json_metadata = json.loads(metadata_str)
775
+ break
776
+ i += 1
777
+ except:
778
+ pass
779
+
780
+ # Calculate score based on matches
781
+ score = 0
782
+ fields_matched = 0
783
+
784
+ # Check JSON metadata extracted from content
785
+ if json_metadata:
786
+ # Check category - count how many terms match
787
+ category = json_metadata.get('category', '').lower()
788
+ if category:
789
+ category_matches = sum(1 for term in terms if term in category)
790
+ if category_matches > 0:
791
+ score += 1.8 * (category_matches / len(terms) if terms else 1)
792
+ fields_matched += 1
793
+
794
+ # Check product - count how many terms match
795
+ product = json_metadata.get('product', '').lower()
796
+ if product:
797
+ product_matches = sum(1 for term in terms if term in product)
798
+ if product_matches > 0:
799
+ score += 1.5 * (product_matches / len(terms) if terms else 1)
800
+ fields_matched += 1
801
+
802
+ # Check source
803
+ source = json_metadata.get('source', '').lower()
804
+ if source:
805
+ source_matches = sum(1 for term in terms if term in source)
806
+ if source_matches > 0:
807
+ score += 1.2 * (source_matches / len(terms) if terms else 1)
808
+ fields_matched += 1
809
+
810
+ # Also check tags from JSON metadata
811
+ json_tags = json_metadata.get('tags', [])
812
+ if json_tags:
813
+ tags_str = str(json_tags).lower()
814
+ tag_matches = sum(1 for term in terms if term in tags_str)
815
+ if tag_matches > 0:
816
+ score += 1.3 * (tag_matches / len(terms) if terms else 1)
817
+ fields_matched += 1
818
+
819
+ if score > 0:
820
+ seen_ids.add(chunk_id)
821
+ results.append({
822
+ 'id': chunk_id,
823
+ 'content': content,
824
+ 'score': float(score),
825
+ 'metadata': {
826
+ 'filename': filename,
827
+ 'section': section,
828
+ 'tags': json.loads(tags_json) if tags_json else [],
829
+ 'metadata': json.loads(metadata_json) if metadata_json else {}
830
+ },
831
+ 'search_type': 'metadata',
832
+ 'fields_matched': fields_matched
833
+ })
834
+ logger.debug(f"Metadata match: {filename} - score={score:.2f}, fields_matched={fields_matched}, json_metadata={json_metadata}")
835
+
836
+ # Also get chunks with regular metadata
837
+ cursor.execute('''
838
+ SELECT id, content, filename, section, tags, metadata
839
+ FROM chunks
840
+ WHERE (tags IS NOT NULL AND tags != '')
841
+ OR (metadata IS NOT NULL AND metadata != '{}')
842
+ OR (section IS NOT NULL AND section != '')
843
+ LIMIT ?
844
+ ''', (count * 10,)) # Get more to search through
845
+
846
+ for row in cursor.fetchall():
847
+ chunk_id, content, filename, section, tags_json, metadata_json = row
848
+
849
+ if chunk_id in seen_ids:
850
+ continue
851
+
852
+ # Parse metadata
853
+ tags = json.loads(tags_json) if tags_json else []
854
+ metadata = json.loads(metadata_json) if metadata_json else {}
855
+
856
+ # Flatten nested metadata if present
857
+ if 'metadata' in metadata:
858
+ # Handle double-nested metadata from some indexes
859
+ nested_meta = metadata['metadata']
860
+ metadata.update(nested_meta)
861
+
862
+ # Initialize scoring components
863
+ score_components = {
864
+ 'tags': 0,
865
+ 'section': 0,
866
+ 'category': 0,
867
+ 'product': 0,
868
+ 'source': 0,
869
+ 'description': 0
870
+ }
871
+
872
+ # Check tags
873
+ if tags:
874
+ tag_matches = 0
875
+ for tag in tags:
876
+ tag_lower = tag.lower()
877
+ # Full query match in tag
878
+ if query_lower in tag_lower:
879
+ tag_matches += 2.0
880
+ else:
881
+ # Individual term matches
882
+ term_matches = sum(1 for term in terms if term in tag_lower)
883
+ tag_matches += term_matches * 0.5
884
+
885
+ if tag_matches > 0:
886
+ score_components['tags'] = min(1.0, tag_matches / len(tags))
887
+
888
+ # Check section
889
+ if section and section.lower() != 'none':
890
+ section_lower = section.lower()
891
+ if query_lower in section_lower:
892
+ score_components['section'] = 1.0
893
+ else:
894
+ term_matches = sum(1 for term in terms if term in section_lower)
895
+ score_components['section'] = (term_matches / len(terms)) * 0.8 if terms else 0
896
+
897
+ # Check category field
898
+ category = metadata.get('category', '')
899
+ if category:
900
+ category_lower = category.lower()
901
+ if query_lower in category_lower:
902
+ score_components['category'] = 1.0
903
+ else:
904
+ term_matches = sum(1 for term in terms if term in category_lower)
905
+ score_components['category'] = (term_matches / len(terms)) * 0.9 if terms else 0
906
+
907
+ # Check product field
908
+ product = metadata.get('product', '')
909
+ if product:
910
+ product_lower = product.lower()
911
+ if query_lower in product_lower:
912
+ score_components['product'] = 1.0
913
+ else:
914
+ term_matches = sum(1 for term in terms if term in product_lower)
915
+ score_components['product'] = (term_matches / len(terms)) * 0.8 if terms else 0
916
+
917
+ # Check source field (original filename)
918
+ source = metadata.get('source', '')
919
+ if source:
920
+ source_lower = source.lower()
921
+ if query_lower in source_lower:
922
+ score_components['source'] = 1.0
923
+ else:
924
+ term_matches = sum(1 for term in terms if term in source_lower)
925
+ score_components['source'] = (term_matches / len(terms)) * 0.7 if terms else 0
926
+
927
+ # Check description or title fields
928
+ description = metadata.get('description', metadata.get('title', ''))
929
+ if description:
930
+ desc_lower = description.lower()
931
+ if query_lower in desc_lower:
932
+ score_components['description'] = 0.8
933
+ else:
934
+ term_matches = sum(1 for term in terms if term in desc_lower)
935
+ score_components['description'] = (term_matches / len(terms)) * 0.6 if terms else 0
936
+
937
+ # Calculate total score with weights
938
+ weights = {
939
+ 'category': 1.8, # Strong signal
940
+ 'product': 1.5, # Strong signal
941
+ 'tags': 1.3, # Good signal
942
+ 'source': 1.2, # Good signal
943
+ 'section': 1.0, # Moderate signal
944
+ 'description': 0.8 # Weaker signal
945
+ }
946
+
947
+ total_score = sum(score_components[field] * weights.get(field, 1.0)
948
+ for field in score_components)
949
+
950
+ # Track match coverage
951
+ fields_matched = sum(1 for score in score_components.values() if score > 0)
952
+ match_coverage = sum(1 for term in terms if any(
953
+ term in str(field_value).lower()
954
+ for field_value in [tags, section, category, product, source, description]
955
+ if field_value
956
+ )) / len(terms) if terms else 0
957
+
958
+ if total_score > 0:
959
+ results.append({
960
+ 'id': chunk_id,
961
+ 'content': content,
962
+ 'score': float(total_score),
963
+ 'metadata': {
964
+ 'filename': filename,
965
+ 'section': section,
966
+ 'tags': tags,
967
+ 'metadata': metadata,
968
+ 'category': category,
969
+ 'product': product,
970
+ 'source': source
971
+ },
972
+ 'search_type': 'metadata',
973
+ 'metadata_matches': score_components,
974
+ 'fields_matched': fields_matched,
975
+ 'match_coverage': match_coverage
976
+ })
977
+ seen_ids.add(chunk_id)
978
+
979
+ conn.close()
980
+
981
+ # Sort by score and return top results
982
+ results.sort(key=lambda x: x['score'], reverse=True)
983
+ return results[:count]
984
+
985
+ except Exception as e:
986
+ logger.error(f"Error in metadata search: {e}")
987
+ return []
988
+
989
+ def _add_vector_scores_to_candidates(self, candidates: Dict[str, Dict], query_vector: NDArray,
990
+ similarity_threshold: float):
991
+ """Add vector similarity scores to existing candidates"""
992
+ if not candidates or not np:
993
+ return
994
+
995
+ try:
996
+ conn = sqlite3.connect(self.index_path)
997
+ cursor = conn.cursor()
998
+
999
+ # Get embeddings for candidate chunks only
1000
+ chunk_ids = list(candidates.keys())
1001
+ placeholders = ','.join(['?' for _ in chunk_ids])
1002
+
1003
+ cursor.execute(f'''
1004
+ SELECT id, embedding
1005
+ FROM chunks
1006
+ WHERE id IN ({placeholders}) AND embedding IS NOT NULL AND embedding != ''
1007
+ ''', chunk_ids)
1008
+
1009
+ for row in cursor.fetchall():
1010
+ chunk_id, embedding_blob = row
1011
+
1012
+ if not embedding_blob:
1013
+ continue
1014
+
1015
+ try:
1016
+ # Convert embedding back to numpy array
1017
+ embedding = np.frombuffer(embedding_blob, dtype=np.float32).reshape(1, -1)
1018
+
1019
+ # Calculate similarity
1020
+ similarity = cosine_similarity(query_vector, embedding)[0][0]
1021
+ distance = 1 - similarity
1022
+
1023
+ # Add vector scores to candidate
1024
+ candidates[chunk_id]['vector_score'] = float(similarity)
1025
+ candidates[chunk_id]['vector_distance'] = float(distance)
1026
+ candidates[chunk_id]['sources']['vector_rerank'] = True
1027
+
1028
+ except Exception as e:
1029
+ logger.debug(f"Error processing embedding for chunk {chunk_id}: {e}")
1030
+ continue
1031
+
1032
+ conn.close()
1033
+
1034
+ except Exception as e:
1035
+ logger.error(f"Error in vector re-ranking: {e}")
1036
+
1037
+ def _calculate_combined_score(self, candidate: Dict, similarity_threshold: float) -> float:
1038
+ """Calculate final score with hybrid vector + metadata weighting
1039
+
1040
+ Hybrid approach:
1041
+ - Vector score is the primary ranking signal (semantic similarity)
1042
+ - Metadata/keyword matches provide confirmation boost
1043
+ - Multiple signal types indicate high relevance (confirmation bonus)
1044
+ - Special boost for 'code' tag matches when query contains code-related terms
1045
+ """
1046
+ sources = candidate.get('sources', {})
1047
+ source_scores = candidate.get('source_scores', {})
1048
+
1049
+ # Vector score is PRIMARY
1050
+ if 'vector_score' in candidate:
1051
+ vector_score = candidate['vector_score']
1052
+ base_score = vector_score
1053
+
1054
+ # Metadata/keyword matches provide confirmation boost
1055
+ if len(sources) > 1:
1056
+ # Has both vector AND metadata/keyword matches - strong confirmation signal
1057
+ keyword_signals = sum(source_scores.get(k, 0) for k in ['keyword', 'filename', 'metadata'])
1058
+ if keyword_signals > 0:
1059
+ # Normalize and apply boost (up to 30% for strong confirmation)
1060
+ keyword_boost = min(0.3, keyword_signals * 0.15)
1061
+ base_score = vector_score * (1.0 + keyword_boost)
1062
+
1063
+ # Additional boost if multiple signal types confirm (2+ sources)
1064
+ num_metadata_sources = sum(1 for s in ['keyword', 'filename', 'metadata'] if s in sources)
1065
+ if num_metadata_sources >= 2:
1066
+ # Multiple confirmation signals - very high confidence
1067
+ base_score *= 1.1
1068
+
1069
+ # Check for code-related tags to boost code examples
1070
+ tags = candidate.get('metadata', {}).get('tags', [])
1071
+ if 'code' in tags:
1072
+ # This chunk contains code - boost if query is code-related
1073
+ # (metadata search would have found it if query mentioned code/example/python/etc)
1074
+ if 'metadata' in sources or 'keyword' in sources:
1075
+ # Query matched code-related metadata - apply code boost
1076
+ base_score *= 1.2
1077
+ else:
1078
+ # No vector score - this is a keyword-only result (backfill)
1079
+ # Use keyword scores but penalize for lack of semantic match
1080
+ base_score = sum(source_scores.values()) * 0.6 # 40% penalty for no vector
1081
+
1082
+ # Still boost code chunks if metadata matched
1083
+ tags = candidate.get('metadata', {}).get('tags', [])
1084
+ if 'code' in tags and 'metadata' in sources:
1085
+ base_score *= 1.15
1086
+
1087
+ return base_score
1088
+
1089
+ def _apply_diversity_penalties(self, results: List[Dict], target_count: int) -> List[Dict]:
1090
+ """Apply penalties to prevent single-file dominance while maintaining quality"""
1091
+ if not results:
1092
+ return results
1093
+
1094
+ # Track file occurrences
1095
+ file_counts = {}
1096
+ penalized_results = []
1097
+
1098
+ # Define penalty multipliers
1099
+ occurrence_penalties = {
1100
+ 1: 1.0, # First chunk: no penalty
1101
+ 2: 0.85, # Second chunk: 15% penalty
1102
+ 3: 0.7, # Third chunk: 30% penalty
1103
+ 4: 0.5, # Fourth chunk: 50% penalty
1104
+ }
1105
+
1106
+ for result in results:
1107
+ filename = result['metadata']['filename']
1108
+
1109
+ # Get current count for this file
1110
+ current_count = file_counts.get(filename, 0) + 1
1111
+ file_counts[filename] = current_count
1112
+
1113
+ # Apply penalty based on occurrence
1114
+ penalty = occurrence_penalties.get(current_count, 0.4) # 60% penalty for 5+ chunks
1115
+
1116
+ # Create a copy to avoid modifying original
1117
+ penalized_result = result.copy()
1118
+ penalized_result['diversity_penalty'] = penalty
1119
+ penalized_result['final_score'] = result.get('final_score', result.get('score', 0)) * penalty
1120
+
1121
+ penalized_results.append(penalized_result)
1122
+
1123
+ # Re-sort by penalized scores
1124
+ penalized_results.sort(key=lambda x: x['final_score'], reverse=True)
1125
+
1126
+ # Ensure minimum diversity if we have enough results
1127
+ if len(penalized_results) > target_count:
1128
+ unique_files = len(set(r['metadata']['filename'] for r in penalized_results[:target_count]))
1129
+
1130
+ # If top results are too homogeneous (e.g., all from 1-2 files)
1131
+ if unique_files < min(3, target_count):
1132
+ # Try to inject some diversity
1133
+ selected = penalized_results[:target_count]
1134
+ seen_files = set(r['metadata']['filename'] for r in selected)
1135
+
1136
+ # Look for high-quality results from other files
1137
+ for result in penalized_results[target_count:]:
1138
+ if result['metadata']['filename'] not in seen_files:
1139
+ # If it's reasonably good (within 50% of top score), include it
1140
+ if result['final_score'] > 0.5 * selected[0]['final_score']:
1141
+ # Replace the lowest scoring result from an over-represented file
1142
+ for i in range(len(selected) - 1, -1, -1):
1143
+ if file_counts[selected[i]['metadata']['filename']] > 2:
1144
+ selected[i] = result
1145
+ seen_files.add(result['metadata']['filename'])
1146
+ break
1147
+
1148
+ penalized_results[:target_count] = selected
1149
+
1150
+ return penalized_results
1151
+
1152
+ def _apply_match_type_diversity(self, results: List[Dict], target_count: int) -> List[Dict]:
1153
+ """Ensure diversity of match types in final results
1154
+
1155
+ Ensures we have a mix of:
1156
+ - Vector-only matches (semantic similarity, good for code examples)
1157
+ - Keyword-only matches (exact term matches)
1158
+ - Hybrid matches (both vector + keyword/metadata)
1159
+ """
1160
+ if not results or len(results) <= target_count:
1161
+ return results
1162
+
1163
+ # Categorize results by match type
1164
+ vector_only = []
1165
+ keyword_only = []
1166
+ hybrid = []
1167
+
1168
+ for result in results:
1169
+ sources = result.get('sources', {})
1170
+ has_vector = 'vector' in sources
1171
+ has_keyword = any(k in sources for k in ['keyword', 'filename', 'metadata'])
1172
+
1173
+ if has_vector and not has_keyword:
1174
+ vector_only.append(result)
1175
+ elif has_keyword and not has_vector:
1176
+ keyword_only.append(result)
1177
+ else:
1178
+ hybrid.append(result)
1179
+
1180
+ # Build diverse result set
1181
+ # Target distribution: 40% hybrid, 40% vector-only, 20% keyword-only
1182
+ # This ensures we include semantic matches (code examples) even if keywords don't match
1183
+ diversified = []
1184
+
1185
+ # Take top hybrid matches first (best overall)
1186
+ hybrid_target = max(1, int(target_count * 0.4))
1187
+ diversified.extend(hybrid[:hybrid_target])
1188
+
1189
+ # Ensure we have vector-only matches (critical for code examples)
1190
+ vector_target = max(1, int(target_count * 0.4))
1191
+ diversified.extend(vector_only[:vector_target])
1192
+
1193
+ # Add keyword-only matches
1194
+ keyword_target = max(1, int(target_count * 0.2))
1195
+ diversified.extend(keyword_only[:keyword_target])
1196
+
1197
+ # Fill remaining slots with best remaining results regardless of type
1198
+ remaining_slots = target_count - len(diversified)
1199
+ if remaining_slots > 0:
1200
+ # Get all unused results
1201
+ used_ids = set(r['id'] for r in diversified)
1202
+ unused = [r for r in results if r['id'] not in used_ids]
1203
+ diversified.extend(unused[:remaining_slots])
1204
+
1205
+ # Sort by final score to maintain quality ordering
1206
+ diversified.sort(key=lambda x: x['final_score'], reverse=True)
1207
+
1208
+ return diversified
1209
+
334
1210
  def get_stats(self) -> Dict[str, Any]:
335
1211
  """Get statistics about the search index"""
1212
+ # Use pgvector backend if available
1213
+ if self.backend == 'pgvector':
1214
+ return self._backend.get_stats()
1215
+
1216
+ # Original SQLite implementation
336
1217
  conn = sqlite3.connect(self.index_path)
337
1218
  cursor = conn.cursor()
338
1219