signalwire-agents 0.1.47__py3-none-any.whl → 0.1.49__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- signalwire_agents/__init__.py +1 -1
- signalwire_agents/cli/build_search.py +516 -12
- signalwire_agents/core/mixins/ai_config_mixin.py +4 -0
- signalwire_agents/schema.json +57 -1
- signalwire_agents/search/__init__.py +7 -1
- signalwire_agents/search/document_processor.py +11 -8
- signalwire_agents/search/index_builder.py +112 -13
- signalwire_agents/search/migration.py +418 -0
- signalwire_agents/search/models.py +30 -0
- signalwire_agents/search/pgvector_backend.py +236 -13
- signalwire_agents/search/query_processor.py +87 -9
- signalwire_agents/search/search_engine.py +835 -31
- signalwire_agents/search/search_service.py +56 -6
- signalwire_agents/skills/native_vector_search/skill.py +208 -33
- {signalwire_agents-0.1.47.dist-info → signalwire_agents-0.1.49.dist-info}/METADATA +1 -1
- {signalwire_agents-0.1.47.dist-info → signalwire_agents-0.1.49.dist-info}/RECORD +20 -18
- {signalwire_agents-0.1.47.dist-info → signalwire_agents-0.1.49.dist-info}/WHEEL +0 -0
- {signalwire_agents-0.1.47.dist-info → signalwire_agents-0.1.49.dist-info}/entry_points.txt +0 -0
- {signalwire_agents-0.1.47.dist-info → signalwire_agents-0.1.49.dist-info}/licenses/LICENSE +0 -0
- {signalwire_agents-0.1.47.dist-info → signalwire_agents-0.1.49.dist-info}/top_level.txt +0 -0
@@ -74,9 +74,16 @@ class SearchEngine:
|
|
74
74
|
|
75
75
|
def search(self, query_vector: List[float], enhanced_text: str,
|
76
76
|
count: int = 3, distance_threshold: float = 0.0,
|
77
|
-
tags: Optional[List[str]] = None
|
77
|
+
tags: Optional[List[str]] = None,
|
78
|
+
keyword_weight: Optional[float] = None,
|
79
|
+
original_query: Optional[str] = None) -> List[Dict[str, Any]]:
|
78
80
|
"""
|
79
|
-
Perform
|
81
|
+
Perform improved search with fast filtering and vector re-ranking
|
82
|
+
|
83
|
+
Strategy:
|
84
|
+
1. Fast candidate collection (filename, metadata, keywords)
|
85
|
+
2. Vector re-ranking on candidates only
|
86
|
+
3. Fallback to full vector search if few candidates
|
80
87
|
|
81
88
|
Args:
|
82
89
|
query_vector: Embedding vector for the query
|
@@ -84,6 +91,8 @@ class SearchEngine:
|
|
84
91
|
count: Number of results to return
|
85
92
|
distance_threshold: Minimum similarity score
|
86
93
|
tags: Filter by tags
|
94
|
+
keyword_weight: Optional manual weight for keyword vs vector
|
95
|
+
original_query: Original query for exact matching
|
87
96
|
|
88
97
|
Returns:
|
89
98
|
List of search results with scores and metadata
|
@@ -91,45 +100,108 @@ class SearchEngine:
|
|
91
100
|
|
92
101
|
# Use pgvector backend if available
|
93
102
|
if self.backend == 'pgvector':
|
94
|
-
return self._backend.search(query_vector, enhanced_text, count, distance_threshold, tags)
|
103
|
+
return self._backend.search(query_vector, enhanced_text, count, distance_threshold, tags, keyword_weight)
|
95
104
|
|
96
|
-
#
|
105
|
+
# Check for numpy/sklearn availability
|
97
106
|
if not np or not cosine_similarity:
|
98
107
|
logger.warning("NumPy or scikit-learn not available. Using keyword search only.")
|
99
|
-
return self._keyword_search_only(enhanced_text, count, tags)
|
108
|
+
return self._keyword_search_only(enhanced_text, count, tags, original_query)
|
100
109
|
|
101
110
|
# Convert query vector to numpy array
|
102
111
|
try:
|
103
112
|
query_array = np.array(query_vector).reshape(1, -1)
|
104
113
|
except Exception as e:
|
105
114
|
logger.error(f"Error converting query vector: {e}")
|
106
|
-
return self._keyword_search_only(enhanced_text, count, tags)
|
115
|
+
return self._keyword_search_only(enhanced_text, count, tags, original_query)
|
107
116
|
|
108
|
-
#
|
109
|
-
|
117
|
+
# Stage 1: Collect candidates using fast methods
|
118
|
+
candidates = {}
|
110
119
|
|
111
|
-
#
|
112
|
-
|
120
|
+
# Fast searches - collect all potential matches
|
121
|
+
filename_results = self._filename_search(original_query or enhanced_text, count * 3)
|
122
|
+
metadata_results = self._metadata_search(original_query or enhanced_text, count * 2)
|
123
|
+
keyword_results = self._keyword_search(enhanced_text, count * 2, original_query)
|
113
124
|
|
114
|
-
|
115
|
-
|
125
|
+
logger.debug(f"Search for '{original_query}': filename={len(filename_results)}, metadata={len(metadata_results)}, keyword={len(keyword_results)}")
|
126
|
+
|
127
|
+
# Merge candidates from different sources
|
128
|
+
for result_set, source_weight in [(filename_results, 2.0),
|
129
|
+
(metadata_results, 1.5),
|
130
|
+
(keyword_results, 1.0)]:
|
131
|
+
for result in result_set:
|
132
|
+
chunk_id = result['id']
|
133
|
+
if chunk_id not in candidates:
|
134
|
+
candidates[chunk_id] = result
|
135
|
+
candidates[chunk_id]['sources'] = {}
|
136
|
+
candidates[chunk_id]['source_scores'] = {}
|
137
|
+
|
138
|
+
# Track which searches found this chunk
|
139
|
+
candidates[chunk_id]['sources'][result['search_type']] = True
|
140
|
+
candidates[chunk_id]['source_scores'][result['search_type']] = result['score'] * source_weight
|
141
|
+
|
142
|
+
# Stage 2: Check if we have enough candidates
|
143
|
+
if len(candidates) < count * 2:
|
144
|
+
# Not enough candidates from fast searches - add full vector search
|
145
|
+
logger.debug(f"Only {len(candidates)} candidates from fast search, adding full vector search")
|
146
|
+
vector_results = self._vector_search(query_array, count * 3)
|
147
|
+
|
148
|
+
for result in vector_results:
|
149
|
+
chunk_id = result['id']
|
150
|
+
if chunk_id not in candidates:
|
151
|
+
candidates[chunk_id] = result
|
152
|
+
candidates[chunk_id]['sources'] = {'vector': True}
|
153
|
+
candidates[chunk_id]['source_scores'] = {}
|
154
|
+
|
155
|
+
# Add vector score
|
156
|
+
candidates[chunk_id]['vector_score'] = result['score']
|
157
|
+
candidates[chunk_id]['vector_distance'] = 1 - result['score']
|
158
|
+
else:
|
159
|
+
# We have enough candidates - just re-rank them with vectors
|
160
|
+
logger.debug(f"Re-ranking {len(candidates)} candidates with vector similarity")
|
161
|
+
self._add_vector_scores_to_candidates(candidates, query_array, distance_threshold)
|
162
|
+
|
163
|
+
# Stage 3: Score and rank all candidates
|
164
|
+
final_results = []
|
165
|
+
for chunk_id, candidate in candidates.items():
|
166
|
+
# Calculate final score combining all signals
|
167
|
+
score = self._calculate_combined_score(candidate, distance_threshold)
|
168
|
+
candidate['final_score'] = score
|
169
|
+
final_results.append(candidate)
|
170
|
+
|
171
|
+
# Sort by final score
|
172
|
+
final_results.sort(key=lambda x: x['final_score'], reverse=True)
|
116
173
|
|
117
174
|
# Filter by tags if specified
|
118
175
|
if tags:
|
119
|
-
|
176
|
+
final_results = [r for r in final_results
|
177
|
+
if any(tag in r['metadata'].get('tags', []) for tag in tags)]
|
120
178
|
|
121
|
-
#
|
122
|
-
|
123
|
-
r for r in
|
124
|
-
|
125
|
-
|
179
|
+
# Apply distance threshold as final filter (soft threshold already applied in scoring)
|
180
|
+
if distance_threshold > 0:
|
181
|
+
final_results = [r for r in final_results
|
182
|
+
if r.get('vector_distance', 0) <= distance_threshold * 1.5
|
183
|
+
or 'vector' not in r.get('sources', {})]
|
126
184
|
|
127
|
-
|
185
|
+
# Boost exact matches if we have the original query
|
186
|
+
if original_query:
|
187
|
+
final_results = self._boost_exact_matches(final_results, original_query)
|
188
|
+
# Re-sort after boosting
|
189
|
+
final_results.sort(key=lambda x: x['final_score'], reverse=True)
|
190
|
+
|
191
|
+
# Apply diversity penalties to prevent single-file dominance
|
192
|
+
final_results = self._apply_diversity_penalties(final_results, count)
|
193
|
+
|
194
|
+
# Ensure 'score' field exists for CLI compatibility
|
195
|
+
for r in final_results:
|
196
|
+
if 'score' not in r:
|
197
|
+
r['score'] = r.get('final_score', 0.0)
|
198
|
+
|
199
|
+
return final_results[:count]
|
128
200
|
|
129
201
|
def _keyword_search_only(self, enhanced_text: str, count: int,
|
130
|
-
tags: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
202
|
+
tags: Optional[List[str]] = None, original_query: Optional[str] = None) -> List[Dict[str, Any]]:
|
131
203
|
"""Fallback to keyword search only when vector search is unavailable"""
|
132
|
-
keyword_results = self._keyword_search(enhanced_text, count)
|
204
|
+
keyword_results = self._keyword_search(enhanced_text, count, original_query)
|
133
205
|
|
134
206
|
if tags:
|
135
207
|
keyword_results = self._filter_by_tags(keyword_results, tags)
|
@@ -193,7 +265,7 @@ class SearchEngine:
|
|
193
265
|
logger.error(f"Error in vector search: {e}")
|
194
266
|
return []
|
195
267
|
|
196
|
-
def _keyword_search(self, enhanced_text: str, count: int) -> List[Dict[str, Any]]:
|
268
|
+
def _keyword_search(self, enhanced_text: str, count: int, original_query: Optional[str] = None) -> List[Dict[str, Any]]:
|
197
269
|
"""Perform full-text search"""
|
198
270
|
try:
|
199
271
|
conn = sqlite3.connect(self.index_path)
|
@@ -235,6 +307,12 @@ class SearchEngine:
|
|
235
307
|
})
|
236
308
|
|
237
309
|
conn.close()
|
310
|
+
|
311
|
+
# If FTS returns no results, try fallback LIKE search
|
312
|
+
if not results:
|
313
|
+
logger.debug(f"FTS returned no results for '{enhanced_text}', trying fallback search")
|
314
|
+
return self._fallback_search(enhanced_text, count)
|
315
|
+
|
238
316
|
return results
|
239
317
|
|
240
318
|
except Exception as e:
|
@@ -259,35 +337,74 @@ class SearchEngine:
|
|
259
337
|
conn = sqlite3.connect(self.index_path)
|
260
338
|
cursor = conn.cursor()
|
261
339
|
|
262
|
-
# Simple LIKE search
|
340
|
+
# Simple LIKE search with word boundaries
|
263
341
|
search_terms = enhanced_text.lower().split()
|
264
342
|
like_conditions = []
|
265
343
|
params = []
|
266
344
|
|
267
345
|
for term in search_terms[:5]: # Limit to 5 terms to avoid too complex queries
|
268
|
-
|
269
|
-
|
346
|
+
# Search for term with word boundaries (space or punctuation)
|
347
|
+
like_conditions.append("""
|
348
|
+
(LOWER(processed_content) LIKE ?
|
349
|
+
OR LOWER(processed_content) LIKE ?
|
350
|
+
OR LOWER(processed_content) LIKE ?
|
351
|
+
OR LOWER(processed_content) LIKE ?)
|
352
|
+
""")
|
353
|
+
params.extend([
|
354
|
+
f"% {term} %", # space on both sides
|
355
|
+
f"{term} %", # at beginning
|
356
|
+
f"% {term}", # at end
|
357
|
+
f"{term}" # exact match
|
358
|
+
])
|
270
359
|
|
271
360
|
if not like_conditions:
|
272
361
|
return []
|
273
362
|
|
363
|
+
# Also search in original content
|
364
|
+
content_conditions = []
|
365
|
+
for term in search_terms[:5]:
|
366
|
+
content_conditions.append("""
|
367
|
+
(LOWER(content) LIKE ?
|
368
|
+
OR LOWER(content) LIKE ?
|
369
|
+
OR LOWER(content) LIKE ?
|
370
|
+
OR LOWER(content) LIKE ?)
|
371
|
+
""")
|
372
|
+
params.extend([
|
373
|
+
f"% {term} %", # with spaces
|
374
|
+
f"{term} %", # at beginning
|
375
|
+
f"% {term}", # at end
|
376
|
+
f"{term}" # exact match
|
377
|
+
])
|
378
|
+
|
274
379
|
query = f'''
|
275
380
|
SELECT id, content, filename, section, tags, metadata
|
276
381
|
FROM chunks
|
277
|
-
WHERE {" OR ".join(like_conditions)}
|
382
|
+
WHERE ({" OR ".join(like_conditions)})
|
383
|
+
OR ({" OR ".join(content_conditions)})
|
278
384
|
LIMIT ?
|
279
385
|
'''
|
280
386
|
params.append(count)
|
281
387
|
|
388
|
+
|
282
389
|
cursor.execute(query, params)
|
283
390
|
|
284
391
|
results = []
|
285
392
|
for row in cursor.fetchall():
|
286
393
|
chunk_id, content, filename, section, tags_json, metadata_json = row
|
287
394
|
|
288
|
-
# Simple scoring based on term matches
|
395
|
+
# Simple scoring based on term matches with word boundaries
|
289
396
|
content_lower = content.lower()
|
290
|
-
|
397
|
+
# Check for whole word matches
|
398
|
+
word_matches = 0
|
399
|
+
for term in search_terms:
|
400
|
+
term_lower = term.lower()
|
401
|
+
# Check word boundaries
|
402
|
+
if (f" {term_lower} " in f" {content_lower} " or
|
403
|
+
content_lower.startswith(f"{term_lower} ") or
|
404
|
+
content_lower.endswith(f" {term_lower}") or
|
405
|
+
content_lower == term_lower):
|
406
|
+
word_matches += 1
|
407
|
+
score = word_matches / len(search_terms) if search_terms else 0.0
|
291
408
|
|
292
409
|
results.append({
|
293
410
|
'id': chunk_id,
|
@@ -306,14 +423,23 @@ class SearchEngine:
|
|
306
423
|
|
307
424
|
# Sort by score
|
308
425
|
results.sort(key=lambda x: x['score'], reverse=True)
|
426
|
+
|
309
427
|
return results
|
310
428
|
|
311
429
|
except Exception as e:
|
312
430
|
logger.error(f"Error in fallback search: {e}")
|
313
431
|
return []
|
314
432
|
|
315
|
-
def _merge_results(self, vector_results: List[Dict], keyword_results: List[Dict]
|
433
|
+
def _merge_results(self, vector_results: List[Dict], keyword_results: List[Dict],
|
434
|
+
vector_weight: Optional[float] = None,
|
435
|
+
keyword_weight: Optional[float] = None) -> List[Dict[str, Any]]:
|
316
436
|
"""Merge and rank vector and keyword search results"""
|
437
|
+
# Use provided weights or defaults
|
438
|
+
if vector_weight is None:
|
439
|
+
vector_weight = 0.7
|
440
|
+
if keyword_weight is None:
|
441
|
+
keyword_weight = 0.3
|
442
|
+
|
317
443
|
# Create a combined list with weighted scores
|
318
444
|
combined = {}
|
319
445
|
|
@@ -335,8 +461,6 @@ class SearchEngine:
|
|
335
461
|
combined[chunk_id]['keyword_score'] = result['score']
|
336
462
|
|
337
463
|
# Calculate combined score (weighted average)
|
338
|
-
vector_weight = 0.7
|
339
|
-
keyword_weight = 0.3
|
340
464
|
|
341
465
|
for chunk_id, result in combined.items():
|
342
466
|
vector_score = result.get('vector_score', 0.0)
|
@@ -363,6 +487,686 @@ class SearchEngine:
|
|
363
487
|
filtered.append(result)
|
364
488
|
return filtered
|
365
489
|
|
490
|
+
def _boost_exact_matches(self, results: List[Dict[str, Any]], original_query: str) -> List[Dict[str, Any]]:
|
491
|
+
"""Boost scores for results that contain exact matches of the original query"""
|
492
|
+
if not original_query:
|
493
|
+
return results
|
494
|
+
|
495
|
+
# Extract key phrases to look for
|
496
|
+
query_lower = original_query.lower()
|
497
|
+
|
498
|
+
for result in results:
|
499
|
+
content_lower = result['content'].lower()
|
500
|
+
filename_lower = result['metadata'].get('filename', '').lower()
|
501
|
+
|
502
|
+
# Boost for exact phrase match in content
|
503
|
+
if query_lower in content_lower:
|
504
|
+
result['score'] *= 2.0 # Double score for exact match
|
505
|
+
|
506
|
+
# Boost for matches in filenames that suggest relevance
|
507
|
+
if any(term in filename_lower for term in ['example', 'sample', 'demo', 'tutorial', 'guide']):
|
508
|
+
if 'example' in query_lower or 'sample' in query_lower or 'code' in query_lower:
|
509
|
+
result['score'] *= 1.5
|
510
|
+
|
511
|
+
# Boost for "getting started" type queries
|
512
|
+
if 'getting started' in query_lower and 'start' in content_lower:
|
513
|
+
result['score'] *= 1.5
|
514
|
+
|
515
|
+
return results
|
516
|
+
|
517
|
+
def _filename_search(self, query: str, count: int) -> List[Dict[str, Any]]:
|
518
|
+
"""Search for query in filenames with term coverage scoring"""
|
519
|
+
try:
|
520
|
+
conn = sqlite3.connect(self.index_path)
|
521
|
+
cursor = conn.cursor()
|
522
|
+
|
523
|
+
query_lower = query.lower()
|
524
|
+
terms = query_lower.split()
|
525
|
+
|
526
|
+
# First try exact phrase match
|
527
|
+
cursor.execute('''
|
528
|
+
SELECT DISTINCT id, content, filename, section, tags, metadata
|
529
|
+
FROM chunks
|
530
|
+
WHERE LOWER(filename) LIKE ?
|
531
|
+
LIMIT ?
|
532
|
+
''', (f'%{query_lower}%', count))
|
533
|
+
|
534
|
+
results = []
|
535
|
+
seen_ids = set()
|
536
|
+
|
537
|
+
# Process exact matches
|
538
|
+
for row in cursor.fetchall():
|
539
|
+
chunk_id, content, filename, section, tags_json, metadata_json = row
|
540
|
+
seen_ids.add(chunk_id)
|
541
|
+
|
542
|
+
# High score for exact phrase match
|
543
|
+
filename_lower = filename.lower()
|
544
|
+
basename = filename_lower.split('/')[-1] if '/' in filename_lower else filename_lower
|
545
|
+
if query_lower in basename:
|
546
|
+
score = 3.0 # Exact match in basename (increased weight)
|
547
|
+
else:
|
548
|
+
score = 2.0 # Exact match in path
|
549
|
+
|
550
|
+
results.append({
|
551
|
+
'id': chunk_id,
|
552
|
+
'content': content,
|
553
|
+
'score': float(score),
|
554
|
+
'metadata': {
|
555
|
+
'filename': filename,
|
556
|
+
'section': section,
|
557
|
+
'tags': json.loads(tags_json) if tags_json else [],
|
558
|
+
'metadata': json.loads(metadata_json) if metadata_json else {}
|
559
|
+
},
|
560
|
+
'search_type': 'filename',
|
561
|
+
'match_coverage': 1.0 # Exact match = 100% coverage
|
562
|
+
})
|
563
|
+
|
564
|
+
# Then search for files containing ANY of the terms
|
565
|
+
if terms and len(results) < count * 3: # Get more candidates
|
566
|
+
# Build OR query for any term match
|
567
|
+
conditions = []
|
568
|
+
params = []
|
569
|
+
for term in terms:
|
570
|
+
conditions.append("LOWER(filename) LIKE ?")
|
571
|
+
params.append(f'%{term}%')
|
572
|
+
|
573
|
+
sql = f'''
|
574
|
+
SELECT DISTINCT id, content, filename, section, tags, metadata
|
575
|
+
FROM chunks
|
576
|
+
WHERE ({' OR '.join(conditions)})
|
577
|
+
AND id NOT IN ({','.join(['?' for _ in seen_ids]) if seen_ids else '0'})
|
578
|
+
LIMIT ?
|
579
|
+
'''
|
580
|
+
if seen_ids:
|
581
|
+
params.extend(seen_ids)
|
582
|
+
params.append(count * 3)
|
583
|
+
|
584
|
+
cursor.execute(sql, params)
|
585
|
+
|
586
|
+
for row in cursor.fetchall():
|
587
|
+
chunk_id, content, filename, section, tags_json, metadata_json = row
|
588
|
+
|
589
|
+
# Enhanced scoring based on term coverage
|
590
|
+
filename_lower = filename.lower()
|
591
|
+
basename = filename_lower.split('/')[-1] if '/' in filename_lower else filename_lower
|
592
|
+
|
593
|
+
# Count matches in basename vs full path
|
594
|
+
basename_matches = sum(1 for term in terms if term in basename)
|
595
|
+
path_matches = sum(1 for term in terms if term in filename_lower)
|
596
|
+
|
597
|
+
# Calculate term coverage (what % of query terms are matched)
|
598
|
+
term_coverage = path_matches / len(terms) if terms else 0
|
599
|
+
basename_coverage = basename_matches / len(terms) if terms else 0
|
600
|
+
|
601
|
+
# Check for substring bonus (e.g., "code_examples" contains both terms together)
|
602
|
+
substring_bonus = 0
|
603
|
+
if len(terms) > 1:
|
604
|
+
# Check if terms appear consecutively
|
605
|
+
for i in range(len(terms) - 1):
|
606
|
+
if f"{terms[i]}_{terms[i+1]}" in filename_lower or f"{terms[i]}{terms[i+1]}" in filename_lower:
|
607
|
+
substring_bonus = 0.3
|
608
|
+
break
|
609
|
+
|
610
|
+
# Score based on coverage with exponential boost for more matches
|
611
|
+
if basename_coverage > 0:
|
612
|
+
# Exponential scoring for basename matches
|
613
|
+
score = basename_coverage ** 1.5 + substring_bonus
|
614
|
+
else:
|
615
|
+
# Lower score for path-only matches
|
616
|
+
score = (term_coverage * 0.5) ** 1.5 + substring_bonus
|
617
|
+
|
618
|
+
results.append({
|
619
|
+
'id': chunk_id,
|
620
|
+
'content': content,
|
621
|
+
'score': float(score),
|
622
|
+
'metadata': {
|
623
|
+
'filename': filename,
|
624
|
+
'section': section,
|
625
|
+
'tags': json.loads(tags_json) if tags_json else [],
|
626
|
+
'metadata': json.loads(metadata_json) if metadata_json else {}
|
627
|
+
},
|
628
|
+
'search_type': 'filename',
|
629
|
+
'match_coverage': term_coverage
|
630
|
+
})
|
631
|
+
|
632
|
+
conn.close()
|
633
|
+
|
634
|
+
# Sort by score and return top results
|
635
|
+
results.sort(key=lambda x: x['score'], reverse=True)
|
636
|
+
return results[:count]
|
637
|
+
|
638
|
+
except Exception as e:
|
639
|
+
logger.error(f"Error in filename search: {e}")
|
640
|
+
return []
|
641
|
+
|
642
|
+
def _metadata_search(self, query: str, count: int) -> List[Dict[str, Any]]:
|
643
|
+
"""Search in all metadata fields (tags, sections, category, product, source)"""
|
644
|
+
try:
|
645
|
+
conn = sqlite3.connect(self.index_path)
|
646
|
+
cursor = conn.cursor()
|
647
|
+
|
648
|
+
query_lower = query.lower()
|
649
|
+
terms = query_lower.split()
|
650
|
+
results = []
|
651
|
+
seen_ids = set()
|
652
|
+
|
653
|
+
# First, try to use the metadata_text column if it exists
|
654
|
+
try:
|
655
|
+
# Check if metadata_text column exists
|
656
|
+
cursor.execute("PRAGMA table_info(chunks)")
|
657
|
+
columns = [col[1] for col in cursor.fetchall()]
|
658
|
+
has_metadata_text = 'metadata_text' in columns
|
659
|
+
except:
|
660
|
+
has_metadata_text = False
|
661
|
+
|
662
|
+
if has_metadata_text:
|
663
|
+
# Use the new metadata_text column for efficient searching
|
664
|
+
# Build conditions for each term
|
665
|
+
conditions = []
|
666
|
+
for term in terms:
|
667
|
+
conditions.append(f"metadata_text LIKE '%{term}%'")
|
668
|
+
|
669
|
+
if conditions:
|
670
|
+
query_sql = f'''
|
671
|
+
SELECT id, content, filename, section, tags, metadata
|
672
|
+
FROM chunks
|
673
|
+
WHERE {' AND '.join(conditions)}
|
674
|
+
LIMIT ?
|
675
|
+
'''
|
676
|
+
cursor.execute(query_sql, (count * 10,))
|
677
|
+
|
678
|
+
for row in cursor.fetchall():
|
679
|
+
chunk_id, content, filename, section, tags_json, metadata_json = row
|
680
|
+
|
681
|
+
if chunk_id in seen_ids:
|
682
|
+
continue
|
683
|
+
|
684
|
+
# Parse metadata
|
685
|
+
metadata = json.loads(metadata_json) if metadata_json else {}
|
686
|
+
tags = json.loads(tags_json) if tags_json else []
|
687
|
+
|
688
|
+
# Calculate score based on how many terms match
|
689
|
+
score = 0
|
690
|
+
for term in terms:
|
691
|
+
# Check metadata values
|
692
|
+
metadata_str = json.dumps(metadata).lower()
|
693
|
+
if term in metadata_str:
|
694
|
+
score += 1.5
|
695
|
+
# Check tags
|
696
|
+
if any(term in str(tag).lower() for tag in tags):
|
697
|
+
score += 1.0
|
698
|
+
# Check section
|
699
|
+
if section and term in section.lower():
|
700
|
+
score += 0.8
|
701
|
+
|
702
|
+
if score > 0:
|
703
|
+
seen_ids.add(chunk_id)
|
704
|
+
results.append({
|
705
|
+
'id': chunk_id,
|
706
|
+
'content': content,
|
707
|
+
'score': score,
|
708
|
+
'metadata': {
|
709
|
+
'filename': filename,
|
710
|
+
'section': section,
|
711
|
+
'tags': tags,
|
712
|
+
'metadata': metadata
|
713
|
+
},
|
714
|
+
'search_type': 'metadata'
|
715
|
+
})
|
716
|
+
|
717
|
+
# Fallback: search for JSON metadata embedded in content
|
718
|
+
# This ensures backwards compatibility
|
719
|
+
if len(results) < count:
|
720
|
+
# Build specific conditions for known patterns
|
721
|
+
specific_conditions = []
|
722
|
+
|
723
|
+
# Look for specific high-value patterns first
|
724
|
+
if 'code' in terms and 'examples' in terms:
|
725
|
+
specific_conditions.append('content LIKE \'%"category": "Code Examples"%\'')
|
726
|
+
if 'sdk' in terms:
|
727
|
+
specific_conditions.append('content LIKE \'%"product": "%\' || \'SDK\' || \'%"%\'')
|
728
|
+
|
729
|
+
# General term search in JSON content
|
730
|
+
for term in terms:
|
731
|
+
specific_conditions.append(f"content LIKE '%\"{term}%'")
|
732
|
+
|
733
|
+
if specific_conditions:
|
734
|
+
# Limit conditions to avoid too broad search
|
735
|
+
conditions_to_use = specific_conditions[:10]
|
736
|
+
query_sql = f'''
|
737
|
+
SELECT id, content, filename, section, tags, metadata
|
738
|
+
FROM chunks
|
739
|
+
WHERE ({' OR '.join(conditions_to_use)})
|
740
|
+
AND id NOT IN ({','.join(str(id) for id in seen_ids) if seen_ids else '0'})
|
741
|
+
LIMIT ?
|
742
|
+
'''
|
743
|
+
cursor.execute(query_sql, (count * 5,))
|
744
|
+
|
745
|
+
rows = cursor.fetchall()
|
746
|
+
|
747
|
+
for row in rows:
|
748
|
+
chunk_id, content, filename, section, tags_json, metadata_json = row
|
749
|
+
|
750
|
+
if chunk_id in seen_ids:
|
751
|
+
continue
|
752
|
+
|
753
|
+
# Try to extract metadata from JSON content
|
754
|
+
json_metadata = {}
|
755
|
+
try:
|
756
|
+
# Look for metadata in JSON structure
|
757
|
+
if '"metadata":' in content:
|
758
|
+
import re
|
759
|
+
# More robust regex to extract nested JSON object
|
760
|
+
# This handles nested braces properly
|
761
|
+
start = content.find('"metadata":')
|
762
|
+
if start != -1:
|
763
|
+
# Find the opening brace
|
764
|
+
brace_start = content.find('{', start)
|
765
|
+
if brace_start != -1:
|
766
|
+
# Count braces to find matching closing brace
|
767
|
+
brace_count = 0
|
768
|
+
i = brace_start
|
769
|
+
while i < len(content):
|
770
|
+
if content[i] == '{':
|
771
|
+
brace_count += 1
|
772
|
+
elif content[i] == '}':
|
773
|
+
brace_count -= 1
|
774
|
+
if brace_count == 0:
|
775
|
+
# Found matching closing brace
|
776
|
+
metadata_str = content[brace_start:i+1]
|
777
|
+
json_metadata = json.loads(metadata_str)
|
778
|
+
break
|
779
|
+
i += 1
|
780
|
+
except:
|
781
|
+
pass
|
782
|
+
|
783
|
+
# Calculate score based on matches
|
784
|
+
score = 0
|
785
|
+
fields_matched = 0
|
786
|
+
|
787
|
+
# Check JSON metadata extracted from content
|
788
|
+
if json_metadata:
|
789
|
+
# Check category - count how many terms match
|
790
|
+
category = json_metadata.get('category', '').lower()
|
791
|
+
if category:
|
792
|
+
category_matches = sum(1 for term in terms if term in category)
|
793
|
+
if category_matches > 0:
|
794
|
+
score += 1.8 * (category_matches / len(terms) if terms else 1)
|
795
|
+
fields_matched += 1
|
796
|
+
|
797
|
+
# Check product - count how many terms match
|
798
|
+
product = json_metadata.get('product', '').lower()
|
799
|
+
if product:
|
800
|
+
product_matches = sum(1 for term in terms if term in product)
|
801
|
+
if product_matches > 0:
|
802
|
+
score += 1.5 * (product_matches / len(terms) if terms else 1)
|
803
|
+
fields_matched += 1
|
804
|
+
|
805
|
+
# Check source
|
806
|
+
source = json_metadata.get('source', '').lower()
|
807
|
+
if source:
|
808
|
+
source_matches = sum(1 for term in terms if term in source)
|
809
|
+
if source_matches > 0:
|
810
|
+
score += 1.2 * (source_matches / len(terms) if terms else 1)
|
811
|
+
fields_matched += 1
|
812
|
+
|
813
|
+
# Also check tags from JSON metadata
|
814
|
+
json_tags = json_metadata.get('tags', [])
|
815
|
+
if json_tags:
|
816
|
+
tags_str = str(json_tags).lower()
|
817
|
+
tag_matches = sum(1 for term in terms if term in tags_str)
|
818
|
+
if tag_matches > 0:
|
819
|
+
score += 1.3 * (tag_matches / len(terms) if terms else 1)
|
820
|
+
fields_matched += 1
|
821
|
+
|
822
|
+
if score > 0:
|
823
|
+
seen_ids.add(chunk_id)
|
824
|
+
results.append({
|
825
|
+
'id': chunk_id,
|
826
|
+
'content': content,
|
827
|
+
'score': float(score),
|
828
|
+
'metadata': {
|
829
|
+
'filename': filename,
|
830
|
+
'section': section,
|
831
|
+
'tags': json.loads(tags_json) if tags_json else [],
|
832
|
+
'metadata': json.loads(metadata_json) if metadata_json else {}
|
833
|
+
},
|
834
|
+
'search_type': 'metadata',
|
835
|
+
'fields_matched': fields_matched
|
836
|
+
})
|
837
|
+
logger.debug(f"Metadata match: {filename} - score={score:.2f}, fields_matched={fields_matched}, json_metadata={json_metadata}")
|
838
|
+
|
839
|
+
# Also get chunks with regular metadata
|
840
|
+
cursor.execute('''
|
841
|
+
SELECT id, content, filename, section, tags, metadata
|
842
|
+
FROM chunks
|
843
|
+
WHERE (tags IS NOT NULL AND tags != '')
|
844
|
+
OR (metadata IS NOT NULL AND metadata != '{}')
|
845
|
+
OR (section IS NOT NULL AND section != '')
|
846
|
+
LIMIT ?
|
847
|
+
''', (count * 10,)) # Get more to search through
|
848
|
+
|
849
|
+
for row in cursor.fetchall():
|
850
|
+
chunk_id, content, filename, section, tags_json, metadata_json = row
|
851
|
+
|
852
|
+
if chunk_id in seen_ids:
|
853
|
+
continue
|
854
|
+
|
855
|
+
# Parse metadata
|
856
|
+
tags = json.loads(tags_json) if tags_json else []
|
857
|
+
metadata = json.loads(metadata_json) if metadata_json else {}
|
858
|
+
|
859
|
+
# Flatten nested metadata if present
|
860
|
+
if 'metadata' in metadata:
|
861
|
+
# Handle double-nested metadata from some indexes
|
862
|
+
nested_meta = metadata['metadata']
|
863
|
+
metadata.update(nested_meta)
|
864
|
+
|
865
|
+
# Initialize scoring components
|
866
|
+
score_components = {
|
867
|
+
'tags': 0,
|
868
|
+
'section': 0,
|
869
|
+
'category': 0,
|
870
|
+
'product': 0,
|
871
|
+
'source': 0,
|
872
|
+
'description': 0
|
873
|
+
}
|
874
|
+
|
875
|
+
# Check tags
|
876
|
+
if tags:
|
877
|
+
tag_matches = 0
|
878
|
+
for tag in tags:
|
879
|
+
tag_lower = tag.lower()
|
880
|
+
# Full query match in tag
|
881
|
+
if query_lower in tag_lower:
|
882
|
+
tag_matches += 2.0
|
883
|
+
else:
|
884
|
+
# Individual term matches
|
885
|
+
term_matches = sum(1 for term in terms if term in tag_lower)
|
886
|
+
tag_matches += term_matches * 0.5
|
887
|
+
|
888
|
+
if tag_matches > 0:
|
889
|
+
score_components['tags'] = min(1.0, tag_matches / len(tags))
|
890
|
+
|
891
|
+
# Check section
|
892
|
+
if section and section.lower() != 'none':
|
893
|
+
section_lower = section.lower()
|
894
|
+
if query_lower in section_lower:
|
895
|
+
score_components['section'] = 1.0
|
896
|
+
else:
|
897
|
+
term_matches = sum(1 for term in terms if term in section_lower)
|
898
|
+
score_components['section'] = (term_matches / len(terms)) * 0.8 if terms else 0
|
899
|
+
|
900
|
+
# Check category field
|
901
|
+
category = metadata.get('category', '')
|
902
|
+
if category:
|
903
|
+
category_lower = category.lower()
|
904
|
+
if query_lower in category_lower:
|
905
|
+
score_components['category'] = 1.0
|
906
|
+
else:
|
907
|
+
term_matches = sum(1 for term in terms if term in category_lower)
|
908
|
+
score_components['category'] = (term_matches / len(terms)) * 0.9 if terms else 0
|
909
|
+
|
910
|
+
# Check product field
|
911
|
+
product = metadata.get('product', '')
|
912
|
+
if product:
|
913
|
+
product_lower = product.lower()
|
914
|
+
if query_lower in product_lower:
|
915
|
+
score_components['product'] = 1.0
|
916
|
+
else:
|
917
|
+
term_matches = sum(1 for term in terms if term in product_lower)
|
918
|
+
score_components['product'] = (term_matches / len(terms)) * 0.8 if terms else 0
|
919
|
+
|
920
|
+
# Check source field (original filename)
|
921
|
+
source = metadata.get('source', '')
|
922
|
+
if source:
|
923
|
+
source_lower = source.lower()
|
924
|
+
if query_lower in source_lower:
|
925
|
+
score_components['source'] = 1.0
|
926
|
+
else:
|
927
|
+
term_matches = sum(1 for term in terms if term in source_lower)
|
928
|
+
score_components['source'] = (term_matches / len(terms)) * 0.7 if terms else 0
|
929
|
+
|
930
|
+
# Check description or title fields
|
931
|
+
description = metadata.get('description', metadata.get('title', ''))
|
932
|
+
if description:
|
933
|
+
desc_lower = description.lower()
|
934
|
+
if query_lower in desc_lower:
|
935
|
+
score_components['description'] = 0.8
|
936
|
+
else:
|
937
|
+
term_matches = sum(1 for term in terms if term in desc_lower)
|
938
|
+
score_components['description'] = (term_matches / len(terms)) * 0.6 if terms else 0
|
939
|
+
|
940
|
+
# Calculate total score with weights
|
941
|
+
weights = {
|
942
|
+
'category': 1.8, # Strong signal
|
943
|
+
'product': 1.5, # Strong signal
|
944
|
+
'tags': 1.3, # Good signal
|
945
|
+
'source': 1.2, # Good signal
|
946
|
+
'section': 1.0, # Moderate signal
|
947
|
+
'description': 0.8 # Weaker signal
|
948
|
+
}
|
949
|
+
|
950
|
+
total_score = sum(score_components[field] * weights.get(field, 1.0)
|
951
|
+
for field in score_components)
|
952
|
+
|
953
|
+
# Track match coverage
|
954
|
+
fields_matched = sum(1 for score in score_components.values() if score > 0)
|
955
|
+
match_coverage = sum(1 for term in terms if any(
|
956
|
+
term in str(field_value).lower()
|
957
|
+
for field_value in [tags, section, category, product, source, description]
|
958
|
+
if field_value
|
959
|
+
)) / len(terms) if terms else 0
|
960
|
+
|
961
|
+
if total_score > 0:
|
962
|
+
results.append({
|
963
|
+
'id': chunk_id,
|
964
|
+
'content': content,
|
965
|
+
'score': float(total_score),
|
966
|
+
'metadata': {
|
967
|
+
'filename': filename,
|
968
|
+
'section': section,
|
969
|
+
'tags': tags,
|
970
|
+
'metadata': metadata,
|
971
|
+
'category': category,
|
972
|
+
'product': product,
|
973
|
+
'source': source
|
974
|
+
},
|
975
|
+
'search_type': 'metadata',
|
976
|
+
'metadata_matches': score_components,
|
977
|
+
'fields_matched': fields_matched,
|
978
|
+
'match_coverage': match_coverage
|
979
|
+
})
|
980
|
+
seen_ids.add(chunk_id)
|
981
|
+
|
982
|
+
conn.close()
|
983
|
+
|
984
|
+
# Sort by score and return top results
|
985
|
+
results.sort(key=lambda x: x['score'], reverse=True)
|
986
|
+
return results[:count]
|
987
|
+
|
988
|
+
except Exception as e:
|
989
|
+
logger.error(f"Error in metadata search: {e}")
|
990
|
+
return []
|
991
|
+
|
992
|
+
def _add_vector_scores_to_candidates(self, candidates: Dict[str, Dict], query_vector: NDArray,
|
993
|
+
distance_threshold: float):
|
994
|
+
"""Add vector similarity scores to existing candidates"""
|
995
|
+
if not candidates or not np:
|
996
|
+
return
|
997
|
+
|
998
|
+
try:
|
999
|
+
conn = sqlite3.connect(self.index_path)
|
1000
|
+
cursor = conn.cursor()
|
1001
|
+
|
1002
|
+
# Get embeddings for candidate chunks only
|
1003
|
+
chunk_ids = list(candidates.keys())
|
1004
|
+
placeholders = ','.join(['?' for _ in chunk_ids])
|
1005
|
+
|
1006
|
+
cursor.execute(f'''
|
1007
|
+
SELECT id, embedding
|
1008
|
+
FROM chunks
|
1009
|
+
WHERE id IN ({placeholders}) AND embedding IS NOT NULL AND embedding != ''
|
1010
|
+
''', chunk_ids)
|
1011
|
+
|
1012
|
+
for row in cursor.fetchall():
|
1013
|
+
chunk_id, embedding_blob = row
|
1014
|
+
|
1015
|
+
if not embedding_blob:
|
1016
|
+
continue
|
1017
|
+
|
1018
|
+
try:
|
1019
|
+
# Convert embedding back to numpy array
|
1020
|
+
embedding = np.frombuffer(embedding_blob, dtype=np.float32).reshape(1, -1)
|
1021
|
+
|
1022
|
+
# Calculate similarity
|
1023
|
+
similarity = cosine_similarity(query_vector, embedding)[0][0]
|
1024
|
+
distance = 1 - similarity
|
1025
|
+
|
1026
|
+
# Add vector scores to candidate
|
1027
|
+
candidates[chunk_id]['vector_score'] = float(similarity)
|
1028
|
+
candidates[chunk_id]['vector_distance'] = float(distance)
|
1029
|
+
candidates[chunk_id]['sources']['vector_rerank'] = True
|
1030
|
+
|
1031
|
+
except Exception as e:
|
1032
|
+
logger.debug(f"Error processing embedding for chunk {chunk_id}: {e}")
|
1033
|
+
continue
|
1034
|
+
|
1035
|
+
conn.close()
|
1036
|
+
|
1037
|
+
except Exception as e:
|
1038
|
+
logger.error(f"Error in vector re-ranking: {e}")
|
1039
|
+
|
1040
|
+
def _calculate_combined_score(self, candidate: Dict, distance_threshold: float) -> float:
|
1041
|
+
"""Calculate final score combining all signals with comprehensive match bonus"""
|
1042
|
+
# Base scores from different sources
|
1043
|
+
source_scores = candidate.get('source_scores', {})
|
1044
|
+
|
1045
|
+
# Check for comprehensive matching (multiple signals)
|
1046
|
+
sources = candidate.get('sources', {})
|
1047
|
+
num_sources = len(sources)
|
1048
|
+
|
1049
|
+
# Get match coverage information
|
1050
|
+
match_coverage = candidate.get('match_coverage', 0)
|
1051
|
+
fields_matched = candidate.get('fields_matched', 0)
|
1052
|
+
|
1053
|
+
# Calculate base score with exponential boost for multiple sources
|
1054
|
+
if num_sources > 1:
|
1055
|
+
# Multiple signal matches are exponentially better
|
1056
|
+
multi_signal_boost = 1.0 + (0.3 * (num_sources - 1))
|
1057
|
+
base_score = sum(source_scores.values()) * multi_signal_boost
|
1058
|
+
else:
|
1059
|
+
base_score = sum(source_scores.values())
|
1060
|
+
|
1061
|
+
# Apply comprehensive match bonus
|
1062
|
+
if match_coverage > 0.5: # More than 50% of query terms matched
|
1063
|
+
coverage_bonus = 1.0 + (match_coverage - 0.5) * 0.5
|
1064
|
+
base_score *= coverage_bonus
|
1065
|
+
|
1066
|
+
# Apply field diversity bonus (matching in multiple metadata fields)
|
1067
|
+
if fields_matched > 2:
|
1068
|
+
field_bonus = 1.0 + (fields_matched - 2) * 0.1
|
1069
|
+
base_score *= field_bonus
|
1070
|
+
|
1071
|
+
# Apply vector similarity multiplier if available
|
1072
|
+
if 'vector_score' in candidate:
|
1073
|
+
vector_score = candidate['vector_score']
|
1074
|
+
vector_distance = candidate.get('vector_distance', 1 - vector_score)
|
1075
|
+
|
1076
|
+
# Distance-aware scoring
|
1077
|
+
if distance_threshold > 0:
|
1078
|
+
if vector_distance <= distance_threshold:
|
1079
|
+
# Within threshold - full vector score
|
1080
|
+
vector_multiplier = vector_score
|
1081
|
+
elif vector_distance <= distance_threshold * 1.5:
|
1082
|
+
# Near threshold - gradual decay
|
1083
|
+
overflow = (vector_distance - distance_threshold) / (distance_threshold * 0.5)
|
1084
|
+
vector_multiplier = vector_score * (1 - overflow * 0.3)
|
1085
|
+
else:
|
1086
|
+
# Beyond threshold - minimal contribution
|
1087
|
+
vector_multiplier = vector_score * 0.3
|
1088
|
+
else:
|
1089
|
+
vector_multiplier = vector_score
|
1090
|
+
|
1091
|
+
# For chunks found by vector-only search, use vector score directly
|
1092
|
+
if 'vector' in sources and len(sources) == 1:
|
1093
|
+
base_score = vector_score
|
1094
|
+
else:
|
1095
|
+
# For chunks found by multiple methods, apply vector as quality check
|
1096
|
+
base_score *= vector_multiplier
|
1097
|
+
|
1098
|
+
# Special handling for strong metadata matches
|
1099
|
+
if 'metadata' in sources:
|
1100
|
+
metadata_matches = candidate.get('metadata_matches', {})
|
1101
|
+
# Strong category or product match should boost significantly
|
1102
|
+
if metadata_matches.get('category', 0) > 0.8 or metadata_matches.get('product', 0) > 0.8:
|
1103
|
+
base_score *= 1.2
|
1104
|
+
|
1105
|
+
return base_score
|
1106
|
+
|
1107
|
+
def _apply_diversity_penalties(self, results: List[Dict], target_count: int) -> List[Dict]:
|
1108
|
+
"""Apply penalties to prevent single-file dominance while maintaining quality"""
|
1109
|
+
if not results:
|
1110
|
+
return results
|
1111
|
+
|
1112
|
+
# Track file occurrences
|
1113
|
+
file_counts = {}
|
1114
|
+
penalized_results = []
|
1115
|
+
|
1116
|
+
# Define penalty multipliers
|
1117
|
+
occurrence_penalties = {
|
1118
|
+
1: 1.0, # First chunk: no penalty
|
1119
|
+
2: 0.85, # Second chunk: 15% penalty
|
1120
|
+
3: 0.7, # Third chunk: 30% penalty
|
1121
|
+
4: 0.5, # Fourth chunk: 50% penalty
|
1122
|
+
}
|
1123
|
+
|
1124
|
+
for result in results:
|
1125
|
+
filename = result['metadata']['filename']
|
1126
|
+
|
1127
|
+
# Get current count for this file
|
1128
|
+
current_count = file_counts.get(filename, 0) + 1
|
1129
|
+
file_counts[filename] = current_count
|
1130
|
+
|
1131
|
+
# Apply penalty based on occurrence
|
1132
|
+
penalty = occurrence_penalties.get(current_count, 0.4) # 60% penalty for 5+ chunks
|
1133
|
+
|
1134
|
+
# Create a copy to avoid modifying original
|
1135
|
+
penalized_result = result.copy()
|
1136
|
+
penalized_result['diversity_penalty'] = penalty
|
1137
|
+
penalized_result['final_score'] = result.get('final_score', result.get('score', 0)) * penalty
|
1138
|
+
|
1139
|
+
penalized_results.append(penalized_result)
|
1140
|
+
|
1141
|
+
# Re-sort by penalized scores
|
1142
|
+
penalized_results.sort(key=lambda x: x['final_score'], reverse=True)
|
1143
|
+
|
1144
|
+
# Ensure minimum diversity if we have enough results
|
1145
|
+
if len(penalized_results) > target_count:
|
1146
|
+
unique_files = len(set(r['metadata']['filename'] for r in penalized_results[:target_count]))
|
1147
|
+
|
1148
|
+
# If top results are too homogeneous (e.g., all from 1-2 files)
|
1149
|
+
if unique_files < min(3, target_count):
|
1150
|
+
# Try to inject some diversity
|
1151
|
+
selected = penalized_results[:target_count]
|
1152
|
+
seen_files = set(r['metadata']['filename'] for r in selected)
|
1153
|
+
|
1154
|
+
# Look for high-quality results from other files
|
1155
|
+
for result in penalized_results[target_count:]:
|
1156
|
+
if result['metadata']['filename'] not in seen_files:
|
1157
|
+
# If it's reasonably good (within 50% of top score), include it
|
1158
|
+
if result['final_score'] > 0.5 * selected[0]['final_score']:
|
1159
|
+
# Replace the lowest scoring result from an over-represented file
|
1160
|
+
for i in range(len(selected) - 1, -1, -1):
|
1161
|
+
if file_counts[selected[i]['metadata']['filename']] > 2:
|
1162
|
+
selected[i] = result
|
1163
|
+
seen_files.add(result['metadata']['filename'])
|
1164
|
+
break
|
1165
|
+
|
1166
|
+
penalized_results[:target_count] = selected
|
1167
|
+
|
1168
|
+
return penalized_results
|
1169
|
+
|
366
1170
|
def get_stats(self) -> Dict[str, Any]:
|
367
1171
|
"""Get statistics about the search index"""
|
368
1172
|
# Use pgvector backend if available
|