signalwire-agents 0.1.46__py3-none-any.whl → 0.1.48__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -74,9 +74,16 @@ class SearchEngine:
74
74
 
75
75
  def search(self, query_vector: List[float], enhanced_text: str,
76
76
  count: int = 3, distance_threshold: float = 0.0,
77
- tags: Optional[List[str]] = None) -> List[Dict[str, Any]]:
77
+ tags: Optional[List[str]] = None,
78
+ keyword_weight: Optional[float] = None,
79
+ original_query: Optional[str] = None) -> List[Dict[str, Any]]:
78
80
  """
79
- Perform hybrid search (vector + keyword)
81
+ Perform improved search with fast filtering and vector re-ranking
82
+
83
+ Strategy:
84
+ 1. Fast candidate collection (filename, metadata, keywords)
85
+ 2. Vector re-ranking on candidates only
86
+ 3. Fallback to full vector search if few candidates
80
87
 
81
88
  Args:
82
89
  query_vector: Embedding vector for the query
@@ -84,6 +91,8 @@ class SearchEngine:
84
91
  count: Number of results to return
85
92
  distance_threshold: Minimum similarity score
86
93
  tags: Filter by tags
94
+ keyword_weight: Optional manual weight for keyword vs vector
95
+ original_query: Original query for exact matching
87
96
 
88
97
  Returns:
89
98
  List of search results with scores and metadata
@@ -91,45 +100,108 @@ class SearchEngine:
91
100
 
92
101
  # Use pgvector backend if available
93
102
  if self.backend == 'pgvector':
94
- return self._backend.search(query_vector, enhanced_text, count, distance_threshold, tags)
103
+ return self._backend.search(query_vector, enhanced_text, count, distance_threshold, tags, keyword_weight)
95
104
 
96
- # Original SQLite implementation
105
+ # Check for numpy/sklearn availability
97
106
  if not np or not cosine_similarity:
98
107
  logger.warning("NumPy or scikit-learn not available. Using keyword search only.")
99
- return self._keyword_search_only(enhanced_text, count, tags)
108
+ return self._keyword_search_only(enhanced_text, count, tags, original_query)
100
109
 
101
110
  # Convert query vector to numpy array
102
111
  try:
103
112
  query_array = np.array(query_vector).reshape(1, -1)
104
113
  except Exception as e:
105
114
  logger.error(f"Error converting query vector: {e}")
106
- return self._keyword_search_only(enhanced_text, count, tags)
115
+ return self._keyword_search_only(enhanced_text, count, tags, original_query)
107
116
 
108
- # Vector search
109
- vector_results = self._vector_search(query_array, count * 2)
117
+ # Stage 1: Collect candidates using fast methods
118
+ candidates = {}
110
119
 
111
- # Keyword search
112
- keyword_results = self._keyword_search(enhanced_text, count * 2)
120
+ # Fast searches - collect all potential matches
121
+ filename_results = self._filename_search(original_query or enhanced_text, count * 3)
122
+ metadata_results = self._metadata_search(original_query or enhanced_text, count * 2)
123
+ keyword_results = self._keyword_search(enhanced_text, count * 2, original_query)
113
124
 
114
- # Merge and rank results
115
- merged_results = self._merge_results(vector_results, keyword_results)
125
+ logger.debug(f"Search for '{original_query}': filename={len(filename_results)}, metadata={len(metadata_results)}, keyword={len(keyword_results)}")
126
+
127
+ # Merge candidates from different sources
128
+ for result_set, source_weight in [(filename_results, 2.0),
129
+ (metadata_results, 1.5),
130
+ (keyword_results, 1.0)]:
131
+ for result in result_set:
132
+ chunk_id = result['id']
133
+ if chunk_id not in candidates:
134
+ candidates[chunk_id] = result
135
+ candidates[chunk_id]['sources'] = {}
136
+ candidates[chunk_id]['source_scores'] = {}
137
+
138
+ # Track which searches found this chunk
139
+ candidates[chunk_id]['sources'][result['search_type']] = True
140
+ candidates[chunk_id]['source_scores'][result['search_type']] = result['score'] * source_weight
141
+
142
+ # Stage 2: Check if we have enough candidates
143
+ if len(candidates) < count * 2:
144
+ # Not enough candidates from fast searches - add full vector search
145
+ logger.debug(f"Only {len(candidates)} candidates from fast search, adding full vector search")
146
+ vector_results = self._vector_search(query_array, count * 3)
147
+
148
+ for result in vector_results:
149
+ chunk_id = result['id']
150
+ if chunk_id not in candidates:
151
+ candidates[chunk_id] = result
152
+ candidates[chunk_id]['sources'] = {'vector': True}
153
+ candidates[chunk_id]['source_scores'] = {}
154
+
155
+ # Add vector score
156
+ candidates[chunk_id]['vector_score'] = result['score']
157
+ candidates[chunk_id]['vector_distance'] = 1 - result['score']
158
+ else:
159
+ # We have enough candidates - just re-rank them with vectors
160
+ logger.debug(f"Re-ranking {len(candidates)} candidates with vector similarity")
161
+ self._add_vector_scores_to_candidates(candidates, query_array, distance_threshold)
162
+
163
+ # Stage 3: Score and rank all candidates
164
+ final_results = []
165
+ for chunk_id, candidate in candidates.items():
166
+ # Calculate final score combining all signals
167
+ score = self._calculate_combined_score(candidate, distance_threshold)
168
+ candidate['final_score'] = score
169
+ final_results.append(candidate)
170
+
171
+ # Sort by final score
172
+ final_results.sort(key=lambda x: x['final_score'], reverse=True)
116
173
 
117
174
  # Filter by tags if specified
118
175
  if tags:
119
- merged_results = self._filter_by_tags(merged_results, tags)
176
+ final_results = [r for r in final_results
177
+ if any(tag in r['metadata'].get('tags', []) for tag in tags)]
120
178
 
121
- # Filter by distance threshold
122
- filtered_results = [
123
- r for r in merged_results
124
- if r['score'] >= distance_threshold
125
- ]
179
+ # Apply distance threshold as final filter (soft threshold already applied in scoring)
180
+ if distance_threshold > 0:
181
+ final_results = [r for r in final_results
182
+ if r.get('vector_distance', 0) <= distance_threshold * 1.5
183
+ or 'vector' not in r.get('sources', {})]
126
184
 
127
- return filtered_results[:count]
185
+ # Boost exact matches if we have the original query
186
+ if original_query:
187
+ final_results = self._boost_exact_matches(final_results, original_query)
188
+ # Re-sort after boosting
189
+ final_results.sort(key=lambda x: x['final_score'], reverse=True)
190
+
191
+ # Apply diversity penalties to prevent single-file dominance
192
+ final_results = self._apply_diversity_penalties(final_results, count)
193
+
194
+ # Ensure 'score' field exists for CLI compatibility
195
+ for r in final_results:
196
+ if 'score' not in r:
197
+ r['score'] = r.get('final_score', 0.0)
198
+
199
+ return final_results[:count]
128
200
 
129
201
  def _keyword_search_only(self, enhanced_text: str, count: int,
130
- tags: Optional[List[str]] = None) -> List[Dict[str, Any]]:
202
+ tags: Optional[List[str]] = None, original_query: Optional[str] = None) -> List[Dict[str, Any]]:
131
203
  """Fallback to keyword search only when vector search is unavailable"""
132
- keyword_results = self._keyword_search(enhanced_text, count)
204
+ keyword_results = self._keyword_search(enhanced_text, count, original_query)
133
205
 
134
206
  if tags:
135
207
  keyword_results = self._filter_by_tags(keyword_results, tags)
@@ -193,7 +265,7 @@ class SearchEngine:
193
265
  logger.error(f"Error in vector search: {e}")
194
266
  return []
195
267
 
196
- def _keyword_search(self, enhanced_text: str, count: int) -> List[Dict[str, Any]]:
268
+ def _keyword_search(self, enhanced_text: str, count: int, original_query: Optional[str] = None) -> List[Dict[str, Any]]:
197
269
  """Perform full-text search"""
198
270
  try:
199
271
  conn = sqlite3.connect(self.index_path)
@@ -235,6 +307,12 @@ class SearchEngine:
235
307
  })
236
308
 
237
309
  conn.close()
310
+
311
+ # If FTS returns no results, try fallback LIKE search
312
+ if not results:
313
+ logger.debug(f"FTS returned no results for '{enhanced_text}', trying fallback search")
314
+ return self._fallback_search(enhanced_text, count)
315
+
238
316
  return results
239
317
 
240
318
  except Exception as e:
@@ -259,35 +337,74 @@ class SearchEngine:
259
337
  conn = sqlite3.connect(self.index_path)
260
338
  cursor = conn.cursor()
261
339
 
262
- # Simple LIKE search
340
+ # Simple LIKE search with word boundaries
263
341
  search_terms = enhanced_text.lower().split()
264
342
  like_conditions = []
265
343
  params = []
266
344
 
267
345
  for term in search_terms[:5]: # Limit to 5 terms to avoid too complex queries
268
- like_conditions.append("LOWER(processed_content) LIKE ?")
269
- params.append(f"%{term}%")
346
+ # Search for term with word boundaries (space or punctuation)
347
+ like_conditions.append("""
348
+ (LOWER(processed_content) LIKE ?
349
+ OR LOWER(processed_content) LIKE ?
350
+ OR LOWER(processed_content) LIKE ?
351
+ OR LOWER(processed_content) LIKE ?)
352
+ """)
353
+ params.extend([
354
+ f"% {term} %", # space on both sides
355
+ f"{term} %", # at beginning
356
+ f"% {term}", # at end
357
+ f"{term}" # exact match
358
+ ])
270
359
 
271
360
  if not like_conditions:
272
361
  return []
273
362
 
363
+ # Also search in original content
364
+ content_conditions = []
365
+ for term in search_terms[:5]:
366
+ content_conditions.append("""
367
+ (LOWER(content) LIKE ?
368
+ OR LOWER(content) LIKE ?
369
+ OR LOWER(content) LIKE ?
370
+ OR LOWER(content) LIKE ?)
371
+ """)
372
+ params.extend([
373
+ f"% {term} %", # with spaces
374
+ f"{term} %", # at beginning
375
+ f"% {term}", # at end
376
+ f"{term}" # exact match
377
+ ])
378
+
274
379
  query = f'''
275
380
  SELECT id, content, filename, section, tags, metadata
276
381
  FROM chunks
277
- WHERE {" OR ".join(like_conditions)}
382
+ WHERE ({" OR ".join(like_conditions)})
383
+ OR ({" OR ".join(content_conditions)})
278
384
  LIMIT ?
279
385
  '''
280
386
  params.append(count)
281
387
 
388
+
282
389
  cursor.execute(query, params)
283
390
 
284
391
  results = []
285
392
  for row in cursor.fetchall():
286
393
  chunk_id, content, filename, section, tags_json, metadata_json = row
287
394
 
288
- # Simple scoring based on term matches
395
+ # Simple scoring based on term matches with word boundaries
289
396
  content_lower = content.lower()
290
- score = sum(1 for term in search_terms if term.lower() in content_lower) / len(search_terms)
397
+ # Check for whole word matches
398
+ word_matches = 0
399
+ for term in search_terms:
400
+ term_lower = term.lower()
401
+ # Check word boundaries
402
+ if (f" {term_lower} " in f" {content_lower} " or
403
+ content_lower.startswith(f"{term_lower} ") or
404
+ content_lower.endswith(f" {term_lower}") or
405
+ content_lower == term_lower):
406
+ word_matches += 1
407
+ score = word_matches / len(search_terms) if search_terms else 0.0
291
408
 
292
409
  results.append({
293
410
  'id': chunk_id,
@@ -306,14 +423,23 @@ class SearchEngine:
306
423
 
307
424
  # Sort by score
308
425
  results.sort(key=lambda x: x['score'], reverse=True)
426
+
309
427
  return results
310
428
 
311
429
  except Exception as e:
312
430
  logger.error(f"Error in fallback search: {e}")
313
431
  return []
314
432
 
315
- def _merge_results(self, vector_results: List[Dict], keyword_results: List[Dict]) -> List[Dict[str, Any]]:
433
+ def _merge_results(self, vector_results: List[Dict], keyword_results: List[Dict],
434
+ vector_weight: Optional[float] = None,
435
+ keyword_weight: Optional[float] = None) -> List[Dict[str, Any]]:
316
436
  """Merge and rank vector and keyword search results"""
437
+ # Use provided weights or defaults
438
+ if vector_weight is None:
439
+ vector_weight = 0.7
440
+ if keyword_weight is None:
441
+ keyword_weight = 0.3
442
+
317
443
  # Create a combined list with weighted scores
318
444
  combined = {}
319
445
 
@@ -335,8 +461,6 @@ class SearchEngine:
335
461
  combined[chunk_id]['keyword_score'] = result['score']
336
462
 
337
463
  # Calculate combined score (weighted average)
338
- vector_weight = 0.7
339
- keyword_weight = 0.3
340
464
 
341
465
  for chunk_id, result in combined.items():
342
466
  vector_score = result.get('vector_score', 0.0)
@@ -363,6 +487,686 @@ class SearchEngine:
363
487
  filtered.append(result)
364
488
  return filtered
365
489
 
490
+ def _boost_exact_matches(self, results: List[Dict[str, Any]], original_query: str) -> List[Dict[str, Any]]:
491
+ """Boost scores for results that contain exact matches of the original query"""
492
+ if not original_query:
493
+ return results
494
+
495
+ # Extract key phrases to look for
496
+ query_lower = original_query.lower()
497
+
498
+ for result in results:
499
+ content_lower = result['content'].lower()
500
+ filename_lower = result['metadata'].get('filename', '').lower()
501
+
502
+ # Boost for exact phrase match in content
503
+ if query_lower in content_lower:
504
+ result['score'] *= 2.0 # Double score for exact match
505
+
506
+ # Boost for matches in filenames that suggest relevance
507
+ if any(term in filename_lower for term in ['example', 'sample', 'demo', 'tutorial', 'guide']):
508
+ if 'example' in query_lower or 'sample' in query_lower or 'code' in query_lower:
509
+ result['score'] *= 1.5
510
+
511
+ # Boost for "getting started" type queries
512
+ if 'getting started' in query_lower and 'start' in content_lower:
513
+ result['score'] *= 1.5
514
+
515
+ return results
516
+
517
+ def _filename_search(self, query: str, count: int) -> List[Dict[str, Any]]:
518
+ """Search for query in filenames with term coverage scoring"""
519
+ try:
520
+ conn = sqlite3.connect(self.index_path)
521
+ cursor = conn.cursor()
522
+
523
+ query_lower = query.lower()
524
+ terms = query_lower.split()
525
+
526
+ # First try exact phrase match
527
+ cursor.execute('''
528
+ SELECT DISTINCT id, content, filename, section, tags, metadata
529
+ FROM chunks
530
+ WHERE LOWER(filename) LIKE ?
531
+ LIMIT ?
532
+ ''', (f'%{query_lower}%', count))
533
+
534
+ results = []
535
+ seen_ids = set()
536
+
537
+ # Process exact matches
538
+ for row in cursor.fetchall():
539
+ chunk_id, content, filename, section, tags_json, metadata_json = row
540
+ seen_ids.add(chunk_id)
541
+
542
+ # High score for exact phrase match
543
+ filename_lower = filename.lower()
544
+ basename = filename_lower.split('/')[-1] if '/' in filename_lower else filename_lower
545
+ if query_lower in basename:
546
+ score = 3.0 # Exact match in basename (increased weight)
547
+ else:
548
+ score = 2.0 # Exact match in path
549
+
550
+ results.append({
551
+ 'id': chunk_id,
552
+ 'content': content,
553
+ 'score': float(score),
554
+ 'metadata': {
555
+ 'filename': filename,
556
+ 'section': section,
557
+ 'tags': json.loads(tags_json) if tags_json else [],
558
+ 'metadata': json.loads(metadata_json) if metadata_json else {}
559
+ },
560
+ 'search_type': 'filename',
561
+ 'match_coverage': 1.0 # Exact match = 100% coverage
562
+ })
563
+
564
+ # Then search for files containing ANY of the terms
565
+ if terms and len(results) < count * 3: # Get more candidates
566
+ # Build OR query for any term match
567
+ conditions = []
568
+ params = []
569
+ for term in terms:
570
+ conditions.append("LOWER(filename) LIKE ?")
571
+ params.append(f'%{term}%')
572
+
573
+ sql = f'''
574
+ SELECT DISTINCT id, content, filename, section, tags, metadata
575
+ FROM chunks
576
+ WHERE ({' OR '.join(conditions)})
577
+ AND id NOT IN ({','.join(['?' for _ in seen_ids]) if seen_ids else '0'})
578
+ LIMIT ?
579
+ '''
580
+ if seen_ids:
581
+ params.extend(seen_ids)
582
+ params.append(count * 3)
583
+
584
+ cursor.execute(sql, params)
585
+
586
+ for row in cursor.fetchall():
587
+ chunk_id, content, filename, section, tags_json, metadata_json = row
588
+
589
+ # Enhanced scoring based on term coverage
590
+ filename_lower = filename.lower()
591
+ basename = filename_lower.split('/')[-1] if '/' in filename_lower else filename_lower
592
+
593
+ # Count matches in basename vs full path
594
+ basename_matches = sum(1 for term in terms if term in basename)
595
+ path_matches = sum(1 for term in terms if term in filename_lower)
596
+
597
+ # Calculate term coverage (what % of query terms are matched)
598
+ term_coverage = path_matches / len(terms) if terms else 0
599
+ basename_coverage = basename_matches / len(terms) if terms else 0
600
+
601
+ # Check for substring bonus (e.g., "code_examples" contains both terms together)
602
+ substring_bonus = 0
603
+ if len(terms) > 1:
604
+ # Check if terms appear consecutively
605
+ for i in range(len(terms) - 1):
606
+ if f"{terms[i]}_{terms[i+1]}" in filename_lower or f"{terms[i]}{terms[i+1]}" in filename_lower:
607
+ substring_bonus = 0.3
608
+ break
609
+
610
+ # Score based on coverage with exponential boost for more matches
611
+ if basename_coverage > 0:
612
+ # Exponential scoring for basename matches
613
+ score = basename_coverage ** 1.5 + substring_bonus
614
+ else:
615
+ # Lower score for path-only matches
616
+ score = (term_coverage * 0.5) ** 1.5 + substring_bonus
617
+
618
+ results.append({
619
+ 'id': chunk_id,
620
+ 'content': content,
621
+ 'score': float(score),
622
+ 'metadata': {
623
+ 'filename': filename,
624
+ 'section': section,
625
+ 'tags': json.loads(tags_json) if tags_json else [],
626
+ 'metadata': json.loads(metadata_json) if metadata_json else {}
627
+ },
628
+ 'search_type': 'filename',
629
+ 'match_coverage': term_coverage
630
+ })
631
+
632
+ conn.close()
633
+
634
+ # Sort by score and return top results
635
+ results.sort(key=lambda x: x['score'], reverse=True)
636
+ return results[:count]
637
+
638
+ except Exception as e:
639
+ logger.error(f"Error in filename search: {e}")
640
+ return []
641
+
642
+ def _metadata_search(self, query: str, count: int) -> List[Dict[str, Any]]:
643
+ """Search in all metadata fields (tags, sections, category, product, source)"""
644
+ try:
645
+ conn = sqlite3.connect(self.index_path)
646
+ cursor = conn.cursor()
647
+
648
+ query_lower = query.lower()
649
+ terms = query_lower.split()
650
+ results = []
651
+ seen_ids = set()
652
+
653
+ # First, try to use the metadata_text column if it exists
654
+ try:
655
+ # Check if metadata_text column exists
656
+ cursor.execute("PRAGMA table_info(chunks)")
657
+ columns = [col[1] for col in cursor.fetchall()]
658
+ has_metadata_text = 'metadata_text' in columns
659
+ except:
660
+ has_metadata_text = False
661
+
662
+ if has_metadata_text:
663
+ # Use the new metadata_text column for efficient searching
664
+ # Build conditions for each term
665
+ conditions = []
666
+ for term in terms:
667
+ conditions.append(f"metadata_text LIKE '%{term}%'")
668
+
669
+ if conditions:
670
+ query_sql = f'''
671
+ SELECT id, content, filename, section, tags, metadata
672
+ FROM chunks
673
+ WHERE {' AND '.join(conditions)}
674
+ LIMIT ?
675
+ '''
676
+ cursor.execute(query_sql, (count * 10,))
677
+
678
+ for row in cursor.fetchall():
679
+ chunk_id, content, filename, section, tags_json, metadata_json = row
680
+
681
+ if chunk_id in seen_ids:
682
+ continue
683
+
684
+ # Parse metadata
685
+ metadata = json.loads(metadata_json) if metadata_json else {}
686
+ tags = json.loads(tags_json) if tags_json else []
687
+
688
+ # Calculate score based on how many terms match
689
+ score = 0
690
+ for term in terms:
691
+ # Check metadata values
692
+ metadata_str = json.dumps(metadata).lower()
693
+ if term in metadata_str:
694
+ score += 1.5
695
+ # Check tags
696
+ if any(term in str(tag).lower() for tag in tags):
697
+ score += 1.0
698
+ # Check section
699
+ if section and term in section.lower():
700
+ score += 0.8
701
+
702
+ if score > 0:
703
+ seen_ids.add(chunk_id)
704
+ results.append({
705
+ 'id': chunk_id,
706
+ 'content': content,
707
+ 'score': score,
708
+ 'metadata': {
709
+ 'filename': filename,
710
+ 'section': section,
711
+ 'tags': tags,
712
+ 'metadata': metadata
713
+ },
714
+ 'search_type': 'metadata'
715
+ })
716
+
717
+ # Fallback: search for JSON metadata embedded in content
718
+ # This ensures backwards compatibility
719
+ if len(results) < count:
720
+ # Build specific conditions for known patterns
721
+ specific_conditions = []
722
+
723
+ # Look for specific high-value patterns first
724
+ if 'code' in terms and 'examples' in terms:
725
+ specific_conditions.append('content LIKE \'%"category": "Code Examples"%\'')
726
+ if 'sdk' in terms:
727
+ specific_conditions.append('content LIKE \'%"product": "%\' || \'SDK\' || \'%"%\'')
728
+
729
+ # General term search in JSON content
730
+ for term in terms:
731
+ specific_conditions.append(f"content LIKE '%\"{term}%'")
732
+
733
+ if specific_conditions:
734
+ # Limit conditions to avoid too broad search
735
+ conditions_to_use = specific_conditions[:10]
736
+ query_sql = f'''
737
+ SELECT id, content, filename, section, tags, metadata
738
+ FROM chunks
739
+ WHERE ({' OR '.join(conditions_to_use)})
740
+ AND id NOT IN ({','.join(str(id) for id in seen_ids) if seen_ids else '0'})
741
+ LIMIT ?
742
+ '''
743
+ cursor.execute(query_sql, (count * 5,))
744
+
745
+ rows = cursor.fetchall()
746
+
747
+ for row in rows:
748
+ chunk_id, content, filename, section, tags_json, metadata_json = row
749
+
750
+ if chunk_id in seen_ids:
751
+ continue
752
+
753
+ # Try to extract metadata from JSON content
754
+ json_metadata = {}
755
+ try:
756
+ # Look for metadata in JSON structure
757
+ if '"metadata":' in content:
758
+ import re
759
+ # More robust regex to extract nested JSON object
760
+ # This handles nested braces properly
761
+ start = content.find('"metadata":')
762
+ if start != -1:
763
+ # Find the opening brace
764
+ brace_start = content.find('{', start)
765
+ if brace_start != -1:
766
+ # Count braces to find matching closing brace
767
+ brace_count = 0
768
+ i = brace_start
769
+ while i < len(content):
770
+ if content[i] == '{':
771
+ brace_count += 1
772
+ elif content[i] == '}':
773
+ brace_count -= 1
774
+ if brace_count == 0:
775
+ # Found matching closing brace
776
+ metadata_str = content[brace_start:i+1]
777
+ json_metadata = json.loads(metadata_str)
778
+ break
779
+ i += 1
780
+ except:
781
+ pass
782
+
783
+ # Calculate score based on matches
784
+ score = 0
785
+ fields_matched = 0
786
+
787
+ # Check JSON metadata extracted from content
788
+ if json_metadata:
789
+ # Check category - count how many terms match
790
+ category = json_metadata.get('category', '').lower()
791
+ if category:
792
+ category_matches = sum(1 for term in terms if term in category)
793
+ if category_matches > 0:
794
+ score += 1.8 * (category_matches / len(terms) if terms else 1)
795
+ fields_matched += 1
796
+
797
+ # Check product - count how many terms match
798
+ product = json_metadata.get('product', '').lower()
799
+ if product:
800
+ product_matches = sum(1 for term in terms if term in product)
801
+ if product_matches > 0:
802
+ score += 1.5 * (product_matches / len(terms) if terms else 1)
803
+ fields_matched += 1
804
+
805
+ # Check source
806
+ source = json_metadata.get('source', '').lower()
807
+ if source:
808
+ source_matches = sum(1 for term in terms if term in source)
809
+ if source_matches > 0:
810
+ score += 1.2 * (source_matches / len(terms) if terms else 1)
811
+ fields_matched += 1
812
+
813
+ # Also check tags from JSON metadata
814
+ json_tags = json_metadata.get('tags', [])
815
+ if json_tags:
816
+ tags_str = str(json_tags).lower()
817
+ tag_matches = sum(1 for term in terms if term in tags_str)
818
+ if tag_matches > 0:
819
+ score += 1.3 * (tag_matches / len(terms) if terms else 1)
820
+ fields_matched += 1
821
+
822
+ if score > 0:
823
+ seen_ids.add(chunk_id)
824
+ results.append({
825
+ 'id': chunk_id,
826
+ 'content': content,
827
+ 'score': float(score),
828
+ 'metadata': {
829
+ 'filename': filename,
830
+ 'section': section,
831
+ 'tags': json.loads(tags_json) if tags_json else [],
832
+ 'metadata': json.loads(metadata_json) if metadata_json else {}
833
+ },
834
+ 'search_type': 'metadata',
835
+ 'fields_matched': fields_matched
836
+ })
837
+ logger.debug(f"Metadata match: {filename} - score={score:.2f}, fields_matched={fields_matched}, json_metadata={json_metadata}")
838
+
839
+ # Also get chunks with regular metadata
840
+ cursor.execute('''
841
+ SELECT id, content, filename, section, tags, metadata
842
+ FROM chunks
843
+ WHERE (tags IS NOT NULL AND tags != '')
844
+ OR (metadata IS NOT NULL AND metadata != '{}')
845
+ OR (section IS NOT NULL AND section != '')
846
+ LIMIT ?
847
+ ''', (count * 10,)) # Get more to search through
848
+
849
+ for row in cursor.fetchall():
850
+ chunk_id, content, filename, section, tags_json, metadata_json = row
851
+
852
+ if chunk_id in seen_ids:
853
+ continue
854
+
855
+ # Parse metadata
856
+ tags = json.loads(tags_json) if tags_json else []
857
+ metadata = json.loads(metadata_json) if metadata_json else {}
858
+
859
+ # Flatten nested metadata if present
860
+ if 'metadata' in metadata:
861
+ # Handle double-nested metadata from some indexes
862
+ nested_meta = metadata['metadata']
863
+ metadata.update(nested_meta)
864
+
865
+ # Initialize scoring components
866
+ score_components = {
867
+ 'tags': 0,
868
+ 'section': 0,
869
+ 'category': 0,
870
+ 'product': 0,
871
+ 'source': 0,
872
+ 'description': 0
873
+ }
874
+
875
+ # Check tags
876
+ if tags:
877
+ tag_matches = 0
878
+ for tag in tags:
879
+ tag_lower = tag.lower()
880
+ # Full query match in tag
881
+ if query_lower in tag_lower:
882
+ tag_matches += 2.0
883
+ else:
884
+ # Individual term matches
885
+ term_matches = sum(1 for term in terms if term in tag_lower)
886
+ tag_matches += term_matches * 0.5
887
+
888
+ if tag_matches > 0:
889
+ score_components['tags'] = min(1.0, tag_matches / len(tags))
890
+
891
+ # Check section
892
+ if section and section.lower() != 'none':
893
+ section_lower = section.lower()
894
+ if query_lower in section_lower:
895
+ score_components['section'] = 1.0
896
+ else:
897
+ term_matches = sum(1 for term in terms if term in section_lower)
898
+ score_components['section'] = (term_matches / len(terms)) * 0.8 if terms else 0
899
+
900
+ # Check category field
901
+ category = metadata.get('category', '')
902
+ if category:
903
+ category_lower = category.lower()
904
+ if query_lower in category_lower:
905
+ score_components['category'] = 1.0
906
+ else:
907
+ term_matches = sum(1 for term in terms if term in category_lower)
908
+ score_components['category'] = (term_matches / len(terms)) * 0.9 if terms else 0
909
+
910
+ # Check product field
911
+ product = metadata.get('product', '')
912
+ if product:
913
+ product_lower = product.lower()
914
+ if query_lower in product_lower:
915
+ score_components['product'] = 1.0
916
+ else:
917
+ term_matches = sum(1 for term in terms if term in product_lower)
918
+ score_components['product'] = (term_matches / len(terms)) * 0.8 if terms else 0
919
+
920
+ # Check source field (original filename)
921
+ source = metadata.get('source', '')
922
+ if source:
923
+ source_lower = source.lower()
924
+ if query_lower in source_lower:
925
+ score_components['source'] = 1.0
926
+ else:
927
+ term_matches = sum(1 for term in terms if term in source_lower)
928
+ score_components['source'] = (term_matches / len(terms)) * 0.7 if terms else 0
929
+
930
+ # Check description or title fields
931
+ description = metadata.get('description', metadata.get('title', ''))
932
+ if description:
933
+ desc_lower = description.lower()
934
+ if query_lower in desc_lower:
935
+ score_components['description'] = 0.8
936
+ else:
937
+ term_matches = sum(1 for term in terms if term in desc_lower)
938
+ score_components['description'] = (term_matches / len(terms)) * 0.6 if terms else 0
939
+
940
+ # Calculate total score with weights
941
+ weights = {
942
+ 'category': 1.8, # Strong signal
943
+ 'product': 1.5, # Strong signal
944
+ 'tags': 1.3, # Good signal
945
+ 'source': 1.2, # Good signal
946
+ 'section': 1.0, # Moderate signal
947
+ 'description': 0.8 # Weaker signal
948
+ }
949
+
950
+ total_score = sum(score_components[field] * weights.get(field, 1.0)
951
+ for field in score_components)
952
+
953
+ # Track match coverage
954
+ fields_matched = sum(1 for score in score_components.values() if score > 0)
955
+ match_coverage = sum(1 for term in terms if any(
956
+ term in str(field_value).lower()
957
+ for field_value in [tags, section, category, product, source, description]
958
+ if field_value
959
+ )) / len(terms) if terms else 0
960
+
961
+ if total_score > 0:
962
+ results.append({
963
+ 'id': chunk_id,
964
+ 'content': content,
965
+ 'score': float(total_score),
966
+ 'metadata': {
967
+ 'filename': filename,
968
+ 'section': section,
969
+ 'tags': tags,
970
+ 'metadata': metadata,
971
+ 'category': category,
972
+ 'product': product,
973
+ 'source': source
974
+ },
975
+ 'search_type': 'metadata',
976
+ 'metadata_matches': score_components,
977
+ 'fields_matched': fields_matched,
978
+ 'match_coverage': match_coverage
979
+ })
980
+ seen_ids.add(chunk_id)
981
+
982
+ conn.close()
983
+
984
+ # Sort by score and return top results
985
+ results.sort(key=lambda x: x['score'], reverse=True)
986
+ return results[:count]
987
+
988
+ except Exception as e:
989
+ logger.error(f"Error in metadata search: {e}")
990
+ return []
991
+
992
+ def _add_vector_scores_to_candidates(self, candidates: Dict[str, Dict], query_vector: NDArray,
993
+ distance_threshold: float):
994
+ """Add vector similarity scores to existing candidates"""
995
+ if not candidates or not np:
996
+ return
997
+
998
+ try:
999
+ conn = sqlite3.connect(self.index_path)
1000
+ cursor = conn.cursor()
1001
+
1002
+ # Get embeddings for candidate chunks only
1003
+ chunk_ids = list(candidates.keys())
1004
+ placeholders = ','.join(['?' for _ in chunk_ids])
1005
+
1006
+ cursor.execute(f'''
1007
+ SELECT id, embedding
1008
+ FROM chunks
1009
+ WHERE id IN ({placeholders}) AND embedding IS NOT NULL AND embedding != ''
1010
+ ''', chunk_ids)
1011
+
1012
+ for row in cursor.fetchall():
1013
+ chunk_id, embedding_blob = row
1014
+
1015
+ if not embedding_blob:
1016
+ continue
1017
+
1018
+ try:
1019
+ # Convert embedding back to numpy array
1020
+ embedding = np.frombuffer(embedding_blob, dtype=np.float32).reshape(1, -1)
1021
+
1022
+ # Calculate similarity
1023
+ similarity = cosine_similarity(query_vector, embedding)[0][0]
1024
+ distance = 1 - similarity
1025
+
1026
+ # Add vector scores to candidate
1027
+ candidates[chunk_id]['vector_score'] = float(similarity)
1028
+ candidates[chunk_id]['vector_distance'] = float(distance)
1029
+ candidates[chunk_id]['sources']['vector_rerank'] = True
1030
+
1031
+ except Exception as e:
1032
+ logger.debug(f"Error processing embedding for chunk {chunk_id}: {e}")
1033
+ continue
1034
+
1035
+ conn.close()
1036
+
1037
+ except Exception as e:
1038
+ logger.error(f"Error in vector re-ranking: {e}")
1039
+
1040
+ def _calculate_combined_score(self, candidate: Dict, distance_threshold: float) -> float:
1041
+ """Calculate final score combining all signals with comprehensive match bonus"""
1042
+ # Base scores from different sources
1043
+ source_scores = candidate.get('source_scores', {})
1044
+
1045
+ # Check for comprehensive matching (multiple signals)
1046
+ sources = candidate.get('sources', {})
1047
+ num_sources = len(sources)
1048
+
1049
+ # Get match coverage information
1050
+ match_coverage = candidate.get('match_coverage', 0)
1051
+ fields_matched = candidate.get('fields_matched', 0)
1052
+
1053
+ # Calculate base score with exponential boost for multiple sources
1054
+ if num_sources > 1:
1055
+ # Multiple signal matches are exponentially better
1056
+ multi_signal_boost = 1.0 + (0.3 * (num_sources - 1))
1057
+ base_score = sum(source_scores.values()) * multi_signal_boost
1058
+ else:
1059
+ base_score = sum(source_scores.values())
1060
+
1061
+ # Apply comprehensive match bonus
1062
+ if match_coverage > 0.5: # More than 50% of query terms matched
1063
+ coverage_bonus = 1.0 + (match_coverage - 0.5) * 0.5
1064
+ base_score *= coverage_bonus
1065
+
1066
+ # Apply field diversity bonus (matching in multiple metadata fields)
1067
+ if fields_matched > 2:
1068
+ field_bonus = 1.0 + (fields_matched - 2) * 0.1
1069
+ base_score *= field_bonus
1070
+
1071
+ # Apply vector similarity multiplier if available
1072
+ if 'vector_score' in candidate:
1073
+ vector_score = candidate['vector_score']
1074
+ vector_distance = candidate.get('vector_distance', 1 - vector_score)
1075
+
1076
+ # Distance-aware scoring
1077
+ if distance_threshold > 0:
1078
+ if vector_distance <= distance_threshold:
1079
+ # Within threshold - full vector score
1080
+ vector_multiplier = vector_score
1081
+ elif vector_distance <= distance_threshold * 1.5:
1082
+ # Near threshold - gradual decay
1083
+ overflow = (vector_distance - distance_threshold) / (distance_threshold * 0.5)
1084
+ vector_multiplier = vector_score * (1 - overflow * 0.3)
1085
+ else:
1086
+ # Beyond threshold - minimal contribution
1087
+ vector_multiplier = vector_score * 0.3
1088
+ else:
1089
+ vector_multiplier = vector_score
1090
+
1091
+ # For chunks found by vector-only search, use vector score directly
1092
+ if 'vector' in sources and len(sources) == 1:
1093
+ base_score = vector_score
1094
+ else:
1095
+ # For chunks found by multiple methods, apply vector as quality check
1096
+ base_score *= vector_multiplier
1097
+
1098
+ # Special handling for strong metadata matches
1099
+ if 'metadata' in sources:
1100
+ metadata_matches = candidate.get('metadata_matches', {})
1101
+ # Strong category or product match should boost significantly
1102
+ if metadata_matches.get('category', 0) > 0.8 or metadata_matches.get('product', 0) > 0.8:
1103
+ base_score *= 1.2
1104
+
1105
+ return base_score
1106
+
1107
+ def _apply_diversity_penalties(self, results: List[Dict], target_count: int) -> List[Dict]:
1108
+ """Apply penalties to prevent single-file dominance while maintaining quality"""
1109
+ if not results:
1110
+ return results
1111
+
1112
+ # Track file occurrences
1113
+ file_counts = {}
1114
+ penalized_results = []
1115
+
1116
+ # Define penalty multipliers
1117
+ occurrence_penalties = {
1118
+ 1: 1.0, # First chunk: no penalty
1119
+ 2: 0.85, # Second chunk: 15% penalty
1120
+ 3: 0.7, # Third chunk: 30% penalty
1121
+ 4: 0.5, # Fourth chunk: 50% penalty
1122
+ }
1123
+
1124
+ for result in results:
1125
+ filename = result['metadata']['filename']
1126
+
1127
+ # Get current count for this file
1128
+ current_count = file_counts.get(filename, 0) + 1
1129
+ file_counts[filename] = current_count
1130
+
1131
+ # Apply penalty based on occurrence
1132
+ penalty = occurrence_penalties.get(current_count, 0.4) # 60% penalty for 5+ chunks
1133
+
1134
+ # Create a copy to avoid modifying original
1135
+ penalized_result = result.copy()
1136
+ penalized_result['diversity_penalty'] = penalty
1137
+ penalized_result['final_score'] = result.get('final_score', result.get('score', 0)) * penalty
1138
+
1139
+ penalized_results.append(penalized_result)
1140
+
1141
+ # Re-sort by penalized scores
1142
+ penalized_results.sort(key=lambda x: x['final_score'], reverse=True)
1143
+
1144
+ # Ensure minimum diversity if we have enough results
1145
+ if len(penalized_results) > target_count:
1146
+ unique_files = len(set(r['metadata']['filename'] for r in penalized_results[:target_count]))
1147
+
1148
+ # If top results are too homogeneous (e.g., all from 1-2 files)
1149
+ if unique_files < min(3, target_count):
1150
+ # Try to inject some diversity
1151
+ selected = penalized_results[:target_count]
1152
+ seen_files = set(r['metadata']['filename'] for r in selected)
1153
+
1154
+ # Look for high-quality results from other files
1155
+ for result in penalized_results[target_count:]:
1156
+ if result['metadata']['filename'] not in seen_files:
1157
+ # If it's reasonably good (within 50% of top score), include it
1158
+ if result['final_score'] > 0.5 * selected[0]['final_score']:
1159
+ # Replace the lowest scoring result from an over-represented file
1160
+ for i in range(len(selected) - 1, -1, -1):
1161
+ if file_counts[selected[i]['metadata']['filename']] > 2:
1162
+ selected[i] = result
1163
+ seen_files.add(result['metadata']['filename'])
1164
+ break
1165
+
1166
+ penalized_results[:target_count] = selected
1167
+
1168
+ return penalized_results
1169
+
366
1170
  def get_stats(self) -> Dict[str, Any]:
367
1171
  """Get statistics about the search index"""
368
1172
  # Use pgvector backend if available