signalwire-agents 0.1.13__py3-none-any.whl → 0.1.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -24,51 +24,11 @@ import types
24
24
  from typing import Dict, List, Any, Optional, Union, Callable, Tuple, Type
25
25
  from urllib.parse import urlparse
26
26
 
27
- # Import and configure structlog
28
- try:
29
- import structlog
30
-
31
- # Only configure if not already configured
32
- if not hasattr(structlog, "_configured") or not structlog._configured:
33
- structlog.configure(
34
- processors=[
35
- structlog.stdlib.filter_by_level,
36
- structlog.stdlib.add_logger_name,
37
- structlog.stdlib.add_log_level,
38
- structlog.stdlib.PositionalArgumentsFormatter(),
39
- structlog.processors.TimeStamper(fmt="%Y-%m-%d %H:%M:%S"),
40
- structlog.processors.StackInfoRenderer(),
41
- structlog.processors.format_exc_info,
42
- structlog.processors.UnicodeDecoder(),
43
- structlog.dev.ConsoleRenderer()
44
- ],
45
- context_class=dict,
46
- logger_factory=structlog.stdlib.LoggerFactory(),
47
- wrapper_class=structlog.stdlib.BoundLogger,
48
- cache_logger_on_first_use=True,
49
- )
50
-
51
- # Set up root logger with structlog
52
- logging.basicConfig(
53
- format="%(message)s",
54
- stream=sys.stdout,
55
- level=logging.INFO,
56
- )
57
-
58
- # Mark as configured to avoid duplicate configuration
59
- structlog._configured = True
60
-
61
- # Create the module logger
62
- logger = structlog.get_logger("swml_service")
63
-
64
- except ImportError:
65
- # Fallback to standard logging if structlog is not available
66
- logging.basicConfig(
67
- level=logging.INFO,
68
- format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
69
- stream=sys.stdout
70
- )
71
- logger = logging.getLogger("swml_service")
27
+ # Import centralized logging system
28
+ from signalwire_agents.core.logging_config import get_logger
29
+
30
+ # Create the module logger using centralized system
31
+ logger = get_logger("swml_service")
72
32
 
73
33
  try:
74
34
  import fastapi
@@ -74,29 +74,42 @@ logger = logging.getLogger(__name__)
74
74
  class DocumentProcessor:
75
75
  """Enhanced document processor with smart chunking capabilities"""
76
76
 
77
- def __init__(self, chunking_strategy: str = 'sentence',
78
- max_sentences_per_chunk: int = 50,
79
- chunk_size: int = 50,
80
- overlap_size: int = 10,
81
- split_newlines: Optional[int] = None):
77
+ def __init__(
78
+ self,
79
+ chunking_strategy: str = 'sentence',
80
+ max_sentences_per_chunk: int = 5,
81
+ chunk_size: int = 50,
82
+ chunk_overlap: int = 10,
83
+ split_newlines: Optional[int] = None,
84
+ index_nlp_backend: str = 'nltk',
85
+ verbose: bool = False,
86
+ semantic_threshold: float = 0.5,
87
+ topic_threshold: float = 0.3
88
+ ):
82
89
  """
83
- Initialize document processor with chunking strategy
90
+ Initialize document processor
84
91
 
85
92
  Args:
86
- chunking_strategy: 'sentence', 'sliding', 'paragraph', or 'page'
87
- max_sentences_per_chunk: For sentence strategy (default: 50)
93
+ chunking_strategy: Strategy for chunking documents ('sentence', 'sliding', 'paragraph', 'page', 'semantic', 'topic', 'qa')
94
+ max_sentences_per_chunk: For sentence strategy (default: 5)
88
95
  chunk_size: For sliding strategy - words per chunk (default: 50)
89
- overlap_size: For sliding strategy - overlap in words (default: 10)
96
+ chunk_overlap: For sliding strategy - overlap in words (default: 10)
90
97
  split_newlines: For sentence strategy - split on multiple newlines (optional)
98
+ index_nlp_backend: NLP backend for indexing (default: 'nltk')
99
+ verbose: Whether to enable verbose logging (default: False)
100
+ semantic_threshold: Similarity threshold for semantic chunking (default: 0.5)
101
+ topic_threshold: Similarity threshold for topic chunking (default: 0.3)
91
102
  """
92
103
  self.chunking_strategy = chunking_strategy
93
104
  self.max_sentences_per_chunk = max_sentences_per_chunk
94
105
  self.chunk_size = chunk_size
95
- self.overlap_size = overlap_size
106
+ self.chunk_overlap = chunk_overlap
96
107
  self.split_newlines = split_newlines
108
+ self.semantic_threshold = semantic_threshold
109
+ self.topic_threshold = topic_threshold
97
110
 
98
111
  # Legacy support for old character-based chunking
99
- self.chunk_overlap = overlap_size
112
+ self.chunk_overlap = chunk_overlap
100
113
 
101
114
  def create_chunks(self, content: str, filename: str,
102
115
  file_type: str) -> List[Dict[str, Any]]:
@@ -121,6 +134,12 @@ class DocumentProcessor:
121
134
  return self._chunk_by_paragraphs(content, filename, file_type)
122
135
  elif self.chunking_strategy == 'page':
123
136
  return self._chunk_by_pages(content, filename, file_type)
137
+ elif self.chunking_strategy == 'semantic':
138
+ return self._chunk_by_semantic(content, filename, file_type)
139
+ elif self.chunking_strategy == 'topic':
140
+ return self._chunk_by_topics(content, filename, file_type)
141
+ elif self.chunking_strategy == 'qa':
142
+ return self._chunk_by_qa_optimization(content, filename, file_type)
124
143
  else:
125
144
  # Fallback to sentence-based chunking
126
145
  return self._chunk_by_sentences(content, filename, file_type)
@@ -674,7 +693,7 @@ class DocumentProcessor:
674
693
  chunk_index = 0
675
694
 
676
695
  # Create overlapping chunks
677
- for i in range(0, len(words), self.chunk_size - self.overlap_size):
696
+ for i in range(0, len(words), self.chunk_size - self.chunk_overlap):
678
697
  chunk_words = words[i:i + self.chunk_size]
679
698
  if chunk_words:
680
699
  chunk_content = ' '.join(chunk_words)
@@ -686,7 +705,7 @@ class DocumentProcessor:
686
705
  'chunk_method': 'sliding_window',
687
706
  'chunk_index': chunk_index,
688
707
  'chunk_size_words': self.chunk_size,
689
- 'overlap_size_words': self.overlap_size,
708
+ 'overlap_size_words': self.chunk_overlap,
690
709
  'start_word': i,
691
710
  'end_word': i + len(chunk_words)
692
711
  }
@@ -761,4 +780,246 @@ class DocumentProcessor:
761
780
  }
762
781
  ))
763
782
 
764
- return chunks
783
+ return chunks
784
+
785
+ def _chunk_by_semantic(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
786
+ """Chunk based on semantic similarity between sentences"""
787
+ if isinstance(content, list):
788
+ content = '\n'.join(content)
789
+
790
+ # Get sentences
791
+ if sent_tokenize:
792
+ sentences = sent_tokenize(content)
793
+ else:
794
+ sentences = content.split('. ')
795
+ sentences = [s.strip() + '.' for s in sentences if s.strip()]
796
+
797
+ if len(sentences) <= 1:
798
+ return [self._create_chunk(content, filename, "Section 1",
799
+ metadata={'chunk_method': 'semantic', 'chunk_index': 0})]
800
+
801
+ # Generate embeddings for sentences (using the same model as the index)
802
+ try:
803
+ from sentence_transformers import SentenceTransformer
804
+ from sklearn.metrics.pairwise import cosine_similarity
805
+ import numpy as np
806
+
807
+ model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
808
+ embeddings = model.encode(sentences, show_progress_bar=False)
809
+
810
+ # Calculate similarity between adjacent sentences
811
+ similarities = []
812
+ for i in range(len(embeddings) - 1):
813
+ sim = cosine_similarity([embeddings[i]], [embeddings[i + 1]])[0][0]
814
+ similarities.append(sim)
815
+
816
+ # Find split points where similarity drops below threshold
817
+ split_points = [0]
818
+ for i, sim in enumerate(similarities):
819
+ if sim < self.semantic_threshold:
820
+ split_points.append(i + 1)
821
+ split_points.append(len(sentences))
822
+
823
+ # Create chunks
824
+ chunks = []
825
+ for i in range(len(split_points) - 1):
826
+ start_idx = split_points[i]
827
+ end_idx = split_points[i + 1]
828
+ chunk_sentences = sentences[start_idx:end_idx]
829
+
830
+ # Ensure minimum chunk size
831
+ if len(chunk_sentences) < 2 and i > 0:
832
+ # Merge with previous chunk
833
+ chunks[-1]['content'] += ' ' + ' '.join(chunk_sentences)
834
+ continue
835
+
836
+ chunk_content = ' '.join(chunk_sentences)
837
+ chunks.append(self._create_chunk(
838
+ content=chunk_content,
839
+ filename=filename,
840
+ section=f"Semantic Section {i+1}",
841
+ metadata={
842
+ 'chunk_method': 'semantic',
843
+ 'chunk_index': i,
844
+ 'semantic_threshold': self.semantic_threshold,
845
+ 'sentence_count': len(chunk_sentences)
846
+ }
847
+ ))
848
+
849
+ return chunks if chunks else [self._create_chunk(content, filename, "Section 1",
850
+ metadata={'chunk_method': 'semantic', 'chunk_index': 0})]
851
+
852
+ except ImportError:
853
+ # Fallback to sentence-based chunking
854
+ return self._chunk_by_sentences(content, filename, file_type)
855
+
856
+ def _chunk_by_topics(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
857
+ """Chunk based on topic changes using keyword analysis"""
858
+ if isinstance(content, list):
859
+ content = '\n'.join(content)
860
+
861
+ if sent_tokenize:
862
+ sentences = sent_tokenize(content)
863
+ else:
864
+ sentences = content.split('. ')
865
+ sentences = [s.strip() + '.' for s in sentences if s.strip()]
866
+
867
+ if len(sentences) <= 3:
868
+ return [self._create_chunk(content, filename, "Topic 1",
869
+ metadata={'chunk_method': 'topic', 'chunk_index': 0})]
870
+
871
+ try:
872
+ # Simple topic detection using keyword overlap
873
+ from collections import Counter
874
+ import re
875
+
876
+ # Extract keywords from each sentence
877
+ sentence_keywords = []
878
+ for sentence in sentences:
879
+ # Simple keyword extraction (could be enhanced with NLP)
880
+ words = re.findall(r'\b[a-zA-Z]{3,}\b', sentence.lower())
881
+ # Filter common words (basic stopwords)
882
+ stopwords = {'the', 'and', 'for', 'are', 'but', 'not', 'you', 'all', 'can', 'had', 'her', 'was', 'one', 'our', 'out', 'day', 'get', 'has', 'him', 'his', 'how', 'its', 'may', 'new', 'now', 'old', 'see', 'two', 'who', 'boy', 'did', 'man', 'way', 'she', 'use', 'her', 'many', 'oil', 'sit', 'set', 'run', 'eat', 'far', 'sea', 'eye', 'ask', 'own', 'say', 'too', 'any', 'try', 'us', 'an', 'as', 'at', 'be', 'he', 'if', 'in', 'is', 'it', 'my', 'of', 'on', 'or', 'to', 'up', 'we', 'go', 'no', 'so', 'am', 'by', 'do', 'me'}
883
+ keywords = [w for w in words if w not in stopwords and len(w) > 3]
884
+ sentence_keywords.append(set(keywords))
885
+
886
+ # Find topic boundaries based on keyword overlap
887
+ chunks = []
888
+ current_chunk = [sentences[0]]
889
+ current_keywords = sentence_keywords[0]
890
+
891
+ for i in range(1, len(sentences)):
892
+ # Calculate keyword overlap with current chunk
893
+ overlap = len(current_keywords.intersection(sentence_keywords[i]))
894
+ total_keywords = len(current_keywords.union(sentence_keywords[i]))
895
+
896
+ if total_keywords > 0:
897
+ similarity = overlap / total_keywords
898
+ else:
899
+ similarity = 0
900
+
901
+ # If similarity is low, start new chunk
902
+ if similarity < self.topic_threshold and len(current_chunk) >= 2:
903
+ chunk_content = ' '.join(current_chunk)
904
+ chunks.append(self._create_chunk(
905
+ content=chunk_content,
906
+ filename=filename,
907
+ section=f"Topic {len(chunks)+1}",
908
+ metadata={
909
+ 'chunk_method': 'topic',
910
+ 'chunk_index': len(chunks),
911
+ 'topic_keywords': list(current_keywords)[:10], # Top keywords
912
+ 'sentence_count': len(current_chunk),
913
+ 'topic_threshold': self.topic_threshold
914
+ }
915
+ ))
916
+ current_chunk = [sentences[i]]
917
+ current_keywords = sentence_keywords[i]
918
+ else:
919
+ current_chunk.append(sentences[i])
920
+ current_keywords = current_keywords.union(sentence_keywords[i])
921
+
922
+ # Add final chunk
923
+ if current_chunk:
924
+ chunk_content = ' '.join(current_chunk)
925
+ chunks.append(self._create_chunk(
926
+ content=chunk_content,
927
+ filename=filename,
928
+ section=f"Topic {len(chunks)+1}",
929
+ metadata={
930
+ 'chunk_method': 'topic',
931
+ 'chunk_index': len(chunks),
932
+ 'topic_keywords': list(current_keywords)[:10],
933
+ 'sentence_count': len(current_chunk),
934
+ 'topic_threshold': self.topic_threshold
935
+ }
936
+ ))
937
+
938
+ return chunks if chunks else [self._create_chunk(content, filename, "Topic 1",
939
+ metadata={'chunk_method': 'topic', 'chunk_index': 0})]
940
+
941
+ except Exception:
942
+ # Fallback to sentence-based chunking
943
+ return self._chunk_by_sentences(content, filename, file_type)
944
+
945
+ def _chunk_by_qa_optimization(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
946
+ """Create chunks optimized for question-answering"""
947
+ if isinstance(content, list):
948
+ content = '\n'.join(content)
949
+
950
+ if sent_tokenize:
951
+ sentences = sent_tokenize(content)
952
+ else:
953
+ sentences = content.split('. ')
954
+ sentences = [s.strip() + '.' for s in sentences if s.strip()]
955
+
956
+ # Patterns that indicate Q&A structure
957
+ question_patterns = [
958
+ r'\?', # Questions
959
+ r'^(what|how|why|when|where|who|which|can|does|is|are|will|would|should)',
960
+ r'(step|steps|process|procedure|method|way to)',
961
+ r'(example|examples|instance|case)',
962
+ r'(definition|meaning|refers to|means)',
963
+ ]
964
+
965
+ chunks = []
966
+ current_chunk = []
967
+ current_context = []
968
+
969
+ for i, sentence in enumerate(sentences):
970
+ sentence_lower = sentence.lower().strip()
971
+
972
+ # Check if this sentence contains Q&A indicators
973
+ is_qa_relevant = any(re.search(pattern, sentence_lower) for pattern in question_patterns)
974
+
975
+ if is_qa_relevant or len(current_chunk) == 0:
976
+ current_chunk.append(sentence)
977
+ # Add surrounding context (previous and next sentences)
978
+ if i > 0 and sentences[i-1] not in current_chunk:
979
+ current_context.append(sentences[i-1])
980
+ if i < len(sentences) - 1:
981
+ current_context.append(sentences[i+1])
982
+ else:
983
+ current_chunk.append(sentence)
984
+
985
+ # Create chunk when we have enough content or reach a natural break
986
+ if (len(current_chunk) >= 3 and
987
+ (i == len(sentences) - 1 or # Last sentence
988
+ sentence.endswith('.') and len(current_chunk) >= 5)): # Natural break
989
+
990
+ # Combine chunk with context
991
+ full_content = current_context + current_chunk
992
+ chunk_content = ' '.join(full_content)
993
+
994
+ chunks.append(self._create_chunk(
995
+ content=chunk_content,
996
+ filename=filename,
997
+ section=f"QA Section {len(chunks)+1}",
998
+ metadata={
999
+ 'chunk_method': 'qa_optimized',
1000
+ 'chunk_index': len(chunks),
1001
+ 'has_question': any('?' in s for s in current_chunk),
1002
+ 'has_process': any(re.search(r'(step|process|method)', s.lower()) for s in current_chunk),
1003
+ 'sentence_count': len(full_content)
1004
+ }
1005
+ ))
1006
+
1007
+ current_chunk = []
1008
+ current_context = []
1009
+
1010
+ # Handle remaining content
1011
+ if current_chunk:
1012
+ chunk_content = ' '.join(current_context + current_chunk)
1013
+ chunks.append(self._create_chunk(
1014
+ content=chunk_content,
1015
+ filename=filename,
1016
+ section=f"QA Section {len(chunks)+1}",
1017
+ metadata={
1018
+ 'chunk_method': 'qa_optimized',
1019
+ 'chunk_index': len(chunks),
1020
+ 'sentence_count': len(current_context + current_chunk)
1021
+ }
1022
+ ))
1023
+
1024
+ return chunks if chunks else [self._create_chunk(content, filename, "QA Section 1",
1025
+ metadata={'chunk_method': 'qa_optimized', 'chunk_index': 0})]
@@ -35,27 +35,61 @@ logger = logging.getLogger(__name__)
35
35
  class IndexBuilder:
36
36
  """Build searchable indexes from document directories"""
37
37
 
38
- def __init__(self, model_name: str = 'sentence-transformers/all-mpnet-base-v2',
39
- chunking_strategy: str = 'sentence',
40
- max_sentences_per_chunk: int = 50,
41
- chunk_size: int = 50,
42
- chunk_overlap: int = 10,
43
- split_newlines: Optional[int] = None,
44
- verbose: bool = False):
38
+ def __init__(
39
+ self,
40
+ model_name: str = 'sentence-transformers/all-mpnet-base-v2',
41
+ chunking_strategy: str = 'sentence',
42
+ max_sentences_per_chunk: int = 5,
43
+ chunk_size: int = 50,
44
+ chunk_overlap: int = 10,
45
+ split_newlines: Optional[int] = None,
46
+ index_nlp_backend: str = 'nltk',
47
+ verbose: bool = False,
48
+ semantic_threshold: float = 0.5,
49
+ topic_threshold: float = 0.3
50
+ ):
51
+ """
52
+ Initialize the index builder
53
+
54
+ Args:
55
+ model_name: Name of the sentence transformer model to use
56
+ chunking_strategy: Strategy for chunking documents ('sentence', 'sliding', 'paragraph', 'page', 'semantic', 'topic', 'qa')
57
+ max_sentences_per_chunk: For sentence strategy (default: 5)
58
+ chunk_size: For sliding strategy - words per chunk (default: 50)
59
+ chunk_overlap: For sliding strategy - overlap in words (default: 10)
60
+ split_newlines: For sentence strategy - split on multiple newlines (optional)
61
+ index_nlp_backend: NLP backend for indexing (default: 'nltk')
62
+ verbose: Whether to enable verbose logging (default: False)
63
+ semantic_threshold: Similarity threshold for semantic chunking (default: 0.5)
64
+ topic_threshold: Similarity threshold for topic chunking (default: 0.3)
65
+ """
45
66
  self.model_name = model_name
46
67
  self.chunking_strategy = chunking_strategy
47
68
  self.max_sentences_per_chunk = max_sentences_per_chunk
48
69
  self.chunk_size = chunk_size
49
70
  self.chunk_overlap = chunk_overlap
50
71
  self.split_newlines = split_newlines
72
+ self.index_nlp_backend = index_nlp_backend
51
73
  self.verbose = verbose
74
+ self.semantic_threshold = semantic_threshold
75
+ self.topic_threshold = topic_threshold
52
76
  self.model = None
77
+
78
+ # Validate NLP backend
79
+ if self.index_nlp_backend not in ['nltk', 'spacy']:
80
+ logger.warning(f"Invalid index_nlp_backend '{self.index_nlp_backend}', using 'nltk'")
81
+ self.index_nlp_backend = 'nltk'
82
+
53
83
  self.doc_processor = DocumentProcessor(
54
84
  chunking_strategy=chunking_strategy,
55
85
  max_sentences_per_chunk=max_sentences_per_chunk,
56
86
  chunk_size=chunk_size,
57
- overlap_size=chunk_overlap,
58
- split_newlines=split_newlines
87
+ chunk_overlap=chunk_overlap,
88
+ split_newlines=split_newlines,
89
+ index_nlp_backend=self.index_nlp_backend,
90
+ verbose=self.verbose,
91
+ semantic_threshold=self.semantic_threshold,
92
+ topic_threshold=self.topic_threshold
59
93
  )
60
94
 
61
95
  def _load_model(self):
@@ -130,7 +164,8 @@ class IndexBuilder:
130
164
  # Preprocess content for better search
131
165
  processed = preprocess_document_content(
132
166
  chunk['content'],
133
- language=chunk.get('language', 'en')
167
+ language=chunk.get('language', 'en'),
168
+ index_nlp_backend=self.index_nlp_backend
134
169
  )
135
170
 
136
171
  chunk['processed_content'] = processed['enhanced_text']
@@ -186,7 +186,8 @@ def remove_duplicate_words(input_string: str) -> str:
186
186
 
187
187
  def preprocess_query(query: str, language: str = 'en', pos_to_expand: Optional[List[str]] = None,
188
188
  max_synonyms: int = 5, debug: bool = False, vector: bool = False,
189
- vectorize_query_param: bool = False, nlp_backend: str = 'nltk') -> Dict[str, Any]:
189
+ vectorize_query_param: bool = False, nlp_backend: str = None,
190
+ query_nlp_backend: str = 'nltk') -> Dict[str, Any]:
190
191
  """
191
192
  Advanced query preprocessing with language detection, POS tagging, synonym expansion, and vectorization
192
193
 
@@ -198,12 +199,19 @@ def preprocess_query(query: str, language: str = 'en', pos_to_expand: Optional[L
198
199
  debug: Enable debug output
199
200
  vector: Include vector embedding in output
200
201
  vectorize_query_param: If True, just vectorize without other processing
201
- nlp_backend: NLP backend to use ('nltk' for fast, 'spacy' for better quality)
202
+ nlp_backend: DEPRECATED - use query_nlp_backend instead
203
+ query_nlp_backend: NLP backend for query processing ('nltk' for fast, 'spacy' for better quality)
202
204
 
203
205
  Returns:
204
206
  Dict containing processed query, language, POS tags, and optionally vector
205
207
  """
206
208
 
209
+ # Handle backward compatibility
210
+ if nlp_backend is not None:
211
+ query_nlp_backend = nlp_backend
212
+ if debug:
213
+ logger.info(f"Using deprecated 'nlp_backend' parameter, please use 'query_nlp_backend' instead")
214
+
207
215
  if vectorize_query_param:
208
216
  # Vectorize the query directly
209
217
  vectorized_query = vectorize_query(query)
@@ -226,15 +234,16 @@ def preprocess_query(query: str, language: str = 'en', pos_to_expand: Optional[L
226
234
 
227
235
  # Load spaCy model based on the language and backend choice
228
236
  nlp = None
229
- if nlp_backend == 'spacy':
237
+ if query_nlp_backend == 'spacy':
230
238
  nlp = load_spacy_model(language)
231
239
  if nlp is None and debug:
232
240
  logger.info("spaCy backend requested but not available, falling back to NLTK")
233
- elif nlp_backend == 'nltk':
241
+ elif query_nlp_backend == 'nltk':
234
242
  if debug:
235
- logger.info("Using NLTK backend for NLP processing")
243
+ logger.info("Using NLTK backend for query processing")
236
244
  else:
237
- logger.warning(f"Unknown NLP backend '{nlp_backend}', using NLTK")
245
+ logger.warning(f"Unknown query NLP backend '{query_nlp_backend}', using NLTK")
246
+ query_nlp_backend = 'nltk'
238
247
 
239
248
  # Tokenization and stop word removal
240
249
  tokens = nltk.word_tokenize(query)
@@ -258,7 +267,7 @@ def preprocess_query(query: str, language: str = 'en', pos_to_expand: Optional[L
258
267
  lemmas = []
259
268
  pos_tags = {}
260
269
 
261
- if nlp and nlp_backend == 'spacy':
270
+ if nlp and query_nlp_backend == 'spacy':
262
271
  # Use spaCy for better POS tagging
263
272
  doc = nlp(" ".join(tokens))
264
273
  for token in doc:
@@ -303,14 +312,14 @@ def preprocess_query(query: str, language: str = 'en', pos_to_expand: Optional[L
303
312
 
304
313
  if debug:
305
314
  logger.info(f"Expanded Query: {final_query_str}")
306
- logger.info(f"NLP Backend Used: {nlp_backend if nlp or nlp_backend == 'nltk' else 'nltk (fallback)'}")
315
+ logger.info(f"NLP Backend Used: {query_nlp_backend if nlp or query_nlp_backend == 'nltk' else 'nltk (fallback)'}")
307
316
 
308
317
  formatted_output = {
309
318
  'input': final_query_str,
310
319
  'enhanced_text': final_query_str, # Alias for compatibility
311
320
  'language': language,
312
321
  'POS': pos_tags,
313
- 'nlp_backend_used': nlp_backend if nlp or nlp_backend == 'nltk' else 'nltk'
322
+ 'nlp_backend_used': query_nlp_backend if nlp or query_nlp_backend == 'nltk' else 'nltk'
314
323
  }
315
324
 
316
325
  # Vectorize query if requested
@@ -323,19 +332,25 @@ def preprocess_query(query: str, language: str = 'en', pos_to_expand: Optional[L
323
332
 
324
333
  return formatted_output
325
334
 
326
- def preprocess_document_content(content: str, language: str = 'en', nlp_backend: str = 'nltk') -> Dict[str, Any]:
335
+ def preprocess_document_content(content: str, language: str = 'en', nlp_backend: str = None,
336
+ index_nlp_backend: str = 'nltk') -> Dict[str, Any]:
327
337
  """
328
338
  Preprocess document content for better searchability
329
339
 
330
340
  Args:
331
341
  content: Document content to process
332
342
  language: Language code for processing
333
- nlp_backend: NLP backend to use ('nltk' for fast, 'spacy' for better quality)
343
+ nlp_backend: DEPRECATED - use index_nlp_backend instead
344
+ index_nlp_backend: NLP backend for document processing ('nltk' for fast, 'spacy' for better quality)
334
345
 
335
346
  Returns:
336
347
  Dict containing enhanced text and extracted keywords
337
348
  """
338
349
 
350
+ # Handle backward compatibility
351
+ if nlp_backend is not None:
352
+ index_nlp_backend = nlp_backend
353
+
339
354
  # Use existing preprocessing but adapted for documents
340
355
  processed = preprocess_query(
341
356
  content,
@@ -344,7 +359,7 @@ def preprocess_document_content(content: str, language: str = 'en', nlp_backend:
344
359
  max_synonyms=2, # Fewer synonyms for documents
345
360
  debug=False,
346
361
  vector=False,
347
- nlp_backend=nlp_backend
362
+ query_nlp_backend=index_nlp_backend
348
363
  )
349
364
 
350
365
  # Extract key terms for keyword search
@@ -9,6 +9,6 @@ Skills are automatically discovered from subdirectories.
9
9
  from .registry import skill_registry
10
10
 
11
11
  # Trigger skill discovery on import
12
- skill_registry.discover_skills()
12
+ # skill_registry.discover_skills()
13
13
 
14
14
  __all__ = ["skill_registry"]
@@ -75,10 +75,25 @@ class NativeVectorSearchSkill(SkillBase):
75
75
  self.swaig_fields = self.params.get('swaig_fields', {})
76
76
 
77
77
  # NLP backend configuration
78
- self.nlp_backend = self.params.get('nlp_backend', 'nltk') # Default to faster NLTK
79
- if self.nlp_backend not in ['nltk', 'spacy']:
80
- self.logger.warning(f"Invalid nlp_backend '{self.nlp_backend}', using 'nltk'")
81
- self.nlp_backend = 'nltk'
78
+ self.nlp_backend = self.params.get('nlp_backend') # Backward compatibility
79
+ self.index_nlp_backend = self.params.get('index_nlp_backend', 'nltk') # Default to fast NLTK for indexing
80
+ self.query_nlp_backend = self.params.get('query_nlp_backend', 'nltk') # Default to fast NLTK for search
81
+
82
+ # Handle backward compatibility
83
+ if self.nlp_backend is not None:
84
+ self.logger.warning("Parameter 'nlp_backend' is deprecated. Use 'index_nlp_backend' and 'query_nlp_backend' instead.")
85
+ # If old parameter is used, apply it to both
86
+ self.index_nlp_backend = self.nlp_backend
87
+ self.query_nlp_backend = self.nlp_backend
88
+
89
+ # Validate parameters
90
+ if self.index_nlp_backend not in ['nltk', 'spacy']:
91
+ self.logger.warning(f"Invalid index_nlp_backend '{self.index_nlp_backend}', using 'nltk'")
92
+ self.index_nlp_backend = 'nltk'
93
+
94
+ if self.query_nlp_backend not in ['nltk', 'spacy']:
95
+ self.logger.warning(f"Invalid query_nlp_backend '{self.query_nlp_backend}', using 'nltk'")
96
+ self.query_nlp_backend = 'nltk'
82
97
 
83
98
  # Auto-build index if requested and search is available
84
99
  if self.build_index and self.source_dir and self.search_available:
@@ -93,7 +108,10 @@ class NativeVectorSearchSkill(SkillBase):
93
108
  self.logger.info(f"Building search index from {self.source_dir}...")
94
109
  from signalwire_agents.search import IndexBuilder
95
110
 
96
- builder = IndexBuilder(verbose=self.params.get('verbose', False))
111
+ builder = IndexBuilder(
112
+ verbose=self.params.get('verbose', False),
113
+ index_nlp_backend=self.index_nlp_backend
114
+ )
97
115
  builder.build_index(
98
116
  source_dir=self.source_dir,
99
117
  output_file=self.index_file,
@@ -187,7 +205,7 @@ class NativeVectorSearchSkill(SkillBase):
187
205
  try:
188
206
  # Preprocess the query
189
207
  from signalwire_agents.search.query_processor import preprocess_query
190
- enhanced = preprocess_query(query, language='en', vector=True, nlp_backend=self.nlp_backend)
208
+ enhanced = preprocess_query(query, language='en', vector=True, query_nlp_backend=self.query_nlp_backend)
191
209
 
192
210
  # Perform search (local or remote)
193
211
  if self.use_remote: