signalwire-agents 0.1.11__py3-none-any.whl → 0.1.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. signalwire_agents/__init__.py +5 -1
  2. signalwire_agents/agent_server.py +222 -13
  3. signalwire_agents/cli/build_search.py +457 -0
  4. signalwire_agents/cli/test_swaig.py +177 -113
  5. signalwire_agents/core/agent_base.py +1 -3
  6. signalwire_agents/core/logging_config.py +232 -0
  7. signalwire_agents/core/swaig_function.py +2 -3
  8. signalwire_agents/core/swml_renderer.py +43 -28
  9. signalwire_agents/search/__init__.py +131 -0
  10. signalwire_agents/search/document_processor.py +764 -0
  11. signalwire_agents/search/index_builder.py +534 -0
  12. signalwire_agents/search/query_processor.py +371 -0
  13. signalwire_agents/search/search_engine.py +383 -0
  14. signalwire_agents/search/search_service.py +251 -0
  15. signalwire_agents/skills/native_vector_search/__init__.py +1 -0
  16. signalwire_agents/skills/native_vector_search/skill.py +352 -0
  17. signalwire_agents/skills/registry.py +2 -15
  18. signalwire_agents/utils/__init__.py +13 -1
  19. {signalwire_agents-0.1.11.dist-info → signalwire_agents-0.1.13.dist-info}/METADATA +110 -3
  20. {signalwire_agents-0.1.11.dist-info → signalwire_agents-0.1.13.dist-info}/RECORD +25 -16
  21. {signalwire_agents-0.1.11.dist-info → signalwire_agents-0.1.13.dist-info}/entry_points.txt +1 -0
  22. signalwire_agents/utils/serverless.py +0 -38
  23. {signalwire_agents-0.1.11.data → signalwire_agents-0.1.13.data}/data/schema.json +0 -0
  24. {signalwire_agents-0.1.11.dist-info → signalwire_agents-0.1.13.dist-info}/WHEEL +0 -0
  25. {signalwire_agents-0.1.11.dist-info → signalwire_agents-0.1.13.dist-info}/licenses/LICENSE +0 -0
  26. {signalwire_agents-0.1.11.dist-info → signalwire_agents-0.1.13.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,371 @@
1
+ """
2
+ Copyright (c) 2025 SignalWire
3
+
4
+ This file is part of the SignalWire AI Agents SDK.
5
+
6
+ Licensed under the MIT License.
7
+ See LICENSE file in the project root for full license information.
8
+ """
9
+
10
+ import os
11
+ import nltk
12
+ import re
13
+ from typing import Dict, Any, List, Optional
14
+ from nltk.corpus import wordnet as wn
15
+ from nltk.stem import PorterStemmer
16
+ import logging
17
+
18
+ # Configure logging
19
+ logger = logging.getLogger(__name__)
20
+
21
+ # Global flag to track if we've already warned about spaCy
22
+ _spacy_warning_shown = False
23
+
24
+ # Language detection and spaCy model loading
25
+ def detect_language(text: str) -> str:
26
+ """
27
+ Detect language of input text
28
+ Simple implementation - can be enhanced with langdetect library
29
+ """
30
+ # Simple heuristic-based detection
31
+ # In a full implementation, you'd use langdetect or similar
32
+ common_english_words = {'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'can', 'must'}
33
+ common_spanish_words = {'el', 'la', 'de', 'que', 'y', 'en', 'un', 'es', 'se', 'no', 'te', 'lo', 'le', 'da', 'su', 'por', 'son', 'con', 'para', 'al', 'del', 'los', 'las', 'una', 'como', 'pero', 'sus', 'han', 'fue', 'ser', 'está', 'todo', 'más', 'muy', 'sin', 'sobre', 'también', 'me', 'hasta', 'donde', 'quien', 'desde', 'nos', 'durante', 'todos', 'uno', 'les', 'ni', 'contra', 'otros', 'ese', 'eso', 'ante', 'ellos', 'e', 'esto', 'mí', 'antes', 'algunos', 'qué', 'unos', 'yo', 'otro', 'otras', 'otra', 'él', 'tanto', 'esa', 'estos', 'mucho', 'quienes', 'nada', 'muchos', 'cual', 'poco', 'ella', 'estar', 'estas', 'algunas', 'algo', 'nosotros', 'mi', 'mis', 'tú', 'te', 'ti', 'tu', 'tus', 'ellas', 'nosotras', 'vosotros', 'vosotras', 'os', 'mío', 'mía', 'míos', 'mías', 'tuyo', 'tuya', 'tuyos', 'tuyas', 'suyo', 'suya', 'suyos', 'suyas', 'nuestro', 'nuestra', 'nuestros', 'nuestras', 'vuestro', 'vuestra', 'vuestros', 'vuestras', 'esos', 'esas'}
34
+
35
+ words = text.lower().split()
36
+ english_count = sum(1 for word in words if word in common_english_words)
37
+ spanish_count = sum(1 for word in words if word in common_spanish_words)
38
+
39
+ if spanish_count > english_count:
40
+ return 'es'
41
+ else:
42
+ return 'en'
43
+
44
+ def load_spacy_model(language: str):
45
+ """
46
+ Load spaCy model for the given language
47
+ Returns None if spaCy is not available or model not found
48
+ """
49
+ global _spacy_warning_shown
50
+
51
+ try:
52
+ import spacy
53
+
54
+ # Language model mapping
55
+ model_map = {
56
+ 'en': 'en_core_web_sm',
57
+ 'es': 'es_core_news_sm',
58
+ 'fr': 'fr_core_news_sm',
59
+ 'de': 'de_core_news_sm',
60
+ 'it': 'it_core_news_sm',
61
+ 'pt': 'pt_core_news_sm'
62
+ }
63
+
64
+ model_name = model_map.get(language, 'en_core_web_sm')
65
+
66
+ try:
67
+ return spacy.load(model_name)
68
+ except OSError:
69
+ if not _spacy_warning_shown:
70
+ logger.warning(f"spaCy model '{model_name}' not found. Falling back to NLTK.")
71
+ _spacy_warning_shown = True
72
+ return None
73
+
74
+ except ImportError:
75
+ if not _spacy_warning_shown:
76
+ logger.warning("spaCy not available. Using NLTK for POS tagging.")
77
+ _spacy_warning_shown = True
78
+ return None
79
+
80
+ def vectorize_query(query: str):
81
+ """
82
+ Vectorize query using sentence transformers
83
+ Returns numpy array of embeddings
84
+ """
85
+ try:
86
+ from sentence_transformers import SentenceTransformer
87
+ import numpy as np
88
+
89
+ # Use the same model as specified in the architecture
90
+ model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
91
+ embedding = model.encode(query, show_progress_bar=False)
92
+ return embedding
93
+
94
+ except ImportError:
95
+ logger.error("sentence-transformers not available. Cannot vectorize query.")
96
+ return None
97
+
98
+ # Language to NLTK stopwords mapping
99
+ stopwords_language_map = {
100
+ 'en': 'english',
101
+ 'es': 'spanish',
102
+ 'fr': 'french',
103
+ 'de': 'german',
104
+ 'it': 'italian',
105
+ 'pt': 'portuguese',
106
+ 'nl': 'dutch',
107
+ 'ru': 'russian',
108
+ 'ar': 'arabic',
109
+ 'da': 'danish',
110
+ 'fi': 'finnish',
111
+ 'hu': 'hungarian',
112
+ 'no': 'norwegian',
113
+ 'ro': 'romanian',
114
+ 'sv': 'swedish',
115
+ 'tr': 'turkish'
116
+ }
117
+
118
+ # Function to ensure NLTK resources are downloaded
119
+ def ensure_nltk_resources():
120
+ """Download required NLTK resources if not already present"""
121
+ resources = ['punkt', 'wordnet', 'averaged_perceptron_tagger', 'stopwords']
122
+ for resource in resources:
123
+ try:
124
+ nltk.data.find(f'corpora/{resource}')
125
+ except LookupError:
126
+ try:
127
+ nltk.download(resource, quiet=True)
128
+ except Exception as e:
129
+ logger.warning(f"Failed to download NLTK resource '{resource}': {e}")
130
+
131
+ # Initialize NLTK resources
132
+ ensure_nltk_resources()
133
+
134
+ # Mapping spaCy POS tags to WordNet POS tags
135
+ pos_mapping = {
136
+ 'NOUN': wn.NOUN,
137
+ 'VERB': wn.VERB,
138
+ 'ADJ': wn.ADJ,
139
+ 'ADV': wn.ADV,
140
+ 'PROPN': wn.NOUN, # Proper nouns as nouns
141
+ }
142
+
143
+ def get_wordnet_pos(spacy_pos):
144
+ """Map spaCy POS tags to WordNet POS tags."""
145
+ return pos_mapping.get(spacy_pos, wn.NOUN)
146
+
147
+ def get_synonyms(word: str, pos_tag: str, max_synonyms: int = 5) -> List[str]:
148
+ """Get synonyms for a word using WordNet"""
149
+ try:
150
+ wn_pos = get_wordnet_pos(pos_tag)
151
+ synsets = wn.synsets(word, pos=wn_pos)
152
+ synonyms = set()
153
+ for synset in synsets:
154
+ for lemma in synset.lemmas():
155
+ synonym = lemma.name().replace('_', ' ')
156
+ synonyms.add(synonym.lower())
157
+ if len(synonyms) >= max_synonyms:
158
+ break
159
+ if len(synonyms) >= max_synonyms:
160
+ break
161
+ return list(synonyms)
162
+ except Exception as e:
163
+ logger.warning(f"Error getting synonyms for '{word}': {e}")
164
+ return []
165
+
166
+ def remove_duplicate_words(input_string: str) -> str:
167
+ """Remove duplicate words from the input string while preserving the order and punctuation."""
168
+ words = re.findall(r'\b\w+\b', input_string)
169
+ seen = set()
170
+ result = []
171
+
172
+ for word in words:
173
+ if word.lower() not in seen:
174
+ seen.add(word.lower())
175
+ result.append(word)
176
+
177
+ words_with_punctuation = input_string.split()
178
+ final_result = []
179
+ for word in words_with_punctuation:
180
+ clean_word = re.sub(r'\W+', '', word)
181
+ if clean_word.lower() in seen:
182
+ final_result.append(word)
183
+ seen.remove(clean_word.lower())
184
+
185
+ return ' '.join(final_result)
186
+
187
+ def preprocess_query(query: str, language: str = 'en', pos_to_expand: Optional[List[str]] = None,
188
+ max_synonyms: int = 5, debug: bool = False, vector: bool = False,
189
+ vectorize_query_param: bool = False, nlp_backend: str = 'nltk') -> Dict[str, Any]:
190
+ """
191
+ Advanced query preprocessing with language detection, POS tagging, synonym expansion, and vectorization
192
+
193
+ Args:
194
+ query: Input query string
195
+ language: Language code ('en', 'es', etc.) or 'auto' for detection
196
+ pos_to_expand: List of POS tags to expand with synonyms
197
+ max_synonyms: Maximum synonyms per word
198
+ debug: Enable debug output
199
+ vector: Include vector embedding in output
200
+ vectorize_query_param: If True, just vectorize without other processing
201
+ nlp_backend: NLP backend to use ('nltk' for fast, 'spacy' for better quality)
202
+
203
+ Returns:
204
+ Dict containing processed query, language, POS tags, and optionally vector
205
+ """
206
+
207
+ if vectorize_query_param:
208
+ # Vectorize the query directly
209
+ vectorized_query = vectorize_query(query)
210
+ if vectorized_query is not None:
211
+ return {
212
+ 'input': query,
213
+ 'vector': vectorized_query.tolist()
214
+ }
215
+ else:
216
+ return {'input': query, 'vector': None}
217
+
218
+ if pos_to_expand is None:
219
+ pos_to_expand = ['NOUN', 'VERB', 'ADJ'] # Default to expanding synonyms for nouns, verbs, and adjectives
220
+
221
+ # Detect language if set to 'auto'
222
+ if language == 'auto':
223
+ language = detect_language(query)
224
+ if debug:
225
+ logger.info(f"Detected language: {language}")
226
+
227
+ # Load spaCy model based on the language and backend choice
228
+ nlp = None
229
+ if nlp_backend == 'spacy':
230
+ nlp = load_spacy_model(language)
231
+ if nlp is None and debug:
232
+ logger.info("spaCy backend requested but not available, falling back to NLTK")
233
+ elif nlp_backend == 'nltk':
234
+ if debug:
235
+ logger.info("Using NLTK backend for NLP processing")
236
+ else:
237
+ logger.warning(f"Unknown NLP backend '{nlp_backend}', using NLTK")
238
+
239
+ # Tokenization and stop word removal
240
+ tokens = nltk.word_tokenize(query)
241
+ nltk_language = stopwords_language_map.get(language, 'english')
242
+
243
+ try:
244
+ stop_words = set(nltk.corpus.stopwords.words(nltk_language))
245
+ except LookupError:
246
+ try:
247
+ nltk.download('stopwords', quiet=True)
248
+ stop_words = set(nltk.corpus.stopwords.words(nltk_language))
249
+ except:
250
+ logger.warning(f"Could not load stopwords for language '{nltk_language}', using English")
251
+ stop_words = set(nltk.corpus.stopwords.words('english'))
252
+
253
+ tokens = [word for word in tokens if word.lower() not in stop_words]
254
+
255
+ # Lemmatization and POS Tagging using spaCy or NLTK
256
+ lemmatizer = nltk.WordNetLemmatizer()
257
+ stemmer = PorterStemmer()
258
+ lemmas = []
259
+ pos_tags = {}
260
+
261
+ if nlp and nlp_backend == 'spacy':
262
+ # Use spaCy for better POS tagging
263
+ doc = nlp(" ".join(tokens))
264
+ for token in doc:
265
+ lemma = token.lemma_.lower()
266
+ stemmed = stemmer.stem(lemma)
267
+ lemmas.append((token.text.lower(), stemmed))
268
+ pos_tags[token.text.lower()] = token.pos_
269
+ if debug:
270
+ logger.info(f"POS Tagging Results (spaCy): {pos_tags}")
271
+ else:
272
+ # Use NLTK (default or fallback)
273
+ nltk_pos_tags = nltk.pos_tag(tokens)
274
+ for token, pos_tag in nltk_pos_tags:
275
+ lemma = lemmatizer.lemmatize(token, get_wordnet_pos(pos_tag)).lower()
276
+ stemmed = stemmer.stem(lemma)
277
+ lemmas.append((token.lower(), stemmed))
278
+ pos_tags[token.lower()] = pos_tag
279
+ if debug:
280
+ logger.info(f"POS Tagging Results (NLTK): {pos_tags}")
281
+
282
+ # Expanding query with synonyms
283
+ expanded_query_set = set()
284
+ expanded_query = []
285
+
286
+ for original, lemma in lemmas:
287
+ if original not in expanded_query_set:
288
+ expanded_query.append(original)
289
+ expanded_query_set.add(original)
290
+ if lemma not in expanded_query_set:
291
+ expanded_query.append(lemma)
292
+ expanded_query_set.add(lemma)
293
+ if pos_tags.get(original) in pos_to_expand:
294
+ synonyms = get_synonyms(lemma, pos_tags[original], max_synonyms)
295
+ for synonym in synonyms:
296
+ if synonym not in expanded_query_set:
297
+ expanded_query.append(synonym)
298
+ expanded_query_set.add(synonym)
299
+
300
+ # Convert to array, remove duplicates, and join back to string
301
+ final_query_str = " ".join(expanded_query)
302
+ final_query_str = remove_duplicate_words(final_query_str)
303
+
304
+ if debug:
305
+ logger.info(f"Expanded Query: {final_query_str}")
306
+ logger.info(f"NLP Backend Used: {nlp_backend if nlp or nlp_backend == 'nltk' else 'nltk (fallback)'}")
307
+
308
+ formatted_output = {
309
+ 'input': final_query_str,
310
+ 'enhanced_text': final_query_str, # Alias for compatibility
311
+ 'language': language,
312
+ 'POS': pos_tags,
313
+ 'nlp_backend_used': nlp_backend if nlp or nlp_backend == 'nltk' else 'nltk'
314
+ }
315
+
316
+ # Vectorize query if requested
317
+ if vector:
318
+ vectorized_query = vectorize_query(final_query_str)
319
+ if vectorized_query is not None:
320
+ formatted_output['vector'] = vectorized_query.tolist()
321
+ else:
322
+ formatted_output['vector'] = None
323
+
324
+ return formatted_output
325
+
326
+ def preprocess_document_content(content: str, language: str = 'en', nlp_backend: str = 'nltk') -> Dict[str, Any]:
327
+ """
328
+ Preprocess document content for better searchability
329
+
330
+ Args:
331
+ content: Document content to process
332
+ language: Language code for processing
333
+ nlp_backend: NLP backend to use ('nltk' for fast, 'spacy' for better quality)
334
+
335
+ Returns:
336
+ Dict containing enhanced text and extracted keywords
337
+ """
338
+
339
+ # Use existing preprocessing but adapted for documents
340
+ processed = preprocess_query(
341
+ content,
342
+ language=language,
343
+ pos_to_expand=['NOUN', 'VERB'], # Less aggressive for documents
344
+ max_synonyms=2, # Fewer synonyms for documents
345
+ debug=False,
346
+ vector=False,
347
+ nlp_backend=nlp_backend
348
+ )
349
+
350
+ # Extract key terms for keyword search
351
+ try:
352
+ tokens = nltk.word_tokenize(processed['input'])
353
+ nltk_language = stopwords_language_map.get(language, 'english')
354
+
355
+ try:
356
+ stop_words = set(nltk.corpus.stopwords.words(nltk_language))
357
+ except:
358
+ stop_words = set(nltk.corpus.stopwords.words('english'))
359
+
360
+ keywords = [word.lower() for word in tokens if word.lower() not in stop_words and len(word) > 2]
361
+
362
+ except Exception as e:
363
+ logger.warning(f"Error extracting keywords: {e}")
364
+ keywords = []
365
+
366
+ return {
367
+ 'enhanced_text': processed['input'],
368
+ 'keywords': keywords[:20], # Limit to top 20 keywords
369
+ 'language': processed.get('language', language),
370
+ 'pos_analysis': processed.get('POS', {})
371
+ }