signalwire-agents 0.1.6__py3-none-any.whl → 1.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (140) hide show
  1. signalwire_agents/__init__.py +130 -4
  2. signalwire_agents/agent_server.py +438 -32
  3. signalwire_agents/agents/bedrock.py +296 -0
  4. signalwire_agents/cli/__init__.py +18 -0
  5. signalwire_agents/cli/build_search.py +1367 -0
  6. signalwire_agents/cli/config.py +80 -0
  7. signalwire_agents/cli/core/__init__.py +10 -0
  8. signalwire_agents/cli/core/agent_loader.py +470 -0
  9. signalwire_agents/cli/core/argparse_helpers.py +179 -0
  10. signalwire_agents/cli/core/dynamic_config.py +71 -0
  11. signalwire_agents/cli/core/service_loader.py +303 -0
  12. signalwire_agents/cli/execution/__init__.py +10 -0
  13. signalwire_agents/cli/execution/datamap_exec.py +446 -0
  14. signalwire_agents/cli/execution/webhook_exec.py +134 -0
  15. signalwire_agents/cli/init_project.py +1225 -0
  16. signalwire_agents/cli/output/__init__.py +10 -0
  17. signalwire_agents/cli/output/output_formatter.py +255 -0
  18. signalwire_agents/cli/output/swml_dump.py +186 -0
  19. signalwire_agents/cli/simulation/__init__.py +10 -0
  20. signalwire_agents/cli/simulation/data_generation.py +374 -0
  21. signalwire_agents/cli/simulation/data_overrides.py +200 -0
  22. signalwire_agents/cli/simulation/mock_env.py +282 -0
  23. signalwire_agents/cli/swaig_test_wrapper.py +52 -0
  24. signalwire_agents/cli/test_swaig.py +809 -0
  25. signalwire_agents/cli/types.py +81 -0
  26. signalwire_agents/core/__init__.py +2 -2
  27. signalwire_agents/core/agent/__init__.py +12 -0
  28. signalwire_agents/core/agent/config/__init__.py +12 -0
  29. signalwire_agents/core/agent/deployment/__init__.py +9 -0
  30. signalwire_agents/core/agent/deployment/handlers/__init__.py +9 -0
  31. signalwire_agents/core/agent/prompt/__init__.py +14 -0
  32. signalwire_agents/core/agent/prompt/manager.py +306 -0
  33. signalwire_agents/core/agent/routing/__init__.py +9 -0
  34. signalwire_agents/core/agent/security/__init__.py +9 -0
  35. signalwire_agents/core/agent/swml/__init__.py +9 -0
  36. signalwire_agents/core/agent/tools/__init__.py +15 -0
  37. signalwire_agents/core/agent/tools/decorator.py +97 -0
  38. signalwire_agents/core/agent/tools/registry.py +210 -0
  39. signalwire_agents/core/agent_base.py +959 -2166
  40. signalwire_agents/core/auth_handler.py +233 -0
  41. signalwire_agents/core/config_loader.py +259 -0
  42. signalwire_agents/core/contexts.py +707 -0
  43. signalwire_agents/core/data_map.py +487 -0
  44. signalwire_agents/core/function_result.py +1150 -1
  45. signalwire_agents/core/logging_config.py +376 -0
  46. signalwire_agents/core/mixins/__init__.py +28 -0
  47. signalwire_agents/core/mixins/ai_config_mixin.py +442 -0
  48. signalwire_agents/core/mixins/auth_mixin.py +287 -0
  49. signalwire_agents/core/mixins/prompt_mixin.py +358 -0
  50. signalwire_agents/core/mixins/serverless_mixin.py +368 -0
  51. signalwire_agents/core/mixins/skill_mixin.py +55 -0
  52. signalwire_agents/core/mixins/state_mixin.py +153 -0
  53. signalwire_agents/core/mixins/tool_mixin.py +230 -0
  54. signalwire_agents/core/mixins/web_mixin.py +1134 -0
  55. signalwire_agents/core/security/session_manager.py +174 -86
  56. signalwire_agents/core/security_config.py +333 -0
  57. signalwire_agents/core/skill_base.py +200 -0
  58. signalwire_agents/core/skill_manager.py +244 -0
  59. signalwire_agents/core/swaig_function.py +33 -9
  60. signalwire_agents/core/swml_builder.py +212 -12
  61. signalwire_agents/core/swml_handler.py +43 -13
  62. signalwire_agents/core/swml_renderer.py +123 -297
  63. signalwire_agents/core/swml_service.py +277 -260
  64. signalwire_agents/prefabs/concierge.py +6 -2
  65. signalwire_agents/prefabs/info_gatherer.py +149 -33
  66. signalwire_agents/prefabs/receptionist.py +14 -22
  67. signalwire_agents/prefabs/survey.py +6 -2
  68. signalwire_agents/schema.json +9218 -5489
  69. signalwire_agents/search/__init__.py +137 -0
  70. signalwire_agents/search/document_processor.py +1223 -0
  71. signalwire_agents/search/index_builder.py +804 -0
  72. signalwire_agents/search/migration.py +418 -0
  73. signalwire_agents/search/models.py +30 -0
  74. signalwire_agents/search/pgvector_backend.py +752 -0
  75. signalwire_agents/search/query_processor.py +502 -0
  76. signalwire_agents/search/search_engine.py +1264 -0
  77. signalwire_agents/search/search_service.py +574 -0
  78. signalwire_agents/skills/README.md +452 -0
  79. signalwire_agents/skills/__init__.py +23 -0
  80. signalwire_agents/skills/api_ninjas_trivia/README.md +215 -0
  81. signalwire_agents/skills/api_ninjas_trivia/__init__.py +12 -0
  82. signalwire_agents/skills/api_ninjas_trivia/skill.py +237 -0
  83. signalwire_agents/skills/datasphere/README.md +210 -0
  84. signalwire_agents/skills/datasphere/__init__.py +12 -0
  85. signalwire_agents/skills/datasphere/skill.py +310 -0
  86. signalwire_agents/skills/datasphere_serverless/README.md +258 -0
  87. signalwire_agents/skills/datasphere_serverless/__init__.py +10 -0
  88. signalwire_agents/skills/datasphere_serverless/skill.py +237 -0
  89. signalwire_agents/skills/datetime/README.md +132 -0
  90. signalwire_agents/skills/datetime/__init__.py +10 -0
  91. signalwire_agents/skills/datetime/skill.py +126 -0
  92. signalwire_agents/skills/joke/README.md +149 -0
  93. signalwire_agents/skills/joke/__init__.py +10 -0
  94. signalwire_agents/skills/joke/skill.py +109 -0
  95. signalwire_agents/skills/math/README.md +161 -0
  96. signalwire_agents/skills/math/__init__.py +10 -0
  97. signalwire_agents/skills/math/skill.py +105 -0
  98. signalwire_agents/skills/mcp_gateway/README.md +230 -0
  99. signalwire_agents/skills/mcp_gateway/__init__.py +10 -0
  100. signalwire_agents/skills/mcp_gateway/skill.py +421 -0
  101. signalwire_agents/skills/native_vector_search/README.md +210 -0
  102. signalwire_agents/skills/native_vector_search/__init__.py +10 -0
  103. signalwire_agents/skills/native_vector_search/skill.py +820 -0
  104. signalwire_agents/skills/play_background_file/README.md +218 -0
  105. signalwire_agents/skills/play_background_file/__init__.py +12 -0
  106. signalwire_agents/skills/play_background_file/skill.py +242 -0
  107. signalwire_agents/skills/registry.py +459 -0
  108. signalwire_agents/skills/spider/README.md +236 -0
  109. signalwire_agents/skills/spider/__init__.py +13 -0
  110. signalwire_agents/skills/spider/skill.py +598 -0
  111. signalwire_agents/skills/swml_transfer/README.md +395 -0
  112. signalwire_agents/skills/swml_transfer/__init__.py +10 -0
  113. signalwire_agents/skills/swml_transfer/skill.py +359 -0
  114. signalwire_agents/skills/weather_api/README.md +178 -0
  115. signalwire_agents/skills/weather_api/__init__.py +12 -0
  116. signalwire_agents/skills/weather_api/skill.py +191 -0
  117. signalwire_agents/skills/web_search/README.md +163 -0
  118. signalwire_agents/skills/web_search/__init__.py +10 -0
  119. signalwire_agents/skills/web_search/skill.py +739 -0
  120. signalwire_agents/skills/wikipedia_search/README.md +228 -0
  121. signalwire_agents/{core/state → skills/wikipedia_search}/__init__.py +5 -4
  122. signalwire_agents/skills/wikipedia_search/skill.py +210 -0
  123. signalwire_agents/utils/__init__.py +14 -0
  124. signalwire_agents/utils/schema_utils.py +111 -44
  125. signalwire_agents/web/__init__.py +17 -0
  126. signalwire_agents/web/web_service.py +559 -0
  127. signalwire_agents-1.0.7.data/data/share/man/man1/sw-agent-init.1 +307 -0
  128. signalwire_agents-1.0.7.data/data/share/man/man1/sw-search.1 +483 -0
  129. signalwire_agents-1.0.7.data/data/share/man/man1/swaig-test.1 +308 -0
  130. signalwire_agents-1.0.7.dist-info/METADATA +992 -0
  131. signalwire_agents-1.0.7.dist-info/RECORD +142 -0
  132. {signalwire_agents-0.1.6.dist-info → signalwire_agents-1.0.7.dist-info}/WHEEL +1 -1
  133. signalwire_agents-1.0.7.dist-info/entry_points.txt +4 -0
  134. signalwire_agents/core/state/file_state_manager.py +0 -219
  135. signalwire_agents/core/state/state_manager.py +0 -101
  136. signalwire_agents-0.1.6.data/data/schema.json +0 -5611
  137. signalwire_agents-0.1.6.dist-info/METADATA +0 -199
  138. signalwire_agents-0.1.6.dist-info/RECORD +0 -34
  139. {signalwire_agents-0.1.6.dist-info → signalwire_agents-1.0.7.dist-info}/licenses/LICENSE +0 -0
  140. {signalwire_agents-0.1.6.dist-info → signalwire_agents-1.0.7.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,502 @@
1
+ """
2
+ Copyright (c) 2025 SignalWire
3
+
4
+ This file is part of the SignalWire AI Agents SDK.
5
+
6
+ Licensed under the MIT License.
7
+ See LICENSE file in the project root for full license information.
8
+ """
9
+
10
+ import os
11
+ import nltk
12
+ import re
13
+ from typing import Dict, Any, List, Optional
14
+ from nltk.corpus import wordnet as wn
15
+ from nltk.stem import PorterStemmer
16
+ import logging
17
+
18
+ # Configure logging
19
+ logger = logging.getLogger(__name__)
20
+
21
+ # Global flag to track if we've already warned about spaCy
22
+ _spacy_warning_shown = False
23
+
24
+ # Language detection and spaCy model loading
25
+ def detect_language(text: str) -> str:
26
+ """
27
+ Detect language of input text
28
+ Simple implementation - can be enhanced with langdetect library
29
+ """
30
+ # Simple heuristic-based detection
31
+ # In a full implementation, you'd use langdetect or similar
32
+ common_english_words = {'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'can', 'must'}
33
+ common_spanish_words = {'el', 'la', 'de', 'que', 'y', 'en', 'un', 'es', 'se', 'no', 'te', 'lo', 'le', 'da', 'su', 'por', 'son', 'con', 'para', 'al', 'del', 'los', 'las', 'una', 'como', 'pero', 'sus', 'han', 'fue', 'ser', 'está', 'todo', 'más', 'muy', 'sin', 'sobre', 'también', 'me', 'hasta', 'donde', 'quien', 'desde', 'nos', 'durante', 'todos', 'uno', 'les', 'ni', 'contra', 'otros', 'ese', 'eso', 'ante', 'ellos', 'e', 'esto', 'mí', 'antes', 'algunos', 'qué', 'unos', 'yo', 'otro', 'otras', 'otra', 'él', 'tanto', 'esa', 'estos', 'mucho', 'quienes', 'nada', 'muchos', 'cual', 'poco', 'ella', 'estar', 'estas', 'algunas', 'algo', 'nosotros', 'mi', 'mis', 'tú', 'te', 'ti', 'tu', 'tus', 'ellas', 'nosotras', 'vosotros', 'vosotras', 'os', 'mío', 'mía', 'míos', 'mías', 'tuyo', 'tuya', 'tuyos', 'tuyas', 'suyo', 'suya', 'suyos', 'suyas', 'nuestro', 'nuestra', 'nuestros', 'nuestras', 'vuestro', 'vuestra', 'vuestros', 'vuestras', 'esos', 'esas'}
34
+
35
+ words = text.lower().split()
36
+ english_count = sum(1 for word in words if word in common_english_words)
37
+ spanish_count = sum(1 for word in words if word in common_spanish_words)
38
+
39
+ if spanish_count > english_count:
40
+ return 'es'
41
+ else:
42
+ return 'en'
43
+
44
+ def load_spacy_model(language: str):
45
+ """
46
+ Load spaCy model for the given language
47
+ Returns None if spaCy is not available or model not found
48
+ """
49
+ global _spacy_warning_shown
50
+
51
+ try:
52
+ import spacy
53
+
54
+ # Language model mapping
55
+ model_map = {
56
+ 'en': 'en_core_web_sm',
57
+ 'es': 'es_core_news_sm',
58
+ 'fr': 'fr_core_news_sm',
59
+ 'de': 'de_core_news_sm',
60
+ 'it': 'it_core_news_sm',
61
+ 'pt': 'pt_core_news_sm'
62
+ }
63
+
64
+ model_name = model_map.get(language, 'en_core_web_sm')
65
+
66
+ try:
67
+ return spacy.load(model_name)
68
+ except OSError:
69
+ if not _spacy_warning_shown:
70
+ logger.warning(f"spaCy model '{model_name}' not found. Falling back to NLTK.")
71
+ _spacy_warning_shown = True
72
+ return None
73
+
74
+ except ImportError:
75
+ if not _spacy_warning_shown:
76
+ logger.warning("spaCy not available. Using NLTK for POS tagging.")
77
+ _spacy_warning_shown = True
78
+ return None
79
+
80
+ # Model cache - stores multiple models by name
81
+ _model_cache = {} # model_name -> SentenceTransformer instance
82
+ _model_lock = None
83
+
84
+ def set_global_model(model):
85
+ """Legacy function - adds model to cache instead of setting globally"""
86
+ if model and hasattr(model, 'model_name'):
87
+ _model_cache[model.model_name] = model
88
+ logger.info(f"Model added to cache: {model.model_name}")
89
+
90
+ def _get_cached_model(model_name: str = None):
91
+ """Get or create cached sentence transformer model
92
+
93
+ Args:
94
+ model_name: Optional model name. If not provided, uses default.
95
+ """
96
+ global _model_cache, _model_lock
97
+
98
+ # Default model
99
+ if model_name is None:
100
+ model_name = 'sentence-transformers/all-mpnet-base-v2'
101
+
102
+ # Initialize lock if needed
103
+ if _model_lock is None:
104
+ import threading
105
+ _model_lock = threading.Lock()
106
+
107
+ # Check if model is already in cache
108
+ if model_name in _model_cache:
109
+ return _model_cache[model_name]
110
+
111
+ # Load model with lock to prevent race conditions
112
+ with _model_lock:
113
+ # Double check in case another thread loaded it
114
+ if model_name in _model_cache:
115
+ return _model_cache[model_name]
116
+
117
+ try:
118
+ from sentence_transformers import SentenceTransformer
119
+ logger.info(f"Loading sentence transformer model: {model_name}")
120
+ model = SentenceTransformer(model_name)
121
+ # Store the model name for identification
122
+ model.model_name = model_name
123
+ # Add to cache
124
+ _model_cache[model_name] = model
125
+ logger.info(f"Successfully loaded and cached model: {model_name}")
126
+ return model
127
+ except ImportError:
128
+ logger.error("sentence-transformers not available. Cannot load model.")
129
+ return None
130
+ except Exception as e:
131
+ logger.error(f"Failed to load model {model_name}: {e}")
132
+ return None
133
+
134
+ def vectorize_query(query: str, model=None, model_name: str = None):
135
+ """
136
+ Vectorize query using sentence transformers
137
+ Returns numpy array of embeddings
138
+
139
+ Args:
140
+ query: Query string to vectorize
141
+ model: Optional pre-loaded model instance. If not provided, uses cached model.
142
+ model_name: Optional model name to use if loading a new model
143
+ """
144
+ try:
145
+ import numpy as np
146
+
147
+ # Use provided model or get cached one
148
+ if model is None:
149
+ model = _get_cached_model(model_name)
150
+ if model is None:
151
+ return None
152
+
153
+ embedding = model.encode(query, show_progress_bar=False)
154
+ return embedding
155
+
156
+ except ImportError:
157
+ logger.error("numpy not available. Cannot vectorize query.")
158
+ return None
159
+ except Exception as e:
160
+ logger.error(f"Error vectorizing query: {e}")
161
+ return None
162
+
163
+ # Language to NLTK stopwords mapping
164
+ stopwords_language_map = {
165
+ 'en': 'english',
166
+ 'es': 'spanish',
167
+ 'fr': 'french',
168
+ 'de': 'german',
169
+ 'it': 'italian',
170
+ 'pt': 'portuguese',
171
+ 'nl': 'dutch',
172
+ 'ru': 'russian',
173
+ 'ar': 'arabic',
174
+ 'da': 'danish',
175
+ 'fi': 'finnish',
176
+ 'hu': 'hungarian',
177
+ 'no': 'norwegian',
178
+ 'ro': 'romanian',
179
+ 'sv': 'swedish',
180
+ 'tr': 'turkish'
181
+ }
182
+
183
+ # Function to ensure NLTK resources are downloaded
184
+ def ensure_nltk_resources():
185
+ """Download required NLTK resources if not already present"""
186
+ resources = ['punkt', 'punkt_tab', 'wordnet', 'averaged_perceptron_tagger', 'stopwords']
187
+ for resource in resources:
188
+ try:
189
+ # Try different paths for different resource types
190
+ if resource in ['punkt', 'punkt_tab']:
191
+ nltk.data.find(f'tokenizers/{resource}')
192
+ elif resource in ['wordnet']:
193
+ nltk.data.find(f'corpora/{resource}')
194
+ elif resource in ['averaged_perceptron_tagger']:
195
+ nltk.data.find(f'taggers/{resource}')
196
+ elif resource in ['stopwords']:
197
+ nltk.data.find(f'corpora/{resource}')
198
+ else:
199
+ nltk.data.find(f'corpora/{resource}')
200
+ except LookupError:
201
+ try:
202
+ logger.info(f"Downloading NLTK resource '{resource}'...")
203
+ nltk.download(resource, quiet=True)
204
+ logger.info(f"Successfully downloaded NLTK resource '{resource}'")
205
+ except Exception as e:
206
+ logger.warning(f"Failed to download NLTK resource '{resource}': {e}")
207
+ # Continue without this resource - some functionality may be degraded
208
+
209
+ # Initialize NLTK resources
210
+ ensure_nltk_resources()
211
+
212
+ # Mapping spaCy POS tags to WordNet POS tags
213
+ pos_mapping = {
214
+ 'NOUN': wn.NOUN,
215
+ 'VERB': wn.VERB,
216
+ 'ADJ': wn.ADJ,
217
+ 'ADV': wn.ADV,
218
+ 'PROPN': wn.NOUN, # Proper nouns as nouns
219
+ }
220
+
221
+ def get_wordnet_pos(spacy_pos):
222
+ """Map spaCy POS tags to WordNet POS tags."""
223
+ return pos_mapping.get(spacy_pos, wn.NOUN)
224
+
225
+ def get_synonyms(word: str, pos_tag: str, max_synonyms: int = 5) -> List[str]:
226
+ """Get synonyms for a word using WordNet"""
227
+ try:
228
+ wn_pos = get_wordnet_pos(pos_tag)
229
+ synsets = wn.synsets(word, pos=wn_pos)
230
+ synonyms = set()
231
+ for synset in synsets:
232
+ for lemma in synset.lemmas():
233
+ synonym = lemma.name().replace('_', ' ')
234
+ synonyms.add(synonym.lower())
235
+ if len(synonyms) >= max_synonyms:
236
+ break
237
+ if len(synonyms) >= max_synonyms:
238
+ break
239
+ return list(synonyms)
240
+ except Exception as e:
241
+ logger.warning(f"Error getting synonyms for '{word}': {e}")
242
+ return []
243
+
244
+ def remove_duplicate_words(input_string: str) -> str:
245
+ """Remove duplicate words from the input string while preserving the order and punctuation."""
246
+ words = re.findall(r'\b\w+\b', input_string)
247
+ seen = set()
248
+ result = []
249
+
250
+ for word in words:
251
+ if word.lower() not in seen:
252
+ seen.add(word.lower())
253
+ result.append(word)
254
+
255
+ words_with_punctuation = input_string.split()
256
+ final_result = []
257
+ for word in words_with_punctuation:
258
+ clean_word = re.sub(r'\W+', '', word)
259
+ if clean_word.lower() in seen:
260
+ final_result.append(word)
261
+ seen.remove(clean_word.lower())
262
+
263
+ return ' '.join(final_result)
264
+
265
+ def preprocess_query(query: str, language: str = 'en', pos_to_expand: Optional[List[str]] = None,
266
+ max_synonyms: int = 5, debug: bool = False, vector: bool = False,
267
+ vectorize_query_param: bool = False, nlp_backend: str = None,
268
+ query_nlp_backend: str = 'nltk', model_name: str = None,
269
+ preserve_original: bool = True) -> Dict[str, Any]:
270
+ """
271
+ Advanced query preprocessing with language detection, POS tagging, synonym expansion, and vectorization
272
+
273
+ Args:
274
+ query: Input query string
275
+ language: Language code ('en', 'es', etc.) or 'auto' for detection
276
+ pos_to_expand: List of POS tags to expand with synonyms
277
+ max_synonyms: Maximum synonyms per word
278
+ debug: Enable debug output
279
+ vector: Include vector embedding in output
280
+ vectorize_query_param: If True, just vectorize without other processing
281
+ nlp_backend: DEPRECATED - use query_nlp_backend instead
282
+ query_nlp_backend: NLP backend for query processing ('nltk' for fast, 'spacy' for better quality)
283
+
284
+ Returns:
285
+ Dict containing processed query, language, POS tags, and optionally vector
286
+ """
287
+
288
+ # Handle backward compatibility
289
+ if nlp_backend is not None:
290
+ query_nlp_backend = nlp_backend
291
+ if debug:
292
+ logger.info(f"Using deprecated 'nlp_backend' parameter, please use 'query_nlp_backend' instead")
293
+
294
+ if vectorize_query_param:
295
+ # Vectorize the query directly
296
+ vectorized_query = vectorize_query(query)
297
+ if vectorized_query is not None:
298
+ return {
299
+ 'input': query,
300
+ 'vector': vectorized_query.tolist()
301
+ }
302
+ else:
303
+ return {'input': query, 'vector': None}
304
+
305
+ if pos_to_expand is None:
306
+ pos_to_expand = ['NOUN', 'VERB', 'ADJ'] # Default to expanding synonyms for nouns, verbs, and adjectives
307
+
308
+ # Detect language if set to 'auto'
309
+ if language == 'auto':
310
+ language = detect_language(query)
311
+ if debug:
312
+ logger.info(f"Detected language: {language}")
313
+
314
+ # Load spaCy model based on the language and backend choice
315
+ nlp = None
316
+ if query_nlp_backend == 'spacy':
317
+ nlp = load_spacy_model(language)
318
+ if nlp is None and debug:
319
+ logger.info("spaCy backend requested but not available, falling back to NLTK")
320
+ elif query_nlp_backend == 'nltk':
321
+ if debug:
322
+ logger.info("Using NLTK backend for query processing")
323
+ else:
324
+ logger.warning(f"Unknown query NLP backend '{query_nlp_backend}', using NLTK")
325
+ query_nlp_backend = 'nltk'
326
+
327
+ # Tokenization and stop word removal
328
+ try:
329
+ tokens = nltk.word_tokenize(query)
330
+ except LookupError as e:
331
+ # If tokenization fails, try to download punkt resources
332
+ logger.warning(f"NLTK tokenization failed: {e}")
333
+ try:
334
+ nltk.download('punkt', quiet=True)
335
+ nltk.download('punkt_tab', quiet=True)
336
+ tokens = nltk.word_tokenize(query)
337
+ except Exception as fallback_error:
338
+ # If all else fails, use simple split as fallback
339
+ logger.warning(f"NLTK tokenization fallback failed: {fallback_error}. Using simple word splitting.")
340
+ tokens = query.split()
341
+
342
+ nltk_language = stopwords_language_map.get(language, 'english')
343
+
344
+ try:
345
+ stop_words = set(nltk.corpus.stopwords.words(nltk_language))
346
+ except LookupError:
347
+ try:
348
+ nltk.download('stopwords', quiet=True)
349
+ stop_words = set(nltk.corpus.stopwords.words(nltk_language))
350
+ except:
351
+ logger.warning(f"Could not load stopwords for language '{nltk_language}', using English")
352
+ stop_words = set(nltk.corpus.stopwords.words('english'))
353
+
354
+ tokens = [word for word in tokens if word.lower() not in stop_words]
355
+
356
+ # Lemmatization and POS Tagging using spaCy or NLTK
357
+ lemmatizer = nltk.WordNetLemmatizer()
358
+ stemmer = PorterStemmer()
359
+ lemmas = []
360
+ pos_tags = {}
361
+
362
+ if nlp and query_nlp_backend == 'spacy':
363
+ # Use spaCy for better POS tagging
364
+ doc = nlp(" ".join(tokens))
365
+ for token in doc:
366
+ lemma = token.lemma_.lower()
367
+ stemmed = stemmer.stem(lemma)
368
+ lemmas.append((token.text.lower(), stemmed))
369
+ pos_tags[token.text.lower()] = token.pos_
370
+ if debug:
371
+ logger.info(f"POS Tagging Results (spaCy): {pos_tags}")
372
+ else:
373
+ # Use NLTK (default or fallback)
374
+ try:
375
+ nltk_pos_tags = nltk.pos_tag(tokens)
376
+ for token, pos_tag in nltk_pos_tags:
377
+ try:
378
+ lemma = lemmatizer.lemmatize(token, get_wordnet_pos(pos_tag)).lower()
379
+ except Exception:
380
+ # Fallback if lemmatization fails
381
+ lemma = token.lower()
382
+ stemmed = stemmer.stem(lemma)
383
+ lemmas.append((token.lower(), stemmed))
384
+ pos_tags[token.lower()] = pos_tag
385
+ if debug:
386
+ logger.info(f"POS Tagging Results (NLTK): {pos_tags}")
387
+ except Exception as pos_error:
388
+ # Fallback if POS tagging fails completely
389
+ logger.warning(f"NLTK POS tagging failed: {pos_error}. Using basic token processing.")
390
+ for token in tokens:
391
+ lemma = token.lower()
392
+ stemmed = stemmer.stem(lemma)
393
+ lemmas.append((token.lower(), stemmed))
394
+ pos_tags[token.lower()] = 'NN' # Default to noun
395
+ if debug:
396
+ logger.info(f"Using fallback token processing for: {tokens}")
397
+
398
+ # Expanding query with synonyms
399
+ expanded_query_set = set()
400
+ expanded_query = []
401
+
402
+ # If preserve_original is True, always include the original query first
403
+ if preserve_original:
404
+ # Add original query terms first (maintains exact phrases)
405
+ original_tokens = query.lower().split()
406
+ for token in original_tokens:
407
+ if token not in expanded_query_set:
408
+ expanded_query.append(token)
409
+ expanded_query_set.add(token)
410
+
411
+ for original, lemma in lemmas:
412
+ if original not in expanded_query_set:
413
+ expanded_query.append(original)
414
+ expanded_query_set.add(original)
415
+ if lemma not in expanded_query_set and not preserve_original: # Only add lemmas if not preserving original
416
+ expanded_query.append(lemma)
417
+ expanded_query_set.add(lemma)
418
+ if pos_tags.get(original) in pos_to_expand and max_synonyms > 0:
419
+ synonyms = get_synonyms(lemma, pos_tags[original], max_synonyms)
420
+ for synonym in synonyms:
421
+ if synonym not in expanded_query_set:
422
+ expanded_query.append(synonym)
423
+ expanded_query_set.add(synonym)
424
+
425
+ # Convert to array, remove duplicates, and join back to string
426
+ final_query_str = " ".join(expanded_query)
427
+ final_query_str = remove_duplicate_words(final_query_str)
428
+
429
+ if debug:
430
+ logger.info(f"Expanded Query: {final_query_str}")
431
+ logger.info(f"NLP Backend Used: {query_nlp_backend if nlp or query_nlp_backend == 'nltk' else 'nltk (fallback)'}")
432
+
433
+ formatted_output = {
434
+ 'input': final_query_str,
435
+ 'enhanced_text': final_query_str, # Alias for compatibility
436
+ 'language': language,
437
+ 'POS': pos_tags,
438
+ 'nlp_backend_used': query_nlp_backend if nlp or query_nlp_backend == 'nltk' else 'nltk'
439
+ }
440
+
441
+ # Vectorize query if requested
442
+ if vector:
443
+ vectorized_query = vectorize_query(final_query_str, model_name=model_name)
444
+ if vectorized_query is not None:
445
+ formatted_output['vector'] = vectorized_query.tolist()
446
+ else:
447
+ formatted_output['vector'] = None
448
+
449
+ return formatted_output
450
+
451
+ def preprocess_document_content(content: str, language: str = 'en', nlp_backend: str = None,
452
+ index_nlp_backend: str = 'nltk') -> Dict[str, Any]:
453
+ """
454
+ Preprocess document content for better searchability
455
+
456
+ Args:
457
+ content: Document content to process
458
+ language: Language code for processing
459
+ nlp_backend: DEPRECATED - use index_nlp_backend instead
460
+ index_nlp_backend: NLP backend for document processing ('nltk' for fast, 'spacy' for better quality)
461
+
462
+ Returns:
463
+ Dict containing enhanced text and extracted keywords
464
+ """
465
+
466
+ # Handle backward compatibility
467
+ if nlp_backend is not None:
468
+ index_nlp_backend = nlp_backend
469
+
470
+ # Use existing preprocessing but adapted for documents
471
+ processed = preprocess_query(
472
+ content,
473
+ language=language,
474
+ pos_to_expand=['NOUN', 'VERB'], # Less aggressive for documents
475
+ max_synonyms=2, # Fewer synonyms for documents
476
+ debug=False,
477
+ vector=False,
478
+ query_nlp_backend=index_nlp_backend
479
+ )
480
+
481
+ # Extract key terms for keyword search
482
+ try:
483
+ tokens = nltk.word_tokenize(processed['input'])
484
+ nltk_language = stopwords_language_map.get(language, 'english')
485
+
486
+ try:
487
+ stop_words = set(nltk.corpus.stopwords.words(nltk_language))
488
+ except:
489
+ stop_words = set(nltk.corpus.stopwords.words('english'))
490
+
491
+ keywords = [word.lower() for word in tokens if word.lower() not in stop_words and len(word) > 2]
492
+
493
+ except Exception as e:
494
+ logger.warning(f"Error extracting keywords: {e}")
495
+ keywords = []
496
+
497
+ return {
498
+ 'enhanced_text': processed['input'],
499
+ 'keywords': keywords[:20], # Limit to top 20 keywords
500
+ 'language': processed.get('language', language),
501
+ 'pos_analysis': processed.get('POS', {})
502
+ }