signalwire-agents 0.1.13__py3-none-any.whl → 1.0.17.dev4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- signalwire_agents/__init__.py +99 -15
- signalwire_agents/agent_server.py +248 -60
- signalwire_agents/agents/bedrock.py +296 -0
- signalwire_agents/cli/__init__.py +9 -0
- signalwire_agents/cli/build_search.py +951 -41
- signalwire_agents/cli/config.py +80 -0
- signalwire_agents/cli/core/__init__.py +10 -0
- signalwire_agents/cli/core/agent_loader.py +470 -0
- signalwire_agents/cli/core/argparse_helpers.py +179 -0
- signalwire_agents/cli/core/dynamic_config.py +71 -0
- signalwire_agents/cli/core/service_loader.py +303 -0
- signalwire_agents/cli/dokku.py +2320 -0
- signalwire_agents/cli/execution/__init__.py +10 -0
- signalwire_agents/cli/execution/datamap_exec.py +446 -0
- signalwire_agents/cli/execution/webhook_exec.py +134 -0
- signalwire_agents/cli/init_project.py +2636 -0
- signalwire_agents/cli/output/__init__.py +10 -0
- signalwire_agents/cli/output/output_formatter.py +255 -0
- signalwire_agents/cli/output/swml_dump.py +186 -0
- signalwire_agents/cli/simulation/__init__.py +10 -0
- signalwire_agents/cli/simulation/data_generation.py +374 -0
- signalwire_agents/cli/simulation/data_overrides.py +200 -0
- signalwire_agents/cli/simulation/mock_env.py +282 -0
- signalwire_agents/cli/swaig_test_wrapper.py +52 -0
- signalwire_agents/cli/test_swaig.py +566 -2366
- signalwire_agents/cli/types.py +81 -0
- signalwire_agents/core/__init__.py +2 -2
- signalwire_agents/core/agent/__init__.py +12 -0
- signalwire_agents/core/agent/config/__init__.py +12 -0
- signalwire_agents/core/agent/deployment/__init__.py +9 -0
- signalwire_agents/core/agent/deployment/handlers/__init__.py +9 -0
- signalwire_agents/core/agent/prompt/__init__.py +14 -0
- signalwire_agents/core/agent/prompt/manager.py +306 -0
- signalwire_agents/core/agent/routing/__init__.py +9 -0
- signalwire_agents/core/agent/security/__init__.py +9 -0
- signalwire_agents/core/agent/swml/__init__.py +9 -0
- signalwire_agents/core/agent/tools/__init__.py +15 -0
- signalwire_agents/core/agent/tools/decorator.py +97 -0
- signalwire_agents/core/agent/tools/registry.py +210 -0
- signalwire_agents/core/agent_base.py +845 -2916
- signalwire_agents/core/auth_handler.py +233 -0
- signalwire_agents/core/config_loader.py +259 -0
- signalwire_agents/core/contexts.py +418 -0
- signalwire_agents/core/data_map.py +3 -15
- signalwire_agents/core/function_result.py +116 -44
- signalwire_agents/core/logging_config.py +162 -18
- signalwire_agents/core/mixins/__init__.py +28 -0
- signalwire_agents/core/mixins/ai_config_mixin.py +442 -0
- signalwire_agents/core/mixins/auth_mixin.py +280 -0
- signalwire_agents/core/mixins/prompt_mixin.py +358 -0
- signalwire_agents/core/mixins/serverless_mixin.py +460 -0
- signalwire_agents/core/mixins/skill_mixin.py +55 -0
- signalwire_agents/core/mixins/state_mixin.py +153 -0
- signalwire_agents/core/mixins/tool_mixin.py +230 -0
- signalwire_agents/core/mixins/web_mixin.py +1142 -0
- signalwire_agents/core/security_config.py +333 -0
- signalwire_agents/core/skill_base.py +84 -1
- signalwire_agents/core/skill_manager.py +62 -20
- signalwire_agents/core/swaig_function.py +18 -5
- signalwire_agents/core/swml_builder.py +207 -11
- signalwire_agents/core/swml_handler.py +27 -21
- signalwire_agents/core/swml_renderer.py +123 -312
- signalwire_agents/core/swml_service.py +171 -203
- signalwire_agents/mcp_gateway/__init__.py +29 -0
- signalwire_agents/mcp_gateway/gateway_service.py +564 -0
- signalwire_agents/mcp_gateway/mcp_manager.py +513 -0
- signalwire_agents/mcp_gateway/session_manager.py +218 -0
- signalwire_agents/prefabs/concierge.py +0 -3
- signalwire_agents/prefabs/faq_bot.py +0 -3
- signalwire_agents/prefabs/info_gatherer.py +0 -3
- signalwire_agents/prefabs/receptionist.py +0 -3
- signalwire_agents/prefabs/survey.py +0 -3
- signalwire_agents/schema.json +9218 -5489
- signalwire_agents/search/__init__.py +7 -1
- signalwire_agents/search/document_processor.py +490 -31
- signalwire_agents/search/index_builder.py +307 -37
- signalwire_agents/search/migration.py +418 -0
- signalwire_agents/search/models.py +30 -0
- signalwire_agents/search/pgvector_backend.py +748 -0
- signalwire_agents/search/query_processor.py +162 -31
- signalwire_agents/search/search_engine.py +916 -35
- signalwire_agents/search/search_service.py +376 -53
- signalwire_agents/skills/README.md +452 -0
- signalwire_agents/skills/__init__.py +14 -2
- signalwire_agents/skills/api_ninjas_trivia/README.md +215 -0
- signalwire_agents/skills/api_ninjas_trivia/__init__.py +12 -0
- signalwire_agents/skills/api_ninjas_trivia/skill.py +237 -0
- signalwire_agents/skills/datasphere/README.md +210 -0
- signalwire_agents/skills/datasphere/skill.py +84 -3
- signalwire_agents/skills/datasphere_serverless/README.md +258 -0
- signalwire_agents/skills/datasphere_serverless/__init__.py +9 -0
- signalwire_agents/skills/datasphere_serverless/skill.py +82 -1
- signalwire_agents/skills/datetime/README.md +132 -0
- signalwire_agents/skills/datetime/__init__.py +9 -0
- signalwire_agents/skills/datetime/skill.py +20 -7
- signalwire_agents/skills/joke/README.md +149 -0
- signalwire_agents/skills/joke/__init__.py +9 -0
- signalwire_agents/skills/joke/skill.py +21 -0
- signalwire_agents/skills/math/README.md +161 -0
- signalwire_agents/skills/math/__init__.py +9 -0
- signalwire_agents/skills/math/skill.py +18 -4
- signalwire_agents/skills/mcp_gateway/README.md +230 -0
- signalwire_agents/skills/mcp_gateway/__init__.py +10 -0
- signalwire_agents/skills/mcp_gateway/skill.py +421 -0
- signalwire_agents/skills/native_vector_search/README.md +210 -0
- signalwire_agents/skills/native_vector_search/__init__.py +9 -0
- signalwire_agents/skills/native_vector_search/skill.py +569 -101
- signalwire_agents/skills/play_background_file/README.md +218 -0
- signalwire_agents/skills/play_background_file/__init__.py +12 -0
- signalwire_agents/skills/play_background_file/skill.py +242 -0
- signalwire_agents/skills/registry.py +395 -40
- signalwire_agents/skills/spider/README.md +236 -0
- signalwire_agents/skills/spider/__init__.py +13 -0
- signalwire_agents/skills/spider/skill.py +598 -0
- signalwire_agents/skills/swml_transfer/README.md +395 -0
- signalwire_agents/skills/swml_transfer/__init__.py +10 -0
- signalwire_agents/skills/swml_transfer/skill.py +359 -0
- signalwire_agents/skills/weather_api/README.md +178 -0
- signalwire_agents/skills/weather_api/__init__.py +12 -0
- signalwire_agents/skills/weather_api/skill.py +191 -0
- signalwire_agents/skills/web_search/README.md +163 -0
- signalwire_agents/skills/web_search/__init__.py +9 -0
- signalwire_agents/skills/web_search/skill.py +586 -112
- signalwire_agents/skills/wikipedia_search/README.md +228 -0
- signalwire_agents/{core/state → skills/wikipedia_search}/__init__.py +5 -4
- signalwire_agents/skills/{wikipedia → wikipedia_search}/skill.py +33 -3
- signalwire_agents/web/__init__.py +17 -0
- signalwire_agents/web/web_service.py +559 -0
- signalwire_agents-1.0.17.dev4.data/data/share/man/man1/sw-agent-init.1 +400 -0
- signalwire_agents-1.0.17.dev4.data/data/share/man/man1/sw-search.1 +483 -0
- signalwire_agents-1.0.17.dev4.data/data/share/man/man1/swaig-test.1 +308 -0
- {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/METADATA +347 -215
- signalwire_agents-1.0.17.dev4.dist-info/RECORD +147 -0
- signalwire_agents-1.0.17.dev4.dist-info/entry_points.txt +6 -0
- signalwire_agents/core/state/file_state_manager.py +0 -219
- signalwire_agents/core/state/state_manager.py +0 -101
- signalwire_agents/skills/wikipedia/__init__.py +0 -9
- signalwire_agents-0.1.13.data/data/schema.json +0 -5611
- signalwire_agents-0.1.13.dist-info/RECORD +0 -67
- signalwire_agents-0.1.13.dist-info/entry_points.txt +0 -3
- {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/WHEEL +0 -0
- {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/licenses/LICENSE +0 -0
- {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/top_level.txt +0 -0
|
@@ -77,22 +77,87 @@ def load_spacy_model(language: str):
|
|
|
77
77
|
_spacy_warning_shown = True
|
|
78
78
|
return None
|
|
79
79
|
|
|
80
|
-
|
|
80
|
+
# Model cache - stores multiple models by name
|
|
81
|
+
_model_cache = {} # model_name -> SentenceTransformer instance
|
|
82
|
+
_model_lock = None
|
|
83
|
+
|
|
84
|
+
def set_global_model(model):
|
|
85
|
+
"""Legacy function - adds model to cache instead of setting globally"""
|
|
86
|
+
if model and hasattr(model, 'model_name'):
|
|
87
|
+
_model_cache[model.model_name] = model
|
|
88
|
+
logger.info(f"Model added to cache: {model.model_name}")
|
|
89
|
+
|
|
90
|
+
def _get_cached_model(model_name: str = None):
|
|
91
|
+
"""Get or create cached sentence transformer model
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
model_name: Optional model name. If not provided, uses default.
|
|
95
|
+
"""
|
|
96
|
+
global _model_cache, _model_lock
|
|
97
|
+
|
|
98
|
+
# Default model
|
|
99
|
+
if model_name is None:
|
|
100
|
+
model_name = 'sentence-transformers/all-mpnet-base-v2'
|
|
101
|
+
|
|
102
|
+
# Initialize lock if needed
|
|
103
|
+
if _model_lock is None:
|
|
104
|
+
import threading
|
|
105
|
+
_model_lock = threading.Lock()
|
|
106
|
+
|
|
107
|
+
# Check if model is already in cache
|
|
108
|
+
if model_name in _model_cache:
|
|
109
|
+
return _model_cache[model_name]
|
|
110
|
+
|
|
111
|
+
# Load model with lock to prevent race conditions
|
|
112
|
+
with _model_lock:
|
|
113
|
+
# Double check in case another thread loaded it
|
|
114
|
+
if model_name in _model_cache:
|
|
115
|
+
return _model_cache[model_name]
|
|
116
|
+
|
|
117
|
+
try:
|
|
118
|
+
from sentence_transformers import SentenceTransformer
|
|
119
|
+
logger.info(f"Loading sentence transformer model: {model_name}")
|
|
120
|
+
model = SentenceTransformer(model_name)
|
|
121
|
+
# Store the model name for identification
|
|
122
|
+
model.model_name = model_name
|
|
123
|
+
# Add to cache
|
|
124
|
+
_model_cache[model_name] = model
|
|
125
|
+
logger.info(f"Successfully loaded and cached model: {model_name}")
|
|
126
|
+
return model
|
|
127
|
+
except ImportError:
|
|
128
|
+
logger.error("sentence-transformers not available. Cannot load model.")
|
|
129
|
+
return None
|
|
130
|
+
except Exception as e:
|
|
131
|
+
logger.error(f"Failed to load model {model_name}: {e}")
|
|
132
|
+
return None
|
|
133
|
+
|
|
134
|
+
def vectorize_query(query: str, model=None, model_name: str = None):
|
|
81
135
|
"""
|
|
82
136
|
Vectorize query using sentence transformers
|
|
83
137
|
Returns numpy array of embeddings
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
query: Query string to vectorize
|
|
141
|
+
model: Optional pre-loaded model instance. If not provided, uses cached model.
|
|
142
|
+
model_name: Optional model name to use if loading a new model
|
|
84
143
|
"""
|
|
85
144
|
try:
|
|
86
|
-
from sentence_transformers import SentenceTransformer
|
|
87
145
|
import numpy as np
|
|
88
146
|
|
|
89
|
-
# Use
|
|
90
|
-
model
|
|
147
|
+
# Use provided model or get cached one
|
|
148
|
+
if model is None:
|
|
149
|
+
model = _get_cached_model(model_name)
|
|
150
|
+
if model is None:
|
|
151
|
+
return None
|
|
152
|
+
|
|
91
153
|
embedding = model.encode(query, show_progress_bar=False)
|
|
92
154
|
return embedding
|
|
93
155
|
|
|
94
156
|
except ImportError:
|
|
95
|
-
logger.error("
|
|
157
|
+
logger.error("numpy not available. Cannot vectorize query.")
|
|
158
|
+
return None
|
|
159
|
+
except Exception as e:
|
|
160
|
+
logger.error(f"Error vectorizing query: {e}")
|
|
96
161
|
return None
|
|
97
162
|
|
|
98
163
|
# Language to NLTK stopwords mapping
|
|
@@ -118,15 +183,28 @@ stopwords_language_map = {
|
|
|
118
183
|
# Function to ensure NLTK resources are downloaded
|
|
119
184
|
def ensure_nltk_resources():
|
|
120
185
|
"""Download required NLTK resources if not already present"""
|
|
121
|
-
resources = ['punkt', 'wordnet', 'averaged_perceptron_tagger', 'stopwords']
|
|
186
|
+
resources = ['punkt', 'punkt_tab', 'wordnet', 'averaged_perceptron_tagger', 'stopwords']
|
|
122
187
|
for resource in resources:
|
|
123
188
|
try:
|
|
124
|
-
|
|
189
|
+
# Try different paths for different resource types
|
|
190
|
+
if resource in ['punkt', 'punkt_tab']:
|
|
191
|
+
nltk.data.find(f'tokenizers/{resource}')
|
|
192
|
+
elif resource in ['wordnet']:
|
|
193
|
+
nltk.data.find(f'corpora/{resource}')
|
|
194
|
+
elif resource in ['averaged_perceptron_tagger']:
|
|
195
|
+
nltk.data.find(f'taggers/{resource}')
|
|
196
|
+
elif resource in ['stopwords']:
|
|
197
|
+
nltk.data.find(f'corpora/{resource}')
|
|
198
|
+
else:
|
|
199
|
+
nltk.data.find(f'corpora/{resource}')
|
|
125
200
|
except LookupError:
|
|
126
201
|
try:
|
|
202
|
+
logger.info(f"Downloading NLTK resource '{resource}'...")
|
|
127
203
|
nltk.download(resource, quiet=True)
|
|
204
|
+
logger.info(f"Successfully downloaded NLTK resource '{resource}'")
|
|
128
205
|
except Exception as e:
|
|
129
206
|
logger.warning(f"Failed to download NLTK resource '{resource}': {e}")
|
|
207
|
+
# Continue without this resource - some functionality may be degraded
|
|
130
208
|
|
|
131
209
|
# Initialize NLTK resources
|
|
132
210
|
ensure_nltk_resources()
|
|
@@ -186,7 +264,9 @@ def remove_duplicate_words(input_string: str) -> str:
|
|
|
186
264
|
|
|
187
265
|
def preprocess_query(query: str, language: str = 'en', pos_to_expand: Optional[List[str]] = None,
|
|
188
266
|
max_synonyms: int = 5, debug: bool = False, vector: bool = False,
|
|
189
|
-
vectorize_query_param: bool = False, nlp_backend: str =
|
|
267
|
+
vectorize_query_param: bool = False, nlp_backend: str = None,
|
|
268
|
+
query_nlp_backend: str = 'nltk', model_name: str = None,
|
|
269
|
+
preserve_original: bool = True) -> Dict[str, Any]:
|
|
190
270
|
"""
|
|
191
271
|
Advanced query preprocessing with language detection, POS tagging, synonym expansion, and vectorization
|
|
192
272
|
|
|
@@ -198,12 +278,19 @@ def preprocess_query(query: str, language: str = 'en', pos_to_expand: Optional[L
|
|
|
198
278
|
debug: Enable debug output
|
|
199
279
|
vector: Include vector embedding in output
|
|
200
280
|
vectorize_query_param: If True, just vectorize without other processing
|
|
201
|
-
nlp_backend:
|
|
281
|
+
nlp_backend: DEPRECATED - use query_nlp_backend instead
|
|
282
|
+
query_nlp_backend: NLP backend for query processing ('nltk' for fast, 'spacy' for better quality)
|
|
202
283
|
|
|
203
284
|
Returns:
|
|
204
285
|
Dict containing processed query, language, POS tags, and optionally vector
|
|
205
286
|
"""
|
|
206
287
|
|
|
288
|
+
# Handle backward compatibility
|
|
289
|
+
if nlp_backend is not None:
|
|
290
|
+
query_nlp_backend = nlp_backend
|
|
291
|
+
if debug:
|
|
292
|
+
logger.info(f"Using deprecated 'nlp_backend' parameter, please use 'query_nlp_backend' instead")
|
|
293
|
+
|
|
207
294
|
if vectorize_query_param:
|
|
208
295
|
# Vectorize the query directly
|
|
209
296
|
vectorized_query = vectorize_query(query)
|
|
@@ -226,18 +313,32 @@ def preprocess_query(query: str, language: str = 'en', pos_to_expand: Optional[L
|
|
|
226
313
|
|
|
227
314
|
# Load spaCy model based on the language and backend choice
|
|
228
315
|
nlp = None
|
|
229
|
-
if
|
|
316
|
+
if query_nlp_backend == 'spacy':
|
|
230
317
|
nlp = load_spacy_model(language)
|
|
231
318
|
if nlp is None and debug:
|
|
232
319
|
logger.info("spaCy backend requested but not available, falling back to NLTK")
|
|
233
|
-
elif
|
|
320
|
+
elif query_nlp_backend == 'nltk':
|
|
234
321
|
if debug:
|
|
235
|
-
logger.info("Using NLTK backend for
|
|
322
|
+
logger.info("Using NLTK backend for query processing")
|
|
236
323
|
else:
|
|
237
|
-
logger.warning(f"Unknown NLP backend '{
|
|
324
|
+
logger.warning(f"Unknown query NLP backend '{query_nlp_backend}', using NLTK")
|
|
325
|
+
query_nlp_backend = 'nltk'
|
|
238
326
|
|
|
239
327
|
# Tokenization and stop word removal
|
|
240
|
-
|
|
328
|
+
try:
|
|
329
|
+
tokens = nltk.word_tokenize(query)
|
|
330
|
+
except LookupError as e:
|
|
331
|
+
# If tokenization fails, try to download punkt resources
|
|
332
|
+
logger.warning(f"NLTK tokenization failed: {e}")
|
|
333
|
+
try:
|
|
334
|
+
nltk.download('punkt', quiet=True)
|
|
335
|
+
nltk.download('punkt_tab', quiet=True)
|
|
336
|
+
tokens = nltk.word_tokenize(query)
|
|
337
|
+
except Exception as fallback_error:
|
|
338
|
+
# If all else fails, use simple split as fallback
|
|
339
|
+
logger.warning(f"NLTK tokenization fallback failed: {fallback_error}. Using simple word splitting.")
|
|
340
|
+
tokens = query.split()
|
|
341
|
+
|
|
241
342
|
nltk_language = stopwords_language_map.get(language, 'english')
|
|
242
343
|
|
|
243
344
|
try:
|
|
@@ -258,7 +359,7 @@ def preprocess_query(query: str, language: str = 'en', pos_to_expand: Optional[L
|
|
|
258
359
|
lemmas = []
|
|
259
360
|
pos_tags = {}
|
|
260
361
|
|
|
261
|
-
if nlp and
|
|
362
|
+
if nlp and query_nlp_backend == 'spacy':
|
|
262
363
|
# Use spaCy for better POS tagging
|
|
263
364
|
doc = nlp(" ".join(tokens))
|
|
264
365
|
for token in doc:
|
|
@@ -270,27 +371,51 @@ def preprocess_query(query: str, language: str = 'en', pos_to_expand: Optional[L
|
|
|
270
371
|
logger.info(f"POS Tagging Results (spaCy): {pos_tags}")
|
|
271
372
|
else:
|
|
272
373
|
# Use NLTK (default or fallback)
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
374
|
+
try:
|
|
375
|
+
nltk_pos_tags = nltk.pos_tag(tokens)
|
|
376
|
+
for token, pos_tag in nltk_pos_tags:
|
|
377
|
+
try:
|
|
378
|
+
lemma = lemmatizer.lemmatize(token, get_wordnet_pos(pos_tag)).lower()
|
|
379
|
+
except Exception:
|
|
380
|
+
# Fallback if lemmatization fails
|
|
381
|
+
lemma = token.lower()
|
|
382
|
+
stemmed = stemmer.stem(lemma)
|
|
383
|
+
lemmas.append((token.lower(), stemmed))
|
|
384
|
+
pos_tags[token.lower()] = pos_tag
|
|
385
|
+
if debug:
|
|
386
|
+
logger.info(f"POS Tagging Results (NLTK): {pos_tags}")
|
|
387
|
+
except Exception as pos_error:
|
|
388
|
+
# Fallback if POS tagging fails completely
|
|
389
|
+
logger.warning(f"NLTK POS tagging failed: {pos_error}. Using basic token processing.")
|
|
390
|
+
for token in tokens:
|
|
391
|
+
lemma = token.lower()
|
|
392
|
+
stemmed = stemmer.stem(lemma)
|
|
393
|
+
lemmas.append((token.lower(), stemmed))
|
|
394
|
+
pos_tags[token.lower()] = 'NN' # Default to noun
|
|
395
|
+
if debug:
|
|
396
|
+
logger.info(f"Using fallback token processing for: {tokens}")
|
|
281
397
|
|
|
282
398
|
# Expanding query with synonyms
|
|
283
399
|
expanded_query_set = set()
|
|
284
400
|
expanded_query = []
|
|
285
401
|
|
|
402
|
+
# If preserve_original is True, always include the original query first
|
|
403
|
+
if preserve_original:
|
|
404
|
+
# Add original query terms first (maintains exact phrases)
|
|
405
|
+
original_tokens = query.lower().split()
|
|
406
|
+
for token in original_tokens:
|
|
407
|
+
if token not in expanded_query_set:
|
|
408
|
+
expanded_query.append(token)
|
|
409
|
+
expanded_query_set.add(token)
|
|
410
|
+
|
|
286
411
|
for original, lemma in lemmas:
|
|
287
412
|
if original not in expanded_query_set:
|
|
288
413
|
expanded_query.append(original)
|
|
289
414
|
expanded_query_set.add(original)
|
|
290
|
-
if lemma not in expanded_query_set:
|
|
415
|
+
if lemma not in expanded_query_set and not preserve_original: # Only add lemmas if not preserving original
|
|
291
416
|
expanded_query.append(lemma)
|
|
292
417
|
expanded_query_set.add(lemma)
|
|
293
|
-
if pos_tags.get(original) in pos_to_expand:
|
|
418
|
+
if pos_tags.get(original) in pos_to_expand and max_synonyms > 0:
|
|
294
419
|
synonyms = get_synonyms(lemma, pos_tags[original], max_synonyms)
|
|
295
420
|
for synonym in synonyms:
|
|
296
421
|
if synonym not in expanded_query_set:
|
|
@@ -303,19 +428,19 @@ def preprocess_query(query: str, language: str = 'en', pos_to_expand: Optional[L
|
|
|
303
428
|
|
|
304
429
|
if debug:
|
|
305
430
|
logger.info(f"Expanded Query: {final_query_str}")
|
|
306
|
-
logger.info(f"NLP Backend Used: {
|
|
431
|
+
logger.info(f"NLP Backend Used: {query_nlp_backend if nlp or query_nlp_backend == 'nltk' else 'nltk (fallback)'}")
|
|
307
432
|
|
|
308
433
|
formatted_output = {
|
|
309
434
|
'input': final_query_str,
|
|
310
435
|
'enhanced_text': final_query_str, # Alias for compatibility
|
|
311
436
|
'language': language,
|
|
312
437
|
'POS': pos_tags,
|
|
313
|
-
'nlp_backend_used':
|
|
438
|
+
'nlp_backend_used': query_nlp_backend if nlp or query_nlp_backend == 'nltk' else 'nltk'
|
|
314
439
|
}
|
|
315
440
|
|
|
316
441
|
# Vectorize query if requested
|
|
317
442
|
if vector:
|
|
318
|
-
vectorized_query = vectorize_query(final_query_str)
|
|
443
|
+
vectorized_query = vectorize_query(final_query_str, model_name=model_name)
|
|
319
444
|
if vectorized_query is not None:
|
|
320
445
|
formatted_output['vector'] = vectorized_query.tolist()
|
|
321
446
|
else:
|
|
@@ -323,19 +448,25 @@ def preprocess_query(query: str, language: str = 'en', pos_to_expand: Optional[L
|
|
|
323
448
|
|
|
324
449
|
return formatted_output
|
|
325
450
|
|
|
326
|
-
def preprocess_document_content(content: str, language: str = 'en', nlp_backend: str =
|
|
451
|
+
def preprocess_document_content(content: str, language: str = 'en', nlp_backend: str = None,
|
|
452
|
+
index_nlp_backend: str = 'nltk') -> Dict[str, Any]:
|
|
327
453
|
"""
|
|
328
454
|
Preprocess document content for better searchability
|
|
329
455
|
|
|
330
456
|
Args:
|
|
331
457
|
content: Document content to process
|
|
332
458
|
language: Language code for processing
|
|
333
|
-
nlp_backend:
|
|
459
|
+
nlp_backend: DEPRECATED - use index_nlp_backend instead
|
|
460
|
+
index_nlp_backend: NLP backend for document processing ('nltk' for fast, 'spacy' for better quality)
|
|
334
461
|
|
|
335
462
|
Returns:
|
|
336
463
|
Dict containing enhanced text and extracted keywords
|
|
337
464
|
"""
|
|
338
465
|
|
|
466
|
+
# Handle backward compatibility
|
|
467
|
+
if nlp_backend is not None:
|
|
468
|
+
index_nlp_backend = nlp_backend
|
|
469
|
+
|
|
339
470
|
# Use existing preprocessing but adapted for documents
|
|
340
471
|
processed = preprocess_query(
|
|
341
472
|
content,
|
|
@@ -344,7 +475,7 @@ def preprocess_document_content(content: str, language: str = 'en', nlp_backend:
|
|
|
344
475
|
max_synonyms=2, # Fewer synonyms for documents
|
|
345
476
|
debug=False,
|
|
346
477
|
vector=False,
|
|
347
|
-
|
|
478
|
+
query_nlp_backend=index_nlp_backend
|
|
348
479
|
)
|
|
349
480
|
|
|
350
481
|
# Extract key terms for keyword search
|