signalwire-agents 0.1.6__py3-none-any.whl → 1.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- signalwire_agents/__init__.py +130 -4
- signalwire_agents/agent_server.py +438 -32
- signalwire_agents/agents/bedrock.py +296 -0
- signalwire_agents/cli/__init__.py +18 -0
- signalwire_agents/cli/build_search.py +1367 -0
- signalwire_agents/cli/config.py +80 -0
- signalwire_agents/cli/core/__init__.py +10 -0
- signalwire_agents/cli/core/agent_loader.py +470 -0
- signalwire_agents/cli/core/argparse_helpers.py +179 -0
- signalwire_agents/cli/core/dynamic_config.py +71 -0
- signalwire_agents/cli/core/service_loader.py +303 -0
- signalwire_agents/cli/execution/__init__.py +10 -0
- signalwire_agents/cli/execution/datamap_exec.py +446 -0
- signalwire_agents/cli/execution/webhook_exec.py +134 -0
- signalwire_agents/cli/init_project.py +1225 -0
- signalwire_agents/cli/output/__init__.py +10 -0
- signalwire_agents/cli/output/output_formatter.py +255 -0
- signalwire_agents/cli/output/swml_dump.py +186 -0
- signalwire_agents/cli/simulation/__init__.py +10 -0
- signalwire_agents/cli/simulation/data_generation.py +374 -0
- signalwire_agents/cli/simulation/data_overrides.py +200 -0
- signalwire_agents/cli/simulation/mock_env.py +282 -0
- signalwire_agents/cli/swaig_test_wrapper.py +52 -0
- signalwire_agents/cli/test_swaig.py +809 -0
- signalwire_agents/cli/types.py +81 -0
- signalwire_agents/core/__init__.py +2 -2
- signalwire_agents/core/agent/__init__.py +12 -0
- signalwire_agents/core/agent/config/__init__.py +12 -0
- signalwire_agents/core/agent/deployment/__init__.py +9 -0
- signalwire_agents/core/agent/deployment/handlers/__init__.py +9 -0
- signalwire_agents/core/agent/prompt/__init__.py +14 -0
- signalwire_agents/core/agent/prompt/manager.py +306 -0
- signalwire_agents/core/agent/routing/__init__.py +9 -0
- signalwire_agents/core/agent/security/__init__.py +9 -0
- signalwire_agents/core/agent/swml/__init__.py +9 -0
- signalwire_agents/core/agent/tools/__init__.py +15 -0
- signalwire_agents/core/agent/tools/decorator.py +97 -0
- signalwire_agents/core/agent/tools/registry.py +210 -0
- signalwire_agents/core/agent_base.py +959 -2166
- signalwire_agents/core/auth_handler.py +233 -0
- signalwire_agents/core/config_loader.py +259 -0
- signalwire_agents/core/contexts.py +707 -0
- signalwire_agents/core/data_map.py +487 -0
- signalwire_agents/core/function_result.py +1150 -1
- signalwire_agents/core/logging_config.py +376 -0
- signalwire_agents/core/mixins/__init__.py +28 -0
- signalwire_agents/core/mixins/ai_config_mixin.py +442 -0
- signalwire_agents/core/mixins/auth_mixin.py +287 -0
- signalwire_agents/core/mixins/prompt_mixin.py +358 -0
- signalwire_agents/core/mixins/serverless_mixin.py +368 -0
- signalwire_agents/core/mixins/skill_mixin.py +55 -0
- signalwire_agents/core/mixins/state_mixin.py +153 -0
- signalwire_agents/core/mixins/tool_mixin.py +230 -0
- signalwire_agents/core/mixins/web_mixin.py +1134 -0
- signalwire_agents/core/security/session_manager.py +174 -86
- signalwire_agents/core/security_config.py +333 -0
- signalwire_agents/core/skill_base.py +200 -0
- signalwire_agents/core/skill_manager.py +244 -0
- signalwire_agents/core/swaig_function.py +33 -9
- signalwire_agents/core/swml_builder.py +212 -12
- signalwire_agents/core/swml_handler.py +43 -13
- signalwire_agents/core/swml_renderer.py +123 -297
- signalwire_agents/core/swml_service.py +277 -260
- signalwire_agents/prefabs/concierge.py +6 -2
- signalwire_agents/prefabs/info_gatherer.py +149 -33
- signalwire_agents/prefabs/receptionist.py +14 -22
- signalwire_agents/prefabs/survey.py +6 -2
- signalwire_agents/schema.json +9218 -5489
- signalwire_agents/search/__init__.py +137 -0
- signalwire_agents/search/document_processor.py +1223 -0
- signalwire_agents/search/index_builder.py +804 -0
- signalwire_agents/search/migration.py +418 -0
- signalwire_agents/search/models.py +30 -0
- signalwire_agents/search/pgvector_backend.py +752 -0
- signalwire_agents/search/query_processor.py +502 -0
- signalwire_agents/search/search_engine.py +1264 -0
- signalwire_agents/search/search_service.py +574 -0
- signalwire_agents/skills/README.md +452 -0
- signalwire_agents/skills/__init__.py +23 -0
- signalwire_agents/skills/api_ninjas_trivia/README.md +215 -0
- signalwire_agents/skills/api_ninjas_trivia/__init__.py +12 -0
- signalwire_agents/skills/api_ninjas_trivia/skill.py +237 -0
- signalwire_agents/skills/datasphere/README.md +210 -0
- signalwire_agents/skills/datasphere/__init__.py +12 -0
- signalwire_agents/skills/datasphere/skill.py +310 -0
- signalwire_agents/skills/datasphere_serverless/README.md +258 -0
- signalwire_agents/skills/datasphere_serverless/__init__.py +10 -0
- signalwire_agents/skills/datasphere_serverless/skill.py +237 -0
- signalwire_agents/skills/datetime/README.md +132 -0
- signalwire_agents/skills/datetime/__init__.py +10 -0
- signalwire_agents/skills/datetime/skill.py +126 -0
- signalwire_agents/skills/joke/README.md +149 -0
- signalwire_agents/skills/joke/__init__.py +10 -0
- signalwire_agents/skills/joke/skill.py +109 -0
- signalwire_agents/skills/math/README.md +161 -0
- signalwire_agents/skills/math/__init__.py +10 -0
- signalwire_agents/skills/math/skill.py +105 -0
- signalwire_agents/skills/mcp_gateway/README.md +230 -0
- signalwire_agents/skills/mcp_gateway/__init__.py +10 -0
- signalwire_agents/skills/mcp_gateway/skill.py +421 -0
- signalwire_agents/skills/native_vector_search/README.md +210 -0
- signalwire_agents/skills/native_vector_search/__init__.py +10 -0
- signalwire_agents/skills/native_vector_search/skill.py +820 -0
- signalwire_agents/skills/play_background_file/README.md +218 -0
- signalwire_agents/skills/play_background_file/__init__.py +12 -0
- signalwire_agents/skills/play_background_file/skill.py +242 -0
- signalwire_agents/skills/registry.py +459 -0
- signalwire_agents/skills/spider/README.md +236 -0
- signalwire_agents/skills/spider/__init__.py +13 -0
- signalwire_agents/skills/spider/skill.py +598 -0
- signalwire_agents/skills/swml_transfer/README.md +395 -0
- signalwire_agents/skills/swml_transfer/__init__.py +10 -0
- signalwire_agents/skills/swml_transfer/skill.py +359 -0
- signalwire_agents/skills/weather_api/README.md +178 -0
- signalwire_agents/skills/weather_api/__init__.py +12 -0
- signalwire_agents/skills/weather_api/skill.py +191 -0
- signalwire_agents/skills/web_search/README.md +163 -0
- signalwire_agents/skills/web_search/__init__.py +10 -0
- signalwire_agents/skills/web_search/skill.py +739 -0
- signalwire_agents/skills/wikipedia_search/README.md +228 -0
- signalwire_agents/{core/state → skills/wikipedia_search}/__init__.py +5 -4
- signalwire_agents/skills/wikipedia_search/skill.py +210 -0
- signalwire_agents/utils/__init__.py +14 -0
- signalwire_agents/utils/schema_utils.py +111 -44
- signalwire_agents/web/__init__.py +17 -0
- signalwire_agents/web/web_service.py +559 -0
- signalwire_agents-1.0.7.data/data/share/man/man1/sw-agent-init.1 +307 -0
- signalwire_agents-1.0.7.data/data/share/man/man1/sw-search.1 +483 -0
- signalwire_agents-1.0.7.data/data/share/man/man1/swaig-test.1 +308 -0
- signalwire_agents-1.0.7.dist-info/METADATA +992 -0
- signalwire_agents-1.0.7.dist-info/RECORD +142 -0
- {signalwire_agents-0.1.6.dist-info → signalwire_agents-1.0.7.dist-info}/WHEEL +1 -1
- signalwire_agents-1.0.7.dist-info/entry_points.txt +4 -0
- signalwire_agents/core/state/file_state_manager.py +0 -219
- signalwire_agents/core/state/state_manager.py +0 -101
- signalwire_agents-0.1.6.data/data/schema.json +0 -5611
- signalwire_agents-0.1.6.dist-info/METADATA +0 -199
- signalwire_agents-0.1.6.dist-info/RECORD +0 -34
- {signalwire_agents-0.1.6.dist-info → signalwire_agents-1.0.7.dist-info}/licenses/LICENSE +0 -0
- {signalwire_agents-0.1.6.dist-info → signalwire_agents-1.0.7.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,502 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright (c) 2025 SignalWire
|
|
3
|
+
|
|
4
|
+
This file is part of the SignalWire AI Agents SDK.
|
|
5
|
+
|
|
6
|
+
Licensed under the MIT License.
|
|
7
|
+
See LICENSE file in the project root for full license information.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import os
|
|
11
|
+
import nltk
|
|
12
|
+
import re
|
|
13
|
+
from typing import Dict, Any, List, Optional
|
|
14
|
+
from nltk.corpus import wordnet as wn
|
|
15
|
+
from nltk.stem import PorterStemmer
|
|
16
|
+
import logging
|
|
17
|
+
|
|
18
|
+
# Configure logging
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
# Global flag to track if we've already warned about spaCy
|
|
22
|
+
_spacy_warning_shown = False
|
|
23
|
+
|
|
24
|
+
# Language detection and spaCy model loading
|
|
25
|
+
def detect_language(text: str) -> str:
|
|
26
|
+
"""
|
|
27
|
+
Detect language of input text
|
|
28
|
+
Simple implementation - can be enhanced with langdetect library
|
|
29
|
+
"""
|
|
30
|
+
# Simple heuristic-based detection
|
|
31
|
+
# In a full implementation, you'd use langdetect or similar
|
|
32
|
+
common_english_words = {'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should', 'may', 'might', 'can', 'must'}
|
|
33
|
+
common_spanish_words = {'el', 'la', 'de', 'que', 'y', 'en', 'un', 'es', 'se', 'no', 'te', 'lo', 'le', 'da', 'su', 'por', 'son', 'con', 'para', 'al', 'del', 'los', 'las', 'una', 'como', 'pero', 'sus', 'han', 'fue', 'ser', 'está', 'todo', 'más', 'muy', 'sin', 'sobre', 'también', 'me', 'hasta', 'donde', 'quien', 'desde', 'nos', 'durante', 'todos', 'uno', 'les', 'ni', 'contra', 'otros', 'ese', 'eso', 'ante', 'ellos', 'e', 'esto', 'mí', 'antes', 'algunos', 'qué', 'unos', 'yo', 'otro', 'otras', 'otra', 'él', 'tanto', 'esa', 'estos', 'mucho', 'quienes', 'nada', 'muchos', 'cual', 'poco', 'ella', 'estar', 'estas', 'algunas', 'algo', 'nosotros', 'mi', 'mis', 'tú', 'te', 'ti', 'tu', 'tus', 'ellas', 'nosotras', 'vosotros', 'vosotras', 'os', 'mío', 'mía', 'míos', 'mías', 'tuyo', 'tuya', 'tuyos', 'tuyas', 'suyo', 'suya', 'suyos', 'suyas', 'nuestro', 'nuestra', 'nuestros', 'nuestras', 'vuestro', 'vuestra', 'vuestros', 'vuestras', 'esos', 'esas'}
|
|
34
|
+
|
|
35
|
+
words = text.lower().split()
|
|
36
|
+
english_count = sum(1 for word in words if word in common_english_words)
|
|
37
|
+
spanish_count = sum(1 for word in words if word in common_spanish_words)
|
|
38
|
+
|
|
39
|
+
if spanish_count > english_count:
|
|
40
|
+
return 'es'
|
|
41
|
+
else:
|
|
42
|
+
return 'en'
|
|
43
|
+
|
|
44
|
+
def load_spacy_model(language: str):
|
|
45
|
+
"""
|
|
46
|
+
Load spaCy model for the given language
|
|
47
|
+
Returns None if spaCy is not available or model not found
|
|
48
|
+
"""
|
|
49
|
+
global _spacy_warning_shown
|
|
50
|
+
|
|
51
|
+
try:
|
|
52
|
+
import spacy
|
|
53
|
+
|
|
54
|
+
# Language model mapping
|
|
55
|
+
model_map = {
|
|
56
|
+
'en': 'en_core_web_sm',
|
|
57
|
+
'es': 'es_core_news_sm',
|
|
58
|
+
'fr': 'fr_core_news_sm',
|
|
59
|
+
'de': 'de_core_news_sm',
|
|
60
|
+
'it': 'it_core_news_sm',
|
|
61
|
+
'pt': 'pt_core_news_sm'
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
model_name = model_map.get(language, 'en_core_web_sm')
|
|
65
|
+
|
|
66
|
+
try:
|
|
67
|
+
return spacy.load(model_name)
|
|
68
|
+
except OSError:
|
|
69
|
+
if not _spacy_warning_shown:
|
|
70
|
+
logger.warning(f"spaCy model '{model_name}' not found. Falling back to NLTK.")
|
|
71
|
+
_spacy_warning_shown = True
|
|
72
|
+
return None
|
|
73
|
+
|
|
74
|
+
except ImportError:
|
|
75
|
+
if not _spacy_warning_shown:
|
|
76
|
+
logger.warning("spaCy not available. Using NLTK for POS tagging.")
|
|
77
|
+
_spacy_warning_shown = True
|
|
78
|
+
return None
|
|
79
|
+
|
|
80
|
+
# Model cache - stores multiple models by name
|
|
81
|
+
_model_cache = {} # model_name -> SentenceTransformer instance
|
|
82
|
+
_model_lock = None
|
|
83
|
+
|
|
84
|
+
def set_global_model(model):
|
|
85
|
+
"""Legacy function - adds model to cache instead of setting globally"""
|
|
86
|
+
if model and hasattr(model, 'model_name'):
|
|
87
|
+
_model_cache[model.model_name] = model
|
|
88
|
+
logger.info(f"Model added to cache: {model.model_name}")
|
|
89
|
+
|
|
90
|
+
def _get_cached_model(model_name: str = None):
|
|
91
|
+
"""Get or create cached sentence transformer model
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
model_name: Optional model name. If not provided, uses default.
|
|
95
|
+
"""
|
|
96
|
+
global _model_cache, _model_lock
|
|
97
|
+
|
|
98
|
+
# Default model
|
|
99
|
+
if model_name is None:
|
|
100
|
+
model_name = 'sentence-transformers/all-mpnet-base-v2'
|
|
101
|
+
|
|
102
|
+
# Initialize lock if needed
|
|
103
|
+
if _model_lock is None:
|
|
104
|
+
import threading
|
|
105
|
+
_model_lock = threading.Lock()
|
|
106
|
+
|
|
107
|
+
# Check if model is already in cache
|
|
108
|
+
if model_name in _model_cache:
|
|
109
|
+
return _model_cache[model_name]
|
|
110
|
+
|
|
111
|
+
# Load model with lock to prevent race conditions
|
|
112
|
+
with _model_lock:
|
|
113
|
+
# Double check in case another thread loaded it
|
|
114
|
+
if model_name in _model_cache:
|
|
115
|
+
return _model_cache[model_name]
|
|
116
|
+
|
|
117
|
+
try:
|
|
118
|
+
from sentence_transformers import SentenceTransformer
|
|
119
|
+
logger.info(f"Loading sentence transformer model: {model_name}")
|
|
120
|
+
model = SentenceTransformer(model_name)
|
|
121
|
+
# Store the model name for identification
|
|
122
|
+
model.model_name = model_name
|
|
123
|
+
# Add to cache
|
|
124
|
+
_model_cache[model_name] = model
|
|
125
|
+
logger.info(f"Successfully loaded and cached model: {model_name}")
|
|
126
|
+
return model
|
|
127
|
+
except ImportError:
|
|
128
|
+
logger.error("sentence-transformers not available. Cannot load model.")
|
|
129
|
+
return None
|
|
130
|
+
except Exception as e:
|
|
131
|
+
logger.error(f"Failed to load model {model_name}: {e}")
|
|
132
|
+
return None
|
|
133
|
+
|
|
134
|
+
def vectorize_query(query: str, model=None, model_name: str = None):
|
|
135
|
+
"""
|
|
136
|
+
Vectorize query using sentence transformers
|
|
137
|
+
Returns numpy array of embeddings
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
query: Query string to vectorize
|
|
141
|
+
model: Optional pre-loaded model instance. If not provided, uses cached model.
|
|
142
|
+
model_name: Optional model name to use if loading a new model
|
|
143
|
+
"""
|
|
144
|
+
try:
|
|
145
|
+
import numpy as np
|
|
146
|
+
|
|
147
|
+
# Use provided model or get cached one
|
|
148
|
+
if model is None:
|
|
149
|
+
model = _get_cached_model(model_name)
|
|
150
|
+
if model is None:
|
|
151
|
+
return None
|
|
152
|
+
|
|
153
|
+
embedding = model.encode(query, show_progress_bar=False)
|
|
154
|
+
return embedding
|
|
155
|
+
|
|
156
|
+
except ImportError:
|
|
157
|
+
logger.error("numpy not available. Cannot vectorize query.")
|
|
158
|
+
return None
|
|
159
|
+
except Exception as e:
|
|
160
|
+
logger.error(f"Error vectorizing query: {e}")
|
|
161
|
+
return None
|
|
162
|
+
|
|
163
|
+
# Language to NLTK stopwords mapping
|
|
164
|
+
stopwords_language_map = {
|
|
165
|
+
'en': 'english',
|
|
166
|
+
'es': 'spanish',
|
|
167
|
+
'fr': 'french',
|
|
168
|
+
'de': 'german',
|
|
169
|
+
'it': 'italian',
|
|
170
|
+
'pt': 'portuguese',
|
|
171
|
+
'nl': 'dutch',
|
|
172
|
+
'ru': 'russian',
|
|
173
|
+
'ar': 'arabic',
|
|
174
|
+
'da': 'danish',
|
|
175
|
+
'fi': 'finnish',
|
|
176
|
+
'hu': 'hungarian',
|
|
177
|
+
'no': 'norwegian',
|
|
178
|
+
'ro': 'romanian',
|
|
179
|
+
'sv': 'swedish',
|
|
180
|
+
'tr': 'turkish'
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
# Function to ensure NLTK resources are downloaded
|
|
184
|
+
def ensure_nltk_resources():
|
|
185
|
+
"""Download required NLTK resources if not already present"""
|
|
186
|
+
resources = ['punkt', 'punkt_tab', 'wordnet', 'averaged_perceptron_tagger', 'stopwords']
|
|
187
|
+
for resource in resources:
|
|
188
|
+
try:
|
|
189
|
+
# Try different paths for different resource types
|
|
190
|
+
if resource in ['punkt', 'punkt_tab']:
|
|
191
|
+
nltk.data.find(f'tokenizers/{resource}')
|
|
192
|
+
elif resource in ['wordnet']:
|
|
193
|
+
nltk.data.find(f'corpora/{resource}')
|
|
194
|
+
elif resource in ['averaged_perceptron_tagger']:
|
|
195
|
+
nltk.data.find(f'taggers/{resource}')
|
|
196
|
+
elif resource in ['stopwords']:
|
|
197
|
+
nltk.data.find(f'corpora/{resource}')
|
|
198
|
+
else:
|
|
199
|
+
nltk.data.find(f'corpora/{resource}')
|
|
200
|
+
except LookupError:
|
|
201
|
+
try:
|
|
202
|
+
logger.info(f"Downloading NLTK resource '{resource}'...")
|
|
203
|
+
nltk.download(resource, quiet=True)
|
|
204
|
+
logger.info(f"Successfully downloaded NLTK resource '{resource}'")
|
|
205
|
+
except Exception as e:
|
|
206
|
+
logger.warning(f"Failed to download NLTK resource '{resource}': {e}")
|
|
207
|
+
# Continue without this resource - some functionality may be degraded
|
|
208
|
+
|
|
209
|
+
# Initialize NLTK resources
|
|
210
|
+
ensure_nltk_resources()
|
|
211
|
+
|
|
212
|
+
# Mapping spaCy POS tags to WordNet POS tags
|
|
213
|
+
pos_mapping = {
|
|
214
|
+
'NOUN': wn.NOUN,
|
|
215
|
+
'VERB': wn.VERB,
|
|
216
|
+
'ADJ': wn.ADJ,
|
|
217
|
+
'ADV': wn.ADV,
|
|
218
|
+
'PROPN': wn.NOUN, # Proper nouns as nouns
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
def get_wordnet_pos(spacy_pos):
|
|
222
|
+
"""Map spaCy POS tags to WordNet POS tags."""
|
|
223
|
+
return pos_mapping.get(spacy_pos, wn.NOUN)
|
|
224
|
+
|
|
225
|
+
def get_synonyms(word: str, pos_tag: str, max_synonyms: int = 5) -> List[str]:
|
|
226
|
+
"""Get synonyms for a word using WordNet"""
|
|
227
|
+
try:
|
|
228
|
+
wn_pos = get_wordnet_pos(pos_tag)
|
|
229
|
+
synsets = wn.synsets(word, pos=wn_pos)
|
|
230
|
+
synonyms = set()
|
|
231
|
+
for synset in synsets:
|
|
232
|
+
for lemma in synset.lemmas():
|
|
233
|
+
synonym = lemma.name().replace('_', ' ')
|
|
234
|
+
synonyms.add(synonym.lower())
|
|
235
|
+
if len(synonyms) >= max_synonyms:
|
|
236
|
+
break
|
|
237
|
+
if len(synonyms) >= max_synonyms:
|
|
238
|
+
break
|
|
239
|
+
return list(synonyms)
|
|
240
|
+
except Exception as e:
|
|
241
|
+
logger.warning(f"Error getting synonyms for '{word}': {e}")
|
|
242
|
+
return []
|
|
243
|
+
|
|
244
|
+
def remove_duplicate_words(input_string: str) -> str:
|
|
245
|
+
"""Remove duplicate words from the input string while preserving the order and punctuation."""
|
|
246
|
+
words = re.findall(r'\b\w+\b', input_string)
|
|
247
|
+
seen = set()
|
|
248
|
+
result = []
|
|
249
|
+
|
|
250
|
+
for word in words:
|
|
251
|
+
if word.lower() not in seen:
|
|
252
|
+
seen.add(word.lower())
|
|
253
|
+
result.append(word)
|
|
254
|
+
|
|
255
|
+
words_with_punctuation = input_string.split()
|
|
256
|
+
final_result = []
|
|
257
|
+
for word in words_with_punctuation:
|
|
258
|
+
clean_word = re.sub(r'\W+', '', word)
|
|
259
|
+
if clean_word.lower() in seen:
|
|
260
|
+
final_result.append(word)
|
|
261
|
+
seen.remove(clean_word.lower())
|
|
262
|
+
|
|
263
|
+
return ' '.join(final_result)
|
|
264
|
+
|
|
265
|
+
def preprocess_query(query: str, language: str = 'en', pos_to_expand: Optional[List[str]] = None,
|
|
266
|
+
max_synonyms: int = 5, debug: bool = False, vector: bool = False,
|
|
267
|
+
vectorize_query_param: bool = False, nlp_backend: str = None,
|
|
268
|
+
query_nlp_backend: str = 'nltk', model_name: str = None,
|
|
269
|
+
preserve_original: bool = True) -> Dict[str, Any]:
|
|
270
|
+
"""
|
|
271
|
+
Advanced query preprocessing with language detection, POS tagging, synonym expansion, and vectorization
|
|
272
|
+
|
|
273
|
+
Args:
|
|
274
|
+
query: Input query string
|
|
275
|
+
language: Language code ('en', 'es', etc.) or 'auto' for detection
|
|
276
|
+
pos_to_expand: List of POS tags to expand with synonyms
|
|
277
|
+
max_synonyms: Maximum synonyms per word
|
|
278
|
+
debug: Enable debug output
|
|
279
|
+
vector: Include vector embedding in output
|
|
280
|
+
vectorize_query_param: If True, just vectorize without other processing
|
|
281
|
+
nlp_backend: DEPRECATED - use query_nlp_backend instead
|
|
282
|
+
query_nlp_backend: NLP backend for query processing ('nltk' for fast, 'spacy' for better quality)
|
|
283
|
+
|
|
284
|
+
Returns:
|
|
285
|
+
Dict containing processed query, language, POS tags, and optionally vector
|
|
286
|
+
"""
|
|
287
|
+
|
|
288
|
+
# Handle backward compatibility
|
|
289
|
+
if nlp_backend is not None:
|
|
290
|
+
query_nlp_backend = nlp_backend
|
|
291
|
+
if debug:
|
|
292
|
+
logger.info(f"Using deprecated 'nlp_backend' parameter, please use 'query_nlp_backend' instead")
|
|
293
|
+
|
|
294
|
+
if vectorize_query_param:
|
|
295
|
+
# Vectorize the query directly
|
|
296
|
+
vectorized_query = vectorize_query(query)
|
|
297
|
+
if vectorized_query is not None:
|
|
298
|
+
return {
|
|
299
|
+
'input': query,
|
|
300
|
+
'vector': vectorized_query.tolist()
|
|
301
|
+
}
|
|
302
|
+
else:
|
|
303
|
+
return {'input': query, 'vector': None}
|
|
304
|
+
|
|
305
|
+
if pos_to_expand is None:
|
|
306
|
+
pos_to_expand = ['NOUN', 'VERB', 'ADJ'] # Default to expanding synonyms for nouns, verbs, and adjectives
|
|
307
|
+
|
|
308
|
+
# Detect language if set to 'auto'
|
|
309
|
+
if language == 'auto':
|
|
310
|
+
language = detect_language(query)
|
|
311
|
+
if debug:
|
|
312
|
+
logger.info(f"Detected language: {language}")
|
|
313
|
+
|
|
314
|
+
# Load spaCy model based on the language and backend choice
|
|
315
|
+
nlp = None
|
|
316
|
+
if query_nlp_backend == 'spacy':
|
|
317
|
+
nlp = load_spacy_model(language)
|
|
318
|
+
if nlp is None and debug:
|
|
319
|
+
logger.info("spaCy backend requested but not available, falling back to NLTK")
|
|
320
|
+
elif query_nlp_backend == 'nltk':
|
|
321
|
+
if debug:
|
|
322
|
+
logger.info("Using NLTK backend for query processing")
|
|
323
|
+
else:
|
|
324
|
+
logger.warning(f"Unknown query NLP backend '{query_nlp_backend}', using NLTK")
|
|
325
|
+
query_nlp_backend = 'nltk'
|
|
326
|
+
|
|
327
|
+
# Tokenization and stop word removal
|
|
328
|
+
try:
|
|
329
|
+
tokens = nltk.word_tokenize(query)
|
|
330
|
+
except LookupError as e:
|
|
331
|
+
# If tokenization fails, try to download punkt resources
|
|
332
|
+
logger.warning(f"NLTK tokenization failed: {e}")
|
|
333
|
+
try:
|
|
334
|
+
nltk.download('punkt', quiet=True)
|
|
335
|
+
nltk.download('punkt_tab', quiet=True)
|
|
336
|
+
tokens = nltk.word_tokenize(query)
|
|
337
|
+
except Exception as fallback_error:
|
|
338
|
+
# If all else fails, use simple split as fallback
|
|
339
|
+
logger.warning(f"NLTK tokenization fallback failed: {fallback_error}. Using simple word splitting.")
|
|
340
|
+
tokens = query.split()
|
|
341
|
+
|
|
342
|
+
nltk_language = stopwords_language_map.get(language, 'english')
|
|
343
|
+
|
|
344
|
+
try:
|
|
345
|
+
stop_words = set(nltk.corpus.stopwords.words(nltk_language))
|
|
346
|
+
except LookupError:
|
|
347
|
+
try:
|
|
348
|
+
nltk.download('stopwords', quiet=True)
|
|
349
|
+
stop_words = set(nltk.corpus.stopwords.words(nltk_language))
|
|
350
|
+
except:
|
|
351
|
+
logger.warning(f"Could not load stopwords for language '{nltk_language}', using English")
|
|
352
|
+
stop_words = set(nltk.corpus.stopwords.words('english'))
|
|
353
|
+
|
|
354
|
+
tokens = [word for word in tokens if word.lower() not in stop_words]
|
|
355
|
+
|
|
356
|
+
# Lemmatization and POS Tagging using spaCy or NLTK
|
|
357
|
+
lemmatizer = nltk.WordNetLemmatizer()
|
|
358
|
+
stemmer = PorterStemmer()
|
|
359
|
+
lemmas = []
|
|
360
|
+
pos_tags = {}
|
|
361
|
+
|
|
362
|
+
if nlp and query_nlp_backend == 'spacy':
|
|
363
|
+
# Use spaCy for better POS tagging
|
|
364
|
+
doc = nlp(" ".join(tokens))
|
|
365
|
+
for token in doc:
|
|
366
|
+
lemma = token.lemma_.lower()
|
|
367
|
+
stemmed = stemmer.stem(lemma)
|
|
368
|
+
lemmas.append((token.text.lower(), stemmed))
|
|
369
|
+
pos_tags[token.text.lower()] = token.pos_
|
|
370
|
+
if debug:
|
|
371
|
+
logger.info(f"POS Tagging Results (spaCy): {pos_tags}")
|
|
372
|
+
else:
|
|
373
|
+
# Use NLTK (default or fallback)
|
|
374
|
+
try:
|
|
375
|
+
nltk_pos_tags = nltk.pos_tag(tokens)
|
|
376
|
+
for token, pos_tag in nltk_pos_tags:
|
|
377
|
+
try:
|
|
378
|
+
lemma = lemmatizer.lemmatize(token, get_wordnet_pos(pos_tag)).lower()
|
|
379
|
+
except Exception:
|
|
380
|
+
# Fallback if lemmatization fails
|
|
381
|
+
lemma = token.lower()
|
|
382
|
+
stemmed = stemmer.stem(lemma)
|
|
383
|
+
lemmas.append((token.lower(), stemmed))
|
|
384
|
+
pos_tags[token.lower()] = pos_tag
|
|
385
|
+
if debug:
|
|
386
|
+
logger.info(f"POS Tagging Results (NLTK): {pos_tags}")
|
|
387
|
+
except Exception as pos_error:
|
|
388
|
+
# Fallback if POS tagging fails completely
|
|
389
|
+
logger.warning(f"NLTK POS tagging failed: {pos_error}. Using basic token processing.")
|
|
390
|
+
for token in tokens:
|
|
391
|
+
lemma = token.lower()
|
|
392
|
+
stemmed = stemmer.stem(lemma)
|
|
393
|
+
lemmas.append((token.lower(), stemmed))
|
|
394
|
+
pos_tags[token.lower()] = 'NN' # Default to noun
|
|
395
|
+
if debug:
|
|
396
|
+
logger.info(f"Using fallback token processing for: {tokens}")
|
|
397
|
+
|
|
398
|
+
# Expanding query with synonyms
|
|
399
|
+
expanded_query_set = set()
|
|
400
|
+
expanded_query = []
|
|
401
|
+
|
|
402
|
+
# If preserve_original is True, always include the original query first
|
|
403
|
+
if preserve_original:
|
|
404
|
+
# Add original query terms first (maintains exact phrases)
|
|
405
|
+
original_tokens = query.lower().split()
|
|
406
|
+
for token in original_tokens:
|
|
407
|
+
if token not in expanded_query_set:
|
|
408
|
+
expanded_query.append(token)
|
|
409
|
+
expanded_query_set.add(token)
|
|
410
|
+
|
|
411
|
+
for original, lemma in lemmas:
|
|
412
|
+
if original not in expanded_query_set:
|
|
413
|
+
expanded_query.append(original)
|
|
414
|
+
expanded_query_set.add(original)
|
|
415
|
+
if lemma not in expanded_query_set and not preserve_original: # Only add lemmas if not preserving original
|
|
416
|
+
expanded_query.append(lemma)
|
|
417
|
+
expanded_query_set.add(lemma)
|
|
418
|
+
if pos_tags.get(original) in pos_to_expand and max_synonyms > 0:
|
|
419
|
+
synonyms = get_synonyms(lemma, pos_tags[original], max_synonyms)
|
|
420
|
+
for synonym in synonyms:
|
|
421
|
+
if synonym not in expanded_query_set:
|
|
422
|
+
expanded_query.append(synonym)
|
|
423
|
+
expanded_query_set.add(synonym)
|
|
424
|
+
|
|
425
|
+
# Convert to array, remove duplicates, and join back to string
|
|
426
|
+
final_query_str = " ".join(expanded_query)
|
|
427
|
+
final_query_str = remove_duplicate_words(final_query_str)
|
|
428
|
+
|
|
429
|
+
if debug:
|
|
430
|
+
logger.info(f"Expanded Query: {final_query_str}")
|
|
431
|
+
logger.info(f"NLP Backend Used: {query_nlp_backend if nlp or query_nlp_backend == 'nltk' else 'nltk (fallback)'}")
|
|
432
|
+
|
|
433
|
+
formatted_output = {
|
|
434
|
+
'input': final_query_str,
|
|
435
|
+
'enhanced_text': final_query_str, # Alias for compatibility
|
|
436
|
+
'language': language,
|
|
437
|
+
'POS': pos_tags,
|
|
438
|
+
'nlp_backend_used': query_nlp_backend if nlp or query_nlp_backend == 'nltk' else 'nltk'
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
# Vectorize query if requested
|
|
442
|
+
if vector:
|
|
443
|
+
vectorized_query = vectorize_query(final_query_str, model_name=model_name)
|
|
444
|
+
if vectorized_query is not None:
|
|
445
|
+
formatted_output['vector'] = vectorized_query.tolist()
|
|
446
|
+
else:
|
|
447
|
+
formatted_output['vector'] = None
|
|
448
|
+
|
|
449
|
+
return formatted_output
|
|
450
|
+
|
|
451
|
+
def preprocess_document_content(content: str, language: str = 'en', nlp_backend: str = None,
|
|
452
|
+
index_nlp_backend: str = 'nltk') -> Dict[str, Any]:
|
|
453
|
+
"""
|
|
454
|
+
Preprocess document content for better searchability
|
|
455
|
+
|
|
456
|
+
Args:
|
|
457
|
+
content: Document content to process
|
|
458
|
+
language: Language code for processing
|
|
459
|
+
nlp_backend: DEPRECATED - use index_nlp_backend instead
|
|
460
|
+
index_nlp_backend: NLP backend for document processing ('nltk' for fast, 'spacy' for better quality)
|
|
461
|
+
|
|
462
|
+
Returns:
|
|
463
|
+
Dict containing enhanced text and extracted keywords
|
|
464
|
+
"""
|
|
465
|
+
|
|
466
|
+
# Handle backward compatibility
|
|
467
|
+
if nlp_backend is not None:
|
|
468
|
+
index_nlp_backend = nlp_backend
|
|
469
|
+
|
|
470
|
+
# Use existing preprocessing but adapted for documents
|
|
471
|
+
processed = preprocess_query(
|
|
472
|
+
content,
|
|
473
|
+
language=language,
|
|
474
|
+
pos_to_expand=['NOUN', 'VERB'], # Less aggressive for documents
|
|
475
|
+
max_synonyms=2, # Fewer synonyms for documents
|
|
476
|
+
debug=False,
|
|
477
|
+
vector=False,
|
|
478
|
+
query_nlp_backend=index_nlp_backend
|
|
479
|
+
)
|
|
480
|
+
|
|
481
|
+
# Extract key terms for keyword search
|
|
482
|
+
try:
|
|
483
|
+
tokens = nltk.word_tokenize(processed['input'])
|
|
484
|
+
nltk_language = stopwords_language_map.get(language, 'english')
|
|
485
|
+
|
|
486
|
+
try:
|
|
487
|
+
stop_words = set(nltk.corpus.stopwords.words(nltk_language))
|
|
488
|
+
except:
|
|
489
|
+
stop_words = set(nltk.corpus.stopwords.words('english'))
|
|
490
|
+
|
|
491
|
+
keywords = [word.lower() for word in tokens if word.lower() not in stop_words and len(word) > 2]
|
|
492
|
+
|
|
493
|
+
except Exception as e:
|
|
494
|
+
logger.warning(f"Error extracting keywords: {e}")
|
|
495
|
+
keywords = []
|
|
496
|
+
|
|
497
|
+
return {
|
|
498
|
+
'enhanced_text': processed['input'],
|
|
499
|
+
'keywords': keywords[:20], # Limit to top 20 keywords
|
|
500
|
+
'language': processed.get('language', language),
|
|
501
|
+
'pos_analysis': processed.get('POS', {})
|
|
502
|
+
}
|