signalwire-agents 0.1.13__py3-none-any.whl → 0.1.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- signalwire_agents/__init__.py +28 -11
- signalwire_agents/cli/build_search.py +174 -14
- signalwire_agents/cli/test_swaig.py +159 -114
- signalwire_agents/core/agent_base.py +7 -36
- signalwire_agents/core/logging_config.py +143 -14
- signalwire_agents/core/skill_manager.py +2 -2
- signalwire_agents/core/swml_service.py +5 -45
- signalwire_agents/search/document_processor.py +275 -14
- signalwire_agents/search/index_builder.py +45 -10
- signalwire_agents/search/query_processor.py +27 -12
- signalwire_agents/skills/__init__.py +1 -1
- signalwire_agents/skills/native_vector_search/skill.py +24 -6
- signalwire_agents/skills/registry.py +58 -42
- {signalwire_agents-0.1.13.dist-info → signalwire_agents-0.1.14.dist-info}/METADATA +1 -1
- {signalwire_agents-0.1.13.dist-info → signalwire_agents-0.1.14.dist-info}/RECORD +20 -20
- {signalwire_agents-0.1.13.dist-info → signalwire_agents-0.1.14.dist-info}/entry_points.txt +1 -1
- {signalwire_agents-0.1.13.data → signalwire_agents-0.1.14.data}/data/schema.json +0 -0
- {signalwire_agents-0.1.13.dist-info → signalwire_agents-0.1.14.dist-info}/WHEEL +0 -0
- {signalwire_agents-0.1.13.dist-info → signalwire_agents-0.1.14.dist-info}/licenses/LICENSE +0 -0
- {signalwire_agents-0.1.13.dist-info → signalwire_agents-0.1.14.dist-info}/top_level.txt +0 -0
@@ -24,51 +24,11 @@ import types
|
|
24
24
|
from typing import Dict, List, Any, Optional, Union, Callable, Tuple, Type
|
25
25
|
from urllib.parse import urlparse
|
26
26
|
|
27
|
-
# Import
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
if not hasattr(structlog, "_configured") or not structlog._configured:
|
33
|
-
structlog.configure(
|
34
|
-
processors=[
|
35
|
-
structlog.stdlib.filter_by_level,
|
36
|
-
structlog.stdlib.add_logger_name,
|
37
|
-
structlog.stdlib.add_log_level,
|
38
|
-
structlog.stdlib.PositionalArgumentsFormatter(),
|
39
|
-
structlog.processors.TimeStamper(fmt="%Y-%m-%d %H:%M:%S"),
|
40
|
-
structlog.processors.StackInfoRenderer(),
|
41
|
-
structlog.processors.format_exc_info,
|
42
|
-
structlog.processors.UnicodeDecoder(),
|
43
|
-
structlog.dev.ConsoleRenderer()
|
44
|
-
],
|
45
|
-
context_class=dict,
|
46
|
-
logger_factory=structlog.stdlib.LoggerFactory(),
|
47
|
-
wrapper_class=structlog.stdlib.BoundLogger,
|
48
|
-
cache_logger_on_first_use=True,
|
49
|
-
)
|
50
|
-
|
51
|
-
# Set up root logger with structlog
|
52
|
-
logging.basicConfig(
|
53
|
-
format="%(message)s",
|
54
|
-
stream=sys.stdout,
|
55
|
-
level=logging.INFO,
|
56
|
-
)
|
57
|
-
|
58
|
-
# Mark as configured to avoid duplicate configuration
|
59
|
-
structlog._configured = True
|
60
|
-
|
61
|
-
# Create the module logger
|
62
|
-
logger = structlog.get_logger("swml_service")
|
63
|
-
|
64
|
-
except ImportError:
|
65
|
-
# Fallback to standard logging if structlog is not available
|
66
|
-
logging.basicConfig(
|
67
|
-
level=logging.INFO,
|
68
|
-
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
69
|
-
stream=sys.stdout
|
70
|
-
)
|
71
|
-
logger = logging.getLogger("swml_service")
|
27
|
+
# Import centralized logging system
|
28
|
+
from signalwire_agents.core.logging_config import get_logger
|
29
|
+
|
30
|
+
# Create the module logger using centralized system
|
31
|
+
logger = get_logger("swml_service")
|
72
32
|
|
73
33
|
try:
|
74
34
|
import fastapi
|
@@ -74,29 +74,42 @@ logger = logging.getLogger(__name__)
|
|
74
74
|
class DocumentProcessor:
|
75
75
|
"""Enhanced document processor with smart chunking capabilities"""
|
76
76
|
|
77
|
-
def __init__(
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
77
|
+
def __init__(
|
78
|
+
self,
|
79
|
+
chunking_strategy: str = 'sentence',
|
80
|
+
max_sentences_per_chunk: int = 5,
|
81
|
+
chunk_size: int = 50,
|
82
|
+
chunk_overlap: int = 10,
|
83
|
+
split_newlines: Optional[int] = None,
|
84
|
+
index_nlp_backend: str = 'nltk',
|
85
|
+
verbose: bool = False,
|
86
|
+
semantic_threshold: float = 0.5,
|
87
|
+
topic_threshold: float = 0.3
|
88
|
+
):
|
82
89
|
"""
|
83
|
-
Initialize document processor
|
90
|
+
Initialize document processor
|
84
91
|
|
85
92
|
Args:
|
86
|
-
chunking_strategy: 'sentence', 'sliding', 'paragraph',
|
87
|
-
max_sentences_per_chunk: For sentence strategy (default:
|
93
|
+
chunking_strategy: Strategy for chunking documents ('sentence', 'sliding', 'paragraph', 'page', 'semantic', 'topic', 'qa')
|
94
|
+
max_sentences_per_chunk: For sentence strategy (default: 5)
|
88
95
|
chunk_size: For sliding strategy - words per chunk (default: 50)
|
89
|
-
|
96
|
+
chunk_overlap: For sliding strategy - overlap in words (default: 10)
|
90
97
|
split_newlines: For sentence strategy - split on multiple newlines (optional)
|
98
|
+
index_nlp_backend: NLP backend for indexing (default: 'nltk')
|
99
|
+
verbose: Whether to enable verbose logging (default: False)
|
100
|
+
semantic_threshold: Similarity threshold for semantic chunking (default: 0.5)
|
101
|
+
topic_threshold: Similarity threshold for topic chunking (default: 0.3)
|
91
102
|
"""
|
92
103
|
self.chunking_strategy = chunking_strategy
|
93
104
|
self.max_sentences_per_chunk = max_sentences_per_chunk
|
94
105
|
self.chunk_size = chunk_size
|
95
|
-
self.
|
106
|
+
self.chunk_overlap = chunk_overlap
|
96
107
|
self.split_newlines = split_newlines
|
108
|
+
self.semantic_threshold = semantic_threshold
|
109
|
+
self.topic_threshold = topic_threshold
|
97
110
|
|
98
111
|
# Legacy support for old character-based chunking
|
99
|
-
self.chunk_overlap =
|
112
|
+
self.chunk_overlap = chunk_overlap
|
100
113
|
|
101
114
|
def create_chunks(self, content: str, filename: str,
|
102
115
|
file_type: str) -> List[Dict[str, Any]]:
|
@@ -121,6 +134,12 @@ class DocumentProcessor:
|
|
121
134
|
return self._chunk_by_paragraphs(content, filename, file_type)
|
122
135
|
elif self.chunking_strategy == 'page':
|
123
136
|
return self._chunk_by_pages(content, filename, file_type)
|
137
|
+
elif self.chunking_strategy == 'semantic':
|
138
|
+
return self._chunk_by_semantic(content, filename, file_type)
|
139
|
+
elif self.chunking_strategy == 'topic':
|
140
|
+
return self._chunk_by_topics(content, filename, file_type)
|
141
|
+
elif self.chunking_strategy == 'qa':
|
142
|
+
return self._chunk_by_qa_optimization(content, filename, file_type)
|
124
143
|
else:
|
125
144
|
# Fallback to sentence-based chunking
|
126
145
|
return self._chunk_by_sentences(content, filename, file_type)
|
@@ -674,7 +693,7 @@ class DocumentProcessor:
|
|
674
693
|
chunk_index = 0
|
675
694
|
|
676
695
|
# Create overlapping chunks
|
677
|
-
for i in range(0, len(words), self.chunk_size - self.
|
696
|
+
for i in range(0, len(words), self.chunk_size - self.chunk_overlap):
|
678
697
|
chunk_words = words[i:i + self.chunk_size]
|
679
698
|
if chunk_words:
|
680
699
|
chunk_content = ' '.join(chunk_words)
|
@@ -686,7 +705,7 @@ class DocumentProcessor:
|
|
686
705
|
'chunk_method': 'sliding_window',
|
687
706
|
'chunk_index': chunk_index,
|
688
707
|
'chunk_size_words': self.chunk_size,
|
689
|
-
'overlap_size_words': self.
|
708
|
+
'overlap_size_words': self.chunk_overlap,
|
690
709
|
'start_word': i,
|
691
710
|
'end_word': i + len(chunk_words)
|
692
711
|
}
|
@@ -761,4 +780,246 @@ class DocumentProcessor:
|
|
761
780
|
}
|
762
781
|
))
|
763
782
|
|
764
|
-
return chunks
|
783
|
+
return chunks
|
784
|
+
|
785
|
+
def _chunk_by_semantic(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
|
786
|
+
"""Chunk based on semantic similarity between sentences"""
|
787
|
+
if isinstance(content, list):
|
788
|
+
content = '\n'.join(content)
|
789
|
+
|
790
|
+
# Get sentences
|
791
|
+
if sent_tokenize:
|
792
|
+
sentences = sent_tokenize(content)
|
793
|
+
else:
|
794
|
+
sentences = content.split('. ')
|
795
|
+
sentences = [s.strip() + '.' for s in sentences if s.strip()]
|
796
|
+
|
797
|
+
if len(sentences) <= 1:
|
798
|
+
return [self._create_chunk(content, filename, "Section 1",
|
799
|
+
metadata={'chunk_method': 'semantic', 'chunk_index': 0})]
|
800
|
+
|
801
|
+
# Generate embeddings for sentences (using the same model as the index)
|
802
|
+
try:
|
803
|
+
from sentence_transformers import SentenceTransformer
|
804
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
805
|
+
import numpy as np
|
806
|
+
|
807
|
+
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
|
808
|
+
embeddings = model.encode(sentences, show_progress_bar=False)
|
809
|
+
|
810
|
+
# Calculate similarity between adjacent sentences
|
811
|
+
similarities = []
|
812
|
+
for i in range(len(embeddings) - 1):
|
813
|
+
sim = cosine_similarity([embeddings[i]], [embeddings[i + 1]])[0][0]
|
814
|
+
similarities.append(sim)
|
815
|
+
|
816
|
+
# Find split points where similarity drops below threshold
|
817
|
+
split_points = [0]
|
818
|
+
for i, sim in enumerate(similarities):
|
819
|
+
if sim < self.semantic_threshold:
|
820
|
+
split_points.append(i + 1)
|
821
|
+
split_points.append(len(sentences))
|
822
|
+
|
823
|
+
# Create chunks
|
824
|
+
chunks = []
|
825
|
+
for i in range(len(split_points) - 1):
|
826
|
+
start_idx = split_points[i]
|
827
|
+
end_idx = split_points[i + 1]
|
828
|
+
chunk_sentences = sentences[start_idx:end_idx]
|
829
|
+
|
830
|
+
# Ensure minimum chunk size
|
831
|
+
if len(chunk_sentences) < 2 and i > 0:
|
832
|
+
# Merge with previous chunk
|
833
|
+
chunks[-1]['content'] += ' ' + ' '.join(chunk_sentences)
|
834
|
+
continue
|
835
|
+
|
836
|
+
chunk_content = ' '.join(chunk_sentences)
|
837
|
+
chunks.append(self._create_chunk(
|
838
|
+
content=chunk_content,
|
839
|
+
filename=filename,
|
840
|
+
section=f"Semantic Section {i+1}",
|
841
|
+
metadata={
|
842
|
+
'chunk_method': 'semantic',
|
843
|
+
'chunk_index': i,
|
844
|
+
'semantic_threshold': self.semantic_threshold,
|
845
|
+
'sentence_count': len(chunk_sentences)
|
846
|
+
}
|
847
|
+
))
|
848
|
+
|
849
|
+
return chunks if chunks else [self._create_chunk(content, filename, "Section 1",
|
850
|
+
metadata={'chunk_method': 'semantic', 'chunk_index': 0})]
|
851
|
+
|
852
|
+
except ImportError:
|
853
|
+
# Fallback to sentence-based chunking
|
854
|
+
return self._chunk_by_sentences(content, filename, file_type)
|
855
|
+
|
856
|
+
def _chunk_by_topics(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
|
857
|
+
"""Chunk based on topic changes using keyword analysis"""
|
858
|
+
if isinstance(content, list):
|
859
|
+
content = '\n'.join(content)
|
860
|
+
|
861
|
+
if sent_tokenize:
|
862
|
+
sentences = sent_tokenize(content)
|
863
|
+
else:
|
864
|
+
sentences = content.split('. ')
|
865
|
+
sentences = [s.strip() + '.' for s in sentences if s.strip()]
|
866
|
+
|
867
|
+
if len(sentences) <= 3:
|
868
|
+
return [self._create_chunk(content, filename, "Topic 1",
|
869
|
+
metadata={'chunk_method': 'topic', 'chunk_index': 0})]
|
870
|
+
|
871
|
+
try:
|
872
|
+
# Simple topic detection using keyword overlap
|
873
|
+
from collections import Counter
|
874
|
+
import re
|
875
|
+
|
876
|
+
# Extract keywords from each sentence
|
877
|
+
sentence_keywords = []
|
878
|
+
for sentence in sentences:
|
879
|
+
# Simple keyword extraction (could be enhanced with NLP)
|
880
|
+
words = re.findall(r'\b[a-zA-Z]{3,}\b', sentence.lower())
|
881
|
+
# Filter common words (basic stopwords)
|
882
|
+
stopwords = {'the', 'and', 'for', 'are', 'but', 'not', 'you', 'all', 'can', 'had', 'her', 'was', 'one', 'our', 'out', 'day', 'get', 'has', 'him', 'his', 'how', 'its', 'may', 'new', 'now', 'old', 'see', 'two', 'who', 'boy', 'did', 'man', 'way', 'she', 'use', 'her', 'many', 'oil', 'sit', 'set', 'run', 'eat', 'far', 'sea', 'eye', 'ask', 'own', 'say', 'too', 'any', 'try', 'us', 'an', 'as', 'at', 'be', 'he', 'if', 'in', 'is', 'it', 'my', 'of', 'on', 'or', 'to', 'up', 'we', 'go', 'no', 'so', 'am', 'by', 'do', 'me'}
|
883
|
+
keywords = [w for w in words if w not in stopwords and len(w) > 3]
|
884
|
+
sentence_keywords.append(set(keywords))
|
885
|
+
|
886
|
+
# Find topic boundaries based on keyword overlap
|
887
|
+
chunks = []
|
888
|
+
current_chunk = [sentences[0]]
|
889
|
+
current_keywords = sentence_keywords[0]
|
890
|
+
|
891
|
+
for i in range(1, len(sentences)):
|
892
|
+
# Calculate keyword overlap with current chunk
|
893
|
+
overlap = len(current_keywords.intersection(sentence_keywords[i]))
|
894
|
+
total_keywords = len(current_keywords.union(sentence_keywords[i]))
|
895
|
+
|
896
|
+
if total_keywords > 0:
|
897
|
+
similarity = overlap / total_keywords
|
898
|
+
else:
|
899
|
+
similarity = 0
|
900
|
+
|
901
|
+
# If similarity is low, start new chunk
|
902
|
+
if similarity < self.topic_threshold and len(current_chunk) >= 2:
|
903
|
+
chunk_content = ' '.join(current_chunk)
|
904
|
+
chunks.append(self._create_chunk(
|
905
|
+
content=chunk_content,
|
906
|
+
filename=filename,
|
907
|
+
section=f"Topic {len(chunks)+1}",
|
908
|
+
metadata={
|
909
|
+
'chunk_method': 'topic',
|
910
|
+
'chunk_index': len(chunks),
|
911
|
+
'topic_keywords': list(current_keywords)[:10], # Top keywords
|
912
|
+
'sentence_count': len(current_chunk),
|
913
|
+
'topic_threshold': self.topic_threshold
|
914
|
+
}
|
915
|
+
))
|
916
|
+
current_chunk = [sentences[i]]
|
917
|
+
current_keywords = sentence_keywords[i]
|
918
|
+
else:
|
919
|
+
current_chunk.append(sentences[i])
|
920
|
+
current_keywords = current_keywords.union(sentence_keywords[i])
|
921
|
+
|
922
|
+
# Add final chunk
|
923
|
+
if current_chunk:
|
924
|
+
chunk_content = ' '.join(current_chunk)
|
925
|
+
chunks.append(self._create_chunk(
|
926
|
+
content=chunk_content,
|
927
|
+
filename=filename,
|
928
|
+
section=f"Topic {len(chunks)+1}",
|
929
|
+
metadata={
|
930
|
+
'chunk_method': 'topic',
|
931
|
+
'chunk_index': len(chunks),
|
932
|
+
'topic_keywords': list(current_keywords)[:10],
|
933
|
+
'sentence_count': len(current_chunk),
|
934
|
+
'topic_threshold': self.topic_threshold
|
935
|
+
}
|
936
|
+
))
|
937
|
+
|
938
|
+
return chunks if chunks else [self._create_chunk(content, filename, "Topic 1",
|
939
|
+
metadata={'chunk_method': 'topic', 'chunk_index': 0})]
|
940
|
+
|
941
|
+
except Exception:
|
942
|
+
# Fallback to sentence-based chunking
|
943
|
+
return self._chunk_by_sentences(content, filename, file_type)
|
944
|
+
|
945
|
+
def _chunk_by_qa_optimization(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
|
946
|
+
"""Create chunks optimized for question-answering"""
|
947
|
+
if isinstance(content, list):
|
948
|
+
content = '\n'.join(content)
|
949
|
+
|
950
|
+
if sent_tokenize:
|
951
|
+
sentences = sent_tokenize(content)
|
952
|
+
else:
|
953
|
+
sentences = content.split('. ')
|
954
|
+
sentences = [s.strip() + '.' for s in sentences if s.strip()]
|
955
|
+
|
956
|
+
# Patterns that indicate Q&A structure
|
957
|
+
question_patterns = [
|
958
|
+
r'\?', # Questions
|
959
|
+
r'^(what|how|why|when|where|who|which|can|does|is|are|will|would|should)',
|
960
|
+
r'(step|steps|process|procedure|method|way to)',
|
961
|
+
r'(example|examples|instance|case)',
|
962
|
+
r'(definition|meaning|refers to|means)',
|
963
|
+
]
|
964
|
+
|
965
|
+
chunks = []
|
966
|
+
current_chunk = []
|
967
|
+
current_context = []
|
968
|
+
|
969
|
+
for i, sentence in enumerate(sentences):
|
970
|
+
sentence_lower = sentence.lower().strip()
|
971
|
+
|
972
|
+
# Check if this sentence contains Q&A indicators
|
973
|
+
is_qa_relevant = any(re.search(pattern, sentence_lower) for pattern in question_patterns)
|
974
|
+
|
975
|
+
if is_qa_relevant or len(current_chunk) == 0:
|
976
|
+
current_chunk.append(sentence)
|
977
|
+
# Add surrounding context (previous and next sentences)
|
978
|
+
if i > 0 and sentences[i-1] not in current_chunk:
|
979
|
+
current_context.append(sentences[i-1])
|
980
|
+
if i < len(sentences) - 1:
|
981
|
+
current_context.append(sentences[i+1])
|
982
|
+
else:
|
983
|
+
current_chunk.append(sentence)
|
984
|
+
|
985
|
+
# Create chunk when we have enough content or reach a natural break
|
986
|
+
if (len(current_chunk) >= 3 and
|
987
|
+
(i == len(sentences) - 1 or # Last sentence
|
988
|
+
sentence.endswith('.') and len(current_chunk) >= 5)): # Natural break
|
989
|
+
|
990
|
+
# Combine chunk with context
|
991
|
+
full_content = current_context + current_chunk
|
992
|
+
chunk_content = ' '.join(full_content)
|
993
|
+
|
994
|
+
chunks.append(self._create_chunk(
|
995
|
+
content=chunk_content,
|
996
|
+
filename=filename,
|
997
|
+
section=f"QA Section {len(chunks)+1}",
|
998
|
+
metadata={
|
999
|
+
'chunk_method': 'qa_optimized',
|
1000
|
+
'chunk_index': len(chunks),
|
1001
|
+
'has_question': any('?' in s for s in current_chunk),
|
1002
|
+
'has_process': any(re.search(r'(step|process|method)', s.lower()) for s in current_chunk),
|
1003
|
+
'sentence_count': len(full_content)
|
1004
|
+
}
|
1005
|
+
))
|
1006
|
+
|
1007
|
+
current_chunk = []
|
1008
|
+
current_context = []
|
1009
|
+
|
1010
|
+
# Handle remaining content
|
1011
|
+
if current_chunk:
|
1012
|
+
chunk_content = ' '.join(current_context + current_chunk)
|
1013
|
+
chunks.append(self._create_chunk(
|
1014
|
+
content=chunk_content,
|
1015
|
+
filename=filename,
|
1016
|
+
section=f"QA Section {len(chunks)+1}",
|
1017
|
+
metadata={
|
1018
|
+
'chunk_method': 'qa_optimized',
|
1019
|
+
'chunk_index': len(chunks),
|
1020
|
+
'sentence_count': len(current_context + current_chunk)
|
1021
|
+
}
|
1022
|
+
))
|
1023
|
+
|
1024
|
+
return chunks if chunks else [self._create_chunk(content, filename, "QA Section 1",
|
1025
|
+
metadata={'chunk_method': 'qa_optimized', 'chunk_index': 0})]
|
@@ -35,27 +35,61 @@ logger = logging.getLogger(__name__)
|
|
35
35
|
class IndexBuilder:
|
36
36
|
"""Build searchable indexes from document directories"""
|
37
37
|
|
38
|
-
def __init__(
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
38
|
+
def __init__(
|
39
|
+
self,
|
40
|
+
model_name: str = 'sentence-transformers/all-mpnet-base-v2',
|
41
|
+
chunking_strategy: str = 'sentence',
|
42
|
+
max_sentences_per_chunk: int = 5,
|
43
|
+
chunk_size: int = 50,
|
44
|
+
chunk_overlap: int = 10,
|
45
|
+
split_newlines: Optional[int] = None,
|
46
|
+
index_nlp_backend: str = 'nltk',
|
47
|
+
verbose: bool = False,
|
48
|
+
semantic_threshold: float = 0.5,
|
49
|
+
topic_threshold: float = 0.3
|
50
|
+
):
|
51
|
+
"""
|
52
|
+
Initialize the index builder
|
53
|
+
|
54
|
+
Args:
|
55
|
+
model_name: Name of the sentence transformer model to use
|
56
|
+
chunking_strategy: Strategy for chunking documents ('sentence', 'sliding', 'paragraph', 'page', 'semantic', 'topic', 'qa')
|
57
|
+
max_sentences_per_chunk: For sentence strategy (default: 5)
|
58
|
+
chunk_size: For sliding strategy - words per chunk (default: 50)
|
59
|
+
chunk_overlap: For sliding strategy - overlap in words (default: 10)
|
60
|
+
split_newlines: For sentence strategy - split on multiple newlines (optional)
|
61
|
+
index_nlp_backend: NLP backend for indexing (default: 'nltk')
|
62
|
+
verbose: Whether to enable verbose logging (default: False)
|
63
|
+
semantic_threshold: Similarity threshold for semantic chunking (default: 0.5)
|
64
|
+
topic_threshold: Similarity threshold for topic chunking (default: 0.3)
|
65
|
+
"""
|
45
66
|
self.model_name = model_name
|
46
67
|
self.chunking_strategy = chunking_strategy
|
47
68
|
self.max_sentences_per_chunk = max_sentences_per_chunk
|
48
69
|
self.chunk_size = chunk_size
|
49
70
|
self.chunk_overlap = chunk_overlap
|
50
71
|
self.split_newlines = split_newlines
|
72
|
+
self.index_nlp_backend = index_nlp_backend
|
51
73
|
self.verbose = verbose
|
74
|
+
self.semantic_threshold = semantic_threshold
|
75
|
+
self.topic_threshold = topic_threshold
|
52
76
|
self.model = None
|
77
|
+
|
78
|
+
# Validate NLP backend
|
79
|
+
if self.index_nlp_backend not in ['nltk', 'spacy']:
|
80
|
+
logger.warning(f"Invalid index_nlp_backend '{self.index_nlp_backend}', using 'nltk'")
|
81
|
+
self.index_nlp_backend = 'nltk'
|
82
|
+
|
53
83
|
self.doc_processor = DocumentProcessor(
|
54
84
|
chunking_strategy=chunking_strategy,
|
55
85
|
max_sentences_per_chunk=max_sentences_per_chunk,
|
56
86
|
chunk_size=chunk_size,
|
57
|
-
|
58
|
-
split_newlines=split_newlines
|
87
|
+
chunk_overlap=chunk_overlap,
|
88
|
+
split_newlines=split_newlines,
|
89
|
+
index_nlp_backend=self.index_nlp_backend,
|
90
|
+
verbose=self.verbose,
|
91
|
+
semantic_threshold=self.semantic_threshold,
|
92
|
+
topic_threshold=self.topic_threshold
|
59
93
|
)
|
60
94
|
|
61
95
|
def _load_model(self):
|
@@ -130,7 +164,8 @@ class IndexBuilder:
|
|
130
164
|
# Preprocess content for better search
|
131
165
|
processed = preprocess_document_content(
|
132
166
|
chunk['content'],
|
133
|
-
language=chunk.get('language', 'en')
|
167
|
+
language=chunk.get('language', 'en'),
|
168
|
+
index_nlp_backend=self.index_nlp_backend
|
134
169
|
)
|
135
170
|
|
136
171
|
chunk['processed_content'] = processed['enhanced_text']
|
@@ -186,7 +186,8 @@ def remove_duplicate_words(input_string: str) -> str:
|
|
186
186
|
|
187
187
|
def preprocess_query(query: str, language: str = 'en', pos_to_expand: Optional[List[str]] = None,
|
188
188
|
max_synonyms: int = 5, debug: bool = False, vector: bool = False,
|
189
|
-
vectorize_query_param: bool = False, nlp_backend: str =
|
189
|
+
vectorize_query_param: bool = False, nlp_backend: str = None,
|
190
|
+
query_nlp_backend: str = 'nltk') -> Dict[str, Any]:
|
190
191
|
"""
|
191
192
|
Advanced query preprocessing with language detection, POS tagging, synonym expansion, and vectorization
|
192
193
|
|
@@ -198,12 +199,19 @@ def preprocess_query(query: str, language: str = 'en', pos_to_expand: Optional[L
|
|
198
199
|
debug: Enable debug output
|
199
200
|
vector: Include vector embedding in output
|
200
201
|
vectorize_query_param: If True, just vectorize without other processing
|
201
|
-
nlp_backend:
|
202
|
+
nlp_backend: DEPRECATED - use query_nlp_backend instead
|
203
|
+
query_nlp_backend: NLP backend for query processing ('nltk' for fast, 'spacy' for better quality)
|
202
204
|
|
203
205
|
Returns:
|
204
206
|
Dict containing processed query, language, POS tags, and optionally vector
|
205
207
|
"""
|
206
208
|
|
209
|
+
# Handle backward compatibility
|
210
|
+
if nlp_backend is not None:
|
211
|
+
query_nlp_backend = nlp_backend
|
212
|
+
if debug:
|
213
|
+
logger.info(f"Using deprecated 'nlp_backend' parameter, please use 'query_nlp_backend' instead")
|
214
|
+
|
207
215
|
if vectorize_query_param:
|
208
216
|
# Vectorize the query directly
|
209
217
|
vectorized_query = vectorize_query(query)
|
@@ -226,15 +234,16 @@ def preprocess_query(query: str, language: str = 'en', pos_to_expand: Optional[L
|
|
226
234
|
|
227
235
|
# Load spaCy model based on the language and backend choice
|
228
236
|
nlp = None
|
229
|
-
if
|
237
|
+
if query_nlp_backend == 'spacy':
|
230
238
|
nlp = load_spacy_model(language)
|
231
239
|
if nlp is None and debug:
|
232
240
|
logger.info("spaCy backend requested but not available, falling back to NLTK")
|
233
|
-
elif
|
241
|
+
elif query_nlp_backend == 'nltk':
|
234
242
|
if debug:
|
235
|
-
logger.info("Using NLTK backend for
|
243
|
+
logger.info("Using NLTK backend for query processing")
|
236
244
|
else:
|
237
|
-
logger.warning(f"Unknown NLP backend '{
|
245
|
+
logger.warning(f"Unknown query NLP backend '{query_nlp_backend}', using NLTK")
|
246
|
+
query_nlp_backend = 'nltk'
|
238
247
|
|
239
248
|
# Tokenization and stop word removal
|
240
249
|
tokens = nltk.word_tokenize(query)
|
@@ -258,7 +267,7 @@ def preprocess_query(query: str, language: str = 'en', pos_to_expand: Optional[L
|
|
258
267
|
lemmas = []
|
259
268
|
pos_tags = {}
|
260
269
|
|
261
|
-
if nlp and
|
270
|
+
if nlp and query_nlp_backend == 'spacy':
|
262
271
|
# Use spaCy for better POS tagging
|
263
272
|
doc = nlp(" ".join(tokens))
|
264
273
|
for token in doc:
|
@@ -303,14 +312,14 @@ def preprocess_query(query: str, language: str = 'en', pos_to_expand: Optional[L
|
|
303
312
|
|
304
313
|
if debug:
|
305
314
|
logger.info(f"Expanded Query: {final_query_str}")
|
306
|
-
logger.info(f"NLP Backend Used: {
|
315
|
+
logger.info(f"NLP Backend Used: {query_nlp_backend if nlp or query_nlp_backend == 'nltk' else 'nltk (fallback)'}")
|
307
316
|
|
308
317
|
formatted_output = {
|
309
318
|
'input': final_query_str,
|
310
319
|
'enhanced_text': final_query_str, # Alias for compatibility
|
311
320
|
'language': language,
|
312
321
|
'POS': pos_tags,
|
313
|
-
'nlp_backend_used':
|
322
|
+
'nlp_backend_used': query_nlp_backend if nlp or query_nlp_backend == 'nltk' else 'nltk'
|
314
323
|
}
|
315
324
|
|
316
325
|
# Vectorize query if requested
|
@@ -323,19 +332,25 @@ def preprocess_query(query: str, language: str = 'en', pos_to_expand: Optional[L
|
|
323
332
|
|
324
333
|
return formatted_output
|
325
334
|
|
326
|
-
def preprocess_document_content(content: str, language: str = 'en', nlp_backend: str =
|
335
|
+
def preprocess_document_content(content: str, language: str = 'en', nlp_backend: str = None,
|
336
|
+
index_nlp_backend: str = 'nltk') -> Dict[str, Any]:
|
327
337
|
"""
|
328
338
|
Preprocess document content for better searchability
|
329
339
|
|
330
340
|
Args:
|
331
341
|
content: Document content to process
|
332
342
|
language: Language code for processing
|
333
|
-
nlp_backend:
|
343
|
+
nlp_backend: DEPRECATED - use index_nlp_backend instead
|
344
|
+
index_nlp_backend: NLP backend for document processing ('nltk' for fast, 'spacy' for better quality)
|
334
345
|
|
335
346
|
Returns:
|
336
347
|
Dict containing enhanced text and extracted keywords
|
337
348
|
"""
|
338
349
|
|
350
|
+
# Handle backward compatibility
|
351
|
+
if nlp_backend is not None:
|
352
|
+
index_nlp_backend = nlp_backend
|
353
|
+
|
339
354
|
# Use existing preprocessing but adapted for documents
|
340
355
|
processed = preprocess_query(
|
341
356
|
content,
|
@@ -344,7 +359,7 @@ def preprocess_document_content(content: str, language: str = 'en', nlp_backend:
|
|
344
359
|
max_synonyms=2, # Fewer synonyms for documents
|
345
360
|
debug=False,
|
346
361
|
vector=False,
|
347
|
-
|
362
|
+
query_nlp_backend=index_nlp_backend
|
348
363
|
)
|
349
364
|
|
350
365
|
# Extract key terms for keyword search
|
@@ -75,10 +75,25 @@ class NativeVectorSearchSkill(SkillBase):
|
|
75
75
|
self.swaig_fields = self.params.get('swaig_fields', {})
|
76
76
|
|
77
77
|
# NLP backend configuration
|
78
|
-
self.nlp_backend = self.params.get('nlp_backend'
|
79
|
-
|
80
|
-
|
81
|
-
|
78
|
+
self.nlp_backend = self.params.get('nlp_backend') # Backward compatibility
|
79
|
+
self.index_nlp_backend = self.params.get('index_nlp_backend', 'nltk') # Default to fast NLTK for indexing
|
80
|
+
self.query_nlp_backend = self.params.get('query_nlp_backend', 'nltk') # Default to fast NLTK for search
|
81
|
+
|
82
|
+
# Handle backward compatibility
|
83
|
+
if self.nlp_backend is not None:
|
84
|
+
self.logger.warning("Parameter 'nlp_backend' is deprecated. Use 'index_nlp_backend' and 'query_nlp_backend' instead.")
|
85
|
+
# If old parameter is used, apply it to both
|
86
|
+
self.index_nlp_backend = self.nlp_backend
|
87
|
+
self.query_nlp_backend = self.nlp_backend
|
88
|
+
|
89
|
+
# Validate parameters
|
90
|
+
if self.index_nlp_backend not in ['nltk', 'spacy']:
|
91
|
+
self.logger.warning(f"Invalid index_nlp_backend '{self.index_nlp_backend}', using 'nltk'")
|
92
|
+
self.index_nlp_backend = 'nltk'
|
93
|
+
|
94
|
+
if self.query_nlp_backend not in ['nltk', 'spacy']:
|
95
|
+
self.logger.warning(f"Invalid query_nlp_backend '{self.query_nlp_backend}', using 'nltk'")
|
96
|
+
self.query_nlp_backend = 'nltk'
|
82
97
|
|
83
98
|
# Auto-build index if requested and search is available
|
84
99
|
if self.build_index and self.source_dir and self.search_available:
|
@@ -93,7 +108,10 @@ class NativeVectorSearchSkill(SkillBase):
|
|
93
108
|
self.logger.info(f"Building search index from {self.source_dir}...")
|
94
109
|
from signalwire_agents.search import IndexBuilder
|
95
110
|
|
96
|
-
builder = IndexBuilder(
|
111
|
+
builder = IndexBuilder(
|
112
|
+
verbose=self.params.get('verbose', False),
|
113
|
+
index_nlp_backend=self.index_nlp_backend
|
114
|
+
)
|
97
115
|
builder.build_index(
|
98
116
|
source_dir=self.source_dir,
|
99
117
|
output_file=self.index_file,
|
@@ -187,7 +205,7 @@ class NativeVectorSearchSkill(SkillBase):
|
|
187
205
|
try:
|
188
206
|
# Preprocess the query
|
189
207
|
from signalwire_agents.search.query_processor import preprocess_query
|
190
|
-
enhanced = preprocess_query(query, language='en', vector=True,
|
208
|
+
enhanced = preprocess_query(query, language='en', vector=True, query_nlp_backend=self.query_nlp_backend)
|
191
209
|
|
192
210
|
# Perform search (local or remote)
|
193
211
|
if self.use_remote:
|