signalwire-agents 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -269,6 +269,7 @@ class SwmlRenderer:
269
269
 
270
270
  # Return in requested format
271
271
  if format.lower() == "yaml":
272
+ import yaml
272
273
  return yaml.dump(swml, sort_keys=False)
273
274
  else:
274
275
  return json.dumps(swml, indent=2)
@@ -305,17 +306,24 @@ class SwmlRenderer:
305
306
  # Add any actions
306
307
  if actions:
307
308
  for action in actions:
308
- if action["type"] == "play":
309
- service.add_verb("play", {
310
- "url": action["url"]
311
- })
312
- elif action["type"] == "transfer":
313
- service.add_verb("connect", [
314
- {"to": action["dest"]}
315
- ])
316
- elif action["type"] == "hang_up":
317
- service.add_verb("hangup", {})
318
- # Additional action types could be added here
309
+ # Support both type-based actions and direct SWML verbs
310
+ if "type" in action:
311
+ # Type-based action format
312
+ if action["type"] == "play":
313
+ service.add_verb("play", {
314
+ "url": action["url"]
315
+ })
316
+ elif action["type"] == "transfer":
317
+ service.add_verb("connect", [
318
+ {"to": action["dest"]}
319
+ ])
320
+ elif action["type"] == "hang_up":
321
+ service.add_verb("hangup", {})
322
+ # Additional action types could be added here
323
+ else:
324
+ # Direct SWML verb format
325
+ for verb_name, verb_config in action.items():
326
+ service.add_verb(verb_name, verb_config)
319
327
 
320
328
  # Return in requested format
321
329
  if format.lower() == "yaml":
@@ -343,26 +351,33 @@ class SwmlRenderer:
343
351
  # Add any actions
344
352
  if actions:
345
353
  for action in actions:
346
- if action["type"] == "play":
347
- swml["sections"]["main"].append({
348
- "play": {
349
- "url": action["url"]
350
- }
351
- })
352
- elif action["type"] == "transfer":
353
- swml["sections"]["main"].append({
354
- "connect": [
355
- {"to": action["dest"]}
356
- ]
357
- })
358
- elif action["type"] == "hang_up":
359
- swml["sections"]["main"].append({
360
- "hangup": {}
361
- })
362
- # Additional action types could be added here
354
+ # Support both type-based actions and direct SWML verbs
355
+ if "type" in action:
356
+ # Type-based action format
357
+ if action["type"] == "play":
358
+ swml["sections"]["main"].append({
359
+ "play": {
360
+ "url": action["url"]
361
+ }
362
+ })
363
+ elif action["type"] == "transfer":
364
+ swml["sections"]["main"].append({
365
+ "connect": [
366
+ {"to": action["dest"]}
367
+ ]
368
+ })
369
+ elif action["type"] == "hang_up":
370
+ swml["sections"]["main"].append({
371
+ "hangup": {}
372
+ })
373
+ # Additional action types could be added here
374
+ else:
375
+ # Direct SWML verb format - add the action as-is
376
+ swml["sections"]["main"].append(action)
363
377
 
364
378
  # Return in requested format
365
379
  if format.lower() == "yaml":
380
+ import yaml
366
381
  return yaml.dump(swml, sort_keys=False)
367
382
  else:
368
383
  return json.dumps(swml)
@@ -24,51 +24,11 @@ import types
24
24
  from typing import Dict, List, Any, Optional, Union, Callable, Tuple, Type
25
25
  from urllib.parse import urlparse
26
26
 
27
- # Import and configure structlog
28
- try:
29
- import structlog
30
-
31
- # Only configure if not already configured
32
- if not hasattr(structlog, "_configured") or not structlog._configured:
33
- structlog.configure(
34
- processors=[
35
- structlog.stdlib.filter_by_level,
36
- structlog.stdlib.add_logger_name,
37
- structlog.stdlib.add_log_level,
38
- structlog.stdlib.PositionalArgumentsFormatter(),
39
- structlog.processors.TimeStamper(fmt="%Y-%m-%d %H:%M:%S"),
40
- structlog.processors.StackInfoRenderer(),
41
- structlog.processors.format_exc_info,
42
- structlog.processors.UnicodeDecoder(),
43
- structlog.dev.ConsoleRenderer()
44
- ],
45
- context_class=dict,
46
- logger_factory=structlog.stdlib.LoggerFactory(),
47
- wrapper_class=structlog.stdlib.BoundLogger,
48
- cache_logger_on_first_use=True,
49
- )
50
-
51
- # Set up root logger with structlog
52
- logging.basicConfig(
53
- format="%(message)s",
54
- stream=sys.stdout,
55
- level=logging.INFO,
56
- )
57
-
58
- # Mark as configured to avoid duplicate configuration
59
- structlog._configured = True
60
-
61
- # Create the module logger
62
- logger = structlog.get_logger("swml_service")
63
-
64
- except ImportError:
65
- # Fallback to standard logging if structlog is not available
66
- logging.basicConfig(
67
- level=logging.INFO,
68
- format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
69
- stream=sys.stdout
70
- )
71
- logger = logging.getLogger("swml_service")
27
+ # Import centralized logging system
28
+ from signalwire_agents.core.logging_config import get_logger
29
+
30
+ # Create the module logger using centralized system
31
+ logger = get_logger("swml_service")
72
32
 
73
33
  try:
74
34
  import fastapi
@@ -74,29 +74,42 @@ logger = logging.getLogger(__name__)
74
74
  class DocumentProcessor:
75
75
  """Enhanced document processor with smart chunking capabilities"""
76
76
 
77
- def __init__(self, chunking_strategy: str = 'sentence',
78
- max_sentences_per_chunk: int = 50,
79
- chunk_size: int = 50,
80
- overlap_size: int = 10,
81
- split_newlines: Optional[int] = None):
77
+ def __init__(
78
+ self,
79
+ chunking_strategy: str = 'sentence',
80
+ max_sentences_per_chunk: int = 5,
81
+ chunk_size: int = 50,
82
+ chunk_overlap: int = 10,
83
+ split_newlines: Optional[int] = None,
84
+ index_nlp_backend: str = 'nltk',
85
+ verbose: bool = False,
86
+ semantic_threshold: float = 0.5,
87
+ topic_threshold: float = 0.3
88
+ ):
82
89
  """
83
- Initialize document processor with chunking strategy
90
+ Initialize document processor
84
91
 
85
92
  Args:
86
- chunking_strategy: 'sentence', 'sliding', 'paragraph', or 'page'
87
- max_sentences_per_chunk: For sentence strategy (default: 50)
93
+ chunking_strategy: Strategy for chunking documents ('sentence', 'sliding', 'paragraph', 'page', 'semantic', 'topic', 'qa')
94
+ max_sentences_per_chunk: For sentence strategy (default: 5)
88
95
  chunk_size: For sliding strategy - words per chunk (default: 50)
89
- overlap_size: For sliding strategy - overlap in words (default: 10)
96
+ chunk_overlap: For sliding strategy - overlap in words (default: 10)
90
97
  split_newlines: For sentence strategy - split on multiple newlines (optional)
98
+ index_nlp_backend: NLP backend for indexing (default: 'nltk')
99
+ verbose: Whether to enable verbose logging (default: False)
100
+ semantic_threshold: Similarity threshold for semantic chunking (default: 0.5)
101
+ topic_threshold: Similarity threshold for topic chunking (default: 0.3)
91
102
  """
92
103
  self.chunking_strategy = chunking_strategy
93
104
  self.max_sentences_per_chunk = max_sentences_per_chunk
94
105
  self.chunk_size = chunk_size
95
- self.overlap_size = overlap_size
106
+ self.chunk_overlap = chunk_overlap
96
107
  self.split_newlines = split_newlines
108
+ self.semantic_threshold = semantic_threshold
109
+ self.topic_threshold = topic_threshold
97
110
 
98
111
  # Legacy support for old character-based chunking
99
- self.chunk_overlap = overlap_size
112
+ self.chunk_overlap = chunk_overlap
100
113
 
101
114
  def create_chunks(self, content: str, filename: str,
102
115
  file_type: str) -> List[Dict[str, Any]]:
@@ -121,6 +134,12 @@ class DocumentProcessor:
121
134
  return self._chunk_by_paragraphs(content, filename, file_type)
122
135
  elif self.chunking_strategy == 'page':
123
136
  return self._chunk_by_pages(content, filename, file_type)
137
+ elif self.chunking_strategy == 'semantic':
138
+ return self._chunk_by_semantic(content, filename, file_type)
139
+ elif self.chunking_strategy == 'topic':
140
+ return self._chunk_by_topics(content, filename, file_type)
141
+ elif self.chunking_strategy == 'qa':
142
+ return self._chunk_by_qa_optimization(content, filename, file_type)
124
143
  else:
125
144
  # Fallback to sentence-based chunking
126
145
  return self._chunk_by_sentences(content, filename, file_type)
@@ -674,7 +693,7 @@ class DocumentProcessor:
674
693
  chunk_index = 0
675
694
 
676
695
  # Create overlapping chunks
677
- for i in range(0, len(words), self.chunk_size - self.overlap_size):
696
+ for i in range(0, len(words), self.chunk_size - self.chunk_overlap):
678
697
  chunk_words = words[i:i + self.chunk_size]
679
698
  if chunk_words:
680
699
  chunk_content = ' '.join(chunk_words)
@@ -686,7 +705,7 @@ class DocumentProcessor:
686
705
  'chunk_method': 'sliding_window',
687
706
  'chunk_index': chunk_index,
688
707
  'chunk_size_words': self.chunk_size,
689
- 'overlap_size_words': self.overlap_size,
708
+ 'overlap_size_words': self.chunk_overlap,
690
709
  'start_word': i,
691
710
  'end_word': i + len(chunk_words)
692
711
  }
@@ -761,4 +780,246 @@ class DocumentProcessor:
761
780
  }
762
781
  ))
763
782
 
764
- return chunks
783
+ return chunks
784
+
785
+ def _chunk_by_semantic(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
786
+ """Chunk based on semantic similarity between sentences"""
787
+ if isinstance(content, list):
788
+ content = '\n'.join(content)
789
+
790
+ # Get sentences
791
+ if sent_tokenize:
792
+ sentences = sent_tokenize(content)
793
+ else:
794
+ sentences = content.split('. ')
795
+ sentences = [s.strip() + '.' for s in sentences if s.strip()]
796
+
797
+ if len(sentences) <= 1:
798
+ return [self._create_chunk(content, filename, "Section 1",
799
+ metadata={'chunk_method': 'semantic', 'chunk_index': 0})]
800
+
801
+ # Generate embeddings for sentences (using the same model as the index)
802
+ try:
803
+ from sentence_transformers import SentenceTransformer
804
+ from sklearn.metrics.pairwise import cosine_similarity
805
+ import numpy as np
806
+
807
+ model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
808
+ embeddings = model.encode(sentences, show_progress_bar=False)
809
+
810
+ # Calculate similarity between adjacent sentences
811
+ similarities = []
812
+ for i in range(len(embeddings) - 1):
813
+ sim = cosine_similarity([embeddings[i]], [embeddings[i + 1]])[0][0]
814
+ similarities.append(sim)
815
+
816
+ # Find split points where similarity drops below threshold
817
+ split_points = [0]
818
+ for i, sim in enumerate(similarities):
819
+ if sim < self.semantic_threshold:
820
+ split_points.append(i + 1)
821
+ split_points.append(len(sentences))
822
+
823
+ # Create chunks
824
+ chunks = []
825
+ for i in range(len(split_points) - 1):
826
+ start_idx = split_points[i]
827
+ end_idx = split_points[i + 1]
828
+ chunk_sentences = sentences[start_idx:end_idx]
829
+
830
+ # Ensure minimum chunk size
831
+ if len(chunk_sentences) < 2 and i > 0:
832
+ # Merge with previous chunk
833
+ chunks[-1]['content'] += ' ' + ' '.join(chunk_sentences)
834
+ continue
835
+
836
+ chunk_content = ' '.join(chunk_sentences)
837
+ chunks.append(self._create_chunk(
838
+ content=chunk_content,
839
+ filename=filename,
840
+ section=f"Semantic Section {i+1}",
841
+ metadata={
842
+ 'chunk_method': 'semantic',
843
+ 'chunk_index': i,
844
+ 'semantic_threshold': self.semantic_threshold,
845
+ 'sentence_count': len(chunk_sentences)
846
+ }
847
+ ))
848
+
849
+ return chunks if chunks else [self._create_chunk(content, filename, "Section 1",
850
+ metadata={'chunk_method': 'semantic', 'chunk_index': 0})]
851
+
852
+ except ImportError:
853
+ # Fallback to sentence-based chunking
854
+ return self._chunk_by_sentences(content, filename, file_type)
855
+
856
+ def _chunk_by_topics(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
857
+ """Chunk based on topic changes using keyword analysis"""
858
+ if isinstance(content, list):
859
+ content = '\n'.join(content)
860
+
861
+ if sent_tokenize:
862
+ sentences = sent_tokenize(content)
863
+ else:
864
+ sentences = content.split('. ')
865
+ sentences = [s.strip() + '.' for s in sentences if s.strip()]
866
+
867
+ if len(sentences) <= 3:
868
+ return [self._create_chunk(content, filename, "Topic 1",
869
+ metadata={'chunk_method': 'topic', 'chunk_index': 0})]
870
+
871
+ try:
872
+ # Simple topic detection using keyword overlap
873
+ from collections import Counter
874
+ import re
875
+
876
+ # Extract keywords from each sentence
877
+ sentence_keywords = []
878
+ for sentence in sentences:
879
+ # Simple keyword extraction (could be enhanced with NLP)
880
+ words = re.findall(r'\b[a-zA-Z]{3,}\b', sentence.lower())
881
+ # Filter common words (basic stopwords)
882
+ stopwords = {'the', 'and', 'for', 'are', 'but', 'not', 'you', 'all', 'can', 'had', 'her', 'was', 'one', 'our', 'out', 'day', 'get', 'has', 'him', 'his', 'how', 'its', 'may', 'new', 'now', 'old', 'see', 'two', 'who', 'boy', 'did', 'man', 'way', 'she', 'use', 'her', 'many', 'oil', 'sit', 'set', 'run', 'eat', 'far', 'sea', 'eye', 'ask', 'own', 'say', 'too', 'any', 'try', 'us', 'an', 'as', 'at', 'be', 'he', 'if', 'in', 'is', 'it', 'my', 'of', 'on', 'or', 'to', 'up', 'we', 'go', 'no', 'so', 'am', 'by', 'do', 'me'}
883
+ keywords = [w for w in words if w not in stopwords and len(w) > 3]
884
+ sentence_keywords.append(set(keywords))
885
+
886
+ # Find topic boundaries based on keyword overlap
887
+ chunks = []
888
+ current_chunk = [sentences[0]]
889
+ current_keywords = sentence_keywords[0]
890
+
891
+ for i in range(1, len(sentences)):
892
+ # Calculate keyword overlap with current chunk
893
+ overlap = len(current_keywords.intersection(sentence_keywords[i]))
894
+ total_keywords = len(current_keywords.union(sentence_keywords[i]))
895
+
896
+ if total_keywords > 0:
897
+ similarity = overlap / total_keywords
898
+ else:
899
+ similarity = 0
900
+
901
+ # If similarity is low, start new chunk
902
+ if similarity < self.topic_threshold and len(current_chunk) >= 2:
903
+ chunk_content = ' '.join(current_chunk)
904
+ chunks.append(self._create_chunk(
905
+ content=chunk_content,
906
+ filename=filename,
907
+ section=f"Topic {len(chunks)+1}",
908
+ metadata={
909
+ 'chunk_method': 'topic',
910
+ 'chunk_index': len(chunks),
911
+ 'topic_keywords': list(current_keywords)[:10], # Top keywords
912
+ 'sentence_count': len(current_chunk),
913
+ 'topic_threshold': self.topic_threshold
914
+ }
915
+ ))
916
+ current_chunk = [sentences[i]]
917
+ current_keywords = sentence_keywords[i]
918
+ else:
919
+ current_chunk.append(sentences[i])
920
+ current_keywords = current_keywords.union(sentence_keywords[i])
921
+
922
+ # Add final chunk
923
+ if current_chunk:
924
+ chunk_content = ' '.join(current_chunk)
925
+ chunks.append(self._create_chunk(
926
+ content=chunk_content,
927
+ filename=filename,
928
+ section=f"Topic {len(chunks)+1}",
929
+ metadata={
930
+ 'chunk_method': 'topic',
931
+ 'chunk_index': len(chunks),
932
+ 'topic_keywords': list(current_keywords)[:10],
933
+ 'sentence_count': len(current_chunk),
934
+ 'topic_threshold': self.topic_threshold
935
+ }
936
+ ))
937
+
938
+ return chunks if chunks else [self._create_chunk(content, filename, "Topic 1",
939
+ metadata={'chunk_method': 'topic', 'chunk_index': 0})]
940
+
941
+ except Exception:
942
+ # Fallback to sentence-based chunking
943
+ return self._chunk_by_sentences(content, filename, file_type)
944
+
945
+ def _chunk_by_qa_optimization(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
946
+ """Create chunks optimized for question-answering"""
947
+ if isinstance(content, list):
948
+ content = '\n'.join(content)
949
+
950
+ if sent_tokenize:
951
+ sentences = sent_tokenize(content)
952
+ else:
953
+ sentences = content.split('. ')
954
+ sentences = [s.strip() + '.' for s in sentences if s.strip()]
955
+
956
+ # Patterns that indicate Q&A structure
957
+ question_patterns = [
958
+ r'\?', # Questions
959
+ r'^(what|how|why|when|where|who|which|can|does|is|are|will|would|should)',
960
+ r'(step|steps|process|procedure|method|way to)',
961
+ r'(example|examples|instance|case)',
962
+ r'(definition|meaning|refers to|means)',
963
+ ]
964
+
965
+ chunks = []
966
+ current_chunk = []
967
+ current_context = []
968
+
969
+ for i, sentence in enumerate(sentences):
970
+ sentence_lower = sentence.lower().strip()
971
+
972
+ # Check if this sentence contains Q&A indicators
973
+ is_qa_relevant = any(re.search(pattern, sentence_lower) for pattern in question_patterns)
974
+
975
+ if is_qa_relevant or len(current_chunk) == 0:
976
+ current_chunk.append(sentence)
977
+ # Add surrounding context (previous and next sentences)
978
+ if i > 0 and sentences[i-1] not in current_chunk:
979
+ current_context.append(sentences[i-1])
980
+ if i < len(sentences) - 1:
981
+ current_context.append(sentences[i+1])
982
+ else:
983
+ current_chunk.append(sentence)
984
+
985
+ # Create chunk when we have enough content or reach a natural break
986
+ if (len(current_chunk) >= 3 and
987
+ (i == len(sentences) - 1 or # Last sentence
988
+ sentence.endswith('.') and len(current_chunk) >= 5)): # Natural break
989
+
990
+ # Combine chunk with context
991
+ full_content = current_context + current_chunk
992
+ chunk_content = ' '.join(full_content)
993
+
994
+ chunks.append(self._create_chunk(
995
+ content=chunk_content,
996
+ filename=filename,
997
+ section=f"QA Section {len(chunks)+1}",
998
+ metadata={
999
+ 'chunk_method': 'qa_optimized',
1000
+ 'chunk_index': len(chunks),
1001
+ 'has_question': any('?' in s for s in current_chunk),
1002
+ 'has_process': any(re.search(r'(step|process|method)', s.lower()) for s in current_chunk),
1003
+ 'sentence_count': len(full_content)
1004
+ }
1005
+ ))
1006
+
1007
+ current_chunk = []
1008
+ current_context = []
1009
+
1010
+ # Handle remaining content
1011
+ if current_chunk:
1012
+ chunk_content = ' '.join(current_context + current_chunk)
1013
+ chunks.append(self._create_chunk(
1014
+ content=chunk_content,
1015
+ filename=filename,
1016
+ section=f"QA Section {len(chunks)+1}",
1017
+ metadata={
1018
+ 'chunk_method': 'qa_optimized',
1019
+ 'chunk_index': len(chunks),
1020
+ 'sentence_count': len(current_context + current_chunk)
1021
+ }
1022
+ ))
1023
+
1024
+ return chunks if chunks else [self._create_chunk(content, filename, "QA Section 1",
1025
+ metadata={'chunk_method': 'qa_optimized', 'chunk_index': 0})]
@@ -35,27 +35,61 @@ logger = logging.getLogger(__name__)
35
35
  class IndexBuilder:
36
36
  """Build searchable indexes from document directories"""
37
37
 
38
- def __init__(self, model_name: str = 'sentence-transformers/all-mpnet-base-v2',
39
- chunking_strategy: str = 'sentence',
40
- max_sentences_per_chunk: int = 50,
41
- chunk_size: int = 50,
42
- chunk_overlap: int = 10,
43
- split_newlines: Optional[int] = None,
44
- verbose: bool = False):
38
+ def __init__(
39
+ self,
40
+ model_name: str = 'sentence-transformers/all-mpnet-base-v2',
41
+ chunking_strategy: str = 'sentence',
42
+ max_sentences_per_chunk: int = 5,
43
+ chunk_size: int = 50,
44
+ chunk_overlap: int = 10,
45
+ split_newlines: Optional[int] = None,
46
+ index_nlp_backend: str = 'nltk',
47
+ verbose: bool = False,
48
+ semantic_threshold: float = 0.5,
49
+ topic_threshold: float = 0.3
50
+ ):
51
+ """
52
+ Initialize the index builder
53
+
54
+ Args:
55
+ model_name: Name of the sentence transformer model to use
56
+ chunking_strategy: Strategy for chunking documents ('sentence', 'sliding', 'paragraph', 'page', 'semantic', 'topic', 'qa')
57
+ max_sentences_per_chunk: For sentence strategy (default: 5)
58
+ chunk_size: For sliding strategy - words per chunk (default: 50)
59
+ chunk_overlap: For sliding strategy - overlap in words (default: 10)
60
+ split_newlines: For sentence strategy - split on multiple newlines (optional)
61
+ index_nlp_backend: NLP backend for indexing (default: 'nltk')
62
+ verbose: Whether to enable verbose logging (default: False)
63
+ semantic_threshold: Similarity threshold for semantic chunking (default: 0.5)
64
+ topic_threshold: Similarity threshold for topic chunking (default: 0.3)
65
+ """
45
66
  self.model_name = model_name
46
67
  self.chunking_strategy = chunking_strategy
47
68
  self.max_sentences_per_chunk = max_sentences_per_chunk
48
69
  self.chunk_size = chunk_size
49
70
  self.chunk_overlap = chunk_overlap
50
71
  self.split_newlines = split_newlines
72
+ self.index_nlp_backend = index_nlp_backend
51
73
  self.verbose = verbose
74
+ self.semantic_threshold = semantic_threshold
75
+ self.topic_threshold = topic_threshold
52
76
  self.model = None
77
+
78
+ # Validate NLP backend
79
+ if self.index_nlp_backend not in ['nltk', 'spacy']:
80
+ logger.warning(f"Invalid index_nlp_backend '{self.index_nlp_backend}', using 'nltk'")
81
+ self.index_nlp_backend = 'nltk'
82
+
53
83
  self.doc_processor = DocumentProcessor(
54
84
  chunking_strategy=chunking_strategy,
55
85
  max_sentences_per_chunk=max_sentences_per_chunk,
56
86
  chunk_size=chunk_size,
57
- overlap_size=chunk_overlap,
58
- split_newlines=split_newlines
87
+ chunk_overlap=chunk_overlap,
88
+ split_newlines=split_newlines,
89
+ index_nlp_backend=self.index_nlp_backend,
90
+ verbose=self.verbose,
91
+ semantic_threshold=self.semantic_threshold,
92
+ topic_threshold=self.topic_threshold
59
93
  )
60
94
 
61
95
  def _load_model(self):
@@ -130,7 +164,8 @@ class IndexBuilder:
130
164
  # Preprocess content for better search
131
165
  processed = preprocess_document_content(
132
166
  chunk['content'],
133
- language=chunk.get('language', 'en')
167
+ language=chunk.get('language', 'en'),
168
+ index_nlp_backend=self.index_nlp_backend
134
169
  )
135
170
 
136
171
  chunk['processed_content'] = processed['enhanced_text']