signalwire-agents 0.1.13__py3-none-any.whl → 0.1.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -89,28 +89,85 @@ class StructuredLoggerWrapper:
89
89
  # Also support the 'warn' alias
90
90
  warn = warning
91
91
 
92
+ def bind(self, **kwargs) -> 'StructuredLoggerWrapper':
93
+ """
94
+ Create a new logger instance with bound context data
95
+
96
+ This maintains compatibility with structlog's bind() method.
97
+ The bound data will be included in all subsequent log messages.
98
+ """
99
+ # Create a new wrapper that includes the bound context
100
+ return BoundStructuredLoggerWrapper(self._logger, kwargs)
101
+
92
102
  # Support direct access to underlying logger attributes if needed
93
103
  def __getattr__(self, name: str) -> Any:
94
104
  """Delegate any unknown attributes to the underlying logger"""
95
105
  return getattr(self._logger, name)
96
106
 
97
107
 
98
- def get_execution_mode() -> str:
108
+ class BoundStructuredLoggerWrapper(StructuredLoggerWrapper):
109
+ """
110
+ A structured logger wrapper that includes bound context data in all messages
111
+ """
112
+
113
+ def __init__(self, logger: logging.Logger, bound_data: Dict[str, Any]):
114
+ super().__init__(logger)
115
+ self._bound_data = bound_data
116
+
117
+ def _format_structured_message(self, message: str, **kwargs) -> str:
118
+ """Format a message with both bound data and additional keyword arguments"""
119
+ # Combine bound data with additional kwargs
120
+ all_kwargs = {**self._bound_data, **kwargs}
121
+ return super()._format_structured_message(message, **all_kwargs)
122
+
123
+ def bind(self, **kwargs) -> 'BoundStructuredLoggerWrapper':
124
+ """Create a new logger with additional bound context"""
125
+ # Combine existing bound data with new data
126
+ new_bound_data = {**self._bound_data, **kwargs}
127
+ return BoundStructuredLoggerWrapper(self._logger, new_bound_data)
128
+
129
+
130
+ def get_execution_mode():
99
131
  """
100
132
  Determine the execution mode based on environment variables
101
133
 
102
134
  Returns:
103
- 'cgi' if running in CGI mode
104
- 'lambda' if running in AWS Lambda
105
- 'server' for normal server mode
135
+ str: 'server', 'cgi', 'lambda', 'google_cloud_function', 'azure_function', or 'unknown'
106
136
  """
137
+ # Check for CGI environment
107
138
  if os.getenv('GATEWAY_INTERFACE'):
108
139
  return 'cgi'
140
+
141
+ # Check for AWS Lambda environment
109
142
  if os.getenv('AWS_LAMBDA_FUNCTION_NAME') or os.getenv('LAMBDA_TASK_ROOT'):
110
143
  return 'lambda'
144
+
145
+ # Check for Google Cloud Functions environment
146
+ if (os.getenv('FUNCTION_TARGET') or
147
+ os.getenv('K_SERVICE') or
148
+ os.getenv('GOOGLE_CLOUD_PROJECT')):
149
+ return 'google_cloud_function'
150
+
151
+ # Check for Azure Functions environment
152
+ if (os.getenv('AZURE_FUNCTIONS_ENVIRONMENT') or
153
+ os.getenv('FUNCTIONS_WORKER_RUNTIME') or
154
+ os.getenv('AzureWebJobsStorage')):
155
+ return 'azure_function'
156
+
157
+ # Default to server mode
111
158
  return 'server'
112
159
 
113
160
 
161
+ def reset_logging_configuration():
162
+ """
163
+ Reset the logging configuration flag to allow reconfiguration
164
+
165
+ This is useful when environment variables change after initial configuration.
166
+ """
167
+ global _logging_configured
168
+ _logging_configured = False
169
+
170
+
114
171
  def configure_logging():
115
172
  """
116
173
  Configure logging system once, globally, based on environment variables
@@ -182,31 +239,39 @@ def _configure_off_mode():
182
239
 
183
240
 
184
241
  def _configure_stderr_mode(log_level: str):
185
- """Configure logging to stderr"""
242
+ """Configure logging to stderr with colored formatting"""
186
243
  # Clear existing handlers
187
244
  logging.getLogger().handlers.clear()
188
245
 
189
246
  # Convert log level
190
247
  numeric_level = getattr(logging, log_level.upper(), logging.INFO)
191
248
 
192
- # Configure to stderr
193
- logging.basicConfig(
194
- stream=sys.stderr,
195
- level=numeric_level,
196
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
197
- )
249
+ # Create handler with colored formatter
250
+ handler = logging.StreamHandler(sys.stderr)
251
+ handler.setFormatter(ColoredFormatter())
252
+
253
+ # Configure root logger
254
+ root_logger = logging.getLogger()
255
+ root_logger.setLevel(numeric_level)
256
+ root_logger.addHandler(handler)
198
257
 
199
258
 
200
259
  def _configure_default_mode(log_level: str):
201
- """Configure standard logging behavior"""
260
+ """Configure standard logging behavior with colored formatting"""
261
+ # Clear existing handlers
262
+ logging.getLogger().handlers.clear()
263
+
202
264
  # Convert log level
203
265
  numeric_level = getattr(logging, log_level.upper(), logging.INFO)
204
266
 
205
- # Configure standard logging
206
- logging.basicConfig(
207
- level=numeric_level,
208
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
209
- )
267
+ # Create handler with colored formatter
268
+ handler = logging.StreamHandler()
269
+ handler.setFormatter(ColoredFormatter())
270
+
271
+ # Configure root logger
272
+ root_logger = logging.getLogger()
273
+ root_logger.setLevel(numeric_level)
274
+ root_logger.addHandler(handler)
210
275
 
211
276
 
212
277
  def get_logger(name: str) -> StructuredLoggerWrapper:
@@ -229,4 +294,83 @@ def get_logger(name: str) -> StructuredLoggerWrapper:
229
294
  python_logger = logging.getLogger(name)
230
295
 
231
296
  # Wrap it with our structured logging interface
232
- return StructuredLoggerWrapper(python_logger)
297
+ return StructuredLoggerWrapper(python_logger)
298
+
299
+
300
+ class ColoredFormatter(logging.Formatter):
301
+ """
302
+ A beautiful colored logging formatter that makes logs easy to read and visually appealing
303
+ """
304
+
305
+ # ANSI color codes
306
+ COLORS = {
307
+ 'DEBUG': '\033[36m', # Cyan
308
+ 'INFO': '\033[32m', # Green
309
+ 'WARNING': '\033[33m', # Yellow
310
+ 'ERROR': '\033[31m', # Red
311
+ 'CRITICAL': '\033[35m', # Magenta
312
+ 'RESET': '\033[0m', # Reset
313
+ 'BOLD': '\033[1m', # Bold
314
+ 'DIM': '\033[2m', # Dim
315
+ 'WHITE': '\033[37m', # White
316
+ 'BLUE': '\033[34m', # Blue
317
+ 'BLACK': '\033[30m', # Black (for brackets)
318
+ }
319
+
320
+ def __init__(self):
321
+ super().__init__()
322
+
323
+ def format(self, record):
324
+ # Check if we should use colors (not in raw mode, and stdout is a tty)
325
+ use_colors = (
326
+ hasattr(sys.stdout, 'isatty') and sys.stdout.isatty() and
327
+ os.getenv('SIGNALWIRE_LOG_MODE') != 'off' and
328
+ '--raw' not in sys.argv and '--dump-swml' not in sys.argv
329
+ )
330
+
331
+ if use_colors:
332
+ # Get colors
333
+ level_color = self.COLORS.get(record.levelname, self.COLORS['WHITE'])
334
+ reset = self.COLORS['RESET']
335
+ dim = self.COLORS['DIM']
336
+ bold = self.COLORS['BOLD']
337
+ blue = self.COLORS['BLUE']
338
+ black = self.COLORS['BLACK']
339
+
340
+ # Format timestamp in a compact, readable way
341
+ timestamp = self.formatTime(record, '%H:%M:%S')
342
+
343
+ # Format level with appropriate color and consistent width
344
+ level_name = f"{level_color}{record.levelname:<8}{reset}"
345
+
346
+ # Format logger name - keep it short and readable
347
+ logger_name = record.name
348
+ if len(logger_name) > 15:
349
+ # Truncate long logger names but keep the end (most specific part)
350
+ logger_name = "..." + logger_name[-12:]
351
+
352
+ # Get function and line info if available
353
+ func_info = ""
354
+ if hasattr(record, 'funcName') and hasattr(record, 'lineno'):
355
+ func_name = getattr(record, 'funcName', '')
356
+ line_no = getattr(record, 'lineno', 0)
357
+ if func_name and func_name != '<module>':
358
+ func_info = f" {dim}({func_name}:{line_no}){reset}"
359
+
360
+ # Format the message
361
+ message = record.getMessage()
362
+
363
+ # Create the final formatted message with a clean, readable layout
364
+ formatted = (
365
+ f"{black}[{reset}{dim}{timestamp}{reset}{black}]{reset} "
366
+ f"{level_name} "
367
+ f"{blue}{logger_name:<15}{reset}"
368
+ f"{func_info} "
369
+ f"{message}"
370
+ )
371
+
372
+ return formatted
373
+ else:
374
+ # Non-colored format (fallback for files, pipes, etc.)
375
+ timestamp = self.formatTime(record, '%Y-%m-%d %H:%M:%S')
376
+ return f"{timestamp} {record.levelname:<8} {record.name} {record.getMessage()}"
@@ -8,7 +8,7 @@ See LICENSE file in the project root for full license information.
8
8
  """
9
9
 
10
10
  from typing import Dict, List, Type, Any, Optional
11
- import logging
11
+ from signalwire_agents.core.logging_config import get_logger
12
12
  from signalwire_agents.core.skill_base import SkillBase
13
13
 
14
14
  class SkillManager:
@@ -17,7 +17,7 @@ class SkillManager:
17
17
  def __init__(self, agent):
18
18
  self.agent = agent
19
19
  self.loaded_skills: Dict[str, SkillBase] = {}
20
- self.logger = logging.getLogger("skill_manager")
20
+ self.logger = get_logger("skill_manager")
21
21
 
22
22
  def load_skill(self, skill_name: str, skill_class: Type[SkillBase] = None, params: Optional[Dict[str, Any]] = None) -> tuple[bool, str]:
23
23
  """
@@ -24,51 +24,11 @@ import types
24
24
  from typing import Dict, List, Any, Optional, Union, Callable, Tuple, Type
25
25
  from urllib.parse import urlparse
26
26
 
27
- # Import and configure structlog
28
- try:
29
- import structlog
30
-
31
- # Only configure if not already configured
32
- if not hasattr(structlog, "_configured") or not structlog._configured:
33
- structlog.configure(
34
- processors=[
35
- structlog.stdlib.filter_by_level,
36
- structlog.stdlib.add_logger_name,
37
- structlog.stdlib.add_log_level,
38
- structlog.stdlib.PositionalArgumentsFormatter(),
39
- structlog.processors.TimeStamper(fmt="%Y-%m-%d %H:%M:%S"),
40
- structlog.processors.StackInfoRenderer(),
41
- structlog.processors.format_exc_info,
42
- structlog.processors.UnicodeDecoder(),
43
- structlog.dev.ConsoleRenderer()
44
- ],
45
- context_class=dict,
46
- logger_factory=structlog.stdlib.LoggerFactory(),
47
- wrapper_class=structlog.stdlib.BoundLogger,
48
- cache_logger_on_first_use=True,
49
- )
50
-
51
- # Set up root logger with structlog
52
- logging.basicConfig(
53
- format="%(message)s",
54
- stream=sys.stdout,
55
- level=logging.INFO,
56
- )
57
-
58
- # Mark as configured to avoid duplicate configuration
59
- structlog._configured = True
60
-
61
- # Create the module logger
62
- logger = structlog.get_logger("swml_service")
63
-
64
- except ImportError:
65
- # Fallback to standard logging if structlog is not available
66
- logging.basicConfig(
67
- level=logging.INFO,
68
- format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
69
- stream=sys.stdout
70
- )
71
- logger = logging.getLogger("swml_service")
27
+ # Import centralized logging system
28
+ from signalwire_agents.core.logging_config import get_logger
29
+
30
+ # Create the module logger using centralized system
31
+ logger = get_logger("swml_service")
72
32
 
73
33
  try:
74
34
  import fastapi
@@ -74,29 +74,42 @@ logger = logging.getLogger(__name__)
74
74
  class DocumentProcessor:
75
75
  """Enhanced document processor with smart chunking capabilities"""
76
76
 
77
- def __init__(self, chunking_strategy: str = 'sentence',
78
- max_sentences_per_chunk: int = 50,
79
- chunk_size: int = 50,
80
- overlap_size: int = 10,
81
- split_newlines: Optional[int] = None):
77
+ def __init__(
78
+ self,
79
+ chunking_strategy: str = 'sentence',
80
+ max_sentences_per_chunk: int = 5,
81
+ chunk_size: int = 50,
82
+ chunk_overlap: int = 10,
83
+ split_newlines: Optional[int] = None,
84
+ index_nlp_backend: str = 'nltk',
85
+ verbose: bool = False,
86
+ semantic_threshold: float = 0.5,
87
+ topic_threshold: float = 0.3
88
+ ):
82
89
  """
83
- Initialize document processor with chunking strategy
90
+ Initialize document processor
84
91
 
85
92
  Args:
86
- chunking_strategy: 'sentence', 'sliding', 'paragraph', or 'page'
87
- max_sentences_per_chunk: For sentence strategy (default: 50)
93
+ chunking_strategy: Strategy for chunking documents ('sentence', 'sliding', 'paragraph', 'page', 'semantic', 'topic', 'qa')
94
+ max_sentences_per_chunk: For sentence strategy (default: 5)
88
95
  chunk_size: For sliding strategy - words per chunk (default: 50)
89
- overlap_size: For sliding strategy - overlap in words (default: 10)
96
+ chunk_overlap: For sliding strategy - overlap in words (default: 10)
90
97
  split_newlines: For sentence strategy - split on multiple newlines (optional)
98
+ index_nlp_backend: NLP backend for indexing (default: 'nltk')
99
+ verbose: Whether to enable verbose logging (default: False)
100
+ semantic_threshold: Similarity threshold for semantic chunking (default: 0.5)
101
+ topic_threshold: Similarity threshold for topic chunking (default: 0.3)
91
102
  """
92
103
  self.chunking_strategy = chunking_strategy
93
104
  self.max_sentences_per_chunk = max_sentences_per_chunk
94
105
  self.chunk_size = chunk_size
95
- self.overlap_size = overlap_size
106
+ self.chunk_overlap = chunk_overlap
96
107
  self.split_newlines = split_newlines
108
+ self.semantic_threshold = semantic_threshold
109
+ self.topic_threshold = topic_threshold
97
110
 
98
111
  # Legacy support for old character-based chunking
99
- self.chunk_overlap = overlap_size
112
+ self.chunk_overlap = chunk_overlap
100
113
 
101
114
  def create_chunks(self, content: str, filename: str,
102
115
  file_type: str) -> List[Dict[str, Any]]:
@@ -121,6 +134,12 @@ class DocumentProcessor:
121
134
  return self._chunk_by_paragraphs(content, filename, file_type)
122
135
  elif self.chunking_strategy == 'page':
123
136
  return self._chunk_by_pages(content, filename, file_type)
137
+ elif self.chunking_strategy == 'semantic':
138
+ return self._chunk_by_semantic(content, filename, file_type)
139
+ elif self.chunking_strategy == 'topic':
140
+ return self._chunk_by_topics(content, filename, file_type)
141
+ elif self.chunking_strategy == 'qa':
142
+ return self._chunk_by_qa_optimization(content, filename, file_type)
124
143
  else:
125
144
  # Fallback to sentence-based chunking
126
145
  return self._chunk_by_sentences(content, filename, file_type)
@@ -674,7 +693,7 @@ class DocumentProcessor:
674
693
  chunk_index = 0
675
694
 
676
695
  # Create overlapping chunks
677
- for i in range(0, len(words), self.chunk_size - self.overlap_size):
696
+ for i in range(0, len(words), self.chunk_size - self.chunk_overlap):
678
697
  chunk_words = words[i:i + self.chunk_size]
679
698
  if chunk_words:
680
699
  chunk_content = ' '.join(chunk_words)
@@ -686,7 +705,7 @@ class DocumentProcessor:
686
705
  'chunk_method': 'sliding_window',
687
706
  'chunk_index': chunk_index,
688
707
  'chunk_size_words': self.chunk_size,
689
- 'overlap_size_words': self.overlap_size,
708
+ 'overlap_size_words': self.chunk_overlap,
690
709
  'start_word': i,
691
710
  'end_word': i + len(chunk_words)
692
711
  }
@@ -761,4 +780,246 @@ class DocumentProcessor:
761
780
  }
762
781
  ))
763
782
 
764
- return chunks
783
+ return chunks
784
+
785
+ def _chunk_by_semantic(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
786
+ """Chunk based on semantic similarity between sentences"""
787
+ if isinstance(content, list):
788
+ content = '\n'.join(content)
789
+
790
+ # Get sentences
791
+ if sent_tokenize:
792
+ sentences = sent_tokenize(content)
793
+ else:
794
+ sentences = content.split('. ')
795
+ sentences = [s.strip() + '.' for s in sentences if s.strip()]
796
+
797
+ if len(sentences) <= 1:
798
+ return [self._create_chunk(content, filename, "Section 1",
799
+ metadata={'chunk_method': 'semantic', 'chunk_index': 0})]
800
+
801
+ # Generate embeddings for sentences (using the same model as the index)
802
+ try:
803
+ from sentence_transformers import SentenceTransformer
804
+ from sklearn.metrics.pairwise import cosine_similarity
805
+ import numpy as np
806
+
807
+ model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
808
+ embeddings = model.encode(sentences, show_progress_bar=False)
809
+
810
+ # Calculate similarity between adjacent sentences
811
+ similarities = []
812
+ for i in range(len(embeddings) - 1):
813
+ sim = cosine_similarity([embeddings[i]], [embeddings[i + 1]])[0][0]
814
+ similarities.append(sim)
815
+
816
+ # Find split points where similarity drops below threshold
817
+ split_points = [0]
818
+ for i, sim in enumerate(similarities):
819
+ if sim < self.semantic_threshold:
820
+ split_points.append(i + 1)
821
+ split_points.append(len(sentences))
822
+
823
+ # Create chunks
824
+ chunks = []
825
+ for i in range(len(split_points) - 1):
826
+ start_idx = split_points[i]
827
+ end_idx = split_points[i + 1]
828
+ chunk_sentences = sentences[start_idx:end_idx]
829
+
830
+ # Ensure minimum chunk size
831
+ if len(chunk_sentences) < 2 and i > 0:
832
+ # Merge with previous chunk
833
+ chunks[-1]['content'] += ' ' + ' '.join(chunk_sentences)
834
+ continue
835
+
836
+ chunk_content = ' '.join(chunk_sentences)
837
+ chunks.append(self._create_chunk(
838
+ content=chunk_content,
839
+ filename=filename,
840
+ section=f"Semantic Section {i+1}",
841
+ metadata={
842
+ 'chunk_method': 'semantic',
843
+ 'chunk_index': i,
844
+ 'semantic_threshold': self.semantic_threshold,
845
+ 'sentence_count': len(chunk_sentences)
846
+ }
847
+ ))
848
+
849
+ return chunks if chunks else [self._create_chunk(content, filename, "Section 1",
850
+ metadata={'chunk_method': 'semantic', 'chunk_index': 0})]
851
+
852
+ except ImportError:
853
+ # Fallback to sentence-based chunking
854
+ return self._chunk_by_sentences(content, filename, file_type)
855
+
856
+ def _chunk_by_topics(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
857
+ """Chunk based on topic changes using keyword analysis"""
858
+ if isinstance(content, list):
859
+ content = '\n'.join(content)
860
+
861
+ if sent_tokenize:
862
+ sentences = sent_tokenize(content)
863
+ else:
864
+ sentences = content.split('. ')
865
+ sentences = [s.strip() + '.' for s in sentences if s.strip()]
866
+
867
+ if len(sentences) <= 3:
868
+ return [self._create_chunk(content, filename, "Topic 1",
869
+ metadata={'chunk_method': 'topic', 'chunk_index': 0})]
870
+
871
+ try:
872
+ # Simple topic detection using keyword overlap
873
+ from collections import Counter
874
+ import re
875
+
876
+ # Extract keywords from each sentence
877
+ sentence_keywords = []
878
+ for sentence in sentences:
879
+ # Simple keyword extraction (could be enhanced with NLP)
880
+ words = re.findall(r'\b[a-zA-Z]{3,}\b', sentence.lower())
881
+ # Filter common words (basic stopwords)
882
+ stopwords = {'the', 'and', 'for', 'are', 'but', 'not', 'you', 'all', 'can', 'had', 'her', 'was', 'one', 'our', 'out', 'day', 'get', 'has', 'him', 'his', 'how', 'its', 'may', 'new', 'now', 'old', 'see', 'two', 'who', 'boy', 'did', 'man', 'way', 'she', 'use', 'her', 'many', 'oil', 'sit', 'set', 'run', 'eat', 'far', 'sea', 'eye', 'ask', 'own', 'say', 'too', 'any', 'try', 'us', 'an', 'as', 'at', 'be', 'he', 'if', 'in', 'is', 'it', 'my', 'of', 'on', 'or', 'to', 'up', 'we', 'go', 'no', 'so', 'am', 'by', 'do', 'me'}
883
+ keywords = [w for w in words if w not in stopwords and len(w) > 3]
884
+ sentence_keywords.append(set(keywords))
885
+
886
+ # Find topic boundaries based on keyword overlap
887
+ chunks = []
888
+ current_chunk = [sentences[0]]
889
+ current_keywords = sentence_keywords[0]
890
+
891
+ for i in range(1, len(sentences)):
892
+ # Calculate keyword overlap with current chunk
893
+ overlap = len(current_keywords.intersection(sentence_keywords[i]))
894
+ total_keywords = len(current_keywords.union(sentence_keywords[i]))
895
+
896
+ if total_keywords > 0:
897
+ similarity = overlap / total_keywords
898
+ else:
899
+ similarity = 0
900
+
901
+ # If similarity is low, start new chunk
902
+ if similarity < self.topic_threshold and len(current_chunk) >= 2:
903
+ chunk_content = ' '.join(current_chunk)
904
+ chunks.append(self._create_chunk(
905
+ content=chunk_content,
906
+ filename=filename,
907
+ section=f"Topic {len(chunks)+1}",
908
+ metadata={
909
+ 'chunk_method': 'topic',
910
+ 'chunk_index': len(chunks),
911
+ 'topic_keywords': list(current_keywords)[:10], # Top keywords
912
+ 'sentence_count': len(current_chunk),
913
+ 'topic_threshold': self.topic_threshold
914
+ }
915
+ ))
916
+ current_chunk = [sentences[i]]
917
+ current_keywords = sentence_keywords[i]
918
+ else:
919
+ current_chunk.append(sentences[i])
920
+ current_keywords = current_keywords.union(sentence_keywords[i])
921
+
922
+ # Add final chunk
923
+ if current_chunk:
924
+ chunk_content = ' '.join(current_chunk)
925
+ chunks.append(self._create_chunk(
926
+ content=chunk_content,
927
+ filename=filename,
928
+ section=f"Topic {len(chunks)+1}",
929
+ metadata={
930
+ 'chunk_method': 'topic',
931
+ 'chunk_index': len(chunks),
932
+ 'topic_keywords': list(current_keywords)[:10],
933
+ 'sentence_count': len(current_chunk),
934
+ 'topic_threshold': self.topic_threshold
935
+ }
936
+ ))
937
+
938
+ return chunks if chunks else [self._create_chunk(content, filename, "Topic 1",
939
+ metadata={'chunk_method': 'topic', 'chunk_index': 0})]
940
+
941
+ except Exception:
942
+ # Fallback to sentence-based chunking
943
+ return self._chunk_by_sentences(content, filename, file_type)
944
+
945
+ def _chunk_by_qa_optimization(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
946
+ """Create chunks optimized for question-answering"""
947
+ if isinstance(content, list):
948
+ content = '\n'.join(content)
949
+
950
+ if sent_tokenize:
951
+ sentences = sent_tokenize(content)
952
+ else:
953
+ sentences = content.split('. ')
954
+ sentences = [s.strip() + '.' for s in sentences if s.strip()]
955
+
956
+ # Patterns that indicate Q&A structure
957
+ question_patterns = [
958
+ r'\?', # Questions
959
+ r'^(what|how|why|when|where|who|which|can|does|is|are|will|would|should)',
960
+ r'(step|steps|process|procedure|method|way to)',
961
+ r'(example|examples|instance|case)',
962
+ r'(definition|meaning|refers to|means)',
963
+ ]
964
+
965
+ chunks = []
966
+ current_chunk = []
967
+ current_context = []
968
+
969
+ for i, sentence in enumerate(sentences):
970
+ sentence_lower = sentence.lower().strip()
971
+
972
+ # Check if this sentence contains Q&A indicators
973
+ is_qa_relevant = any(re.search(pattern, sentence_lower) for pattern in question_patterns)
974
+
975
+ if is_qa_relevant or len(current_chunk) == 0:
976
+ current_chunk.append(sentence)
977
+ # Add surrounding context (previous and next sentences)
978
+ if i > 0 and sentences[i-1] not in current_chunk:
979
+ current_context.append(sentences[i-1])
980
+ if i < len(sentences) - 1:
981
+ current_context.append(sentences[i+1])
982
+ else:
983
+ current_chunk.append(sentence)
984
+
985
+ # Create chunk when we have enough content or reach a natural break
986
+ if (len(current_chunk) >= 3 and
987
+ (i == len(sentences) - 1 or # Last sentence
988
+ sentence.endswith('.') and len(current_chunk) >= 5)): # Natural break
989
+
990
+ # Combine chunk with context
991
+ full_content = current_context + current_chunk
992
+ chunk_content = ' '.join(full_content)
993
+
994
+ chunks.append(self._create_chunk(
995
+ content=chunk_content,
996
+ filename=filename,
997
+ section=f"QA Section {len(chunks)+1}",
998
+ metadata={
999
+ 'chunk_method': 'qa_optimized',
1000
+ 'chunk_index': len(chunks),
1001
+ 'has_question': any('?' in s for s in current_chunk),
1002
+ 'has_process': any(re.search(r'(step|process|method)', s.lower()) for s in current_chunk),
1003
+ 'sentence_count': len(full_content)
1004
+ }
1005
+ ))
1006
+
1007
+ current_chunk = []
1008
+ current_context = []
1009
+
1010
+ # Handle remaining content
1011
+ if current_chunk:
1012
+ chunk_content = ' '.join(current_context + current_chunk)
1013
+ chunks.append(self._create_chunk(
1014
+ content=chunk_content,
1015
+ filename=filename,
1016
+ section=f"QA Section {len(chunks)+1}",
1017
+ metadata={
1018
+ 'chunk_method': 'qa_optimized',
1019
+ 'chunk_index': len(chunks),
1020
+ 'sentence_count': len(current_context + current_chunk)
1021
+ }
1022
+ ))
1023
+
1024
+ return chunks if chunks else [self._create_chunk(content, filename, "QA Section 1",
1025
+ metadata={'chunk_method': 'qa_optimized', 'chunk_index': 0})]