signalwire-agents 0.1.13__py3-none-any.whl → 0.1.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- signalwire_agents/__init__.py +28 -11
- signalwire_agents/cli/build_search.py +174 -14
- signalwire_agents/cli/test_swaig.py +159 -114
- signalwire_agents/core/agent_base.py +446 -78
- signalwire_agents/core/logging_config.py +162 -18
- signalwire_agents/core/skill_manager.py +2 -2
- signalwire_agents/core/swml_service.py +5 -45
- signalwire_agents/search/document_processor.py +275 -14
- signalwire_agents/search/index_builder.py +45 -10
- signalwire_agents/search/query_processor.py +27 -12
- signalwire_agents/skills/__init__.py +1 -1
- signalwire_agents/skills/native_vector_search/skill.py +24 -6
- signalwire_agents/skills/registry.py +58 -42
- {signalwire_agents-0.1.13.dist-info → signalwire_agents-0.1.15.dist-info}/METADATA +1 -1
- {signalwire_agents-0.1.13.dist-info → signalwire_agents-0.1.15.dist-info}/RECORD +20 -20
- {signalwire_agents-0.1.13.dist-info → signalwire_agents-0.1.15.dist-info}/entry_points.txt +1 -1
- {signalwire_agents-0.1.13.data → signalwire_agents-0.1.15.data}/data/schema.json +0 -0
- {signalwire_agents-0.1.13.dist-info → signalwire_agents-0.1.15.dist-info}/WHEEL +0 -0
- {signalwire_agents-0.1.13.dist-info → signalwire_agents-0.1.15.dist-info}/licenses/LICENSE +0 -0
- {signalwire_agents-0.1.13.dist-info → signalwire_agents-0.1.15.dist-info}/top_level.txt +0 -0
@@ -89,28 +89,85 @@ class StructuredLoggerWrapper:
|
|
89
89
|
# Also support the 'warn' alias
|
90
90
|
warn = warning
|
91
91
|
|
92
|
+
def bind(self, **kwargs) -> 'StructuredLoggerWrapper':
|
93
|
+
"""
|
94
|
+
Create a new logger instance with bound context data
|
95
|
+
|
96
|
+
This maintains compatibility with structlog's bind() method.
|
97
|
+
The bound data will be included in all subsequent log messages.
|
98
|
+
"""
|
99
|
+
# Create a new wrapper that includes the bound context
|
100
|
+
return BoundStructuredLoggerWrapper(self._logger, kwargs)
|
101
|
+
|
92
102
|
# Support direct access to underlying logger attributes if needed
|
93
103
|
def __getattr__(self, name: str) -> Any:
|
94
104
|
"""Delegate any unknown attributes to the underlying logger"""
|
95
105
|
return getattr(self._logger, name)
|
96
106
|
|
97
107
|
|
98
|
-
|
108
|
+
class BoundStructuredLoggerWrapper(StructuredLoggerWrapper):
|
109
|
+
"""
|
110
|
+
A structured logger wrapper that includes bound context data in all messages
|
111
|
+
"""
|
112
|
+
|
113
|
+
def __init__(self, logger: logging.Logger, bound_data: Dict[str, Any]):
|
114
|
+
super().__init__(logger)
|
115
|
+
self._bound_data = bound_data
|
116
|
+
|
117
|
+
def _format_structured_message(self, message: str, **kwargs) -> str:
|
118
|
+
"""Format a message with both bound data and additional keyword arguments"""
|
119
|
+
# Combine bound data with additional kwargs
|
120
|
+
all_kwargs = {**self._bound_data, **kwargs}
|
121
|
+
return super()._format_structured_message(message, **all_kwargs)
|
122
|
+
|
123
|
+
def bind(self, **kwargs) -> 'BoundStructuredLoggerWrapper':
|
124
|
+
"""Create a new logger with additional bound context"""
|
125
|
+
# Combine existing bound data with new data
|
126
|
+
new_bound_data = {**self._bound_data, **kwargs}
|
127
|
+
return BoundStructuredLoggerWrapper(self._logger, new_bound_data)
|
128
|
+
|
129
|
+
|
130
|
+
def get_execution_mode():
|
99
131
|
"""
|
100
132
|
Determine the execution mode based on environment variables
|
101
133
|
|
102
134
|
Returns:
|
103
|
-
'cgi'
|
104
|
-
'lambda' if running in AWS Lambda
|
105
|
-
'server' for normal server mode
|
135
|
+
str: 'server', 'cgi', 'lambda', 'google_cloud_function', 'azure_function', or 'unknown'
|
106
136
|
"""
|
137
|
+
# Check for CGI environment
|
107
138
|
if os.getenv('GATEWAY_INTERFACE'):
|
108
139
|
return 'cgi'
|
140
|
+
|
141
|
+
# Check for AWS Lambda environment
|
109
142
|
if os.getenv('AWS_LAMBDA_FUNCTION_NAME') or os.getenv('LAMBDA_TASK_ROOT'):
|
110
143
|
return 'lambda'
|
144
|
+
|
145
|
+
# Check for Google Cloud Functions environment
|
146
|
+
if (os.getenv('FUNCTION_TARGET') or
|
147
|
+
os.getenv('K_SERVICE') or
|
148
|
+
os.getenv('GOOGLE_CLOUD_PROJECT')):
|
149
|
+
return 'google_cloud_function'
|
150
|
+
|
151
|
+
# Check for Azure Functions environment
|
152
|
+
if (os.getenv('AZURE_FUNCTIONS_ENVIRONMENT') or
|
153
|
+
os.getenv('FUNCTIONS_WORKER_RUNTIME') or
|
154
|
+
os.getenv('AzureWebJobsStorage')):
|
155
|
+
return 'azure_function'
|
156
|
+
|
157
|
+
# Default to server mode
|
111
158
|
return 'server'
|
112
159
|
|
113
160
|
|
161
|
+
def reset_logging_configuration():
|
162
|
+
"""
|
163
|
+
Reset the logging configuration flag to allow reconfiguration
|
164
|
+
|
165
|
+
This is useful when environment variables change after initial configuration.
|
166
|
+
"""
|
167
|
+
global _logging_configured
|
168
|
+
_logging_configured = False
|
169
|
+
|
170
|
+
|
114
171
|
def configure_logging():
|
115
172
|
"""
|
116
173
|
Configure logging system once, globally, based on environment variables
|
@@ -182,31 +239,39 @@ def _configure_off_mode():
|
|
182
239
|
|
183
240
|
|
184
241
|
def _configure_stderr_mode(log_level: str):
|
185
|
-
"""Configure logging to stderr"""
|
242
|
+
"""Configure logging to stderr with colored formatting"""
|
186
243
|
# Clear existing handlers
|
187
244
|
logging.getLogger().handlers.clear()
|
188
245
|
|
189
246
|
# Convert log level
|
190
247
|
numeric_level = getattr(logging, log_level.upper(), logging.INFO)
|
191
248
|
|
192
|
-
#
|
193
|
-
logging.
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
)
|
249
|
+
# Create handler with colored formatter
|
250
|
+
handler = logging.StreamHandler(sys.stderr)
|
251
|
+
handler.setFormatter(ColoredFormatter())
|
252
|
+
|
253
|
+
# Configure root logger
|
254
|
+
root_logger = logging.getLogger()
|
255
|
+
root_logger.setLevel(numeric_level)
|
256
|
+
root_logger.addHandler(handler)
|
198
257
|
|
199
258
|
|
200
259
|
def _configure_default_mode(log_level: str):
|
201
|
-
"""Configure standard logging behavior"""
|
260
|
+
"""Configure standard logging behavior with colored formatting"""
|
261
|
+
# Clear existing handlers
|
262
|
+
logging.getLogger().handlers.clear()
|
263
|
+
|
202
264
|
# Convert log level
|
203
265
|
numeric_level = getattr(logging, log_level.upper(), logging.INFO)
|
204
266
|
|
205
|
-
#
|
206
|
-
logging.
|
207
|
-
|
208
|
-
|
209
|
-
|
267
|
+
# Create handler with colored formatter
|
268
|
+
handler = logging.StreamHandler()
|
269
|
+
handler.setFormatter(ColoredFormatter())
|
270
|
+
|
271
|
+
# Configure root logger
|
272
|
+
root_logger = logging.getLogger()
|
273
|
+
root_logger.setLevel(numeric_level)
|
274
|
+
root_logger.addHandler(handler)
|
210
275
|
|
211
276
|
|
212
277
|
def get_logger(name: str) -> StructuredLoggerWrapper:
|
@@ -229,4 +294,83 @@ def get_logger(name: str) -> StructuredLoggerWrapper:
|
|
229
294
|
python_logger = logging.getLogger(name)
|
230
295
|
|
231
296
|
# Wrap it with our structured logging interface
|
232
|
-
return StructuredLoggerWrapper(python_logger)
|
297
|
+
return StructuredLoggerWrapper(python_logger)
|
298
|
+
|
299
|
+
|
300
|
+
class ColoredFormatter(logging.Formatter):
|
301
|
+
"""
|
302
|
+
A beautiful colored logging formatter that makes logs easy to read and visually appealing
|
303
|
+
"""
|
304
|
+
|
305
|
+
# ANSI color codes
|
306
|
+
COLORS = {
|
307
|
+
'DEBUG': '\033[36m', # Cyan
|
308
|
+
'INFO': '\033[32m', # Green
|
309
|
+
'WARNING': '\033[33m', # Yellow
|
310
|
+
'ERROR': '\033[31m', # Red
|
311
|
+
'CRITICAL': '\033[35m', # Magenta
|
312
|
+
'RESET': '\033[0m', # Reset
|
313
|
+
'BOLD': '\033[1m', # Bold
|
314
|
+
'DIM': '\033[2m', # Dim
|
315
|
+
'WHITE': '\033[37m', # White
|
316
|
+
'BLUE': '\033[34m', # Blue
|
317
|
+
'BLACK': '\033[30m', # Black (for brackets)
|
318
|
+
}
|
319
|
+
|
320
|
+
def __init__(self):
|
321
|
+
super().__init__()
|
322
|
+
|
323
|
+
def format(self, record):
|
324
|
+
# Check if we should use colors (not in raw mode, and stdout is a tty)
|
325
|
+
use_colors = (
|
326
|
+
hasattr(sys.stdout, 'isatty') and sys.stdout.isatty() and
|
327
|
+
os.getenv('SIGNALWIRE_LOG_MODE') != 'off' and
|
328
|
+
'--raw' not in sys.argv and '--dump-swml' not in sys.argv
|
329
|
+
)
|
330
|
+
|
331
|
+
if use_colors:
|
332
|
+
# Get colors
|
333
|
+
level_color = self.COLORS.get(record.levelname, self.COLORS['WHITE'])
|
334
|
+
reset = self.COLORS['RESET']
|
335
|
+
dim = self.COLORS['DIM']
|
336
|
+
bold = self.COLORS['BOLD']
|
337
|
+
blue = self.COLORS['BLUE']
|
338
|
+
black = self.COLORS['BLACK']
|
339
|
+
|
340
|
+
# Format timestamp in a compact, readable way
|
341
|
+
timestamp = self.formatTime(record, '%H:%M:%S')
|
342
|
+
|
343
|
+
# Format level with appropriate color and consistent width
|
344
|
+
level_name = f"{level_color}{record.levelname:<8}{reset}"
|
345
|
+
|
346
|
+
# Format logger name - keep it short and readable
|
347
|
+
logger_name = record.name
|
348
|
+
if len(logger_name) > 15:
|
349
|
+
# Truncate long logger names but keep the end (most specific part)
|
350
|
+
logger_name = "..." + logger_name[-12:]
|
351
|
+
|
352
|
+
# Get function and line info if available
|
353
|
+
func_info = ""
|
354
|
+
if hasattr(record, 'funcName') and hasattr(record, 'lineno'):
|
355
|
+
func_name = getattr(record, 'funcName', '')
|
356
|
+
line_no = getattr(record, 'lineno', 0)
|
357
|
+
if func_name and func_name != '<module>':
|
358
|
+
func_info = f" {dim}({func_name}:{line_no}){reset}"
|
359
|
+
|
360
|
+
# Format the message
|
361
|
+
message = record.getMessage()
|
362
|
+
|
363
|
+
# Create the final formatted message with a clean, readable layout
|
364
|
+
formatted = (
|
365
|
+
f"{black}[{reset}{dim}{timestamp}{reset}{black}]{reset} "
|
366
|
+
f"{level_name} "
|
367
|
+
f"{blue}{logger_name:<15}{reset}"
|
368
|
+
f"{func_info} "
|
369
|
+
f"{message}"
|
370
|
+
)
|
371
|
+
|
372
|
+
return formatted
|
373
|
+
else:
|
374
|
+
# Non-colored format (fallback for files, pipes, etc.)
|
375
|
+
timestamp = self.formatTime(record, '%Y-%m-%d %H:%M:%S')
|
376
|
+
return f"{timestamp} {record.levelname:<8} {record.name} {record.getMessage()}"
|
@@ -8,7 +8,7 @@ See LICENSE file in the project root for full license information.
|
|
8
8
|
"""
|
9
9
|
|
10
10
|
from typing import Dict, List, Type, Any, Optional
|
11
|
-
import
|
11
|
+
from signalwire_agents.core.logging_config import get_logger
|
12
12
|
from signalwire_agents.core.skill_base import SkillBase
|
13
13
|
|
14
14
|
class SkillManager:
|
@@ -17,7 +17,7 @@ class SkillManager:
|
|
17
17
|
def __init__(self, agent):
|
18
18
|
self.agent = agent
|
19
19
|
self.loaded_skills: Dict[str, SkillBase] = {}
|
20
|
-
self.logger =
|
20
|
+
self.logger = get_logger("skill_manager")
|
21
21
|
|
22
22
|
def load_skill(self, skill_name: str, skill_class: Type[SkillBase] = None, params: Optional[Dict[str, Any]] = None) -> tuple[bool, str]:
|
23
23
|
"""
|
@@ -24,51 +24,11 @@ import types
|
|
24
24
|
from typing import Dict, List, Any, Optional, Union, Callable, Tuple, Type
|
25
25
|
from urllib.parse import urlparse
|
26
26
|
|
27
|
-
# Import
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
if not hasattr(structlog, "_configured") or not structlog._configured:
|
33
|
-
structlog.configure(
|
34
|
-
processors=[
|
35
|
-
structlog.stdlib.filter_by_level,
|
36
|
-
structlog.stdlib.add_logger_name,
|
37
|
-
structlog.stdlib.add_log_level,
|
38
|
-
structlog.stdlib.PositionalArgumentsFormatter(),
|
39
|
-
structlog.processors.TimeStamper(fmt="%Y-%m-%d %H:%M:%S"),
|
40
|
-
structlog.processors.StackInfoRenderer(),
|
41
|
-
structlog.processors.format_exc_info,
|
42
|
-
structlog.processors.UnicodeDecoder(),
|
43
|
-
structlog.dev.ConsoleRenderer()
|
44
|
-
],
|
45
|
-
context_class=dict,
|
46
|
-
logger_factory=structlog.stdlib.LoggerFactory(),
|
47
|
-
wrapper_class=structlog.stdlib.BoundLogger,
|
48
|
-
cache_logger_on_first_use=True,
|
49
|
-
)
|
50
|
-
|
51
|
-
# Set up root logger with structlog
|
52
|
-
logging.basicConfig(
|
53
|
-
format="%(message)s",
|
54
|
-
stream=sys.stdout,
|
55
|
-
level=logging.INFO,
|
56
|
-
)
|
57
|
-
|
58
|
-
# Mark as configured to avoid duplicate configuration
|
59
|
-
structlog._configured = True
|
60
|
-
|
61
|
-
# Create the module logger
|
62
|
-
logger = structlog.get_logger("swml_service")
|
63
|
-
|
64
|
-
except ImportError:
|
65
|
-
# Fallback to standard logging if structlog is not available
|
66
|
-
logging.basicConfig(
|
67
|
-
level=logging.INFO,
|
68
|
-
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
69
|
-
stream=sys.stdout
|
70
|
-
)
|
71
|
-
logger = logging.getLogger("swml_service")
|
27
|
+
# Import centralized logging system
|
28
|
+
from signalwire_agents.core.logging_config import get_logger
|
29
|
+
|
30
|
+
# Create the module logger using centralized system
|
31
|
+
logger = get_logger("swml_service")
|
72
32
|
|
73
33
|
try:
|
74
34
|
import fastapi
|
@@ -74,29 +74,42 @@ logger = logging.getLogger(__name__)
|
|
74
74
|
class DocumentProcessor:
|
75
75
|
"""Enhanced document processor with smart chunking capabilities"""
|
76
76
|
|
77
|
-
def __init__(
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
77
|
+
def __init__(
|
78
|
+
self,
|
79
|
+
chunking_strategy: str = 'sentence',
|
80
|
+
max_sentences_per_chunk: int = 5,
|
81
|
+
chunk_size: int = 50,
|
82
|
+
chunk_overlap: int = 10,
|
83
|
+
split_newlines: Optional[int] = None,
|
84
|
+
index_nlp_backend: str = 'nltk',
|
85
|
+
verbose: bool = False,
|
86
|
+
semantic_threshold: float = 0.5,
|
87
|
+
topic_threshold: float = 0.3
|
88
|
+
):
|
82
89
|
"""
|
83
|
-
Initialize document processor
|
90
|
+
Initialize document processor
|
84
91
|
|
85
92
|
Args:
|
86
|
-
chunking_strategy: 'sentence', 'sliding', 'paragraph',
|
87
|
-
max_sentences_per_chunk: For sentence strategy (default:
|
93
|
+
chunking_strategy: Strategy for chunking documents ('sentence', 'sliding', 'paragraph', 'page', 'semantic', 'topic', 'qa')
|
94
|
+
max_sentences_per_chunk: For sentence strategy (default: 5)
|
88
95
|
chunk_size: For sliding strategy - words per chunk (default: 50)
|
89
|
-
|
96
|
+
chunk_overlap: For sliding strategy - overlap in words (default: 10)
|
90
97
|
split_newlines: For sentence strategy - split on multiple newlines (optional)
|
98
|
+
index_nlp_backend: NLP backend for indexing (default: 'nltk')
|
99
|
+
verbose: Whether to enable verbose logging (default: False)
|
100
|
+
semantic_threshold: Similarity threshold for semantic chunking (default: 0.5)
|
101
|
+
topic_threshold: Similarity threshold for topic chunking (default: 0.3)
|
91
102
|
"""
|
92
103
|
self.chunking_strategy = chunking_strategy
|
93
104
|
self.max_sentences_per_chunk = max_sentences_per_chunk
|
94
105
|
self.chunk_size = chunk_size
|
95
|
-
self.
|
106
|
+
self.chunk_overlap = chunk_overlap
|
96
107
|
self.split_newlines = split_newlines
|
108
|
+
self.semantic_threshold = semantic_threshold
|
109
|
+
self.topic_threshold = topic_threshold
|
97
110
|
|
98
111
|
# Legacy support for old character-based chunking
|
99
|
-
self.chunk_overlap =
|
112
|
+
self.chunk_overlap = chunk_overlap
|
100
113
|
|
101
114
|
def create_chunks(self, content: str, filename: str,
|
102
115
|
file_type: str) -> List[Dict[str, Any]]:
|
@@ -121,6 +134,12 @@ class DocumentProcessor:
|
|
121
134
|
return self._chunk_by_paragraphs(content, filename, file_type)
|
122
135
|
elif self.chunking_strategy == 'page':
|
123
136
|
return self._chunk_by_pages(content, filename, file_type)
|
137
|
+
elif self.chunking_strategy == 'semantic':
|
138
|
+
return self._chunk_by_semantic(content, filename, file_type)
|
139
|
+
elif self.chunking_strategy == 'topic':
|
140
|
+
return self._chunk_by_topics(content, filename, file_type)
|
141
|
+
elif self.chunking_strategy == 'qa':
|
142
|
+
return self._chunk_by_qa_optimization(content, filename, file_type)
|
124
143
|
else:
|
125
144
|
# Fallback to sentence-based chunking
|
126
145
|
return self._chunk_by_sentences(content, filename, file_type)
|
@@ -674,7 +693,7 @@ class DocumentProcessor:
|
|
674
693
|
chunk_index = 0
|
675
694
|
|
676
695
|
# Create overlapping chunks
|
677
|
-
for i in range(0, len(words), self.chunk_size - self.
|
696
|
+
for i in range(0, len(words), self.chunk_size - self.chunk_overlap):
|
678
697
|
chunk_words = words[i:i + self.chunk_size]
|
679
698
|
if chunk_words:
|
680
699
|
chunk_content = ' '.join(chunk_words)
|
@@ -686,7 +705,7 @@ class DocumentProcessor:
|
|
686
705
|
'chunk_method': 'sliding_window',
|
687
706
|
'chunk_index': chunk_index,
|
688
707
|
'chunk_size_words': self.chunk_size,
|
689
|
-
'overlap_size_words': self.
|
708
|
+
'overlap_size_words': self.chunk_overlap,
|
690
709
|
'start_word': i,
|
691
710
|
'end_word': i + len(chunk_words)
|
692
711
|
}
|
@@ -761,4 +780,246 @@ class DocumentProcessor:
|
|
761
780
|
}
|
762
781
|
))
|
763
782
|
|
764
|
-
return chunks
|
783
|
+
return chunks
|
784
|
+
|
785
|
+
def _chunk_by_semantic(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
|
786
|
+
"""Chunk based on semantic similarity between sentences"""
|
787
|
+
if isinstance(content, list):
|
788
|
+
content = '\n'.join(content)
|
789
|
+
|
790
|
+
# Get sentences
|
791
|
+
if sent_tokenize:
|
792
|
+
sentences = sent_tokenize(content)
|
793
|
+
else:
|
794
|
+
sentences = content.split('. ')
|
795
|
+
sentences = [s.strip() + '.' for s in sentences if s.strip()]
|
796
|
+
|
797
|
+
if len(sentences) <= 1:
|
798
|
+
return [self._create_chunk(content, filename, "Section 1",
|
799
|
+
metadata={'chunk_method': 'semantic', 'chunk_index': 0})]
|
800
|
+
|
801
|
+
# Generate embeddings for sentences (using the same model as the index)
|
802
|
+
try:
|
803
|
+
from sentence_transformers import SentenceTransformer
|
804
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
805
|
+
import numpy as np
|
806
|
+
|
807
|
+
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
|
808
|
+
embeddings = model.encode(sentences, show_progress_bar=False)
|
809
|
+
|
810
|
+
# Calculate similarity between adjacent sentences
|
811
|
+
similarities = []
|
812
|
+
for i in range(len(embeddings) - 1):
|
813
|
+
sim = cosine_similarity([embeddings[i]], [embeddings[i + 1]])[0][0]
|
814
|
+
similarities.append(sim)
|
815
|
+
|
816
|
+
# Find split points where similarity drops below threshold
|
817
|
+
split_points = [0]
|
818
|
+
for i, sim in enumerate(similarities):
|
819
|
+
if sim < self.semantic_threshold:
|
820
|
+
split_points.append(i + 1)
|
821
|
+
split_points.append(len(sentences))
|
822
|
+
|
823
|
+
# Create chunks
|
824
|
+
chunks = []
|
825
|
+
for i in range(len(split_points) - 1):
|
826
|
+
start_idx = split_points[i]
|
827
|
+
end_idx = split_points[i + 1]
|
828
|
+
chunk_sentences = sentences[start_idx:end_idx]
|
829
|
+
|
830
|
+
# Ensure minimum chunk size
|
831
|
+
if len(chunk_sentences) < 2 and i > 0:
|
832
|
+
# Merge with previous chunk
|
833
|
+
chunks[-1]['content'] += ' ' + ' '.join(chunk_sentences)
|
834
|
+
continue
|
835
|
+
|
836
|
+
chunk_content = ' '.join(chunk_sentences)
|
837
|
+
chunks.append(self._create_chunk(
|
838
|
+
content=chunk_content,
|
839
|
+
filename=filename,
|
840
|
+
section=f"Semantic Section {i+1}",
|
841
|
+
metadata={
|
842
|
+
'chunk_method': 'semantic',
|
843
|
+
'chunk_index': i,
|
844
|
+
'semantic_threshold': self.semantic_threshold,
|
845
|
+
'sentence_count': len(chunk_sentences)
|
846
|
+
}
|
847
|
+
))
|
848
|
+
|
849
|
+
return chunks if chunks else [self._create_chunk(content, filename, "Section 1",
|
850
|
+
metadata={'chunk_method': 'semantic', 'chunk_index': 0})]
|
851
|
+
|
852
|
+
except ImportError:
|
853
|
+
# Fallback to sentence-based chunking
|
854
|
+
return self._chunk_by_sentences(content, filename, file_type)
|
855
|
+
|
856
|
+
def _chunk_by_topics(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
|
857
|
+
"""Chunk based on topic changes using keyword analysis"""
|
858
|
+
if isinstance(content, list):
|
859
|
+
content = '\n'.join(content)
|
860
|
+
|
861
|
+
if sent_tokenize:
|
862
|
+
sentences = sent_tokenize(content)
|
863
|
+
else:
|
864
|
+
sentences = content.split('. ')
|
865
|
+
sentences = [s.strip() + '.' for s in sentences if s.strip()]
|
866
|
+
|
867
|
+
if len(sentences) <= 3:
|
868
|
+
return [self._create_chunk(content, filename, "Topic 1",
|
869
|
+
metadata={'chunk_method': 'topic', 'chunk_index': 0})]
|
870
|
+
|
871
|
+
try:
|
872
|
+
# Simple topic detection using keyword overlap
|
873
|
+
from collections import Counter
|
874
|
+
import re
|
875
|
+
|
876
|
+
# Extract keywords from each sentence
|
877
|
+
sentence_keywords = []
|
878
|
+
for sentence in sentences:
|
879
|
+
# Simple keyword extraction (could be enhanced with NLP)
|
880
|
+
words = re.findall(r'\b[a-zA-Z]{3,}\b', sentence.lower())
|
881
|
+
# Filter common words (basic stopwords)
|
882
|
+
stopwords = {'the', 'and', 'for', 'are', 'but', 'not', 'you', 'all', 'can', 'had', 'her', 'was', 'one', 'our', 'out', 'day', 'get', 'has', 'him', 'his', 'how', 'its', 'may', 'new', 'now', 'old', 'see', 'two', 'who', 'boy', 'did', 'man', 'way', 'she', 'use', 'her', 'many', 'oil', 'sit', 'set', 'run', 'eat', 'far', 'sea', 'eye', 'ask', 'own', 'say', 'too', 'any', 'try', 'us', 'an', 'as', 'at', 'be', 'he', 'if', 'in', 'is', 'it', 'my', 'of', 'on', 'or', 'to', 'up', 'we', 'go', 'no', 'so', 'am', 'by', 'do', 'me'}
|
883
|
+
keywords = [w for w in words if w not in stopwords and len(w) > 3]
|
884
|
+
sentence_keywords.append(set(keywords))
|
885
|
+
|
886
|
+
# Find topic boundaries based on keyword overlap
|
887
|
+
chunks = []
|
888
|
+
current_chunk = [sentences[0]]
|
889
|
+
current_keywords = sentence_keywords[0]
|
890
|
+
|
891
|
+
for i in range(1, len(sentences)):
|
892
|
+
# Calculate keyword overlap with current chunk
|
893
|
+
overlap = len(current_keywords.intersection(sentence_keywords[i]))
|
894
|
+
total_keywords = len(current_keywords.union(sentence_keywords[i]))
|
895
|
+
|
896
|
+
if total_keywords > 0:
|
897
|
+
similarity = overlap / total_keywords
|
898
|
+
else:
|
899
|
+
similarity = 0
|
900
|
+
|
901
|
+
# If similarity is low, start new chunk
|
902
|
+
if similarity < self.topic_threshold and len(current_chunk) >= 2:
|
903
|
+
chunk_content = ' '.join(current_chunk)
|
904
|
+
chunks.append(self._create_chunk(
|
905
|
+
content=chunk_content,
|
906
|
+
filename=filename,
|
907
|
+
section=f"Topic {len(chunks)+1}",
|
908
|
+
metadata={
|
909
|
+
'chunk_method': 'topic',
|
910
|
+
'chunk_index': len(chunks),
|
911
|
+
'topic_keywords': list(current_keywords)[:10], # Top keywords
|
912
|
+
'sentence_count': len(current_chunk),
|
913
|
+
'topic_threshold': self.topic_threshold
|
914
|
+
}
|
915
|
+
))
|
916
|
+
current_chunk = [sentences[i]]
|
917
|
+
current_keywords = sentence_keywords[i]
|
918
|
+
else:
|
919
|
+
current_chunk.append(sentences[i])
|
920
|
+
current_keywords = current_keywords.union(sentence_keywords[i])
|
921
|
+
|
922
|
+
# Add final chunk
|
923
|
+
if current_chunk:
|
924
|
+
chunk_content = ' '.join(current_chunk)
|
925
|
+
chunks.append(self._create_chunk(
|
926
|
+
content=chunk_content,
|
927
|
+
filename=filename,
|
928
|
+
section=f"Topic {len(chunks)+1}",
|
929
|
+
metadata={
|
930
|
+
'chunk_method': 'topic',
|
931
|
+
'chunk_index': len(chunks),
|
932
|
+
'topic_keywords': list(current_keywords)[:10],
|
933
|
+
'sentence_count': len(current_chunk),
|
934
|
+
'topic_threshold': self.topic_threshold
|
935
|
+
}
|
936
|
+
))
|
937
|
+
|
938
|
+
return chunks if chunks else [self._create_chunk(content, filename, "Topic 1",
|
939
|
+
metadata={'chunk_method': 'topic', 'chunk_index': 0})]
|
940
|
+
|
941
|
+
except Exception:
|
942
|
+
# Fallback to sentence-based chunking
|
943
|
+
return self._chunk_by_sentences(content, filename, file_type)
|
944
|
+
|
945
|
+
def _chunk_by_qa_optimization(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
|
946
|
+
"""Create chunks optimized for question-answering"""
|
947
|
+
if isinstance(content, list):
|
948
|
+
content = '\n'.join(content)
|
949
|
+
|
950
|
+
if sent_tokenize:
|
951
|
+
sentences = sent_tokenize(content)
|
952
|
+
else:
|
953
|
+
sentences = content.split('. ')
|
954
|
+
sentences = [s.strip() + '.' for s in sentences if s.strip()]
|
955
|
+
|
956
|
+
# Patterns that indicate Q&A structure
|
957
|
+
question_patterns = [
|
958
|
+
r'\?', # Questions
|
959
|
+
r'^(what|how|why|when|where|who|which|can|does|is|are|will|would|should)',
|
960
|
+
r'(step|steps|process|procedure|method|way to)',
|
961
|
+
r'(example|examples|instance|case)',
|
962
|
+
r'(definition|meaning|refers to|means)',
|
963
|
+
]
|
964
|
+
|
965
|
+
chunks = []
|
966
|
+
current_chunk = []
|
967
|
+
current_context = []
|
968
|
+
|
969
|
+
for i, sentence in enumerate(sentences):
|
970
|
+
sentence_lower = sentence.lower().strip()
|
971
|
+
|
972
|
+
# Check if this sentence contains Q&A indicators
|
973
|
+
is_qa_relevant = any(re.search(pattern, sentence_lower) for pattern in question_patterns)
|
974
|
+
|
975
|
+
if is_qa_relevant or len(current_chunk) == 0:
|
976
|
+
current_chunk.append(sentence)
|
977
|
+
# Add surrounding context (previous and next sentences)
|
978
|
+
if i > 0 and sentences[i-1] not in current_chunk:
|
979
|
+
current_context.append(sentences[i-1])
|
980
|
+
if i < len(sentences) - 1:
|
981
|
+
current_context.append(sentences[i+1])
|
982
|
+
else:
|
983
|
+
current_chunk.append(sentence)
|
984
|
+
|
985
|
+
# Create chunk when we have enough content or reach a natural break
|
986
|
+
if (len(current_chunk) >= 3 and
|
987
|
+
(i == len(sentences) - 1 or # Last sentence
|
988
|
+
sentence.endswith('.') and len(current_chunk) >= 5)): # Natural break
|
989
|
+
|
990
|
+
# Combine chunk with context
|
991
|
+
full_content = current_context + current_chunk
|
992
|
+
chunk_content = ' '.join(full_content)
|
993
|
+
|
994
|
+
chunks.append(self._create_chunk(
|
995
|
+
content=chunk_content,
|
996
|
+
filename=filename,
|
997
|
+
section=f"QA Section {len(chunks)+1}",
|
998
|
+
metadata={
|
999
|
+
'chunk_method': 'qa_optimized',
|
1000
|
+
'chunk_index': len(chunks),
|
1001
|
+
'has_question': any('?' in s for s in current_chunk),
|
1002
|
+
'has_process': any(re.search(r'(step|process|method)', s.lower()) for s in current_chunk),
|
1003
|
+
'sentence_count': len(full_content)
|
1004
|
+
}
|
1005
|
+
))
|
1006
|
+
|
1007
|
+
current_chunk = []
|
1008
|
+
current_context = []
|
1009
|
+
|
1010
|
+
# Handle remaining content
|
1011
|
+
if current_chunk:
|
1012
|
+
chunk_content = ' '.join(current_context + current_chunk)
|
1013
|
+
chunks.append(self._create_chunk(
|
1014
|
+
content=chunk_content,
|
1015
|
+
filename=filename,
|
1016
|
+
section=f"QA Section {len(chunks)+1}",
|
1017
|
+
metadata={
|
1018
|
+
'chunk_method': 'qa_optimized',
|
1019
|
+
'chunk_index': len(chunks),
|
1020
|
+
'sentence_count': len(current_context + current_chunk)
|
1021
|
+
}
|
1022
|
+
))
|
1023
|
+
|
1024
|
+
return chunks if chunks else [self._create_chunk(content, filename, "QA Section 1",
|
1025
|
+
metadata={'chunk_method': 'qa_optimized', 'chunk_index': 0})]
|