signalwire-agents 0.1.11__py3-none-any.whl → 0.1.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- signalwire_agents/__init__.py +5 -1
- signalwire_agents/agent_server.py +222 -13
- signalwire_agents/cli/build_search.py +457 -0
- signalwire_agents/cli/test_swaig.py +177 -113
- signalwire_agents/core/agent_base.py +1 -1
- signalwire_agents/core/logging_config.py +232 -0
- signalwire_agents/search/__init__.py +131 -0
- signalwire_agents/search/document_processor.py +764 -0
- signalwire_agents/search/index_builder.py +534 -0
- signalwire_agents/search/query_processor.py +371 -0
- signalwire_agents/search/search_engine.py +383 -0
- signalwire_agents/search/search_service.py +251 -0
- signalwire_agents/skills/native_vector_search/__init__.py +1 -0
- signalwire_agents/skills/native_vector_search/skill.py +352 -0
- signalwire_agents/skills/registry.py +2 -15
- signalwire_agents/utils/__init__.py +13 -1
- {signalwire_agents-0.1.11.dist-info → signalwire_agents-0.1.12.dist-info}/METADATA +110 -3
- {signalwire_agents-0.1.11.dist-info → signalwire_agents-0.1.12.dist-info}/RECORD +23 -14
- {signalwire_agents-0.1.11.dist-info → signalwire_agents-0.1.12.dist-info}/entry_points.txt +1 -0
- signalwire_agents/utils/serverless.py +0 -38
- {signalwire_agents-0.1.11.data → signalwire_agents-0.1.12.data}/data/schema.json +0 -0
- {signalwire_agents-0.1.11.dist-info → signalwire_agents-0.1.12.dist-info}/WHEEL +0 -0
- {signalwire_agents-0.1.11.dist-info → signalwire_agents-0.1.12.dist-info}/licenses/LICENSE +0 -0
- {signalwire_agents-0.1.11.dist-info → signalwire_agents-0.1.12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,764 @@
|
|
1
|
+
"""
|
2
|
+
Copyright (c) 2025 SignalWire
|
3
|
+
|
4
|
+
This file is part of the SignalWire AI Agents SDK.
|
5
|
+
|
6
|
+
Licensed under the MIT License.
|
7
|
+
See LICENSE file in the project root for full license information.
|
8
|
+
"""
|
9
|
+
|
10
|
+
import re
|
11
|
+
import hashlib
|
12
|
+
import json
|
13
|
+
import logging
|
14
|
+
from typing import List, Dict, Any, Optional
|
15
|
+
from pathlib import Path
|
16
|
+
|
17
|
+
# Document processing imports
|
18
|
+
try:
|
19
|
+
import pdfplumber
|
20
|
+
except ImportError:
|
21
|
+
pdfplumber = None
|
22
|
+
|
23
|
+
try:
|
24
|
+
from docx import Document as DocxDocument
|
25
|
+
except ImportError:
|
26
|
+
DocxDocument = None
|
27
|
+
|
28
|
+
try:
|
29
|
+
from bs4 import BeautifulSoup
|
30
|
+
except ImportError:
|
31
|
+
BeautifulSoup = None
|
32
|
+
|
33
|
+
try:
|
34
|
+
import markdown
|
35
|
+
except ImportError:
|
36
|
+
markdown = None
|
37
|
+
|
38
|
+
try:
|
39
|
+
from striprtf.striprtf import rtf_to_text
|
40
|
+
except ImportError:
|
41
|
+
rtf_to_text = None
|
42
|
+
|
43
|
+
try:
|
44
|
+
from openpyxl import load_workbook
|
45
|
+
except ImportError:
|
46
|
+
load_workbook = None
|
47
|
+
|
48
|
+
try:
|
49
|
+
from pptx import Presentation
|
50
|
+
except ImportError:
|
51
|
+
Presentation = None
|
52
|
+
|
53
|
+
try:
|
54
|
+
from nltk.tokenize import sent_tokenize
|
55
|
+
import nltk
|
56
|
+
# Ensure NLTK data is available
|
57
|
+
try:
|
58
|
+
nltk.data.find('tokenizers/punkt')
|
59
|
+
except LookupError:
|
60
|
+
nltk.download('punkt', quiet=True)
|
61
|
+
except ImportError:
|
62
|
+
sent_tokenize = None
|
63
|
+
nltk = None
|
64
|
+
|
65
|
+
try:
|
66
|
+
import magic
|
67
|
+
except ImportError:
|
68
|
+
magic = None
|
69
|
+
|
70
|
+
from .query_processor import preprocess_document_content
|
71
|
+
|
72
|
+
logger = logging.getLogger(__name__)
|
73
|
+
|
74
|
+
class DocumentProcessor:
|
75
|
+
"""Enhanced document processor with smart chunking capabilities"""
|
76
|
+
|
77
|
+
def __init__(self, chunking_strategy: str = 'sentence',
|
78
|
+
max_sentences_per_chunk: int = 50,
|
79
|
+
chunk_size: int = 50,
|
80
|
+
overlap_size: int = 10,
|
81
|
+
split_newlines: Optional[int] = None):
|
82
|
+
"""
|
83
|
+
Initialize document processor with chunking strategy
|
84
|
+
|
85
|
+
Args:
|
86
|
+
chunking_strategy: 'sentence', 'sliding', 'paragraph', or 'page'
|
87
|
+
max_sentences_per_chunk: For sentence strategy (default: 50)
|
88
|
+
chunk_size: For sliding strategy - words per chunk (default: 50)
|
89
|
+
overlap_size: For sliding strategy - overlap in words (default: 10)
|
90
|
+
split_newlines: For sentence strategy - split on multiple newlines (optional)
|
91
|
+
"""
|
92
|
+
self.chunking_strategy = chunking_strategy
|
93
|
+
self.max_sentences_per_chunk = max_sentences_per_chunk
|
94
|
+
self.chunk_size = chunk_size
|
95
|
+
self.overlap_size = overlap_size
|
96
|
+
self.split_newlines = split_newlines
|
97
|
+
|
98
|
+
# Legacy support for old character-based chunking
|
99
|
+
self.chunk_overlap = overlap_size
|
100
|
+
|
101
|
+
def create_chunks(self, content: str, filename: str,
|
102
|
+
file_type: str) -> List[Dict[str, Any]]:
|
103
|
+
"""
|
104
|
+
Create chunks from document content using specified chunking strategy
|
105
|
+
|
106
|
+
Args:
|
107
|
+
content: Document content (string) - should be the actual content, not a file path
|
108
|
+
filename: Name of the file (for metadata)
|
109
|
+
file_type: File extension/type
|
110
|
+
|
111
|
+
Returns:
|
112
|
+
List of chunk dictionaries
|
113
|
+
"""
|
114
|
+
|
115
|
+
# Apply chunking strategy
|
116
|
+
if self.chunking_strategy == 'sentence':
|
117
|
+
return self._chunk_by_sentences(content, filename, file_type)
|
118
|
+
elif self.chunking_strategy == 'sliding':
|
119
|
+
return self._chunk_by_sliding_window(content, filename, file_type)
|
120
|
+
elif self.chunking_strategy == 'paragraph':
|
121
|
+
return self._chunk_by_paragraphs(content, filename, file_type)
|
122
|
+
elif self.chunking_strategy == 'page':
|
123
|
+
return self._chunk_by_pages(content, filename, file_type)
|
124
|
+
else:
|
125
|
+
# Fallback to sentence-based chunking
|
126
|
+
return self._chunk_by_sentences(content, filename, file_type)
|
127
|
+
|
128
|
+
def _extract_text_from_file(self, file_path: str) -> Any:
|
129
|
+
"""Extract text from various file formats"""
|
130
|
+
if not magic:
|
131
|
+
# Fallback to extension-based detection
|
132
|
+
file_path_obj = Path(file_path)
|
133
|
+
extension = file_path_obj.suffix.lower()
|
134
|
+
|
135
|
+
if extension == '.pdf':
|
136
|
+
file_type = 'application/pdf'
|
137
|
+
elif extension == '.docx':
|
138
|
+
file_type = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
|
139
|
+
elif extension in ['.txt', '.md']:
|
140
|
+
file_type = 'text/plain'
|
141
|
+
elif extension == '.html':
|
142
|
+
file_type = 'text/html'
|
143
|
+
elif extension == '.rtf':
|
144
|
+
file_type = 'application/rtf'
|
145
|
+
elif extension == '.xlsx':
|
146
|
+
file_type = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
|
147
|
+
elif extension == '.pptx':
|
148
|
+
file_type = 'application/vnd.openxmlformats-officedocument.presentationml.presentation'
|
149
|
+
else:
|
150
|
+
file_type = 'text/plain'
|
151
|
+
else:
|
152
|
+
mime = magic.Magic(mime=True)
|
153
|
+
file_type = mime.from_file(file_path)
|
154
|
+
|
155
|
+
if 'pdf' in file_type:
|
156
|
+
return self._extract_pdf(file_path)
|
157
|
+
elif 'vnd.openxmlformats-officedocument.wordprocessingml.document' in file_type:
|
158
|
+
return self._extract_docx(file_path)
|
159
|
+
elif 'plain' in file_type or 'text' in file_type:
|
160
|
+
return self._extract_text(file_path)
|
161
|
+
elif 'html' in file_type:
|
162
|
+
return self._extract_html(file_path)
|
163
|
+
elif 'markdown' in file_type or file_path.endswith('.md'):
|
164
|
+
return self._extract_markdown(file_path)
|
165
|
+
elif 'rtf' in file_type:
|
166
|
+
return self._extract_rtf(file_path)
|
167
|
+
elif 'vnd.openxmlformats-officedocument.spreadsheetml.sheet' in file_type:
|
168
|
+
return self._extract_excel(file_path)
|
169
|
+
elif 'vnd.openxmlformats-officedocument.presentationml.presentation' in file_type:
|
170
|
+
return self._extract_powerpoint(file_path)
|
171
|
+
else:
|
172
|
+
return json.dumps({"error": f"Unsupported file type: {file_type}"})
|
173
|
+
|
174
|
+
def _extract_pdf(self, file_path: str):
|
175
|
+
"""Extract text from PDF files"""
|
176
|
+
if not pdfplumber:
|
177
|
+
return json.dumps({"error": "pdfplumber not available for PDF processing"})
|
178
|
+
|
179
|
+
try:
|
180
|
+
with pdfplumber.open(file_path) as pdf:
|
181
|
+
pages = []
|
182
|
+
for page in pdf.pages:
|
183
|
+
text = page.extract_text()
|
184
|
+
if text:
|
185
|
+
# Remove page number from the beginning
|
186
|
+
text = re.sub(r'^\d+\.\s*', '', text.strip())
|
187
|
+
pages.append(text)
|
188
|
+
return pages
|
189
|
+
except Exception as e:
|
190
|
+
return json.dumps({"error": f"Error processing PDF: {e}"})
|
191
|
+
|
192
|
+
def _extract_docx(self, file_path: str):
|
193
|
+
"""Extract text from DOCX files"""
|
194
|
+
if not DocxDocument:
|
195
|
+
return json.dumps({"error": "python-docx not available for DOCX processing"})
|
196
|
+
|
197
|
+
try:
|
198
|
+
doc = DocxDocument(file_path)
|
199
|
+
return [para.text for para in doc.paragraphs if para.text.strip()]
|
200
|
+
except Exception as e:
|
201
|
+
return json.dumps({"error": f"Error processing DOCX: {e}"})
|
202
|
+
|
203
|
+
def _extract_text(self, file_path: str):
|
204
|
+
"""Extract text from plain text files"""
|
205
|
+
try:
|
206
|
+
with open(file_path, 'r', encoding='utf-8') as file:
|
207
|
+
return file.read()
|
208
|
+
except Exception as e:
|
209
|
+
return json.dumps({"error": f"Error processing TXT: {e}"})
|
210
|
+
|
211
|
+
def _extract_html(self, file_path: str):
|
212
|
+
"""Extract text from HTML files"""
|
213
|
+
if not BeautifulSoup:
|
214
|
+
return json.dumps({"error": "beautifulsoup4 not available for HTML processing"})
|
215
|
+
|
216
|
+
try:
|
217
|
+
with open(file_path, 'r', encoding='utf-8') as file:
|
218
|
+
soup = BeautifulSoup(file, 'html.parser')
|
219
|
+
return soup.get_text(separator='\n')
|
220
|
+
except Exception as e:
|
221
|
+
return json.dumps({"error": f"Error processing HTML: {e}"})
|
222
|
+
|
223
|
+
def _extract_markdown(self, file_path: str):
|
224
|
+
"""Extract text from Markdown files"""
|
225
|
+
try:
|
226
|
+
with open(file_path, 'r', encoding='utf-8') as file:
|
227
|
+
content = file.read()
|
228
|
+
if markdown and BeautifulSoup:
|
229
|
+
html = markdown.markdown(content)
|
230
|
+
soup = BeautifulSoup(html, 'html.parser')
|
231
|
+
return soup.get_text(separator='\n')
|
232
|
+
else:
|
233
|
+
# Fallback to raw markdown
|
234
|
+
return content
|
235
|
+
except Exception as e:
|
236
|
+
return json.dumps({"error": f"Error processing Markdown: {e}"})
|
237
|
+
|
238
|
+
def _extract_rtf(self, file_path: str):
|
239
|
+
"""Extract text from RTF files"""
|
240
|
+
if not rtf_to_text:
|
241
|
+
return json.dumps({"error": "striprtf not available for RTF processing"})
|
242
|
+
|
243
|
+
try:
|
244
|
+
with open(file_path, 'r', encoding='utf-8') as file:
|
245
|
+
return rtf_to_text(file.read())
|
246
|
+
except Exception as e:
|
247
|
+
return json.dumps({"error": f"Error processing RTF: {e}"})
|
248
|
+
|
249
|
+
def _extract_excel(self, file_path: str):
|
250
|
+
"""Extract text from Excel files"""
|
251
|
+
if not load_workbook:
|
252
|
+
return json.dumps({"error": "openpyxl not available for Excel processing"})
|
253
|
+
|
254
|
+
try:
|
255
|
+
wb = load_workbook(file_path)
|
256
|
+
sheets_text = []
|
257
|
+
for sheet in wb.worksheets:
|
258
|
+
for row in sheet.iter_rows(values_only=True):
|
259
|
+
row_text = ' '.join([str(cell) for cell in row if cell is not None])
|
260
|
+
sheets_text.append(row_text)
|
261
|
+
return "\n".join(sheets_text)
|
262
|
+
except Exception as e:
|
263
|
+
return json.dumps({"error": f"Error processing Excel: {e}"})
|
264
|
+
|
265
|
+
def _extract_powerpoint(self, file_path: str):
|
266
|
+
"""Extract text from PowerPoint files"""
|
267
|
+
if not Presentation:
|
268
|
+
return json.dumps({"error": "python-pptx not available for PowerPoint processing"})
|
269
|
+
|
270
|
+
try:
|
271
|
+
prs = Presentation(file_path)
|
272
|
+
slides_text = []
|
273
|
+
for slide in prs.slides:
|
274
|
+
slide_text = []
|
275
|
+
for shape in slide.shapes:
|
276
|
+
if hasattr(shape, "text"):
|
277
|
+
slide_text.append(shape.text)
|
278
|
+
slides_text.append("\n".join(slide_text))
|
279
|
+
return slides_text
|
280
|
+
except Exception as e:
|
281
|
+
return json.dumps({"error": f"Error processing PowerPoint: {e}"})
|
282
|
+
|
283
|
+
def _chunk_document_aware(self, content: Any, filename: str, file_type: str) -> List[Dict[str, Any]]:
|
284
|
+
"""Smart chunking for documents with natural structure"""
|
285
|
+
chunks = []
|
286
|
+
|
287
|
+
if isinstance(content, list):
|
288
|
+
# Handle page-based or paragraph-based content (PDF, DOCX, PPTX)
|
289
|
+
for i, page_content in enumerate(content):
|
290
|
+
if not page_content or not page_content.strip():
|
291
|
+
continue
|
292
|
+
|
293
|
+
# For each page/slide, use sentence-based chunking if it's large
|
294
|
+
if len(page_content) > self.chunk_size:
|
295
|
+
page_chunks = self._sentence_based_chunking(
|
296
|
+
page_content,
|
297
|
+
max_sentences_per_chunk=self._calculate_sentences_per_chunk(page_content)
|
298
|
+
)
|
299
|
+
for j, chunk_content in enumerate(page_chunks):
|
300
|
+
chunks.append(self._create_chunk(
|
301
|
+
content=chunk_content,
|
302
|
+
filename=filename,
|
303
|
+
section=f"Page {i+1}" if file_type == 'pdf' else f"Slide {i+1}" if file_type == 'pptx' else f"Section {i+1}",
|
304
|
+
metadata={'page_number': i+1, 'chunk_index': j}
|
305
|
+
))
|
306
|
+
else:
|
307
|
+
# Small page/slide - keep as single chunk
|
308
|
+
chunks.append(self._create_chunk(
|
309
|
+
content=page_content,
|
310
|
+
filename=filename,
|
311
|
+
section=f"Page {i+1}" if file_type == 'pdf' else f"Slide {i+1}" if file_type == 'pptx' else f"Section {i+1}",
|
312
|
+
metadata={'page_number': i+1}
|
313
|
+
))
|
314
|
+
else:
|
315
|
+
# Single text content - use paragraph-aware chunking
|
316
|
+
chunks = self._chunk_text_enhanced(content, filename)
|
317
|
+
|
318
|
+
return chunks
|
319
|
+
|
320
|
+
def _chunk_markdown_enhanced(self, content: str, filename: str) -> List[Dict[str, Any]]:
|
321
|
+
"""Enhanced markdown chunking with better header handling"""
|
322
|
+
chunks = []
|
323
|
+
lines = content.split('\n')
|
324
|
+
|
325
|
+
current_section = None
|
326
|
+
current_hierarchy = [] # Track header hierarchy
|
327
|
+
current_chunk = []
|
328
|
+
current_size = 0
|
329
|
+
line_start = 1
|
330
|
+
|
331
|
+
for line_num, line in enumerate(lines, 1):
|
332
|
+
# Check for headers with hierarchy tracking
|
333
|
+
header_match = re.match(r'^(#{1,6})\s+(.+)', line)
|
334
|
+
if header_match:
|
335
|
+
header_level = len(header_match.group(1))
|
336
|
+
header_text = header_match.group(2).strip()
|
337
|
+
|
338
|
+
# Save current chunk if it exists
|
339
|
+
if current_chunk:
|
340
|
+
chunks.append(self._create_chunk(
|
341
|
+
content='\n'.join(current_chunk),
|
342
|
+
filename=filename,
|
343
|
+
section=self._build_section_path(current_hierarchy),
|
344
|
+
start_line=line_start,
|
345
|
+
end_line=line_num - 1
|
346
|
+
))
|
347
|
+
|
348
|
+
# Update hierarchy
|
349
|
+
current_hierarchy = current_hierarchy[:header_level-1] + [header_text]
|
350
|
+
current_section = header_text
|
351
|
+
current_chunk = [line]
|
352
|
+
current_size = len(line)
|
353
|
+
line_start = line_num
|
354
|
+
|
355
|
+
else:
|
356
|
+
current_chunk.append(line)
|
357
|
+
current_size += len(line) + 1
|
358
|
+
|
359
|
+
# Check if chunk is getting too large - use smart splitting
|
360
|
+
if current_size >= self.chunk_size:
|
361
|
+
# Try to split at paragraph boundary first
|
362
|
+
split_point = self._find_best_split_point(current_chunk)
|
363
|
+
|
364
|
+
chunk_to_save = current_chunk[:split_point]
|
365
|
+
chunks.append(self._create_chunk(
|
366
|
+
content='\n'.join(chunk_to_save),
|
367
|
+
filename=filename,
|
368
|
+
section=self._build_section_path(current_hierarchy),
|
369
|
+
start_line=line_start,
|
370
|
+
end_line=line_start + split_point - 1
|
371
|
+
))
|
372
|
+
|
373
|
+
# Start new chunk with overlap
|
374
|
+
overlap_lines = self._get_overlap_lines(chunk_to_save)
|
375
|
+
remaining_lines = current_chunk[split_point:]
|
376
|
+
current_chunk = overlap_lines + remaining_lines
|
377
|
+
current_size = sum(len(line) + 1 for line in current_chunk)
|
378
|
+
line_start = line_start + split_point - len(overlap_lines)
|
379
|
+
|
380
|
+
# Add final chunk
|
381
|
+
if current_chunk:
|
382
|
+
chunks.append(self._create_chunk(
|
383
|
+
content='\n'.join(current_chunk),
|
384
|
+
filename=filename,
|
385
|
+
section=self._build_section_path(current_hierarchy),
|
386
|
+
start_line=line_start,
|
387
|
+
end_line=len(lines)
|
388
|
+
))
|
389
|
+
|
390
|
+
return chunks
|
391
|
+
|
392
|
+
def _chunk_python_enhanced(self, content: str, filename: str) -> List[Dict[str, Any]]:
|
393
|
+
"""Enhanced Python code chunking with better function/class detection"""
|
394
|
+
chunks = []
|
395
|
+
lines = content.split('\n')
|
396
|
+
|
397
|
+
current_function = None
|
398
|
+
current_class = None
|
399
|
+
current_chunk = []
|
400
|
+
current_size = 0
|
401
|
+
line_start = 1
|
402
|
+
indent_level = 0
|
403
|
+
|
404
|
+
for line_num, line in enumerate(lines, 1):
|
405
|
+
# Detect class definitions
|
406
|
+
class_match = re.match(r'^(\s*)(class\s+([^(:\s]+))', line)
|
407
|
+
if class_match:
|
408
|
+
indent = len(class_match.group(1))
|
409
|
+
class_name = class_match.group(3)
|
410
|
+
|
411
|
+
# Save current chunk if switching context
|
412
|
+
if current_chunk and (indent <= indent_level or current_class != class_name):
|
413
|
+
chunks.append(self._create_chunk(
|
414
|
+
content='\n'.join(current_chunk),
|
415
|
+
filename=filename,
|
416
|
+
section=self._build_python_section(current_class, current_function),
|
417
|
+
start_line=line_start,
|
418
|
+
end_line=line_num - 1
|
419
|
+
))
|
420
|
+
current_chunk = []
|
421
|
+
line_start = line_num
|
422
|
+
|
423
|
+
current_class = class_name
|
424
|
+
current_function = None
|
425
|
+
indent_level = indent
|
426
|
+
|
427
|
+
# Detect function definitions
|
428
|
+
func_match = re.match(r'^(\s*)(def\s+([^(:\s]+)|async\s+def\s+([^(:\s]+))', line)
|
429
|
+
if func_match:
|
430
|
+
indent = len(func_match.group(1))
|
431
|
+
func_name = func_match.group(3) or func_match.group(4)
|
432
|
+
|
433
|
+
# Save current chunk if switching to new function at same or lower level
|
434
|
+
if current_chunk and indent <= indent_level:
|
435
|
+
chunks.append(self._create_chunk(
|
436
|
+
content='\n'.join(current_chunk),
|
437
|
+
filename=filename,
|
438
|
+
section=self._build_python_section(current_class, current_function),
|
439
|
+
start_line=line_start,
|
440
|
+
end_line=line_num - 1
|
441
|
+
))
|
442
|
+
current_chunk = []
|
443
|
+
line_start = line_num
|
444
|
+
|
445
|
+
if indent >= indent_level: # Method within class or nested function
|
446
|
+
current_function = func_name
|
447
|
+
else: # Top-level function
|
448
|
+
current_function = func_name
|
449
|
+
current_class = None
|
450
|
+
|
451
|
+
indent_level = indent
|
452
|
+
|
453
|
+
current_chunk.append(line)
|
454
|
+
current_size += len(line) + 1
|
455
|
+
|
456
|
+
# Handle oversized chunks
|
457
|
+
if current_size >= self.chunk_size:
|
458
|
+
chunks.append(self._create_chunk(
|
459
|
+
content='\n'.join(current_chunk),
|
460
|
+
filename=filename,
|
461
|
+
section=self._build_python_section(current_class, current_function),
|
462
|
+
start_line=line_start,
|
463
|
+
end_line=line_num
|
464
|
+
))
|
465
|
+
|
466
|
+
# Start new chunk with minimal overlap for code
|
467
|
+
overlap_lines = current_chunk[-2:] if len(current_chunk) > 2 else current_chunk
|
468
|
+
current_chunk = overlap_lines
|
469
|
+
current_size = sum(len(line) + 1 for line in overlap_lines)
|
470
|
+
line_start = line_num - len(overlap_lines) + 1
|
471
|
+
|
472
|
+
# Add final chunk
|
473
|
+
if current_chunk:
|
474
|
+
chunks.append(self._create_chunk(
|
475
|
+
content='\n'.join(current_chunk),
|
476
|
+
filename=filename,
|
477
|
+
section=self._build_python_section(current_class, current_function),
|
478
|
+
start_line=line_start,
|
479
|
+
end_line=len(lines)
|
480
|
+
))
|
481
|
+
|
482
|
+
return chunks
|
483
|
+
|
484
|
+
def _chunk_text_enhanced(self, content: str, filename: str) -> List[Dict[str, Any]]:
|
485
|
+
"""Enhanced text chunking using sentence-based approach"""
|
486
|
+
if isinstance(content, list):
|
487
|
+
content = '\n'.join(content)
|
488
|
+
|
489
|
+
# Use sentence-based chunking for better coherence
|
490
|
+
max_sentences = self._calculate_sentences_per_chunk(content)
|
491
|
+
sentences = self._sentence_based_chunking(content, max_sentences)
|
492
|
+
|
493
|
+
chunks = []
|
494
|
+
for i, chunk_content in enumerate(sentences):
|
495
|
+
chunks.append(self._create_chunk(
|
496
|
+
content=chunk_content,
|
497
|
+
filename=filename,
|
498
|
+
section=f"Section {i+1}",
|
499
|
+
metadata={'chunk_method': 'sentence_based', 'chunk_index': i}
|
500
|
+
))
|
501
|
+
|
502
|
+
return chunks
|
503
|
+
|
504
|
+
def _sentence_based_chunking(self, text: str, max_sentences_per_chunk: int, split_newlines: int = 2) -> List[str]:
|
505
|
+
"""Sentence-based chunking with enhancements"""
|
506
|
+
if not sent_tokenize:
|
507
|
+
# Fallback to simple splitting
|
508
|
+
sentences = text.split('. ')
|
509
|
+
sentences = [s.strip() + '.' for s in sentences if s.strip()]
|
510
|
+
else:
|
511
|
+
sentences = []
|
512
|
+
|
513
|
+
if split_newlines > 0:
|
514
|
+
# Create regex pattern for specified number of newlines
|
515
|
+
newline_pattern = r'(\n{%d,})' % split_newlines
|
516
|
+
parts = re.split(newline_pattern, text)
|
517
|
+
|
518
|
+
for part in parts:
|
519
|
+
part = part.strip()
|
520
|
+
if part and not re.match(newline_pattern, part):
|
521
|
+
sentences.extend(sent_tokenize(part))
|
522
|
+
elif re.match(newline_pattern, part):
|
523
|
+
sentences.append(part)
|
524
|
+
else:
|
525
|
+
sentences = [sentence.strip() for sentence in sent_tokenize(text) if sentence.strip()]
|
526
|
+
|
527
|
+
# Create chunks of sentences with overlap
|
528
|
+
chunks = []
|
529
|
+
overlap_sentences = max(1, max_sentences_per_chunk // 4) # 25% overlap
|
530
|
+
|
531
|
+
for i in range(0, len(sentences), max_sentences_per_chunk - overlap_sentences):
|
532
|
+
chunk_sentences = sentences[i:i + max_sentences_per_chunk]
|
533
|
+
if chunk_sentences:
|
534
|
+
chunks.append(' '.join(chunk_sentences))
|
535
|
+
|
536
|
+
return chunks
|
537
|
+
|
538
|
+
def _calculate_sentences_per_chunk(self, text: str) -> int:
|
539
|
+
"""Calculate optimal sentences per chunk based on average sentence length"""
|
540
|
+
if not sent_tokenize:
|
541
|
+
# Fallback calculation
|
542
|
+
sentences = text.split('. ')
|
543
|
+
else:
|
544
|
+
sentences = sent_tokenize(text)
|
545
|
+
|
546
|
+
if not sentences:
|
547
|
+
return 1
|
548
|
+
|
549
|
+
avg_sentence_length = sum(len(s) for s in sentences) / len(sentences)
|
550
|
+
# Target chunk size divided by average sentence length
|
551
|
+
optimal_sentences = max(1, int(self.chunk_size / avg_sentence_length))
|
552
|
+
return min(optimal_sentences, 10) # Cap at 10 sentences for readability
|
553
|
+
|
554
|
+
def _build_section_path(self, hierarchy: List[str]) -> str:
|
555
|
+
"""Build hierarchical section path from header hierarchy"""
|
556
|
+
return ' > '.join(hierarchy) if hierarchy else None
|
557
|
+
|
558
|
+
def _build_python_section(self, class_name: Optional[str], function_name: Optional[str]) -> str:
|
559
|
+
"""Build section name for Python code"""
|
560
|
+
if class_name and function_name:
|
561
|
+
return f"{class_name}.{function_name}"
|
562
|
+
elif class_name:
|
563
|
+
return class_name
|
564
|
+
elif function_name:
|
565
|
+
return function_name
|
566
|
+
else:
|
567
|
+
return None
|
568
|
+
|
569
|
+
def _find_best_split_point(self, lines: List[str]) -> int:
|
570
|
+
"""Find the best point to split a chunk (prefer paragraph boundaries)"""
|
571
|
+
# Look for empty lines (paragraph boundaries) in the last 25% of the chunk
|
572
|
+
start_search = max(1, len(lines) * 3 // 4)
|
573
|
+
|
574
|
+
for i in range(len(lines) - 1, start_search - 1, -1):
|
575
|
+
if not lines[i].strip(): # Empty line
|
576
|
+
return i
|
577
|
+
|
578
|
+
# If no paragraph boundary found, split at 75% of chunk size
|
579
|
+
return max(1, len(lines) * 3 // 4)
|
580
|
+
|
581
|
+
def _create_chunk(self, content: str, filename: str,
|
582
|
+
section: Optional[str] = None,
|
583
|
+
start_line: Optional[int] = None,
|
584
|
+
end_line: Optional[int] = None,
|
585
|
+
metadata: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
586
|
+
"""Create chunk dictionary with enhanced metadata"""
|
587
|
+
base_metadata = {
|
588
|
+
'file_type': Path(filename).suffix.lstrip('.'),
|
589
|
+
'chunk_size': len(content),
|
590
|
+
'word_count': len(content.split()),
|
591
|
+
}
|
592
|
+
|
593
|
+
# Handle sentence count with fallback
|
594
|
+
try:
|
595
|
+
if sent_tokenize and content.strip():
|
596
|
+
base_metadata['sentence_count'] = len(sent_tokenize(content))
|
597
|
+
else:
|
598
|
+
# Fallback: count sentences by periods
|
599
|
+
base_metadata['sentence_count'] = len([s for s in content.split('.') if s.strip()])
|
600
|
+
except Exception as e:
|
601
|
+
logger.warning(f"Error counting sentences: {e}")
|
602
|
+
# Simple fallback: count periods
|
603
|
+
base_metadata['sentence_count'] = len([s for s in content.split('.') if s.strip()])
|
604
|
+
|
605
|
+
if metadata:
|
606
|
+
base_metadata.update(metadata)
|
607
|
+
|
608
|
+
return {
|
609
|
+
'content': content.strip(),
|
610
|
+
'filename': filename,
|
611
|
+
'section': section,
|
612
|
+
'start_line': start_line,
|
613
|
+
'end_line': end_line,
|
614
|
+
'metadata': base_metadata
|
615
|
+
}
|
616
|
+
|
617
|
+
def _get_overlap_lines(self, lines: List[str]) -> List[str]:
|
618
|
+
"""Get overlap lines for chunk continuity"""
|
619
|
+
if not lines:
|
620
|
+
return []
|
621
|
+
|
622
|
+
# Calculate overlap size in characters
|
623
|
+
overlap_chars = self.chunk_overlap
|
624
|
+
overlap_lines = []
|
625
|
+
char_count = 0
|
626
|
+
|
627
|
+
# Take lines from the end until we reach overlap size
|
628
|
+
for line in reversed(lines):
|
629
|
+
if char_count + len(line) <= overlap_chars:
|
630
|
+
overlap_lines.insert(0, line)
|
631
|
+
char_count += len(line) + 1
|
632
|
+
else:
|
633
|
+
break
|
634
|
+
|
635
|
+
return overlap_lines
|
636
|
+
|
637
|
+
def _chunk_by_sentences(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
|
638
|
+
"""Chunk content by sentences with specified max sentences per chunk"""
|
639
|
+
if isinstance(content, list):
|
640
|
+
content = '\n'.join(content)
|
641
|
+
|
642
|
+
# Use sentence-based chunking
|
643
|
+
split_newlines = self.split_newlines if self.split_newlines is not None else 2
|
644
|
+
sentences = self._sentence_based_chunking(content, self.max_sentences_per_chunk, split_newlines)
|
645
|
+
|
646
|
+
chunks = []
|
647
|
+
for i, chunk_content in enumerate(sentences):
|
648
|
+
chunks.append(self._create_chunk(
|
649
|
+
content=chunk_content,
|
650
|
+
filename=filename,
|
651
|
+
section=f"Section {i+1}",
|
652
|
+
metadata={
|
653
|
+
'chunk_method': 'sentence_based',
|
654
|
+
'chunk_index': i,
|
655
|
+
'max_sentences_per_chunk': self.max_sentences_per_chunk,
|
656
|
+
'split_newlines': split_newlines
|
657
|
+
}
|
658
|
+
))
|
659
|
+
|
660
|
+
return chunks
|
661
|
+
|
662
|
+
def _chunk_by_sliding_window(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
|
663
|
+
"""Chunk content using sliding window approach with word-based chunks"""
|
664
|
+
if isinstance(content, list):
|
665
|
+
content = '\n'.join(content)
|
666
|
+
|
667
|
+
# Split content into words
|
668
|
+
words = content.split()
|
669
|
+
|
670
|
+
if not words:
|
671
|
+
return []
|
672
|
+
|
673
|
+
chunks = []
|
674
|
+
chunk_index = 0
|
675
|
+
|
676
|
+
# Create overlapping chunks
|
677
|
+
for i in range(0, len(words), self.chunk_size - self.overlap_size):
|
678
|
+
chunk_words = words[i:i + self.chunk_size]
|
679
|
+
if chunk_words:
|
680
|
+
chunk_content = ' '.join(chunk_words)
|
681
|
+
chunks.append(self._create_chunk(
|
682
|
+
content=chunk_content,
|
683
|
+
filename=filename,
|
684
|
+
section=f"Chunk {chunk_index + 1}",
|
685
|
+
metadata={
|
686
|
+
'chunk_method': 'sliding_window',
|
687
|
+
'chunk_index': chunk_index,
|
688
|
+
'chunk_size_words': self.chunk_size,
|
689
|
+
'overlap_size_words': self.overlap_size,
|
690
|
+
'start_word': i,
|
691
|
+
'end_word': i + len(chunk_words)
|
692
|
+
}
|
693
|
+
))
|
694
|
+
chunk_index += 1
|
695
|
+
|
696
|
+
return chunks
|
697
|
+
|
698
|
+
def _chunk_by_paragraphs(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
|
699
|
+
"""Chunk content by paragraphs (split on double newlines)"""
|
700
|
+
if isinstance(content, list):
|
701
|
+
content = '\n'.join(content)
|
702
|
+
|
703
|
+
# Split on double newlines to get paragraphs
|
704
|
+
paragraphs = re.split(r'\n\s*\n', content)
|
705
|
+
paragraphs = [p.strip() for p in paragraphs if p.strip()]
|
706
|
+
|
707
|
+
chunks = []
|
708
|
+
for i, paragraph in enumerate(paragraphs):
|
709
|
+
if paragraph:
|
710
|
+
chunks.append(self._create_chunk(
|
711
|
+
content=paragraph,
|
712
|
+
filename=filename,
|
713
|
+
section=f"Paragraph {i+1}",
|
714
|
+
metadata={
|
715
|
+
'chunk_method': 'paragraph_based',
|
716
|
+
'chunk_index': i,
|
717
|
+
'paragraph_number': i + 1
|
718
|
+
}
|
719
|
+
))
|
720
|
+
|
721
|
+
return chunks
|
722
|
+
|
723
|
+
def _chunk_by_pages(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
|
724
|
+
"""Chunk content by pages (for documents that have page boundaries)"""
|
725
|
+
if isinstance(content, list):
|
726
|
+
# If content is already a list (e.g., from PDF extraction), treat each item as a page
|
727
|
+
pages = [str(page).strip() for page in content if str(page).strip()]
|
728
|
+
else:
|
729
|
+
# For text content, try to detect page boundaries
|
730
|
+
# Look for form feed characters or page break indicators
|
731
|
+
if '\f' in content:
|
732
|
+
pages = content.split('\f')
|
733
|
+
elif '---PAGE---' in content:
|
734
|
+
pages = content.split('---PAGE---')
|
735
|
+
elif re.search(r'\n\s*Page\s+\d+\s*\n', content):
|
736
|
+
# Split on "Page N" patterns
|
737
|
+
pages = re.split(r'\n\s*Page\s+\d+\s*\n', content)
|
738
|
+
else:
|
739
|
+
# Fallback: split into roughly equal chunks
|
740
|
+
words = content.split()
|
741
|
+
words_per_page = max(500, len(words) // 10) # Aim for ~10 pages
|
742
|
+
pages = []
|
743
|
+
for i in range(0, len(words), words_per_page):
|
744
|
+
page_words = words[i:i + words_per_page]
|
745
|
+
if page_words:
|
746
|
+
pages.append(' '.join(page_words))
|
747
|
+
|
748
|
+
pages = [p.strip() for p in pages if p.strip()]
|
749
|
+
|
750
|
+
chunks = []
|
751
|
+
for i, page_content in enumerate(pages):
|
752
|
+
if page_content:
|
753
|
+
chunks.append(self._create_chunk(
|
754
|
+
content=page_content,
|
755
|
+
filename=filename,
|
756
|
+
section=f"Page {i+1}",
|
757
|
+
metadata={
|
758
|
+
'chunk_method': 'page_based',
|
759
|
+
'chunk_index': i,
|
760
|
+
'page_number': i + 1
|
761
|
+
}
|
762
|
+
))
|
763
|
+
|
764
|
+
return chunks
|