signalwire-agents 0.1.11__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. signalwire_agents/__init__.py +5 -1
  2. signalwire_agents/agent_server.py +222 -13
  3. signalwire_agents/cli/build_search.py +457 -0
  4. signalwire_agents/cli/test_swaig.py +177 -113
  5. signalwire_agents/core/agent_base.py +1 -1
  6. signalwire_agents/core/logging_config.py +232 -0
  7. signalwire_agents/search/__init__.py +131 -0
  8. signalwire_agents/search/document_processor.py +764 -0
  9. signalwire_agents/search/index_builder.py +534 -0
  10. signalwire_agents/search/query_processor.py +371 -0
  11. signalwire_agents/search/search_engine.py +383 -0
  12. signalwire_agents/search/search_service.py +251 -0
  13. signalwire_agents/skills/native_vector_search/__init__.py +1 -0
  14. signalwire_agents/skills/native_vector_search/skill.py +352 -0
  15. signalwire_agents/skills/registry.py +2 -15
  16. signalwire_agents/utils/__init__.py +13 -1
  17. {signalwire_agents-0.1.11.dist-info → signalwire_agents-0.1.12.dist-info}/METADATA +110 -3
  18. {signalwire_agents-0.1.11.dist-info → signalwire_agents-0.1.12.dist-info}/RECORD +23 -14
  19. {signalwire_agents-0.1.11.dist-info → signalwire_agents-0.1.12.dist-info}/entry_points.txt +1 -0
  20. signalwire_agents/utils/serverless.py +0 -38
  21. {signalwire_agents-0.1.11.data → signalwire_agents-0.1.12.data}/data/schema.json +0 -0
  22. {signalwire_agents-0.1.11.dist-info → signalwire_agents-0.1.12.dist-info}/WHEEL +0 -0
  23. {signalwire_agents-0.1.11.dist-info → signalwire_agents-0.1.12.dist-info}/licenses/LICENSE +0 -0
  24. {signalwire_agents-0.1.11.dist-info → signalwire_agents-0.1.12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,764 @@
1
+ """
2
+ Copyright (c) 2025 SignalWire
3
+
4
+ This file is part of the SignalWire AI Agents SDK.
5
+
6
+ Licensed under the MIT License.
7
+ See LICENSE file in the project root for full license information.
8
+ """
9
+
10
+ import re
11
+ import hashlib
12
+ import json
13
+ import logging
14
+ from typing import List, Dict, Any, Optional
15
+ from pathlib import Path
16
+
17
+ # Document processing imports
18
+ try:
19
+ import pdfplumber
20
+ except ImportError:
21
+ pdfplumber = None
22
+
23
+ try:
24
+ from docx import Document as DocxDocument
25
+ except ImportError:
26
+ DocxDocument = None
27
+
28
+ try:
29
+ from bs4 import BeautifulSoup
30
+ except ImportError:
31
+ BeautifulSoup = None
32
+
33
+ try:
34
+ import markdown
35
+ except ImportError:
36
+ markdown = None
37
+
38
+ try:
39
+ from striprtf.striprtf import rtf_to_text
40
+ except ImportError:
41
+ rtf_to_text = None
42
+
43
+ try:
44
+ from openpyxl import load_workbook
45
+ except ImportError:
46
+ load_workbook = None
47
+
48
+ try:
49
+ from pptx import Presentation
50
+ except ImportError:
51
+ Presentation = None
52
+
53
+ try:
54
+ from nltk.tokenize import sent_tokenize
55
+ import nltk
56
+ # Ensure NLTK data is available
57
+ try:
58
+ nltk.data.find('tokenizers/punkt')
59
+ except LookupError:
60
+ nltk.download('punkt', quiet=True)
61
+ except ImportError:
62
+ sent_tokenize = None
63
+ nltk = None
64
+
65
+ try:
66
+ import magic
67
+ except ImportError:
68
+ magic = None
69
+
70
+ from .query_processor import preprocess_document_content
71
+
72
+ logger = logging.getLogger(__name__)
73
+
74
+ class DocumentProcessor:
75
+ """Enhanced document processor with smart chunking capabilities"""
76
+
77
+ def __init__(self, chunking_strategy: str = 'sentence',
78
+ max_sentences_per_chunk: int = 50,
79
+ chunk_size: int = 50,
80
+ overlap_size: int = 10,
81
+ split_newlines: Optional[int] = None):
82
+ """
83
+ Initialize document processor with chunking strategy
84
+
85
+ Args:
86
+ chunking_strategy: 'sentence', 'sliding', 'paragraph', or 'page'
87
+ max_sentences_per_chunk: For sentence strategy (default: 50)
88
+ chunk_size: For sliding strategy - words per chunk (default: 50)
89
+ overlap_size: For sliding strategy - overlap in words (default: 10)
90
+ split_newlines: For sentence strategy - split on multiple newlines (optional)
91
+ """
92
+ self.chunking_strategy = chunking_strategy
93
+ self.max_sentences_per_chunk = max_sentences_per_chunk
94
+ self.chunk_size = chunk_size
95
+ self.overlap_size = overlap_size
96
+ self.split_newlines = split_newlines
97
+
98
+ # Legacy support for old character-based chunking
99
+ self.chunk_overlap = overlap_size
100
+
101
+ def create_chunks(self, content: str, filename: str,
102
+ file_type: str) -> List[Dict[str, Any]]:
103
+ """
104
+ Create chunks from document content using specified chunking strategy
105
+
106
+ Args:
107
+ content: Document content (string) - should be the actual content, not a file path
108
+ filename: Name of the file (for metadata)
109
+ file_type: File extension/type
110
+
111
+ Returns:
112
+ List of chunk dictionaries
113
+ """
114
+
115
+ # Apply chunking strategy
116
+ if self.chunking_strategy == 'sentence':
117
+ return self._chunk_by_sentences(content, filename, file_type)
118
+ elif self.chunking_strategy == 'sliding':
119
+ return self._chunk_by_sliding_window(content, filename, file_type)
120
+ elif self.chunking_strategy == 'paragraph':
121
+ return self._chunk_by_paragraphs(content, filename, file_type)
122
+ elif self.chunking_strategy == 'page':
123
+ return self._chunk_by_pages(content, filename, file_type)
124
+ else:
125
+ # Fallback to sentence-based chunking
126
+ return self._chunk_by_sentences(content, filename, file_type)
127
+
128
+ def _extract_text_from_file(self, file_path: str) -> Any:
129
+ """Extract text from various file formats"""
130
+ if not magic:
131
+ # Fallback to extension-based detection
132
+ file_path_obj = Path(file_path)
133
+ extension = file_path_obj.suffix.lower()
134
+
135
+ if extension == '.pdf':
136
+ file_type = 'application/pdf'
137
+ elif extension == '.docx':
138
+ file_type = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
139
+ elif extension in ['.txt', '.md']:
140
+ file_type = 'text/plain'
141
+ elif extension == '.html':
142
+ file_type = 'text/html'
143
+ elif extension == '.rtf':
144
+ file_type = 'application/rtf'
145
+ elif extension == '.xlsx':
146
+ file_type = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
147
+ elif extension == '.pptx':
148
+ file_type = 'application/vnd.openxmlformats-officedocument.presentationml.presentation'
149
+ else:
150
+ file_type = 'text/plain'
151
+ else:
152
+ mime = magic.Magic(mime=True)
153
+ file_type = mime.from_file(file_path)
154
+
155
+ if 'pdf' in file_type:
156
+ return self._extract_pdf(file_path)
157
+ elif 'vnd.openxmlformats-officedocument.wordprocessingml.document' in file_type:
158
+ return self._extract_docx(file_path)
159
+ elif 'plain' in file_type or 'text' in file_type:
160
+ return self._extract_text(file_path)
161
+ elif 'html' in file_type:
162
+ return self._extract_html(file_path)
163
+ elif 'markdown' in file_type or file_path.endswith('.md'):
164
+ return self._extract_markdown(file_path)
165
+ elif 'rtf' in file_type:
166
+ return self._extract_rtf(file_path)
167
+ elif 'vnd.openxmlformats-officedocument.spreadsheetml.sheet' in file_type:
168
+ return self._extract_excel(file_path)
169
+ elif 'vnd.openxmlformats-officedocument.presentationml.presentation' in file_type:
170
+ return self._extract_powerpoint(file_path)
171
+ else:
172
+ return json.dumps({"error": f"Unsupported file type: {file_type}"})
173
+
174
+ def _extract_pdf(self, file_path: str):
175
+ """Extract text from PDF files"""
176
+ if not pdfplumber:
177
+ return json.dumps({"error": "pdfplumber not available for PDF processing"})
178
+
179
+ try:
180
+ with pdfplumber.open(file_path) as pdf:
181
+ pages = []
182
+ for page in pdf.pages:
183
+ text = page.extract_text()
184
+ if text:
185
+ # Remove page number from the beginning
186
+ text = re.sub(r'^\d+\.\s*', '', text.strip())
187
+ pages.append(text)
188
+ return pages
189
+ except Exception as e:
190
+ return json.dumps({"error": f"Error processing PDF: {e}"})
191
+
192
+ def _extract_docx(self, file_path: str):
193
+ """Extract text from DOCX files"""
194
+ if not DocxDocument:
195
+ return json.dumps({"error": "python-docx not available for DOCX processing"})
196
+
197
+ try:
198
+ doc = DocxDocument(file_path)
199
+ return [para.text for para in doc.paragraphs if para.text.strip()]
200
+ except Exception as e:
201
+ return json.dumps({"error": f"Error processing DOCX: {e}"})
202
+
203
+ def _extract_text(self, file_path: str):
204
+ """Extract text from plain text files"""
205
+ try:
206
+ with open(file_path, 'r', encoding='utf-8') as file:
207
+ return file.read()
208
+ except Exception as e:
209
+ return json.dumps({"error": f"Error processing TXT: {e}"})
210
+
211
+ def _extract_html(self, file_path: str):
212
+ """Extract text from HTML files"""
213
+ if not BeautifulSoup:
214
+ return json.dumps({"error": "beautifulsoup4 not available for HTML processing"})
215
+
216
+ try:
217
+ with open(file_path, 'r', encoding='utf-8') as file:
218
+ soup = BeautifulSoup(file, 'html.parser')
219
+ return soup.get_text(separator='\n')
220
+ except Exception as e:
221
+ return json.dumps({"error": f"Error processing HTML: {e}"})
222
+
223
+ def _extract_markdown(self, file_path: str):
224
+ """Extract text from Markdown files"""
225
+ try:
226
+ with open(file_path, 'r', encoding='utf-8') as file:
227
+ content = file.read()
228
+ if markdown and BeautifulSoup:
229
+ html = markdown.markdown(content)
230
+ soup = BeautifulSoup(html, 'html.parser')
231
+ return soup.get_text(separator='\n')
232
+ else:
233
+ # Fallback to raw markdown
234
+ return content
235
+ except Exception as e:
236
+ return json.dumps({"error": f"Error processing Markdown: {e}"})
237
+
238
+ def _extract_rtf(self, file_path: str):
239
+ """Extract text from RTF files"""
240
+ if not rtf_to_text:
241
+ return json.dumps({"error": "striprtf not available for RTF processing"})
242
+
243
+ try:
244
+ with open(file_path, 'r', encoding='utf-8') as file:
245
+ return rtf_to_text(file.read())
246
+ except Exception as e:
247
+ return json.dumps({"error": f"Error processing RTF: {e}"})
248
+
249
+ def _extract_excel(self, file_path: str):
250
+ """Extract text from Excel files"""
251
+ if not load_workbook:
252
+ return json.dumps({"error": "openpyxl not available for Excel processing"})
253
+
254
+ try:
255
+ wb = load_workbook(file_path)
256
+ sheets_text = []
257
+ for sheet in wb.worksheets:
258
+ for row in sheet.iter_rows(values_only=True):
259
+ row_text = ' '.join([str(cell) for cell in row if cell is not None])
260
+ sheets_text.append(row_text)
261
+ return "\n".join(sheets_text)
262
+ except Exception as e:
263
+ return json.dumps({"error": f"Error processing Excel: {e}"})
264
+
265
+ def _extract_powerpoint(self, file_path: str):
266
+ """Extract text from PowerPoint files"""
267
+ if not Presentation:
268
+ return json.dumps({"error": "python-pptx not available for PowerPoint processing"})
269
+
270
+ try:
271
+ prs = Presentation(file_path)
272
+ slides_text = []
273
+ for slide in prs.slides:
274
+ slide_text = []
275
+ for shape in slide.shapes:
276
+ if hasattr(shape, "text"):
277
+ slide_text.append(shape.text)
278
+ slides_text.append("\n".join(slide_text))
279
+ return slides_text
280
+ except Exception as e:
281
+ return json.dumps({"error": f"Error processing PowerPoint: {e}"})
282
+
283
+ def _chunk_document_aware(self, content: Any, filename: str, file_type: str) -> List[Dict[str, Any]]:
284
+ """Smart chunking for documents with natural structure"""
285
+ chunks = []
286
+
287
+ if isinstance(content, list):
288
+ # Handle page-based or paragraph-based content (PDF, DOCX, PPTX)
289
+ for i, page_content in enumerate(content):
290
+ if not page_content or not page_content.strip():
291
+ continue
292
+
293
+ # For each page/slide, use sentence-based chunking if it's large
294
+ if len(page_content) > self.chunk_size:
295
+ page_chunks = self._sentence_based_chunking(
296
+ page_content,
297
+ max_sentences_per_chunk=self._calculate_sentences_per_chunk(page_content)
298
+ )
299
+ for j, chunk_content in enumerate(page_chunks):
300
+ chunks.append(self._create_chunk(
301
+ content=chunk_content,
302
+ filename=filename,
303
+ section=f"Page {i+1}" if file_type == 'pdf' else f"Slide {i+1}" if file_type == 'pptx' else f"Section {i+1}",
304
+ metadata={'page_number': i+1, 'chunk_index': j}
305
+ ))
306
+ else:
307
+ # Small page/slide - keep as single chunk
308
+ chunks.append(self._create_chunk(
309
+ content=page_content,
310
+ filename=filename,
311
+ section=f"Page {i+1}" if file_type == 'pdf' else f"Slide {i+1}" if file_type == 'pptx' else f"Section {i+1}",
312
+ metadata={'page_number': i+1}
313
+ ))
314
+ else:
315
+ # Single text content - use paragraph-aware chunking
316
+ chunks = self._chunk_text_enhanced(content, filename)
317
+
318
+ return chunks
319
+
320
+ def _chunk_markdown_enhanced(self, content: str, filename: str) -> List[Dict[str, Any]]:
321
+ """Enhanced markdown chunking with better header handling"""
322
+ chunks = []
323
+ lines = content.split('\n')
324
+
325
+ current_section = None
326
+ current_hierarchy = [] # Track header hierarchy
327
+ current_chunk = []
328
+ current_size = 0
329
+ line_start = 1
330
+
331
+ for line_num, line in enumerate(lines, 1):
332
+ # Check for headers with hierarchy tracking
333
+ header_match = re.match(r'^(#{1,6})\s+(.+)', line)
334
+ if header_match:
335
+ header_level = len(header_match.group(1))
336
+ header_text = header_match.group(2).strip()
337
+
338
+ # Save current chunk if it exists
339
+ if current_chunk:
340
+ chunks.append(self._create_chunk(
341
+ content='\n'.join(current_chunk),
342
+ filename=filename,
343
+ section=self._build_section_path(current_hierarchy),
344
+ start_line=line_start,
345
+ end_line=line_num - 1
346
+ ))
347
+
348
+ # Update hierarchy
349
+ current_hierarchy = current_hierarchy[:header_level-1] + [header_text]
350
+ current_section = header_text
351
+ current_chunk = [line]
352
+ current_size = len(line)
353
+ line_start = line_num
354
+
355
+ else:
356
+ current_chunk.append(line)
357
+ current_size += len(line) + 1
358
+
359
+ # Check if chunk is getting too large - use smart splitting
360
+ if current_size >= self.chunk_size:
361
+ # Try to split at paragraph boundary first
362
+ split_point = self._find_best_split_point(current_chunk)
363
+
364
+ chunk_to_save = current_chunk[:split_point]
365
+ chunks.append(self._create_chunk(
366
+ content='\n'.join(chunk_to_save),
367
+ filename=filename,
368
+ section=self._build_section_path(current_hierarchy),
369
+ start_line=line_start,
370
+ end_line=line_start + split_point - 1
371
+ ))
372
+
373
+ # Start new chunk with overlap
374
+ overlap_lines = self._get_overlap_lines(chunk_to_save)
375
+ remaining_lines = current_chunk[split_point:]
376
+ current_chunk = overlap_lines + remaining_lines
377
+ current_size = sum(len(line) + 1 for line in current_chunk)
378
+ line_start = line_start + split_point - len(overlap_lines)
379
+
380
+ # Add final chunk
381
+ if current_chunk:
382
+ chunks.append(self._create_chunk(
383
+ content='\n'.join(current_chunk),
384
+ filename=filename,
385
+ section=self._build_section_path(current_hierarchy),
386
+ start_line=line_start,
387
+ end_line=len(lines)
388
+ ))
389
+
390
+ return chunks
391
+
392
+ def _chunk_python_enhanced(self, content: str, filename: str) -> List[Dict[str, Any]]:
393
+ """Enhanced Python code chunking with better function/class detection"""
394
+ chunks = []
395
+ lines = content.split('\n')
396
+
397
+ current_function = None
398
+ current_class = None
399
+ current_chunk = []
400
+ current_size = 0
401
+ line_start = 1
402
+ indent_level = 0
403
+
404
+ for line_num, line in enumerate(lines, 1):
405
+ # Detect class definitions
406
+ class_match = re.match(r'^(\s*)(class\s+([^(:\s]+))', line)
407
+ if class_match:
408
+ indent = len(class_match.group(1))
409
+ class_name = class_match.group(3)
410
+
411
+ # Save current chunk if switching context
412
+ if current_chunk and (indent <= indent_level or current_class != class_name):
413
+ chunks.append(self._create_chunk(
414
+ content='\n'.join(current_chunk),
415
+ filename=filename,
416
+ section=self._build_python_section(current_class, current_function),
417
+ start_line=line_start,
418
+ end_line=line_num - 1
419
+ ))
420
+ current_chunk = []
421
+ line_start = line_num
422
+
423
+ current_class = class_name
424
+ current_function = None
425
+ indent_level = indent
426
+
427
+ # Detect function definitions
428
+ func_match = re.match(r'^(\s*)(def\s+([^(:\s]+)|async\s+def\s+([^(:\s]+))', line)
429
+ if func_match:
430
+ indent = len(func_match.group(1))
431
+ func_name = func_match.group(3) or func_match.group(4)
432
+
433
+ # Save current chunk if switching to new function at same or lower level
434
+ if current_chunk and indent <= indent_level:
435
+ chunks.append(self._create_chunk(
436
+ content='\n'.join(current_chunk),
437
+ filename=filename,
438
+ section=self._build_python_section(current_class, current_function),
439
+ start_line=line_start,
440
+ end_line=line_num - 1
441
+ ))
442
+ current_chunk = []
443
+ line_start = line_num
444
+
445
+ if indent >= indent_level: # Method within class or nested function
446
+ current_function = func_name
447
+ else: # Top-level function
448
+ current_function = func_name
449
+ current_class = None
450
+
451
+ indent_level = indent
452
+
453
+ current_chunk.append(line)
454
+ current_size += len(line) + 1
455
+
456
+ # Handle oversized chunks
457
+ if current_size >= self.chunk_size:
458
+ chunks.append(self._create_chunk(
459
+ content='\n'.join(current_chunk),
460
+ filename=filename,
461
+ section=self._build_python_section(current_class, current_function),
462
+ start_line=line_start,
463
+ end_line=line_num
464
+ ))
465
+
466
+ # Start new chunk with minimal overlap for code
467
+ overlap_lines = current_chunk[-2:] if len(current_chunk) > 2 else current_chunk
468
+ current_chunk = overlap_lines
469
+ current_size = sum(len(line) + 1 for line in overlap_lines)
470
+ line_start = line_num - len(overlap_lines) + 1
471
+
472
+ # Add final chunk
473
+ if current_chunk:
474
+ chunks.append(self._create_chunk(
475
+ content='\n'.join(current_chunk),
476
+ filename=filename,
477
+ section=self._build_python_section(current_class, current_function),
478
+ start_line=line_start,
479
+ end_line=len(lines)
480
+ ))
481
+
482
+ return chunks
483
+
484
+ def _chunk_text_enhanced(self, content: str, filename: str) -> List[Dict[str, Any]]:
485
+ """Enhanced text chunking using sentence-based approach"""
486
+ if isinstance(content, list):
487
+ content = '\n'.join(content)
488
+
489
+ # Use sentence-based chunking for better coherence
490
+ max_sentences = self._calculate_sentences_per_chunk(content)
491
+ sentences = self._sentence_based_chunking(content, max_sentences)
492
+
493
+ chunks = []
494
+ for i, chunk_content in enumerate(sentences):
495
+ chunks.append(self._create_chunk(
496
+ content=chunk_content,
497
+ filename=filename,
498
+ section=f"Section {i+1}",
499
+ metadata={'chunk_method': 'sentence_based', 'chunk_index': i}
500
+ ))
501
+
502
+ return chunks
503
+
504
+ def _sentence_based_chunking(self, text: str, max_sentences_per_chunk: int, split_newlines: int = 2) -> List[str]:
505
+ """Sentence-based chunking with enhancements"""
506
+ if not sent_tokenize:
507
+ # Fallback to simple splitting
508
+ sentences = text.split('. ')
509
+ sentences = [s.strip() + '.' for s in sentences if s.strip()]
510
+ else:
511
+ sentences = []
512
+
513
+ if split_newlines > 0:
514
+ # Create regex pattern for specified number of newlines
515
+ newline_pattern = r'(\n{%d,})' % split_newlines
516
+ parts = re.split(newline_pattern, text)
517
+
518
+ for part in parts:
519
+ part = part.strip()
520
+ if part and not re.match(newline_pattern, part):
521
+ sentences.extend(sent_tokenize(part))
522
+ elif re.match(newline_pattern, part):
523
+ sentences.append(part)
524
+ else:
525
+ sentences = [sentence.strip() for sentence in sent_tokenize(text) if sentence.strip()]
526
+
527
+ # Create chunks of sentences with overlap
528
+ chunks = []
529
+ overlap_sentences = max(1, max_sentences_per_chunk // 4) # 25% overlap
530
+
531
+ for i in range(0, len(sentences), max_sentences_per_chunk - overlap_sentences):
532
+ chunk_sentences = sentences[i:i + max_sentences_per_chunk]
533
+ if chunk_sentences:
534
+ chunks.append(' '.join(chunk_sentences))
535
+
536
+ return chunks
537
+
538
+ def _calculate_sentences_per_chunk(self, text: str) -> int:
539
+ """Calculate optimal sentences per chunk based on average sentence length"""
540
+ if not sent_tokenize:
541
+ # Fallback calculation
542
+ sentences = text.split('. ')
543
+ else:
544
+ sentences = sent_tokenize(text)
545
+
546
+ if not sentences:
547
+ return 1
548
+
549
+ avg_sentence_length = sum(len(s) for s in sentences) / len(sentences)
550
+ # Target chunk size divided by average sentence length
551
+ optimal_sentences = max(1, int(self.chunk_size / avg_sentence_length))
552
+ return min(optimal_sentences, 10) # Cap at 10 sentences for readability
553
+
554
+ def _build_section_path(self, hierarchy: List[str]) -> str:
555
+ """Build hierarchical section path from header hierarchy"""
556
+ return ' > '.join(hierarchy) if hierarchy else None
557
+
558
+ def _build_python_section(self, class_name: Optional[str], function_name: Optional[str]) -> str:
559
+ """Build section name for Python code"""
560
+ if class_name and function_name:
561
+ return f"{class_name}.{function_name}"
562
+ elif class_name:
563
+ return class_name
564
+ elif function_name:
565
+ return function_name
566
+ else:
567
+ return None
568
+
569
+ def _find_best_split_point(self, lines: List[str]) -> int:
570
+ """Find the best point to split a chunk (prefer paragraph boundaries)"""
571
+ # Look for empty lines (paragraph boundaries) in the last 25% of the chunk
572
+ start_search = max(1, len(lines) * 3 // 4)
573
+
574
+ for i in range(len(lines) - 1, start_search - 1, -1):
575
+ if not lines[i].strip(): # Empty line
576
+ return i
577
+
578
+ # If no paragraph boundary found, split at 75% of chunk size
579
+ return max(1, len(lines) * 3 // 4)
580
+
581
+ def _create_chunk(self, content: str, filename: str,
582
+ section: Optional[str] = None,
583
+ start_line: Optional[int] = None,
584
+ end_line: Optional[int] = None,
585
+ metadata: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
586
+ """Create chunk dictionary with enhanced metadata"""
587
+ base_metadata = {
588
+ 'file_type': Path(filename).suffix.lstrip('.'),
589
+ 'chunk_size': len(content),
590
+ 'word_count': len(content.split()),
591
+ }
592
+
593
+ # Handle sentence count with fallback
594
+ try:
595
+ if sent_tokenize and content.strip():
596
+ base_metadata['sentence_count'] = len(sent_tokenize(content))
597
+ else:
598
+ # Fallback: count sentences by periods
599
+ base_metadata['sentence_count'] = len([s for s in content.split('.') if s.strip()])
600
+ except Exception as e:
601
+ logger.warning(f"Error counting sentences: {e}")
602
+ # Simple fallback: count periods
603
+ base_metadata['sentence_count'] = len([s for s in content.split('.') if s.strip()])
604
+
605
+ if metadata:
606
+ base_metadata.update(metadata)
607
+
608
+ return {
609
+ 'content': content.strip(),
610
+ 'filename': filename,
611
+ 'section': section,
612
+ 'start_line': start_line,
613
+ 'end_line': end_line,
614
+ 'metadata': base_metadata
615
+ }
616
+
617
+ def _get_overlap_lines(self, lines: List[str]) -> List[str]:
618
+ """Get overlap lines for chunk continuity"""
619
+ if not lines:
620
+ return []
621
+
622
+ # Calculate overlap size in characters
623
+ overlap_chars = self.chunk_overlap
624
+ overlap_lines = []
625
+ char_count = 0
626
+
627
+ # Take lines from the end until we reach overlap size
628
+ for line in reversed(lines):
629
+ if char_count + len(line) <= overlap_chars:
630
+ overlap_lines.insert(0, line)
631
+ char_count += len(line) + 1
632
+ else:
633
+ break
634
+
635
+ return overlap_lines
636
+
637
+ def _chunk_by_sentences(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
638
+ """Chunk content by sentences with specified max sentences per chunk"""
639
+ if isinstance(content, list):
640
+ content = '\n'.join(content)
641
+
642
+ # Use sentence-based chunking
643
+ split_newlines = self.split_newlines if self.split_newlines is not None else 2
644
+ sentences = self._sentence_based_chunking(content, self.max_sentences_per_chunk, split_newlines)
645
+
646
+ chunks = []
647
+ for i, chunk_content in enumerate(sentences):
648
+ chunks.append(self._create_chunk(
649
+ content=chunk_content,
650
+ filename=filename,
651
+ section=f"Section {i+1}",
652
+ metadata={
653
+ 'chunk_method': 'sentence_based',
654
+ 'chunk_index': i,
655
+ 'max_sentences_per_chunk': self.max_sentences_per_chunk,
656
+ 'split_newlines': split_newlines
657
+ }
658
+ ))
659
+
660
+ return chunks
661
+
662
+ def _chunk_by_sliding_window(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
663
+ """Chunk content using sliding window approach with word-based chunks"""
664
+ if isinstance(content, list):
665
+ content = '\n'.join(content)
666
+
667
+ # Split content into words
668
+ words = content.split()
669
+
670
+ if not words:
671
+ return []
672
+
673
+ chunks = []
674
+ chunk_index = 0
675
+
676
+ # Create overlapping chunks
677
+ for i in range(0, len(words), self.chunk_size - self.overlap_size):
678
+ chunk_words = words[i:i + self.chunk_size]
679
+ if chunk_words:
680
+ chunk_content = ' '.join(chunk_words)
681
+ chunks.append(self._create_chunk(
682
+ content=chunk_content,
683
+ filename=filename,
684
+ section=f"Chunk {chunk_index + 1}",
685
+ metadata={
686
+ 'chunk_method': 'sliding_window',
687
+ 'chunk_index': chunk_index,
688
+ 'chunk_size_words': self.chunk_size,
689
+ 'overlap_size_words': self.overlap_size,
690
+ 'start_word': i,
691
+ 'end_word': i + len(chunk_words)
692
+ }
693
+ ))
694
+ chunk_index += 1
695
+
696
+ return chunks
697
+
698
+ def _chunk_by_paragraphs(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
699
+ """Chunk content by paragraphs (split on double newlines)"""
700
+ if isinstance(content, list):
701
+ content = '\n'.join(content)
702
+
703
+ # Split on double newlines to get paragraphs
704
+ paragraphs = re.split(r'\n\s*\n', content)
705
+ paragraphs = [p.strip() for p in paragraphs if p.strip()]
706
+
707
+ chunks = []
708
+ for i, paragraph in enumerate(paragraphs):
709
+ if paragraph:
710
+ chunks.append(self._create_chunk(
711
+ content=paragraph,
712
+ filename=filename,
713
+ section=f"Paragraph {i+1}",
714
+ metadata={
715
+ 'chunk_method': 'paragraph_based',
716
+ 'chunk_index': i,
717
+ 'paragraph_number': i + 1
718
+ }
719
+ ))
720
+
721
+ return chunks
722
+
723
+ def _chunk_by_pages(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
724
+ """Chunk content by pages (for documents that have page boundaries)"""
725
+ if isinstance(content, list):
726
+ # If content is already a list (e.g., from PDF extraction), treat each item as a page
727
+ pages = [str(page).strip() for page in content if str(page).strip()]
728
+ else:
729
+ # For text content, try to detect page boundaries
730
+ # Look for form feed characters or page break indicators
731
+ if '\f' in content:
732
+ pages = content.split('\f')
733
+ elif '---PAGE---' in content:
734
+ pages = content.split('---PAGE---')
735
+ elif re.search(r'\n\s*Page\s+\d+\s*\n', content):
736
+ # Split on "Page N" patterns
737
+ pages = re.split(r'\n\s*Page\s+\d+\s*\n', content)
738
+ else:
739
+ # Fallback: split into roughly equal chunks
740
+ words = content.split()
741
+ words_per_page = max(500, len(words) // 10) # Aim for ~10 pages
742
+ pages = []
743
+ for i in range(0, len(words), words_per_page):
744
+ page_words = words[i:i + words_per_page]
745
+ if page_words:
746
+ pages.append(' '.join(page_words))
747
+
748
+ pages = [p.strip() for p in pages if p.strip()]
749
+
750
+ chunks = []
751
+ for i, page_content in enumerate(pages):
752
+ if page_content:
753
+ chunks.append(self._create_chunk(
754
+ content=page_content,
755
+ filename=filename,
756
+ section=f"Page {i+1}",
757
+ metadata={
758
+ 'chunk_method': 'page_based',
759
+ 'chunk_index': i,
760
+ 'page_number': i + 1
761
+ }
762
+ ))
763
+
764
+ return chunks