signalwire-agents 0.1.11__py3-none-any.whl → 0.1.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. signalwire_agents/__init__.py +5 -1
  2. signalwire_agents/agent_server.py +222 -13
  3. signalwire_agents/cli/build_search.py +457 -0
  4. signalwire_agents/cli/test_swaig.py +177 -113
  5. signalwire_agents/core/agent_base.py +1 -3
  6. signalwire_agents/core/logging_config.py +232 -0
  7. signalwire_agents/core/swaig_function.py +2 -3
  8. signalwire_agents/core/swml_renderer.py +43 -28
  9. signalwire_agents/search/__init__.py +131 -0
  10. signalwire_agents/search/document_processor.py +764 -0
  11. signalwire_agents/search/index_builder.py +534 -0
  12. signalwire_agents/search/query_processor.py +371 -0
  13. signalwire_agents/search/search_engine.py +383 -0
  14. signalwire_agents/search/search_service.py +251 -0
  15. signalwire_agents/skills/native_vector_search/__init__.py +1 -0
  16. signalwire_agents/skills/native_vector_search/skill.py +352 -0
  17. signalwire_agents/skills/registry.py +2 -15
  18. signalwire_agents/utils/__init__.py +13 -1
  19. {signalwire_agents-0.1.11.dist-info → signalwire_agents-0.1.13.dist-info}/METADATA +110 -3
  20. {signalwire_agents-0.1.11.dist-info → signalwire_agents-0.1.13.dist-info}/RECORD +25 -16
  21. {signalwire_agents-0.1.11.dist-info → signalwire_agents-0.1.13.dist-info}/entry_points.txt +1 -0
  22. signalwire_agents/utils/serverless.py +0 -38
  23. {signalwire_agents-0.1.11.data → signalwire_agents-0.1.13.data}/data/schema.json +0 -0
  24. {signalwire_agents-0.1.11.dist-info → signalwire_agents-0.1.13.dist-info}/WHEEL +0 -0
  25. {signalwire_agents-0.1.11.dist-info → signalwire_agents-0.1.13.dist-info}/licenses/LICENSE +0 -0
  26. {signalwire_agents-0.1.11.dist-info → signalwire_agents-0.1.13.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,534 @@
1
+ """
2
+ Copyright (c) 2025 SignalWire
3
+
4
+ This file is part of the SignalWire AI Agents SDK.
5
+
6
+ Licensed under the MIT License.
7
+ See LICENSE file in the project root for full license information.
8
+ """
9
+
10
+ import os
11
+ import sqlite3
12
+ import json
13
+ import hashlib
14
+ import logging
15
+ from datetime import datetime
16
+ from pathlib import Path
17
+ from typing import List, Optional, Dict, Any
18
+ import fnmatch
19
+
20
+ try:
21
+ import numpy as np
22
+ except ImportError:
23
+ np = None
24
+
25
+ try:
26
+ from sentence_transformers import SentenceTransformer
27
+ except ImportError:
28
+ SentenceTransformer = None
29
+
30
+ from .document_processor import DocumentProcessor
31
+ from .query_processor import preprocess_document_content
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+ class IndexBuilder:
36
+ """Build searchable indexes from document directories"""
37
+
38
+ def __init__(self, model_name: str = 'sentence-transformers/all-mpnet-base-v2',
39
+ chunking_strategy: str = 'sentence',
40
+ max_sentences_per_chunk: int = 50,
41
+ chunk_size: int = 50,
42
+ chunk_overlap: int = 10,
43
+ split_newlines: Optional[int] = None,
44
+ verbose: bool = False):
45
+ self.model_name = model_name
46
+ self.chunking_strategy = chunking_strategy
47
+ self.max_sentences_per_chunk = max_sentences_per_chunk
48
+ self.chunk_size = chunk_size
49
+ self.chunk_overlap = chunk_overlap
50
+ self.split_newlines = split_newlines
51
+ self.verbose = verbose
52
+ self.model = None
53
+ self.doc_processor = DocumentProcessor(
54
+ chunking_strategy=chunking_strategy,
55
+ max_sentences_per_chunk=max_sentences_per_chunk,
56
+ chunk_size=chunk_size,
57
+ overlap_size=chunk_overlap,
58
+ split_newlines=split_newlines
59
+ )
60
+
61
+ def _load_model(self):
62
+ """Load embedding model (lazy loading)"""
63
+ if self.model is None:
64
+ if not SentenceTransformer:
65
+ raise ImportError("sentence-transformers is required for embedding generation. Install with: pip install sentence-transformers")
66
+
67
+ if self.verbose:
68
+ print(f"Loading embedding model: {self.model_name}")
69
+
70
+ try:
71
+ self.model = SentenceTransformer(self.model_name)
72
+ except Exception as e:
73
+ logger.error(f"Failed to load model '{self.model_name}': {e}")
74
+ raise
75
+
76
+ def build_index_from_sources(self, sources: List[Path], output_file: str,
77
+ file_types: List[str], exclude_patterns: Optional[List[str]] = None,
78
+ languages: List[str] = None, tags: Optional[List[str]] = None):
79
+ """
80
+ Build complete search index from multiple sources (files and directories)
81
+
82
+ Args:
83
+ sources: List of Path objects (files and/or directories)
84
+ output_file: Output .swsearch file path
85
+ file_types: List of file extensions to include for directories
86
+ exclude_patterns: Glob patterns to exclude
87
+ languages: List of languages to support
88
+ tags: Global tags to add to all chunks
89
+ """
90
+
91
+ # Discover files from all sources
92
+ files = self._discover_files_from_sources(sources, file_types, exclude_patterns)
93
+ if self.verbose:
94
+ print(f"Found {len(files)} files to process")
95
+
96
+ if not files:
97
+ print("No files found to process. Check your sources, file types and exclude patterns.")
98
+ return
99
+
100
+ # Process documents
101
+ chunks = []
102
+ for file_path in files:
103
+ try:
104
+ # For individual files, use the file's parent as the base directory
105
+ # For files from directories, use the original source directory
106
+ base_dir = self._get_base_directory_for_file(file_path, sources)
107
+ file_chunks = self._process_file(file_path, base_dir, tags)
108
+ chunks.extend(file_chunks)
109
+ if self.verbose:
110
+ print(f"Processed {file_path}: {len(file_chunks)} chunks")
111
+ except Exception as e:
112
+ logger.error(f"Error processing {file_path}: {e}")
113
+ if self.verbose:
114
+ print(f"Error processing {file_path}: {e}")
115
+
116
+ if not chunks:
117
+ print("No chunks created from documents. Check file contents and processing.")
118
+ return
119
+
120
+ if self.verbose:
121
+ print(f"Created {len(chunks)} total chunks")
122
+
123
+ # Generate embeddings
124
+ self._load_model()
125
+ if self.verbose:
126
+ print("Generating embeddings...")
127
+
128
+ for i, chunk in enumerate(chunks):
129
+ try:
130
+ # Preprocess content for better search
131
+ processed = preprocess_document_content(
132
+ chunk['content'],
133
+ language=chunk.get('language', 'en')
134
+ )
135
+
136
+ chunk['processed_content'] = processed['enhanced_text']
137
+ chunk['keywords'] = processed.get('keywords', [])
138
+
139
+ # Generate embedding (suppress progress bar)
140
+ embedding = self.model.encode(processed['enhanced_text'], show_progress_bar=False)
141
+ chunk['embedding'] = embedding.tobytes()
142
+
143
+ if self.verbose and (i + 1) % 50 == 0:
144
+ progress_pct = ((i + 1) / len(chunks)) * 100
145
+ print(f"Generated embeddings: {i + 1}/{len(chunks)} chunks ({progress_pct:.1f}%)")
146
+
147
+ except Exception as e:
148
+ logger.error(f"Error processing chunk {i}: {e}")
149
+ # Use original content as fallback
150
+ chunk['processed_content'] = chunk['content']
151
+ chunk['keywords'] = []
152
+ # Create zero embedding as fallback
153
+ if np:
154
+ embedding = np.zeros(768, dtype=np.float32)
155
+ chunk['embedding'] = embedding.tobytes()
156
+ else:
157
+ chunk['embedding'] = b''
158
+
159
+ # Create SQLite database
160
+ sources_info = [str(s) for s in sources]
161
+ self._create_database(output_file, chunks, languages or ['en'], sources_info, file_types)
162
+
163
+ if self.verbose:
164
+ print(f"Index created: {output_file}")
165
+ print(f"Total chunks: {len(chunks)}")
166
+
167
+ def build_index(self, source_dir: str, output_file: str,
168
+ file_types: List[str], exclude_patterns: Optional[List[str]] = None,
169
+ languages: List[str] = None, tags: Optional[List[str]] = None):
170
+ """
171
+ Build complete search index from a single directory (legacy method)
172
+
173
+ Args:
174
+ source_dir: Directory to scan for documents
175
+ output_file: Output .swsearch file path
176
+ file_types: List of file extensions to include
177
+ exclude_patterns: Glob patterns to exclude
178
+ languages: List of languages to support
179
+ tags: Global tags to add to all chunks
180
+ """
181
+
182
+ # Convert to new multi-source method
183
+ sources = [Path(source_dir)]
184
+ self.build_index_from_sources(sources, output_file, file_types, exclude_patterns, languages, tags)
185
+
186
+ def _get_base_directory_for_file(self, file_path: Path, sources: List[Path]) -> str:
187
+ """
188
+ Determine the appropriate base directory for a file to calculate relative paths
189
+
190
+ Args:
191
+ file_path: The file being processed
192
+ sources: List of original source paths
193
+
194
+ Returns:
195
+ Base directory path as string
196
+ """
197
+
198
+ # Check if this file was specified directly as a source
199
+ if file_path in sources:
200
+ # For individual files, use the parent directory
201
+ return str(file_path.parent)
202
+
203
+ # Check if this file is within any of the source directories
204
+ for source in sources:
205
+ if source.is_dir():
206
+ try:
207
+ # Check if file_path is relative to this source directory
208
+ file_path.relative_to(source)
209
+ return str(source)
210
+ except ValueError:
211
+ # file_path is not relative to this source
212
+ continue
213
+
214
+ # Fallback: use the file's parent directory
215
+ return str(file_path.parent)
216
+
217
+ def _discover_files_from_sources(self, sources: List[Path], file_types: List[str],
218
+ exclude_patterns: Optional[List[str]] = None) -> List[Path]:
219
+ """
220
+ Discover files from multiple sources (files and directories)
221
+
222
+ Args:
223
+ sources: List of Path objects (files and/or directories)
224
+ file_types: List of file extensions to include for directories
225
+ exclude_patterns: Glob patterns to exclude
226
+
227
+ Returns:
228
+ List of file paths to process
229
+ """
230
+
231
+ files = []
232
+ supported_extensions = set(ft.lstrip('.').lower() for ft in file_types)
233
+
234
+ for source in sources:
235
+ if source.is_file():
236
+ # Individual file - check if it's supported
237
+ file_ext = source.suffix.lstrip('.').lower()
238
+ if file_ext in supported_extensions or not file_ext: # Allow extensionless files
239
+ # Check exclusions
240
+ if self._is_file_excluded(source, exclude_patterns):
241
+ if self.verbose:
242
+ print(f"Excluded file: {source}")
243
+ continue
244
+
245
+ files.append(source)
246
+ if self.verbose:
247
+ print(f"Added individual file: {source}")
248
+ else:
249
+ if self.verbose:
250
+ print(f"Skipped unsupported file type: {source} (extension: {file_ext})")
251
+
252
+ elif source.is_dir():
253
+ # Directory - use existing discovery logic
254
+ dir_files = self._discover_files(str(source), file_types, exclude_patterns)
255
+ files.extend(dir_files)
256
+ if self.verbose:
257
+ print(f"Added {len(dir_files)} files from directory: {source}")
258
+ else:
259
+ if self.verbose:
260
+ print(f"Skipped non-existent or invalid source: {source}")
261
+
262
+ # Remove duplicates while preserving order
263
+ seen = set()
264
+ unique_files = []
265
+ for file_path in files:
266
+ if file_path not in seen:
267
+ seen.add(file_path)
268
+ unique_files.append(file_path)
269
+
270
+ return unique_files
271
+
272
+ def _is_file_excluded(self, file_path: Path, exclude_patterns: Optional[List[str]] = None) -> bool:
273
+ """
274
+ Check if a file should be excluded based on exclude patterns
275
+
276
+ Args:
277
+ file_path: Path to check
278
+ exclude_patterns: List of glob patterns to exclude
279
+
280
+ Returns:
281
+ True if file should be excluded
282
+ """
283
+
284
+ if not exclude_patterns:
285
+ return False
286
+
287
+ import fnmatch
288
+
289
+ file_str = str(file_path)
290
+ for pattern in exclude_patterns:
291
+ if fnmatch.fnmatch(file_str, pattern):
292
+ return True
293
+
294
+ return False
295
+
296
+ def _discover_files(self, source_dir: str, file_types: List[str],
297
+ exclude_patterns: Optional[List[str]] = None) -> List[Path]:
298
+ """Discover files to index"""
299
+ files = []
300
+ source_path = Path(source_dir)
301
+
302
+ if not source_path.exists():
303
+ raise FileNotFoundError(f"Source directory does not exist: {source_dir}")
304
+
305
+ for file_type in file_types:
306
+ # Clean up file type (remove leading dots)
307
+ clean_type = file_type.lstrip('.')
308
+ pattern = f"**/*.{clean_type}"
309
+
310
+ for file_path in source_path.glob(pattern):
311
+ # Skip directories
312
+ if not file_path.is_file():
313
+ continue
314
+
315
+ # Check exclusions
316
+ if exclude_patterns:
317
+ excluded = False
318
+ for pattern in exclude_patterns:
319
+ if fnmatch.fnmatch(str(file_path), pattern):
320
+ excluded = True
321
+ break
322
+ if excluded:
323
+ if self.verbose:
324
+ print(f"Excluded: {file_path}")
325
+ continue
326
+
327
+ files.append(file_path)
328
+
329
+ return files
330
+
331
+ def _process_file(self, file_path: Path, source_dir: str,
332
+ global_tags: Optional[List[str]] = None) -> List[Dict[str, Any]]:
333
+ """Process single file into chunks"""
334
+ try:
335
+ # Try to read as text first
336
+ try:
337
+ content = file_path.read_text(encoding='utf-8')
338
+ except UnicodeDecodeError:
339
+ if self.verbose:
340
+ print(f"Skipping binary file: {file_path}")
341
+ return []
342
+
343
+ relative_path = str(file_path.relative_to(source_dir))
344
+
345
+ # Create chunks using document processor - pass content directly, not file path
346
+ chunks = self.doc_processor.create_chunks(
347
+ content=content, # Pass the actual content, not the file path
348
+ filename=relative_path,
349
+ file_type=file_path.suffix.lstrip('.')
350
+ )
351
+
352
+ # Add global tags
353
+ if global_tags:
354
+ for chunk in chunks:
355
+ existing_tags = chunk.get('tags', [])
356
+ if isinstance(existing_tags, str):
357
+ existing_tags = [existing_tags]
358
+ chunk['tags'] = existing_tags + global_tags
359
+
360
+ return chunks
361
+
362
+ except Exception as e:
363
+ logger.error(f"Error processing file {file_path}: {e}")
364
+ return []
365
+
366
+ def _create_database(self, output_file: str, chunks: List[Dict[str, Any]],
367
+ languages: List[str], sources_info: List[str], file_types: List[str]):
368
+ """Create SQLite database with all data"""
369
+
370
+ # Remove existing file
371
+ if os.path.exists(output_file):
372
+ os.remove(output_file)
373
+
374
+ conn = sqlite3.connect(output_file)
375
+ cursor = conn.cursor()
376
+
377
+ try:
378
+ # Create schema
379
+ cursor.execute('''
380
+ CREATE TABLE chunks (
381
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
382
+ content TEXT NOT NULL,
383
+ processed_content TEXT NOT NULL,
384
+ keywords TEXT,
385
+ language TEXT DEFAULT 'en',
386
+ embedding BLOB NOT NULL,
387
+ filename TEXT NOT NULL,
388
+ section TEXT,
389
+ start_line INTEGER,
390
+ end_line INTEGER,
391
+ tags TEXT,
392
+ metadata TEXT,
393
+ chunk_hash TEXT UNIQUE,
394
+ created_at TEXT DEFAULT CURRENT_TIMESTAMP
395
+ )
396
+ ''')
397
+
398
+ cursor.execute('''
399
+ CREATE VIRTUAL TABLE chunks_fts USING fts5(
400
+ processed_content,
401
+ keywords,
402
+ content='chunks',
403
+ content_rowid='id'
404
+ )
405
+ ''')
406
+
407
+ cursor.execute('''
408
+ CREATE TABLE synonyms (
409
+ word TEXT,
410
+ pos_tag TEXT,
411
+ synonyms TEXT,
412
+ language TEXT DEFAULT 'en',
413
+ PRIMARY KEY (word, pos_tag, language)
414
+ )
415
+ ''')
416
+
417
+ cursor.execute('''
418
+ CREATE TABLE config (
419
+ key TEXT PRIMARY KEY,
420
+ value TEXT
421
+ )
422
+ ''')
423
+
424
+ # Create indexes for performance
425
+ cursor.execute('CREATE INDEX idx_chunks_filename ON chunks(filename)')
426
+ cursor.execute('CREATE INDEX idx_chunks_language ON chunks(language)')
427
+ cursor.execute('CREATE INDEX idx_chunks_tags ON chunks(tags)')
428
+
429
+ # Insert config
430
+ embedding_dimensions = 768 # Default for all-mpnet-base-v2
431
+ if chunks and chunks[0].get('embedding'):
432
+ try:
433
+ if np:
434
+ embedding_array = np.frombuffer(chunks[0]['embedding'], dtype=np.float32)
435
+ embedding_dimensions = len(embedding_array)
436
+ except:
437
+ pass
438
+
439
+ config_data = {
440
+ 'embedding_model': self.model_name,
441
+ 'embedding_dimensions': str(embedding_dimensions),
442
+ 'chunk_size': str(self.chunk_size),
443
+ 'chunk_overlap': str(self.chunk_overlap),
444
+ 'preprocessing_version': '1.0',
445
+ 'languages': json.dumps(languages),
446
+ 'created_at': datetime.now().isoformat(),
447
+ 'sources': json.dumps(sources_info), # Store list of sources instead of single directory
448
+ 'file_types': json.dumps(file_types)
449
+ }
450
+
451
+ for key, value in config_data.items():
452
+ cursor.execute('INSERT INTO config (key, value) VALUES (?, ?)', (key, value))
453
+
454
+ # Insert chunks
455
+ for chunk in chunks:
456
+ # Create hash for deduplication - include filename, section, and line numbers for uniqueness
457
+ hash_content = f"{chunk['filename']}:{chunk.get('section', '')}:{chunk.get('start_line', 0)}:{chunk.get('end_line', 0)}:{chunk['content']}"
458
+ chunk_hash = hashlib.sha256(hash_content.encode()).hexdigest()[:16]
459
+
460
+ # Prepare data
461
+ keywords_json = json.dumps(chunk.get('keywords', []))
462
+ tags_json = json.dumps(chunk.get('tags', []))
463
+ metadata_json = json.dumps(chunk.get('metadata', {}))
464
+
465
+ cursor.execute('''
466
+ INSERT OR IGNORE INTO chunks (
467
+ content, processed_content, keywords, language, embedding,
468
+ filename, section, start_line, end_line, tags, metadata, chunk_hash
469
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
470
+ ''', (
471
+ chunk['content'],
472
+ chunk.get('processed_content', chunk['content']),
473
+ keywords_json,
474
+ chunk.get('language', 'en'),
475
+ chunk.get('embedding', b''),
476
+ chunk['filename'],
477
+ chunk.get('section'),
478
+ chunk.get('start_line'),
479
+ chunk.get('end_line'),
480
+ tags_json,
481
+ metadata_json,
482
+ chunk_hash
483
+ ))
484
+
485
+ conn.commit()
486
+
487
+ except Exception as e:
488
+ conn.rollback()
489
+ raise e
490
+ finally:
491
+ conn.close()
492
+
493
+ def validate_index(self, index_file: str) -> Dict[str, Any]:
494
+ """Validate an existing search index"""
495
+ if not os.path.exists(index_file):
496
+ return {"valid": False, "error": "Index file does not exist"}
497
+
498
+ try:
499
+ conn = sqlite3.connect(index_file)
500
+ cursor = conn.cursor()
501
+
502
+ # Check schema
503
+ cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
504
+ tables = [row[0] for row in cursor.fetchall()]
505
+
506
+ required_tables = ['chunks', 'chunks_fts', 'synonyms', 'config']
507
+ missing_tables = [t for t in required_tables if t not in tables]
508
+
509
+ if missing_tables:
510
+ return {"valid": False, "error": f"Missing tables: {missing_tables}"}
511
+
512
+ # Get config
513
+ cursor.execute("SELECT key, value FROM config")
514
+ config = dict(cursor.fetchall())
515
+
516
+ # Get chunk count
517
+ cursor.execute("SELECT COUNT(*) FROM chunks")
518
+ chunk_count = cursor.fetchone()[0]
519
+
520
+ # Get file count
521
+ cursor.execute("SELECT COUNT(DISTINCT filename) FROM chunks")
522
+ file_count = cursor.fetchone()[0]
523
+
524
+ conn.close()
525
+
526
+ return {
527
+ "valid": True,
528
+ "chunk_count": chunk_count,
529
+ "file_count": file_count,
530
+ "config": config
531
+ }
532
+
533
+ except Exception as e:
534
+ return {"valid": False, "error": str(e)}