signalwire-agents 0.1.6__py3-none-any.whl → 1.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (140) hide show
  1. signalwire_agents/__init__.py +130 -4
  2. signalwire_agents/agent_server.py +438 -32
  3. signalwire_agents/agents/bedrock.py +296 -0
  4. signalwire_agents/cli/__init__.py +18 -0
  5. signalwire_agents/cli/build_search.py +1367 -0
  6. signalwire_agents/cli/config.py +80 -0
  7. signalwire_agents/cli/core/__init__.py +10 -0
  8. signalwire_agents/cli/core/agent_loader.py +470 -0
  9. signalwire_agents/cli/core/argparse_helpers.py +179 -0
  10. signalwire_agents/cli/core/dynamic_config.py +71 -0
  11. signalwire_agents/cli/core/service_loader.py +303 -0
  12. signalwire_agents/cli/execution/__init__.py +10 -0
  13. signalwire_agents/cli/execution/datamap_exec.py +446 -0
  14. signalwire_agents/cli/execution/webhook_exec.py +134 -0
  15. signalwire_agents/cli/init_project.py +1225 -0
  16. signalwire_agents/cli/output/__init__.py +10 -0
  17. signalwire_agents/cli/output/output_formatter.py +255 -0
  18. signalwire_agents/cli/output/swml_dump.py +186 -0
  19. signalwire_agents/cli/simulation/__init__.py +10 -0
  20. signalwire_agents/cli/simulation/data_generation.py +374 -0
  21. signalwire_agents/cli/simulation/data_overrides.py +200 -0
  22. signalwire_agents/cli/simulation/mock_env.py +282 -0
  23. signalwire_agents/cli/swaig_test_wrapper.py +52 -0
  24. signalwire_agents/cli/test_swaig.py +809 -0
  25. signalwire_agents/cli/types.py +81 -0
  26. signalwire_agents/core/__init__.py +2 -2
  27. signalwire_agents/core/agent/__init__.py +12 -0
  28. signalwire_agents/core/agent/config/__init__.py +12 -0
  29. signalwire_agents/core/agent/deployment/__init__.py +9 -0
  30. signalwire_agents/core/agent/deployment/handlers/__init__.py +9 -0
  31. signalwire_agents/core/agent/prompt/__init__.py +14 -0
  32. signalwire_agents/core/agent/prompt/manager.py +306 -0
  33. signalwire_agents/core/agent/routing/__init__.py +9 -0
  34. signalwire_agents/core/agent/security/__init__.py +9 -0
  35. signalwire_agents/core/agent/swml/__init__.py +9 -0
  36. signalwire_agents/core/agent/tools/__init__.py +15 -0
  37. signalwire_agents/core/agent/tools/decorator.py +97 -0
  38. signalwire_agents/core/agent/tools/registry.py +210 -0
  39. signalwire_agents/core/agent_base.py +959 -2166
  40. signalwire_agents/core/auth_handler.py +233 -0
  41. signalwire_agents/core/config_loader.py +259 -0
  42. signalwire_agents/core/contexts.py +707 -0
  43. signalwire_agents/core/data_map.py +487 -0
  44. signalwire_agents/core/function_result.py +1150 -1
  45. signalwire_agents/core/logging_config.py +376 -0
  46. signalwire_agents/core/mixins/__init__.py +28 -0
  47. signalwire_agents/core/mixins/ai_config_mixin.py +442 -0
  48. signalwire_agents/core/mixins/auth_mixin.py +287 -0
  49. signalwire_agents/core/mixins/prompt_mixin.py +358 -0
  50. signalwire_agents/core/mixins/serverless_mixin.py +368 -0
  51. signalwire_agents/core/mixins/skill_mixin.py +55 -0
  52. signalwire_agents/core/mixins/state_mixin.py +153 -0
  53. signalwire_agents/core/mixins/tool_mixin.py +230 -0
  54. signalwire_agents/core/mixins/web_mixin.py +1134 -0
  55. signalwire_agents/core/security/session_manager.py +174 -86
  56. signalwire_agents/core/security_config.py +333 -0
  57. signalwire_agents/core/skill_base.py +200 -0
  58. signalwire_agents/core/skill_manager.py +244 -0
  59. signalwire_agents/core/swaig_function.py +33 -9
  60. signalwire_agents/core/swml_builder.py +212 -12
  61. signalwire_agents/core/swml_handler.py +43 -13
  62. signalwire_agents/core/swml_renderer.py +123 -297
  63. signalwire_agents/core/swml_service.py +277 -260
  64. signalwire_agents/prefabs/concierge.py +6 -2
  65. signalwire_agents/prefabs/info_gatherer.py +149 -33
  66. signalwire_agents/prefabs/receptionist.py +14 -22
  67. signalwire_agents/prefabs/survey.py +6 -2
  68. signalwire_agents/schema.json +9218 -5489
  69. signalwire_agents/search/__init__.py +137 -0
  70. signalwire_agents/search/document_processor.py +1223 -0
  71. signalwire_agents/search/index_builder.py +804 -0
  72. signalwire_agents/search/migration.py +418 -0
  73. signalwire_agents/search/models.py +30 -0
  74. signalwire_agents/search/pgvector_backend.py +752 -0
  75. signalwire_agents/search/query_processor.py +502 -0
  76. signalwire_agents/search/search_engine.py +1264 -0
  77. signalwire_agents/search/search_service.py +574 -0
  78. signalwire_agents/skills/README.md +452 -0
  79. signalwire_agents/skills/__init__.py +23 -0
  80. signalwire_agents/skills/api_ninjas_trivia/README.md +215 -0
  81. signalwire_agents/skills/api_ninjas_trivia/__init__.py +12 -0
  82. signalwire_agents/skills/api_ninjas_trivia/skill.py +237 -0
  83. signalwire_agents/skills/datasphere/README.md +210 -0
  84. signalwire_agents/skills/datasphere/__init__.py +12 -0
  85. signalwire_agents/skills/datasphere/skill.py +310 -0
  86. signalwire_agents/skills/datasphere_serverless/README.md +258 -0
  87. signalwire_agents/skills/datasphere_serverless/__init__.py +10 -0
  88. signalwire_agents/skills/datasphere_serverless/skill.py +237 -0
  89. signalwire_agents/skills/datetime/README.md +132 -0
  90. signalwire_agents/skills/datetime/__init__.py +10 -0
  91. signalwire_agents/skills/datetime/skill.py +126 -0
  92. signalwire_agents/skills/joke/README.md +149 -0
  93. signalwire_agents/skills/joke/__init__.py +10 -0
  94. signalwire_agents/skills/joke/skill.py +109 -0
  95. signalwire_agents/skills/math/README.md +161 -0
  96. signalwire_agents/skills/math/__init__.py +10 -0
  97. signalwire_agents/skills/math/skill.py +105 -0
  98. signalwire_agents/skills/mcp_gateway/README.md +230 -0
  99. signalwire_agents/skills/mcp_gateway/__init__.py +10 -0
  100. signalwire_agents/skills/mcp_gateway/skill.py +421 -0
  101. signalwire_agents/skills/native_vector_search/README.md +210 -0
  102. signalwire_agents/skills/native_vector_search/__init__.py +10 -0
  103. signalwire_agents/skills/native_vector_search/skill.py +820 -0
  104. signalwire_agents/skills/play_background_file/README.md +218 -0
  105. signalwire_agents/skills/play_background_file/__init__.py +12 -0
  106. signalwire_agents/skills/play_background_file/skill.py +242 -0
  107. signalwire_agents/skills/registry.py +459 -0
  108. signalwire_agents/skills/spider/README.md +236 -0
  109. signalwire_agents/skills/spider/__init__.py +13 -0
  110. signalwire_agents/skills/spider/skill.py +598 -0
  111. signalwire_agents/skills/swml_transfer/README.md +395 -0
  112. signalwire_agents/skills/swml_transfer/__init__.py +10 -0
  113. signalwire_agents/skills/swml_transfer/skill.py +359 -0
  114. signalwire_agents/skills/weather_api/README.md +178 -0
  115. signalwire_agents/skills/weather_api/__init__.py +12 -0
  116. signalwire_agents/skills/weather_api/skill.py +191 -0
  117. signalwire_agents/skills/web_search/README.md +163 -0
  118. signalwire_agents/skills/web_search/__init__.py +10 -0
  119. signalwire_agents/skills/web_search/skill.py +739 -0
  120. signalwire_agents/skills/wikipedia_search/README.md +228 -0
  121. signalwire_agents/{core/state → skills/wikipedia_search}/__init__.py +5 -4
  122. signalwire_agents/skills/wikipedia_search/skill.py +210 -0
  123. signalwire_agents/utils/__init__.py +14 -0
  124. signalwire_agents/utils/schema_utils.py +111 -44
  125. signalwire_agents/web/__init__.py +17 -0
  126. signalwire_agents/web/web_service.py +559 -0
  127. signalwire_agents-1.0.7.data/data/share/man/man1/sw-agent-init.1 +307 -0
  128. signalwire_agents-1.0.7.data/data/share/man/man1/sw-search.1 +483 -0
  129. signalwire_agents-1.0.7.data/data/share/man/man1/swaig-test.1 +308 -0
  130. signalwire_agents-1.0.7.dist-info/METADATA +992 -0
  131. signalwire_agents-1.0.7.dist-info/RECORD +142 -0
  132. {signalwire_agents-0.1.6.dist-info → signalwire_agents-1.0.7.dist-info}/WHEEL +1 -1
  133. signalwire_agents-1.0.7.dist-info/entry_points.txt +4 -0
  134. signalwire_agents/core/state/file_state_manager.py +0 -219
  135. signalwire_agents/core/state/state_manager.py +0 -101
  136. signalwire_agents-0.1.6.data/data/schema.json +0 -5611
  137. signalwire_agents-0.1.6.dist-info/METADATA +0 -199
  138. signalwire_agents-0.1.6.dist-info/RECORD +0 -34
  139. {signalwire_agents-0.1.6.dist-info → signalwire_agents-1.0.7.dist-info}/licenses/LICENSE +0 -0
  140. {signalwire_agents-0.1.6.dist-info → signalwire_agents-1.0.7.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,804 @@
1
+ """
2
+ Copyright (c) 2025 SignalWire
3
+
4
+ This file is part of the SignalWire AI Agents SDK.
5
+
6
+ Licensed under the MIT License.
7
+ See LICENSE file in the project root for full license information.
8
+ """
9
+
10
+ import os
11
+ import sqlite3
12
+ import json
13
+ import hashlib
14
+ import logging
15
+ from datetime import datetime
16
+ from pathlib import Path
17
+ from typing import List, Optional, Dict, Any
18
+ import fnmatch
19
+
20
+ try:
21
+ import numpy as np
22
+ except ImportError:
23
+ np = None
24
+
25
+ try:
26
+ from sentence_transformers import SentenceTransformer
27
+ except ImportError:
28
+ SentenceTransformer = None
29
+
30
+ from .document_processor import DocumentProcessor
31
+ from .query_processor import preprocess_document_content
32
+
33
+ logger = logging.getLogger(__name__)
34
+
35
+ class IndexBuilder:
36
+ """Build searchable indexes from document directories"""
37
+
38
+ def __init__(
39
+ self,
40
+ model_name: str = 'sentence-transformers/all-mpnet-base-v2',
41
+ chunking_strategy: str = 'sentence',
42
+ max_sentences_per_chunk: int = 5,
43
+ chunk_size: int = 50,
44
+ chunk_overlap: int = 10,
45
+ split_newlines: Optional[int] = None,
46
+ index_nlp_backend: str = 'nltk',
47
+ verbose: bool = False,
48
+ semantic_threshold: float = 0.5,
49
+ topic_threshold: float = 0.3,
50
+ backend: str = 'sqlite',
51
+ connection_string: Optional[str] = None
52
+ ):
53
+ """
54
+ Initialize the index builder
55
+
56
+ Args:
57
+ model_name: Name of the sentence transformer model to use
58
+ chunking_strategy: Strategy for chunking documents ('sentence', 'sliding', 'paragraph', 'page', 'semantic', 'topic', 'qa', 'json')
59
+ max_sentences_per_chunk: For sentence strategy (default: 5)
60
+ chunk_size: For sliding strategy - words per chunk (default: 50)
61
+ chunk_overlap: For sliding strategy - overlap in words (default: 10)
62
+ split_newlines: For sentence strategy - split on multiple newlines (optional)
63
+ index_nlp_backend: NLP backend for indexing (default: 'nltk')
64
+ verbose: Whether to enable verbose logging (default: False)
65
+ semantic_threshold: Similarity threshold for semantic chunking (default: 0.5)
66
+ topic_threshold: Similarity threshold for topic chunking (default: 0.3)
67
+ backend: Storage backend ('sqlite' or 'pgvector') (default: 'sqlite')
68
+ connection_string: PostgreSQL connection string for pgvector backend
69
+ """
70
+ self.model_name = model_name
71
+ self.chunking_strategy = chunking_strategy
72
+ self.max_sentences_per_chunk = max_sentences_per_chunk
73
+ self.chunk_size = chunk_size
74
+ self.chunk_overlap = chunk_overlap
75
+ self.split_newlines = split_newlines
76
+ self.index_nlp_backend = index_nlp_backend
77
+ self.verbose = verbose
78
+ self.semantic_threshold = semantic_threshold
79
+ self.topic_threshold = topic_threshold
80
+ self.backend = backend
81
+ self.connection_string = connection_string
82
+ self.model = None
83
+
84
+ # Validate backend
85
+ if self.backend not in ['sqlite', 'pgvector']:
86
+ raise ValueError(f"Invalid backend '{self.backend}'. Must be 'sqlite' or 'pgvector'")
87
+
88
+ # Validate NLP backend
89
+ if self.index_nlp_backend not in ['nltk', 'spacy']:
90
+ logger.warning(f"Invalid index_nlp_backend '{self.index_nlp_backend}', using 'nltk'")
91
+ self.index_nlp_backend = 'nltk'
92
+
93
+ self.doc_processor = DocumentProcessor(
94
+ chunking_strategy=chunking_strategy,
95
+ max_sentences_per_chunk=max_sentences_per_chunk,
96
+ chunk_size=chunk_size,
97
+ chunk_overlap=chunk_overlap,
98
+ split_newlines=split_newlines,
99
+ index_nlp_backend=self.index_nlp_backend,
100
+ verbose=self.verbose,
101
+ semantic_threshold=self.semantic_threshold,
102
+ topic_threshold=self.topic_threshold
103
+ )
104
+
105
+ def _extract_metadata_from_json_content(self, content: str) -> tuple[Dict[str, Any], str]:
106
+ """
107
+ Extract metadata from JSON content if present
108
+
109
+ Returns:
110
+ (metadata_dict, metadata_text)
111
+ """
112
+ metadata_dict = {}
113
+
114
+ # Try to extract metadata from JSON structure in content
115
+ if '"metadata":' in content:
116
+ try:
117
+ # Look for metadata object in content
118
+ import re
119
+ # Find all metadata objects
120
+ pattern = r'"metadata"\s*:\s*(\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\})'
121
+ matches = re.finditer(pattern, content)
122
+
123
+ for match in matches:
124
+ try:
125
+ json_metadata = json.loads(match.group(1))
126
+ # Merge all found metadata
127
+ if isinstance(json_metadata, dict):
128
+ metadata_dict.update(json_metadata)
129
+ except:
130
+ pass
131
+ except Exception as e:
132
+ logger.debug(f"Error extracting JSON metadata: {e}")
133
+
134
+ # Create searchable text from all metadata keys and values
135
+ metadata_text_parts = []
136
+ for key, value in metadata_dict.items():
137
+ # Add key
138
+ metadata_text_parts.append(str(key))
139
+ # Add value(s)
140
+ if isinstance(value, list):
141
+ metadata_text_parts.extend(str(v) for v in value)
142
+ else:
143
+ metadata_text_parts.append(str(value))
144
+
145
+ metadata_text = ' '.join(metadata_text_parts).lower()
146
+
147
+ return metadata_dict, metadata_text
148
+
149
+ def _load_model(self):
150
+ """Load embedding model (lazy loading)"""
151
+ if self.model is None:
152
+ if not SentenceTransformer:
153
+ raise ImportError("sentence-transformers is required for embedding generation. Install with: pip install sentence-transformers")
154
+
155
+ if self.verbose:
156
+ print(f"Loading embedding model: {self.model_name}")
157
+
158
+ try:
159
+ self.model = SentenceTransformer(self.model_name)
160
+ except Exception as e:
161
+ logger.error(f"Failed to load model '{self.model_name}': {e}")
162
+ raise
163
+
164
+ def build_index_from_sources(self, sources: List[Path], output_file: str,
165
+ file_types: List[str], exclude_patterns: Optional[List[str]] = None,
166
+ languages: List[str] = None, tags: Optional[List[str]] = None,
167
+ overwrite: bool = False):
168
+ """
169
+ Build complete search index from multiple sources (files and directories)
170
+
171
+ Args:
172
+ sources: List of Path objects (files and/or directories)
173
+ output_file: Output .swsearch file path
174
+ file_types: List of file extensions to include for directories
175
+ exclude_patterns: Glob patterns to exclude
176
+ languages: List of languages to support
177
+ tags: Global tags to add to all chunks
178
+ """
179
+
180
+ # Discover files from all sources
181
+ files = self._discover_files_from_sources(sources, file_types, exclude_patterns)
182
+ if self.verbose:
183
+ print(f"Found {len(files)} files to process")
184
+
185
+ if not files:
186
+ print("No files found to process. Check your sources, file types and exclude patterns.")
187
+ return
188
+
189
+ # Process documents
190
+ chunks = []
191
+ print(f"Processing {len(files)} files...")
192
+ for file_path in files:
193
+ try:
194
+ # For individual files, use the file's parent as the base directory
195
+ # For files from directories, use the original source directory
196
+ base_dir = self._get_base_directory_for_file(file_path, sources)
197
+ file_chunks = self._process_file(file_path, base_dir, tags)
198
+ chunks.extend(file_chunks)
199
+ if self.verbose or file_path.suffix == '.json':
200
+ print(f" {file_path}: {len(file_chunks)} chunks")
201
+ except Exception as e:
202
+ logger.error(f"Error processing {file_path}: {e}")
203
+ if self.verbose:
204
+ print(f"Error processing {file_path}: {e}")
205
+
206
+ if not chunks:
207
+ print("No chunks created from documents. Check file contents and processing.")
208
+ return
209
+
210
+ if self.verbose:
211
+ print(f"Created {len(chunks)} total chunks")
212
+
213
+ # Generate embeddings
214
+ self._load_model()
215
+ if self.verbose:
216
+ print(f"Generating embeddings for {len(chunks)} chunks...")
217
+ else:
218
+ print(f"Generating embeddings for {len(chunks)} chunks...")
219
+
220
+ for i, chunk in enumerate(chunks):
221
+ try:
222
+ # Preprocess content for better search
223
+ processed = preprocess_document_content(
224
+ chunk['content'],
225
+ language=chunk.get('language', 'en'),
226
+ index_nlp_backend=self.index_nlp_backend
227
+ )
228
+
229
+ chunk['processed_content'] = processed['enhanced_text']
230
+
231
+ # Include tags in keywords for better search matching
232
+ keywords = processed.get('keywords', [])
233
+ chunk_tags = chunk.get('tags', [])
234
+ if chunk_tags:
235
+ # Add tags to keywords list for FTS matching
236
+ keywords.extend(chunk_tags)
237
+ # Remove duplicates while preserving order
238
+ keywords = list(dict.fromkeys(keywords))
239
+
240
+ chunk['keywords'] = keywords
241
+
242
+ # For embedding, include tags in the text for better semantic matching
243
+ embedding_text = processed['enhanced_text']
244
+ if chunk_tags:
245
+ # Append tags to the text for embedding generation
246
+ embedding_text += " " + " ".join(chunk_tags)
247
+
248
+ # Generate embedding (suppress progress bar)
249
+ embedding = self.model.encode(embedding_text, show_progress_bar=False)
250
+ chunk['embedding'] = embedding.tobytes()
251
+
252
+ # Show progress more frequently
253
+ show_every = 50 if len(chunks) > 500 else max(10, len(chunks) // 10)
254
+ if (i + 1) % show_every == 0 or (i + 1) == len(chunks):
255
+ progress_pct = ((i + 1) / len(chunks)) * 100
256
+ print(f" Progress: {i + 1}/{len(chunks)} chunks ({progress_pct:.1f}%)")
257
+
258
+ except Exception as e:
259
+ logger.error(f"Error processing chunk {i}: {e}")
260
+ # Use original content as fallback
261
+ chunk['processed_content'] = chunk['content']
262
+ chunk['keywords'] = []
263
+ # Create zero embedding as fallback
264
+ if np:
265
+ embedding = np.zeros(768, dtype=np.float32)
266
+ chunk['embedding'] = embedding.tobytes()
267
+ else:
268
+ chunk['embedding'] = b''
269
+
270
+ # Store chunks based on backend
271
+ if self.backend == 'sqlite':
272
+ # Create SQLite database
273
+ sources_info = [str(s) for s in sources]
274
+ self._create_database(output_file, chunks, languages or ['en'], sources_info, file_types)
275
+
276
+ if self.verbose:
277
+ print(f"Index created: {output_file}")
278
+ print(f"Total chunks: {len(chunks)}")
279
+ else:
280
+ # Use pgvector backend
281
+ self._store_chunks_pgvector(chunks, output_file, languages or ['en'], overwrite)
282
+
283
+ def build_index(self, source_dir: str, output_file: str,
284
+ file_types: List[str], exclude_patterns: Optional[List[str]] = None,
285
+ languages: List[str] = None, tags: Optional[List[str]] = None):
286
+ """
287
+ Build complete search index from a single directory
288
+
289
+ Args:
290
+ source_dir: Directory to scan for documents
291
+ output_file: Output .swsearch file path
292
+ file_types: List of file extensions to include
293
+ exclude_patterns: Glob patterns to exclude
294
+ languages: List of languages to support
295
+ tags: Global tags to add to all chunks
296
+ """
297
+
298
+ # Convert to new multi-source method
299
+ sources = [Path(source_dir)]
300
+ self.build_index_from_sources(sources, output_file, file_types, exclude_patterns, languages, tags)
301
+
302
+ def _get_base_directory_for_file(self, file_path: Path, sources: List[Path]) -> str:
303
+ """
304
+ Determine the appropriate base directory for a file to calculate relative paths
305
+
306
+ Args:
307
+ file_path: The file being processed
308
+ sources: List of original source paths
309
+
310
+ Returns:
311
+ Base directory path as string
312
+ """
313
+
314
+ # Check if this file was specified directly as a source
315
+ if file_path in sources:
316
+ # For individual files, use the parent directory
317
+ return str(file_path.parent)
318
+
319
+ # Check if this file is within any of the source directories
320
+ for source in sources:
321
+ if source.is_dir():
322
+ try:
323
+ # Check if file_path is relative to this source directory
324
+ file_path.relative_to(source)
325
+ return str(source)
326
+ except ValueError:
327
+ # file_path is not relative to this source
328
+ continue
329
+
330
+ # Fallback: use the file's parent directory
331
+ return str(file_path.parent)
332
+
333
+ def _discover_files_from_sources(self, sources: List[Path], file_types: List[str],
334
+ exclude_patterns: Optional[List[str]] = None) -> List[Path]:
335
+ """
336
+ Discover files from multiple sources (files and directories)
337
+
338
+ Args:
339
+ sources: List of Path objects (files and/or directories)
340
+ file_types: List of file extensions to include for directories
341
+ exclude_patterns: Glob patterns to exclude
342
+
343
+ Returns:
344
+ List of file paths to process
345
+ """
346
+
347
+ files = []
348
+ supported_extensions = set(ft.lstrip('.').lower() for ft in file_types)
349
+
350
+ for source in sources:
351
+ if source.is_file():
352
+ # Individual file - check if it's supported
353
+ file_ext = source.suffix.lstrip('.').lower()
354
+ if file_ext in supported_extensions or not file_ext: # Allow extensionless files
355
+ # Check exclusions
356
+ if self._is_file_excluded(source, exclude_patterns):
357
+ if self.verbose:
358
+ print(f"Excluded file: {source}")
359
+ continue
360
+
361
+ files.append(source)
362
+ if self.verbose:
363
+ print(f"Added individual file: {source}")
364
+ else:
365
+ if self.verbose:
366
+ print(f"Skipped unsupported file type: {source} (extension: {file_ext})")
367
+
368
+ elif source.is_dir():
369
+ # Directory - use existing discovery logic
370
+ dir_files = self._discover_files(str(source), file_types, exclude_patterns)
371
+ files.extend(dir_files)
372
+ if self.verbose:
373
+ print(f"Added {len(dir_files)} files from directory: {source}")
374
+ else:
375
+ if self.verbose:
376
+ print(f"Skipped non-existent or invalid source: {source}")
377
+
378
+ # Remove duplicates while preserving order
379
+ seen = set()
380
+ unique_files = []
381
+ for file_path in files:
382
+ if file_path not in seen:
383
+ seen.add(file_path)
384
+ unique_files.append(file_path)
385
+
386
+ return unique_files
387
+
388
+ def _is_file_excluded(self, file_path: Path, exclude_patterns: Optional[List[str]] = None) -> bool:
389
+ """
390
+ Check if a file should be excluded based on exclude patterns
391
+
392
+ Args:
393
+ file_path: Path to check
394
+ exclude_patterns: List of glob patterns to exclude
395
+
396
+ Returns:
397
+ True if file should be excluded
398
+ """
399
+
400
+ if not exclude_patterns:
401
+ return False
402
+
403
+ import fnmatch
404
+
405
+ file_str = str(file_path)
406
+ for pattern in exclude_patterns:
407
+ if fnmatch.fnmatch(file_str, pattern):
408
+ return True
409
+
410
+ return False
411
+
412
+ def _discover_files(self, source_dir: str, file_types: List[str],
413
+ exclude_patterns: Optional[List[str]] = None) -> List[Path]:
414
+ """Discover files to index"""
415
+ files = []
416
+ source_path = Path(source_dir)
417
+
418
+ if not source_path.exists():
419
+ raise FileNotFoundError(f"Source directory does not exist: {source_dir}")
420
+
421
+ for file_type in file_types:
422
+ # Clean up file type (remove leading dots)
423
+ clean_type = file_type.lstrip('.')
424
+ pattern = f"**/*.{clean_type}"
425
+
426
+ for file_path in source_path.glob(pattern):
427
+ # Skip directories
428
+ if not file_path.is_file():
429
+ continue
430
+
431
+ # Check exclusions
432
+ if exclude_patterns:
433
+ excluded = False
434
+ for pattern in exclude_patterns:
435
+ if fnmatch.fnmatch(str(file_path), pattern):
436
+ excluded = True
437
+ break
438
+ if excluded:
439
+ if self.verbose:
440
+ print(f"Excluded: {file_path}")
441
+ continue
442
+
443
+ files.append(file_path)
444
+
445
+ return files
446
+
447
+ def _process_file(self, file_path: Path, source_dir: str,
448
+ global_tags: Optional[List[str]] = None) -> List[Dict[str, Any]]:
449
+ """Process single file into chunks"""
450
+ try:
451
+ relative_path = str(file_path.relative_to(source_dir))
452
+ file_extension = file_path.suffix.lower()
453
+
454
+ # Handle different file types appropriately
455
+ if file_extension == '.pdf':
456
+ # Use document processor for PDF extraction
457
+ content_result = self.doc_processor._extract_text_from_file(str(file_path))
458
+ if isinstance(content_result, str) and content_result.startswith('{"error"'):
459
+ if self.verbose:
460
+ print(f"Skipping PDF file (extraction failed): {file_path}")
461
+ return []
462
+ content = content_result
463
+ elif file_extension in ['.docx', '.xlsx', '.pptx']:
464
+ # Use document processor for Office documents
465
+ content_result = self.doc_processor._extract_text_from_file(str(file_path))
466
+ if isinstance(content_result, str) and content_result.startswith('{"error"'):
467
+ if self.verbose:
468
+ print(f"Skipping office document (extraction failed): {file_path}")
469
+ return []
470
+ content = content_result
471
+ elif file_extension == '.html':
472
+ # Use document processor for HTML
473
+ content_result = self.doc_processor._extract_text_from_file(str(file_path))
474
+ if isinstance(content_result, str) and content_result.startswith('{"error"'):
475
+ if self.verbose:
476
+ print(f"Skipping HTML file (extraction failed): {file_path}")
477
+ return []
478
+ content = content_result
479
+ elif file_extension == '.rtf':
480
+ # Use document processor for RTF
481
+ content_result = self.doc_processor._extract_text_from_file(str(file_path))
482
+ if isinstance(content_result, str) and content_result.startswith('{"error"'):
483
+ if self.verbose:
484
+ print(f"Skipping RTF file (extraction failed): {file_path}")
485
+ return []
486
+ content = content_result
487
+ else:
488
+ # Try to read as text file (markdown, txt, code, etc.)
489
+ try:
490
+ content = file_path.read_text(encoding='utf-8')
491
+ except UnicodeDecodeError:
492
+ if self.verbose:
493
+ print(f"Skipping binary file: {file_path}")
494
+ return []
495
+
496
+ # Validate content
497
+ if not content or (isinstance(content, str) and len(content.strip()) == 0):
498
+ if self.verbose:
499
+ print(f"Skipping empty file: {file_path}")
500
+ return []
501
+
502
+ # Create chunks using document processor - pass content directly, not file path
503
+ chunks = self.doc_processor.create_chunks(
504
+ content=content, # Pass the actual content, not the file path
505
+ filename=relative_path,
506
+ file_type=file_path.suffix.lstrip('.')
507
+ )
508
+
509
+ # Add global tags
510
+ if global_tags:
511
+ for chunk in chunks:
512
+ existing_tags = chunk.get('tags', [])
513
+ if isinstance(existing_tags, str):
514
+ existing_tags = [existing_tags]
515
+ chunk['tags'] = existing_tags + global_tags
516
+
517
+ return chunks
518
+
519
+ except Exception as e:
520
+ logger.error(f"Error processing file {file_path}: {e}")
521
+ return []
522
+
523
+ def _create_database(self, output_file: str, chunks: List[Dict[str, Any]],
524
+ languages: List[str], sources_info: List[str], file_types: List[str]):
525
+ """Create SQLite database with all data"""
526
+
527
+ # Remove existing file
528
+ if os.path.exists(output_file):
529
+ os.remove(output_file)
530
+
531
+ conn = sqlite3.connect(output_file)
532
+ cursor = conn.cursor()
533
+
534
+ try:
535
+ # Create schema
536
+ cursor.execute('''
537
+ CREATE TABLE chunks (
538
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
539
+ content TEXT NOT NULL,
540
+ processed_content TEXT NOT NULL,
541
+ keywords TEXT,
542
+ language TEXT DEFAULT 'en',
543
+ embedding BLOB NOT NULL,
544
+ filename TEXT NOT NULL,
545
+ section TEXT,
546
+ start_line INTEGER,
547
+ end_line INTEGER,
548
+ tags TEXT,
549
+ metadata TEXT,
550
+ metadata_text TEXT, -- Searchable text representation of all metadata
551
+ chunk_hash TEXT UNIQUE,
552
+ created_at TEXT DEFAULT CURRENT_TIMESTAMP
553
+ )
554
+ ''')
555
+
556
+ cursor.execute('''
557
+ CREATE VIRTUAL TABLE chunks_fts USING fts5(
558
+ processed_content,
559
+ keywords,
560
+ metadata_text,
561
+ content='chunks',
562
+ content_rowid='id'
563
+ )
564
+ ''')
565
+
566
+ cursor.execute('''
567
+ CREATE TABLE synonyms (
568
+ word TEXT,
569
+ pos_tag TEXT,
570
+ synonyms TEXT,
571
+ language TEXT DEFAULT 'en',
572
+ PRIMARY KEY (word, pos_tag, language)
573
+ )
574
+ ''')
575
+
576
+ cursor.execute('''
577
+ CREATE TABLE config (
578
+ key TEXT PRIMARY KEY,
579
+ value TEXT
580
+ )
581
+ ''')
582
+
583
+ # Create indexes for performance
584
+ cursor.execute('CREATE INDEX idx_chunks_filename ON chunks(filename)')
585
+ cursor.execute('CREATE INDEX idx_chunks_language ON chunks(language)')
586
+ cursor.execute('CREATE INDEX idx_chunks_tags ON chunks(tags)')
587
+
588
+ # Insert config
589
+ embedding_dimensions = 768 # Default for all-mpnet-base-v2
590
+ if chunks and chunks[0].get('embedding'):
591
+ try:
592
+ if np:
593
+ embedding_array = np.frombuffer(chunks[0]['embedding'], dtype=np.float32)
594
+ embedding_dimensions = len(embedding_array)
595
+ except:
596
+ pass
597
+
598
+ config_data = {
599
+ 'embedding_model': self.model_name,
600
+ 'embedding_dimensions': str(embedding_dimensions),
601
+ 'chunk_size': str(self.chunk_size),
602
+ 'chunk_overlap': str(self.chunk_overlap),
603
+ 'preprocessing_version': '1.0',
604
+ 'languages': json.dumps(languages),
605
+ 'created_at': datetime.now().isoformat(),
606
+ 'sources': json.dumps(sources_info), # Store list of sources instead of single directory
607
+ 'file_types': json.dumps(file_types)
608
+ }
609
+
610
+ for key, value in config_data.items():
611
+ cursor.execute('INSERT INTO config (key, value) VALUES (?, ?)', (key, value))
612
+
613
+ # Insert chunks
614
+ for chunk in chunks:
615
+ # Create hash for deduplication - include filename, section, and line numbers for uniqueness
616
+ hash_content = f"{chunk['filename']}:{chunk.get('section', '')}:{chunk.get('start_line', 0)}:{chunk.get('end_line', 0)}:{chunk['content']}"
617
+ chunk_hash = hashlib.sha256(hash_content.encode()).hexdigest()[:16]
618
+
619
+ # Prepare data
620
+ keywords_json = json.dumps(chunk.get('keywords', []))
621
+ tags_json = json.dumps(chunk.get('tags', []))
622
+
623
+ # Extract metadata from JSON content and merge with chunk metadata
624
+ json_metadata, json_metadata_text = self._extract_metadata_from_json_content(chunk['content'])
625
+ chunk_metadata = chunk.get('metadata', {})
626
+
627
+ # Merge metadata: chunk metadata takes precedence
628
+ merged_metadata = {**json_metadata, **chunk_metadata}
629
+ metadata_json = json.dumps(merged_metadata)
630
+
631
+ # Create comprehensive metadata_text including tags
632
+ metadata_text_parts = []
633
+
634
+ # Add metadata text from JSON content
635
+ if json_metadata_text:
636
+ metadata_text_parts.append(json_metadata_text)
637
+
638
+ # Add tags
639
+ tags = chunk.get('tags', [])
640
+ if tags:
641
+ metadata_text_parts.extend(str(tag).lower() for tag in tags)
642
+
643
+ # Add section if present
644
+ if chunk.get('section'):
645
+ metadata_text_parts.append(chunk['section'].lower())
646
+
647
+ # Add any additional metadata values
648
+ for key, value in chunk_metadata.items():
649
+ if key not in json_metadata: # Avoid duplicates
650
+ metadata_text_parts.append(str(key).lower())
651
+ if isinstance(value, list):
652
+ metadata_text_parts.extend(str(v).lower() for v in value)
653
+ else:
654
+ metadata_text_parts.append(str(value).lower())
655
+
656
+ metadata_text = ' '.join(metadata_text_parts)
657
+
658
+ cursor.execute('''
659
+ INSERT OR IGNORE INTO chunks (
660
+ content, processed_content, keywords, language, embedding,
661
+ filename, section, start_line, end_line, tags, metadata, metadata_text, chunk_hash
662
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
663
+ ''', (
664
+ chunk['content'],
665
+ chunk.get('processed_content', chunk['content']),
666
+ keywords_json,
667
+ chunk.get('language', 'en'),
668
+ chunk.get('embedding', b''),
669
+ chunk['filename'],
670
+ chunk.get('section'),
671
+ chunk.get('start_line'),
672
+ chunk.get('end_line'),
673
+ tags_json,
674
+ metadata_json,
675
+ metadata_text,
676
+ chunk_hash
677
+ ))
678
+
679
+ conn.commit()
680
+
681
+ except Exception as e:
682
+ conn.rollback()
683
+ raise e
684
+ finally:
685
+ conn.close()
686
+
687
+ def validate_index(self, index_file: str) -> Dict[str, Any]:
688
+ """Validate an existing search index"""
689
+ if not os.path.exists(index_file):
690
+ return {"valid": False, "error": "Index file does not exist"}
691
+
692
+ try:
693
+ conn = sqlite3.connect(index_file)
694
+ cursor = conn.cursor()
695
+
696
+ # Check schema
697
+ cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
698
+ tables = [row[0] for row in cursor.fetchall()]
699
+
700
+ required_tables = ['chunks', 'chunks_fts', 'synonyms', 'config']
701
+ missing_tables = [t for t in required_tables if t not in tables]
702
+
703
+ if missing_tables:
704
+ return {"valid": False, "error": f"Missing tables: {missing_tables}"}
705
+
706
+ # Get config
707
+ cursor.execute("SELECT key, value FROM config")
708
+ config = dict(cursor.fetchall())
709
+
710
+ # Get chunk count
711
+ cursor.execute("SELECT COUNT(*) FROM chunks")
712
+ chunk_count = cursor.fetchone()[0]
713
+
714
+ # Get file count
715
+ cursor.execute("SELECT COUNT(DISTINCT filename) FROM chunks")
716
+ file_count = cursor.fetchone()[0]
717
+
718
+ conn.close()
719
+
720
+ return {
721
+ "valid": True,
722
+ "chunk_count": chunk_count,
723
+ "file_count": file_count,
724
+ "config": config
725
+ }
726
+
727
+ except Exception as e:
728
+ return {"valid": False, "error": str(e)}
729
+
730
+ def _store_chunks_pgvector(self, chunks: List[Dict[str, Any]], collection_name: str,
731
+ languages: List[str], overwrite: bool = False):
732
+ """
733
+ Store chunks in pgvector backend
734
+
735
+ Args:
736
+ chunks: List of processed chunks
737
+ collection_name: Name for the collection (from output_file parameter)
738
+ languages: List of supported languages
739
+ """
740
+ from .pgvector_backend import PgVectorBackend
741
+
742
+ # Extract collection name from the provided name
743
+ if collection_name.endswith('.swsearch'):
744
+ collection_name = collection_name[:-9] # Remove .swsearch extension
745
+
746
+ # Clean collection name for PostgreSQL
747
+ import re
748
+ collection_name = re.sub(r'[^a-zA-Z0-9_]', '_', collection_name)
749
+
750
+ if self.verbose:
751
+ print(f"Storing chunks in pgvector collection: {collection_name}")
752
+
753
+ # Create backend instance
754
+ backend = PgVectorBackend(self.connection_string)
755
+
756
+ try:
757
+ # Get embedding dimensions from model
758
+ if self.model:
759
+ embedding_dim = self.model.get_sentence_embedding_dimension()
760
+ else:
761
+ embedding_dim = 768 # Default for all-mpnet-base-v2
762
+
763
+ # Delete existing collection if overwrite is requested
764
+ if overwrite:
765
+ if self.verbose:
766
+ print(f"Dropping existing collection: {collection_name}")
767
+ backend.delete_collection(collection_name)
768
+
769
+ # Create schema
770
+ backend.create_schema(collection_name, embedding_dim)
771
+
772
+ # Convert embeddings from bytes to numpy arrays
773
+ for chunk in chunks:
774
+ if chunk.get('embedding') and isinstance(chunk['embedding'], bytes):
775
+ if np:
776
+ chunk['embedding'] = np.frombuffer(chunk['embedding'], dtype=np.float32)
777
+ else:
778
+ # If numpy not available, leave as bytes
779
+ pass
780
+
781
+ # Prepare config
782
+ config = {
783
+ 'model_name': self.model_name,
784
+ 'embedding_dimensions': embedding_dim,
785
+ 'chunking_strategy': self.chunking_strategy,
786
+ 'languages': languages,
787
+ 'metadata': {
788
+ 'max_sentences_per_chunk': self.max_sentences_per_chunk,
789
+ 'chunk_size': self.chunk_size,
790
+ 'chunk_overlap': self.chunk_overlap,
791
+ 'index_nlp_backend': self.index_nlp_backend
792
+ }
793
+ }
794
+
795
+ # Store chunks
796
+ backend.store_chunks(chunks, collection_name, config)
797
+
798
+ if self.verbose:
799
+ stats = backend.get_stats(collection_name)
800
+ print(f"Stored {stats['total_chunks']} chunks in pgvector")
801
+ print(f"Collection: {collection_name}")
802
+
803
+ finally:
804
+ backend.close()