signalwire-agents 0.1.13__py3-none-any.whl → 1.0.17.dev4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. signalwire_agents/__init__.py +99 -15
  2. signalwire_agents/agent_server.py +248 -60
  3. signalwire_agents/agents/bedrock.py +296 -0
  4. signalwire_agents/cli/__init__.py +9 -0
  5. signalwire_agents/cli/build_search.py +951 -41
  6. signalwire_agents/cli/config.py +80 -0
  7. signalwire_agents/cli/core/__init__.py +10 -0
  8. signalwire_agents/cli/core/agent_loader.py +470 -0
  9. signalwire_agents/cli/core/argparse_helpers.py +179 -0
  10. signalwire_agents/cli/core/dynamic_config.py +71 -0
  11. signalwire_agents/cli/core/service_loader.py +303 -0
  12. signalwire_agents/cli/dokku.py +2320 -0
  13. signalwire_agents/cli/execution/__init__.py +10 -0
  14. signalwire_agents/cli/execution/datamap_exec.py +446 -0
  15. signalwire_agents/cli/execution/webhook_exec.py +134 -0
  16. signalwire_agents/cli/init_project.py +2636 -0
  17. signalwire_agents/cli/output/__init__.py +10 -0
  18. signalwire_agents/cli/output/output_formatter.py +255 -0
  19. signalwire_agents/cli/output/swml_dump.py +186 -0
  20. signalwire_agents/cli/simulation/__init__.py +10 -0
  21. signalwire_agents/cli/simulation/data_generation.py +374 -0
  22. signalwire_agents/cli/simulation/data_overrides.py +200 -0
  23. signalwire_agents/cli/simulation/mock_env.py +282 -0
  24. signalwire_agents/cli/swaig_test_wrapper.py +52 -0
  25. signalwire_agents/cli/test_swaig.py +566 -2366
  26. signalwire_agents/cli/types.py +81 -0
  27. signalwire_agents/core/__init__.py +2 -2
  28. signalwire_agents/core/agent/__init__.py +12 -0
  29. signalwire_agents/core/agent/config/__init__.py +12 -0
  30. signalwire_agents/core/agent/deployment/__init__.py +9 -0
  31. signalwire_agents/core/agent/deployment/handlers/__init__.py +9 -0
  32. signalwire_agents/core/agent/prompt/__init__.py +14 -0
  33. signalwire_agents/core/agent/prompt/manager.py +306 -0
  34. signalwire_agents/core/agent/routing/__init__.py +9 -0
  35. signalwire_agents/core/agent/security/__init__.py +9 -0
  36. signalwire_agents/core/agent/swml/__init__.py +9 -0
  37. signalwire_agents/core/agent/tools/__init__.py +15 -0
  38. signalwire_agents/core/agent/tools/decorator.py +97 -0
  39. signalwire_agents/core/agent/tools/registry.py +210 -0
  40. signalwire_agents/core/agent_base.py +845 -2916
  41. signalwire_agents/core/auth_handler.py +233 -0
  42. signalwire_agents/core/config_loader.py +259 -0
  43. signalwire_agents/core/contexts.py +418 -0
  44. signalwire_agents/core/data_map.py +3 -15
  45. signalwire_agents/core/function_result.py +116 -44
  46. signalwire_agents/core/logging_config.py +162 -18
  47. signalwire_agents/core/mixins/__init__.py +28 -0
  48. signalwire_agents/core/mixins/ai_config_mixin.py +442 -0
  49. signalwire_agents/core/mixins/auth_mixin.py +280 -0
  50. signalwire_agents/core/mixins/prompt_mixin.py +358 -0
  51. signalwire_agents/core/mixins/serverless_mixin.py +460 -0
  52. signalwire_agents/core/mixins/skill_mixin.py +55 -0
  53. signalwire_agents/core/mixins/state_mixin.py +153 -0
  54. signalwire_agents/core/mixins/tool_mixin.py +230 -0
  55. signalwire_agents/core/mixins/web_mixin.py +1142 -0
  56. signalwire_agents/core/security_config.py +333 -0
  57. signalwire_agents/core/skill_base.py +84 -1
  58. signalwire_agents/core/skill_manager.py +62 -20
  59. signalwire_agents/core/swaig_function.py +18 -5
  60. signalwire_agents/core/swml_builder.py +207 -11
  61. signalwire_agents/core/swml_handler.py +27 -21
  62. signalwire_agents/core/swml_renderer.py +123 -312
  63. signalwire_agents/core/swml_service.py +171 -203
  64. signalwire_agents/mcp_gateway/__init__.py +29 -0
  65. signalwire_agents/mcp_gateway/gateway_service.py +564 -0
  66. signalwire_agents/mcp_gateway/mcp_manager.py +513 -0
  67. signalwire_agents/mcp_gateway/session_manager.py +218 -0
  68. signalwire_agents/prefabs/concierge.py +0 -3
  69. signalwire_agents/prefabs/faq_bot.py +0 -3
  70. signalwire_agents/prefabs/info_gatherer.py +0 -3
  71. signalwire_agents/prefabs/receptionist.py +0 -3
  72. signalwire_agents/prefabs/survey.py +0 -3
  73. signalwire_agents/schema.json +9218 -5489
  74. signalwire_agents/search/__init__.py +7 -1
  75. signalwire_agents/search/document_processor.py +490 -31
  76. signalwire_agents/search/index_builder.py +307 -37
  77. signalwire_agents/search/migration.py +418 -0
  78. signalwire_agents/search/models.py +30 -0
  79. signalwire_agents/search/pgvector_backend.py +748 -0
  80. signalwire_agents/search/query_processor.py +162 -31
  81. signalwire_agents/search/search_engine.py +916 -35
  82. signalwire_agents/search/search_service.py +376 -53
  83. signalwire_agents/skills/README.md +452 -0
  84. signalwire_agents/skills/__init__.py +14 -2
  85. signalwire_agents/skills/api_ninjas_trivia/README.md +215 -0
  86. signalwire_agents/skills/api_ninjas_trivia/__init__.py +12 -0
  87. signalwire_agents/skills/api_ninjas_trivia/skill.py +237 -0
  88. signalwire_agents/skills/datasphere/README.md +210 -0
  89. signalwire_agents/skills/datasphere/skill.py +84 -3
  90. signalwire_agents/skills/datasphere_serverless/README.md +258 -0
  91. signalwire_agents/skills/datasphere_serverless/__init__.py +9 -0
  92. signalwire_agents/skills/datasphere_serverless/skill.py +82 -1
  93. signalwire_agents/skills/datetime/README.md +132 -0
  94. signalwire_agents/skills/datetime/__init__.py +9 -0
  95. signalwire_agents/skills/datetime/skill.py +20 -7
  96. signalwire_agents/skills/joke/README.md +149 -0
  97. signalwire_agents/skills/joke/__init__.py +9 -0
  98. signalwire_agents/skills/joke/skill.py +21 -0
  99. signalwire_agents/skills/math/README.md +161 -0
  100. signalwire_agents/skills/math/__init__.py +9 -0
  101. signalwire_agents/skills/math/skill.py +18 -4
  102. signalwire_agents/skills/mcp_gateway/README.md +230 -0
  103. signalwire_agents/skills/mcp_gateway/__init__.py +10 -0
  104. signalwire_agents/skills/mcp_gateway/skill.py +421 -0
  105. signalwire_agents/skills/native_vector_search/README.md +210 -0
  106. signalwire_agents/skills/native_vector_search/__init__.py +9 -0
  107. signalwire_agents/skills/native_vector_search/skill.py +569 -101
  108. signalwire_agents/skills/play_background_file/README.md +218 -0
  109. signalwire_agents/skills/play_background_file/__init__.py +12 -0
  110. signalwire_agents/skills/play_background_file/skill.py +242 -0
  111. signalwire_agents/skills/registry.py +395 -40
  112. signalwire_agents/skills/spider/README.md +236 -0
  113. signalwire_agents/skills/spider/__init__.py +13 -0
  114. signalwire_agents/skills/spider/skill.py +598 -0
  115. signalwire_agents/skills/swml_transfer/README.md +395 -0
  116. signalwire_agents/skills/swml_transfer/__init__.py +10 -0
  117. signalwire_agents/skills/swml_transfer/skill.py +359 -0
  118. signalwire_agents/skills/weather_api/README.md +178 -0
  119. signalwire_agents/skills/weather_api/__init__.py +12 -0
  120. signalwire_agents/skills/weather_api/skill.py +191 -0
  121. signalwire_agents/skills/web_search/README.md +163 -0
  122. signalwire_agents/skills/web_search/__init__.py +9 -0
  123. signalwire_agents/skills/web_search/skill.py +586 -112
  124. signalwire_agents/skills/wikipedia_search/README.md +228 -0
  125. signalwire_agents/{core/state → skills/wikipedia_search}/__init__.py +5 -4
  126. signalwire_agents/skills/{wikipedia → wikipedia_search}/skill.py +33 -3
  127. signalwire_agents/web/__init__.py +17 -0
  128. signalwire_agents/web/web_service.py +559 -0
  129. signalwire_agents-1.0.17.dev4.data/data/share/man/man1/sw-agent-init.1 +400 -0
  130. signalwire_agents-1.0.17.dev4.data/data/share/man/man1/sw-search.1 +483 -0
  131. signalwire_agents-1.0.17.dev4.data/data/share/man/man1/swaig-test.1 +308 -0
  132. {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/METADATA +347 -215
  133. signalwire_agents-1.0.17.dev4.dist-info/RECORD +147 -0
  134. signalwire_agents-1.0.17.dev4.dist-info/entry_points.txt +6 -0
  135. signalwire_agents/core/state/file_state_manager.py +0 -219
  136. signalwire_agents/core/state/state_manager.py +0 -101
  137. signalwire_agents/skills/wikipedia/__init__.py +0 -9
  138. signalwire_agents-0.1.13.data/data/schema.json +0 -5611
  139. signalwire_agents-0.1.13.dist-info/RECORD +0 -67
  140. signalwire_agents-0.1.13.dist-info/entry_points.txt +0 -3
  141. {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/WHEEL +0 -0
  142. {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/licenses/LICENSE +0 -0
  143. {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/top_level.txt +0 -0
@@ -35,29 +35,117 @@ logger = logging.getLogger(__name__)
35
35
  class IndexBuilder:
36
36
  """Build searchable indexes from document directories"""
37
37
 
38
- def __init__(self, model_name: str = 'sentence-transformers/all-mpnet-base-v2',
39
- chunking_strategy: str = 'sentence',
40
- max_sentences_per_chunk: int = 50,
41
- chunk_size: int = 50,
42
- chunk_overlap: int = 10,
43
- split_newlines: Optional[int] = None,
44
- verbose: bool = False):
38
+ def __init__(
39
+ self,
40
+ model_name: str = 'sentence-transformers/all-mpnet-base-v2',
41
+ chunking_strategy: str = 'sentence',
42
+ max_sentences_per_chunk: int = 5,
43
+ chunk_size: int = 50,
44
+ chunk_overlap: int = 10,
45
+ split_newlines: Optional[int] = None,
46
+ index_nlp_backend: str = 'nltk',
47
+ verbose: bool = False,
48
+ semantic_threshold: float = 0.5,
49
+ topic_threshold: float = 0.3,
50
+ backend: str = 'sqlite',
51
+ connection_string: Optional[str] = None
52
+ ):
53
+ """
54
+ Initialize the index builder
55
+
56
+ Args:
57
+ model_name: Name of the sentence transformer model to use
58
+ chunking_strategy: Strategy for chunking documents ('sentence', 'sliding', 'paragraph', 'page', 'semantic', 'topic', 'qa', 'json')
59
+ max_sentences_per_chunk: For sentence strategy (default: 5)
60
+ chunk_size: For sliding strategy - words per chunk (default: 50)
61
+ chunk_overlap: For sliding strategy - overlap in words (default: 10)
62
+ split_newlines: For sentence strategy - split on multiple newlines (optional)
63
+ index_nlp_backend: NLP backend for indexing (default: 'nltk')
64
+ verbose: Whether to enable verbose logging (default: False)
65
+ semantic_threshold: Similarity threshold for semantic chunking (default: 0.5)
66
+ topic_threshold: Similarity threshold for topic chunking (default: 0.3)
67
+ backend: Storage backend ('sqlite' or 'pgvector') (default: 'sqlite')
68
+ connection_string: PostgreSQL connection string for pgvector backend
69
+ """
45
70
  self.model_name = model_name
46
71
  self.chunking_strategy = chunking_strategy
47
72
  self.max_sentences_per_chunk = max_sentences_per_chunk
48
73
  self.chunk_size = chunk_size
49
74
  self.chunk_overlap = chunk_overlap
50
75
  self.split_newlines = split_newlines
76
+ self.index_nlp_backend = index_nlp_backend
51
77
  self.verbose = verbose
78
+ self.semantic_threshold = semantic_threshold
79
+ self.topic_threshold = topic_threshold
80
+ self.backend = backend
81
+ self.connection_string = connection_string
52
82
  self.model = None
83
+
84
+ # Validate backend
85
+ if self.backend not in ['sqlite', 'pgvector']:
86
+ raise ValueError(f"Invalid backend '{self.backend}'. Must be 'sqlite' or 'pgvector'")
87
+
88
+ # Validate NLP backend
89
+ if self.index_nlp_backend not in ['nltk', 'spacy']:
90
+ logger.warning(f"Invalid index_nlp_backend '{self.index_nlp_backend}', using 'nltk'")
91
+ self.index_nlp_backend = 'nltk'
92
+
53
93
  self.doc_processor = DocumentProcessor(
54
94
  chunking_strategy=chunking_strategy,
55
95
  max_sentences_per_chunk=max_sentences_per_chunk,
56
96
  chunk_size=chunk_size,
57
- overlap_size=chunk_overlap,
58
- split_newlines=split_newlines
97
+ chunk_overlap=chunk_overlap,
98
+ split_newlines=split_newlines,
99
+ index_nlp_backend=self.index_nlp_backend,
100
+ verbose=self.verbose,
101
+ semantic_threshold=self.semantic_threshold,
102
+ topic_threshold=self.topic_threshold
59
103
  )
60
104
 
105
+ def _extract_metadata_from_json_content(self, content: str) -> tuple[Dict[str, Any], str]:
106
+ """
107
+ Extract metadata from JSON content if present
108
+
109
+ Returns:
110
+ (metadata_dict, metadata_text)
111
+ """
112
+ metadata_dict = {}
113
+
114
+ # Try to extract metadata from JSON structure in content
115
+ if '"metadata":' in content:
116
+ try:
117
+ # Look for metadata object in content
118
+ import re
119
+ # Find all metadata objects
120
+ pattern = r'"metadata"\s*:\s*(\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\})'
121
+ matches = re.finditer(pattern, content)
122
+
123
+ for match in matches:
124
+ try:
125
+ json_metadata = json.loads(match.group(1))
126
+ # Merge all found metadata
127
+ if isinstance(json_metadata, dict):
128
+ metadata_dict.update(json_metadata)
129
+ except:
130
+ pass
131
+ except Exception as e:
132
+ logger.debug(f"Error extracting JSON metadata: {e}")
133
+
134
+ # Create searchable text from all metadata keys and values
135
+ metadata_text_parts = []
136
+ for key, value in metadata_dict.items():
137
+ # Add key
138
+ metadata_text_parts.append(str(key))
139
+ # Add value(s)
140
+ if isinstance(value, list):
141
+ metadata_text_parts.extend(str(v) for v in value)
142
+ else:
143
+ metadata_text_parts.append(str(value))
144
+
145
+ metadata_text = ' '.join(metadata_text_parts).lower()
146
+
147
+ return metadata_dict, metadata_text
148
+
61
149
  def _load_model(self):
62
150
  """Load embedding model (lazy loading)"""
63
151
  if self.model is None:
@@ -75,7 +163,8 @@ class IndexBuilder:
75
163
 
76
164
  def build_index_from_sources(self, sources: List[Path], output_file: str,
77
165
  file_types: List[str], exclude_patterns: Optional[List[str]] = None,
78
- languages: List[str] = None, tags: Optional[List[str]] = None):
166
+ languages: List[str] = None, tags: Optional[List[str]] = None,
167
+ overwrite: bool = False):
79
168
  """
80
169
  Build complete search index from multiple sources (files and directories)
81
170
 
@@ -99,6 +188,7 @@ class IndexBuilder:
99
188
 
100
189
  # Process documents
101
190
  chunks = []
191
+ print(f"Processing {len(files)} files...")
102
192
  for file_path in files:
103
193
  try:
104
194
  # For individual files, use the file's parent as the base directory
@@ -106,8 +196,8 @@ class IndexBuilder:
106
196
  base_dir = self._get_base_directory_for_file(file_path, sources)
107
197
  file_chunks = self._process_file(file_path, base_dir, tags)
108
198
  chunks.extend(file_chunks)
109
- if self.verbose:
110
- print(f"Processed {file_path}: {len(file_chunks)} chunks")
199
+ if self.verbose or file_path.suffix == '.json':
200
+ print(f" {file_path}: {len(file_chunks)} chunks")
111
201
  except Exception as e:
112
202
  logger.error(f"Error processing {file_path}: {e}")
113
203
  if self.verbose:
@@ -123,26 +213,47 @@ class IndexBuilder:
123
213
  # Generate embeddings
124
214
  self._load_model()
125
215
  if self.verbose:
126
- print("Generating embeddings...")
216
+ print(f"Generating embeddings for {len(chunks)} chunks...")
217
+ else:
218
+ print(f"Generating embeddings for {len(chunks)} chunks...")
127
219
 
128
220
  for i, chunk in enumerate(chunks):
129
221
  try:
130
222
  # Preprocess content for better search
131
223
  processed = preprocess_document_content(
132
224
  chunk['content'],
133
- language=chunk.get('language', 'en')
225
+ language=chunk.get('language', 'en'),
226
+ index_nlp_backend=self.index_nlp_backend
134
227
  )
135
228
 
136
229
  chunk['processed_content'] = processed['enhanced_text']
137
- chunk['keywords'] = processed.get('keywords', [])
230
+
231
+ # Include tags in keywords for better search matching
232
+ keywords = processed.get('keywords', [])
233
+ chunk_tags = chunk.get('tags', [])
234
+ if chunk_tags:
235
+ # Add tags to keywords list for FTS matching
236
+ keywords.extend(chunk_tags)
237
+ # Remove duplicates while preserving order
238
+ keywords = list(dict.fromkeys(keywords))
239
+
240
+ chunk['keywords'] = keywords
241
+
242
+ # For embedding, include tags in the text for better semantic matching
243
+ embedding_text = processed['enhanced_text']
244
+ if chunk_tags:
245
+ # Append tags to the text for embedding generation
246
+ embedding_text += " " + " ".join(chunk_tags)
138
247
 
139
248
  # Generate embedding (suppress progress bar)
140
- embedding = self.model.encode(processed['enhanced_text'], show_progress_bar=False)
249
+ embedding = self.model.encode(embedding_text, show_progress_bar=False)
141
250
  chunk['embedding'] = embedding.tobytes()
142
251
 
143
- if self.verbose and (i + 1) % 50 == 0:
252
+ # Show progress more frequently
253
+ show_every = 50 if len(chunks) > 500 else max(10, len(chunks) // 10)
254
+ if (i + 1) % show_every == 0 or (i + 1) == len(chunks):
144
255
  progress_pct = ((i + 1) / len(chunks)) * 100
145
- print(f"Generated embeddings: {i + 1}/{len(chunks)} chunks ({progress_pct:.1f}%)")
256
+ print(f" Progress: {i + 1}/{len(chunks)} chunks ({progress_pct:.1f}%)")
146
257
 
147
258
  except Exception as e:
148
259
  logger.error(f"Error processing chunk {i}: {e}")
@@ -156,19 +267,24 @@ class IndexBuilder:
156
267
  else:
157
268
  chunk['embedding'] = b''
158
269
 
159
- # Create SQLite database
160
- sources_info = [str(s) for s in sources]
161
- self._create_database(output_file, chunks, languages or ['en'], sources_info, file_types)
162
-
163
- if self.verbose:
164
- print(f"Index created: {output_file}")
165
- print(f"Total chunks: {len(chunks)}")
270
+ # Store chunks based on backend
271
+ if self.backend == 'sqlite':
272
+ # Create SQLite database
273
+ sources_info = [str(s) for s in sources]
274
+ self._create_database(output_file, chunks, languages or ['en'], sources_info, file_types)
275
+
276
+ if self.verbose:
277
+ print(f"Index created: {output_file}")
278
+ print(f"Total chunks: {len(chunks)}")
279
+ else:
280
+ # Use pgvector backend
281
+ self._store_chunks_pgvector(chunks, output_file, languages or ['en'], overwrite)
166
282
 
167
283
  def build_index(self, source_dir: str, output_file: str,
168
284
  file_types: List[str], exclude_patterns: Optional[List[str]] = None,
169
285
  languages: List[str] = None, tags: Optional[List[str]] = None):
170
286
  """
171
- Build complete search index from a single directory (legacy method)
287
+ Build complete search index from a single directory
172
288
 
173
289
  Args:
174
290
  source_dir: Directory to scan for documents
@@ -332,16 +448,57 @@ class IndexBuilder:
332
448
  global_tags: Optional[List[str]] = None) -> List[Dict[str, Any]]:
333
449
  """Process single file into chunks"""
334
450
  try:
335
- # Try to read as text first
336
- try:
337
- content = file_path.read_text(encoding='utf-8')
338
- except UnicodeDecodeError:
451
+ relative_path = str(file_path.relative_to(source_dir))
452
+ file_extension = file_path.suffix.lower()
453
+
454
+ # Handle different file types appropriately
455
+ if file_extension == '.pdf':
456
+ # Use document processor for PDF extraction
457
+ content_result = self.doc_processor._extract_text_from_file(str(file_path))
458
+ if isinstance(content_result, str) and content_result.startswith('{"error"'):
459
+ if self.verbose:
460
+ print(f"Skipping PDF file (extraction failed): {file_path}")
461
+ return []
462
+ content = content_result
463
+ elif file_extension in ['.docx', '.xlsx', '.pptx']:
464
+ # Use document processor for Office documents
465
+ content_result = self.doc_processor._extract_text_from_file(str(file_path))
466
+ if isinstance(content_result, str) and content_result.startswith('{"error"'):
467
+ if self.verbose:
468
+ print(f"Skipping office document (extraction failed): {file_path}")
469
+ return []
470
+ content = content_result
471
+ elif file_extension == '.html':
472
+ # Use document processor for HTML
473
+ content_result = self.doc_processor._extract_text_from_file(str(file_path))
474
+ if isinstance(content_result, str) and content_result.startswith('{"error"'):
475
+ if self.verbose:
476
+ print(f"Skipping HTML file (extraction failed): {file_path}")
477
+ return []
478
+ content = content_result
479
+ elif file_extension == '.rtf':
480
+ # Use document processor for RTF
481
+ content_result = self.doc_processor._extract_text_from_file(str(file_path))
482
+ if isinstance(content_result, str) and content_result.startswith('{"error"'):
483
+ if self.verbose:
484
+ print(f"Skipping RTF file (extraction failed): {file_path}")
485
+ return []
486
+ content = content_result
487
+ else:
488
+ # Try to read as text file (markdown, txt, code, etc.)
489
+ try:
490
+ content = file_path.read_text(encoding='utf-8')
491
+ except UnicodeDecodeError:
492
+ if self.verbose:
493
+ print(f"Skipping binary file: {file_path}")
494
+ return []
495
+
496
+ # Validate content
497
+ if not content or (isinstance(content, str) and len(content.strip()) == 0):
339
498
  if self.verbose:
340
- print(f"Skipping binary file: {file_path}")
499
+ print(f"Skipping empty file: {file_path}")
341
500
  return []
342
501
 
343
- relative_path = str(file_path.relative_to(source_dir))
344
-
345
502
  # Create chunks using document processor - pass content directly, not file path
346
503
  chunks = self.doc_processor.create_chunks(
347
504
  content=content, # Pass the actual content, not the file path
@@ -390,6 +547,7 @@ class IndexBuilder:
390
547
  end_line INTEGER,
391
548
  tags TEXT,
392
549
  metadata TEXT,
550
+ metadata_text TEXT, -- Searchable text representation of all metadata
393
551
  chunk_hash TEXT UNIQUE,
394
552
  created_at TEXT DEFAULT CURRENT_TIMESTAMP
395
553
  )
@@ -399,6 +557,7 @@ class IndexBuilder:
399
557
  CREATE VIRTUAL TABLE chunks_fts USING fts5(
400
558
  processed_content,
401
559
  keywords,
560
+ metadata_text,
402
561
  content='chunks',
403
562
  content_rowid='id'
404
563
  )
@@ -460,13 +619,47 @@ class IndexBuilder:
460
619
  # Prepare data
461
620
  keywords_json = json.dumps(chunk.get('keywords', []))
462
621
  tags_json = json.dumps(chunk.get('tags', []))
463
- metadata_json = json.dumps(chunk.get('metadata', {}))
622
+
623
+ # Extract metadata from JSON content and merge with chunk metadata
624
+ json_metadata, json_metadata_text = self._extract_metadata_from_json_content(chunk['content'])
625
+ chunk_metadata = chunk.get('metadata', {})
626
+
627
+ # Merge metadata: chunk metadata takes precedence
628
+ merged_metadata = {**json_metadata, **chunk_metadata}
629
+ metadata_json = json.dumps(merged_metadata)
630
+
631
+ # Create comprehensive metadata_text including tags
632
+ metadata_text_parts = []
633
+
634
+ # Add metadata text from JSON content
635
+ if json_metadata_text:
636
+ metadata_text_parts.append(json_metadata_text)
637
+
638
+ # Add tags
639
+ tags = chunk.get('tags', [])
640
+ if tags:
641
+ metadata_text_parts.extend(str(tag).lower() for tag in tags)
642
+
643
+ # Add section if present
644
+ if chunk.get('section'):
645
+ metadata_text_parts.append(chunk['section'].lower())
646
+
647
+ # Add any additional metadata values
648
+ for key, value in chunk_metadata.items():
649
+ if key not in json_metadata: # Avoid duplicates
650
+ metadata_text_parts.append(str(key).lower())
651
+ if isinstance(value, list):
652
+ metadata_text_parts.extend(str(v).lower() for v in value)
653
+ else:
654
+ metadata_text_parts.append(str(value).lower())
655
+
656
+ metadata_text = ' '.join(metadata_text_parts)
464
657
 
465
658
  cursor.execute('''
466
659
  INSERT OR IGNORE INTO chunks (
467
660
  content, processed_content, keywords, language, embedding,
468
- filename, section, start_line, end_line, tags, metadata, chunk_hash
469
- ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
661
+ filename, section, start_line, end_line, tags, metadata, metadata_text, chunk_hash
662
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
470
663
  ''', (
471
664
  chunk['content'],
472
665
  chunk.get('processed_content', chunk['content']),
@@ -479,6 +672,7 @@ class IndexBuilder:
479
672
  chunk.get('end_line'),
480
673
  tags_json,
481
674
  metadata_json,
675
+ metadata_text,
482
676
  chunk_hash
483
677
  ))
484
678
 
@@ -531,4 +725,80 @@ class IndexBuilder:
531
725
  }
532
726
 
533
727
  except Exception as e:
534
- return {"valid": False, "error": str(e)}
728
+ return {"valid": False, "error": str(e)}
729
+
730
+ def _store_chunks_pgvector(self, chunks: List[Dict[str, Any]], collection_name: str,
731
+ languages: List[str], overwrite: bool = False):
732
+ """
733
+ Store chunks in pgvector backend
734
+
735
+ Args:
736
+ chunks: List of processed chunks
737
+ collection_name: Name for the collection (from output_file parameter)
738
+ languages: List of supported languages
739
+ """
740
+ from .pgvector_backend import PgVectorBackend
741
+
742
+ # Extract collection name from the provided name
743
+ if collection_name.endswith('.swsearch'):
744
+ collection_name = collection_name[:-9] # Remove .swsearch extension
745
+
746
+ # Clean collection name for PostgreSQL
747
+ import re
748
+ collection_name = re.sub(r'[^a-zA-Z0-9_]', '_', collection_name)
749
+
750
+ if self.verbose:
751
+ print(f"Storing chunks in pgvector collection: {collection_name}")
752
+
753
+ # Create backend instance
754
+ backend = PgVectorBackend(self.connection_string)
755
+
756
+ try:
757
+ # Get embedding dimensions from model
758
+ if self.model:
759
+ embedding_dim = self.model.get_sentence_embedding_dimension()
760
+ else:
761
+ embedding_dim = 768 # Default for all-mpnet-base-v2
762
+
763
+ # Delete existing collection if overwrite is requested
764
+ if overwrite:
765
+ if self.verbose:
766
+ print(f"Dropping existing collection: {collection_name}")
767
+ backend.delete_collection(collection_name)
768
+
769
+ # Create schema
770
+ backend.create_schema(collection_name, embedding_dim)
771
+
772
+ # Convert embeddings from bytes to numpy arrays
773
+ for chunk in chunks:
774
+ if chunk.get('embedding') and isinstance(chunk['embedding'], bytes):
775
+ if np:
776
+ chunk['embedding'] = np.frombuffer(chunk['embedding'], dtype=np.float32)
777
+ else:
778
+ # If numpy not available, leave as bytes
779
+ pass
780
+
781
+ # Prepare config
782
+ config = {
783
+ 'model_name': self.model_name,
784
+ 'embedding_dimensions': embedding_dim,
785
+ 'chunking_strategy': self.chunking_strategy,
786
+ 'languages': languages,
787
+ 'metadata': {
788
+ 'max_sentences_per_chunk': self.max_sentences_per_chunk,
789
+ 'chunk_size': self.chunk_size,
790
+ 'chunk_overlap': self.chunk_overlap,
791
+ 'index_nlp_backend': self.index_nlp_backend
792
+ }
793
+ }
794
+
795
+ # Store chunks
796
+ backend.store_chunks(chunks, collection_name, config)
797
+
798
+ if self.verbose:
799
+ stats = backend.get_stats(collection_name)
800
+ print(f"Stored {stats['total_chunks']} chunks in pgvector")
801
+ print(f"Collection: {collection_name}")
802
+
803
+ finally:
804
+ backend.close()