signalwire-agents 0.1.46__py3-none-any.whl → 0.1.48__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -250,20 +250,9 @@ class AgentBase(
250
250
  self._params = {}
251
251
  self._global_data = {}
252
252
  self._function_includes = []
253
- # Initialize with default LLM params
254
- self._prompt_llm_params = {
255
- 'temperature': 0.3,
256
- 'top_p': 1.0,
257
- 'barge_confidence': 0.0,
258
- 'presence_penalty': 0.1,
259
- 'frequency_penalty': 0.1
260
- }
261
- self._post_prompt_llm_params = {
262
- 'temperature': 0.0,
263
- 'top_p': 1.0,
264
- 'presence_penalty': 0.0,
265
- 'frequency_penalty': 0.0
266
- }
253
+ # Initialize LLM params as empty - only send if explicitly set
254
+ self._prompt_llm_params = {}
255
+ self._post_prompt_llm_params = {}
267
256
 
268
257
  # Dynamic configuration callback
269
258
  self._dynamic_config_callback = None
@@ -692,7 +681,7 @@ class AgentBase(
692
681
  "parameters": func._ensure_parameter_structure()
693
682
  }
694
683
 
695
- # Add wait_file if present (SignalWire SWML expects wait_file, not fillers)
684
+ # Add wait_file if present (audio/video file URL)
696
685
  if hasattr(func, 'wait_file') and func.wait_file:
697
686
  wait_file_url = func.wait_file
698
687
  # If wait_file is a relative URL, convert it to absolute using agent's base URL
@@ -704,9 +693,10 @@ class AgentBase(
704
693
  wait_file_url = '/' + wait_file_url
705
694
  wait_file_url = f"{base_url}{wait_file_url}"
706
695
  function_entry["wait_file"] = wait_file_url
707
- elif func.fillers:
708
- # Backward compatibility: use fillers as wait_file if wait_file not specified
709
- function_entry["wait_file"] = func.fillers
696
+
697
+ # Add fillers if present (text phrases to say while processing)
698
+ if hasattr(func, 'fillers') and func.fillers:
699
+ function_entry["fillers"] = func.fillers
710
700
 
711
701
  # Add wait_file_loops if present
712
702
  if hasattr(func, 'wait_file_loops') and func.wait_file_loops is not None:
@@ -833,27 +823,29 @@ class AgentBase(
833
823
 
834
824
  # Always add LLM parameters to prompt
835
825
  if "prompt" in ai_config:
836
- # Update existing prompt with LLM params
837
- if isinstance(ai_config["prompt"], dict):
838
- ai_config["prompt"].update(agent_to_use._prompt_llm_params)
839
- elif isinstance(ai_config["prompt"], str):
840
- # Convert string prompt to dict format
841
- ai_config["prompt"] = {
842
- "text": ai_config["prompt"],
843
- **agent_to_use._prompt_llm_params
844
- }
826
+ # Only add LLM params if explicitly set
827
+ if agent_to_use._prompt_llm_params:
828
+ if isinstance(ai_config["prompt"], dict):
829
+ ai_config["prompt"].update(agent_to_use._prompt_llm_params)
830
+ elif isinstance(ai_config["prompt"], str):
831
+ # Convert string prompt to dict format
832
+ ai_config["prompt"] = {
833
+ "text": ai_config["prompt"],
834
+ **agent_to_use._prompt_llm_params
835
+ }
845
836
 
846
- # Always add LLM parameters to post_prompt if post_prompt exists
837
+ # Only add LLM parameters to post_prompt if explicitly set
847
838
  if post_prompt and "post_prompt" in ai_config:
848
- # Update existing post_prompt with LLM params
849
- if isinstance(ai_config["post_prompt"], dict):
850
- ai_config["post_prompt"].update(agent_to_use._post_prompt_llm_params)
851
- elif isinstance(ai_config["post_prompt"], str):
852
- # Convert string post_prompt to dict format
853
- ai_config["post_prompt"] = {
854
- "text": ai_config["post_prompt"],
855
- **agent_to_use._post_prompt_llm_params
856
- }
839
+ # Only add LLM params if explicitly set
840
+ if agent_to_use._post_prompt_llm_params:
841
+ if isinstance(ai_config["post_prompt"], dict):
842
+ ai_config["post_prompt"].update(agent_to_use._post_prompt_llm_params)
843
+ elif isinstance(ai_config["post_prompt"], str):
844
+ # Convert string post_prompt to dict format
845
+ ai_config["post_prompt"] = {
846
+ "text": ai_config["post_prompt"],
847
+ **agent_to_use._post_prompt_llm_params
848
+ }
857
849
 
858
850
  except ValueError as e:
859
851
  if not agent_to_use._suppress_logs:
@@ -372,28 +372,22 @@ class AIConfigMixin:
372
372
  self._function_includes = valid_includes
373
373
  return self
374
374
 
375
- def set_prompt_llm_params(
376
- self,
377
- temperature: Optional[float] = None,
378
- top_p: Optional[float] = None,
379
- barge_confidence: Optional[float] = None,
380
- presence_penalty: Optional[float] = None,
381
- frequency_penalty: Optional[float] = None
382
- ) -> 'AgentBase':
375
+ def set_prompt_llm_params(self, **params) -> 'AgentBase':
383
376
  """
384
377
  Set LLM parameters for the main prompt.
385
378
 
386
- Args:
387
- temperature: Randomness setting (0.0-1.5). Lower values make output more deterministic.
388
- Default: 0.3
389
- top_p: Alternative to temperature (0.0-1.0). Controls nucleus sampling.
390
- Default: 1.0
391
- barge_confidence: ASR confidence to interrupt (0.0-1.0). Higher values make it harder to interrupt.
392
- Default: 0.0
393
- presence_penalty: Topic diversity (-2.0 to 2.0). Positive values encourage new topics.
394
- Default: 0.1
395
- frequency_penalty: Repetition control (-2.0 to 2.0). Positive values reduce repetition.
396
- Default: 0.1
379
+ Accepts any parameters which will be passed through to the SignalWire server.
380
+ The server will validate and apply parameters based on the target model's capabilities.
381
+
382
+ Common parameters include:
383
+ temperature: Randomness setting. Lower values make output more deterministic.
384
+ top_p: Alternative to temperature. Controls nucleus sampling.
385
+ barge_confidence: ASR confidence to interrupt. Higher values make it harder to interrupt.
386
+ presence_penalty: Topic diversity. Positive values encourage new topics.
387
+ frequency_penalty: Repetition control. Positive values reduce repetition.
388
+
389
+ Note: Parameters are model-specific and will be validated by the server.
390
+ Invalid parameters for the selected model will be handled/ignored by the server.
397
391
 
398
392
  Returns:
399
393
  Self for method chaining
@@ -405,57 +399,28 @@ class AIConfigMixin:
405
399
  barge_confidence=0.6
406
400
  )
407
401
  """
408
- # Validate and set temperature
409
- if temperature is not None:
410
- if not 0.0 <= temperature <= 1.5:
411
- raise ValueError("temperature must be between 0.0 and 1.5")
412
- self._prompt_llm_params['temperature'] = temperature
413
-
414
- # Validate and set top_p
415
- if top_p is not None:
416
- if not 0.0 <= top_p <= 1.0:
417
- raise ValueError("top_p must be between 0.0 and 1.0")
418
- self._prompt_llm_params['top_p'] = top_p
419
-
420
- # Validate and set barge_confidence
421
- if barge_confidence is not None:
422
- if not 0.0 <= barge_confidence <= 1.0:
423
- raise ValueError("barge_confidence must be between 0.0 and 1.0")
424
- self._prompt_llm_params['barge_confidence'] = barge_confidence
425
-
426
- # Validate and set presence_penalty
427
- if presence_penalty is not None:
428
- if not -2.0 <= presence_penalty <= 2.0:
429
- raise ValueError("presence_penalty must be between -2.0 and 2.0")
430
- self._prompt_llm_params['presence_penalty'] = presence_penalty
431
-
432
- # Validate and set frequency_penalty
433
- if frequency_penalty is not None:
434
- if not -2.0 <= frequency_penalty <= 2.0:
435
- raise ValueError("frequency_penalty must be between -2.0 and 2.0")
436
- self._prompt_llm_params['frequency_penalty'] = frequency_penalty
402
+ # Accept any parameters without validation
403
+ if params:
404
+ self._prompt_llm_params.update(params)
437
405
 
438
406
  return self
439
407
 
440
- def set_post_prompt_llm_params(
441
- self,
442
- temperature: Optional[float] = None,
443
- top_p: Optional[float] = None,
444
- presence_penalty: Optional[float] = None,
445
- frequency_penalty: Optional[float] = None
446
- ) -> 'AgentBase':
408
+ def set_post_prompt_llm_params(self, **params) -> 'AgentBase':
447
409
  """
448
410
  Set LLM parameters for the post-prompt.
449
411
 
450
- Args:
451
- temperature: Randomness setting (0.0-1.5). Lower values make output more deterministic.
452
- Default: 0.0
453
- top_p: Alternative to temperature (0.0-1.0). Controls nucleus sampling.
454
- Default: 1.0
455
- presence_penalty: Topic diversity (-2.0 to 2.0). Positive values encourage new topics.
456
- Default: 0.0
457
- frequency_penalty: Repetition control (-2.0 to 2.0). Positive values reduce repetition.
458
- Default: 0.0
412
+ Accepts any parameters which will be passed through to the SignalWire server.
413
+ The server will validate and apply parameters based on the target model's capabilities.
414
+
415
+ Common parameters include:
416
+ temperature: Randomness setting. Lower values make output more deterministic.
417
+ top_p: Alternative to temperature. Controls nucleus sampling.
418
+ presence_penalty: Topic diversity. Positive values encourage new topics.
419
+ frequency_penalty: Repetition control. Positive values reduce repetition.
420
+
421
+ Note: Parameters are model-specific and will be validated by the server.
422
+ Invalid parameters for the selected model will be handled/ignored by the server.
423
+ barge_confidence is not applicable to post-prompt.
459
424
 
460
425
  Returns:
461
426
  Self for method chaining
@@ -466,28 +431,8 @@ class AIConfigMixin:
466
431
  top_p=0.9
467
432
  )
468
433
  """
469
- # Validate and set temperature
470
- if temperature is not None:
471
- if not 0.0 <= temperature <= 1.5:
472
- raise ValueError("temperature must be between 0.0 and 1.5")
473
- self._post_prompt_llm_params['temperature'] = temperature
474
-
475
- # Validate and set top_p
476
- if top_p is not None:
477
- if not 0.0 <= top_p <= 1.0:
478
- raise ValueError("top_p must be between 0.0 and 1.0")
479
- self._post_prompt_llm_params['top_p'] = top_p
480
-
481
- # Validate and set presence_penalty
482
- if presence_penalty is not None:
483
- if not -2.0 <= presence_penalty <= 2.0:
484
- raise ValueError("presence_penalty must be between -2.0 and 2.0")
485
- self._post_prompt_llm_params['presence_penalty'] = presence_penalty
486
-
487
- # Validate and set frequency_penalty
488
- if frequency_penalty is not None:
489
- if not -2.0 <= frequency_penalty <= 2.0:
490
- raise ValueError("frequency_penalty must be between -2.0 and 2.0")
491
- self._post_prompt_llm_params['frequency_penalty'] = frequency_penalty
434
+ # Accept any parameters without validation
435
+ if params:
436
+ self._post_prompt_llm_params.update(params)
492
437
 
493
438
  return self
@@ -57,8 +57,8 @@ class SWAIGFunction:
57
57
  self.description = description
58
58
  self.parameters = parameters or {}
59
59
  self.secure = secure
60
- self.fillers = fillers # Keep for backward compatibility
61
- self.wait_file = wait_file or fillers # Use wait_file if provided, else fall back to fillers
60
+ self.fillers = fillers # Text phrases to say while processing
61
+ self.wait_file = wait_file # URL to audio/video file to play while waiting
62
62
  self.wait_file_loops = wait_file_loops
63
63
  self.webhook_url = webhook_url
64
64
  self.required = required or []
@@ -68,6 +68,8 @@ if _SEARCH_AVAILABLE:
68
68
  from .index_builder import IndexBuilder
69
69
  from .search_engine import SearchEngine
70
70
  from .search_service import SearchService
71
+ from .models import MODEL_ALIASES, DEFAULT_MODEL, resolve_model_alias
72
+ from .migration import SearchIndexMigrator
71
73
 
72
74
  __all__ = [
73
75
  'preprocess_query',
@@ -75,7 +77,11 @@ if _SEARCH_AVAILABLE:
75
77
  'DocumentProcessor',
76
78
  'IndexBuilder',
77
79
  'SearchEngine',
78
- 'SearchService'
80
+ 'SearchService',
81
+ 'MODEL_ALIASES',
82
+ 'DEFAULT_MODEL',
83
+ 'resolve_model_alias',
84
+ 'SearchIndexMigrator'
79
85
  ]
80
86
  except ImportError as e:
81
87
  # Some search components failed to import
@@ -140,6 +140,8 @@ class DocumentProcessor:
140
140
  return self._chunk_by_topics(content, filename, file_type)
141
141
  elif self.chunking_strategy == 'qa':
142
142
  return self._chunk_by_qa_optimization(content, filename, file_type)
143
+ elif self.chunking_strategy == 'json':
144
+ return self._chunk_from_json(content, filename, file_type)
143
145
  else:
144
146
  # Fallback to sentence-based chunking
145
147
  return self._chunk_by_sentences(content, filename, file_type)
@@ -1022,4 +1024,106 @@ class DocumentProcessor:
1022
1024
  ))
1023
1025
 
1024
1026
  return chunks if chunks else [self._create_chunk(content, filename, "QA Section 1",
1025
- metadata={'chunk_method': 'qa_optimized', 'chunk_index': 0})]
1027
+ metadata={'chunk_method': 'qa_optimized', 'chunk_index': 0})]
1028
+
1029
+ def _chunk_from_json(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
1030
+ """
1031
+ Create chunks from pre-processed JSON content
1032
+
1033
+ This strategy expects content to be a JSON string with the following structure:
1034
+ {
1035
+ "chunks": [
1036
+ {
1037
+ "chunk_id": "unique_id",
1038
+ "type": "content|toc",
1039
+ "content": "text content",
1040
+ "metadata": {
1041
+ "url": "https://...",
1042
+ "section_number": 1,
1043
+ "related_toc": "toc_id",
1044
+ ...
1045
+ }
1046
+ },
1047
+ ...
1048
+ ]
1049
+ }
1050
+
1051
+ Args:
1052
+ content: JSON string containing pre-chunked content
1053
+ filename: Name of the source file
1054
+ file_type: Should be 'json'
1055
+
1056
+ Returns:
1057
+ List of chunk dictionaries formatted for the search index
1058
+ """
1059
+ try:
1060
+ # Parse JSON content
1061
+ data = json.loads(content)
1062
+
1063
+ if not isinstance(data, dict) or 'chunks' not in data:
1064
+ logger.error(f"Invalid JSON structure in {filename}: expected 'chunks' key")
1065
+ # Fallback to treating it as plain text
1066
+ return self._chunk_by_sentences(content, filename, file_type)
1067
+
1068
+ chunks = []
1069
+ for idx, json_chunk in enumerate(data['chunks']):
1070
+ if not isinstance(json_chunk, dict) or 'content' not in json_chunk:
1071
+ logger.warning(f"Skipping invalid chunk {idx} in {filename}")
1072
+ continue
1073
+
1074
+ # Extract metadata from JSON chunk
1075
+ json_metadata = json_chunk.get('metadata', {})
1076
+ chunk_type = json_chunk.get('type', 'content')
1077
+
1078
+ # Build chunk metadata (excluding tags which go at top level)
1079
+ metadata = {
1080
+ 'chunk_method': 'json',
1081
+ 'chunk_index': idx,
1082
+ 'chunk_type': chunk_type,
1083
+ 'original_chunk_id': json_chunk.get('chunk_id', f'chunk_{idx}')
1084
+ }
1085
+
1086
+ # Extract tags before merging metadata
1087
+ tags = json_metadata.get('tags', [])
1088
+
1089
+ # Merge JSON metadata (this includes all fields including tags)
1090
+ # We'll keep tags in metadata for backward compatibility but also set at top level
1091
+ metadata.update(json_metadata)
1092
+
1093
+ # Determine section name
1094
+ if chunk_type == 'toc':
1095
+ section = f"TOC: {json_chunk.get('content', '')[:50]}"
1096
+ else:
1097
+ section = json_metadata.get('section', f"Section {json_metadata.get('section_number', idx + 1)}")
1098
+
1099
+ # Create chunk with proper structure
1100
+ chunk = self._create_chunk(
1101
+ content=json_chunk['content'],
1102
+ filename=filename,
1103
+ section=section,
1104
+ metadata=metadata
1105
+ )
1106
+
1107
+ # Set tags at the top level for proper tag filtering
1108
+ if tags:
1109
+ chunk['tags'] = tags
1110
+ elif chunk_type == 'toc':
1111
+ # For TOC entries, add special tags if none provided
1112
+ chunk['tags'] = ['toc', 'navigation']
1113
+
1114
+ chunks.append(chunk)
1115
+
1116
+ if not chunks:
1117
+ logger.warning(f"No valid chunks found in JSON file {filename}")
1118
+ return self._chunk_by_sentences(str(data), filename, file_type)
1119
+
1120
+ logger.info(f"Created {len(chunks)} chunks from JSON file {filename}")
1121
+ return chunks
1122
+
1123
+ except json.JSONDecodeError as e:
1124
+ logger.error(f"Failed to parse JSON in {filename}: {e}")
1125
+ # Fallback to sentence chunking
1126
+ return self._chunk_by_sentences(content, filename, file_type)
1127
+ except Exception as e:
1128
+ logger.error(f"Unexpected error processing JSON chunks in {filename}: {e}")
1129
+ return self._chunk_by_sentences(content, filename, file_type)
@@ -55,7 +55,7 @@ class IndexBuilder:
55
55
 
56
56
  Args:
57
57
  model_name: Name of the sentence transformer model to use
58
- chunking_strategy: Strategy for chunking documents ('sentence', 'sliding', 'paragraph', 'page', 'semantic', 'topic', 'qa')
58
+ chunking_strategy: Strategy for chunking documents ('sentence', 'sliding', 'paragraph', 'page', 'semantic', 'topic', 'qa', 'json')
59
59
  max_sentences_per_chunk: For sentence strategy (default: 5)
60
60
  chunk_size: For sliding strategy - words per chunk (default: 50)
61
61
  chunk_overlap: For sliding strategy - overlap in words (default: 10)
@@ -85,9 +85,6 @@ class IndexBuilder:
85
85
  if self.backend not in ['sqlite', 'pgvector']:
86
86
  raise ValueError(f"Invalid backend '{self.backend}'. Must be 'sqlite' or 'pgvector'")
87
87
 
88
- if self.backend == 'pgvector' and not self.connection_string:
89
- raise ValueError("connection_string is required for pgvector backend")
90
-
91
88
  # Validate NLP backend
92
89
  if self.index_nlp_backend not in ['nltk', 'spacy']:
93
90
  logger.warning(f"Invalid index_nlp_backend '{self.index_nlp_backend}', using 'nltk'")
@@ -105,6 +102,50 @@ class IndexBuilder:
105
102
  topic_threshold=self.topic_threshold
106
103
  )
107
104
 
105
+ def _extract_metadata_from_json_content(self, content: str) -> tuple[Dict[str, Any], str]:
106
+ """
107
+ Extract metadata from JSON content if present
108
+
109
+ Returns:
110
+ (metadata_dict, metadata_text)
111
+ """
112
+ metadata_dict = {}
113
+
114
+ # Try to extract metadata from JSON structure in content
115
+ if '"metadata":' in content:
116
+ try:
117
+ # Look for metadata object in content
118
+ import re
119
+ # Find all metadata objects
120
+ pattern = r'"metadata"\s*:\s*(\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\})'
121
+ matches = re.finditer(pattern, content)
122
+
123
+ for match in matches:
124
+ try:
125
+ json_metadata = json.loads(match.group(1))
126
+ # Merge all found metadata
127
+ if isinstance(json_metadata, dict):
128
+ metadata_dict.update(json_metadata)
129
+ except:
130
+ pass
131
+ except Exception as e:
132
+ logger.debug(f"Error extracting JSON metadata: {e}")
133
+
134
+ # Create searchable text from all metadata keys and values
135
+ metadata_text_parts = []
136
+ for key, value in metadata_dict.items():
137
+ # Add key
138
+ metadata_text_parts.append(str(key))
139
+ # Add value(s)
140
+ if isinstance(value, list):
141
+ metadata_text_parts.extend(str(v) for v in value)
142
+ else:
143
+ metadata_text_parts.append(str(value))
144
+
145
+ metadata_text = ' '.join(metadata_text_parts).lower()
146
+
147
+ return metadata_dict, metadata_text
148
+
108
149
  def _load_model(self):
109
150
  """Load embedding model (lazy loading)"""
110
151
  if self.model is None:
@@ -147,6 +188,7 @@ class IndexBuilder:
147
188
 
148
189
  # Process documents
149
190
  chunks = []
191
+ print(f"Processing {len(files)} files...")
150
192
  for file_path in files:
151
193
  try:
152
194
  # For individual files, use the file's parent as the base directory
@@ -154,8 +196,8 @@ class IndexBuilder:
154
196
  base_dir = self._get_base_directory_for_file(file_path, sources)
155
197
  file_chunks = self._process_file(file_path, base_dir, tags)
156
198
  chunks.extend(file_chunks)
157
- if self.verbose:
158
- print(f"Processed {file_path}: {len(file_chunks)} chunks")
199
+ if self.verbose or file_path.suffix == '.json':
200
+ print(f" {file_path}: {len(file_chunks)} chunks")
159
201
  except Exception as e:
160
202
  logger.error(f"Error processing {file_path}: {e}")
161
203
  if self.verbose:
@@ -171,7 +213,9 @@ class IndexBuilder:
171
213
  # Generate embeddings
172
214
  self._load_model()
173
215
  if self.verbose:
174
- print("Generating embeddings...")
216
+ print(f"Generating embeddings for {len(chunks)} chunks...")
217
+ else:
218
+ print(f"Generating embeddings for {len(chunks)} chunks...")
175
219
 
176
220
  for i, chunk in enumerate(chunks):
177
221
  try:
@@ -183,15 +227,33 @@ class IndexBuilder:
183
227
  )
184
228
 
185
229
  chunk['processed_content'] = processed['enhanced_text']
186
- chunk['keywords'] = processed.get('keywords', [])
230
+
231
+ # Include tags in keywords for better search matching
232
+ keywords = processed.get('keywords', [])
233
+ chunk_tags = chunk.get('tags', [])
234
+ if chunk_tags:
235
+ # Add tags to keywords list for FTS matching
236
+ keywords.extend(chunk_tags)
237
+ # Remove duplicates while preserving order
238
+ keywords = list(dict.fromkeys(keywords))
239
+
240
+ chunk['keywords'] = keywords
241
+
242
+ # For embedding, include tags in the text for better semantic matching
243
+ embedding_text = processed['enhanced_text']
244
+ if chunk_tags:
245
+ # Append tags to the text for embedding generation
246
+ embedding_text += " " + " ".join(chunk_tags)
187
247
 
188
248
  # Generate embedding (suppress progress bar)
189
- embedding = self.model.encode(processed['enhanced_text'], show_progress_bar=False)
249
+ embedding = self.model.encode(embedding_text, show_progress_bar=False)
190
250
  chunk['embedding'] = embedding.tobytes()
191
251
 
192
- if self.verbose and (i + 1) % 50 == 0:
252
+ # Show progress more frequently
253
+ show_every = 50 if len(chunks) > 500 else max(10, len(chunks) // 10)
254
+ if (i + 1) % show_every == 0 or (i + 1) == len(chunks):
193
255
  progress_pct = ((i + 1) / len(chunks)) * 100
194
- print(f"Generated embeddings: {i + 1}/{len(chunks)} chunks ({progress_pct:.1f}%)")
256
+ print(f" Progress: {i + 1}/{len(chunks)} chunks ({progress_pct:.1f}%)")
195
257
 
196
258
  except Exception as e:
197
259
  logger.error(f"Error processing chunk {i}: {e}")
@@ -485,6 +547,7 @@ class IndexBuilder:
485
547
  end_line INTEGER,
486
548
  tags TEXT,
487
549
  metadata TEXT,
550
+ metadata_text TEXT, -- Searchable text representation of all metadata
488
551
  chunk_hash TEXT UNIQUE,
489
552
  created_at TEXT DEFAULT CURRENT_TIMESTAMP
490
553
  )
@@ -494,6 +557,7 @@ class IndexBuilder:
494
557
  CREATE VIRTUAL TABLE chunks_fts USING fts5(
495
558
  processed_content,
496
559
  keywords,
560
+ metadata_text,
497
561
  content='chunks',
498
562
  content_rowid='id'
499
563
  )
@@ -555,13 +619,47 @@ class IndexBuilder:
555
619
  # Prepare data
556
620
  keywords_json = json.dumps(chunk.get('keywords', []))
557
621
  tags_json = json.dumps(chunk.get('tags', []))
558
- metadata_json = json.dumps(chunk.get('metadata', {}))
622
+
623
+ # Extract metadata from JSON content and merge with chunk metadata
624
+ json_metadata, json_metadata_text = self._extract_metadata_from_json_content(chunk['content'])
625
+ chunk_metadata = chunk.get('metadata', {})
626
+
627
+ # Merge metadata: chunk metadata takes precedence
628
+ merged_metadata = {**json_metadata, **chunk_metadata}
629
+ metadata_json = json.dumps(merged_metadata)
630
+
631
+ # Create comprehensive metadata_text including tags
632
+ metadata_text_parts = []
633
+
634
+ # Add metadata text from JSON content
635
+ if json_metadata_text:
636
+ metadata_text_parts.append(json_metadata_text)
637
+
638
+ # Add tags
639
+ tags = chunk.get('tags', [])
640
+ if tags:
641
+ metadata_text_parts.extend(str(tag).lower() for tag in tags)
642
+
643
+ # Add section if present
644
+ if chunk.get('section'):
645
+ metadata_text_parts.append(chunk['section'].lower())
646
+
647
+ # Add any additional metadata values
648
+ for key, value in chunk_metadata.items():
649
+ if key not in json_metadata: # Avoid duplicates
650
+ metadata_text_parts.append(str(key).lower())
651
+ if isinstance(value, list):
652
+ metadata_text_parts.extend(str(v).lower() for v in value)
653
+ else:
654
+ metadata_text_parts.append(str(value).lower())
655
+
656
+ metadata_text = ' '.join(metadata_text_parts)
559
657
 
560
658
  cursor.execute('''
561
659
  INSERT OR IGNORE INTO chunks (
562
660
  content, processed_content, keywords, language, embedding,
563
- filename, section, start_line, end_line, tags, metadata, chunk_hash
564
- ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
661
+ filename, section, start_line, end_line, tags, metadata, metadata_text, chunk_hash
662
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
565
663
  ''', (
566
664
  chunk['content'],
567
665
  chunk.get('processed_content', chunk['content']),
@@ -574,6 +672,7 @@ class IndexBuilder:
574
672
  chunk.get('end_line'),
575
673
  tags_json,
576
674
  metadata_json,
675
+ metadata_text,
577
676
  chunk_hash
578
677
  ))
579
678