signalwire-agents 0.1.46__py3-none-any.whl → 0.1.48__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- signalwire_agents/__init__.py +1 -1
- signalwire_agents/cli/build_search.py +522 -13
- signalwire_agents/core/agent_base.py +29 -37
- signalwire_agents/core/mixins/ai_config_mixin.py +32 -87
- signalwire_agents/core/swaig_function.py +2 -2
- signalwire_agents/search/__init__.py +7 -1
- signalwire_agents/search/document_processor.py +105 -1
- signalwire_agents/search/index_builder.py +113 -14
- signalwire_agents/search/migration.py +418 -0
- signalwire_agents/search/models.py +30 -0
- signalwire_agents/search/pgvector_backend.py +236 -13
- signalwire_agents/search/query_processor.py +87 -9
- signalwire_agents/search/search_engine.py +835 -31
- signalwire_agents/search/search_service.py +56 -6
- signalwire_agents/skills/native_vector_search/skill.py +208 -33
- signalwire_agents/skills/weather_api/skill.py +2 -2
- {signalwire_agents-0.1.46.dist-info → signalwire_agents-0.1.48.dist-info}/METADATA +12 -7
- {signalwire_agents-0.1.46.dist-info → signalwire_agents-0.1.48.dist-info}/RECORD +22 -20
- {signalwire_agents-0.1.46.dist-info → signalwire_agents-0.1.48.dist-info}/WHEEL +0 -0
- {signalwire_agents-0.1.46.dist-info → signalwire_agents-0.1.48.dist-info}/entry_points.txt +0 -0
- {signalwire_agents-0.1.46.dist-info → signalwire_agents-0.1.48.dist-info}/licenses/LICENSE +0 -0
- {signalwire_agents-0.1.46.dist-info → signalwire_agents-0.1.48.dist-info}/top_level.txt +0 -0
@@ -250,20 +250,9 @@ class AgentBase(
|
|
250
250
|
self._params = {}
|
251
251
|
self._global_data = {}
|
252
252
|
self._function_includes = []
|
253
|
-
# Initialize
|
254
|
-
self._prompt_llm_params = {
|
255
|
-
|
256
|
-
'top_p': 1.0,
|
257
|
-
'barge_confidence': 0.0,
|
258
|
-
'presence_penalty': 0.1,
|
259
|
-
'frequency_penalty': 0.1
|
260
|
-
}
|
261
|
-
self._post_prompt_llm_params = {
|
262
|
-
'temperature': 0.0,
|
263
|
-
'top_p': 1.0,
|
264
|
-
'presence_penalty': 0.0,
|
265
|
-
'frequency_penalty': 0.0
|
266
|
-
}
|
253
|
+
# Initialize LLM params as empty - only send if explicitly set
|
254
|
+
self._prompt_llm_params = {}
|
255
|
+
self._post_prompt_llm_params = {}
|
267
256
|
|
268
257
|
# Dynamic configuration callback
|
269
258
|
self._dynamic_config_callback = None
|
@@ -692,7 +681,7 @@ class AgentBase(
|
|
692
681
|
"parameters": func._ensure_parameter_structure()
|
693
682
|
}
|
694
683
|
|
695
|
-
# Add wait_file if present (
|
684
|
+
# Add wait_file if present (audio/video file URL)
|
696
685
|
if hasattr(func, 'wait_file') and func.wait_file:
|
697
686
|
wait_file_url = func.wait_file
|
698
687
|
# If wait_file is a relative URL, convert it to absolute using agent's base URL
|
@@ -704,9 +693,10 @@ class AgentBase(
|
|
704
693
|
wait_file_url = '/' + wait_file_url
|
705
694
|
wait_file_url = f"{base_url}{wait_file_url}"
|
706
695
|
function_entry["wait_file"] = wait_file_url
|
707
|
-
|
708
|
-
|
709
|
-
|
696
|
+
|
697
|
+
# Add fillers if present (text phrases to say while processing)
|
698
|
+
if hasattr(func, 'fillers') and func.fillers:
|
699
|
+
function_entry["fillers"] = func.fillers
|
710
700
|
|
711
701
|
# Add wait_file_loops if present
|
712
702
|
if hasattr(func, 'wait_file_loops') and func.wait_file_loops is not None:
|
@@ -833,27 +823,29 @@ class AgentBase(
|
|
833
823
|
|
834
824
|
# Always add LLM parameters to prompt
|
835
825
|
if "prompt" in ai_config:
|
836
|
-
#
|
837
|
-
if
|
838
|
-
ai_config["prompt"]
|
839
|
-
|
840
|
-
|
841
|
-
|
842
|
-
|
843
|
-
|
844
|
-
|
826
|
+
# Only add LLM params if explicitly set
|
827
|
+
if agent_to_use._prompt_llm_params:
|
828
|
+
if isinstance(ai_config["prompt"], dict):
|
829
|
+
ai_config["prompt"].update(agent_to_use._prompt_llm_params)
|
830
|
+
elif isinstance(ai_config["prompt"], str):
|
831
|
+
# Convert string prompt to dict format
|
832
|
+
ai_config["prompt"] = {
|
833
|
+
"text": ai_config["prompt"],
|
834
|
+
**agent_to_use._prompt_llm_params
|
835
|
+
}
|
845
836
|
|
846
|
-
#
|
837
|
+
# Only add LLM parameters to post_prompt if explicitly set
|
847
838
|
if post_prompt and "post_prompt" in ai_config:
|
848
|
-
#
|
849
|
-
if
|
850
|
-
ai_config["post_prompt"]
|
851
|
-
|
852
|
-
|
853
|
-
|
854
|
-
|
855
|
-
|
856
|
-
|
839
|
+
# Only add LLM params if explicitly set
|
840
|
+
if agent_to_use._post_prompt_llm_params:
|
841
|
+
if isinstance(ai_config["post_prompt"], dict):
|
842
|
+
ai_config["post_prompt"].update(agent_to_use._post_prompt_llm_params)
|
843
|
+
elif isinstance(ai_config["post_prompt"], str):
|
844
|
+
# Convert string post_prompt to dict format
|
845
|
+
ai_config["post_prompt"] = {
|
846
|
+
"text": ai_config["post_prompt"],
|
847
|
+
**agent_to_use._post_prompt_llm_params
|
848
|
+
}
|
857
849
|
|
858
850
|
except ValueError as e:
|
859
851
|
if not agent_to_use._suppress_logs:
|
@@ -372,28 +372,22 @@ class AIConfigMixin:
|
|
372
372
|
self._function_includes = valid_includes
|
373
373
|
return self
|
374
374
|
|
375
|
-
def set_prompt_llm_params(
|
376
|
-
self,
|
377
|
-
temperature: Optional[float] = None,
|
378
|
-
top_p: Optional[float] = None,
|
379
|
-
barge_confidence: Optional[float] = None,
|
380
|
-
presence_penalty: Optional[float] = None,
|
381
|
-
frequency_penalty: Optional[float] = None
|
382
|
-
) -> 'AgentBase':
|
375
|
+
def set_prompt_llm_params(self, **params) -> 'AgentBase':
|
383
376
|
"""
|
384
377
|
Set LLM parameters for the main prompt.
|
385
378
|
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
presence_penalty: Topic diversity
|
394
|
-
|
395
|
-
|
396
|
-
|
379
|
+
Accepts any parameters which will be passed through to the SignalWire server.
|
380
|
+
The server will validate and apply parameters based on the target model's capabilities.
|
381
|
+
|
382
|
+
Common parameters include:
|
383
|
+
temperature: Randomness setting. Lower values make output more deterministic.
|
384
|
+
top_p: Alternative to temperature. Controls nucleus sampling.
|
385
|
+
barge_confidence: ASR confidence to interrupt. Higher values make it harder to interrupt.
|
386
|
+
presence_penalty: Topic diversity. Positive values encourage new topics.
|
387
|
+
frequency_penalty: Repetition control. Positive values reduce repetition.
|
388
|
+
|
389
|
+
Note: Parameters are model-specific and will be validated by the server.
|
390
|
+
Invalid parameters for the selected model will be handled/ignored by the server.
|
397
391
|
|
398
392
|
Returns:
|
399
393
|
Self for method chaining
|
@@ -405,57 +399,28 @@ class AIConfigMixin:
|
|
405
399
|
barge_confidence=0.6
|
406
400
|
)
|
407
401
|
"""
|
408
|
-
#
|
409
|
-
if
|
410
|
-
|
411
|
-
raise ValueError("temperature must be between 0.0 and 1.5")
|
412
|
-
self._prompt_llm_params['temperature'] = temperature
|
413
|
-
|
414
|
-
# Validate and set top_p
|
415
|
-
if top_p is not None:
|
416
|
-
if not 0.0 <= top_p <= 1.0:
|
417
|
-
raise ValueError("top_p must be between 0.0 and 1.0")
|
418
|
-
self._prompt_llm_params['top_p'] = top_p
|
419
|
-
|
420
|
-
# Validate and set barge_confidence
|
421
|
-
if barge_confidence is not None:
|
422
|
-
if not 0.0 <= barge_confidence <= 1.0:
|
423
|
-
raise ValueError("barge_confidence must be between 0.0 and 1.0")
|
424
|
-
self._prompt_llm_params['barge_confidence'] = barge_confidence
|
425
|
-
|
426
|
-
# Validate and set presence_penalty
|
427
|
-
if presence_penalty is not None:
|
428
|
-
if not -2.0 <= presence_penalty <= 2.0:
|
429
|
-
raise ValueError("presence_penalty must be between -2.0 and 2.0")
|
430
|
-
self._prompt_llm_params['presence_penalty'] = presence_penalty
|
431
|
-
|
432
|
-
# Validate and set frequency_penalty
|
433
|
-
if frequency_penalty is not None:
|
434
|
-
if not -2.0 <= frequency_penalty <= 2.0:
|
435
|
-
raise ValueError("frequency_penalty must be between -2.0 and 2.0")
|
436
|
-
self._prompt_llm_params['frequency_penalty'] = frequency_penalty
|
402
|
+
# Accept any parameters without validation
|
403
|
+
if params:
|
404
|
+
self._prompt_llm_params.update(params)
|
437
405
|
|
438
406
|
return self
|
439
407
|
|
440
|
-
def set_post_prompt_llm_params(
|
441
|
-
self,
|
442
|
-
temperature: Optional[float] = None,
|
443
|
-
top_p: Optional[float] = None,
|
444
|
-
presence_penalty: Optional[float] = None,
|
445
|
-
frequency_penalty: Optional[float] = None
|
446
|
-
) -> 'AgentBase':
|
408
|
+
def set_post_prompt_llm_params(self, **params) -> 'AgentBase':
|
447
409
|
"""
|
448
410
|
Set LLM parameters for the post-prompt.
|
449
411
|
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
frequency_penalty: Repetition control
|
458
|
-
|
412
|
+
Accepts any parameters which will be passed through to the SignalWire server.
|
413
|
+
The server will validate and apply parameters based on the target model's capabilities.
|
414
|
+
|
415
|
+
Common parameters include:
|
416
|
+
temperature: Randomness setting. Lower values make output more deterministic.
|
417
|
+
top_p: Alternative to temperature. Controls nucleus sampling.
|
418
|
+
presence_penalty: Topic diversity. Positive values encourage new topics.
|
419
|
+
frequency_penalty: Repetition control. Positive values reduce repetition.
|
420
|
+
|
421
|
+
Note: Parameters are model-specific and will be validated by the server.
|
422
|
+
Invalid parameters for the selected model will be handled/ignored by the server.
|
423
|
+
barge_confidence is not applicable to post-prompt.
|
459
424
|
|
460
425
|
Returns:
|
461
426
|
Self for method chaining
|
@@ -466,28 +431,8 @@ class AIConfigMixin:
|
|
466
431
|
top_p=0.9
|
467
432
|
)
|
468
433
|
"""
|
469
|
-
#
|
470
|
-
if
|
471
|
-
|
472
|
-
raise ValueError("temperature must be between 0.0 and 1.5")
|
473
|
-
self._post_prompt_llm_params['temperature'] = temperature
|
474
|
-
|
475
|
-
# Validate and set top_p
|
476
|
-
if top_p is not None:
|
477
|
-
if not 0.0 <= top_p <= 1.0:
|
478
|
-
raise ValueError("top_p must be between 0.0 and 1.0")
|
479
|
-
self._post_prompt_llm_params['top_p'] = top_p
|
480
|
-
|
481
|
-
# Validate and set presence_penalty
|
482
|
-
if presence_penalty is not None:
|
483
|
-
if not -2.0 <= presence_penalty <= 2.0:
|
484
|
-
raise ValueError("presence_penalty must be between -2.0 and 2.0")
|
485
|
-
self._post_prompt_llm_params['presence_penalty'] = presence_penalty
|
486
|
-
|
487
|
-
# Validate and set frequency_penalty
|
488
|
-
if frequency_penalty is not None:
|
489
|
-
if not -2.0 <= frequency_penalty <= 2.0:
|
490
|
-
raise ValueError("frequency_penalty must be between -2.0 and 2.0")
|
491
|
-
self._post_prompt_llm_params['frequency_penalty'] = frequency_penalty
|
434
|
+
# Accept any parameters without validation
|
435
|
+
if params:
|
436
|
+
self._post_prompt_llm_params.update(params)
|
492
437
|
|
493
438
|
return self
|
@@ -57,8 +57,8 @@ class SWAIGFunction:
|
|
57
57
|
self.description = description
|
58
58
|
self.parameters = parameters or {}
|
59
59
|
self.secure = secure
|
60
|
-
self.fillers = fillers #
|
61
|
-
self.wait_file = wait_file
|
60
|
+
self.fillers = fillers # Text phrases to say while processing
|
61
|
+
self.wait_file = wait_file # URL to audio/video file to play while waiting
|
62
62
|
self.wait_file_loops = wait_file_loops
|
63
63
|
self.webhook_url = webhook_url
|
64
64
|
self.required = required or []
|
@@ -68,6 +68,8 @@ if _SEARCH_AVAILABLE:
|
|
68
68
|
from .index_builder import IndexBuilder
|
69
69
|
from .search_engine import SearchEngine
|
70
70
|
from .search_service import SearchService
|
71
|
+
from .models import MODEL_ALIASES, DEFAULT_MODEL, resolve_model_alias
|
72
|
+
from .migration import SearchIndexMigrator
|
71
73
|
|
72
74
|
__all__ = [
|
73
75
|
'preprocess_query',
|
@@ -75,7 +77,11 @@ if _SEARCH_AVAILABLE:
|
|
75
77
|
'DocumentProcessor',
|
76
78
|
'IndexBuilder',
|
77
79
|
'SearchEngine',
|
78
|
-
'SearchService'
|
80
|
+
'SearchService',
|
81
|
+
'MODEL_ALIASES',
|
82
|
+
'DEFAULT_MODEL',
|
83
|
+
'resolve_model_alias',
|
84
|
+
'SearchIndexMigrator'
|
79
85
|
]
|
80
86
|
except ImportError as e:
|
81
87
|
# Some search components failed to import
|
@@ -140,6 +140,8 @@ class DocumentProcessor:
|
|
140
140
|
return self._chunk_by_topics(content, filename, file_type)
|
141
141
|
elif self.chunking_strategy == 'qa':
|
142
142
|
return self._chunk_by_qa_optimization(content, filename, file_type)
|
143
|
+
elif self.chunking_strategy == 'json':
|
144
|
+
return self._chunk_from_json(content, filename, file_type)
|
143
145
|
else:
|
144
146
|
# Fallback to sentence-based chunking
|
145
147
|
return self._chunk_by_sentences(content, filename, file_type)
|
@@ -1022,4 +1024,106 @@ class DocumentProcessor:
|
|
1022
1024
|
))
|
1023
1025
|
|
1024
1026
|
return chunks if chunks else [self._create_chunk(content, filename, "QA Section 1",
|
1025
|
-
metadata={'chunk_method': 'qa_optimized', 'chunk_index': 0})]
|
1027
|
+
metadata={'chunk_method': 'qa_optimized', 'chunk_index': 0})]
|
1028
|
+
|
1029
|
+
def _chunk_from_json(self, content: str, filename: str, file_type: str) -> List[Dict[str, Any]]:
|
1030
|
+
"""
|
1031
|
+
Create chunks from pre-processed JSON content
|
1032
|
+
|
1033
|
+
This strategy expects content to be a JSON string with the following structure:
|
1034
|
+
{
|
1035
|
+
"chunks": [
|
1036
|
+
{
|
1037
|
+
"chunk_id": "unique_id",
|
1038
|
+
"type": "content|toc",
|
1039
|
+
"content": "text content",
|
1040
|
+
"metadata": {
|
1041
|
+
"url": "https://...",
|
1042
|
+
"section_number": 1,
|
1043
|
+
"related_toc": "toc_id",
|
1044
|
+
...
|
1045
|
+
}
|
1046
|
+
},
|
1047
|
+
...
|
1048
|
+
]
|
1049
|
+
}
|
1050
|
+
|
1051
|
+
Args:
|
1052
|
+
content: JSON string containing pre-chunked content
|
1053
|
+
filename: Name of the source file
|
1054
|
+
file_type: Should be 'json'
|
1055
|
+
|
1056
|
+
Returns:
|
1057
|
+
List of chunk dictionaries formatted for the search index
|
1058
|
+
"""
|
1059
|
+
try:
|
1060
|
+
# Parse JSON content
|
1061
|
+
data = json.loads(content)
|
1062
|
+
|
1063
|
+
if not isinstance(data, dict) or 'chunks' not in data:
|
1064
|
+
logger.error(f"Invalid JSON structure in {filename}: expected 'chunks' key")
|
1065
|
+
# Fallback to treating it as plain text
|
1066
|
+
return self._chunk_by_sentences(content, filename, file_type)
|
1067
|
+
|
1068
|
+
chunks = []
|
1069
|
+
for idx, json_chunk in enumerate(data['chunks']):
|
1070
|
+
if not isinstance(json_chunk, dict) or 'content' not in json_chunk:
|
1071
|
+
logger.warning(f"Skipping invalid chunk {idx} in {filename}")
|
1072
|
+
continue
|
1073
|
+
|
1074
|
+
# Extract metadata from JSON chunk
|
1075
|
+
json_metadata = json_chunk.get('metadata', {})
|
1076
|
+
chunk_type = json_chunk.get('type', 'content')
|
1077
|
+
|
1078
|
+
# Build chunk metadata (excluding tags which go at top level)
|
1079
|
+
metadata = {
|
1080
|
+
'chunk_method': 'json',
|
1081
|
+
'chunk_index': idx,
|
1082
|
+
'chunk_type': chunk_type,
|
1083
|
+
'original_chunk_id': json_chunk.get('chunk_id', f'chunk_{idx}')
|
1084
|
+
}
|
1085
|
+
|
1086
|
+
# Extract tags before merging metadata
|
1087
|
+
tags = json_metadata.get('tags', [])
|
1088
|
+
|
1089
|
+
# Merge JSON metadata (this includes all fields including tags)
|
1090
|
+
# We'll keep tags in metadata for backward compatibility but also set at top level
|
1091
|
+
metadata.update(json_metadata)
|
1092
|
+
|
1093
|
+
# Determine section name
|
1094
|
+
if chunk_type == 'toc':
|
1095
|
+
section = f"TOC: {json_chunk.get('content', '')[:50]}"
|
1096
|
+
else:
|
1097
|
+
section = json_metadata.get('section', f"Section {json_metadata.get('section_number', idx + 1)}")
|
1098
|
+
|
1099
|
+
# Create chunk with proper structure
|
1100
|
+
chunk = self._create_chunk(
|
1101
|
+
content=json_chunk['content'],
|
1102
|
+
filename=filename,
|
1103
|
+
section=section,
|
1104
|
+
metadata=metadata
|
1105
|
+
)
|
1106
|
+
|
1107
|
+
# Set tags at the top level for proper tag filtering
|
1108
|
+
if tags:
|
1109
|
+
chunk['tags'] = tags
|
1110
|
+
elif chunk_type == 'toc':
|
1111
|
+
# For TOC entries, add special tags if none provided
|
1112
|
+
chunk['tags'] = ['toc', 'navigation']
|
1113
|
+
|
1114
|
+
chunks.append(chunk)
|
1115
|
+
|
1116
|
+
if not chunks:
|
1117
|
+
logger.warning(f"No valid chunks found in JSON file {filename}")
|
1118
|
+
return self._chunk_by_sentences(str(data), filename, file_type)
|
1119
|
+
|
1120
|
+
logger.info(f"Created {len(chunks)} chunks from JSON file {filename}")
|
1121
|
+
return chunks
|
1122
|
+
|
1123
|
+
except json.JSONDecodeError as e:
|
1124
|
+
logger.error(f"Failed to parse JSON in {filename}: {e}")
|
1125
|
+
# Fallback to sentence chunking
|
1126
|
+
return self._chunk_by_sentences(content, filename, file_type)
|
1127
|
+
except Exception as e:
|
1128
|
+
logger.error(f"Unexpected error processing JSON chunks in {filename}: {e}")
|
1129
|
+
return self._chunk_by_sentences(content, filename, file_type)
|
@@ -55,7 +55,7 @@ class IndexBuilder:
|
|
55
55
|
|
56
56
|
Args:
|
57
57
|
model_name: Name of the sentence transformer model to use
|
58
|
-
chunking_strategy: Strategy for chunking documents ('sentence', 'sliding', 'paragraph', 'page', 'semantic', 'topic', 'qa')
|
58
|
+
chunking_strategy: Strategy for chunking documents ('sentence', 'sliding', 'paragraph', 'page', 'semantic', 'topic', 'qa', 'json')
|
59
59
|
max_sentences_per_chunk: For sentence strategy (default: 5)
|
60
60
|
chunk_size: For sliding strategy - words per chunk (default: 50)
|
61
61
|
chunk_overlap: For sliding strategy - overlap in words (default: 10)
|
@@ -85,9 +85,6 @@ class IndexBuilder:
|
|
85
85
|
if self.backend not in ['sqlite', 'pgvector']:
|
86
86
|
raise ValueError(f"Invalid backend '{self.backend}'. Must be 'sqlite' or 'pgvector'")
|
87
87
|
|
88
|
-
if self.backend == 'pgvector' and not self.connection_string:
|
89
|
-
raise ValueError("connection_string is required for pgvector backend")
|
90
|
-
|
91
88
|
# Validate NLP backend
|
92
89
|
if self.index_nlp_backend not in ['nltk', 'spacy']:
|
93
90
|
logger.warning(f"Invalid index_nlp_backend '{self.index_nlp_backend}', using 'nltk'")
|
@@ -105,6 +102,50 @@ class IndexBuilder:
|
|
105
102
|
topic_threshold=self.topic_threshold
|
106
103
|
)
|
107
104
|
|
105
|
+
def _extract_metadata_from_json_content(self, content: str) -> tuple[Dict[str, Any], str]:
|
106
|
+
"""
|
107
|
+
Extract metadata from JSON content if present
|
108
|
+
|
109
|
+
Returns:
|
110
|
+
(metadata_dict, metadata_text)
|
111
|
+
"""
|
112
|
+
metadata_dict = {}
|
113
|
+
|
114
|
+
# Try to extract metadata from JSON structure in content
|
115
|
+
if '"metadata":' in content:
|
116
|
+
try:
|
117
|
+
# Look for metadata object in content
|
118
|
+
import re
|
119
|
+
# Find all metadata objects
|
120
|
+
pattern = r'"metadata"\s*:\s*(\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\})'
|
121
|
+
matches = re.finditer(pattern, content)
|
122
|
+
|
123
|
+
for match in matches:
|
124
|
+
try:
|
125
|
+
json_metadata = json.loads(match.group(1))
|
126
|
+
# Merge all found metadata
|
127
|
+
if isinstance(json_metadata, dict):
|
128
|
+
metadata_dict.update(json_metadata)
|
129
|
+
except:
|
130
|
+
pass
|
131
|
+
except Exception as e:
|
132
|
+
logger.debug(f"Error extracting JSON metadata: {e}")
|
133
|
+
|
134
|
+
# Create searchable text from all metadata keys and values
|
135
|
+
metadata_text_parts = []
|
136
|
+
for key, value in metadata_dict.items():
|
137
|
+
# Add key
|
138
|
+
metadata_text_parts.append(str(key))
|
139
|
+
# Add value(s)
|
140
|
+
if isinstance(value, list):
|
141
|
+
metadata_text_parts.extend(str(v) for v in value)
|
142
|
+
else:
|
143
|
+
metadata_text_parts.append(str(value))
|
144
|
+
|
145
|
+
metadata_text = ' '.join(metadata_text_parts).lower()
|
146
|
+
|
147
|
+
return metadata_dict, metadata_text
|
148
|
+
|
108
149
|
def _load_model(self):
|
109
150
|
"""Load embedding model (lazy loading)"""
|
110
151
|
if self.model is None:
|
@@ -147,6 +188,7 @@ class IndexBuilder:
|
|
147
188
|
|
148
189
|
# Process documents
|
149
190
|
chunks = []
|
191
|
+
print(f"Processing {len(files)} files...")
|
150
192
|
for file_path in files:
|
151
193
|
try:
|
152
194
|
# For individual files, use the file's parent as the base directory
|
@@ -154,8 +196,8 @@ class IndexBuilder:
|
|
154
196
|
base_dir = self._get_base_directory_for_file(file_path, sources)
|
155
197
|
file_chunks = self._process_file(file_path, base_dir, tags)
|
156
198
|
chunks.extend(file_chunks)
|
157
|
-
if self.verbose:
|
158
|
-
print(f"
|
199
|
+
if self.verbose or file_path.suffix == '.json':
|
200
|
+
print(f" {file_path}: {len(file_chunks)} chunks")
|
159
201
|
except Exception as e:
|
160
202
|
logger.error(f"Error processing {file_path}: {e}")
|
161
203
|
if self.verbose:
|
@@ -171,7 +213,9 @@ class IndexBuilder:
|
|
171
213
|
# Generate embeddings
|
172
214
|
self._load_model()
|
173
215
|
if self.verbose:
|
174
|
-
print("Generating embeddings...")
|
216
|
+
print(f"Generating embeddings for {len(chunks)} chunks...")
|
217
|
+
else:
|
218
|
+
print(f"Generating embeddings for {len(chunks)} chunks...")
|
175
219
|
|
176
220
|
for i, chunk in enumerate(chunks):
|
177
221
|
try:
|
@@ -183,15 +227,33 @@ class IndexBuilder:
|
|
183
227
|
)
|
184
228
|
|
185
229
|
chunk['processed_content'] = processed['enhanced_text']
|
186
|
-
|
230
|
+
|
231
|
+
# Include tags in keywords for better search matching
|
232
|
+
keywords = processed.get('keywords', [])
|
233
|
+
chunk_tags = chunk.get('tags', [])
|
234
|
+
if chunk_tags:
|
235
|
+
# Add tags to keywords list for FTS matching
|
236
|
+
keywords.extend(chunk_tags)
|
237
|
+
# Remove duplicates while preserving order
|
238
|
+
keywords = list(dict.fromkeys(keywords))
|
239
|
+
|
240
|
+
chunk['keywords'] = keywords
|
241
|
+
|
242
|
+
# For embedding, include tags in the text for better semantic matching
|
243
|
+
embedding_text = processed['enhanced_text']
|
244
|
+
if chunk_tags:
|
245
|
+
# Append tags to the text for embedding generation
|
246
|
+
embedding_text += " " + " ".join(chunk_tags)
|
187
247
|
|
188
248
|
# Generate embedding (suppress progress bar)
|
189
|
-
embedding = self.model.encode(
|
249
|
+
embedding = self.model.encode(embedding_text, show_progress_bar=False)
|
190
250
|
chunk['embedding'] = embedding.tobytes()
|
191
251
|
|
192
|
-
|
252
|
+
# Show progress more frequently
|
253
|
+
show_every = 50 if len(chunks) > 500 else max(10, len(chunks) // 10)
|
254
|
+
if (i + 1) % show_every == 0 or (i + 1) == len(chunks):
|
193
255
|
progress_pct = ((i + 1) / len(chunks)) * 100
|
194
|
-
print(f"
|
256
|
+
print(f" Progress: {i + 1}/{len(chunks)} chunks ({progress_pct:.1f}%)")
|
195
257
|
|
196
258
|
except Exception as e:
|
197
259
|
logger.error(f"Error processing chunk {i}: {e}")
|
@@ -485,6 +547,7 @@ class IndexBuilder:
|
|
485
547
|
end_line INTEGER,
|
486
548
|
tags TEXT,
|
487
549
|
metadata TEXT,
|
550
|
+
metadata_text TEXT, -- Searchable text representation of all metadata
|
488
551
|
chunk_hash TEXT UNIQUE,
|
489
552
|
created_at TEXT DEFAULT CURRENT_TIMESTAMP
|
490
553
|
)
|
@@ -494,6 +557,7 @@ class IndexBuilder:
|
|
494
557
|
CREATE VIRTUAL TABLE chunks_fts USING fts5(
|
495
558
|
processed_content,
|
496
559
|
keywords,
|
560
|
+
metadata_text,
|
497
561
|
content='chunks',
|
498
562
|
content_rowid='id'
|
499
563
|
)
|
@@ -555,13 +619,47 @@ class IndexBuilder:
|
|
555
619
|
# Prepare data
|
556
620
|
keywords_json = json.dumps(chunk.get('keywords', []))
|
557
621
|
tags_json = json.dumps(chunk.get('tags', []))
|
558
|
-
|
622
|
+
|
623
|
+
# Extract metadata from JSON content and merge with chunk metadata
|
624
|
+
json_metadata, json_metadata_text = self._extract_metadata_from_json_content(chunk['content'])
|
625
|
+
chunk_metadata = chunk.get('metadata', {})
|
626
|
+
|
627
|
+
# Merge metadata: chunk metadata takes precedence
|
628
|
+
merged_metadata = {**json_metadata, **chunk_metadata}
|
629
|
+
metadata_json = json.dumps(merged_metadata)
|
630
|
+
|
631
|
+
# Create comprehensive metadata_text including tags
|
632
|
+
metadata_text_parts = []
|
633
|
+
|
634
|
+
# Add metadata text from JSON content
|
635
|
+
if json_metadata_text:
|
636
|
+
metadata_text_parts.append(json_metadata_text)
|
637
|
+
|
638
|
+
# Add tags
|
639
|
+
tags = chunk.get('tags', [])
|
640
|
+
if tags:
|
641
|
+
metadata_text_parts.extend(str(tag).lower() for tag in tags)
|
642
|
+
|
643
|
+
# Add section if present
|
644
|
+
if chunk.get('section'):
|
645
|
+
metadata_text_parts.append(chunk['section'].lower())
|
646
|
+
|
647
|
+
# Add any additional metadata values
|
648
|
+
for key, value in chunk_metadata.items():
|
649
|
+
if key not in json_metadata: # Avoid duplicates
|
650
|
+
metadata_text_parts.append(str(key).lower())
|
651
|
+
if isinstance(value, list):
|
652
|
+
metadata_text_parts.extend(str(v).lower() for v in value)
|
653
|
+
else:
|
654
|
+
metadata_text_parts.append(str(value).lower())
|
655
|
+
|
656
|
+
metadata_text = ' '.join(metadata_text_parts)
|
559
657
|
|
560
658
|
cursor.execute('''
|
561
659
|
INSERT OR IGNORE INTO chunks (
|
562
660
|
content, processed_content, keywords, language, embedding,
|
563
|
-
filename, section, start_line, end_line, tags, metadata, chunk_hash
|
564
|
-
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
661
|
+
filename, section, start_line, end_line, tags, metadata, metadata_text, chunk_hash
|
662
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
565
663
|
''', (
|
566
664
|
chunk['content'],
|
567
665
|
chunk.get('processed_content', chunk['content']),
|
@@ -574,6 +672,7 @@ class IndexBuilder:
|
|
574
672
|
chunk.get('end_line'),
|
575
673
|
tags_json,
|
576
674
|
metadata_json,
|
675
|
+
metadata_text,
|
577
676
|
chunk_hash
|
578
677
|
))
|
579
678
|
|