signalwire-agents 0.1.47__py3-none-any.whl → 0.1.49__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- signalwire_agents/__init__.py +1 -1
- signalwire_agents/cli/build_search.py +516 -12
- signalwire_agents/core/mixins/ai_config_mixin.py +4 -0
- signalwire_agents/schema.json +57 -1
- signalwire_agents/search/__init__.py +7 -1
- signalwire_agents/search/document_processor.py +11 -8
- signalwire_agents/search/index_builder.py +112 -13
- signalwire_agents/search/migration.py +418 -0
- signalwire_agents/search/models.py +30 -0
- signalwire_agents/search/pgvector_backend.py +236 -13
- signalwire_agents/search/query_processor.py +87 -9
- signalwire_agents/search/search_engine.py +835 -31
- signalwire_agents/search/search_service.py +56 -6
- signalwire_agents/skills/native_vector_search/skill.py +208 -33
- {signalwire_agents-0.1.47.dist-info → signalwire_agents-0.1.49.dist-info}/METADATA +1 -1
- {signalwire_agents-0.1.47.dist-info → signalwire_agents-0.1.49.dist-info}/RECORD +20 -18
- {signalwire_agents-0.1.47.dist-info → signalwire_agents-0.1.49.dist-info}/WHEEL +0 -0
- {signalwire_agents-0.1.47.dist-info → signalwire_agents-0.1.49.dist-info}/entry_points.txt +0 -0
- {signalwire_agents-0.1.47.dist-info → signalwire_agents-0.1.49.dist-info}/licenses/LICENSE +0 -0
- {signalwire_agents-0.1.47.dist-info → signalwire_agents-0.1.49.dist-info}/top_level.txt +0 -0
@@ -380,6 +380,7 @@ class AIConfigMixin:
|
|
380
380
|
The server will validate and apply parameters based on the target model's capabilities.
|
381
381
|
|
382
382
|
Common parameters include:
|
383
|
+
model: The AI model to use (gpt-4o-mini, gpt-4.1-mini, gpt-4.1-nano, nova-micro, nova-lite)
|
383
384
|
temperature: Randomness setting. Lower values make output more deterministic.
|
384
385
|
top_p: Alternative to temperature. Controls nucleus sampling.
|
385
386
|
barge_confidence: ASR confidence to interrupt. Higher values make it harder to interrupt.
|
@@ -394,6 +395,7 @@ class AIConfigMixin:
|
|
394
395
|
|
395
396
|
Example:
|
396
397
|
agent.set_prompt_llm_params(
|
398
|
+
model="nova-micro", # Using Amazon's nova-micro model
|
397
399
|
temperature=0.7,
|
398
400
|
top_p=0.9,
|
399
401
|
barge_confidence=0.6
|
@@ -413,6 +415,7 @@ class AIConfigMixin:
|
|
413
415
|
The server will validate and apply parameters based on the target model's capabilities.
|
414
416
|
|
415
417
|
Common parameters include:
|
418
|
+
model: The AI model to use (gpt-4o-mini, gpt-4.1-mini, gpt-4.1-nano, nova-micro, nova-lite)
|
416
419
|
temperature: Randomness setting. Lower values make output more deterministic.
|
417
420
|
top_p: Alternative to temperature. Controls nucleus sampling.
|
418
421
|
presence_penalty: Topic diversity. Positive values encourage new topics.
|
@@ -427,6 +430,7 @@ class AIConfigMixin:
|
|
427
430
|
|
428
431
|
Example:
|
429
432
|
agent.set_post_prompt_llm_params(
|
433
|
+
model="gpt-4o-mini",
|
430
434
|
temperature=0.5, # More deterministic for post-prompt
|
431
435
|
top_p=0.9
|
432
436
|
)
|
signalwire_agents/schema.json
CHANGED
@@ -1925,9 +1925,17 @@
|
|
1925
1925
|
{
|
1926
1926
|
"type": "string",
|
1927
1927
|
"const": "gpt-4.1-nano"
|
1928
|
+
},
|
1929
|
+
{
|
1930
|
+
"type": "string",
|
1931
|
+
"const": "nova-micro"
|
1932
|
+
},
|
1933
|
+
{
|
1934
|
+
"type": "string",
|
1935
|
+
"const": "nova-lite"
|
1928
1936
|
}
|
1929
1937
|
],
|
1930
|
-
"description": "The model to use for the AI. Allowed values are `gpt-4o-mini`, `gpt-4.1-mini`,
|
1938
|
+
"description": "The model to use for the AI. Allowed values are `gpt-4o-mini`, `gpt-4.1-mini`, `gpt-4.1-nano`, `nova-micro`, and `nova-lite`."
|
1931
1939
|
},
|
1932
1940
|
"ai_volume": {
|
1933
1941
|
"anyOf": [
|
@@ -4005,6 +4013,18 @@
|
|
4005
4013
|
"maximum": 2,
|
4006
4014
|
"description": "Aversion to repeating lines. Float value between -2.0 and 2.0. Positive values decrease the model's likelihood to repeat the same line verbatim."
|
4007
4015
|
},
|
4016
|
+
"model": {
|
4017
|
+
"anyOf": [
|
4018
|
+
{
|
4019
|
+
"type": "string",
|
4020
|
+
"enum": ["gpt-4o-mini", "gpt-4.1-mini", "gpt-4.1-nano", "nova-micro", "nova-lite"]
|
4021
|
+
},
|
4022
|
+
{
|
4023
|
+
"$ref": "#/$defs/SWMLVar"
|
4024
|
+
}
|
4025
|
+
],
|
4026
|
+
"description": "The model to use for the post-prompt. Allowed values are `gpt-4o-mini`, `gpt-4.1-mini`, `gpt-4.1-nano`, `nova-micro`, and `nova-lite`."
|
4027
|
+
},
|
4008
4028
|
"text": {
|
4009
4029
|
"type": "string",
|
4010
4030
|
"description": "The instructions to send to the agent."
|
@@ -4084,6 +4104,18 @@
|
|
4084
4104
|
"maximum": 2,
|
4085
4105
|
"description": "Aversion to repeating lines. Float value between -2.0 and 2.0. Positive values decrease the model's likelihood to repeat the same line verbatim."
|
4086
4106
|
},
|
4107
|
+
"model": {
|
4108
|
+
"anyOf": [
|
4109
|
+
{
|
4110
|
+
"type": "string",
|
4111
|
+
"enum": ["gpt-4o-mini", "gpt-4.1-mini", "gpt-4.1-nano", "nova-micro", "nova-lite"]
|
4112
|
+
},
|
4113
|
+
{
|
4114
|
+
"$ref": "#/$defs/SWMLVar"
|
4115
|
+
}
|
4116
|
+
],
|
4117
|
+
"description": "The model to use for the post-prompt. Allowed values are `gpt-4o-mini`, `gpt-4.1-mini`, `gpt-4.1-nano`, `nova-micro`, and `nova-lite`."
|
4118
|
+
},
|
4087
4119
|
"pom": {
|
4088
4120
|
"type": "array",
|
4089
4121
|
"items": {
|
@@ -4166,6 +4198,18 @@
|
|
4166
4198
|
"maximum": 2,
|
4167
4199
|
"description": "Aversion to repeating lines. Float value between -2.0 and 2.0. Positive values decrease the model's likelihood to repeat the same line verbatim."
|
4168
4200
|
},
|
4201
|
+
"model": {
|
4202
|
+
"anyOf": [
|
4203
|
+
{
|
4204
|
+
"type": "string",
|
4205
|
+
"enum": ["gpt-4o-mini", "gpt-4.1-mini", "gpt-4.1-nano", "nova-micro", "nova-lite"]
|
4206
|
+
},
|
4207
|
+
{
|
4208
|
+
"$ref": "#/$defs/SWMLVar"
|
4209
|
+
}
|
4210
|
+
],
|
4211
|
+
"description": "The model to use for the prompt. Allowed values are `gpt-4o-mini`, `gpt-4.1-mini`, `gpt-4.1-nano`, `nova-micro`, and `nova-lite`."
|
4212
|
+
},
|
4169
4213
|
"text": {
|
4170
4214
|
"type": "string",
|
4171
4215
|
"description": "The instructions to send to the agent."
|
@@ -4249,6 +4293,18 @@
|
|
4249
4293
|
"maximum": 2,
|
4250
4294
|
"description": "Aversion to repeating lines. Float value between -2.0 and 2.0. Positive values decrease the model's likelihood to repeat the same line verbatim."
|
4251
4295
|
},
|
4296
|
+
"model": {
|
4297
|
+
"anyOf": [
|
4298
|
+
{
|
4299
|
+
"type": "string",
|
4300
|
+
"enum": ["gpt-4o-mini", "gpt-4.1-mini", "gpt-4.1-nano", "nova-micro", "nova-lite"]
|
4301
|
+
},
|
4302
|
+
{
|
4303
|
+
"$ref": "#/$defs/SWMLVar"
|
4304
|
+
}
|
4305
|
+
],
|
4306
|
+
"description": "The model to use for the prompt. Allowed values are `gpt-4o-mini`, `gpt-4.1-mini`, `gpt-4.1-nano`, `nova-micro`, and `nova-lite`."
|
4307
|
+
},
|
4252
4308
|
"pom": {
|
4253
4309
|
"type": "array",
|
4254
4310
|
"items": {
|
@@ -68,6 +68,8 @@ if _SEARCH_AVAILABLE:
|
|
68
68
|
from .index_builder import IndexBuilder
|
69
69
|
from .search_engine import SearchEngine
|
70
70
|
from .search_service import SearchService
|
71
|
+
from .models import MODEL_ALIASES, DEFAULT_MODEL, resolve_model_alias
|
72
|
+
from .migration import SearchIndexMigrator
|
71
73
|
|
72
74
|
__all__ = [
|
73
75
|
'preprocess_query',
|
@@ -75,7 +77,11 @@ if _SEARCH_AVAILABLE:
|
|
75
77
|
'DocumentProcessor',
|
76
78
|
'IndexBuilder',
|
77
79
|
'SearchEngine',
|
78
|
-
'SearchService'
|
80
|
+
'SearchService',
|
81
|
+
'MODEL_ALIASES',
|
82
|
+
'DEFAULT_MODEL',
|
83
|
+
'resolve_model_alias',
|
84
|
+
'SearchIndexMigrator'
|
79
85
|
]
|
80
86
|
except ImportError as e:
|
81
87
|
# Some search components failed to import
|
@@ -1075,7 +1075,7 @@ class DocumentProcessor:
|
|
1075
1075
|
json_metadata = json_chunk.get('metadata', {})
|
1076
1076
|
chunk_type = json_chunk.get('type', 'content')
|
1077
1077
|
|
1078
|
-
# Build chunk metadata
|
1078
|
+
# Build chunk metadata (excluding tags which go at top level)
|
1079
1079
|
metadata = {
|
1080
1080
|
'chunk_method': 'json',
|
1081
1081
|
'chunk_index': idx,
|
@@ -1083,7 +1083,11 @@ class DocumentProcessor:
|
|
1083
1083
|
'original_chunk_id': json_chunk.get('chunk_id', f'chunk_{idx}')
|
1084
1084
|
}
|
1085
1085
|
|
1086
|
-
#
|
1086
|
+
# Extract tags before merging metadata
|
1087
|
+
tags = json_metadata.get('tags', [])
|
1088
|
+
|
1089
|
+
# Merge JSON metadata (this includes all fields including tags)
|
1090
|
+
# We'll keep tags in metadata for backward compatibility but also set at top level
|
1087
1091
|
metadata.update(json_metadata)
|
1088
1092
|
|
1089
1093
|
# Determine section name
|
@@ -1100,12 +1104,11 @@ class DocumentProcessor:
|
|
1100
1104
|
metadata=metadata
|
1101
1105
|
)
|
1102
1106
|
|
1103
|
-
#
|
1104
|
-
if
|
1105
|
-
chunk['tags'] =
|
1106
|
-
|
1107
|
-
|
1108
|
-
if chunk_type == 'toc' and 'tags' not in chunk:
|
1107
|
+
# Set tags at the top level for proper tag filtering
|
1108
|
+
if tags:
|
1109
|
+
chunk['tags'] = tags
|
1110
|
+
elif chunk_type == 'toc':
|
1111
|
+
# For TOC entries, add special tags if none provided
|
1109
1112
|
chunk['tags'] = ['toc', 'navigation']
|
1110
1113
|
|
1111
1114
|
chunks.append(chunk)
|
@@ -85,9 +85,6 @@ class IndexBuilder:
|
|
85
85
|
if self.backend not in ['sqlite', 'pgvector']:
|
86
86
|
raise ValueError(f"Invalid backend '{self.backend}'. Must be 'sqlite' or 'pgvector'")
|
87
87
|
|
88
|
-
if self.backend == 'pgvector' and not self.connection_string:
|
89
|
-
raise ValueError("connection_string is required for pgvector backend")
|
90
|
-
|
91
88
|
# Validate NLP backend
|
92
89
|
if self.index_nlp_backend not in ['nltk', 'spacy']:
|
93
90
|
logger.warning(f"Invalid index_nlp_backend '{self.index_nlp_backend}', using 'nltk'")
|
@@ -105,6 +102,50 @@ class IndexBuilder:
|
|
105
102
|
topic_threshold=self.topic_threshold
|
106
103
|
)
|
107
104
|
|
105
|
+
def _extract_metadata_from_json_content(self, content: str) -> tuple[Dict[str, Any], str]:
|
106
|
+
"""
|
107
|
+
Extract metadata from JSON content if present
|
108
|
+
|
109
|
+
Returns:
|
110
|
+
(metadata_dict, metadata_text)
|
111
|
+
"""
|
112
|
+
metadata_dict = {}
|
113
|
+
|
114
|
+
# Try to extract metadata from JSON structure in content
|
115
|
+
if '"metadata":' in content:
|
116
|
+
try:
|
117
|
+
# Look for metadata object in content
|
118
|
+
import re
|
119
|
+
# Find all metadata objects
|
120
|
+
pattern = r'"metadata"\s*:\s*(\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\})'
|
121
|
+
matches = re.finditer(pattern, content)
|
122
|
+
|
123
|
+
for match in matches:
|
124
|
+
try:
|
125
|
+
json_metadata = json.loads(match.group(1))
|
126
|
+
# Merge all found metadata
|
127
|
+
if isinstance(json_metadata, dict):
|
128
|
+
metadata_dict.update(json_metadata)
|
129
|
+
except:
|
130
|
+
pass
|
131
|
+
except Exception as e:
|
132
|
+
logger.debug(f"Error extracting JSON metadata: {e}")
|
133
|
+
|
134
|
+
# Create searchable text from all metadata keys and values
|
135
|
+
metadata_text_parts = []
|
136
|
+
for key, value in metadata_dict.items():
|
137
|
+
# Add key
|
138
|
+
metadata_text_parts.append(str(key))
|
139
|
+
# Add value(s)
|
140
|
+
if isinstance(value, list):
|
141
|
+
metadata_text_parts.extend(str(v) for v in value)
|
142
|
+
else:
|
143
|
+
metadata_text_parts.append(str(value))
|
144
|
+
|
145
|
+
metadata_text = ' '.join(metadata_text_parts).lower()
|
146
|
+
|
147
|
+
return metadata_dict, metadata_text
|
148
|
+
|
108
149
|
def _load_model(self):
|
109
150
|
"""Load embedding model (lazy loading)"""
|
110
151
|
if self.model is None:
|
@@ -147,6 +188,7 @@ class IndexBuilder:
|
|
147
188
|
|
148
189
|
# Process documents
|
149
190
|
chunks = []
|
191
|
+
print(f"Processing {len(files)} files...")
|
150
192
|
for file_path in files:
|
151
193
|
try:
|
152
194
|
# For individual files, use the file's parent as the base directory
|
@@ -154,8 +196,8 @@ class IndexBuilder:
|
|
154
196
|
base_dir = self._get_base_directory_for_file(file_path, sources)
|
155
197
|
file_chunks = self._process_file(file_path, base_dir, tags)
|
156
198
|
chunks.extend(file_chunks)
|
157
|
-
if self.verbose:
|
158
|
-
print(f"
|
199
|
+
if self.verbose or file_path.suffix == '.json':
|
200
|
+
print(f" {file_path}: {len(file_chunks)} chunks")
|
159
201
|
except Exception as e:
|
160
202
|
logger.error(f"Error processing {file_path}: {e}")
|
161
203
|
if self.verbose:
|
@@ -171,7 +213,9 @@ class IndexBuilder:
|
|
171
213
|
# Generate embeddings
|
172
214
|
self._load_model()
|
173
215
|
if self.verbose:
|
174
|
-
print("Generating embeddings...")
|
216
|
+
print(f"Generating embeddings for {len(chunks)} chunks...")
|
217
|
+
else:
|
218
|
+
print(f"Generating embeddings for {len(chunks)} chunks...")
|
175
219
|
|
176
220
|
for i, chunk in enumerate(chunks):
|
177
221
|
try:
|
@@ -183,15 +227,33 @@ class IndexBuilder:
|
|
183
227
|
)
|
184
228
|
|
185
229
|
chunk['processed_content'] = processed['enhanced_text']
|
186
|
-
|
230
|
+
|
231
|
+
# Include tags in keywords for better search matching
|
232
|
+
keywords = processed.get('keywords', [])
|
233
|
+
chunk_tags = chunk.get('tags', [])
|
234
|
+
if chunk_tags:
|
235
|
+
# Add tags to keywords list for FTS matching
|
236
|
+
keywords.extend(chunk_tags)
|
237
|
+
# Remove duplicates while preserving order
|
238
|
+
keywords = list(dict.fromkeys(keywords))
|
239
|
+
|
240
|
+
chunk['keywords'] = keywords
|
241
|
+
|
242
|
+
# For embedding, include tags in the text for better semantic matching
|
243
|
+
embedding_text = processed['enhanced_text']
|
244
|
+
if chunk_tags:
|
245
|
+
# Append tags to the text for embedding generation
|
246
|
+
embedding_text += " " + " ".join(chunk_tags)
|
187
247
|
|
188
248
|
# Generate embedding (suppress progress bar)
|
189
|
-
embedding = self.model.encode(
|
249
|
+
embedding = self.model.encode(embedding_text, show_progress_bar=False)
|
190
250
|
chunk['embedding'] = embedding.tobytes()
|
191
251
|
|
192
|
-
|
252
|
+
# Show progress more frequently
|
253
|
+
show_every = 50 if len(chunks) > 500 else max(10, len(chunks) // 10)
|
254
|
+
if (i + 1) % show_every == 0 or (i + 1) == len(chunks):
|
193
255
|
progress_pct = ((i + 1) / len(chunks)) * 100
|
194
|
-
print(f"
|
256
|
+
print(f" Progress: {i + 1}/{len(chunks)} chunks ({progress_pct:.1f}%)")
|
195
257
|
|
196
258
|
except Exception as e:
|
197
259
|
logger.error(f"Error processing chunk {i}: {e}")
|
@@ -485,6 +547,7 @@ class IndexBuilder:
|
|
485
547
|
end_line INTEGER,
|
486
548
|
tags TEXT,
|
487
549
|
metadata TEXT,
|
550
|
+
metadata_text TEXT, -- Searchable text representation of all metadata
|
488
551
|
chunk_hash TEXT UNIQUE,
|
489
552
|
created_at TEXT DEFAULT CURRENT_TIMESTAMP
|
490
553
|
)
|
@@ -494,6 +557,7 @@ class IndexBuilder:
|
|
494
557
|
CREATE VIRTUAL TABLE chunks_fts USING fts5(
|
495
558
|
processed_content,
|
496
559
|
keywords,
|
560
|
+
metadata_text,
|
497
561
|
content='chunks',
|
498
562
|
content_rowid='id'
|
499
563
|
)
|
@@ -555,13 +619,47 @@ class IndexBuilder:
|
|
555
619
|
# Prepare data
|
556
620
|
keywords_json = json.dumps(chunk.get('keywords', []))
|
557
621
|
tags_json = json.dumps(chunk.get('tags', []))
|
558
|
-
|
622
|
+
|
623
|
+
# Extract metadata from JSON content and merge with chunk metadata
|
624
|
+
json_metadata, json_metadata_text = self._extract_metadata_from_json_content(chunk['content'])
|
625
|
+
chunk_metadata = chunk.get('metadata', {})
|
626
|
+
|
627
|
+
# Merge metadata: chunk metadata takes precedence
|
628
|
+
merged_metadata = {**json_metadata, **chunk_metadata}
|
629
|
+
metadata_json = json.dumps(merged_metadata)
|
630
|
+
|
631
|
+
# Create comprehensive metadata_text including tags
|
632
|
+
metadata_text_parts = []
|
633
|
+
|
634
|
+
# Add metadata text from JSON content
|
635
|
+
if json_metadata_text:
|
636
|
+
metadata_text_parts.append(json_metadata_text)
|
637
|
+
|
638
|
+
# Add tags
|
639
|
+
tags = chunk.get('tags', [])
|
640
|
+
if tags:
|
641
|
+
metadata_text_parts.extend(str(tag).lower() for tag in tags)
|
642
|
+
|
643
|
+
# Add section if present
|
644
|
+
if chunk.get('section'):
|
645
|
+
metadata_text_parts.append(chunk['section'].lower())
|
646
|
+
|
647
|
+
# Add any additional metadata values
|
648
|
+
for key, value in chunk_metadata.items():
|
649
|
+
if key not in json_metadata: # Avoid duplicates
|
650
|
+
metadata_text_parts.append(str(key).lower())
|
651
|
+
if isinstance(value, list):
|
652
|
+
metadata_text_parts.extend(str(v).lower() for v in value)
|
653
|
+
else:
|
654
|
+
metadata_text_parts.append(str(value).lower())
|
655
|
+
|
656
|
+
metadata_text = ' '.join(metadata_text_parts)
|
559
657
|
|
560
658
|
cursor.execute('''
|
561
659
|
INSERT OR IGNORE INTO chunks (
|
562
660
|
content, processed_content, keywords, language, embedding,
|
563
|
-
filename, section, start_line, end_line, tags, metadata, chunk_hash
|
564
|
-
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
661
|
+
filename, section, start_line, end_line, tags, metadata, metadata_text, chunk_hash
|
662
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
565
663
|
''', (
|
566
664
|
chunk['content'],
|
567
665
|
chunk.get('processed_content', chunk['content']),
|
@@ -574,6 +672,7 @@ class IndexBuilder:
|
|
574
672
|
chunk.get('end_line'),
|
575
673
|
tags_json,
|
576
674
|
metadata_json,
|
675
|
+
metadata_text,
|
577
676
|
chunk_hash
|
578
677
|
))
|
579
678
|
|