signalwire-agents 0.1.13__py3-none-any.whl → 1.0.17.dev4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- signalwire_agents/__init__.py +99 -15
- signalwire_agents/agent_server.py +248 -60
- signalwire_agents/agents/bedrock.py +296 -0
- signalwire_agents/cli/__init__.py +9 -0
- signalwire_agents/cli/build_search.py +951 -41
- signalwire_agents/cli/config.py +80 -0
- signalwire_agents/cli/core/__init__.py +10 -0
- signalwire_agents/cli/core/agent_loader.py +470 -0
- signalwire_agents/cli/core/argparse_helpers.py +179 -0
- signalwire_agents/cli/core/dynamic_config.py +71 -0
- signalwire_agents/cli/core/service_loader.py +303 -0
- signalwire_agents/cli/dokku.py +2320 -0
- signalwire_agents/cli/execution/__init__.py +10 -0
- signalwire_agents/cli/execution/datamap_exec.py +446 -0
- signalwire_agents/cli/execution/webhook_exec.py +134 -0
- signalwire_agents/cli/init_project.py +2636 -0
- signalwire_agents/cli/output/__init__.py +10 -0
- signalwire_agents/cli/output/output_formatter.py +255 -0
- signalwire_agents/cli/output/swml_dump.py +186 -0
- signalwire_agents/cli/simulation/__init__.py +10 -0
- signalwire_agents/cli/simulation/data_generation.py +374 -0
- signalwire_agents/cli/simulation/data_overrides.py +200 -0
- signalwire_agents/cli/simulation/mock_env.py +282 -0
- signalwire_agents/cli/swaig_test_wrapper.py +52 -0
- signalwire_agents/cli/test_swaig.py +566 -2366
- signalwire_agents/cli/types.py +81 -0
- signalwire_agents/core/__init__.py +2 -2
- signalwire_agents/core/agent/__init__.py +12 -0
- signalwire_agents/core/agent/config/__init__.py +12 -0
- signalwire_agents/core/agent/deployment/__init__.py +9 -0
- signalwire_agents/core/agent/deployment/handlers/__init__.py +9 -0
- signalwire_agents/core/agent/prompt/__init__.py +14 -0
- signalwire_agents/core/agent/prompt/manager.py +306 -0
- signalwire_agents/core/agent/routing/__init__.py +9 -0
- signalwire_agents/core/agent/security/__init__.py +9 -0
- signalwire_agents/core/agent/swml/__init__.py +9 -0
- signalwire_agents/core/agent/tools/__init__.py +15 -0
- signalwire_agents/core/agent/tools/decorator.py +97 -0
- signalwire_agents/core/agent/tools/registry.py +210 -0
- signalwire_agents/core/agent_base.py +845 -2916
- signalwire_agents/core/auth_handler.py +233 -0
- signalwire_agents/core/config_loader.py +259 -0
- signalwire_agents/core/contexts.py +418 -0
- signalwire_agents/core/data_map.py +3 -15
- signalwire_agents/core/function_result.py +116 -44
- signalwire_agents/core/logging_config.py +162 -18
- signalwire_agents/core/mixins/__init__.py +28 -0
- signalwire_agents/core/mixins/ai_config_mixin.py +442 -0
- signalwire_agents/core/mixins/auth_mixin.py +280 -0
- signalwire_agents/core/mixins/prompt_mixin.py +358 -0
- signalwire_agents/core/mixins/serverless_mixin.py +460 -0
- signalwire_agents/core/mixins/skill_mixin.py +55 -0
- signalwire_agents/core/mixins/state_mixin.py +153 -0
- signalwire_agents/core/mixins/tool_mixin.py +230 -0
- signalwire_agents/core/mixins/web_mixin.py +1142 -0
- signalwire_agents/core/security_config.py +333 -0
- signalwire_agents/core/skill_base.py +84 -1
- signalwire_agents/core/skill_manager.py +62 -20
- signalwire_agents/core/swaig_function.py +18 -5
- signalwire_agents/core/swml_builder.py +207 -11
- signalwire_agents/core/swml_handler.py +27 -21
- signalwire_agents/core/swml_renderer.py +123 -312
- signalwire_agents/core/swml_service.py +171 -203
- signalwire_agents/mcp_gateway/__init__.py +29 -0
- signalwire_agents/mcp_gateway/gateway_service.py +564 -0
- signalwire_agents/mcp_gateway/mcp_manager.py +513 -0
- signalwire_agents/mcp_gateway/session_manager.py +218 -0
- signalwire_agents/prefabs/concierge.py +0 -3
- signalwire_agents/prefabs/faq_bot.py +0 -3
- signalwire_agents/prefabs/info_gatherer.py +0 -3
- signalwire_agents/prefabs/receptionist.py +0 -3
- signalwire_agents/prefabs/survey.py +0 -3
- signalwire_agents/schema.json +9218 -5489
- signalwire_agents/search/__init__.py +7 -1
- signalwire_agents/search/document_processor.py +490 -31
- signalwire_agents/search/index_builder.py +307 -37
- signalwire_agents/search/migration.py +418 -0
- signalwire_agents/search/models.py +30 -0
- signalwire_agents/search/pgvector_backend.py +748 -0
- signalwire_agents/search/query_processor.py +162 -31
- signalwire_agents/search/search_engine.py +916 -35
- signalwire_agents/search/search_service.py +376 -53
- signalwire_agents/skills/README.md +452 -0
- signalwire_agents/skills/__init__.py +14 -2
- signalwire_agents/skills/api_ninjas_trivia/README.md +215 -0
- signalwire_agents/skills/api_ninjas_trivia/__init__.py +12 -0
- signalwire_agents/skills/api_ninjas_trivia/skill.py +237 -0
- signalwire_agents/skills/datasphere/README.md +210 -0
- signalwire_agents/skills/datasphere/skill.py +84 -3
- signalwire_agents/skills/datasphere_serverless/README.md +258 -0
- signalwire_agents/skills/datasphere_serverless/__init__.py +9 -0
- signalwire_agents/skills/datasphere_serverless/skill.py +82 -1
- signalwire_agents/skills/datetime/README.md +132 -0
- signalwire_agents/skills/datetime/__init__.py +9 -0
- signalwire_agents/skills/datetime/skill.py +20 -7
- signalwire_agents/skills/joke/README.md +149 -0
- signalwire_agents/skills/joke/__init__.py +9 -0
- signalwire_agents/skills/joke/skill.py +21 -0
- signalwire_agents/skills/math/README.md +161 -0
- signalwire_agents/skills/math/__init__.py +9 -0
- signalwire_agents/skills/math/skill.py +18 -4
- signalwire_agents/skills/mcp_gateway/README.md +230 -0
- signalwire_agents/skills/mcp_gateway/__init__.py +10 -0
- signalwire_agents/skills/mcp_gateway/skill.py +421 -0
- signalwire_agents/skills/native_vector_search/README.md +210 -0
- signalwire_agents/skills/native_vector_search/__init__.py +9 -0
- signalwire_agents/skills/native_vector_search/skill.py +569 -101
- signalwire_agents/skills/play_background_file/README.md +218 -0
- signalwire_agents/skills/play_background_file/__init__.py +12 -0
- signalwire_agents/skills/play_background_file/skill.py +242 -0
- signalwire_agents/skills/registry.py +395 -40
- signalwire_agents/skills/spider/README.md +236 -0
- signalwire_agents/skills/spider/__init__.py +13 -0
- signalwire_agents/skills/spider/skill.py +598 -0
- signalwire_agents/skills/swml_transfer/README.md +395 -0
- signalwire_agents/skills/swml_transfer/__init__.py +10 -0
- signalwire_agents/skills/swml_transfer/skill.py +359 -0
- signalwire_agents/skills/weather_api/README.md +178 -0
- signalwire_agents/skills/weather_api/__init__.py +12 -0
- signalwire_agents/skills/weather_api/skill.py +191 -0
- signalwire_agents/skills/web_search/README.md +163 -0
- signalwire_agents/skills/web_search/__init__.py +9 -0
- signalwire_agents/skills/web_search/skill.py +586 -112
- signalwire_agents/skills/wikipedia_search/README.md +228 -0
- signalwire_agents/{core/state → skills/wikipedia_search}/__init__.py +5 -4
- signalwire_agents/skills/{wikipedia → wikipedia_search}/skill.py +33 -3
- signalwire_agents/web/__init__.py +17 -0
- signalwire_agents/web/web_service.py +559 -0
- signalwire_agents-1.0.17.dev4.data/data/share/man/man1/sw-agent-init.1 +400 -0
- signalwire_agents-1.0.17.dev4.data/data/share/man/man1/sw-search.1 +483 -0
- signalwire_agents-1.0.17.dev4.data/data/share/man/man1/swaig-test.1 +308 -0
- {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/METADATA +347 -215
- signalwire_agents-1.0.17.dev4.dist-info/RECORD +147 -0
- signalwire_agents-1.0.17.dev4.dist-info/entry_points.txt +6 -0
- signalwire_agents/core/state/file_state_manager.py +0 -219
- signalwire_agents/core/state/state_manager.py +0 -101
- signalwire_agents/skills/wikipedia/__init__.py +0 -9
- signalwire_agents-0.1.13.data/data/schema.json +0 -5611
- signalwire_agents-0.1.13.dist-info/RECORD +0 -67
- signalwire_agents-0.1.13.dist-info/entry_points.txt +0 -3
- {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/WHEEL +0 -0
- {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/licenses/LICENSE +0 -0
- {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,748 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright (c) 2025 SignalWire
|
|
3
|
+
|
|
4
|
+
This file is part of the SignalWire AI Agents SDK.
|
|
5
|
+
|
|
6
|
+
Licensed under the MIT License.
|
|
7
|
+
See LICENSE file in the project root for full license information.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import logging
|
|
12
|
+
from typing import List, Dict, Any, Optional
|
|
13
|
+
from datetime import datetime
|
|
14
|
+
|
|
15
|
+
try:
|
|
16
|
+
import psycopg2
|
|
17
|
+
from psycopg2.extras import execute_values
|
|
18
|
+
from pgvector.psycopg2 import register_vector
|
|
19
|
+
PGVECTOR_AVAILABLE = True
|
|
20
|
+
except ImportError:
|
|
21
|
+
PGVECTOR_AVAILABLE = False
|
|
22
|
+
psycopg2 = None
|
|
23
|
+
register_vector = None
|
|
24
|
+
|
|
25
|
+
try:
|
|
26
|
+
import numpy as np
|
|
27
|
+
except ImportError:
|
|
28
|
+
np = None
|
|
29
|
+
|
|
30
|
+
logger = logging.getLogger(__name__)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class PgVectorBackend:
|
|
34
|
+
"""PostgreSQL pgvector backend for search indexing and retrieval"""
|
|
35
|
+
|
|
36
|
+
def __init__(self, connection_string: str):
|
|
37
|
+
"""
|
|
38
|
+
Initialize pgvector backend
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
connection_string: PostgreSQL connection string
|
|
42
|
+
"""
|
|
43
|
+
if not PGVECTOR_AVAILABLE:
|
|
44
|
+
raise ImportError(
|
|
45
|
+
"pgvector dependencies not available. Install with: "
|
|
46
|
+
"pip install psycopg2-binary pgvector"
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
self.connection_string = connection_string
|
|
50
|
+
self.conn = None
|
|
51
|
+
self._connect()
|
|
52
|
+
|
|
53
|
+
def _connect(self):
|
|
54
|
+
"""Establish database connection"""
|
|
55
|
+
try:
|
|
56
|
+
self.conn = psycopg2.connect(self.connection_string)
|
|
57
|
+
register_vector(self.conn)
|
|
58
|
+
logger.info("Connected to PostgreSQL database")
|
|
59
|
+
except Exception as e:
|
|
60
|
+
error_msg = str(e)
|
|
61
|
+
if "vector type not found" in error_msg:
|
|
62
|
+
logger.error(
|
|
63
|
+
"pgvector extension not installed in database. "
|
|
64
|
+
"Run: CREATE EXTENSION IF NOT EXISTS vector;"
|
|
65
|
+
)
|
|
66
|
+
else:
|
|
67
|
+
logger.error(f"Failed to connect to database: {e}")
|
|
68
|
+
raise
|
|
69
|
+
|
|
70
|
+
def _ensure_connection(self):
|
|
71
|
+
"""Ensure database connection is active"""
|
|
72
|
+
if self.conn is None or self.conn.closed:
|
|
73
|
+
self._connect()
|
|
74
|
+
|
|
75
|
+
def create_schema(self, collection_name: str, embedding_dim: int = 768):
|
|
76
|
+
"""
|
|
77
|
+
Create database schema for a collection
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
collection_name: Name of the collection
|
|
81
|
+
embedding_dim: Dimension of embeddings
|
|
82
|
+
"""
|
|
83
|
+
self._ensure_connection()
|
|
84
|
+
|
|
85
|
+
with self.conn.cursor() as cursor:
|
|
86
|
+
# Create extensions
|
|
87
|
+
cursor.execute("CREATE EXTENSION IF NOT EXISTS vector")
|
|
88
|
+
cursor.execute("CREATE EXTENSION IF NOT EXISTS pg_trgm")
|
|
89
|
+
|
|
90
|
+
# Create table
|
|
91
|
+
table_name = f"chunks_{collection_name}"
|
|
92
|
+
cursor.execute(f"""
|
|
93
|
+
CREATE TABLE IF NOT EXISTS {table_name} (
|
|
94
|
+
id SERIAL PRIMARY KEY,
|
|
95
|
+
content TEXT NOT NULL,
|
|
96
|
+
processed_content TEXT,
|
|
97
|
+
embedding vector({embedding_dim}),
|
|
98
|
+
filename TEXT,
|
|
99
|
+
section TEXT,
|
|
100
|
+
tags JSONB DEFAULT '[]'::jsonb,
|
|
101
|
+
metadata JSONB DEFAULT '{{}}'::jsonb,
|
|
102
|
+
metadata_text TEXT, -- Searchable text representation of all metadata
|
|
103
|
+
created_at TIMESTAMP DEFAULT NOW()
|
|
104
|
+
)
|
|
105
|
+
""")
|
|
106
|
+
|
|
107
|
+
# Create indexes
|
|
108
|
+
cursor.execute(f"""
|
|
109
|
+
CREATE INDEX IF NOT EXISTS idx_{table_name}_embedding
|
|
110
|
+
ON {table_name} USING ivfflat (embedding vector_cosine_ops)
|
|
111
|
+
WITH (lists = 100)
|
|
112
|
+
""")
|
|
113
|
+
|
|
114
|
+
cursor.execute(f"""
|
|
115
|
+
CREATE INDEX IF NOT EXISTS idx_{table_name}_content
|
|
116
|
+
ON {table_name} USING gin (content gin_trgm_ops)
|
|
117
|
+
""")
|
|
118
|
+
|
|
119
|
+
cursor.execute(f"""
|
|
120
|
+
CREATE INDEX IF NOT EXISTS idx_{table_name}_tags
|
|
121
|
+
ON {table_name} USING gin (tags)
|
|
122
|
+
""")
|
|
123
|
+
|
|
124
|
+
cursor.execute(f"""
|
|
125
|
+
CREATE INDEX IF NOT EXISTS idx_{table_name}_metadata
|
|
126
|
+
ON {table_name} USING gin (metadata)
|
|
127
|
+
""")
|
|
128
|
+
|
|
129
|
+
cursor.execute(f"""
|
|
130
|
+
CREATE INDEX IF NOT EXISTS idx_{table_name}_metadata_text
|
|
131
|
+
ON {table_name} USING gin (metadata_text gin_trgm_ops)
|
|
132
|
+
""")
|
|
133
|
+
|
|
134
|
+
# Create config table
|
|
135
|
+
cursor.execute("""
|
|
136
|
+
CREATE TABLE IF NOT EXISTS collection_config (
|
|
137
|
+
collection_name TEXT PRIMARY KEY,
|
|
138
|
+
model_name TEXT,
|
|
139
|
+
embedding_dimensions INTEGER,
|
|
140
|
+
chunking_strategy TEXT,
|
|
141
|
+
languages JSONB,
|
|
142
|
+
created_at TIMESTAMP DEFAULT NOW(),
|
|
143
|
+
metadata JSONB DEFAULT '{}'::jsonb
|
|
144
|
+
)
|
|
145
|
+
""")
|
|
146
|
+
|
|
147
|
+
self.conn.commit()
|
|
148
|
+
logger.info(f"Created schema for collection '{collection_name}'")
|
|
149
|
+
|
|
150
|
+
def _extract_metadata_from_json_content(self, content: str) -> Dict[str, Any]:
|
|
151
|
+
"""
|
|
152
|
+
Extract metadata from JSON content if present
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
metadata_dict
|
|
156
|
+
"""
|
|
157
|
+
metadata_dict = {}
|
|
158
|
+
|
|
159
|
+
# Try to extract metadata from JSON structure in content
|
|
160
|
+
if '"metadata":' in content:
|
|
161
|
+
try:
|
|
162
|
+
import re
|
|
163
|
+
# Find all metadata objects
|
|
164
|
+
pattern = r'"metadata"\s*:\s*(\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\})'
|
|
165
|
+
matches = re.finditer(pattern, content)
|
|
166
|
+
|
|
167
|
+
for match in matches:
|
|
168
|
+
try:
|
|
169
|
+
json_metadata = json.loads(match.group(1))
|
|
170
|
+
# Merge all found metadata
|
|
171
|
+
if isinstance(json_metadata, dict):
|
|
172
|
+
metadata_dict.update(json_metadata)
|
|
173
|
+
except:
|
|
174
|
+
pass
|
|
175
|
+
except Exception as e:
|
|
176
|
+
logger.debug(f"Error extracting JSON metadata: {e}")
|
|
177
|
+
|
|
178
|
+
return metadata_dict
|
|
179
|
+
|
|
180
|
+
def store_chunks(self, chunks: List[Dict[str, Any]], collection_name: str,
|
|
181
|
+
config: Dict[str, Any]):
|
|
182
|
+
"""
|
|
183
|
+
Store document chunks in the database
|
|
184
|
+
|
|
185
|
+
Args:
|
|
186
|
+
chunks: List of processed chunks with embeddings
|
|
187
|
+
collection_name: Name of the collection
|
|
188
|
+
config: Configuration metadata
|
|
189
|
+
"""
|
|
190
|
+
self._ensure_connection()
|
|
191
|
+
|
|
192
|
+
table_name = f"chunks_{collection_name}"
|
|
193
|
+
|
|
194
|
+
# Prepare data for batch insert
|
|
195
|
+
data = []
|
|
196
|
+
for chunk in chunks:
|
|
197
|
+
embedding = chunk.get('embedding')
|
|
198
|
+
if embedding is not None:
|
|
199
|
+
# Convert to list if it's a numpy array
|
|
200
|
+
if hasattr(embedding, 'tolist'):
|
|
201
|
+
embedding = embedding.tolist()
|
|
202
|
+
|
|
203
|
+
metadata = chunk.get('metadata', {})
|
|
204
|
+
|
|
205
|
+
# Extract fields - they might be at top level or in metadata
|
|
206
|
+
filename = chunk.get('filename') or metadata.get('filename', '')
|
|
207
|
+
section = chunk.get('section') or metadata.get('section', '')
|
|
208
|
+
tags = chunk.get('tags', []) or metadata.get('tags', [])
|
|
209
|
+
|
|
210
|
+
# Extract metadata from JSON content and merge with chunk metadata
|
|
211
|
+
json_metadata = self._extract_metadata_from_json_content(chunk['content'])
|
|
212
|
+
|
|
213
|
+
# Build metadata from all fields except the ones we store separately
|
|
214
|
+
chunk_metadata = {}
|
|
215
|
+
for key, value in chunk.items():
|
|
216
|
+
if key not in ['content', 'processed_content', 'embedding', 'filename', 'section', 'tags']:
|
|
217
|
+
chunk_metadata[key] = value
|
|
218
|
+
# Also include any extra metadata
|
|
219
|
+
for key, value in metadata.items():
|
|
220
|
+
if key not in ['filename', 'section', 'tags']:
|
|
221
|
+
chunk_metadata[key] = value
|
|
222
|
+
|
|
223
|
+
# Merge metadata: chunk metadata takes precedence over JSON metadata
|
|
224
|
+
merged_metadata = {**json_metadata, **chunk_metadata}
|
|
225
|
+
|
|
226
|
+
# Create searchable metadata text
|
|
227
|
+
metadata_text_parts = []
|
|
228
|
+
|
|
229
|
+
# Add all metadata keys and values
|
|
230
|
+
for key, value in merged_metadata.items():
|
|
231
|
+
metadata_text_parts.append(str(key).lower())
|
|
232
|
+
if isinstance(value, list):
|
|
233
|
+
metadata_text_parts.extend(str(v).lower() for v in value)
|
|
234
|
+
else:
|
|
235
|
+
metadata_text_parts.append(str(value).lower())
|
|
236
|
+
|
|
237
|
+
# Add tags
|
|
238
|
+
if tags:
|
|
239
|
+
metadata_text_parts.extend(str(tag).lower() for tag in tags)
|
|
240
|
+
|
|
241
|
+
# Add section if present
|
|
242
|
+
if section:
|
|
243
|
+
metadata_text_parts.append(section.lower())
|
|
244
|
+
|
|
245
|
+
metadata_text = ' '.join(metadata_text_parts)
|
|
246
|
+
|
|
247
|
+
data.append((
|
|
248
|
+
chunk['content'],
|
|
249
|
+
chunk.get('processed_content', chunk['content']),
|
|
250
|
+
embedding,
|
|
251
|
+
filename,
|
|
252
|
+
section,
|
|
253
|
+
json.dumps(tags),
|
|
254
|
+
json.dumps(merged_metadata),
|
|
255
|
+
metadata_text
|
|
256
|
+
))
|
|
257
|
+
|
|
258
|
+
# Batch insert chunks
|
|
259
|
+
with self.conn.cursor() as cursor:
|
|
260
|
+
execute_values(
|
|
261
|
+
cursor,
|
|
262
|
+
f"""
|
|
263
|
+
INSERT INTO {table_name}
|
|
264
|
+
(content, processed_content, embedding, filename, section, tags, metadata, metadata_text)
|
|
265
|
+
VALUES %s
|
|
266
|
+
""",
|
|
267
|
+
data,
|
|
268
|
+
template="(%s, %s, %s, %s, %s, %s::jsonb, %s::jsonb, %s)"
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
# Update or insert config
|
|
272
|
+
cursor.execute("""
|
|
273
|
+
INSERT INTO collection_config
|
|
274
|
+
(collection_name, model_name, embedding_dimensions, chunking_strategy,
|
|
275
|
+
languages, metadata)
|
|
276
|
+
VALUES (%s, %s, %s, %s, %s, %s)
|
|
277
|
+
ON CONFLICT (collection_name)
|
|
278
|
+
DO UPDATE SET
|
|
279
|
+
model_name = EXCLUDED.model_name,
|
|
280
|
+
embedding_dimensions = EXCLUDED.embedding_dimensions,
|
|
281
|
+
chunking_strategy = EXCLUDED.chunking_strategy,
|
|
282
|
+
languages = EXCLUDED.languages,
|
|
283
|
+
metadata = EXCLUDED.metadata
|
|
284
|
+
""", (
|
|
285
|
+
collection_name,
|
|
286
|
+
config.get('model_name'),
|
|
287
|
+
config.get('embedding_dimensions'),
|
|
288
|
+
config.get('chunking_strategy'),
|
|
289
|
+
json.dumps(config.get('languages', [])),
|
|
290
|
+
json.dumps(config.get('metadata', {}))
|
|
291
|
+
))
|
|
292
|
+
|
|
293
|
+
self.conn.commit()
|
|
294
|
+
logger.info(f"Stored {len(chunks)} chunks in collection '{collection_name}'")
|
|
295
|
+
|
|
296
|
+
def get_stats(self, collection_name: str) -> Dict[str, Any]:
|
|
297
|
+
"""Get statistics for a collection"""
|
|
298
|
+
self._ensure_connection()
|
|
299
|
+
|
|
300
|
+
table_name = f"chunks_{collection_name}"
|
|
301
|
+
|
|
302
|
+
with self.conn.cursor() as cursor:
|
|
303
|
+
# Get chunk count
|
|
304
|
+
cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
|
|
305
|
+
total_chunks = cursor.fetchone()[0]
|
|
306
|
+
|
|
307
|
+
# Get unique files
|
|
308
|
+
cursor.execute(f"SELECT COUNT(DISTINCT filename) FROM {table_name}")
|
|
309
|
+
total_files = cursor.fetchone()[0]
|
|
310
|
+
|
|
311
|
+
# Get config
|
|
312
|
+
cursor.execute(
|
|
313
|
+
"SELECT * FROM collection_config WHERE collection_name = %s",
|
|
314
|
+
(collection_name,)
|
|
315
|
+
)
|
|
316
|
+
config_row = cursor.fetchone()
|
|
317
|
+
|
|
318
|
+
if config_row:
|
|
319
|
+
config = {
|
|
320
|
+
'model_name': config_row[1],
|
|
321
|
+
'embedding_dimensions': config_row[2],
|
|
322
|
+
'chunking_strategy': config_row[3],
|
|
323
|
+
'languages': config_row[4],
|
|
324
|
+
'created_at': config_row[5].isoformat() if config_row[5] else None,
|
|
325
|
+
'metadata': config_row[6]
|
|
326
|
+
}
|
|
327
|
+
else:
|
|
328
|
+
config = {}
|
|
329
|
+
|
|
330
|
+
return {
|
|
331
|
+
'total_chunks': total_chunks,
|
|
332
|
+
'total_files': total_files,
|
|
333
|
+
'config': config
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
def list_collections(self) -> List[str]:
|
|
337
|
+
"""List all collections in the database"""
|
|
338
|
+
self._ensure_connection()
|
|
339
|
+
|
|
340
|
+
with self.conn.cursor() as cursor:
|
|
341
|
+
cursor.execute("SELECT collection_name FROM collection_config ORDER BY collection_name")
|
|
342
|
+
return [row[0] for row in cursor.fetchall()]
|
|
343
|
+
|
|
344
|
+
def delete_collection(self, collection_name: str):
|
|
345
|
+
"""Delete a collection and its data"""
|
|
346
|
+
self._ensure_connection()
|
|
347
|
+
|
|
348
|
+
table_name = f"chunks_{collection_name}"
|
|
349
|
+
|
|
350
|
+
with self.conn.cursor() as cursor:
|
|
351
|
+
cursor.execute(f"DROP TABLE IF EXISTS {table_name}")
|
|
352
|
+
cursor.execute(
|
|
353
|
+
"DELETE FROM collection_config WHERE collection_name = %s",
|
|
354
|
+
(collection_name,)
|
|
355
|
+
)
|
|
356
|
+
self.conn.commit()
|
|
357
|
+
logger.info(f"Deleted collection '{collection_name}'")
|
|
358
|
+
|
|
359
|
+
def close(self):
|
|
360
|
+
"""Close database connection"""
|
|
361
|
+
if self.conn and not self.conn.closed:
|
|
362
|
+
self.conn.close()
|
|
363
|
+
logger.info("Closed database connection")
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
class PgVectorSearchBackend:
|
|
367
|
+
"""PostgreSQL pgvector backend for search operations"""
|
|
368
|
+
|
|
369
|
+
def __init__(self, connection_string: str, collection_name: str):
|
|
370
|
+
"""
|
|
371
|
+
Initialize search backend
|
|
372
|
+
|
|
373
|
+
Args:
|
|
374
|
+
connection_string: PostgreSQL connection string
|
|
375
|
+
collection_name: Name of the collection to search
|
|
376
|
+
"""
|
|
377
|
+
if not PGVECTOR_AVAILABLE:
|
|
378
|
+
raise ImportError(
|
|
379
|
+
"pgvector dependencies not available. Install with: "
|
|
380
|
+
"pip install psycopg2-binary pgvector"
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
self.connection_string = connection_string
|
|
384
|
+
self.collection_name = collection_name
|
|
385
|
+
self.table_name = f"chunks_{collection_name}"
|
|
386
|
+
self.conn = None
|
|
387
|
+
self._connect()
|
|
388
|
+
self.config = self._load_config()
|
|
389
|
+
|
|
390
|
+
def _connect(self):
|
|
391
|
+
"""Establish database connection"""
|
|
392
|
+
try:
|
|
393
|
+
self.conn = psycopg2.connect(self.connection_string)
|
|
394
|
+
register_vector(self.conn)
|
|
395
|
+
except Exception as e:
|
|
396
|
+
logger.error(f"Failed to connect to database: {e}")
|
|
397
|
+
raise
|
|
398
|
+
|
|
399
|
+
def _ensure_connection(self):
|
|
400
|
+
"""Ensure database connection is active"""
|
|
401
|
+
if self.conn is None or self.conn.closed:
|
|
402
|
+
self._connect()
|
|
403
|
+
|
|
404
|
+
def _load_config(self) -> Dict[str, Any]:
|
|
405
|
+
"""Load collection configuration"""
|
|
406
|
+
self._ensure_connection()
|
|
407
|
+
|
|
408
|
+
with self.conn.cursor() as cursor:
|
|
409
|
+
cursor.execute(
|
|
410
|
+
"SELECT * FROM collection_config WHERE collection_name = %s",
|
|
411
|
+
(self.collection_name,)
|
|
412
|
+
)
|
|
413
|
+
row = cursor.fetchone()
|
|
414
|
+
|
|
415
|
+
if row:
|
|
416
|
+
return {
|
|
417
|
+
'model_name': row[1],
|
|
418
|
+
'embedding_dimensions': row[2],
|
|
419
|
+
'chunking_strategy': row[3],
|
|
420
|
+
'languages': row[4],
|
|
421
|
+
'metadata': row[6]
|
|
422
|
+
}
|
|
423
|
+
return {}
|
|
424
|
+
|
|
425
|
+
def search(self, query_vector: List[float], enhanced_text: str,
|
|
426
|
+
count: int = 5, similarity_threshold: float = 0.0,
|
|
427
|
+
tags: Optional[List[str]] = None,
|
|
428
|
+
keyword_weight: Optional[float] = None) -> List[Dict[str, Any]]:
|
|
429
|
+
"""
|
|
430
|
+
Perform hybrid search (vector + keyword + metadata)
|
|
431
|
+
|
|
432
|
+
Args:
|
|
433
|
+
query_vector: Embedding vector for the query
|
|
434
|
+
enhanced_text: Processed query text for keyword search
|
|
435
|
+
count: Number of results to return
|
|
436
|
+
similarity_threshold: Minimum similarity score
|
|
437
|
+
tags: Filter by tags
|
|
438
|
+
keyword_weight: Manual keyword weight (0.0-1.0). If None, uses default weighting
|
|
439
|
+
|
|
440
|
+
Returns:
|
|
441
|
+
List of search results with scores and metadata
|
|
442
|
+
"""
|
|
443
|
+
self._ensure_connection()
|
|
444
|
+
|
|
445
|
+
# Extract query terms for metadata search
|
|
446
|
+
query_terms = enhanced_text.lower().split()
|
|
447
|
+
|
|
448
|
+
# Vector search
|
|
449
|
+
vector_results = self._vector_search(query_vector, count * 2, tags)
|
|
450
|
+
|
|
451
|
+
# Apply similarity threshold to raw vector scores BEFORE weighting
|
|
452
|
+
# This ensures threshold behaves intuitively (filters on actual similarity, not weighted score)
|
|
453
|
+
if similarity_threshold > 0:
|
|
454
|
+
vector_results = [r for r in vector_results if r['score'] >= similarity_threshold]
|
|
455
|
+
|
|
456
|
+
# Keyword search
|
|
457
|
+
keyword_results = self._keyword_search(enhanced_text, count * 2, tags)
|
|
458
|
+
|
|
459
|
+
# Metadata search
|
|
460
|
+
metadata_results = self._metadata_search(query_terms, count * 2, tags)
|
|
461
|
+
|
|
462
|
+
# Merge all results (threshold already applied to vector results)
|
|
463
|
+
merged_results = self._merge_all_results(vector_results, keyword_results, metadata_results, keyword_weight)
|
|
464
|
+
|
|
465
|
+
# Ensure 'score' field exists for CLI compatibility
|
|
466
|
+
for r in merged_results:
|
|
467
|
+
if 'score' not in r:
|
|
468
|
+
r['score'] = r.get('final_score', 0.0)
|
|
469
|
+
|
|
470
|
+
return merged_results[:count]
|
|
471
|
+
|
|
472
|
+
def _vector_search(self, query_vector: List[float], count: int,
|
|
473
|
+
tags: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
|
474
|
+
"""Perform vector similarity search"""
|
|
475
|
+
with self.conn.cursor() as cursor:
|
|
476
|
+
# Set ef_search for HNSW index to ensure we get enough results
|
|
477
|
+
# ef_search must be at least as large as the LIMIT
|
|
478
|
+
cursor.execute(f"SET LOCAL hnsw.ef_search = {max(count, 40)}")
|
|
479
|
+
# Build query
|
|
480
|
+
query = f"""
|
|
481
|
+
SELECT id, content, filename, section, tags, metadata,
|
|
482
|
+
1 - (embedding <=> %s::vector) as similarity
|
|
483
|
+
FROM {self.table_name}
|
|
484
|
+
WHERE embedding IS NOT NULL
|
|
485
|
+
"""
|
|
486
|
+
|
|
487
|
+
params = [query_vector]
|
|
488
|
+
|
|
489
|
+
# Add tag filter if specified
|
|
490
|
+
if tags:
|
|
491
|
+
query += " AND tags ?| %s"
|
|
492
|
+
params.append(tags)
|
|
493
|
+
|
|
494
|
+
query += " ORDER BY embedding <=> %s::vector LIMIT %s"
|
|
495
|
+
params.extend([query_vector, count])
|
|
496
|
+
|
|
497
|
+
cursor.execute(query, params)
|
|
498
|
+
|
|
499
|
+
results = []
|
|
500
|
+
for row in cursor.fetchall():
|
|
501
|
+
chunk_id, content, filename, section, tags_json, metadata_json, similarity = row
|
|
502
|
+
|
|
503
|
+
results.append({
|
|
504
|
+
'id': chunk_id,
|
|
505
|
+
'content': content,
|
|
506
|
+
'score': float(similarity),
|
|
507
|
+
'metadata': {
|
|
508
|
+
'filename': filename,
|
|
509
|
+
'section': section,
|
|
510
|
+
'tags': tags_json if isinstance(tags_json, list) else [],
|
|
511
|
+
**metadata_json
|
|
512
|
+
},
|
|
513
|
+
'search_type': 'vector'
|
|
514
|
+
})
|
|
515
|
+
|
|
516
|
+
return results
|
|
517
|
+
|
|
518
|
+
def _keyword_search(self, enhanced_text: str, count: int,
|
|
519
|
+
tags: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
|
520
|
+
"""Perform full-text search"""
|
|
521
|
+
with self.conn.cursor() as cursor:
|
|
522
|
+
# Use PostgreSQL text search
|
|
523
|
+
query = f"""
|
|
524
|
+
SELECT id, content, filename, section, tags, metadata,
|
|
525
|
+
ts_rank(to_tsvector('english', content),
|
|
526
|
+
plainto_tsquery('english', %s)) as rank
|
|
527
|
+
FROM {self.table_name}
|
|
528
|
+
WHERE to_tsvector('english', content) @@ plainto_tsquery('english', %s)
|
|
529
|
+
"""
|
|
530
|
+
|
|
531
|
+
params = [enhanced_text, enhanced_text]
|
|
532
|
+
|
|
533
|
+
# Add tag filter if specified
|
|
534
|
+
if tags:
|
|
535
|
+
query += " AND tags ?| %s"
|
|
536
|
+
params.append(tags)
|
|
537
|
+
|
|
538
|
+
query += " ORDER BY rank DESC LIMIT %s"
|
|
539
|
+
params.append(count)
|
|
540
|
+
|
|
541
|
+
cursor.execute(query, params)
|
|
542
|
+
|
|
543
|
+
results = []
|
|
544
|
+
for row in cursor.fetchall():
|
|
545
|
+
chunk_id, content, filename, section, tags_json, metadata_json, rank = row
|
|
546
|
+
|
|
547
|
+
# Normalize rank to 0-1 score
|
|
548
|
+
score = min(1.0, rank / 10.0)
|
|
549
|
+
|
|
550
|
+
results.append({
|
|
551
|
+
'id': chunk_id,
|
|
552
|
+
'content': content,
|
|
553
|
+
'score': float(score),
|
|
554
|
+
'metadata': {
|
|
555
|
+
'filename': filename,
|
|
556
|
+
'section': section,
|
|
557
|
+
'tags': tags_json if isinstance(tags_json, list) else [],
|
|
558
|
+
**metadata_json
|
|
559
|
+
},
|
|
560
|
+
'search_type': 'keyword'
|
|
561
|
+
})
|
|
562
|
+
|
|
563
|
+
return results
|
|
564
|
+
|
|
565
|
+
def _metadata_search(self, query_terms: List[str], count: int,
|
|
566
|
+
tags: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
|
567
|
+
"""
|
|
568
|
+
Perform metadata search using JSONB operators and metadata_text
|
|
569
|
+
"""
|
|
570
|
+
with self.conn.cursor() as cursor:
|
|
571
|
+
# Build WHERE conditions
|
|
572
|
+
where_conditions = []
|
|
573
|
+
params = []
|
|
574
|
+
|
|
575
|
+
# Use metadata_text for trigram search
|
|
576
|
+
if query_terms:
|
|
577
|
+
# Create AND conditions for all terms
|
|
578
|
+
for term in query_terms:
|
|
579
|
+
where_conditions.append(f"metadata_text ILIKE %s")
|
|
580
|
+
params.append(f'%{term}%')
|
|
581
|
+
|
|
582
|
+
# Add tag filter if specified
|
|
583
|
+
if tags:
|
|
584
|
+
where_conditions.append("tags ?| %s")
|
|
585
|
+
params.append(tags)
|
|
586
|
+
|
|
587
|
+
# Build query
|
|
588
|
+
where_clause = " AND ".join(where_conditions) if where_conditions else "1=1"
|
|
589
|
+
|
|
590
|
+
query = f"""
|
|
591
|
+
SELECT id, content, filename, section, tags, metadata,
|
|
592
|
+
metadata_text
|
|
593
|
+
FROM {self.table_name}
|
|
594
|
+
WHERE {where_clause}
|
|
595
|
+
LIMIT %s
|
|
596
|
+
"""
|
|
597
|
+
|
|
598
|
+
params.append(count)
|
|
599
|
+
|
|
600
|
+
cursor.execute(query, params)
|
|
601
|
+
|
|
602
|
+
results = []
|
|
603
|
+
for row in cursor.fetchall():
|
|
604
|
+
chunk_id, content, filename, section, tags_json, metadata_json, metadata_text = row
|
|
605
|
+
|
|
606
|
+
# Calculate score based on term matches
|
|
607
|
+
score = 0.0
|
|
608
|
+
if metadata_text:
|
|
609
|
+
metadata_lower = metadata_text.lower()
|
|
610
|
+
for term in query_terms:
|
|
611
|
+
if term.lower() in metadata_lower:
|
|
612
|
+
score += 0.3 # Base score for each match
|
|
613
|
+
|
|
614
|
+
# Bonus for exact matches in JSONB keys/values
|
|
615
|
+
if metadata_json:
|
|
616
|
+
json_str = json.dumps(metadata_json).lower()
|
|
617
|
+
for term in query_terms:
|
|
618
|
+
if term.lower() in json_str:
|
|
619
|
+
score += 0.2
|
|
620
|
+
|
|
621
|
+
# Normalize score
|
|
622
|
+
score = min(1.0, score)
|
|
623
|
+
|
|
624
|
+
results.append({
|
|
625
|
+
'id': chunk_id,
|
|
626
|
+
'content': content,
|
|
627
|
+
'score': float(score),
|
|
628
|
+
'metadata': {
|
|
629
|
+
'filename': filename,
|
|
630
|
+
'section': section,
|
|
631
|
+
'tags': tags_json if isinstance(tags_json, list) else [],
|
|
632
|
+
**metadata_json
|
|
633
|
+
},
|
|
634
|
+
'search_type': 'metadata'
|
|
635
|
+
})
|
|
636
|
+
|
|
637
|
+
# Sort by score
|
|
638
|
+
results.sort(key=lambda x: x['score'], reverse=True)
|
|
639
|
+
return results[:count]
|
|
640
|
+
|
|
641
|
+
def _merge_results(self, vector_results: List[Dict[str, Any]],
|
|
642
|
+
keyword_results: List[Dict[str, Any]],
|
|
643
|
+
keyword_weight: Optional[float] = None) -> List[Dict[str, Any]]:
|
|
644
|
+
"""Merge and rank results from vector and keyword search"""
|
|
645
|
+
# Use provided weights or defaults
|
|
646
|
+
if keyword_weight is None:
|
|
647
|
+
keyword_weight = 0.3
|
|
648
|
+
vector_weight = 1.0 - keyword_weight
|
|
649
|
+
|
|
650
|
+
# Create a map to track unique results
|
|
651
|
+
results_map = {}
|
|
652
|
+
|
|
653
|
+
# Add vector results
|
|
654
|
+
for result in vector_results:
|
|
655
|
+
chunk_id = result['id']
|
|
656
|
+
if chunk_id not in results_map:
|
|
657
|
+
results_map[chunk_id] = result
|
|
658
|
+
results_map[chunk_id]['score'] *= vector_weight
|
|
659
|
+
else:
|
|
660
|
+
# Combine scores if result appears in both
|
|
661
|
+
results_map[chunk_id]['score'] += result['score'] * vector_weight
|
|
662
|
+
|
|
663
|
+
# Add keyword results
|
|
664
|
+
for result in keyword_results:
|
|
665
|
+
chunk_id = result['id']
|
|
666
|
+
if chunk_id not in results_map:
|
|
667
|
+
results_map[chunk_id] = result
|
|
668
|
+
results_map[chunk_id]['score'] *= keyword_weight
|
|
669
|
+
else:
|
|
670
|
+
# Combine scores if result appears in both
|
|
671
|
+
results_map[chunk_id]['score'] += result['score'] * keyword_weight
|
|
672
|
+
|
|
673
|
+
# Sort by combined score
|
|
674
|
+
merged = list(results_map.values())
|
|
675
|
+
merged.sort(key=lambda x: x['score'], reverse=True)
|
|
676
|
+
|
|
677
|
+
return merged
|
|
678
|
+
|
|
679
|
+
def _merge_all_results(self, vector_results: List[Dict[str, Any]],
|
|
680
|
+
keyword_results: List[Dict[str, Any]],
|
|
681
|
+
metadata_results: List[Dict[str, Any]],
|
|
682
|
+
keyword_weight: Optional[float] = None) -> List[Dict[str, Any]]:
|
|
683
|
+
"""Merge and rank results from vector, keyword, and metadata search"""
|
|
684
|
+
# Use provided weights or defaults
|
|
685
|
+
if keyword_weight is None:
|
|
686
|
+
keyword_weight = 0.3
|
|
687
|
+
vector_weight = 0.5
|
|
688
|
+
metadata_weight = 0.2
|
|
689
|
+
|
|
690
|
+
# Create a map to track unique results
|
|
691
|
+
results_map = {}
|
|
692
|
+
all_sources = {}
|
|
693
|
+
|
|
694
|
+
# Add vector results
|
|
695
|
+
for result in vector_results:
|
|
696
|
+
chunk_id = result['id']
|
|
697
|
+
if chunk_id not in results_map:
|
|
698
|
+
results_map[chunk_id] = result.copy()
|
|
699
|
+
results_map[chunk_id]['score'] = result['score'] * vector_weight
|
|
700
|
+
all_sources[chunk_id] = {'vector': result['score']}
|
|
701
|
+
else:
|
|
702
|
+
results_map[chunk_id]['score'] += result['score'] * vector_weight
|
|
703
|
+
all_sources[chunk_id]['vector'] = result['score']
|
|
704
|
+
|
|
705
|
+
# Add keyword results
|
|
706
|
+
for result in keyword_results:
|
|
707
|
+
chunk_id = result['id']
|
|
708
|
+
if chunk_id not in results_map:
|
|
709
|
+
results_map[chunk_id] = result.copy()
|
|
710
|
+
results_map[chunk_id]['score'] = result['score'] * keyword_weight
|
|
711
|
+
all_sources.setdefault(chunk_id, {})['keyword'] = result['score']
|
|
712
|
+
else:
|
|
713
|
+
results_map[chunk_id]['score'] += result['score'] * keyword_weight
|
|
714
|
+
all_sources[chunk_id]['keyword'] = result['score']
|
|
715
|
+
|
|
716
|
+
# Add metadata results
|
|
717
|
+
for result in metadata_results:
|
|
718
|
+
chunk_id = result['id']
|
|
719
|
+
if chunk_id not in results_map:
|
|
720
|
+
results_map[chunk_id] = result.copy()
|
|
721
|
+
results_map[chunk_id]['score'] = result['score'] * metadata_weight
|
|
722
|
+
all_sources.setdefault(chunk_id, {})['metadata'] = result['score']
|
|
723
|
+
else:
|
|
724
|
+
results_map[chunk_id]['score'] += result['score'] * metadata_weight
|
|
725
|
+
all_sources[chunk_id]['metadata'] = result['score']
|
|
726
|
+
|
|
727
|
+
# Add sources to results for transparency
|
|
728
|
+
for chunk_id, result in results_map.items():
|
|
729
|
+
result['sources'] = all_sources.get(chunk_id, {})
|
|
730
|
+
result['final_score'] = result['score']
|
|
731
|
+
|
|
732
|
+
# Sort by combined score
|
|
733
|
+
merged = list(results_map.values())
|
|
734
|
+
merged.sort(key=lambda x: x['score'], reverse=True)
|
|
735
|
+
|
|
736
|
+
return merged
|
|
737
|
+
|
|
738
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
739
|
+
"""Get statistics for the collection"""
|
|
740
|
+
backend = PgVectorBackend(self.connection_string)
|
|
741
|
+
stats = backend.get_stats(self.collection_name)
|
|
742
|
+
backend.close()
|
|
743
|
+
return stats
|
|
744
|
+
|
|
745
|
+
def close(self):
|
|
746
|
+
"""Close database connection"""
|
|
747
|
+
if self.conn and not self.conn.closed:
|
|
748
|
+
self.conn.close()
|