signalwire-agents 0.1.13__py3-none-any.whl → 1.0.17.dev4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. signalwire_agents/__init__.py +99 -15
  2. signalwire_agents/agent_server.py +248 -60
  3. signalwire_agents/agents/bedrock.py +296 -0
  4. signalwire_agents/cli/__init__.py +9 -0
  5. signalwire_agents/cli/build_search.py +951 -41
  6. signalwire_agents/cli/config.py +80 -0
  7. signalwire_agents/cli/core/__init__.py +10 -0
  8. signalwire_agents/cli/core/agent_loader.py +470 -0
  9. signalwire_agents/cli/core/argparse_helpers.py +179 -0
  10. signalwire_agents/cli/core/dynamic_config.py +71 -0
  11. signalwire_agents/cli/core/service_loader.py +303 -0
  12. signalwire_agents/cli/dokku.py +2320 -0
  13. signalwire_agents/cli/execution/__init__.py +10 -0
  14. signalwire_agents/cli/execution/datamap_exec.py +446 -0
  15. signalwire_agents/cli/execution/webhook_exec.py +134 -0
  16. signalwire_agents/cli/init_project.py +2636 -0
  17. signalwire_agents/cli/output/__init__.py +10 -0
  18. signalwire_agents/cli/output/output_formatter.py +255 -0
  19. signalwire_agents/cli/output/swml_dump.py +186 -0
  20. signalwire_agents/cli/simulation/__init__.py +10 -0
  21. signalwire_agents/cli/simulation/data_generation.py +374 -0
  22. signalwire_agents/cli/simulation/data_overrides.py +200 -0
  23. signalwire_agents/cli/simulation/mock_env.py +282 -0
  24. signalwire_agents/cli/swaig_test_wrapper.py +52 -0
  25. signalwire_agents/cli/test_swaig.py +566 -2366
  26. signalwire_agents/cli/types.py +81 -0
  27. signalwire_agents/core/__init__.py +2 -2
  28. signalwire_agents/core/agent/__init__.py +12 -0
  29. signalwire_agents/core/agent/config/__init__.py +12 -0
  30. signalwire_agents/core/agent/deployment/__init__.py +9 -0
  31. signalwire_agents/core/agent/deployment/handlers/__init__.py +9 -0
  32. signalwire_agents/core/agent/prompt/__init__.py +14 -0
  33. signalwire_agents/core/agent/prompt/manager.py +306 -0
  34. signalwire_agents/core/agent/routing/__init__.py +9 -0
  35. signalwire_agents/core/agent/security/__init__.py +9 -0
  36. signalwire_agents/core/agent/swml/__init__.py +9 -0
  37. signalwire_agents/core/agent/tools/__init__.py +15 -0
  38. signalwire_agents/core/agent/tools/decorator.py +97 -0
  39. signalwire_agents/core/agent/tools/registry.py +210 -0
  40. signalwire_agents/core/agent_base.py +845 -2916
  41. signalwire_agents/core/auth_handler.py +233 -0
  42. signalwire_agents/core/config_loader.py +259 -0
  43. signalwire_agents/core/contexts.py +418 -0
  44. signalwire_agents/core/data_map.py +3 -15
  45. signalwire_agents/core/function_result.py +116 -44
  46. signalwire_agents/core/logging_config.py +162 -18
  47. signalwire_agents/core/mixins/__init__.py +28 -0
  48. signalwire_agents/core/mixins/ai_config_mixin.py +442 -0
  49. signalwire_agents/core/mixins/auth_mixin.py +280 -0
  50. signalwire_agents/core/mixins/prompt_mixin.py +358 -0
  51. signalwire_agents/core/mixins/serverless_mixin.py +460 -0
  52. signalwire_agents/core/mixins/skill_mixin.py +55 -0
  53. signalwire_agents/core/mixins/state_mixin.py +153 -0
  54. signalwire_agents/core/mixins/tool_mixin.py +230 -0
  55. signalwire_agents/core/mixins/web_mixin.py +1142 -0
  56. signalwire_agents/core/security_config.py +333 -0
  57. signalwire_agents/core/skill_base.py +84 -1
  58. signalwire_agents/core/skill_manager.py +62 -20
  59. signalwire_agents/core/swaig_function.py +18 -5
  60. signalwire_agents/core/swml_builder.py +207 -11
  61. signalwire_agents/core/swml_handler.py +27 -21
  62. signalwire_agents/core/swml_renderer.py +123 -312
  63. signalwire_agents/core/swml_service.py +171 -203
  64. signalwire_agents/mcp_gateway/__init__.py +29 -0
  65. signalwire_agents/mcp_gateway/gateway_service.py +564 -0
  66. signalwire_agents/mcp_gateway/mcp_manager.py +513 -0
  67. signalwire_agents/mcp_gateway/session_manager.py +218 -0
  68. signalwire_agents/prefabs/concierge.py +0 -3
  69. signalwire_agents/prefabs/faq_bot.py +0 -3
  70. signalwire_agents/prefabs/info_gatherer.py +0 -3
  71. signalwire_agents/prefabs/receptionist.py +0 -3
  72. signalwire_agents/prefabs/survey.py +0 -3
  73. signalwire_agents/schema.json +9218 -5489
  74. signalwire_agents/search/__init__.py +7 -1
  75. signalwire_agents/search/document_processor.py +490 -31
  76. signalwire_agents/search/index_builder.py +307 -37
  77. signalwire_agents/search/migration.py +418 -0
  78. signalwire_agents/search/models.py +30 -0
  79. signalwire_agents/search/pgvector_backend.py +748 -0
  80. signalwire_agents/search/query_processor.py +162 -31
  81. signalwire_agents/search/search_engine.py +916 -35
  82. signalwire_agents/search/search_service.py +376 -53
  83. signalwire_agents/skills/README.md +452 -0
  84. signalwire_agents/skills/__init__.py +14 -2
  85. signalwire_agents/skills/api_ninjas_trivia/README.md +215 -0
  86. signalwire_agents/skills/api_ninjas_trivia/__init__.py +12 -0
  87. signalwire_agents/skills/api_ninjas_trivia/skill.py +237 -0
  88. signalwire_agents/skills/datasphere/README.md +210 -0
  89. signalwire_agents/skills/datasphere/skill.py +84 -3
  90. signalwire_agents/skills/datasphere_serverless/README.md +258 -0
  91. signalwire_agents/skills/datasphere_serverless/__init__.py +9 -0
  92. signalwire_agents/skills/datasphere_serverless/skill.py +82 -1
  93. signalwire_agents/skills/datetime/README.md +132 -0
  94. signalwire_agents/skills/datetime/__init__.py +9 -0
  95. signalwire_agents/skills/datetime/skill.py +20 -7
  96. signalwire_agents/skills/joke/README.md +149 -0
  97. signalwire_agents/skills/joke/__init__.py +9 -0
  98. signalwire_agents/skills/joke/skill.py +21 -0
  99. signalwire_agents/skills/math/README.md +161 -0
  100. signalwire_agents/skills/math/__init__.py +9 -0
  101. signalwire_agents/skills/math/skill.py +18 -4
  102. signalwire_agents/skills/mcp_gateway/README.md +230 -0
  103. signalwire_agents/skills/mcp_gateway/__init__.py +10 -0
  104. signalwire_agents/skills/mcp_gateway/skill.py +421 -0
  105. signalwire_agents/skills/native_vector_search/README.md +210 -0
  106. signalwire_agents/skills/native_vector_search/__init__.py +9 -0
  107. signalwire_agents/skills/native_vector_search/skill.py +569 -101
  108. signalwire_agents/skills/play_background_file/README.md +218 -0
  109. signalwire_agents/skills/play_background_file/__init__.py +12 -0
  110. signalwire_agents/skills/play_background_file/skill.py +242 -0
  111. signalwire_agents/skills/registry.py +395 -40
  112. signalwire_agents/skills/spider/README.md +236 -0
  113. signalwire_agents/skills/spider/__init__.py +13 -0
  114. signalwire_agents/skills/spider/skill.py +598 -0
  115. signalwire_agents/skills/swml_transfer/README.md +395 -0
  116. signalwire_agents/skills/swml_transfer/__init__.py +10 -0
  117. signalwire_agents/skills/swml_transfer/skill.py +359 -0
  118. signalwire_agents/skills/weather_api/README.md +178 -0
  119. signalwire_agents/skills/weather_api/__init__.py +12 -0
  120. signalwire_agents/skills/weather_api/skill.py +191 -0
  121. signalwire_agents/skills/web_search/README.md +163 -0
  122. signalwire_agents/skills/web_search/__init__.py +9 -0
  123. signalwire_agents/skills/web_search/skill.py +586 -112
  124. signalwire_agents/skills/wikipedia_search/README.md +228 -0
  125. signalwire_agents/{core/state → skills/wikipedia_search}/__init__.py +5 -4
  126. signalwire_agents/skills/{wikipedia → wikipedia_search}/skill.py +33 -3
  127. signalwire_agents/web/__init__.py +17 -0
  128. signalwire_agents/web/web_service.py +559 -0
  129. signalwire_agents-1.0.17.dev4.data/data/share/man/man1/sw-agent-init.1 +400 -0
  130. signalwire_agents-1.0.17.dev4.data/data/share/man/man1/sw-search.1 +483 -0
  131. signalwire_agents-1.0.17.dev4.data/data/share/man/man1/swaig-test.1 +308 -0
  132. {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/METADATA +347 -215
  133. signalwire_agents-1.0.17.dev4.dist-info/RECORD +147 -0
  134. signalwire_agents-1.0.17.dev4.dist-info/entry_points.txt +6 -0
  135. signalwire_agents/core/state/file_state_manager.py +0 -219
  136. signalwire_agents/core/state/state_manager.py +0 -101
  137. signalwire_agents/skills/wikipedia/__init__.py +0 -9
  138. signalwire_agents-0.1.13.data/data/schema.json +0 -5611
  139. signalwire_agents-0.1.13.dist-info/RECORD +0 -67
  140. signalwire_agents-0.1.13.dist-info/entry_points.txt +0 -3
  141. {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/WHEEL +0 -0
  142. {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/licenses/LICENSE +0 -0
  143. {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,748 @@
1
+ """
2
+ Copyright (c) 2025 SignalWire
3
+
4
+ This file is part of the SignalWire AI Agents SDK.
5
+
6
+ Licensed under the MIT License.
7
+ See LICENSE file in the project root for full license information.
8
+ """
9
+
10
+ import json
11
+ import logging
12
+ from typing import List, Dict, Any, Optional
13
+ from datetime import datetime
14
+
15
+ try:
16
+ import psycopg2
17
+ from psycopg2.extras import execute_values
18
+ from pgvector.psycopg2 import register_vector
19
+ PGVECTOR_AVAILABLE = True
20
+ except ImportError:
21
+ PGVECTOR_AVAILABLE = False
22
+ psycopg2 = None
23
+ register_vector = None
24
+
25
+ try:
26
+ import numpy as np
27
+ except ImportError:
28
+ np = None
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+
33
+ class PgVectorBackend:
34
+ """PostgreSQL pgvector backend for search indexing and retrieval"""
35
+
36
+ def __init__(self, connection_string: str):
37
+ """
38
+ Initialize pgvector backend
39
+
40
+ Args:
41
+ connection_string: PostgreSQL connection string
42
+ """
43
+ if not PGVECTOR_AVAILABLE:
44
+ raise ImportError(
45
+ "pgvector dependencies not available. Install with: "
46
+ "pip install psycopg2-binary pgvector"
47
+ )
48
+
49
+ self.connection_string = connection_string
50
+ self.conn = None
51
+ self._connect()
52
+
53
+ def _connect(self):
54
+ """Establish database connection"""
55
+ try:
56
+ self.conn = psycopg2.connect(self.connection_string)
57
+ register_vector(self.conn)
58
+ logger.info("Connected to PostgreSQL database")
59
+ except Exception as e:
60
+ error_msg = str(e)
61
+ if "vector type not found" in error_msg:
62
+ logger.error(
63
+ "pgvector extension not installed in database. "
64
+ "Run: CREATE EXTENSION IF NOT EXISTS vector;"
65
+ )
66
+ else:
67
+ logger.error(f"Failed to connect to database: {e}")
68
+ raise
69
+
70
+ def _ensure_connection(self):
71
+ """Ensure database connection is active"""
72
+ if self.conn is None or self.conn.closed:
73
+ self._connect()
74
+
75
+ def create_schema(self, collection_name: str, embedding_dim: int = 768):
76
+ """
77
+ Create database schema for a collection
78
+
79
+ Args:
80
+ collection_name: Name of the collection
81
+ embedding_dim: Dimension of embeddings
82
+ """
83
+ self._ensure_connection()
84
+
85
+ with self.conn.cursor() as cursor:
86
+ # Create extensions
87
+ cursor.execute("CREATE EXTENSION IF NOT EXISTS vector")
88
+ cursor.execute("CREATE EXTENSION IF NOT EXISTS pg_trgm")
89
+
90
+ # Create table
91
+ table_name = f"chunks_{collection_name}"
92
+ cursor.execute(f"""
93
+ CREATE TABLE IF NOT EXISTS {table_name} (
94
+ id SERIAL PRIMARY KEY,
95
+ content TEXT NOT NULL,
96
+ processed_content TEXT,
97
+ embedding vector({embedding_dim}),
98
+ filename TEXT,
99
+ section TEXT,
100
+ tags JSONB DEFAULT '[]'::jsonb,
101
+ metadata JSONB DEFAULT '{{}}'::jsonb,
102
+ metadata_text TEXT, -- Searchable text representation of all metadata
103
+ created_at TIMESTAMP DEFAULT NOW()
104
+ )
105
+ """)
106
+
107
+ # Create indexes
108
+ cursor.execute(f"""
109
+ CREATE INDEX IF NOT EXISTS idx_{table_name}_embedding
110
+ ON {table_name} USING ivfflat (embedding vector_cosine_ops)
111
+ WITH (lists = 100)
112
+ """)
113
+
114
+ cursor.execute(f"""
115
+ CREATE INDEX IF NOT EXISTS idx_{table_name}_content
116
+ ON {table_name} USING gin (content gin_trgm_ops)
117
+ """)
118
+
119
+ cursor.execute(f"""
120
+ CREATE INDEX IF NOT EXISTS idx_{table_name}_tags
121
+ ON {table_name} USING gin (tags)
122
+ """)
123
+
124
+ cursor.execute(f"""
125
+ CREATE INDEX IF NOT EXISTS idx_{table_name}_metadata
126
+ ON {table_name} USING gin (metadata)
127
+ """)
128
+
129
+ cursor.execute(f"""
130
+ CREATE INDEX IF NOT EXISTS idx_{table_name}_metadata_text
131
+ ON {table_name} USING gin (metadata_text gin_trgm_ops)
132
+ """)
133
+
134
+ # Create config table
135
+ cursor.execute("""
136
+ CREATE TABLE IF NOT EXISTS collection_config (
137
+ collection_name TEXT PRIMARY KEY,
138
+ model_name TEXT,
139
+ embedding_dimensions INTEGER,
140
+ chunking_strategy TEXT,
141
+ languages JSONB,
142
+ created_at TIMESTAMP DEFAULT NOW(),
143
+ metadata JSONB DEFAULT '{}'::jsonb
144
+ )
145
+ """)
146
+
147
+ self.conn.commit()
148
+ logger.info(f"Created schema for collection '{collection_name}'")
149
+
150
+ def _extract_metadata_from_json_content(self, content: str) -> Dict[str, Any]:
151
+ """
152
+ Extract metadata from JSON content if present
153
+
154
+ Returns:
155
+ metadata_dict
156
+ """
157
+ metadata_dict = {}
158
+
159
+ # Try to extract metadata from JSON structure in content
160
+ if '"metadata":' in content:
161
+ try:
162
+ import re
163
+ # Find all metadata objects
164
+ pattern = r'"metadata"\s*:\s*(\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\})'
165
+ matches = re.finditer(pattern, content)
166
+
167
+ for match in matches:
168
+ try:
169
+ json_metadata = json.loads(match.group(1))
170
+ # Merge all found metadata
171
+ if isinstance(json_metadata, dict):
172
+ metadata_dict.update(json_metadata)
173
+ except:
174
+ pass
175
+ except Exception as e:
176
+ logger.debug(f"Error extracting JSON metadata: {e}")
177
+
178
+ return metadata_dict
179
+
180
+ def store_chunks(self, chunks: List[Dict[str, Any]], collection_name: str,
181
+ config: Dict[str, Any]):
182
+ """
183
+ Store document chunks in the database
184
+
185
+ Args:
186
+ chunks: List of processed chunks with embeddings
187
+ collection_name: Name of the collection
188
+ config: Configuration metadata
189
+ """
190
+ self._ensure_connection()
191
+
192
+ table_name = f"chunks_{collection_name}"
193
+
194
+ # Prepare data for batch insert
195
+ data = []
196
+ for chunk in chunks:
197
+ embedding = chunk.get('embedding')
198
+ if embedding is not None:
199
+ # Convert to list if it's a numpy array
200
+ if hasattr(embedding, 'tolist'):
201
+ embedding = embedding.tolist()
202
+
203
+ metadata = chunk.get('metadata', {})
204
+
205
+ # Extract fields - they might be at top level or in metadata
206
+ filename = chunk.get('filename') or metadata.get('filename', '')
207
+ section = chunk.get('section') or metadata.get('section', '')
208
+ tags = chunk.get('tags', []) or metadata.get('tags', [])
209
+
210
+ # Extract metadata from JSON content and merge with chunk metadata
211
+ json_metadata = self._extract_metadata_from_json_content(chunk['content'])
212
+
213
+ # Build metadata from all fields except the ones we store separately
214
+ chunk_metadata = {}
215
+ for key, value in chunk.items():
216
+ if key not in ['content', 'processed_content', 'embedding', 'filename', 'section', 'tags']:
217
+ chunk_metadata[key] = value
218
+ # Also include any extra metadata
219
+ for key, value in metadata.items():
220
+ if key not in ['filename', 'section', 'tags']:
221
+ chunk_metadata[key] = value
222
+
223
+ # Merge metadata: chunk metadata takes precedence over JSON metadata
224
+ merged_metadata = {**json_metadata, **chunk_metadata}
225
+
226
+ # Create searchable metadata text
227
+ metadata_text_parts = []
228
+
229
+ # Add all metadata keys and values
230
+ for key, value in merged_metadata.items():
231
+ metadata_text_parts.append(str(key).lower())
232
+ if isinstance(value, list):
233
+ metadata_text_parts.extend(str(v).lower() for v in value)
234
+ else:
235
+ metadata_text_parts.append(str(value).lower())
236
+
237
+ # Add tags
238
+ if tags:
239
+ metadata_text_parts.extend(str(tag).lower() for tag in tags)
240
+
241
+ # Add section if present
242
+ if section:
243
+ metadata_text_parts.append(section.lower())
244
+
245
+ metadata_text = ' '.join(metadata_text_parts)
246
+
247
+ data.append((
248
+ chunk['content'],
249
+ chunk.get('processed_content', chunk['content']),
250
+ embedding,
251
+ filename,
252
+ section,
253
+ json.dumps(tags),
254
+ json.dumps(merged_metadata),
255
+ metadata_text
256
+ ))
257
+
258
+ # Batch insert chunks
259
+ with self.conn.cursor() as cursor:
260
+ execute_values(
261
+ cursor,
262
+ f"""
263
+ INSERT INTO {table_name}
264
+ (content, processed_content, embedding, filename, section, tags, metadata, metadata_text)
265
+ VALUES %s
266
+ """,
267
+ data,
268
+ template="(%s, %s, %s, %s, %s, %s::jsonb, %s::jsonb, %s)"
269
+ )
270
+
271
+ # Update or insert config
272
+ cursor.execute("""
273
+ INSERT INTO collection_config
274
+ (collection_name, model_name, embedding_dimensions, chunking_strategy,
275
+ languages, metadata)
276
+ VALUES (%s, %s, %s, %s, %s, %s)
277
+ ON CONFLICT (collection_name)
278
+ DO UPDATE SET
279
+ model_name = EXCLUDED.model_name,
280
+ embedding_dimensions = EXCLUDED.embedding_dimensions,
281
+ chunking_strategy = EXCLUDED.chunking_strategy,
282
+ languages = EXCLUDED.languages,
283
+ metadata = EXCLUDED.metadata
284
+ """, (
285
+ collection_name,
286
+ config.get('model_name'),
287
+ config.get('embedding_dimensions'),
288
+ config.get('chunking_strategy'),
289
+ json.dumps(config.get('languages', [])),
290
+ json.dumps(config.get('metadata', {}))
291
+ ))
292
+
293
+ self.conn.commit()
294
+ logger.info(f"Stored {len(chunks)} chunks in collection '{collection_name}'")
295
+
296
+ def get_stats(self, collection_name: str) -> Dict[str, Any]:
297
+ """Get statistics for a collection"""
298
+ self._ensure_connection()
299
+
300
+ table_name = f"chunks_{collection_name}"
301
+
302
+ with self.conn.cursor() as cursor:
303
+ # Get chunk count
304
+ cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
305
+ total_chunks = cursor.fetchone()[0]
306
+
307
+ # Get unique files
308
+ cursor.execute(f"SELECT COUNT(DISTINCT filename) FROM {table_name}")
309
+ total_files = cursor.fetchone()[0]
310
+
311
+ # Get config
312
+ cursor.execute(
313
+ "SELECT * FROM collection_config WHERE collection_name = %s",
314
+ (collection_name,)
315
+ )
316
+ config_row = cursor.fetchone()
317
+
318
+ if config_row:
319
+ config = {
320
+ 'model_name': config_row[1],
321
+ 'embedding_dimensions': config_row[2],
322
+ 'chunking_strategy': config_row[3],
323
+ 'languages': config_row[4],
324
+ 'created_at': config_row[5].isoformat() if config_row[5] else None,
325
+ 'metadata': config_row[6]
326
+ }
327
+ else:
328
+ config = {}
329
+
330
+ return {
331
+ 'total_chunks': total_chunks,
332
+ 'total_files': total_files,
333
+ 'config': config
334
+ }
335
+
336
+ def list_collections(self) -> List[str]:
337
+ """List all collections in the database"""
338
+ self._ensure_connection()
339
+
340
+ with self.conn.cursor() as cursor:
341
+ cursor.execute("SELECT collection_name FROM collection_config ORDER BY collection_name")
342
+ return [row[0] for row in cursor.fetchall()]
343
+
344
+ def delete_collection(self, collection_name: str):
345
+ """Delete a collection and its data"""
346
+ self._ensure_connection()
347
+
348
+ table_name = f"chunks_{collection_name}"
349
+
350
+ with self.conn.cursor() as cursor:
351
+ cursor.execute(f"DROP TABLE IF EXISTS {table_name}")
352
+ cursor.execute(
353
+ "DELETE FROM collection_config WHERE collection_name = %s",
354
+ (collection_name,)
355
+ )
356
+ self.conn.commit()
357
+ logger.info(f"Deleted collection '{collection_name}'")
358
+
359
+ def close(self):
360
+ """Close database connection"""
361
+ if self.conn and not self.conn.closed:
362
+ self.conn.close()
363
+ logger.info("Closed database connection")
364
+
365
+
366
+ class PgVectorSearchBackend:
367
+ """PostgreSQL pgvector backend for search operations"""
368
+
369
+ def __init__(self, connection_string: str, collection_name: str):
370
+ """
371
+ Initialize search backend
372
+
373
+ Args:
374
+ connection_string: PostgreSQL connection string
375
+ collection_name: Name of the collection to search
376
+ """
377
+ if not PGVECTOR_AVAILABLE:
378
+ raise ImportError(
379
+ "pgvector dependencies not available. Install with: "
380
+ "pip install psycopg2-binary pgvector"
381
+ )
382
+
383
+ self.connection_string = connection_string
384
+ self.collection_name = collection_name
385
+ self.table_name = f"chunks_{collection_name}"
386
+ self.conn = None
387
+ self._connect()
388
+ self.config = self._load_config()
389
+
390
+ def _connect(self):
391
+ """Establish database connection"""
392
+ try:
393
+ self.conn = psycopg2.connect(self.connection_string)
394
+ register_vector(self.conn)
395
+ except Exception as e:
396
+ logger.error(f"Failed to connect to database: {e}")
397
+ raise
398
+
399
+ def _ensure_connection(self):
400
+ """Ensure database connection is active"""
401
+ if self.conn is None or self.conn.closed:
402
+ self._connect()
403
+
404
+ def _load_config(self) -> Dict[str, Any]:
405
+ """Load collection configuration"""
406
+ self._ensure_connection()
407
+
408
+ with self.conn.cursor() as cursor:
409
+ cursor.execute(
410
+ "SELECT * FROM collection_config WHERE collection_name = %s",
411
+ (self.collection_name,)
412
+ )
413
+ row = cursor.fetchone()
414
+
415
+ if row:
416
+ return {
417
+ 'model_name': row[1],
418
+ 'embedding_dimensions': row[2],
419
+ 'chunking_strategy': row[3],
420
+ 'languages': row[4],
421
+ 'metadata': row[6]
422
+ }
423
+ return {}
424
+
425
+ def search(self, query_vector: List[float], enhanced_text: str,
426
+ count: int = 5, similarity_threshold: float = 0.0,
427
+ tags: Optional[List[str]] = None,
428
+ keyword_weight: Optional[float] = None) -> List[Dict[str, Any]]:
429
+ """
430
+ Perform hybrid search (vector + keyword + metadata)
431
+
432
+ Args:
433
+ query_vector: Embedding vector for the query
434
+ enhanced_text: Processed query text for keyword search
435
+ count: Number of results to return
436
+ similarity_threshold: Minimum similarity score
437
+ tags: Filter by tags
438
+ keyword_weight: Manual keyword weight (0.0-1.0). If None, uses default weighting
439
+
440
+ Returns:
441
+ List of search results with scores and metadata
442
+ """
443
+ self._ensure_connection()
444
+
445
+ # Extract query terms for metadata search
446
+ query_terms = enhanced_text.lower().split()
447
+
448
+ # Vector search
449
+ vector_results = self._vector_search(query_vector, count * 2, tags)
450
+
451
+ # Apply similarity threshold to raw vector scores BEFORE weighting
452
+ # This ensures threshold behaves intuitively (filters on actual similarity, not weighted score)
453
+ if similarity_threshold > 0:
454
+ vector_results = [r for r in vector_results if r['score'] >= similarity_threshold]
455
+
456
+ # Keyword search
457
+ keyword_results = self._keyword_search(enhanced_text, count * 2, tags)
458
+
459
+ # Metadata search
460
+ metadata_results = self._metadata_search(query_terms, count * 2, tags)
461
+
462
+ # Merge all results (threshold already applied to vector results)
463
+ merged_results = self._merge_all_results(vector_results, keyword_results, metadata_results, keyword_weight)
464
+
465
+ # Ensure 'score' field exists for CLI compatibility
466
+ for r in merged_results:
467
+ if 'score' not in r:
468
+ r['score'] = r.get('final_score', 0.0)
469
+
470
+ return merged_results[:count]
471
+
472
+ def _vector_search(self, query_vector: List[float], count: int,
473
+ tags: Optional[List[str]] = None) -> List[Dict[str, Any]]:
474
+ """Perform vector similarity search"""
475
+ with self.conn.cursor() as cursor:
476
+ # Set ef_search for HNSW index to ensure we get enough results
477
+ # ef_search must be at least as large as the LIMIT
478
+ cursor.execute(f"SET LOCAL hnsw.ef_search = {max(count, 40)}")
479
+ # Build query
480
+ query = f"""
481
+ SELECT id, content, filename, section, tags, metadata,
482
+ 1 - (embedding <=> %s::vector) as similarity
483
+ FROM {self.table_name}
484
+ WHERE embedding IS NOT NULL
485
+ """
486
+
487
+ params = [query_vector]
488
+
489
+ # Add tag filter if specified
490
+ if tags:
491
+ query += " AND tags ?| %s"
492
+ params.append(tags)
493
+
494
+ query += " ORDER BY embedding <=> %s::vector LIMIT %s"
495
+ params.extend([query_vector, count])
496
+
497
+ cursor.execute(query, params)
498
+
499
+ results = []
500
+ for row in cursor.fetchall():
501
+ chunk_id, content, filename, section, tags_json, metadata_json, similarity = row
502
+
503
+ results.append({
504
+ 'id': chunk_id,
505
+ 'content': content,
506
+ 'score': float(similarity),
507
+ 'metadata': {
508
+ 'filename': filename,
509
+ 'section': section,
510
+ 'tags': tags_json if isinstance(tags_json, list) else [],
511
+ **metadata_json
512
+ },
513
+ 'search_type': 'vector'
514
+ })
515
+
516
+ return results
517
+
518
+ def _keyword_search(self, enhanced_text: str, count: int,
519
+ tags: Optional[List[str]] = None) -> List[Dict[str, Any]]:
520
+ """Perform full-text search"""
521
+ with self.conn.cursor() as cursor:
522
+ # Use PostgreSQL text search
523
+ query = f"""
524
+ SELECT id, content, filename, section, tags, metadata,
525
+ ts_rank(to_tsvector('english', content),
526
+ plainto_tsquery('english', %s)) as rank
527
+ FROM {self.table_name}
528
+ WHERE to_tsvector('english', content) @@ plainto_tsquery('english', %s)
529
+ """
530
+
531
+ params = [enhanced_text, enhanced_text]
532
+
533
+ # Add tag filter if specified
534
+ if tags:
535
+ query += " AND tags ?| %s"
536
+ params.append(tags)
537
+
538
+ query += " ORDER BY rank DESC LIMIT %s"
539
+ params.append(count)
540
+
541
+ cursor.execute(query, params)
542
+
543
+ results = []
544
+ for row in cursor.fetchall():
545
+ chunk_id, content, filename, section, tags_json, metadata_json, rank = row
546
+
547
+ # Normalize rank to 0-1 score
548
+ score = min(1.0, rank / 10.0)
549
+
550
+ results.append({
551
+ 'id': chunk_id,
552
+ 'content': content,
553
+ 'score': float(score),
554
+ 'metadata': {
555
+ 'filename': filename,
556
+ 'section': section,
557
+ 'tags': tags_json if isinstance(tags_json, list) else [],
558
+ **metadata_json
559
+ },
560
+ 'search_type': 'keyword'
561
+ })
562
+
563
+ return results
564
+
565
+ def _metadata_search(self, query_terms: List[str], count: int,
566
+ tags: Optional[List[str]] = None) -> List[Dict[str, Any]]:
567
+ """
568
+ Perform metadata search using JSONB operators and metadata_text
569
+ """
570
+ with self.conn.cursor() as cursor:
571
+ # Build WHERE conditions
572
+ where_conditions = []
573
+ params = []
574
+
575
+ # Use metadata_text for trigram search
576
+ if query_terms:
577
+ # Create AND conditions for all terms
578
+ for term in query_terms:
579
+ where_conditions.append(f"metadata_text ILIKE %s")
580
+ params.append(f'%{term}%')
581
+
582
+ # Add tag filter if specified
583
+ if tags:
584
+ where_conditions.append("tags ?| %s")
585
+ params.append(tags)
586
+
587
+ # Build query
588
+ where_clause = " AND ".join(where_conditions) if where_conditions else "1=1"
589
+
590
+ query = f"""
591
+ SELECT id, content, filename, section, tags, metadata,
592
+ metadata_text
593
+ FROM {self.table_name}
594
+ WHERE {where_clause}
595
+ LIMIT %s
596
+ """
597
+
598
+ params.append(count)
599
+
600
+ cursor.execute(query, params)
601
+
602
+ results = []
603
+ for row in cursor.fetchall():
604
+ chunk_id, content, filename, section, tags_json, metadata_json, metadata_text = row
605
+
606
+ # Calculate score based on term matches
607
+ score = 0.0
608
+ if metadata_text:
609
+ metadata_lower = metadata_text.lower()
610
+ for term in query_terms:
611
+ if term.lower() in metadata_lower:
612
+ score += 0.3 # Base score for each match
613
+
614
+ # Bonus for exact matches in JSONB keys/values
615
+ if metadata_json:
616
+ json_str = json.dumps(metadata_json).lower()
617
+ for term in query_terms:
618
+ if term.lower() in json_str:
619
+ score += 0.2
620
+
621
+ # Normalize score
622
+ score = min(1.0, score)
623
+
624
+ results.append({
625
+ 'id': chunk_id,
626
+ 'content': content,
627
+ 'score': float(score),
628
+ 'metadata': {
629
+ 'filename': filename,
630
+ 'section': section,
631
+ 'tags': tags_json if isinstance(tags_json, list) else [],
632
+ **metadata_json
633
+ },
634
+ 'search_type': 'metadata'
635
+ })
636
+
637
+ # Sort by score
638
+ results.sort(key=lambda x: x['score'], reverse=True)
639
+ return results[:count]
640
+
641
+ def _merge_results(self, vector_results: List[Dict[str, Any]],
642
+ keyword_results: List[Dict[str, Any]],
643
+ keyword_weight: Optional[float] = None) -> List[Dict[str, Any]]:
644
+ """Merge and rank results from vector and keyword search"""
645
+ # Use provided weights or defaults
646
+ if keyword_weight is None:
647
+ keyword_weight = 0.3
648
+ vector_weight = 1.0 - keyword_weight
649
+
650
+ # Create a map to track unique results
651
+ results_map = {}
652
+
653
+ # Add vector results
654
+ for result in vector_results:
655
+ chunk_id = result['id']
656
+ if chunk_id not in results_map:
657
+ results_map[chunk_id] = result
658
+ results_map[chunk_id]['score'] *= vector_weight
659
+ else:
660
+ # Combine scores if result appears in both
661
+ results_map[chunk_id]['score'] += result['score'] * vector_weight
662
+
663
+ # Add keyword results
664
+ for result in keyword_results:
665
+ chunk_id = result['id']
666
+ if chunk_id not in results_map:
667
+ results_map[chunk_id] = result
668
+ results_map[chunk_id]['score'] *= keyword_weight
669
+ else:
670
+ # Combine scores if result appears in both
671
+ results_map[chunk_id]['score'] += result['score'] * keyword_weight
672
+
673
+ # Sort by combined score
674
+ merged = list(results_map.values())
675
+ merged.sort(key=lambda x: x['score'], reverse=True)
676
+
677
+ return merged
678
+
679
+ def _merge_all_results(self, vector_results: List[Dict[str, Any]],
680
+ keyword_results: List[Dict[str, Any]],
681
+ metadata_results: List[Dict[str, Any]],
682
+ keyword_weight: Optional[float] = None) -> List[Dict[str, Any]]:
683
+ """Merge and rank results from vector, keyword, and metadata search"""
684
+ # Use provided weights or defaults
685
+ if keyword_weight is None:
686
+ keyword_weight = 0.3
687
+ vector_weight = 0.5
688
+ metadata_weight = 0.2
689
+
690
+ # Create a map to track unique results
691
+ results_map = {}
692
+ all_sources = {}
693
+
694
+ # Add vector results
695
+ for result in vector_results:
696
+ chunk_id = result['id']
697
+ if chunk_id not in results_map:
698
+ results_map[chunk_id] = result.copy()
699
+ results_map[chunk_id]['score'] = result['score'] * vector_weight
700
+ all_sources[chunk_id] = {'vector': result['score']}
701
+ else:
702
+ results_map[chunk_id]['score'] += result['score'] * vector_weight
703
+ all_sources[chunk_id]['vector'] = result['score']
704
+
705
+ # Add keyword results
706
+ for result in keyword_results:
707
+ chunk_id = result['id']
708
+ if chunk_id not in results_map:
709
+ results_map[chunk_id] = result.copy()
710
+ results_map[chunk_id]['score'] = result['score'] * keyword_weight
711
+ all_sources.setdefault(chunk_id, {})['keyword'] = result['score']
712
+ else:
713
+ results_map[chunk_id]['score'] += result['score'] * keyword_weight
714
+ all_sources[chunk_id]['keyword'] = result['score']
715
+
716
+ # Add metadata results
717
+ for result in metadata_results:
718
+ chunk_id = result['id']
719
+ if chunk_id not in results_map:
720
+ results_map[chunk_id] = result.copy()
721
+ results_map[chunk_id]['score'] = result['score'] * metadata_weight
722
+ all_sources.setdefault(chunk_id, {})['metadata'] = result['score']
723
+ else:
724
+ results_map[chunk_id]['score'] += result['score'] * metadata_weight
725
+ all_sources[chunk_id]['metadata'] = result['score']
726
+
727
+ # Add sources to results for transparency
728
+ for chunk_id, result in results_map.items():
729
+ result['sources'] = all_sources.get(chunk_id, {})
730
+ result['final_score'] = result['score']
731
+
732
+ # Sort by combined score
733
+ merged = list(results_map.values())
734
+ merged.sort(key=lambda x: x['score'], reverse=True)
735
+
736
+ return merged
737
+
738
+ def get_stats(self) -> Dict[str, Any]:
739
+ """Get statistics for the collection"""
740
+ backend = PgVectorBackend(self.connection_string)
741
+ stats = backend.get_stats(self.collection_name)
742
+ backend.close()
743
+ return stats
744
+
745
+ def close(self):
746
+ """Close database connection"""
747
+ if self.conn and not self.conn.closed:
748
+ self.conn.close()