signalwire-agents 0.1.6__py3-none-any.whl → 1.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- signalwire_agents/__init__.py +130 -4
- signalwire_agents/agent_server.py +438 -32
- signalwire_agents/agents/bedrock.py +296 -0
- signalwire_agents/cli/__init__.py +18 -0
- signalwire_agents/cli/build_search.py +1367 -0
- signalwire_agents/cli/config.py +80 -0
- signalwire_agents/cli/core/__init__.py +10 -0
- signalwire_agents/cli/core/agent_loader.py +470 -0
- signalwire_agents/cli/core/argparse_helpers.py +179 -0
- signalwire_agents/cli/core/dynamic_config.py +71 -0
- signalwire_agents/cli/core/service_loader.py +303 -0
- signalwire_agents/cli/execution/__init__.py +10 -0
- signalwire_agents/cli/execution/datamap_exec.py +446 -0
- signalwire_agents/cli/execution/webhook_exec.py +134 -0
- signalwire_agents/cli/init_project.py +1225 -0
- signalwire_agents/cli/output/__init__.py +10 -0
- signalwire_agents/cli/output/output_formatter.py +255 -0
- signalwire_agents/cli/output/swml_dump.py +186 -0
- signalwire_agents/cli/simulation/__init__.py +10 -0
- signalwire_agents/cli/simulation/data_generation.py +374 -0
- signalwire_agents/cli/simulation/data_overrides.py +200 -0
- signalwire_agents/cli/simulation/mock_env.py +282 -0
- signalwire_agents/cli/swaig_test_wrapper.py +52 -0
- signalwire_agents/cli/test_swaig.py +809 -0
- signalwire_agents/cli/types.py +81 -0
- signalwire_agents/core/__init__.py +2 -2
- signalwire_agents/core/agent/__init__.py +12 -0
- signalwire_agents/core/agent/config/__init__.py +12 -0
- signalwire_agents/core/agent/deployment/__init__.py +9 -0
- signalwire_agents/core/agent/deployment/handlers/__init__.py +9 -0
- signalwire_agents/core/agent/prompt/__init__.py +14 -0
- signalwire_agents/core/agent/prompt/manager.py +306 -0
- signalwire_agents/core/agent/routing/__init__.py +9 -0
- signalwire_agents/core/agent/security/__init__.py +9 -0
- signalwire_agents/core/agent/swml/__init__.py +9 -0
- signalwire_agents/core/agent/tools/__init__.py +15 -0
- signalwire_agents/core/agent/tools/decorator.py +97 -0
- signalwire_agents/core/agent/tools/registry.py +210 -0
- signalwire_agents/core/agent_base.py +959 -2166
- signalwire_agents/core/auth_handler.py +233 -0
- signalwire_agents/core/config_loader.py +259 -0
- signalwire_agents/core/contexts.py +707 -0
- signalwire_agents/core/data_map.py +487 -0
- signalwire_agents/core/function_result.py +1150 -1
- signalwire_agents/core/logging_config.py +376 -0
- signalwire_agents/core/mixins/__init__.py +28 -0
- signalwire_agents/core/mixins/ai_config_mixin.py +442 -0
- signalwire_agents/core/mixins/auth_mixin.py +287 -0
- signalwire_agents/core/mixins/prompt_mixin.py +358 -0
- signalwire_agents/core/mixins/serverless_mixin.py +368 -0
- signalwire_agents/core/mixins/skill_mixin.py +55 -0
- signalwire_agents/core/mixins/state_mixin.py +153 -0
- signalwire_agents/core/mixins/tool_mixin.py +230 -0
- signalwire_agents/core/mixins/web_mixin.py +1134 -0
- signalwire_agents/core/security/session_manager.py +174 -86
- signalwire_agents/core/security_config.py +333 -0
- signalwire_agents/core/skill_base.py +200 -0
- signalwire_agents/core/skill_manager.py +244 -0
- signalwire_agents/core/swaig_function.py +33 -9
- signalwire_agents/core/swml_builder.py +212 -12
- signalwire_agents/core/swml_handler.py +43 -13
- signalwire_agents/core/swml_renderer.py +123 -297
- signalwire_agents/core/swml_service.py +277 -260
- signalwire_agents/prefabs/concierge.py +6 -2
- signalwire_agents/prefabs/info_gatherer.py +149 -33
- signalwire_agents/prefabs/receptionist.py +14 -22
- signalwire_agents/prefabs/survey.py +6 -2
- signalwire_agents/schema.json +9218 -5489
- signalwire_agents/search/__init__.py +137 -0
- signalwire_agents/search/document_processor.py +1223 -0
- signalwire_agents/search/index_builder.py +804 -0
- signalwire_agents/search/migration.py +418 -0
- signalwire_agents/search/models.py +30 -0
- signalwire_agents/search/pgvector_backend.py +752 -0
- signalwire_agents/search/query_processor.py +502 -0
- signalwire_agents/search/search_engine.py +1264 -0
- signalwire_agents/search/search_service.py +574 -0
- signalwire_agents/skills/README.md +452 -0
- signalwire_agents/skills/__init__.py +23 -0
- signalwire_agents/skills/api_ninjas_trivia/README.md +215 -0
- signalwire_agents/skills/api_ninjas_trivia/__init__.py +12 -0
- signalwire_agents/skills/api_ninjas_trivia/skill.py +237 -0
- signalwire_agents/skills/datasphere/README.md +210 -0
- signalwire_agents/skills/datasphere/__init__.py +12 -0
- signalwire_agents/skills/datasphere/skill.py +310 -0
- signalwire_agents/skills/datasphere_serverless/README.md +258 -0
- signalwire_agents/skills/datasphere_serverless/__init__.py +10 -0
- signalwire_agents/skills/datasphere_serverless/skill.py +237 -0
- signalwire_agents/skills/datetime/README.md +132 -0
- signalwire_agents/skills/datetime/__init__.py +10 -0
- signalwire_agents/skills/datetime/skill.py +126 -0
- signalwire_agents/skills/joke/README.md +149 -0
- signalwire_agents/skills/joke/__init__.py +10 -0
- signalwire_agents/skills/joke/skill.py +109 -0
- signalwire_agents/skills/math/README.md +161 -0
- signalwire_agents/skills/math/__init__.py +10 -0
- signalwire_agents/skills/math/skill.py +105 -0
- signalwire_agents/skills/mcp_gateway/README.md +230 -0
- signalwire_agents/skills/mcp_gateway/__init__.py +10 -0
- signalwire_agents/skills/mcp_gateway/skill.py +421 -0
- signalwire_agents/skills/native_vector_search/README.md +210 -0
- signalwire_agents/skills/native_vector_search/__init__.py +10 -0
- signalwire_agents/skills/native_vector_search/skill.py +820 -0
- signalwire_agents/skills/play_background_file/README.md +218 -0
- signalwire_agents/skills/play_background_file/__init__.py +12 -0
- signalwire_agents/skills/play_background_file/skill.py +242 -0
- signalwire_agents/skills/registry.py +459 -0
- signalwire_agents/skills/spider/README.md +236 -0
- signalwire_agents/skills/spider/__init__.py +13 -0
- signalwire_agents/skills/spider/skill.py +598 -0
- signalwire_agents/skills/swml_transfer/README.md +395 -0
- signalwire_agents/skills/swml_transfer/__init__.py +10 -0
- signalwire_agents/skills/swml_transfer/skill.py +359 -0
- signalwire_agents/skills/weather_api/README.md +178 -0
- signalwire_agents/skills/weather_api/__init__.py +12 -0
- signalwire_agents/skills/weather_api/skill.py +191 -0
- signalwire_agents/skills/web_search/README.md +163 -0
- signalwire_agents/skills/web_search/__init__.py +10 -0
- signalwire_agents/skills/web_search/skill.py +739 -0
- signalwire_agents/skills/wikipedia_search/README.md +228 -0
- signalwire_agents/{core/state → skills/wikipedia_search}/__init__.py +5 -4
- signalwire_agents/skills/wikipedia_search/skill.py +210 -0
- signalwire_agents/utils/__init__.py +14 -0
- signalwire_agents/utils/schema_utils.py +111 -44
- signalwire_agents/web/__init__.py +17 -0
- signalwire_agents/web/web_service.py +559 -0
- signalwire_agents-1.0.7.data/data/share/man/man1/sw-agent-init.1 +307 -0
- signalwire_agents-1.0.7.data/data/share/man/man1/sw-search.1 +483 -0
- signalwire_agents-1.0.7.data/data/share/man/man1/swaig-test.1 +308 -0
- signalwire_agents-1.0.7.dist-info/METADATA +992 -0
- signalwire_agents-1.0.7.dist-info/RECORD +142 -0
- {signalwire_agents-0.1.6.dist-info → signalwire_agents-1.0.7.dist-info}/WHEEL +1 -1
- signalwire_agents-1.0.7.dist-info/entry_points.txt +4 -0
- signalwire_agents/core/state/file_state_manager.py +0 -219
- signalwire_agents/core/state/state_manager.py +0 -101
- signalwire_agents-0.1.6.data/data/schema.json +0 -5611
- signalwire_agents-0.1.6.dist-info/METADATA +0 -199
- signalwire_agents-0.1.6.dist-info/RECORD +0 -34
- {signalwire_agents-0.1.6.dist-info → signalwire_agents-1.0.7.dist-info}/licenses/LICENSE +0 -0
- {signalwire_agents-0.1.6.dist-info → signalwire_agents-1.0.7.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,804 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright (c) 2025 SignalWire
|
|
3
|
+
|
|
4
|
+
This file is part of the SignalWire AI Agents SDK.
|
|
5
|
+
|
|
6
|
+
Licensed under the MIT License.
|
|
7
|
+
See LICENSE file in the project root for full license information.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import os
|
|
11
|
+
import sqlite3
|
|
12
|
+
import json
|
|
13
|
+
import hashlib
|
|
14
|
+
import logging
|
|
15
|
+
from datetime import datetime
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import List, Optional, Dict, Any
|
|
18
|
+
import fnmatch
|
|
19
|
+
|
|
20
|
+
try:
|
|
21
|
+
import numpy as np
|
|
22
|
+
except ImportError:
|
|
23
|
+
np = None
|
|
24
|
+
|
|
25
|
+
try:
|
|
26
|
+
from sentence_transformers import SentenceTransformer
|
|
27
|
+
except ImportError:
|
|
28
|
+
SentenceTransformer = None
|
|
29
|
+
|
|
30
|
+
from .document_processor import DocumentProcessor
|
|
31
|
+
from .query_processor import preprocess_document_content
|
|
32
|
+
|
|
33
|
+
logger = logging.getLogger(__name__)
|
|
34
|
+
|
|
35
|
+
class IndexBuilder:
|
|
36
|
+
"""Build searchable indexes from document directories"""
|
|
37
|
+
|
|
38
|
+
def __init__(
|
|
39
|
+
self,
|
|
40
|
+
model_name: str = 'sentence-transformers/all-mpnet-base-v2',
|
|
41
|
+
chunking_strategy: str = 'sentence',
|
|
42
|
+
max_sentences_per_chunk: int = 5,
|
|
43
|
+
chunk_size: int = 50,
|
|
44
|
+
chunk_overlap: int = 10,
|
|
45
|
+
split_newlines: Optional[int] = None,
|
|
46
|
+
index_nlp_backend: str = 'nltk',
|
|
47
|
+
verbose: bool = False,
|
|
48
|
+
semantic_threshold: float = 0.5,
|
|
49
|
+
topic_threshold: float = 0.3,
|
|
50
|
+
backend: str = 'sqlite',
|
|
51
|
+
connection_string: Optional[str] = None
|
|
52
|
+
):
|
|
53
|
+
"""
|
|
54
|
+
Initialize the index builder
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
model_name: Name of the sentence transformer model to use
|
|
58
|
+
chunking_strategy: Strategy for chunking documents ('sentence', 'sliding', 'paragraph', 'page', 'semantic', 'topic', 'qa', 'json')
|
|
59
|
+
max_sentences_per_chunk: For sentence strategy (default: 5)
|
|
60
|
+
chunk_size: For sliding strategy - words per chunk (default: 50)
|
|
61
|
+
chunk_overlap: For sliding strategy - overlap in words (default: 10)
|
|
62
|
+
split_newlines: For sentence strategy - split on multiple newlines (optional)
|
|
63
|
+
index_nlp_backend: NLP backend for indexing (default: 'nltk')
|
|
64
|
+
verbose: Whether to enable verbose logging (default: False)
|
|
65
|
+
semantic_threshold: Similarity threshold for semantic chunking (default: 0.5)
|
|
66
|
+
topic_threshold: Similarity threshold for topic chunking (default: 0.3)
|
|
67
|
+
backend: Storage backend ('sqlite' or 'pgvector') (default: 'sqlite')
|
|
68
|
+
connection_string: PostgreSQL connection string for pgvector backend
|
|
69
|
+
"""
|
|
70
|
+
self.model_name = model_name
|
|
71
|
+
self.chunking_strategy = chunking_strategy
|
|
72
|
+
self.max_sentences_per_chunk = max_sentences_per_chunk
|
|
73
|
+
self.chunk_size = chunk_size
|
|
74
|
+
self.chunk_overlap = chunk_overlap
|
|
75
|
+
self.split_newlines = split_newlines
|
|
76
|
+
self.index_nlp_backend = index_nlp_backend
|
|
77
|
+
self.verbose = verbose
|
|
78
|
+
self.semantic_threshold = semantic_threshold
|
|
79
|
+
self.topic_threshold = topic_threshold
|
|
80
|
+
self.backend = backend
|
|
81
|
+
self.connection_string = connection_string
|
|
82
|
+
self.model = None
|
|
83
|
+
|
|
84
|
+
# Validate backend
|
|
85
|
+
if self.backend not in ['sqlite', 'pgvector']:
|
|
86
|
+
raise ValueError(f"Invalid backend '{self.backend}'. Must be 'sqlite' or 'pgvector'")
|
|
87
|
+
|
|
88
|
+
# Validate NLP backend
|
|
89
|
+
if self.index_nlp_backend not in ['nltk', 'spacy']:
|
|
90
|
+
logger.warning(f"Invalid index_nlp_backend '{self.index_nlp_backend}', using 'nltk'")
|
|
91
|
+
self.index_nlp_backend = 'nltk'
|
|
92
|
+
|
|
93
|
+
self.doc_processor = DocumentProcessor(
|
|
94
|
+
chunking_strategy=chunking_strategy,
|
|
95
|
+
max_sentences_per_chunk=max_sentences_per_chunk,
|
|
96
|
+
chunk_size=chunk_size,
|
|
97
|
+
chunk_overlap=chunk_overlap,
|
|
98
|
+
split_newlines=split_newlines,
|
|
99
|
+
index_nlp_backend=self.index_nlp_backend,
|
|
100
|
+
verbose=self.verbose,
|
|
101
|
+
semantic_threshold=self.semantic_threshold,
|
|
102
|
+
topic_threshold=self.topic_threshold
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
def _extract_metadata_from_json_content(self, content: str) -> tuple[Dict[str, Any], str]:
|
|
106
|
+
"""
|
|
107
|
+
Extract metadata from JSON content if present
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
(metadata_dict, metadata_text)
|
|
111
|
+
"""
|
|
112
|
+
metadata_dict = {}
|
|
113
|
+
|
|
114
|
+
# Try to extract metadata from JSON structure in content
|
|
115
|
+
if '"metadata":' in content:
|
|
116
|
+
try:
|
|
117
|
+
# Look for metadata object in content
|
|
118
|
+
import re
|
|
119
|
+
# Find all metadata objects
|
|
120
|
+
pattern = r'"metadata"\s*:\s*(\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\})'
|
|
121
|
+
matches = re.finditer(pattern, content)
|
|
122
|
+
|
|
123
|
+
for match in matches:
|
|
124
|
+
try:
|
|
125
|
+
json_metadata = json.loads(match.group(1))
|
|
126
|
+
# Merge all found metadata
|
|
127
|
+
if isinstance(json_metadata, dict):
|
|
128
|
+
metadata_dict.update(json_metadata)
|
|
129
|
+
except:
|
|
130
|
+
pass
|
|
131
|
+
except Exception as e:
|
|
132
|
+
logger.debug(f"Error extracting JSON metadata: {e}")
|
|
133
|
+
|
|
134
|
+
# Create searchable text from all metadata keys and values
|
|
135
|
+
metadata_text_parts = []
|
|
136
|
+
for key, value in metadata_dict.items():
|
|
137
|
+
# Add key
|
|
138
|
+
metadata_text_parts.append(str(key))
|
|
139
|
+
# Add value(s)
|
|
140
|
+
if isinstance(value, list):
|
|
141
|
+
metadata_text_parts.extend(str(v) for v in value)
|
|
142
|
+
else:
|
|
143
|
+
metadata_text_parts.append(str(value))
|
|
144
|
+
|
|
145
|
+
metadata_text = ' '.join(metadata_text_parts).lower()
|
|
146
|
+
|
|
147
|
+
return metadata_dict, metadata_text
|
|
148
|
+
|
|
149
|
+
def _load_model(self):
|
|
150
|
+
"""Load embedding model (lazy loading)"""
|
|
151
|
+
if self.model is None:
|
|
152
|
+
if not SentenceTransformer:
|
|
153
|
+
raise ImportError("sentence-transformers is required for embedding generation. Install with: pip install sentence-transformers")
|
|
154
|
+
|
|
155
|
+
if self.verbose:
|
|
156
|
+
print(f"Loading embedding model: {self.model_name}")
|
|
157
|
+
|
|
158
|
+
try:
|
|
159
|
+
self.model = SentenceTransformer(self.model_name)
|
|
160
|
+
except Exception as e:
|
|
161
|
+
logger.error(f"Failed to load model '{self.model_name}': {e}")
|
|
162
|
+
raise
|
|
163
|
+
|
|
164
|
+
def build_index_from_sources(self, sources: List[Path], output_file: str,
|
|
165
|
+
file_types: List[str], exclude_patterns: Optional[List[str]] = None,
|
|
166
|
+
languages: List[str] = None, tags: Optional[List[str]] = None,
|
|
167
|
+
overwrite: bool = False):
|
|
168
|
+
"""
|
|
169
|
+
Build complete search index from multiple sources (files and directories)
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
sources: List of Path objects (files and/or directories)
|
|
173
|
+
output_file: Output .swsearch file path
|
|
174
|
+
file_types: List of file extensions to include for directories
|
|
175
|
+
exclude_patterns: Glob patterns to exclude
|
|
176
|
+
languages: List of languages to support
|
|
177
|
+
tags: Global tags to add to all chunks
|
|
178
|
+
"""
|
|
179
|
+
|
|
180
|
+
# Discover files from all sources
|
|
181
|
+
files = self._discover_files_from_sources(sources, file_types, exclude_patterns)
|
|
182
|
+
if self.verbose:
|
|
183
|
+
print(f"Found {len(files)} files to process")
|
|
184
|
+
|
|
185
|
+
if not files:
|
|
186
|
+
print("No files found to process. Check your sources, file types and exclude patterns.")
|
|
187
|
+
return
|
|
188
|
+
|
|
189
|
+
# Process documents
|
|
190
|
+
chunks = []
|
|
191
|
+
print(f"Processing {len(files)} files...")
|
|
192
|
+
for file_path in files:
|
|
193
|
+
try:
|
|
194
|
+
# For individual files, use the file's parent as the base directory
|
|
195
|
+
# For files from directories, use the original source directory
|
|
196
|
+
base_dir = self._get_base_directory_for_file(file_path, sources)
|
|
197
|
+
file_chunks = self._process_file(file_path, base_dir, tags)
|
|
198
|
+
chunks.extend(file_chunks)
|
|
199
|
+
if self.verbose or file_path.suffix == '.json':
|
|
200
|
+
print(f" {file_path}: {len(file_chunks)} chunks")
|
|
201
|
+
except Exception as e:
|
|
202
|
+
logger.error(f"Error processing {file_path}: {e}")
|
|
203
|
+
if self.verbose:
|
|
204
|
+
print(f"Error processing {file_path}: {e}")
|
|
205
|
+
|
|
206
|
+
if not chunks:
|
|
207
|
+
print("No chunks created from documents. Check file contents and processing.")
|
|
208
|
+
return
|
|
209
|
+
|
|
210
|
+
if self.verbose:
|
|
211
|
+
print(f"Created {len(chunks)} total chunks")
|
|
212
|
+
|
|
213
|
+
# Generate embeddings
|
|
214
|
+
self._load_model()
|
|
215
|
+
if self.verbose:
|
|
216
|
+
print(f"Generating embeddings for {len(chunks)} chunks...")
|
|
217
|
+
else:
|
|
218
|
+
print(f"Generating embeddings for {len(chunks)} chunks...")
|
|
219
|
+
|
|
220
|
+
for i, chunk in enumerate(chunks):
|
|
221
|
+
try:
|
|
222
|
+
# Preprocess content for better search
|
|
223
|
+
processed = preprocess_document_content(
|
|
224
|
+
chunk['content'],
|
|
225
|
+
language=chunk.get('language', 'en'),
|
|
226
|
+
index_nlp_backend=self.index_nlp_backend
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
chunk['processed_content'] = processed['enhanced_text']
|
|
230
|
+
|
|
231
|
+
# Include tags in keywords for better search matching
|
|
232
|
+
keywords = processed.get('keywords', [])
|
|
233
|
+
chunk_tags = chunk.get('tags', [])
|
|
234
|
+
if chunk_tags:
|
|
235
|
+
# Add tags to keywords list for FTS matching
|
|
236
|
+
keywords.extend(chunk_tags)
|
|
237
|
+
# Remove duplicates while preserving order
|
|
238
|
+
keywords = list(dict.fromkeys(keywords))
|
|
239
|
+
|
|
240
|
+
chunk['keywords'] = keywords
|
|
241
|
+
|
|
242
|
+
# For embedding, include tags in the text for better semantic matching
|
|
243
|
+
embedding_text = processed['enhanced_text']
|
|
244
|
+
if chunk_tags:
|
|
245
|
+
# Append tags to the text for embedding generation
|
|
246
|
+
embedding_text += " " + " ".join(chunk_tags)
|
|
247
|
+
|
|
248
|
+
# Generate embedding (suppress progress bar)
|
|
249
|
+
embedding = self.model.encode(embedding_text, show_progress_bar=False)
|
|
250
|
+
chunk['embedding'] = embedding.tobytes()
|
|
251
|
+
|
|
252
|
+
# Show progress more frequently
|
|
253
|
+
show_every = 50 if len(chunks) > 500 else max(10, len(chunks) // 10)
|
|
254
|
+
if (i + 1) % show_every == 0 or (i + 1) == len(chunks):
|
|
255
|
+
progress_pct = ((i + 1) / len(chunks)) * 100
|
|
256
|
+
print(f" Progress: {i + 1}/{len(chunks)} chunks ({progress_pct:.1f}%)")
|
|
257
|
+
|
|
258
|
+
except Exception as e:
|
|
259
|
+
logger.error(f"Error processing chunk {i}: {e}")
|
|
260
|
+
# Use original content as fallback
|
|
261
|
+
chunk['processed_content'] = chunk['content']
|
|
262
|
+
chunk['keywords'] = []
|
|
263
|
+
# Create zero embedding as fallback
|
|
264
|
+
if np:
|
|
265
|
+
embedding = np.zeros(768, dtype=np.float32)
|
|
266
|
+
chunk['embedding'] = embedding.tobytes()
|
|
267
|
+
else:
|
|
268
|
+
chunk['embedding'] = b''
|
|
269
|
+
|
|
270
|
+
# Store chunks based on backend
|
|
271
|
+
if self.backend == 'sqlite':
|
|
272
|
+
# Create SQLite database
|
|
273
|
+
sources_info = [str(s) for s in sources]
|
|
274
|
+
self._create_database(output_file, chunks, languages or ['en'], sources_info, file_types)
|
|
275
|
+
|
|
276
|
+
if self.verbose:
|
|
277
|
+
print(f"Index created: {output_file}")
|
|
278
|
+
print(f"Total chunks: {len(chunks)}")
|
|
279
|
+
else:
|
|
280
|
+
# Use pgvector backend
|
|
281
|
+
self._store_chunks_pgvector(chunks, output_file, languages or ['en'], overwrite)
|
|
282
|
+
|
|
283
|
+
def build_index(self, source_dir: str, output_file: str,
|
|
284
|
+
file_types: List[str], exclude_patterns: Optional[List[str]] = None,
|
|
285
|
+
languages: List[str] = None, tags: Optional[List[str]] = None):
|
|
286
|
+
"""
|
|
287
|
+
Build complete search index from a single directory
|
|
288
|
+
|
|
289
|
+
Args:
|
|
290
|
+
source_dir: Directory to scan for documents
|
|
291
|
+
output_file: Output .swsearch file path
|
|
292
|
+
file_types: List of file extensions to include
|
|
293
|
+
exclude_patterns: Glob patterns to exclude
|
|
294
|
+
languages: List of languages to support
|
|
295
|
+
tags: Global tags to add to all chunks
|
|
296
|
+
"""
|
|
297
|
+
|
|
298
|
+
# Convert to new multi-source method
|
|
299
|
+
sources = [Path(source_dir)]
|
|
300
|
+
self.build_index_from_sources(sources, output_file, file_types, exclude_patterns, languages, tags)
|
|
301
|
+
|
|
302
|
+
def _get_base_directory_for_file(self, file_path: Path, sources: List[Path]) -> str:
|
|
303
|
+
"""
|
|
304
|
+
Determine the appropriate base directory for a file to calculate relative paths
|
|
305
|
+
|
|
306
|
+
Args:
|
|
307
|
+
file_path: The file being processed
|
|
308
|
+
sources: List of original source paths
|
|
309
|
+
|
|
310
|
+
Returns:
|
|
311
|
+
Base directory path as string
|
|
312
|
+
"""
|
|
313
|
+
|
|
314
|
+
# Check if this file was specified directly as a source
|
|
315
|
+
if file_path in sources:
|
|
316
|
+
# For individual files, use the parent directory
|
|
317
|
+
return str(file_path.parent)
|
|
318
|
+
|
|
319
|
+
# Check if this file is within any of the source directories
|
|
320
|
+
for source in sources:
|
|
321
|
+
if source.is_dir():
|
|
322
|
+
try:
|
|
323
|
+
# Check if file_path is relative to this source directory
|
|
324
|
+
file_path.relative_to(source)
|
|
325
|
+
return str(source)
|
|
326
|
+
except ValueError:
|
|
327
|
+
# file_path is not relative to this source
|
|
328
|
+
continue
|
|
329
|
+
|
|
330
|
+
# Fallback: use the file's parent directory
|
|
331
|
+
return str(file_path.parent)
|
|
332
|
+
|
|
333
|
+
def _discover_files_from_sources(self, sources: List[Path], file_types: List[str],
|
|
334
|
+
exclude_patterns: Optional[List[str]] = None) -> List[Path]:
|
|
335
|
+
"""
|
|
336
|
+
Discover files from multiple sources (files and directories)
|
|
337
|
+
|
|
338
|
+
Args:
|
|
339
|
+
sources: List of Path objects (files and/or directories)
|
|
340
|
+
file_types: List of file extensions to include for directories
|
|
341
|
+
exclude_patterns: Glob patterns to exclude
|
|
342
|
+
|
|
343
|
+
Returns:
|
|
344
|
+
List of file paths to process
|
|
345
|
+
"""
|
|
346
|
+
|
|
347
|
+
files = []
|
|
348
|
+
supported_extensions = set(ft.lstrip('.').lower() for ft in file_types)
|
|
349
|
+
|
|
350
|
+
for source in sources:
|
|
351
|
+
if source.is_file():
|
|
352
|
+
# Individual file - check if it's supported
|
|
353
|
+
file_ext = source.suffix.lstrip('.').lower()
|
|
354
|
+
if file_ext in supported_extensions or not file_ext: # Allow extensionless files
|
|
355
|
+
# Check exclusions
|
|
356
|
+
if self._is_file_excluded(source, exclude_patterns):
|
|
357
|
+
if self.verbose:
|
|
358
|
+
print(f"Excluded file: {source}")
|
|
359
|
+
continue
|
|
360
|
+
|
|
361
|
+
files.append(source)
|
|
362
|
+
if self.verbose:
|
|
363
|
+
print(f"Added individual file: {source}")
|
|
364
|
+
else:
|
|
365
|
+
if self.verbose:
|
|
366
|
+
print(f"Skipped unsupported file type: {source} (extension: {file_ext})")
|
|
367
|
+
|
|
368
|
+
elif source.is_dir():
|
|
369
|
+
# Directory - use existing discovery logic
|
|
370
|
+
dir_files = self._discover_files(str(source), file_types, exclude_patterns)
|
|
371
|
+
files.extend(dir_files)
|
|
372
|
+
if self.verbose:
|
|
373
|
+
print(f"Added {len(dir_files)} files from directory: {source}")
|
|
374
|
+
else:
|
|
375
|
+
if self.verbose:
|
|
376
|
+
print(f"Skipped non-existent or invalid source: {source}")
|
|
377
|
+
|
|
378
|
+
# Remove duplicates while preserving order
|
|
379
|
+
seen = set()
|
|
380
|
+
unique_files = []
|
|
381
|
+
for file_path in files:
|
|
382
|
+
if file_path not in seen:
|
|
383
|
+
seen.add(file_path)
|
|
384
|
+
unique_files.append(file_path)
|
|
385
|
+
|
|
386
|
+
return unique_files
|
|
387
|
+
|
|
388
|
+
def _is_file_excluded(self, file_path: Path, exclude_patterns: Optional[List[str]] = None) -> bool:
|
|
389
|
+
"""
|
|
390
|
+
Check if a file should be excluded based on exclude patterns
|
|
391
|
+
|
|
392
|
+
Args:
|
|
393
|
+
file_path: Path to check
|
|
394
|
+
exclude_patterns: List of glob patterns to exclude
|
|
395
|
+
|
|
396
|
+
Returns:
|
|
397
|
+
True if file should be excluded
|
|
398
|
+
"""
|
|
399
|
+
|
|
400
|
+
if not exclude_patterns:
|
|
401
|
+
return False
|
|
402
|
+
|
|
403
|
+
import fnmatch
|
|
404
|
+
|
|
405
|
+
file_str = str(file_path)
|
|
406
|
+
for pattern in exclude_patterns:
|
|
407
|
+
if fnmatch.fnmatch(file_str, pattern):
|
|
408
|
+
return True
|
|
409
|
+
|
|
410
|
+
return False
|
|
411
|
+
|
|
412
|
+
def _discover_files(self, source_dir: str, file_types: List[str],
|
|
413
|
+
exclude_patterns: Optional[List[str]] = None) -> List[Path]:
|
|
414
|
+
"""Discover files to index"""
|
|
415
|
+
files = []
|
|
416
|
+
source_path = Path(source_dir)
|
|
417
|
+
|
|
418
|
+
if not source_path.exists():
|
|
419
|
+
raise FileNotFoundError(f"Source directory does not exist: {source_dir}")
|
|
420
|
+
|
|
421
|
+
for file_type in file_types:
|
|
422
|
+
# Clean up file type (remove leading dots)
|
|
423
|
+
clean_type = file_type.lstrip('.')
|
|
424
|
+
pattern = f"**/*.{clean_type}"
|
|
425
|
+
|
|
426
|
+
for file_path in source_path.glob(pattern):
|
|
427
|
+
# Skip directories
|
|
428
|
+
if not file_path.is_file():
|
|
429
|
+
continue
|
|
430
|
+
|
|
431
|
+
# Check exclusions
|
|
432
|
+
if exclude_patterns:
|
|
433
|
+
excluded = False
|
|
434
|
+
for pattern in exclude_patterns:
|
|
435
|
+
if fnmatch.fnmatch(str(file_path), pattern):
|
|
436
|
+
excluded = True
|
|
437
|
+
break
|
|
438
|
+
if excluded:
|
|
439
|
+
if self.verbose:
|
|
440
|
+
print(f"Excluded: {file_path}")
|
|
441
|
+
continue
|
|
442
|
+
|
|
443
|
+
files.append(file_path)
|
|
444
|
+
|
|
445
|
+
return files
|
|
446
|
+
|
|
447
|
+
def _process_file(self, file_path: Path, source_dir: str,
|
|
448
|
+
global_tags: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
|
449
|
+
"""Process single file into chunks"""
|
|
450
|
+
try:
|
|
451
|
+
relative_path = str(file_path.relative_to(source_dir))
|
|
452
|
+
file_extension = file_path.suffix.lower()
|
|
453
|
+
|
|
454
|
+
# Handle different file types appropriately
|
|
455
|
+
if file_extension == '.pdf':
|
|
456
|
+
# Use document processor for PDF extraction
|
|
457
|
+
content_result = self.doc_processor._extract_text_from_file(str(file_path))
|
|
458
|
+
if isinstance(content_result, str) and content_result.startswith('{"error"'):
|
|
459
|
+
if self.verbose:
|
|
460
|
+
print(f"Skipping PDF file (extraction failed): {file_path}")
|
|
461
|
+
return []
|
|
462
|
+
content = content_result
|
|
463
|
+
elif file_extension in ['.docx', '.xlsx', '.pptx']:
|
|
464
|
+
# Use document processor for Office documents
|
|
465
|
+
content_result = self.doc_processor._extract_text_from_file(str(file_path))
|
|
466
|
+
if isinstance(content_result, str) and content_result.startswith('{"error"'):
|
|
467
|
+
if self.verbose:
|
|
468
|
+
print(f"Skipping office document (extraction failed): {file_path}")
|
|
469
|
+
return []
|
|
470
|
+
content = content_result
|
|
471
|
+
elif file_extension == '.html':
|
|
472
|
+
# Use document processor for HTML
|
|
473
|
+
content_result = self.doc_processor._extract_text_from_file(str(file_path))
|
|
474
|
+
if isinstance(content_result, str) and content_result.startswith('{"error"'):
|
|
475
|
+
if self.verbose:
|
|
476
|
+
print(f"Skipping HTML file (extraction failed): {file_path}")
|
|
477
|
+
return []
|
|
478
|
+
content = content_result
|
|
479
|
+
elif file_extension == '.rtf':
|
|
480
|
+
# Use document processor for RTF
|
|
481
|
+
content_result = self.doc_processor._extract_text_from_file(str(file_path))
|
|
482
|
+
if isinstance(content_result, str) and content_result.startswith('{"error"'):
|
|
483
|
+
if self.verbose:
|
|
484
|
+
print(f"Skipping RTF file (extraction failed): {file_path}")
|
|
485
|
+
return []
|
|
486
|
+
content = content_result
|
|
487
|
+
else:
|
|
488
|
+
# Try to read as text file (markdown, txt, code, etc.)
|
|
489
|
+
try:
|
|
490
|
+
content = file_path.read_text(encoding='utf-8')
|
|
491
|
+
except UnicodeDecodeError:
|
|
492
|
+
if self.verbose:
|
|
493
|
+
print(f"Skipping binary file: {file_path}")
|
|
494
|
+
return []
|
|
495
|
+
|
|
496
|
+
# Validate content
|
|
497
|
+
if not content or (isinstance(content, str) and len(content.strip()) == 0):
|
|
498
|
+
if self.verbose:
|
|
499
|
+
print(f"Skipping empty file: {file_path}")
|
|
500
|
+
return []
|
|
501
|
+
|
|
502
|
+
# Create chunks using document processor - pass content directly, not file path
|
|
503
|
+
chunks = self.doc_processor.create_chunks(
|
|
504
|
+
content=content, # Pass the actual content, not the file path
|
|
505
|
+
filename=relative_path,
|
|
506
|
+
file_type=file_path.suffix.lstrip('.')
|
|
507
|
+
)
|
|
508
|
+
|
|
509
|
+
# Add global tags
|
|
510
|
+
if global_tags:
|
|
511
|
+
for chunk in chunks:
|
|
512
|
+
existing_tags = chunk.get('tags', [])
|
|
513
|
+
if isinstance(existing_tags, str):
|
|
514
|
+
existing_tags = [existing_tags]
|
|
515
|
+
chunk['tags'] = existing_tags + global_tags
|
|
516
|
+
|
|
517
|
+
return chunks
|
|
518
|
+
|
|
519
|
+
except Exception as e:
|
|
520
|
+
logger.error(f"Error processing file {file_path}: {e}")
|
|
521
|
+
return []
|
|
522
|
+
|
|
523
|
+
def _create_database(self, output_file: str, chunks: List[Dict[str, Any]],
|
|
524
|
+
languages: List[str], sources_info: List[str], file_types: List[str]):
|
|
525
|
+
"""Create SQLite database with all data"""
|
|
526
|
+
|
|
527
|
+
# Remove existing file
|
|
528
|
+
if os.path.exists(output_file):
|
|
529
|
+
os.remove(output_file)
|
|
530
|
+
|
|
531
|
+
conn = sqlite3.connect(output_file)
|
|
532
|
+
cursor = conn.cursor()
|
|
533
|
+
|
|
534
|
+
try:
|
|
535
|
+
# Create schema
|
|
536
|
+
cursor.execute('''
|
|
537
|
+
CREATE TABLE chunks (
|
|
538
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
539
|
+
content TEXT NOT NULL,
|
|
540
|
+
processed_content TEXT NOT NULL,
|
|
541
|
+
keywords TEXT,
|
|
542
|
+
language TEXT DEFAULT 'en',
|
|
543
|
+
embedding BLOB NOT NULL,
|
|
544
|
+
filename TEXT NOT NULL,
|
|
545
|
+
section TEXT,
|
|
546
|
+
start_line INTEGER,
|
|
547
|
+
end_line INTEGER,
|
|
548
|
+
tags TEXT,
|
|
549
|
+
metadata TEXT,
|
|
550
|
+
metadata_text TEXT, -- Searchable text representation of all metadata
|
|
551
|
+
chunk_hash TEXT UNIQUE,
|
|
552
|
+
created_at TEXT DEFAULT CURRENT_TIMESTAMP
|
|
553
|
+
)
|
|
554
|
+
''')
|
|
555
|
+
|
|
556
|
+
cursor.execute('''
|
|
557
|
+
CREATE VIRTUAL TABLE chunks_fts USING fts5(
|
|
558
|
+
processed_content,
|
|
559
|
+
keywords,
|
|
560
|
+
metadata_text,
|
|
561
|
+
content='chunks',
|
|
562
|
+
content_rowid='id'
|
|
563
|
+
)
|
|
564
|
+
''')
|
|
565
|
+
|
|
566
|
+
cursor.execute('''
|
|
567
|
+
CREATE TABLE synonyms (
|
|
568
|
+
word TEXT,
|
|
569
|
+
pos_tag TEXT,
|
|
570
|
+
synonyms TEXT,
|
|
571
|
+
language TEXT DEFAULT 'en',
|
|
572
|
+
PRIMARY KEY (word, pos_tag, language)
|
|
573
|
+
)
|
|
574
|
+
''')
|
|
575
|
+
|
|
576
|
+
cursor.execute('''
|
|
577
|
+
CREATE TABLE config (
|
|
578
|
+
key TEXT PRIMARY KEY,
|
|
579
|
+
value TEXT
|
|
580
|
+
)
|
|
581
|
+
''')
|
|
582
|
+
|
|
583
|
+
# Create indexes for performance
|
|
584
|
+
cursor.execute('CREATE INDEX idx_chunks_filename ON chunks(filename)')
|
|
585
|
+
cursor.execute('CREATE INDEX idx_chunks_language ON chunks(language)')
|
|
586
|
+
cursor.execute('CREATE INDEX idx_chunks_tags ON chunks(tags)')
|
|
587
|
+
|
|
588
|
+
# Insert config
|
|
589
|
+
embedding_dimensions = 768 # Default for all-mpnet-base-v2
|
|
590
|
+
if chunks and chunks[0].get('embedding'):
|
|
591
|
+
try:
|
|
592
|
+
if np:
|
|
593
|
+
embedding_array = np.frombuffer(chunks[0]['embedding'], dtype=np.float32)
|
|
594
|
+
embedding_dimensions = len(embedding_array)
|
|
595
|
+
except:
|
|
596
|
+
pass
|
|
597
|
+
|
|
598
|
+
config_data = {
|
|
599
|
+
'embedding_model': self.model_name,
|
|
600
|
+
'embedding_dimensions': str(embedding_dimensions),
|
|
601
|
+
'chunk_size': str(self.chunk_size),
|
|
602
|
+
'chunk_overlap': str(self.chunk_overlap),
|
|
603
|
+
'preprocessing_version': '1.0',
|
|
604
|
+
'languages': json.dumps(languages),
|
|
605
|
+
'created_at': datetime.now().isoformat(),
|
|
606
|
+
'sources': json.dumps(sources_info), # Store list of sources instead of single directory
|
|
607
|
+
'file_types': json.dumps(file_types)
|
|
608
|
+
}
|
|
609
|
+
|
|
610
|
+
for key, value in config_data.items():
|
|
611
|
+
cursor.execute('INSERT INTO config (key, value) VALUES (?, ?)', (key, value))
|
|
612
|
+
|
|
613
|
+
# Insert chunks
|
|
614
|
+
for chunk in chunks:
|
|
615
|
+
# Create hash for deduplication - include filename, section, and line numbers for uniqueness
|
|
616
|
+
hash_content = f"{chunk['filename']}:{chunk.get('section', '')}:{chunk.get('start_line', 0)}:{chunk.get('end_line', 0)}:{chunk['content']}"
|
|
617
|
+
chunk_hash = hashlib.sha256(hash_content.encode()).hexdigest()[:16]
|
|
618
|
+
|
|
619
|
+
# Prepare data
|
|
620
|
+
keywords_json = json.dumps(chunk.get('keywords', []))
|
|
621
|
+
tags_json = json.dumps(chunk.get('tags', []))
|
|
622
|
+
|
|
623
|
+
# Extract metadata from JSON content and merge with chunk metadata
|
|
624
|
+
json_metadata, json_metadata_text = self._extract_metadata_from_json_content(chunk['content'])
|
|
625
|
+
chunk_metadata = chunk.get('metadata', {})
|
|
626
|
+
|
|
627
|
+
# Merge metadata: chunk metadata takes precedence
|
|
628
|
+
merged_metadata = {**json_metadata, **chunk_metadata}
|
|
629
|
+
metadata_json = json.dumps(merged_metadata)
|
|
630
|
+
|
|
631
|
+
# Create comprehensive metadata_text including tags
|
|
632
|
+
metadata_text_parts = []
|
|
633
|
+
|
|
634
|
+
# Add metadata text from JSON content
|
|
635
|
+
if json_metadata_text:
|
|
636
|
+
metadata_text_parts.append(json_metadata_text)
|
|
637
|
+
|
|
638
|
+
# Add tags
|
|
639
|
+
tags = chunk.get('tags', [])
|
|
640
|
+
if tags:
|
|
641
|
+
metadata_text_parts.extend(str(tag).lower() for tag in tags)
|
|
642
|
+
|
|
643
|
+
# Add section if present
|
|
644
|
+
if chunk.get('section'):
|
|
645
|
+
metadata_text_parts.append(chunk['section'].lower())
|
|
646
|
+
|
|
647
|
+
# Add any additional metadata values
|
|
648
|
+
for key, value in chunk_metadata.items():
|
|
649
|
+
if key not in json_metadata: # Avoid duplicates
|
|
650
|
+
metadata_text_parts.append(str(key).lower())
|
|
651
|
+
if isinstance(value, list):
|
|
652
|
+
metadata_text_parts.extend(str(v).lower() for v in value)
|
|
653
|
+
else:
|
|
654
|
+
metadata_text_parts.append(str(value).lower())
|
|
655
|
+
|
|
656
|
+
metadata_text = ' '.join(metadata_text_parts)
|
|
657
|
+
|
|
658
|
+
cursor.execute('''
|
|
659
|
+
INSERT OR IGNORE INTO chunks (
|
|
660
|
+
content, processed_content, keywords, language, embedding,
|
|
661
|
+
filename, section, start_line, end_line, tags, metadata, metadata_text, chunk_hash
|
|
662
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
663
|
+
''', (
|
|
664
|
+
chunk['content'],
|
|
665
|
+
chunk.get('processed_content', chunk['content']),
|
|
666
|
+
keywords_json,
|
|
667
|
+
chunk.get('language', 'en'),
|
|
668
|
+
chunk.get('embedding', b''),
|
|
669
|
+
chunk['filename'],
|
|
670
|
+
chunk.get('section'),
|
|
671
|
+
chunk.get('start_line'),
|
|
672
|
+
chunk.get('end_line'),
|
|
673
|
+
tags_json,
|
|
674
|
+
metadata_json,
|
|
675
|
+
metadata_text,
|
|
676
|
+
chunk_hash
|
|
677
|
+
))
|
|
678
|
+
|
|
679
|
+
conn.commit()
|
|
680
|
+
|
|
681
|
+
except Exception as e:
|
|
682
|
+
conn.rollback()
|
|
683
|
+
raise e
|
|
684
|
+
finally:
|
|
685
|
+
conn.close()
|
|
686
|
+
|
|
687
|
+
def validate_index(self, index_file: str) -> Dict[str, Any]:
|
|
688
|
+
"""Validate an existing search index"""
|
|
689
|
+
if not os.path.exists(index_file):
|
|
690
|
+
return {"valid": False, "error": "Index file does not exist"}
|
|
691
|
+
|
|
692
|
+
try:
|
|
693
|
+
conn = sqlite3.connect(index_file)
|
|
694
|
+
cursor = conn.cursor()
|
|
695
|
+
|
|
696
|
+
# Check schema
|
|
697
|
+
cursor.execute("SELECT name FROM sqlite_master WHERE type='table'")
|
|
698
|
+
tables = [row[0] for row in cursor.fetchall()]
|
|
699
|
+
|
|
700
|
+
required_tables = ['chunks', 'chunks_fts', 'synonyms', 'config']
|
|
701
|
+
missing_tables = [t for t in required_tables if t not in tables]
|
|
702
|
+
|
|
703
|
+
if missing_tables:
|
|
704
|
+
return {"valid": False, "error": f"Missing tables: {missing_tables}"}
|
|
705
|
+
|
|
706
|
+
# Get config
|
|
707
|
+
cursor.execute("SELECT key, value FROM config")
|
|
708
|
+
config = dict(cursor.fetchall())
|
|
709
|
+
|
|
710
|
+
# Get chunk count
|
|
711
|
+
cursor.execute("SELECT COUNT(*) FROM chunks")
|
|
712
|
+
chunk_count = cursor.fetchone()[0]
|
|
713
|
+
|
|
714
|
+
# Get file count
|
|
715
|
+
cursor.execute("SELECT COUNT(DISTINCT filename) FROM chunks")
|
|
716
|
+
file_count = cursor.fetchone()[0]
|
|
717
|
+
|
|
718
|
+
conn.close()
|
|
719
|
+
|
|
720
|
+
return {
|
|
721
|
+
"valid": True,
|
|
722
|
+
"chunk_count": chunk_count,
|
|
723
|
+
"file_count": file_count,
|
|
724
|
+
"config": config
|
|
725
|
+
}
|
|
726
|
+
|
|
727
|
+
except Exception as e:
|
|
728
|
+
return {"valid": False, "error": str(e)}
|
|
729
|
+
|
|
730
|
+
def _store_chunks_pgvector(self, chunks: List[Dict[str, Any]], collection_name: str,
|
|
731
|
+
languages: List[str], overwrite: bool = False):
|
|
732
|
+
"""
|
|
733
|
+
Store chunks in pgvector backend
|
|
734
|
+
|
|
735
|
+
Args:
|
|
736
|
+
chunks: List of processed chunks
|
|
737
|
+
collection_name: Name for the collection (from output_file parameter)
|
|
738
|
+
languages: List of supported languages
|
|
739
|
+
"""
|
|
740
|
+
from .pgvector_backend import PgVectorBackend
|
|
741
|
+
|
|
742
|
+
# Extract collection name from the provided name
|
|
743
|
+
if collection_name.endswith('.swsearch'):
|
|
744
|
+
collection_name = collection_name[:-9] # Remove .swsearch extension
|
|
745
|
+
|
|
746
|
+
# Clean collection name for PostgreSQL
|
|
747
|
+
import re
|
|
748
|
+
collection_name = re.sub(r'[^a-zA-Z0-9_]', '_', collection_name)
|
|
749
|
+
|
|
750
|
+
if self.verbose:
|
|
751
|
+
print(f"Storing chunks in pgvector collection: {collection_name}")
|
|
752
|
+
|
|
753
|
+
# Create backend instance
|
|
754
|
+
backend = PgVectorBackend(self.connection_string)
|
|
755
|
+
|
|
756
|
+
try:
|
|
757
|
+
# Get embedding dimensions from model
|
|
758
|
+
if self.model:
|
|
759
|
+
embedding_dim = self.model.get_sentence_embedding_dimension()
|
|
760
|
+
else:
|
|
761
|
+
embedding_dim = 768 # Default for all-mpnet-base-v2
|
|
762
|
+
|
|
763
|
+
# Delete existing collection if overwrite is requested
|
|
764
|
+
if overwrite:
|
|
765
|
+
if self.verbose:
|
|
766
|
+
print(f"Dropping existing collection: {collection_name}")
|
|
767
|
+
backend.delete_collection(collection_name)
|
|
768
|
+
|
|
769
|
+
# Create schema
|
|
770
|
+
backend.create_schema(collection_name, embedding_dim)
|
|
771
|
+
|
|
772
|
+
# Convert embeddings from bytes to numpy arrays
|
|
773
|
+
for chunk in chunks:
|
|
774
|
+
if chunk.get('embedding') and isinstance(chunk['embedding'], bytes):
|
|
775
|
+
if np:
|
|
776
|
+
chunk['embedding'] = np.frombuffer(chunk['embedding'], dtype=np.float32)
|
|
777
|
+
else:
|
|
778
|
+
# If numpy not available, leave as bytes
|
|
779
|
+
pass
|
|
780
|
+
|
|
781
|
+
# Prepare config
|
|
782
|
+
config = {
|
|
783
|
+
'model_name': self.model_name,
|
|
784
|
+
'embedding_dimensions': embedding_dim,
|
|
785
|
+
'chunking_strategy': self.chunking_strategy,
|
|
786
|
+
'languages': languages,
|
|
787
|
+
'metadata': {
|
|
788
|
+
'max_sentences_per_chunk': self.max_sentences_per_chunk,
|
|
789
|
+
'chunk_size': self.chunk_size,
|
|
790
|
+
'chunk_overlap': self.chunk_overlap,
|
|
791
|
+
'index_nlp_backend': self.index_nlp_backend
|
|
792
|
+
}
|
|
793
|
+
}
|
|
794
|
+
|
|
795
|
+
# Store chunks
|
|
796
|
+
backend.store_chunks(chunks, collection_name, config)
|
|
797
|
+
|
|
798
|
+
if self.verbose:
|
|
799
|
+
stats = backend.get_stats(collection_name)
|
|
800
|
+
print(f"Stored {stats['total_chunks']} chunks in pgvector")
|
|
801
|
+
print(f"Collection: {collection_name}")
|
|
802
|
+
|
|
803
|
+
finally:
|
|
804
|
+
backend.close()
|