signalwire-agents 0.1.13__py3-none-any.whl → 1.0.17.dev4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- signalwire_agents/__init__.py +99 -15
- signalwire_agents/agent_server.py +248 -60
- signalwire_agents/agents/bedrock.py +296 -0
- signalwire_agents/cli/__init__.py +9 -0
- signalwire_agents/cli/build_search.py +951 -41
- signalwire_agents/cli/config.py +80 -0
- signalwire_agents/cli/core/__init__.py +10 -0
- signalwire_agents/cli/core/agent_loader.py +470 -0
- signalwire_agents/cli/core/argparse_helpers.py +179 -0
- signalwire_agents/cli/core/dynamic_config.py +71 -0
- signalwire_agents/cli/core/service_loader.py +303 -0
- signalwire_agents/cli/dokku.py +2320 -0
- signalwire_agents/cli/execution/__init__.py +10 -0
- signalwire_agents/cli/execution/datamap_exec.py +446 -0
- signalwire_agents/cli/execution/webhook_exec.py +134 -0
- signalwire_agents/cli/init_project.py +2636 -0
- signalwire_agents/cli/output/__init__.py +10 -0
- signalwire_agents/cli/output/output_formatter.py +255 -0
- signalwire_agents/cli/output/swml_dump.py +186 -0
- signalwire_agents/cli/simulation/__init__.py +10 -0
- signalwire_agents/cli/simulation/data_generation.py +374 -0
- signalwire_agents/cli/simulation/data_overrides.py +200 -0
- signalwire_agents/cli/simulation/mock_env.py +282 -0
- signalwire_agents/cli/swaig_test_wrapper.py +52 -0
- signalwire_agents/cli/test_swaig.py +566 -2366
- signalwire_agents/cli/types.py +81 -0
- signalwire_agents/core/__init__.py +2 -2
- signalwire_agents/core/agent/__init__.py +12 -0
- signalwire_agents/core/agent/config/__init__.py +12 -0
- signalwire_agents/core/agent/deployment/__init__.py +9 -0
- signalwire_agents/core/agent/deployment/handlers/__init__.py +9 -0
- signalwire_agents/core/agent/prompt/__init__.py +14 -0
- signalwire_agents/core/agent/prompt/manager.py +306 -0
- signalwire_agents/core/agent/routing/__init__.py +9 -0
- signalwire_agents/core/agent/security/__init__.py +9 -0
- signalwire_agents/core/agent/swml/__init__.py +9 -0
- signalwire_agents/core/agent/tools/__init__.py +15 -0
- signalwire_agents/core/agent/tools/decorator.py +97 -0
- signalwire_agents/core/agent/tools/registry.py +210 -0
- signalwire_agents/core/agent_base.py +845 -2916
- signalwire_agents/core/auth_handler.py +233 -0
- signalwire_agents/core/config_loader.py +259 -0
- signalwire_agents/core/contexts.py +418 -0
- signalwire_agents/core/data_map.py +3 -15
- signalwire_agents/core/function_result.py +116 -44
- signalwire_agents/core/logging_config.py +162 -18
- signalwire_agents/core/mixins/__init__.py +28 -0
- signalwire_agents/core/mixins/ai_config_mixin.py +442 -0
- signalwire_agents/core/mixins/auth_mixin.py +280 -0
- signalwire_agents/core/mixins/prompt_mixin.py +358 -0
- signalwire_agents/core/mixins/serverless_mixin.py +460 -0
- signalwire_agents/core/mixins/skill_mixin.py +55 -0
- signalwire_agents/core/mixins/state_mixin.py +153 -0
- signalwire_agents/core/mixins/tool_mixin.py +230 -0
- signalwire_agents/core/mixins/web_mixin.py +1142 -0
- signalwire_agents/core/security_config.py +333 -0
- signalwire_agents/core/skill_base.py +84 -1
- signalwire_agents/core/skill_manager.py +62 -20
- signalwire_agents/core/swaig_function.py +18 -5
- signalwire_agents/core/swml_builder.py +207 -11
- signalwire_agents/core/swml_handler.py +27 -21
- signalwire_agents/core/swml_renderer.py +123 -312
- signalwire_agents/core/swml_service.py +171 -203
- signalwire_agents/mcp_gateway/__init__.py +29 -0
- signalwire_agents/mcp_gateway/gateway_service.py +564 -0
- signalwire_agents/mcp_gateway/mcp_manager.py +513 -0
- signalwire_agents/mcp_gateway/session_manager.py +218 -0
- signalwire_agents/prefabs/concierge.py +0 -3
- signalwire_agents/prefabs/faq_bot.py +0 -3
- signalwire_agents/prefabs/info_gatherer.py +0 -3
- signalwire_agents/prefabs/receptionist.py +0 -3
- signalwire_agents/prefabs/survey.py +0 -3
- signalwire_agents/schema.json +9218 -5489
- signalwire_agents/search/__init__.py +7 -1
- signalwire_agents/search/document_processor.py +490 -31
- signalwire_agents/search/index_builder.py +307 -37
- signalwire_agents/search/migration.py +418 -0
- signalwire_agents/search/models.py +30 -0
- signalwire_agents/search/pgvector_backend.py +748 -0
- signalwire_agents/search/query_processor.py +162 -31
- signalwire_agents/search/search_engine.py +916 -35
- signalwire_agents/search/search_service.py +376 -53
- signalwire_agents/skills/README.md +452 -0
- signalwire_agents/skills/__init__.py +14 -2
- signalwire_agents/skills/api_ninjas_trivia/README.md +215 -0
- signalwire_agents/skills/api_ninjas_trivia/__init__.py +12 -0
- signalwire_agents/skills/api_ninjas_trivia/skill.py +237 -0
- signalwire_agents/skills/datasphere/README.md +210 -0
- signalwire_agents/skills/datasphere/skill.py +84 -3
- signalwire_agents/skills/datasphere_serverless/README.md +258 -0
- signalwire_agents/skills/datasphere_serverless/__init__.py +9 -0
- signalwire_agents/skills/datasphere_serverless/skill.py +82 -1
- signalwire_agents/skills/datetime/README.md +132 -0
- signalwire_agents/skills/datetime/__init__.py +9 -0
- signalwire_agents/skills/datetime/skill.py +20 -7
- signalwire_agents/skills/joke/README.md +149 -0
- signalwire_agents/skills/joke/__init__.py +9 -0
- signalwire_agents/skills/joke/skill.py +21 -0
- signalwire_agents/skills/math/README.md +161 -0
- signalwire_agents/skills/math/__init__.py +9 -0
- signalwire_agents/skills/math/skill.py +18 -4
- signalwire_agents/skills/mcp_gateway/README.md +230 -0
- signalwire_agents/skills/mcp_gateway/__init__.py +10 -0
- signalwire_agents/skills/mcp_gateway/skill.py +421 -0
- signalwire_agents/skills/native_vector_search/README.md +210 -0
- signalwire_agents/skills/native_vector_search/__init__.py +9 -0
- signalwire_agents/skills/native_vector_search/skill.py +569 -101
- signalwire_agents/skills/play_background_file/README.md +218 -0
- signalwire_agents/skills/play_background_file/__init__.py +12 -0
- signalwire_agents/skills/play_background_file/skill.py +242 -0
- signalwire_agents/skills/registry.py +395 -40
- signalwire_agents/skills/spider/README.md +236 -0
- signalwire_agents/skills/spider/__init__.py +13 -0
- signalwire_agents/skills/spider/skill.py +598 -0
- signalwire_agents/skills/swml_transfer/README.md +395 -0
- signalwire_agents/skills/swml_transfer/__init__.py +10 -0
- signalwire_agents/skills/swml_transfer/skill.py +359 -0
- signalwire_agents/skills/weather_api/README.md +178 -0
- signalwire_agents/skills/weather_api/__init__.py +12 -0
- signalwire_agents/skills/weather_api/skill.py +191 -0
- signalwire_agents/skills/web_search/README.md +163 -0
- signalwire_agents/skills/web_search/__init__.py +9 -0
- signalwire_agents/skills/web_search/skill.py +586 -112
- signalwire_agents/skills/wikipedia_search/README.md +228 -0
- signalwire_agents/{core/state → skills/wikipedia_search}/__init__.py +5 -4
- signalwire_agents/skills/{wikipedia → wikipedia_search}/skill.py +33 -3
- signalwire_agents/web/__init__.py +17 -0
- signalwire_agents/web/web_service.py +559 -0
- signalwire_agents-1.0.17.dev4.data/data/share/man/man1/sw-agent-init.1 +400 -0
- signalwire_agents-1.0.17.dev4.data/data/share/man/man1/sw-search.1 +483 -0
- signalwire_agents-1.0.17.dev4.data/data/share/man/man1/swaig-test.1 +308 -0
- {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/METADATA +347 -215
- signalwire_agents-1.0.17.dev4.dist-info/RECORD +147 -0
- signalwire_agents-1.0.17.dev4.dist-info/entry_points.txt +6 -0
- signalwire_agents/core/state/file_state_manager.py +0 -219
- signalwire_agents/core/state/state_manager.py +0 -101
- signalwire_agents/skills/wikipedia/__init__.py +0 -9
- signalwire_agents-0.1.13.data/data/schema.json +0 -5611
- signalwire_agents-0.1.13.dist-info/RECORD +0 -67
- signalwire_agents-0.1.13.dist-info/entry_points.txt +0 -3
- {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/WHEEL +0 -0
- {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/licenses/LICENSE +0 -0
- {signalwire_agents-0.1.13.dist-info → signalwire_agents-1.0.17.dev4.dist-info}/top_level.txt +0 -0
|
@@ -35,29 +35,117 @@ logger = logging.getLogger(__name__)
|
|
|
35
35
|
class IndexBuilder:
|
|
36
36
|
"""Build searchable indexes from document directories"""
|
|
37
37
|
|
|
38
|
-
def __init__(
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
38
|
+
def __init__(
|
|
39
|
+
self,
|
|
40
|
+
model_name: str = 'sentence-transformers/all-mpnet-base-v2',
|
|
41
|
+
chunking_strategy: str = 'sentence',
|
|
42
|
+
max_sentences_per_chunk: int = 5,
|
|
43
|
+
chunk_size: int = 50,
|
|
44
|
+
chunk_overlap: int = 10,
|
|
45
|
+
split_newlines: Optional[int] = None,
|
|
46
|
+
index_nlp_backend: str = 'nltk',
|
|
47
|
+
verbose: bool = False,
|
|
48
|
+
semantic_threshold: float = 0.5,
|
|
49
|
+
topic_threshold: float = 0.3,
|
|
50
|
+
backend: str = 'sqlite',
|
|
51
|
+
connection_string: Optional[str] = None
|
|
52
|
+
):
|
|
53
|
+
"""
|
|
54
|
+
Initialize the index builder
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
model_name: Name of the sentence transformer model to use
|
|
58
|
+
chunking_strategy: Strategy for chunking documents ('sentence', 'sliding', 'paragraph', 'page', 'semantic', 'topic', 'qa', 'json')
|
|
59
|
+
max_sentences_per_chunk: For sentence strategy (default: 5)
|
|
60
|
+
chunk_size: For sliding strategy - words per chunk (default: 50)
|
|
61
|
+
chunk_overlap: For sliding strategy - overlap in words (default: 10)
|
|
62
|
+
split_newlines: For sentence strategy - split on multiple newlines (optional)
|
|
63
|
+
index_nlp_backend: NLP backend for indexing (default: 'nltk')
|
|
64
|
+
verbose: Whether to enable verbose logging (default: False)
|
|
65
|
+
semantic_threshold: Similarity threshold for semantic chunking (default: 0.5)
|
|
66
|
+
topic_threshold: Similarity threshold for topic chunking (default: 0.3)
|
|
67
|
+
backend: Storage backend ('sqlite' or 'pgvector') (default: 'sqlite')
|
|
68
|
+
connection_string: PostgreSQL connection string for pgvector backend
|
|
69
|
+
"""
|
|
45
70
|
self.model_name = model_name
|
|
46
71
|
self.chunking_strategy = chunking_strategy
|
|
47
72
|
self.max_sentences_per_chunk = max_sentences_per_chunk
|
|
48
73
|
self.chunk_size = chunk_size
|
|
49
74
|
self.chunk_overlap = chunk_overlap
|
|
50
75
|
self.split_newlines = split_newlines
|
|
76
|
+
self.index_nlp_backend = index_nlp_backend
|
|
51
77
|
self.verbose = verbose
|
|
78
|
+
self.semantic_threshold = semantic_threshold
|
|
79
|
+
self.topic_threshold = topic_threshold
|
|
80
|
+
self.backend = backend
|
|
81
|
+
self.connection_string = connection_string
|
|
52
82
|
self.model = None
|
|
83
|
+
|
|
84
|
+
# Validate backend
|
|
85
|
+
if self.backend not in ['sqlite', 'pgvector']:
|
|
86
|
+
raise ValueError(f"Invalid backend '{self.backend}'. Must be 'sqlite' or 'pgvector'")
|
|
87
|
+
|
|
88
|
+
# Validate NLP backend
|
|
89
|
+
if self.index_nlp_backend not in ['nltk', 'spacy']:
|
|
90
|
+
logger.warning(f"Invalid index_nlp_backend '{self.index_nlp_backend}', using 'nltk'")
|
|
91
|
+
self.index_nlp_backend = 'nltk'
|
|
92
|
+
|
|
53
93
|
self.doc_processor = DocumentProcessor(
|
|
54
94
|
chunking_strategy=chunking_strategy,
|
|
55
95
|
max_sentences_per_chunk=max_sentences_per_chunk,
|
|
56
96
|
chunk_size=chunk_size,
|
|
57
|
-
|
|
58
|
-
split_newlines=split_newlines
|
|
97
|
+
chunk_overlap=chunk_overlap,
|
|
98
|
+
split_newlines=split_newlines,
|
|
99
|
+
index_nlp_backend=self.index_nlp_backend,
|
|
100
|
+
verbose=self.verbose,
|
|
101
|
+
semantic_threshold=self.semantic_threshold,
|
|
102
|
+
topic_threshold=self.topic_threshold
|
|
59
103
|
)
|
|
60
104
|
|
|
105
|
+
def _extract_metadata_from_json_content(self, content: str) -> tuple[Dict[str, Any], str]:
|
|
106
|
+
"""
|
|
107
|
+
Extract metadata from JSON content if present
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
(metadata_dict, metadata_text)
|
|
111
|
+
"""
|
|
112
|
+
metadata_dict = {}
|
|
113
|
+
|
|
114
|
+
# Try to extract metadata from JSON structure in content
|
|
115
|
+
if '"metadata":' in content:
|
|
116
|
+
try:
|
|
117
|
+
# Look for metadata object in content
|
|
118
|
+
import re
|
|
119
|
+
# Find all metadata objects
|
|
120
|
+
pattern = r'"metadata"\s*:\s*(\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\})'
|
|
121
|
+
matches = re.finditer(pattern, content)
|
|
122
|
+
|
|
123
|
+
for match in matches:
|
|
124
|
+
try:
|
|
125
|
+
json_metadata = json.loads(match.group(1))
|
|
126
|
+
# Merge all found metadata
|
|
127
|
+
if isinstance(json_metadata, dict):
|
|
128
|
+
metadata_dict.update(json_metadata)
|
|
129
|
+
except:
|
|
130
|
+
pass
|
|
131
|
+
except Exception as e:
|
|
132
|
+
logger.debug(f"Error extracting JSON metadata: {e}")
|
|
133
|
+
|
|
134
|
+
# Create searchable text from all metadata keys and values
|
|
135
|
+
metadata_text_parts = []
|
|
136
|
+
for key, value in metadata_dict.items():
|
|
137
|
+
# Add key
|
|
138
|
+
metadata_text_parts.append(str(key))
|
|
139
|
+
# Add value(s)
|
|
140
|
+
if isinstance(value, list):
|
|
141
|
+
metadata_text_parts.extend(str(v) for v in value)
|
|
142
|
+
else:
|
|
143
|
+
metadata_text_parts.append(str(value))
|
|
144
|
+
|
|
145
|
+
metadata_text = ' '.join(metadata_text_parts).lower()
|
|
146
|
+
|
|
147
|
+
return metadata_dict, metadata_text
|
|
148
|
+
|
|
61
149
|
def _load_model(self):
|
|
62
150
|
"""Load embedding model (lazy loading)"""
|
|
63
151
|
if self.model is None:
|
|
@@ -75,7 +163,8 @@ class IndexBuilder:
|
|
|
75
163
|
|
|
76
164
|
def build_index_from_sources(self, sources: List[Path], output_file: str,
|
|
77
165
|
file_types: List[str], exclude_patterns: Optional[List[str]] = None,
|
|
78
|
-
languages: List[str] = None, tags: Optional[List[str]] = None
|
|
166
|
+
languages: List[str] = None, tags: Optional[List[str]] = None,
|
|
167
|
+
overwrite: bool = False):
|
|
79
168
|
"""
|
|
80
169
|
Build complete search index from multiple sources (files and directories)
|
|
81
170
|
|
|
@@ -99,6 +188,7 @@ class IndexBuilder:
|
|
|
99
188
|
|
|
100
189
|
# Process documents
|
|
101
190
|
chunks = []
|
|
191
|
+
print(f"Processing {len(files)} files...")
|
|
102
192
|
for file_path in files:
|
|
103
193
|
try:
|
|
104
194
|
# For individual files, use the file's parent as the base directory
|
|
@@ -106,8 +196,8 @@ class IndexBuilder:
|
|
|
106
196
|
base_dir = self._get_base_directory_for_file(file_path, sources)
|
|
107
197
|
file_chunks = self._process_file(file_path, base_dir, tags)
|
|
108
198
|
chunks.extend(file_chunks)
|
|
109
|
-
if self.verbose:
|
|
110
|
-
print(f"
|
|
199
|
+
if self.verbose or file_path.suffix == '.json':
|
|
200
|
+
print(f" {file_path}: {len(file_chunks)} chunks")
|
|
111
201
|
except Exception as e:
|
|
112
202
|
logger.error(f"Error processing {file_path}: {e}")
|
|
113
203
|
if self.verbose:
|
|
@@ -123,26 +213,47 @@ class IndexBuilder:
|
|
|
123
213
|
# Generate embeddings
|
|
124
214
|
self._load_model()
|
|
125
215
|
if self.verbose:
|
|
126
|
-
print("Generating embeddings...")
|
|
216
|
+
print(f"Generating embeddings for {len(chunks)} chunks...")
|
|
217
|
+
else:
|
|
218
|
+
print(f"Generating embeddings for {len(chunks)} chunks...")
|
|
127
219
|
|
|
128
220
|
for i, chunk in enumerate(chunks):
|
|
129
221
|
try:
|
|
130
222
|
# Preprocess content for better search
|
|
131
223
|
processed = preprocess_document_content(
|
|
132
224
|
chunk['content'],
|
|
133
|
-
language=chunk.get('language', 'en')
|
|
225
|
+
language=chunk.get('language', 'en'),
|
|
226
|
+
index_nlp_backend=self.index_nlp_backend
|
|
134
227
|
)
|
|
135
228
|
|
|
136
229
|
chunk['processed_content'] = processed['enhanced_text']
|
|
137
|
-
|
|
230
|
+
|
|
231
|
+
# Include tags in keywords for better search matching
|
|
232
|
+
keywords = processed.get('keywords', [])
|
|
233
|
+
chunk_tags = chunk.get('tags', [])
|
|
234
|
+
if chunk_tags:
|
|
235
|
+
# Add tags to keywords list for FTS matching
|
|
236
|
+
keywords.extend(chunk_tags)
|
|
237
|
+
# Remove duplicates while preserving order
|
|
238
|
+
keywords = list(dict.fromkeys(keywords))
|
|
239
|
+
|
|
240
|
+
chunk['keywords'] = keywords
|
|
241
|
+
|
|
242
|
+
# For embedding, include tags in the text for better semantic matching
|
|
243
|
+
embedding_text = processed['enhanced_text']
|
|
244
|
+
if chunk_tags:
|
|
245
|
+
# Append tags to the text for embedding generation
|
|
246
|
+
embedding_text += " " + " ".join(chunk_tags)
|
|
138
247
|
|
|
139
248
|
# Generate embedding (suppress progress bar)
|
|
140
|
-
embedding = self.model.encode(
|
|
249
|
+
embedding = self.model.encode(embedding_text, show_progress_bar=False)
|
|
141
250
|
chunk['embedding'] = embedding.tobytes()
|
|
142
251
|
|
|
143
|
-
|
|
252
|
+
# Show progress more frequently
|
|
253
|
+
show_every = 50 if len(chunks) > 500 else max(10, len(chunks) // 10)
|
|
254
|
+
if (i + 1) % show_every == 0 or (i + 1) == len(chunks):
|
|
144
255
|
progress_pct = ((i + 1) / len(chunks)) * 100
|
|
145
|
-
print(f"
|
|
256
|
+
print(f" Progress: {i + 1}/{len(chunks)} chunks ({progress_pct:.1f}%)")
|
|
146
257
|
|
|
147
258
|
except Exception as e:
|
|
148
259
|
logger.error(f"Error processing chunk {i}: {e}")
|
|
@@ -156,19 +267,24 @@ class IndexBuilder:
|
|
|
156
267
|
else:
|
|
157
268
|
chunk['embedding'] = b''
|
|
158
269
|
|
|
159
|
-
#
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
270
|
+
# Store chunks based on backend
|
|
271
|
+
if self.backend == 'sqlite':
|
|
272
|
+
# Create SQLite database
|
|
273
|
+
sources_info = [str(s) for s in sources]
|
|
274
|
+
self._create_database(output_file, chunks, languages or ['en'], sources_info, file_types)
|
|
275
|
+
|
|
276
|
+
if self.verbose:
|
|
277
|
+
print(f"Index created: {output_file}")
|
|
278
|
+
print(f"Total chunks: {len(chunks)}")
|
|
279
|
+
else:
|
|
280
|
+
# Use pgvector backend
|
|
281
|
+
self._store_chunks_pgvector(chunks, output_file, languages or ['en'], overwrite)
|
|
166
282
|
|
|
167
283
|
def build_index(self, source_dir: str, output_file: str,
|
|
168
284
|
file_types: List[str], exclude_patterns: Optional[List[str]] = None,
|
|
169
285
|
languages: List[str] = None, tags: Optional[List[str]] = None):
|
|
170
286
|
"""
|
|
171
|
-
Build complete search index from a single directory
|
|
287
|
+
Build complete search index from a single directory
|
|
172
288
|
|
|
173
289
|
Args:
|
|
174
290
|
source_dir: Directory to scan for documents
|
|
@@ -332,16 +448,57 @@ class IndexBuilder:
|
|
|
332
448
|
global_tags: Optional[List[str]] = None) -> List[Dict[str, Any]]:
|
|
333
449
|
"""Process single file into chunks"""
|
|
334
450
|
try:
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
451
|
+
relative_path = str(file_path.relative_to(source_dir))
|
|
452
|
+
file_extension = file_path.suffix.lower()
|
|
453
|
+
|
|
454
|
+
# Handle different file types appropriately
|
|
455
|
+
if file_extension == '.pdf':
|
|
456
|
+
# Use document processor for PDF extraction
|
|
457
|
+
content_result = self.doc_processor._extract_text_from_file(str(file_path))
|
|
458
|
+
if isinstance(content_result, str) and content_result.startswith('{"error"'):
|
|
459
|
+
if self.verbose:
|
|
460
|
+
print(f"Skipping PDF file (extraction failed): {file_path}")
|
|
461
|
+
return []
|
|
462
|
+
content = content_result
|
|
463
|
+
elif file_extension in ['.docx', '.xlsx', '.pptx']:
|
|
464
|
+
# Use document processor for Office documents
|
|
465
|
+
content_result = self.doc_processor._extract_text_from_file(str(file_path))
|
|
466
|
+
if isinstance(content_result, str) and content_result.startswith('{"error"'):
|
|
467
|
+
if self.verbose:
|
|
468
|
+
print(f"Skipping office document (extraction failed): {file_path}")
|
|
469
|
+
return []
|
|
470
|
+
content = content_result
|
|
471
|
+
elif file_extension == '.html':
|
|
472
|
+
# Use document processor for HTML
|
|
473
|
+
content_result = self.doc_processor._extract_text_from_file(str(file_path))
|
|
474
|
+
if isinstance(content_result, str) and content_result.startswith('{"error"'):
|
|
475
|
+
if self.verbose:
|
|
476
|
+
print(f"Skipping HTML file (extraction failed): {file_path}")
|
|
477
|
+
return []
|
|
478
|
+
content = content_result
|
|
479
|
+
elif file_extension == '.rtf':
|
|
480
|
+
# Use document processor for RTF
|
|
481
|
+
content_result = self.doc_processor._extract_text_from_file(str(file_path))
|
|
482
|
+
if isinstance(content_result, str) and content_result.startswith('{"error"'):
|
|
483
|
+
if self.verbose:
|
|
484
|
+
print(f"Skipping RTF file (extraction failed): {file_path}")
|
|
485
|
+
return []
|
|
486
|
+
content = content_result
|
|
487
|
+
else:
|
|
488
|
+
# Try to read as text file (markdown, txt, code, etc.)
|
|
489
|
+
try:
|
|
490
|
+
content = file_path.read_text(encoding='utf-8')
|
|
491
|
+
except UnicodeDecodeError:
|
|
492
|
+
if self.verbose:
|
|
493
|
+
print(f"Skipping binary file: {file_path}")
|
|
494
|
+
return []
|
|
495
|
+
|
|
496
|
+
# Validate content
|
|
497
|
+
if not content or (isinstance(content, str) and len(content.strip()) == 0):
|
|
339
498
|
if self.verbose:
|
|
340
|
-
print(f"Skipping
|
|
499
|
+
print(f"Skipping empty file: {file_path}")
|
|
341
500
|
return []
|
|
342
501
|
|
|
343
|
-
relative_path = str(file_path.relative_to(source_dir))
|
|
344
|
-
|
|
345
502
|
# Create chunks using document processor - pass content directly, not file path
|
|
346
503
|
chunks = self.doc_processor.create_chunks(
|
|
347
504
|
content=content, # Pass the actual content, not the file path
|
|
@@ -390,6 +547,7 @@ class IndexBuilder:
|
|
|
390
547
|
end_line INTEGER,
|
|
391
548
|
tags TEXT,
|
|
392
549
|
metadata TEXT,
|
|
550
|
+
metadata_text TEXT, -- Searchable text representation of all metadata
|
|
393
551
|
chunk_hash TEXT UNIQUE,
|
|
394
552
|
created_at TEXT DEFAULT CURRENT_TIMESTAMP
|
|
395
553
|
)
|
|
@@ -399,6 +557,7 @@ class IndexBuilder:
|
|
|
399
557
|
CREATE VIRTUAL TABLE chunks_fts USING fts5(
|
|
400
558
|
processed_content,
|
|
401
559
|
keywords,
|
|
560
|
+
metadata_text,
|
|
402
561
|
content='chunks',
|
|
403
562
|
content_rowid='id'
|
|
404
563
|
)
|
|
@@ -460,13 +619,47 @@ class IndexBuilder:
|
|
|
460
619
|
# Prepare data
|
|
461
620
|
keywords_json = json.dumps(chunk.get('keywords', []))
|
|
462
621
|
tags_json = json.dumps(chunk.get('tags', []))
|
|
463
|
-
|
|
622
|
+
|
|
623
|
+
# Extract metadata from JSON content and merge with chunk metadata
|
|
624
|
+
json_metadata, json_metadata_text = self._extract_metadata_from_json_content(chunk['content'])
|
|
625
|
+
chunk_metadata = chunk.get('metadata', {})
|
|
626
|
+
|
|
627
|
+
# Merge metadata: chunk metadata takes precedence
|
|
628
|
+
merged_metadata = {**json_metadata, **chunk_metadata}
|
|
629
|
+
metadata_json = json.dumps(merged_metadata)
|
|
630
|
+
|
|
631
|
+
# Create comprehensive metadata_text including tags
|
|
632
|
+
metadata_text_parts = []
|
|
633
|
+
|
|
634
|
+
# Add metadata text from JSON content
|
|
635
|
+
if json_metadata_text:
|
|
636
|
+
metadata_text_parts.append(json_metadata_text)
|
|
637
|
+
|
|
638
|
+
# Add tags
|
|
639
|
+
tags = chunk.get('tags', [])
|
|
640
|
+
if tags:
|
|
641
|
+
metadata_text_parts.extend(str(tag).lower() for tag in tags)
|
|
642
|
+
|
|
643
|
+
# Add section if present
|
|
644
|
+
if chunk.get('section'):
|
|
645
|
+
metadata_text_parts.append(chunk['section'].lower())
|
|
646
|
+
|
|
647
|
+
# Add any additional metadata values
|
|
648
|
+
for key, value in chunk_metadata.items():
|
|
649
|
+
if key not in json_metadata: # Avoid duplicates
|
|
650
|
+
metadata_text_parts.append(str(key).lower())
|
|
651
|
+
if isinstance(value, list):
|
|
652
|
+
metadata_text_parts.extend(str(v).lower() for v in value)
|
|
653
|
+
else:
|
|
654
|
+
metadata_text_parts.append(str(value).lower())
|
|
655
|
+
|
|
656
|
+
metadata_text = ' '.join(metadata_text_parts)
|
|
464
657
|
|
|
465
658
|
cursor.execute('''
|
|
466
659
|
INSERT OR IGNORE INTO chunks (
|
|
467
660
|
content, processed_content, keywords, language, embedding,
|
|
468
|
-
filename, section, start_line, end_line, tags, metadata, chunk_hash
|
|
469
|
-
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
661
|
+
filename, section, start_line, end_line, tags, metadata, metadata_text, chunk_hash
|
|
662
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
470
663
|
''', (
|
|
471
664
|
chunk['content'],
|
|
472
665
|
chunk.get('processed_content', chunk['content']),
|
|
@@ -479,6 +672,7 @@ class IndexBuilder:
|
|
|
479
672
|
chunk.get('end_line'),
|
|
480
673
|
tags_json,
|
|
481
674
|
metadata_json,
|
|
675
|
+
metadata_text,
|
|
482
676
|
chunk_hash
|
|
483
677
|
))
|
|
484
678
|
|
|
@@ -531,4 +725,80 @@ class IndexBuilder:
|
|
|
531
725
|
}
|
|
532
726
|
|
|
533
727
|
except Exception as e:
|
|
534
|
-
return {"valid": False, "error": str(e)}
|
|
728
|
+
return {"valid": False, "error": str(e)}
|
|
729
|
+
|
|
730
|
+
def _store_chunks_pgvector(self, chunks: List[Dict[str, Any]], collection_name: str,
|
|
731
|
+
languages: List[str], overwrite: bool = False):
|
|
732
|
+
"""
|
|
733
|
+
Store chunks in pgvector backend
|
|
734
|
+
|
|
735
|
+
Args:
|
|
736
|
+
chunks: List of processed chunks
|
|
737
|
+
collection_name: Name for the collection (from output_file parameter)
|
|
738
|
+
languages: List of supported languages
|
|
739
|
+
"""
|
|
740
|
+
from .pgvector_backend import PgVectorBackend
|
|
741
|
+
|
|
742
|
+
# Extract collection name from the provided name
|
|
743
|
+
if collection_name.endswith('.swsearch'):
|
|
744
|
+
collection_name = collection_name[:-9] # Remove .swsearch extension
|
|
745
|
+
|
|
746
|
+
# Clean collection name for PostgreSQL
|
|
747
|
+
import re
|
|
748
|
+
collection_name = re.sub(r'[^a-zA-Z0-9_]', '_', collection_name)
|
|
749
|
+
|
|
750
|
+
if self.verbose:
|
|
751
|
+
print(f"Storing chunks in pgvector collection: {collection_name}")
|
|
752
|
+
|
|
753
|
+
# Create backend instance
|
|
754
|
+
backend = PgVectorBackend(self.connection_string)
|
|
755
|
+
|
|
756
|
+
try:
|
|
757
|
+
# Get embedding dimensions from model
|
|
758
|
+
if self.model:
|
|
759
|
+
embedding_dim = self.model.get_sentence_embedding_dimension()
|
|
760
|
+
else:
|
|
761
|
+
embedding_dim = 768 # Default for all-mpnet-base-v2
|
|
762
|
+
|
|
763
|
+
# Delete existing collection if overwrite is requested
|
|
764
|
+
if overwrite:
|
|
765
|
+
if self.verbose:
|
|
766
|
+
print(f"Dropping existing collection: {collection_name}")
|
|
767
|
+
backend.delete_collection(collection_name)
|
|
768
|
+
|
|
769
|
+
# Create schema
|
|
770
|
+
backend.create_schema(collection_name, embedding_dim)
|
|
771
|
+
|
|
772
|
+
# Convert embeddings from bytes to numpy arrays
|
|
773
|
+
for chunk in chunks:
|
|
774
|
+
if chunk.get('embedding') and isinstance(chunk['embedding'], bytes):
|
|
775
|
+
if np:
|
|
776
|
+
chunk['embedding'] = np.frombuffer(chunk['embedding'], dtype=np.float32)
|
|
777
|
+
else:
|
|
778
|
+
# If numpy not available, leave as bytes
|
|
779
|
+
pass
|
|
780
|
+
|
|
781
|
+
# Prepare config
|
|
782
|
+
config = {
|
|
783
|
+
'model_name': self.model_name,
|
|
784
|
+
'embedding_dimensions': embedding_dim,
|
|
785
|
+
'chunking_strategy': self.chunking_strategy,
|
|
786
|
+
'languages': languages,
|
|
787
|
+
'metadata': {
|
|
788
|
+
'max_sentences_per_chunk': self.max_sentences_per_chunk,
|
|
789
|
+
'chunk_size': self.chunk_size,
|
|
790
|
+
'chunk_overlap': self.chunk_overlap,
|
|
791
|
+
'index_nlp_backend': self.index_nlp_backend
|
|
792
|
+
}
|
|
793
|
+
}
|
|
794
|
+
|
|
795
|
+
# Store chunks
|
|
796
|
+
backend.store_chunks(chunks, collection_name, config)
|
|
797
|
+
|
|
798
|
+
if self.verbose:
|
|
799
|
+
stats = backend.get_stats(collection_name)
|
|
800
|
+
print(f"Stored {stats['total_chunks']} chunks in pgvector")
|
|
801
|
+
print(f"Collection: {collection_name}")
|
|
802
|
+
|
|
803
|
+
finally:
|
|
804
|
+
backend.close()
|