wikigen 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wikigen/__init__.py +7 -0
- wikigen/cli.py +690 -0
- wikigen/config.py +526 -0
- wikigen/defaults.py +78 -0
- wikigen/flows/__init__.py +1 -0
- wikigen/flows/flow.py +38 -0
- wikigen/formatter/help_formatter.py +194 -0
- wikigen/formatter/init_formatter.py +56 -0
- wikigen/formatter/output_formatter.py +290 -0
- wikigen/mcp/__init__.py +12 -0
- wikigen/mcp/chunking.py +127 -0
- wikigen/mcp/embeddings.py +69 -0
- wikigen/mcp/output_resources.py +65 -0
- wikigen/mcp/search_index.py +826 -0
- wikigen/mcp/server.py +232 -0
- wikigen/mcp/vector_index.py +297 -0
- wikigen/metadata/__init__.py +35 -0
- wikigen/metadata/logo.py +28 -0
- wikigen/metadata/project.py +28 -0
- wikigen/metadata/version.py +17 -0
- wikigen/nodes/__init__.py +1 -0
- wikigen/nodes/nodes.py +1080 -0
- wikigen/utils/__init__.py +0 -0
- wikigen/utils/adjust_headings.py +72 -0
- wikigen/utils/call_llm.py +271 -0
- wikigen/utils/crawl_github_files.py +450 -0
- wikigen/utils/crawl_local_files.py +151 -0
- wikigen/utils/llm_providers.py +101 -0
- wikigen/utils/version_check.py +84 -0
- wikigen-1.0.0.dist-info/METADATA +352 -0
- wikigen-1.0.0.dist-info/RECORD +35 -0
- wikigen-1.0.0.dist-info/WHEEL +5 -0
- wikigen-1.0.0.dist-info/entry_points.txt +2 -0
- wikigen-1.0.0.dist-info/licenses/LICENSE +21 -0
- wikigen-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,826 @@
|
|
|
1
|
+
"""Fast file indexing system using SQLite FTS for machine-wide markdown search.
|
|
2
|
+
|
|
3
|
+
This module provides indexed search capabilities across multiple directories using
|
|
4
|
+
SQLite FTS5 for full-text search of file paths, names, and resource names.
|
|
5
|
+
Also supports semantic search using FAISS for chunk-based retrieval.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import sqlite3
|
|
9
|
+
import time
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Dict, List, Optional, Tuple, Any
|
|
12
|
+
from threading import Lock
|
|
13
|
+
import hashlib
|
|
14
|
+
|
|
15
|
+
from ..config import CONFIG_DIR
|
|
16
|
+
from ..defaults import DEFAULT_CONFIG
|
|
17
|
+
from .chunking import chunk_markdown
|
|
18
|
+
from .embeddings import get_embeddings_batch
|
|
19
|
+
from .vector_index import VectorIndex
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class FileIndexer:
|
|
23
|
+
"""
|
|
24
|
+
Fast file indexer using SQLite FTS5 for efficient full-text search.
|
|
25
|
+
|
|
26
|
+
Indexes markdown files across configured directories and provides
|
|
27
|
+
fast search capabilities through SQLite's full-text search engine.
|
|
28
|
+
Also supports semantic search using FAISS for chunk-based retrieval.
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
index_db_path: Optional[Path] = None,
|
|
34
|
+
enable_semantic_search: Optional[bool] = None,
|
|
35
|
+
vector_index_path: Optional[Path] = None,
|
|
36
|
+
):
|
|
37
|
+
"""
|
|
38
|
+
Initialize the file indexer.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
index_db_path: Path to SQLite database. Defaults to config_dir/file_index.db
|
|
42
|
+
enable_semantic_search: Enable semantic search. Defaults to config value.
|
|
43
|
+
vector_index_path: Path to FAISS vector index. Defaults to config_dir/vector_index.faiss
|
|
44
|
+
"""
|
|
45
|
+
if index_db_path is None:
|
|
46
|
+
index_db_path = CONFIG_DIR / "file_index.db"
|
|
47
|
+
|
|
48
|
+
self.db_path = index_db_path
|
|
49
|
+
self.db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
50
|
+
self._lock = Lock()
|
|
51
|
+
|
|
52
|
+
# Load semantic search config
|
|
53
|
+
if enable_semantic_search is None:
|
|
54
|
+
config = DEFAULT_CONFIG.copy()
|
|
55
|
+
enable_semantic_search = config.get("semantic_search_enabled", True)
|
|
56
|
+
|
|
57
|
+
self.enable_semantic_search = enable_semantic_search
|
|
58
|
+
|
|
59
|
+
# Initialize vector index if semantic search is enabled
|
|
60
|
+
self.vector_index: Optional[VectorIndex] = None
|
|
61
|
+
if self.enable_semantic_search:
|
|
62
|
+
try:
|
|
63
|
+
# Get embedding model dimension (384 for all-MiniLM-L6-v2)
|
|
64
|
+
embedding_model = DEFAULT_CONFIG.get(
|
|
65
|
+
"embedding_model", "all-MiniLM-L6-v2"
|
|
66
|
+
)
|
|
67
|
+
embedding_dim = 384 # all-MiniLM-L6-v2 dimension
|
|
68
|
+
self.vector_index = VectorIndex(
|
|
69
|
+
embedding_dim=embedding_dim, index_path=vector_index_path
|
|
70
|
+
)
|
|
71
|
+
except ImportError:
|
|
72
|
+
# FAISS not available, disable semantic search
|
|
73
|
+
self.enable_semantic_search = False
|
|
74
|
+
self.vector_index = None
|
|
75
|
+
|
|
76
|
+
self._init_database()
|
|
77
|
+
|
|
78
|
+
def _init_database(self):
|
|
79
|
+
"""Initialize SQLite database with FTS5 table for full-text search."""
|
|
80
|
+
with self._lock:
|
|
81
|
+
conn = sqlite3.connect(str(self.db_path))
|
|
82
|
+
try:
|
|
83
|
+
cursor = conn.cursor()
|
|
84
|
+
|
|
85
|
+
# Create main files table
|
|
86
|
+
cursor.execute(
|
|
87
|
+
"""
|
|
88
|
+
CREATE TABLE IF NOT EXISTS files (
|
|
89
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
90
|
+
file_path TEXT NOT NULL UNIQUE,
|
|
91
|
+
file_name TEXT NOT NULL,
|
|
92
|
+
resource_name TEXT NOT NULL,
|
|
93
|
+
directory TEXT NOT NULL,
|
|
94
|
+
size INTEGER,
|
|
95
|
+
modified_time REAL,
|
|
96
|
+
indexed_time REAL NOT NULL,
|
|
97
|
+
content_hash TEXT
|
|
98
|
+
)
|
|
99
|
+
"""
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
# Create indexes separately
|
|
103
|
+
cursor.execute(
|
|
104
|
+
"CREATE INDEX IF NOT EXISTS idx_file_path ON files(file_path)"
|
|
105
|
+
)
|
|
106
|
+
cursor.execute(
|
|
107
|
+
"CREATE INDEX IF NOT EXISTS idx_file_name ON files(file_name)"
|
|
108
|
+
)
|
|
109
|
+
cursor.execute(
|
|
110
|
+
"CREATE INDEX IF NOT EXISTS idx_directory ON files(directory)"
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# Create FTS5 virtual table for full-text search
|
|
114
|
+
# FTS5 allows fast full-text search on multiple columns
|
|
115
|
+
cursor.execute(
|
|
116
|
+
"""
|
|
117
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS files_fts USING fts5(
|
|
118
|
+
file_path,
|
|
119
|
+
file_name,
|
|
120
|
+
resource_name,
|
|
121
|
+
directory,
|
|
122
|
+
content='files',
|
|
123
|
+
content_rowid='id'
|
|
124
|
+
)
|
|
125
|
+
"""
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
# Create triggers to keep FTS5 in syncdex with main table
|
|
129
|
+
cursor.execute(
|
|
130
|
+
"""
|
|
131
|
+
CREATE TRIGGER IF NOT EXISTS files_ai AFTER INSERT ON files BEGIN
|
|
132
|
+
INSERT INTO files_fts(rowid, file_path, file_name, resource_name, directory)
|
|
133
|
+
VALUES (new.id, new.file_path, new.file_name, new.resource_name, new.directory);
|
|
134
|
+
END
|
|
135
|
+
"""
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
cursor.execute(
|
|
139
|
+
"""
|
|
140
|
+
CREATE TRIGGER IF NOT EXISTS files_ad AFTER DELETE ON files BEGIN
|
|
141
|
+
INSERT INTO files_fts(files_fts, rowid, file_path, file_name, resource_name, directory)
|
|
142
|
+
VALUES('delete', old.id, old.file_path, old.file_name, old.resource_name, old.directory);
|
|
143
|
+
END
|
|
144
|
+
"""
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
cursor.execute(
|
|
148
|
+
"""
|
|
149
|
+
CREATE TRIGGER IF NOT EXISTS files_au AFTER UPDATE ON files BEGIN
|
|
150
|
+
INSERT INTO files_fts(files_fts, rowid, file_path, file_name, resource_name, directory)
|
|
151
|
+
VALUES('delete', old.id, old.file_path, old.file_name, old.resource_name, old.directory);
|
|
152
|
+
INSERT INTO files_fts(rowid, file_path, file_name, resource_name, directory)
|
|
153
|
+
VALUES (new.id, new.file_path, new.file_name, new.resource_name, new.directory);
|
|
154
|
+
END
|
|
155
|
+
"""
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
# Migration: Populate FTS5 from existing files if needed
|
|
159
|
+
cursor.execute(
|
|
160
|
+
"""
|
|
161
|
+
INSERT INTO files_fts(rowid, file_path, file_name, resource_name, directory)
|
|
162
|
+
SELECT id, file_path, file_name, resource_name, directory
|
|
163
|
+
FROM files
|
|
164
|
+
WHERE NOT EXISTS (
|
|
165
|
+
SELECT 1 FROM files_fts WHERE files_fts.rowid = files.id
|
|
166
|
+
)
|
|
167
|
+
"""
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
conn.commit()
|
|
171
|
+
finally:
|
|
172
|
+
conn.close()
|
|
173
|
+
|
|
174
|
+
def _calculate_content_hash(self, file_path: Path) -> str:
|
|
175
|
+
"""Calculate SHA256 hash of file content for change detection."""
|
|
176
|
+
try:
|
|
177
|
+
with open(file_path, "rb") as f:
|
|
178
|
+
return hashlib.sha256(f.read()).hexdigest()
|
|
179
|
+
except Exception:
|
|
180
|
+
return ""
|
|
181
|
+
|
|
182
|
+
def index_directory(
|
|
183
|
+
self,
|
|
184
|
+
directory: Path,
|
|
185
|
+
pattern: str = "*.md",
|
|
186
|
+
exclude_hidden: bool = True,
|
|
187
|
+
max_depth: Optional[int] = None,
|
|
188
|
+
) -> Tuple[int, int, int]:
|
|
189
|
+
"""
|
|
190
|
+
Index all markdown files in a directory recursively.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
directory: Directory to index
|
|
194
|
+
pattern: File pattern to match (default: "*.md")
|
|
195
|
+
exclude_hidden: Skip hidden files/directories
|
|
196
|
+
max_depth: Maximum recursion depth (None = unlimited)
|
|
197
|
+
|
|
198
|
+
Returns:
|
|
199
|
+
Tuple of (files_added, files_updated, files_skipped)
|
|
200
|
+
"""
|
|
201
|
+
if not directory.exists() or not directory.is_dir():
|
|
202
|
+
return (0, 0, 0)
|
|
203
|
+
|
|
204
|
+
files_added = 0
|
|
205
|
+
files_updated = 0
|
|
206
|
+
files_skipped = 0
|
|
207
|
+
indexed_time = time.time()
|
|
208
|
+
|
|
209
|
+
with self._lock:
|
|
210
|
+
conn = sqlite3.connect(str(self.db_path))
|
|
211
|
+
try:
|
|
212
|
+
cursor = conn.cursor()
|
|
213
|
+
|
|
214
|
+
# Find all matching files
|
|
215
|
+
for md_file in directory.rglob(pattern):
|
|
216
|
+
# Skip if exceeds max_depth
|
|
217
|
+
if max_depth is not None:
|
|
218
|
+
depth = len(md_file.relative_to(directory).parts) - 1
|
|
219
|
+
if depth > max_depth:
|
|
220
|
+
continue
|
|
221
|
+
|
|
222
|
+
# Skip hidden files/directories
|
|
223
|
+
if exclude_hidden:
|
|
224
|
+
relative_path = md_file.relative_to(directory)
|
|
225
|
+
if any(
|
|
226
|
+
part.startswith(".") for part in relative_path.parts[:-1]
|
|
227
|
+
):
|
|
228
|
+
continue
|
|
229
|
+
|
|
230
|
+
try:
|
|
231
|
+
# Get file metadata
|
|
232
|
+
stat = md_file.stat()
|
|
233
|
+
file_size = stat.st_size
|
|
234
|
+
modified_time = stat.st_mtime
|
|
235
|
+
|
|
236
|
+
# Calculate resource name (path without extension)
|
|
237
|
+
try:
|
|
238
|
+
relative_path = md_file.relative_to(directory)
|
|
239
|
+
except ValueError:
|
|
240
|
+
# File not relative to directory (shouldn't happen)
|
|
241
|
+
files_skipped += 1
|
|
242
|
+
continue
|
|
243
|
+
|
|
244
|
+
resource_name = str(relative_path.with_suffix(""))
|
|
245
|
+
file_name = md_file.name
|
|
246
|
+
file_dir = str(md_file.parent)
|
|
247
|
+
file_path_str = str(md_file.absolute())
|
|
248
|
+
|
|
249
|
+
# Calculate content hash
|
|
250
|
+
content_hash = self._calculate_content_hash(md_file)
|
|
251
|
+
|
|
252
|
+
# Check if file already indexed
|
|
253
|
+
cursor.execute(
|
|
254
|
+
"SELECT id, content_hash, modified_time FROM files WHERE file_path = ?",
|
|
255
|
+
(file_path_str,),
|
|
256
|
+
)
|
|
257
|
+
existing = cursor.fetchone()
|
|
258
|
+
|
|
259
|
+
file_changed = False
|
|
260
|
+
if existing:
|
|
261
|
+
file_id, old_hash, old_mtime = existing
|
|
262
|
+
# Update if file changed
|
|
263
|
+
if content_hash != old_hash or modified_time > old_mtime:
|
|
264
|
+
cursor.execute(
|
|
265
|
+
"""
|
|
266
|
+
UPDATE files
|
|
267
|
+
SET file_name = ?, resource_name = ?, directory = ?,
|
|
268
|
+
size = ?, modified_time = ?, indexed_time = ?,
|
|
269
|
+
content_hash = ?
|
|
270
|
+
WHERE id = ?
|
|
271
|
+
""",
|
|
272
|
+
(
|
|
273
|
+
file_name,
|
|
274
|
+
resource_name,
|
|
275
|
+
file_dir,
|
|
276
|
+
file_size,
|
|
277
|
+
modified_time,
|
|
278
|
+
indexed_time,
|
|
279
|
+
content_hash,
|
|
280
|
+
file_id,
|
|
281
|
+
),
|
|
282
|
+
)
|
|
283
|
+
files_updated += 1
|
|
284
|
+
file_changed = True
|
|
285
|
+
else:
|
|
286
|
+
files_skipped += 1
|
|
287
|
+
else:
|
|
288
|
+
# Insert new file
|
|
289
|
+
cursor.execute(
|
|
290
|
+
"""
|
|
291
|
+
INSERT INTO files (
|
|
292
|
+
file_path, file_name, resource_name, directory,
|
|
293
|
+
size, modified_time, indexed_time, content_hash
|
|
294
|
+
) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
295
|
+
""",
|
|
296
|
+
(
|
|
297
|
+
file_path_str,
|
|
298
|
+
file_name,
|
|
299
|
+
resource_name,
|
|
300
|
+
file_dir,
|
|
301
|
+
file_size,
|
|
302
|
+
modified_time,
|
|
303
|
+
indexed_time,
|
|
304
|
+
content_hash,
|
|
305
|
+
),
|
|
306
|
+
)
|
|
307
|
+
files_added += 1
|
|
308
|
+
file_changed = True
|
|
309
|
+
|
|
310
|
+
# Index chunks for semantic search if enabled and file changed
|
|
311
|
+
if (
|
|
312
|
+
self.enable_semantic_search
|
|
313
|
+
and self.vector_index
|
|
314
|
+
and file_changed
|
|
315
|
+
):
|
|
316
|
+
try:
|
|
317
|
+
self._index_file_chunks(md_file, file_path_str)
|
|
318
|
+
except Exception as e:
|
|
319
|
+
# Log error but don't fail indexing
|
|
320
|
+
print(
|
|
321
|
+
f"Warning: Could not index chunks for {file_path_str}: {e}"
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
except Exception:
|
|
325
|
+
# Skip files we can't read or process
|
|
326
|
+
files_skipped += 1
|
|
327
|
+
continue
|
|
328
|
+
|
|
329
|
+
conn.commit()
|
|
330
|
+
finally:
|
|
331
|
+
conn.close()
|
|
332
|
+
|
|
333
|
+
# Save vector index after indexing
|
|
334
|
+
if self.enable_semantic_search and self.vector_index:
|
|
335
|
+
try:
|
|
336
|
+
self.vector_index.save()
|
|
337
|
+
except Exception as e:
|
|
338
|
+
print(f"Warning: Could not save vector index: {e}")
|
|
339
|
+
|
|
340
|
+
return (files_added, files_updated, files_skipped)
|
|
341
|
+
|
|
342
|
+
def _index_file_chunks(self, file_path: Path, file_path_str: str) -> None:
|
|
343
|
+
"""
|
|
344
|
+
Index chunks for a file in the vector index.
|
|
345
|
+
|
|
346
|
+
Args:
|
|
347
|
+
file_path: Path to the file
|
|
348
|
+
file_path_str: String representation of the file path
|
|
349
|
+
"""
|
|
350
|
+
if not self.vector_index:
|
|
351
|
+
return
|
|
352
|
+
|
|
353
|
+
try:
|
|
354
|
+
# Read file content
|
|
355
|
+
content = file_path.read_text(encoding="utf-8")
|
|
356
|
+
|
|
357
|
+
# Get chunking config
|
|
358
|
+
config = DEFAULT_CONFIG.copy()
|
|
359
|
+
chunk_size = config.get("chunk_size", 500)
|
|
360
|
+
chunk_overlap = config.get("chunk_overlap", 50)
|
|
361
|
+
|
|
362
|
+
# Chunk the content
|
|
363
|
+
chunks = chunk_markdown(
|
|
364
|
+
content, chunk_size=chunk_size, overlap=chunk_overlap
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
if not chunks:
|
|
368
|
+
return
|
|
369
|
+
|
|
370
|
+
# Generate embeddings for chunks
|
|
371
|
+
chunk_texts = [chunk["content"] for chunk in chunks]
|
|
372
|
+
embedding_model = config.get("embedding_model", "all-MiniLM-L6-v2")
|
|
373
|
+
embeddings = get_embeddings_batch(chunk_texts, model_name=embedding_model)
|
|
374
|
+
|
|
375
|
+
# Add chunks to vector index
|
|
376
|
+
self.vector_index.add_chunks(file_path_str, chunks, embeddings)
|
|
377
|
+
except Exception as e:
|
|
378
|
+
# Log error but don't fail
|
|
379
|
+
print(f"Warning: Could not index chunks for {file_path_str}: {e}")
|
|
380
|
+
|
|
381
|
+
def search(
|
|
382
|
+
self, query: str, limit: int = 50, directory_filter: Optional[str] = None
|
|
383
|
+
) -> List[Dict[str, Any]]:
|
|
384
|
+
"""
|
|
385
|
+
Search for files using full-text search.
|
|
386
|
+
|
|
387
|
+
Args:
|
|
388
|
+
query: Search query (supports FTS5 syntax)
|
|
389
|
+
limit: Maximum number of results
|
|
390
|
+
directory_filter: Optional directory path to filter results
|
|
391
|
+
|
|
392
|
+
Returns:
|
|
393
|
+
List of dictionaries with file information
|
|
394
|
+
"""
|
|
395
|
+
with self._lock:
|
|
396
|
+
conn = sqlite3.connect(str(self.db_path))
|
|
397
|
+
try:
|
|
398
|
+
conn.row_factory = sqlite3.Row
|
|
399
|
+
cursor = conn.cursor()
|
|
400
|
+
|
|
401
|
+
# Build FTS5 query
|
|
402
|
+
# Handle empty query
|
|
403
|
+
if not query or not query.strip():
|
|
404
|
+
fts_query = "*" # Match all
|
|
405
|
+
else:
|
|
406
|
+
# Escape FTS5 special characters
|
|
407
|
+
# FTS5 special characters: " ' \ and operators: AND OR NOT
|
|
408
|
+
# For simplicity, we'll use a simple word search
|
|
409
|
+
def escape_fts5_token(word):
|
|
410
|
+
# Remove FTS5 special characters that cause syntax errors
|
|
411
|
+
# Replace with space to split into multiple tokens
|
|
412
|
+
word = (
|
|
413
|
+
word.replace('"', " ").replace("'", " ").replace("\\", " ")
|
|
414
|
+
)
|
|
415
|
+
word = word.replace("(", " ").replace(")", " ")
|
|
416
|
+
word = word.replace("[", " ").replace("]", " ")
|
|
417
|
+
word = word.replace("?", " ") # Remove question marks
|
|
418
|
+
word = word.replace("-", " ") # Split hyphenated words
|
|
419
|
+
# Remove extra spaces
|
|
420
|
+
word = " ".join(word.split())
|
|
421
|
+
return word
|
|
422
|
+
|
|
423
|
+
# Split query into words and escape each
|
|
424
|
+
words = query.strip().split()
|
|
425
|
+
escaped_words = []
|
|
426
|
+
for word in words:
|
|
427
|
+
word = word.strip()
|
|
428
|
+
if word:
|
|
429
|
+
# Escape special characters
|
|
430
|
+
escaped = escape_fts5_token(word)
|
|
431
|
+
if escaped: # Only add if word is not empty after escaping
|
|
432
|
+
# Split if multiple words after escaping
|
|
433
|
+
for token in escaped.split():
|
|
434
|
+
if token:
|
|
435
|
+
# Use prefix matching (*) to match partial tokens
|
|
436
|
+
# Remove any existing * to avoid double wildcards
|
|
437
|
+
token = token.rstrip("*")
|
|
438
|
+
escaped_words.append(f"{token}*")
|
|
439
|
+
|
|
440
|
+
# If no valid words after escaping, use wildcard
|
|
441
|
+
if not escaped_words:
|
|
442
|
+
fts_query = "*"
|
|
443
|
+
else:
|
|
444
|
+
# Join with OR for any-word matching
|
|
445
|
+
fts_query = " OR ".join(escaped_words)
|
|
446
|
+
|
|
447
|
+
# Build SQL query
|
|
448
|
+
# Note: FTS5 MATCH doesn't support parameterized queries in some SQLite versions
|
|
449
|
+
# We embed the query directly after proper escaping
|
|
450
|
+
# Escape single quotes in fts_query for SQL embedding
|
|
451
|
+
fts_query_escaped = fts_query.replace("'", "''")
|
|
452
|
+
|
|
453
|
+
if directory_filter:
|
|
454
|
+
sql = f"""
|
|
455
|
+
SELECT f.id, f.file_path, f.file_name, f.resource_name,
|
|
456
|
+
f.directory, f.size, f.modified_time
|
|
457
|
+
FROM files_fts
|
|
458
|
+
JOIN files f ON files_fts.rowid = f.id
|
|
459
|
+
WHERE files_fts MATCH '{fts_query_escaped}' AND f.directory LIKE ?
|
|
460
|
+
ORDER BY files_fts.rank
|
|
461
|
+
LIMIT ?
|
|
462
|
+
"""
|
|
463
|
+
cursor.execute(sql, (f"%{directory_filter}%", limit))
|
|
464
|
+
else:
|
|
465
|
+
sql = f"""
|
|
466
|
+
SELECT f.id, f.file_path, f.file_name, f.resource_name,
|
|
467
|
+
f.directory, f.size, f.modified_time
|
|
468
|
+
FROM files_fts
|
|
469
|
+
JOIN files f ON files_fts.rowid = f.id
|
|
470
|
+
WHERE files_fts MATCH '{fts_query_escaped}'
|
|
471
|
+
ORDER BY files_fts.rank
|
|
472
|
+
LIMIT ?
|
|
473
|
+
"""
|
|
474
|
+
cursor.execute(sql, (limit,))
|
|
475
|
+
|
|
476
|
+
results = []
|
|
477
|
+
rows = cursor.fetchall()
|
|
478
|
+
for row in rows:
|
|
479
|
+
results.append(
|
|
480
|
+
{
|
|
481
|
+
"id": row["id"],
|
|
482
|
+
"file_path": row["file_path"],
|
|
483
|
+
"file_name": row["file_name"],
|
|
484
|
+
"resource_name": row["resource_name"],
|
|
485
|
+
"directory": row["directory"],
|
|
486
|
+
"size": row["size"],
|
|
487
|
+
"modified_time": row["modified_time"],
|
|
488
|
+
}
|
|
489
|
+
)
|
|
490
|
+
|
|
491
|
+
# Fallback: if FTS returns no matches, try LIKE on filenames/paths
|
|
492
|
+
if not results and query and query.strip():
|
|
493
|
+
like = f"%{query.strip()}%"
|
|
494
|
+
if directory_filter:
|
|
495
|
+
cursor.execute(
|
|
496
|
+
"""
|
|
497
|
+
SELECT id, file_path, file_name, resource_name,
|
|
498
|
+
directory, size, modified_time
|
|
499
|
+
FROM files
|
|
500
|
+
WHERE (file_name LIKE ? OR file_path LIKE ?)
|
|
501
|
+
AND directory LIKE ?
|
|
502
|
+
LIMIT ?
|
|
503
|
+
""",
|
|
504
|
+
(like, like, f"%{directory_filter}%", limit),
|
|
505
|
+
)
|
|
506
|
+
else:
|
|
507
|
+
cursor.execute(
|
|
508
|
+
"""
|
|
509
|
+
SELECT id, file_path, file_name, resource_name,
|
|
510
|
+
directory, size, modified_time
|
|
511
|
+
FROM files
|
|
512
|
+
WHERE file_name LIKE ? OR file_path LIKE ?
|
|
513
|
+
LIMIT ?
|
|
514
|
+
""",
|
|
515
|
+
(like, like, limit),
|
|
516
|
+
)
|
|
517
|
+
for row in cursor.fetchall():
|
|
518
|
+
results.append(
|
|
519
|
+
{
|
|
520
|
+
"id": row["id"],
|
|
521
|
+
"file_path": row["file_path"],
|
|
522
|
+
"file_name": row["file_name"],
|
|
523
|
+
"resource_name": row["resource_name"],
|
|
524
|
+
"directory": row["directory"],
|
|
525
|
+
"size": row["size"],
|
|
526
|
+
"modified_time": row["modified_time"],
|
|
527
|
+
}
|
|
528
|
+
)
|
|
529
|
+
|
|
530
|
+
return results
|
|
531
|
+
finally:
|
|
532
|
+
conn.close()
|
|
533
|
+
|
|
534
|
+
def search_semantic(
|
|
535
|
+
self,
|
|
536
|
+
query: str,
|
|
537
|
+
limit: int = 10,
|
|
538
|
+
directory_filter: Optional[str] = None,
|
|
539
|
+
max_chunks_per_file: int = 5,
|
|
540
|
+
) -> List[Dict[str, Any]]:
|
|
541
|
+
"""
|
|
542
|
+
Hybrid semantic search: Use FTS5 to find candidate files, then FAISS to find relevant chunks.
|
|
543
|
+
|
|
544
|
+
Args:
|
|
545
|
+
query: Search query
|
|
546
|
+
limit: Maximum number of chunks to return
|
|
547
|
+
directory_filter: Optional directory path to filter results
|
|
548
|
+
max_chunks_per_file: Maximum chunks to return per file
|
|
549
|
+
|
|
550
|
+
Returns:
|
|
551
|
+
List of dictionaries with chunk information:
|
|
552
|
+
- 'file_path': Path to the file
|
|
553
|
+
- 'file_name': Name of the file
|
|
554
|
+
- 'resource_name': Resource name
|
|
555
|
+
- 'chunk_index': Index of the chunk
|
|
556
|
+
- 'content': Chunk content
|
|
557
|
+
- 'start_pos': Start position in file
|
|
558
|
+
- 'end_pos': End position in file
|
|
559
|
+
- 'score': Relevance score (distance)
|
|
560
|
+
"""
|
|
561
|
+
if not self.enable_semantic_search or not self.vector_index:
|
|
562
|
+
# Fallback to keyword search
|
|
563
|
+
return self.search(query, limit=limit, directory_filter=directory_filter)
|
|
564
|
+
|
|
565
|
+
# Step 1: Use FTS5 to find candidate files
|
|
566
|
+
candidate_files = self.search(
|
|
567
|
+
query, limit=50, directory_filter=directory_filter
|
|
568
|
+
) # Get more candidates
|
|
569
|
+
|
|
570
|
+
# If no candidate files found, search all files (semantic search can find relevant content)
|
|
571
|
+
if not candidate_files:
|
|
572
|
+
# Get all files instead of returning empty
|
|
573
|
+
candidate_files = self.get_all_files(directory_filter=directory_filter)
|
|
574
|
+
if not candidate_files:
|
|
575
|
+
return []
|
|
576
|
+
|
|
577
|
+
# Step 2: Generate query embedding
|
|
578
|
+
try:
|
|
579
|
+
from .embeddings import get_embedding
|
|
580
|
+
from ..defaults import DEFAULT_CONFIG
|
|
581
|
+
|
|
582
|
+
config = DEFAULT_CONFIG.copy()
|
|
583
|
+
embedding_model = config.get("embedding_model", "all-MiniLM-L6-v2")
|
|
584
|
+
query_embedding = get_embedding(query, model_name=embedding_model)
|
|
585
|
+
except Exception as e:
|
|
586
|
+
# If embedding fails, fallback to keyword search
|
|
587
|
+
print(f"Warning: Could not generate query embedding: {e}")
|
|
588
|
+
return self.search(query, limit=limit, directory_filter=directory_filter)
|
|
589
|
+
|
|
590
|
+
# Step 3: Search FAISS for relevant chunks in candidate files
|
|
591
|
+
file_paths = [f["file_path"] for f in candidate_files]
|
|
592
|
+
chunk_results = self.vector_index.search(
|
|
593
|
+
query_embedding, k=limit * 2, file_filter=file_paths
|
|
594
|
+
)
|
|
595
|
+
|
|
596
|
+
# Step 4: Format results
|
|
597
|
+
results = []
|
|
598
|
+
seen_files = {} # Track chunks per file
|
|
599
|
+
|
|
600
|
+
for chunk_id, distance, metadata in chunk_results:
|
|
601
|
+
file_path = metadata["file_path"]
|
|
602
|
+
|
|
603
|
+
# Limit chunks per file
|
|
604
|
+
if file_path not in seen_files:
|
|
605
|
+
seen_files[file_path] = 0
|
|
606
|
+
if seen_files[file_path] >= max_chunks_per_file:
|
|
607
|
+
continue
|
|
608
|
+
seen_files[file_path] += 1
|
|
609
|
+
|
|
610
|
+
# Find file metadata
|
|
611
|
+
file_meta = next(
|
|
612
|
+
(f for f in candidate_files if f["file_path"] == file_path), None
|
|
613
|
+
)
|
|
614
|
+
if not file_meta:
|
|
615
|
+
continue
|
|
616
|
+
|
|
617
|
+
results.append(
|
|
618
|
+
{
|
|
619
|
+
"file_path": file_path,
|
|
620
|
+
"file_name": file_meta.get("file_name", ""),
|
|
621
|
+
"resource_name": file_meta.get("resource_name", ""),
|
|
622
|
+
"directory": file_meta.get("directory", ""),
|
|
623
|
+
"chunk_index": metadata.get("chunk_index", 0),
|
|
624
|
+
"content": metadata.get("content", ""),
|
|
625
|
+
"start_pos": metadata.get("start_pos", 0),
|
|
626
|
+
"end_pos": metadata.get("end_pos", 0),
|
|
627
|
+
"score": distance,
|
|
628
|
+
}
|
|
629
|
+
)
|
|
630
|
+
|
|
631
|
+
if len(results) >= limit:
|
|
632
|
+
break
|
|
633
|
+
|
|
634
|
+
return results
|
|
635
|
+
|
|
636
|
+
def get_file_by_path(self, file_path: str) -> Optional[Dict[str, Any]]:
|
|
637
|
+
"""Get file information by absolute path."""
|
|
638
|
+
with self._lock:
|
|
639
|
+
conn = sqlite3.connect(str(self.db_path))
|
|
640
|
+
try:
|
|
641
|
+
conn.row_factory = sqlite3.Row
|
|
642
|
+
cursor = conn.cursor()
|
|
643
|
+
|
|
644
|
+
cursor.execute(
|
|
645
|
+
"""
|
|
646
|
+
SELECT id, file_path, file_name, resource_name,
|
|
647
|
+
directory, size, modified_time
|
|
648
|
+
FROM files
|
|
649
|
+
WHERE file_path = ?
|
|
650
|
+
""",
|
|
651
|
+
(file_path,),
|
|
652
|
+
)
|
|
653
|
+
|
|
654
|
+
row = cursor.fetchone()
|
|
655
|
+
if row:
|
|
656
|
+
return {
|
|
657
|
+
"id": row["id"],
|
|
658
|
+
"file_path": row["file_path"],
|
|
659
|
+
"file_name": row["file_name"],
|
|
660
|
+
"resource_name": row["resource_name"],
|
|
661
|
+
"directory": row["directory"],
|
|
662
|
+
"size": row["size"],
|
|
663
|
+
"modified_time": row["modified_time"],
|
|
664
|
+
}
|
|
665
|
+
return None
|
|
666
|
+
finally:
|
|
667
|
+
conn.close()
|
|
668
|
+
|
|
669
|
+
def get_all_files(
|
|
670
|
+
self, directory_filter: Optional[str] = None
|
|
671
|
+
) -> List[Dict[str, Any]]:
|
|
672
|
+
"""Get all indexed files, optionally filtered by directory."""
|
|
673
|
+
with self._lock:
|
|
674
|
+
conn = sqlite3.connect(str(self.db_path))
|
|
675
|
+
try:
|
|
676
|
+
conn.row_factory = sqlite3.Row
|
|
677
|
+
cursor = conn.cursor()
|
|
678
|
+
|
|
679
|
+
if directory_filter:
|
|
680
|
+
cursor.execute(
|
|
681
|
+
"""
|
|
682
|
+
SELECT id, file_path, file_name, resource_name,
|
|
683
|
+
directory, size, modified_time
|
|
684
|
+
FROM files
|
|
685
|
+
WHERE directory LIKE ?
|
|
686
|
+
ORDER BY file_path
|
|
687
|
+
""",
|
|
688
|
+
(f"%{directory_filter}%",),
|
|
689
|
+
)
|
|
690
|
+
else:
|
|
691
|
+
cursor.execute(
|
|
692
|
+
"""
|
|
693
|
+
SELECT id, file_path, file_name, resource_name,
|
|
694
|
+
directory, size, modified_time
|
|
695
|
+
FROM files
|
|
696
|
+
ORDER BY file_path
|
|
697
|
+
"""
|
|
698
|
+
)
|
|
699
|
+
|
|
700
|
+
results = []
|
|
701
|
+
for row in cursor.fetchall():
|
|
702
|
+
results.append(
|
|
703
|
+
{
|
|
704
|
+
"id": row["id"],
|
|
705
|
+
"file_path": row["file_path"],
|
|
706
|
+
"file_name": row["file_name"],
|
|
707
|
+
"resource_name": row["resource_name"],
|
|
708
|
+
"directory": row["directory"],
|
|
709
|
+
"size": row["size"],
|
|
710
|
+
"modified_time": row["modified_time"],
|
|
711
|
+
}
|
|
712
|
+
)
|
|
713
|
+
|
|
714
|
+
return results
|
|
715
|
+
finally:
|
|
716
|
+
conn.close()
|
|
717
|
+
|
|
718
|
+
def remove_directory(self, directory: Path) -> int:
|
|
719
|
+
"""
|
|
720
|
+
Remove all files from index that are in the specified directory.
|
|
721
|
+
|
|
722
|
+
Returns:
|
|
723
|
+
Number of files removed
|
|
724
|
+
"""
|
|
725
|
+
directory_str = str(directory.absolute())
|
|
726
|
+
|
|
727
|
+
with self._lock:
|
|
728
|
+
conn = sqlite3.connect(str(self.db_path))
|
|
729
|
+
try:
|
|
730
|
+
cursor = conn.cursor()
|
|
731
|
+
|
|
732
|
+
# Get file paths to remove from vector index
|
|
733
|
+
cursor.execute(
|
|
734
|
+
"""
|
|
735
|
+
SELECT file_path FROM files
|
|
736
|
+
WHERE file_path LIKE ?
|
|
737
|
+
""",
|
|
738
|
+
(f"{directory_str}%",),
|
|
739
|
+
)
|
|
740
|
+
file_paths = [row[0] for row in cursor.fetchall()]
|
|
741
|
+
|
|
742
|
+
# Delete files in this directory
|
|
743
|
+
cursor.execute(
|
|
744
|
+
"""
|
|
745
|
+
DELETE FROM files
|
|
746
|
+
WHERE file_path LIKE ?
|
|
747
|
+
""",
|
|
748
|
+
(f"{directory_str}%",),
|
|
749
|
+
)
|
|
750
|
+
|
|
751
|
+
removed = cursor.rowcount
|
|
752
|
+
conn.commit()
|
|
753
|
+
|
|
754
|
+
# Remove from vector index
|
|
755
|
+
if self.enable_semantic_search and self.vector_index:
|
|
756
|
+
for file_path in file_paths:
|
|
757
|
+
try:
|
|
758
|
+
self.vector_index.remove_file(file_path)
|
|
759
|
+
except Exception:
|
|
760
|
+
pass
|
|
761
|
+
self.vector_index.save()
|
|
762
|
+
|
|
763
|
+
return removed
|
|
764
|
+
finally:
|
|
765
|
+
conn.close()
|
|
766
|
+
|
|
767
|
+
def clear_index(self):
|
|
768
|
+
"""Clear all indexed files."""
|
|
769
|
+
with self._lock:
|
|
770
|
+
conn = sqlite3.connect(str(self.db_path))
|
|
771
|
+
try:
|
|
772
|
+
cursor = conn.cursor()
|
|
773
|
+
cursor.execute("DELETE FROM files")
|
|
774
|
+
cursor.execute("DELETE FROM files_fts")
|
|
775
|
+
conn.commit()
|
|
776
|
+
finally:
|
|
777
|
+
conn.close()
|
|
778
|
+
|
|
779
|
+
# Clear vector index
|
|
780
|
+
if self.enable_semantic_search and self.vector_index:
|
|
781
|
+
try:
|
|
782
|
+
# Reinitialize vector index
|
|
783
|
+
self.vector_index._init_index()
|
|
784
|
+
self.vector_index.save()
|
|
785
|
+
except Exception:
|
|
786
|
+
pass
|
|
787
|
+
|
|
788
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
789
|
+
"""Get statistics about the index."""
|
|
790
|
+
with self._lock:
|
|
791
|
+
conn = sqlite3.connect(str(self.db_path))
|
|
792
|
+
try:
|
|
793
|
+
cursor = conn.cursor()
|
|
794
|
+
|
|
795
|
+
cursor.execute("SELECT COUNT(*) FROM files")
|
|
796
|
+
total_files = cursor.fetchone()[0]
|
|
797
|
+
|
|
798
|
+
cursor.execute("SELECT SUM(size) FROM files")
|
|
799
|
+
total_size = cursor.fetchone()[0] or 0
|
|
800
|
+
|
|
801
|
+
cursor.execute(
|
|
802
|
+
"""
|
|
803
|
+
SELECT COUNT(DISTINCT directory) FROM files
|
|
804
|
+
"""
|
|
805
|
+
)
|
|
806
|
+
total_directories = cursor.fetchone()[0]
|
|
807
|
+
|
|
808
|
+
stats = {
|
|
809
|
+
"total_files": total_files,
|
|
810
|
+
"total_size": total_size,
|
|
811
|
+
"total_directories": total_directories,
|
|
812
|
+
"database_path": str(self.db_path),
|
|
813
|
+
"semantic_search_enabled": self.enable_semantic_search,
|
|
814
|
+
}
|
|
815
|
+
|
|
816
|
+
# Add vector index stats if available
|
|
817
|
+
if self.enable_semantic_search and self.vector_index:
|
|
818
|
+
try:
|
|
819
|
+
vector_stats = self.vector_index.get_stats()
|
|
820
|
+
stats.update(vector_stats)
|
|
821
|
+
except Exception:
|
|
822
|
+
pass
|
|
823
|
+
|
|
824
|
+
return stats
|
|
825
|
+
finally:
|
|
826
|
+
conn.close()
|