wikigen 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,826 @@
1
+ """Fast file indexing system using SQLite FTS for machine-wide markdown search.
2
+
3
+ This module provides indexed search capabilities across multiple directories using
4
+ SQLite FTS5 for full-text search of file paths, names, and resource names.
5
+ Also supports semantic search using FAISS for chunk-based retrieval.
6
+ """
7
+
8
+ import sqlite3
9
+ import time
10
+ from pathlib import Path
11
+ from typing import Dict, List, Optional, Tuple, Any
12
+ from threading import Lock
13
+ import hashlib
14
+
15
+ from ..config import CONFIG_DIR
16
+ from ..defaults import DEFAULT_CONFIG
17
+ from .chunking import chunk_markdown
18
+ from .embeddings import get_embeddings_batch
19
+ from .vector_index import VectorIndex
20
+
21
+
22
+ class FileIndexer:
23
+ """
24
+ Fast file indexer using SQLite FTS5 for efficient full-text search.
25
+
26
+ Indexes markdown files across configured directories and provides
27
+ fast search capabilities through SQLite's full-text search engine.
28
+ Also supports semantic search using FAISS for chunk-based retrieval.
29
+ """
30
+
31
+ def __init__(
32
+ self,
33
+ index_db_path: Optional[Path] = None,
34
+ enable_semantic_search: Optional[bool] = None,
35
+ vector_index_path: Optional[Path] = None,
36
+ ):
37
+ """
38
+ Initialize the file indexer.
39
+
40
+ Args:
41
+ index_db_path: Path to SQLite database. Defaults to config_dir/file_index.db
42
+ enable_semantic_search: Enable semantic search. Defaults to config value.
43
+ vector_index_path: Path to FAISS vector index. Defaults to config_dir/vector_index.faiss
44
+ """
45
+ if index_db_path is None:
46
+ index_db_path = CONFIG_DIR / "file_index.db"
47
+
48
+ self.db_path = index_db_path
49
+ self.db_path.parent.mkdir(parents=True, exist_ok=True)
50
+ self._lock = Lock()
51
+
52
+ # Load semantic search config
53
+ if enable_semantic_search is None:
54
+ config = DEFAULT_CONFIG.copy()
55
+ enable_semantic_search = config.get("semantic_search_enabled", True)
56
+
57
+ self.enable_semantic_search = enable_semantic_search
58
+
59
+ # Initialize vector index if semantic search is enabled
60
+ self.vector_index: Optional[VectorIndex] = None
61
+ if self.enable_semantic_search:
62
+ try:
63
+ # Get embedding model dimension (384 for all-MiniLM-L6-v2)
64
+ embedding_model = DEFAULT_CONFIG.get(
65
+ "embedding_model", "all-MiniLM-L6-v2"
66
+ )
67
+ embedding_dim = 384 # all-MiniLM-L6-v2 dimension
68
+ self.vector_index = VectorIndex(
69
+ embedding_dim=embedding_dim, index_path=vector_index_path
70
+ )
71
+ except ImportError:
72
+ # FAISS not available, disable semantic search
73
+ self.enable_semantic_search = False
74
+ self.vector_index = None
75
+
76
+ self._init_database()
77
+
78
+ def _init_database(self):
79
+ """Initialize SQLite database with FTS5 table for full-text search."""
80
+ with self._lock:
81
+ conn = sqlite3.connect(str(self.db_path))
82
+ try:
83
+ cursor = conn.cursor()
84
+
85
+ # Create main files table
86
+ cursor.execute(
87
+ """
88
+ CREATE TABLE IF NOT EXISTS files (
89
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
90
+ file_path TEXT NOT NULL UNIQUE,
91
+ file_name TEXT NOT NULL,
92
+ resource_name TEXT NOT NULL,
93
+ directory TEXT NOT NULL,
94
+ size INTEGER,
95
+ modified_time REAL,
96
+ indexed_time REAL NOT NULL,
97
+ content_hash TEXT
98
+ )
99
+ """
100
+ )
101
+
102
+ # Create indexes separately
103
+ cursor.execute(
104
+ "CREATE INDEX IF NOT EXISTS idx_file_path ON files(file_path)"
105
+ )
106
+ cursor.execute(
107
+ "CREATE INDEX IF NOT EXISTS idx_file_name ON files(file_name)"
108
+ )
109
+ cursor.execute(
110
+ "CREATE INDEX IF NOT EXISTS idx_directory ON files(directory)"
111
+ )
112
+
113
+ # Create FTS5 virtual table for full-text search
114
+ # FTS5 allows fast full-text search on multiple columns
115
+ cursor.execute(
116
+ """
117
+ CREATE VIRTUAL TABLE IF NOT EXISTS files_fts USING fts5(
118
+ file_path,
119
+ file_name,
120
+ resource_name,
121
+ directory,
122
+ content='files',
123
+ content_rowid='id'
124
+ )
125
+ """
126
+ )
127
+
128
+ # Create triggers to keep FTS5 in syncdex with main table
129
+ cursor.execute(
130
+ """
131
+ CREATE TRIGGER IF NOT EXISTS files_ai AFTER INSERT ON files BEGIN
132
+ INSERT INTO files_fts(rowid, file_path, file_name, resource_name, directory)
133
+ VALUES (new.id, new.file_path, new.file_name, new.resource_name, new.directory);
134
+ END
135
+ """
136
+ )
137
+
138
+ cursor.execute(
139
+ """
140
+ CREATE TRIGGER IF NOT EXISTS files_ad AFTER DELETE ON files BEGIN
141
+ INSERT INTO files_fts(files_fts, rowid, file_path, file_name, resource_name, directory)
142
+ VALUES('delete', old.id, old.file_path, old.file_name, old.resource_name, old.directory);
143
+ END
144
+ """
145
+ )
146
+
147
+ cursor.execute(
148
+ """
149
+ CREATE TRIGGER IF NOT EXISTS files_au AFTER UPDATE ON files BEGIN
150
+ INSERT INTO files_fts(files_fts, rowid, file_path, file_name, resource_name, directory)
151
+ VALUES('delete', old.id, old.file_path, old.file_name, old.resource_name, old.directory);
152
+ INSERT INTO files_fts(rowid, file_path, file_name, resource_name, directory)
153
+ VALUES (new.id, new.file_path, new.file_name, new.resource_name, new.directory);
154
+ END
155
+ """
156
+ )
157
+
158
+ # Migration: Populate FTS5 from existing files if needed
159
+ cursor.execute(
160
+ """
161
+ INSERT INTO files_fts(rowid, file_path, file_name, resource_name, directory)
162
+ SELECT id, file_path, file_name, resource_name, directory
163
+ FROM files
164
+ WHERE NOT EXISTS (
165
+ SELECT 1 FROM files_fts WHERE files_fts.rowid = files.id
166
+ )
167
+ """
168
+ )
169
+
170
+ conn.commit()
171
+ finally:
172
+ conn.close()
173
+
174
+ def _calculate_content_hash(self, file_path: Path) -> str:
175
+ """Calculate SHA256 hash of file content for change detection."""
176
+ try:
177
+ with open(file_path, "rb") as f:
178
+ return hashlib.sha256(f.read()).hexdigest()
179
+ except Exception:
180
+ return ""
181
+
182
+ def index_directory(
183
+ self,
184
+ directory: Path,
185
+ pattern: str = "*.md",
186
+ exclude_hidden: bool = True,
187
+ max_depth: Optional[int] = None,
188
+ ) -> Tuple[int, int, int]:
189
+ """
190
+ Index all markdown files in a directory recursively.
191
+
192
+ Args:
193
+ directory: Directory to index
194
+ pattern: File pattern to match (default: "*.md")
195
+ exclude_hidden: Skip hidden files/directories
196
+ max_depth: Maximum recursion depth (None = unlimited)
197
+
198
+ Returns:
199
+ Tuple of (files_added, files_updated, files_skipped)
200
+ """
201
+ if not directory.exists() or not directory.is_dir():
202
+ return (0, 0, 0)
203
+
204
+ files_added = 0
205
+ files_updated = 0
206
+ files_skipped = 0
207
+ indexed_time = time.time()
208
+
209
+ with self._lock:
210
+ conn = sqlite3.connect(str(self.db_path))
211
+ try:
212
+ cursor = conn.cursor()
213
+
214
+ # Find all matching files
215
+ for md_file in directory.rglob(pattern):
216
+ # Skip if exceeds max_depth
217
+ if max_depth is not None:
218
+ depth = len(md_file.relative_to(directory).parts) - 1
219
+ if depth > max_depth:
220
+ continue
221
+
222
+ # Skip hidden files/directories
223
+ if exclude_hidden:
224
+ relative_path = md_file.relative_to(directory)
225
+ if any(
226
+ part.startswith(".") for part in relative_path.parts[:-1]
227
+ ):
228
+ continue
229
+
230
+ try:
231
+ # Get file metadata
232
+ stat = md_file.stat()
233
+ file_size = stat.st_size
234
+ modified_time = stat.st_mtime
235
+
236
+ # Calculate resource name (path without extension)
237
+ try:
238
+ relative_path = md_file.relative_to(directory)
239
+ except ValueError:
240
+ # File not relative to directory (shouldn't happen)
241
+ files_skipped += 1
242
+ continue
243
+
244
+ resource_name = str(relative_path.with_suffix(""))
245
+ file_name = md_file.name
246
+ file_dir = str(md_file.parent)
247
+ file_path_str = str(md_file.absolute())
248
+
249
+ # Calculate content hash
250
+ content_hash = self._calculate_content_hash(md_file)
251
+
252
+ # Check if file already indexed
253
+ cursor.execute(
254
+ "SELECT id, content_hash, modified_time FROM files WHERE file_path = ?",
255
+ (file_path_str,),
256
+ )
257
+ existing = cursor.fetchone()
258
+
259
+ file_changed = False
260
+ if existing:
261
+ file_id, old_hash, old_mtime = existing
262
+ # Update if file changed
263
+ if content_hash != old_hash or modified_time > old_mtime:
264
+ cursor.execute(
265
+ """
266
+ UPDATE files
267
+ SET file_name = ?, resource_name = ?, directory = ?,
268
+ size = ?, modified_time = ?, indexed_time = ?,
269
+ content_hash = ?
270
+ WHERE id = ?
271
+ """,
272
+ (
273
+ file_name,
274
+ resource_name,
275
+ file_dir,
276
+ file_size,
277
+ modified_time,
278
+ indexed_time,
279
+ content_hash,
280
+ file_id,
281
+ ),
282
+ )
283
+ files_updated += 1
284
+ file_changed = True
285
+ else:
286
+ files_skipped += 1
287
+ else:
288
+ # Insert new file
289
+ cursor.execute(
290
+ """
291
+ INSERT INTO files (
292
+ file_path, file_name, resource_name, directory,
293
+ size, modified_time, indexed_time, content_hash
294
+ ) VALUES (?, ?, ?, ?, ?, ?, ?, ?)
295
+ """,
296
+ (
297
+ file_path_str,
298
+ file_name,
299
+ resource_name,
300
+ file_dir,
301
+ file_size,
302
+ modified_time,
303
+ indexed_time,
304
+ content_hash,
305
+ ),
306
+ )
307
+ files_added += 1
308
+ file_changed = True
309
+
310
+ # Index chunks for semantic search if enabled and file changed
311
+ if (
312
+ self.enable_semantic_search
313
+ and self.vector_index
314
+ and file_changed
315
+ ):
316
+ try:
317
+ self._index_file_chunks(md_file, file_path_str)
318
+ except Exception as e:
319
+ # Log error but don't fail indexing
320
+ print(
321
+ f"Warning: Could not index chunks for {file_path_str}: {e}"
322
+ )
323
+
324
+ except Exception:
325
+ # Skip files we can't read or process
326
+ files_skipped += 1
327
+ continue
328
+
329
+ conn.commit()
330
+ finally:
331
+ conn.close()
332
+
333
+ # Save vector index after indexing
334
+ if self.enable_semantic_search and self.vector_index:
335
+ try:
336
+ self.vector_index.save()
337
+ except Exception as e:
338
+ print(f"Warning: Could not save vector index: {e}")
339
+
340
+ return (files_added, files_updated, files_skipped)
341
+
342
+ def _index_file_chunks(self, file_path: Path, file_path_str: str) -> None:
343
+ """
344
+ Index chunks for a file in the vector index.
345
+
346
+ Args:
347
+ file_path: Path to the file
348
+ file_path_str: String representation of the file path
349
+ """
350
+ if not self.vector_index:
351
+ return
352
+
353
+ try:
354
+ # Read file content
355
+ content = file_path.read_text(encoding="utf-8")
356
+
357
+ # Get chunking config
358
+ config = DEFAULT_CONFIG.copy()
359
+ chunk_size = config.get("chunk_size", 500)
360
+ chunk_overlap = config.get("chunk_overlap", 50)
361
+
362
+ # Chunk the content
363
+ chunks = chunk_markdown(
364
+ content, chunk_size=chunk_size, overlap=chunk_overlap
365
+ )
366
+
367
+ if not chunks:
368
+ return
369
+
370
+ # Generate embeddings for chunks
371
+ chunk_texts = [chunk["content"] for chunk in chunks]
372
+ embedding_model = config.get("embedding_model", "all-MiniLM-L6-v2")
373
+ embeddings = get_embeddings_batch(chunk_texts, model_name=embedding_model)
374
+
375
+ # Add chunks to vector index
376
+ self.vector_index.add_chunks(file_path_str, chunks, embeddings)
377
+ except Exception as e:
378
+ # Log error but don't fail
379
+ print(f"Warning: Could not index chunks for {file_path_str}: {e}")
380
+
381
+ def search(
382
+ self, query: str, limit: int = 50, directory_filter: Optional[str] = None
383
+ ) -> List[Dict[str, Any]]:
384
+ """
385
+ Search for files using full-text search.
386
+
387
+ Args:
388
+ query: Search query (supports FTS5 syntax)
389
+ limit: Maximum number of results
390
+ directory_filter: Optional directory path to filter results
391
+
392
+ Returns:
393
+ List of dictionaries with file information
394
+ """
395
+ with self._lock:
396
+ conn = sqlite3.connect(str(self.db_path))
397
+ try:
398
+ conn.row_factory = sqlite3.Row
399
+ cursor = conn.cursor()
400
+
401
+ # Build FTS5 query
402
+ # Handle empty query
403
+ if not query or not query.strip():
404
+ fts_query = "*" # Match all
405
+ else:
406
+ # Escape FTS5 special characters
407
+ # FTS5 special characters: " ' \ and operators: AND OR NOT
408
+ # For simplicity, we'll use a simple word search
409
+ def escape_fts5_token(word):
410
+ # Remove FTS5 special characters that cause syntax errors
411
+ # Replace with space to split into multiple tokens
412
+ word = (
413
+ word.replace('"', " ").replace("'", " ").replace("\\", " ")
414
+ )
415
+ word = word.replace("(", " ").replace(")", " ")
416
+ word = word.replace("[", " ").replace("]", " ")
417
+ word = word.replace("?", " ") # Remove question marks
418
+ word = word.replace("-", " ") # Split hyphenated words
419
+ # Remove extra spaces
420
+ word = " ".join(word.split())
421
+ return word
422
+
423
+ # Split query into words and escape each
424
+ words = query.strip().split()
425
+ escaped_words = []
426
+ for word in words:
427
+ word = word.strip()
428
+ if word:
429
+ # Escape special characters
430
+ escaped = escape_fts5_token(word)
431
+ if escaped: # Only add if word is not empty after escaping
432
+ # Split if multiple words after escaping
433
+ for token in escaped.split():
434
+ if token:
435
+ # Use prefix matching (*) to match partial tokens
436
+ # Remove any existing * to avoid double wildcards
437
+ token = token.rstrip("*")
438
+ escaped_words.append(f"{token}*")
439
+
440
+ # If no valid words after escaping, use wildcard
441
+ if not escaped_words:
442
+ fts_query = "*"
443
+ else:
444
+ # Join with OR for any-word matching
445
+ fts_query = " OR ".join(escaped_words)
446
+
447
+ # Build SQL query
448
+ # Note: FTS5 MATCH doesn't support parameterized queries in some SQLite versions
449
+ # We embed the query directly after proper escaping
450
+ # Escape single quotes in fts_query for SQL embedding
451
+ fts_query_escaped = fts_query.replace("'", "''")
452
+
453
+ if directory_filter:
454
+ sql = f"""
455
+ SELECT f.id, f.file_path, f.file_name, f.resource_name,
456
+ f.directory, f.size, f.modified_time
457
+ FROM files_fts
458
+ JOIN files f ON files_fts.rowid = f.id
459
+ WHERE files_fts MATCH '{fts_query_escaped}' AND f.directory LIKE ?
460
+ ORDER BY files_fts.rank
461
+ LIMIT ?
462
+ """
463
+ cursor.execute(sql, (f"%{directory_filter}%", limit))
464
+ else:
465
+ sql = f"""
466
+ SELECT f.id, f.file_path, f.file_name, f.resource_name,
467
+ f.directory, f.size, f.modified_time
468
+ FROM files_fts
469
+ JOIN files f ON files_fts.rowid = f.id
470
+ WHERE files_fts MATCH '{fts_query_escaped}'
471
+ ORDER BY files_fts.rank
472
+ LIMIT ?
473
+ """
474
+ cursor.execute(sql, (limit,))
475
+
476
+ results = []
477
+ rows = cursor.fetchall()
478
+ for row in rows:
479
+ results.append(
480
+ {
481
+ "id": row["id"],
482
+ "file_path": row["file_path"],
483
+ "file_name": row["file_name"],
484
+ "resource_name": row["resource_name"],
485
+ "directory": row["directory"],
486
+ "size": row["size"],
487
+ "modified_time": row["modified_time"],
488
+ }
489
+ )
490
+
491
+ # Fallback: if FTS returns no matches, try LIKE on filenames/paths
492
+ if not results and query and query.strip():
493
+ like = f"%{query.strip()}%"
494
+ if directory_filter:
495
+ cursor.execute(
496
+ """
497
+ SELECT id, file_path, file_name, resource_name,
498
+ directory, size, modified_time
499
+ FROM files
500
+ WHERE (file_name LIKE ? OR file_path LIKE ?)
501
+ AND directory LIKE ?
502
+ LIMIT ?
503
+ """,
504
+ (like, like, f"%{directory_filter}%", limit),
505
+ )
506
+ else:
507
+ cursor.execute(
508
+ """
509
+ SELECT id, file_path, file_name, resource_name,
510
+ directory, size, modified_time
511
+ FROM files
512
+ WHERE file_name LIKE ? OR file_path LIKE ?
513
+ LIMIT ?
514
+ """,
515
+ (like, like, limit),
516
+ )
517
+ for row in cursor.fetchall():
518
+ results.append(
519
+ {
520
+ "id": row["id"],
521
+ "file_path": row["file_path"],
522
+ "file_name": row["file_name"],
523
+ "resource_name": row["resource_name"],
524
+ "directory": row["directory"],
525
+ "size": row["size"],
526
+ "modified_time": row["modified_time"],
527
+ }
528
+ )
529
+
530
+ return results
531
+ finally:
532
+ conn.close()
533
+
534
+ def search_semantic(
535
+ self,
536
+ query: str,
537
+ limit: int = 10,
538
+ directory_filter: Optional[str] = None,
539
+ max_chunks_per_file: int = 5,
540
+ ) -> List[Dict[str, Any]]:
541
+ """
542
+ Hybrid semantic search: Use FTS5 to find candidate files, then FAISS to find relevant chunks.
543
+
544
+ Args:
545
+ query: Search query
546
+ limit: Maximum number of chunks to return
547
+ directory_filter: Optional directory path to filter results
548
+ max_chunks_per_file: Maximum chunks to return per file
549
+
550
+ Returns:
551
+ List of dictionaries with chunk information:
552
+ - 'file_path': Path to the file
553
+ - 'file_name': Name of the file
554
+ - 'resource_name': Resource name
555
+ - 'chunk_index': Index of the chunk
556
+ - 'content': Chunk content
557
+ - 'start_pos': Start position in file
558
+ - 'end_pos': End position in file
559
+ - 'score': Relevance score (distance)
560
+ """
561
+ if not self.enable_semantic_search or not self.vector_index:
562
+ # Fallback to keyword search
563
+ return self.search(query, limit=limit, directory_filter=directory_filter)
564
+
565
+ # Step 1: Use FTS5 to find candidate files
566
+ candidate_files = self.search(
567
+ query, limit=50, directory_filter=directory_filter
568
+ ) # Get more candidates
569
+
570
+ # If no candidate files found, search all files (semantic search can find relevant content)
571
+ if not candidate_files:
572
+ # Get all files instead of returning empty
573
+ candidate_files = self.get_all_files(directory_filter=directory_filter)
574
+ if not candidate_files:
575
+ return []
576
+
577
+ # Step 2: Generate query embedding
578
+ try:
579
+ from .embeddings import get_embedding
580
+ from ..defaults import DEFAULT_CONFIG
581
+
582
+ config = DEFAULT_CONFIG.copy()
583
+ embedding_model = config.get("embedding_model", "all-MiniLM-L6-v2")
584
+ query_embedding = get_embedding(query, model_name=embedding_model)
585
+ except Exception as e:
586
+ # If embedding fails, fallback to keyword search
587
+ print(f"Warning: Could not generate query embedding: {e}")
588
+ return self.search(query, limit=limit, directory_filter=directory_filter)
589
+
590
+ # Step 3: Search FAISS for relevant chunks in candidate files
591
+ file_paths = [f["file_path"] for f in candidate_files]
592
+ chunk_results = self.vector_index.search(
593
+ query_embedding, k=limit * 2, file_filter=file_paths
594
+ )
595
+
596
+ # Step 4: Format results
597
+ results = []
598
+ seen_files = {} # Track chunks per file
599
+
600
+ for chunk_id, distance, metadata in chunk_results:
601
+ file_path = metadata["file_path"]
602
+
603
+ # Limit chunks per file
604
+ if file_path not in seen_files:
605
+ seen_files[file_path] = 0
606
+ if seen_files[file_path] >= max_chunks_per_file:
607
+ continue
608
+ seen_files[file_path] += 1
609
+
610
+ # Find file metadata
611
+ file_meta = next(
612
+ (f for f in candidate_files if f["file_path"] == file_path), None
613
+ )
614
+ if not file_meta:
615
+ continue
616
+
617
+ results.append(
618
+ {
619
+ "file_path": file_path,
620
+ "file_name": file_meta.get("file_name", ""),
621
+ "resource_name": file_meta.get("resource_name", ""),
622
+ "directory": file_meta.get("directory", ""),
623
+ "chunk_index": metadata.get("chunk_index", 0),
624
+ "content": metadata.get("content", ""),
625
+ "start_pos": metadata.get("start_pos", 0),
626
+ "end_pos": metadata.get("end_pos", 0),
627
+ "score": distance,
628
+ }
629
+ )
630
+
631
+ if len(results) >= limit:
632
+ break
633
+
634
+ return results
635
+
636
+ def get_file_by_path(self, file_path: str) -> Optional[Dict[str, Any]]:
637
+ """Get file information by absolute path."""
638
+ with self._lock:
639
+ conn = sqlite3.connect(str(self.db_path))
640
+ try:
641
+ conn.row_factory = sqlite3.Row
642
+ cursor = conn.cursor()
643
+
644
+ cursor.execute(
645
+ """
646
+ SELECT id, file_path, file_name, resource_name,
647
+ directory, size, modified_time
648
+ FROM files
649
+ WHERE file_path = ?
650
+ """,
651
+ (file_path,),
652
+ )
653
+
654
+ row = cursor.fetchone()
655
+ if row:
656
+ return {
657
+ "id": row["id"],
658
+ "file_path": row["file_path"],
659
+ "file_name": row["file_name"],
660
+ "resource_name": row["resource_name"],
661
+ "directory": row["directory"],
662
+ "size": row["size"],
663
+ "modified_time": row["modified_time"],
664
+ }
665
+ return None
666
+ finally:
667
+ conn.close()
668
+
669
+ def get_all_files(
670
+ self, directory_filter: Optional[str] = None
671
+ ) -> List[Dict[str, Any]]:
672
+ """Get all indexed files, optionally filtered by directory."""
673
+ with self._lock:
674
+ conn = sqlite3.connect(str(self.db_path))
675
+ try:
676
+ conn.row_factory = sqlite3.Row
677
+ cursor = conn.cursor()
678
+
679
+ if directory_filter:
680
+ cursor.execute(
681
+ """
682
+ SELECT id, file_path, file_name, resource_name,
683
+ directory, size, modified_time
684
+ FROM files
685
+ WHERE directory LIKE ?
686
+ ORDER BY file_path
687
+ """,
688
+ (f"%{directory_filter}%",),
689
+ )
690
+ else:
691
+ cursor.execute(
692
+ """
693
+ SELECT id, file_path, file_name, resource_name,
694
+ directory, size, modified_time
695
+ FROM files
696
+ ORDER BY file_path
697
+ """
698
+ )
699
+
700
+ results = []
701
+ for row in cursor.fetchall():
702
+ results.append(
703
+ {
704
+ "id": row["id"],
705
+ "file_path": row["file_path"],
706
+ "file_name": row["file_name"],
707
+ "resource_name": row["resource_name"],
708
+ "directory": row["directory"],
709
+ "size": row["size"],
710
+ "modified_time": row["modified_time"],
711
+ }
712
+ )
713
+
714
+ return results
715
+ finally:
716
+ conn.close()
717
+
718
+ def remove_directory(self, directory: Path) -> int:
719
+ """
720
+ Remove all files from index that are in the specified directory.
721
+
722
+ Returns:
723
+ Number of files removed
724
+ """
725
+ directory_str = str(directory.absolute())
726
+
727
+ with self._lock:
728
+ conn = sqlite3.connect(str(self.db_path))
729
+ try:
730
+ cursor = conn.cursor()
731
+
732
+ # Get file paths to remove from vector index
733
+ cursor.execute(
734
+ """
735
+ SELECT file_path FROM files
736
+ WHERE file_path LIKE ?
737
+ """,
738
+ (f"{directory_str}%",),
739
+ )
740
+ file_paths = [row[0] for row in cursor.fetchall()]
741
+
742
+ # Delete files in this directory
743
+ cursor.execute(
744
+ """
745
+ DELETE FROM files
746
+ WHERE file_path LIKE ?
747
+ """,
748
+ (f"{directory_str}%",),
749
+ )
750
+
751
+ removed = cursor.rowcount
752
+ conn.commit()
753
+
754
+ # Remove from vector index
755
+ if self.enable_semantic_search and self.vector_index:
756
+ for file_path in file_paths:
757
+ try:
758
+ self.vector_index.remove_file(file_path)
759
+ except Exception:
760
+ pass
761
+ self.vector_index.save()
762
+
763
+ return removed
764
+ finally:
765
+ conn.close()
766
+
767
+ def clear_index(self):
768
+ """Clear all indexed files."""
769
+ with self._lock:
770
+ conn = sqlite3.connect(str(self.db_path))
771
+ try:
772
+ cursor = conn.cursor()
773
+ cursor.execute("DELETE FROM files")
774
+ cursor.execute("DELETE FROM files_fts")
775
+ conn.commit()
776
+ finally:
777
+ conn.close()
778
+
779
+ # Clear vector index
780
+ if self.enable_semantic_search and self.vector_index:
781
+ try:
782
+ # Reinitialize vector index
783
+ self.vector_index._init_index()
784
+ self.vector_index.save()
785
+ except Exception:
786
+ pass
787
+
788
+ def get_stats(self) -> Dict[str, Any]:
789
+ """Get statistics about the index."""
790
+ with self._lock:
791
+ conn = sqlite3.connect(str(self.db_path))
792
+ try:
793
+ cursor = conn.cursor()
794
+
795
+ cursor.execute("SELECT COUNT(*) FROM files")
796
+ total_files = cursor.fetchone()[0]
797
+
798
+ cursor.execute("SELECT SUM(size) FROM files")
799
+ total_size = cursor.fetchone()[0] or 0
800
+
801
+ cursor.execute(
802
+ """
803
+ SELECT COUNT(DISTINCT directory) FROM files
804
+ """
805
+ )
806
+ total_directories = cursor.fetchone()[0]
807
+
808
+ stats = {
809
+ "total_files": total_files,
810
+ "total_size": total_size,
811
+ "total_directories": total_directories,
812
+ "database_path": str(self.db_path),
813
+ "semantic_search_enabled": self.enable_semantic_search,
814
+ }
815
+
816
+ # Add vector index stats if available
817
+ if self.enable_semantic_search and self.vector_index:
818
+ try:
819
+ vector_stats = self.vector_index.get_stats()
820
+ stats.update(vector_stats)
821
+ except Exception:
822
+ pass
823
+
824
+ return stats
825
+ finally:
826
+ conn.close()