claude-code-workflow 6.3.2 → 6.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. package/.claude/CLAUDE.md +9 -1
  2. package/.claude/commands/workflow/lite-plan.md +1 -1
  3. package/.claude/workflows/cli-tools-usage.md +515 -516
  4. package/ccw/dist/cli.d.ts.map +1 -1
  5. package/ccw/dist/cli.js +6 -1
  6. package/ccw/dist/cli.js.map +1 -1
  7. package/ccw/dist/commands/cli.d.ts +1 -1
  8. package/ccw/dist/commands/cli.d.ts.map +1 -1
  9. package/ccw/dist/commands/cli.js +71 -7
  10. package/ccw/dist/commands/cli.js.map +1 -1
  11. package/ccw/dist/tools/cli-executor.d.ts.map +1 -1
  12. package/ccw/dist/tools/cli-executor.js +19 -7
  13. package/ccw/dist/tools/cli-executor.js.map +1 -1
  14. package/ccw/dist/tools/cli-history-store.d.ts +33 -0
  15. package/ccw/dist/tools/cli-history-store.d.ts.map +1 -1
  16. package/ccw/dist/tools/cli-history-store.js +89 -5
  17. package/ccw/dist/tools/cli-history-store.js.map +1 -1
  18. package/ccw/src/cli.ts +263 -258
  19. package/ccw/src/commands/cli.ts +967 -884
  20. package/ccw/src/tools/cli-executor.ts +20 -7
  21. package/ccw/src/tools/cli-history-store.ts +125 -5
  22. package/codex-lens/src/codexlens/__pycache__/config.cpython-313.pyc +0 -0
  23. package/codex-lens/src/codexlens/config.py +3 -0
  24. package/codex-lens/src/codexlens/search/__pycache__/chain_search.cpython-313.pyc +0 -0
  25. package/codex-lens/src/codexlens/search/__pycache__/hybrid_search.cpython-313.pyc +0 -0
  26. package/codex-lens/src/codexlens/search/__pycache__/ranking.cpython-313.pyc +0 -0
  27. package/codex-lens/src/codexlens/search/chain_search.py +71 -1
  28. package/codex-lens/src/codexlens/search/ranking.py +274 -274
  29. package/codex-lens/src/codexlens/semantic/__pycache__/chunker.cpython-313.pyc +0 -0
  30. package/codex-lens/src/codexlens/storage/__pycache__/dir_index.cpython-313.pyc +0 -0
  31. package/codex-lens/src/codexlens/storage/__pycache__/global_index.cpython-313.pyc +0 -0
  32. package/codex-lens/src/codexlens/storage/__pycache__/index_tree.cpython-313.pyc +0 -0
  33. package/codex-lens/src/codexlens/storage/dir_index.py +1888 -1850
  34. package/codex-lens/src/codexlens/storage/global_index.py +365 -0
  35. package/codex-lens/src/codexlens/storage/index_tree.py +83 -10
  36. package/package.json +1 -1
@@ -1,1850 +1,1888 @@
1
- """Single-directory index storage with hierarchical linking.
2
-
3
- Each directory maintains its own _index.db with:
4
- - Files in the current directory
5
- - Links to subdirectory indexes
6
- - Full-text search via FTS5
7
- - Symbol table for code navigation
8
- """
9
-
10
- from __future__ import annotations
11
-
12
- import logging
13
- import re
14
- import sqlite3
15
- import threading
16
- from dataclasses import dataclass
17
- from pathlib import Path
18
- from typing import Any, Dict, List, Optional, Tuple
19
-
20
- from codexlens.entities import SearchResult, Symbol
21
- from codexlens.errors import StorageError
22
-
23
-
24
- @dataclass
25
- class SubdirLink:
26
- """Link to a subdirectory's index database."""
27
-
28
- id: int
29
- name: str
30
- index_path: Path
31
- files_count: int
32
- last_updated: float
33
-
34
-
35
- @dataclass
36
- class FileEntry:
37
- """Metadata for an indexed file in current directory."""
38
-
39
- id: int
40
- name: str
41
- full_path: Path
42
- language: str
43
- mtime: float
44
- line_count: int
45
-
46
-
47
- class DirIndexStore:
48
- """Single-directory index storage with hierarchical subdirectory linking.
49
-
50
- Each directory has an independent _index.db containing:
51
- - Files table: Files in this directory only
52
- - Subdirs table: Links to child directory indexes
53
- - Symbols table: Code symbols from files
54
- - FTS5 index: Full-text search on file content
55
-
56
- Thread-safe operations with WAL mode enabled.
57
- """
58
-
59
- # Schema version for migration tracking
60
- # Increment this when schema changes require migration
61
- SCHEMA_VERSION = 5
62
-
63
- def __init__(self, db_path: str | Path) -> None:
64
- """Initialize directory index store.
65
-
66
- Args:
67
- db_path: Path to _index.db file for this directory
68
- """
69
- self.db_path = Path(db_path).resolve()
70
- self._lock = threading.RLock()
71
- self._conn: Optional[sqlite3.Connection] = None
72
- self.logger = logging.getLogger(__name__)
73
-
74
- def initialize(self) -> None:
75
- """Create database and schema if not exists."""
76
- with self._lock:
77
- self.db_path.parent.mkdir(parents=True, exist_ok=True)
78
- conn = self._get_connection()
79
-
80
- # Check current schema version
81
- current_version = self._get_schema_version(conn)
82
-
83
- # Fail gracefully if database is from a newer version
84
- if current_version > self.SCHEMA_VERSION:
85
- raise StorageError(
86
- f"Database schema version {current_version} is newer than "
87
- f"supported version {self.SCHEMA_VERSION}. "
88
- f"Please update the application or use a compatible database.",
89
- db_path=str(self.db_path),
90
- operation="initialize",
91
- details={
92
- "current_version": current_version,
93
- "supported_version": self.SCHEMA_VERSION
94
- }
95
- )
96
-
97
- # Create or migrate schema
98
- if current_version == 0:
99
- # New database - create schema directly
100
- self._create_schema(conn)
101
- self._create_fts_triggers(conn)
102
- self._set_schema_version(conn, self.SCHEMA_VERSION)
103
- elif current_version < self.SCHEMA_VERSION:
104
- # Existing database - apply migrations
105
- self._apply_migrations(conn, current_version)
106
- self._set_schema_version(conn, self.SCHEMA_VERSION)
107
-
108
- conn.commit()
109
-
110
- def _get_schema_version(self, conn: sqlite3.Connection) -> int:
111
- """Get current schema version from database."""
112
- try:
113
- row = conn.execute("PRAGMA user_version").fetchone()
114
- return row[0] if row else 0
115
- except Exception:
116
- return 0
117
-
118
- def _set_schema_version(self, conn: sqlite3.Connection, version: int) -> None:
119
- """Set schema version in database."""
120
- conn.execute(f"PRAGMA user_version = {version}")
121
-
122
- def _apply_migrations(self, conn: sqlite3.Connection, from_version: int) -> None:
123
- """Apply schema migrations from current version to latest.
124
-
125
- Args:
126
- conn: Database connection
127
- from_version: Current schema version
128
- """
129
- # Migration v0/v1 -> v2: Add 'name' column to files table
130
- if from_version < 2:
131
- self._migrate_v2_add_name_column(conn)
132
-
133
- # Migration v2 -> v4: Add dual FTS tables (exact + fuzzy)
134
- if from_version < 4:
135
- from codexlens.storage.migrations.migration_004_dual_fts import upgrade
136
- upgrade(conn)
137
-
138
- # Migration v4 -> v5: Remove unused/redundant fields
139
- if from_version < 5:
140
- from codexlens.storage.migrations.migration_005_cleanup_unused_fields import upgrade
141
- upgrade(conn)
142
-
143
- def close(self) -> None:
144
- """Close database connection."""
145
- with self._lock:
146
- if self._conn is not None:
147
- try:
148
- self._conn.close()
149
- except Exception:
150
- pass
151
- finally:
152
- self._conn = None
153
-
154
- def __enter__(self) -> DirIndexStore:
155
- """Context manager entry."""
156
- self.initialize()
157
- return self
158
-
159
- def __exit__(self, exc_type: object, exc: object, tb: object) -> None:
160
- """Context manager exit."""
161
- self.close()
162
-
163
- # === File Operations ===
164
-
165
- def add_file(
166
- self,
167
- name: str,
168
- full_path: str | Path,
169
- content: str,
170
- language: str,
171
- symbols: Optional[List[Symbol]] = None,
172
- ) -> int:
173
- """Add or update a file in the current directory index.
174
-
175
- Args:
176
- name: Filename without path
177
- full_path: Complete source file path
178
- content: File content for indexing
179
- language: Programming language identifier
180
- symbols: List of Symbol objects from the file
181
-
182
- Returns:
183
- Database file_id
184
-
185
- Raises:
186
- StorageError: If database operations fail
187
- """
188
- with self._lock:
189
- conn = self._get_connection()
190
- full_path_str = str(Path(full_path).resolve())
191
- mtime = Path(full_path_str).stat().st_mtime if Path(full_path_str).exists() else None
192
- line_count = content.count('\n') + 1
193
-
194
- try:
195
- conn.execute(
196
- """
197
- INSERT INTO files(name, full_path, language, content, mtime, line_count)
198
- VALUES(?, ?, ?, ?, ?, ?)
199
- ON CONFLICT(full_path) DO UPDATE SET
200
- name=excluded.name,
201
- language=excluded.language,
202
- content=excluded.content,
203
- mtime=excluded.mtime,
204
- line_count=excluded.line_count
205
- """,
206
- (name, full_path_str, language, content, mtime, line_count),
207
- )
208
-
209
- row = conn.execute("SELECT id FROM files WHERE full_path=?", (full_path_str,)).fetchone()
210
- if not row:
211
- raise StorageError(f"Failed to retrieve file_id for {full_path_str}")
212
-
213
- file_id = int(row["id"])
214
-
215
- # Replace symbols
216
- conn.execute("DELETE FROM symbols WHERE file_id=?", (file_id,))
217
- if symbols:
218
- # Insert symbols without token_count and symbol_type
219
- symbol_rows = []
220
- for s in symbols:
221
- symbol_rows.append(
222
- (file_id, s.name, s.kind, s.range[0], s.range[1])
223
- )
224
-
225
- conn.executemany(
226
- """
227
- INSERT INTO symbols(file_id, name, kind, start_line, end_line)
228
- VALUES(?, ?, ?, ?, ?)
229
- """,
230
- symbol_rows,
231
- )
232
-
233
- conn.commit()
234
- return file_id
235
-
236
- except sqlite3.DatabaseError as exc:
237
- conn.rollback()
238
- raise StorageError(f"Failed to add file {name}: {exc}") from exc
239
-
240
- def add_files_batch(
241
- self, files: List[Tuple[str, Path, str, str, Optional[List[Symbol]]]]
242
- ) -> int:
243
- """Add multiple files in a single transaction.
244
-
245
- Args:
246
- files: List of (name, full_path, content, language, symbols) tuples
247
-
248
- Returns:
249
- Number of files added
250
-
251
- Raises:
252
- StorageError: If batch operation fails
253
- """
254
- with self._lock:
255
- conn = self._get_connection()
256
- count = 0
257
-
258
- try:
259
- conn.execute("BEGIN")
260
-
261
- for name, full_path, content, language, symbols in files:
262
- full_path_str = str(Path(full_path).resolve())
263
- mtime = Path(full_path_str).stat().st_mtime if Path(full_path_str).exists() else None
264
- line_count = content.count('\n') + 1
265
-
266
- conn.execute(
267
- """
268
- INSERT INTO files(name, full_path, language, content, mtime, line_count)
269
- VALUES(?, ?, ?, ?, ?, ?)
270
- ON CONFLICT(full_path) DO UPDATE SET
271
- name=excluded.name,
272
- language=excluded.language,
273
- content=excluded.content,
274
- mtime=excluded.mtime,
275
- line_count=excluded.line_count
276
- """,
277
- (name, full_path_str, language, content, mtime, line_count),
278
- )
279
-
280
- row = conn.execute("SELECT id FROM files WHERE full_path=?", (full_path_str,)).fetchone()
281
- if not row:
282
- raise StorageError(f"Failed to retrieve file_id for {full_path_str}")
283
-
284
- file_id = int(row["id"])
285
- count += 1
286
-
287
- conn.execute("DELETE FROM symbols WHERE file_id=?", (file_id,))
288
- if symbols:
289
- # Insert symbols
290
- symbol_rows = []
291
- for s in symbols:
292
- symbol_rows.append(
293
- (file_id, s.name, s.kind, s.range[0], s.range[1])
294
- )
295
-
296
- conn.executemany(
297
- """
298
- INSERT INTO symbols(file_id, name, kind, start_line, end_line)
299
- VALUES(?, ?, ?, ?, ?)
300
- """,
301
- symbol_rows,
302
- )
303
-
304
- conn.commit()
305
- return count
306
-
307
- except sqlite3.DatabaseError as exc:
308
- conn.rollback()
309
- raise StorageError(f"Batch insert failed: {exc}") from exc
310
-
311
- def remove_file(self, full_path: str | Path) -> bool:
312
- """Remove a file from the index.
313
-
314
- Args:
315
- full_path: Complete source file path
316
-
317
- Returns:
318
- True if file was removed, False if not found
319
- """
320
- with self._lock:
321
- conn = self._get_connection()
322
- full_path_str = str(Path(full_path).resolve())
323
-
324
- row = conn.execute("SELECT id FROM files WHERE full_path=?", (full_path_str,)).fetchone()
325
- if not row:
326
- return False
327
-
328
- file_id = int(row["id"])
329
- conn.execute("DELETE FROM files WHERE id=?", (file_id,))
330
- conn.commit()
331
- return True
332
-
333
- def get_file(self, full_path: str | Path) -> Optional[FileEntry]:
334
- """Get file metadata.
335
-
336
- Args:
337
- full_path: Complete source file path
338
-
339
- Returns:
340
- FileEntry if found, None otherwise
341
- """
342
- with self._lock:
343
- conn = self._get_connection()
344
- full_path_str = str(Path(full_path).resolve())
345
-
346
- row = conn.execute(
347
- """
348
- SELECT id, name, full_path, language, mtime, line_count
349
- FROM files WHERE full_path=?
350
- """,
351
- (full_path_str,),
352
- ).fetchone()
353
-
354
- if not row:
355
- return None
356
-
357
- return FileEntry(
358
- id=int(row["id"]),
359
- name=row["name"],
360
- full_path=Path(row["full_path"]),
361
- language=row["language"],
362
- mtime=float(row["mtime"]) if row["mtime"] else 0.0,
363
- line_count=int(row["line_count"]) if row["line_count"] else 0,
364
- )
365
-
366
- def get_file_mtime(self, full_path: str | Path) -> Optional[float]:
367
- """Get stored modification time for a file.
368
-
369
- Args:
370
- full_path: Complete source file path
371
-
372
- Returns:
373
- Modification time as float, or None if not found
374
- """
375
- with self._lock:
376
- conn = self._get_connection()
377
- full_path_str = str(Path(full_path).resolve())
378
-
379
- row = conn.execute(
380
- "SELECT mtime FROM files WHERE full_path=?", (full_path_str,)
381
- ).fetchone()
382
-
383
- return float(row["mtime"]) if row and row["mtime"] else None
384
-
385
- def needs_reindex(self, full_path: str | Path) -> bool:
386
- """Check if a file needs reindexing based on mtime comparison.
387
-
388
- Uses 1ms tolerance to handle filesystem timestamp precision variations.
389
-
390
- Args:
391
- full_path: Complete source file path
392
-
393
- Returns:
394
- True if file should be reindexed (new, modified, or missing from index)
395
- """
396
- full_path_obj = Path(full_path).resolve()
397
- if not full_path_obj.exists():
398
- return False # File doesn't exist, skip indexing
399
-
400
- # Get current filesystem mtime
401
- try:
402
- current_mtime = full_path_obj.stat().st_mtime
403
- except OSError:
404
- return False # Can't read file stats, skip
405
-
406
- # Get stored mtime from database
407
- stored_mtime = self.get_file_mtime(full_path_obj)
408
-
409
- # File not in index, needs indexing
410
- if stored_mtime is None:
411
- return True
412
-
413
- # Compare with 1ms tolerance for floating point precision
414
- MTIME_TOLERANCE = 0.001
415
- return abs(current_mtime - stored_mtime) > MTIME_TOLERANCE
416
-
417
- def add_file_incremental(
418
- self,
419
- name: str,
420
- full_path: str | Path,
421
- content: str,
422
- language: str,
423
- symbols: Optional[List[Symbol]] = None,
424
- ) -> Optional[int]:
425
- """Add or update a file only if it has changed (incremental indexing).
426
-
427
- Checks mtime before indexing to skip unchanged files.
428
-
429
- Args:
430
- name: Filename without path
431
- full_path: Complete source file path
432
- content: File content for indexing
433
- language: Programming language identifier
434
- symbols: List of Symbol objects from the file
435
-
436
- Returns:
437
- Database file_id if indexed, None if skipped (unchanged)
438
-
439
- Raises:
440
- StorageError: If database operations fail
441
- """
442
- # Check if reindexing is needed
443
- if not self.needs_reindex(full_path):
444
- return None # Skip unchanged file
445
-
446
- # File changed or new, perform full indexing
447
- return self.add_file(name, full_path, content, language, symbols)
448
-
449
- def cleanup_deleted_files(self, source_dir: Path) -> int:
450
- """Remove indexed files that no longer exist in the source directory.
451
-
452
- Scans the source directory and removes database entries for deleted files.
453
-
454
- Args:
455
- source_dir: Source directory to scan
456
-
457
- Returns:
458
- Number of deleted file entries removed
459
-
460
- Raises:
461
- StorageError: If cleanup operations fail
462
- """
463
- with self._lock:
464
- conn = self._get_connection()
465
- source_dir = source_dir.resolve()
466
-
467
- try:
468
- # Get all indexed file paths
469
- rows = conn.execute("SELECT full_path FROM files").fetchall()
470
- indexed_paths = {row["full_path"] for row in rows}
471
-
472
- # Build set of existing files in source directory
473
- existing_paths = set()
474
- for file_path in source_dir.rglob("*"):
475
- if file_path.is_file():
476
- existing_paths.add(str(file_path.resolve()))
477
-
478
- # Find orphaned entries (indexed but no longer exist)
479
- deleted_paths = indexed_paths - existing_paths
480
-
481
- # Remove orphaned entries
482
- deleted_count = 0
483
- for deleted_path in deleted_paths:
484
- conn.execute("DELETE FROM files WHERE full_path=?", (deleted_path,))
485
- deleted_count += 1
486
-
487
- if deleted_count > 0:
488
- conn.commit()
489
-
490
- return deleted_count
491
-
492
- except Exception as exc:
493
- conn.rollback()
494
- raise StorageError(f"Failed to cleanup deleted files: {exc}") from exc
495
-
496
- def list_files(self) -> List[FileEntry]:
497
- """List all files in current directory.
498
-
499
- Returns:
500
- List of FileEntry objects
501
- """
502
- with self._lock:
503
- conn = self._get_connection()
504
- rows = conn.execute(
505
- """
506
- SELECT id, name, full_path, language, mtime, line_count
507
- FROM files
508
- ORDER BY name
509
- """
510
- ).fetchall()
511
-
512
- return [
513
- FileEntry(
514
- id=int(row["id"]),
515
- name=row["name"],
516
- full_path=Path(row["full_path"]),
517
- language=row["language"],
518
- mtime=float(row["mtime"]) if row["mtime"] else 0.0,
519
- line_count=int(row["line_count"]) if row["line_count"] else 0,
520
- )
521
- for row in rows
522
- ]
523
-
524
- def file_count(self) -> int:
525
- """Get number of files in current directory.
526
-
527
- Returns:
528
- File count
529
- """
530
- with self._lock:
531
- conn = self._get_connection()
532
- row = conn.execute("SELECT COUNT(*) AS c FROM files").fetchone()
533
- return int(row["c"]) if row else 0
534
-
535
- # === Semantic Metadata ===
536
-
537
- def add_semantic_metadata(
538
- self,
539
- file_id: int,
540
- summary: str,
541
- keywords: List[str],
542
- purpose: str,
543
- llm_tool: str
544
- ) -> None:
545
- """Add or update semantic metadata for a file.
546
-
547
- Args:
548
- file_id: File ID from files table
549
- summary: LLM-generated summary
550
- keywords: List of keywords
551
- purpose: Purpose/role of the file
552
- llm_tool: Tool used to generate metadata (gemini/qwen)
553
- """
554
- with self._lock:
555
- conn = self._get_connection()
556
-
557
- import time
558
-
559
- generated_at = time.time()
560
-
561
- # Write to semantic_metadata table (without keywords column)
562
- conn.execute(
563
- """
564
- INSERT INTO semantic_metadata(file_id, summary, purpose, llm_tool, generated_at)
565
- VALUES(?, ?, ?, ?, ?)
566
- ON CONFLICT(file_id) DO UPDATE SET
567
- summary=excluded.summary,
568
- purpose=excluded.purpose,
569
- llm_tool=excluded.llm_tool,
570
- generated_at=excluded.generated_at
571
- """,
572
- (file_id, summary, purpose, llm_tool, generated_at),
573
- )
574
-
575
- # Write to normalized keywords tables for optimized search
576
- # First, remove existing keyword associations
577
- conn.execute("DELETE FROM file_keywords WHERE file_id = ?", (file_id,))
578
-
579
- # Then add new keywords
580
- for keyword in keywords:
581
- keyword = keyword.strip()
582
- if not keyword:
583
- continue
584
-
585
- # Insert keyword if it doesn't exist
586
- conn.execute(
587
- "INSERT OR IGNORE INTO keywords(keyword) VALUES(?)",
588
- (keyword,)
589
- )
590
-
591
- # Get keyword_id
592
- row = conn.execute(
593
- "SELECT id FROM keywords WHERE keyword = ?",
594
- (keyword,)
595
- ).fetchone()
596
-
597
- if row:
598
- keyword_id = row["id"]
599
- # Link file to keyword
600
- conn.execute(
601
- "INSERT OR IGNORE INTO file_keywords(file_id, keyword_id) VALUES(?, ?)",
602
- (file_id, keyword_id)
603
- )
604
-
605
- conn.commit()
606
-
607
- def get_semantic_metadata(self, file_id: int) -> Optional[Dict[str, Any]]:
608
- """Get semantic metadata for a file.
609
-
610
- Args:
611
- file_id: File ID from files table
612
-
613
- Returns:
614
- Dict with summary, keywords, purpose, llm_tool, generated_at, or None if not found
615
- """
616
- with self._lock:
617
- conn = self._get_connection()
618
-
619
- # Get semantic metadata (without keywords column)
620
- row = conn.execute(
621
- """
622
- SELECT summary, purpose, llm_tool, generated_at
623
- FROM semantic_metadata WHERE file_id=?
624
- """,
625
- (file_id,),
626
- ).fetchone()
627
-
628
- if not row:
629
- return None
630
-
631
- # Get keywords from normalized file_keywords table
632
- keyword_rows = conn.execute(
633
- """
634
- SELECT k.keyword
635
- FROM file_keywords fk
636
- JOIN keywords k ON fk.keyword_id = k.id
637
- WHERE fk.file_id = ?
638
- ORDER BY k.keyword
639
- """,
640
- (file_id,),
641
- ).fetchall()
642
-
643
- keywords = [kw["keyword"] for kw in keyword_rows]
644
-
645
- return {
646
- "summary": row["summary"],
647
- "keywords": keywords,
648
- "purpose": row["purpose"],
649
- "llm_tool": row["llm_tool"],
650
- "generated_at": float(row["generated_at"]) if row["generated_at"] else 0.0,
651
- }
652
-
653
- def get_files_without_semantic(self) -> List[FileEntry]:
654
- """Get all files that don't have semantic metadata.
655
-
656
- Returns:
657
- List of FileEntry objects without semantic metadata
658
- """
659
- with self._lock:
660
- conn = self._get_connection()
661
-
662
- rows = conn.execute(
663
- """
664
- SELECT f.id, f.name, f.full_path, f.language, f.mtime, f.line_count
665
- FROM files f
666
- LEFT JOIN semantic_metadata sm ON f.id = sm.file_id
667
- WHERE sm.id IS NULL
668
- ORDER BY f.name
669
- """
670
- ).fetchall()
671
-
672
- return [
673
- FileEntry(
674
- id=int(row["id"]),
675
- name=row["name"],
676
- full_path=Path(row["full_path"]),
677
- language=row["language"],
678
- mtime=float(row["mtime"]) if row["mtime"] else 0.0,
679
- line_count=int(row["line_count"]) if row["line_count"] else 0,
680
- )
681
- for row in rows
682
- ]
683
-
684
- def search_semantic_keywords(self, keyword: str, use_normalized: bool = True) -> List[Tuple[FileEntry, List[str]]]:
685
- """Search files by semantic keywords.
686
-
687
- Args:
688
- keyword: Keyword to search for (case-insensitive)
689
- use_normalized: Use optimized normalized tables (default: True)
690
-
691
- Returns:
692
- List of (FileEntry, keywords) tuples where keyword matches
693
- """
694
- with self._lock:
695
- conn = self._get_connection()
696
-
697
- if use_normalized:
698
- # Optimized query using normalized tables with indexed lookup
699
- # Use prefix search (keyword%) for better index utilization
700
- keyword_pattern = f"{keyword}%"
701
-
702
- rows = conn.execute(
703
- """
704
- SELECT f.id, f.name, f.full_path, f.language, f.mtime, f.line_count,
705
- GROUP_CONCAT(k.keyword, ',') as keywords
706
- FROM files f
707
- JOIN file_keywords fk ON f.id = fk.file_id
708
- JOIN keywords k ON fk.keyword_id = k.id
709
- WHERE k.keyword LIKE ? COLLATE NOCASE
710
- GROUP BY f.id, f.name, f.full_path, f.language, f.mtime, f.line_count
711
- ORDER BY f.name
712
- """,
713
- (keyword_pattern,),
714
- ).fetchall()
715
-
716
- results = []
717
- for row in rows:
718
- file_entry = FileEntry(
719
- id=int(row["id"]),
720
- name=row["name"],
721
- full_path=Path(row["full_path"]),
722
- language=row["language"],
723
- mtime=float(row["mtime"]) if row["mtime"] else 0.0,
724
- line_count=int(row["line_count"]) if row["line_count"] else 0,
725
- )
726
- keywords = row["keywords"].split(',') if row["keywords"] else []
727
- results.append((file_entry, keywords))
728
-
729
- return results
730
-
731
- else:
732
- # Fallback using normalized tables with contains matching (slower but more flexible)
733
- keyword_pattern = f"%{keyword}%"
734
-
735
- rows = conn.execute(
736
- """
737
- SELECT f.id, f.name, f.full_path, f.language, f.mtime, f.line_count,
738
- GROUP_CONCAT(k.keyword, ',') as keywords
739
- FROM files f
740
- JOIN file_keywords fk ON f.id = fk.file_id
741
- JOIN keywords k ON fk.keyword_id = k.id
742
- WHERE k.keyword LIKE ? COLLATE NOCASE
743
- GROUP BY f.id, f.name, f.full_path, f.language, f.mtime, f.line_count
744
- ORDER BY f.name
745
- """,
746
- (keyword_pattern,),
747
- ).fetchall()
748
-
749
- results = []
750
- for row in rows:
751
- file_entry = FileEntry(
752
- id=int(row["id"]),
753
- name=row["name"],
754
- full_path=Path(row["full_path"]),
755
- language=row["language"],
756
- mtime=float(row["mtime"]) if row["mtime"] else 0.0,
757
- line_count=int(row["line_count"]) if row["line_count"] else 0,
758
- )
759
- keywords = row["keywords"].split(',') if row["keywords"] else []
760
- results.append((file_entry, keywords))
761
-
762
- return results
763
-
764
- def list_semantic_metadata(
765
- self,
766
- offset: int = 0,
767
- limit: int = 50,
768
- llm_tool: Optional[str] = None,
769
- ) -> Tuple[List[Dict[str, Any]], int]:
770
- """List all semantic metadata with file information.
771
-
772
- Args:
773
- offset: Number of records to skip (for pagination)
774
- limit: Maximum records to return (max 100)
775
- llm_tool: Optional filter by LLM tool used
776
-
777
- Returns:
778
- Tuple of (list of metadata dicts, total count)
779
- """
780
- with self._lock:
781
- conn = self._get_connection()
782
-
783
- # Query semantic metadata without keywords column
784
- base_query = """
785
- SELECT f.id as file_id, f.name as file_name, f.full_path,
786
- f.language, f.line_count,
787
- sm.summary, sm.purpose,
788
- sm.llm_tool, sm.generated_at
789
- FROM files f
790
- JOIN semantic_metadata sm ON f.id = sm.file_id
791
- """
792
- count_query = """
793
- SELECT COUNT(*) as total
794
- FROM files f
795
- JOIN semantic_metadata sm ON f.id = sm.file_id
796
- """
797
-
798
- params: List[Any] = []
799
- if llm_tool:
800
- base_query += " WHERE sm.llm_tool = ?"
801
- count_query += " WHERE sm.llm_tool = ?"
802
- params.append(llm_tool)
803
-
804
- base_query += " ORDER BY sm.generated_at DESC LIMIT ? OFFSET ?"
805
- params.extend([min(limit, 100), offset])
806
-
807
- count_params = [llm_tool] if llm_tool else []
808
- total_row = conn.execute(count_query, count_params).fetchone()
809
- total = int(total_row["total"]) if total_row else 0
810
-
811
- rows = conn.execute(base_query, params).fetchall()
812
-
813
- results = []
814
- for row in rows:
815
- file_id = int(row["file_id"])
816
-
817
- # Get keywords from normalized file_keywords table
818
- keyword_rows = conn.execute(
819
- """
820
- SELECT k.keyword
821
- FROM file_keywords fk
822
- JOIN keywords k ON fk.keyword_id = k.id
823
- WHERE fk.file_id = ?
824
- ORDER BY k.keyword
825
- """,
826
- (file_id,),
827
- ).fetchall()
828
-
829
- keywords = [kw["keyword"] for kw in keyword_rows]
830
-
831
- results.append({
832
- "file_id": file_id,
833
- "file_name": row["file_name"],
834
- "full_path": row["full_path"],
835
- "language": row["language"],
836
- "line_count": int(row["line_count"]) if row["line_count"] else 0,
837
- "summary": row["summary"],
838
- "keywords": keywords,
839
- "purpose": row["purpose"],
840
- "llm_tool": row["llm_tool"],
841
- "generated_at": float(row["generated_at"]) if row["generated_at"] else 0.0,
842
- })
843
-
844
- return results, total
845
-
846
- # === Subdirectory Links ===
847
-
848
- def register_subdir(
849
- self,
850
- name: str,
851
- index_path: str | Path,
852
- files_count: int = 0,
853
- direct_files: int = 0,
854
- ) -> None:
855
- """Register or update a subdirectory link.
856
-
857
- Args:
858
- name: Subdirectory name
859
- index_path: Path to subdirectory's _index.db
860
- files_count: Total files recursively
861
- direct_files: Deprecated parameter (no longer used)
862
- """
863
- with self._lock:
864
- conn = self._get_connection()
865
- index_path_str = str(Path(index_path).resolve())
866
-
867
- import time
868
- last_updated = time.time()
869
-
870
- # Note: direct_files parameter is deprecated but kept for backward compatibility
871
- conn.execute(
872
- """
873
- INSERT INTO subdirs(name, index_path, files_count, last_updated)
874
- VALUES(?, ?, ?, ?)
875
- ON CONFLICT(name) DO UPDATE SET
876
- index_path=excluded.index_path,
877
- files_count=excluded.files_count,
878
- last_updated=excluded.last_updated
879
- """,
880
- (name, index_path_str, files_count, last_updated),
881
- )
882
- conn.commit()
883
-
884
- def unregister_subdir(self, name: str) -> bool:
885
- """Remove a subdirectory link.
886
-
887
- Args:
888
- name: Subdirectory name
889
-
890
- Returns:
891
- True if removed, False if not found
892
- """
893
- with self._lock:
894
- conn = self._get_connection()
895
- row = conn.execute("SELECT id FROM subdirs WHERE name=?", (name,)).fetchone()
896
- if not row:
897
- return False
898
-
899
- conn.execute("DELETE FROM subdirs WHERE name=?", (name,))
900
- conn.commit()
901
- return True
902
-
903
- def get_subdirs(self) -> List[SubdirLink]:
904
- """Get all subdirectory links.
905
-
906
- Returns:
907
- List of SubdirLink objects
908
- """
909
- with self._lock:
910
- conn = self._get_connection()
911
- rows = conn.execute(
912
- """
913
- SELECT id, name, index_path, files_count, last_updated
914
- FROM subdirs
915
- ORDER BY name
916
- """
917
- ).fetchall()
918
-
919
- return [
920
- SubdirLink(
921
- id=int(row["id"]),
922
- name=row["name"],
923
- index_path=Path(row["index_path"]),
924
- files_count=int(row["files_count"]) if row["files_count"] else 0,
925
- last_updated=float(row["last_updated"]) if row["last_updated"] else 0.0,
926
- )
927
- for row in rows
928
- ]
929
-
930
- def get_subdir(self, name: str) -> Optional[SubdirLink]:
931
- """Get a specific subdirectory link.
932
-
933
- Args:
934
- name: Subdirectory name
935
-
936
- Returns:
937
- SubdirLink if found, None otherwise
938
- """
939
- with self._lock:
940
- conn = self._get_connection()
941
- row = conn.execute(
942
- """
943
- SELECT id, name, index_path, files_count, last_updated
944
- FROM subdirs WHERE name=?
945
- """,
946
- (name,),
947
- ).fetchone()
948
-
949
- if not row:
950
- return None
951
-
952
- return SubdirLink(
953
- id=int(row["id"]),
954
- name=row["name"],
955
- index_path=Path(row["index_path"]),
956
- files_count=int(row["files_count"]) if row["files_count"] else 0,
957
- last_updated=float(row["last_updated"]) if row["last_updated"] else 0.0,
958
- )
959
-
960
- def update_subdir_stats(
961
- self, name: str, files_count: int, direct_files: Optional[int] = None
962
- ) -> None:
963
- """Update subdirectory statistics.
964
-
965
- Args:
966
- name: Subdirectory name
967
- files_count: Total files recursively
968
- direct_files: Deprecated parameter (no longer used)
969
- """
970
- with self._lock:
971
- conn = self._get_connection()
972
- import time
973
- last_updated = time.time()
974
-
975
- # Note: direct_files parameter is deprecated but kept for backward compatibility
976
- conn.execute(
977
- """
978
- UPDATE subdirs
979
- SET files_count=?, last_updated=?
980
- WHERE name=?
981
- """,
982
- (files_count, last_updated, name),
983
- )
984
- conn.commit()
985
-
986
- # === Search ===
987
-
988
- @staticmethod
989
- def _enhance_fts_query(query: str) -> str:
990
- """Enhance FTS5 query to support prefix matching for simple queries.
991
-
992
- For simple single-word or multi-word queries without FTS5 operators,
993
- automatically adds prefix wildcard (*) to enable partial matching.
994
-
995
- Examples:
996
- "loadPack" -> "loadPack*"
997
- "load package" -> "load* package*"
998
- "load*" -> "load*" (already has wildcard, unchanged)
999
- "NOT test" -> "NOT test" (has FTS operator, unchanged)
1000
-
1001
- Args:
1002
- query: Original FTS5 query string
1003
-
1004
- Returns:
1005
- Enhanced query string with prefix wildcards for simple queries
1006
- """
1007
- # Don't modify if query already contains FTS5 operators or wildcards
1008
- if any(op in query.upper() for op in [' AND ', ' OR ', ' NOT ', ' NEAR ', '*', '"']):
1009
- return query
1010
-
1011
- # For simple queries, add prefix wildcard to each word
1012
- words = query.split()
1013
- enhanced_words = [f"{word}*" if not word.endswith('*') else word for word in words]
1014
- return ' '.join(enhanced_words)
1015
-
1016
- def _find_match_lines(self, content: str, query: str) -> List[int]:
1017
- """Find line numbers where query terms match.
1018
-
1019
- Args:
1020
- content: File content
1021
- query: Search query (FTS5 format)
1022
-
1023
- Returns:
1024
- List of 1-based line numbers containing matches
1025
- """
1026
- # Extract search terms from FTS query (remove operators)
1027
- terms = re.findall(r'["\']([^"\']+)["\']|(\w+)', query)
1028
- search_terms = [t[0] or t[1] for t in terms if t[0] or t[1]]
1029
- # Filter out FTS operators
1030
- fts_operators = {'AND', 'OR', 'NOT', 'NEAR'}
1031
- search_terms = [t for t in search_terms if t.upper() not in fts_operators]
1032
-
1033
- if not search_terms:
1034
- return [1] # Default to first line
1035
-
1036
- lines = content.split('\n')
1037
- match_lines = []
1038
-
1039
- for i, line in enumerate(lines, 1):
1040
- line_lower = line.lower()
1041
- for term in search_terms:
1042
- # Handle wildcard suffix
1043
- term_clean = term.rstrip('*').lower()
1044
- if term_clean and term_clean in line_lower:
1045
- match_lines.append(i)
1046
- break
1047
-
1048
- return match_lines if match_lines else [1]
1049
-
1050
- def _find_containing_symbol(
1051
- self, conn: sqlite3.Connection, file_id: int, line_num: int
1052
- ) -> Optional[Tuple[int, int, str, str]]:
1053
- """Find the symbol that contains the given line number.
1054
-
1055
- Args:
1056
- conn: Database connection
1057
- file_id: File ID in database
1058
- line_num: 1-based line number
1059
-
1060
- Returns:
1061
- Tuple of (start_line, end_line, symbol_name, symbol_kind) or None
1062
- """
1063
- row = conn.execute(
1064
- """
1065
- SELECT start_line, end_line, name, kind
1066
- FROM symbols
1067
- WHERE file_id = ? AND start_line <= ? AND end_line >= ?
1068
- ORDER BY (end_line - start_line) ASC
1069
- LIMIT 1
1070
- """,
1071
- (file_id, line_num, line_num),
1072
- ).fetchone()
1073
-
1074
- if row:
1075
- return (row["start_line"], row["end_line"], row["name"], row["kind"])
1076
- return None
1077
-
1078
- def _extract_code_block(
1079
- self,
1080
- content: str,
1081
- start_line: int,
1082
- end_line: int,
1083
- match_line: Optional[int] = None,
1084
- context_lines: int = 5,
1085
- ) -> Tuple[str, int, int]:
1086
- """Extract code block from content.
1087
-
1088
- If start_line/end_line are provided (from symbol), use them.
1089
- Otherwise, extract context around match_line.
1090
-
1091
- Args:
1092
- content: Full file content
1093
- start_line: 1-based start line (from symbol or calculated)
1094
- end_line: 1-based end line (from symbol or calculated)
1095
- match_line: 1-based line where match occurred (for context extraction)
1096
- context_lines: Number of lines before/after match when no symbol
1097
-
1098
- Returns:
1099
- Tuple of (code_block, actual_start_line, actual_end_line)
1100
- """
1101
- lines = content.split('\n')
1102
- total_lines = len(lines)
1103
-
1104
- # Clamp to valid range
1105
- start_line = max(1, start_line)
1106
- end_line = min(total_lines, end_line)
1107
-
1108
- # Extract block (convert to 0-based index)
1109
- block_lines = lines[start_line - 1:end_line]
1110
- block_content = '\n'.join(block_lines)
1111
-
1112
- return block_content, start_line, end_line
1113
-
1114
- def _batch_fetch_symbols(
1115
- self, conn: sqlite3.Connection, file_ids: List[int]
1116
- ) -> Dict[int, List[Tuple[int, int, str, str]]]:
1117
- """Batch fetch all symbols for multiple files in a single query.
1118
-
1119
- Args:
1120
- conn: Database connection
1121
- file_ids: List of file IDs to fetch symbols for
1122
-
1123
- Returns:
1124
- Dictionary mapping file_id to list of (start_line, end_line, name, kind) tuples
1125
- """
1126
- if not file_ids:
1127
- return {}
1128
-
1129
- # Build placeholder string for IN clause
1130
- placeholders = ','.join('?' for _ in file_ids)
1131
- rows = conn.execute(
1132
- f"""
1133
- SELECT file_id, start_line, end_line, name, kind
1134
- FROM symbols
1135
- WHERE file_id IN ({placeholders})
1136
- ORDER BY file_id, (end_line - start_line) ASC
1137
- """,
1138
- file_ids,
1139
- ).fetchall()
1140
-
1141
- # Organize symbols by file_id
1142
- symbols_by_file: Dict[int, List[Tuple[int, int, str, str]]] = {fid: [] for fid in file_ids}
1143
- for row in rows:
1144
- symbols_by_file[row["file_id"]].append(
1145
- (row["start_line"], row["end_line"], row["name"], row["kind"])
1146
- )
1147
- return symbols_by_file
1148
-
1149
- def _find_containing_symbol_from_cache(
1150
- self, symbols: List[Tuple[int, int, str, str]], line_num: int
1151
- ) -> Optional[Tuple[int, int, str, str]]:
1152
- """Find the smallest symbol containing the given line number from cached symbols.
1153
-
1154
- Args:
1155
- symbols: List of (start_line, end_line, name, kind) tuples, sorted by size
1156
- line_num: 1-based line number
1157
-
1158
- Returns:
1159
- Tuple of (start_line, end_line, symbol_name, symbol_kind) or None
1160
- """
1161
- for start_line, end_line, name, kind in symbols:
1162
- if start_line <= line_num <= end_line:
1163
- return (start_line, end_line, name, kind)
1164
- return None
1165
-
1166
- def _generate_centered_excerpt(
1167
- self, content: str, match_line: int, start_line: int, end_line: int, max_chars: int = 200
1168
- ) -> str:
1169
- """Generate excerpt centered around the match line.
1170
-
1171
- Args:
1172
- content: Full file content
1173
- match_line: 1-based line where match occurred
1174
- start_line: 1-based start line of the code block
1175
- end_line: 1-based end line of the code block
1176
- max_chars: Maximum characters for excerpt
1177
-
1178
- Returns:
1179
- Excerpt string centered around the match
1180
- """
1181
- lines = content.split('\n')
1182
- total_lines = len(lines)
1183
-
1184
- # Ensure match_line is within bounds
1185
- match_line = max(1, min(match_line, total_lines))
1186
-
1187
- # Calculate context window (2 lines before, 2 lines after the match)
1188
- ctx_start = max(start_line, match_line - 2)
1189
- ctx_end = min(end_line, match_line + 2)
1190
-
1191
- # Extract and join lines
1192
- excerpt_lines = lines[ctx_start - 1:ctx_end]
1193
- excerpt = '\n'.join(excerpt_lines)
1194
-
1195
- # Truncate if too long
1196
- if len(excerpt) > max_chars:
1197
- excerpt = excerpt[:max_chars] + "..."
1198
-
1199
- return excerpt
1200
-
1201
- def _search_internal(
1202
- self,
1203
- query: str,
1204
- fts_table: str,
1205
- limit: int = 20,
1206
- return_full_content: bool = False,
1207
- context_lines: int = 10,
1208
- ) -> List[SearchResult]:
1209
- """Internal unified search implementation for all FTS modes.
1210
-
1211
- Optimizations:
1212
- - Fast path: Direct FTS query with snippet() for location-only results
1213
- - Full content path: Batch fetch symbols to eliminate N+1 queries
1214
- - Centered excerpt generation for better context
1215
-
1216
- Args:
1217
- query: FTS5 query string
1218
- fts_table: FTS table name ('files_fts_exact' or 'files_fts_fuzzy')
1219
- limit: Maximum results to return
1220
- return_full_content: If True, include full code block in content field
1221
- context_lines: Lines of context when no symbol contains the match
1222
-
1223
- Returns:
1224
- List of SearchResult objects
1225
- """
1226
- with self._lock:
1227
- conn = self._get_connection()
1228
-
1229
- # Fast path: location-only results (no content processing)
1230
- if not return_full_content:
1231
- try:
1232
- rows = conn.execute(
1233
- f"""
1234
- SELECT rowid, full_path, bm25({fts_table}) AS rank,
1235
- snippet({fts_table}, 2, '', '', '...', 30) AS excerpt
1236
- FROM {fts_table}
1237
- WHERE {fts_table} MATCH ?
1238
- ORDER BY rank
1239
- LIMIT ?
1240
- """,
1241
- (query, limit),
1242
- ).fetchall()
1243
- except sqlite3.DatabaseError as exc:
1244
- raise StorageError(f"FTS search failed: {exc}") from exc
1245
-
1246
- results: List[SearchResult] = []
1247
- for row in rows:
1248
- rank = float(row["rank"]) if row["rank"] is not None else 0.0
1249
- score = abs(rank) if rank < 0 else 0.0
1250
- results.append(
1251
- SearchResult(
1252
- path=row["full_path"],
1253
- score=score,
1254
- excerpt=row["excerpt"],
1255
- )
1256
- )
1257
- return results
1258
-
1259
- # Full content path with batch optimization
1260
- # Step 1: Get file_ids and ranks (lightweight query)
1261
- try:
1262
- id_rows = conn.execute(
1263
- f"""
1264
- SELECT rowid AS file_id, bm25({fts_table}) AS rank
1265
- FROM {fts_table}
1266
- WHERE {fts_table} MATCH ?
1267
- ORDER BY rank
1268
- LIMIT ?
1269
- """,
1270
- (query, limit),
1271
- ).fetchall()
1272
- except sqlite3.DatabaseError as exc:
1273
- raise StorageError(f"FTS search failed: {exc}") from exc
1274
-
1275
- if not id_rows:
1276
- return []
1277
-
1278
- file_ids = [row["file_id"] for row in id_rows]
1279
- ranks_by_id = {row["file_id"]: row["rank"] for row in id_rows}
1280
-
1281
- # Step 2: Batch fetch all symbols for matched files (eliminates N+1)
1282
- symbols_by_file = self._batch_fetch_symbols(conn, file_ids)
1283
-
1284
- # Step 3: Process each file on-demand (reduces memory)
1285
- results: List[SearchResult] = []
1286
- for file_id in file_ids:
1287
- # Fetch file content on-demand
1288
- file_row = conn.execute(
1289
- "SELECT full_path, content FROM files WHERE id = ?",
1290
- (file_id,),
1291
- ).fetchone()
1292
-
1293
- if not file_row:
1294
- continue
1295
-
1296
- file_path = file_row["full_path"]
1297
- content = file_row["content"] or ""
1298
- rank = ranks_by_id.get(file_id, 0.0)
1299
- score = abs(rank) if rank < 0 else 0.0
1300
-
1301
- # Find matching lines
1302
- match_lines = self._find_match_lines(content, query)
1303
- first_match_line = match_lines[0] if match_lines else 1
1304
-
1305
- # Find symbol from cached symbols (no extra SQL query)
1306
- file_symbols = symbols_by_file.get(file_id, [])
1307
- symbol_info = self._find_containing_symbol_from_cache(file_symbols, first_match_line)
1308
-
1309
- if symbol_info:
1310
- start_line, end_line, symbol_name, symbol_kind = symbol_info
1311
- else:
1312
- # No symbol found, use context around match
1313
- lines = content.split('\n')
1314
- total_lines = len(lines)
1315
- start_line = max(1, first_match_line - context_lines)
1316
- end_line = min(total_lines, first_match_line + context_lines)
1317
- symbol_name = None
1318
- symbol_kind = None
1319
-
1320
- # Extract code block
1321
- block_content, start_line, end_line = self._extract_code_block(
1322
- content, start_line, end_line
1323
- )
1324
-
1325
- # Generate centered excerpt (improved quality)
1326
- excerpt = self._generate_centered_excerpt(
1327
- content, first_match_line, start_line, end_line
1328
- )
1329
-
1330
- results.append(
1331
- SearchResult(
1332
- path=file_path,
1333
- score=score,
1334
- excerpt=excerpt,
1335
- content=block_content,
1336
- start_line=start_line,
1337
- end_line=end_line,
1338
- symbol_name=symbol_name,
1339
- symbol_kind=symbol_kind,
1340
- )
1341
- )
1342
- return results
1343
-
1344
-
1345
- def search_fts(
1346
- self,
1347
- query: str,
1348
- limit: int = 20,
1349
- enhance_query: bool = False,
1350
- return_full_content: bool = False,
1351
- context_lines: int = 10,
1352
- ) -> List[SearchResult]:
1353
- """Full-text search in current directory files.
1354
-
1355
- Uses files_fts_exact (unicode61 tokenizer) for exact token matching.
1356
- For fuzzy/substring search, use search_fts_fuzzy() instead.
1357
-
1358
- Best Practice (from industry analysis of Codanna/Code-Index-MCP):
1359
- - Default: Respects exact user input without modification
1360
- - Users can manually add wildcards (e.g., "loadPack*") for prefix matching
1361
- - Automatic enhancement (enhance_query=True) is NOT recommended as it can
1362
- violate user intent and bring unwanted noise in results
1363
-
1364
- Args:
1365
- query: FTS5 query string
1366
- limit: Maximum results to return
1367
- enhance_query: If True, automatically add prefix wildcards for simple queries.
1368
- Default False to respect exact user input.
1369
- return_full_content: If True, include full code block in content field.
1370
- Default False for fast location-only results.
1371
- context_lines: Lines of context when no symbol contains the match
1372
-
1373
- Returns:
1374
- List of SearchResult objects (location-only by default, with content if requested)
1375
-
1376
- Raises:
1377
- StorageError: If FTS search fails
1378
- """
1379
- final_query = self._enhance_fts_query(query) if enhance_query else query
1380
- return self._search_internal(
1381
- query=final_query,
1382
- fts_table='files_fts_exact',
1383
- limit=limit,
1384
- return_full_content=return_full_content,
1385
- context_lines=context_lines,
1386
- )
1387
-
1388
- def search_fts_exact(
1389
- self,
1390
- query: str,
1391
- limit: int = 20,
1392
- return_full_content: bool = False,
1393
- context_lines: int = 10,
1394
- ) -> List[SearchResult]:
1395
- """Full-text search using exact token matching.
1396
-
1397
- Args:
1398
- query: FTS5 query string
1399
- limit: Maximum results to return
1400
- return_full_content: If True, include full code block in content field.
1401
- Default False for fast location-only results.
1402
- context_lines: Lines of context when no symbol contains the match
1403
-
1404
- Returns:
1405
- List of SearchResult objects (location-only by default, with content if requested)
1406
-
1407
- Raises:
1408
- StorageError: If FTS search fails
1409
- """
1410
- return self._search_internal(
1411
- query=query,
1412
- fts_table='files_fts_exact',
1413
- limit=limit,
1414
- return_full_content=return_full_content,
1415
- context_lines=context_lines,
1416
- )
1417
-
1418
- def search_fts_fuzzy(
1419
- self,
1420
- query: str,
1421
- limit: int = 20,
1422
- return_full_content: bool = False,
1423
- context_lines: int = 10,
1424
- ) -> List[SearchResult]:
1425
- """Full-text search using fuzzy/substring matching.
1426
-
1427
- Args:
1428
- query: FTS5 query string
1429
- limit: Maximum results to return
1430
- return_full_content: If True, include full code block in content field.
1431
- Default False for fast location-only results.
1432
- context_lines: Lines of context when no symbol contains the match
1433
-
1434
- Returns:
1435
- List of SearchResult objects (location-only by default, with content if requested)
1436
-
1437
- Raises:
1438
- StorageError: If FTS search fails
1439
- """
1440
- return self._search_internal(
1441
- query=query,
1442
- fts_table='files_fts_fuzzy',
1443
- limit=limit,
1444
- return_full_content=return_full_content,
1445
- context_lines=context_lines,
1446
- )
1447
-
1448
- def search_files_only(self, query: str, limit: int = 20) -> List[str]:
1449
- """Fast FTS search returning only file paths (no snippet generation).
1450
-
1451
- Optimized for when only file paths are needed, skipping expensive
1452
- snippet() function call.
1453
-
1454
- Args:
1455
- query: FTS5 query string
1456
- limit: Maximum results to return
1457
-
1458
- Returns:
1459
- List of file paths as strings
1460
-
1461
- Raises:
1462
- StorageError: If FTS search fails
1463
- """
1464
- with self._lock:
1465
- conn = self._get_connection()
1466
- try:
1467
- rows = conn.execute(
1468
- """
1469
- SELECT full_path
1470
- FROM files_fts
1471
- WHERE files_fts MATCH ?
1472
- ORDER BY bm25(files_fts)
1473
- LIMIT ?
1474
- """,
1475
- (query, limit),
1476
- ).fetchall()
1477
- except sqlite3.DatabaseError as exc:
1478
- raise StorageError(f"FTS search failed: {exc}") from exc
1479
-
1480
- return [row["full_path"] for row in rows]
1481
-
1482
- def search_symbols(
1483
- self, name: str, kind: Optional[str] = None, limit: int = 50, prefix_mode: bool = True
1484
- ) -> List[Symbol]:
1485
- """Search symbols by name pattern.
1486
-
1487
- Args:
1488
- name: Symbol name pattern
1489
- kind: Optional symbol kind filter
1490
- limit: Maximum results to return
1491
- prefix_mode: If True, use prefix search (faster with index);
1492
- If False, use substring search (slower)
1493
-
1494
- Returns:
1495
- List of Symbol objects
1496
- """
1497
- # Prefix search is much faster as it can use index
1498
- if prefix_mode:
1499
- pattern = f"{name}%"
1500
- else:
1501
- pattern = f"%{name}%"
1502
-
1503
- with self._lock:
1504
- conn = self._get_connection()
1505
- if kind:
1506
- rows = conn.execute(
1507
- """
1508
- SELECT s.name, s.kind, s.start_line, s.end_line, f.full_path
1509
- FROM symbols s
1510
- JOIN files f ON s.file_id = f.id
1511
- WHERE s.name LIKE ? AND s.kind=?
1512
- ORDER BY s.name
1513
- LIMIT ?
1514
- """,
1515
- (pattern, kind, limit),
1516
- ).fetchall()
1517
- else:
1518
- rows = conn.execute(
1519
- """
1520
- SELECT s.name, s.kind, s.start_line, s.end_line, f.full_path
1521
- FROM symbols s
1522
- JOIN files f ON s.file_id = f.id
1523
- WHERE s.name LIKE ?
1524
- ORDER BY s.name
1525
- LIMIT ?
1526
- """,
1527
- (pattern, limit),
1528
- ).fetchall()
1529
-
1530
- return [
1531
- Symbol(
1532
- name=row["name"],
1533
- kind=row["kind"],
1534
- range=(row["start_line"], row["end_line"]),
1535
- file=row["full_path"],
1536
- )
1537
- for row in rows
1538
- ]
1539
-
1540
- # === Statistics ===
1541
-
1542
- def stats(self) -> Dict[str, Any]:
1543
- """Get current directory statistics.
1544
-
1545
- Returns:
1546
- Dictionary containing:
1547
- - files: Number of files in this directory
1548
- - symbols: Number of symbols
1549
- - subdirs: Number of subdirectories
1550
- - total_files: Total files including subdirectories
1551
- - languages: Dictionary of language counts
1552
- """
1553
- with self._lock:
1554
- conn = self._get_connection()
1555
-
1556
- file_count = conn.execute("SELECT COUNT(*) AS c FROM files").fetchone()["c"]
1557
- symbol_count = conn.execute("SELECT COUNT(*) AS c FROM symbols").fetchone()["c"]
1558
- subdir_count = conn.execute("SELECT COUNT(*) AS c FROM subdirs").fetchone()["c"]
1559
-
1560
- total_files_row = conn.execute(
1561
- "SELECT COALESCE(SUM(files_count), 0) AS total FROM subdirs"
1562
- ).fetchone()
1563
- total_files = int(file_count) + int(total_files_row["total"] if total_files_row else 0)
1564
-
1565
- lang_rows = conn.execute(
1566
- "SELECT language, COUNT(*) AS c FROM files GROUP BY language ORDER BY c DESC"
1567
- ).fetchall()
1568
- languages = {row["language"]: int(row["c"]) for row in lang_rows}
1569
-
1570
- return {
1571
- "files": int(file_count),
1572
- "symbols": int(symbol_count),
1573
- "subdirs": int(subdir_count),
1574
- "total_files": total_files,
1575
- "languages": languages,
1576
- }
1577
-
1578
- # === Internal Methods ===
1579
-
1580
- def _get_connection(self) -> sqlite3.Connection:
1581
- """Get or create database connection with proper configuration.
1582
-
1583
- Returns:
1584
- sqlite3.Connection with WAL mode and foreign keys enabled
1585
- """
1586
- if self._conn is None:
1587
- self._conn = sqlite3.connect(str(self.db_path), check_same_thread=False)
1588
- self._conn.row_factory = sqlite3.Row
1589
- self._conn.execute("PRAGMA journal_mode=WAL")
1590
- self._conn.execute("PRAGMA synchronous=NORMAL")
1591
- self._conn.execute("PRAGMA foreign_keys=ON")
1592
- # Memory-mapped I/O for faster reads (30GB limit)
1593
- self._conn.execute("PRAGMA mmap_size=30000000000")
1594
- return self._conn
1595
-
1596
- def _create_schema(self, conn: sqlite3.Connection) -> None:
1597
- """Create database schema.
1598
-
1599
- Args:
1600
- conn: Database connection
1601
-
1602
- Raises:
1603
- StorageError: If schema creation fails
1604
- """
1605
- try:
1606
- # Files table
1607
- conn.execute(
1608
- """
1609
- CREATE TABLE IF NOT EXISTS files (
1610
- id INTEGER PRIMARY KEY,
1611
- name TEXT NOT NULL,
1612
- full_path TEXT UNIQUE NOT NULL,
1613
- language TEXT,
1614
- content TEXT,
1615
- mtime REAL,
1616
- line_count INTEGER
1617
- )
1618
- """
1619
- )
1620
-
1621
- # Subdirectories table (v5: removed direct_files)
1622
- conn.execute(
1623
- """
1624
- CREATE TABLE IF NOT EXISTS subdirs (
1625
- id INTEGER PRIMARY KEY,
1626
- name TEXT NOT NULL UNIQUE,
1627
- index_path TEXT NOT NULL,
1628
- files_count INTEGER DEFAULT 0,
1629
- last_updated REAL
1630
- )
1631
- """
1632
- )
1633
-
1634
- # Symbols table with token metadata
1635
- conn.execute(
1636
- """
1637
- CREATE TABLE IF NOT EXISTS symbols (
1638
- id INTEGER PRIMARY KEY,
1639
- file_id INTEGER REFERENCES files(id) ON DELETE CASCADE,
1640
- name TEXT NOT NULL,
1641
- kind TEXT NOT NULL,
1642
- start_line INTEGER,
1643
- end_line INTEGER
1644
- )
1645
- """
1646
- )
1647
-
1648
- # Dual FTS5 external content tables for exact and fuzzy matching
1649
- # files_fts_exact: unicode61 tokenizer for exact token matching
1650
- # files_fts_fuzzy: trigram tokenizer (or extended unicode61) for substring/fuzzy matching
1651
- from codexlens.storage.sqlite_utils import check_trigram_support
1652
-
1653
- has_trigram = check_trigram_support(conn)
1654
- fuzzy_tokenizer = "trigram" if has_trigram else "unicode61 tokenchars '_-.'"
1655
-
1656
- # Exact FTS table with unicode61 tokenizer
1657
- # Note: tokenchars includes '.' to properly tokenize qualified names like PortRole.FLOW
1658
- conn.execute(
1659
- """
1660
- CREATE VIRTUAL TABLE IF NOT EXISTS files_fts_exact USING fts5(
1661
- name, full_path UNINDEXED, content,
1662
- content='files',
1663
- content_rowid='id',
1664
- tokenize="unicode61 tokenchars '_-.'"
1665
- )
1666
- """
1667
- )
1668
-
1669
- # Fuzzy FTS table with trigram or extended unicode61 tokenizer
1670
- conn.execute(
1671
- f"""
1672
- CREATE VIRTUAL TABLE IF NOT EXISTS files_fts_fuzzy USING fts5(
1673
- name, full_path UNINDEXED, content,
1674
- content='files',
1675
- content_rowid='id',
1676
- tokenize="{fuzzy_tokenizer}"
1677
- )
1678
- """
1679
- )
1680
-
1681
- # Semantic metadata table (v5: removed keywords column)
1682
- conn.execute(
1683
- """
1684
- CREATE TABLE IF NOT EXISTS semantic_metadata (
1685
- id INTEGER PRIMARY KEY,
1686
- file_id INTEGER UNIQUE REFERENCES files(id) ON DELETE CASCADE,
1687
- summary TEXT,
1688
- purpose TEXT,
1689
- llm_tool TEXT,
1690
- generated_at REAL
1691
- )
1692
- """
1693
- )
1694
-
1695
- # Normalized keywords tables for performance
1696
- conn.execute(
1697
- """
1698
- CREATE TABLE IF NOT EXISTS keywords (
1699
- id INTEGER PRIMARY KEY,
1700
- keyword TEXT NOT NULL UNIQUE
1701
- )
1702
- """
1703
- )
1704
-
1705
- conn.execute(
1706
- """
1707
- CREATE TABLE IF NOT EXISTS file_keywords (
1708
- file_id INTEGER NOT NULL,
1709
- keyword_id INTEGER NOT NULL,
1710
- PRIMARY KEY (file_id, keyword_id),
1711
- FOREIGN KEY (file_id) REFERENCES files (id) ON DELETE CASCADE,
1712
- FOREIGN KEY (keyword_id) REFERENCES keywords (id) ON DELETE CASCADE
1713
- )
1714
- """
1715
- )
1716
-
1717
- # Code relationships table for graph visualization
1718
- conn.execute(
1719
- """
1720
- CREATE TABLE IF NOT EXISTS code_relationships (
1721
- id INTEGER PRIMARY KEY,
1722
- source_symbol_id INTEGER NOT NULL,
1723
- target_qualified_name TEXT NOT NULL,
1724
- relationship_type TEXT NOT NULL,
1725
- source_line INTEGER NOT NULL,
1726
- target_file TEXT,
1727
- FOREIGN KEY (source_symbol_id) REFERENCES symbols (id) ON DELETE CASCADE
1728
- )
1729
- """
1730
- )
1731
-
1732
- # Indexes (v5: removed idx_symbols_type)
1733
- conn.execute("CREATE INDEX IF NOT EXISTS idx_files_name ON files(name)")
1734
- conn.execute("CREATE INDEX IF NOT EXISTS idx_files_path ON files(full_path)")
1735
- conn.execute("CREATE INDEX IF NOT EXISTS idx_subdirs_name ON subdirs(name)")
1736
- conn.execute("CREATE INDEX IF NOT EXISTS idx_symbols_name ON symbols(name)")
1737
- conn.execute("CREATE INDEX IF NOT EXISTS idx_symbols_file ON symbols(file_id)")
1738
- conn.execute("CREATE INDEX IF NOT EXISTS idx_semantic_file ON semantic_metadata(file_id)")
1739
- conn.execute("CREATE INDEX IF NOT EXISTS idx_keywords_keyword ON keywords(keyword)")
1740
- conn.execute("CREATE INDEX IF NOT EXISTS idx_file_keywords_file_id ON file_keywords(file_id)")
1741
- conn.execute("CREATE INDEX IF NOT EXISTS idx_file_keywords_keyword_id ON file_keywords(keyword_id)")
1742
- conn.execute("CREATE INDEX IF NOT EXISTS idx_rel_source ON code_relationships(source_symbol_id)")
1743
- conn.execute("CREATE INDEX IF NOT EXISTS idx_rel_target ON code_relationships(target_qualified_name)")
1744
- conn.execute("CREATE INDEX IF NOT EXISTS idx_rel_type ON code_relationships(relationship_type)")
1745
-
1746
- except sqlite3.DatabaseError as exc:
1747
- raise StorageError(f"Failed to create schema: {exc}") from exc
1748
-
1749
- def _migrate_v2_add_name_column(self, conn: sqlite3.Connection) -> None:
1750
- """Migration v2: Add 'name' column to files table.
1751
-
1752
- Required for FTS5 external content table.
1753
-
1754
- Args:
1755
- conn: Database connection
1756
- """
1757
- # Check if files table exists and has columns
1758
- cursor = conn.execute("PRAGMA table_info(files)")
1759
- files_columns = {row[1] for row in cursor.fetchall()}
1760
-
1761
- if not files_columns:
1762
- return # No files table yet, will be created fresh
1763
-
1764
- # Skip if 'name' column already exists
1765
- if "name" in files_columns:
1766
- return
1767
-
1768
- # Add 'name' column with default value
1769
- conn.execute("ALTER TABLE files ADD COLUMN name TEXT NOT NULL DEFAULT ''")
1770
-
1771
- # Populate 'name' column from full_path using pathlib for robustness
1772
- rows = conn.execute("SELECT id, full_path FROM files WHERE name = ''").fetchall()
1773
- for row in rows:
1774
- file_id = row[0]
1775
- full_path = row[1]
1776
- # Use pathlib.Path.name for cross-platform compatibility
1777
- name = Path(full_path).name if full_path else ""
1778
- conn.execute("UPDATE files SET name = ? WHERE id = ?", (name, file_id))
1779
-
1780
- def _create_fts_triggers(self, conn: sqlite3.Connection) -> None:
1781
- """Create FTS5 external content triggers for dual FTS tables.
1782
-
1783
- Creates synchronized triggers for both files_fts_exact and files_fts_fuzzy tables.
1784
-
1785
- Args:
1786
- conn: Database connection
1787
- """
1788
- # Insert triggers for files_fts_exact
1789
- conn.execute(
1790
- """
1791
- CREATE TRIGGER IF NOT EXISTS files_exact_ai AFTER INSERT ON files BEGIN
1792
- INSERT INTO files_fts_exact(rowid, name, full_path, content)
1793
- VALUES(new.id, new.name, new.full_path, new.content);
1794
- END
1795
- """
1796
- )
1797
-
1798
- # Delete trigger for files_fts_exact
1799
- conn.execute(
1800
- """
1801
- CREATE TRIGGER IF NOT EXISTS files_exact_ad AFTER DELETE ON files BEGIN
1802
- INSERT INTO files_fts_exact(files_fts_exact, rowid, name, full_path, content)
1803
- VALUES('delete', old.id, old.name, old.full_path, old.content);
1804
- END
1805
- """
1806
- )
1807
-
1808
- # Update trigger for files_fts_exact
1809
- conn.execute(
1810
- """
1811
- CREATE TRIGGER IF NOT EXISTS files_exact_au AFTER UPDATE ON files BEGIN
1812
- INSERT INTO files_fts_exact(files_fts_exact, rowid, name, full_path, content)
1813
- VALUES('delete', old.id, old.name, old.full_path, old.content);
1814
- INSERT INTO files_fts_exact(rowid, name, full_path, content)
1815
- VALUES(new.id, new.name, new.full_path, new.content);
1816
- END
1817
- """
1818
- )
1819
-
1820
- # Insert trigger for files_fts_fuzzy
1821
- conn.execute(
1822
- """
1823
- CREATE TRIGGER IF NOT EXISTS files_fuzzy_ai AFTER INSERT ON files BEGIN
1824
- INSERT INTO files_fts_fuzzy(rowid, name, full_path, content)
1825
- VALUES(new.id, new.name, new.full_path, new.content);
1826
- END
1827
- """
1828
- )
1829
-
1830
- # Delete trigger for files_fts_fuzzy
1831
- conn.execute(
1832
- """
1833
- CREATE TRIGGER IF NOT EXISTS files_fuzzy_ad AFTER DELETE ON files BEGIN
1834
- INSERT INTO files_fts_fuzzy(files_fts_fuzzy, rowid, name, full_path, content)
1835
- VALUES('delete', old.id, old.name, old.full_path, old.content);
1836
- END
1837
- """
1838
- )
1839
-
1840
- # Update trigger for files_fts_fuzzy
1841
- conn.execute(
1842
- """
1843
- CREATE TRIGGER IF NOT EXISTS files_fuzzy_au AFTER UPDATE ON files BEGIN
1844
- INSERT INTO files_fts_fuzzy(files_fts_fuzzy, rowid, name, full_path, content)
1845
- VALUES('delete', old.id, old.name, old.full_path, old.content);
1846
- INSERT INTO files_fts_fuzzy(rowid, name, full_path, content)
1847
- VALUES(new.id, new.name, new.full_path, new.content);
1848
- END
1849
- """
1850
- )
1
+ """Single-directory index storage with hierarchical linking.
2
+
3
+ Each directory maintains its own _index.db with:
4
+ - Files in the current directory
5
+ - Links to subdirectory indexes
6
+ - Full-text search via FTS5
7
+ - Symbol table for code navigation
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import logging
13
+ import re
14
+ import sqlite3
15
+ import threading
16
+ from dataclasses import dataclass
17
+ from pathlib import Path
18
+ from typing import Any, Dict, List, Optional, Tuple
19
+
20
+ from codexlens.config import Config
21
+ from codexlens.entities import SearchResult, Symbol
22
+ from codexlens.errors import StorageError
23
+ from codexlens.storage.global_index import GlobalSymbolIndex
24
+
25
+
26
+ @dataclass
27
+ class SubdirLink:
28
+ """Link to a subdirectory's index database."""
29
+
30
+ id: int
31
+ name: str
32
+ index_path: Path
33
+ files_count: int
34
+ last_updated: float
35
+
36
+
37
+ @dataclass
38
+ class FileEntry:
39
+ """Metadata for an indexed file in current directory."""
40
+
41
+ id: int
42
+ name: str
43
+ full_path: Path
44
+ language: str
45
+ mtime: float
46
+ line_count: int
47
+
48
+
49
+ class DirIndexStore:
50
+ """Single-directory index storage with hierarchical subdirectory linking.
51
+
52
+ Each directory has an independent _index.db containing:
53
+ - Files table: Files in this directory only
54
+ - Subdirs table: Links to child directory indexes
55
+ - Symbols table: Code symbols from files
56
+ - FTS5 index: Full-text search on file content
57
+
58
+ Thread-safe operations with WAL mode enabled.
59
+ """
60
+
61
+ # Schema version for migration tracking
62
+ # Increment this when schema changes require migration
63
+ SCHEMA_VERSION = 5
64
+
65
+ def __init__(
66
+ self,
67
+ db_path: str | Path,
68
+ *,
69
+ config: Config | None = None,
70
+ global_index: GlobalSymbolIndex | None = None,
71
+ ) -> None:
72
+ """Initialize directory index store.
73
+
74
+ Args:
75
+ db_path: Path to _index.db file for this directory
76
+ """
77
+ self.db_path = Path(db_path).resolve()
78
+ self._lock = threading.RLock()
79
+ self._conn: Optional[sqlite3.Connection] = None
80
+ self.logger = logging.getLogger(__name__)
81
+ self._config = config
82
+ self._global_index = global_index
83
+
84
+ def initialize(self) -> None:
85
+ """Create database and schema if not exists."""
86
+ with self._lock:
87
+ self.db_path.parent.mkdir(parents=True, exist_ok=True)
88
+ conn = self._get_connection()
89
+
90
+ # Check current schema version
91
+ current_version = self._get_schema_version(conn)
92
+
93
+ # Fail gracefully if database is from a newer version
94
+ if current_version > self.SCHEMA_VERSION:
95
+ raise StorageError(
96
+ f"Database schema version {current_version} is newer than "
97
+ f"supported version {self.SCHEMA_VERSION}. "
98
+ f"Please update the application or use a compatible database.",
99
+ db_path=str(self.db_path),
100
+ operation="initialize",
101
+ details={
102
+ "current_version": current_version,
103
+ "supported_version": self.SCHEMA_VERSION
104
+ }
105
+ )
106
+
107
+ # Create or migrate schema
108
+ if current_version == 0:
109
+ # New database - create schema directly
110
+ self._create_schema(conn)
111
+ self._create_fts_triggers(conn)
112
+ self._set_schema_version(conn, self.SCHEMA_VERSION)
113
+ elif current_version < self.SCHEMA_VERSION:
114
+ # Existing database - apply migrations
115
+ self._apply_migrations(conn, current_version)
116
+ self._set_schema_version(conn, self.SCHEMA_VERSION)
117
+
118
+ conn.commit()
119
+
120
+ def _get_schema_version(self, conn: sqlite3.Connection) -> int:
121
+ """Get current schema version from database."""
122
+ try:
123
+ row = conn.execute("PRAGMA user_version").fetchone()
124
+ return row[0] if row else 0
125
+ except Exception:
126
+ return 0
127
+
128
+ def _set_schema_version(self, conn: sqlite3.Connection, version: int) -> None:
129
+ """Set schema version in database."""
130
+ conn.execute(f"PRAGMA user_version = {version}")
131
+
132
+ def _apply_migrations(self, conn: sqlite3.Connection, from_version: int) -> None:
133
+ """Apply schema migrations from current version to latest.
134
+
135
+ Args:
136
+ conn: Database connection
137
+ from_version: Current schema version
138
+ """
139
+ # Migration v0/v1 -> v2: Add 'name' column to files table
140
+ if from_version < 2:
141
+ self._migrate_v2_add_name_column(conn)
142
+
143
+ # Migration v2 -> v4: Add dual FTS tables (exact + fuzzy)
144
+ if from_version < 4:
145
+ from codexlens.storage.migrations.migration_004_dual_fts import upgrade
146
+ upgrade(conn)
147
+
148
+ # Migration v4 -> v5: Remove unused/redundant fields
149
+ if from_version < 5:
150
+ from codexlens.storage.migrations.migration_005_cleanup_unused_fields import upgrade
151
+ upgrade(conn)
152
+
153
+ def close(self) -> None:
154
+ """Close database connection."""
155
+ with self._lock:
156
+ if self._conn is not None:
157
+ try:
158
+ self._conn.close()
159
+ except Exception:
160
+ pass
161
+ finally:
162
+ self._conn = None
163
+
164
+ def __enter__(self) -> DirIndexStore:
165
+ """Context manager entry."""
166
+ self.initialize()
167
+ return self
168
+
169
+ def __exit__(self, exc_type: object, exc: object, tb: object) -> None:
170
+ """Context manager exit."""
171
+ self.close()
172
+
173
+ # === File Operations ===
174
+
175
+ def add_file(
176
+ self,
177
+ name: str,
178
+ full_path: str | Path,
179
+ content: str,
180
+ language: str,
181
+ symbols: Optional[List[Symbol]] = None,
182
+ ) -> int:
183
+ """Add or update a file in the current directory index.
184
+
185
+ Args:
186
+ name: Filename without path
187
+ full_path: Complete source file path
188
+ content: File content for indexing
189
+ language: Programming language identifier
190
+ symbols: List of Symbol objects from the file
191
+
192
+ Returns:
193
+ Database file_id
194
+
195
+ Raises:
196
+ StorageError: If database operations fail
197
+ """
198
+ with self._lock:
199
+ conn = self._get_connection()
200
+ full_path_str = str(Path(full_path).resolve())
201
+ mtime = Path(full_path_str).stat().st_mtime if Path(full_path_str).exists() else None
202
+ line_count = content.count('\n') + 1
203
+
204
+ try:
205
+ conn.execute(
206
+ """
207
+ INSERT INTO files(name, full_path, language, content, mtime, line_count)
208
+ VALUES(?, ?, ?, ?, ?, ?)
209
+ ON CONFLICT(full_path) DO UPDATE SET
210
+ name=excluded.name,
211
+ language=excluded.language,
212
+ content=excluded.content,
213
+ mtime=excluded.mtime,
214
+ line_count=excluded.line_count
215
+ """,
216
+ (name, full_path_str, language, content, mtime, line_count),
217
+ )
218
+
219
+ row = conn.execute("SELECT id FROM files WHERE full_path=?", (full_path_str,)).fetchone()
220
+ if not row:
221
+ raise StorageError(f"Failed to retrieve file_id for {full_path_str}")
222
+
223
+ file_id = int(row["id"])
224
+
225
+ # Replace symbols
226
+ conn.execute("DELETE FROM symbols WHERE file_id=?", (file_id,))
227
+ if symbols:
228
+ # Insert symbols without token_count and symbol_type
229
+ symbol_rows = []
230
+ for s in symbols:
231
+ symbol_rows.append(
232
+ (file_id, s.name, s.kind, s.range[0], s.range[1])
233
+ )
234
+
235
+ conn.executemany(
236
+ """
237
+ INSERT INTO symbols(file_id, name, kind, start_line, end_line)
238
+ VALUES(?, ?, ?, ?, ?)
239
+ """,
240
+ symbol_rows,
241
+ )
242
+
243
+ conn.commit()
244
+ self._maybe_update_global_symbols(full_path_str, symbols or [])
245
+ return file_id
246
+
247
+ except sqlite3.DatabaseError as exc:
248
+ conn.rollback()
249
+ raise StorageError(f"Failed to add file {name}: {exc}") from exc
250
+
251
+ def add_files_batch(
252
+ self, files: List[Tuple[str, Path, str, str, Optional[List[Symbol]]]]
253
+ ) -> int:
254
+ """Add multiple files in a single transaction.
255
+
256
+ Args:
257
+ files: List of (name, full_path, content, language, symbols) tuples
258
+
259
+ Returns:
260
+ Number of files added
261
+
262
+ Raises:
263
+ StorageError: If batch operation fails
264
+ """
265
+ with self._lock:
266
+ conn = self._get_connection()
267
+ count = 0
268
+
269
+ try:
270
+ conn.execute("BEGIN")
271
+
272
+ for name, full_path, content, language, symbols in files:
273
+ full_path_str = str(Path(full_path).resolve())
274
+ mtime = Path(full_path_str).stat().st_mtime if Path(full_path_str).exists() else None
275
+ line_count = content.count('\n') + 1
276
+
277
+ conn.execute(
278
+ """
279
+ INSERT INTO files(name, full_path, language, content, mtime, line_count)
280
+ VALUES(?, ?, ?, ?, ?, ?)
281
+ ON CONFLICT(full_path) DO UPDATE SET
282
+ name=excluded.name,
283
+ language=excluded.language,
284
+ content=excluded.content,
285
+ mtime=excluded.mtime,
286
+ line_count=excluded.line_count
287
+ """,
288
+ (name, full_path_str, language, content, mtime, line_count),
289
+ )
290
+
291
+ row = conn.execute("SELECT id FROM files WHERE full_path=?", (full_path_str,)).fetchone()
292
+ if not row:
293
+ raise StorageError(f"Failed to retrieve file_id for {full_path_str}")
294
+
295
+ file_id = int(row["id"])
296
+ count += 1
297
+
298
+ conn.execute("DELETE FROM symbols WHERE file_id=?", (file_id,))
299
+ if symbols:
300
+ # Insert symbols
301
+ symbol_rows = []
302
+ for s in symbols:
303
+ symbol_rows.append(
304
+ (file_id, s.name, s.kind, s.range[0], s.range[1])
305
+ )
306
+
307
+ conn.executemany(
308
+ """
309
+ INSERT INTO symbols(file_id, name, kind, start_line, end_line)
310
+ VALUES(?, ?, ?, ?, ?)
311
+ """,
312
+ symbol_rows,
313
+ )
314
+
315
+ conn.commit()
316
+ return count
317
+
318
+ except sqlite3.DatabaseError as exc:
319
+ conn.rollback()
320
+ raise StorageError(f"Batch insert failed: {exc}") from exc
321
+
322
+ def remove_file(self, full_path: str | Path) -> bool:
323
+ """Remove a file from the index.
324
+
325
+ Args:
326
+ full_path: Complete source file path
327
+
328
+ Returns:
329
+ True if file was removed, False if not found
330
+ """
331
+ with self._lock:
332
+ conn = self._get_connection()
333
+ full_path_str = str(Path(full_path).resolve())
334
+
335
+ row = conn.execute("SELECT id FROM files WHERE full_path=?", (full_path_str,)).fetchone()
336
+ if not row:
337
+ return False
338
+
339
+ file_id = int(row["id"])
340
+ conn.execute("DELETE FROM files WHERE id=?", (file_id,))
341
+ conn.commit()
342
+ self._maybe_delete_global_symbols(full_path_str)
343
+ return True
344
+
345
+ def get_file(self, full_path: str | Path) -> Optional[FileEntry]:
346
+ """Get file metadata.
347
+
348
+ Args:
349
+ full_path: Complete source file path
350
+
351
+ Returns:
352
+ FileEntry if found, None otherwise
353
+ """
354
+ with self._lock:
355
+ conn = self._get_connection()
356
+ full_path_str = str(Path(full_path).resolve())
357
+
358
+ row = conn.execute(
359
+ """
360
+ SELECT id, name, full_path, language, mtime, line_count
361
+ FROM files WHERE full_path=?
362
+ """,
363
+ (full_path_str,),
364
+ ).fetchone()
365
+
366
+ if not row:
367
+ return None
368
+
369
+ return FileEntry(
370
+ id=int(row["id"]),
371
+ name=row["name"],
372
+ full_path=Path(row["full_path"]),
373
+ language=row["language"],
374
+ mtime=float(row["mtime"]) if row["mtime"] else 0.0,
375
+ line_count=int(row["line_count"]) if row["line_count"] else 0,
376
+ )
377
+
378
+ def get_file_mtime(self, full_path: str | Path) -> Optional[float]:
379
+ """Get stored modification time for a file.
380
+
381
+ Args:
382
+ full_path: Complete source file path
383
+
384
+ Returns:
385
+ Modification time as float, or None if not found
386
+ """
387
+ with self._lock:
388
+ conn = self._get_connection()
389
+ full_path_str = str(Path(full_path).resolve())
390
+
391
+ row = conn.execute(
392
+ "SELECT mtime FROM files WHERE full_path=?", (full_path_str,)
393
+ ).fetchone()
394
+
395
+ return float(row["mtime"]) if row and row["mtime"] else None
396
+
397
+ def needs_reindex(self, full_path: str | Path) -> bool:
398
+ """Check if a file needs reindexing based on mtime comparison.
399
+
400
+ Uses 1ms tolerance to handle filesystem timestamp precision variations.
401
+
402
+ Args:
403
+ full_path: Complete source file path
404
+
405
+ Returns:
406
+ True if file should be reindexed (new, modified, or missing from index)
407
+ """
408
+ full_path_obj = Path(full_path).resolve()
409
+ if not full_path_obj.exists():
410
+ return False # File doesn't exist, skip indexing
411
+
412
+ # Get current filesystem mtime
413
+ try:
414
+ current_mtime = full_path_obj.stat().st_mtime
415
+ except OSError:
416
+ return False # Can't read file stats, skip
417
+
418
+ # Get stored mtime from database
419
+ stored_mtime = self.get_file_mtime(full_path_obj)
420
+
421
+ # File not in index, needs indexing
422
+ if stored_mtime is None:
423
+ return True
424
+
425
+ # Compare with 1ms tolerance for floating point precision
426
+ MTIME_TOLERANCE = 0.001
427
+ return abs(current_mtime - stored_mtime) > MTIME_TOLERANCE
428
+
429
+ def add_file_incremental(
430
+ self,
431
+ name: str,
432
+ full_path: str | Path,
433
+ content: str,
434
+ language: str,
435
+ symbols: Optional[List[Symbol]] = None,
436
+ ) -> Optional[int]:
437
+ """Add or update a file only if it has changed (incremental indexing).
438
+
439
+ Checks mtime before indexing to skip unchanged files.
440
+
441
+ Args:
442
+ name: Filename without path
443
+ full_path: Complete source file path
444
+ content: File content for indexing
445
+ language: Programming language identifier
446
+ symbols: List of Symbol objects from the file
447
+
448
+ Returns:
449
+ Database file_id if indexed, None if skipped (unchanged)
450
+
451
+ Raises:
452
+ StorageError: If database operations fail
453
+ """
454
+ # Check if reindexing is needed
455
+ if not self.needs_reindex(full_path):
456
+ return None # Skip unchanged file
457
+
458
+ # File changed or new, perform full indexing
459
+ return self.add_file(name, full_path, content, language, symbols)
460
+
461
+ def cleanup_deleted_files(self, source_dir: Path) -> int:
462
+ """Remove indexed files that no longer exist in the source directory.
463
+
464
+ Scans the source directory and removes database entries for deleted files.
465
+
466
+ Args:
467
+ source_dir: Source directory to scan
468
+
469
+ Returns:
470
+ Number of deleted file entries removed
471
+
472
+ Raises:
473
+ StorageError: If cleanup operations fail
474
+ """
475
+ with self._lock:
476
+ conn = self._get_connection()
477
+ source_dir = source_dir.resolve()
478
+
479
+ try:
480
+ # Get all indexed file paths
481
+ rows = conn.execute("SELECT full_path FROM files").fetchall()
482
+ indexed_paths = {row["full_path"] for row in rows}
483
+
484
+ # Build set of existing files in source directory
485
+ existing_paths = set()
486
+ for file_path in source_dir.rglob("*"):
487
+ if file_path.is_file():
488
+ existing_paths.add(str(file_path.resolve()))
489
+
490
+ # Find orphaned entries (indexed but no longer exist)
491
+ deleted_paths = indexed_paths - existing_paths
492
+
493
+ # Remove orphaned entries
494
+ deleted_count = 0
495
+ for deleted_path in deleted_paths:
496
+ conn.execute("DELETE FROM files WHERE full_path=?", (deleted_path,))
497
+ deleted_count += 1
498
+ self._maybe_delete_global_symbols(deleted_path)
499
+
500
+ if deleted_count > 0:
501
+ conn.commit()
502
+
503
+ return deleted_count
504
+
505
+ except Exception as exc:
506
+ conn.rollback()
507
+ raise StorageError(f"Failed to cleanup deleted files: {exc}") from exc
508
+
509
+ def list_files(self) -> List[FileEntry]:
510
+ """List all files in current directory.
511
+
512
+ Returns:
513
+ List of FileEntry objects
514
+ """
515
+ with self._lock:
516
+ conn = self._get_connection()
517
+ rows = conn.execute(
518
+ """
519
+ SELECT id, name, full_path, language, mtime, line_count
520
+ FROM files
521
+ ORDER BY name
522
+ """
523
+ ).fetchall()
524
+
525
+ return [
526
+ FileEntry(
527
+ id=int(row["id"]),
528
+ name=row["name"],
529
+ full_path=Path(row["full_path"]),
530
+ language=row["language"],
531
+ mtime=float(row["mtime"]) if row["mtime"] else 0.0,
532
+ line_count=int(row["line_count"]) if row["line_count"] else 0,
533
+ )
534
+ for row in rows
535
+ ]
536
+
537
+ def file_count(self) -> int:
538
+ """Get number of files in current directory.
539
+
540
+ Returns:
541
+ File count
542
+ """
543
+ with self._lock:
544
+ conn = self._get_connection()
545
+ row = conn.execute("SELECT COUNT(*) AS c FROM files").fetchone()
546
+ return int(row["c"]) if row else 0
547
+
548
+ # === Semantic Metadata ===
549
+
550
+ def add_semantic_metadata(
551
+ self,
552
+ file_id: int,
553
+ summary: str,
554
+ keywords: List[str],
555
+ purpose: str,
556
+ llm_tool: str
557
+ ) -> None:
558
+ """Add or update semantic metadata for a file.
559
+
560
+ Args:
561
+ file_id: File ID from files table
562
+ summary: LLM-generated summary
563
+ keywords: List of keywords
564
+ purpose: Purpose/role of the file
565
+ llm_tool: Tool used to generate metadata (gemini/qwen)
566
+ """
567
+ with self._lock:
568
+ conn = self._get_connection()
569
+
570
+ import time
571
+
572
+ generated_at = time.time()
573
+
574
+ # Write to semantic_metadata table (without keywords column)
575
+ conn.execute(
576
+ """
577
+ INSERT INTO semantic_metadata(file_id, summary, purpose, llm_tool, generated_at)
578
+ VALUES(?, ?, ?, ?, ?)
579
+ ON CONFLICT(file_id) DO UPDATE SET
580
+ summary=excluded.summary,
581
+ purpose=excluded.purpose,
582
+ llm_tool=excluded.llm_tool,
583
+ generated_at=excluded.generated_at
584
+ """,
585
+ (file_id, summary, purpose, llm_tool, generated_at),
586
+ )
587
+
588
+ # Write to normalized keywords tables for optimized search
589
+ # First, remove existing keyword associations
590
+ conn.execute("DELETE FROM file_keywords WHERE file_id = ?", (file_id,))
591
+
592
+ # Then add new keywords
593
+ for keyword in keywords:
594
+ keyword = keyword.strip()
595
+ if not keyword:
596
+ continue
597
+
598
+ # Insert keyword if it doesn't exist
599
+ conn.execute(
600
+ "INSERT OR IGNORE INTO keywords(keyword) VALUES(?)",
601
+ (keyword,)
602
+ )
603
+
604
+ # Get keyword_id
605
+ row = conn.execute(
606
+ "SELECT id FROM keywords WHERE keyword = ?",
607
+ (keyword,)
608
+ ).fetchone()
609
+
610
+ if row:
611
+ keyword_id = row["id"]
612
+ # Link file to keyword
613
+ conn.execute(
614
+ "INSERT OR IGNORE INTO file_keywords(file_id, keyword_id) VALUES(?, ?)",
615
+ (file_id, keyword_id)
616
+ )
617
+
618
+ conn.commit()
619
+
620
+ def get_semantic_metadata(self, file_id: int) -> Optional[Dict[str, Any]]:
621
+ """Get semantic metadata for a file.
622
+
623
+ Args:
624
+ file_id: File ID from files table
625
+
626
+ Returns:
627
+ Dict with summary, keywords, purpose, llm_tool, generated_at, or None if not found
628
+ """
629
+ with self._lock:
630
+ conn = self._get_connection()
631
+
632
+ # Get semantic metadata (without keywords column)
633
+ row = conn.execute(
634
+ """
635
+ SELECT summary, purpose, llm_tool, generated_at
636
+ FROM semantic_metadata WHERE file_id=?
637
+ """,
638
+ (file_id,),
639
+ ).fetchone()
640
+
641
+ if not row:
642
+ return None
643
+
644
+ # Get keywords from normalized file_keywords table
645
+ keyword_rows = conn.execute(
646
+ """
647
+ SELECT k.keyword
648
+ FROM file_keywords fk
649
+ JOIN keywords k ON fk.keyword_id = k.id
650
+ WHERE fk.file_id = ?
651
+ ORDER BY k.keyword
652
+ """,
653
+ (file_id,),
654
+ ).fetchall()
655
+
656
+ keywords = [kw["keyword"] for kw in keyword_rows]
657
+
658
+ return {
659
+ "summary": row["summary"],
660
+ "keywords": keywords,
661
+ "purpose": row["purpose"],
662
+ "llm_tool": row["llm_tool"],
663
+ "generated_at": float(row["generated_at"]) if row["generated_at"] else 0.0,
664
+ }
665
+
666
+ def get_files_without_semantic(self) -> List[FileEntry]:
667
+ """Get all files that don't have semantic metadata.
668
+
669
+ Returns:
670
+ List of FileEntry objects without semantic metadata
671
+ """
672
+ with self._lock:
673
+ conn = self._get_connection()
674
+
675
+ rows = conn.execute(
676
+ """
677
+ SELECT f.id, f.name, f.full_path, f.language, f.mtime, f.line_count
678
+ FROM files f
679
+ LEFT JOIN semantic_metadata sm ON f.id = sm.file_id
680
+ WHERE sm.id IS NULL
681
+ ORDER BY f.name
682
+ """
683
+ ).fetchall()
684
+
685
+ return [
686
+ FileEntry(
687
+ id=int(row["id"]),
688
+ name=row["name"],
689
+ full_path=Path(row["full_path"]),
690
+ language=row["language"],
691
+ mtime=float(row["mtime"]) if row["mtime"] else 0.0,
692
+ line_count=int(row["line_count"]) if row["line_count"] else 0,
693
+ )
694
+ for row in rows
695
+ ]
696
+
697
+ def search_semantic_keywords(self, keyword: str, use_normalized: bool = True) -> List[Tuple[FileEntry, List[str]]]:
698
+ """Search files by semantic keywords.
699
+
700
+ Args:
701
+ keyword: Keyword to search for (case-insensitive)
702
+ use_normalized: Use optimized normalized tables (default: True)
703
+
704
+ Returns:
705
+ List of (FileEntry, keywords) tuples where keyword matches
706
+ """
707
+ with self._lock:
708
+ conn = self._get_connection()
709
+
710
+ if use_normalized:
711
+ # Optimized query using normalized tables with indexed lookup
712
+ # Use prefix search (keyword%) for better index utilization
713
+ keyword_pattern = f"{keyword}%"
714
+
715
+ rows = conn.execute(
716
+ """
717
+ SELECT f.id, f.name, f.full_path, f.language, f.mtime, f.line_count,
718
+ GROUP_CONCAT(k.keyword, ',') as keywords
719
+ FROM files f
720
+ JOIN file_keywords fk ON f.id = fk.file_id
721
+ JOIN keywords k ON fk.keyword_id = k.id
722
+ WHERE k.keyword LIKE ? COLLATE NOCASE
723
+ GROUP BY f.id, f.name, f.full_path, f.language, f.mtime, f.line_count
724
+ ORDER BY f.name
725
+ """,
726
+ (keyword_pattern,),
727
+ ).fetchall()
728
+
729
+ results = []
730
+ for row in rows:
731
+ file_entry = FileEntry(
732
+ id=int(row["id"]),
733
+ name=row["name"],
734
+ full_path=Path(row["full_path"]),
735
+ language=row["language"],
736
+ mtime=float(row["mtime"]) if row["mtime"] else 0.0,
737
+ line_count=int(row["line_count"]) if row["line_count"] else 0,
738
+ )
739
+ keywords = row["keywords"].split(',') if row["keywords"] else []
740
+ results.append((file_entry, keywords))
741
+
742
+ return results
743
+
744
+ else:
745
+ # Fallback using normalized tables with contains matching (slower but more flexible)
746
+ keyword_pattern = f"%{keyword}%"
747
+
748
+ rows = conn.execute(
749
+ """
750
+ SELECT f.id, f.name, f.full_path, f.language, f.mtime, f.line_count,
751
+ GROUP_CONCAT(k.keyword, ',') as keywords
752
+ FROM files f
753
+ JOIN file_keywords fk ON f.id = fk.file_id
754
+ JOIN keywords k ON fk.keyword_id = k.id
755
+ WHERE k.keyword LIKE ? COLLATE NOCASE
756
+ GROUP BY f.id, f.name, f.full_path, f.language, f.mtime, f.line_count
757
+ ORDER BY f.name
758
+ """,
759
+ (keyword_pattern,),
760
+ ).fetchall()
761
+
762
+ results = []
763
+ for row in rows:
764
+ file_entry = FileEntry(
765
+ id=int(row["id"]),
766
+ name=row["name"],
767
+ full_path=Path(row["full_path"]),
768
+ language=row["language"],
769
+ mtime=float(row["mtime"]) if row["mtime"] else 0.0,
770
+ line_count=int(row["line_count"]) if row["line_count"] else 0,
771
+ )
772
+ keywords = row["keywords"].split(',') if row["keywords"] else []
773
+ results.append((file_entry, keywords))
774
+
775
+ return results
776
+
777
+ def list_semantic_metadata(
778
+ self,
779
+ offset: int = 0,
780
+ limit: int = 50,
781
+ llm_tool: Optional[str] = None,
782
+ ) -> Tuple[List[Dict[str, Any]], int]:
783
+ """List all semantic metadata with file information.
784
+
785
+ Args:
786
+ offset: Number of records to skip (for pagination)
787
+ limit: Maximum records to return (max 100)
788
+ llm_tool: Optional filter by LLM tool used
789
+
790
+ Returns:
791
+ Tuple of (list of metadata dicts, total count)
792
+ """
793
+ with self._lock:
794
+ conn = self._get_connection()
795
+
796
+ # Query semantic metadata without keywords column
797
+ base_query = """
798
+ SELECT f.id as file_id, f.name as file_name, f.full_path,
799
+ f.language, f.line_count,
800
+ sm.summary, sm.purpose,
801
+ sm.llm_tool, sm.generated_at
802
+ FROM files f
803
+ JOIN semantic_metadata sm ON f.id = sm.file_id
804
+ """
805
+ count_query = """
806
+ SELECT COUNT(*) as total
807
+ FROM files f
808
+ JOIN semantic_metadata sm ON f.id = sm.file_id
809
+ """
810
+
811
+ params: List[Any] = []
812
+ if llm_tool:
813
+ base_query += " WHERE sm.llm_tool = ?"
814
+ count_query += " WHERE sm.llm_tool = ?"
815
+ params.append(llm_tool)
816
+
817
+ base_query += " ORDER BY sm.generated_at DESC LIMIT ? OFFSET ?"
818
+ params.extend([min(limit, 100), offset])
819
+
820
+ count_params = [llm_tool] if llm_tool else []
821
+ total_row = conn.execute(count_query, count_params).fetchone()
822
+ total = int(total_row["total"]) if total_row else 0
823
+
824
+ rows = conn.execute(base_query, params).fetchall()
825
+
826
+ results = []
827
+ for row in rows:
828
+ file_id = int(row["file_id"])
829
+
830
+ # Get keywords from normalized file_keywords table
831
+ keyword_rows = conn.execute(
832
+ """
833
+ SELECT k.keyword
834
+ FROM file_keywords fk
835
+ JOIN keywords k ON fk.keyword_id = k.id
836
+ WHERE fk.file_id = ?
837
+ ORDER BY k.keyword
838
+ """,
839
+ (file_id,),
840
+ ).fetchall()
841
+
842
+ keywords = [kw["keyword"] for kw in keyword_rows]
843
+
844
+ results.append({
845
+ "file_id": file_id,
846
+ "file_name": row["file_name"],
847
+ "full_path": row["full_path"],
848
+ "language": row["language"],
849
+ "line_count": int(row["line_count"]) if row["line_count"] else 0,
850
+ "summary": row["summary"],
851
+ "keywords": keywords,
852
+ "purpose": row["purpose"],
853
+ "llm_tool": row["llm_tool"],
854
+ "generated_at": float(row["generated_at"]) if row["generated_at"] else 0.0,
855
+ })
856
+
857
+ return results, total
858
+
859
+ # === Subdirectory Links ===
860
+
861
+ def register_subdir(
862
+ self,
863
+ name: str,
864
+ index_path: str | Path,
865
+ files_count: int = 0,
866
+ direct_files: int = 0,
867
+ ) -> None:
868
+ """Register or update a subdirectory link.
869
+
870
+ Args:
871
+ name: Subdirectory name
872
+ index_path: Path to subdirectory's _index.db
873
+ files_count: Total files recursively
874
+ direct_files: Deprecated parameter (no longer used)
875
+ """
876
+ with self._lock:
877
+ conn = self._get_connection()
878
+ index_path_str = str(Path(index_path).resolve())
879
+
880
+ import time
881
+ last_updated = time.time()
882
+
883
+ # Note: direct_files parameter is deprecated but kept for backward compatibility
884
+ conn.execute(
885
+ """
886
+ INSERT INTO subdirs(name, index_path, files_count, last_updated)
887
+ VALUES(?, ?, ?, ?)
888
+ ON CONFLICT(name) DO UPDATE SET
889
+ index_path=excluded.index_path,
890
+ files_count=excluded.files_count,
891
+ last_updated=excluded.last_updated
892
+ """,
893
+ (name, index_path_str, files_count, last_updated),
894
+ )
895
+ conn.commit()
896
+
897
+ def unregister_subdir(self, name: str) -> bool:
898
+ """Remove a subdirectory link.
899
+
900
+ Args:
901
+ name: Subdirectory name
902
+
903
+ Returns:
904
+ True if removed, False if not found
905
+ """
906
+ with self._lock:
907
+ conn = self._get_connection()
908
+ row = conn.execute("SELECT id FROM subdirs WHERE name=?", (name,)).fetchone()
909
+ if not row:
910
+ return False
911
+
912
+ conn.execute("DELETE FROM subdirs WHERE name=?", (name,))
913
+ conn.commit()
914
+ return True
915
+
916
+ def get_subdirs(self) -> List[SubdirLink]:
917
+ """Get all subdirectory links.
918
+
919
+ Returns:
920
+ List of SubdirLink objects
921
+ """
922
+ with self._lock:
923
+ conn = self._get_connection()
924
+ rows = conn.execute(
925
+ """
926
+ SELECT id, name, index_path, files_count, last_updated
927
+ FROM subdirs
928
+ ORDER BY name
929
+ """
930
+ ).fetchall()
931
+
932
+ return [
933
+ SubdirLink(
934
+ id=int(row["id"]),
935
+ name=row["name"],
936
+ index_path=Path(row["index_path"]),
937
+ files_count=int(row["files_count"]) if row["files_count"] else 0,
938
+ last_updated=float(row["last_updated"]) if row["last_updated"] else 0.0,
939
+ )
940
+ for row in rows
941
+ ]
942
+
943
+ def get_subdir(self, name: str) -> Optional[SubdirLink]:
944
+ """Get a specific subdirectory link.
945
+
946
+ Args:
947
+ name: Subdirectory name
948
+
949
+ Returns:
950
+ SubdirLink if found, None otherwise
951
+ """
952
+ with self._lock:
953
+ conn = self._get_connection()
954
+ row = conn.execute(
955
+ """
956
+ SELECT id, name, index_path, files_count, last_updated
957
+ FROM subdirs WHERE name=?
958
+ """,
959
+ (name,),
960
+ ).fetchone()
961
+
962
+ if not row:
963
+ return None
964
+
965
+ return SubdirLink(
966
+ id=int(row["id"]),
967
+ name=row["name"],
968
+ index_path=Path(row["index_path"]),
969
+ files_count=int(row["files_count"]) if row["files_count"] else 0,
970
+ last_updated=float(row["last_updated"]) if row["last_updated"] else 0.0,
971
+ )
972
+
973
+ def update_subdir_stats(
974
+ self, name: str, files_count: int, direct_files: Optional[int] = None
975
+ ) -> None:
976
+ """Update subdirectory statistics.
977
+
978
+ Args:
979
+ name: Subdirectory name
980
+ files_count: Total files recursively
981
+ direct_files: Deprecated parameter (no longer used)
982
+ """
983
+ with self._lock:
984
+ conn = self._get_connection()
985
+ import time
986
+ last_updated = time.time()
987
+
988
+ # Note: direct_files parameter is deprecated but kept for backward compatibility
989
+ conn.execute(
990
+ """
991
+ UPDATE subdirs
992
+ SET files_count=?, last_updated=?
993
+ WHERE name=?
994
+ """,
995
+ (files_count, last_updated, name),
996
+ )
997
+ conn.commit()
998
+
999
+ # === Search ===
1000
+
1001
+ @staticmethod
1002
+ def _enhance_fts_query(query: str) -> str:
1003
+ """Enhance FTS5 query to support prefix matching for simple queries.
1004
+
1005
+ For simple single-word or multi-word queries without FTS5 operators,
1006
+ automatically adds prefix wildcard (*) to enable partial matching.
1007
+
1008
+ Examples:
1009
+ "loadPack" -> "loadPack*"
1010
+ "load package" -> "load* package*"
1011
+ "load*" -> "load*" (already has wildcard, unchanged)
1012
+ "NOT test" -> "NOT test" (has FTS operator, unchanged)
1013
+
1014
+ Args:
1015
+ query: Original FTS5 query string
1016
+
1017
+ Returns:
1018
+ Enhanced query string with prefix wildcards for simple queries
1019
+ """
1020
+ # Don't modify if query already contains FTS5 operators or wildcards
1021
+ if any(op in query.upper() for op in [' AND ', ' OR ', ' NOT ', ' NEAR ', '*', '"']):
1022
+ return query
1023
+
1024
+ # For simple queries, add prefix wildcard to each word
1025
+ words = query.split()
1026
+ enhanced_words = [f"{word}*" if not word.endswith('*') else word for word in words]
1027
+ return ' '.join(enhanced_words)
1028
+
1029
+ def _find_match_lines(self, content: str, query: str) -> List[int]:
1030
+ """Find line numbers where query terms match.
1031
+
1032
+ Args:
1033
+ content: File content
1034
+ query: Search query (FTS5 format)
1035
+
1036
+ Returns:
1037
+ List of 1-based line numbers containing matches
1038
+ """
1039
+ # Extract search terms from FTS query (remove operators)
1040
+ terms = re.findall(r'["\']([^"\']+)["\']|(\w+)', query)
1041
+ search_terms = [t[0] or t[1] for t in terms if t[0] or t[1]]
1042
+ # Filter out FTS operators
1043
+ fts_operators = {'AND', 'OR', 'NOT', 'NEAR'}
1044
+ search_terms = [t for t in search_terms if t.upper() not in fts_operators]
1045
+
1046
+ if not search_terms:
1047
+ return [1] # Default to first line
1048
+
1049
+ lines = content.split('\n')
1050
+ match_lines = []
1051
+
1052
+ for i, line in enumerate(lines, 1):
1053
+ line_lower = line.lower()
1054
+ for term in search_terms:
1055
+ # Handle wildcard suffix
1056
+ term_clean = term.rstrip('*').lower()
1057
+ if term_clean and term_clean in line_lower:
1058
+ match_lines.append(i)
1059
+ break
1060
+
1061
+ return match_lines if match_lines else [1]
1062
+
1063
+ def _find_containing_symbol(
1064
+ self, conn: sqlite3.Connection, file_id: int, line_num: int
1065
+ ) -> Optional[Tuple[int, int, str, str]]:
1066
+ """Find the symbol that contains the given line number.
1067
+
1068
+ Args:
1069
+ conn: Database connection
1070
+ file_id: File ID in database
1071
+ line_num: 1-based line number
1072
+
1073
+ Returns:
1074
+ Tuple of (start_line, end_line, symbol_name, symbol_kind) or None
1075
+ """
1076
+ row = conn.execute(
1077
+ """
1078
+ SELECT start_line, end_line, name, kind
1079
+ FROM symbols
1080
+ WHERE file_id = ? AND start_line <= ? AND end_line >= ?
1081
+ ORDER BY (end_line - start_line) ASC
1082
+ LIMIT 1
1083
+ """,
1084
+ (file_id, line_num, line_num),
1085
+ ).fetchone()
1086
+
1087
+ if row:
1088
+ return (row["start_line"], row["end_line"], row["name"], row["kind"])
1089
+ return None
1090
+
1091
+ def _extract_code_block(
1092
+ self,
1093
+ content: str,
1094
+ start_line: int,
1095
+ end_line: int,
1096
+ match_line: Optional[int] = None,
1097
+ context_lines: int = 5,
1098
+ ) -> Tuple[str, int, int]:
1099
+ """Extract code block from content.
1100
+
1101
+ If start_line/end_line are provided (from symbol), use them.
1102
+ Otherwise, extract context around match_line.
1103
+
1104
+ Args:
1105
+ content: Full file content
1106
+ start_line: 1-based start line (from symbol or calculated)
1107
+ end_line: 1-based end line (from symbol or calculated)
1108
+ match_line: 1-based line where match occurred (for context extraction)
1109
+ context_lines: Number of lines before/after match when no symbol
1110
+
1111
+ Returns:
1112
+ Tuple of (code_block, actual_start_line, actual_end_line)
1113
+ """
1114
+ lines = content.split('\n')
1115
+ total_lines = len(lines)
1116
+
1117
+ # Clamp to valid range
1118
+ start_line = max(1, start_line)
1119
+ end_line = min(total_lines, end_line)
1120
+
1121
+ # Extract block (convert to 0-based index)
1122
+ block_lines = lines[start_line - 1:end_line]
1123
+ block_content = '\n'.join(block_lines)
1124
+
1125
+ return block_content, start_line, end_line
1126
+
1127
+ def _batch_fetch_symbols(
1128
+ self, conn: sqlite3.Connection, file_ids: List[int]
1129
+ ) -> Dict[int, List[Tuple[int, int, str, str]]]:
1130
+ """Batch fetch all symbols for multiple files in a single query.
1131
+
1132
+ Args:
1133
+ conn: Database connection
1134
+ file_ids: List of file IDs to fetch symbols for
1135
+
1136
+ Returns:
1137
+ Dictionary mapping file_id to list of (start_line, end_line, name, kind) tuples
1138
+ """
1139
+ if not file_ids:
1140
+ return {}
1141
+
1142
+ # Build placeholder string for IN clause
1143
+ placeholders = ','.join('?' for _ in file_ids)
1144
+ rows = conn.execute(
1145
+ f"""
1146
+ SELECT file_id, start_line, end_line, name, kind
1147
+ FROM symbols
1148
+ WHERE file_id IN ({placeholders})
1149
+ ORDER BY file_id, (end_line - start_line) ASC
1150
+ """,
1151
+ file_ids,
1152
+ ).fetchall()
1153
+
1154
+ # Organize symbols by file_id
1155
+ symbols_by_file: Dict[int, List[Tuple[int, int, str, str]]] = {fid: [] for fid in file_ids}
1156
+ for row in rows:
1157
+ symbols_by_file[row["file_id"]].append(
1158
+ (row["start_line"], row["end_line"], row["name"], row["kind"])
1159
+ )
1160
+ return symbols_by_file
1161
+
1162
+ def _find_containing_symbol_from_cache(
1163
+ self, symbols: List[Tuple[int, int, str, str]], line_num: int
1164
+ ) -> Optional[Tuple[int, int, str, str]]:
1165
+ """Find the smallest symbol containing the given line number from cached symbols.
1166
+
1167
+ Args:
1168
+ symbols: List of (start_line, end_line, name, kind) tuples, sorted by size
1169
+ line_num: 1-based line number
1170
+
1171
+ Returns:
1172
+ Tuple of (start_line, end_line, symbol_name, symbol_kind) or None
1173
+ """
1174
+ for start_line, end_line, name, kind in symbols:
1175
+ if start_line <= line_num <= end_line:
1176
+ return (start_line, end_line, name, kind)
1177
+ return None
1178
+
1179
+ def _generate_centered_excerpt(
1180
+ self, content: str, match_line: int, start_line: int, end_line: int, max_chars: int = 200
1181
+ ) -> str:
1182
+ """Generate excerpt centered around the match line.
1183
+
1184
+ Args:
1185
+ content: Full file content
1186
+ match_line: 1-based line where match occurred
1187
+ start_line: 1-based start line of the code block
1188
+ end_line: 1-based end line of the code block
1189
+ max_chars: Maximum characters for excerpt
1190
+
1191
+ Returns:
1192
+ Excerpt string centered around the match
1193
+ """
1194
+ lines = content.split('\n')
1195
+ total_lines = len(lines)
1196
+
1197
+ # Ensure match_line is within bounds
1198
+ match_line = max(1, min(match_line, total_lines))
1199
+
1200
+ # Calculate context window (2 lines before, 2 lines after the match)
1201
+ ctx_start = max(start_line, match_line - 2)
1202
+ ctx_end = min(end_line, match_line + 2)
1203
+
1204
+ # Extract and join lines
1205
+ excerpt_lines = lines[ctx_start - 1:ctx_end]
1206
+ excerpt = '\n'.join(excerpt_lines)
1207
+
1208
+ # Truncate if too long
1209
+ if len(excerpt) > max_chars:
1210
+ excerpt = excerpt[:max_chars] + "..."
1211
+
1212
+ return excerpt
1213
+
1214
+ def _search_internal(
1215
+ self,
1216
+ query: str,
1217
+ fts_table: str,
1218
+ limit: int = 20,
1219
+ return_full_content: bool = False,
1220
+ context_lines: int = 10,
1221
+ ) -> List[SearchResult]:
1222
+ """Internal unified search implementation for all FTS modes.
1223
+
1224
+ Optimizations:
1225
+ - Fast path: Direct FTS query with snippet() for location-only results
1226
+ - Full content path: Batch fetch symbols to eliminate N+1 queries
1227
+ - Centered excerpt generation for better context
1228
+
1229
+ Args:
1230
+ query: FTS5 query string
1231
+ fts_table: FTS table name ('files_fts_exact' or 'files_fts_fuzzy')
1232
+ limit: Maximum results to return
1233
+ return_full_content: If True, include full code block in content field
1234
+ context_lines: Lines of context when no symbol contains the match
1235
+
1236
+ Returns:
1237
+ List of SearchResult objects
1238
+ """
1239
+ with self._lock:
1240
+ conn = self._get_connection()
1241
+
1242
+ # Fast path: location-only results (no content processing)
1243
+ if not return_full_content:
1244
+ try:
1245
+ rows = conn.execute(
1246
+ f"""
1247
+ SELECT rowid, full_path, bm25({fts_table}) AS rank,
1248
+ snippet({fts_table}, 2, '', '', '...', 30) AS excerpt
1249
+ FROM {fts_table}
1250
+ WHERE {fts_table} MATCH ?
1251
+ ORDER BY rank
1252
+ LIMIT ?
1253
+ """,
1254
+ (query, limit),
1255
+ ).fetchall()
1256
+ except sqlite3.DatabaseError as exc:
1257
+ raise StorageError(f"FTS search failed: {exc}") from exc
1258
+
1259
+ results: List[SearchResult] = []
1260
+ for row in rows:
1261
+ rank = float(row["rank"]) if row["rank"] is not None else 0.0
1262
+ score = abs(rank) if rank < 0 else 0.0
1263
+ results.append(
1264
+ SearchResult(
1265
+ path=row["full_path"],
1266
+ score=score,
1267
+ excerpt=row["excerpt"],
1268
+ )
1269
+ )
1270
+ return results
1271
+
1272
+ # Full content path with batch optimization
1273
+ # Step 1: Get file_ids and ranks (lightweight query)
1274
+ try:
1275
+ id_rows = conn.execute(
1276
+ f"""
1277
+ SELECT rowid AS file_id, bm25({fts_table}) AS rank
1278
+ FROM {fts_table}
1279
+ WHERE {fts_table} MATCH ?
1280
+ ORDER BY rank
1281
+ LIMIT ?
1282
+ """,
1283
+ (query, limit),
1284
+ ).fetchall()
1285
+ except sqlite3.DatabaseError as exc:
1286
+ raise StorageError(f"FTS search failed: {exc}") from exc
1287
+
1288
+ if not id_rows:
1289
+ return []
1290
+
1291
+ file_ids = [row["file_id"] for row in id_rows]
1292
+ ranks_by_id = {row["file_id"]: row["rank"] for row in id_rows}
1293
+
1294
+ # Step 2: Batch fetch all symbols for matched files (eliminates N+1)
1295
+ symbols_by_file = self._batch_fetch_symbols(conn, file_ids)
1296
+
1297
+ # Step 3: Process each file on-demand (reduces memory)
1298
+ results: List[SearchResult] = []
1299
+ for file_id in file_ids:
1300
+ # Fetch file content on-demand
1301
+ file_row = conn.execute(
1302
+ "SELECT full_path, content FROM files WHERE id = ?",
1303
+ (file_id,),
1304
+ ).fetchone()
1305
+
1306
+ if not file_row:
1307
+ continue
1308
+
1309
+ file_path = file_row["full_path"]
1310
+ content = file_row["content"] or ""
1311
+ rank = ranks_by_id.get(file_id, 0.0)
1312
+ score = abs(rank) if rank < 0 else 0.0
1313
+
1314
+ # Find matching lines
1315
+ match_lines = self._find_match_lines(content, query)
1316
+ first_match_line = match_lines[0] if match_lines else 1
1317
+
1318
+ # Find symbol from cached symbols (no extra SQL query)
1319
+ file_symbols = symbols_by_file.get(file_id, [])
1320
+ symbol_info = self._find_containing_symbol_from_cache(file_symbols, first_match_line)
1321
+
1322
+ if symbol_info:
1323
+ start_line, end_line, symbol_name, symbol_kind = symbol_info
1324
+ else:
1325
+ # No symbol found, use context around match
1326
+ lines = content.split('\n')
1327
+ total_lines = len(lines)
1328
+ start_line = max(1, first_match_line - context_lines)
1329
+ end_line = min(total_lines, first_match_line + context_lines)
1330
+ symbol_name = None
1331
+ symbol_kind = None
1332
+
1333
+ # Extract code block
1334
+ block_content, start_line, end_line = self._extract_code_block(
1335
+ content, start_line, end_line
1336
+ )
1337
+
1338
+ # Generate centered excerpt (improved quality)
1339
+ excerpt = self._generate_centered_excerpt(
1340
+ content, first_match_line, start_line, end_line
1341
+ )
1342
+
1343
+ results.append(
1344
+ SearchResult(
1345
+ path=file_path,
1346
+ score=score,
1347
+ excerpt=excerpt,
1348
+ content=block_content,
1349
+ start_line=start_line,
1350
+ end_line=end_line,
1351
+ symbol_name=symbol_name,
1352
+ symbol_kind=symbol_kind,
1353
+ )
1354
+ )
1355
+ return results
1356
+
1357
+
1358
+ def search_fts(
1359
+ self,
1360
+ query: str,
1361
+ limit: int = 20,
1362
+ enhance_query: bool = False,
1363
+ return_full_content: bool = False,
1364
+ context_lines: int = 10,
1365
+ ) -> List[SearchResult]:
1366
+ """Full-text search in current directory files.
1367
+
1368
+ Uses files_fts_exact (unicode61 tokenizer) for exact token matching.
1369
+ For fuzzy/substring search, use search_fts_fuzzy() instead.
1370
+
1371
+ Best Practice (from industry analysis of Codanna/Code-Index-MCP):
1372
+ - Default: Respects exact user input without modification
1373
+ - Users can manually add wildcards (e.g., "loadPack*") for prefix matching
1374
+ - Automatic enhancement (enhance_query=True) is NOT recommended as it can
1375
+ violate user intent and bring unwanted noise in results
1376
+
1377
+ Args:
1378
+ query: FTS5 query string
1379
+ limit: Maximum results to return
1380
+ enhance_query: If True, automatically add prefix wildcards for simple queries.
1381
+ Default False to respect exact user input.
1382
+ return_full_content: If True, include full code block in content field.
1383
+ Default False for fast location-only results.
1384
+ context_lines: Lines of context when no symbol contains the match
1385
+
1386
+ Returns:
1387
+ List of SearchResult objects (location-only by default, with content if requested)
1388
+
1389
+ Raises:
1390
+ StorageError: If FTS search fails
1391
+ """
1392
+ final_query = self._enhance_fts_query(query) if enhance_query else query
1393
+ return self._search_internal(
1394
+ query=final_query,
1395
+ fts_table='files_fts_exact',
1396
+ limit=limit,
1397
+ return_full_content=return_full_content,
1398
+ context_lines=context_lines,
1399
+ )
1400
+
1401
+ def search_fts_exact(
1402
+ self,
1403
+ query: str,
1404
+ limit: int = 20,
1405
+ return_full_content: bool = False,
1406
+ context_lines: int = 10,
1407
+ ) -> List[SearchResult]:
1408
+ """Full-text search using exact token matching.
1409
+
1410
+ Args:
1411
+ query: FTS5 query string
1412
+ limit: Maximum results to return
1413
+ return_full_content: If True, include full code block in content field.
1414
+ Default False for fast location-only results.
1415
+ context_lines: Lines of context when no symbol contains the match
1416
+
1417
+ Returns:
1418
+ List of SearchResult objects (location-only by default, with content if requested)
1419
+
1420
+ Raises:
1421
+ StorageError: If FTS search fails
1422
+ """
1423
+ return self._search_internal(
1424
+ query=query,
1425
+ fts_table='files_fts_exact',
1426
+ limit=limit,
1427
+ return_full_content=return_full_content,
1428
+ context_lines=context_lines,
1429
+ )
1430
+
1431
+ def search_fts_fuzzy(
1432
+ self,
1433
+ query: str,
1434
+ limit: int = 20,
1435
+ return_full_content: bool = False,
1436
+ context_lines: int = 10,
1437
+ ) -> List[SearchResult]:
1438
+ """Full-text search using fuzzy/substring matching.
1439
+
1440
+ Args:
1441
+ query: FTS5 query string
1442
+ limit: Maximum results to return
1443
+ return_full_content: If True, include full code block in content field.
1444
+ Default False for fast location-only results.
1445
+ context_lines: Lines of context when no symbol contains the match
1446
+
1447
+ Returns:
1448
+ List of SearchResult objects (location-only by default, with content if requested)
1449
+
1450
+ Raises:
1451
+ StorageError: If FTS search fails
1452
+ """
1453
+ return self._search_internal(
1454
+ query=query,
1455
+ fts_table='files_fts_fuzzy',
1456
+ limit=limit,
1457
+ return_full_content=return_full_content,
1458
+ context_lines=context_lines,
1459
+ )
1460
+
1461
+ def search_files_only(self, query: str, limit: int = 20) -> List[str]:
1462
+ """Fast FTS search returning only file paths (no snippet generation).
1463
+
1464
+ Optimized for when only file paths are needed, skipping expensive
1465
+ snippet() function call.
1466
+
1467
+ Args:
1468
+ query: FTS5 query string
1469
+ limit: Maximum results to return
1470
+
1471
+ Returns:
1472
+ List of file paths as strings
1473
+
1474
+ Raises:
1475
+ StorageError: If FTS search fails
1476
+ """
1477
+ with self._lock:
1478
+ conn = self._get_connection()
1479
+ try:
1480
+ rows = conn.execute(
1481
+ """
1482
+ SELECT full_path
1483
+ FROM files_fts
1484
+ WHERE files_fts MATCH ?
1485
+ ORDER BY bm25(files_fts)
1486
+ LIMIT ?
1487
+ """,
1488
+ (query, limit),
1489
+ ).fetchall()
1490
+ except sqlite3.DatabaseError as exc:
1491
+ raise StorageError(f"FTS search failed: {exc}") from exc
1492
+
1493
+ return [row["full_path"] for row in rows]
1494
+
1495
+ def search_symbols(
1496
+ self, name: str, kind: Optional[str] = None, limit: int = 50, prefix_mode: bool = True
1497
+ ) -> List[Symbol]:
1498
+ """Search symbols by name pattern.
1499
+
1500
+ Args:
1501
+ name: Symbol name pattern
1502
+ kind: Optional symbol kind filter
1503
+ limit: Maximum results to return
1504
+ prefix_mode: If True, use prefix search (faster with index);
1505
+ If False, use substring search (slower)
1506
+
1507
+ Returns:
1508
+ List of Symbol objects
1509
+ """
1510
+ # Prefix search is much faster as it can use index
1511
+ if prefix_mode:
1512
+ pattern = f"{name}%"
1513
+ else:
1514
+ pattern = f"%{name}%"
1515
+
1516
+ with self._lock:
1517
+ conn = self._get_connection()
1518
+ if kind:
1519
+ rows = conn.execute(
1520
+ """
1521
+ SELECT s.name, s.kind, s.start_line, s.end_line, f.full_path
1522
+ FROM symbols s
1523
+ JOIN files f ON s.file_id = f.id
1524
+ WHERE s.name LIKE ? AND s.kind=?
1525
+ ORDER BY s.name
1526
+ LIMIT ?
1527
+ """,
1528
+ (pattern, kind, limit),
1529
+ ).fetchall()
1530
+ else:
1531
+ rows = conn.execute(
1532
+ """
1533
+ SELECT s.name, s.kind, s.start_line, s.end_line, f.full_path
1534
+ FROM symbols s
1535
+ JOIN files f ON s.file_id = f.id
1536
+ WHERE s.name LIKE ?
1537
+ ORDER BY s.name
1538
+ LIMIT ?
1539
+ """,
1540
+ (pattern, limit),
1541
+ ).fetchall()
1542
+
1543
+ return [
1544
+ Symbol(
1545
+ name=row["name"],
1546
+ kind=row["kind"],
1547
+ range=(row["start_line"], row["end_line"]),
1548
+ file=row["full_path"],
1549
+ )
1550
+ for row in rows
1551
+ ]
1552
+
1553
+ # === Statistics ===
1554
+
1555
+ def stats(self) -> Dict[str, Any]:
1556
+ """Get current directory statistics.
1557
+
1558
+ Returns:
1559
+ Dictionary containing:
1560
+ - files: Number of files in this directory
1561
+ - symbols: Number of symbols
1562
+ - subdirs: Number of subdirectories
1563
+ - total_files: Total files including subdirectories
1564
+ - languages: Dictionary of language counts
1565
+ """
1566
+ with self._lock:
1567
+ conn = self._get_connection()
1568
+
1569
+ file_count = conn.execute("SELECT COUNT(*) AS c FROM files").fetchone()["c"]
1570
+ symbol_count = conn.execute("SELECT COUNT(*) AS c FROM symbols").fetchone()["c"]
1571
+ subdir_count = conn.execute("SELECT COUNT(*) AS c FROM subdirs").fetchone()["c"]
1572
+
1573
+ total_files_row = conn.execute(
1574
+ "SELECT COALESCE(SUM(files_count), 0) AS total FROM subdirs"
1575
+ ).fetchone()
1576
+ total_files = int(file_count) + int(total_files_row["total"] if total_files_row else 0)
1577
+
1578
+ lang_rows = conn.execute(
1579
+ "SELECT language, COUNT(*) AS c FROM files GROUP BY language ORDER BY c DESC"
1580
+ ).fetchall()
1581
+ languages = {row["language"]: int(row["c"]) for row in lang_rows}
1582
+
1583
+ return {
1584
+ "files": int(file_count),
1585
+ "symbols": int(symbol_count),
1586
+ "subdirs": int(subdir_count),
1587
+ "total_files": total_files,
1588
+ "languages": languages,
1589
+ }
1590
+
1591
+ # === Internal Methods ===
1592
+
1593
+ def _get_connection(self) -> sqlite3.Connection:
1594
+ """Get or create database connection with proper configuration.
1595
+
1596
+ Returns:
1597
+ sqlite3.Connection with WAL mode and foreign keys enabled
1598
+ """
1599
+ if self._conn is None:
1600
+ self._conn = sqlite3.connect(str(self.db_path), check_same_thread=False)
1601
+ self._conn.row_factory = sqlite3.Row
1602
+ self._conn.execute("PRAGMA journal_mode=WAL")
1603
+ self._conn.execute("PRAGMA synchronous=NORMAL")
1604
+ self._conn.execute("PRAGMA foreign_keys=ON")
1605
+ # Memory-mapped I/O for faster reads (30GB limit)
1606
+ self._conn.execute("PRAGMA mmap_size=30000000000")
1607
+ return self._conn
1608
+
1609
+ def _maybe_update_global_symbols(self, file_path: str, symbols: List[Symbol]) -> None:
1610
+ if self._global_index is None:
1611
+ return
1612
+ if self._config is not None and not getattr(self._config, "global_symbol_index_enabled", True):
1613
+ return
1614
+ try:
1615
+ self._global_index.update_file_symbols(
1616
+ file_path=file_path,
1617
+ symbols=symbols,
1618
+ index_path=str(self.db_path),
1619
+ )
1620
+ except Exception as exc:
1621
+ # Global index is an optimization; local directory index remains authoritative.
1622
+ self.logger.debug("Global symbol index update failed for %s: %s", file_path, exc)
1623
+
1624
+ def _maybe_delete_global_symbols(self, file_path: str) -> None:
1625
+ if self._global_index is None:
1626
+ return
1627
+ if self._config is not None and not getattr(self._config, "global_symbol_index_enabled", True):
1628
+ return
1629
+ try:
1630
+ self._global_index.delete_file_symbols(file_path)
1631
+ except Exception as exc:
1632
+ self.logger.debug("Global symbol index delete failed for %s: %s", file_path, exc)
1633
+
1634
+ def _create_schema(self, conn: sqlite3.Connection) -> None:
1635
+ """Create database schema.
1636
+
1637
+ Args:
1638
+ conn: Database connection
1639
+
1640
+ Raises:
1641
+ StorageError: If schema creation fails
1642
+ """
1643
+ try:
1644
+ # Files table
1645
+ conn.execute(
1646
+ """
1647
+ CREATE TABLE IF NOT EXISTS files (
1648
+ id INTEGER PRIMARY KEY,
1649
+ name TEXT NOT NULL,
1650
+ full_path TEXT UNIQUE NOT NULL,
1651
+ language TEXT,
1652
+ content TEXT,
1653
+ mtime REAL,
1654
+ line_count INTEGER
1655
+ )
1656
+ """
1657
+ )
1658
+
1659
+ # Subdirectories table (v5: removed direct_files)
1660
+ conn.execute(
1661
+ """
1662
+ CREATE TABLE IF NOT EXISTS subdirs (
1663
+ id INTEGER PRIMARY KEY,
1664
+ name TEXT NOT NULL UNIQUE,
1665
+ index_path TEXT NOT NULL,
1666
+ files_count INTEGER DEFAULT 0,
1667
+ last_updated REAL
1668
+ )
1669
+ """
1670
+ )
1671
+
1672
+ # Symbols table with token metadata
1673
+ conn.execute(
1674
+ """
1675
+ CREATE TABLE IF NOT EXISTS symbols (
1676
+ id INTEGER PRIMARY KEY,
1677
+ file_id INTEGER REFERENCES files(id) ON DELETE CASCADE,
1678
+ name TEXT NOT NULL,
1679
+ kind TEXT NOT NULL,
1680
+ start_line INTEGER,
1681
+ end_line INTEGER
1682
+ )
1683
+ """
1684
+ )
1685
+
1686
+ # Dual FTS5 external content tables for exact and fuzzy matching
1687
+ # files_fts_exact: unicode61 tokenizer for exact token matching
1688
+ # files_fts_fuzzy: trigram tokenizer (or extended unicode61) for substring/fuzzy matching
1689
+ from codexlens.storage.sqlite_utils import check_trigram_support
1690
+
1691
+ has_trigram = check_trigram_support(conn)
1692
+ fuzzy_tokenizer = "trigram" if has_trigram else "unicode61 tokenchars '_-.'"
1693
+
1694
+ # Exact FTS table with unicode61 tokenizer
1695
+ # Note: tokenchars includes '.' to properly tokenize qualified names like PortRole.FLOW
1696
+ conn.execute(
1697
+ """
1698
+ CREATE VIRTUAL TABLE IF NOT EXISTS files_fts_exact USING fts5(
1699
+ name, full_path UNINDEXED, content,
1700
+ content='files',
1701
+ content_rowid='id',
1702
+ tokenize="unicode61 tokenchars '_-.'"
1703
+ )
1704
+ """
1705
+ )
1706
+
1707
+ # Fuzzy FTS table with trigram or extended unicode61 tokenizer
1708
+ conn.execute(
1709
+ f"""
1710
+ CREATE VIRTUAL TABLE IF NOT EXISTS files_fts_fuzzy USING fts5(
1711
+ name, full_path UNINDEXED, content,
1712
+ content='files',
1713
+ content_rowid='id',
1714
+ tokenize="{fuzzy_tokenizer}"
1715
+ )
1716
+ """
1717
+ )
1718
+
1719
+ # Semantic metadata table (v5: removed keywords column)
1720
+ conn.execute(
1721
+ """
1722
+ CREATE TABLE IF NOT EXISTS semantic_metadata (
1723
+ id INTEGER PRIMARY KEY,
1724
+ file_id INTEGER UNIQUE REFERENCES files(id) ON DELETE CASCADE,
1725
+ summary TEXT,
1726
+ purpose TEXT,
1727
+ llm_tool TEXT,
1728
+ generated_at REAL
1729
+ )
1730
+ """
1731
+ )
1732
+
1733
+ # Normalized keywords tables for performance
1734
+ conn.execute(
1735
+ """
1736
+ CREATE TABLE IF NOT EXISTS keywords (
1737
+ id INTEGER PRIMARY KEY,
1738
+ keyword TEXT NOT NULL UNIQUE
1739
+ )
1740
+ """
1741
+ )
1742
+
1743
+ conn.execute(
1744
+ """
1745
+ CREATE TABLE IF NOT EXISTS file_keywords (
1746
+ file_id INTEGER NOT NULL,
1747
+ keyword_id INTEGER NOT NULL,
1748
+ PRIMARY KEY (file_id, keyword_id),
1749
+ FOREIGN KEY (file_id) REFERENCES files (id) ON DELETE CASCADE,
1750
+ FOREIGN KEY (keyword_id) REFERENCES keywords (id) ON DELETE CASCADE
1751
+ )
1752
+ """
1753
+ )
1754
+
1755
+ # Code relationships table for graph visualization
1756
+ conn.execute(
1757
+ """
1758
+ CREATE TABLE IF NOT EXISTS code_relationships (
1759
+ id INTEGER PRIMARY KEY,
1760
+ source_symbol_id INTEGER NOT NULL,
1761
+ target_qualified_name TEXT NOT NULL,
1762
+ relationship_type TEXT NOT NULL,
1763
+ source_line INTEGER NOT NULL,
1764
+ target_file TEXT,
1765
+ FOREIGN KEY (source_symbol_id) REFERENCES symbols (id) ON DELETE CASCADE
1766
+ )
1767
+ """
1768
+ )
1769
+
1770
+ # Indexes (v5: removed idx_symbols_type)
1771
+ conn.execute("CREATE INDEX IF NOT EXISTS idx_files_name ON files(name)")
1772
+ conn.execute("CREATE INDEX IF NOT EXISTS idx_files_path ON files(full_path)")
1773
+ conn.execute("CREATE INDEX IF NOT EXISTS idx_subdirs_name ON subdirs(name)")
1774
+ conn.execute("CREATE INDEX IF NOT EXISTS idx_symbols_name ON symbols(name)")
1775
+ conn.execute("CREATE INDEX IF NOT EXISTS idx_symbols_file ON symbols(file_id)")
1776
+ conn.execute("CREATE INDEX IF NOT EXISTS idx_semantic_file ON semantic_metadata(file_id)")
1777
+ conn.execute("CREATE INDEX IF NOT EXISTS idx_keywords_keyword ON keywords(keyword)")
1778
+ conn.execute("CREATE INDEX IF NOT EXISTS idx_file_keywords_file_id ON file_keywords(file_id)")
1779
+ conn.execute("CREATE INDEX IF NOT EXISTS idx_file_keywords_keyword_id ON file_keywords(keyword_id)")
1780
+ conn.execute("CREATE INDEX IF NOT EXISTS idx_rel_source ON code_relationships(source_symbol_id)")
1781
+ conn.execute("CREATE INDEX IF NOT EXISTS idx_rel_target ON code_relationships(target_qualified_name)")
1782
+ conn.execute("CREATE INDEX IF NOT EXISTS idx_rel_type ON code_relationships(relationship_type)")
1783
+
1784
+ except sqlite3.DatabaseError as exc:
1785
+ raise StorageError(f"Failed to create schema: {exc}") from exc
1786
+
1787
+ def _migrate_v2_add_name_column(self, conn: sqlite3.Connection) -> None:
1788
+ """Migration v2: Add 'name' column to files table.
1789
+
1790
+ Required for FTS5 external content table.
1791
+
1792
+ Args:
1793
+ conn: Database connection
1794
+ """
1795
+ # Check if files table exists and has columns
1796
+ cursor = conn.execute("PRAGMA table_info(files)")
1797
+ files_columns = {row[1] for row in cursor.fetchall()}
1798
+
1799
+ if not files_columns:
1800
+ return # No files table yet, will be created fresh
1801
+
1802
+ # Skip if 'name' column already exists
1803
+ if "name" in files_columns:
1804
+ return
1805
+
1806
+ # Add 'name' column with default value
1807
+ conn.execute("ALTER TABLE files ADD COLUMN name TEXT NOT NULL DEFAULT ''")
1808
+
1809
+ # Populate 'name' column from full_path using pathlib for robustness
1810
+ rows = conn.execute("SELECT id, full_path FROM files WHERE name = ''").fetchall()
1811
+ for row in rows:
1812
+ file_id = row[0]
1813
+ full_path = row[1]
1814
+ # Use pathlib.Path.name for cross-platform compatibility
1815
+ name = Path(full_path).name if full_path else ""
1816
+ conn.execute("UPDATE files SET name = ? WHERE id = ?", (name, file_id))
1817
+
1818
+ def _create_fts_triggers(self, conn: sqlite3.Connection) -> None:
1819
+ """Create FTS5 external content triggers for dual FTS tables.
1820
+
1821
+ Creates synchronized triggers for both files_fts_exact and files_fts_fuzzy tables.
1822
+
1823
+ Args:
1824
+ conn: Database connection
1825
+ """
1826
+ # Insert triggers for files_fts_exact
1827
+ conn.execute(
1828
+ """
1829
+ CREATE TRIGGER IF NOT EXISTS files_exact_ai AFTER INSERT ON files BEGIN
1830
+ INSERT INTO files_fts_exact(rowid, name, full_path, content)
1831
+ VALUES(new.id, new.name, new.full_path, new.content);
1832
+ END
1833
+ """
1834
+ )
1835
+
1836
+ # Delete trigger for files_fts_exact
1837
+ conn.execute(
1838
+ """
1839
+ CREATE TRIGGER IF NOT EXISTS files_exact_ad AFTER DELETE ON files BEGIN
1840
+ INSERT INTO files_fts_exact(files_fts_exact, rowid, name, full_path, content)
1841
+ VALUES('delete', old.id, old.name, old.full_path, old.content);
1842
+ END
1843
+ """
1844
+ )
1845
+
1846
+ # Update trigger for files_fts_exact
1847
+ conn.execute(
1848
+ """
1849
+ CREATE TRIGGER IF NOT EXISTS files_exact_au AFTER UPDATE ON files BEGIN
1850
+ INSERT INTO files_fts_exact(files_fts_exact, rowid, name, full_path, content)
1851
+ VALUES('delete', old.id, old.name, old.full_path, old.content);
1852
+ INSERT INTO files_fts_exact(rowid, name, full_path, content)
1853
+ VALUES(new.id, new.name, new.full_path, new.content);
1854
+ END
1855
+ """
1856
+ )
1857
+
1858
+ # Insert trigger for files_fts_fuzzy
1859
+ conn.execute(
1860
+ """
1861
+ CREATE TRIGGER IF NOT EXISTS files_fuzzy_ai AFTER INSERT ON files BEGIN
1862
+ INSERT INTO files_fts_fuzzy(rowid, name, full_path, content)
1863
+ VALUES(new.id, new.name, new.full_path, new.content);
1864
+ END
1865
+ """
1866
+ )
1867
+
1868
+ # Delete trigger for files_fts_fuzzy
1869
+ conn.execute(
1870
+ """
1871
+ CREATE TRIGGER IF NOT EXISTS files_fuzzy_ad AFTER DELETE ON files BEGIN
1872
+ INSERT INTO files_fts_fuzzy(files_fts_fuzzy, rowid, name, full_path, content)
1873
+ VALUES('delete', old.id, old.name, old.full_path, old.content);
1874
+ END
1875
+ """
1876
+ )
1877
+
1878
+ # Update trigger for files_fts_fuzzy
1879
+ conn.execute(
1880
+ """
1881
+ CREATE TRIGGER IF NOT EXISTS files_fuzzy_au AFTER UPDATE ON files BEGIN
1882
+ INSERT INTO files_fts_fuzzy(files_fts_fuzzy, rowid, name, full_path, content)
1883
+ VALUES('delete', old.id, old.name, old.full_path, old.content);
1884
+ INSERT INTO files_fts_fuzzy(rowid, name, full_path, content)
1885
+ VALUES(new.id, new.name, new.full_path, new.content);
1886
+ END
1887
+ """
1888
+ )