mcp-vector-search 0.9.3__py3-none-any.whl → 0.12.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mcp-vector-search might be problematic. Click here for more details.

@@ -0,0 +1,318 @@
1
+ """Directory index for tracking project structure and file relationships."""
2
+
3
+ import json
4
+ from collections import defaultdict
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ from loguru import logger
9
+
10
+ from .models import Directory
11
+
12
+
13
+ class DirectoryIndex:
14
+ """Manages directory structure and file-directory relationships."""
15
+
16
+ def __init__(self, index_path: Path) -> None:
17
+ """Initialize directory index.
18
+
19
+ Args:
20
+ index_path: Path to directory index file (JSON)
21
+ """
22
+ self.index_path = index_path
23
+ self.directories: dict[str, Directory] = {} # path -> Directory
24
+ self.file_to_directory: dict[str, str] = {} # file_path -> directory_path
25
+ self.directory_files: dict[str, list[str]] = defaultdict(
26
+ list
27
+ ) # dir_path -> [file_paths]
28
+
29
+ def load(self) -> None:
30
+ """Load directory index from disk."""
31
+ if not self.index_path.exists():
32
+ logger.debug("No directory index found, starting fresh")
33
+ return
34
+
35
+ try:
36
+ with open(self.index_path) as f:
37
+ data = json.load(f)
38
+
39
+ # Load directories
40
+ for dir_data in data.get("directories", []):
41
+ directory = Directory.from_dict(dir_data)
42
+ self.directories[str(directory.path)] = directory
43
+
44
+ # Load file mappings
45
+ self.file_to_directory = data.get("file_to_directory", {})
46
+
47
+ # Rebuild directory_files from file_to_directory
48
+ self.directory_files = defaultdict(list)
49
+ for file_path, dir_path in self.file_to_directory.items():
50
+ self.directory_files[dir_path].append(file_path)
51
+
52
+ logger.info(f"Loaded {len(self.directories)} directories from index")
53
+
54
+ except Exception as e:
55
+ logger.error(f"Failed to load directory index: {e}")
56
+ self.directories = {}
57
+ self.file_to_directory = {}
58
+ self.directory_files = defaultdict(list)
59
+
60
+ def save(self) -> None:
61
+ """Save directory index to disk."""
62
+ try:
63
+ # Ensure parent directory exists
64
+ self.index_path.parent.mkdir(parents=True, exist_ok=True)
65
+
66
+ data = {
67
+ "directories": [d.to_dict() for d in self.directories.values()],
68
+ "file_to_directory": self.file_to_directory,
69
+ }
70
+
71
+ with open(self.index_path, "w") as f:
72
+ json.dump(data, f, indent=2)
73
+
74
+ logger.debug(f"Saved {len(self.directories)} directories to index")
75
+
76
+ except Exception as e:
77
+ logger.error(f"Failed to save directory index: {e}")
78
+ raise
79
+
80
+ def add_directory(self, directory: Directory) -> None:
81
+ """Add or update a directory in the index.
82
+
83
+ Args:
84
+ directory: Directory to add
85
+ """
86
+ dir_path = str(directory.path)
87
+ self.directories[dir_path] = directory
88
+
89
+ def add_file(self, file_path: Path, directory_path: Path) -> None:
90
+ """Associate a file with its directory.
91
+
92
+ Args:
93
+ file_path: Path to the file
94
+ directory_path: Path to the directory containing the file
95
+ """
96
+ file_path_str = str(file_path)
97
+ dir_path_str = str(directory_path)
98
+
99
+ self.file_to_directory[file_path_str] = dir_path_str
100
+ if file_path_str not in self.directory_files[dir_path_str]:
101
+ self.directory_files[dir_path_str].append(file_path_str)
102
+
103
+ # Update directory file count
104
+ if dir_path_str in self.directories:
105
+ self.directories[dir_path_str].file_count = len(
106
+ self.directory_files[dir_path_str]
107
+ )
108
+
109
+ def get_directory(self, directory_path: Path) -> Directory | None:
110
+ """Get directory by path.
111
+
112
+ Args:
113
+ directory_path: Path to directory
114
+
115
+ Returns:
116
+ Directory object or None if not found
117
+ """
118
+ return self.directories.get(str(directory_path))
119
+
120
+ def get_files_in_directory(self, directory_path: Path) -> list[str]:
121
+ """Get all files in a directory.
122
+
123
+ Args:
124
+ directory_path: Path to directory
125
+
126
+ Returns:
127
+ List of file paths in the directory
128
+ """
129
+ return self.directory_files.get(str(directory_path), [])
130
+
131
+ def get_subdirectories(self, directory_path: Path) -> list[Directory]:
132
+ """Get all immediate subdirectories.
133
+
134
+ Args:
135
+ directory_path: Path to parent directory
136
+
137
+ Returns:
138
+ List of subdirectory objects
139
+ """
140
+ parent_path_str = str(directory_path)
141
+ subdirs = []
142
+
143
+ for _dir_path_str, directory in self.directories.items():
144
+ if directory.parent_path and str(directory.parent_path) == parent_path_str:
145
+ subdirs.append(directory)
146
+
147
+ return subdirs
148
+
149
+ def get_root_directories(self) -> list[Directory]:
150
+ """Get all root-level directories (no parent).
151
+
152
+ Returns:
153
+ List of root directory objects
154
+ """
155
+ return [d for d in self.directories.values() if d.parent_path is None]
156
+
157
+ def delete_directory(self, directory_path: Path) -> None:
158
+ """Remove directory and its file associations.
159
+
160
+ Args:
161
+ directory_path: Path to directory to remove
162
+ """
163
+ dir_path_str = str(directory_path)
164
+
165
+ # Remove directory
166
+ if dir_path_str in self.directories:
167
+ del self.directories[dir_path_str]
168
+
169
+ # Remove file associations
170
+ if dir_path_str in self.directory_files:
171
+ for file_path in self.directory_files[dir_path_str]:
172
+ if file_path in self.file_to_directory:
173
+ del self.file_to_directory[file_path]
174
+ del self.directory_files[dir_path_str]
175
+
176
+ def delete_file(self, file_path: Path) -> None:
177
+ """Remove file from directory associations.
178
+
179
+ Args:
180
+ file_path: Path to file to remove
181
+ """
182
+ file_path_str = str(file_path)
183
+
184
+ if file_path_str in self.file_to_directory:
185
+ dir_path = self.file_to_directory[file_path_str]
186
+ del self.file_to_directory[file_path_str]
187
+
188
+ # Remove from directory_files
189
+ if dir_path in self.directory_files:
190
+ self.directory_files[dir_path] = [
191
+ f for f in self.directory_files[dir_path] if f != file_path_str
192
+ ]
193
+
194
+ # Update directory file count
195
+ if dir_path in self.directories:
196
+ self.directories[dir_path].file_count = len(
197
+ self.directory_files[dir_path]
198
+ )
199
+
200
+ def rebuild_from_files(
201
+ self,
202
+ file_paths: list[Path],
203
+ root_path: Path,
204
+ chunk_stats: dict[str, dict] | None = None,
205
+ ) -> None:
206
+ """Rebuild directory index from list of files with statistics from chunks.
207
+
208
+ Args:
209
+ file_paths: List of file paths to index
210
+ root_path: Project root path
211
+ chunk_stats: Optional dict mapping file_path -> {'chunks': count, 'language': str}
212
+ """
213
+ self.directories = {}
214
+ self.file_to_directory = {}
215
+ self.directory_files = defaultdict(list)
216
+
217
+ # Track all unique directories and their statistics
218
+ dir_set = set()
219
+ dir_chunks = defaultdict(int) # directory -> total chunks
220
+ dir_languages = defaultdict(
221
+ lambda: defaultdict(int)
222
+ ) # directory -> {language: count}
223
+ dir_modified = defaultdict(float) # directory -> most recent modification time
224
+
225
+ for file_path in file_paths:
226
+ # Get relative path from root
227
+ try:
228
+ rel_path = file_path.relative_to(root_path)
229
+ parent_dir = rel_path.parent
230
+
231
+ # Add all parent directories up to root
232
+ current = parent_dir
233
+ while current != Path("."):
234
+ dir_set.add(current)
235
+
236
+ # Accumulate statistics up the directory tree
237
+ if chunk_stats and str(file_path) in chunk_stats:
238
+ stats = chunk_stats[str(file_path)]
239
+ dir_chunks[current] += stats.get("chunks", 0)
240
+ if "language" in stats:
241
+ dir_languages[current][stats["language"]] += stats.get(
242
+ "chunks", 0
243
+ )
244
+ # Track most recent modification time
245
+ if "modified" in stats:
246
+ dir_modified[current] = max(
247
+ dir_modified.get(current, 0), stats["modified"]
248
+ )
249
+
250
+ current = current.parent
251
+
252
+ # Associate file with its direct parent
253
+ if parent_dir != Path("."):
254
+ self.add_file(rel_path, parent_dir)
255
+
256
+ except ValueError:
257
+ # File not relative to root, skip
258
+ logger.warning(f"File {file_path} not under root {root_path}")
259
+ continue
260
+
261
+ # Create Directory objects for all directories
262
+ for dir_path in sorted(dir_set):
263
+ # Determine parent
264
+ parent_path = dir_path.parent if dir_path.parent != Path(".") else None
265
+
266
+ # Check if it's a package
267
+ is_package = False
268
+ full_dir_path = root_path / dir_path
269
+ if (full_dir_path / "__init__.py").exists():
270
+ is_package = True
271
+ elif (full_dir_path / "package.json").exists():
272
+ is_package = True
273
+
274
+ directory = Directory(
275
+ path=dir_path,
276
+ name=dir_path.name,
277
+ parent_path=parent_path,
278
+ depth=len(dir_path.parts),
279
+ is_package=is_package,
280
+ total_chunks=dir_chunks.get(dir_path, 0),
281
+ languages=dict(dir_languages.get(dir_path, {})),
282
+ last_modified=dir_modified.get(dir_path),
283
+ )
284
+
285
+ self.add_directory(directory)
286
+
287
+ # Update subdirectory counts
288
+ for directory in self.directories.values():
289
+ subdirs = self.get_subdirectories(directory.path)
290
+ directory.subdirectory_count = len(subdirs)
291
+
292
+ logger.info(
293
+ f"Rebuilt directory index with {len(self.directories)} directories, {sum(dir_chunks.values())} total chunks"
294
+ )
295
+
296
+ def get_stats(self) -> dict[str, Any]:
297
+ """Get directory index statistics.
298
+
299
+ Returns:
300
+ Dictionary with statistics
301
+ """
302
+ return {
303
+ "total_directories": len(self.directories),
304
+ "total_files": len(self.file_to_directory),
305
+ "root_directories": len(self.get_root_directories()),
306
+ "packages": sum(1 for d in self.directories.values() if d.is_package),
307
+ }
308
+
309
+ def reset(self) -> None:
310
+ """Clear all directory data."""
311
+ self.directories = {}
312
+ self.file_to_directory = {}
313
+ self.directory_files = defaultdict(list)
314
+
315
+ if self.index_path.exists():
316
+ self.index_path.unlink()
317
+
318
+ logger.info("Directory index reset")
@@ -15,8 +15,9 @@ from ..parsers.registry import get_parser_registry
15
15
  from ..utils.gitignore import create_gitignore_parser
16
16
  from ..utils.monorepo import MonorepoDetector
17
17
  from .database import VectorDatabase
18
+ from .directory_index import DirectoryIndex
18
19
  from .exceptions import ParsingError
19
- from .models import CodeChunk
20
+ from .models import CodeChunk, IndexStats
20
21
 
21
22
 
22
23
  class SemanticIndexer:
@@ -29,6 +30,7 @@ class SemanticIndexer:
29
30
  file_extensions: list[str],
30
31
  max_workers: int | None = None,
31
32
  batch_size: int = 10,
33
+ debug: bool = False,
32
34
  ) -> None:
33
35
  """Initialize semantic indexer.
34
36
 
@@ -38,12 +40,14 @@ class SemanticIndexer:
38
40
  file_extensions: File extensions to index
39
41
  max_workers: Maximum number of worker threads for parallel processing
40
42
  batch_size: Number of files to process in each batch
43
+ debug: Enable debug output for hierarchy building
41
44
  """
42
45
  self.database = database
43
46
  self.project_root = project_root
44
47
  self.file_extensions = {ext.lower() for ext in file_extensions}
45
48
  self.parser_registry = get_parser_registry()
46
49
  self._ignore_patterns = set(DEFAULT_IGNORE_PATTERNS)
50
+ self.debug = debug
47
51
 
48
52
  # Safely get event loop for max_workers
49
53
  try:
@@ -81,6 +85,13 @@ class SemanticIndexer:
81
85
  for sp in subprojects:
82
86
  logger.debug(f" - {sp.name} ({sp.relative_path})")
83
87
 
88
+ # Initialize directory index
89
+ self.directory_index = DirectoryIndex(
90
+ project_root / ".mcp-vector-search" / "directory_index.json"
91
+ )
92
+ # Load existing directory index
93
+ self.directory_index.load()
94
+
84
95
  async def index_project(
85
96
  self,
86
97
  force_reindex: bool = False,
@@ -156,6 +167,39 @@ class SemanticIndexer:
156
167
 
157
168
  self._save_index_metadata(metadata)
158
169
 
170
+ # Rebuild directory index from successfully indexed files
171
+ try:
172
+ logger.debug("Rebuilding directory index...")
173
+ # We don't have chunk counts here, but we have file modification times
174
+ # Build a simple stats dict with file mod times for recency tracking
175
+ chunk_stats = {}
176
+ for file_path in files_to_index:
177
+ try:
178
+ mtime = os.path.getmtime(file_path)
179
+ # For now, just track modification time
180
+ # Chunk counts will be aggregated from the database later if needed
181
+ chunk_stats[str(file_path)] = {
182
+ "modified": mtime,
183
+ "chunks": 1, # Placeholder - real count from chunks
184
+ }
185
+ except OSError:
186
+ pass
187
+
188
+ self.directory_index.rebuild_from_files(
189
+ files_to_index, self.project_root, chunk_stats=chunk_stats
190
+ )
191
+ self.directory_index.save()
192
+ dir_stats = self.directory_index.get_stats()
193
+ logger.info(
194
+ f"Directory index updated: {dir_stats['total_directories']} directories, "
195
+ f"{dir_stats['total_files']} files"
196
+ )
197
+ except Exception as e:
198
+ logger.error(f"Failed to update directory index: {e}")
199
+ import traceback
200
+
201
+ logger.debug(traceback.format_exc())
202
+
159
203
  logger.info(
160
204
  f"Indexing complete: {indexed_count} files indexed, {failed_count} failed"
161
205
  )
@@ -307,6 +351,16 @@ class SemanticIndexer:
307
351
  # Build hierarchical relationships between chunks
308
352
  chunks_with_hierarchy = self._build_chunk_hierarchy(chunks)
309
353
 
354
+ # Debug: Check if hierarchy was built
355
+ methods_with_parents = sum(
356
+ 1
357
+ for c in chunks_with_hierarchy
358
+ if c.chunk_type in ("method", "function") and c.parent_chunk_id
359
+ )
360
+ logger.debug(
361
+ f"After hierarchy build: {methods_with_parents}/{len([c for c in chunks_with_hierarchy if c.chunk_type in ('method', 'function')])} methods have parents"
362
+ )
363
+
310
364
  # Add chunks to database
311
365
  await self.database.add_chunks(chunks_with_hierarchy)
312
366
 
@@ -396,7 +450,11 @@ class SemanticIndexer:
396
450
  # Filter out ignored directories IN-PLACE to prevent os.walk from traversing them
397
451
  # This is much more efficient than checking every file in ignored directories
398
452
  # PERFORMANCE: Pass is_directory=True hint to skip filesystem stat() calls
399
- dirs[:] = [d for d in dirs if not self._should_ignore_path(root_path / d, is_directory=True)]
453
+ dirs[:] = [
454
+ d
455
+ for d in dirs
456
+ if not self._should_ignore_path(root_path / d, is_directory=True)
457
+ ]
400
458
 
401
459
  # Check each file in the current directory
402
460
  # PERFORMANCE: skip_file_check=True because os.walk guarantees these are files
@@ -442,7 +500,9 @@ class SemanticIndexer:
442
500
 
443
501
  return self._indexable_files_cache
444
502
 
445
- def _should_index_file(self, file_path: Path, skip_file_check: bool = False) -> bool:
503
+ def _should_index_file(
504
+ self, file_path: Path, skip_file_check: bool = False
505
+ ) -> bool:
446
506
  """Check if a file should be indexed.
447
507
 
448
508
  Args:
@@ -478,7 +538,9 @@ class SemanticIndexer:
478
538
 
479
539
  return True
480
540
 
481
- def _should_ignore_path(self, file_path: Path, is_directory: bool | None = None) -> bool:
541
+ def _should_ignore_path(
542
+ self, file_path: Path, is_directory: bool | None = None
543
+ ) -> bool:
482
544
  """Check if a path should be ignored.
483
545
 
484
546
  Args:
@@ -491,7 +553,9 @@ class SemanticIndexer:
491
553
  try:
492
554
  # First check gitignore rules if available
493
555
  # PERFORMANCE: Pass is_directory hint to avoid redundant stat() calls
494
- if self.gitignore_parser and self.gitignore_parser.is_ignored(file_path, is_directory=is_directory):
556
+ if self.gitignore_parser and self.gitignore_parser.is_ignored(
557
+ file_path, is_directory=is_directory
558
+ ):
495
559
  logger.debug(f"Path ignored by .gitignore: {file_path}")
496
560
  return True
497
561
 
@@ -630,24 +694,34 @@ class SemanticIndexer:
630
694
  # If we can't parse versions, be safe and reindex
631
695
  return True
632
696
 
633
- async def get_indexing_stats(self) -> dict:
697
+ async def get_indexing_stats(self, db_stats: IndexStats | None = None) -> dict:
634
698
  """Get statistics about the indexing process.
635
699
 
700
+ Args:
701
+ db_stats: Optional pre-fetched database stats to avoid duplicate queries
702
+
636
703
  Returns:
637
704
  Dictionary with indexing statistics
705
+
706
+ Note:
707
+ Uses database statistics only for performance on large projects.
708
+ Filesystem scanning would timeout on 100K+ file projects.
709
+ Pass db_stats parameter to avoid calling database.get_stats() twice.
638
710
  """
639
711
  try:
640
- # Get database stats
641
- db_stats = await self.database.get_stats()
642
-
643
- # Count indexable files asynchronously without blocking
644
- indexable_files = await self._find_indexable_files_async()
712
+ # Get database stats if not provided (fast, no filesystem scan)
713
+ if db_stats is None:
714
+ db_stats = await self.database.get_stats()
645
715
 
716
+ # Use database stats for all file counts
717
+ # This avoids expensive filesystem scans on large projects
646
718
  return {
647
- "total_indexable_files": len(indexable_files),
719
+ "total_indexable_files": db_stats.total_files,
648
720
  "indexed_files": db_stats.total_files,
721
+ "total_files": db_stats.total_files, # For backward compatibility
649
722
  "total_chunks": db_stats.total_chunks,
650
723
  "languages": db_stats.languages,
724
+ "file_types": db_stats.file_types, # Include file type distribution
651
725
  "file_extensions": list(self.file_extensions),
652
726
  "ignore_patterns": list(self._ignore_patterns),
653
727
  "parser_info": self.parser_registry.get_parser_info(),
@@ -659,6 +733,7 @@ class SemanticIndexer:
659
733
  "error": str(e),
660
734
  "total_indexable_files": 0,
661
735
  "indexed_files": 0,
736
+ "total_files": 0,
662
737
  "total_chunks": 0,
663
738
  }
664
739
 
@@ -752,9 +827,14 @@ class SemanticIndexer:
752
827
 
753
828
  # Save error to error log file
754
829
  try:
755
- error_log_path = self.project_root / ".mcp-vector-search" / "indexing_errors.log"
830
+ error_log_path = (
831
+ self.project_root
832
+ / ".mcp-vector-search"
833
+ / "indexing_errors.log"
834
+ )
756
835
  with open(error_log_path, "a", encoding="utf-8") as f:
757
836
  from datetime import datetime
837
+
758
838
  timestamp = datetime.now().isoformat()
759
839
  f.write(f"[{timestamp}] {error_msg}\n")
760
840
  except Exception as log_err:
@@ -787,22 +867,54 @@ class SemanticIndexer:
787
867
 
788
868
  # Group chunks by type and name
789
869
  module_chunks = [c for c in chunks if c.chunk_type in ("module", "imports")]
790
- class_chunks = [c for c in chunks if c.chunk_type in ("class", "interface", "mixin")]
791
- function_chunks = [c for c in chunks if c.chunk_type in ("function", "method", "constructor")]
870
+ class_chunks = [
871
+ c for c in chunks if c.chunk_type in ("class", "interface", "mixin")
872
+ ]
873
+ function_chunks = [
874
+ c for c in chunks if c.chunk_type in ("function", "method", "constructor")
875
+ ]
876
+
877
+ # DEBUG: Print what we have (if debug enabled)
878
+ if self.debug:
879
+ import sys
880
+
881
+ print(
882
+ f"\n[DEBUG] Building hierarchy: {len(module_chunks)} modules, {len(class_chunks)} classes, {len(function_chunks)} functions",
883
+ file=sys.stderr,
884
+ )
885
+ if class_chunks:
886
+ print(
887
+ f"[DEBUG] Class names: {[c.class_name for c in class_chunks[:5]]}",
888
+ file=sys.stderr,
889
+ )
890
+ if function_chunks:
891
+ print(
892
+ f"[DEBUG] First 5 functions with class_name: {[(f.function_name, f.class_name) for f in function_chunks[:5]]}",
893
+ file=sys.stderr,
894
+ )
792
895
 
793
896
  # Build relationships
794
897
  for func in function_chunks:
795
898
  if func.class_name:
796
899
  # Find parent class
797
900
  parent_class = next(
798
- (c for c in class_chunks if c.class_name == func.class_name),
799
- None
901
+ (c for c in class_chunks if c.class_name == func.class_name), None
800
902
  )
801
903
  if parent_class:
802
904
  func.parent_chunk_id = parent_class.chunk_id
803
905
  func.chunk_depth = parent_class.chunk_depth + 1
804
906
  if func.chunk_id not in parent_class.child_chunk_ids:
805
907
  parent_class.child_chunk_ids.append(func.chunk_id)
908
+ if self.debug:
909
+ import sys
910
+
911
+ print(
912
+ f"[DEBUG] ✓ Linked '{func.function_name}' to class '{parent_class.class_name}'",
913
+ file=sys.stderr,
914
+ )
915
+ logger.debug(
916
+ f"Linked method '{func.function_name}' (ID: {func.chunk_id[:8]}) to class '{parent_class.class_name}' (ID: {parent_class.chunk_id[:8]})"
917
+ )
806
918
  else:
807
919
  # Top-level function
808
920
  if not func.chunk_depth:
@@ -828,19 +940,34 @@ class SemanticIndexer:
828
940
  if not mod.chunk_depth:
829
941
  mod.chunk_depth = 0
830
942
 
943
+ # DEBUG: Print summary
944
+ if self.debug:
945
+ import sys
946
+
947
+ funcs_with_parents = sum(1 for f in function_chunks if f.parent_chunk_id)
948
+ classes_with_parents = sum(1 for c in class_chunks if c.parent_chunk_id)
949
+ print(
950
+ f"[DEBUG] Hierarchy built: {funcs_with_parents}/{len(function_chunks)} functions linked, {classes_with_parents}/{len(class_chunks)} classes linked\n",
951
+ file=sys.stderr,
952
+ )
953
+
831
954
  return chunks
832
955
 
833
956
  def _write_indexing_run_header(self) -> None:
834
957
  """Write version and timestamp header to error log at start of indexing run."""
835
958
  try:
836
- error_log_path = self.project_root / ".mcp-vector-search" / "indexing_errors.log"
959
+ error_log_path = (
960
+ self.project_root / ".mcp-vector-search" / "indexing_errors.log"
961
+ )
837
962
  error_log_path.parent.mkdir(parents=True, exist_ok=True)
838
963
 
839
964
  with open(error_log_path, "a", encoding="utf-8") as f:
840
965
  timestamp = datetime.now(UTC).isoformat()
841
966
  separator = "=" * 80
842
967
  f.write(f"\n{separator}\n")
843
- f.write(f"[{timestamp}] Indexing run started - mcp-vector-search v{__version__}\n")
968
+ f.write(
969
+ f"[{timestamp}] Indexing run started - mcp-vector-search v{__version__}\n"
970
+ )
844
971
  f.write(f"{separator}\n")
845
972
  except Exception as e:
846
973
  logger.debug(f"Failed to write indexing run header: {e}")
@@ -206,6 +206,67 @@ class IndexStats(BaseModel):
206
206
  }
207
207
 
208
208
 
209
+ @dataclass
210
+ class Directory:
211
+ """Represents a directory in the project structure."""
212
+
213
+ path: Path # Relative path from project root
214
+ name: str # Directory name
215
+ parent_path: Path | None = None # Parent directory path (None for root)
216
+ file_count: int = 0 # Number of files directly in this directory
217
+ subdirectory_count: int = 0 # Number of subdirectories
218
+ total_chunks: int = 0 # Total code chunks in this directory (recursive)
219
+ languages: dict[str, int] = None # Language distribution in this directory
220
+ depth: int = 0 # Depth from project root (0 = root)
221
+ is_package: bool = False # True if contains __init__.py or package.json
222
+ last_modified: float | None = (
223
+ None # Most recent file modification time (unix timestamp)
224
+ )
225
+
226
+ def __post_init__(self) -> None:
227
+ """Initialize default values and generate directory ID."""
228
+ if self.languages is None:
229
+ self.languages = {}
230
+
231
+ @property
232
+ def id(self) -> str:
233
+ """Generate unique ID for this directory."""
234
+ import hashlib
235
+
236
+ return hashlib.sha256(str(self.path).encode()).hexdigest()[:16]
237
+
238
+ def to_dict(self) -> dict[str, Any]:
239
+ """Convert to dictionary for storage."""
240
+ return {
241
+ "path": str(self.path),
242
+ "name": self.name,
243
+ "parent_path": str(self.parent_path) if self.parent_path else None,
244
+ "file_count": self.file_count,
245
+ "subdirectory_count": self.subdirectory_count,
246
+ "total_chunks": self.total_chunks,
247
+ "languages": self.languages,
248
+ "depth": self.depth,
249
+ "is_package": self.is_package,
250
+ "last_modified": self.last_modified,
251
+ }
252
+
253
+ @classmethod
254
+ def from_dict(cls, data: dict[str, Any]) -> "Directory":
255
+ """Create from dictionary."""
256
+ return cls(
257
+ path=Path(data["path"]),
258
+ name=data["name"],
259
+ parent_path=Path(data["parent_path"]) if data.get("parent_path") else None,
260
+ file_count=data.get("file_count", 0),
261
+ subdirectory_count=data.get("subdirectory_count", 0),
262
+ total_chunks=data.get("total_chunks", 0),
263
+ languages=data.get("languages", {}),
264
+ depth=data.get("depth", 0),
265
+ is_package=data.get("is_package", False),
266
+ last_modified=data.get("last_modified"),
267
+ )
268
+
269
+
209
270
  class ProjectInfo(BaseModel):
210
271
  """Information about a project."""
211
272