mcp-code-indexer 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,375 @@
1
+ """
2
+ File discovery and gitignore integration for the MCP Code Indexer.
3
+
4
+ This module provides functionality to scan project directories for files
5
+ while respecting .gitignore patterns and common ignore patterns. It enables
6
+ efficient discovery of files that need description tracking.
7
+ """
8
+
9
+ import logging
10
+ from pathlib import Path
11
+ from typing import List, Set, Optional, Generator
12
+ import fnmatch
13
+
14
+ try:
15
+ from gitignore_parser import parse_gitignore
16
+ except ImportError:
17
+ parse_gitignore = None
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ # Default patterns to ignore even without .gitignore
23
+ DEFAULT_IGNORE_PATTERNS = [
24
+ # Version control
25
+ '.git/',
26
+ '.svn/',
27
+ '.hg/',
28
+
29
+ # Dependencies and packages
30
+ 'node_modules/',
31
+ 'venv/',
32
+ '.venv/',
33
+ 'env/',
34
+ '.env/',
35
+ '__pycache__/',
36
+ '*.pyc',
37
+ '*.pyo',
38
+ '*.pyd',
39
+ '.Python',
40
+
41
+ # Build artifacts
42
+ 'build/',
43
+ 'dist/',
44
+ 'target/',
45
+ 'out/',
46
+ 'bin/',
47
+ 'obj/',
48
+ '*.o',
49
+ '*.so',
50
+ '*.dylib',
51
+ '*.dll',
52
+ '*.exe',
53
+
54
+ # IDE and editor files
55
+ '.vscode/',
56
+ '.idea/',
57
+ '.vs/',
58
+ '*.swp',
59
+ '*.swo',
60
+ '*~',
61
+ '.DS_Store',
62
+ 'Thumbs.db',
63
+
64
+ # Testing and coverage
65
+ 'coverage/',
66
+ 'htmlcov/',
67
+ '.pytest_cache/',
68
+ '.coverage',
69
+ '*.coverage',
70
+
71
+ # Documentation builds
72
+ '_build/',
73
+ 'docs/_build/',
74
+ 'site/',
75
+
76
+ # Logs and temporary files
77
+ '*.log',
78
+ '*.tmp',
79
+ '*.temp',
80
+ '*.cache',
81
+
82
+ # Package files
83
+ '*.tar.gz',
84
+ '*.zip',
85
+ '*.rar',
86
+ '*.7z',
87
+
88
+ # Lock files
89
+ 'package-lock.json',
90
+ 'yarn.lock',
91
+ 'Pipfile.lock',
92
+ 'poetry.lock',
93
+ ]
94
+
95
+ # File extensions commonly ignored for code indexing
96
+ IGNORED_EXTENSIONS = {
97
+ # Binary files
98
+ '.exe', '.dll', '.so', '.dylib', '.bin', '.o', '.obj',
99
+
100
+ # Images
101
+ '.png', '.jpg', '.jpeg', '.gif', '.bmp', '.ico', '.svg',
102
+
103
+ # Documents
104
+ '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx',
105
+
106
+ # Media
107
+ '.mp3', '.mp4', '.avi', '.mov', '.wmv', '.flv',
108
+
109
+ # Archives
110
+ '.zip', '.tar', '.gz', '.rar', '.7z',
111
+
112
+ # Fonts
113
+ '.ttf', '.otf', '.woff', '.woff2', '.eot',
114
+ }
115
+
116
+
117
+ class FileScanner:
118
+ """
119
+ Handles file discovery with gitignore and pattern-based filtering.
120
+
121
+ Provides methods to scan directories while respecting .gitignore files
122
+ and default ignore patterns to identify files suitable for description tracking.
123
+ """
124
+
125
+ def __init__(self, project_root: Path):
126
+ """
127
+ Initialize file scanner for a project.
128
+
129
+ Args:
130
+ project_root: Root directory of the project to scan
131
+ """
132
+ self.project_root = Path(project_root).resolve()
133
+ self._gitignore_cache: dict = {}
134
+ self._load_gitignore_patterns()
135
+
136
+ def _load_gitignore_patterns(self) -> None:
137
+ """Load and cache gitignore patterns from the project."""
138
+ self._gitignore_cache.clear()
139
+
140
+ if parse_gitignore is None:
141
+ logger.warning("gitignore_parser not available, using default patterns only")
142
+ return
143
+
144
+ # Look for .gitignore files in the project hierarchy
145
+ current_path = self.project_root
146
+
147
+ while current_path != current_path.parent:
148
+ gitignore_path = current_path / '.gitignore'
149
+
150
+ if gitignore_path.exists():
151
+ try:
152
+ gitignore_func = parse_gitignore(gitignore_path)
153
+ self._gitignore_cache[str(current_path)] = gitignore_func
154
+ logger.debug(f"Loaded .gitignore from {gitignore_path}")
155
+ except Exception as e:
156
+ logger.warning(f"Failed to parse {gitignore_path}: {e}")
157
+
158
+ current_path = current_path.parent
159
+
160
+ def _is_ignored_by_gitignore(self, file_path: Path) -> bool:
161
+ """Check if a file is ignored by any .gitignore file."""
162
+ if not self._gitignore_cache:
163
+ return False
164
+
165
+ # Check against all loaded .gitignore patterns
166
+ for base_path, gitignore_func in self._gitignore_cache.items():
167
+ try:
168
+ # gitignore_parser expects absolute paths
169
+ if gitignore_func(str(file_path.resolve())):
170
+ return True
171
+ except Exception as e:
172
+ logger.debug(f"Error checking gitignore pattern: {e}")
173
+ continue
174
+
175
+ return False
176
+
177
+ def _is_ignored_by_default_patterns(self, file_path: Path) -> bool:
178
+ """Check if a file matches default ignore patterns."""
179
+ try:
180
+ resolved_file = file_path.resolve()
181
+ resolved_root = self.project_root.resolve()
182
+ rel_path = resolved_file.relative_to(resolved_root)
183
+ rel_path_str = str(rel_path)
184
+ except ValueError:
185
+ return True
186
+
187
+ for pattern in DEFAULT_IGNORE_PATTERNS:
188
+ # Handle directory patterns
189
+ if pattern.endswith('/'):
190
+ pattern_no_slash = pattern.rstrip('/')
191
+ # Check if any parent directory matches
192
+ for parent in rel_path.parents:
193
+ if fnmatch.fnmatch(parent.name, pattern_no_slash):
194
+ return True
195
+ # Check the file's parent directory
196
+ if fnmatch.fnmatch(rel_path.parent.name, pattern_no_slash):
197
+ return True
198
+ else:
199
+ # Handle file patterns
200
+ if fnmatch.fnmatch(rel_path_str, pattern):
201
+ return True
202
+ if fnmatch.fnmatch(file_path.name, pattern):
203
+ return True
204
+
205
+ return False
206
+
207
+ def _is_ignored_by_extension(self, file_path: Path) -> bool:
208
+ """Check if a file has an ignored extension."""
209
+ return file_path.suffix.lower() in IGNORED_EXTENSIONS
210
+
211
+ def should_ignore_file(self, file_path: Path) -> bool:
212
+ """
213
+ Determine if a file should be ignored.
214
+
215
+ Args:
216
+ file_path: Path to the file to check
217
+
218
+ Returns:
219
+ True if the file should be ignored
220
+ """
221
+ # Check if it's a file (not directory)
222
+ if not file_path.is_file():
223
+ return True
224
+
225
+ # Check file extension
226
+ if self._is_ignored_by_extension(file_path):
227
+ return True
228
+
229
+ # Check default patterns
230
+ if self._is_ignored_by_default_patterns(file_path):
231
+ return True
232
+
233
+ # Check gitignore patterns
234
+ if self._is_ignored_by_gitignore(file_path):
235
+ return True
236
+
237
+ return False
238
+
239
+ def scan_directory(self, max_files: Optional[int] = None) -> List[Path]:
240
+ """
241
+ Scan the project directory for trackable files.
242
+
243
+ Args:
244
+ max_files: Maximum number of files to return (None for no limit)
245
+
246
+ Returns:
247
+ List of file paths that should be tracked
248
+ """
249
+ files = []
250
+
251
+ try:
252
+ for file_path in self._walk_directory():
253
+ if not self.should_ignore_file(file_path):
254
+ files.append(file_path)
255
+
256
+ if max_files and len(files) >= max_files:
257
+ logger.info(f"Reached max_files limit of {max_files}")
258
+ break
259
+
260
+ except Exception as e:
261
+ logger.error(f"Error scanning directory {self.project_root}: {e}")
262
+
263
+ # Sort files for consistent ordering
264
+ files.sort()
265
+
266
+ logger.info(f"Found {len(files)} trackable files in {self.project_root}")
267
+ return files
268
+
269
+ def _walk_directory(self) -> Generator[Path, None, None]:
270
+ """Walk through all files in the project directory."""
271
+ try:
272
+ for item in self.project_root.rglob('*'):
273
+ if item.is_file():
274
+ yield item
275
+ except PermissionError as e:
276
+ logger.warning(f"Permission denied accessing {e.filename}")
277
+ except Exception as e:
278
+ logger.error(f"Error walking directory: {e}")
279
+
280
+ def get_relative_path(self, file_path: Path) -> str:
281
+ """
282
+ Get relative path from project root.
283
+
284
+ Args:
285
+ file_path: Absolute path to file
286
+
287
+ Returns:
288
+ Relative path string from project root
289
+ """
290
+ try:
291
+ # Resolve both paths to handle symlinks and .. properly
292
+ resolved_file = file_path.resolve()
293
+ resolved_root = self.project_root.resolve()
294
+ return str(resolved_file.relative_to(resolved_root))
295
+ except ValueError:
296
+ # File is outside project root, return absolute path
297
+ return str(file_path)
298
+
299
+ def find_missing_files(self, existing_paths: Set[str]) -> List[Path]:
300
+ """
301
+ Find files that exist on disk but aren't in the existing paths set.
302
+
303
+ Args:
304
+ existing_paths: Set of relative file paths that already have descriptions
305
+
306
+ Returns:
307
+ List of file paths that are missing descriptions
308
+ """
309
+ all_files = self.scan_directory()
310
+ missing_files = []
311
+
312
+ for file_path in all_files:
313
+ rel_path = self.get_relative_path(file_path)
314
+ if rel_path not in existing_paths:
315
+ missing_files.append(file_path)
316
+
317
+ logger.info(f"Found {len(missing_files)} files missing descriptions")
318
+ return missing_files
319
+
320
+ def is_valid_project_directory(self) -> bool:
321
+ """
322
+ Check if the project root is a valid directory for scanning.
323
+
324
+ Returns:
325
+ True if the directory exists and is accessible
326
+ """
327
+ try:
328
+ return (
329
+ self.project_root.exists() and
330
+ self.project_root.is_dir() and
331
+ self.project_root.stat().st_mode & 0o444 # Readable
332
+ )
333
+ except (OSError, PermissionError):
334
+ return False
335
+
336
+ def get_project_stats(self) -> dict:
337
+ """
338
+ Get statistics about the project directory.
339
+
340
+ Returns:
341
+ Dictionary with project statistics
342
+ """
343
+ stats = {
344
+ 'total_files': 0,
345
+ 'trackable_files': 0,
346
+ 'ignored_files': 0,
347
+ 'largest_file_size': 0,
348
+ 'file_extensions': {},
349
+ }
350
+
351
+ try:
352
+ for file_path in self._walk_directory():
353
+ stats['total_files'] += 1
354
+
355
+ # Track file size
356
+ try:
357
+ file_size = file_path.stat().st_size
358
+ stats['largest_file_size'] = max(stats['largest_file_size'], file_size)
359
+ except OSError:
360
+ pass
361
+
362
+ # Track extensions
363
+ ext = file_path.suffix.lower()
364
+ stats['file_extensions'][ext] = stats['file_extensions'].get(ext, 0) + 1
365
+
366
+ # Check if trackable
367
+ if self.should_ignore_file(file_path):
368
+ stats['ignored_files'] += 1
369
+ else:
370
+ stats['trackable_files'] += 1
371
+
372
+ except Exception as e:
373
+ logger.error(f"Error getting project stats: {e}")
374
+
375
+ return stats
@@ -0,0 +1,183 @@
1
+ """
2
+ Logging configuration for the MCP Code Indexer.
3
+
4
+ This module provides centralized logging setup with structured JSON output,
5
+ proper async handling, and file rotation for production use.
6
+ """
7
+
8
+ import logging
9
+ import logging.handlers
10
+ import sys
11
+ from pathlib import Path
12
+ from typing import Optional
13
+
14
+ from .error_handler import StructuredFormatter
15
+
16
+
17
+ def setup_logging(
18
+ log_level: str = "INFO",
19
+ log_file: Optional[Path] = None,
20
+ enable_file_logging: bool = False,
21
+ max_bytes: int = 10 * 1024 * 1024, # 10MB
22
+ backup_count: int = 5
23
+ ) -> logging.Logger:
24
+ """
25
+ Set up comprehensive logging configuration.
26
+
27
+ Args:
28
+ log_level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
29
+ log_file: Path to log file (optional)
30
+ enable_file_logging: Whether to enable file logging
31
+ max_bytes: Maximum size of log file before rotation
32
+ backup_count: Number of backup files to keep
33
+
34
+ Returns:
35
+ Configured root logger
36
+ """
37
+ # Get root logger
38
+ root_logger = logging.getLogger()
39
+ root_logger.setLevel(getattr(logging, log_level.upper()))
40
+
41
+ # Clear existing handlers
42
+ root_logger.handlers.clear()
43
+
44
+ # Console handler (stderr to avoid interfering with MCP stdout)
45
+ console_handler = logging.StreamHandler(sys.stderr)
46
+ console_handler.setLevel(getattr(logging, log_level.upper()))
47
+
48
+ # Use structured formatter for all handlers
49
+ structured_formatter = StructuredFormatter()
50
+ console_handler.setFormatter(structured_formatter)
51
+
52
+ root_logger.addHandler(console_handler)
53
+
54
+ # File handler (optional)
55
+ if enable_file_logging and log_file:
56
+ try:
57
+ # Ensure log directory exists
58
+ log_file.parent.mkdir(parents=True, exist_ok=True)
59
+
60
+ # Rotating file handler
61
+ file_handler = logging.handlers.RotatingFileHandler(
62
+ log_file,
63
+ maxBytes=max_bytes,
64
+ backupCount=backup_count,
65
+ encoding='utf-8'
66
+ )
67
+ file_handler.setLevel(logging.DEBUG) # File gets all levels
68
+ file_handler.setFormatter(structured_formatter)
69
+
70
+ root_logger.addHandler(file_handler)
71
+
72
+ except (OSError, PermissionError) as e:
73
+ # Log to console if file logging fails
74
+ root_logger.warning(f"Failed to set up file logging: {e}")
75
+
76
+ # Configure specific loggers
77
+
78
+ # Quiet down noisy libraries
79
+ logging.getLogger("aiosqlite").setLevel(logging.WARNING)
80
+ logging.getLogger("tiktoken").setLevel(logging.WARNING)
81
+
82
+ # MCP specific loggers
83
+ mcp_logger = logging.getLogger("mcp")
84
+ mcp_logger.setLevel(logging.INFO)
85
+
86
+ # Database logger
87
+ db_logger = logging.getLogger("src.database")
88
+ db_logger.setLevel(logging.INFO)
89
+
90
+ # Server logger
91
+ server_logger = logging.getLogger("src.server")
92
+ server_logger.setLevel(logging.INFO)
93
+
94
+ return root_logger
95
+
96
+
97
+ def get_logger(name: str) -> logging.Logger:
98
+ """
99
+ Get a logger with the specified name.
100
+
101
+ Args:
102
+ name: Logger name (usually __name__)
103
+
104
+ Returns:
105
+ Logger instance
106
+ """
107
+ return logging.getLogger(name)
108
+
109
+
110
+ def log_performance_metrics(
111
+ logger: logging.Logger,
112
+ operation: str,
113
+ duration: float,
114
+ **metrics
115
+ ) -> None:
116
+ """
117
+ Log performance metrics in structured format.
118
+
119
+ Args:
120
+ logger: Logger instance
121
+ operation: Name of the operation
122
+ duration: Duration in seconds
123
+ **metrics: Additional metrics to log
124
+ """
125
+ perf_data = {
126
+ "operation": operation,
127
+ "duration_seconds": duration,
128
+ "metrics": metrics
129
+ }
130
+
131
+ logger.info(
132
+ f"Performance: {operation} completed in {duration:.3f}s",
133
+ extra={"structured_data": {"performance": perf_data}}
134
+ )
135
+
136
+
137
+ def log_tool_usage(
138
+ logger: logging.Logger,
139
+ tool_name: str,
140
+ arguments: dict,
141
+ success: bool,
142
+ duration: Optional[float] = None,
143
+ result_size: Optional[int] = None
144
+ ) -> None:
145
+ """
146
+ Log MCP tool usage for analytics.
147
+
148
+ Args:
149
+ logger: Logger instance
150
+ tool_name: Name of the MCP tool
151
+ arguments: Tool arguments (will be sanitized)
152
+ success: Whether the operation succeeded
153
+ duration: Operation duration in seconds
154
+ result_size: Size of result data
155
+ """
156
+ # Sanitize arguments
157
+ safe_args = {}
158
+ for key, value in arguments.items():
159
+ if isinstance(value, str) and len(value) > 50:
160
+ safe_args[key] = f"{value[:50]}..."
161
+ else:
162
+ safe_args[key] = value
163
+
164
+ usage_data = {
165
+ "tool_name": tool_name,
166
+ "arguments": safe_args,
167
+ "success": success
168
+ }
169
+
170
+ if duration is not None:
171
+ usage_data["duration_seconds"] = duration
172
+
173
+ if result_size is not None:
174
+ usage_data["result_size"] = result_size
175
+
176
+ level = logging.INFO if success else logging.WARNING
177
+ message = f"Tool {tool_name}: {'SUCCESS' if success else 'FAILED'}"
178
+
179
+ logger.log(
180
+ level,
181
+ message,
182
+ extra={"structured_data": {"tool_usage": usage_data}}
183
+ )
@@ -0,0 +1,129 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ MCP Code Indexer Package Main Module
4
+
5
+ Entry point for the mcp-code-indexer package when installed via pip.
6
+ """
7
+
8
+ import argparse
9
+ import asyncio
10
+ import logging
11
+ import sys
12
+ from pathlib import Path
13
+
14
+ from . import __version__
15
+ from .logging_config import setup_logging
16
+ from .error_handler import setup_error_handling
17
+
18
+
19
+ def parse_arguments() -> argparse.Namespace:
20
+ """Parse command line arguments."""
21
+ parser = argparse.ArgumentParser(
22
+ description="MCP Code Index Server - Track file descriptions across codebases",
23
+ prog="mcp-code-indexer"
24
+ )
25
+
26
+ parser.add_argument(
27
+ "--version",
28
+ action="version",
29
+ version=f"mcp-code-indexer {__version__}"
30
+ )
31
+
32
+ parser.add_argument(
33
+ "--token-limit",
34
+ type=int,
35
+ default=32000,
36
+ help="Maximum tokens before recommending search instead of full overview (default: 32000)"
37
+ )
38
+
39
+ parser.add_argument(
40
+ "--db-path",
41
+ type=str,
42
+ default="~/.mcp-code-index/tracker.db",
43
+ help="Path to SQLite database (default: ~/.mcp-code-index/tracker.db)"
44
+ )
45
+
46
+ parser.add_argument(
47
+ "--cache-dir",
48
+ type=str,
49
+ default="~/.mcp-code-index/cache",
50
+ help="Directory for caching token counts (default: ~/.mcp-code-index/cache)"
51
+ )
52
+
53
+ parser.add_argument(
54
+ "--log-level",
55
+ type=str,
56
+ choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
57
+ default="INFO",
58
+ help="Logging level (default: INFO)"
59
+ )
60
+
61
+ return parser.parse_args()
62
+
63
+
64
+ async def main() -> None:
65
+ """Main entry point for the MCP server."""
66
+ args = parse_arguments()
67
+
68
+ # Setup structured logging
69
+ log_file = Path(args.cache_dir).expanduser() / "server.log" if args.cache_dir else None
70
+ logger = setup_logging(
71
+ log_level=args.log_level,
72
+ log_file=log_file,
73
+ enable_file_logging=True
74
+ )
75
+
76
+ # Setup error handling
77
+ error_handler = setup_error_handling(logger)
78
+
79
+ # Expand user paths
80
+ db_path = Path(args.db_path).expanduser()
81
+ cache_dir = Path(args.cache_dir).expanduser()
82
+
83
+ # Create directories if they don't exist
84
+ db_path.parent.mkdir(parents=True, exist_ok=True)
85
+ cache_dir.mkdir(parents=True, exist_ok=True)
86
+
87
+ # Log startup information
88
+ logger.info("Starting MCP Code Index Server", extra={
89
+ "structured_data": {
90
+ "startup": {
91
+ "version": __version__,
92
+ "token_limit": args.token_limit,
93
+ "db_path": str(db_path),
94
+ "cache_dir": str(cache_dir),
95
+ "log_level": args.log_level
96
+ }
97
+ }
98
+ })
99
+
100
+ try:
101
+ # Import and run the MCP server
102
+ from .server.mcp_server import MCPCodeIndexServer
103
+
104
+ server = MCPCodeIndexServer(
105
+ token_limit=args.token_limit,
106
+ db_path=db_path,
107
+ cache_dir=cache_dir
108
+ )
109
+
110
+ await server.run()
111
+
112
+ except Exception as e:
113
+ error_handler.log_error(e, context={"phase": "startup"})
114
+ raise
115
+
116
+
117
+ def cli_main():
118
+ """Console script entry point."""
119
+ try:
120
+ asyncio.run(main())
121
+ except KeyboardInterrupt:
122
+ print("\nServer stopped by user")
123
+ except Exception as e:
124
+ print(f"Server failed to start: {e}")
125
+ sys.exit(1)
126
+
127
+
128
+ if __name__ == "__main__":
129
+ cli_main()