mcp-code-indexer 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,16 @@
1
+ """
2
+ MCP Code Indexer - Intelligent codebase navigation for AI agents.
3
+
4
+ A production-ready Model Context Protocol (MCP) server that provides
5
+ intelligent codebase navigation through searchable file descriptions,
6
+ token-aware overviews, and advanced merge capabilities.
7
+ """
8
+
9
+ __version__ = "1.0.0"
10
+ __author__ = "MCP Code Indexer Contributors"
11
+ __email__ = ""
12
+ __license__ = "MIT"
13
+
14
+ from .server.mcp_server import MCPCodeIndexServer
15
+
16
+ __all__ = ["MCPCodeIndexServer", "__version__"]
@@ -0,0 +1 @@
1
+ """Database models and operations."""
@@ -0,0 +1,480 @@
1
+ """
2
+ Database operations for the MCP Code Indexer.
3
+
4
+ This module provides async database operations using aiosqlite with proper
5
+ connection management, transaction handling, and performance optimizations.
6
+ """
7
+
8
+ import json
9
+ import logging
10
+ import sqlite3
11
+ from contextlib import asynccontextmanager
12
+ from datetime import datetime, timedelta
13
+ from pathlib import Path
14
+ from typing import List, Optional, Dict, Any, Tuple, AsyncIterator
15
+
16
+ import aiosqlite
17
+
18
+ from mcp_code_indexer.database.models import (
19
+ Project, FileDescription, MergeConflict, SearchResult,
20
+ CodebaseSizeInfo
21
+ )
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ class DatabaseManager:
27
+ """
28
+ Manages SQLite database operations with async support.
29
+
30
+ Provides high-level operations for projects, file descriptions, search,
31
+ and caching with proper transaction management and error handling.
32
+ """
33
+
34
+ def __init__(self, db_path: Path, pool_size: int = 5):
35
+ """Initialize database manager with path to SQLite database."""
36
+ self.db_path = db_path
37
+ self.pool_size = pool_size
38
+ self._connection_pool: List[aiosqlite.Connection] = []
39
+ self._pool_lock = None # Will be initialized in async context
40
+
41
+ async def initialize(self) -> None:
42
+ """Initialize database schema and configuration."""
43
+ import asyncio
44
+
45
+ # Initialize pool lock
46
+ self._pool_lock = asyncio.Lock()
47
+
48
+ # Ensure database directory exists
49
+ self.db_path.parent.mkdir(parents=True, exist_ok=True)
50
+
51
+ # Apply migrations in order
52
+ migrations_dir = Path(__file__).parent.parent.parent / "migrations"
53
+ migration_files = sorted(migrations_dir.glob("*.sql"))
54
+
55
+ async with aiosqlite.connect(self.db_path) as db:
56
+ # Enable row factory for easier data access
57
+ db.row_factory = aiosqlite.Row
58
+
59
+ # Apply each migration
60
+ for migration_file in migration_files:
61
+ logger.info(f"Applying migration: {migration_file.name}")
62
+ with open(migration_file, 'r') as f:
63
+ migration_sql = f.read()
64
+
65
+ await db.executescript(migration_sql)
66
+ await db.commit()
67
+
68
+ logger.info(f"Database initialized at {self.db_path} with {len(migration_files)} migrations")
69
+
70
+ @asynccontextmanager
71
+ async def get_connection(self) -> AsyncIterator[aiosqlite.Connection]:
72
+ """Get a database connection from pool or create new one."""
73
+ conn = None
74
+
75
+ # Try to get from pool
76
+ if self._pool_lock:
77
+ async with self._pool_lock:
78
+ if self._connection_pool:
79
+ conn = self._connection_pool.pop()
80
+
81
+ # Create new connection if none available
82
+ if conn is None:
83
+ conn = await aiosqlite.connect(self.db_path)
84
+ conn.row_factory = aiosqlite.Row
85
+
86
+ # Apply performance settings to new connections
87
+ await conn.execute("PRAGMA busy_timeout = 30000") # 30 second timeout
88
+ await conn.execute("PRAGMA synchronous = NORMAL") # Balanced durability/performance
89
+ await conn.execute("PRAGMA cache_size = -64000") # 64MB cache
90
+ await conn.execute("PRAGMA temp_store = MEMORY") # Use memory for temp tables
91
+
92
+ try:
93
+ yield conn
94
+ finally:
95
+ # Return to pool if pool not full, otherwise close
96
+ returned_to_pool = False
97
+ if self._pool_lock and len(self._connection_pool) < self.pool_size:
98
+ async with self._pool_lock:
99
+ if len(self._connection_pool) < self.pool_size:
100
+ self._connection_pool.append(conn)
101
+ returned_to_pool = True
102
+
103
+ if not returned_to_pool:
104
+ await conn.close()
105
+
106
+ async def close_pool(self) -> None:
107
+ """Close all connections in the pool."""
108
+ if self._pool_lock:
109
+ async with self._pool_lock:
110
+ for conn in self._connection_pool:
111
+ await conn.close()
112
+ self._connection_pool.clear()
113
+
114
+ # Project operations
115
+
116
+ async def create_project(self, project: Project) -> None:
117
+ """Create a new project record."""
118
+ async with self.get_connection() as db:
119
+ await db.execute(
120
+ """
121
+ INSERT INTO projects (id, name, remote_origin, upstream_origin, aliases, created, last_accessed)
122
+ VALUES (?, ?, ?, ?, ?, ?, ?)
123
+ """,
124
+ (
125
+ project.id,
126
+ project.name,
127
+ project.remote_origin,
128
+ project.upstream_origin,
129
+ json.dumps(project.aliases),
130
+ project.created,
131
+ project.last_accessed
132
+ )
133
+ )
134
+ await db.commit()
135
+ logger.debug(f"Created project: {project.id}")
136
+
137
+ async def get_project(self, project_id: str) -> Optional[Project]:
138
+ """Get project by ID."""
139
+ async with self.get_connection() as db:
140
+ cursor = await db.execute(
141
+ "SELECT * FROM projects WHERE id = ?",
142
+ (project_id,)
143
+ )
144
+ row = await cursor.fetchone()
145
+
146
+ if row:
147
+ return Project(
148
+ id=row['id'],
149
+ name=row['name'],
150
+ remote_origin=row['remote_origin'],
151
+ upstream_origin=row['upstream_origin'],
152
+ aliases=json.loads(row['aliases']),
153
+ created=datetime.fromisoformat(row['created']),
154
+ last_accessed=datetime.fromisoformat(row['last_accessed'])
155
+ )
156
+ return None
157
+
158
+ async def find_project_by_origin(self, origin_url: str) -> Optional[Project]:
159
+ """Find project by remote or upstream origin URL."""
160
+ async with self.get_connection() as db:
161
+ cursor = await db.execute(
162
+ """
163
+ SELECT * FROM projects
164
+ WHERE remote_origin = ? OR upstream_origin = ?
165
+ LIMIT 1
166
+ """,
167
+ (origin_url, origin_url)
168
+ )
169
+ row = await cursor.fetchone()
170
+
171
+ if row:
172
+ return Project(
173
+ id=row['id'],
174
+ name=row['name'],
175
+ remote_origin=row['remote_origin'],
176
+ upstream_origin=row['upstream_origin'],
177
+ aliases=json.loads(row['aliases']),
178
+ created=datetime.fromisoformat(row['created']),
179
+ last_accessed=datetime.fromisoformat(row['last_accessed'])
180
+ )
181
+ return None
182
+
183
+ async def update_project_access_time(self, project_id: str) -> None:
184
+ """Update the last accessed time for a project."""
185
+ async with self.get_connection() as db:
186
+ await db.execute(
187
+ "UPDATE projects SET last_accessed = ? WHERE id = ?",
188
+ (datetime.utcnow(), project_id)
189
+ )
190
+ await db.commit()
191
+
192
+ # File description operations
193
+
194
+ async def create_file_description(self, file_desc: FileDescription) -> None:
195
+ """Create or update a file description."""
196
+ async with self.get_connection() as db:
197
+ await db.execute(
198
+ """
199
+ INSERT OR REPLACE INTO file_descriptions
200
+ (project_id, branch, file_path, description, file_hash, last_modified, version, source_project_id)
201
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)
202
+ """,
203
+ (
204
+ file_desc.project_id,
205
+ file_desc.branch,
206
+ file_desc.file_path,
207
+ file_desc.description,
208
+ file_desc.file_hash,
209
+ file_desc.last_modified,
210
+ file_desc.version,
211
+ file_desc.source_project_id
212
+ )
213
+ )
214
+ await db.commit()
215
+ logger.debug(f"Saved file description: {file_desc.file_path}")
216
+
217
+ async def get_file_description(
218
+ self,
219
+ project_id: str,
220
+ branch: str,
221
+ file_path: str
222
+ ) -> Optional[FileDescription]:
223
+ """Get file description by project, branch, and path."""
224
+ async with self.get_connection() as db:
225
+ cursor = await db.execute(
226
+ """
227
+ SELECT * FROM file_descriptions
228
+ WHERE project_id = ? AND branch = ? AND file_path = ?
229
+ """,
230
+ (project_id, branch, file_path)
231
+ )
232
+ row = await cursor.fetchone()
233
+
234
+ if row:
235
+ return FileDescription(
236
+ project_id=row['project_id'],
237
+ branch=row['branch'],
238
+ file_path=row['file_path'],
239
+ description=row['description'],
240
+ file_hash=row['file_hash'],
241
+ last_modified=datetime.fromisoformat(row['last_modified']),
242
+ version=row['version'],
243
+ source_project_id=row['source_project_id']
244
+ )
245
+ return None
246
+
247
+ async def get_all_file_descriptions(
248
+ self,
249
+ project_id: str,
250
+ branch: str
251
+ ) -> List[FileDescription]:
252
+ """Get all file descriptions for a project and branch."""
253
+ async with self.get_connection() as db:
254
+ cursor = await db.execute(
255
+ """
256
+ SELECT * FROM file_descriptions
257
+ WHERE project_id = ? AND branch = ?
258
+ ORDER BY file_path
259
+ """,
260
+ (project_id, branch)
261
+ )
262
+ rows = await cursor.fetchall()
263
+
264
+ return [
265
+ FileDescription(
266
+ project_id=row['project_id'],
267
+ branch=row['branch'],
268
+ file_path=row['file_path'],
269
+ description=row['description'],
270
+ file_hash=row['file_hash'],
271
+ last_modified=datetime.fromisoformat(row['last_modified']),
272
+ version=row['version'],
273
+ source_project_id=row['source_project_id']
274
+ )
275
+ for row in rows
276
+ ]
277
+
278
+ async def batch_create_file_descriptions(self, file_descriptions: List[FileDescription]) -> None:
279
+ """Batch create multiple file descriptions efficiently."""
280
+ if not file_descriptions:
281
+ return
282
+
283
+ async with self.get_connection() as db:
284
+ data = [
285
+ (
286
+ fd.project_id,
287
+ fd.branch,
288
+ fd.file_path,
289
+ fd.description,
290
+ fd.file_hash,
291
+ fd.last_modified,
292
+ fd.version,
293
+ fd.source_project_id
294
+ )
295
+ for fd in file_descriptions
296
+ ]
297
+
298
+ await db.executemany(
299
+ """
300
+ INSERT OR REPLACE INTO file_descriptions
301
+ (project_id, branch, file_path, description, file_hash, last_modified, version, source_project_id)
302
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)
303
+ """,
304
+ data
305
+ )
306
+ await db.commit()
307
+ logger.debug(f"Batch created {len(file_descriptions)} file descriptions")
308
+
309
+ # Search operations
310
+
311
+ async def search_file_descriptions(
312
+ self,
313
+ project_id: str,
314
+ branch: str,
315
+ query: str,
316
+ max_results: int = 20
317
+ ) -> List[SearchResult]:
318
+ """Search file descriptions using FTS5."""
319
+ async with self.get_connection() as db:
320
+ cursor = await db.execute(
321
+ """
322
+ SELECT
323
+ fd.project_id,
324
+ fd.branch,
325
+ fd.file_path,
326
+ fd.description,
327
+ fts.rank
328
+ FROM file_descriptions_fts fts
329
+ JOIN file_descriptions fd ON fd.rowid = fts.rowid
330
+ WHERE fts MATCH ?
331
+ AND fd.project_id = ?
332
+ AND fd.branch = ?
333
+ ORDER BY fts.rank
334
+ LIMIT ?
335
+ """,
336
+ (query, project_id, branch, max_results)
337
+ )
338
+ rows = await cursor.fetchall()
339
+
340
+ return [
341
+ SearchResult(
342
+ project_id=row['project_id'],
343
+ branch=row['branch'],
344
+ file_path=row['file_path'],
345
+ description=row['description'],
346
+ relevance_score=row['rank']
347
+ )
348
+ for row in rows
349
+ ]
350
+
351
+ # Token cache operations
352
+
353
+ async def get_cached_token_count(self, cache_key: str) -> Optional[int]:
354
+ """Get cached token count if not expired."""
355
+ async with self.get_connection() as db:
356
+ cursor = await db.execute(
357
+ """
358
+ SELECT token_count FROM token_cache
359
+ WHERE cache_key = ? AND (expires IS NULL OR expires > ?)
360
+ """,
361
+ (cache_key, datetime.utcnow())
362
+ )
363
+ row = await cursor.fetchone()
364
+ return row['token_count'] if row else None
365
+
366
+ async def cache_token_count(
367
+ self,
368
+ cache_key: str,
369
+ token_count: int,
370
+ ttl_hours: int = 24
371
+ ) -> None:
372
+ """Cache token count with TTL."""
373
+ expires = datetime.utcnow() + timedelta(hours=ttl_hours)
374
+
375
+ async with self.get_connection() as db:
376
+ await db.execute(
377
+ """
378
+ INSERT OR REPLACE INTO token_cache (cache_key, token_count, expires)
379
+ VALUES (?, ?, ?)
380
+ """,
381
+ (cache_key, token_count, expires)
382
+ )
383
+ await db.commit()
384
+
385
+ async def cleanup_expired_cache(self) -> None:
386
+ """Remove expired cache entries."""
387
+ async with self.get_connection() as db:
388
+ await db.execute(
389
+ "DELETE FROM token_cache WHERE expires < ?",
390
+ (datetime.utcnow(),)
391
+ )
392
+ await db.commit()
393
+
394
+ # Utility operations
395
+
396
+ async def get_file_count(self, project_id: str, branch: str) -> int:
397
+ """Get count of files in a project branch."""
398
+ async with self.get_connection() as db:
399
+ cursor = await db.execute(
400
+ "SELECT COUNT(*) as count FROM file_descriptions WHERE project_id = ? AND branch = ?",
401
+ (project_id, branch)
402
+ )
403
+ row = await cursor.fetchone()
404
+ return row['count'] if row else 0
405
+
406
+ # Upstream inheritance operations
407
+
408
+ async def inherit_from_upstream(self, project: Project, target_branch: str = "main") -> int:
409
+ """
410
+ Inherit file descriptions from upstream repository.
411
+
412
+ Args:
413
+ project: Target project that should inherit descriptions
414
+ target_branch: Branch to inherit descriptions into
415
+
416
+ Returns:
417
+ Number of descriptions inherited
418
+ """
419
+ if not project.upstream_origin:
420
+ return 0
421
+
422
+ # Find upstream project
423
+ upstream_project = await self.find_project_by_origin(project.upstream_origin)
424
+ if not upstream_project:
425
+ logger.debug(f"No upstream project found for {project.upstream_origin}")
426
+ return 0
427
+
428
+ # Get upstream descriptions
429
+ upstream_descriptions = await self.get_all_file_descriptions(
430
+ upstream_project.id, target_branch
431
+ )
432
+
433
+ if not upstream_descriptions:
434
+ logger.debug(f"No upstream descriptions found in branch {target_branch}")
435
+ return 0
436
+
437
+ # Get existing descriptions to avoid overwriting
438
+ existing_descriptions = await self.get_all_file_descriptions(
439
+ project.id, target_branch
440
+ )
441
+ existing_paths = {desc.file_path for desc in existing_descriptions}
442
+
443
+ # Create new descriptions for files that don't exist locally
444
+ inherited_descriptions = []
445
+ for upstream_desc in upstream_descriptions:
446
+ if upstream_desc.file_path not in existing_paths:
447
+ new_desc = FileDescription(
448
+ project_id=project.id,
449
+ branch=target_branch,
450
+ file_path=upstream_desc.file_path,
451
+ description=upstream_desc.description,
452
+ file_hash=None, # Don't copy hash as local file may differ
453
+ last_modified=datetime.utcnow(),
454
+ version=1,
455
+ source_project_id=upstream_project.id # Track inheritance source
456
+ )
457
+ inherited_descriptions.append(new_desc)
458
+
459
+ if inherited_descriptions:
460
+ await self.batch_create_file_descriptions(inherited_descriptions)
461
+ logger.info(f"Inherited {len(inherited_descriptions)} descriptions from upstream")
462
+
463
+ return len(inherited_descriptions)
464
+
465
+ async def check_upstream_inheritance_needed(self, project: Project) -> bool:
466
+ """
467
+ Check if a project needs upstream inheritance.
468
+
469
+ Args:
470
+ project: Project to check
471
+
472
+ Returns:
473
+ True if project has upstream but no descriptions yet
474
+ """
475
+ if not project.upstream_origin:
476
+ return False
477
+
478
+ # Check if project has any descriptions
479
+ file_count = await self.get_file_count(project.id, "main")
480
+ return file_count == 0
@@ -0,0 +1,123 @@
1
+ """
2
+ Data models for the MCP Code Indexer.
3
+
4
+ This module defines Pydantic models for project tracking, file descriptions,
5
+ and merge conflicts. These models provide validation and serialization for
6
+ the database operations.
7
+ """
8
+
9
+ from datetime import datetime
10
+ from typing import List, Optional
11
+ from pydantic import BaseModel, Field
12
+
13
+
14
+ class Project(BaseModel):
15
+ """
16
+ Represents a tracked project/repository.
17
+
18
+ Projects are identified by a combination of git remotes and local paths,
19
+ allowing tracking across forks, renames, and different local copies.
20
+ """
21
+ id: str = Field(..., description="Generated unique identifier")
22
+ name: str = Field(..., description="User-provided project name")
23
+ remote_origin: Optional[str] = Field(None, description="Git remote origin URL")
24
+ upstream_origin: Optional[str] = Field(None, description="Upstream repository URL for forks")
25
+ aliases: List[str] = Field(default_factory=list, description="Alternative identifiers")
26
+ created: datetime = Field(default_factory=datetime.utcnow, description="Creation timestamp")
27
+ last_accessed: datetime = Field(default_factory=datetime.utcnow, description="Last access timestamp")
28
+
29
+
30
+ class FileDescription(BaseModel):
31
+ """
32
+ Represents a file description within a project branch.
33
+
34
+ Stores detailed summaries of file contents including purpose, components,
35
+ and relationships to enable efficient codebase navigation.
36
+ """
37
+ project_id: str = Field(..., description="Reference to project")
38
+ branch: str = Field(..., description="Git branch name")
39
+ file_path: str = Field(..., description="Relative path from project root")
40
+ description: str = Field(..., description="Detailed content description")
41
+ file_hash: Optional[str] = Field(None, description="SHA-256 of file contents")
42
+ last_modified: datetime = Field(default_factory=datetime.utcnow, description="Last update timestamp")
43
+ version: int = Field(default=1, description="For optimistic concurrency control")
44
+ source_project_id: Optional[str] = Field(None, description="Source project if copied from upstream")
45
+
46
+
47
+ class MergeConflict(BaseModel):
48
+ """
49
+ Represents a merge conflict between file descriptions.
50
+
51
+ Used during branch merging when the same file has different descriptions
52
+ in source and target branches.
53
+ """
54
+ id: Optional[int] = Field(None, description="Database ID")
55
+ project_id: str = Field(..., description="Project identifier")
56
+ file_path: str = Field(..., description="Path to conflicted file")
57
+ source_branch: str = Field(..., description="Branch being merged from")
58
+ target_branch: str = Field(..., description="Branch being merged into")
59
+ source_description: str = Field(..., description="Description from source branch")
60
+ target_description: str = Field(..., description="Description from target branch")
61
+ resolution: Optional[str] = Field(None, description="AI-provided resolution")
62
+ created: datetime = Field(default_factory=datetime.utcnow, description="Creation timestamp")
63
+
64
+
65
+ class CodebaseOverview(BaseModel):
66
+ """
67
+ Represents a complete codebase structure with file descriptions.
68
+
69
+ Provides hierarchical view of project files with token count information
70
+ to help determine whether to use full overview or search-based approach.
71
+ """
72
+ project_name: str = Field(..., description="Project name")
73
+ branch: str = Field(..., description="Git branch")
74
+ total_files: int = Field(..., description="Total number of tracked files")
75
+ total_tokens: int = Field(..., description="Total token count for all descriptions")
76
+ is_large: bool = Field(..., description="True if exceeds configured token limit")
77
+ token_limit: int = Field(..., description="Current token limit setting")
78
+ structure: 'FolderNode' = Field(..., description="Hierarchical folder structure")
79
+
80
+
81
+ class FolderNode(BaseModel):
82
+ """
83
+ Represents a folder in the codebase hierarchy.
84
+ """
85
+ name: str = Field(..., description="Folder name")
86
+ path: str = Field(..., description="Full path from project root")
87
+ files: List['FileNode'] = Field(default_factory=list, description="Files in this folder")
88
+ folders: List['FolderNode'] = Field(default_factory=list, description="Subfolders")
89
+
90
+
91
+ class FileNode(BaseModel):
92
+ """
93
+ Represents a file in the codebase hierarchy.
94
+ """
95
+ name: str = Field(..., description="File name")
96
+ path: str = Field(..., description="Full path from project root")
97
+ description: str = Field(..., description="File description")
98
+
99
+
100
+ class SearchResult(BaseModel):
101
+ """
102
+ Represents a search result with relevance scoring.
103
+ """
104
+ file_path: str = Field(..., description="Path to the matching file")
105
+ description: str = Field(..., description="File description")
106
+ relevance_score: float = Field(..., description="Search relevance score")
107
+ project_id: str = Field(..., description="Project identifier")
108
+ branch: str = Field(..., description="Git branch")
109
+
110
+
111
+ class CodebaseSizeInfo(BaseModel):
112
+ """
113
+ Information about codebase size and token usage.
114
+ """
115
+ total_tokens: int = Field(..., description="Total token count")
116
+ is_large: bool = Field(..., description="Whether codebase exceeds token limit")
117
+ recommendation: str = Field(..., description="Recommended approach (use_search or use_overview)")
118
+ token_limit: int = Field(..., description="Configured token limit")
119
+
120
+
121
+ # Enable forward references for recursive models
122
+ FolderNode.model_rebuild()
123
+ CodebaseOverview.model_rebuild()