mcp-code-indexer 1.2.4__py3-none-any.whl → 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,7 +6,7 @@ intelligent codebase navigation through searchable file descriptions,
6
6
  token-aware overviews, and advanced merge capabilities.
7
7
  """
8
8
 
9
- __version__ = "1.2.4"
9
+ __version__ = "1.3.0"
10
10
  __author__ = "MCP Code Indexer Contributors"
11
11
  __email__ = ""
12
12
  __license__ = "MIT"
@@ -648,10 +648,10 @@ class DatabaseManager:
648
648
  if stop_words_path.exists():
649
649
  with open(stop_words_path, 'r', encoding='utf-8') as f:
650
650
  for line in f:
651
- # Parse lines like "1: word" and extract just the word
652
- parts = line.strip().split(': ', 1)
653
- if len(parts) == 2:
654
- stop_words.add(parts[1].lower())
651
+ # Each line contains just the stop word
652
+ word = line.strip().lower()
653
+ if word: # Skip empty lines
654
+ stop_words.add(word)
655
655
 
656
656
  # Add common programming keywords to stop words
657
657
  programming_keywords = {
@@ -688,7 +688,118 @@ class DatabaseManager:
688
688
  ]
689
689
 
690
690
  return WordFrequencyResult(
691
- top_terms=top_terms,
692
- total_terms_analyzed=len(filtered_words),
693
- total_unique_terms=len(word_counts)
691
+ top_terms=top_terms,
692
+ total_terms_analyzed=len(filtered_words),
693
+ total_unique_terms=len(word_counts)
694
694
  )
695
+
696
+ async def cleanup_empty_projects(self) -> int:
697
+ """
698
+ Remove projects that have no file descriptions and no project overview.
699
+
700
+ Returns:
701
+ Number of projects removed
702
+ """
703
+ async with self.get_connection() as db:
704
+ # Find projects with no descriptions and no overview
705
+ cursor = await db.execute("""
706
+ SELECT p.id, p.name
707
+ FROM projects p
708
+ LEFT JOIN file_descriptions fd ON p.id = fd.project_id
709
+ LEFT JOIN project_overviews po ON p.id = po.project_id
710
+ WHERE fd.project_id IS NULL AND po.project_id IS NULL
711
+ """)
712
+
713
+ empty_projects = await cursor.fetchall()
714
+
715
+ if not empty_projects:
716
+ return 0
717
+
718
+ removed_count = 0
719
+ for project in empty_projects:
720
+ project_id = project['id']
721
+ project_name = project['name']
722
+
723
+ # Remove from projects table (cascading will handle related data)
724
+ await db.execute("DELETE FROM projects WHERE id = ?", (project_id,))
725
+ removed_count += 1
726
+
727
+ logger.info(f"Removed empty project: {project_name} (ID: {project_id})")
728
+
729
+ await db.commit()
730
+ return removed_count
731
+
732
+ async def get_project_map_data(self, project_identifier: str, branch: str = None) -> dict:
733
+ """
734
+ Get all data needed to generate a project map.
735
+
736
+ Args:
737
+ project_identifier: Project name or ID
738
+ branch: Branch name (optional, will use first available if not specified)
739
+
740
+ Returns:
741
+ Dictionary containing project info, overview, and file descriptions
742
+ """
743
+ async with self.get_connection() as db:
744
+ # Try to find project by ID first, then by name
745
+ if len(project_identifier) == 36 and '-' in project_identifier:
746
+ # Looks like a UUID
747
+ cursor = await db.execute(
748
+ "SELECT * FROM projects WHERE id = ?",
749
+ (project_identifier,)
750
+ )
751
+ else:
752
+ # Search by name
753
+ cursor = await db.execute(
754
+ "SELECT * FROM projects WHERE LOWER(name) = LOWER(?)",
755
+ (project_identifier,)
756
+ )
757
+
758
+ project_row = await cursor.fetchone()
759
+ if not project_row:
760
+ return None
761
+
762
+ # Handle aliases JSON parsing
763
+ project_dict = dict(project_row)
764
+ if isinstance(project_dict['aliases'], str):
765
+ import json
766
+ project_dict['aliases'] = json.loads(project_dict['aliases'])
767
+
768
+ project = Project(**project_dict)
769
+
770
+ # If no branch specified, find the first available branch
771
+ if not branch:
772
+ cursor = await db.execute(
773
+ "SELECT DISTINCT branch FROM file_descriptions WHERE project_id = ? LIMIT 1",
774
+ (project.id,)
775
+ )
776
+ branch_row = await cursor.fetchone()
777
+ if branch_row:
778
+ branch = branch_row['branch']
779
+ else:
780
+ branch = 'main' # Default fallback
781
+
782
+ # Get project overview
783
+ cursor = await db.execute(
784
+ "SELECT * FROM project_overviews WHERE project_id = ? AND branch = ?",
785
+ (project.id, branch)
786
+ )
787
+ overview_row = await cursor.fetchone()
788
+ project_overview = ProjectOverview(**overview_row) if overview_row else None
789
+
790
+ # Get all file descriptions for this project/branch
791
+ cursor = await db.execute(
792
+ """SELECT * FROM file_descriptions
793
+ WHERE project_id = ? AND branch = ?
794
+ ORDER BY file_path""",
795
+ (project.id, branch)
796
+ )
797
+ file_rows = await cursor.fetchall()
798
+ file_descriptions = [FileDescription(**row) for row in file_rows]
799
+
800
+ return {
801
+ 'project': project,
802
+ 'branch': branch,
803
+ 'overview': project_overview,
804
+ 'files': file_descriptions
805
+ }
@@ -0,0 +1,542 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Git Hook Handler for MCP Code Indexer
4
+
5
+ Handles automated analysis of git changes and updates file descriptions
6
+ and project overview using OpenRouter API integration.
7
+ """
8
+
9
+ import asyncio
10
+ import json
11
+ import logging
12
+ import os
13
+ import subprocess
14
+ import tempfile
15
+ from pathlib import Path
16
+ from typing import Dict, List, Optional, Tuple, Any
17
+ from urllib.parse import urlparse
18
+
19
+ import aiohttp
20
+ from tenacity import retry, wait_exponential, stop_after_attempt, retry_if_exception_type
21
+
22
+ from .database.database import DatabaseManager
23
+ from .database.models import Project, FileDescription
24
+ from .error_handler import ValidationError
25
+
26
+
27
+ class GitHookError(Exception):
28
+ """Custom exception for git hook operations."""
29
+ pass
30
+
31
+
32
+ class ThrottlingError(Exception):
33
+ """Exception for rate limiting scenarios."""
34
+ pass
35
+
36
+
37
+ class GitHookHandler:
38
+ """
39
+ Handles git hook integration for automated code indexing.
40
+
41
+ This class provides functionality to:
42
+ - Analyze git diffs to identify changed files
43
+ - Use OpenRouter API to update file descriptions
44
+ - Update project overview when structural changes occur
45
+ """
46
+
47
+ # OpenRouter configuration
48
+ OPENROUTER_API_URL = "https://openrouter.ai/api/v1/chat/completions"
49
+ OPENROUTER_MODEL = "anthropic/claude-sonnet-4"
50
+
51
+ def __init__(self, db_manager: DatabaseManager, cache_dir: Path):
52
+ """
53
+ Initialize GitHookHandler.
54
+
55
+ Args:
56
+ db_manager: Database manager instance
57
+ cache_dir: Cache directory for temporary files
58
+ """
59
+ self.db_manager = db_manager
60
+ self.cache_dir = cache_dir
61
+ self.logger = logging.getLogger(__name__)
62
+
63
+ # Git hook specific settings
64
+ self.config = {
65
+ "model": os.getenv("MCP_GITHOOK_MODEL", self.OPENROUTER_MODEL),
66
+ "max_diff_size": 100000, # Skip if diff larger than this
67
+ "timeout": 30,
68
+ "temperature": 0.3, # Lower temperature for consistent updates
69
+ }
70
+
71
+ # Validate OpenRouter API key
72
+ self.api_key = os.getenv("OPENROUTER_API_KEY")
73
+ if not self.api_key:
74
+ raise GitHookError("OPENROUTER_API_KEY environment variable is required for git hook mode")
75
+
76
+ async def run_githook_mode(self) -> None:
77
+ """
78
+ Run in git hook mode - analyze changes and update descriptions.
79
+
80
+ This is the main entry point for git hook functionality.
81
+ """
82
+ try:
83
+ # Get git info from current directory
84
+ project_info = await self._identify_project_from_git()
85
+
86
+ # Get git diff
87
+ git_diff = await self._get_git_diff()
88
+
89
+ if not git_diff or len(git_diff) > self.config["max_diff_size"]:
90
+ self.logger.info(f"Skipping git hook update - diff too large or empty")
91
+ return
92
+
93
+ # Fetch current state
94
+ current_overview = await self._get_project_overview(project_info)
95
+ current_descriptions = await self._get_all_descriptions(project_info)
96
+ changed_files = self._extract_changed_files(git_diff)
97
+
98
+ if not changed_files:
99
+ self.logger.info("No changed files detected in git diff")
100
+ return
101
+
102
+ # Build prompt for OpenRouter
103
+ prompt = self._build_githook_prompt(
104
+ git_diff,
105
+ current_overview,
106
+ current_descriptions,
107
+ changed_files
108
+ )
109
+
110
+ # Call OpenRouter API
111
+ updates = await self._call_openrouter(prompt)
112
+
113
+ # Apply updates to database
114
+ await self._apply_updates(project_info, updates)
115
+
116
+ self.logger.info(f"Git hook update completed for {len(changed_files)} files")
117
+
118
+ except Exception as e:
119
+ self.logger.error(f"Git hook mode failed: {e}")
120
+ # Don't fail the git operation - just log the error
121
+ raise GitHookError(f"Git hook processing failed: {e}")
122
+
123
+ async def _identify_project_from_git(self) -> Dict[str, Any]:
124
+ """
125
+ Identify project information from git repository.
126
+
127
+ Returns:
128
+ Dict containing project identification info
129
+ """
130
+ try:
131
+ # Get current working directory as project root
132
+ project_root = Path.cwd()
133
+
134
+ # Get git remote info
135
+ remote_result = await self._run_git_command(["remote", "get-url", "origin"])
136
+ remote_origin = remote_result.strip() if remote_result else None
137
+
138
+ # Try to get upstream origin
139
+ upstream_origin = None
140
+ try:
141
+ upstream_result = await self._run_git_command(["remote", "get-url", "upstream"])
142
+ upstream_origin = upstream_result.strip() if upstream_result else None
143
+ except subprocess.CalledProcessError:
144
+ pass # No upstream remote
145
+
146
+ # Get current branch
147
+ branch_result = await self._run_git_command(["rev-parse", "--abbrev-ref", "HEAD"])
148
+ branch = branch_result.strip() if branch_result else "main"
149
+
150
+ # Extract project name from remote URL or use directory name
151
+ project_name = self._extract_project_name(remote_origin, project_root)
152
+
153
+ return {
154
+ "projectName": project_name,
155
+ "folderPath": str(project_root),
156
+ "branch": branch,
157
+ "remoteOrigin": remote_origin,
158
+ "upstreamOrigin": upstream_origin
159
+ }
160
+
161
+ except Exception as e:
162
+ raise GitHookError(f"Failed to identify project from git: {e}")
163
+
164
+ def _extract_project_name(self, remote_origin: Optional[str], project_root: Path) -> str:
165
+ """Extract project name from remote URL or directory name."""
166
+ if remote_origin:
167
+ # Parse GitHub/GitLab URL
168
+ if remote_origin.startswith("git@"):
169
+ # SSH format: git@github.com:user/repo.git
170
+ parts = remote_origin.split(":")
171
+ if len(parts) >= 2:
172
+ repo_path = parts[-1].replace(".git", "")
173
+ return repo_path.split("/")[-1]
174
+ else:
175
+ # HTTPS format
176
+ parsed = urlparse(remote_origin)
177
+ if parsed.path:
178
+ repo_path = parsed.path.strip("/").replace(".git", "")
179
+ return repo_path.split("/")[-1]
180
+
181
+ # Fallback to directory name
182
+ return project_root.name
183
+
184
+ async def _get_git_diff(self) -> str:
185
+ """
186
+ Get git diff for recent changes.
187
+
188
+ Returns:
189
+ Git diff content as string
190
+ """
191
+ try:
192
+ # Get diff from last commit
193
+ diff_result = await self._run_git_command([
194
+ "diff", "--no-color", "--no-ext-diff", "HEAD~1..HEAD"
195
+ ])
196
+ return diff_result
197
+
198
+ except subprocess.CalledProcessError:
199
+ # If HEAD~1 doesn't exist (first commit), get diff against empty tree
200
+ try:
201
+ diff_result = await self._run_git_command([
202
+ "diff", "--no-color", "--no-ext-diff", "--cached"
203
+ ])
204
+ return diff_result
205
+ except subprocess.CalledProcessError as e:
206
+ raise GitHookError(f"Failed to get git diff: {e}")
207
+
208
+ def _extract_changed_files(self, git_diff: str) -> List[str]:
209
+ """
210
+ Extract list of changed files from git diff.
211
+
212
+ Args:
213
+ git_diff: Git diff content
214
+
215
+ Returns:
216
+ List of file paths that changed
217
+ """
218
+ changed_files = []
219
+ lines = git_diff.split('\n')
220
+
221
+ for line in lines:
222
+ if line.startswith('diff --git a/'):
223
+ # Parse file path from diff header
224
+ # Format: diff --git a/path/to/file b/path/to/file
225
+ parts = line.split(' ')
226
+ if len(parts) >= 4:
227
+ file_path = parts[2][2:] # Remove 'a/' prefix
228
+ changed_files.append(file_path)
229
+
230
+ return changed_files
231
+
232
+ async def _get_project_overview(self, project_info: Dict[str, Any]) -> str:
233
+ """Get current project overview from database."""
234
+ try:
235
+ # Try to get existing project
236
+ project = await self.db_manager.get_project(
237
+ project_info["projectName"],
238
+ project_info["folderPath"],
239
+ project_info.get("remoteOrigin"),
240
+ project_info.get("upstreamOrigin")
241
+ )
242
+
243
+ if project:
244
+ overview = await self.db_manager.get_project_overview(
245
+ project.id, project_info["branch"]
246
+ )
247
+ return overview.overview if overview else ""
248
+
249
+ return ""
250
+
251
+ except Exception as e:
252
+ self.logger.warning(f"Failed to get project overview: {e}")
253
+ return ""
254
+
255
+ async def _get_all_descriptions(self, project_info: Dict[str, Any]) -> Dict[str, str]:
256
+ """Get all current file descriptions from database."""
257
+ try:
258
+ # Try to get existing project
259
+ project = await self.db_manager.get_project(
260
+ project_info["projectName"],
261
+ project_info["folderPath"],
262
+ project_info.get("remoteOrigin"),
263
+ project_info.get("upstreamOrigin")
264
+ )
265
+
266
+ if project:
267
+ descriptions = await self.db_manager.get_all_file_descriptions(
268
+ project.id, project_info["branch"]
269
+ )
270
+ return {desc.file_path: desc.description for desc in descriptions}
271
+
272
+ return {}
273
+
274
+ except Exception as e:
275
+ self.logger.warning(f"Failed to get file descriptions: {e}")
276
+ return {}
277
+
278
+ def _build_githook_prompt(
279
+ self,
280
+ git_diff: str,
281
+ overview: str,
282
+ descriptions: Dict[str, str],
283
+ changed_files: List[str]
284
+ ) -> str:
285
+ """
286
+ Build prompt for OpenRouter API to analyze git changes.
287
+
288
+ Args:
289
+ git_diff: Git diff content
290
+ overview: Current project overview
291
+ descriptions: Current file descriptions
292
+ changed_files: List of changed file paths
293
+
294
+ Returns:
295
+ Formatted prompt for the API
296
+ """
297
+ return f"""Analyze this git diff and update the file descriptions and project overview as needed.
298
+
299
+ CURRENT PROJECT OVERVIEW:
300
+ {overview or "No overview available"}
301
+
302
+ CURRENT FILE DESCRIPTIONS:
303
+ {json.dumps(descriptions, indent=2)}
304
+
305
+ GIT DIFF:
306
+ {git_diff}
307
+
308
+ CHANGED FILES:
309
+ {', '.join(changed_files)}
310
+
311
+ INSTRUCTIONS:
312
+
313
+ 1. **File Descriptions**: Update descriptions for any files that have changed significantly. Only include files that need actual description updates.
314
+
315
+ 2. **Project Overview**: Update ONLY if there are major structural changes like:
316
+ - New major features or components
317
+ - Architectural changes (new patterns, frameworks, or approaches)
318
+ - Significant dependency additions
319
+ - New API endpoints or workflows
320
+ - Changes to build/deployment processes
321
+
322
+ Do NOT update overview for minor changes like bug fixes, small refactors, or documentation updates.
323
+
324
+ 3. **Overview Format**: If updating the overview, follow this structure with comprehensive narrative (10-20 pages of text):
325
+
326
+ ````
327
+ ## Directory Structure
328
+ ```
329
+ src/
330
+ ├── api/ # REST API endpoints and middleware
331
+ ├── models/ # Database models and business logic
332
+ ├── services/ # External service integrations
333
+ ├── utils/ # Shared utilities and helpers
334
+ └── tests/ # Test suites
335
+ ```
336
+
337
+ ## Architecture Overview
338
+ [Describe how components interact, data flow, key design decisions]
339
+
340
+ ## Core Components
341
+ ### API Layer
342
+ [Details about API structure, authentication, routing]
343
+
344
+ ### Data Model
345
+ [Key entities, relationships, database design]
346
+
347
+ ## Key Workflows
348
+ 1. User Authentication Flow
349
+ [Step-by-step description]
350
+ 2. Data Processing Pipeline
351
+ [How data moves through the system]
352
+
353
+ [Continue with other sections...]
354
+ ````
355
+
356
+ Return ONLY a JSON object in this exact format:
357
+ {{
358
+ "file_updates": {{
359
+ "path/to/file1.py": "Updated description for file1",
360
+ "path/to/file2.js": "Updated description for file2"
361
+ }},
362
+ "overview_update": "Updated project overview text (or null if no update needed)"
363
+ }}
364
+
365
+ Return ONLY the JSON, no other text."""
366
+
367
+ @retry(
368
+ wait=wait_exponential(multiplier=1, min=4, max=60),
369
+ stop=stop_after_attempt(5),
370
+ retry=retry_if_exception_type(ThrottlingError)
371
+ )
372
+ async def _call_openrouter(self, prompt: str) -> Dict[str, Any]:
373
+ """
374
+ Call OpenRouter API to analyze changes.
375
+
376
+ Args:
377
+ prompt: Analysis prompt
378
+
379
+ Returns:
380
+ Parsed response with file updates and overview update
381
+ """
382
+ headers = {
383
+ "Authorization": f"Bearer {self.api_key}",
384
+ "HTTP-Referer": "https://github.com/fluffypony/mcp-code-indexer",
385
+ "X-Title": "MCP Code Indexer Git Hook",
386
+ "Content-Type": "application/json"
387
+ }
388
+
389
+ payload = {
390
+ "model": self.config["model"],
391
+ "messages": [
392
+ {
393
+ "role": "system",
394
+ "content": "You are a technical assistant that analyzes code changes and updates file descriptions accurately and concisely."
395
+ },
396
+ {
397
+ "role": "user",
398
+ "content": prompt
399
+ }
400
+ ],
401
+ "temperature": self.config["temperature"],
402
+ "max_tokens": 24000,
403
+ }
404
+
405
+ timeout = aiohttp.ClientTimeout(total=self.config["timeout"])
406
+
407
+ try:
408
+ async with aiohttp.ClientSession(timeout=timeout) as session:
409
+ async with session.post(
410
+ self.OPENROUTER_API_URL,
411
+ headers=headers,
412
+ json=payload
413
+ ) as response:
414
+
415
+ if response.status == 429:
416
+ retry_after = int(response.headers.get("Retry-After", 60))
417
+ raise ThrottlingError(f"Rate limited. Retry after {retry_after}s")
418
+
419
+ response.raise_for_status()
420
+
421
+ response_data = await response.json()
422
+
423
+ if "choices" not in response_data:
424
+ raise GitHookError(f"Invalid API response format: {response_data}")
425
+
426
+ content = response_data["choices"][0]["message"]["content"]
427
+ return self._validate_githook_response(content)
428
+
429
+ except aiohttp.ClientError as e:
430
+ raise GitHookError(f"OpenRouter API request failed: {e}")
431
+ except asyncio.TimeoutError:
432
+ raise GitHookError("OpenRouter API request timed out")
433
+
434
+ def _validate_githook_response(self, response_text: str) -> Dict[str, Any]:
435
+ """
436
+ Validate and parse JSON response from OpenRouter.
437
+
438
+ Args:
439
+ response_text: Raw response content
440
+
441
+ Returns:
442
+ Validated response data
443
+ """
444
+ try:
445
+ data = json.loads(response_text.strip())
446
+
447
+ # Validate structure
448
+ if "file_updates" not in data:
449
+ raise ValueError("Missing 'file_updates' field")
450
+ if "overview_update" not in data:
451
+ raise ValueError("Missing 'overview_update' field")
452
+
453
+ if not isinstance(data["file_updates"], dict):
454
+ raise ValueError("'file_updates' must be a dictionary")
455
+
456
+ # Validate descriptions
457
+ for path, desc in data["file_updates"].items():
458
+ if not isinstance(desc, str) or not desc.strip():
459
+ raise ValueError(f"Invalid description for {path}")
460
+
461
+ return data
462
+
463
+ except json.JSONDecodeError as e:
464
+ raise GitHookError(f"Invalid JSON response from API: {e}")
465
+ except ValueError as e:
466
+ raise GitHookError(f"Invalid response structure: {e}")
467
+
468
+ async def _apply_updates(self, project_info: Dict[str, Any], updates: Dict[str, Any]) -> None:
469
+ """
470
+ Apply updates to database.
471
+
472
+ Args:
473
+ project_info: Project identification info
474
+ updates: Updates from OpenRouter API
475
+ """
476
+ try:
477
+ # Get or create project
478
+ project = await self.db_manager.get_or_create_project(
479
+ project_info["projectName"],
480
+ project_info["folderPath"],
481
+ project_info.get("remoteOrigin"),
482
+ project_info.get("upstreamOrigin")
483
+ )
484
+
485
+ # Update file descriptions
486
+ file_updates = updates.get("file_updates", {})
487
+ for file_path, description in file_updates.items():
488
+ await self.db_manager.upsert_file_description(
489
+ project_id=project.id,
490
+ branch=project_info["branch"],
491
+ file_path=file_path,
492
+ description=description
493
+ )
494
+ self.logger.info(f"Updated description for {file_path}")
495
+
496
+ # Update project overview if provided
497
+ overview_update = updates.get("overview_update")
498
+ if overview_update and overview_update.strip():
499
+ await self.db_manager.upsert_project_overview(
500
+ project_id=project.id,
501
+ branch=project_info["branch"],
502
+ overview=overview_update
503
+ )
504
+ self.logger.info("Updated project overview")
505
+
506
+ except Exception as e:
507
+ raise GitHookError(f"Failed to apply updates to database: {e}")
508
+
509
+ async def _run_git_command(self, cmd: List[str]) -> str:
510
+ """
511
+ Run a git command and return output.
512
+
513
+ Args:
514
+ cmd: Git command arguments
515
+
516
+ Returns:
517
+ Command output as string
518
+ """
519
+ full_cmd = ["git"] + cmd
520
+
521
+ try:
522
+ process = await asyncio.create_subprocess_exec(
523
+ *full_cmd,
524
+ stdout=asyncio.subprocess.PIPE,
525
+ stderr=asyncio.subprocess.PIPE,
526
+ cwd=Path.cwd()
527
+ )
528
+
529
+ stdout, stderr = await process.communicate()
530
+
531
+ if process.returncode != 0:
532
+ raise subprocess.CalledProcessError(
533
+ process.returncode,
534
+ full_cmd,
535
+ stdout,
536
+ stderr
537
+ )
538
+
539
+ return stdout.decode('utf-8')
540
+
541
+ except FileNotFoundError:
542
+ raise GitHookError("Git command not found - ensure git is installed and in PATH")