mcp-code-indexer 3.3.0__tar.gz → 3.4.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. {mcp_code_indexer-3.3.0 → mcp_code_indexer-3.4.1}/PKG-INFO +3 -3
  2. {mcp_code_indexer-3.3.0 → mcp_code_indexer-3.4.1}/README.md +2 -2
  3. {mcp_code_indexer-3.3.0 → mcp_code_indexer-3.4.1}/pyproject.toml +1 -1
  4. {mcp_code_indexer-3.3.0 → mcp_code_indexer-3.4.1}/src/mcp_code_indexer/git_hook_handler.py +375 -53
  5. {mcp_code_indexer-3.3.0 → mcp_code_indexer-3.4.1}/LICENSE +0 -0
  6. {mcp_code_indexer-3.3.0 → mcp_code_indexer-3.4.1}/src/mcp_code_indexer/__init__.py +0 -0
  7. {mcp_code_indexer-3.3.0 → mcp_code_indexer-3.4.1}/src/mcp_code_indexer/__main__.py +0 -0
  8. {mcp_code_indexer-3.3.0 → mcp_code_indexer-3.4.1}/src/mcp_code_indexer/ask_handler.py +0 -0
  9. {mcp_code_indexer-3.3.0 → mcp_code_indexer-3.4.1}/src/mcp_code_indexer/claude_api_handler.py +0 -0
  10. {mcp_code_indexer-3.3.0 → mcp_code_indexer-3.4.1}/src/mcp_code_indexer/cleanup_manager.py +0 -0
  11. {mcp_code_indexer-3.3.0 → mcp_code_indexer-3.4.1}/src/mcp_code_indexer/data/stop_words_english.txt +0 -0
  12. {mcp_code_indexer-3.3.0 → mcp_code_indexer-3.4.1}/src/mcp_code_indexer/database/__init__.py +0 -0
  13. {mcp_code_indexer-3.3.0 → mcp_code_indexer-3.4.1}/src/mcp_code_indexer/database/connection_health.py +0 -0
  14. {mcp_code_indexer-3.3.0 → mcp_code_indexer-3.4.1}/src/mcp_code_indexer/database/database.py +0 -0
  15. {mcp_code_indexer-3.3.0 → mcp_code_indexer-3.4.1}/src/mcp_code_indexer/database/exceptions.py +0 -0
  16. {mcp_code_indexer-3.3.0 → mcp_code_indexer-3.4.1}/src/mcp_code_indexer/database/models.py +0 -0
  17. {mcp_code_indexer-3.3.0 → mcp_code_indexer-3.4.1}/src/mcp_code_indexer/database/retry_executor.py +0 -0
  18. {mcp_code_indexer-3.3.0 → mcp_code_indexer-3.4.1}/src/mcp_code_indexer/deepask_handler.py +0 -0
  19. {mcp_code_indexer-3.3.0 → mcp_code_indexer-3.4.1}/src/mcp_code_indexer/error_handler.py +0 -0
  20. {mcp_code_indexer-3.3.0 → mcp_code_indexer-3.4.1}/src/mcp_code_indexer/file_scanner.py +0 -0
  21. {mcp_code_indexer-3.3.0 → mcp_code_indexer-3.4.1}/src/mcp_code_indexer/logging_config.py +0 -0
  22. {mcp_code_indexer-3.3.0 → mcp_code_indexer-3.4.1}/src/mcp_code_indexer/main.py +0 -0
  23. {mcp_code_indexer-3.3.0 → mcp_code_indexer-3.4.1}/src/mcp_code_indexer/middleware/__init__.py +0 -0
  24. {mcp_code_indexer-3.3.0 → mcp_code_indexer-3.4.1}/src/mcp_code_indexer/middleware/error_middleware.py +0 -0
  25. {mcp_code_indexer-3.3.0 → mcp_code_indexer-3.4.1}/src/mcp_code_indexer/migrations/001_initial.sql +0 -0
  26. {mcp_code_indexer-3.3.0 → mcp_code_indexer-3.4.1}/src/mcp_code_indexer/migrations/002_performance_indexes.sql +0 -0
  27. {mcp_code_indexer-3.3.0 → mcp_code_indexer-3.4.1}/src/mcp_code_indexer/migrations/003_project_overviews.sql +0 -0
  28. {mcp_code_indexer-3.3.0 → mcp_code_indexer-3.4.1}/src/mcp_code_indexer/migrations/004_remove_branch_dependency.sql +0 -0
  29. {mcp_code_indexer-3.3.0 → mcp_code_indexer-3.4.1}/src/mcp_code_indexer/migrations/005_remove_git_remotes.sql +0 -0
  30. {mcp_code_indexer-3.3.0 → mcp_code_indexer-3.4.1}/src/mcp_code_indexer/query_preprocessor.py +0 -0
  31. {mcp_code_indexer-3.3.0 → mcp_code_indexer-3.4.1}/src/mcp_code_indexer/server/__init__.py +0 -0
  32. {mcp_code_indexer-3.3.0 → mcp_code_indexer-3.4.1}/src/mcp_code_indexer/server/mcp_server.py +0 -0
  33. {mcp_code_indexer-3.3.0 → mcp_code_indexer-3.4.1}/src/mcp_code_indexer/tiktoken_cache/9b5ad71b2ce5302211f9c61530b329a4922fc6a4 +0 -0
  34. {mcp_code_indexer-3.3.0 → mcp_code_indexer-3.4.1}/src/mcp_code_indexer/token_counter.py +0 -0
  35. {mcp_code_indexer-3.3.0 → mcp_code_indexer-3.4.1}/src/mcp_code_indexer/tools/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: mcp-code-indexer
3
- Version: 3.3.0
3
+ Version: 3.4.1
4
4
  Summary: MCP server that tracks file descriptions across codebases, enabling AI agents to efficiently navigate and understand code through searchable summaries and token-aware overviews.
5
5
  License: MIT
6
6
  Keywords: mcp,model-context-protocol,code-indexer,ai-tools,codebase-navigation,file-descriptions,llm-tools
@@ -40,8 +40,8 @@ Description-Content-Type: text/markdown
40
40
 
41
41
  # MCP Code Indexer 🚀
42
42
 
43
- [![PyPI version](https://badge.fury.io/py/mcp-code-indexer.svg?30)](https://badge.fury.io/py/mcp-code-indexer)
44
- [![Python](https://img.shields.io/pypi/pyversions/mcp-code-indexer.svg?30)](https://pypi.org/project/mcp-code-indexer/)
43
+ [![PyPI version](https://badge.fury.io/py/mcp-code-indexer.svg?32)](https://badge.fury.io/py/mcp-code-indexer)
44
+ [![Python](https://img.shields.io/pypi/pyversions/mcp-code-indexer.svg?32)](https://pypi.org/project/mcp-code-indexer/)
45
45
  [![License](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)
46
46
 
47
47
  A production-ready **Model Context Protocol (MCP) server** that revolutionizes how AI agents navigate and understand codebases. Built for high-concurrency environments with advanced database resilience, the server provides instant access to intelligent descriptions, semantic search, and context-aware recommendations while maintaining 800+ writes/sec throughput.
@@ -1,7 +1,7 @@
1
1
  # MCP Code Indexer 🚀
2
2
 
3
- [![PyPI version](https://badge.fury.io/py/mcp-code-indexer.svg?30)](https://badge.fury.io/py/mcp-code-indexer)
4
- [![Python](https://img.shields.io/pypi/pyversions/mcp-code-indexer.svg?30)](https://pypi.org/project/mcp-code-indexer/)
3
+ [![PyPI version](https://badge.fury.io/py/mcp-code-indexer.svg?32)](https://badge.fury.io/py/mcp-code-indexer)
4
+ [![Python](https://img.shields.io/pypi/pyversions/mcp-code-indexer.svg?32)](https://pypi.org/project/mcp-code-indexer/)
5
5
  [![License](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)
6
6
 
7
7
  A production-ready **Model Context Protocol (MCP) server** that revolutionizes how AI agents navigate and understand codebases. Built for high-concurrency environments with advanced database resilience, the server provides instant access to intelligent descriptions, semantic search, and context-aware recommendations while maintaining 800+ writes/sec throughput.
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "mcp-code-indexer"
7
- version = "3.3.0"
7
+ version = "3.4.1"
8
8
  description = "MCP server that tracks file descriptions across codebases, enabling AI agents to efficiently navigate and understand code through searchable summaries and token-aware overviews."
9
9
  authors = ["MCP Code Indexer Contributors"]
10
10
  maintainers = ["MCP Code Indexer Contributors"]
@@ -76,6 +76,7 @@ class GitHookHandler:
76
76
  self.config = {
77
77
  "model": os.getenv("MCP_GITHOOK_MODEL", self.OPENROUTER_MODEL),
78
78
  "max_diff_tokens": 136000, # Skip if diff larger than this (in tokens)
79
+ "chunk_token_limit": 100000, # Target token limit per chunk
79
80
  "timeout": 300, # 5 minutes
80
81
  "temperature": 0.3, # Lower temperature for consistent updates
81
82
  }
@@ -87,6 +88,21 @@ class GitHookHandler:
87
88
  "OPENROUTER_API_KEY environment variable is required for git hook mode"
88
89
  )
89
90
 
91
+ def _log_and_print(self, message: str, level: str = "info") -> None:
92
+ """
93
+ Log message and also print to stdout for user visibility.
94
+
95
+ Args:
96
+ message: Message to log and print
97
+ level: Log level (info, warning, error)
98
+ """
99
+ # Log to logger
100
+ getattr(self.logger, level)(message)
101
+
102
+ # Also print to stdout with prefix for visibility
103
+ prefix = "🔍" if level == "info" else "⚠️" if level == "warning" else "❌"
104
+ print(f"{prefix} {message}")
105
+
90
106
  async def run_githook_mode(
91
107
  self,
92
108
  commit_hash: Optional[str] = None,
@@ -102,21 +118,20 @@ class GitHookHandler:
102
118
  This is the main entry point for git hook functionality.
103
119
  """
104
120
  try:
105
- self.logger.info("=== Git Hook Analysis Started ===")
121
+ self._log_and_print("=== Git Hook Analysis Started ===")
106
122
  if commit_hash:
107
- self.logger.info(f"Mode: Single commit ({commit_hash})")
123
+ self._log_and_print(f"Mode: Single commit ({commit_hash})")
108
124
  elif commit_range:
109
- self.logger.info(
125
+ self._log_and_print(
110
126
  f"Mode: Commit range ({commit_range[0]}..{commit_range[1]})"
111
127
  )
112
128
  else:
113
- self.logger.info("Mode: Staged changes")
129
+ self._log_and_print("Mode: Staged changes")
114
130
 
115
131
  # Get git info from current directory
116
132
  project_info = await self._identify_project_from_git()
117
- self.logger.info(
118
- f"Project identified: {project_info.get('name', 'Unknown')} "
119
- f"at {project_info.get('folderPath', 'Unknown')}"
133
+ self._log_and_print(
134
+ f"Project: {project_info.get('name', 'Unknown')}"
120
135
  )
121
136
 
122
137
  # Get git diff and commit message based on mode
@@ -136,25 +151,23 @@ class GitHookHandler:
136
151
 
137
152
  # Log diff details
138
153
  if not git_diff:
139
- self.logger.info("Skipping git hook update - no git diff")
154
+ self._log_and_print("No changes detected, skipping analysis")
140
155
  return
141
156
 
142
157
  diff_tokens = self.token_counter.count_tokens(git_diff)
143
- self.logger.info(f"Git diff: {diff_tokens} tokens")
158
+ self._log_and_print(f"Analyzing diff: {diff_tokens:,} tokens")
144
159
 
145
160
  # Fetch current state
146
- self.logger.info("Fetching current project state...")
161
+ self._log_and_print("Fetching current project state...")
147
162
  current_overview = await self._get_project_overview(project_info)
148
163
  current_descriptions = await self._get_all_descriptions(project_info)
149
164
  changed_files = self._extract_changed_files(git_diff)
150
165
 
151
166
  if not changed_files:
152
- self.logger.info("No changed files detected in git diff")
167
+ self._log_and_print("No files changed, skipping analysis")
153
168
  return
154
169
 
155
- self.logger.info(
156
- f"Found {len(changed_files)} changed files: {', '.join(changed_files)}"
157
- )
170
+ self._log_and_print(f"Found {len(changed_files)} changed files")
158
171
  overview_tokens = (
159
172
  self.token_counter.count_tokens(current_overview)
160
173
  if current_overview
@@ -174,13 +187,23 @@ class GitHookHandler:
174
187
 
175
188
  # Apply updates to database
176
189
  await self._apply_updates(project_info, updates)
177
-
178
- self.logger.info(
179
- f"Git hook update completed successfully for {len(changed_files)} files"
180
- )
190
+
191
+ # Count actual updates
192
+ file_update_count = len(updates.get("file_updates", {}))
193
+ overview_updated = bool(updates.get("overview_update"))
194
+
195
+ if file_update_count > 0 or overview_updated:
196
+ update_parts = []
197
+ if file_update_count > 0:
198
+ update_parts.append(f"{file_update_count} file descriptions")
199
+ if overview_updated:
200
+ update_parts.append("project overview")
201
+ self._log_and_print(f"✅ Updated {' and '.join(update_parts)}")
202
+ else:
203
+ self._log_and_print("✅ Analysis complete, no updates needed")
181
204
 
182
205
  except Exception as e:
183
- self.logger.error(f"Git hook mode failed: {e}")
206
+ self._log_and_print(f"Git hook analysis failed: {e}", "error")
184
207
  self.logger.error(f"Exception details: {type(e).__name__}: {str(e)}")
185
208
  import traceback
186
209
 
@@ -197,8 +220,8 @@ class GitHookHandler:
197
220
  changed_files: List[str],
198
221
  ) -> Dict[str, Any]:
199
222
  """
200
- Smart staging: Try single-stage first, fall back to two-stage if
201
- token limit exceeded.
223
+ Smart staging: Try single-stage first, fall back to two-stage,
224
+ then chunked processing if needed.
202
225
 
203
226
  Args:
204
227
  git_diff: Git diff content
@@ -229,35 +252,31 @@ class GitHookHandler:
229
252
 
230
253
  if prompt_tokens <= token_limit:
231
254
  # Use single-stage approach
232
- self.logger.info("Using single-stage analysis (within token limit)")
255
+ self._log_and_print("Using single-stage analysis")
233
256
  result = await self._call_openrouter(single_stage_prompt)
234
- self.logger.info("Single-stage analysis completed")
235
257
  return result
236
258
  else:
237
259
  # Fall back to two-stage approach
238
- self.logger.info(
239
- f"Single-stage prompt too large ({prompt_tokens} tokens), "
240
- f"falling back to two-stage analysis"
241
- )
242
-
243
- # Stage 1: Check if overview needs updating
244
- overview_updates = await self._analyze_overview_updates(
245
- git_diff, commit_message, current_overview, changed_files
246
- )
247
-
248
- # Stage 2: Update file descriptions
249
- file_updates = await self._analyze_file_updates(
250
- git_diff, commit_message, current_descriptions, changed_files
251
- )
252
-
253
- # Combine updates
254
- updates = {
255
- "file_updates": file_updates.get("file_updates", {}),
256
- "overview_update": overview_updates.get("overview_update"),
257
- }
260
+ self._log_and_print("Using two-stage analysis (large diff)")
258
261
 
259
- self.logger.info("Two-stage analysis completed")
260
- return updates
262
+ # Try two-stage analysis first
263
+ try:
264
+ return await self._analyze_with_two_stage(
265
+ git_diff, commit_message, current_overview,
266
+ current_descriptions, changed_files
267
+ )
268
+ except GitHookError as e:
269
+ if "too large" in str(e).lower():
270
+ # Fall back to chunked processing
271
+ self._log_and_print(
272
+ "Using chunked processing (very large diff)"
273
+ )
274
+ return await self._analyze_with_chunking(
275
+ git_diff, commit_message, current_overview,
276
+ current_descriptions, changed_files
277
+ )
278
+ else:
279
+ raise
261
280
 
262
281
  def _build_single_stage_prompt(
263
282
  self,
@@ -571,6 +590,118 @@ Return ONLY a JSON object:
571
590
  self.logger.warning(f"Failed to get file descriptions: {e}")
572
591
  return {}
573
592
 
593
+ async def _analyze_with_two_stage(
594
+ self,
595
+ git_diff: str,
596
+ commit_message: str,
597
+ current_overview: str,
598
+ current_descriptions: Dict[str, str],
599
+ changed_files: List[str],
600
+ ) -> Dict[str, Any]:
601
+ """
602
+ Two-stage analysis: overview updates first, then file updates.
603
+
604
+ Args:
605
+ git_diff: Git diff content
606
+ commit_message: Commit message explaining the changes
607
+ current_overview: Current project overview
608
+ current_descriptions: Current file descriptions
609
+ changed_files: List of changed file paths
610
+
611
+ Returns:
612
+ Dict containing file_updates and overview_update
613
+ """
614
+ # Stage 1: Check if overview needs updating
615
+ overview_updates = await self._analyze_overview_updates(
616
+ git_diff, commit_message, current_overview, changed_files
617
+ )
618
+
619
+ # Stage 2: Update file descriptions
620
+ file_updates = await self._analyze_file_updates(
621
+ git_diff, commit_message, current_descriptions, changed_files
622
+ )
623
+
624
+ # Combine updates
625
+ updates = {
626
+ "file_updates": file_updates.get("file_updates", {}),
627
+ "overview_update": overview_updates.get("overview_update"),
628
+ }
629
+
630
+ self.logger.info("Two-stage analysis completed")
631
+ return updates
632
+
633
+ async def _analyze_with_chunking(
634
+ self,
635
+ git_diff: str,
636
+ commit_message: str,
637
+ current_overview: str,
638
+ current_descriptions: Dict[str, str],
639
+ changed_files: List[str],
640
+ ) -> Dict[str, Any]:
641
+ """
642
+ Chunked processing: Break large diffs into manageable chunks.
643
+
644
+ Args:
645
+ git_diff: Git diff content
646
+ commit_message: Commit message explaining the changes
647
+ current_overview: Current project overview
648
+ current_descriptions: Current file descriptions
649
+ changed_files: List of changed file paths
650
+
651
+ Returns:
652
+ Dict containing file_updates and overview_update
653
+ """
654
+ self._log_and_print(
655
+ f"Starting chunked processing for {len(changed_files)} files"
656
+ )
657
+
658
+ # First, handle overview separately if needed
659
+ overview_update = None
660
+ if current_overview:
661
+ overview_update = await self._analyze_overview_lightweight(
662
+ commit_message, current_overview, changed_files
663
+ )
664
+
665
+ # Break changed files into chunks and process file descriptions
666
+ chunk_size = await self._calculate_optimal_chunk_size(
667
+ git_diff, changed_files
668
+ )
669
+
670
+ self._log_and_print(f"Processing in {chunk_size}-file chunks")
671
+
672
+ all_file_updates = {}
673
+
674
+ for i in range(0, len(changed_files), chunk_size):
675
+ chunk_files = changed_files[i:i + chunk_size]
676
+ chunk_number = (i // chunk_size) + 1
677
+ total_chunks = (len(changed_files) + chunk_size - 1) // chunk_size
678
+
679
+ self._log_and_print(
680
+ f"Processing chunk {chunk_number}/{total_chunks} "
681
+ f"({len(chunk_files)} files)"
682
+ )
683
+
684
+ # Extract diff content for this chunk
685
+ chunk_diff = self._extract_chunk_diff(git_diff, chunk_files)
686
+
687
+ # Process this chunk
688
+ chunk_updates = await self._analyze_file_chunk(
689
+ chunk_diff, commit_message, current_descriptions, chunk_files
690
+ )
691
+
692
+ # Merge results
693
+ if chunk_updates and "file_updates" in chunk_updates:
694
+ all_file_updates.update(chunk_updates["file_updates"])
695
+
696
+ self.logger.info(
697
+ f"Chunked processing completed: updated {len(all_file_updates)} files"
698
+ )
699
+
700
+ return {
701
+ "file_updates": all_file_updates,
702
+ "overview_update": overview_update
703
+ }
704
+
574
705
  async def _analyze_overview_updates(
575
706
  self,
576
707
  git_diff: str,
@@ -632,11 +763,9 @@ Return ONLY a JSON object:
632
763
  self.logger.info(f"Stage 1 prompt: {prompt_tokens} tokens")
633
764
 
634
765
  if prompt_tokens > self.config["max_diff_tokens"]:
635
- self.logger.warning(
636
- f"Stage 1 prompt too large ({prompt_tokens} tokens), "
637
- f"skipping overview analysis"
766
+ raise GitHookError(
767
+ f"Stage 1 prompt too large ({prompt_tokens} tokens)"
638
768
  )
639
- return {"overview_update": None}
640
769
 
641
770
  # Call OpenRouter API
642
771
  result = await self._call_openrouter(prompt)
@@ -708,11 +837,9 @@ Return ONLY a JSON object:
708
837
  self.logger.info(f"Stage 2 prompt: {prompt_tokens} tokens")
709
838
 
710
839
  if prompt_tokens > self.config["max_diff_tokens"]:
711
- self.logger.warning(
712
- f"Stage 2 prompt too large ({prompt_tokens} tokens), "
713
- f"skipping file analysis"
840
+ raise GitHookError(
841
+ f"Stage 2 prompt too large ({prompt_tokens} tokens)"
714
842
  )
715
- return {"file_updates": {}}
716
843
 
717
844
  # Call OpenRouter API
718
845
  result = await self._call_openrouter(prompt)
@@ -720,6 +847,201 @@ Return ONLY a JSON object:
720
847
 
721
848
  return result
722
849
 
850
+ async def _analyze_overview_lightweight(
851
+ self,
852
+ commit_message: str,
853
+ current_overview: str,
854
+ changed_files: List[str],
855
+ ) -> Optional[str]:
856
+ """
857
+ Lightweight overview analysis without including full diff.
858
+
859
+ Args:
860
+ commit_message: Commit message explaining the changes
861
+ current_overview: Current project overview
862
+ changed_files: List of changed file paths
863
+
864
+ Returns:
865
+ Updated overview text or None
866
+ """
867
+ self.logger.info("Lightweight overview analysis...")
868
+
869
+ prompt = f"""Analyze this commit to determine if project overview needs updating.
870
+
871
+ COMMIT MESSAGE:
872
+ {commit_message or "No commit message available"}
873
+
874
+ CURRENT PROJECT OVERVIEW:
875
+ {current_overview or "No overview available"}
876
+
877
+ CHANGED FILES:
878
+ {', '.join(changed_files)}
879
+
880
+ INSTRUCTIONS:
881
+ Update project overview ONLY if there are major structural changes like:
882
+ - New major features or components (indicated by commit message or new directories)
883
+ - Architectural changes (new patterns, frameworks, or approaches)
884
+ - Significant dependency additions (Cargo.toml, package.json, pyproject.toml changes)
885
+ - New API endpoints or workflows
886
+ - Changes to build/deployment processes
887
+
888
+ Do NOT update for: bug fixes, small refactors, documentation updates, version bumps.
889
+
890
+ Return ONLY a JSON object:
891
+ {{
892
+ "overview_update": "Updated overview text" or null
893
+ }}"""
894
+
895
+ try:
896
+ result = await self._call_openrouter(prompt)
897
+ return result.get("overview_update")
898
+ except Exception as e:
899
+ self.logger.warning(f"Lightweight overview analysis failed: {e}")
900
+ return None
901
+
902
+ async def _calculate_optimal_chunk_size(
903
+ self, git_diff: str, changed_files: List[str]
904
+ ) -> int:
905
+ """
906
+ Calculate optimal chunk size based on diff content.
907
+
908
+ Args:
909
+ git_diff: Full git diff content
910
+ changed_files: List of changed file paths
911
+
912
+ Returns:
913
+ Optimal number of files per chunk
914
+ """
915
+ if not changed_files:
916
+ return 10 # Default chunk size
917
+
918
+ # Estimate average diff size per file
919
+ total_diff_tokens = self.token_counter.count_tokens(git_diff)
920
+ avg_tokens_per_file = total_diff_tokens / len(changed_files)
921
+
922
+ # Target chunk token limit
923
+ chunk_limit = self.config.get("chunk_token_limit", 100000)
924
+
925
+ # Calculate chunk size with buffer for overhead
926
+ overhead_factor = 0.7 # Reserve 30% for prompt overhead
927
+ effective_limit = chunk_limit * overhead_factor
928
+
929
+ chunk_size = max(1, int(effective_limit / avg_tokens_per_file))
930
+
931
+ # Cap at reasonable limits
932
+ chunk_size = min(chunk_size, 50) # Max 50 files per chunk
933
+ chunk_size = max(chunk_size, 5) # Min 5 files per chunk
934
+
935
+ self.logger.info(
936
+ f"Calculated chunk size: {chunk_size} files "
937
+ f"(avg {avg_tokens_per_file:.0f} tokens/file, "
938
+ f"target {chunk_limit} tokens/chunk)"
939
+ )
940
+
941
+ return chunk_size
942
+
943
+ def _extract_chunk_diff(self, git_diff: str, chunk_files: List[str]) -> str:
944
+ """
945
+ Extract diff content for specific files.
946
+
947
+ Args:
948
+ git_diff: Full git diff content
949
+ chunk_files: List of files to include in chunk
950
+
951
+ Returns:
952
+ Filtered diff content for chunk files only
953
+ """
954
+ lines = git_diff.split('\n')
955
+ chunk_lines = []
956
+ current_file = None
957
+ include_section = False
958
+
959
+ for line in lines:
960
+ if line.startswith('diff --git'):
961
+ # Parse file path from diff header
962
+ parts = line.split(' ')
963
+ if len(parts) >= 4:
964
+ file_path = parts[2][2:] # Remove 'a/' prefix
965
+ current_file = file_path
966
+ include_section = file_path in chunk_files
967
+
968
+ if include_section:
969
+ chunk_lines.append(line)
970
+
971
+ return '\n'.join(chunk_lines)
972
+
973
+ async def _analyze_file_chunk(
974
+ self,
975
+ chunk_diff: str,
976
+ commit_message: str,
977
+ current_descriptions: Dict[str, str],
978
+ chunk_files: List[str],
979
+ ) -> Dict[str, Any]:
980
+ """
981
+ Analyze a chunk of files for description updates.
982
+
983
+ Args:
984
+ chunk_diff: Git diff for this chunk only
985
+ commit_message: Commit message explaining the changes
986
+ current_descriptions: Current file descriptions
987
+ chunk_files: List of files in this chunk
988
+
989
+ Returns:
990
+ Dict with file_updates for this chunk
991
+ """
992
+ # Only include descriptions for files in this chunk
993
+ relevant_descriptions = {
994
+ path: desc
995
+ for path, desc in current_descriptions.items()
996
+ if path in chunk_files
997
+ }
998
+
999
+ prompt = f"""Analyze this git commit chunk and update file descriptions.
1000
+
1001
+ COMMIT MESSAGE:
1002
+ {commit_message or "No commit message available"}
1003
+
1004
+ CURRENT FILE DESCRIPTIONS (for chunk files only):
1005
+ {json.dumps(relevant_descriptions, indent=2)}
1006
+
1007
+ CHUNK FILES:
1008
+ {', '.join(chunk_files)}
1009
+
1010
+ GIT DIFF (chunk only):
1011
+ {chunk_diff}
1012
+
1013
+ INSTRUCTIONS:
1014
+ Use the COMMIT MESSAGE to understand the intent and context of the changes.
1015
+ Update descriptions for files that have changed significantly.
1016
+ Only include files that need actual description updates.
1017
+
1018
+ Return ONLY a JSON object:
1019
+ {{
1020
+ "file_updates": {{
1021
+ "path/to/file1.py": "Updated description for file1",
1022
+ "path/to/file2.js": "Updated description for file2"
1023
+ }}
1024
+ }}"""
1025
+
1026
+ # Check token count
1027
+ prompt_tokens = self.token_counter.count_tokens(prompt)
1028
+ self.logger.info(f"Chunk prompt: {prompt_tokens} tokens")
1029
+
1030
+ if prompt_tokens > self.config.get("chunk_token_limit", 100000):
1031
+ self.logger.warning(
1032
+ f"Chunk still too large ({prompt_tokens} tokens), "
1033
+ f"skipping {len(chunk_files)} files"
1034
+ )
1035
+ return {"file_updates": {}}
1036
+
1037
+ # Call OpenRouter API
1038
+ try:
1039
+ result = await self._call_openrouter(prompt)
1040
+ return result
1041
+ except Exception as e:
1042
+ self.logger.error(f"Failed to analyze chunk: {e}")
1043
+ return {"file_updates": {}}
1044
+
723
1045
  @retry(
724
1046
  wait=wait_exponential(multiplier=1, min=4, max=60),
725
1047
  stop=stop_after_attempt(5),