mcp-code-indexer 2.2.1__py3-none-any.whl → 2.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -30,6 +30,7 @@ from mcp_code_indexer.database.exceptions import (
30
30
  from mcp_code_indexer.database.connection_health import (
31
31
  ConnectionHealthMonitor, DatabaseMetricsCollector
32
32
  )
33
+ from mcp_code_indexer.query_preprocessor import preprocess_search_query
33
34
 
34
35
  logger = logging.getLogger(__name__)
35
36
 
@@ -848,7 +849,16 @@ class DatabaseManager:
848
849
  query: str,
849
850
  max_results: int = 20
850
851
  ) -> List[SearchResult]:
851
- """Search file descriptions using FTS5."""
852
+ """Search file descriptions using FTS5 with intelligent query preprocessing."""
853
+ # Preprocess query for optimal FTS5 search
854
+ preprocessed_query = preprocess_search_query(query)
855
+
856
+ if not preprocessed_query:
857
+ logger.debug(f"Empty query after preprocessing: '{query}'")
858
+ return []
859
+
860
+ logger.debug(f"Search query preprocessing: '{query}' -> '{preprocessed_query}'")
861
+
852
862
  async with self.get_connection() as db:
853
863
  cursor = await db.execute(
854
864
  """
@@ -866,7 +876,7 @@ class DatabaseManager:
866
876
  ORDER BY bm25(file_descriptions_fts)
867
877
  LIMIT ?
868
878
  """,
869
- (query, project_id, branch, max_results)
879
+ (preprocessed_query, project_id, branch, max_results)
870
880
  )
871
881
  rows = await cursor.fetchall()
872
882
 
mcp_code_indexer/main.py CHANGED
@@ -294,6 +294,7 @@ async def handle_runcommand(args: argparse.Namespace) -> None:
294
294
  "update_codebase_overview": server._handle_update_codebase_overview,
295
295
  "get_word_frequency": server._handle_get_word_frequency,
296
296
  "merge_branch_descriptions": server._handle_merge_branch_descriptions,
297
+ "search_codebase_overview": server._handle_search_codebase_overview,
297
298
  }
298
299
 
299
300
  if tool_name not in tool_handlers:
@@ -0,0 +1,181 @@
1
+ """
2
+ Query preprocessing module for intelligent FTS5 search.
3
+
4
+ This module provides intelligent query preprocessing for SQLite FTS5 full-text search
5
+ to enable multi-word search with case insensitive matching, whole word enforcement,
6
+ and proper handling of FTS5 operators as literal search terms.
7
+
8
+ Key features:
9
+ - Multi-word queries: "grpc proto" becomes "grpc" AND "proto" for order-agnostic matching
10
+ - FTS5 operator escaping: "AND OR" becomes '"AND" AND "OR"' to treat operators as literals
11
+ - Whole word matching: prevents partial matches by relying on proper tokenization
12
+ - Case insensitive: leverages FTS5 default behavior
13
+ - Special character handling: preserves special characters in quoted terms
14
+ """
15
+
16
+ import re
17
+ import logging
18
+ from typing import List, Set
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ class QueryPreprocessor:
24
+ """
25
+ Preprocesses user queries for optimal FTS5 search performance.
26
+
27
+ Handles multi-word queries, operator escaping, and special character preservation
28
+ while maintaining BM25 ranking performance.
29
+ """
30
+
31
+ # FTS5 operators that need to be escaped when used as literal search terms
32
+ FTS5_OPERATORS: Set[str] = {
33
+ 'AND', 'OR', 'NOT', 'NEAR'
34
+ }
35
+
36
+ def __init__(self):
37
+ """Initialize the query preprocessor."""
38
+ pass
39
+
40
+ def preprocess_query(self, query: str) -> str:
41
+ """
42
+ Preprocess a user query for FTS5 search.
43
+
44
+ Args:
45
+ query: Raw user query string
46
+
47
+ Returns:
48
+ Preprocessed query string optimized for FTS5
49
+
50
+ Examples:
51
+ >>> preprocessor = QueryPreprocessor()
52
+ >>> preprocessor.preprocess_query("grpc proto")
53
+ '"grpc" AND "proto"'
54
+ >>> preprocessor.preprocess_query("error AND handling")
55
+ '"error" AND "AND" AND "handling"'
56
+ >>> preprocessor.preprocess_query('config "file system"')
57
+ '"config" AND "file system"'
58
+ """
59
+ if not query or not query.strip():
60
+ return ""
61
+
62
+ # Normalize whitespace
63
+ query = query.strip()
64
+
65
+ # Split into terms while preserving quoted phrases
66
+ terms = self._split_terms(query)
67
+
68
+ if not terms:
69
+ return ""
70
+
71
+ # Process each term: escape operators and add quotes
72
+ processed_terms = []
73
+ for term in terms:
74
+ processed_term = self._process_term(term)
75
+ if processed_term: # Skip empty terms
76
+ processed_terms.append(processed_term)
77
+
78
+ if not processed_terms:
79
+ return ""
80
+
81
+ # Join with AND for multi-word matching
82
+ result = " AND ".join(processed_terms)
83
+
84
+ logger.debug(f"Preprocessed query: '{query}' -> '{result}'")
85
+ return result
86
+
87
+ def _split_terms(self, query: str) -> List[str]:
88
+ """
89
+ Split query into terms while preserving quoted phrases.
90
+
91
+ Args:
92
+ query: Input query string
93
+
94
+ Returns:
95
+ List of terms and quoted phrases
96
+
97
+ Examples:
98
+ 'grpc proto' -> ['grpc', 'proto']
99
+ 'config "file system"' -> ['config', '"file system"']
100
+ 'error AND handling' -> ['error', 'AND', 'handling']
101
+ """
102
+ terms = []
103
+
104
+ # Regex to match quoted phrases or individual words
105
+ # This pattern captures:
106
+ # 1. Double-quoted strings (including the quotes)
107
+ # 2. Single words (sequences of non-whitespace characters)
108
+ pattern = r'"[^"]*"|\S+'
109
+
110
+ matches = re.findall(pattern, query)
111
+
112
+ for match in matches:
113
+ # Skip empty matches
114
+ if match.strip():
115
+ terms.append(match)
116
+
117
+ return terms
118
+
119
+ def _process_term(self, term: str) -> str:
120
+ """
121
+ Process a single term: escape operators and ensure proper quoting.
122
+
123
+ Args:
124
+ term: Single term or quoted phrase
125
+
126
+ Returns:
127
+ Processed term ready for FTS5
128
+
129
+ Examples:
130
+ 'grpc' -> '"grpc"'
131
+ 'AND' -> '"AND"'
132
+ '"file system"' -> '"file system"'
133
+ 'c++' -> '"c++"'
134
+ """
135
+ if not term:
136
+ return ""
137
+
138
+ # If already quoted, return as-is (user intentional phrase)
139
+ if term.startswith('"') and term.endswith('"') and len(term) >= 2:
140
+ return term
141
+
142
+ # Check if term is an FTS5 operator (case-insensitive)
143
+ if term.upper() in self.FTS5_OPERATORS:
144
+ # Escape operator by quoting
145
+ escaped_term = f'"{term}"'
146
+ logger.debug(f"Escaped FTS5 operator: '{term}' -> '{escaped_term}'")
147
+ return escaped_term
148
+
149
+ # Quote all terms to ensure whole-word matching and handle special characters
150
+ return f'"{term}"'
151
+
152
+ def _escape_quotes_in_term(self, term: str) -> str:
153
+ """
154
+ Escape internal quotes in a term for FTS5 compatibility.
155
+
156
+ Args:
157
+ term: Term that may contain quotes
158
+
159
+ Returns:
160
+ Term with escaped quotes
161
+
162
+ Examples:
163
+ 'say "hello"' -> 'say ""hello""'
164
+ "test's file" -> "test's file"
165
+ """
166
+ # In FTS5, quotes inside quoted strings are escaped by doubling them
167
+ return term.replace('"', '""')
168
+
169
+
170
+ def preprocess_search_query(query: str) -> str:
171
+ """
172
+ Convenience function for preprocessing search queries.
173
+
174
+ Args:
175
+ query: Raw user query
176
+
177
+ Returns:
178
+ Preprocessed query ready for FTS5
179
+ """
180
+ preprocessor = QueryPreprocessor()
181
+ return preprocessor.preprocess_query(query)
@@ -478,6 +478,23 @@ src/
478
478
  "properties": {},
479
479
  "additionalProperties": False
480
480
  }
481
+ ),
482
+ types.Tool(
483
+ name="search_codebase_overview",
484
+ description="Search for a single word in the codebase overview and return 2 sentences before and after where the word is found. Useful for quickly finding specific information in large overviews.",
485
+ inputSchema={
486
+ "type": "object",
487
+ "properties": {
488
+ "projectName": {"type": "string", "description": "The name of the project"},
489
+ "folderPath": {"type": "string", "description": "Absolute path to the project folder on disk"},
490
+ "branch": {"type": "string", "description": "Git branch name"},
491
+ "remoteOrigin": {"type": "string", "description": "Git remote origin URL if available"},
492
+ "upstreamOrigin": {"type": "string", "description": "Upstream repository URL if this is a fork"},
493
+ "searchWord": {"type": "string", "description": "Single word to search for in the overview"}
494
+ },
495
+ "required": ["projectName", "folderPath", "branch", "searchWord"],
496
+ "additionalProperties": False
497
+ }
481
498
  )
482
499
  ]
483
500
 
@@ -503,6 +520,7 @@ src/
503
520
  "get_word_frequency": self._handle_get_word_frequency,
504
521
  "merge_branch_descriptions": self._handle_merge_branch_descriptions,
505
522
  "check_database_health": self._handle_check_database_health,
523
+ "search_codebase_overview": self._handle_search_codebase_overview,
506
524
  }
507
525
 
508
526
  if name not in tool_handlers:
@@ -889,18 +907,28 @@ src/
889
907
  # Use provided token limit or fall back to server default
890
908
  token_limit = arguments.get("tokenLimit", self.token_limit)
891
909
 
892
- # Calculate total tokens
910
+ # Calculate total tokens for descriptions
893
911
  logger.info("Calculating total token count...")
894
- total_tokens = self.token_counter.calculate_codebase_tokens(file_descriptions)
912
+ descriptions_tokens = self.token_counter.calculate_codebase_tokens(file_descriptions)
913
+
914
+ # Get overview tokens if available
915
+ overview = await self.db_manager.get_project_overview(project_id, resolved_branch)
916
+ overview_tokens = 0
917
+ if overview and overview.overview:
918
+ overview_tokens = self.token_counter.count_tokens(overview.overview)
919
+
920
+ total_tokens = descriptions_tokens + overview_tokens
895
921
  is_large = total_tokens > token_limit
896
922
  recommendation = "use_search" if is_large else "use_overview"
897
923
 
898
- logger.info(f"Codebase analysis complete: {total_tokens} tokens, {len(file_descriptions)} files")
924
+ logger.info(f"Codebase analysis complete: {total_tokens} tokens total ({descriptions_tokens} descriptions + {overview_tokens} overview), {len(file_descriptions)} files")
899
925
  logger.info(f"Size assessment: {'LARGE' if is_large else 'SMALL'} (limit: {token_limit})")
900
926
  logger.info(f"Recommendation: {recommendation}")
901
927
 
902
928
  return {
903
929
  "totalTokens": total_tokens,
930
+ "descriptionsTokens": descriptions_tokens,
931
+ "overviewTokens": overview_tokens,
904
932
  "isLarge": is_large,
905
933
  "recommendation": recommendation,
906
934
  "tokenLimit": token_limit,
@@ -1205,6 +1233,54 @@ src/
1205
1233
  "totalUniqueTerms": result.total_unique_terms
1206
1234
  }
1207
1235
 
1236
+ async def _handle_search_codebase_overview(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
1237
+ """Handle search_codebase_overview tool calls."""
1238
+ project_id = await self._get_or_create_project_id(arguments)
1239
+ resolved_branch = await self._resolve_branch(project_id, arguments["branch"])
1240
+ search_word = arguments["searchWord"].lower()
1241
+
1242
+ # Get the overview
1243
+ overview = await self.db_manager.get_project_overview(project_id, resolved_branch)
1244
+
1245
+ if not overview or not overview.overview:
1246
+ return {
1247
+ "found": False,
1248
+ "message": "No overview found for this project",
1249
+ "searchWord": arguments["searchWord"]
1250
+ }
1251
+
1252
+ # Split overview into sentences
1253
+ import re
1254
+ sentences = re.split(r'[.!?]+', overview.overview)
1255
+ sentences = [s.strip() for s in sentences if s.strip()]
1256
+
1257
+ # Find matches
1258
+ matches = []
1259
+ for i, sentence in enumerate(sentences):
1260
+ if search_word in sentence.lower():
1261
+ # Get context: 2 sentences before and after
1262
+ start_idx = max(0, i - 2)
1263
+ end_idx = min(len(sentences), i + 3)
1264
+
1265
+ context_sentences = sentences[start_idx:end_idx]
1266
+ context = '. '.join(context_sentences) + '.'
1267
+
1268
+ matches.append({
1269
+ "matchIndex": i,
1270
+ "matchSentence": sentence,
1271
+ "context": context,
1272
+ "contextStartIndex": start_idx,
1273
+ "contextEndIndex": end_idx - 1
1274
+ })
1275
+
1276
+ return {
1277
+ "found": len(matches) > 0,
1278
+ "searchWord": arguments["searchWord"],
1279
+ "matches": matches,
1280
+ "totalMatches": len(matches),
1281
+ "totalSentences": len(sentences)
1282
+ }
1283
+
1208
1284
  async def _handle_check_database_health(self, arguments: Dict[str, Any]) -> Dict[str, Any]:
1209
1285
  """
1210
1286
  Handle check_database_health tool calls with comprehensive diagnostics.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mcp-code-indexer
3
- Version: 2.2.1
3
+ Version: 2.4.0
4
4
  Summary: MCP server that tracks file descriptions across codebases, enabling AI agents to efficiently navigate and understand code through searchable summaries and token-aware overviews.
5
5
  Author: MCP Code Indexer Contributors
6
6
  Maintainer: MCP Code Indexer Contributors
@@ -59,8 +59,8 @@ Dynamic: requires-python
59
59
 
60
60
  # MCP Code Indexer 🚀
61
61
 
62
- [![PyPI version](https://badge.fury.io/py/mcp-code-indexer.svg?16)](https://badge.fury.io/py/mcp-code-indexer)
63
- [![Python](https://img.shields.io/pypi/pyversions/mcp-code-indexer.svg?16)](https://pypi.org/project/mcp-code-indexer/)
62
+ [![PyPI version](https://badge.fury.io/py/mcp-code-indexer.svg?18)](https://badge.fury.io/py/mcp-code-indexer)
63
+ [![Python](https://img.shields.io/pypi/pyversions/mcp-code-indexer.svg?18)](https://pypi.org/project/mcp-code-indexer/)
64
64
  [![License](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)
65
65
 
66
66
  A production-ready **Model Context Protocol (MCP) server** that revolutionizes how AI agents navigate and understand codebases. Built for high-concurrency environments with advanced database resilience, the server provides instant access to intelligent descriptions, semantic search, and context-aware recommendations while maintaining 800+ writes/sec throughput.
@@ -7,25 +7,26 @@ mcp_code_indexer/error_handler.py,sha256=cNSUFFrGBMLDv4qa78c7495L1wSl_dXCRbzCJOi
7
7
  mcp_code_indexer/file_scanner.py,sha256=ctXeZMROgDThEtjzsANTK9TbK-fhTScMBd4iyuleBT4,11734
8
8
  mcp_code_indexer/git_hook_handler.py,sha256=k6QpoLI-5D9EvrLQrHWMII2qNu21daRX_jXlk9U6bGI,36976
9
9
  mcp_code_indexer/logging_config.py,sha256=tf_U-Zz_axDXRV9s7TfHEeUrBjT1QBWkzPuiyZMffBU,10252
10
- mcp_code_indexer/main.py,sha256=U-f3AJYdycWhjh-vLryj7aH8DGCs4d3x1yjA852HTxM,31546
10
+ mcp_code_indexer/main.py,sha256=abCHbNFUYjkJcNYsU0EPdZQI-_Gz9cQCH7dYJ5Jp7I8,31627
11
11
  mcp_code_indexer/merge_handler.py,sha256=lJR8eVq2qSrF6MW9mR3Fy8UzrNAaQ7RsI2FMNXne3vQ,14692
12
+ mcp_code_indexer/query_preprocessor.py,sha256=uHYy8FO4FTs7MFKsXoueYIafWDKOIirRgdUzwh8upb4,5773
12
13
  mcp_code_indexer/token_counter.py,sha256=WrifOkbF99nWWHlRlhCHAB2KN7qr83GOHl7apE-hJcE,8460
13
14
  mcp_code_indexer/data/stop_words_english.txt,sha256=7Zdd9ameVgA6tN_zuXROvHXD4hkWeELVywPhb7FJEkw,6343
14
15
  mcp_code_indexer/database/__init__.py,sha256=aPq_aaRp0aSwOBIq9GkuMNjmLxA411zg2vhdrAuHm-w,38
15
16
  mcp_code_indexer/database/connection_health.py,sha256=s2r9L_KipH5NlemAUDnhBQO90Dn4b_0Ht9UDs7F6QPk,24432
16
- mcp_code_indexer/database/database.py,sha256=86XL1b49cTeTzkJ1mVbkYPq_QyQrVQOy8w_b1MxZR-E,50856
17
+ mcp_code_indexer/database/database.py,sha256=7B1Pq9CFwIgU0k8ObcgtGxqdnGPZnDhaYuTTdjf7AV0,51334
17
18
  mcp_code_indexer/database/exceptions.py,sha256=AgpRA9Z5R-GoWYdQSPeSdYvAXDopFCQkLGN3jD7Ha4E,10215
18
19
  mcp_code_indexer/database/models.py,sha256=_vCmJnPXZSiInRzyvs4c7QUWuNNW8qsOoDlGX8J-Gnk,7124
19
20
  mcp_code_indexer/database/retry_executor.py,sha256=QUayjkCk8OsckVMYiJ_HBQ9NTUss-H8GQeUIUbbw4_U,13419
20
21
  mcp_code_indexer/middleware/__init__.py,sha256=p-mP0pMsfiU2yajCPvokCUxUEkh_lu4XJP1LyyMW2ug,220
21
22
  mcp_code_indexer/middleware/error_middleware.py,sha256=5agJTAkkPogfPGnja1V9JtG9RG-BiOALIJYctK3byJQ,11730
22
23
  mcp_code_indexer/server/__init__.py,sha256=16xMcuriUOBlawRqWNBk6niwrvtv_JD5xvI36X1Vsmk,41
23
- mcp_code_indexer/server/mcp_server.py,sha256=Bu0H8HNqZ6S8_BUZndQLIQpq4cDys3Ry9eEKBuIJItQ,70505
24
+ mcp_code_indexer/server/mcp_server.py,sha256=BqRPF2pBQhNbPN-LrNq1IMYJzqDKD-A6BSHwQ_5dgK8,74382
24
25
  mcp_code_indexer/tiktoken_cache/9b5ad71b2ce5302211f9c61530b329a4922fc6a4,sha256=Ijkht27pm96ZW3_3OFE-7xAPtR0YyTWXoRO8_-hlsqc,1681126
25
26
  mcp_code_indexer/tools/__init__.py,sha256=m01mxML2UdD7y5rih_XNhNSCMzQTz7WQ_T1TeOcYlnE,49
26
- mcp_code_indexer-2.2.1.dist-info/licenses/LICENSE,sha256=JN9dyPPgYwH9C-UjYM7FLNZjQ6BF7kAzpF3_4PwY4rY,1086
27
- mcp_code_indexer-2.2.1.dist-info/METADATA,sha256=0EGjVvrBPbpQ-webmSYeXEyxWA0RRORrRGx24-DPloQ,20165
28
- mcp_code_indexer-2.2.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
29
- mcp_code_indexer-2.2.1.dist-info/entry_points.txt,sha256=8HqWOw1Is7jOP1bvIgaSwouvT9z_Boe-9hd4NzyJOhY,68
30
- mcp_code_indexer-2.2.1.dist-info/top_level.txt,sha256=yKYCM-gMGt-cnupGfAhnZaoEsROLB6DQ1KFUuyKx4rw,17
31
- mcp_code_indexer-2.2.1.dist-info/RECORD,,
27
+ mcp_code_indexer-2.4.0.dist-info/licenses/LICENSE,sha256=JN9dyPPgYwH9C-UjYM7FLNZjQ6BF7kAzpF3_4PwY4rY,1086
28
+ mcp_code_indexer-2.4.0.dist-info/METADATA,sha256=6fbCl9tjJUT0x7EF77w-znQumIQw2fVkRes3FcN1Av4,20165
29
+ mcp_code_indexer-2.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
30
+ mcp_code_indexer-2.4.0.dist-info/entry_points.txt,sha256=8HqWOw1Is7jOP1bvIgaSwouvT9z_Boe-9hd4NzyJOhY,68
31
+ mcp_code_indexer-2.4.0.dist-info/top_level.txt,sha256=yKYCM-gMGt-cnupGfAhnZaoEsROLB6DQ1KFUuyKx4rw,17
32
+ mcp_code_indexer-2.4.0.dist-info/RECORD,,