mcp-code-indexer 2.2.1__py3-none-any.whl → 2.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -30,6 +30,7 @@ from mcp_code_indexer.database.exceptions import (
30
30
  from mcp_code_indexer.database.connection_health import (
31
31
  ConnectionHealthMonitor, DatabaseMetricsCollector
32
32
  )
33
+ from mcp_code_indexer.query_preprocessor import preprocess_search_query
33
34
 
34
35
  logger = logging.getLogger(__name__)
35
36
 
@@ -848,7 +849,16 @@ class DatabaseManager:
848
849
  query: str,
849
850
  max_results: int = 20
850
851
  ) -> List[SearchResult]:
851
- """Search file descriptions using FTS5."""
852
+ """Search file descriptions using FTS5 with intelligent query preprocessing."""
853
+ # Preprocess query for optimal FTS5 search
854
+ preprocessed_query = preprocess_search_query(query)
855
+
856
+ if not preprocessed_query:
857
+ logger.debug(f"Empty query after preprocessing: '{query}'")
858
+ return []
859
+
860
+ logger.debug(f"Search query preprocessing: '{query}' -> '{preprocessed_query}'")
861
+
852
862
  async with self.get_connection() as db:
853
863
  cursor = await db.execute(
854
864
  """
@@ -866,7 +876,7 @@ class DatabaseManager:
866
876
  ORDER BY bm25(file_descriptions_fts)
867
877
  LIMIT ?
868
878
  """,
869
- (query, project_id, branch, max_results)
879
+ (preprocessed_query, project_id, branch, max_results)
870
880
  )
871
881
  rows = await cursor.fetchall()
872
882
 
@@ -0,0 +1,181 @@
1
+ """
2
+ Query preprocessing module for intelligent FTS5 search.
3
+
4
+ This module provides intelligent query preprocessing for SQLite FTS5 full-text search
5
+ to enable multi-word search with case insensitive matching, whole word enforcement,
6
+ and proper handling of FTS5 operators as literal search terms.
7
+
8
+ Key features:
9
+ - Multi-word queries: "grpc proto" becomes "grpc" AND "proto" for order-agnostic matching
10
+ - FTS5 operator escaping: "AND OR" becomes '"AND" AND "OR"' to treat operators as literals
11
+ - Whole word matching: prevents partial matches by relying on proper tokenization
12
+ - Case insensitive: leverages FTS5 default behavior
13
+ - Special character handling: preserves special characters in quoted terms
14
+ """
15
+
16
+ import re
17
+ import logging
18
+ from typing import List, Set
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ class QueryPreprocessor:
24
+ """
25
+ Preprocesses user queries for optimal FTS5 search performance.
26
+
27
+ Handles multi-word queries, operator escaping, and special character preservation
28
+ while maintaining BM25 ranking performance.
29
+ """
30
+
31
+ # FTS5 operators that need to be escaped when used as literal search terms
32
+ FTS5_OPERATORS: Set[str] = {
33
+ 'AND', 'OR', 'NOT', 'NEAR'
34
+ }
35
+
36
+ def __init__(self):
37
+ """Initialize the query preprocessor."""
38
+ pass
39
+
40
+ def preprocess_query(self, query: str) -> str:
41
+ """
42
+ Preprocess a user query for FTS5 search.
43
+
44
+ Args:
45
+ query: Raw user query string
46
+
47
+ Returns:
48
+ Preprocessed query string optimized for FTS5
49
+
50
+ Examples:
51
+ >>> preprocessor = QueryPreprocessor()
52
+ >>> preprocessor.preprocess_query("grpc proto")
53
+ '"grpc" AND "proto"'
54
+ >>> preprocessor.preprocess_query("error AND handling")
55
+ '"error" AND "AND" AND "handling"'
56
+ >>> preprocessor.preprocess_query('config "file system"')
57
+ '"config" AND "file system"'
58
+ """
59
+ if not query or not query.strip():
60
+ return ""
61
+
62
+ # Normalize whitespace
63
+ query = query.strip()
64
+
65
+ # Split into terms while preserving quoted phrases
66
+ terms = self._split_terms(query)
67
+
68
+ if not terms:
69
+ return ""
70
+
71
+ # Process each term: escape operators and add quotes
72
+ processed_terms = []
73
+ for term in terms:
74
+ processed_term = self._process_term(term)
75
+ if processed_term: # Skip empty terms
76
+ processed_terms.append(processed_term)
77
+
78
+ if not processed_terms:
79
+ return ""
80
+
81
+ # Join with AND for multi-word matching
82
+ result = " AND ".join(processed_terms)
83
+
84
+ logger.debug(f"Preprocessed query: '{query}' -> '{result}'")
85
+ return result
86
+
87
+ def _split_terms(self, query: str) -> List[str]:
88
+ """
89
+ Split query into terms while preserving quoted phrases.
90
+
91
+ Args:
92
+ query: Input query string
93
+
94
+ Returns:
95
+ List of terms and quoted phrases
96
+
97
+ Examples:
98
+ 'grpc proto' -> ['grpc', 'proto']
99
+ 'config "file system"' -> ['config', '"file system"']
100
+ 'error AND handling' -> ['error', 'AND', 'handling']
101
+ """
102
+ terms = []
103
+
104
+ # Regex to match quoted phrases or individual words
105
+ # This pattern captures:
106
+ # 1. Double-quoted strings (including the quotes)
107
+ # 2. Single words (sequences of non-whitespace characters)
108
+ pattern = r'"[^"]*"|\S+'
109
+
110
+ matches = re.findall(pattern, query)
111
+
112
+ for match in matches:
113
+ # Skip empty matches
114
+ if match.strip():
115
+ terms.append(match)
116
+
117
+ return terms
118
+
119
+ def _process_term(self, term: str) -> str:
120
+ """
121
+ Process a single term: escape operators and ensure proper quoting.
122
+
123
+ Args:
124
+ term: Single term or quoted phrase
125
+
126
+ Returns:
127
+ Processed term ready for FTS5
128
+
129
+ Examples:
130
+ 'grpc' -> '"grpc"'
131
+ 'AND' -> '"AND"'
132
+ '"file system"' -> '"file system"'
133
+ 'c++' -> '"c++"'
134
+ """
135
+ if not term:
136
+ return ""
137
+
138
+ # If already quoted, return as-is (user intentional phrase)
139
+ if term.startswith('"') and term.endswith('"') and len(term) >= 2:
140
+ return term
141
+
142
+ # Check if term is an FTS5 operator (case-insensitive)
143
+ if term.upper() in self.FTS5_OPERATORS:
144
+ # Escape operator by quoting
145
+ escaped_term = f'"{term}"'
146
+ logger.debug(f"Escaped FTS5 operator: '{term}' -> '{escaped_term}'")
147
+ return escaped_term
148
+
149
+ # Quote all terms to ensure whole-word matching and handle special characters
150
+ return f'"{term}"'
151
+
152
+ def _escape_quotes_in_term(self, term: str) -> str:
153
+ """
154
+ Escape internal quotes in a term for FTS5 compatibility.
155
+
156
+ Args:
157
+ term: Term that may contain quotes
158
+
159
+ Returns:
160
+ Term with escaped quotes
161
+
162
+ Examples:
163
+ 'say "hello"' -> 'say ""hello""'
164
+ "test's file" -> "test's file"
165
+ """
166
+ # In FTS5, quotes inside quoted strings are escaped by doubling them
167
+ return term.replace('"', '""')
168
+
169
+
170
+ def preprocess_search_query(query: str) -> str:
171
+ """
172
+ Convenience function for preprocessing search queries.
173
+
174
+ Args:
175
+ query: Raw user query
176
+
177
+ Returns:
178
+ Preprocessed query ready for FTS5
179
+ """
180
+ preprocessor = QueryPreprocessor()
181
+ return preprocessor.preprocess_query(query)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mcp-code-indexer
3
- Version: 2.2.1
3
+ Version: 2.3.0
4
4
  Summary: MCP server that tracks file descriptions across codebases, enabling AI agents to efficiently navigate and understand code through searchable summaries and token-aware overviews.
5
5
  Author: MCP Code Indexer Contributors
6
6
  Maintainer: MCP Code Indexer Contributors
@@ -59,8 +59,8 @@ Dynamic: requires-python
59
59
 
60
60
  # MCP Code Indexer 🚀
61
61
 
62
- [![PyPI version](https://badge.fury.io/py/mcp-code-indexer.svg?16)](https://badge.fury.io/py/mcp-code-indexer)
63
- [![Python](https://img.shields.io/pypi/pyversions/mcp-code-indexer.svg?16)](https://pypi.org/project/mcp-code-indexer/)
62
+ [![PyPI version](https://badge.fury.io/py/mcp-code-indexer.svg?17)](https://badge.fury.io/py/mcp-code-indexer)
63
+ [![Python](https://img.shields.io/pypi/pyversions/mcp-code-indexer.svg?17)](https://pypi.org/project/mcp-code-indexer/)
64
64
  [![License](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)
65
65
 
66
66
  A production-ready **Model Context Protocol (MCP) server** that revolutionizes how AI agents navigate and understand codebases. Built for high-concurrency environments with advanced database resilience, the server provides instant access to intelligent descriptions, semantic search, and context-aware recommendations while maintaining 800+ writes/sec throughput.
@@ -9,11 +9,12 @@ mcp_code_indexer/git_hook_handler.py,sha256=k6QpoLI-5D9EvrLQrHWMII2qNu21daRX_jXl
9
9
  mcp_code_indexer/logging_config.py,sha256=tf_U-Zz_axDXRV9s7TfHEeUrBjT1QBWkzPuiyZMffBU,10252
10
10
  mcp_code_indexer/main.py,sha256=U-f3AJYdycWhjh-vLryj7aH8DGCs4d3x1yjA852HTxM,31546
11
11
  mcp_code_indexer/merge_handler.py,sha256=lJR8eVq2qSrF6MW9mR3Fy8UzrNAaQ7RsI2FMNXne3vQ,14692
12
+ mcp_code_indexer/query_preprocessor.py,sha256=uHYy8FO4FTs7MFKsXoueYIafWDKOIirRgdUzwh8upb4,5773
12
13
  mcp_code_indexer/token_counter.py,sha256=WrifOkbF99nWWHlRlhCHAB2KN7qr83GOHl7apE-hJcE,8460
13
14
  mcp_code_indexer/data/stop_words_english.txt,sha256=7Zdd9ameVgA6tN_zuXROvHXD4hkWeELVywPhb7FJEkw,6343
14
15
  mcp_code_indexer/database/__init__.py,sha256=aPq_aaRp0aSwOBIq9GkuMNjmLxA411zg2vhdrAuHm-w,38
15
16
  mcp_code_indexer/database/connection_health.py,sha256=s2r9L_KipH5NlemAUDnhBQO90Dn4b_0Ht9UDs7F6QPk,24432
16
- mcp_code_indexer/database/database.py,sha256=86XL1b49cTeTzkJ1mVbkYPq_QyQrVQOy8w_b1MxZR-E,50856
17
+ mcp_code_indexer/database/database.py,sha256=7B1Pq9CFwIgU0k8ObcgtGxqdnGPZnDhaYuTTdjf7AV0,51334
17
18
  mcp_code_indexer/database/exceptions.py,sha256=AgpRA9Z5R-GoWYdQSPeSdYvAXDopFCQkLGN3jD7Ha4E,10215
18
19
  mcp_code_indexer/database/models.py,sha256=_vCmJnPXZSiInRzyvs4c7QUWuNNW8qsOoDlGX8J-Gnk,7124
19
20
  mcp_code_indexer/database/retry_executor.py,sha256=QUayjkCk8OsckVMYiJ_HBQ9NTUss-H8GQeUIUbbw4_U,13419
@@ -23,9 +24,9 @@ mcp_code_indexer/server/__init__.py,sha256=16xMcuriUOBlawRqWNBk6niwrvtv_JD5xvI36
23
24
  mcp_code_indexer/server/mcp_server.py,sha256=Bu0H8HNqZ6S8_BUZndQLIQpq4cDys3Ry9eEKBuIJItQ,70505
24
25
  mcp_code_indexer/tiktoken_cache/9b5ad71b2ce5302211f9c61530b329a4922fc6a4,sha256=Ijkht27pm96ZW3_3OFE-7xAPtR0YyTWXoRO8_-hlsqc,1681126
25
26
  mcp_code_indexer/tools/__init__.py,sha256=m01mxML2UdD7y5rih_XNhNSCMzQTz7WQ_T1TeOcYlnE,49
26
- mcp_code_indexer-2.2.1.dist-info/licenses/LICENSE,sha256=JN9dyPPgYwH9C-UjYM7FLNZjQ6BF7kAzpF3_4PwY4rY,1086
27
- mcp_code_indexer-2.2.1.dist-info/METADATA,sha256=0EGjVvrBPbpQ-webmSYeXEyxWA0RRORrRGx24-DPloQ,20165
28
- mcp_code_indexer-2.2.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
29
- mcp_code_indexer-2.2.1.dist-info/entry_points.txt,sha256=8HqWOw1Is7jOP1bvIgaSwouvT9z_Boe-9hd4NzyJOhY,68
30
- mcp_code_indexer-2.2.1.dist-info/top_level.txt,sha256=yKYCM-gMGt-cnupGfAhnZaoEsROLB6DQ1KFUuyKx4rw,17
31
- mcp_code_indexer-2.2.1.dist-info/RECORD,,
27
+ mcp_code_indexer-2.3.0.dist-info/licenses/LICENSE,sha256=JN9dyPPgYwH9C-UjYM7FLNZjQ6BF7kAzpF3_4PwY4rY,1086
28
+ mcp_code_indexer-2.3.0.dist-info/METADATA,sha256=lvPoSUWnbJ20uv59kQYBjiCpj9Ea6l439haNAK4WDQU,20165
29
+ mcp_code_indexer-2.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
30
+ mcp_code_indexer-2.3.0.dist-info/entry_points.txt,sha256=8HqWOw1Is7jOP1bvIgaSwouvT9z_Boe-9hd4NzyJOhY,68
31
+ mcp_code_indexer-2.3.0.dist-info/top_level.txt,sha256=yKYCM-gMGt-cnupGfAhnZaoEsROLB6DQ1KFUuyKx4rw,17
32
+ mcp_code_indexer-2.3.0.dist-info/RECORD,,