mcp-code-indexer 4.0.2__tar.gz → 4.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. {mcp_code_indexer-4.0.2 → mcp_code_indexer-4.2.0}/PKG-INFO +66 -5
  2. {mcp_code_indexer-4.0.2 → mcp_code_indexer-4.2.0}/README.md +62 -2
  3. {mcp_code_indexer-4.0.2 → mcp_code_indexer-4.2.0}/pyproject.toml +11 -2
  4. {mcp_code_indexer-4.0.2 → mcp_code_indexer-4.2.0}/src/mcp_code_indexer/database/models.py +125 -1
  5. {mcp_code_indexer-4.0.2 → mcp_code_indexer-4.2.0}/src/mcp_code_indexer/main.py +60 -0
  6. mcp_code_indexer-4.2.0/src/mcp_code_indexer/migrations/006_vector_mode.sql +189 -0
  7. {mcp_code_indexer-4.0.2 → mcp_code_indexer-4.2.0}/src/mcp_code_indexer/server/mcp_server.py +3 -0
  8. mcp_code_indexer-4.2.0/src/mcp_code_indexer/vector_mode/__init__.py +36 -0
  9. mcp_code_indexer-4.2.0/src/mcp_code_indexer/vector_mode/chunking/__init__.py +19 -0
  10. mcp_code_indexer-4.2.0/src/mcp_code_indexer/vector_mode/chunking/ast_chunker.py +403 -0
  11. mcp_code_indexer-4.2.0/src/mcp_code_indexer/vector_mode/chunking/chunk_optimizer.py +500 -0
  12. mcp_code_indexer-4.2.0/src/mcp_code_indexer/vector_mode/chunking/language_handlers.py +478 -0
  13. mcp_code_indexer-4.2.0/src/mcp_code_indexer/vector_mode/config.py +167 -0
  14. mcp_code_indexer-4.2.0/src/mcp_code_indexer/vector_mode/daemon.py +335 -0
  15. mcp_code_indexer-4.2.0/src/mcp_code_indexer/vector_mode/monitoring/__init__.py +19 -0
  16. mcp_code_indexer-4.2.0/src/mcp_code_indexer/vector_mode/monitoring/change_detector.py +312 -0
  17. mcp_code_indexer-4.2.0/src/mcp_code_indexer/vector_mode/monitoring/file_watcher.py +445 -0
  18. mcp_code_indexer-4.2.0/src/mcp_code_indexer/vector_mode/monitoring/merkle_tree.py +418 -0
  19. mcp_code_indexer-4.2.0/src/mcp_code_indexer/vector_mode/providers/__init__.py +17 -0
  20. mcp_code_indexer-4.2.0/src/mcp_code_indexer/vector_mode/providers/turbopuffer_client.py +217 -0
  21. mcp_code_indexer-4.2.0/src/mcp_code_indexer/vector_mode/providers/voyage_client.py +119 -0
  22. mcp_code_indexer-4.2.0/src/mcp_code_indexer/vector_mode/security/__init__.py +11 -0
  23. mcp_code_indexer-4.2.0/src/mcp_code_indexer/vector_mode/security/patterns.py +297 -0
  24. mcp_code_indexer-4.2.0/src/mcp_code_indexer/vector_mode/security/redactor.py +368 -0
  25. {mcp_code_indexer-4.0.2 → mcp_code_indexer-4.2.0}/LICENSE +0 -0
  26. {mcp_code_indexer-4.0.2 → mcp_code_indexer-4.2.0}/src/mcp_code_indexer/__init__.py +0 -0
  27. {mcp_code_indexer-4.0.2 → mcp_code_indexer-4.2.0}/src/mcp_code_indexer/__main__.py +0 -0
  28. {mcp_code_indexer-4.0.2 → mcp_code_indexer-4.2.0}/src/mcp_code_indexer/ask_handler.py +0 -0
  29. {mcp_code_indexer-4.0.2 → mcp_code_indexer-4.2.0}/src/mcp_code_indexer/claude_api_handler.py +0 -0
  30. {mcp_code_indexer-4.0.2 → mcp_code_indexer-4.2.0}/src/mcp_code_indexer/cleanup_manager.py +0 -0
  31. {mcp_code_indexer-4.0.2 → mcp_code_indexer-4.2.0}/src/mcp_code_indexer/commands/__init__.py +0 -0
  32. {mcp_code_indexer-4.0.2 → mcp_code_indexer-4.2.0}/src/mcp_code_indexer/commands/makelocal.py +0 -0
  33. {mcp_code_indexer-4.0.2 → mcp_code_indexer-4.2.0}/src/mcp_code_indexer/data/stop_words_english.txt +0 -0
  34. {mcp_code_indexer-4.0.2 → mcp_code_indexer-4.2.0}/src/mcp_code_indexer/database/__init__.py +0 -0
  35. {mcp_code_indexer-4.0.2 → mcp_code_indexer-4.2.0}/src/mcp_code_indexer/database/connection_health.py +0 -0
  36. {mcp_code_indexer-4.0.2 → mcp_code_indexer-4.2.0}/src/mcp_code_indexer/database/database.py +0 -0
  37. {mcp_code_indexer-4.0.2 → mcp_code_indexer-4.2.0}/src/mcp_code_indexer/database/database_factory.py +0 -0
  38. {mcp_code_indexer-4.0.2 → mcp_code_indexer-4.2.0}/src/mcp_code_indexer/database/exceptions.py +0 -0
  39. {mcp_code_indexer-4.0.2 → mcp_code_indexer-4.2.0}/src/mcp_code_indexer/database/path_resolver.py +0 -0
  40. {mcp_code_indexer-4.0.2 → mcp_code_indexer-4.2.0}/src/mcp_code_indexer/database/retry_executor.py +0 -0
  41. {mcp_code_indexer-4.0.2 → mcp_code_indexer-4.2.0}/src/mcp_code_indexer/deepask_handler.py +0 -0
  42. {mcp_code_indexer-4.0.2 → mcp_code_indexer-4.2.0}/src/mcp_code_indexer/error_handler.py +0 -0
  43. {mcp_code_indexer-4.0.2 → mcp_code_indexer-4.2.0}/src/mcp_code_indexer/file_scanner.py +0 -0
  44. {mcp_code_indexer-4.0.2 → mcp_code_indexer-4.2.0}/src/mcp_code_indexer/git_hook_handler.py +0 -0
  45. {mcp_code_indexer-4.0.2 → mcp_code_indexer-4.2.0}/src/mcp_code_indexer/logging_config.py +0 -0
  46. {mcp_code_indexer-4.0.2 → mcp_code_indexer-4.2.0}/src/mcp_code_indexer/middleware/__init__.py +0 -0
  47. {mcp_code_indexer-4.0.2 → mcp_code_indexer-4.2.0}/src/mcp_code_indexer/middleware/auth.py +0 -0
  48. {mcp_code_indexer-4.0.2 → mcp_code_indexer-4.2.0}/src/mcp_code_indexer/middleware/error_middleware.py +0 -0
  49. {mcp_code_indexer-4.0.2 → mcp_code_indexer-4.2.0}/src/mcp_code_indexer/middleware/logging.py +0 -0
  50. {mcp_code_indexer-4.0.2 → mcp_code_indexer-4.2.0}/src/mcp_code_indexer/middleware/security.py +0 -0
  51. {mcp_code_indexer-4.0.2 → mcp_code_indexer-4.2.0}/src/mcp_code_indexer/migrations/001_initial.sql +0 -0
  52. {mcp_code_indexer-4.0.2 → mcp_code_indexer-4.2.0}/src/mcp_code_indexer/migrations/002_performance_indexes.sql +0 -0
  53. {mcp_code_indexer-4.0.2 → mcp_code_indexer-4.2.0}/src/mcp_code_indexer/migrations/003_project_overviews.sql +0 -0
  54. {mcp_code_indexer-4.0.2 → mcp_code_indexer-4.2.0}/src/mcp_code_indexer/migrations/004_remove_branch_dependency.sql +0 -0
  55. {mcp_code_indexer-4.0.2 → mcp_code_indexer-4.2.0}/src/mcp_code_indexer/migrations/005_remove_git_remotes.sql +0 -0
  56. {mcp_code_indexer-4.0.2 → mcp_code_indexer-4.2.0}/src/mcp_code_indexer/query_preprocessor.py +0 -0
  57. {mcp_code_indexer-4.0.2 → mcp_code_indexer-4.2.0}/src/mcp_code_indexer/server/__init__.py +0 -0
  58. {mcp_code_indexer-4.0.2 → mcp_code_indexer-4.2.0}/src/mcp_code_indexer/tiktoken_cache/9b5ad71b2ce5302211f9c61530b329a4922fc6a4 +0 -0
  59. {mcp_code_indexer-4.0.2 → mcp_code_indexer-4.2.0}/src/mcp_code_indexer/token_counter.py +0 -0
  60. {mcp_code_indexer-4.0.2 → mcp_code_indexer-4.2.0}/src/mcp_code_indexer/tools/__init__.py +0 -0
  61. {mcp_code_indexer-4.0.2 → mcp_code_indexer-4.2.0}/src/mcp_code_indexer/transport/__init__.py +0 -0
  62. {mcp_code_indexer-4.0.2 → mcp_code_indexer-4.2.0}/src/mcp_code_indexer/transport/base.py +0 -0
  63. {mcp_code_indexer-4.0.2 → mcp_code_indexer-4.2.0}/src/mcp_code_indexer/transport/http_transport.py +0 -0
  64. {mcp_code_indexer-4.0.2 → mcp_code_indexer-4.2.0}/src/mcp_code_indexer/transport/stdio_transport.py +0 -0
@@ -1,12 +1,12 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: mcp-code-indexer
3
- Version: 4.0.2
3
+ Version: 4.2.0
4
4
  Summary: MCP server that tracks file descriptions across codebases, enabling AI agents to efficiently navigate and understand code through searchable summaries and token-aware overviews.
5
5
  License: MIT
6
6
  Keywords: mcp,model-context-protocol,code-indexer,ai-tools,codebase-navigation,file-descriptions,llm-tools
7
7
  Author: MCP Code Indexer Contributors
8
8
  Maintainer: MCP Code Indexer Contributors
9
- Requires-Python: >=3.9,<4.0
9
+ Requires-Python: >=3.10,<4.0
10
10
  Classifier: Development Status :: 5 - Production/Stable
11
11
  Classifier: Environment :: Console
12
12
  Classifier: Framework :: AsyncIO
@@ -14,15 +14,16 @@ Classifier: Intended Audience :: Developers
14
14
  Classifier: License :: OSI Approved :: MIT License
15
15
  Classifier: Operating System :: OS Independent
16
16
  Classifier: Programming Language :: Python :: 3
17
- Classifier: Programming Language :: Python :: 3.9
18
17
  Classifier: Programming Language :: Python :: 3.10
19
18
  Classifier: Programming Language :: Python :: 3.11
20
19
  Classifier: Programming Language :: Python :: 3.12
21
20
  Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Programming Language :: Python :: 3.9
22
22
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
23
  Classifier: Topic :: Software Development
24
24
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
25
25
  Classifier: Typing :: Typed
26
+ Provides-Extra: vector
26
27
  Requires-Dist: aiofiles (==23.2.0)
27
28
  Requires-Dist: aiohttp (>=3.8.0)
28
29
  Requires-Dist: aiosqlite (==0.19.0)
@@ -43,8 +44,8 @@ Description-Content-Type: text/markdown
43
44
 
44
45
  # MCP Code Indexer 🚀
45
46
 
46
- [![PyPI version](https://badge.fury.io/py/mcp-code-indexer.svg?45)](https://badge.fury.io/py/mcp-code-indexer)
47
- [![Python](https://img.shields.io/pypi/pyversions/mcp-code-indexer.svg?45)](https://pypi.org/project/mcp-code-indexer/)
47
+ [![PyPI version](https://badge.fury.io/py/mcp-code-indexer.svg?47)](https://badge.fury.io/py/mcp-code-indexer)
48
+ [![Python](https://img.shields.io/pypi/pyversions/mcp-code-indexer.svg?47)](https://pypi.org/project/mcp-code-indexer/)
48
49
  [![License](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)
49
50
 
50
51
  A production-ready **Model Context Protocol (MCP) server** that revolutionizes how AI agents navigate and understand codebases. Built for high-concurrency environments with advanced database resilience, the server provides instant access to intelligent descriptions, semantic search, and context-aware recommendations while maintaining 800+ writes/sec throughput.
@@ -197,6 +198,66 @@ The git hook integration provides intelligent automation:
197
198
 
198
199
  **Learn More**: See [Git Hook Setup Guide](docs/git-hook-setup.md) for complete configuration options and troubleshooting.
199
200
 
201
+ ## 🧠 Vector Mode (BETA)
202
+
203
+ 🚀 **NEW Feature**: Semantic code search with vector embeddings! Experience AI-powered code discovery that understands context and meaning, not just keywords.
204
+
205
+ ### 🎯 What is Vector Mode?
206
+
207
+ Vector Mode transforms how you search and understand codebases by using AI embeddings:
208
+
209
+ - **🔍 Semantic Search**: Find code by meaning, not just text matching
210
+ - **⚡ Real-time Indexing**: Automatic embedding generation as code changes
211
+ - **🛡️ Secure by Default**: Comprehensive secret redaction before API calls
212
+ - **🌐 Multi-language**: Python, JavaScript, TypeScript with AST-based chunking
213
+ - **📊 Smart Chunking**: Context-aware code segmentation for optimal embeddings
214
+
215
+ ### 🚀 Quick Start
216
+
217
+ ```bash
218
+ # Install vector mode dependencies
219
+ pip install mcp-code-indexer[vector]
220
+
221
+ # Set required API keys
222
+ export VOYAGE_API_KEY="pa-your-voyage-api-key"
223
+ export TURBOPUFFER_API_KEY="your-turbopuffer-api-key"
224
+
225
+ # Optional: Configure region (default: gcp-europe-west3)
226
+ export TURBOPUFFER_REGION="gcp-europe-west3"
227
+
228
+ # Start with vector mode enabled
229
+ mcp-code-indexer --vector
230
+
231
+ # The daemon automatically starts and begins indexing your projects
232
+ ```
233
+
234
+ ### 💡 Key Features
235
+
236
+ - **🔐 Secret Redaction**: 20+ pattern types automatically detected and redacted
237
+ - **🌳 Merkle Trees**: Efficient change detection without full directory scans
238
+ - **🎛️ Circuit Breakers**: Resilient API integration with automatic retry logic
239
+ - **📈 Production Ready**: Built for high-concurrency with comprehensive monitoring
240
+
241
+ ### 🔧 Advanced Configuration
242
+
243
+ ```bash
244
+ # Custom configuration
245
+ mcp-code-indexer --vector --vector-config /path/to/config.yaml
246
+
247
+ # HTTP mode with vector search
248
+ mcp-code-indexer --vector --http --port 8080
249
+ ```
250
+
251
+ ### 🛠️ Architecture
252
+
253
+ Vector Mode adds powerful new MCP tools:
254
+ - `vector_search` - Semantic code search across projects
255
+ - `similarity_search` - Find similar code patterns
256
+ - `dependency_search` - Discover code relationships
257
+ - `vector_status` - Monitor indexing progress
258
+
259
+ **Status**: Currently in BETA - foundations implemented, full pipeline in development.
260
+
200
261
  ## 🔧 Development Setup
201
262
 
202
263
  ### 👨‍💻 For Contributors
@@ -1,7 +1,7 @@
1
1
  # MCP Code Indexer 🚀
2
2
 
3
- [![PyPI version](https://badge.fury.io/py/mcp-code-indexer.svg?45)](https://badge.fury.io/py/mcp-code-indexer)
4
- [![Python](https://img.shields.io/pypi/pyversions/mcp-code-indexer.svg?45)](https://pypi.org/project/mcp-code-indexer/)
3
+ [![PyPI version](https://badge.fury.io/py/mcp-code-indexer.svg?47)](https://badge.fury.io/py/mcp-code-indexer)
4
+ [![Python](https://img.shields.io/pypi/pyversions/mcp-code-indexer.svg?47)](https://pypi.org/project/mcp-code-indexer/)
5
5
  [![License](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)
6
6
 
7
7
  A production-ready **Model Context Protocol (MCP) server** that revolutionizes how AI agents navigate and understand codebases. Built for high-concurrency environments with advanced database resilience, the server provides instant access to intelligent descriptions, semantic search, and context-aware recommendations while maintaining 800+ writes/sec throughput.
@@ -154,6 +154,66 @@ The git hook integration provides intelligent automation:
154
154
 
155
155
  **Learn More**: See [Git Hook Setup Guide](docs/git-hook-setup.md) for complete configuration options and troubleshooting.
156
156
 
157
+ ## 🧠 Vector Mode (BETA)
158
+
159
+ 🚀 **NEW Feature**: Semantic code search with vector embeddings! Experience AI-powered code discovery that understands context and meaning, not just keywords.
160
+
161
+ ### 🎯 What is Vector Mode?
162
+
163
+ Vector Mode transforms how you search and understand codebases by using AI embeddings:
164
+
165
+ - **🔍 Semantic Search**: Find code by meaning, not just text matching
166
+ - **⚡ Real-time Indexing**: Automatic embedding generation as code changes
167
+ - **🛡️ Secure by Default**: Comprehensive secret redaction before API calls
168
+ - **🌐 Multi-language**: Python, JavaScript, TypeScript with AST-based chunking
169
+ - **📊 Smart Chunking**: Context-aware code segmentation for optimal embeddings
170
+
171
+ ### 🚀 Quick Start
172
+
173
+ ```bash
174
+ # Install vector mode dependencies
175
+ pip install mcp-code-indexer[vector]
176
+
177
+ # Set required API keys
178
+ export VOYAGE_API_KEY="pa-your-voyage-api-key"
179
+ export TURBOPUFFER_API_KEY="your-turbopuffer-api-key"
180
+
181
+ # Optional: Configure region (default: gcp-europe-west3)
182
+ export TURBOPUFFER_REGION="gcp-europe-west3"
183
+
184
+ # Start with vector mode enabled
185
+ mcp-code-indexer --vector
186
+
187
+ # The daemon automatically starts and begins indexing your projects
188
+ ```
189
+
190
+ ### 💡 Key Features
191
+
192
+ - **🔐 Secret Redaction**: 20+ pattern types automatically detected and redacted
193
+ - **🌳 Merkle Trees**: Efficient change detection without full directory scans
194
+ - **🎛️ Circuit Breakers**: Resilient API integration with automatic retry logic
195
+ - **📈 Production Ready**: Built for high-concurrency with comprehensive monitoring
196
+
197
+ ### 🔧 Advanced Configuration
198
+
199
+ ```bash
200
+ # Custom configuration
201
+ mcp-code-indexer --vector --vector-config /path/to/config.yaml
202
+
203
+ # HTTP mode with vector search
204
+ mcp-code-indexer --vector --http --port 8080
205
+ ```
206
+
207
+ ### 🛠️ Architecture
208
+
209
+ Vector Mode adds powerful new MCP tools:
210
+ - `vector_search` - Semantic code search across projects
211
+ - `similarity_search` - Find similar code patterns
212
+ - `dependency_search` - Discover code relationships
213
+ - `vector_status` - Monitor indexing progress
214
+
215
+ **Status**: Currently in BETA - foundations implemented, full pipeline in development.
216
+
157
217
  ## 🔧 Development Setup
158
218
 
159
219
  ### 👨‍💻 For Contributors
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
4
4
 
5
5
  [tool.poetry]
6
6
  name = "mcp-code-indexer"
7
- version = "4.0.2"
7
+ version = "4.2.0"
8
8
  description = "MCP server that tracks file descriptions across codebases, enabling AI agents to efficiently navigate and understand code through searchable summaries and token-aware overviews."
9
9
  authors = ["MCP Code Indexer Contributors"]
10
10
  maintainers = ["MCP Code Indexer Contributors"]
@@ -43,7 +43,7 @@ classifiers = [
43
43
  packages = [{include = "mcp_code_indexer", from = "src"}]
44
44
 
45
45
  [tool.poetry.dependencies]
46
- python = "^3.9"
46
+ python = "^3.10"
47
47
  tiktoken = ">=0.9.0"
48
48
  mcp = ">=1.9.0"
49
49
  gitignore-parser = "0.1.11"
@@ -59,6 +59,15 @@ fastapi = ">=0.104.0"
59
59
  uvicorn = ">=0.24.0"
60
60
  python-multipart = ">=0.0.6"
61
61
 
62
+ [tool.poetry.extras]
63
+ vector = [
64
+ "voyageai",
65
+ "turbopuffer",
66
+ "tree-sitter",
67
+ "watchdog",
68
+ "pyyaml"
69
+ ]
70
+
62
71
  [tool.poetry.group.dev.dependencies]
63
72
  pytest = ">=8.0.0"
64
73
  pytest-asyncio = ">=0.21.0"
@@ -7,7 +7,8 @@ the database operations.
7
7
  """
8
8
 
9
9
  from datetime import datetime
10
- from typing import List, Optional
10
+ from typing import List, Optional, Dict, Any
11
+ from enum import Enum
11
12
 
12
13
  from pydantic import BaseModel, Field
13
14
 
@@ -185,6 +186,129 @@ class WordFrequencyResult(BaseModel):
185
186
  total_unique_terms: int = Field(..., description="Number of unique terms found")
186
187
 
187
188
 
189
+ # Vector Mode Models
190
+
191
+ class ChunkType(str, Enum):
192
+ """Types of code chunks for semantic analysis."""
193
+ FUNCTION = "function"
194
+ CLASS = "class"
195
+ METHOD = "method"
196
+ IMPORT = "import"
197
+ DOCSTRING = "docstring"
198
+ COMMENT = "comment"
199
+ VARIABLE = "variable"
200
+ INTERFACE = "interface"
201
+ TYPE_DEFINITION = "type_definition"
202
+ MODULE = "module"
203
+ NAMESPACE = "namespace"
204
+ GENERIC = "generic"
205
+
206
+ class NodeType(str, Enum):
207
+ """Types of nodes in Merkle tree."""
208
+ FILE = "file"
209
+ DIRECTORY = "directory"
210
+ PROJECT = "project"
211
+
212
+ class SyncStatus(str, Enum):
213
+ """Vector index synchronization status."""
214
+ PENDING = "pending"
215
+ IN_PROGRESS = "in_progress"
216
+ COMPLETED = "completed"
217
+ FAILED = "failed"
218
+ PAUSED = "paused"
219
+
220
+ class CodeChunk(BaseModel):
221
+ """
222
+ Represents a semantic chunk of code extracted from a file.
223
+
224
+ Used for embedding generation and vector search operations.
225
+ """
226
+
227
+ id: Optional[int] = Field(None, description="Database ID")
228
+ file_id: int = Field(..., description="Reference to FileDescription")
229
+ project_id: str = Field(..., description="Reference to project")
230
+ chunk_type: ChunkType = Field(..., description="Type of code chunk")
231
+ name: Optional[str] = Field(None, description="Name of function/class/etc")
232
+ start_line: int = Field(..., description="Starting line number")
233
+ end_line: int = Field(..., description="Ending line number")
234
+ content_hash: str = Field(..., description="SHA-256 hash of chunk content")
235
+ embedding_id: Optional[str] = Field(None, description="Vector database ID")
236
+ redacted: bool = Field(default=False, description="Whether content was redacted")
237
+ metadata: Dict[str, Any] = Field(default_factory=dict, description="Additional metadata")
238
+ created: datetime = Field(default_factory=datetime.utcnow, description="Creation timestamp")
239
+ last_modified: datetime = Field(default_factory=datetime.utcnow, description="Last update timestamp")
240
+
241
+ class MerkleNode(BaseModel):
242
+ """
243
+ Represents a node in the Merkle tree for change detection.
244
+
245
+ Used to efficiently detect file system changes without scanning entire directory trees.
246
+ """
247
+
248
+ id: Optional[int] = Field(None, description="Database ID")
249
+ project_id: str = Field(..., description="Reference to project")
250
+ path: str = Field(..., description="File/directory path relative to project root")
251
+ hash: str = Field(..., description="SHA-256 hash of content or children")
252
+ node_type: NodeType = Field(..., description="Type of filesystem node")
253
+ parent_path: Optional[str] = Field(None, description="Path to parent directory")
254
+ children_hash: Optional[str] = Field(None, description="Combined hash of children")
255
+ last_modified: datetime = Field(default_factory=datetime.utcnow, description="Last update timestamp")
256
+
257
+ class IndexMeta(BaseModel):
258
+ """
259
+ Metadata about vector indexing progress and status for a project.
260
+
261
+ Tracks indexing state, statistics, and synchronization status.
262
+ """
263
+
264
+ id: Optional[int] = Field(None, description="Database ID")
265
+ project_id: str = Field(..., description="Reference to project", unique=True)
266
+ total_chunks: int = Field(default=0, description="Total number of chunks")
267
+ indexed_chunks: int = Field(default=0, description="Number of chunks with embeddings")
268
+ total_files: int = Field(default=0, description="Total number of files")
269
+ indexed_files: int = Field(default=0, description="Number of files processed")
270
+ last_sync: Optional[datetime] = Field(None, description="Last successful sync timestamp")
271
+ sync_status: SyncStatus = Field(default=SyncStatus.PENDING, description="Current sync status")
272
+ error_message: Optional[str] = Field(None, description="Last error message")
273
+ queue_depth: int = Field(default=0, description="Number of pending tasks")
274
+ processing_rate: float = Field(default=0.0, description="Files per second processing rate")
275
+ estimated_completion: Optional[datetime] = Field(None, description="Estimated completion time")
276
+ metadata: Dict[str, Any] = Field(default_factory=dict, description="Additional metadata")
277
+ created: datetime = Field(default_factory=datetime.utcnow, description="Creation timestamp")
278
+ last_modified: datetime = Field(default_factory=datetime.utcnow, description="Last update timestamp")
279
+
280
+ class VectorSearchResult(BaseModel):
281
+ """
282
+ Represents a vector search result with similarity scoring.
283
+ """
284
+
285
+ file_path: str = Field(..., description="Path to the matching file")
286
+ chunk_name: Optional[str] = Field(None, description="Name of the code chunk")
287
+ chunk_type: ChunkType = Field(..., description="Type of code chunk")
288
+ code_snippet: str = Field(..., description="Original code content")
289
+ start_line: int = Field(..., description="Starting line number")
290
+ end_line: int = Field(..., description="Ending line number")
291
+ similarity_score: float = Field(..., description="Cosine similarity score")
292
+ project_id: str = Field(..., description="Project identifier")
293
+ metadata: Dict[str, Any] = Field(default_factory=dict, description="Additional metadata")
294
+
295
+ class VectorIndexStatus(BaseModel):
296
+ """
297
+ Current status of vector indexing for a project.
298
+ """
299
+
300
+ is_indexing: bool = Field(..., description="Whether indexing is currently active")
301
+ indexed_files: int = Field(..., description="Number of files indexed")
302
+ total_files: int = Field(..., description="Total number of files")
303
+ indexed_chunks: int = Field(..., description="Number of chunks indexed")
304
+ total_chunks: int = Field(..., description="Total number of chunks")
305
+ last_sync: Optional[datetime] = Field(None, description="Last sync timestamp")
306
+ sync_status: SyncStatus = Field(..., description="Current sync status")
307
+ queue_depth: int = Field(..., description="Number of pending tasks")
308
+ processing_rate: float = Field(..., description="Processing rate")
309
+ estimated_completion: Optional[datetime] = Field(None, description="Estimated completion time")
310
+ error_message: Optional[str] = Field(None, description="Last error message")
311
+
188
312
  # Enable forward references for recursive models
189
313
  FolderNode.model_rebuild()
190
314
  CodebaseOverview.model_rebuild()
@@ -151,6 +151,19 @@ def parse_arguments() -> argparse.Namespace:
151
151
  help="Allowed CORS origins for HTTP transport (default: allow all)",
152
152
  )
153
153
 
154
+ # Vector mode options
155
+ parser.add_argument(
156
+ "--vector",
157
+ action="store_true",
158
+ help="Enable vector mode with semantic search capabilities (requires vector extras)",
159
+ )
160
+
161
+ parser.add_argument(
162
+ "--vector-config",
163
+ type=str,
164
+ help="Path to vector mode configuration file",
165
+ )
166
+
154
167
  return parser.parse_args()
155
168
 
156
169
 
@@ -996,6 +1009,52 @@ async def main() -> None:
996
1009
  )
997
1010
 
998
1011
  try:
1012
+ # Handle vector mode initialization
1013
+ vector_daemon_task = None
1014
+ if args.vector:
1015
+ try:
1016
+ from .vector_mode import is_vector_mode_available, check_api_keys
1017
+ from .vector_mode.config import load_vector_config
1018
+ from .vector_mode.daemon import start_vector_daemon
1019
+
1020
+ # Check if vector mode is available
1021
+ if not is_vector_mode_available():
1022
+ logger.error("Vector mode requires additional dependencies. Install with: pip install mcp-code-indexer[vector]")
1023
+ sys.exit(1)
1024
+
1025
+ # Check API keys
1026
+ api_keys = check_api_keys()
1027
+ if not all(api_keys.values()):
1028
+ missing = [k for k, v in api_keys.items() if not v]
1029
+ logger.error(f"Missing API keys for vector mode: {', '.join(missing)}")
1030
+ sys.exit(1)
1031
+
1032
+ # Load vector configuration
1033
+ vector_config_path = Path(args.vector_config).expanduser() if args.vector_config else None
1034
+ vector_config = load_vector_config(vector_config_path)
1035
+
1036
+ logger.info(
1037
+ "Vector mode enabled",
1038
+ extra={
1039
+ "structured_data": {
1040
+ "embedding_model": vector_config.embedding_model,
1041
+ "batch_size": vector_config.batch_size,
1042
+ "daemon_enabled": vector_config.daemon_enabled,
1043
+ }
1044
+ }
1045
+ )
1046
+
1047
+ # Start vector daemon in background
1048
+ if vector_config.daemon_enabled:
1049
+ vector_daemon_task = asyncio.create_task(
1050
+ start_vector_daemon(vector_config_path, db_path, cache_dir)
1051
+ )
1052
+ logger.info("Vector daemon started")
1053
+
1054
+ except Exception as e:
1055
+ logger.error(f"Failed to initialize vector mode: {e}")
1056
+ sys.exit(1)
1057
+
999
1058
  # Import and run the MCP server
1000
1059
  from .server.mcp_server import MCPCodeIndexServer
1001
1060
 
@@ -1028,6 +1087,7 @@ async def main() -> None:
1028
1087
  db_path=db_path,
1029
1088
  cache_dir=cache_dir,
1030
1089
  transport=transport,
1090
+ vector_mode=args.vector,
1031
1091
  )
1032
1092
 
1033
1093
  # Set server instance in transport after server creation
@@ -0,0 +1,189 @@
1
+ -- Migration 006: Add vector mode tables and indexes
2
+ -- This migration adds support for semantic search capabilities with embeddings
3
+ -- Includes code chunks, Merkle tree nodes, and indexing metadata
4
+
5
+ -- Ensure WAL mode is enabled for safe migrations
6
+ PRAGMA journal_mode=WAL;
7
+
8
+ -- Temporarily disable foreign key constraints for migration
9
+ PRAGMA foreign_keys=OFF;
10
+
11
+ -- Start transaction for atomic migration
12
+ BEGIN TRANSACTION;
13
+
14
+ -- Create code_chunks table for storing semantic code chunks
15
+ CREATE TABLE code_chunks (
16
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
17
+ file_id INTEGER NOT NULL,
18
+ project_id TEXT NOT NULL,
19
+ chunk_type TEXT NOT NULL DEFAULT 'generic', -- function, class, method, import, etc.
20
+ name TEXT, -- Name of function/class/etc, can be NULL for generic chunks
21
+ start_line INTEGER NOT NULL,
22
+ end_line INTEGER NOT NULL,
23
+ content_hash TEXT NOT NULL, -- SHA-256 hash of chunk content
24
+ embedding_id TEXT, -- ID in vector database (Turbopuffer)
25
+ redacted BOOLEAN DEFAULT FALSE, -- Whether content was redacted for security
26
+ metadata TEXT DEFAULT '{}', -- JSON metadata about the chunk
27
+ created DATETIME DEFAULT CURRENT_TIMESTAMP,
28
+ last_modified DATETIME DEFAULT CURRENT_TIMESTAMP,
29
+ FOREIGN KEY (file_id) REFERENCES file_descriptions(id) ON DELETE CASCADE,
30
+ FOREIGN KEY (project_id) REFERENCES projects(id) ON DELETE CASCADE
31
+ );
32
+
33
+ -- Create indexes for code_chunks table
34
+ CREATE INDEX idx_code_chunks_file_id ON code_chunks(file_id);
35
+ CREATE INDEX idx_code_chunks_project_id ON code_chunks(project_id);
36
+ CREATE INDEX idx_code_chunks_chunk_type ON code_chunks(chunk_type);
37
+ CREATE INDEX idx_code_chunks_content_hash ON code_chunks(content_hash);
38
+ CREATE INDEX idx_code_chunks_embedding_id ON code_chunks(embedding_id);
39
+ CREATE INDEX idx_code_chunks_last_modified ON code_chunks(last_modified);
40
+ CREATE INDEX idx_code_chunks_redacted ON code_chunks(redacted);
41
+
42
+ -- Create merkle_nodes table for efficient change detection
43
+ CREATE TABLE merkle_nodes (
44
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
45
+ project_id TEXT NOT NULL,
46
+ path TEXT NOT NULL, -- File/directory path relative to project root
47
+ hash TEXT NOT NULL, -- SHA-256 hash of content or children
48
+ node_type TEXT NOT NULL DEFAULT 'file', -- file, directory, project
49
+ parent_path TEXT, -- Path to parent directory, NULL for root
50
+ children_hash TEXT, -- Combined hash of children for directories
51
+ last_modified DATETIME DEFAULT CURRENT_TIMESTAMP,
52
+ UNIQUE(project_id, path),
53
+ FOREIGN KEY (project_id) REFERENCES projects(id) ON DELETE CASCADE
54
+ );
55
+
56
+ -- Create indexes for merkle_nodes table
57
+ CREATE INDEX idx_merkle_nodes_project_id ON merkle_nodes(project_id);
58
+ CREATE INDEX idx_merkle_nodes_path ON merkle_nodes(path);
59
+ CREATE INDEX idx_merkle_nodes_hash ON merkle_nodes(hash);
60
+ CREATE INDEX idx_merkle_nodes_node_type ON merkle_nodes(node_type);
61
+ CREATE INDEX idx_merkle_nodes_parent_path ON merkle_nodes(parent_path);
62
+ CREATE INDEX idx_merkle_nodes_last_modified ON merkle_nodes(last_modified);
63
+
64
+ -- Create index_meta table for tracking vector indexing progress
65
+ CREATE TABLE index_meta (
66
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
67
+ project_id TEXT NOT NULL UNIQUE,
68
+ total_chunks INTEGER DEFAULT 0,
69
+ indexed_chunks INTEGER DEFAULT 0,
70
+ total_files INTEGER DEFAULT 0,
71
+ indexed_files INTEGER DEFAULT 0,
72
+ last_sync DATETIME,
73
+ sync_status TEXT DEFAULT 'pending', -- pending, in_progress, completed, failed, paused
74
+ error_message TEXT,
75
+ queue_depth INTEGER DEFAULT 0,
76
+ processing_rate REAL DEFAULT 0.0, -- Files per second
77
+ estimated_completion DATETIME,
78
+ metadata TEXT DEFAULT '{}', -- JSON metadata
79
+ created DATETIME DEFAULT CURRENT_TIMESTAMP,
80
+ last_modified DATETIME DEFAULT CURRENT_TIMESTAMP,
81
+ FOREIGN KEY (project_id) REFERENCES projects(id) ON DELETE CASCADE
82
+ );
83
+
84
+ -- Create indexes for index_meta table
85
+ CREATE INDEX idx_index_meta_project_id ON index_meta(project_id);
86
+ CREATE INDEX idx_index_meta_sync_status ON index_meta(sync_status);
87
+ CREATE INDEX idx_index_meta_last_sync ON index_meta(last_sync);
88
+ CREATE INDEX idx_index_meta_last_modified ON index_meta(last_modified);
89
+
90
+ -- Add vector_mode column to projects table to track which projects use vector search
91
+ ALTER TABLE projects ADD COLUMN vector_mode BOOLEAN DEFAULT FALSE;
92
+ CREATE INDEX idx_projects_vector_mode ON projects(vector_mode);
93
+
94
+ -- Create triggers to maintain consistency between file_descriptions and code_chunks
95
+ CREATE TRIGGER code_chunks_cleanup_on_file_delete
96
+ AFTER DELETE ON file_descriptions
97
+ BEGIN
98
+ DELETE FROM code_chunks WHERE file_id = OLD.id;
99
+ END;
100
+
101
+ -- Create triggers to update index_meta when chunks are added/removed
102
+ CREATE TRIGGER update_index_meta_on_chunk_insert
103
+ AFTER INSERT ON code_chunks
104
+ BEGIN
105
+ INSERT OR REPLACE INTO index_meta (
106
+ project_id, total_chunks, indexed_chunks, total_files, indexed_files, last_modified
107
+ )
108
+ SELECT
109
+ NEW.project_id,
110
+ COUNT(*) as total_chunks,
111
+ COUNT(embedding_id) as indexed_chunks,
112
+ (SELECT COUNT(DISTINCT file_id) FROM code_chunks WHERE project_id = NEW.project_id) as total_files,
113
+ (SELECT COUNT(DISTINCT file_id) FROM code_chunks WHERE project_id = NEW.project_id AND embedding_id IS NOT NULL) as indexed_files,
114
+ CURRENT_TIMESTAMP
115
+ FROM code_chunks
116
+ WHERE project_id = NEW.project_id;
117
+ END;
118
+
119
+ CREATE TRIGGER update_index_meta_on_chunk_update
120
+ AFTER UPDATE ON code_chunks
121
+ BEGIN
122
+ UPDATE index_meta SET
123
+ indexed_chunks = (
124
+ SELECT COUNT(*) FROM code_chunks
125
+ WHERE project_id = NEW.project_id AND embedding_id IS NOT NULL
126
+ ),
127
+ indexed_files = (
128
+ SELECT COUNT(DISTINCT file_id) FROM code_chunks
129
+ WHERE project_id = NEW.project_id AND embedding_id IS NOT NULL
130
+ ),
131
+ last_modified = CURRENT_TIMESTAMP
132
+ WHERE project_id = NEW.project_id;
133
+ END;
134
+
135
+ CREATE TRIGGER update_index_meta_on_chunk_delete
136
+ AFTER DELETE ON code_chunks
137
+ BEGIN
138
+ UPDATE index_meta SET
139
+ total_chunks = (
140
+ SELECT COUNT(*) FROM code_chunks
141
+ WHERE project_id = OLD.project_id
142
+ ),
143
+ indexed_chunks = (
144
+ SELECT COUNT(*) FROM code_chunks
145
+ WHERE project_id = OLD.project_id AND embedding_id IS NOT NULL
146
+ ),
147
+ total_files = (
148
+ SELECT COUNT(DISTINCT file_id) FROM code_chunks
149
+ WHERE project_id = OLD.project_id
150
+ ),
151
+ indexed_files = (
152
+ SELECT COUNT(DISTINCT file_id) FROM code_chunks
153
+ WHERE project_id = OLD.project_id AND embedding_id IS NOT NULL
154
+ ),
155
+ last_modified = CURRENT_TIMESTAMP
156
+ WHERE project_id = OLD.project_id;
157
+ END;
158
+
159
+ -- Create view for vector search results with file information
160
+ CREATE VIEW vector_search_view AS
161
+ SELECT
162
+ cc.id as chunk_id,
163
+ cc.file_id,
164
+ fd.file_path,
165
+ cc.chunk_type,
166
+ cc.name as chunk_name,
167
+ cc.start_line,
168
+ cc.end_line,
169
+ cc.content_hash,
170
+ cc.embedding_id,
171
+ cc.redacted,
172
+ cc.metadata as chunk_metadata,
173
+ cc.project_id,
174
+ p.name as project_name,
175
+ fd.description as file_description,
176
+ cc.created as chunk_created,
177
+ cc.last_modified as chunk_modified,
178
+ fd.last_modified as file_modified
179
+ FROM code_chunks cc
180
+ JOIN file_descriptions fd ON cc.file_id = fd.id
181
+ JOIN projects p ON cc.project_id = p.id
182
+ WHERE cc.embedding_id IS NOT NULL
183
+ AND fd.to_be_cleaned IS NULL;
184
+
185
+ -- Re-enable foreign key constraints
186
+ PRAGMA foreign_keys=ON;
187
+
188
+ -- Commit the migration
189
+ COMMIT;
@@ -63,6 +63,7 @@ class MCPCodeIndexServer:
63
63
  retry_max_wait: float = 2.0,
64
64
  retry_jitter: float = 0.2,
65
65
  transport: Optional[Any] = None,
66
+ vector_mode: bool = False,
66
67
  ):
67
68
  """
68
69
  Initialize the MCP Code Index Server.
@@ -80,10 +81,12 @@ class MCPCodeIndexServer:
80
81
  retry_max_wait: Maximum wait time between retries in seconds
81
82
  retry_jitter: Maximum jitter to add to retry delays in seconds
82
83
  transport: Optional transport instance (if None, uses default stdio)
84
+ vector_mode: Enable vector search capabilities and tools
83
85
  """
84
86
  self.token_limit = token_limit
85
87
  self.db_path = db_path or Path.home() / ".mcp-code-index" / "tracker.db"
86
88
  self.cache_dir = cache_dir or Path.home() / ".mcp-code-index" / "cache"
89
+ self.vector_mode = vector_mode
87
90
 
88
91
  # Store database configuration
89
92
  self.db_config = {