hanzo-mcp 0.3.8__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hanzo-mcp might be problematic. Click here for more details.
- hanzo_mcp/__init__.py +1 -1
- hanzo_mcp/cli.py +118 -170
- hanzo_mcp/cli_enhanced.py +438 -0
- hanzo_mcp/config/__init__.py +19 -0
- hanzo_mcp/config/settings.py +449 -0
- hanzo_mcp/config/tool_config.py +197 -0
- hanzo_mcp/prompts/__init__.py +117 -0
- hanzo_mcp/prompts/compact_conversation.py +77 -0
- hanzo_mcp/prompts/create_release.py +38 -0
- hanzo_mcp/prompts/project_system.py +120 -0
- hanzo_mcp/prompts/project_todo_reminder.py +111 -0
- hanzo_mcp/prompts/utils.py +286 -0
- hanzo_mcp/server.py +117 -99
- hanzo_mcp/tools/__init__.py +121 -33
- hanzo_mcp/tools/agent/__init__.py +8 -11
- hanzo_mcp/tools/agent/agent_tool.py +290 -224
- hanzo_mcp/tools/agent/prompt.py +16 -13
- hanzo_mcp/tools/agent/tool_adapter.py +9 -9
- hanzo_mcp/tools/common/__init__.py +17 -16
- hanzo_mcp/tools/common/base.py +79 -110
- hanzo_mcp/tools/common/batch_tool.py +330 -0
- hanzo_mcp/tools/common/config_tool.py +396 -0
- hanzo_mcp/tools/common/context.py +26 -292
- hanzo_mcp/tools/common/permissions.py +12 -12
- hanzo_mcp/tools/common/thinking_tool.py +153 -0
- hanzo_mcp/tools/common/validation.py +1 -63
- hanzo_mcp/tools/filesystem/__init__.py +97 -57
- hanzo_mcp/tools/filesystem/base.py +32 -24
- hanzo_mcp/tools/filesystem/content_replace.py +114 -107
- hanzo_mcp/tools/filesystem/directory_tree.py +129 -105
- hanzo_mcp/tools/filesystem/edit.py +279 -0
- hanzo_mcp/tools/filesystem/grep.py +458 -0
- hanzo_mcp/tools/filesystem/grep_ast_tool.py +250 -0
- hanzo_mcp/tools/filesystem/multi_edit.py +362 -0
- hanzo_mcp/tools/filesystem/read.py +255 -0
- hanzo_mcp/tools/filesystem/unified_search.py +689 -0
- hanzo_mcp/tools/filesystem/write.py +156 -0
- hanzo_mcp/tools/jupyter/__init__.py +41 -29
- hanzo_mcp/tools/jupyter/base.py +66 -57
- hanzo_mcp/tools/jupyter/{edit_notebook.py → notebook_edit.py} +162 -139
- hanzo_mcp/tools/jupyter/notebook_read.py +152 -0
- hanzo_mcp/tools/shell/__init__.py +29 -20
- hanzo_mcp/tools/shell/base.py +87 -45
- hanzo_mcp/tools/shell/bash_session.py +731 -0
- hanzo_mcp/tools/shell/bash_session_executor.py +295 -0
- hanzo_mcp/tools/shell/command_executor.py +435 -384
- hanzo_mcp/tools/shell/run_command.py +284 -131
- hanzo_mcp/tools/shell/run_command_windows.py +328 -0
- hanzo_mcp/tools/shell/session_manager.py +196 -0
- hanzo_mcp/tools/shell/session_storage.py +325 -0
- hanzo_mcp/tools/todo/__init__.py +66 -0
- hanzo_mcp/tools/todo/base.py +319 -0
- hanzo_mcp/tools/todo/todo_read.py +148 -0
- hanzo_mcp/tools/todo/todo_write.py +378 -0
- hanzo_mcp/tools/vector/__init__.py +99 -0
- hanzo_mcp/tools/vector/ast_analyzer.py +459 -0
- hanzo_mcp/tools/vector/git_ingester.py +482 -0
- hanzo_mcp/tools/vector/infinity_store.py +731 -0
- hanzo_mcp/tools/vector/mock_infinity.py +162 -0
- hanzo_mcp/tools/vector/project_manager.py +361 -0
- hanzo_mcp/tools/vector/vector_index.py +116 -0
- hanzo_mcp/tools/vector/vector_search.py +225 -0
- hanzo_mcp-0.5.1.dist-info/METADATA +276 -0
- hanzo_mcp-0.5.1.dist-info/RECORD +68 -0
- {hanzo_mcp-0.3.8.dist-info → hanzo_mcp-0.5.1.dist-info}/WHEEL +1 -1
- hanzo_mcp/tools/agent/base_provider.py +0 -73
- hanzo_mcp/tools/agent/litellm_provider.py +0 -45
- hanzo_mcp/tools/agent/lmstudio_agent.py +0 -385
- hanzo_mcp/tools/agent/lmstudio_provider.py +0 -219
- hanzo_mcp/tools/agent/provider_registry.py +0 -120
- hanzo_mcp/tools/common/error_handling.py +0 -86
- hanzo_mcp/tools/common/logging_config.py +0 -115
- hanzo_mcp/tools/common/session.py +0 -91
- hanzo_mcp/tools/common/think_tool.py +0 -123
- hanzo_mcp/tools/common/version_tool.py +0 -120
- hanzo_mcp/tools/filesystem/edit_file.py +0 -287
- hanzo_mcp/tools/filesystem/get_file_info.py +0 -170
- hanzo_mcp/tools/filesystem/read_files.py +0 -199
- hanzo_mcp/tools/filesystem/search_content.py +0 -275
- hanzo_mcp/tools/filesystem/write_file.py +0 -162
- hanzo_mcp/tools/jupyter/notebook_operations.py +0 -514
- hanzo_mcp/tools/jupyter/read_notebook.py +0 -165
- hanzo_mcp/tools/project/__init__.py +0 -64
- hanzo_mcp/tools/project/analysis.py +0 -886
- hanzo_mcp/tools/project/base.py +0 -66
- hanzo_mcp/tools/project/project_analyze.py +0 -173
- hanzo_mcp/tools/shell/run_script.py +0 -215
- hanzo_mcp/tools/shell/script_tool.py +0 -244
- hanzo_mcp-0.3.8.dist-info/METADATA +0 -196
- hanzo_mcp-0.3.8.dist-info/RECORD +0 -53
- {hanzo_mcp-0.3.8.dist-info → hanzo_mcp-0.5.1.dist-info}/entry_points.txt +0 -0
- {hanzo_mcp-0.3.8.dist-info → hanzo_mcp-0.5.1.dist-info}/licenses/LICENSE +0 -0
- {hanzo_mcp-0.3.8.dist-info → hanzo_mcp-0.5.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,731 @@
|
|
|
1
|
+
"""Infinity vector database integration for Hanzo MCP."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import hashlib
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
|
|
9
|
+
try:
|
|
10
|
+
import infinity_embedded
|
|
11
|
+
INFINITY_AVAILABLE = True
|
|
12
|
+
except ImportError:
|
|
13
|
+
# Use mock implementation when infinity_embedded is not available
|
|
14
|
+
from . import mock_infinity as infinity_embedded
|
|
15
|
+
INFINITY_AVAILABLE = True # Mock is always available
|
|
16
|
+
|
|
17
|
+
from .ast_analyzer import ASTAnalyzer, FileAST, Symbol, create_symbol_embedding_text
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class Document:
|
|
22
|
+
"""Document representation for vector storage."""
|
|
23
|
+
id: str
|
|
24
|
+
content: str
|
|
25
|
+
metadata: Dict[str, Any]
|
|
26
|
+
file_path: Optional[str] = None
|
|
27
|
+
chunk_index: Optional[int] = None
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class SearchResult:
|
|
32
|
+
"""Search result from vector database."""
|
|
33
|
+
document: Document
|
|
34
|
+
score: float
|
|
35
|
+
distance: float
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class SymbolSearchResult:
|
|
40
|
+
"""Search result for symbols."""
|
|
41
|
+
symbol: Symbol
|
|
42
|
+
score: float
|
|
43
|
+
context_document: Optional[Document] = None
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class UnifiedSearchResult:
|
|
48
|
+
"""Unified search result combining text, vector, and symbol search."""
|
|
49
|
+
type: str # 'document', 'symbol', 'reference'
|
|
50
|
+
content: str
|
|
51
|
+
file_path: str
|
|
52
|
+
line_start: int
|
|
53
|
+
line_end: int
|
|
54
|
+
score: float
|
|
55
|
+
search_type: str # 'text', 'vector', 'symbol', 'ast'
|
|
56
|
+
metadata: Dict[str, Any]
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class InfinityVectorStore:
|
|
60
|
+
"""Local vector database using Infinity."""
|
|
61
|
+
|
|
62
|
+
def __init__(
|
|
63
|
+
self,
|
|
64
|
+
data_path: Optional[str] = None,
|
|
65
|
+
embedding_model: str = "text-embedding-3-small",
|
|
66
|
+
dimension: int = 1536, # Default for OpenAI text-embedding-3-small
|
|
67
|
+
):
|
|
68
|
+
"""Initialize the Infinity vector store.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
data_path: Path to store vector database (default: ~/.config/hanzo/vector-store)
|
|
72
|
+
embedding_model: Embedding model to use
|
|
73
|
+
dimension: Vector dimension (must match embedding model)
|
|
74
|
+
"""
|
|
75
|
+
if not INFINITY_AVAILABLE:
|
|
76
|
+
raise ImportError("infinity_embedded is required for vector store functionality")
|
|
77
|
+
|
|
78
|
+
# Set up data path
|
|
79
|
+
if data_path:
|
|
80
|
+
self.data_path = Path(data_path)
|
|
81
|
+
else:
|
|
82
|
+
from hanzo_mcp.config.settings import get_config_dir
|
|
83
|
+
self.data_path = get_config_dir() / "vector-store"
|
|
84
|
+
|
|
85
|
+
self.data_path.mkdir(parents=True, exist_ok=True)
|
|
86
|
+
|
|
87
|
+
self.embedding_model = embedding_model
|
|
88
|
+
self.dimension = dimension
|
|
89
|
+
|
|
90
|
+
# Initialize AST analyzer
|
|
91
|
+
self.ast_analyzer = ASTAnalyzer()
|
|
92
|
+
|
|
93
|
+
# Connect to Infinity
|
|
94
|
+
self.infinity = infinity_embedded.connect(str(self.data_path))
|
|
95
|
+
self.db = self.infinity.get_database("hanzo_mcp")
|
|
96
|
+
|
|
97
|
+
# Initialize tables
|
|
98
|
+
self._initialize_tables()
|
|
99
|
+
|
|
100
|
+
def _initialize_tables(self):
|
|
101
|
+
"""Initialize database tables if they don't exist."""
|
|
102
|
+
# Documents table
|
|
103
|
+
try:
|
|
104
|
+
self.documents_table = self.db.get_table("documents")
|
|
105
|
+
except:
|
|
106
|
+
self.documents_table = self.db.create_table(
|
|
107
|
+
"documents",
|
|
108
|
+
{
|
|
109
|
+
"id": {"type": "varchar"},
|
|
110
|
+
"content": {"type": "varchar"},
|
|
111
|
+
"file_path": {"type": "varchar"},
|
|
112
|
+
"chunk_index": {"type": "integer"},
|
|
113
|
+
"metadata": {"type": "varchar"}, # JSON string
|
|
114
|
+
"embedding": {"type": f"vector,{self.dimension},float"},
|
|
115
|
+
}
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# Symbols table for code symbols
|
|
119
|
+
try:
|
|
120
|
+
self.symbols_table = self.db.get_table("symbols")
|
|
121
|
+
except:
|
|
122
|
+
self.symbols_table = self.db.create_table(
|
|
123
|
+
"symbols",
|
|
124
|
+
{
|
|
125
|
+
"id": {"type": "varchar"},
|
|
126
|
+
"name": {"type": "varchar"},
|
|
127
|
+
"type": {"type": "varchar"}, # function, class, variable, etc.
|
|
128
|
+
"file_path": {"type": "varchar"},
|
|
129
|
+
"line_start": {"type": "integer"},
|
|
130
|
+
"line_end": {"type": "integer"},
|
|
131
|
+
"scope": {"type": "varchar"},
|
|
132
|
+
"parent": {"type": "varchar"},
|
|
133
|
+
"signature": {"type": "varchar"},
|
|
134
|
+
"docstring": {"type": "varchar"},
|
|
135
|
+
"metadata": {"type": "varchar"}, # JSON string
|
|
136
|
+
"embedding": {"type": f"vector,{self.dimension},float"},
|
|
137
|
+
}
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
# AST table for storing complete file ASTs
|
|
141
|
+
try:
|
|
142
|
+
self.ast_table = self.db.get_table("ast_files")
|
|
143
|
+
except:
|
|
144
|
+
self.ast_table = self.db.create_table(
|
|
145
|
+
"ast_files",
|
|
146
|
+
{
|
|
147
|
+
"file_path": {"type": "varchar"},
|
|
148
|
+
"file_hash": {"type": "varchar"},
|
|
149
|
+
"language": {"type": "varchar"},
|
|
150
|
+
"ast_data": {"type": "varchar"}, # JSON string of complete AST
|
|
151
|
+
"last_updated": {"type": "varchar"}, # ISO timestamp
|
|
152
|
+
}
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
# References table for cross-file references
|
|
156
|
+
try:
|
|
157
|
+
self.references_table = self.db.get_table("references")
|
|
158
|
+
except:
|
|
159
|
+
self.references_table = self.db.create_table(
|
|
160
|
+
"references",
|
|
161
|
+
{
|
|
162
|
+
"id": {"type": "varchar"},
|
|
163
|
+
"source_file": {"type": "varchar"},
|
|
164
|
+
"target_file": {"type": "varchar"},
|
|
165
|
+
"symbol_name": {"type": "varchar"},
|
|
166
|
+
"reference_type": {"type": "varchar"}, # import, call, inheritance, etc.
|
|
167
|
+
"line_number": {"type": "integer"},
|
|
168
|
+
"metadata": {"type": "varchar"}, # JSON string
|
|
169
|
+
}
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
def _generate_doc_id(self, content: str, file_path: str = "", chunk_index: int = 0) -> str:
|
|
173
|
+
"""Generate a unique document ID."""
|
|
174
|
+
content_hash = hashlib.sha256(content.encode()).hexdigest()[:16]
|
|
175
|
+
path_hash = hashlib.sha256(file_path.encode()).hexdigest()[:8]
|
|
176
|
+
return f"doc_{path_hash}_{chunk_index}_{content_hash}"
|
|
177
|
+
|
|
178
|
+
def add_document(
|
|
179
|
+
self,
|
|
180
|
+
content: str,
|
|
181
|
+
metadata: Dict[str, Any] = None,
|
|
182
|
+
file_path: Optional[str] = None,
|
|
183
|
+
chunk_index: int = 0,
|
|
184
|
+
embedding: Optional[List[float]] = None,
|
|
185
|
+
) -> str:
|
|
186
|
+
"""Add a document to the vector store.
|
|
187
|
+
|
|
188
|
+
Args:
|
|
189
|
+
content: Document content
|
|
190
|
+
metadata: Additional metadata
|
|
191
|
+
file_path: Source file path
|
|
192
|
+
chunk_index: Chunk index if document is part of larger file
|
|
193
|
+
embedding: Pre-computed embedding (if None, will compute)
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
Document ID
|
|
197
|
+
"""
|
|
198
|
+
doc_id = self._generate_doc_id(content, file_path or "", chunk_index)
|
|
199
|
+
|
|
200
|
+
# Generate embedding if not provided
|
|
201
|
+
if embedding is None:
|
|
202
|
+
embedding = self._generate_embedding(content)
|
|
203
|
+
|
|
204
|
+
# Prepare metadata
|
|
205
|
+
metadata = metadata or {}
|
|
206
|
+
metadata_json = json.dumps(metadata)
|
|
207
|
+
|
|
208
|
+
# Insert document
|
|
209
|
+
self.documents_table.insert([{
|
|
210
|
+
"id": doc_id,
|
|
211
|
+
"content": content,
|
|
212
|
+
"file_path": file_path or "",
|
|
213
|
+
"chunk_index": chunk_index,
|
|
214
|
+
"metadata": metadata_json,
|
|
215
|
+
"embedding": embedding,
|
|
216
|
+
}])
|
|
217
|
+
|
|
218
|
+
return doc_id
|
|
219
|
+
|
|
220
|
+
def add_file(
|
|
221
|
+
self,
|
|
222
|
+
file_path: str,
|
|
223
|
+
chunk_size: int = 1000,
|
|
224
|
+
chunk_overlap: int = 200,
|
|
225
|
+
metadata: Dict[str, Any] = None,
|
|
226
|
+
) -> List[str]:
|
|
227
|
+
"""Add a file to the vector store by chunking it.
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
file_path: Path to the file to add
|
|
231
|
+
chunk_size: Maximum characters per chunk
|
|
232
|
+
chunk_overlap: Characters to overlap between chunks
|
|
233
|
+
metadata: Additional metadata for all chunks
|
|
234
|
+
|
|
235
|
+
Returns:
|
|
236
|
+
List of document IDs for all chunks
|
|
237
|
+
"""
|
|
238
|
+
path = Path(file_path)
|
|
239
|
+
if not path.exists():
|
|
240
|
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
241
|
+
|
|
242
|
+
# Read file content
|
|
243
|
+
try:
|
|
244
|
+
content = path.read_text(encoding='utf-8')
|
|
245
|
+
except UnicodeDecodeError:
|
|
246
|
+
# Try with different encoding
|
|
247
|
+
content = path.read_text(encoding='latin-1')
|
|
248
|
+
|
|
249
|
+
# Chunk the content
|
|
250
|
+
chunks = self._chunk_text(content, chunk_size, chunk_overlap)
|
|
251
|
+
|
|
252
|
+
# Add metadata
|
|
253
|
+
file_metadata = metadata or {}
|
|
254
|
+
file_metadata.update({
|
|
255
|
+
"file_name": path.name,
|
|
256
|
+
"file_extension": path.suffix,
|
|
257
|
+
"file_size": path.stat().st_size,
|
|
258
|
+
})
|
|
259
|
+
|
|
260
|
+
# Add each chunk
|
|
261
|
+
doc_ids = []
|
|
262
|
+
for i, chunk in enumerate(chunks):
|
|
263
|
+
chunk_metadata = file_metadata.copy()
|
|
264
|
+
chunk_metadata["chunk_number"] = i
|
|
265
|
+
chunk_metadata["total_chunks"] = len(chunks)
|
|
266
|
+
|
|
267
|
+
doc_id = self.add_document(
|
|
268
|
+
content=chunk,
|
|
269
|
+
metadata=chunk_metadata,
|
|
270
|
+
file_path=str(path),
|
|
271
|
+
chunk_index=i,
|
|
272
|
+
)
|
|
273
|
+
doc_ids.append(doc_id)
|
|
274
|
+
|
|
275
|
+
return doc_ids
|
|
276
|
+
|
|
277
|
+
def add_file_with_ast(
|
|
278
|
+
self,
|
|
279
|
+
file_path: str,
|
|
280
|
+
chunk_size: int = 1000,
|
|
281
|
+
chunk_overlap: int = 200,
|
|
282
|
+
metadata: Dict[str, Any] = None,
|
|
283
|
+
) -> Tuple[List[str], Optional[FileAST]]:
|
|
284
|
+
"""Add a file with full AST analysis and symbol extraction.
|
|
285
|
+
|
|
286
|
+
Args:
|
|
287
|
+
file_path: Path to the file to add
|
|
288
|
+
chunk_size: Maximum characters per chunk for content
|
|
289
|
+
chunk_overlap: Characters to overlap between chunks
|
|
290
|
+
metadata: Additional metadata for all chunks
|
|
291
|
+
|
|
292
|
+
Returns:
|
|
293
|
+
Tuple of (document IDs for content chunks, FileAST object)
|
|
294
|
+
"""
|
|
295
|
+
path = Path(file_path)
|
|
296
|
+
if not path.exists():
|
|
297
|
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
298
|
+
|
|
299
|
+
# First add file content using existing method
|
|
300
|
+
doc_ids = self.add_file(file_path, chunk_size, chunk_overlap, metadata)
|
|
301
|
+
|
|
302
|
+
# Analyze AST and symbols
|
|
303
|
+
file_ast = self.ast_analyzer.analyze_file(file_path)
|
|
304
|
+
if not file_ast:
|
|
305
|
+
return doc_ids, None
|
|
306
|
+
|
|
307
|
+
# Store complete AST
|
|
308
|
+
self._store_file_ast(file_ast)
|
|
309
|
+
|
|
310
|
+
# Store individual symbols with embeddings
|
|
311
|
+
self._store_symbols(file_ast.symbols)
|
|
312
|
+
|
|
313
|
+
# Store cross-references
|
|
314
|
+
self._store_references(file_ast)
|
|
315
|
+
|
|
316
|
+
return doc_ids, file_ast
|
|
317
|
+
|
|
318
|
+
def _store_file_ast(self, file_ast: FileAST):
|
|
319
|
+
"""Store complete file AST information."""
|
|
320
|
+
from datetime import datetime
|
|
321
|
+
|
|
322
|
+
# Remove existing AST for this file
|
|
323
|
+
try:
|
|
324
|
+
self.ast_table.delete(f"file_path = '{file_ast.file_path}'")
|
|
325
|
+
except:
|
|
326
|
+
pass
|
|
327
|
+
|
|
328
|
+
# Insert new AST
|
|
329
|
+
self.ast_table.insert([{
|
|
330
|
+
"file_path": file_ast.file_path,
|
|
331
|
+
"file_hash": file_ast.file_hash,
|
|
332
|
+
"language": file_ast.language,
|
|
333
|
+
"ast_data": json.dumps(file_ast.to_dict()),
|
|
334
|
+
"last_updated": datetime.now().isoformat(),
|
|
335
|
+
}])
|
|
336
|
+
|
|
337
|
+
def _store_symbols(self, symbols: List[Symbol]):
|
|
338
|
+
"""Store symbols with vector embeddings."""
|
|
339
|
+
if not symbols:
|
|
340
|
+
return
|
|
341
|
+
|
|
342
|
+
# Remove existing symbols for these files
|
|
343
|
+
file_paths = list(set(symbol.file_path for symbol in symbols))
|
|
344
|
+
for file_path in file_paths:
|
|
345
|
+
try:
|
|
346
|
+
self.symbols_table.delete(f"file_path = '{file_path}'")
|
|
347
|
+
except:
|
|
348
|
+
pass
|
|
349
|
+
|
|
350
|
+
# Insert new symbols
|
|
351
|
+
symbol_records = []
|
|
352
|
+
for symbol in symbols:
|
|
353
|
+
# Create embedding text for symbol
|
|
354
|
+
embedding_text = create_symbol_embedding_text(symbol)
|
|
355
|
+
embedding = self._generate_embedding(embedding_text)
|
|
356
|
+
|
|
357
|
+
# Generate symbol ID
|
|
358
|
+
symbol_id = self._generate_symbol_id(symbol)
|
|
359
|
+
|
|
360
|
+
# Prepare metadata
|
|
361
|
+
symbol_metadata = {
|
|
362
|
+
"references": symbol.references,
|
|
363
|
+
"embedding_text": embedding_text,
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
symbol_records.append({
|
|
367
|
+
"id": symbol_id,
|
|
368
|
+
"name": symbol.name,
|
|
369
|
+
"type": symbol.type,
|
|
370
|
+
"file_path": symbol.file_path,
|
|
371
|
+
"line_start": symbol.line_start,
|
|
372
|
+
"line_end": symbol.line_end,
|
|
373
|
+
"scope": symbol.scope or "",
|
|
374
|
+
"parent": symbol.parent or "",
|
|
375
|
+
"signature": symbol.signature or "",
|
|
376
|
+
"docstring": symbol.docstring or "",
|
|
377
|
+
"metadata": json.dumps(symbol_metadata),
|
|
378
|
+
"embedding": embedding,
|
|
379
|
+
})
|
|
380
|
+
|
|
381
|
+
if symbol_records:
|
|
382
|
+
self.symbols_table.insert(symbol_records)
|
|
383
|
+
|
|
384
|
+
def _store_references(self, file_ast: FileAST):
|
|
385
|
+
"""Store cross-file references."""
|
|
386
|
+
if not file_ast.dependencies:
|
|
387
|
+
return
|
|
388
|
+
|
|
389
|
+
# Remove existing references for this file
|
|
390
|
+
try:
|
|
391
|
+
self.references_table.delete(f"source_file = '{file_ast.file_path}'")
|
|
392
|
+
except:
|
|
393
|
+
pass
|
|
394
|
+
|
|
395
|
+
# Insert new references
|
|
396
|
+
reference_records = []
|
|
397
|
+
for i, dependency in enumerate(file_ast.dependencies):
|
|
398
|
+
ref_id = f"{file_ast.file_path}_{dependency}_{i}"
|
|
399
|
+
reference_records.append({
|
|
400
|
+
"id": ref_id,
|
|
401
|
+
"source_file": file_ast.file_path,
|
|
402
|
+
"target_file": dependency,
|
|
403
|
+
"symbol_name": dependency,
|
|
404
|
+
"reference_type": "import",
|
|
405
|
+
"line_number": 0, # Could be enhanced to track actual line numbers
|
|
406
|
+
"metadata": json.dumps({}),
|
|
407
|
+
})
|
|
408
|
+
|
|
409
|
+
if reference_records:
|
|
410
|
+
self.references_table.insert(reference_records)
|
|
411
|
+
|
|
412
|
+
def _generate_symbol_id(self, symbol: Symbol) -> str:
|
|
413
|
+
"""Generate unique symbol ID."""
|
|
414
|
+
text = f"{symbol.file_path}_{symbol.type}_{symbol.name}_{symbol.line_start}"
|
|
415
|
+
return hashlib.sha256(text.encode()).hexdigest()[:16]
|
|
416
|
+
|
|
417
|
+
def search_symbols(
|
|
418
|
+
self,
|
|
419
|
+
query: str,
|
|
420
|
+
symbol_type: Optional[str] = None,
|
|
421
|
+
file_path: Optional[str] = None,
|
|
422
|
+
limit: int = 10,
|
|
423
|
+
score_threshold: float = 0.0,
|
|
424
|
+
) -> List[SymbolSearchResult]:
|
|
425
|
+
"""Search for symbols using vector similarity.
|
|
426
|
+
|
|
427
|
+
Args:
|
|
428
|
+
query: Search query
|
|
429
|
+
symbol_type: Filter by symbol type (function, class, variable, etc.)
|
|
430
|
+
file_path: Filter by file path
|
|
431
|
+
limit: Maximum number of results
|
|
432
|
+
score_threshold: Minimum similarity score
|
|
433
|
+
|
|
434
|
+
Returns:
|
|
435
|
+
List of symbol search results
|
|
436
|
+
"""
|
|
437
|
+
# Generate query embedding
|
|
438
|
+
query_embedding = self._generate_embedding(query)
|
|
439
|
+
|
|
440
|
+
# Build search query
|
|
441
|
+
search_query = self.symbols_table.output(["*"]).match_dense(
|
|
442
|
+
"embedding",
|
|
443
|
+
query_embedding,
|
|
444
|
+
"float",
|
|
445
|
+
"ip", # Inner product
|
|
446
|
+
limit * 2 # Get more results for filtering
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
# Apply filters
|
|
450
|
+
if symbol_type:
|
|
451
|
+
search_query = search_query.filter(f"type = '{symbol_type}'")
|
|
452
|
+
if file_path:
|
|
453
|
+
search_query = search_query.filter(f"file_path = '{file_path}'")
|
|
454
|
+
|
|
455
|
+
search_results = search_query.to_pl()
|
|
456
|
+
|
|
457
|
+
# Convert to SymbolSearchResult objects
|
|
458
|
+
results = []
|
|
459
|
+
for row in search_results.iter_rows(named=True):
|
|
460
|
+
score = row.get("score", 0.0)
|
|
461
|
+
if score >= score_threshold:
|
|
462
|
+
# Parse metadata
|
|
463
|
+
try:
|
|
464
|
+
metadata = json.loads(row["metadata"])
|
|
465
|
+
except:
|
|
466
|
+
metadata = {}
|
|
467
|
+
|
|
468
|
+
# Create Symbol object
|
|
469
|
+
symbol = Symbol(
|
|
470
|
+
name=row["name"],
|
|
471
|
+
type=row["type"],
|
|
472
|
+
file_path=row["file_path"],
|
|
473
|
+
line_start=row["line_start"],
|
|
474
|
+
line_end=row["line_end"],
|
|
475
|
+
column_start=0, # Not stored in table
|
|
476
|
+
column_end=0, # Not stored in table
|
|
477
|
+
scope=row["scope"],
|
|
478
|
+
parent=row["parent"] if row["parent"] else None,
|
|
479
|
+
docstring=row["docstring"] if row["docstring"] else None,
|
|
480
|
+
signature=row["signature"] if row["signature"] else None,
|
|
481
|
+
references=metadata.get("references", []),
|
|
482
|
+
)
|
|
483
|
+
|
|
484
|
+
results.append(SymbolSearchResult(
|
|
485
|
+
symbol=symbol,
|
|
486
|
+
score=score,
|
|
487
|
+
))
|
|
488
|
+
|
|
489
|
+
return results[:limit]
|
|
490
|
+
|
|
491
|
+
def search_ast_nodes(
|
|
492
|
+
self,
|
|
493
|
+
file_path: str,
|
|
494
|
+
node_type: Optional[str] = None,
|
|
495
|
+
node_name: Optional[str] = None,
|
|
496
|
+
) -> Optional[FileAST]:
|
|
497
|
+
"""Search AST nodes within a specific file.
|
|
498
|
+
|
|
499
|
+
Args:
|
|
500
|
+
file_path: File to search in
|
|
501
|
+
node_type: Filter by AST node type
|
|
502
|
+
node_name: Filter by node name
|
|
503
|
+
|
|
504
|
+
Returns:
|
|
505
|
+
FileAST object if file found, None otherwise
|
|
506
|
+
"""
|
|
507
|
+
try:
|
|
508
|
+
results = self.ast_table.output(["*"]).filter(f"file_path = '{file_path}'").to_pl()
|
|
509
|
+
|
|
510
|
+
if len(results) == 0:
|
|
511
|
+
return None
|
|
512
|
+
|
|
513
|
+
row = next(results.iter_rows(named=True))
|
|
514
|
+
ast_data = json.loads(row["ast_data"])
|
|
515
|
+
|
|
516
|
+
# Reconstruct FileAST object
|
|
517
|
+
file_ast = FileAST(
|
|
518
|
+
file_path=ast_data["file_path"],
|
|
519
|
+
file_hash=ast_data["file_hash"],
|
|
520
|
+
language=ast_data["language"],
|
|
521
|
+
symbols=[Symbol(**s) for s in ast_data["symbols"]],
|
|
522
|
+
ast_nodes=[], # Would need custom deserialization for ASTNode
|
|
523
|
+
imports=ast_data["imports"],
|
|
524
|
+
exports=ast_data["exports"],
|
|
525
|
+
dependencies=ast_data["dependencies"],
|
|
526
|
+
)
|
|
527
|
+
|
|
528
|
+
return file_ast
|
|
529
|
+
|
|
530
|
+
except Exception as e:
|
|
531
|
+
print(f"Error searching AST nodes: {e}")
|
|
532
|
+
return None
|
|
533
|
+
|
|
534
|
+
def get_file_references(self, file_path: str) -> List[Dict[str, Any]]:
|
|
535
|
+
"""Get all files that reference the given file.
|
|
536
|
+
|
|
537
|
+
Args:
|
|
538
|
+
file_path: File to find references for
|
|
539
|
+
|
|
540
|
+
Returns:
|
|
541
|
+
List of reference information
|
|
542
|
+
"""
|
|
543
|
+
try:
|
|
544
|
+
results = self.references_table.output(["*"]).filter(f"target_file = '{file_path}'").to_pl()
|
|
545
|
+
|
|
546
|
+
references = []
|
|
547
|
+
for row in results.iter_rows(named=True):
|
|
548
|
+
references.append({
|
|
549
|
+
"source_file": row["source_file"],
|
|
550
|
+
"symbol_name": row["symbol_name"],
|
|
551
|
+
"reference_type": row["reference_type"],
|
|
552
|
+
"line_number": row["line_number"],
|
|
553
|
+
})
|
|
554
|
+
|
|
555
|
+
return references
|
|
556
|
+
|
|
557
|
+
except Exception as e:
|
|
558
|
+
print(f"Error getting file references: {e}")
|
|
559
|
+
return []
|
|
560
|
+
|
|
561
|
+
def search(
|
|
562
|
+
self,
|
|
563
|
+
query: str,
|
|
564
|
+
limit: int = 10,
|
|
565
|
+
score_threshold: float = 0.0,
|
|
566
|
+
filters: Dict[str, Any] = None,
|
|
567
|
+
) -> List[SearchResult]:
|
|
568
|
+
"""Search for similar documents.
|
|
569
|
+
|
|
570
|
+
Args:
|
|
571
|
+
query: Search query
|
|
572
|
+
limit: Maximum number of results
|
|
573
|
+
score_threshold: Minimum similarity score
|
|
574
|
+
filters: Metadata filters (not yet implemented)
|
|
575
|
+
|
|
576
|
+
Returns:
|
|
577
|
+
List of search results
|
|
578
|
+
"""
|
|
579
|
+
# Generate query embedding
|
|
580
|
+
query_embedding = self._generate_embedding(query)
|
|
581
|
+
|
|
582
|
+
# Perform vector search
|
|
583
|
+
search_results = self.documents_table.output(["*"]).match_dense(
|
|
584
|
+
"embedding",
|
|
585
|
+
query_embedding,
|
|
586
|
+
"float",
|
|
587
|
+
"ip", # Inner product (cosine similarity)
|
|
588
|
+
limit
|
|
589
|
+
).to_pl()
|
|
590
|
+
|
|
591
|
+
# Convert to SearchResult objects
|
|
592
|
+
results = []
|
|
593
|
+
for row in search_results.iter_rows(named=True):
|
|
594
|
+
# Parse metadata
|
|
595
|
+
try:
|
|
596
|
+
metadata = json.loads(row["metadata"])
|
|
597
|
+
except:
|
|
598
|
+
metadata = {}
|
|
599
|
+
|
|
600
|
+
# Create document
|
|
601
|
+
document = Document(
|
|
602
|
+
id=row["id"],
|
|
603
|
+
content=row["content"],
|
|
604
|
+
metadata=metadata,
|
|
605
|
+
file_path=row["file_path"] if row["file_path"] else None,
|
|
606
|
+
chunk_index=row["chunk_index"],
|
|
607
|
+
)
|
|
608
|
+
|
|
609
|
+
# Score is the similarity (higher is better)
|
|
610
|
+
score = row.get("score", 0.0)
|
|
611
|
+
distance = 1.0 - score # Convert similarity to distance
|
|
612
|
+
|
|
613
|
+
if score >= score_threshold:
|
|
614
|
+
results.append(SearchResult(
|
|
615
|
+
document=document,
|
|
616
|
+
score=score,
|
|
617
|
+
distance=distance,
|
|
618
|
+
))
|
|
619
|
+
|
|
620
|
+
return results
|
|
621
|
+
|
|
622
|
+
def delete_document(self, doc_id: str) -> bool:
|
|
623
|
+
"""Delete a document by ID.
|
|
624
|
+
|
|
625
|
+
Args:
|
|
626
|
+
doc_id: Document ID to delete
|
|
627
|
+
|
|
628
|
+
Returns:
|
|
629
|
+
True if document was deleted
|
|
630
|
+
"""
|
|
631
|
+
try:
|
|
632
|
+
self.documents_table.delete(f"id = '{doc_id}'")
|
|
633
|
+
return True
|
|
634
|
+
except:
|
|
635
|
+
return False
|
|
636
|
+
|
|
637
|
+
def delete_file(self, file_path: str) -> int:
|
|
638
|
+
"""Delete all documents from a specific file.
|
|
639
|
+
|
|
640
|
+
Args:
|
|
641
|
+
file_path: File path to delete documents for
|
|
642
|
+
|
|
643
|
+
Returns:
|
|
644
|
+
Number of documents deleted
|
|
645
|
+
"""
|
|
646
|
+
try:
|
|
647
|
+
# Get count first
|
|
648
|
+
results = self.documents_table.output(["id"]).filter(f"file_path = '{file_path}'").to_pl()
|
|
649
|
+
count = len(results)
|
|
650
|
+
|
|
651
|
+
# Delete all documents for this file
|
|
652
|
+
self.documents_table.delete(f"file_path = '{file_path}'")
|
|
653
|
+
return count
|
|
654
|
+
except:
|
|
655
|
+
return 0
|
|
656
|
+
|
|
657
|
+
def list_files(self) -> List[Dict[str, Any]]:
|
|
658
|
+
"""List all indexed files.
|
|
659
|
+
|
|
660
|
+
Returns:
|
|
661
|
+
List of file information
|
|
662
|
+
"""
|
|
663
|
+
try:
|
|
664
|
+
results = self.documents_table.output(["file_path", "metadata"]).to_pl()
|
|
665
|
+
|
|
666
|
+
files = {}
|
|
667
|
+
for row in results.iter_rows(named=True):
|
|
668
|
+
file_path = row["file_path"]
|
|
669
|
+
if file_path and file_path not in files:
|
|
670
|
+
try:
|
|
671
|
+
metadata = json.loads(row["metadata"])
|
|
672
|
+
files[file_path] = {
|
|
673
|
+
"file_path": file_path,
|
|
674
|
+
"file_name": metadata.get("file_name", Path(file_path).name),
|
|
675
|
+
"file_size": metadata.get("file_size", 0),
|
|
676
|
+
"total_chunks": metadata.get("total_chunks", 1),
|
|
677
|
+
}
|
|
678
|
+
except:
|
|
679
|
+
files[file_path] = {
|
|
680
|
+
"file_path": file_path,
|
|
681
|
+
"file_name": Path(file_path).name,
|
|
682
|
+
}
|
|
683
|
+
|
|
684
|
+
return list(files.values())
|
|
685
|
+
except:
|
|
686
|
+
return []
|
|
687
|
+
|
|
688
|
+
def _chunk_text(self, text: str, chunk_size: int, overlap: int) -> List[str]:
|
|
689
|
+
"""Split text into overlapping chunks."""
|
|
690
|
+
if len(text) <= chunk_size:
|
|
691
|
+
return [text]
|
|
692
|
+
|
|
693
|
+
chunks = []
|
|
694
|
+
start = 0
|
|
695
|
+
|
|
696
|
+
while start < len(text):
|
|
697
|
+
end = start + chunk_size
|
|
698
|
+
|
|
699
|
+
# Try to break at word boundary
|
|
700
|
+
if end < len(text):
|
|
701
|
+
# Look back for a good break point
|
|
702
|
+
break_point = end
|
|
703
|
+
for i in range(end - 100, start + 100, -1):
|
|
704
|
+
if i > 0 and text[i] in '\n\r.!?':
|
|
705
|
+
break_point = i + 1
|
|
706
|
+
break
|
|
707
|
+
end = break_point
|
|
708
|
+
|
|
709
|
+
chunk = text[start:end].strip()
|
|
710
|
+
if chunk:
|
|
711
|
+
chunks.append(chunk)
|
|
712
|
+
|
|
713
|
+
start = max(start + chunk_size - overlap, end)
|
|
714
|
+
|
|
715
|
+
return chunks
|
|
716
|
+
|
|
717
|
+
def _generate_embedding(self, text: str) -> List[float]:
|
|
718
|
+
"""Generate embedding for text.
|
|
719
|
+
|
|
720
|
+
For now, this returns a dummy embedding. In a real implementation,
|
|
721
|
+
you would call an embedding API (OpenAI, Cohere, etc.) or use a local model.
|
|
722
|
+
"""
|
|
723
|
+
# This is a placeholder - you would implement actual embedding generation here
|
|
724
|
+
# For now, return a random embedding of the correct dimension
|
|
725
|
+
import random
|
|
726
|
+
return [random.random() for _ in range(self.dimension)]
|
|
727
|
+
|
|
728
|
+
def close(self):
|
|
729
|
+
"""Close the database connection."""
|
|
730
|
+
if hasattr(self, 'infinity'):
|
|
731
|
+
self.infinity.disconnect()
|