hanzo-mcp 0.6.13__py3-none-any.whl → 0.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hanzo-mcp might be problematic. Click here for more details.
- hanzo_mcp/analytics/__init__.py +5 -0
- hanzo_mcp/analytics/posthog_analytics.py +364 -0
- hanzo_mcp/cli.py +3 -3
- hanzo_mcp/cli_enhanced.py +3 -3
- hanzo_mcp/config/settings.py +1 -1
- hanzo_mcp/config/tool_config.py +18 -4
- hanzo_mcp/server.py +34 -1
- hanzo_mcp/tools/__init__.py +65 -2
- hanzo_mcp/tools/agent/__init__.py +84 -3
- hanzo_mcp/tools/agent/agent_tool.py +102 -4
- hanzo_mcp/tools/agent/agent_tool_v2.py +492 -0
- hanzo_mcp/tools/agent/clarification_protocol.py +220 -0
- hanzo_mcp/tools/agent/clarification_tool.py +68 -0
- hanzo_mcp/tools/agent/claude_cli_tool.py +125 -0
- hanzo_mcp/tools/agent/claude_desktop_auth.py +508 -0
- hanzo_mcp/tools/agent/cli_agent_base.py +191 -0
- hanzo_mcp/tools/agent/code_auth.py +436 -0
- hanzo_mcp/tools/agent/code_auth_tool.py +194 -0
- hanzo_mcp/tools/agent/codex_cli_tool.py +123 -0
- hanzo_mcp/tools/agent/critic_tool.py +376 -0
- hanzo_mcp/tools/agent/gemini_cli_tool.py +128 -0
- hanzo_mcp/tools/agent/grok_cli_tool.py +128 -0
- hanzo_mcp/tools/agent/iching_tool.py +380 -0
- hanzo_mcp/tools/agent/network_tool.py +273 -0
- hanzo_mcp/tools/agent/prompt.py +62 -20
- hanzo_mcp/tools/agent/review_tool.py +433 -0
- hanzo_mcp/tools/agent/swarm_tool.py +535 -0
- hanzo_mcp/tools/agent/swarm_tool_v2.py +654 -0
- hanzo_mcp/tools/common/base.py +1 -0
- hanzo_mcp/tools/common/batch_tool.py +102 -10
- hanzo_mcp/tools/common/fastmcp_pagination.py +369 -0
- hanzo_mcp/tools/common/forgiving_edit.py +243 -0
- hanzo_mcp/tools/common/paginated_base.py +230 -0
- hanzo_mcp/tools/common/paginated_response.py +307 -0
- hanzo_mcp/tools/common/pagination.py +226 -0
- hanzo_mcp/tools/common/tool_list.py +3 -0
- hanzo_mcp/tools/common/truncate.py +101 -0
- hanzo_mcp/tools/filesystem/__init__.py +29 -0
- hanzo_mcp/tools/filesystem/ast_multi_edit.py +562 -0
- hanzo_mcp/tools/filesystem/directory_tree_paginated.py +338 -0
- hanzo_mcp/tools/lsp/__init__.py +5 -0
- hanzo_mcp/tools/lsp/lsp_tool.py +512 -0
- hanzo_mcp/tools/memory/__init__.py +76 -0
- hanzo_mcp/tools/memory/knowledge_tools.py +518 -0
- hanzo_mcp/tools/memory/memory_tools.py +456 -0
- hanzo_mcp/tools/search/__init__.py +6 -0
- hanzo_mcp/tools/search/find_tool.py +581 -0
- hanzo_mcp/tools/search/unified_search.py +953 -0
- hanzo_mcp/tools/shell/__init__.py +5 -0
- hanzo_mcp/tools/shell/auto_background.py +203 -0
- hanzo_mcp/tools/shell/base_process.py +53 -27
- hanzo_mcp/tools/shell/bash_tool.py +17 -33
- hanzo_mcp/tools/shell/npx_tool.py +15 -32
- hanzo_mcp/tools/shell/streaming_command.py +594 -0
- hanzo_mcp/tools/shell/uvx_tool.py +15 -32
- hanzo_mcp/types.py +23 -0
- {hanzo_mcp-0.6.13.dist-info → hanzo_mcp-0.7.1.dist-info}/METADATA +229 -71
- {hanzo_mcp-0.6.13.dist-info → hanzo_mcp-0.7.1.dist-info}/RECORD +61 -24
- hanzo_mcp-0.6.13.dist-info/licenses/LICENSE +0 -21
- {hanzo_mcp-0.6.13.dist-info → hanzo_mcp-0.7.1.dist-info}/WHEEL +0 -0
- {hanzo_mcp-0.6.13.dist-info → hanzo_mcp-0.7.1.dist-info}/entry_points.txt +0 -0
- {hanzo_mcp-0.6.13.dist-info → hanzo_mcp-0.7.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,953 @@
|
|
|
1
|
+
"""Primary unified search tool - THE search tool for finding anything in code.
|
|
2
|
+
|
|
3
|
+
This is your main search interface that intelligently combines all available
|
|
4
|
+
search capabilities including text, AST, symbols, memory, and semantic search.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import os
|
|
8
|
+
import time
|
|
9
|
+
import json
|
|
10
|
+
import subprocess
|
|
11
|
+
from typing import List, Dict, Any, Optional, Set, Tuple
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from dataclasses import dataclass
|
|
14
|
+
from collections import defaultdict
|
|
15
|
+
import hashlib
|
|
16
|
+
|
|
17
|
+
from hanzo_mcp.tools.common.base import BaseTool
|
|
18
|
+
from hanzo_mcp.tools.common.paginated_response import AutoPaginatedResponse
|
|
19
|
+
from hanzo_mcp.tools.common.decorators import with_context_normalization
|
|
20
|
+
from hanzo_mcp.types import MCPResourceDocument
|
|
21
|
+
|
|
22
|
+
# Import memory tools if available
|
|
23
|
+
try:
|
|
24
|
+
from hanzo_mcp.tools.memory.memory_tools import KnowledgeRetrieval
|
|
25
|
+
MEMORY_AVAILABLE = True
|
|
26
|
+
except ImportError:
|
|
27
|
+
MEMORY_AVAILABLE = False
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
import tree_sitter
|
|
31
|
+
TREESITTER_AVAILABLE = True
|
|
32
|
+
except ImportError:
|
|
33
|
+
TREESITTER_AVAILABLE = False
|
|
34
|
+
|
|
35
|
+
try:
|
|
36
|
+
import chromadb
|
|
37
|
+
from sentence_transformers import SentenceTransformer
|
|
38
|
+
VECTOR_SEARCH_AVAILABLE = True
|
|
39
|
+
except ImportError:
|
|
40
|
+
VECTOR_SEARCH_AVAILABLE = False
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class SearchResult:
|
|
45
|
+
"""Unified search result."""
|
|
46
|
+
file_path: str
|
|
47
|
+
line_number: int
|
|
48
|
+
column: int
|
|
49
|
+
match_text: str
|
|
50
|
+
context_before: List[str]
|
|
51
|
+
context_after: List[str]
|
|
52
|
+
match_type: str # 'text', 'ast', 'vector', 'symbol', 'memory', 'file'
|
|
53
|
+
score: float = 1.0
|
|
54
|
+
node_type: Optional[str] = None
|
|
55
|
+
semantic_context: Optional[str] = None
|
|
56
|
+
|
|
57
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
58
|
+
return {
|
|
59
|
+
"file": self.file_path,
|
|
60
|
+
"line": self.line_number,
|
|
61
|
+
"column": self.column,
|
|
62
|
+
"match": self.match_text,
|
|
63
|
+
"type": self.match_type,
|
|
64
|
+
"score": self.score,
|
|
65
|
+
"context": {
|
|
66
|
+
"before": self.context_before,
|
|
67
|
+
"after": self.context_after,
|
|
68
|
+
"node_type": self.node_type,
|
|
69
|
+
"semantic": self.semantic_context
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
def __hash__(self):
|
|
74
|
+
"""Make result hashable for deduplication."""
|
|
75
|
+
return hash((self.file_path, self.line_number, self.column, self.match_text))
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class UnifiedSearch(BaseTool):
|
|
79
|
+
"""THE primary search tool - your universal interface for finding anything.
|
|
80
|
+
|
|
81
|
+
This is the main search tool you should use for finding:
|
|
82
|
+
- Code patterns and text matches (using ripgrep)
|
|
83
|
+
- AST nodes and code structure (using treesitter)
|
|
84
|
+
- Symbol definitions and references (using ctags/LSP)
|
|
85
|
+
- Files and directories (using find tool)
|
|
86
|
+
- Memory and knowledge base entries
|
|
87
|
+
- Semantic/conceptual matches (using vector search)
|
|
88
|
+
|
|
89
|
+
The tool automatically determines the best search strategy based on your query
|
|
90
|
+
and runs multiple search types in parallel for comprehensive results.
|
|
91
|
+
|
|
92
|
+
USAGE EXAMPLES:
|
|
93
|
+
|
|
94
|
+
1. Find code patterns:
|
|
95
|
+
search("error handling") # Finds all error handling code
|
|
96
|
+
search("TODO|FIXME") # Regex search for TODOs
|
|
97
|
+
search("async function") # Find async functions
|
|
98
|
+
|
|
99
|
+
2. Find symbols/definitions:
|
|
100
|
+
search("class UserService") # Find class definition
|
|
101
|
+
search("handleRequest") # Find function/method
|
|
102
|
+
search("MAX_RETRIES") # Find constant
|
|
103
|
+
|
|
104
|
+
3. Find files:
|
|
105
|
+
search("test_*.py", search_files=True) # Find test files
|
|
106
|
+
search("config", search_files=True) # Find config files
|
|
107
|
+
|
|
108
|
+
4. Semantic search:
|
|
109
|
+
search("how authentication works") # Natural language query
|
|
110
|
+
search("database connection logic") # Conceptual search
|
|
111
|
+
|
|
112
|
+
5. Memory search:
|
|
113
|
+
search("previous discussion about API design") # Search memories
|
|
114
|
+
search("that bug we fixed last week") # Search knowledge
|
|
115
|
+
|
|
116
|
+
The tool automatically:
|
|
117
|
+
- Detects query intent and chooses appropriate search methods
|
|
118
|
+
- Runs searches in parallel for speed
|
|
119
|
+
- Deduplicates and ranks results by relevance
|
|
120
|
+
- Provides context around matches
|
|
121
|
+
- Paginates results to stay within token limits
|
|
122
|
+
- Respects .gitignore and other exclusions
|
|
123
|
+
|
|
124
|
+
PRO TIPS:
|
|
125
|
+
- Use natural language for conceptual searches
|
|
126
|
+
- Use code syntax for exact matches
|
|
127
|
+
- Add search_files=True to also find filenames
|
|
128
|
+
- Results are ranked by relevance and type
|
|
129
|
+
- Use page parameter to get more results
|
|
130
|
+
"""
|
|
131
|
+
|
|
132
|
+
name = "search"
|
|
133
|
+
description = """THE primary unified search tool for rapid parallel search across all modalities.
|
|
134
|
+
|
|
135
|
+
Find anything in your codebase using text, AST, symbols, files, memory, and semantic search.
|
|
136
|
+
Automatically detects query intent and runs appropriate searches in parallel.
|
|
137
|
+
"""
|
|
138
|
+
|
|
139
|
+
def __init__(self):
|
|
140
|
+
super().__init__()
|
|
141
|
+
self.ripgrep_available = self._check_ripgrep()
|
|
142
|
+
self.vector_db = None
|
|
143
|
+
self.embedder = None
|
|
144
|
+
|
|
145
|
+
if VECTOR_SEARCH_AVAILABLE:
|
|
146
|
+
self._init_vector_search()
|
|
147
|
+
|
|
148
|
+
def _check_ripgrep(self) -> bool:
|
|
149
|
+
"""Check if ripgrep is available."""
|
|
150
|
+
try:
|
|
151
|
+
subprocess.run(['rg', '--version'], capture_output=True, check=True)
|
|
152
|
+
return True
|
|
153
|
+
except:
|
|
154
|
+
return False
|
|
155
|
+
|
|
156
|
+
def _init_vector_search(self):
|
|
157
|
+
"""Initialize vector search components."""
|
|
158
|
+
try:
|
|
159
|
+
self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
|
|
160
|
+
self.vector_db = chromadb.Client()
|
|
161
|
+
# Create or get collection
|
|
162
|
+
self.collection = self.vector_db.get_or_create_collection(
|
|
163
|
+
name="code_search",
|
|
164
|
+
metadata={"description": "Code semantic search"}
|
|
165
|
+
)
|
|
166
|
+
except Exception as e:
|
|
167
|
+
print(f"Failed to initialize vector search: {e}")
|
|
168
|
+
self.vector_db = None
|
|
169
|
+
|
|
170
|
+
def _should_use_vector_search(self, query: str) -> bool:
|
|
171
|
+
"""Determine if vector search would be helpful."""
|
|
172
|
+
# Use vector search for natural language queries
|
|
173
|
+
indicators = [
|
|
174
|
+
len(query.split()) > 2, # Multi-word queries
|
|
175
|
+
not any(c in query for c in ['(', ')', '{', '}', '[', ']']), # Not code syntax
|
|
176
|
+
' ' in query, # Has spaces (natural language)
|
|
177
|
+
not query.startswith('^') and not query.endswith('$'), # Not regex anchors
|
|
178
|
+
]
|
|
179
|
+
return sum(indicators) >= 2
|
|
180
|
+
|
|
181
|
+
def _should_use_ast_search(self, query: str) -> bool:
|
|
182
|
+
"""Determine if AST search would be helpful."""
|
|
183
|
+
# Use AST search for code patterns
|
|
184
|
+
indicators = [
|
|
185
|
+
'class ' in query or 'function ' in query or 'def ' in query,
|
|
186
|
+
'import ' in query or 'from ' in query,
|
|
187
|
+
any(kw in query.lower() for kw in ['method', 'function', 'class', 'interface', 'struct']),
|
|
188
|
+
'::' in query or '->' in query or '.' in query, # Member access
|
|
189
|
+
]
|
|
190
|
+
return any(indicators)
|
|
191
|
+
|
|
192
|
+
def _should_use_symbol_search(self, query: str) -> bool:
|
|
193
|
+
"""Determine if symbol search would be helpful."""
|
|
194
|
+
# Use symbol search for identifiers
|
|
195
|
+
return (
|
|
196
|
+
len(query.split()) <= 2 and # Short queries
|
|
197
|
+
query.replace('_', '').replace('-', '').isalnum() and # Looks like identifier
|
|
198
|
+
not ' ' in query.strip() # Single token
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
async def run(self,
|
|
202
|
+
pattern: str,
|
|
203
|
+
path: str = ".",
|
|
204
|
+
include: Optional[str] = None,
|
|
205
|
+
exclude: Optional[str] = None,
|
|
206
|
+
max_results_per_type: int = 20,
|
|
207
|
+
context_lines: int = 3,
|
|
208
|
+
search_files: bool = False,
|
|
209
|
+
search_memory: bool = None,
|
|
210
|
+
enable_text: bool = None,
|
|
211
|
+
enable_ast: bool = None,
|
|
212
|
+
enable_vector: bool = None,
|
|
213
|
+
enable_symbol: bool = None,
|
|
214
|
+
page_size: int = 50,
|
|
215
|
+
page: int = 1,
|
|
216
|
+
**kwargs) -> MCPResourceDocument:
|
|
217
|
+
"""Execute unified search across all available search modalities.
|
|
218
|
+
|
|
219
|
+
Args:
|
|
220
|
+
pattern: Search query (text, regex, natural language, or glob for files)
|
|
221
|
+
path: Directory to search in (default: current directory)
|
|
222
|
+
include: File pattern to include (e.g., "*.py", "*.js")
|
|
223
|
+
exclude: File pattern to exclude (e.g., "*.test.py")
|
|
224
|
+
max_results_per_type: Max results from each search type
|
|
225
|
+
context_lines: Lines of context around text matches
|
|
226
|
+
search_files: Also search for matching filenames
|
|
227
|
+
search_memory: Search in memory/knowledge base (auto-detected if None)
|
|
228
|
+
enable_*: Force enable/disable specific search types (auto if None)
|
|
229
|
+
page_size: Results per page (default: 50)
|
|
230
|
+
page: Page number to retrieve (default: 1)
|
|
231
|
+
"""
|
|
232
|
+
|
|
233
|
+
# Auto-detect search types based on query
|
|
234
|
+
if search_memory is None:
|
|
235
|
+
# Search memory for natural language queries or specific references
|
|
236
|
+
search_memory = (
|
|
237
|
+
MEMORY_AVAILABLE and
|
|
238
|
+
(self._should_use_vector_search(pattern) or
|
|
239
|
+
any(word in pattern.lower() for word in ['previous', 'discussion', 'remember', 'last']))
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
if enable_text is None:
|
|
243
|
+
enable_text = True # Always use text search as baseline
|
|
244
|
+
|
|
245
|
+
if enable_vector is None:
|
|
246
|
+
enable_vector = self._should_use_vector_search(pattern) and VECTOR_SEARCH_AVAILABLE
|
|
247
|
+
|
|
248
|
+
if enable_ast is None:
|
|
249
|
+
enable_ast = self._should_use_ast_search(pattern) and TREESITTER_AVAILABLE
|
|
250
|
+
|
|
251
|
+
if enable_symbol is None:
|
|
252
|
+
enable_symbol = self._should_use_symbol_search(pattern)
|
|
253
|
+
|
|
254
|
+
# Collect results from all enabled search types
|
|
255
|
+
all_results = []
|
|
256
|
+
search_stats = {
|
|
257
|
+
"query": pattern,
|
|
258
|
+
"path": path,
|
|
259
|
+
"search_types_used": [],
|
|
260
|
+
"total_matches": 0,
|
|
261
|
+
"unique_matches": 0,
|
|
262
|
+
"time_ms": {}
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
# 1. Text search (ripgrep) - always fast, do first
|
|
266
|
+
if enable_text:
|
|
267
|
+
start = time.time()
|
|
268
|
+
text_results = await self._text_search(
|
|
269
|
+
pattern, path, include, exclude, max_results_per_type, context_lines
|
|
270
|
+
)
|
|
271
|
+
search_stats["time_ms"]["text"] = int((time.time() - start) * 1000)
|
|
272
|
+
search_stats["search_types_used"].append("text")
|
|
273
|
+
all_results.extend(text_results)
|
|
274
|
+
|
|
275
|
+
# 2. AST search - for code structure
|
|
276
|
+
if enable_ast and TREESITTER_AVAILABLE:
|
|
277
|
+
start = time.time()
|
|
278
|
+
ast_results = await self._ast_search(
|
|
279
|
+
pattern, path, include, exclude, max_results_per_type, context_lines
|
|
280
|
+
)
|
|
281
|
+
search_stats["time_ms"]["ast"] = int((time.time() - start) * 1000)
|
|
282
|
+
search_stats["search_types_used"].append("ast")
|
|
283
|
+
all_results.extend(ast_results)
|
|
284
|
+
|
|
285
|
+
# 3. Symbol search - for definitions
|
|
286
|
+
if enable_symbol:
|
|
287
|
+
start = time.time()
|
|
288
|
+
symbol_results = await self._symbol_search(
|
|
289
|
+
pattern, path, include, exclude, max_results_per_type
|
|
290
|
+
)
|
|
291
|
+
search_stats["time_ms"]["symbol"] = int((time.time() - start) * 1000)
|
|
292
|
+
search_stats["search_types_used"].append("symbol")
|
|
293
|
+
all_results.extend(symbol_results)
|
|
294
|
+
|
|
295
|
+
# 4. Vector search - for semantic similarity
|
|
296
|
+
if enable_vector and self.vector_db:
|
|
297
|
+
start = time.time()
|
|
298
|
+
vector_results = await self._vector_search(
|
|
299
|
+
pattern, path, include, exclude, max_results_per_type, context_lines
|
|
300
|
+
)
|
|
301
|
+
search_stats["time_ms"]["vector"] = int((time.time() - start) * 1000)
|
|
302
|
+
search_stats["search_types_used"].append("vector")
|
|
303
|
+
all_results.extend(vector_results)
|
|
304
|
+
|
|
305
|
+
# 5. File search - for finding files by name/pattern
|
|
306
|
+
if search_files:
|
|
307
|
+
start = time.time()
|
|
308
|
+
file_results = await self._file_search(
|
|
309
|
+
pattern, path, include, exclude, max_results_per_type
|
|
310
|
+
)
|
|
311
|
+
search_stats["time_ms"]["files"] = int((time.time() - start) * 1000)
|
|
312
|
+
search_stats["search_types_used"].append("files")
|
|
313
|
+
all_results.extend(file_results)
|
|
314
|
+
|
|
315
|
+
# 6. Memory search - for knowledge base and previous discussions
|
|
316
|
+
if search_memory:
|
|
317
|
+
start = time.time()
|
|
318
|
+
memory_results = await self._memory_search(
|
|
319
|
+
pattern, max_results_per_type, context_lines
|
|
320
|
+
)
|
|
321
|
+
search_stats["time_ms"]["memory"] = int((time.time() - start) * 1000)
|
|
322
|
+
search_stats["search_types_used"].append("memory")
|
|
323
|
+
all_results.extend(memory_results)
|
|
324
|
+
|
|
325
|
+
# Deduplicate and rank results
|
|
326
|
+
unique_results = self._deduplicate_results(all_results)
|
|
327
|
+
ranked_results = self._rank_results(unique_results, pattern)
|
|
328
|
+
|
|
329
|
+
search_stats["total_matches"] = len(all_results)
|
|
330
|
+
search_stats["unique_matches"] = len(ranked_results)
|
|
331
|
+
|
|
332
|
+
# Paginate results
|
|
333
|
+
total_results = len(ranked_results)
|
|
334
|
+
start_idx = (page - 1) * page_size
|
|
335
|
+
end_idx = start_idx + page_size
|
|
336
|
+
page_results = ranked_results[start_idx:end_idx]
|
|
337
|
+
|
|
338
|
+
# Format results for output
|
|
339
|
+
formatted_results = []
|
|
340
|
+
for result in page_results:
|
|
341
|
+
formatted = result.to_dict()
|
|
342
|
+
# Add match preview with context
|
|
343
|
+
formatted["preview"] = self._format_preview(result)
|
|
344
|
+
formatted_results.append(formatted)
|
|
345
|
+
|
|
346
|
+
# Create paginated response
|
|
347
|
+
response_data = {
|
|
348
|
+
"results": formatted_results,
|
|
349
|
+
"statistics": search_stats,
|
|
350
|
+
"pagination": {
|
|
351
|
+
"page": page,
|
|
352
|
+
"page_size": page_size,
|
|
353
|
+
"total_results": total_results,
|
|
354
|
+
"total_pages": (total_results + page_size - 1) // page_size,
|
|
355
|
+
"has_next": end_idx < total_results,
|
|
356
|
+
"has_prev": page > 1
|
|
357
|
+
}
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
return MCPResourceDocument(data=response_data)
|
|
361
|
+
|
|
362
|
+
async def call(self, **kwargs) -> str:
|
|
363
|
+
"""Tool interface for MCP - converts result to JSON string."""
|
|
364
|
+
result = await self.run(**kwargs)
|
|
365
|
+
return result.to_json_string()
|
|
366
|
+
|
|
367
|
+
def register(self, mcp_server) -> None:
|
|
368
|
+
"""Register tool with MCP server."""
|
|
369
|
+
from mcp.server import FastMCP
|
|
370
|
+
|
|
371
|
+
@mcp_server.tool(name=self.name, description=self.description)
|
|
372
|
+
async def search_handler(
|
|
373
|
+
pattern: str,
|
|
374
|
+
path: str = ".",
|
|
375
|
+
include: Optional[str] = None,
|
|
376
|
+
exclude: Optional[str] = None,
|
|
377
|
+
max_results_per_type: int = 20,
|
|
378
|
+
context_lines: int = 2,
|
|
379
|
+
page_size: int = 50,
|
|
380
|
+
page: int = 1,
|
|
381
|
+
enable_text: bool = True,
|
|
382
|
+
enable_ast: bool = True,
|
|
383
|
+
enable_vector: bool = True,
|
|
384
|
+
enable_symbol: bool = True,
|
|
385
|
+
search_files: bool = False,
|
|
386
|
+
search_memory: bool = False,
|
|
387
|
+
) -> str:
|
|
388
|
+
"""Execute unified search."""
|
|
389
|
+
return await self.call(
|
|
390
|
+
pattern=pattern,
|
|
391
|
+
path=path,
|
|
392
|
+
include=include,
|
|
393
|
+
exclude=exclude,
|
|
394
|
+
max_results_per_type=max_results_per_type,
|
|
395
|
+
context_lines=context_lines,
|
|
396
|
+
page_size=page_size,
|
|
397
|
+
page=page,
|
|
398
|
+
enable_text=enable_text,
|
|
399
|
+
enable_ast=enable_ast,
|
|
400
|
+
enable_vector=enable_vector,
|
|
401
|
+
enable_symbol=enable_symbol,
|
|
402
|
+
search_files=search_files,
|
|
403
|
+
search_memory=search_memory,
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
async def _text_search(self,
|
|
407
|
+
pattern: str,
|
|
408
|
+
path: str,
|
|
409
|
+
include: Optional[str],
|
|
410
|
+
exclude: Optional[str],
|
|
411
|
+
max_results: int,
|
|
412
|
+
context_lines: int) -> List[SearchResult]:
|
|
413
|
+
"""Perform text search using ripgrep."""
|
|
414
|
+
results = []
|
|
415
|
+
|
|
416
|
+
if not self.ripgrep_available:
|
|
417
|
+
# Fallback to Python implementation
|
|
418
|
+
return await self._python_text_search(pattern, path, include, exclude, max_results, context_lines)
|
|
419
|
+
|
|
420
|
+
# Build ripgrep command
|
|
421
|
+
cmd = ['rg', '--json', '--max-count', str(max_results)]
|
|
422
|
+
|
|
423
|
+
if context_lines > 0:
|
|
424
|
+
cmd.extend(['-C', str(context_lines)])
|
|
425
|
+
|
|
426
|
+
if include:
|
|
427
|
+
cmd.extend(['--glob', include])
|
|
428
|
+
|
|
429
|
+
if exclude:
|
|
430
|
+
cmd.extend(['--glob', f'!{exclude}'])
|
|
431
|
+
|
|
432
|
+
cmd.extend([pattern, path])
|
|
433
|
+
|
|
434
|
+
try:
|
|
435
|
+
proc = subprocess.run(cmd, capture_output=True, text=True)
|
|
436
|
+
|
|
437
|
+
for line in proc.stdout.splitlines():
|
|
438
|
+
try:
|
|
439
|
+
data = json.loads(line)
|
|
440
|
+
if data.get('type') == 'match':
|
|
441
|
+
match_data = data['data']
|
|
442
|
+
|
|
443
|
+
result = SearchResult(
|
|
444
|
+
file_path=match_data['path']['text'],
|
|
445
|
+
line_number=match_data['line_number'],
|
|
446
|
+
column=match_data['submatches'][0]['start'],
|
|
447
|
+
match_text=match_data['lines']['text'].strip(),
|
|
448
|
+
context_before=[],
|
|
449
|
+
context_after=[],
|
|
450
|
+
match_type='text',
|
|
451
|
+
score=1.0
|
|
452
|
+
)
|
|
453
|
+
|
|
454
|
+
# Extract context if available
|
|
455
|
+
if 'context' in data:
|
|
456
|
+
# Parse context lines
|
|
457
|
+
pass
|
|
458
|
+
|
|
459
|
+
results.append(result)
|
|
460
|
+
|
|
461
|
+
except json.JSONDecodeError:
|
|
462
|
+
continue
|
|
463
|
+
|
|
464
|
+
except subprocess.CalledProcessError:
|
|
465
|
+
pass
|
|
466
|
+
|
|
467
|
+
return results
|
|
468
|
+
|
|
469
|
+
async def _ast_search(self,
|
|
470
|
+
pattern: str,
|
|
471
|
+
path: str,
|
|
472
|
+
include: Optional[str],
|
|
473
|
+
exclude: Optional[str],
|
|
474
|
+
max_results: int,
|
|
475
|
+
context_lines: int) -> List[SearchResult]:
|
|
476
|
+
"""Perform AST-based search using treesitter."""
|
|
477
|
+
# Try to use grep-ast if available
|
|
478
|
+
try:
|
|
479
|
+
from grep_ast.grep_ast import TreeContext
|
|
480
|
+
except ImportError:
|
|
481
|
+
# grep-ast not installed, skip AST search
|
|
482
|
+
return []
|
|
483
|
+
|
|
484
|
+
results = []
|
|
485
|
+
|
|
486
|
+
try:
|
|
487
|
+
|
|
488
|
+
# Get files to search
|
|
489
|
+
search_path = Path(path or ".")
|
|
490
|
+
files_to_search = []
|
|
491
|
+
|
|
492
|
+
if search_path.is_file():
|
|
493
|
+
files_to_search = [search_path]
|
|
494
|
+
else:
|
|
495
|
+
# Find files matching include pattern
|
|
496
|
+
pattern_to_use = include or "*.py"
|
|
497
|
+
for ext in ["*.py", "*.js", "*.ts", "*.go", "*.java", "*.cpp", "*.c"]:
|
|
498
|
+
if include and include != ext:
|
|
499
|
+
continue
|
|
500
|
+
files_to_search.extend(search_path.rglob(ext))
|
|
501
|
+
if len(files_to_search) >= max_results:
|
|
502
|
+
break
|
|
503
|
+
|
|
504
|
+
# Search each file
|
|
505
|
+
for file_path in files_to_search[:max_results]:
|
|
506
|
+
if not file_path.is_file():
|
|
507
|
+
continue
|
|
508
|
+
|
|
509
|
+
try:
|
|
510
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
511
|
+
code = f.read()
|
|
512
|
+
|
|
513
|
+
# Process with grep-ast
|
|
514
|
+
tc = TreeContext(
|
|
515
|
+
str(file_path),
|
|
516
|
+
code,
|
|
517
|
+
color=False,
|
|
518
|
+
verbose=False,
|
|
519
|
+
line_number=True,
|
|
520
|
+
)
|
|
521
|
+
|
|
522
|
+
# Find matches
|
|
523
|
+
matches = tc.grep(pattern, ignore_case=False)
|
|
524
|
+
|
|
525
|
+
for match in matches:
|
|
526
|
+
# Extract context
|
|
527
|
+
lines = code.split('\n')
|
|
528
|
+
line_num = match # This might need adjustment based on actual return type
|
|
529
|
+
|
|
530
|
+
result = SearchResult(
|
|
531
|
+
file_path=str(file_path),
|
|
532
|
+
line_number=line_num,
|
|
533
|
+
column=0,
|
|
534
|
+
match_text=lines[line_num - 1] if 0 < line_num <= len(lines) else "",
|
|
535
|
+
context_before=lines[max(0, line_num - context_lines - 1):line_num - 1],
|
|
536
|
+
context_after=lines[line_num:min(len(lines), line_num + context_lines)],
|
|
537
|
+
match_type='ast',
|
|
538
|
+
score=0.9,
|
|
539
|
+
node_type='ast_match',
|
|
540
|
+
semantic_context=None
|
|
541
|
+
)
|
|
542
|
+
results.append(result)
|
|
543
|
+
|
|
544
|
+
except Exception:
|
|
545
|
+
# Skip files that can't be parsed
|
|
546
|
+
continue
|
|
547
|
+
|
|
548
|
+
except Exception as e:
|
|
549
|
+
print(f"AST search error: {e}")
|
|
550
|
+
|
|
551
|
+
return results
|
|
552
|
+
|
|
553
|
+
async def _symbol_search(self,
|
|
554
|
+
pattern: str,
|
|
555
|
+
path: str,
|
|
556
|
+
include: Optional[str],
|
|
557
|
+
exclude: Optional[str],
|
|
558
|
+
max_results: int) -> List[SearchResult]:
|
|
559
|
+
"""Search for symbol definitions."""
|
|
560
|
+
results = []
|
|
561
|
+
|
|
562
|
+
# Use ctags or similar for symbol search
|
|
563
|
+
# For now, use specialized ripgrep patterns
|
|
564
|
+
symbol_patterns = [
|
|
565
|
+
f"^\\s*(def|function|func)\\s+{pattern}", # Function definitions
|
|
566
|
+
f"^\\s*class\\s+{pattern}", # Class definitions
|
|
567
|
+
f"^\\s*(const|let|var)\\s+{pattern}", # Variable declarations
|
|
568
|
+
f"^\\s*type\\s+{pattern}", # Type definitions
|
|
569
|
+
f"interface\\s+{pattern}", # Interface definitions
|
|
570
|
+
]
|
|
571
|
+
|
|
572
|
+
for symbol_pattern in symbol_patterns:
|
|
573
|
+
symbol_results = await self._text_search(
|
|
574
|
+
symbol_pattern, path, include, exclude,
|
|
575
|
+
max_results // len(symbol_patterns), 0
|
|
576
|
+
)
|
|
577
|
+
|
|
578
|
+
for res in symbol_results:
|
|
579
|
+
res.match_type = 'symbol'
|
|
580
|
+
res.score = 1.1 # Boost symbol definitions
|
|
581
|
+
results.append(res)
|
|
582
|
+
|
|
583
|
+
return results
|
|
584
|
+
|
|
585
|
+
async def _vector_search(self,
|
|
586
|
+
query: str,
|
|
587
|
+
path: str,
|
|
588
|
+
include: Optional[str],
|
|
589
|
+
exclude: Optional[str],
|
|
590
|
+
max_results: int,
|
|
591
|
+
context_lines: int) -> List[SearchResult]:
|
|
592
|
+
"""Perform semantic vector search."""
|
|
593
|
+
if not self.vector_db or not self.embedder:
|
|
594
|
+
return []
|
|
595
|
+
|
|
596
|
+
results = []
|
|
597
|
+
|
|
598
|
+
try:
|
|
599
|
+
# Embed the query
|
|
600
|
+
query_embedding = self.embedder.encode(query).tolist()
|
|
601
|
+
|
|
602
|
+
# Search in vector database
|
|
603
|
+
search_results = self.collection.query(
|
|
604
|
+
query_embeddings=[query_embedding],
|
|
605
|
+
n_results=max_results,
|
|
606
|
+
where={"path": {"$contains": path}} if path != "." else None
|
|
607
|
+
)
|
|
608
|
+
|
|
609
|
+
if search_results['ids'][0]:
|
|
610
|
+
for i, doc_id in enumerate(search_results['ids'][0]):
|
|
611
|
+
metadata = search_results['metadatas'][0][i]
|
|
612
|
+
|
|
613
|
+
result = SearchResult(
|
|
614
|
+
file_path=metadata['file_path'],
|
|
615
|
+
line_number=metadata['line_number'],
|
|
616
|
+
column=0,
|
|
617
|
+
match_text=search_results['documents'][0][i],
|
|
618
|
+
context_before=[],
|
|
619
|
+
context_after=[],
|
|
620
|
+
match_type='vector',
|
|
621
|
+
score=1.0 - search_results['distances'][0][i], # Convert distance to similarity
|
|
622
|
+
semantic_context=metadata.get('context', '')
|
|
623
|
+
)
|
|
624
|
+
results.append(result)
|
|
625
|
+
|
|
626
|
+
except Exception as e:
|
|
627
|
+
print(f"Vector search error: {e}")
|
|
628
|
+
|
|
629
|
+
return results
|
|
630
|
+
|
|
631
|
+
async def _file_search(self,
|
|
632
|
+
pattern: str,
|
|
633
|
+
path: str,
|
|
634
|
+
include: Optional[str],
|
|
635
|
+
exclude: Optional[str],
|
|
636
|
+
max_results: int) -> List[SearchResult]:
|
|
637
|
+
"""Search for files by name/pattern using find tool."""
|
|
638
|
+
results = []
|
|
639
|
+
|
|
640
|
+
try:
|
|
641
|
+
# Import and use find tool
|
|
642
|
+
from hanzo_mcp.tools.search.find_tool import FindTool
|
|
643
|
+
find_tool = FindTool()
|
|
644
|
+
|
|
645
|
+
# Call find tool with pattern
|
|
646
|
+
find_result = await find_tool.run(
|
|
647
|
+
pattern=pattern,
|
|
648
|
+
path=path,
|
|
649
|
+
type="file", # Only files for now
|
|
650
|
+
max_results=max_results,
|
|
651
|
+
regex=False, # Use glob patterns by default
|
|
652
|
+
fuzzy=False,
|
|
653
|
+
case_sensitive=False
|
|
654
|
+
)
|
|
655
|
+
|
|
656
|
+
# Convert find results to SearchResult format
|
|
657
|
+
if find_result.data and "results" in find_result.data:
|
|
658
|
+
for file_match in find_result.data["results"]:
|
|
659
|
+
result = SearchResult(
|
|
660
|
+
file_path=file_match["path"],
|
|
661
|
+
line_number=1, # File matches don't have line numbers
|
|
662
|
+
column=0,
|
|
663
|
+
match_text=file_match["name"],
|
|
664
|
+
context_before=[],
|
|
665
|
+
context_after=[],
|
|
666
|
+
match_type='file',
|
|
667
|
+
score=1.0,
|
|
668
|
+
semantic_context=f"File: {file_match['extension']} ({file_match['size']} bytes)"
|
|
669
|
+
)
|
|
670
|
+
results.append(result)
|
|
671
|
+
|
|
672
|
+
except Exception as e:
|
|
673
|
+
print(f"File search error: {e}")
|
|
674
|
+
|
|
675
|
+
return results
|
|
676
|
+
|
|
677
|
+
async def _memory_search(self,
|
|
678
|
+
query: str,
|
|
679
|
+
max_results: int,
|
|
680
|
+
context_lines: int) -> List[SearchResult]:
|
|
681
|
+
"""Search in memory/knowledge base."""
|
|
682
|
+
results = []
|
|
683
|
+
|
|
684
|
+
if not MEMORY_AVAILABLE:
|
|
685
|
+
return results
|
|
686
|
+
|
|
687
|
+
try:
|
|
688
|
+
# Create memory retrieval tool
|
|
689
|
+
retrieval_tool = KnowledgeRetrieval()
|
|
690
|
+
|
|
691
|
+
# Search memories
|
|
692
|
+
memory_result = await retrieval_tool.run(
|
|
693
|
+
query=query,
|
|
694
|
+
top_k=max_results,
|
|
695
|
+
threshold=0.5 # Minimum relevance threshold
|
|
696
|
+
)
|
|
697
|
+
|
|
698
|
+
# Convert memory results to SearchResult format
|
|
699
|
+
if memory_result.data and "results" in memory_result.data:
|
|
700
|
+
for mem in memory_result.data["results"]:
|
|
701
|
+
# Extract content and metadata
|
|
702
|
+
content = mem.get("content", "")
|
|
703
|
+
metadata = mem.get("metadata", {})
|
|
704
|
+
|
|
705
|
+
# Create a virtual file path for memories
|
|
706
|
+
memory_type = metadata.get("type", "memory")
|
|
707
|
+
memory_id = metadata.get("id", "unknown")
|
|
708
|
+
virtual_path = f"memory://{memory_type}/{memory_id}"
|
|
709
|
+
|
|
710
|
+
result = SearchResult(
|
|
711
|
+
file_path=virtual_path,
|
|
712
|
+
line_number=1,
|
|
713
|
+
column=0,
|
|
714
|
+
match_text=content[:200] + "..." if len(content) > 200 else content,
|
|
715
|
+
context_before=[],
|
|
716
|
+
context_after=[],
|
|
717
|
+
match_type='memory',
|
|
718
|
+
score=mem.get("score", 0.8),
|
|
719
|
+
semantic_context=f"Memory type: {memory_type}, Created: {metadata.get('created_at', 'unknown')}"
|
|
720
|
+
)
|
|
721
|
+
results.append(result)
|
|
722
|
+
|
|
723
|
+
except Exception as e:
|
|
724
|
+
print(f"Memory search error: {e}")
|
|
725
|
+
|
|
726
|
+
return results
|
|
727
|
+
|
|
728
|
+
def _deduplicate_results(self, results: List[SearchResult]) -> List[SearchResult]:
|
|
729
|
+
"""Remove duplicate results across search types."""
|
|
730
|
+
seen = set()
|
|
731
|
+
unique = []
|
|
732
|
+
|
|
733
|
+
for result in results:
|
|
734
|
+
key = (result.file_path, result.line_number, result.match_text.strip())
|
|
735
|
+
if key not in seen:
|
|
736
|
+
seen.add(key)
|
|
737
|
+
unique.append(result)
|
|
738
|
+
else:
|
|
739
|
+
# Merge information from duplicate
|
|
740
|
+
for existing in unique:
|
|
741
|
+
if (existing.file_path, existing.line_number, existing.match_text.strip()) == key:
|
|
742
|
+
# Update with better context or node type
|
|
743
|
+
if result.node_type and not existing.node_type:
|
|
744
|
+
existing.node_type = result.node_type
|
|
745
|
+
if result.semantic_context and not existing.semantic_context:
|
|
746
|
+
existing.semantic_context = result.semantic_context
|
|
747
|
+
# Take best score
|
|
748
|
+
existing.score = max(existing.score, result.score)
|
|
749
|
+
break
|
|
750
|
+
|
|
751
|
+
return unique
|
|
752
|
+
|
|
753
|
+
def _rank_results(self, results: List[SearchResult], query: str) -> List[SearchResult]:
|
|
754
|
+
"""Rank results by relevance."""
|
|
755
|
+
# Simple ranking based on:
|
|
756
|
+
# 1. Match type score
|
|
757
|
+
# 2. Exact match bonus
|
|
758
|
+
# 3. File path relevance
|
|
759
|
+
|
|
760
|
+
for result in results:
|
|
761
|
+
# Exact match bonus
|
|
762
|
+
if query.lower() in result.match_text.lower():
|
|
763
|
+
result.score *= 1.2
|
|
764
|
+
|
|
765
|
+
# Path relevance (prefer non-test, non-vendor files)
|
|
766
|
+
if any(skip in result.file_path for skip in ['test', 'vendor', 'node_modules']):
|
|
767
|
+
result.score *= 0.8
|
|
768
|
+
|
|
769
|
+
# Prefer definition files
|
|
770
|
+
if any(pattern in result.file_path for pattern in ['index.', 'main.', 'api.', 'types.']):
|
|
771
|
+
result.score *= 1.1
|
|
772
|
+
|
|
773
|
+
# Sort by score descending, then by file path
|
|
774
|
+
results.sort(key=lambda r: (-r.score, r.file_path, r.line_number))
|
|
775
|
+
|
|
776
|
+
return results
|
|
777
|
+
|
|
778
|
+
def _format_preview(self, result: SearchResult) -> str:
|
|
779
|
+
"""Format result preview with context."""
|
|
780
|
+
lines = []
|
|
781
|
+
|
|
782
|
+
# Add context before
|
|
783
|
+
for line in result.context_before[-2:]:
|
|
784
|
+
lines.append(f" {line}")
|
|
785
|
+
|
|
786
|
+
# Add match line with highlighting
|
|
787
|
+
match_line = result.match_text
|
|
788
|
+
if result.column > 0:
|
|
789
|
+
# Add column indicator
|
|
790
|
+
lines.append(f"> {match_line}")
|
|
791
|
+
lines.append(f" {' ' * result.column}^")
|
|
792
|
+
else:
|
|
793
|
+
lines.append(f"> {match_line}")
|
|
794
|
+
|
|
795
|
+
# Add context after
|
|
796
|
+
for line in result.context_after[:2]:
|
|
797
|
+
lines.append(f" {line}")
|
|
798
|
+
|
|
799
|
+
return '\n'.join(lines)
|
|
800
|
+
|
|
801
|
+
async def _python_text_search(self,
|
|
802
|
+
pattern: str,
|
|
803
|
+
path: str,
|
|
804
|
+
include: Optional[str],
|
|
805
|
+
exclude: Optional[str],
|
|
806
|
+
max_results: int,
|
|
807
|
+
context_lines: int) -> List[SearchResult]:
|
|
808
|
+
"""Fallback Python text search when ripgrep not available."""
|
|
809
|
+
results = []
|
|
810
|
+
count = 0
|
|
811
|
+
|
|
812
|
+
import re
|
|
813
|
+
import glob
|
|
814
|
+
|
|
815
|
+
# Compile pattern
|
|
816
|
+
try:
|
|
817
|
+
regex = re.compile(pattern)
|
|
818
|
+
except re.error:
|
|
819
|
+
# Treat as literal string
|
|
820
|
+
regex = re.compile(re.escape(pattern))
|
|
821
|
+
|
|
822
|
+
# Find files
|
|
823
|
+
for file_path in Path(path).rglob(include or '*'):
|
|
824
|
+
if count >= max_results:
|
|
825
|
+
break
|
|
826
|
+
|
|
827
|
+
if file_path.is_file():
|
|
828
|
+
try:
|
|
829
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
830
|
+
lines = f.readlines()
|
|
831
|
+
|
|
832
|
+
for i, line in enumerate(lines):
|
|
833
|
+
if count >= max_results:
|
|
834
|
+
break
|
|
835
|
+
|
|
836
|
+
match = regex.search(line)
|
|
837
|
+
if match:
|
|
838
|
+
result = SearchResult(
|
|
839
|
+
file_path=str(file_path),
|
|
840
|
+
line_number=i + 1,
|
|
841
|
+
column=match.start(),
|
|
842
|
+
match_text=line.strip(),
|
|
843
|
+
context_before=lines[max(0, i-context_lines):i],
|
|
844
|
+
context_after=lines[i+1:i+1+context_lines],
|
|
845
|
+
match_type='text',
|
|
846
|
+
score=1.0
|
|
847
|
+
)
|
|
848
|
+
results.append(result)
|
|
849
|
+
count += 1
|
|
850
|
+
|
|
851
|
+
except Exception:
|
|
852
|
+
continue
|
|
853
|
+
|
|
854
|
+
return results
|
|
855
|
+
|
|
856
|
+
|
|
857
|
+
# Index builder for vector search
|
|
858
|
+
class CodeIndexer:
|
|
859
|
+
"""Build and maintain vector search index."""
|
|
860
|
+
|
|
861
|
+
def __init__(self, vector_db, embedder):
|
|
862
|
+
self.vector_db = vector_db
|
|
863
|
+
self.embedder = embedder
|
|
864
|
+
self.collection = vector_db.get_or_create_collection("code_search")
|
|
865
|
+
|
|
866
|
+
async def index_directory(self, path: str, file_patterns: List[str] = None):
|
|
867
|
+
"""Index a directory for vector search."""
|
|
868
|
+
if file_patterns is None:
|
|
869
|
+
file_patterns = ['*.py', '*.js', '*.ts', '*.go', '*.java', '*.cpp', '*.c']
|
|
870
|
+
|
|
871
|
+
documents = []
|
|
872
|
+
metadatas = []
|
|
873
|
+
ids = []
|
|
874
|
+
|
|
875
|
+
for pattern in file_patterns:
|
|
876
|
+
for file_path in Path(path).rglob(pattern):
|
|
877
|
+
if file_path.is_file():
|
|
878
|
+
try:
|
|
879
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
|
880
|
+
content = f.read()
|
|
881
|
+
|
|
882
|
+
# Split into chunks (functions, classes, etc.)
|
|
883
|
+
chunks = self._split_code_intelligently(content, file_path)
|
|
884
|
+
|
|
885
|
+
for chunk in chunks:
|
|
886
|
+
doc_id = hashlib.md5(
|
|
887
|
+
f"{file_path}:{chunk['line']}:{chunk['text'][:50]}".encode()
|
|
888
|
+
).hexdigest()
|
|
889
|
+
|
|
890
|
+
documents.append(chunk['text'])
|
|
891
|
+
metadatas.append({
|
|
892
|
+
'file_path': str(file_path),
|
|
893
|
+
'line_number': chunk['line'],
|
|
894
|
+
'context': chunk.get('context', ''),
|
|
895
|
+
'type': chunk.get('type', 'code')
|
|
896
|
+
})
|
|
897
|
+
ids.append(doc_id)
|
|
898
|
+
|
|
899
|
+
except Exception as e:
|
|
900
|
+
print(f"Error indexing {file_path}: {e}")
|
|
901
|
+
|
|
902
|
+
# Batch embed and store
|
|
903
|
+
if documents:
|
|
904
|
+
embeddings = self.embedder.encode(documents).tolist()
|
|
905
|
+
self.collection.add(
|
|
906
|
+
embeddings=embeddings,
|
|
907
|
+
documents=documents,
|
|
908
|
+
metadatas=metadatas,
|
|
909
|
+
ids=ids
|
|
910
|
+
)
|
|
911
|
+
|
|
912
|
+
def _split_code_intelligently(self, content: str, file_path: Path) -> List[Dict[str, Any]]:
|
|
913
|
+
"""Split code into meaningful chunks."""
|
|
914
|
+
# Simple line-based splitting for now
|
|
915
|
+
# TODO: Use AST for better splitting
|
|
916
|
+
chunks = []
|
|
917
|
+
lines = content.split('\n')
|
|
918
|
+
|
|
919
|
+
# Group into function-sized chunks
|
|
920
|
+
current_chunk = []
|
|
921
|
+
current_line = 1
|
|
922
|
+
|
|
923
|
+
for i, line in enumerate(lines):
|
|
924
|
+
current_chunk.append(line)
|
|
925
|
+
|
|
926
|
+
# Split on function/class definitions or every 50 lines
|
|
927
|
+
if (len(current_chunk) >= 50 or
|
|
928
|
+
any(kw in line for kw in ['def ', 'function ', 'class ', 'interface '])):
|
|
929
|
+
|
|
930
|
+
if current_chunk:
|
|
931
|
+
chunks.append({
|
|
932
|
+
'text': '\n'.join(current_chunk),
|
|
933
|
+
'line': current_line,
|
|
934
|
+
'type': 'code'
|
|
935
|
+
})
|
|
936
|
+
current_chunk = []
|
|
937
|
+
current_line = i + 2
|
|
938
|
+
|
|
939
|
+
# Add remaining
|
|
940
|
+
if current_chunk:
|
|
941
|
+
chunks.append({
|
|
942
|
+
'text': '\n'.join(current_chunk),
|
|
943
|
+
'line': current_line,
|
|
944
|
+
'type': 'code'
|
|
945
|
+
})
|
|
946
|
+
|
|
947
|
+
return chunks
|
|
948
|
+
|
|
949
|
+
|
|
950
|
+
# Tool registration
|
|
951
|
+
def create_unified_search_tool():
|
|
952
|
+
"""Factory function to create unified search tool."""
|
|
953
|
+
return UnifiedSearch()
|