hanzo-mcp 0.3.8__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hanzo-mcp might be problematic. Click here for more details.

Files changed (93) hide show
  1. hanzo_mcp/__init__.py +1 -1
  2. hanzo_mcp/cli.py +118 -170
  3. hanzo_mcp/cli_enhanced.py +438 -0
  4. hanzo_mcp/config/__init__.py +19 -0
  5. hanzo_mcp/config/settings.py +449 -0
  6. hanzo_mcp/config/tool_config.py +197 -0
  7. hanzo_mcp/prompts/__init__.py +117 -0
  8. hanzo_mcp/prompts/compact_conversation.py +77 -0
  9. hanzo_mcp/prompts/create_release.py +38 -0
  10. hanzo_mcp/prompts/project_system.py +120 -0
  11. hanzo_mcp/prompts/project_todo_reminder.py +111 -0
  12. hanzo_mcp/prompts/utils.py +286 -0
  13. hanzo_mcp/server.py +117 -99
  14. hanzo_mcp/tools/__init__.py +121 -33
  15. hanzo_mcp/tools/agent/__init__.py +8 -11
  16. hanzo_mcp/tools/agent/agent_tool.py +290 -224
  17. hanzo_mcp/tools/agent/prompt.py +16 -13
  18. hanzo_mcp/tools/agent/tool_adapter.py +9 -9
  19. hanzo_mcp/tools/common/__init__.py +17 -16
  20. hanzo_mcp/tools/common/base.py +79 -110
  21. hanzo_mcp/tools/common/batch_tool.py +330 -0
  22. hanzo_mcp/tools/common/config_tool.py +396 -0
  23. hanzo_mcp/tools/common/context.py +26 -292
  24. hanzo_mcp/tools/common/permissions.py +12 -12
  25. hanzo_mcp/tools/common/thinking_tool.py +153 -0
  26. hanzo_mcp/tools/common/validation.py +1 -63
  27. hanzo_mcp/tools/filesystem/__init__.py +97 -57
  28. hanzo_mcp/tools/filesystem/base.py +32 -24
  29. hanzo_mcp/tools/filesystem/content_replace.py +114 -107
  30. hanzo_mcp/tools/filesystem/directory_tree.py +129 -105
  31. hanzo_mcp/tools/filesystem/edit.py +279 -0
  32. hanzo_mcp/tools/filesystem/grep.py +458 -0
  33. hanzo_mcp/tools/filesystem/grep_ast_tool.py +250 -0
  34. hanzo_mcp/tools/filesystem/multi_edit.py +362 -0
  35. hanzo_mcp/tools/filesystem/read.py +255 -0
  36. hanzo_mcp/tools/filesystem/unified_search.py +689 -0
  37. hanzo_mcp/tools/filesystem/write.py +156 -0
  38. hanzo_mcp/tools/jupyter/__init__.py +41 -29
  39. hanzo_mcp/tools/jupyter/base.py +66 -57
  40. hanzo_mcp/tools/jupyter/{edit_notebook.py → notebook_edit.py} +162 -139
  41. hanzo_mcp/tools/jupyter/notebook_read.py +152 -0
  42. hanzo_mcp/tools/shell/__init__.py +29 -20
  43. hanzo_mcp/tools/shell/base.py +87 -45
  44. hanzo_mcp/tools/shell/bash_session.py +731 -0
  45. hanzo_mcp/tools/shell/bash_session_executor.py +295 -0
  46. hanzo_mcp/tools/shell/command_executor.py +435 -384
  47. hanzo_mcp/tools/shell/run_command.py +284 -131
  48. hanzo_mcp/tools/shell/run_command_windows.py +328 -0
  49. hanzo_mcp/tools/shell/session_manager.py +196 -0
  50. hanzo_mcp/tools/shell/session_storage.py +325 -0
  51. hanzo_mcp/tools/todo/__init__.py +66 -0
  52. hanzo_mcp/tools/todo/base.py +319 -0
  53. hanzo_mcp/tools/todo/todo_read.py +148 -0
  54. hanzo_mcp/tools/todo/todo_write.py +378 -0
  55. hanzo_mcp/tools/vector/__init__.py +99 -0
  56. hanzo_mcp/tools/vector/ast_analyzer.py +459 -0
  57. hanzo_mcp/tools/vector/git_ingester.py +482 -0
  58. hanzo_mcp/tools/vector/infinity_store.py +731 -0
  59. hanzo_mcp/tools/vector/mock_infinity.py +162 -0
  60. hanzo_mcp/tools/vector/project_manager.py +361 -0
  61. hanzo_mcp/tools/vector/vector_index.py +116 -0
  62. hanzo_mcp/tools/vector/vector_search.py +225 -0
  63. hanzo_mcp-0.5.1.dist-info/METADATA +276 -0
  64. hanzo_mcp-0.5.1.dist-info/RECORD +68 -0
  65. {hanzo_mcp-0.3.8.dist-info → hanzo_mcp-0.5.1.dist-info}/WHEEL +1 -1
  66. hanzo_mcp/tools/agent/base_provider.py +0 -73
  67. hanzo_mcp/tools/agent/litellm_provider.py +0 -45
  68. hanzo_mcp/tools/agent/lmstudio_agent.py +0 -385
  69. hanzo_mcp/tools/agent/lmstudio_provider.py +0 -219
  70. hanzo_mcp/tools/agent/provider_registry.py +0 -120
  71. hanzo_mcp/tools/common/error_handling.py +0 -86
  72. hanzo_mcp/tools/common/logging_config.py +0 -115
  73. hanzo_mcp/tools/common/session.py +0 -91
  74. hanzo_mcp/tools/common/think_tool.py +0 -123
  75. hanzo_mcp/tools/common/version_tool.py +0 -120
  76. hanzo_mcp/tools/filesystem/edit_file.py +0 -287
  77. hanzo_mcp/tools/filesystem/get_file_info.py +0 -170
  78. hanzo_mcp/tools/filesystem/read_files.py +0 -199
  79. hanzo_mcp/tools/filesystem/search_content.py +0 -275
  80. hanzo_mcp/tools/filesystem/write_file.py +0 -162
  81. hanzo_mcp/tools/jupyter/notebook_operations.py +0 -514
  82. hanzo_mcp/tools/jupyter/read_notebook.py +0 -165
  83. hanzo_mcp/tools/project/__init__.py +0 -64
  84. hanzo_mcp/tools/project/analysis.py +0 -886
  85. hanzo_mcp/tools/project/base.py +0 -66
  86. hanzo_mcp/tools/project/project_analyze.py +0 -173
  87. hanzo_mcp/tools/shell/run_script.py +0 -215
  88. hanzo_mcp/tools/shell/script_tool.py +0 -244
  89. hanzo_mcp-0.3.8.dist-info/METADATA +0 -196
  90. hanzo_mcp-0.3.8.dist-info/RECORD +0 -53
  91. {hanzo_mcp-0.3.8.dist-info → hanzo_mcp-0.5.1.dist-info}/entry_points.txt +0 -0
  92. {hanzo_mcp-0.3.8.dist-info → hanzo_mcp-0.5.1.dist-info}/licenses/LICENSE +0 -0
  93. {hanzo_mcp-0.3.8.dist-info → hanzo_mcp-0.5.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,731 @@
1
+ """Infinity vector database integration for Hanzo MCP."""
2
+
3
+ import json
4
+ import hashlib
5
+ from pathlib import Path
6
+ from typing import Any, Dict, List, Optional, Tuple
7
+ from dataclasses import dataclass
8
+
9
+ try:
10
+ import infinity_embedded
11
+ INFINITY_AVAILABLE = True
12
+ except ImportError:
13
+ # Use mock implementation when infinity_embedded is not available
14
+ from . import mock_infinity as infinity_embedded
15
+ INFINITY_AVAILABLE = True # Mock is always available
16
+
17
+ from .ast_analyzer import ASTAnalyzer, FileAST, Symbol, create_symbol_embedding_text
18
+
19
+
20
+ @dataclass
21
+ class Document:
22
+ """Document representation for vector storage."""
23
+ id: str
24
+ content: str
25
+ metadata: Dict[str, Any]
26
+ file_path: Optional[str] = None
27
+ chunk_index: Optional[int] = None
28
+
29
+
30
+ @dataclass
31
+ class SearchResult:
32
+ """Search result from vector database."""
33
+ document: Document
34
+ score: float
35
+ distance: float
36
+
37
+
38
+ @dataclass
39
+ class SymbolSearchResult:
40
+ """Search result for symbols."""
41
+ symbol: Symbol
42
+ score: float
43
+ context_document: Optional[Document] = None
44
+
45
+
46
+ @dataclass
47
+ class UnifiedSearchResult:
48
+ """Unified search result combining text, vector, and symbol search."""
49
+ type: str # 'document', 'symbol', 'reference'
50
+ content: str
51
+ file_path: str
52
+ line_start: int
53
+ line_end: int
54
+ score: float
55
+ search_type: str # 'text', 'vector', 'symbol', 'ast'
56
+ metadata: Dict[str, Any]
57
+
58
+
59
+ class InfinityVectorStore:
60
+ """Local vector database using Infinity."""
61
+
62
+ def __init__(
63
+ self,
64
+ data_path: Optional[str] = None,
65
+ embedding_model: str = "text-embedding-3-small",
66
+ dimension: int = 1536, # Default for OpenAI text-embedding-3-small
67
+ ):
68
+ """Initialize the Infinity vector store.
69
+
70
+ Args:
71
+ data_path: Path to store vector database (default: ~/.config/hanzo/vector-store)
72
+ embedding_model: Embedding model to use
73
+ dimension: Vector dimension (must match embedding model)
74
+ """
75
+ if not INFINITY_AVAILABLE:
76
+ raise ImportError("infinity_embedded is required for vector store functionality")
77
+
78
+ # Set up data path
79
+ if data_path:
80
+ self.data_path = Path(data_path)
81
+ else:
82
+ from hanzo_mcp.config.settings import get_config_dir
83
+ self.data_path = get_config_dir() / "vector-store"
84
+
85
+ self.data_path.mkdir(parents=True, exist_ok=True)
86
+
87
+ self.embedding_model = embedding_model
88
+ self.dimension = dimension
89
+
90
+ # Initialize AST analyzer
91
+ self.ast_analyzer = ASTAnalyzer()
92
+
93
+ # Connect to Infinity
94
+ self.infinity = infinity_embedded.connect(str(self.data_path))
95
+ self.db = self.infinity.get_database("hanzo_mcp")
96
+
97
+ # Initialize tables
98
+ self._initialize_tables()
99
+
100
+ def _initialize_tables(self):
101
+ """Initialize database tables if they don't exist."""
102
+ # Documents table
103
+ try:
104
+ self.documents_table = self.db.get_table("documents")
105
+ except:
106
+ self.documents_table = self.db.create_table(
107
+ "documents",
108
+ {
109
+ "id": {"type": "varchar"},
110
+ "content": {"type": "varchar"},
111
+ "file_path": {"type": "varchar"},
112
+ "chunk_index": {"type": "integer"},
113
+ "metadata": {"type": "varchar"}, # JSON string
114
+ "embedding": {"type": f"vector,{self.dimension},float"},
115
+ }
116
+ )
117
+
118
+ # Symbols table for code symbols
119
+ try:
120
+ self.symbols_table = self.db.get_table("symbols")
121
+ except:
122
+ self.symbols_table = self.db.create_table(
123
+ "symbols",
124
+ {
125
+ "id": {"type": "varchar"},
126
+ "name": {"type": "varchar"},
127
+ "type": {"type": "varchar"}, # function, class, variable, etc.
128
+ "file_path": {"type": "varchar"},
129
+ "line_start": {"type": "integer"},
130
+ "line_end": {"type": "integer"},
131
+ "scope": {"type": "varchar"},
132
+ "parent": {"type": "varchar"},
133
+ "signature": {"type": "varchar"},
134
+ "docstring": {"type": "varchar"},
135
+ "metadata": {"type": "varchar"}, # JSON string
136
+ "embedding": {"type": f"vector,{self.dimension},float"},
137
+ }
138
+ )
139
+
140
+ # AST table for storing complete file ASTs
141
+ try:
142
+ self.ast_table = self.db.get_table("ast_files")
143
+ except:
144
+ self.ast_table = self.db.create_table(
145
+ "ast_files",
146
+ {
147
+ "file_path": {"type": "varchar"},
148
+ "file_hash": {"type": "varchar"},
149
+ "language": {"type": "varchar"},
150
+ "ast_data": {"type": "varchar"}, # JSON string of complete AST
151
+ "last_updated": {"type": "varchar"}, # ISO timestamp
152
+ }
153
+ )
154
+
155
+ # References table for cross-file references
156
+ try:
157
+ self.references_table = self.db.get_table("references")
158
+ except:
159
+ self.references_table = self.db.create_table(
160
+ "references",
161
+ {
162
+ "id": {"type": "varchar"},
163
+ "source_file": {"type": "varchar"},
164
+ "target_file": {"type": "varchar"},
165
+ "symbol_name": {"type": "varchar"},
166
+ "reference_type": {"type": "varchar"}, # import, call, inheritance, etc.
167
+ "line_number": {"type": "integer"},
168
+ "metadata": {"type": "varchar"}, # JSON string
169
+ }
170
+ )
171
+
172
+ def _generate_doc_id(self, content: str, file_path: str = "", chunk_index: int = 0) -> str:
173
+ """Generate a unique document ID."""
174
+ content_hash = hashlib.sha256(content.encode()).hexdigest()[:16]
175
+ path_hash = hashlib.sha256(file_path.encode()).hexdigest()[:8]
176
+ return f"doc_{path_hash}_{chunk_index}_{content_hash}"
177
+
178
+ def add_document(
179
+ self,
180
+ content: str,
181
+ metadata: Dict[str, Any] = None,
182
+ file_path: Optional[str] = None,
183
+ chunk_index: int = 0,
184
+ embedding: Optional[List[float]] = None,
185
+ ) -> str:
186
+ """Add a document to the vector store.
187
+
188
+ Args:
189
+ content: Document content
190
+ metadata: Additional metadata
191
+ file_path: Source file path
192
+ chunk_index: Chunk index if document is part of larger file
193
+ embedding: Pre-computed embedding (if None, will compute)
194
+
195
+ Returns:
196
+ Document ID
197
+ """
198
+ doc_id = self._generate_doc_id(content, file_path or "", chunk_index)
199
+
200
+ # Generate embedding if not provided
201
+ if embedding is None:
202
+ embedding = self._generate_embedding(content)
203
+
204
+ # Prepare metadata
205
+ metadata = metadata or {}
206
+ metadata_json = json.dumps(metadata)
207
+
208
+ # Insert document
209
+ self.documents_table.insert([{
210
+ "id": doc_id,
211
+ "content": content,
212
+ "file_path": file_path or "",
213
+ "chunk_index": chunk_index,
214
+ "metadata": metadata_json,
215
+ "embedding": embedding,
216
+ }])
217
+
218
+ return doc_id
219
+
220
+ def add_file(
221
+ self,
222
+ file_path: str,
223
+ chunk_size: int = 1000,
224
+ chunk_overlap: int = 200,
225
+ metadata: Dict[str, Any] = None,
226
+ ) -> List[str]:
227
+ """Add a file to the vector store by chunking it.
228
+
229
+ Args:
230
+ file_path: Path to the file to add
231
+ chunk_size: Maximum characters per chunk
232
+ chunk_overlap: Characters to overlap between chunks
233
+ metadata: Additional metadata for all chunks
234
+
235
+ Returns:
236
+ List of document IDs for all chunks
237
+ """
238
+ path = Path(file_path)
239
+ if not path.exists():
240
+ raise FileNotFoundError(f"File not found: {file_path}")
241
+
242
+ # Read file content
243
+ try:
244
+ content = path.read_text(encoding='utf-8')
245
+ except UnicodeDecodeError:
246
+ # Try with different encoding
247
+ content = path.read_text(encoding='latin-1')
248
+
249
+ # Chunk the content
250
+ chunks = self._chunk_text(content, chunk_size, chunk_overlap)
251
+
252
+ # Add metadata
253
+ file_metadata = metadata or {}
254
+ file_metadata.update({
255
+ "file_name": path.name,
256
+ "file_extension": path.suffix,
257
+ "file_size": path.stat().st_size,
258
+ })
259
+
260
+ # Add each chunk
261
+ doc_ids = []
262
+ for i, chunk in enumerate(chunks):
263
+ chunk_metadata = file_metadata.copy()
264
+ chunk_metadata["chunk_number"] = i
265
+ chunk_metadata["total_chunks"] = len(chunks)
266
+
267
+ doc_id = self.add_document(
268
+ content=chunk,
269
+ metadata=chunk_metadata,
270
+ file_path=str(path),
271
+ chunk_index=i,
272
+ )
273
+ doc_ids.append(doc_id)
274
+
275
+ return doc_ids
276
+
277
+ def add_file_with_ast(
278
+ self,
279
+ file_path: str,
280
+ chunk_size: int = 1000,
281
+ chunk_overlap: int = 200,
282
+ metadata: Dict[str, Any] = None,
283
+ ) -> Tuple[List[str], Optional[FileAST]]:
284
+ """Add a file with full AST analysis and symbol extraction.
285
+
286
+ Args:
287
+ file_path: Path to the file to add
288
+ chunk_size: Maximum characters per chunk for content
289
+ chunk_overlap: Characters to overlap between chunks
290
+ metadata: Additional metadata for all chunks
291
+
292
+ Returns:
293
+ Tuple of (document IDs for content chunks, FileAST object)
294
+ """
295
+ path = Path(file_path)
296
+ if not path.exists():
297
+ raise FileNotFoundError(f"File not found: {file_path}")
298
+
299
+ # First add file content using existing method
300
+ doc_ids = self.add_file(file_path, chunk_size, chunk_overlap, metadata)
301
+
302
+ # Analyze AST and symbols
303
+ file_ast = self.ast_analyzer.analyze_file(file_path)
304
+ if not file_ast:
305
+ return doc_ids, None
306
+
307
+ # Store complete AST
308
+ self._store_file_ast(file_ast)
309
+
310
+ # Store individual symbols with embeddings
311
+ self._store_symbols(file_ast.symbols)
312
+
313
+ # Store cross-references
314
+ self._store_references(file_ast)
315
+
316
+ return doc_ids, file_ast
317
+
318
+ def _store_file_ast(self, file_ast: FileAST):
319
+ """Store complete file AST information."""
320
+ from datetime import datetime
321
+
322
+ # Remove existing AST for this file
323
+ try:
324
+ self.ast_table.delete(f"file_path = '{file_ast.file_path}'")
325
+ except:
326
+ pass
327
+
328
+ # Insert new AST
329
+ self.ast_table.insert([{
330
+ "file_path": file_ast.file_path,
331
+ "file_hash": file_ast.file_hash,
332
+ "language": file_ast.language,
333
+ "ast_data": json.dumps(file_ast.to_dict()),
334
+ "last_updated": datetime.now().isoformat(),
335
+ }])
336
+
337
+ def _store_symbols(self, symbols: List[Symbol]):
338
+ """Store symbols with vector embeddings."""
339
+ if not symbols:
340
+ return
341
+
342
+ # Remove existing symbols for these files
343
+ file_paths = list(set(symbol.file_path for symbol in symbols))
344
+ for file_path in file_paths:
345
+ try:
346
+ self.symbols_table.delete(f"file_path = '{file_path}'")
347
+ except:
348
+ pass
349
+
350
+ # Insert new symbols
351
+ symbol_records = []
352
+ for symbol in symbols:
353
+ # Create embedding text for symbol
354
+ embedding_text = create_symbol_embedding_text(symbol)
355
+ embedding = self._generate_embedding(embedding_text)
356
+
357
+ # Generate symbol ID
358
+ symbol_id = self._generate_symbol_id(symbol)
359
+
360
+ # Prepare metadata
361
+ symbol_metadata = {
362
+ "references": symbol.references,
363
+ "embedding_text": embedding_text,
364
+ }
365
+
366
+ symbol_records.append({
367
+ "id": symbol_id,
368
+ "name": symbol.name,
369
+ "type": symbol.type,
370
+ "file_path": symbol.file_path,
371
+ "line_start": symbol.line_start,
372
+ "line_end": symbol.line_end,
373
+ "scope": symbol.scope or "",
374
+ "parent": symbol.parent or "",
375
+ "signature": symbol.signature or "",
376
+ "docstring": symbol.docstring or "",
377
+ "metadata": json.dumps(symbol_metadata),
378
+ "embedding": embedding,
379
+ })
380
+
381
+ if symbol_records:
382
+ self.symbols_table.insert(symbol_records)
383
+
384
+ def _store_references(self, file_ast: FileAST):
385
+ """Store cross-file references."""
386
+ if not file_ast.dependencies:
387
+ return
388
+
389
+ # Remove existing references for this file
390
+ try:
391
+ self.references_table.delete(f"source_file = '{file_ast.file_path}'")
392
+ except:
393
+ pass
394
+
395
+ # Insert new references
396
+ reference_records = []
397
+ for i, dependency in enumerate(file_ast.dependencies):
398
+ ref_id = f"{file_ast.file_path}_{dependency}_{i}"
399
+ reference_records.append({
400
+ "id": ref_id,
401
+ "source_file": file_ast.file_path,
402
+ "target_file": dependency,
403
+ "symbol_name": dependency,
404
+ "reference_type": "import",
405
+ "line_number": 0, # Could be enhanced to track actual line numbers
406
+ "metadata": json.dumps({}),
407
+ })
408
+
409
+ if reference_records:
410
+ self.references_table.insert(reference_records)
411
+
412
+ def _generate_symbol_id(self, symbol: Symbol) -> str:
413
+ """Generate unique symbol ID."""
414
+ text = f"{symbol.file_path}_{symbol.type}_{symbol.name}_{symbol.line_start}"
415
+ return hashlib.sha256(text.encode()).hexdigest()[:16]
416
+
417
+ def search_symbols(
418
+ self,
419
+ query: str,
420
+ symbol_type: Optional[str] = None,
421
+ file_path: Optional[str] = None,
422
+ limit: int = 10,
423
+ score_threshold: float = 0.0,
424
+ ) -> List[SymbolSearchResult]:
425
+ """Search for symbols using vector similarity.
426
+
427
+ Args:
428
+ query: Search query
429
+ symbol_type: Filter by symbol type (function, class, variable, etc.)
430
+ file_path: Filter by file path
431
+ limit: Maximum number of results
432
+ score_threshold: Minimum similarity score
433
+
434
+ Returns:
435
+ List of symbol search results
436
+ """
437
+ # Generate query embedding
438
+ query_embedding = self._generate_embedding(query)
439
+
440
+ # Build search query
441
+ search_query = self.symbols_table.output(["*"]).match_dense(
442
+ "embedding",
443
+ query_embedding,
444
+ "float",
445
+ "ip", # Inner product
446
+ limit * 2 # Get more results for filtering
447
+ )
448
+
449
+ # Apply filters
450
+ if symbol_type:
451
+ search_query = search_query.filter(f"type = '{symbol_type}'")
452
+ if file_path:
453
+ search_query = search_query.filter(f"file_path = '{file_path}'")
454
+
455
+ search_results = search_query.to_pl()
456
+
457
+ # Convert to SymbolSearchResult objects
458
+ results = []
459
+ for row in search_results.iter_rows(named=True):
460
+ score = row.get("score", 0.0)
461
+ if score >= score_threshold:
462
+ # Parse metadata
463
+ try:
464
+ metadata = json.loads(row["metadata"])
465
+ except:
466
+ metadata = {}
467
+
468
+ # Create Symbol object
469
+ symbol = Symbol(
470
+ name=row["name"],
471
+ type=row["type"],
472
+ file_path=row["file_path"],
473
+ line_start=row["line_start"],
474
+ line_end=row["line_end"],
475
+ column_start=0, # Not stored in table
476
+ column_end=0, # Not stored in table
477
+ scope=row["scope"],
478
+ parent=row["parent"] if row["parent"] else None,
479
+ docstring=row["docstring"] if row["docstring"] else None,
480
+ signature=row["signature"] if row["signature"] else None,
481
+ references=metadata.get("references", []),
482
+ )
483
+
484
+ results.append(SymbolSearchResult(
485
+ symbol=symbol,
486
+ score=score,
487
+ ))
488
+
489
+ return results[:limit]
490
+
491
+ def search_ast_nodes(
492
+ self,
493
+ file_path: str,
494
+ node_type: Optional[str] = None,
495
+ node_name: Optional[str] = None,
496
+ ) -> Optional[FileAST]:
497
+ """Search AST nodes within a specific file.
498
+
499
+ Args:
500
+ file_path: File to search in
501
+ node_type: Filter by AST node type
502
+ node_name: Filter by node name
503
+
504
+ Returns:
505
+ FileAST object if file found, None otherwise
506
+ """
507
+ try:
508
+ results = self.ast_table.output(["*"]).filter(f"file_path = '{file_path}'").to_pl()
509
+
510
+ if len(results) == 0:
511
+ return None
512
+
513
+ row = next(results.iter_rows(named=True))
514
+ ast_data = json.loads(row["ast_data"])
515
+
516
+ # Reconstruct FileAST object
517
+ file_ast = FileAST(
518
+ file_path=ast_data["file_path"],
519
+ file_hash=ast_data["file_hash"],
520
+ language=ast_data["language"],
521
+ symbols=[Symbol(**s) for s in ast_data["symbols"]],
522
+ ast_nodes=[], # Would need custom deserialization for ASTNode
523
+ imports=ast_data["imports"],
524
+ exports=ast_data["exports"],
525
+ dependencies=ast_data["dependencies"],
526
+ )
527
+
528
+ return file_ast
529
+
530
+ except Exception as e:
531
+ print(f"Error searching AST nodes: {e}")
532
+ return None
533
+
534
+ def get_file_references(self, file_path: str) -> List[Dict[str, Any]]:
535
+ """Get all files that reference the given file.
536
+
537
+ Args:
538
+ file_path: File to find references for
539
+
540
+ Returns:
541
+ List of reference information
542
+ """
543
+ try:
544
+ results = self.references_table.output(["*"]).filter(f"target_file = '{file_path}'").to_pl()
545
+
546
+ references = []
547
+ for row in results.iter_rows(named=True):
548
+ references.append({
549
+ "source_file": row["source_file"],
550
+ "symbol_name": row["symbol_name"],
551
+ "reference_type": row["reference_type"],
552
+ "line_number": row["line_number"],
553
+ })
554
+
555
+ return references
556
+
557
+ except Exception as e:
558
+ print(f"Error getting file references: {e}")
559
+ return []
560
+
561
+ def search(
562
+ self,
563
+ query: str,
564
+ limit: int = 10,
565
+ score_threshold: float = 0.0,
566
+ filters: Dict[str, Any] = None,
567
+ ) -> List[SearchResult]:
568
+ """Search for similar documents.
569
+
570
+ Args:
571
+ query: Search query
572
+ limit: Maximum number of results
573
+ score_threshold: Minimum similarity score
574
+ filters: Metadata filters (not yet implemented)
575
+
576
+ Returns:
577
+ List of search results
578
+ """
579
+ # Generate query embedding
580
+ query_embedding = self._generate_embedding(query)
581
+
582
+ # Perform vector search
583
+ search_results = self.documents_table.output(["*"]).match_dense(
584
+ "embedding",
585
+ query_embedding,
586
+ "float",
587
+ "ip", # Inner product (cosine similarity)
588
+ limit
589
+ ).to_pl()
590
+
591
+ # Convert to SearchResult objects
592
+ results = []
593
+ for row in search_results.iter_rows(named=True):
594
+ # Parse metadata
595
+ try:
596
+ metadata = json.loads(row["metadata"])
597
+ except:
598
+ metadata = {}
599
+
600
+ # Create document
601
+ document = Document(
602
+ id=row["id"],
603
+ content=row["content"],
604
+ metadata=metadata,
605
+ file_path=row["file_path"] if row["file_path"] else None,
606
+ chunk_index=row["chunk_index"],
607
+ )
608
+
609
+ # Score is the similarity (higher is better)
610
+ score = row.get("score", 0.0)
611
+ distance = 1.0 - score # Convert similarity to distance
612
+
613
+ if score >= score_threshold:
614
+ results.append(SearchResult(
615
+ document=document,
616
+ score=score,
617
+ distance=distance,
618
+ ))
619
+
620
+ return results
621
+
622
+ def delete_document(self, doc_id: str) -> bool:
623
+ """Delete a document by ID.
624
+
625
+ Args:
626
+ doc_id: Document ID to delete
627
+
628
+ Returns:
629
+ True if document was deleted
630
+ """
631
+ try:
632
+ self.documents_table.delete(f"id = '{doc_id}'")
633
+ return True
634
+ except:
635
+ return False
636
+
637
+ def delete_file(self, file_path: str) -> int:
638
+ """Delete all documents from a specific file.
639
+
640
+ Args:
641
+ file_path: File path to delete documents for
642
+
643
+ Returns:
644
+ Number of documents deleted
645
+ """
646
+ try:
647
+ # Get count first
648
+ results = self.documents_table.output(["id"]).filter(f"file_path = '{file_path}'").to_pl()
649
+ count = len(results)
650
+
651
+ # Delete all documents for this file
652
+ self.documents_table.delete(f"file_path = '{file_path}'")
653
+ return count
654
+ except:
655
+ return 0
656
+
657
+ def list_files(self) -> List[Dict[str, Any]]:
658
+ """List all indexed files.
659
+
660
+ Returns:
661
+ List of file information
662
+ """
663
+ try:
664
+ results = self.documents_table.output(["file_path", "metadata"]).to_pl()
665
+
666
+ files = {}
667
+ for row in results.iter_rows(named=True):
668
+ file_path = row["file_path"]
669
+ if file_path and file_path not in files:
670
+ try:
671
+ metadata = json.loads(row["metadata"])
672
+ files[file_path] = {
673
+ "file_path": file_path,
674
+ "file_name": metadata.get("file_name", Path(file_path).name),
675
+ "file_size": metadata.get("file_size", 0),
676
+ "total_chunks": metadata.get("total_chunks", 1),
677
+ }
678
+ except:
679
+ files[file_path] = {
680
+ "file_path": file_path,
681
+ "file_name": Path(file_path).name,
682
+ }
683
+
684
+ return list(files.values())
685
+ except:
686
+ return []
687
+
688
+ def _chunk_text(self, text: str, chunk_size: int, overlap: int) -> List[str]:
689
+ """Split text into overlapping chunks."""
690
+ if len(text) <= chunk_size:
691
+ return [text]
692
+
693
+ chunks = []
694
+ start = 0
695
+
696
+ while start < len(text):
697
+ end = start + chunk_size
698
+
699
+ # Try to break at word boundary
700
+ if end < len(text):
701
+ # Look back for a good break point
702
+ break_point = end
703
+ for i in range(end - 100, start + 100, -1):
704
+ if i > 0 and text[i] in '\n\r.!?':
705
+ break_point = i + 1
706
+ break
707
+ end = break_point
708
+
709
+ chunk = text[start:end].strip()
710
+ if chunk:
711
+ chunks.append(chunk)
712
+
713
+ start = max(start + chunk_size - overlap, end)
714
+
715
+ return chunks
716
+
717
+ def _generate_embedding(self, text: str) -> List[float]:
718
+ """Generate embedding for text.
719
+
720
+ For now, this returns a dummy embedding. In a real implementation,
721
+ you would call an embedding API (OpenAI, Cohere, etc.) or use a local model.
722
+ """
723
+ # This is a placeholder - you would implement actual embedding generation here
724
+ # For now, return a random embedding of the correct dimension
725
+ import random
726
+ return [random.random() for _ in range(self.dimension)]
727
+
728
+ def close(self):
729
+ """Close the database connection."""
730
+ if hasattr(self, 'infinity'):
731
+ self.infinity.disconnect()