hanzo-mcp 0.7.6__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hanzo-mcp might be problematic. Click here for more details.
- hanzo_mcp/__init__.py +7 -1
- hanzo_mcp/__main__.py +1 -1
- hanzo_mcp/analytics/__init__.py +2 -2
- hanzo_mcp/analytics/posthog_analytics.py +76 -82
- hanzo_mcp/cli.py +31 -36
- hanzo_mcp/cli_enhanced.py +94 -72
- hanzo_mcp/cli_plugin.py +27 -17
- hanzo_mcp/config/__init__.py +2 -2
- hanzo_mcp/config/settings.py +112 -88
- hanzo_mcp/config/tool_config.py +32 -34
- hanzo_mcp/dev_server.py +66 -67
- hanzo_mcp/prompts/__init__.py +94 -12
- hanzo_mcp/prompts/enhanced_prompts.py +809 -0
- hanzo_mcp/prompts/example_custom_prompt.py +6 -5
- hanzo_mcp/prompts/project_todo_reminder.py +0 -1
- hanzo_mcp/prompts/tool_explorer.py +10 -7
- hanzo_mcp/server.py +17 -21
- hanzo_mcp/server_enhanced.py +15 -22
- hanzo_mcp/tools/__init__.py +56 -28
- hanzo_mcp/tools/agent/__init__.py +16 -19
- hanzo_mcp/tools/agent/agent.py +82 -65
- hanzo_mcp/tools/agent/agent_tool.py +152 -122
- hanzo_mcp/tools/agent/agent_tool_v1_deprecated.py +66 -62
- hanzo_mcp/tools/agent/clarification_protocol.py +55 -50
- hanzo_mcp/tools/agent/clarification_tool.py +11 -10
- hanzo_mcp/tools/agent/claude_cli_tool.py +21 -20
- hanzo_mcp/tools/agent/claude_desktop_auth.py +130 -144
- hanzo_mcp/tools/agent/cli_agent_base.py +59 -53
- hanzo_mcp/tools/agent/code_auth.py +102 -107
- hanzo_mcp/tools/agent/code_auth_tool.py +28 -27
- hanzo_mcp/tools/agent/codex_cli_tool.py +20 -19
- hanzo_mcp/tools/agent/critic_tool.py +86 -73
- hanzo_mcp/tools/agent/gemini_cli_tool.py +21 -20
- hanzo_mcp/tools/agent/grok_cli_tool.py +21 -20
- hanzo_mcp/tools/agent/iching_tool.py +404 -139
- hanzo_mcp/tools/agent/network_tool.py +89 -73
- hanzo_mcp/tools/agent/prompt.py +2 -1
- hanzo_mcp/tools/agent/review_tool.py +101 -98
- hanzo_mcp/tools/agent/swarm_alias.py +87 -0
- hanzo_mcp/tools/agent/swarm_tool.py +246 -161
- hanzo_mcp/tools/agent/swarm_tool_v1_deprecated.py +134 -92
- hanzo_mcp/tools/agent/tool_adapter.py +21 -11
- hanzo_mcp/tools/common/__init__.py +1 -1
- hanzo_mcp/tools/common/base.py +3 -5
- hanzo_mcp/tools/common/batch_tool.py +46 -39
- hanzo_mcp/tools/common/config_tool.py +120 -84
- hanzo_mcp/tools/common/context.py +1 -5
- hanzo_mcp/tools/common/context_fix.py +5 -3
- hanzo_mcp/tools/common/critic_tool.py +4 -8
- hanzo_mcp/tools/common/decorators.py +58 -56
- hanzo_mcp/tools/common/enhanced_base.py +29 -32
- hanzo_mcp/tools/common/fastmcp_pagination.py +91 -94
- hanzo_mcp/tools/common/forgiving_edit.py +91 -87
- hanzo_mcp/tools/common/mode.py +15 -17
- hanzo_mcp/tools/common/mode_loader.py +27 -24
- hanzo_mcp/tools/common/paginated_base.py +61 -53
- hanzo_mcp/tools/common/paginated_response.py +72 -79
- hanzo_mcp/tools/common/pagination.py +50 -53
- hanzo_mcp/tools/common/permissions.py +4 -4
- hanzo_mcp/tools/common/personality.py +186 -138
- hanzo_mcp/tools/common/plugin_loader.py +54 -54
- hanzo_mcp/tools/common/stats.py +65 -47
- hanzo_mcp/tools/common/test_helpers.py +31 -0
- hanzo_mcp/tools/common/thinking_tool.py +4 -8
- hanzo_mcp/tools/common/tool_disable.py +17 -12
- hanzo_mcp/tools/common/tool_enable.py +13 -14
- hanzo_mcp/tools/common/tool_list.py +36 -28
- hanzo_mcp/tools/common/truncate.py +23 -23
- hanzo_mcp/tools/config/__init__.py +4 -4
- hanzo_mcp/tools/config/config_tool.py +42 -29
- hanzo_mcp/tools/config/index_config.py +37 -34
- hanzo_mcp/tools/config/mode_tool.py +175 -55
- hanzo_mcp/tools/database/__init__.py +15 -12
- hanzo_mcp/tools/database/database_manager.py +77 -75
- hanzo_mcp/tools/database/graph.py +137 -91
- hanzo_mcp/tools/database/graph_add.py +30 -18
- hanzo_mcp/tools/database/graph_query.py +178 -102
- hanzo_mcp/tools/database/graph_remove.py +33 -28
- hanzo_mcp/tools/database/graph_search.py +97 -75
- hanzo_mcp/tools/database/graph_stats.py +91 -59
- hanzo_mcp/tools/database/sql.py +107 -79
- hanzo_mcp/tools/database/sql_query.py +30 -24
- hanzo_mcp/tools/database/sql_search.py +29 -25
- hanzo_mcp/tools/database/sql_stats.py +47 -35
- hanzo_mcp/tools/editor/neovim_command.py +25 -28
- hanzo_mcp/tools/editor/neovim_edit.py +21 -23
- hanzo_mcp/tools/editor/neovim_session.py +60 -54
- hanzo_mcp/tools/filesystem/__init__.py +31 -30
- hanzo_mcp/tools/filesystem/ast_multi_edit.py +329 -249
- hanzo_mcp/tools/filesystem/ast_tool.py +4 -4
- hanzo_mcp/tools/filesystem/base.py +1 -1
- hanzo_mcp/tools/filesystem/batch_search.py +316 -224
- hanzo_mcp/tools/filesystem/content_replace.py +4 -4
- hanzo_mcp/tools/filesystem/diff.py +71 -59
- hanzo_mcp/tools/filesystem/directory_tree.py +7 -7
- hanzo_mcp/tools/filesystem/directory_tree_paginated.py +49 -37
- hanzo_mcp/tools/filesystem/edit.py +4 -4
- hanzo_mcp/tools/filesystem/find.py +173 -80
- hanzo_mcp/tools/filesystem/find_files.py +73 -52
- hanzo_mcp/tools/filesystem/git_search.py +157 -104
- hanzo_mcp/tools/filesystem/grep.py +8 -8
- hanzo_mcp/tools/filesystem/multi_edit.py +4 -8
- hanzo_mcp/tools/filesystem/read.py +12 -10
- hanzo_mcp/tools/filesystem/rules_tool.py +59 -43
- hanzo_mcp/tools/filesystem/search_tool.py +263 -207
- hanzo_mcp/tools/filesystem/symbols_tool.py +94 -54
- hanzo_mcp/tools/filesystem/tree.py +35 -33
- hanzo_mcp/tools/filesystem/unix_aliases.py +13 -18
- hanzo_mcp/tools/filesystem/watch.py +37 -36
- hanzo_mcp/tools/filesystem/write.py +4 -8
- hanzo_mcp/tools/jupyter/__init__.py +4 -4
- hanzo_mcp/tools/jupyter/base.py +4 -5
- hanzo_mcp/tools/jupyter/jupyter.py +67 -47
- hanzo_mcp/tools/jupyter/notebook_edit.py +4 -4
- hanzo_mcp/tools/jupyter/notebook_read.py +4 -7
- hanzo_mcp/tools/llm/__init__.py +5 -7
- hanzo_mcp/tools/llm/consensus_tool.py +72 -52
- hanzo_mcp/tools/llm/llm_manage.py +101 -60
- hanzo_mcp/tools/llm/llm_tool.py +226 -166
- hanzo_mcp/tools/llm/provider_tools.py +25 -26
- hanzo_mcp/tools/lsp/__init__.py +1 -1
- hanzo_mcp/tools/lsp/lsp_tool.py +228 -143
- hanzo_mcp/tools/mcp/__init__.py +2 -3
- hanzo_mcp/tools/mcp/mcp_add.py +27 -25
- hanzo_mcp/tools/mcp/mcp_remove.py +7 -8
- hanzo_mcp/tools/mcp/mcp_stats.py +23 -22
- hanzo_mcp/tools/mcp/mcp_tool.py +129 -98
- hanzo_mcp/tools/memory/__init__.py +39 -21
- hanzo_mcp/tools/memory/knowledge_tools.py +124 -99
- hanzo_mcp/tools/memory/memory_tools.py +90 -108
- hanzo_mcp/tools/search/__init__.py +7 -2
- hanzo_mcp/tools/search/find_tool.py +297 -212
- hanzo_mcp/tools/search/unified_search.py +366 -314
- hanzo_mcp/tools/shell/__init__.py +8 -7
- hanzo_mcp/tools/shell/auto_background.py +56 -49
- hanzo_mcp/tools/shell/base.py +1 -1
- hanzo_mcp/tools/shell/base_process.py +75 -75
- hanzo_mcp/tools/shell/bash_session.py +2 -2
- hanzo_mcp/tools/shell/bash_session_executor.py +4 -4
- hanzo_mcp/tools/shell/bash_tool.py +24 -31
- hanzo_mcp/tools/shell/command_executor.py +12 -12
- hanzo_mcp/tools/shell/logs.py +43 -33
- hanzo_mcp/tools/shell/npx.py +13 -13
- hanzo_mcp/tools/shell/npx_background.py +24 -21
- hanzo_mcp/tools/shell/npx_tool.py +18 -22
- hanzo_mcp/tools/shell/open.py +19 -21
- hanzo_mcp/tools/shell/pkill.py +31 -26
- hanzo_mcp/tools/shell/process_tool.py +32 -32
- hanzo_mcp/tools/shell/processes.py +57 -58
- hanzo_mcp/tools/shell/run_background.py +24 -25
- hanzo_mcp/tools/shell/run_command.py +5 -5
- hanzo_mcp/tools/shell/run_command_windows.py +5 -5
- hanzo_mcp/tools/shell/session_storage.py +3 -3
- hanzo_mcp/tools/shell/streaming_command.py +141 -126
- hanzo_mcp/tools/shell/uvx.py +24 -25
- hanzo_mcp/tools/shell/uvx_background.py +35 -33
- hanzo_mcp/tools/shell/uvx_tool.py +18 -22
- hanzo_mcp/tools/todo/__init__.py +6 -2
- hanzo_mcp/tools/todo/todo.py +50 -37
- hanzo_mcp/tools/todo/todo_read.py +5 -8
- hanzo_mcp/tools/todo/todo_write.py +5 -7
- hanzo_mcp/tools/vector/__init__.py +40 -28
- hanzo_mcp/tools/vector/ast_analyzer.py +176 -143
- hanzo_mcp/tools/vector/git_ingester.py +170 -179
- hanzo_mcp/tools/vector/index_tool.py +96 -44
- hanzo_mcp/tools/vector/infinity_store.py +283 -228
- hanzo_mcp/tools/vector/mock_infinity.py +39 -40
- hanzo_mcp/tools/vector/project_manager.py +88 -78
- hanzo_mcp/tools/vector/vector.py +59 -42
- hanzo_mcp/tools/vector/vector_index.py +30 -27
- hanzo_mcp/tools/vector/vector_search.py +64 -45
- hanzo_mcp/types.py +6 -4
- {hanzo_mcp-0.7.6.dist-info → hanzo_mcp-0.8.0.dist-info}/METADATA +1 -1
- hanzo_mcp-0.8.0.dist-info/RECORD +185 -0
- hanzo_mcp-0.7.6.dist-info/RECORD +0 -182
- {hanzo_mcp-0.7.6.dist-info → hanzo_mcp-0.8.0.dist-info}/WHEEL +0 -0
- {hanzo_mcp-0.7.6.dist-info → hanzo_mcp-0.8.0.dist-info}/entry_points.txt +0 -0
- {hanzo_mcp-0.7.6.dist-info → hanzo_mcp-0.8.0.dist-info}/top_level.txt +0 -0
|
@@ -2,24 +2,27 @@
|
|
|
2
2
|
|
|
3
3
|
import json
|
|
4
4
|
import hashlib
|
|
5
|
+
from typing import Any, Dict, List, Tuple, Optional
|
|
5
6
|
from pathlib import Path
|
|
6
|
-
from typing import Any, Dict, List, Optional, Tuple
|
|
7
7
|
from dataclasses import dataclass
|
|
8
8
|
|
|
9
9
|
try:
|
|
10
10
|
import infinity_embedded
|
|
11
|
+
|
|
11
12
|
INFINITY_AVAILABLE = True
|
|
12
13
|
except ImportError:
|
|
13
14
|
# Use mock implementation when infinity_embedded is not available
|
|
14
15
|
from . import mock_infinity as infinity_embedded
|
|
16
|
+
|
|
15
17
|
INFINITY_AVAILABLE = True # Mock is always available
|
|
16
18
|
|
|
17
|
-
from .ast_analyzer import
|
|
19
|
+
from .ast_analyzer import Symbol, FileAST, ASTAnalyzer, create_symbol_embedding_text
|
|
18
20
|
|
|
19
21
|
|
|
20
22
|
@dataclass
|
|
21
23
|
class Document:
|
|
22
24
|
"""Document representation for vector storage."""
|
|
25
|
+
|
|
23
26
|
id: str
|
|
24
27
|
content: str
|
|
25
28
|
metadata: Dict[str, Any]
|
|
@@ -30,6 +33,7 @@ class Document:
|
|
|
30
33
|
@dataclass
|
|
31
34
|
class SearchResult:
|
|
32
35
|
"""Search result from vector database."""
|
|
36
|
+
|
|
33
37
|
document: Document
|
|
34
38
|
score: float
|
|
35
39
|
distance: float
|
|
@@ -38,6 +42,7 @@ class SearchResult:
|
|
|
38
42
|
@dataclass
|
|
39
43
|
class SymbolSearchResult:
|
|
40
44
|
"""Search result for symbols."""
|
|
45
|
+
|
|
41
46
|
symbol: Symbol
|
|
42
47
|
score: float
|
|
43
48
|
context_document: Optional[Document] = None
|
|
@@ -46,6 +51,7 @@ class SymbolSearchResult:
|
|
|
46
51
|
@dataclass
|
|
47
52
|
class UnifiedSearchResult:
|
|
48
53
|
"""Search result combining text, vector, and symbol search."""
|
|
54
|
+
|
|
49
55
|
type: str # 'document', 'symbol', 'reference'
|
|
50
56
|
content: str
|
|
51
57
|
file_path: str
|
|
@@ -58,7 +64,7 @@ class UnifiedSearchResult:
|
|
|
58
64
|
|
|
59
65
|
class InfinityVectorStore:
|
|
60
66
|
"""Local vector database using Infinity."""
|
|
61
|
-
|
|
67
|
+
|
|
62
68
|
def __init__(
|
|
63
69
|
self,
|
|
64
70
|
data_path: Optional[str] = None,
|
|
@@ -66,43 +72,46 @@ class InfinityVectorStore:
|
|
|
66
72
|
dimension: int = 1536, # Default for OpenAI text-embedding-3-small
|
|
67
73
|
):
|
|
68
74
|
"""Initialize the Infinity vector store.
|
|
69
|
-
|
|
75
|
+
|
|
70
76
|
Args:
|
|
71
77
|
data_path: Path to store vector database (default: ~/.config/hanzo/vector-store)
|
|
72
78
|
embedding_model: Embedding model to use
|
|
73
79
|
dimension: Vector dimension (must match embedding model)
|
|
74
80
|
"""
|
|
75
81
|
if not INFINITY_AVAILABLE:
|
|
76
|
-
raise ImportError(
|
|
77
|
-
|
|
82
|
+
raise ImportError(
|
|
83
|
+
"infinity_embedded is required for vector store functionality"
|
|
84
|
+
)
|
|
85
|
+
|
|
78
86
|
# Set up data path
|
|
79
87
|
if data_path:
|
|
80
88
|
self.data_path = Path(data_path)
|
|
81
89
|
else:
|
|
82
90
|
from hanzo_mcp.config.settings import get_config_dir
|
|
91
|
+
|
|
83
92
|
self.data_path = get_config_dir() / "vector-store"
|
|
84
|
-
|
|
93
|
+
|
|
85
94
|
self.data_path.mkdir(parents=True, exist_ok=True)
|
|
86
|
-
|
|
95
|
+
|
|
87
96
|
self.embedding_model = embedding_model
|
|
88
97
|
self.dimension = dimension
|
|
89
|
-
|
|
98
|
+
|
|
90
99
|
# Initialize AST analyzer
|
|
91
100
|
self.ast_analyzer = ASTAnalyzer()
|
|
92
|
-
|
|
101
|
+
|
|
93
102
|
# Connect to Infinity
|
|
94
103
|
self.infinity = infinity_embedded.connect(str(self.data_path))
|
|
95
104
|
self.db = self.infinity.get_database("hanzo_mcp")
|
|
96
|
-
|
|
105
|
+
|
|
97
106
|
# Initialize tables
|
|
98
107
|
self._initialize_tables()
|
|
99
|
-
|
|
108
|
+
|
|
100
109
|
def _initialize_tables(self):
|
|
101
110
|
"""Initialize database tables if they don't exist."""
|
|
102
111
|
# Documents table
|
|
103
112
|
try:
|
|
104
113
|
self.documents_table = self.db.get_table("documents")
|
|
105
|
-
except:
|
|
114
|
+
except Exception:
|
|
106
115
|
self.documents_table = self.db.create_table(
|
|
107
116
|
"documents",
|
|
108
117
|
{
|
|
@@ -112,13 +121,13 @@ class InfinityVectorStore:
|
|
|
112
121
|
"chunk_index": {"type": "integer"},
|
|
113
122
|
"metadata": {"type": "varchar"}, # JSON string
|
|
114
123
|
"embedding": {"type": f"vector,{self.dimension},float"},
|
|
115
|
-
}
|
|
124
|
+
},
|
|
116
125
|
)
|
|
117
|
-
|
|
126
|
+
|
|
118
127
|
# Symbols table for code symbols
|
|
119
128
|
try:
|
|
120
129
|
self.symbols_table = self.db.get_table("symbols")
|
|
121
|
-
except:
|
|
130
|
+
except Exception:
|
|
122
131
|
self.symbols_table = self.db.create_table(
|
|
123
132
|
"symbols",
|
|
124
133
|
{
|
|
@@ -134,13 +143,13 @@ class InfinityVectorStore:
|
|
|
134
143
|
"docstring": {"type": "varchar"},
|
|
135
144
|
"metadata": {"type": "varchar"}, # JSON string
|
|
136
145
|
"embedding": {"type": f"vector,{self.dimension},float"},
|
|
137
|
-
}
|
|
146
|
+
},
|
|
138
147
|
)
|
|
139
|
-
|
|
148
|
+
|
|
140
149
|
# AST table for storing complete file ASTs
|
|
141
150
|
try:
|
|
142
151
|
self.ast_table = self.db.get_table("ast_files")
|
|
143
|
-
except:
|
|
152
|
+
except Exception:
|
|
144
153
|
self.ast_table = self.db.create_table(
|
|
145
154
|
"ast_files",
|
|
146
155
|
{
|
|
@@ -149,13 +158,13 @@ class InfinityVectorStore:
|
|
|
149
158
|
"language": {"type": "varchar"},
|
|
150
159
|
"ast_data": {"type": "varchar"}, # JSON string of complete AST
|
|
151
160
|
"last_updated": {"type": "varchar"}, # ISO timestamp
|
|
152
|
-
}
|
|
161
|
+
},
|
|
153
162
|
)
|
|
154
|
-
|
|
163
|
+
|
|
155
164
|
# References table for cross-file references
|
|
156
165
|
try:
|
|
157
166
|
self.references_table = self.db.get_table("references")
|
|
158
|
-
except:
|
|
167
|
+
except Exception:
|
|
159
168
|
self.references_table = self.db.create_table(
|
|
160
169
|
"references",
|
|
161
170
|
{
|
|
@@ -163,18 +172,22 @@ class InfinityVectorStore:
|
|
|
163
172
|
"source_file": {"type": "varchar"},
|
|
164
173
|
"target_file": {"type": "varchar"},
|
|
165
174
|
"symbol_name": {"type": "varchar"},
|
|
166
|
-
"reference_type": {
|
|
175
|
+
"reference_type": {
|
|
176
|
+
"type": "varchar"
|
|
177
|
+
}, # import, call, inheritance, etc.
|
|
167
178
|
"line_number": {"type": "integer"},
|
|
168
179
|
"metadata": {"type": "varchar"}, # JSON string
|
|
169
|
-
}
|
|
180
|
+
},
|
|
170
181
|
)
|
|
171
|
-
|
|
172
|
-
def _generate_doc_id(
|
|
182
|
+
|
|
183
|
+
def _generate_doc_id(
|
|
184
|
+
self, content: str, file_path: str = "", chunk_index: int = 0
|
|
185
|
+
) -> str:
|
|
173
186
|
"""Generate a unique document ID."""
|
|
174
187
|
content_hash = hashlib.sha256(content.encode()).hexdigest()[:16]
|
|
175
188
|
path_hash = hashlib.sha256(file_path.encode()).hexdigest()[:8]
|
|
176
189
|
return f"doc_{path_hash}_{chunk_index}_{content_hash}"
|
|
177
|
-
|
|
190
|
+
|
|
178
191
|
def add_document(
|
|
179
192
|
self,
|
|
180
193
|
content: str,
|
|
@@ -184,39 +197,43 @@ class InfinityVectorStore:
|
|
|
184
197
|
embedding: Optional[List[float]] = None,
|
|
185
198
|
) -> str:
|
|
186
199
|
"""Add a document to the vector store.
|
|
187
|
-
|
|
200
|
+
|
|
188
201
|
Args:
|
|
189
202
|
content: Document content
|
|
190
203
|
metadata: Additional metadata
|
|
191
204
|
file_path: Source file path
|
|
192
205
|
chunk_index: Chunk index if document is part of larger file
|
|
193
206
|
embedding: Pre-computed embedding (if None, will compute)
|
|
194
|
-
|
|
207
|
+
|
|
195
208
|
Returns:
|
|
196
209
|
Document ID
|
|
197
210
|
"""
|
|
198
211
|
doc_id = self._generate_doc_id(content, file_path or "", chunk_index)
|
|
199
|
-
|
|
212
|
+
|
|
200
213
|
# Generate embedding if not provided
|
|
201
214
|
if embedding is None:
|
|
202
215
|
embedding = self._generate_embedding(content)
|
|
203
|
-
|
|
216
|
+
|
|
204
217
|
# Prepare metadata
|
|
205
218
|
metadata = metadata or {}
|
|
206
219
|
metadata_json = json.dumps(metadata)
|
|
207
|
-
|
|
220
|
+
|
|
208
221
|
# Insert document
|
|
209
|
-
self.documents_table.insert(
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
222
|
+
self.documents_table.insert(
|
|
223
|
+
[
|
|
224
|
+
{
|
|
225
|
+
"id": doc_id,
|
|
226
|
+
"content": content,
|
|
227
|
+
"file_path": file_path or "",
|
|
228
|
+
"chunk_index": chunk_index,
|
|
229
|
+
"metadata": metadata_json,
|
|
230
|
+
"embedding": embedding,
|
|
231
|
+
}
|
|
232
|
+
]
|
|
233
|
+
)
|
|
234
|
+
|
|
218
235
|
return doc_id
|
|
219
|
-
|
|
236
|
+
|
|
220
237
|
def add_file(
|
|
221
238
|
self,
|
|
222
239
|
file_path: str,
|
|
@@ -225,45 +242,47 @@ class InfinityVectorStore:
|
|
|
225
242
|
metadata: Dict[str, Any] = None,
|
|
226
243
|
) -> List[str]:
|
|
227
244
|
"""Add a file to the vector store by chunking it.
|
|
228
|
-
|
|
245
|
+
|
|
229
246
|
Args:
|
|
230
247
|
file_path: Path to the file to add
|
|
231
248
|
chunk_size: Maximum characters per chunk
|
|
232
249
|
chunk_overlap: Characters to overlap between chunks
|
|
233
250
|
metadata: Additional metadata for all chunks
|
|
234
|
-
|
|
251
|
+
|
|
235
252
|
Returns:
|
|
236
253
|
List of document IDs for all chunks
|
|
237
254
|
"""
|
|
238
255
|
path = Path(file_path)
|
|
239
256
|
if not path.exists():
|
|
240
257
|
raise FileNotFoundError(f"File not found: {file_path}")
|
|
241
|
-
|
|
258
|
+
|
|
242
259
|
# Read file content
|
|
243
260
|
try:
|
|
244
|
-
content = path.read_text(encoding=
|
|
261
|
+
content = path.read_text(encoding="utf-8")
|
|
245
262
|
except UnicodeDecodeError:
|
|
246
263
|
# Try with different encoding
|
|
247
|
-
content = path.read_text(encoding=
|
|
248
|
-
|
|
264
|
+
content = path.read_text(encoding="latin-1")
|
|
265
|
+
|
|
249
266
|
# Chunk the content
|
|
250
267
|
chunks = self._chunk_text(content, chunk_size, chunk_overlap)
|
|
251
|
-
|
|
268
|
+
|
|
252
269
|
# Add metadata
|
|
253
270
|
file_metadata = metadata or {}
|
|
254
|
-
file_metadata.update(
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
271
|
+
file_metadata.update(
|
|
272
|
+
{
|
|
273
|
+
"file_name": path.name,
|
|
274
|
+
"file_extension": path.suffix,
|
|
275
|
+
"file_size": path.stat().st_size,
|
|
276
|
+
}
|
|
277
|
+
)
|
|
278
|
+
|
|
260
279
|
# Add each chunk
|
|
261
280
|
doc_ids = []
|
|
262
281
|
for i, chunk in enumerate(chunks):
|
|
263
282
|
chunk_metadata = file_metadata.copy()
|
|
264
283
|
chunk_metadata["chunk_number"] = i
|
|
265
284
|
chunk_metadata["total_chunks"] = len(chunks)
|
|
266
|
-
|
|
285
|
+
|
|
267
286
|
doc_id = self.add_document(
|
|
268
287
|
content=chunk,
|
|
269
288
|
metadata=chunk_metadata,
|
|
@@ -271,9 +290,9 @@ class InfinityVectorStore:
|
|
|
271
290
|
chunk_index=i,
|
|
272
291
|
)
|
|
273
292
|
doc_ids.append(doc_id)
|
|
274
|
-
|
|
293
|
+
|
|
275
294
|
return doc_ids
|
|
276
|
-
|
|
295
|
+
|
|
277
296
|
def add_file_with_ast(
|
|
278
297
|
self,
|
|
279
298
|
file_path: str,
|
|
@@ -282,138 +301,146 @@ class InfinityVectorStore:
|
|
|
282
301
|
metadata: Dict[str, Any] = None,
|
|
283
302
|
) -> Tuple[List[str], Optional[FileAST]]:
|
|
284
303
|
"""Add a file with full AST analysis and symbol extraction.
|
|
285
|
-
|
|
304
|
+
|
|
286
305
|
Args:
|
|
287
306
|
file_path: Path to the file to add
|
|
288
307
|
chunk_size: Maximum characters per chunk for content
|
|
289
308
|
chunk_overlap: Characters to overlap between chunks
|
|
290
309
|
metadata: Additional metadata for all chunks
|
|
291
|
-
|
|
310
|
+
|
|
292
311
|
Returns:
|
|
293
312
|
Tuple of (document IDs for content chunks, FileAST object)
|
|
294
313
|
"""
|
|
295
314
|
path = Path(file_path)
|
|
296
315
|
if not path.exists():
|
|
297
316
|
raise FileNotFoundError(f"File not found: {file_path}")
|
|
298
|
-
|
|
317
|
+
|
|
299
318
|
# First add file content using existing method
|
|
300
319
|
doc_ids = self.add_file(file_path, chunk_size, chunk_overlap, metadata)
|
|
301
|
-
|
|
320
|
+
|
|
302
321
|
# Analyze AST and symbols
|
|
303
322
|
file_ast = self.ast_analyzer.analyze_file(file_path)
|
|
304
323
|
if not file_ast:
|
|
305
324
|
return doc_ids, None
|
|
306
|
-
|
|
325
|
+
|
|
307
326
|
# Store complete AST
|
|
308
327
|
self._store_file_ast(file_ast)
|
|
309
|
-
|
|
328
|
+
|
|
310
329
|
# Store individual symbols with embeddings
|
|
311
330
|
self._store_symbols(file_ast.symbols)
|
|
312
|
-
|
|
331
|
+
|
|
313
332
|
# Store cross-references
|
|
314
333
|
self._store_references(file_ast)
|
|
315
|
-
|
|
334
|
+
|
|
316
335
|
return doc_ids, file_ast
|
|
317
|
-
|
|
336
|
+
|
|
318
337
|
def _store_file_ast(self, file_ast: FileAST):
|
|
319
338
|
"""Store complete file AST information."""
|
|
320
339
|
from datetime import datetime
|
|
321
|
-
|
|
340
|
+
|
|
322
341
|
# Remove existing AST for this file
|
|
323
342
|
try:
|
|
324
343
|
self.ast_table.delete(f"file_path = '{file_ast.file_path}'")
|
|
325
|
-
except:
|
|
344
|
+
except Exception:
|
|
326
345
|
pass
|
|
327
|
-
|
|
346
|
+
|
|
328
347
|
# Insert new AST
|
|
329
|
-
self.ast_table.insert(
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
348
|
+
self.ast_table.insert(
|
|
349
|
+
[
|
|
350
|
+
{
|
|
351
|
+
"file_path": file_ast.file_path,
|
|
352
|
+
"file_hash": file_ast.file_hash,
|
|
353
|
+
"language": file_ast.language,
|
|
354
|
+
"ast_data": json.dumps(file_ast.to_dict()),
|
|
355
|
+
"last_updated": datetime.now().isoformat(),
|
|
356
|
+
}
|
|
357
|
+
]
|
|
358
|
+
)
|
|
359
|
+
|
|
337
360
|
def _store_symbols(self, symbols: List[Symbol]):
|
|
338
361
|
"""Store symbols with vector embeddings."""
|
|
339
362
|
if not symbols:
|
|
340
363
|
return
|
|
341
|
-
|
|
364
|
+
|
|
342
365
|
# Remove existing symbols for these files
|
|
343
366
|
file_paths = list(set(symbol.file_path for symbol in symbols))
|
|
344
367
|
for file_path in file_paths:
|
|
345
368
|
try:
|
|
346
369
|
self.symbols_table.delete(f"file_path = '{file_path}'")
|
|
347
|
-
except:
|
|
370
|
+
except Exception:
|
|
348
371
|
pass
|
|
349
|
-
|
|
372
|
+
|
|
350
373
|
# Insert new symbols
|
|
351
374
|
symbol_records = []
|
|
352
375
|
for symbol in symbols:
|
|
353
376
|
# Create embedding text for symbol
|
|
354
377
|
embedding_text = create_symbol_embedding_text(symbol)
|
|
355
378
|
embedding = self._generate_embedding(embedding_text)
|
|
356
|
-
|
|
379
|
+
|
|
357
380
|
# Generate symbol ID
|
|
358
381
|
symbol_id = self._generate_symbol_id(symbol)
|
|
359
|
-
|
|
382
|
+
|
|
360
383
|
# Prepare metadata
|
|
361
384
|
symbol_metadata = {
|
|
362
385
|
"references": symbol.references,
|
|
363
386
|
"embedding_text": embedding_text,
|
|
364
387
|
}
|
|
365
|
-
|
|
366
|
-
symbol_records.append(
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
388
|
+
|
|
389
|
+
symbol_records.append(
|
|
390
|
+
{
|
|
391
|
+
"id": symbol_id,
|
|
392
|
+
"name": symbol.name,
|
|
393
|
+
"type": symbol.type,
|
|
394
|
+
"file_path": symbol.file_path,
|
|
395
|
+
"line_start": symbol.line_start,
|
|
396
|
+
"line_end": symbol.line_end,
|
|
397
|
+
"scope": symbol.scope or "",
|
|
398
|
+
"parent": symbol.parent or "",
|
|
399
|
+
"signature": symbol.signature or "",
|
|
400
|
+
"docstring": symbol.docstring or "",
|
|
401
|
+
"metadata": json.dumps(symbol_metadata),
|
|
402
|
+
"embedding": embedding,
|
|
403
|
+
}
|
|
404
|
+
)
|
|
405
|
+
|
|
381
406
|
if symbol_records:
|
|
382
407
|
self.symbols_table.insert(symbol_records)
|
|
383
|
-
|
|
408
|
+
|
|
384
409
|
def _store_references(self, file_ast: FileAST):
|
|
385
410
|
"""Store cross-file references."""
|
|
386
411
|
if not file_ast.dependencies:
|
|
387
412
|
return
|
|
388
|
-
|
|
413
|
+
|
|
389
414
|
# Remove existing references for this file
|
|
390
415
|
try:
|
|
391
416
|
self.references_table.delete(f"source_file = '{file_ast.file_path}'")
|
|
392
|
-
except:
|
|
417
|
+
except Exception:
|
|
393
418
|
pass
|
|
394
|
-
|
|
419
|
+
|
|
395
420
|
# Insert new references
|
|
396
421
|
reference_records = []
|
|
397
422
|
for i, dependency in enumerate(file_ast.dependencies):
|
|
398
423
|
ref_id = f"{file_ast.file_path}_{dependency}_{i}"
|
|
399
|
-
reference_records.append(
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
424
|
+
reference_records.append(
|
|
425
|
+
{
|
|
426
|
+
"id": ref_id,
|
|
427
|
+
"source_file": file_ast.file_path,
|
|
428
|
+
"target_file": dependency,
|
|
429
|
+
"symbol_name": dependency,
|
|
430
|
+
"reference_type": "import",
|
|
431
|
+
"line_number": 0, # Could be enhanced to track actual line numbers
|
|
432
|
+
"metadata": json.dumps({}),
|
|
433
|
+
}
|
|
434
|
+
)
|
|
435
|
+
|
|
409
436
|
if reference_records:
|
|
410
437
|
self.references_table.insert(reference_records)
|
|
411
|
-
|
|
438
|
+
|
|
412
439
|
def _generate_symbol_id(self, symbol: Symbol) -> str:
|
|
413
440
|
"""Generate unique symbol ID."""
|
|
414
441
|
text = f"{symbol.file_path}_{symbol.type}_{symbol.name}_{symbol.line_start}"
|
|
415
442
|
return hashlib.sha256(text.encode()).hexdigest()[:16]
|
|
416
|
-
|
|
443
|
+
|
|
417
444
|
def search_symbols(
|
|
418
445
|
self,
|
|
419
446
|
query: str,
|
|
@@ -423,37 +450,37 @@ class InfinityVectorStore:
|
|
|
423
450
|
score_threshold: float = 0.0,
|
|
424
451
|
) -> List[SymbolSearchResult]:
|
|
425
452
|
"""Search for symbols using vector similarity.
|
|
426
|
-
|
|
453
|
+
|
|
427
454
|
Args:
|
|
428
455
|
query: Search query
|
|
429
456
|
symbol_type: Filter by symbol type (function, class, variable, etc.)
|
|
430
457
|
file_path: Filter by file path
|
|
431
458
|
limit: Maximum number of results
|
|
432
459
|
score_threshold: Minimum similarity score
|
|
433
|
-
|
|
460
|
+
|
|
434
461
|
Returns:
|
|
435
462
|
List of symbol search results
|
|
436
463
|
"""
|
|
437
464
|
# Generate query embedding
|
|
438
465
|
query_embedding = self._generate_embedding(query)
|
|
439
|
-
|
|
466
|
+
|
|
440
467
|
# Build search query
|
|
441
468
|
search_query = self.symbols_table.output(["*"]).match_dense(
|
|
442
|
-
"embedding",
|
|
443
|
-
query_embedding,
|
|
444
|
-
"float",
|
|
469
|
+
"embedding",
|
|
470
|
+
query_embedding,
|
|
471
|
+
"float",
|
|
445
472
|
"ip", # Inner product
|
|
446
|
-
limit * 2 # Get more results for filtering
|
|
473
|
+
limit * 2, # Get more results for filtering
|
|
447
474
|
)
|
|
448
|
-
|
|
475
|
+
|
|
449
476
|
# Apply filters
|
|
450
477
|
if symbol_type:
|
|
451
478
|
search_query = search_query.filter(f"type = '{symbol_type}'")
|
|
452
479
|
if file_path:
|
|
453
480
|
search_query = search_query.filter(f"file_path = '{file_path}'")
|
|
454
|
-
|
|
481
|
+
|
|
455
482
|
search_results = search_query.to_pl()
|
|
456
|
-
|
|
483
|
+
|
|
457
484
|
# Convert to SymbolSearchResult objects
|
|
458
485
|
results = []
|
|
459
486
|
for row in search_results.iter_rows(named=True):
|
|
@@ -462,9 +489,9 @@ class InfinityVectorStore:
|
|
|
462
489
|
# Parse metadata
|
|
463
490
|
try:
|
|
464
491
|
metadata = json.loads(row["metadata"])
|
|
465
|
-
except:
|
|
492
|
+
except Exception:
|
|
466
493
|
metadata = {}
|
|
467
|
-
|
|
494
|
+
|
|
468
495
|
# Create Symbol object
|
|
469
496
|
symbol = Symbol(
|
|
470
497
|
name=row["name"],
|
|
@@ -473,21 +500,23 @@ class InfinityVectorStore:
|
|
|
473
500
|
line_start=row["line_start"],
|
|
474
501
|
line_end=row["line_end"],
|
|
475
502
|
column_start=0, # Not stored in table
|
|
476
|
-
column_end=0,
|
|
503
|
+
column_end=0, # Not stored in table
|
|
477
504
|
scope=row["scope"],
|
|
478
505
|
parent=row["parent"] if row["parent"] else None,
|
|
479
506
|
docstring=row["docstring"] if row["docstring"] else None,
|
|
480
507
|
signature=row["signature"] if row["signature"] else None,
|
|
481
508
|
references=metadata.get("references", []),
|
|
482
509
|
)
|
|
483
|
-
|
|
484
|
-
results.append(
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
510
|
+
|
|
511
|
+
results.append(
|
|
512
|
+
SymbolSearchResult(
|
|
513
|
+
symbol=symbol,
|
|
514
|
+
score=score,
|
|
515
|
+
)
|
|
516
|
+
)
|
|
517
|
+
|
|
489
518
|
return results[:limit]
|
|
490
|
-
|
|
519
|
+
|
|
491
520
|
def search_ast_nodes(
|
|
492
521
|
self,
|
|
493
522
|
file_path: str,
|
|
@@ -495,24 +524,28 @@ class InfinityVectorStore:
|
|
|
495
524
|
node_name: Optional[str] = None,
|
|
496
525
|
) -> Optional[FileAST]:
|
|
497
526
|
"""Search AST nodes within a specific file.
|
|
498
|
-
|
|
527
|
+
|
|
499
528
|
Args:
|
|
500
529
|
file_path: File to search in
|
|
501
530
|
node_type: Filter by AST node type
|
|
502
531
|
node_name: Filter by node name
|
|
503
|
-
|
|
532
|
+
|
|
504
533
|
Returns:
|
|
505
534
|
FileAST object if file found, None otherwise
|
|
506
535
|
"""
|
|
507
536
|
try:
|
|
508
|
-
results =
|
|
509
|
-
|
|
537
|
+
results = (
|
|
538
|
+
self.ast_table.output(["*"])
|
|
539
|
+
.filter(f"file_path = '{file_path}'")
|
|
540
|
+
.to_pl()
|
|
541
|
+
)
|
|
542
|
+
|
|
510
543
|
if len(results) == 0:
|
|
511
544
|
return None
|
|
512
|
-
|
|
545
|
+
|
|
513
546
|
row = next(results.iter_rows(named=True))
|
|
514
547
|
ast_data = json.loads(row["ast_data"])
|
|
515
|
-
|
|
548
|
+
|
|
516
549
|
# Reconstruct FileAST object
|
|
517
550
|
file_ast = FileAST(
|
|
518
551
|
file_path=ast_data["file_path"],
|
|
@@ -524,44 +557,52 @@ class InfinityVectorStore:
|
|
|
524
557
|
exports=ast_data["exports"],
|
|
525
558
|
dependencies=ast_data["dependencies"],
|
|
526
559
|
)
|
|
527
|
-
|
|
560
|
+
|
|
528
561
|
return file_ast
|
|
529
|
-
|
|
562
|
+
|
|
530
563
|
except Exception as e:
|
|
531
564
|
import logging
|
|
565
|
+
|
|
532
566
|
logger = logging.getLogger(__name__)
|
|
533
567
|
logger.error(f"Error searching AST nodes: {e}")
|
|
534
568
|
return None
|
|
535
|
-
|
|
569
|
+
|
|
536
570
|
def get_file_references(self, file_path: str) -> List[Dict[str, Any]]:
|
|
537
571
|
"""Get all files that reference the given file.
|
|
538
|
-
|
|
572
|
+
|
|
539
573
|
Args:
|
|
540
574
|
file_path: File to find references for
|
|
541
|
-
|
|
575
|
+
|
|
542
576
|
Returns:
|
|
543
577
|
List of reference information
|
|
544
578
|
"""
|
|
545
579
|
try:
|
|
546
|
-
results =
|
|
547
|
-
|
|
580
|
+
results = (
|
|
581
|
+
self.references_table.output(["*"])
|
|
582
|
+
.filter(f"target_file = '{file_path}'")
|
|
583
|
+
.to_pl()
|
|
584
|
+
)
|
|
585
|
+
|
|
548
586
|
references = []
|
|
549
587
|
for row in results.iter_rows(named=True):
|
|
550
|
-
references.append(
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
588
|
+
references.append(
|
|
589
|
+
{
|
|
590
|
+
"source_file": row["source_file"],
|
|
591
|
+
"symbol_name": row["symbol_name"],
|
|
592
|
+
"reference_type": row["reference_type"],
|
|
593
|
+
"line_number": row["line_number"],
|
|
594
|
+
}
|
|
595
|
+
)
|
|
596
|
+
|
|
557
597
|
return references
|
|
558
|
-
|
|
598
|
+
|
|
559
599
|
except Exception as e:
|
|
560
600
|
import logging
|
|
601
|
+
|
|
561
602
|
logger = logging.getLogger(__name__)
|
|
562
603
|
logger.error(f"Error getting file references: {e}")
|
|
563
604
|
return []
|
|
564
|
-
|
|
605
|
+
|
|
565
606
|
def search(
|
|
566
607
|
self,
|
|
567
608
|
query: str,
|
|
@@ -570,37 +611,41 @@ class InfinityVectorStore:
|
|
|
570
611
|
filters: Dict[str, Any] = None,
|
|
571
612
|
) -> List[SearchResult]:
|
|
572
613
|
"""Search for similar documents.
|
|
573
|
-
|
|
614
|
+
|
|
574
615
|
Args:
|
|
575
616
|
query: Search query
|
|
576
617
|
limit: Maximum number of results
|
|
577
618
|
score_threshold: Minimum similarity score
|
|
578
619
|
filters: Metadata filters (not yet implemented)
|
|
579
|
-
|
|
620
|
+
|
|
580
621
|
Returns:
|
|
581
622
|
List of search results
|
|
582
623
|
"""
|
|
583
624
|
# Generate query embedding
|
|
584
625
|
query_embedding = self._generate_embedding(query)
|
|
585
|
-
|
|
626
|
+
|
|
586
627
|
# Perform vector search
|
|
587
|
-
search_results =
|
|
588
|
-
"
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
628
|
+
search_results = (
|
|
629
|
+
self.documents_table.output(["*"])
|
|
630
|
+
.match_dense(
|
|
631
|
+
"embedding",
|
|
632
|
+
query_embedding,
|
|
633
|
+
"float",
|
|
634
|
+
"ip", # Inner product (cosine similarity)
|
|
635
|
+
limit,
|
|
636
|
+
)
|
|
637
|
+
.to_pl()
|
|
638
|
+
)
|
|
639
|
+
|
|
595
640
|
# Convert to SearchResult objects
|
|
596
641
|
results = []
|
|
597
642
|
for row in search_results.iter_rows(named=True):
|
|
598
643
|
# Parse metadata
|
|
599
644
|
try:
|
|
600
645
|
metadata = json.loads(row["metadata"])
|
|
601
|
-
except:
|
|
646
|
+
except Exception:
|
|
602
647
|
metadata = {}
|
|
603
|
-
|
|
648
|
+
|
|
604
649
|
# Create document
|
|
605
650
|
document = Document(
|
|
606
651
|
id=row["id"],
|
|
@@ -609,64 +654,70 @@ class InfinityVectorStore:
|
|
|
609
654
|
file_path=row["file_path"] if row["file_path"] else None,
|
|
610
655
|
chunk_index=row["chunk_index"],
|
|
611
656
|
)
|
|
612
|
-
|
|
657
|
+
|
|
613
658
|
# Score is the similarity (higher is better)
|
|
614
659
|
score = row.get("score", 0.0)
|
|
615
660
|
distance = 1.0 - score # Convert similarity to distance
|
|
616
|
-
|
|
661
|
+
|
|
617
662
|
if score >= score_threshold:
|
|
618
|
-
results.append(
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
663
|
+
results.append(
|
|
664
|
+
SearchResult(
|
|
665
|
+
document=document,
|
|
666
|
+
score=score,
|
|
667
|
+
distance=distance,
|
|
668
|
+
)
|
|
669
|
+
)
|
|
670
|
+
|
|
624
671
|
return results
|
|
625
|
-
|
|
672
|
+
|
|
626
673
|
def delete_document(self, doc_id: str) -> bool:
|
|
627
674
|
"""Delete a document by ID.
|
|
628
|
-
|
|
675
|
+
|
|
629
676
|
Args:
|
|
630
677
|
doc_id: Document ID to delete
|
|
631
|
-
|
|
678
|
+
|
|
632
679
|
Returns:
|
|
633
680
|
True if document was deleted
|
|
634
681
|
"""
|
|
635
682
|
try:
|
|
636
683
|
self.documents_table.delete(f"id = '{doc_id}'")
|
|
637
684
|
return True
|
|
638
|
-
except:
|
|
685
|
+
except Exception:
|
|
639
686
|
return False
|
|
640
|
-
|
|
687
|
+
|
|
641
688
|
def delete_file(self, file_path: str) -> int:
|
|
642
689
|
"""Delete all documents from a specific file.
|
|
643
|
-
|
|
690
|
+
|
|
644
691
|
Args:
|
|
645
692
|
file_path: File path to delete documents for
|
|
646
|
-
|
|
693
|
+
|
|
647
694
|
Returns:
|
|
648
695
|
Number of documents deleted
|
|
649
696
|
"""
|
|
650
697
|
try:
|
|
651
698
|
# Get count first
|
|
652
|
-
results =
|
|
699
|
+
results = (
|
|
700
|
+
self.documents_table.output(["id"])
|
|
701
|
+
.filter(f"file_path = '{file_path}'")
|
|
702
|
+
.to_pl()
|
|
703
|
+
)
|
|
653
704
|
count = len(results)
|
|
654
|
-
|
|
705
|
+
|
|
655
706
|
# Delete all documents for this file
|
|
656
707
|
self.documents_table.delete(f"file_path = '{file_path}'")
|
|
657
708
|
return count
|
|
658
|
-
except:
|
|
709
|
+
except Exception:
|
|
659
710
|
return 0
|
|
660
|
-
|
|
711
|
+
|
|
661
712
|
def list_files(self) -> List[Dict[str, Any]]:
|
|
662
713
|
"""List all indexed files.
|
|
663
|
-
|
|
714
|
+
|
|
664
715
|
Returns:
|
|
665
716
|
List of file information
|
|
666
717
|
"""
|
|
667
718
|
try:
|
|
668
719
|
results = self.documents_table.output(["file_path", "metadata"]).to_pl()
|
|
669
|
-
|
|
720
|
+
|
|
670
721
|
files = {}
|
|
671
722
|
for row in results.iter_rows(named=True):
|
|
672
723
|
file_path = row["file_path"]
|
|
@@ -675,63 +726,66 @@ class InfinityVectorStore:
|
|
|
675
726
|
metadata = json.loads(row["metadata"])
|
|
676
727
|
files[file_path] = {
|
|
677
728
|
"file_path": file_path,
|
|
678
|
-
"file_name": metadata.get(
|
|
729
|
+
"file_name": metadata.get(
|
|
730
|
+
"file_name", Path(file_path).name
|
|
731
|
+
),
|
|
679
732
|
"file_size": metadata.get("file_size", 0),
|
|
680
733
|
"total_chunks": metadata.get("total_chunks", 1),
|
|
681
734
|
}
|
|
682
|
-
except:
|
|
735
|
+
except Exception:
|
|
683
736
|
files[file_path] = {
|
|
684
737
|
"file_path": file_path,
|
|
685
738
|
"file_name": Path(file_path).name,
|
|
686
739
|
}
|
|
687
|
-
|
|
740
|
+
|
|
688
741
|
return list(files.values())
|
|
689
|
-
except:
|
|
742
|
+
except Exception:
|
|
690
743
|
return []
|
|
691
|
-
|
|
744
|
+
|
|
692
745
|
def _chunk_text(self, text: str, chunk_size: int, overlap: int) -> List[str]:
|
|
693
746
|
"""Split text into overlapping chunks."""
|
|
694
747
|
if len(text) <= chunk_size:
|
|
695
748
|
return [text]
|
|
696
|
-
|
|
749
|
+
|
|
697
750
|
chunks = []
|
|
698
751
|
start = 0
|
|
699
|
-
|
|
752
|
+
|
|
700
753
|
while start < len(text):
|
|
701
754
|
end = start + chunk_size
|
|
702
|
-
|
|
755
|
+
|
|
703
756
|
# Try to break at word boundary
|
|
704
757
|
if end < len(text):
|
|
705
758
|
# Look back for a good break point
|
|
706
759
|
break_point = end
|
|
707
760
|
for i in range(end - 100, start + 100, -1):
|
|
708
|
-
if i > 0 and text[i] in
|
|
761
|
+
if i > 0 and text[i] in "\n\r.!?":
|
|
709
762
|
break_point = i + 1
|
|
710
763
|
break
|
|
711
764
|
end = break_point
|
|
712
|
-
|
|
765
|
+
|
|
713
766
|
chunk = text[start:end].strip()
|
|
714
767
|
if chunk:
|
|
715
768
|
chunks.append(chunk)
|
|
716
|
-
|
|
769
|
+
|
|
717
770
|
start = max(start + chunk_size - overlap, end)
|
|
718
|
-
|
|
771
|
+
|
|
719
772
|
return chunks
|
|
720
|
-
|
|
773
|
+
|
|
721
774
|
def _generate_embedding(self, text: str) -> List[float]:
|
|
722
775
|
"""Generate embedding for text.
|
|
723
|
-
|
|
776
|
+
|
|
724
777
|
For now, this returns a dummy embedding. In a real implementation,
|
|
725
778
|
you would call an embedding API (OpenAI, Cohere, etc.) or use a local model.
|
|
726
779
|
"""
|
|
727
780
|
# This is a placeholder - you would implement actual embedding generation here
|
|
728
781
|
# For now, return a random embedding of the correct dimension
|
|
729
782
|
import random
|
|
783
|
+
|
|
730
784
|
return [random.random() for _ in range(self.dimension)]
|
|
731
|
-
|
|
785
|
+
|
|
732
786
|
async def get_stats(self) -> Dict[str, Any]:
|
|
733
787
|
"""Get statistics about the vector store.
|
|
734
|
-
|
|
788
|
+
|
|
735
789
|
Returns:
|
|
736
790
|
Dictionary with statistics
|
|
737
791
|
"""
|
|
@@ -739,30 +793,30 @@ class InfinityVectorStore:
|
|
|
739
793
|
# Get document count
|
|
740
794
|
doc_count_result = self.documents_table.output(["count(*)"]).to_pl()
|
|
741
795
|
doc_count = doc_count_result.item(0, 0) if len(doc_count_result) > 0 else 0
|
|
742
|
-
|
|
796
|
+
|
|
743
797
|
# Get unique file count
|
|
744
798
|
file_result = self.documents_table.output(["file_path"]).to_pl()
|
|
745
799
|
unique_files = set()
|
|
746
800
|
for row in file_result.iter_rows():
|
|
747
801
|
if row[0]:
|
|
748
802
|
unique_files.add(row[0])
|
|
749
|
-
|
|
803
|
+
|
|
750
804
|
# Get symbol count
|
|
751
805
|
symbol_count = 0
|
|
752
806
|
try:
|
|
753
807
|
symbol_result = self.symbols_table.output(["count(*)"]).to_pl()
|
|
754
808
|
symbol_count = symbol_result.item(0, 0) if len(symbol_result) > 0 else 0
|
|
755
|
-
except:
|
|
809
|
+
except Exception:
|
|
756
810
|
pass
|
|
757
|
-
|
|
811
|
+
|
|
758
812
|
# Get AST count
|
|
759
813
|
ast_count = 0
|
|
760
814
|
try:
|
|
761
815
|
ast_result = self.ast_table.output(["count(*)"]).to_pl()
|
|
762
816
|
ast_count = ast_result.item(0, 0) if len(ast_result) > 0 else 0
|
|
763
|
-
except:
|
|
817
|
+
except Exception:
|
|
764
818
|
pass
|
|
765
|
-
|
|
819
|
+
|
|
766
820
|
return {
|
|
767
821
|
"document_count": doc_count,
|
|
768
822
|
"vector_count": doc_count, # Each document has a vector
|
|
@@ -779,57 +833,58 @@ class InfinityVectorStore:
|
|
|
779
833
|
"document_count": 0,
|
|
780
834
|
"vector_count": 0,
|
|
781
835
|
}
|
|
782
|
-
|
|
836
|
+
|
|
783
837
|
async def clear(self) -> bool:
|
|
784
838
|
"""Clear all data from the vector store.
|
|
785
|
-
|
|
839
|
+
|
|
786
840
|
Returns:
|
|
787
841
|
True if successful
|
|
788
842
|
"""
|
|
789
843
|
try:
|
|
790
844
|
# Delete all records from all tables
|
|
791
845
|
self.documents_table.delete()
|
|
792
|
-
|
|
846
|
+
|
|
793
847
|
try:
|
|
794
848
|
self.symbols_table.delete()
|
|
795
|
-
except:
|
|
849
|
+
except Exception:
|
|
796
850
|
pass
|
|
797
|
-
|
|
851
|
+
|
|
798
852
|
try:
|
|
799
853
|
self.ast_table.delete()
|
|
800
|
-
except:
|
|
854
|
+
except Exception:
|
|
801
855
|
pass
|
|
802
|
-
|
|
856
|
+
|
|
803
857
|
try:
|
|
804
858
|
self.references_table.delete()
|
|
805
|
-
except:
|
|
859
|
+
except Exception:
|
|
806
860
|
pass
|
|
807
|
-
|
|
861
|
+
|
|
808
862
|
return True
|
|
809
863
|
except Exception as e:
|
|
810
864
|
import logging
|
|
865
|
+
|
|
811
866
|
logger = logging.getLogger(__name__)
|
|
812
867
|
logger.error(f"Error clearing vector store: {e}")
|
|
813
868
|
return False
|
|
814
|
-
|
|
869
|
+
|
|
815
870
|
async def index_document(
|
|
816
871
|
self,
|
|
817
872
|
content: str,
|
|
818
873
|
metadata: Dict[str, Any] = None,
|
|
819
874
|
) -> str:
|
|
820
875
|
"""Async version of add_document for consistency.
|
|
821
|
-
|
|
876
|
+
|
|
822
877
|
Args:
|
|
823
878
|
content: Document content
|
|
824
879
|
metadata: Additional metadata
|
|
825
|
-
|
|
880
|
+
|
|
826
881
|
Returns:
|
|
827
882
|
Document ID
|
|
828
883
|
"""
|
|
829
884
|
file_path = metadata.get("path") if metadata else None
|
|
830
885
|
return self.add_document(content, metadata, file_path)
|
|
831
|
-
|
|
886
|
+
|
|
832
887
|
def close(self):
|
|
833
888
|
"""Close the database connection."""
|
|
834
|
-
if hasattr(self,
|
|
835
|
-
self.infinity.disconnect()
|
|
889
|
+
if hasattr(self, "infinity"):
|
|
890
|
+
self.infinity.disconnect()
|