vox-code 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vox_code-2.0.0.dist-info/METADATA +258 -0
- vox_code-2.0.0.dist-info/RECORD +88 -0
- vox_code-2.0.0.dist-info/WHEEL +4 -0
- vox_code-2.0.0.dist-info/entry_points.txt +3 -0
- voxcli/__init__.py +3 -0
- voxcli/__main__.py +5 -0
- voxcli/agent/__init__.py +12 -0
- voxcli/agent/agent.py +449 -0
- voxcli/agent/agent_budget.py +133 -0
- voxcli/agent/agent_orchestrator.py +414 -0
- voxcli/agent/plan_execute_agent.py +514 -0
- voxcli/agent/roles.py +80 -0
- voxcli/agent/sub_agent.py +351 -0
- voxcli/catalog.py +477 -0
- voxcli/chat.py +91 -0
- voxcli/cli/__init__.py +4 -0
- voxcli/cli/main.py +452 -0
- voxcli/cli/parser.py +71 -0
- voxcli/config.py +518 -0
- voxcli/gui/__main__.py +3 -0
- voxcli/gui/main.py +22 -0
- voxcli/gui/pet/__init__.py +5 -0
- voxcli/gui/pet/base.py +62 -0
- voxcli/gui/pet/coordinator.py +888 -0
- voxcli/gui/pet/data.py +430 -0
- voxcli/gui/pet/widgets.py +683 -0
- voxcli/gui/pet/windows.py +2298 -0
- voxcli/gui/pet/workers.py +54 -0
- voxcli/gui/pet_app.py +7 -0
- voxcli/hitl/__init__.py +11 -0
- voxcli/hitl/handler.py +11 -0
- voxcli/hitl/policy.py +32 -0
- voxcli/hitl/request.py +13 -0
- voxcli/hitl/result.py +11 -0
- voxcli/hitl/terminal_handler.py +64 -0
- voxcli/hitl/tool_registry.py +64 -0
- voxcli/llm/base.py +93 -0
- voxcli/llm/factory.py +178 -0
- voxcli/llm/ollama_client.py +137 -0
- voxcli/llm/openai_compatible.py +249 -0
- voxcli/memory/base.py +16 -0
- voxcli/memory/budget.py +53 -0
- voxcli/memory/compressor.py +198 -0
- voxcli/memory/entry.py +36 -0
- voxcli/memory/long_term.py +126 -0
- voxcli/memory/manager.py +101 -0
- voxcli/memory/retriever.py +72 -0
- voxcli/memory/short_term.py +84 -0
- voxcli/memory/tokenizer.py +21 -0
- voxcli/plan/__init__.py +5 -0
- voxcli/plan/execution_plan.py +225 -0
- voxcli/plan/planner.py +198 -0
- voxcli/plan/task.py +123 -0
- voxcli/policy/audit_log.py +111 -0
- voxcli/policy/command_guard.py +34 -0
- voxcli/policy/exception.py +5 -0
- voxcli/policy/path_guard.py +32 -0
- voxcli/prompting/__init__.py +7 -0
- voxcli/prompting/presenter.py +154 -0
- voxcli/rag/__init__.py +16 -0
- voxcli/rag/analyzer.py +89 -0
- voxcli/rag/chunk.py +17 -0
- voxcli/rag/chunker.py +137 -0
- voxcli/rag/embedding.py +75 -0
- voxcli/rag/formatter.py +40 -0
- voxcli/rag/index.py +96 -0
- voxcli/rag/relation.py +14 -0
- voxcli/rag/retriever.py +58 -0
- voxcli/rag/store.py +155 -0
- voxcli/rag/tokenizer.py +26 -0
- voxcli/runtime/__init__.py +6 -0
- voxcli/runtime/session_controller.py +386 -0
- voxcli/tool/__init__.py +3 -0
- voxcli/tool/tool_registry.py +433 -0
- voxcli/util/animation.py +219 -0
- voxcli/util/ansi.py +82 -0
- voxcli/util/markdown.py +98 -0
- voxcli/web/__init__.py +17 -0
- voxcli/web/base.py +20 -0
- voxcli/web/extractor.py +77 -0
- voxcli/web/factory.py +38 -0
- voxcli/web/fetch_result.py +27 -0
- voxcli/web/fetcher.py +42 -0
- voxcli/web/network_policy.py +49 -0
- voxcli/web/result.py +23 -0
- voxcli/web/searxng.py +55 -0
- voxcli/web/serpapi.py +53 -0
- voxcli/web/zhipu.py +55 -0
voxcli/rag/analyzer.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""代码分析器 - 分析代码结构和关系"""
|
|
2
|
+
|
|
3
|
+
import ast
|
|
4
|
+
import logging
|
|
5
|
+
import re
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Dict, List, Optional, Set
|
|
8
|
+
|
|
9
|
+
from .relation import CodeRelation
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class CodeAnalyzer:
|
|
15
|
+
def __init__(self, project_path: str):
|
|
16
|
+
self._project_path = Path(project_path)
|
|
17
|
+
|
|
18
|
+
def analyze_file(self, file_path: str) -> List[CodeRelation]:
|
|
19
|
+
path = Path(file_path)
|
|
20
|
+
if not path.exists():
|
|
21
|
+
return []
|
|
22
|
+
try:
|
|
23
|
+
content = path.read_text(encoding="utf-8")
|
|
24
|
+
except Exception as e:
|
|
25
|
+
logger.warning("Failed to read %s: %s", file_path, e)
|
|
26
|
+
return []
|
|
27
|
+
|
|
28
|
+
ext = path.suffix.lower()
|
|
29
|
+
if ext == ".py":
|
|
30
|
+
return self._analyze_python(file_path, content)
|
|
31
|
+
elif ext == ".java":
|
|
32
|
+
return self._analyze_java(file_path, content)
|
|
33
|
+
return []
|
|
34
|
+
|
|
35
|
+
def _analyze_python(self, file_path: str, content: str) -> List[CodeRelation]:
|
|
36
|
+
relations = []
|
|
37
|
+
try:
|
|
38
|
+
tree = ast.parse(content)
|
|
39
|
+
for node in ast.walk(tree):
|
|
40
|
+
if isinstance(node, ast.Import):
|
|
41
|
+
for alias in node.names:
|
|
42
|
+
target = self._resolve_module(alias.name)
|
|
43
|
+
if target:
|
|
44
|
+
relations.append(CodeRelation(
|
|
45
|
+
source_file=file_path,
|
|
46
|
+
target_file=target,
|
|
47
|
+
relation_type="import",
|
|
48
|
+
))
|
|
49
|
+
elif isinstance(node, ast.ImportFrom):
|
|
50
|
+
if node.module:
|
|
51
|
+
for alias in node.names:
|
|
52
|
+
target = self._resolve_module(f"{node.module}.{alias.name}")
|
|
53
|
+
if target:
|
|
54
|
+
relations.append(CodeRelation(
|
|
55
|
+
source_file=file_path,
|
|
56
|
+
target_file=target,
|
|
57
|
+
relation_type="import",
|
|
58
|
+
))
|
|
59
|
+
except SyntaxError:
|
|
60
|
+
pass
|
|
61
|
+
return relations
|
|
62
|
+
|
|
63
|
+
def _analyze_java(self, file_path: str, content: str) -> List[CodeRelation]:
|
|
64
|
+
relations = []
|
|
65
|
+
import_pattern = re.compile(r"^import\s+(?:static\s+)?([\w.]+);", re.MULTILINE)
|
|
66
|
+
for match in import_pattern.finditer(content):
|
|
67
|
+
import_path = match.group(1)
|
|
68
|
+
target = self._resolve_java_class(import_path)
|
|
69
|
+
if target:
|
|
70
|
+
relations.append(CodeRelation(
|
|
71
|
+
source_file=file_path,
|
|
72
|
+
target_file=str(target),
|
|
73
|
+
relation_type="import",
|
|
74
|
+
))
|
|
75
|
+
return relations
|
|
76
|
+
|
|
77
|
+
def _resolve_module(self, module_name: str) -> Optional[str]:
|
|
78
|
+
path = self._project_path / (module_name.replace(".", "/") + ".py")
|
|
79
|
+
if path.exists():
|
|
80
|
+
return str(path)
|
|
81
|
+
init_path = path.parent / "__init__.py"
|
|
82
|
+
if init_path.exists():
|
|
83
|
+
return str(init_path)
|
|
84
|
+
return None
|
|
85
|
+
|
|
86
|
+
def _resolve_java_class(self, class_name: str) -> Optional[Path]:
|
|
87
|
+
rel_path = class_name.replace(".", "/") + ".java"
|
|
88
|
+
candidates = list(self._project_path.rglob(rel_path))
|
|
89
|
+
return candidates[0] if candidates else None
|
voxcli/rag/chunk.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""代码块数据模型"""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import List, Optional
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class CodeChunk:
|
|
9
|
+
id: str
|
|
10
|
+
file_path: str
|
|
11
|
+
content: str
|
|
12
|
+
language: str
|
|
13
|
+
start_line: int
|
|
14
|
+
end_line: int
|
|
15
|
+
chunk_type: str = "code" # code, comment, import, class, function
|
|
16
|
+
metadata: dict = field(default_factory=dict)
|
|
17
|
+
embedding: Optional[List[float]] = None
|
voxcli/rag/chunker.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
"""代码分块器 - 将源代码文件分割为可检索的块"""
|
|
2
|
+
|
|
3
|
+
import ast
|
|
4
|
+
import logging
|
|
5
|
+
import re
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import List, Optional
|
|
8
|
+
|
|
9
|
+
from .chunk import CodeChunk
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
_LINE_CHUNK_SIZE = 50
|
|
14
|
+
_LINE_OVERLAP = 10
|
|
15
|
+
|
|
16
|
+
_LANGUAGE_MAP = {
|
|
17
|
+
".py": "python", ".java": "java", ".js": "javascript",
|
|
18
|
+
".ts": "typescript", ".tsx": "typescriptreact", ".jsx": "javascriptreact",
|
|
19
|
+
".go": "go", ".rs": "rust", ".rb": "ruby", ".php": "php",
|
|
20
|
+
".c": "c", ".cpp": "cpp", ".h": "c", ".hpp": "cpp",
|
|
21
|
+
".swift": "swift", ".kt": "kotlin", ".scala": "scala",
|
|
22
|
+
".md": "markdown", ".json": "json", ".yaml": "yaml", ".yml": "yaml",
|
|
23
|
+
".xml": "xml", ".toml": "toml", ".sql": "sql", ".sh": "bash",
|
|
24
|
+
".zsh": "bash", ".bash": "bash", ".txt": "text",
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class CodeChunker:
|
|
29
|
+
def __init__(self, line_chunk_size: int = _LINE_CHUNK_SIZE,
|
|
30
|
+
line_overlap: int = _LINE_OVERLAP):
|
|
31
|
+
self._chunk_size = line_chunk_size
|
|
32
|
+
self._overlap = line_overlap
|
|
33
|
+
|
|
34
|
+
def chunk_file(self, file_path: str) -> List[CodeChunk]:
|
|
35
|
+
path = Path(file_path)
|
|
36
|
+
if not path.exists():
|
|
37
|
+
return []
|
|
38
|
+
try:
|
|
39
|
+
content = path.read_text(encoding="utf-8")
|
|
40
|
+
except Exception as e:
|
|
41
|
+
logger.warning("Failed to read %s: %s", file_path, e)
|
|
42
|
+
return []
|
|
43
|
+
|
|
44
|
+
ext = path.suffix.lower()
|
|
45
|
+
language = _LANGUAGE_MAP.get(ext, "text")
|
|
46
|
+
chunks = []
|
|
47
|
+
|
|
48
|
+
if language == "python":
|
|
49
|
+
chunks = self._chunk_python(file_path, content)
|
|
50
|
+
elif language == "java":
|
|
51
|
+
chunks = self._chunk_java(file_path, content)
|
|
52
|
+
|
|
53
|
+
if not chunks:
|
|
54
|
+
chunks = self._chunk_by_lines(file_path, content, language)
|
|
55
|
+
|
|
56
|
+
return chunks
|
|
57
|
+
|
|
58
|
+
def _chunk_python(self, file_path: str, content: str) -> List[CodeChunk]:
|
|
59
|
+
chunks = []
|
|
60
|
+
try:
|
|
61
|
+
tree = ast.parse(content)
|
|
62
|
+
for node in ast.walk(tree):
|
|
63
|
+
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
|
|
64
|
+
start = node.lineno or 1
|
|
65
|
+
end = node.end_lineno or start
|
|
66
|
+
lines = content.split("\n")[start - 1:end]
|
|
67
|
+
chunk_type = "class" if isinstance(node, ast.ClassDef) else "function"
|
|
68
|
+
chunks.append(CodeChunk(
|
|
69
|
+
id=f"{file_path}:{start}-{end}",
|
|
70
|
+
file_path=file_path,
|
|
71
|
+
content="\n".join(lines),
|
|
72
|
+
language="python",
|
|
73
|
+
start_line=start,
|
|
74
|
+
end_line=end,
|
|
75
|
+
chunk_type=chunk_type,
|
|
76
|
+
))
|
|
77
|
+
except SyntaxError:
|
|
78
|
+
pass
|
|
79
|
+
return chunks
|
|
80
|
+
|
|
81
|
+
def _chunk_java(self, file_path: str, content: str) -> List[CodeChunk]:
|
|
82
|
+
chunks = []
|
|
83
|
+
pattern = re.compile(
|
|
84
|
+
r'(?:public|private|protected)\s+(?:static\s+)?(?:class|interface|enum)\s+(\w+)|'
|
|
85
|
+
r'(?:public|private|protected)\s+(?:static\s+)?\w+\s+(\w+)\s*\([^)]*\)\s*(?:throws\s+\w+)?\s*\{'
|
|
86
|
+
)
|
|
87
|
+
lines = content.split("\n")
|
|
88
|
+
for match in pattern.finditer(content):
|
|
89
|
+
start_line = content[:match.start()].count("\n") + 1
|
|
90
|
+
brace_pos = content.find("{", match.end())
|
|
91
|
+
if brace_pos == -1:
|
|
92
|
+
continue
|
|
93
|
+
end_line = self._find_matching_brace(content, brace_pos, lines)
|
|
94
|
+
chunk_type = "class" if match.group(1) else "function"
|
|
95
|
+
chunk_lines = lines[start_line - 1:end_line]
|
|
96
|
+
chunks.append(CodeChunk(
|
|
97
|
+
id=f"{file_path}:{start_line}-{end_line}",
|
|
98
|
+
file_path=file_path,
|
|
99
|
+
content="\n".join(chunk_lines),
|
|
100
|
+
language="java",
|
|
101
|
+
start_line=start_line,
|
|
102
|
+
end_line=end_line,
|
|
103
|
+
chunk_type=chunk_type,
|
|
104
|
+
))
|
|
105
|
+
return chunks
|
|
106
|
+
|
|
107
|
+
def _chunk_by_lines(self, file_path: str, content: str, language: str) -> List[CodeChunk]:
|
|
108
|
+
lines = content.split("\n")
|
|
109
|
+
chunks = []
|
|
110
|
+
for i in range(0, len(lines), self._chunk_size - self._overlap):
|
|
111
|
+
chunk_lines = lines[i:i + self._chunk_size]
|
|
112
|
+
if not chunk_lines:
|
|
113
|
+
break
|
|
114
|
+
start = i + 1
|
|
115
|
+
end = i + len(chunk_lines)
|
|
116
|
+
chunks.append(CodeChunk(
|
|
117
|
+
id=f"{file_path}:{start}-{end}",
|
|
118
|
+
file_path=file_path,
|
|
119
|
+
content="\n".join(chunk_lines),
|
|
120
|
+
language=language,
|
|
121
|
+
start_line=start,
|
|
122
|
+
end_line=end,
|
|
123
|
+
chunk_type="code",
|
|
124
|
+
))
|
|
125
|
+
return chunks
|
|
126
|
+
|
|
127
|
+
@staticmethod
|
|
128
|
+
def _find_matching_brace(content: str, open_pos: int, lines: List[str]) -> int:
|
|
129
|
+
depth = 1
|
|
130
|
+
pos = open_pos + 1
|
|
131
|
+
while pos < len(content) and depth > 0:
|
|
132
|
+
if content[pos] == "{":
|
|
133
|
+
depth += 1
|
|
134
|
+
elif content[pos] == "}":
|
|
135
|
+
depth -= 1
|
|
136
|
+
pos += 1
|
|
137
|
+
return content[:pos].count("\n") + 1
|
voxcli/rag/embedding.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""嵌入客户端 - 使用 Ollama 或 OpenAI 兼容 API 生成文本嵌入"""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
from typing import List, Optional
|
|
6
|
+
|
|
7
|
+
import httpx
|
|
8
|
+
|
|
9
|
+
from ..config import pai_config
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
_DEFAULT_MODEL = "nomic-embed-text"
|
|
14
|
+
_DEFAULT_BASE_URL = "http://localhost:11434"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class EmbeddingClient:
|
|
18
|
+
def __init__(self, model: str = _DEFAULT_MODEL, base_url: str = _DEFAULT_BASE_URL,
|
|
19
|
+
api_key: str = ""):
|
|
20
|
+
self._model = model
|
|
21
|
+
self._base_url = base_url
|
|
22
|
+
self._api_key = api_key
|
|
23
|
+
|
|
24
|
+
@classmethod
|
|
25
|
+
def from_env(cls) -> "EmbeddingClient":
|
|
26
|
+
provider = pai_config.get_provider("ollama")
|
|
27
|
+
if provider:
|
|
28
|
+
return cls(
|
|
29
|
+
base_url=provider.get("base_url", _DEFAULT_BASE_URL),
|
|
30
|
+
model=provider.get("model", _DEFAULT_MODEL),
|
|
31
|
+
)
|
|
32
|
+
provider = pai_config.get_provider("glm")
|
|
33
|
+
if provider:
|
|
34
|
+
return cls(
|
|
35
|
+
base_url=provider.get("base_url", "https://open.bigmodel.cn/api/paas/v4"),
|
|
36
|
+
model=provider.get("model", "embedding-2"),
|
|
37
|
+
api_key=provider.get("api_key", ""),
|
|
38
|
+
)
|
|
39
|
+
return cls()
|
|
40
|
+
|
|
41
|
+
def embed(self, text: str) -> List[float]:
|
|
42
|
+
if self._base_url.startswith("http://localhost") or "ollama" in self._base_url:
|
|
43
|
+
return self._embed_ollama(text)
|
|
44
|
+
return self._embed_openai(text)
|
|
45
|
+
|
|
46
|
+
def embed_batch(self, texts: List[str]) -> List[List[float]]:
|
|
47
|
+
return [self.embed(t) for t in texts]
|
|
48
|
+
|
|
49
|
+
@property
|
|
50
|
+
def dimension(self) -> int:
|
|
51
|
+
return 768
|
|
52
|
+
|
|
53
|
+
def _embed_ollama(self, text: str) -> List[float]:
|
|
54
|
+
response = httpx.post(
|
|
55
|
+
f"{self._base_url}/api/embeddings",
|
|
56
|
+
json={"model": self._model, "prompt": text},
|
|
57
|
+
timeout=30,
|
|
58
|
+
)
|
|
59
|
+
response.raise_for_status()
|
|
60
|
+
data = response.json()
|
|
61
|
+
return data.get("embedding", [])
|
|
62
|
+
|
|
63
|
+
def _embed_openai(self, text: str) -> List[float]:
|
|
64
|
+
response = httpx.post(
|
|
65
|
+
f"{self._base_url}/embeddings",
|
|
66
|
+
headers={
|
|
67
|
+
"Authorization": f"Bearer {self._api_key}",
|
|
68
|
+
"Content-Type": "application/json",
|
|
69
|
+
},
|
|
70
|
+
json={"model": self._model, "input": text},
|
|
71
|
+
timeout=30,
|
|
72
|
+
)
|
|
73
|
+
response.raise_for_status()
|
|
74
|
+
data = response.json()
|
|
75
|
+
return data.get("data", [{}])[0].get("embedding", [])
|
voxcli/rag/formatter.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""搜索结果格式化器"""
|
|
2
|
+
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
from .store import SearchResult
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class SearchResultFormatter:
|
|
9
|
+
@staticmethod
|
|
10
|
+
def format_for_tool(query: str, results: List[SearchResult]) -> str:
|
|
11
|
+
lines = [f"🔍 代码检索: {query}\n"]
|
|
12
|
+
if not results:
|
|
13
|
+
lines.append("未找到相关结果。")
|
|
14
|
+
else:
|
|
15
|
+
for i, r in enumerate(results, 1):
|
|
16
|
+
chunk = r.chunk
|
|
17
|
+
lines.append(f"{i}. [{chunk.chunk_type}] {chunk.file_path}:{chunk.start_line}-{chunk.end_line}")
|
|
18
|
+
lines.append(f" 语言: {chunk.language} | 相关度: {r.score:.3f}")
|
|
19
|
+
code = chunk.content.strip()
|
|
20
|
+
if len(code) > 500:
|
|
21
|
+
code = code[:500] + "..."
|
|
22
|
+
lines.append(f" ```{chunk.language}")
|
|
23
|
+
lines.append(code)
|
|
24
|
+
lines.append(" ```")
|
|
25
|
+
lines.append("")
|
|
26
|
+
return "\n".join(lines).strip()
|
|
27
|
+
|
|
28
|
+
@staticmethod
|
|
29
|
+
def format_for_llm(query: str, results: List[SearchResult]) -> str:
|
|
30
|
+
parts = []
|
|
31
|
+
for r in results:
|
|
32
|
+
chunk = r.chunk
|
|
33
|
+
parts.append(
|
|
34
|
+
f"File: {chunk.file_path}:{chunk.start_line}-{chunk.end_line}\n"
|
|
35
|
+
f"Type: {chunk.chunk_type}\n"
|
|
36
|
+
f"```{chunk.language}\n{chunk.content}\n```"
|
|
37
|
+
)
|
|
38
|
+
if parts:
|
|
39
|
+
return f"Related code for query '{query}':\n\n" + "\n---\n".join(parts)
|
|
40
|
+
return ""
|
voxcli/rag/index.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""代码索引器 - 将项目文件索引到向量存储"""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import time
|
|
5
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import List, Optional, Set
|
|
8
|
+
|
|
9
|
+
from .chunk import CodeChunk
|
|
10
|
+
from .chunker import CodeChunker
|
|
11
|
+
from .embedding import EmbeddingClient
|
|
12
|
+
from .store import VectorStore
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
_IGNORE_DIRS = {
|
|
17
|
+
".git", ".svn", "__pycache__", "node_modules", ".mvn", ".gradle",
|
|
18
|
+
"target", "build", "dist", ".idea", ".vscode", ".venv", "venv",
|
|
19
|
+
"env", ".egg-info", "site-packages", ".tox", ".nox",
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
_IGNORE_EXTENSIONS = {
|
|
23
|
+
".pyc", ".pyo", ".so", ".o", ".class", ".jar", ".war", ".zip",
|
|
24
|
+
".tar", ".gz", ".7z", ".rar", ".exe", ".dll", ".dylib", ".bin",
|
|
25
|
+
".png", ".jpg", ".jpeg", ".gif", ".svg", ".ico", ".webp",
|
|
26
|
+
".mp4", ".mp3", ".wav", ".avi", ".mov", ".pdf", ".doc", ".docx",
|
|
27
|
+
".xls", ".xlsx", ".ppt", ".pptx", ".ttf", ".woff", ".woff2",
|
|
28
|
+
".DS_Store", ".gitkeep", ".gitignore",
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
_INCLUDE_EXTENSIONS = {
|
|
32
|
+
".py", ".java", ".js", ".ts", ".tsx", ".jsx", ".go", ".rs", ".rb",
|
|
33
|
+
".php", ".c", ".cpp", ".h", ".hpp", ".swift", ".kt", ".scala",
|
|
34
|
+
".md", ".json", ".yaml", ".yml", ".xml", ".toml", ".sql", ".sh",
|
|
35
|
+
".bash", ".zsh", ".txt", ".cfg", ".conf", ".ini", ".properties",
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class CodeIndex:
|
|
40
|
+
def __init__(self, project_path: str,
|
|
41
|
+
vector_store: Optional[VectorStore] = None,
|
|
42
|
+
chunker: Optional[CodeChunker] = None,
|
|
43
|
+
embedding_client: Optional[EmbeddingClient] = None):
|
|
44
|
+
self._project_path = Path(project_path)
|
|
45
|
+
self._store = vector_store or VectorStore()
|
|
46
|
+
self._chunker = chunker or CodeChunker()
|
|
47
|
+
self._embedding = embedding_client or EmbeddingClient()
|
|
48
|
+
|
|
49
|
+
def index_project(self, max_workers: int = 4) -> int:
|
|
50
|
+
files = self._discover_files()
|
|
51
|
+
if not files:
|
|
52
|
+
logger.info("No indexable files found in %s", self._project_path)
|
|
53
|
+
return 0
|
|
54
|
+
|
|
55
|
+
total_chunks = 0
|
|
56
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
57
|
+
futures = [executor.submit(self._index_file, f) for f in files]
|
|
58
|
+
for future in futures:
|
|
59
|
+
try:
|
|
60
|
+
total_chunks += future.result()
|
|
61
|
+
except Exception as e:
|
|
62
|
+
logger.warning("Indexing file failed: %s", e)
|
|
63
|
+
|
|
64
|
+
logger.info("Indexed %d chunks from %d files", total_chunks, len(files))
|
|
65
|
+
return total_chunks
|
|
66
|
+
|
|
67
|
+
def _discover_files(self) -> List[Path]:
|
|
68
|
+
files = []
|
|
69
|
+
for ext in _INCLUDE_EXTENSIONS:
|
|
70
|
+
files.extend(self._project_path.rglob(f"*{ext}"))
|
|
71
|
+
|
|
72
|
+
return [
|
|
73
|
+
f for f in files
|
|
74
|
+
if f.is_file()
|
|
75
|
+
and not any(part.startswith(".") or part in _IGNORE_DIRS
|
|
76
|
+
for part in f.relative_to(self._project_path).parts)
|
|
77
|
+
and f.suffix.lower() not in _IGNORE_EXTENSIONS
|
|
78
|
+
]
|
|
79
|
+
|
|
80
|
+
def _index_file(self, file_path: Path) -> int:
|
|
81
|
+
try:
|
|
82
|
+
# Remove stale entries
|
|
83
|
+
self._store.remove_file(str(file_path))
|
|
84
|
+
chunks = self._chunker.chunk_file(str(file_path))
|
|
85
|
+
if not chunks:
|
|
86
|
+
return 0
|
|
87
|
+
|
|
88
|
+
for chunk in chunks:
|
|
89
|
+
embedding = self._embedding.embed(chunk.content)
|
|
90
|
+
chunk.embedding = embedding
|
|
91
|
+
|
|
92
|
+
self._store.store_chunks(chunks)
|
|
93
|
+
return len(chunks)
|
|
94
|
+
except Exception as e:
|
|
95
|
+
logger.warning("Failed to index %s: %s", file_path, e)
|
|
96
|
+
return 0
|
voxcli/rag/relation.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""代码关系 - 代码块之间的调用/引用关系"""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import List, Optional
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class CodeRelation:
|
|
9
|
+
source_file: str
|
|
10
|
+
target_file: str
|
|
11
|
+
relation_type: str # import, call, extend, implement, reference
|
|
12
|
+
source_chunk_id: Optional[str] = None
|
|
13
|
+
target_chunk_id: Optional[str] = None
|
|
14
|
+
metadata: dict = field(default_factory=dict)
|
voxcli/rag/retriever.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""代码检索器 - 混合检索(语义 + 关键词)"""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from typing import List, Optional
|
|
6
|
+
|
|
7
|
+
from .embedding import EmbeddingClient
|
|
8
|
+
from .store import VectorStore, SearchResult
|
|
9
|
+
from .chunker import CodeChunker
|
|
10
|
+
from .index import CodeIndex
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class IndexStats:
|
|
17
|
+
chunk_count: int
|
|
18
|
+
file_count: int
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class CodeRetriever:
|
|
22
|
+
def __init__(self, project_path: str,
|
|
23
|
+
vector_store: Optional[VectorStore] = None,
|
|
24
|
+
embedding_client: Optional[EmbeddingClient] = None):
|
|
25
|
+
self._project_path = project_path
|
|
26
|
+
self._store = vector_store or VectorStore()
|
|
27
|
+
self._embedding = embedding_client or EmbeddingClient.from_env()
|
|
28
|
+
|
|
29
|
+
def hybrid_search(self, query: str, top_k: int = 5) -> List[SearchResult]:
|
|
30
|
+
query_embedding = self._embedding.embed(query)
|
|
31
|
+
semantic_results = self._store.search_by_embedding(query_embedding, top_k)
|
|
32
|
+
keyword_results = self._store.search_by_keyword(query, top_k)
|
|
33
|
+
|
|
34
|
+
seen = set()
|
|
35
|
+
merged = []
|
|
36
|
+
|
|
37
|
+
for r in semantic_results + keyword_results:
|
|
38
|
+
if r.chunk.id not in seen:
|
|
39
|
+
seen.add(r.chunk.id)
|
|
40
|
+
merged.append(r)
|
|
41
|
+
|
|
42
|
+
merged.sort(key=lambda x: x.score, reverse=True)
|
|
43
|
+
return merged[:top_k]
|
|
44
|
+
|
|
45
|
+
def get_stats(self) -> IndexStats:
|
|
46
|
+
return IndexStats(
|
|
47
|
+
chunk_count=self._store.chunk_count,
|
|
48
|
+
file_count=self._store.file_count,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
def close(self):
|
|
52
|
+
self._store.close()
|
|
53
|
+
|
|
54
|
+
def __enter__(self):
|
|
55
|
+
return self
|
|
56
|
+
|
|
57
|
+
def __exit__(self, *args):
|
|
58
|
+
self.close()
|
voxcli/rag/store.py
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
"""向量存储 - SQLite + 余弦相似度检索"""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
import math
|
|
6
|
+
import sqlite3
|
|
7
|
+
import time
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Dict, List, Optional, Tuple
|
|
11
|
+
|
|
12
|
+
from .chunk import CodeChunk
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class SearchResult:
|
|
19
|
+
chunk: CodeChunk
|
|
20
|
+
score: float
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class VectorStore:
|
|
24
|
+
def __init__(self, db_path: Optional[str] = None):
|
|
25
|
+
if db_path:
|
|
26
|
+
self._db_path = Path(db_path)
|
|
27
|
+
else:
|
|
28
|
+
mem_dir = Path.home() / ".vox-code" / "memory"
|
|
29
|
+
mem_dir.mkdir(parents=True, exist_ok=True)
|
|
30
|
+
self._db_path = mem_dir / "vector_store.db"
|
|
31
|
+
self._conn = sqlite3.connect(str(self._db_path))
|
|
32
|
+
self._conn.execute("PRAGMA journal_mode=WAL")
|
|
33
|
+
self._init_db()
|
|
34
|
+
|
|
35
|
+
def _init_db(self):
|
|
36
|
+
self._conn.execute("""
|
|
37
|
+
CREATE TABLE IF NOT EXISTS chunks (
|
|
38
|
+
id TEXT PRIMARY KEY,
|
|
39
|
+
file_path TEXT NOT NULL,
|
|
40
|
+
content TEXT NOT NULL,
|
|
41
|
+
language TEXT NOT NULL,
|
|
42
|
+
start_line INTEGER NOT NULL,
|
|
43
|
+
end_line INTEGER NOT NULL,
|
|
44
|
+
chunk_type TEXT DEFAULT 'code',
|
|
45
|
+
metadata TEXT DEFAULT '{}',
|
|
46
|
+
embedding TEXT,
|
|
47
|
+
indexed_at REAL NOT NULL
|
|
48
|
+
)
|
|
49
|
+
""")
|
|
50
|
+
self._conn.execute("""
|
|
51
|
+
CREATE INDEX IF NOT EXISTS idx_chunks_file_path ON chunks(file_path)
|
|
52
|
+
""")
|
|
53
|
+
self._conn.commit()
|
|
54
|
+
|
|
55
|
+
def store_chunk(self, chunk: CodeChunk):
|
|
56
|
+
self._conn.execute("""
|
|
57
|
+
INSERT OR REPLACE INTO chunks
|
|
58
|
+
(id, file_path, content, language, start_line, end_line,
|
|
59
|
+
chunk_type, metadata, embedding, indexed_at)
|
|
60
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
61
|
+
""", (
|
|
62
|
+
chunk.id, chunk.file_path, chunk.content, chunk.language,
|
|
63
|
+
chunk.start_line, chunk.end_line, chunk.chunk_type,
|
|
64
|
+
json.dumps(chunk.metadata),
|
|
65
|
+
json.dumps(chunk.embedding) if chunk.embedding else None,
|
|
66
|
+
time.time(),
|
|
67
|
+
))
|
|
68
|
+
self._conn.commit()
|
|
69
|
+
|
|
70
|
+
def store_chunks(self, chunks: List[CodeChunk]):
|
|
71
|
+
for chunk in chunks:
|
|
72
|
+
self.store_chunk(chunk)
|
|
73
|
+
|
|
74
|
+
def search_by_embedding(self, query_embedding: List[float], top_k: int = 5) -> List[SearchResult]:
|
|
75
|
+
cursor = self._conn.execute(
|
|
76
|
+
"SELECT id, file_path, content, language, start_line, end_line, "
|
|
77
|
+
"chunk_type, metadata, embedding FROM chunks WHERE embedding IS NOT NULL"
|
|
78
|
+
)
|
|
79
|
+
scored: List[Tuple[float, dict]] = []
|
|
80
|
+
for row in cursor:
|
|
81
|
+
try:
|
|
82
|
+
stored_emb = json.loads(row[8])
|
|
83
|
+
score = self._cosine_similarity(query_embedding, stored_emb)
|
|
84
|
+
scored.append((score, {
|
|
85
|
+
"id": row[0], "file_path": row[1], "content": row[2],
|
|
86
|
+
"language": row[3], "start_line": row[4], "end_line": row[5],
|
|
87
|
+
"chunk_type": row[6], "metadata": json.loads(row[7]),
|
|
88
|
+
}))
|
|
89
|
+
except (json.JSONDecodeError, IndexError):
|
|
90
|
+
continue
|
|
91
|
+
|
|
92
|
+
scored.sort(key=lambda x: x[0], reverse=True)
|
|
93
|
+
results = []
|
|
94
|
+
for score, data in scored[:top_k]:
|
|
95
|
+
chunk = CodeChunk(**data)
|
|
96
|
+
results.append(SearchResult(chunk=chunk, score=score))
|
|
97
|
+
return results
|
|
98
|
+
|
|
99
|
+
def search_by_keyword(self, query: str, top_k: int = 5) -> List[SearchResult]:
|
|
100
|
+
terms = query.lower().split()
|
|
101
|
+
cursor = self._conn.execute(
|
|
102
|
+
"SELECT id, file_path, content, language, start_line, end_line, "
|
|
103
|
+
"chunk_type, metadata FROM chunks"
|
|
104
|
+
)
|
|
105
|
+
scored: List[Tuple[float, dict]] = []
|
|
106
|
+
for row in cursor:
|
|
107
|
+
content_lower = (row[2] or "").lower()
|
|
108
|
+
path_lower = (row[1] or "").lower()
|
|
109
|
+
match_count = sum(1 for t in terms if t in content_lower or t in path_lower)
|
|
110
|
+
if match_count > 0:
|
|
111
|
+
score = match_count / len(terms)
|
|
112
|
+
scored.append((score, {
|
|
113
|
+
"id": row[0], "file_path": row[1], "content": row[2],
|
|
114
|
+
"language": row[3], "start_line": row[4], "end_line": row[5],
|
|
115
|
+
"chunk_type": row[6], "metadata": json.loads(row[7]),
|
|
116
|
+
}))
|
|
117
|
+
|
|
118
|
+
scored.sort(key=lambda x: x[0], reverse=True)
|
|
119
|
+
results = []
|
|
120
|
+
for score, data in scored[:top_k]:
|
|
121
|
+
chunk = CodeChunk(**data)
|
|
122
|
+
results.append(SearchResult(chunk=chunk, score=score))
|
|
123
|
+
return results
|
|
124
|
+
|
|
125
|
+
def remove_file(self, file_path: str):
|
|
126
|
+
self._conn.execute("DELETE FROM chunks WHERE file_path = ?", (file_path,))
|
|
127
|
+
self._conn.commit()
|
|
128
|
+
|
|
129
|
+
def clear(self):
|
|
130
|
+
self._conn.execute("DELETE FROM chunks")
|
|
131
|
+
self._conn.commit()
|
|
132
|
+
|
|
133
|
+
@property
|
|
134
|
+
def chunk_count(self) -> int:
|
|
135
|
+
cursor = self._conn.execute("SELECT COUNT(*) FROM chunks")
|
|
136
|
+
return cursor.fetchone()[0]
|
|
137
|
+
|
|
138
|
+
@property
|
|
139
|
+
def file_count(self) -> int:
|
|
140
|
+
cursor = self._conn.execute("SELECT COUNT(DISTINCT file_path) FROM chunks")
|
|
141
|
+
return cursor.fetchone()[0]
|
|
142
|
+
|
|
143
|
+
def close(self):
|
|
144
|
+
self._conn.close()
|
|
145
|
+
|
|
146
|
+
@staticmethod
|
|
147
|
+
def _cosine_similarity(a: List[float], b: List[float]) -> float:
|
|
148
|
+
if not a or not b:
|
|
149
|
+
return 0.0
|
|
150
|
+
dot = sum(x * y for x, y in zip(a, b))
|
|
151
|
+
norm_a = math.sqrt(sum(x * x for x in a))
|
|
152
|
+
norm_b = math.sqrt(sum(y * y for y in b))
|
|
153
|
+
if norm_a == 0 or norm_b == 0:
|
|
154
|
+
return 0.0
|
|
155
|
+
return dot / (norm_a * norm_b)
|