vox-code 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. vox_code-2.0.0.dist-info/METADATA +258 -0
  2. vox_code-2.0.0.dist-info/RECORD +88 -0
  3. vox_code-2.0.0.dist-info/WHEEL +4 -0
  4. vox_code-2.0.0.dist-info/entry_points.txt +3 -0
  5. voxcli/__init__.py +3 -0
  6. voxcli/__main__.py +5 -0
  7. voxcli/agent/__init__.py +12 -0
  8. voxcli/agent/agent.py +449 -0
  9. voxcli/agent/agent_budget.py +133 -0
  10. voxcli/agent/agent_orchestrator.py +414 -0
  11. voxcli/agent/plan_execute_agent.py +514 -0
  12. voxcli/agent/roles.py +80 -0
  13. voxcli/agent/sub_agent.py +351 -0
  14. voxcli/catalog.py +477 -0
  15. voxcli/chat.py +91 -0
  16. voxcli/cli/__init__.py +4 -0
  17. voxcli/cli/main.py +452 -0
  18. voxcli/cli/parser.py +71 -0
  19. voxcli/config.py +518 -0
  20. voxcli/gui/__main__.py +3 -0
  21. voxcli/gui/main.py +22 -0
  22. voxcli/gui/pet/__init__.py +5 -0
  23. voxcli/gui/pet/base.py +62 -0
  24. voxcli/gui/pet/coordinator.py +888 -0
  25. voxcli/gui/pet/data.py +430 -0
  26. voxcli/gui/pet/widgets.py +683 -0
  27. voxcli/gui/pet/windows.py +2298 -0
  28. voxcli/gui/pet/workers.py +54 -0
  29. voxcli/gui/pet_app.py +7 -0
  30. voxcli/hitl/__init__.py +11 -0
  31. voxcli/hitl/handler.py +11 -0
  32. voxcli/hitl/policy.py +32 -0
  33. voxcli/hitl/request.py +13 -0
  34. voxcli/hitl/result.py +11 -0
  35. voxcli/hitl/terminal_handler.py +64 -0
  36. voxcli/hitl/tool_registry.py +64 -0
  37. voxcli/llm/base.py +93 -0
  38. voxcli/llm/factory.py +178 -0
  39. voxcli/llm/ollama_client.py +137 -0
  40. voxcli/llm/openai_compatible.py +249 -0
  41. voxcli/memory/base.py +16 -0
  42. voxcli/memory/budget.py +53 -0
  43. voxcli/memory/compressor.py +198 -0
  44. voxcli/memory/entry.py +36 -0
  45. voxcli/memory/long_term.py +126 -0
  46. voxcli/memory/manager.py +101 -0
  47. voxcli/memory/retriever.py +72 -0
  48. voxcli/memory/short_term.py +84 -0
  49. voxcli/memory/tokenizer.py +21 -0
  50. voxcli/plan/__init__.py +5 -0
  51. voxcli/plan/execution_plan.py +225 -0
  52. voxcli/plan/planner.py +198 -0
  53. voxcli/plan/task.py +123 -0
  54. voxcli/policy/audit_log.py +111 -0
  55. voxcli/policy/command_guard.py +34 -0
  56. voxcli/policy/exception.py +5 -0
  57. voxcli/policy/path_guard.py +32 -0
  58. voxcli/prompting/__init__.py +7 -0
  59. voxcli/prompting/presenter.py +154 -0
  60. voxcli/rag/__init__.py +16 -0
  61. voxcli/rag/analyzer.py +89 -0
  62. voxcli/rag/chunk.py +17 -0
  63. voxcli/rag/chunker.py +137 -0
  64. voxcli/rag/embedding.py +75 -0
  65. voxcli/rag/formatter.py +40 -0
  66. voxcli/rag/index.py +96 -0
  67. voxcli/rag/relation.py +14 -0
  68. voxcli/rag/retriever.py +58 -0
  69. voxcli/rag/store.py +155 -0
  70. voxcli/rag/tokenizer.py +26 -0
  71. voxcli/runtime/__init__.py +6 -0
  72. voxcli/runtime/session_controller.py +386 -0
  73. voxcli/tool/__init__.py +3 -0
  74. voxcli/tool/tool_registry.py +433 -0
  75. voxcli/util/animation.py +219 -0
  76. voxcli/util/ansi.py +82 -0
  77. voxcli/util/markdown.py +98 -0
  78. voxcli/web/__init__.py +17 -0
  79. voxcli/web/base.py +20 -0
  80. voxcli/web/extractor.py +77 -0
  81. voxcli/web/factory.py +38 -0
  82. voxcli/web/fetch_result.py +27 -0
  83. voxcli/web/fetcher.py +42 -0
  84. voxcli/web/network_policy.py +49 -0
  85. voxcli/web/result.py +23 -0
  86. voxcli/web/searxng.py +55 -0
  87. voxcli/web/serpapi.py +53 -0
  88. voxcli/web/zhipu.py +55 -0
voxcli/rag/analyzer.py ADDED
@@ -0,0 +1,89 @@
1
+ """代码分析器 - 分析代码结构和关系"""
2
+
3
+ import ast
4
+ import logging
5
+ import re
6
+ from pathlib import Path
7
+ from typing import Dict, List, Optional, Set
8
+
9
+ from .relation import CodeRelation
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class CodeAnalyzer:
15
+ def __init__(self, project_path: str):
16
+ self._project_path = Path(project_path)
17
+
18
+ def analyze_file(self, file_path: str) -> List[CodeRelation]:
19
+ path = Path(file_path)
20
+ if not path.exists():
21
+ return []
22
+ try:
23
+ content = path.read_text(encoding="utf-8")
24
+ except Exception as e:
25
+ logger.warning("Failed to read %s: %s", file_path, e)
26
+ return []
27
+
28
+ ext = path.suffix.lower()
29
+ if ext == ".py":
30
+ return self._analyze_python(file_path, content)
31
+ elif ext == ".java":
32
+ return self._analyze_java(file_path, content)
33
+ return []
34
+
35
+ def _analyze_python(self, file_path: str, content: str) -> List[CodeRelation]:
36
+ relations = []
37
+ try:
38
+ tree = ast.parse(content)
39
+ for node in ast.walk(tree):
40
+ if isinstance(node, ast.Import):
41
+ for alias in node.names:
42
+ target = self._resolve_module(alias.name)
43
+ if target:
44
+ relations.append(CodeRelation(
45
+ source_file=file_path,
46
+ target_file=target,
47
+ relation_type="import",
48
+ ))
49
+ elif isinstance(node, ast.ImportFrom):
50
+ if node.module:
51
+ for alias in node.names:
52
+ target = self._resolve_module(f"{node.module}.{alias.name}")
53
+ if target:
54
+ relations.append(CodeRelation(
55
+ source_file=file_path,
56
+ target_file=target,
57
+ relation_type="import",
58
+ ))
59
+ except SyntaxError:
60
+ pass
61
+ return relations
62
+
63
+ def _analyze_java(self, file_path: str, content: str) -> List[CodeRelation]:
64
+ relations = []
65
+ import_pattern = re.compile(r"^import\s+(?:static\s+)?([\w.]+);", re.MULTILINE)
66
+ for match in import_pattern.finditer(content):
67
+ import_path = match.group(1)
68
+ target = self._resolve_java_class(import_path)
69
+ if target:
70
+ relations.append(CodeRelation(
71
+ source_file=file_path,
72
+ target_file=str(target),
73
+ relation_type="import",
74
+ ))
75
+ return relations
76
+
77
+ def _resolve_module(self, module_name: str) -> Optional[str]:
78
+ path = self._project_path / (module_name.replace(".", "/") + ".py")
79
+ if path.exists():
80
+ return str(path)
81
+ init_path = path.parent / "__init__.py"
82
+ if init_path.exists():
83
+ return str(init_path)
84
+ return None
85
+
86
+ def _resolve_java_class(self, class_name: str) -> Optional[Path]:
87
+ rel_path = class_name.replace(".", "/") + ".java"
88
+ candidates = list(self._project_path.rglob(rel_path))
89
+ return candidates[0] if candidates else None
voxcli/rag/chunk.py ADDED
@@ -0,0 +1,17 @@
1
+ """代码块数据模型"""
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import List, Optional
5
+
6
+
7
+ @dataclass
8
+ class CodeChunk:
9
+ id: str
10
+ file_path: str
11
+ content: str
12
+ language: str
13
+ start_line: int
14
+ end_line: int
15
+ chunk_type: str = "code" # code, comment, import, class, function
16
+ metadata: dict = field(default_factory=dict)
17
+ embedding: Optional[List[float]] = None
voxcli/rag/chunker.py ADDED
@@ -0,0 +1,137 @@
1
+ """代码分块器 - 将源代码文件分割为可检索的块"""
2
+
3
+ import ast
4
+ import logging
5
+ import re
6
+ from pathlib import Path
7
+ from typing import List, Optional
8
+
9
+ from .chunk import CodeChunk
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ _LINE_CHUNK_SIZE = 50
14
+ _LINE_OVERLAP = 10
15
+
16
+ _LANGUAGE_MAP = {
17
+ ".py": "python", ".java": "java", ".js": "javascript",
18
+ ".ts": "typescript", ".tsx": "typescriptreact", ".jsx": "javascriptreact",
19
+ ".go": "go", ".rs": "rust", ".rb": "ruby", ".php": "php",
20
+ ".c": "c", ".cpp": "cpp", ".h": "c", ".hpp": "cpp",
21
+ ".swift": "swift", ".kt": "kotlin", ".scala": "scala",
22
+ ".md": "markdown", ".json": "json", ".yaml": "yaml", ".yml": "yaml",
23
+ ".xml": "xml", ".toml": "toml", ".sql": "sql", ".sh": "bash",
24
+ ".zsh": "bash", ".bash": "bash", ".txt": "text",
25
+ }
26
+
27
+
28
+ class CodeChunker:
29
+ def __init__(self, line_chunk_size: int = _LINE_CHUNK_SIZE,
30
+ line_overlap: int = _LINE_OVERLAP):
31
+ self._chunk_size = line_chunk_size
32
+ self._overlap = line_overlap
33
+
34
+ def chunk_file(self, file_path: str) -> List[CodeChunk]:
35
+ path = Path(file_path)
36
+ if not path.exists():
37
+ return []
38
+ try:
39
+ content = path.read_text(encoding="utf-8")
40
+ except Exception as e:
41
+ logger.warning("Failed to read %s: %s", file_path, e)
42
+ return []
43
+
44
+ ext = path.suffix.lower()
45
+ language = _LANGUAGE_MAP.get(ext, "text")
46
+ chunks = []
47
+
48
+ if language == "python":
49
+ chunks = self._chunk_python(file_path, content)
50
+ elif language == "java":
51
+ chunks = self._chunk_java(file_path, content)
52
+
53
+ if not chunks:
54
+ chunks = self._chunk_by_lines(file_path, content, language)
55
+
56
+ return chunks
57
+
58
+ def _chunk_python(self, file_path: str, content: str) -> List[CodeChunk]:
59
+ chunks = []
60
+ try:
61
+ tree = ast.parse(content)
62
+ for node in ast.walk(tree):
63
+ if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
64
+ start = node.lineno or 1
65
+ end = node.end_lineno or start
66
+ lines = content.split("\n")[start - 1:end]
67
+ chunk_type = "class" if isinstance(node, ast.ClassDef) else "function"
68
+ chunks.append(CodeChunk(
69
+ id=f"{file_path}:{start}-{end}",
70
+ file_path=file_path,
71
+ content="\n".join(lines),
72
+ language="python",
73
+ start_line=start,
74
+ end_line=end,
75
+ chunk_type=chunk_type,
76
+ ))
77
+ except SyntaxError:
78
+ pass
79
+ return chunks
80
+
81
+ def _chunk_java(self, file_path: str, content: str) -> List[CodeChunk]:
82
+ chunks = []
83
+ pattern = re.compile(
84
+ r'(?:public|private|protected)\s+(?:static\s+)?(?:class|interface|enum)\s+(\w+)|'
85
+ r'(?:public|private|protected)\s+(?:static\s+)?\w+\s+(\w+)\s*\([^)]*\)\s*(?:throws\s+\w+)?\s*\{'
86
+ )
87
+ lines = content.split("\n")
88
+ for match in pattern.finditer(content):
89
+ start_line = content[:match.start()].count("\n") + 1
90
+ brace_pos = content.find("{", match.end())
91
+ if brace_pos == -1:
92
+ continue
93
+ end_line = self._find_matching_brace(content, brace_pos, lines)
94
+ chunk_type = "class" if match.group(1) else "function"
95
+ chunk_lines = lines[start_line - 1:end_line]
96
+ chunks.append(CodeChunk(
97
+ id=f"{file_path}:{start_line}-{end_line}",
98
+ file_path=file_path,
99
+ content="\n".join(chunk_lines),
100
+ language="java",
101
+ start_line=start_line,
102
+ end_line=end_line,
103
+ chunk_type=chunk_type,
104
+ ))
105
+ return chunks
106
+
107
+ def _chunk_by_lines(self, file_path: str, content: str, language: str) -> List[CodeChunk]:
108
+ lines = content.split("\n")
109
+ chunks = []
110
+ for i in range(0, len(lines), self._chunk_size - self._overlap):
111
+ chunk_lines = lines[i:i + self._chunk_size]
112
+ if not chunk_lines:
113
+ break
114
+ start = i + 1
115
+ end = i + len(chunk_lines)
116
+ chunks.append(CodeChunk(
117
+ id=f"{file_path}:{start}-{end}",
118
+ file_path=file_path,
119
+ content="\n".join(chunk_lines),
120
+ language=language,
121
+ start_line=start,
122
+ end_line=end,
123
+ chunk_type="code",
124
+ ))
125
+ return chunks
126
+
127
+ @staticmethod
128
+ def _find_matching_brace(content: str, open_pos: int, lines: List[str]) -> int:
129
+ depth = 1
130
+ pos = open_pos + 1
131
+ while pos < len(content) and depth > 0:
132
+ if content[pos] == "{":
133
+ depth += 1
134
+ elif content[pos] == "}":
135
+ depth -= 1
136
+ pos += 1
137
+ return content[:pos].count("\n") + 1
@@ -0,0 +1,75 @@
1
+ """嵌入客户端 - 使用 Ollama 或 OpenAI 兼容 API 生成文本嵌入"""
2
+
3
+ import json
4
+ import logging
5
+ from typing import List, Optional
6
+
7
+ import httpx
8
+
9
+ from ..config import pai_config
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ _DEFAULT_MODEL = "nomic-embed-text"
14
+ _DEFAULT_BASE_URL = "http://localhost:11434"
15
+
16
+
17
+ class EmbeddingClient:
18
+ def __init__(self, model: str = _DEFAULT_MODEL, base_url: str = _DEFAULT_BASE_URL,
19
+ api_key: str = ""):
20
+ self._model = model
21
+ self._base_url = base_url
22
+ self._api_key = api_key
23
+
24
+ @classmethod
25
+ def from_env(cls) -> "EmbeddingClient":
26
+ provider = pai_config.get_provider("ollama")
27
+ if provider:
28
+ return cls(
29
+ base_url=provider.get("base_url", _DEFAULT_BASE_URL),
30
+ model=provider.get("model", _DEFAULT_MODEL),
31
+ )
32
+ provider = pai_config.get_provider("glm")
33
+ if provider:
34
+ return cls(
35
+ base_url=provider.get("base_url", "https://open.bigmodel.cn/api/paas/v4"),
36
+ model=provider.get("model", "embedding-2"),
37
+ api_key=provider.get("api_key", ""),
38
+ )
39
+ return cls()
40
+
41
+ def embed(self, text: str) -> List[float]:
42
+ if self._base_url.startswith("http://localhost") or "ollama" in self._base_url:
43
+ return self._embed_ollama(text)
44
+ return self._embed_openai(text)
45
+
46
+ def embed_batch(self, texts: List[str]) -> List[List[float]]:
47
+ return [self.embed(t) for t in texts]
48
+
49
+ @property
50
+ def dimension(self) -> int:
51
+ return 768
52
+
53
+ def _embed_ollama(self, text: str) -> List[float]:
54
+ response = httpx.post(
55
+ f"{self._base_url}/api/embeddings",
56
+ json={"model": self._model, "prompt": text},
57
+ timeout=30,
58
+ )
59
+ response.raise_for_status()
60
+ data = response.json()
61
+ return data.get("embedding", [])
62
+
63
+ def _embed_openai(self, text: str) -> List[float]:
64
+ response = httpx.post(
65
+ f"{self._base_url}/embeddings",
66
+ headers={
67
+ "Authorization": f"Bearer {self._api_key}",
68
+ "Content-Type": "application/json",
69
+ },
70
+ json={"model": self._model, "input": text},
71
+ timeout=30,
72
+ )
73
+ response.raise_for_status()
74
+ data = response.json()
75
+ return data.get("data", [{}])[0].get("embedding", [])
@@ -0,0 +1,40 @@
1
+ """搜索结果格式化器"""
2
+
3
+ from typing import List
4
+
5
+ from .store import SearchResult
6
+
7
+
8
+ class SearchResultFormatter:
9
+ @staticmethod
10
+ def format_for_tool(query: str, results: List[SearchResult]) -> str:
11
+ lines = [f"🔍 代码检索: {query}\n"]
12
+ if not results:
13
+ lines.append("未找到相关结果。")
14
+ else:
15
+ for i, r in enumerate(results, 1):
16
+ chunk = r.chunk
17
+ lines.append(f"{i}. [{chunk.chunk_type}] {chunk.file_path}:{chunk.start_line}-{chunk.end_line}")
18
+ lines.append(f" 语言: {chunk.language} | 相关度: {r.score:.3f}")
19
+ code = chunk.content.strip()
20
+ if len(code) > 500:
21
+ code = code[:500] + "..."
22
+ lines.append(f" ```{chunk.language}")
23
+ lines.append(code)
24
+ lines.append(" ```")
25
+ lines.append("")
26
+ return "\n".join(lines).strip()
27
+
28
+ @staticmethod
29
+ def format_for_llm(query: str, results: List[SearchResult]) -> str:
30
+ parts = []
31
+ for r in results:
32
+ chunk = r.chunk
33
+ parts.append(
34
+ f"File: {chunk.file_path}:{chunk.start_line}-{chunk.end_line}\n"
35
+ f"Type: {chunk.chunk_type}\n"
36
+ f"```{chunk.language}\n{chunk.content}\n```"
37
+ )
38
+ if parts:
39
+ return f"Related code for query '{query}':\n\n" + "\n---\n".join(parts)
40
+ return ""
voxcli/rag/index.py ADDED
@@ -0,0 +1,96 @@
1
+ """代码索引器 - 将项目文件索引到向量存储"""
2
+
3
+ import logging
4
+ import time
5
+ from concurrent.futures import ThreadPoolExecutor
6
+ from pathlib import Path
7
+ from typing import List, Optional, Set
8
+
9
+ from .chunk import CodeChunk
10
+ from .chunker import CodeChunker
11
+ from .embedding import EmbeddingClient
12
+ from .store import VectorStore
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ _IGNORE_DIRS = {
17
+ ".git", ".svn", "__pycache__", "node_modules", ".mvn", ".gradle",
18
+ "target", "build", "dist", ".idea", ".vscode", ".venv", "venv",
19
+ "env", ".egg-info", "site-packages", ".tox", ".nox",
20
+ }
21
+
22
+ _IGNORE_EXTENSIONS = {
23
+ ".pyc", ".pyo", ".so", ".o", ".class", ".jar", ".war", ".zip",
24
+ ".tar", ".gz", ".7z", ".rar", ".exe", ".dll", ".dylib", ".bin",
25
+ ".png", ".jpg", ".jpeg", ".gif", ".svg", ".ico", ".webp",
26
+ ".mp4", ".mp3", ".wav", ".avi", ".mov", ".pdf", ".doc", ".docx",
27
+ ".xls", ".xlsx", ".ppt", ".pptx", ".ttf", ".woff", ".woff2",
28
+ ".DS_Store", ".gitkeep", ".gitignore",
29
+ }
30
+
31
+ _INCLUDE_EXTENSIONS = {
32
+ ".py", ".java", ".js", ".ts", ".tsx", ".jsx", ".go", ".rs", ".rb",
33
+ ".php", ".c", ".cpp", ".h", ".hpp", ".swift", ".kt", ".scala",
34
+ ".md", ".json", ".yaml", ".yml", ".xml", ".toml", ".sql", ".sh",
35
+ ".bash", ".zsh", ".txt", ".cfg", ".conf", ".ini", ".properties",
36
+ }
37
+
38
+
39
+ class CodeIndex:
40
+ def __init__(self, project_path: str,
41
+ vector_store: Optional[VectorStore] = None,
42
+ chunker: Optional[CodeChunker] = None,
43
+ embedding_client: Optional[EmbeddingClient] = None):
44
+ self._project_path = Path(project_path)
45
+ self._store = vector_store or VectorStore()
46
+ self._chunker = chunker or CodeChunker()
47
+ self._embedding = embedding_client or EmbeddingClient()
48
+
49
+ def index_project(self, max_workers: int = 4) -> int:
50
+ files = self._discover_files()
51
+ if not files:
52
+ logger.info("No indexable files found in %s", self._project_path)
53
+ return 0
54
+
55
+ total_chunks = 0
56
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
57
+ futures = [executor.submit(self._index_file, f) for f in files]
58
+ for future in futures:
59
+ try:
60
+ total_chunks += future.result()
61
+ except Exception as e:
62
+ logger.warning("Indexing file failed: %s", e)
63
+
64
+ logger.info("Indexed %d chunks from %d files", total_chunks, len(files))
65
+ return total_chunks
66
+
67
+ def _discover_files(self) -> List[Path]:
68
+ files = []
69
+ for ext in _INCLUDE_EXTENSIONS:
70
+ files.extend(self._project_path.rglob(f"*{ext}"))
71
+
72
+ return [
73
+ f for f in files
74
+ if f.is_file()
75
+ and not any(part.startswith(".") or part in _IGNORE_DIRS
76
+ for part in f.relative_to(self._project_path).parts)
77
+ and f.suffix.lower() not in _IGNORE_EXTENSIONS
78
+ ]
79
+
80
+ def _index_file(self, file_path: Path) -> int:
81
+ try:
82
+ # Remove stale entries
83
+ self._store.remove_file(str(file_path))
84
+ chunks = self._chunker.chunk_file(str(file_path))
85
+ if not chunks:
86
+ return 0
87
+
88
+ for chunk in chunks:
89
+ embedding = self._embedding.embed(chunk.content)
90
+ chunk.embedding = embedding
91
+
92
+ self._store.store_chunks(chunks)
93
+ return len(chunks)
94
+ except Exception as e:
95
+ logger.warning("Failed to index %s: %s", file_path, e)
96
+ return 0
voxcli/rag/relation.py ADDED
@@ -0,0 +1,14 @@
1
+ """代码关系 - 代码块之间的调用/引用关系"""
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import List, Optional
5
+
6
+
7
+ @dataclass
8
+ class CodeRelation:
9
+ source_file: str
10
+ target_file: str
11
+ relation_type: str # import, call, extend, implement, reference
12
+ source_chunk_id: Optional[str] = None
13
+ target_chunk_id: Optional[str] = None
14
+ metadata: dict = field(default_factory=dict)
@@ -0,0 +1,58 @@
1
+ """代码检索器 - 混合检索(语义 + 关键词)"""
2
+
3
+ import logging
4
+ from dataclasses import dataclass
5
+ from typing import List, Optional
6
+
7
+ from .embedding import EmbeddingClient
8
+ from .store import VectorStore, SearchResult
9
+ from .chunker import CodeChunker
10
+ from .index import CodeIndex
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ @dataclass
16
+ class IndexStats:
17
+ chunk_count: int
18
+ file_count: int
19
+
20
+
21
+ class CodeRetriever:
22
+ def __init__(self, project_path: str,
23
+ vector_store: Optional[VectorStore] = None,
24
+ embedding_client: Optional[EmbeddingClient] = None):
25
+ self._project_path = project_path
26
+ self._store = vector_store or VectorStore()
27
+ self._embedding = embedding_client or EmbeddingClient.from_env()
28
+
29
+ def hybrid_search(self, query: str, top_k: int = 5) -> List[SearchResult]:
30
+ query_embedding = self._embedding.embed(query)
31
+ semantic_results = self._store.search_by_embedding(query_embedding, top_k)
32
+ keyword_results = self._store.search_by_keyword(query, top_k)
33
+
34
+ seen = set()
35
+ merged = []
36
+
37
+ for r in semantic_results + keyword_results:
38
+ if r.chunk.id not in seen:
39
+ seen.add(r.chunk.id)
40
+ merged.append(r)
41
+
42
+ merged.sort(key=lambda x: x.score, reverse=True)
43
+ return merged[:top_k]
44
+
45
+ def get_stats(self) -> IndexStats:
46
+ return IndexStats(
47
+ chunk_count=self._store.chunk_count,
48
+ file_count=self._store.file_count,
49
+ )
50
+
51
+ def close(self):
52
+ self._store.close()
53
+
54
+ def __enter__(self):
55
+ return self
56
+
57
+ def __exit__(self, *args):
58
+ self.close()
voxcli/rag/store.py ADDED
@@ -0,0 +1,155 @@
1
+ """向量存储 - SQLite + 余弦相似度检索"""
2
+
3
+ import json
4
+ import logging
5
+ import math
6
+ import sqlite3
7
+ import time
8
+ from dataclasses import dataclass
9
+ from pathlib import Path
10
+ from typing import Dict, List, Optional, Tuple
11
+
12
+ from .chunk import CodeChunk
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ @dataclass
18
+ class SearchResult:
19
+ chunk: CodeChunk
20
+ score: float
21
+
22
+
23
+ class VectorStore:
24
+ def __init__(self, db_path: Optional[str] = None):
25
+ if db_path:
26
+ self._db_path = Path(db_path)
27
+ else:
28
+ mem_dir = Path.home() / ".vox-code" / "memory"
29
+ mem_dir.mkdir(parents=True, exist_ok=True)
30
+ self._db_path = mem_dir / "vector_store.db"
31
+ self._conn = sqlite3.connect(str(self._db_path))
32
+ self._conn.execute("PRAGMA journal_mode=WAL")
33
+ self._init_db()
34
+
35
+ def _init_db(self):
36
+ self._conn.execute("""
37
+ CREATE TABLE IF NOT EXISTS chunks (
38
+ id TEXT PRIMARY KEY,
39
+ file_path TEXT NOT NULL,
40
+ content TEXT NOT NULL,
41
+ language TEXT NOT NULL,
42
+ start_line INTEGER NOT NULL,
43
+ end_line INTEGER NOT NULL,
44
+ chunk_type TEXT DEFAULT 'code',
45
+ metadata TEXT DEFAULT '{}',
46
+ embedding TEXT,
47
+ indexed_at REAL NOT NULL
48
+ )
49
+ """)
50
+ self._conn.execute("""
51
+ CREATE INDEX IF NOT EXISTS idx_chunks_file_path ON chunks(file_path)
52
+ """)
53
+ self._conn.commit()
54
+
55
+ def store_chunk(self, chunk: CodeChunk):
56
+ self._conn.execute("""
57
+ INSERT OR REPLACE INTO chunks
58
+ (id, file_path, content, language, start_line, end_line,
59
+ chunk_type, metadata, embedding, indexed_at)
60
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
61
+ """, (
62
+ chunk.id, chunk.file_path, chunk.content, chunk.language,
63
+ chunk.start_line, chunk.end_line, chunk.chunk_type,
64
+ json.dumps(chunk.metadata),
65
+ json.dumps(chunk.embedding) if chunk.embedding else None,
66
+ time.time(),
67
+ ))
68
+ self._conn.commit()
69
+
70
+ def store_chunks(self, chunks: List[CodeChunk]):
71
+ for chunk in chunks:
72
+ self.store_chunk(chunk)
73
+
74
+ def search_by_embedding(self, query_embedding: List[float], top_k: int = 5) -> List[SearchResult]:
75
+ cursor = self._conn.execute(
76
+ "SELECT id, file_path, content, language, start_line, end_line, "
77
+ "chunk_type, metadata, embedding FROM chunks WHERE embedding IS NOT NULL"
78
+ )
79
+ scored: List[Tuple[float, dict]] = []
80
+ for row in cursor:
81
+ try:
82
+ stored_emb = json.loads(row[8])
83
+ score = self._cosine_similarity(query_embedding, stored_emb)
84
+ scored.append((score, {
85
+ "id": row[0], "file_path": row[1], "content": row[2],
86
+ "language": row[3], "start_line": row[4], "end_line": row[5],
87
+ "chunk_type": row[6], "metadata": json.loads(row[7]),
88
+ }))
89
+ except (json.JSONDecodeError, IndexError):
90
+ continue
91
+
92
+ scored.sort(key=lambda x: x[0], reverse=True)
93
+ results = []
94
+ for score, data in scored[:top_k]:
95
+ chunk = CodeChunk(**data)
96
+ results.append(SearchResult(chunk=chunk, score=score))
97
+ return results
98
+
99
+ def search_by_keyword(self, query: str, top_k: int = 5) -> List[SearchResult]:
100
+ terms = query.lower().split()
101
+ cursor = self._conn.execute(
102
+ "SELECT id, file_path, content, language, start_line, end_line, "
103
+ "chunk_type, metadata FROM chunks"
104
+ )
105
+ scored: List[Tuple[float, dict]] = []
106
+ for row in cursor:
107
+ content_lower = (row[2] or "").lower()
108
+ path_lower = (row[1] or "").lower()
109
+ match_count = sum(1 for t in terms if t in content_lower or t in path_lower)
110
+ if match_count > 0:
111
+ score = match_count / len(terms)
112
+ scored.append((score, {
113
+ "id": row[0], "file_path": row[1], "content": row[2],
114
+ "language": row[3], "start_line": row[4], "end_line": row[5],
115
+ "chunk_type": row[6], "metadata": json.loads(row[7]),
116
+ }))
117
+
118
+ scored.sort(key=lambda x: x[0], reverse=True)
119
+ results = []
120
+ for score, data in scored[:top_k]:
121
+ chunk = CodeChunk(**data)
122
+ results.append(SearchResult(chunk=chunk, score=score))
123
+ return results
124
+
125
+ def remove_file(self, file_path: str):
126
+ self._conn.execute("DELETE FROM chunks WHERE file_path = ?", (file_path,))
127
+ self._conn.commit()
128
+
129
+ def clear(self):
130
+ self._conn.execute("DELETE FROM chunks")
131
+ self._conn.commit()
132
+
133
+ @property
134
+ def chunk_count(self) -> int:
135
+ cursor = self._conn.execute("SELECT COUNT(*) FROM chunks")
136
+ return cursor.fetchone()[0]
137
+
138
+ @property
139
+ def file_count(self) -> int:
140
+ cursor = self._conn.execute("SELECT COUNT(DISTINCT file_path) FROM chunks")
141
+ return cursor.fetchone()[0]
142
+
143
+ def close(self):
144
+ self._conn.close()
145
+
146
+ @staticmethod
147
+ def _cosine_similarity(a: List[float], b: List[float]) -> float:
148
+ if not a or not b:
149
+ return 0.0
150
+ dot = sum(x * y for x, y in zip(a, b))
151
+ norm_a = math.sqrt(sum(x * x for x in a))
152
+ norm_b = math.sqrt(sum(y * y for y in b))
153
+ if norm_a == 0 or norm_b == 0:
154
+ return 0.0
155
+ return dot / (norm_a * norm_b)