luckyd-code 1.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- luckyd_code/__init__.py +54 -0
- luckyd_code/__main__.py +5 -0
- luckyd_code/_agent_loop.py +551 -0
- luckyd_code/_data_dir.py +73 -0
- luckyd_code/agent.py +38 -0
- luckyd_code/analytics/__init__.py +18 -0
- luckyd_code/analytics/reporter.py +195 -0
- luckyd_code/analytics/scanner.py +443 -0
- luckyd_code/analytics/smells.py +316 -0
- luckyd_code/analytics/trends.py +303 -0
- luckyd_code/api.py +473 -0
- luckyd_code/audit_daemon.py +845 -0
- luckyd_code/autonomous_fixer.py +473 -0
- luckyd_code/background.py +159 -0
- luckyd_code/backup.py +237 -0
- luckyd_code/brain/__init__.py +84 -0
- luckyd_code/brain/assembler.py +100 -0
- luckyd_code/brain/chunker.py +345 -0
- luckyd_code/brain/constants.py +73 -0
- luckyd_code/brain/embedder.py +163 -0
- luckyd_code/brain/graph.py +311 -0
- luckyd_code/brain/indexer.py +316 -0
- luckyd_code/brain/parser.py +140 -0
- luckyd_code/brain/retriever.py +234 -0
- luckyd_code/cli.py +894 -0
- luckyd_code/cli_commands/__init__.py +1 -0
- luckyd_code/cli_commands/audit.py +120 -0
- luckyd_code/cli_commands/background.py +83 -0
- luckyd_code/cli_commands/brain.py +87 -0
- luckyd_code/cli_commands/config.py +75 -0
- luckyd_code/cli_commands/dispatcher.py +695 -0
- luckyd_code/cli_commands/sessions.py +41 -0
- luckyd_code/cli_entry.py +147 -0
- luckyd_code/cli_utils.py +112 -0
- luckyd_code/config.py +205 -0
- luckyd_code/context.py +214 -0
- luckyd_code/cost_tracker.py +209 -0
- luckyd_code/error_reporter.py +508 -0
- luckyd_code/exceptions.py +39 -0
- luckyd_code/export.py +126 -0
- luckyd_code/feedback_analyzer.py +290 -0
- luckyd_code/file_watcher.py +258 -0
- luckyd_code/git/__init__.py +11 -0
- luckyd_code/git/auto_commit.py +157 -0
- luckyd_code/git/tools.py +85 -0
- luckyd_code/hooks.py +236 -0
- luckyd_code/indexer.py +280 -0
- luckyd_code/init.py +39 -0
- luckyd_code/keybindings.py +77 -0
- luckyd_code/log.py +55 -0
- luckyd_code/mcp/__init__.py +6 -0
- luckyd_code/mcp/client.py +184 -0
- luckyd_code/memory/__init__.py +19 -0
- luckyd_code/memory/manager.py +339 -0
- luckyd_code/metrics/__init__.py +5 -0
- luckyd_code/model_registry.py +131 -0
- luckyd_code/orchestrator.py +204 -0
- luckyd_code/permissions/__init__.py +1 -0
- luckyd_code/permissions/manager.py +103 -0
- luckyd_code/planner.py +361 -0
- luckyd_code/plugins.py +91 -0
- luckyd_code/py.typed +0 -0
- luckyd_code/retry.py +57 -0
- luckyd_code/router.py +417 -0
- luckyd_code/sandbox.py +156 -0
- luckyd_code/self_critique.py +2 -0
- luckyd_code/self_improve.py +274 -0
- luckyd_code/sessions.py +114 -0
- luckyd_code/settings.py +72 -0
- luckyd_code/skills/__init__.py +8 -0
- luckyd_code/skills/review.py +22 -0
- luckyd_code/skills/security.py +17 -0
- luckyd_code/tasks/__init__.py +1 -0
- luckyd_code/tasks/manager.py +102 -0
- luckyd_code/templates/icon-192.png +0 -0
- luckyd_code/templates/icon-512.png +0 -0
- luckyd_code/templates/index.html +1965 -0
- luckyd_code/templates/manifest.json +14 -0
- luckyd_code/templates/src/app.js +694 -0
- luckyd_code/templates/src/body.html +767 -0
- luckyd_code/templates/src/cdn.txt +2 -0
- luckyd_code/templates/src/style.css +474 -0
- luckyd_code/templates/sw.js +31 -0
- luckyd_code/templates/test.html +6 -0
- luckyd_code/themes.py +48 -0
- luckyd_code/tools/__init__.py +97 -0
- luckyd_code/tools/agent_tools.py +65 -0
- luckyd_code/tools/bash.py +360 -0
- luckyd_code/tools/brain_tools.py +137 -0
- luckyd_code/tools/browser.py +369 -0
- luckyd_code/tools/datetime_tool.py +34 -0
- luckyd_code/tools/dockerfile_gen.py +212 -0
- luckyd_code/tools/file_ops.py +381 -0
- luckyd_code/tools/game_gen.py +360 -0
- luckyd_code/tools/git_tools.py +130 -0
- luckyd_code/tools/git_worktree.py +63 -0
- luckyd_code/tools/path_validate.py +64 -0
- luckyd_code/tools/project_gen.py +187 -0
- luckyd_code/tools/readme_gen.py +227 -0
- luckyd_code/tools/registry.py +157 -0
- luckyd_code/tools/shell_detect.py +109 -0
- luckyd_code/tools/web.py +89 -0
- luckyd_code/tools/youtube.py +187 -0
- luckyd_code/tools_bridge.py +144 -0
- luckyd_code/undo.py +126 -0
- luckyd_code/update.py +60 -0
- luckyd_code/verify.py +360 -0
- luckyd_code/web_app.py +176 -0
- luckyd_code/web_routes/__init__.py +23 -0
- luckyd_code/web_routes/background.py +73 -0
- luckyd_code/web_routes/brain.py +109 -0
- luckyd_code/web_routes/cost.py +12 -0
- luckyd_code/web_routes/files.py +133 -0
- luckyd_code/web_routes/memories.py +94 -0
- luckyd_code/web_routes/misc.py +67 -0
- luckyd_code/web_routes/project.py +48 -0
- luckyd_code/web_routes/review.py +20 -0
- luckyd_code/web_routes/sessions.py +44 -0
- luckyd_code/web_routes/settings.py +43 -0
- luckyd_code/web_routes/static.py +70 -0
- luckyd_code/web_routes/update.py +19 -0
- luckyd_code/web_routes/ws.py +237 -0
- luckyd_code-1.2.2.dist-info/METADATA +297 -0
- luckyd_code-1.2.2.dist-info/RECORD +127 -0
- luckyd_code-1.2.2.dist-info/WHEEL +4 -0
- luckyd_code-1.2.2.dist-info/entry_points.txt +3 -0
- luckyd_code-1.2.2.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,345 @@
|
|
|
1
|
+
"""Code chunker — splits source files into overlapping chunks for embedding."""
|
|
2
|
+
|
|
3
|
+
import ast
|
|
4
|
+
import os
|
|
5
|
+
import re
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from ..log import get_logger
|
|
9
|
+
from .constants import LANGUAGE_MAP, Chunk, should_skip
|
|
10
|
+
|
|
11
|
+
# Regex patterns for non-Python language structure detection
|
|
12
|
+
STRUCTURE_PATTERNS: dict[str, list[tuple[str, str, str]]] = {
|
|
13
|
+
"javascript": [
|
|
14
|
+
(r"(?:^|\n)\s*function\s+\*?\s*(\w+)\s*\(", "function", "function"),
|
|
15
|
+
(r"(?:^|\n)\s*(?:async\s+)?function\s+\*?\s*(\w+)\s*\(", "function", "function"),
|
|
16
|
+
(r"(?:^|\n)\s*class\s+(\w+)", "class", "class"),
|
|
17
|
+
(r"(?:^|\n)\s*(?:export\s+)?(?:default\s+)?(?:const|let|var)\s+(\w+)\s*=\s*(?:async\s+)?\(?.*\)?\s*=>", "function", "arrow_function"),
|
|
18
|
+
(r"(?:^|\n)\s*(?:export\s+)?(?:const|let|var)\s+(\w+)\s*=\s*function", "function", "function_expression"),
|
|
19
|
+
],
|
|
20
|
+
"typescript": [
|
|
21
|
+
(r"(?:^|\n)\s*function\s+\*?\s*(\w+)\s*\(", "function", "function"),
|
|
22
|
+
(r"(?:^|\n)\s*(?:async\s+)?function\s+\*?\s*(\w+)\s*\(", "function", "function"),
|
|
23
|
+
(r"(?:^|\n)\s*class\s+(\w+)", "class", "class"),
|
|
24
|
+
(r"(?:^|\n)\s*interface\s+(\w+)", "class", "interface"),
|
|
25
|
+
(r"(?:^|\n)\s*type\s+(\w+)\s*=", "class", "type_alias"),
|
|
26
|
+
(r"(?:^|\n)\s*(?:export\s+)?(?:const|let|var)\s+(\w+)\s*=\s*(?:async\s+)?\(?.*\)?\s*=>", "function", "arrow_function"),
|
|
27
|
+
(r"(?:^|\n)\s*(?:export\s+)?(?:const|let|var)\s+(\w+)\s*=\s*function", "function", "function_expression"),
|
|
28
|
+
],
|
|
29
|
+
"go": [
|
|
30
|
+
(r"(?:^|\n)\s*func\s+(\w+)", "function", "function"),
|
|
31
|
+
(r"(?:^|\n)\s*type\s+(\w+)\s+struct", "class", "struct"),
|
|
32
|
+
(r"(?:^|\n)\s*type\s+(\w+)\s+interface", "class", "interface"),
|
|
33
|
+
],
|
|
34
|
+
"rust": [
|
|
35
|
+
(r"(?:^|\n)\s*fn\s+(\w+)", "function", "function"),
|
|
36
|
+
(r"(?:^|\n)\s*struct\s+(\w+)", "class", "struct"),
|
|
37
|
+
(r"(?:^|\n)\s*enum\s+(\w+)", "class", "enum"),
|
|
38
|
+
(r"(?:^|\n)\s*trait\s+(\w+)", "class", "trait"),
|
|
39
|
+
(r"(?:^|\n)\s*impl\s+(\w+)", "class", "impl"),
|
|
40
|
+
],
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _chunk_python(filepath: Path, content: str) -> list[Chunk]:
|
|
45
|
+
chunks: list[Chunk] = []
|
|
46
|
+
lines = content.split("\n")
|
|
47
|
+
rel_path = str(filepath)
|
|
48
|
+
|
|
49
|
+
try:
|
|
50
|
+
tree = ast.parse(content, filename=str(filepath))
|
|
51
|
+
except SyntaxError:
|
|
52
|
+
return _chunk_by_lines(filepath, content, "python")
|
|
53
|
+
|
|
54
|
+
header_end = 1
|
|
55
|
+
module_doc = ast.get_docstring(tree) or ""
|
|
56
|
+
for node in ast.iter_child_nodes(tree):
|
|
57
|
+
if isinstance(node, (ast.Import, ast.ImportFrom)):
|
|
58
|
+
header_end = node.end_lineno or node.lineno
|
|
59
|
+
elif isinstance(node, ast.Expr) and module_doc:
|
|
60
|
+
header_end = node.end_lineno or node.lineno
|
|
61
|
+
else:
|
|
62
|
+
break
|
|
63
|
+
|
|
64
|
+
header_content = "\n".join(lines[:header_end])
|
|
65
|
+
if header_content.strip():
|
|
66
|
+
chunks.append(Chunk(
|
|
67
|
+
file_path=rel_path,
|
|
68
|
+
chunk_id=f"{rel_path}:module",
|
|
69
|
+
start_line=1,
|
|
70
|
+
end_line=header_end,
|
|
71
|
+
type="module",
|
|
72
|
+
name=Path(rel_path).name,
|
|
73
|
+
language="python",
|
|
74
|
+
content=header_content,
|
|
75
|
+
))
|
|
76
|
+
|
|
77
|
+
for node in ast.iter_child_nodes(tree):
|
|
78
|
+
if isinstance(node, ast.ClassDef):
|
|
79
|
+
end = node.end_lineno or node.lineno
|
|
80
|
+
cls_lines = lines[node.lineno - 1:end]
|
|
81
|
+
if end < len(lines):
|
|
82
|
+
cls_lines.append(lines[end])
|
|
83
|
+
|
|
84
|
+
content = "\n".join(cls_lines)
|
|
85
|
+
chunks.append(Chunk(
|
|
86
|
+
file_path=rel_path,
|
|
87
|
+
chunk_id=f"{rel_path}:class:{node.name}",
|
|
88
|
+
start_line=node.lineno,
|
|
89
|
+
end_line=end,
|
|
90
|
+
type="class",
|
|
91
|
+
name=node.name,
|
|
92
|
+
language="python",
|
|
93
|
+
content=content,
|
|
94
|
+
))
|
|
95
|
+
|
|
96
|
+
for child in ast.iter_child_nodes(node):
|
|
97
|
+
if isinstance(child, ast.FunctionDef):
|
|
98
|
+
m_end = child.end_lineno or child.lineno
|
|
99
|
+
method_lines = lines[child.lineno - 1:m_end]
|
|
100
|
+
if m_end < len(lines):
|
|
101
|
+
method_lines.append(lines[m_end])
|
|
102
|
+
chunks.append(Chunk(
|
|
103
|
+
file_path=rel_path,
|
|
104
|
+
chunk_id=f"{rel_path}:method:{child.name}",
|
|
105
|
+
start_line=child.lineno,
|
|
106
|
+
end_line=m_end,
|
|
107
|
+
type="method",
|
|
108
|
+
name=child.name,
|
|
109
|
+
language="python",
|
|
110
|
+
content="\n".join(method_lines),
|
|
111
|
+
))
|
|
112
|
+
|
|
113
|
+
elif isinstance(node, ast.FunctionDef):
|
|
114
|
+
end = node.end_lineno or node.lineno
|
|
115
|
+
func_lines = lines[node.lineno - 1:end]
|
|
116
|
+
if end < len(lines):
|
|
117
|
+
func_lines.append(lines[end])
|
|
118
|
+
|
|
119
|
+
chunks.append(Chunk(
|
|
120
|
+
file_path=rel_path,
|
|
121
|
+
chunk_id=f"{rel_path}:function:{node.name}",
|
|
122
|
+
start_line=node.lineno,
|
|
123
|
+
end_line=end,
|
|
124
|
+
type="function",
|
|
125
|
+
name=node.name,
|
|
126
|
+
language="python",
|
|
127
|
+
content="\n".join(func_lines),
|
|
128
|
+
))
|
|
129
|
+
|
|
130
|
+
return chunks
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _chunk_by_lines(filepath: Path, content: str, language: str) -> list[Chunk]:
|
|
134
|
+
chunks: list[Chunk] = []
|
|
135
|
+
lines = content.split("\n")
|
|
136
|
+
rel_path = str(filepath)
|
|
137
|
+
|
|
138
|
+
header_lines: list[str] = []
|
|
139
|
+
i = 0
|
|
140
|
+
while i < len(lines):
|
|
141
|
+
if lines[i].strip() == "" and any(l.strip() for l in lines[i + 1:i + 3]):
|
|
142
|
+
break
|
|
143
|
+
header_lines.append(lines[i])
|
|
144
|
+
i += 1
|
|
145
|
+
if header_lines:
|
|
146
|
+
chunks.append(Chunk(
|
|
147
|
+
file_path=rel_path,
|
|
148
|
+
chunk_id=f"{rel_path}:module",
|
|
149
|
+
start_line=1,
|
|
150
|
+
end_line=i,
|
|
151
|
+
type="module",
|
|
152
|
+
name=Path(rel_path).name,
|
|
153
|
+
language=language,
|
|
154
|
+
content="\n".join(header_lines).strip(),
|
|
155
|
+
))
|
|
156
|
+
|
|
157
|
+
block_start = i + 1
|
|
158
|
+
for match in re.finditer(r"\n\s*\n", content):
|
|
159
|
+
block_end = match.start()
|
|
160
|
+
if block_end > block_start:
|
|
161
|
+
block_content = content[block_start:block_end].strip()
|
|
162
|
+
if block_content:
|
|
163
|
+
start_line = content[:block_start].count("\n") + 1
|
|
164
|
+
end_line = content[:block_end].count("\n") + 1
|
|
165
|
+
chunks.append(Chunk(
|
|
166
|
+
file_path=rel_path,
|
|
167
|
+
chunk_id=f"{rel_path}:block:{start_line}",
|
|
168
|
+
start_line=start_line,
|
|
169
|
+
end_line=end_line,
|
|
170
|
+
type="block",
|
|
171
|
+
name="",
|
|
172
|
+
language=language,
|
|
173
|
+
content=block_content,
|
|
174
|
+
))
|
|
175
|
+
block_start = match.end()
|
|
176
|
+
|
|
177
|
+
if block_start < len(content):
|
|
178
|
+
last_block = content[block_start:].strip()
|
|
179
|
+
if last_block:
|
|
180
|
+
start_line = content[:block_start].count("\n") + 1
|
|
181
|
+
end_line = content.count("\n") + 1
|
|
182
|
+
chunks.append(Chunk(
|
|
183
|
+
file_path=rel_path,
|
|
184
|
+
chunk_id=f"{rel_path}:block:{start_line}",
|
|
185
|
+
start_line=start_line,
|
|
186
|
+
end_line=end_line,
|
|
187
|
+
type="block",
|
|
188
|
+
name="",
|
|
189
|
+
language=language,
|
|
190
|
+
content=last_block,
|
|
191
|
+
))
|
|
192
|
+
|
|
193
|
+
return chunks
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def _chunk_with_regex(filepath: Path, content: str, language: str) -> list[Chunk]:
|
|
197
|
+
chunks: list[Chunk] = []
|
|
198
|
+
lines = content.split("\n")
|
|
199
|
+
rel_path = str(filepath)
|
|
200
|
+
patterns = STRUCTURE_PATTERNS.get(language, [])
|
|
201
|
+
|
|
202
|
+
header_end = min(20, len(lines))
|
|
203
|
+
for pattern, _type, _subtype in patterns:
|
|
204
|
+
for m in re.finditer(pattern, content):
|
|
205
|
+
line_num = content[:m.start()].count("\n") + 1
|
|
206
|
+
header_end = min(header_end, line_num - 1)
|
|
207
|
+
break
|
|
208
|
+
|
|
209
|
+
header_content = "\n".join(lines[:header_end]).strip()
|
|
210
|
+
if header_content:
|
|
211
|
+
chunks.append(Chunk(
|
|
212
|
+
file_path=rel_path,
|
|
213
|
+
chunk_id=f"{rel_path}:module",
|
|
214
|
+
start_line=1,
|
|
215
|
+
end_line=header_end,
|
|
216
|
+
type="module",
|
|
217
|
+
name=Path(rel_path).name,
|
|
218
|
+
language=language,
|
|
219
|
+
content=header_content,
|
|
220
|
+
))
|
|
221
|
+
|
|
222
|
+
finds: list[tuple[int, str, str, str]] = []
|
|
223
|
+
for pattern, chunk_type, subtype in patterns:
|
|
224
|
+
for m in re.finditer(pattern, content):
|
|
225
|
+
line_num = content[:m.start()].count("\n") + 1
|
|
226
|
+
name = m.group(1)
|
|
227
|
+
finds.append((line_num, name, chunk_type, subtype))
|
|
228
|
+
|
|
229
|
+
finds.sort(key=lambda x: x[0])
|
|
230
|
+
|
|
231
|
+
for idx, (start_line, name, chunk_type, subtype) in enumerate(finds):
|
|
232
|
+
byte_pos = 0
|
|
233
|
+
for _ in range(start_line - 1):
|
|
234
|
+
byte_pos = content.index("\n", byte_pos) + 1
|
|
235
|
+
|
|
236
|
+
end_byte = _find_block_end(content, byte_pos)
|
|
237
|
+
if end_byte <= byte_pos:
|
|
238
|
+
end_byte = len(content)
|
|
239
|
+
|
|
240
|
+
end_line = content[:end_byte].count("\n") + 1
|
|
241
|
+
|
|
242
|
+
chunk_content = content[byte_pos:end_byte].rstrip()
|
|
243
|
+
if not chunk_content:
|
|
244
|
+
continue
|
|
245
|
+
|
|
246
|
+
if idx + 1 < len(finds):
|
|
247
|
+
next_start = finds[idx + 1][0]
|
|
248
|
+
next_byte = 0
|
|
249
|
+
for _ in range(next_start - 1):
|
|
250
|
+
next_byte = content.index("\n", next_byte) + 1
|
|
251
|
+
overlap_end = min(end_byte + 200, next_byte)
|
|
252
|
+
overlap_content = content[end_byte:overlap_end]
|
|
253
|
+
chunk_content += overlap_content
|
|
254
|
+
|
|
255
|
+
chunks.append(Chunk(
|
|
256
|
+
file_path=rel_path,
|
|
257
|
+
chunk_id=f"{rel_path}:{chunk_type}:{name}",
|
|
258
|
+
start_line=start_line,
|
|
259
|
+
end_line=end_line,
|
|
260
|
+
type=chunk_type,
|
|
261
|
+
name=name,
|
|
262
|
+
language=language,
|
|
263
|
+
content=chunk_content,
|
|
264
|
+
))
|
|
265
|
+
|
|
266
|
+
return chunks
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def _find_block_end(content: str, start_byte: int) -> int:
|
|
270
|
+
"""Find the end of a brace-delimited block starting at start_byte."""
|
|
271
|
+
# Find the first '{' after start_byte
|
|
272
|
+
brace_start = content.find("{", start_byte)
|
|
273
|
+
if brace_start == -1:
|
|
274
|
+
return len(content)
|
|
275
|
+
|
|
276
|
+
depth = 0
|
|
277
|
+
in_string = False
|
|
278
|
+
string_char = None
|
|
279
|
+
i = brace_start
|
|
280
|
+
|
|
281
|
+
while i < len(content):
|
|
282
|
+
ch = content[i]
|
|
283
|
+
if not in_string:
|
|
284
|
+
if ch == '"' or ch == "'":
|
|
285
|
+
in_string = True
|
|
286
|
+
string_char = ch
|
|
287
|
+
elif ch == "{":
|
|
288
|
+
depth += 1
|
|
289
|
+
elif ch == "}":
|
|
290
|
+
depth -= 1
|
|
291
|
+
if depth == 0:
|
|
292
|
+
return i + 1
|
|
293
|
+
else:
|
|
294
|
+
if ch == "\\":
|
|
295
|
+
i += 1 # skip escaped char
|
|
296
|
+
elif ch == string_char:
|
|
297
|
+
in_string = False
|
|
298
|
+
i += 1
|
|
299
|
+
|
|
300
|
+
return len(content)
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def chunk_file(filepath: Path) -> list[Chunk]:
|
|
304
|
+
suffix = filepath.suffix.lower()
|
|
305
|
+
language = LANGUAGE_MAP.get(suffix)
|
|
306
|
+
if not language:
|
|
307
|
+
return []
|
|
308
|
+
|
|
309
|
+
try:
|
|
310
|
+
content = filepath.read_text(encoding="utf-8", errors="replace")
|
|
311
|
+
except OSError:
|
|
312
|
+
get_logger().warning("Could not read file %s", filepath, exc_info=True)
|
|
313
|
+
return []
|
|
314
|
+
|
|
315
|
+
if not content.strip():
|
|
316
|
+
return []
|
|
317
|
+
|
|
318
|
+
if language == "python":
|
|
319
|
+
return _chunk_python(filepath, content)
|
|
320
|
+
elif language in STRUCTURE_PATTERNS:
|
|
321
|
+
return _chunk_with_regex(filepath, content, language)
|
|
322
|
+
else:
|
|
323
|
+
return _chunk_by_lines(filepath, content, language)
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def chunk_project(project_root: str) -> list[Chunk]:
|
|
327
|
+
root = Path(project_root).resolve()
|
|
328
|
+
all_chunks: list[Chunk] = []
|
|
329
|
+
|
|
330
|
+
for dirpath, dirnames, filenames in os.walk(root):
|
|
331
|
+
dirnames[:] = [d for d in dirnames if not should_skip(d)]
|
|
332
|
+
|
|
333
|
+
for fname in filenames:
|
|
334
|
+
suffix = Path(fname).suffix.lower()
|
|
335
|
+
if suffix not in LANGUAGE_MAP:
|
|
336
|
+
continue
|
|
337
|
+
|
|
338
|
+
fpath = Path(dirpath) / fname
|
|
339
|
+
try:
|
|
340
|
+
chunks = chunk_file(fpath)
|
|
341
|
+
all_chunks.extend(chunks)
|
|
342
|
+
except Exception:
|
|
343
|
+
get_logger().warning("Error chunking %s", fpath, exc_info=True)
|
|
344
|
+
|
|
345
|
+
return all_chunks
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"""Shared constants, types, and helpers for the brain module."""
|
|
2
|
+
|
|
3
|
+
from typing import TypedDict
|
|
4
|
+
|
|
5
|
+
from .._data_dir import data_path
|
|
6
|
+
|
|
7
|
+
# Directories to skip during project scanning
|
|
8
|
+
SKIP_DIRS = {
|
|
9
|
+
".git", "__pycache__", "node_modules", ".venv", "venv", "env",
|
|
10
|
+
".tox", ".eggs", "dist", "build", ".next", ".nuxt",
|
|
11
|
+
"target", "vendor", ".bundle", ".claude", ".deepseek-code", ".vscode", ".idea",
|
|
12
|
+
".mypy_cache", ".pytest_cache", ".ruff_cache", ".ruff",
|
|
13
|
+
".svn", ".hg", "egg-info",
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
# Supported file extensions and their languages
|
|
17
|
+
LANGUAGE_MAP = {
|
|
18
|
+
".py": "python",
|
|
19
|
+
".js": "javascript",
|
|
20
|
+
".jsx": "javascript",
|
|
21
|
+
".ts": "typescript",
|
|
22
|
+
".tsx": "typescript",
|
|
23
|
+
".go": "go",
|
|
24
|
+
".rs": "rust",
|
|
25
|
+
".java": "java",
|
|
26
|
+
".cpp": "cpp",
|
|
27
|
+
".cc": "cpp",
|
|
28
|
+
".cxx": "cpp",
|
|
29
|
+
".c": "c",
|
|
30
|
+
".h": "c",
|
|
31
|
+
".hpp": "cpp",
|
|
32
|
+
".cs": "csharp",
|
|
33
|
+
".rb": "ruby",
|
|
34
|
+
".php": "php",
|
|
35
|
+
".swift": "swift",
|
|
36
|
+
".kt": "kotlin",
|
|
37
|
+
".kts": "kotlin",
|
|
38
|
+
".sh": "bash",
|
|
39
|
+
".bash": "bash",
|
|
40
|
+
".zsh": "bash",
|
|
41
|
+
".lua": "lua",
|
|
42
|
+
".r": "r",
|
|
43
|
+
".R": "r",
|
|
44
|
+
".scala": "scala",
|
|
45
|
+
".ex": "elixir",
|
|
46
|
+
".exs": "elixir",
|
|
47
|
+
".md": "markdown",
|
|
48
|
+
".toml": "toml",
|
|
49
|
+
".yaml": "yaml",
|
|
50
|
+
".yml": "yaml",
|
|
51
|
+
".json": "json",
|
|
52
|
+
".sql": "sql",
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
# Storage directory for index and cache files
|
|
56
|
+
BRAIN_DIR = data_path("brain")
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class Chunk(TypedDict):
|
|
60
|
+
"""A chunk of source code ready for embedding and search."""
|
|
61
|
+
file_path: str
|
|
62
|
+
chunk_id: str
|
|
63
|
+
start_line: int
|
|
64
|
+
end_line: int
|
|
65
|
+
type: str # module, class, function, method, block
|
|
66
|
+
name: str
|
|
67
|
+
language: str
|
|
68
|
+
content: str
|
|
69
|
+
# score is added by the retriever at search time
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def should_skip(name: str) -> bool:
|
|
73
|
+
return name in SKIP_DIRS or name.startswith(".")
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
"""Embedder — generates embeddings for code chunks.
|
|
2
|
+
|
|
3
|
+
Supports multiple backends:
|
|
4
|
+
- sentence-transformers (local, no API key)
|
|
5
|
+
- OpenAI text-embedding-3-small (requires API key)
|
|
6
|
+
|
|
7
|
+
All dependencies are optional — embedder returns available=False gracefully.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import os
|
|
11
|
+
from typing import Optional
|
|
12
|
+
|
|
13
|
+
from ..log import get_logger
|
|
14
|
+
|
|
15
|
+
# Module-level singleton
|
|
16
|
+
_embedder: Optional["Embedder"] = None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def get_embedder() -> "Embedder":
|
|
20
|
+
"""Get or create the shared embedder singleton."""
|
|
21
|
+
global _embedder
|
|
22
|
+
if _embedder is None:
|
|
23
|
+
_embedder = Embedder()
|
|
24
|
+
_embedder.load()
|
|
25
|
+
return _embedder
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class Embedder:
|
|
29
|
+
"""Generates embeddings for text/code using available backends."""
|
|
30
|
+
|
|
31
|
+
def __init__(self):
|
|
32
|
+
self.available = False
|
|
33
|
+
self.dimension = 0
|
|
34
|
+
self.model_name = "none"
|
|
35
|
+
self._model = None
|
|
36
|
+
self._openai_client = None
|
|
37
|
+
|
|
38
|
+
def load(self, model_type: Optional[str] = None) -> bool:
|
|
39
|
+
"""Load the embedding model.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
model_type: "local" for sentence-transformers, "openai" for OpenAI API.
|
|
43
|
+
If None, reads from settings or tries local first.
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
True if a backend was successfully loaded.
|
|
47
|
+
"""
|
|
48
|
+
if model_type is None:
|
|
49
|
+
try:
|
|
50
|
+
from .. import settings as cfg
|
|
51
|
+
model_type = cfg.load_settings().get("embedding_model", "local")
|
|
52
|
+
except Exception:
|
|
53
|
+
model_type = "local"
|
|
54
|
+
|
|
55
|
+
if model_type == "openai":
|
|
56
|
+
return self._load_openai()
|
|
57
|
+
return self._load_local()
|
|
58
|
+
|
|
59
|
+
def _load_local(self) -> bool:
|
|
60
|
+
"""Load sentence-transformers for local embeddings."""
|
|
61
|
+
try:
|
|
62
|
+
import sentence_transformers
|
|
63
|
+
|
|
64
|
+
self._model = sentence_transformers.SentenceTransformer(
|
|
65
|
+
"all-MiniLM-L6-v2"
|
|
66
|
+
)
|
|
67
|
+
self.dimension = 384
|
|
68
|
+
self.model_name = "sentence-transformers/all-MiniLM-L6-v2"
|
|
69
|
+
self.available = True
|
|
70
|
+
get_logger().info(
|
|
71
|
+
"Loaded local embedding model: %s (dim=%d)",
|
|
72
|
+
self.model_name, self.dimension,
|
|
73
|
+
)
|
|
74
|
+
return True
|
|
75
|
+
except ImportError:
|
|
76
|
+
get_logger().info(
|
|
77
|
+
"sentence-transformers not installed. "
|
|
78
|
+
"Install with: pip install sentence-transformers"
|
|
79
|
+
)
|
|
80
|
+
except Exception as exc:
|
|
81
|
+
get_logger().warning("Failed to load sentence-transformers: %s", exc)
|
|
82
|
+
|
|
83
|
+
self.available = False
|
|
84
|
+
return False
|
|
85
|
+
|
|
86
|
+
def _load_openai(self) -> bool:
|
|
87
|
+
"""Load OpenAI embedding API client."""
|
|
88
|
+
api_key = os.environ.get("OPENAI_API_KEY")
|
|
89
|
+
if not api_key:
|
|
90
|
+
get_logger().warning(
|
|
91
|
+
"OPENAI_API_KEY not set. Cannot use OpenAI embeddings."
|
|
92
|
+
)
|
|
93
|
+
self.available = False
|
|
94
|
+
return False
|
|
95
|
+
|
|
96
|
+
try:
|
|
97
|
+
from openai import OpenAI
|
|
98
|
+
|
|
99
|
+
self._openai_client = OpenAI(api_key=api_key)
|
|
100
|
+
self.dimension = 1536
|
|
101
|
+
self.model_name = "text-embedding-3-small"
|
|
102
|
+
self.available = True
|
|
103
|
+
get_logger().info("Loaded OpenAI embedding model: text-embedding-3-small")
|
|
104
|
+
return True
|
|
105
|
+
except ImportError:
|
|
106
|
+
get_logger().warning("openai package not installed.")
|
|
107
|
+
except Exception as exc:
|
|
108
|
+
get_logger().warning("Failed to load OpenAI embeddings: %s", exc)
|
|
109
|
+
|
|
110
|
+
self.available = False
|
|
111
|
+
return False
|
|
112
|
+
|
|
113
|
+
def embed(self, texts: list[str]) -> Optional[list[list[float]]]:
|
|
114
|
+
"""Embed a list of texts into vectors.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
texts: List of text strings to embed.
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
List of embedding vectors (list of floats), or None if unavailable.
|
|
121
|
+
"""
|
|
122
|
+
if not self.available or not texts:
|
|
123
|
+
return None
|
|
124
|
+
|
|
125
|
+
# Filter out empty texts
|
|
126
|
+
valid_texts = [t for t in texts if t and t.strip()]
|
|
127
|
+
if not valid_texts:
|
|
128
|
+
return None
|
|
129
|
+
|
|
130
|
+
# Truncate very long texts (sentence-transformers has 256/512 token limit)
|
|
131
|
+
truncated = [t[:8192] for t in valid_texts]
|
|
132
|
+
|
|
133
|
+
try:
|
|
134
|
+
if self._model is not None:
|
|
135
|
+
# sentence-transformers
|
|
136
|
+
import numpy as np
|
|
137
|
+
|
|
138
|
+
embeddings = self._model.encode(truncated, show_progress_bar=False)
|
|
139
|
+
return embeddings.tolist() if isinstance(embeddings, np.ndarray) else embeddings # type: ignore[return-value]
|
|
140
|
+
elif self._openai_client is not None:
|
|
141
|
+
resp = self._openai_client.embeddings.create(
|
|
142
|
+
model="text-embedding-3-small",
|
|
143
|
+
input=truncated,
|
|
144
|
+
)
|
|
145
|
+
return [item.embedding for item in resp.data]
|
|
146
|
+
except Exception as exc:
|
|
147
|
+
get_logger().warning("Embedding failed: %s", exc)
|
|
148
|
+
|
|
149
|
+
return None
|
|
150
|
+
|
|
151
|
+
def embed_query(self, query: str) -> Optional[list[float]]:
|
|
152
|
+
"""Embed a single query string.
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
query: The search query.
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
Single embedding vector, or None if unavailable.
|
|
159
|
+
"""
|
|
160
|
+
result = self.embed([query])
|
|
161
|
+
if result:
|
|
162
|
+
return result[0]
|
|
163
|
+
return None
|