code-context-control 2.28.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cli/__init__.py +1 -0
- cli/_hook_utils.py +99 -0
- cli/c3.py +6152 -0
- cli/commands/__init__.py +1 -0
- cli/commands/common.py +312 -0
- cli/commands/parser.py +286 -0
- cli/docs.html +3178 -0
- cli/edits.html +878 -0
- cli/hook_auto_snapshot.py +142 -0
- cli/hook_c3_signal.py +61 -0
- cli/hook_c3read.py +116 -0
- cli/hook_edit_ledger.py +213 -0
- cli/hook_edit_unlock.py +170 -0
- cli/hook_filter.py +130 -0
- cli/hook_ghost_files.py +238 -0
- cli/hook_pretool_enforce.py +334 -0
- cli/hook_read.py +200 -0
- cli/hook_session_stats.py +62 -0
- cli/hook_terse_advisor.py +190 -0
- cli/hub.html +3764 -0
- cli/hub_server.py +1619 -0
- cli/mcp_proxy.py +428 -0
- cli/mcp_server.py +660 -0
- cli/server.py +2985 -0
- cli/tools/__init__.py +4 -0
- cli/tools/_helpers.py +65 -0
- cli/tools/agent.py +1165 -0
- cli/tools/compress.py +215 -0
- cli/tools/delegate.py +1184 -0
- cli/tools/edit.py +313 -0
- cli/tools/edits.py +118 -0
- cli/tools/filter.py +285 -0
- cli/tools/impact.py +163 -0
- cli/tools/memory.py +469 -0
- cli/tools/read.py +224 -0
- cli/tools/search.py +337 -0
- cli/tools/session.py +95 -0
- cli/tools/shell.py +193 -0
- cli/tools/status.py +306 -0
- cli/tools/validate.py +310 -0
- cli/ui/api.js +36 -0
- cli/ui/app.js +207 -0
- cli/ui/components/chat.js +758 -0
- cli/ui/components/dashboard.js +689 -0
- cli/ui/components/edits.js +220 -0
- cli/ui/components/instructions.js +481 -0
- cli/ui/components/memory.js +626 -0
- cli/ui/components/sessions.js +606 -0
- cli/ui/components/settings.js +1404 -0
- cli/ui/components/sidebar.js +156 -0
- cli/ui/icons.js +51 -0
- cli/ui/shared.js +119 -0
- cli/ui/theme.js +22 -0
- cli/ui.html +168 -0
- cli/ui_legacy.html +6797 -0
- cli/ui_nano.html +503 -0
- code_context_control-2.28.0.dist-info/METADATA +248 -0
- code_context_control-2.28.0.dist-info/RECORD +150 -0
- code_context_control-2.28.0.dist-info/WHEEL +5 -0
- code_context_control-2.28.0.dist-info/entry_points.txt +4 -0
- code_context_control-2.28.0.dist-info/licenses/LICENSE +201 -0
- code_context_control-2.28.0.dist-info/top_level.txt +5 -0
- core/__init__.py +75 -0
- core/config.py +269 -0
- core/ide.py +188 -0
- oracle/__init__.py +1 -0
- oracle/config.py +75 -0
- oracle/oracle.html +3900 -0
- oracle/oracle_server.py +663 -0
- oracle/services/__init__.py +1 -0
- oracle/services/c3_bridge.py +210 -0
- oracle/services/chat_engine.py +1103 -0
- oracle/services/chat_store.py +155 -0
- oracle/services/cross_memory.py +154 -0
- oracle/services/federated_graph.py +463 -0
- oracle/services/health_checker.py +117 -0
- oracle/services/insight_engine.py +307 -0
- oracle/services/memory_reader.py +106 -0
- oracle/services/memory_writer.py +182 -0
- oracle/services/ollama_bridge.py +332 -0
- oracle/services/project_scanner.py +87 -0
- oracle/services/review_agent.py +206 -0
- services/__init__.py +1 -0
- services/activity_log.py +93 -0
- services/agent_base.py +124 -0
- services/agents.py +1529 -0
- services/auto_memory.py +407 -0
- services/bench/__init__.py +6 -0
- services/bench/external/__init__.py +29 -0
- services/bench/external/aider_polyglot.py +405 -0
- services/bench/external/swe_bench.py +485 -0
- services/benchmark_dashboard.py +596 -0
- services/claude_md.py +785 -0
- services/compressor.py +592 -0
- services/context_snapshot.py +356 -0
- services/conversation_store.py +870 -0
- services/doc_index.py +537 -0
- services/e2e_benchmark.py +2884 -0
- services/e2e_evaluator.py +396 -0
- services/e2e_tasks.py +743 -0
- services/edit_ledger.py +459 -0
- services/embedding_index.py +341 -0
- services/error_reporting.py +123 -0
- services/file_memory.py +734 -0
- services/hub_service.py +585 -0
- services/indexer.py +712 -0
- services/memory.py +318 -0
- services/memory_consolidator.py +538 -0
- services/memory_graph.py +382 -0
- services/memory_grounder.py +304 -0
- services/memory_scorer.py +246 -0
- services/metrics.py +86 -0
- services/notifications.py +209 -0
- services/ollama_client.py +201 -0
- services/output_filter.py +488 -0
- services/parser.py +1238 -0
- services/project_manager.py +579 -0
- services/protocol.py +306 -0
- services/proxy_state.py +152 -0
- services/retrieval_broker.py +129 -0
- services/router.py +414 -0
- services/runtime.py +326 -0
- services/session_benchmark.py +1945 -0
- services/session_manager.py +1026 -0
- services/session_preloader.py +251 -0
- services/text_index.py +90 -0
- services/tool_classifier.py +176 -0
- services/transcript_index.py +340 -0
- services/validation_cache.py +155 -0
- services/vector_store.py +299 -0
- services/version_tracker.py +271 -0
- services/watcher.py +192 -0
- tui/__init__.py +0 -0
- tui/backend.py +59 -0
- tui/main.py +145 -0
- tui/screens/__init__.py +1 -0
- tui/screens/benchmark_view.py +109 -0
- tui/screens/claudemd_view.py +46 -0
- tui/screens/compress_view.py +52 -0
- tui/screens/index_view.py +74 -0
- tui/screens/init_view.py +82 -0
- tui/screens/mcp_view.py +73 -0
- tui/screens/optimize_view.py +41 -0
- tui/screens/pipe_view.py +46 -0
- tui/screens/projects_view.py +355 -0
- tui/screens/search_view.py +55 -0
- tui/screens/session_view.py +143 -0
- tui/screens/stats.py +158 -0
- tui/screens/ui_view.py +54 -0
- tui/theme.tcss +335 -0
services/indexer.py
ADDED
|
@@ -0,0 +1,712 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Smart Local Index Service
|
|
3
|
+
|
|
4
|
+
Builds a searchable index of your codebase using TF-IDF and code structure analysis.
|
|
5
|
+
Retrieves only the most relevant code snippets for a given query, dramatically reducing
|
|
6
|
+
the amount of code Claude needs to read.
|
|
7
|
+
"""
|
|
8
|
+
import json
|
|
9
|
+
import math
|
|
10
|
+
import os
|
|
11
|
+
import re
|
|
12
|
+
from collections import Counter, OrderedDict
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
from core import count_tokens
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class CodeIndex:
|
|
19
|
+
"""TF-IDF based code search index with structural awareness."""
|
|
20
|
+
|
|
21
|
+
def __init__(self, project_path: str, index_dir: str = ".c3/index"):
|
|
22
|
+
self.project_path = Path(project_path)
|
|
23
|
+
self.index_dir = self.project_path / index_dir
|
|
24
|
+
self.index_dir.mkdir(parents=True, exist_ok=True)
|
|
25
|
+
|
|
26
|
+
# Index data
|
|
27
|
+
self.documents = {} # doc_id -> {path, content, chunks}
|
|
28
|
+
self.chunks = {} # chunk_id -> {doc_id, content, type, name, line_start, line_end}
|
|
29
|
+
self.idf = {} # term -> IDF score
|
|
30
|
+
self.chunk_tfidf = {} # chunk_id -> {term: tfidf_score}
|
|
31
|
+
self.symbols = {} # symbol_name -> [chunk_ids]
|
|
32
|
+
# Bounded LRU — an unbounded dict grew indefinitely over long sessions.
|
|
33
|
+
self._search_cache: "OrderedDict" = OrderedDict()
|
|
34
|
+
self._search_cache_max = 128
|
|
35
|
+
# Memoized query expansion + bigrams. Agents repeat the same queries.
|
|
36
|
+
self._expand_cache: dict = {}
|
|
37
|
+
self._cooccurrence = {} # term -> {term: count} for auto-synonyms
|
|
38
|
+
self._file_mtimes = {} # doc_id -> mtime for recency bias
|
|
39
|
+
|
|
40
|
+
# Config
|
|
41
|
+
self.skip_dirs = {'node_modules', '.git', '__pycache__', '.c3', 'venv',
|
|
42
|
+
'env', '.venv', 'dist', 'build', '.next', '.cache',
|
|
43
|
+
'coverage', '.pytest_cache'}
|
|
44
|
+
self.code_exts = {
|
|
45
|
+
# Python
|
|
46
|
+
'.py', '.pyi', '.pyx',
|
|
47
|
+
# JavaScript / TypeScript
|
|
48
|
+
'.js', '.jsx', '.ts', '.tsx', '.mjs', '.cjs',
|
|
49
|
+
# Web
|
|
50
|
+
'.html', '.htm', '.css', '.scss', '.sass', '.less', '.vue', '.svelte',
|
|
51
|
+
# Markdown
|
|
52
|
+
'.md', '.mdx',
|
|
53
|
+
# Data / Config
|
|
54
|
+
'.json', '.yaml', '.yml', '.toml', '.ini', '.cfg', '.env.example',
|
|
55
|
+
'.xml', '.csv',
|
|
56
|
+
# Systems
|
|
57
|
+
'.c', '.h', '.cpp', '.cxx', '.cc', '.hpp', '.hxx',
|
|
58
|
+
'.rs', '.go', '.java', '.kt', '.kts', '.scala',
|
|
59
|
+
'.cs', '.fs', '.vb',
|
|
60
|
+
# Scripting
|
|
61
|
+
'.sh', '.bash', '.zsh', '.fish', '.ps1', '.bat', '.cmd',
|
|
62
|
+
'.rb', '.pl', '.pm', '.lua', '.php',
|
|
63
|
+
'.r', '.R', '.jl',
|
|
64
|
+
# Query / Schema
|
|
65
|
+
'.sql', '.graphql', '.gql', '.prisma',
|
|
66
|
+
# Functional
|
|
67
|
+
'.hs', '.ex', '.exs', '.erl', '.clj', '.cljs', '.elm', '.ml', '.mli',
|
|
68
|
+
# Mobile
|
|
69
|
+
'.swift', '.m', '.mm', '.dart',
|
|
70
|
+
# Docs / Markup
|
|
71
|
+
'.md', '.mdx', '.rst', '.tex', '.adoc',
|
|
72
|
+
# DevOps / IaC
|
|
73
|
+
'.tf', '.hcl', '.dockerfile', '.nix',
|
|
74
|
+
# Other
|
|
75
|
+
'.proto', '.thrift', '.zig', '.nim', '.v',
|
|
76
|
+
'.makefile', '.cmake',
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
def build_index(self, max_files: int = 500) -> dict:
|
|
80
|
+
"""Build the full code index."""
|
|
81
|
+
self.documents = {}
|
|
82
|
+
self.chunks = {}
|
|
83
|
+
self.symbols = {}
|
|
84
|
+
self._search_cache: OrderedDict = OrderedDict()
|
|
85
|
+
|
|
86
|
+
files_indexed = 0
|
|
87
|
+
chunks_created = 0
|
|
88
|
+
|
|
89
|
+
for fpath in sorted(self.project_path.rglob('*')):
|
|
90
|
+
if files_indexed >= max_files:
|
|
91
|
+
break
|
|
92
|
+
if not fpath.is_file():
|
|
93
|
+
continue
|
|
94
|
+
if fpath.suffix.lower() not in self.code_exts:
|
|
95
|
+
continue
|
|
96
|
+
if any(skip in fpath.parts for skip in self.skip_dirs):
|
|
97
|
+
continue
|
|
98
|
+
|
|
99
|
+
try:
|
|
100
|
+
content = fpath.read_text(errors='replace')
|
|
101
|
+
except Exception:
|
|
102
|
+
continue
|
|
103
|
+
|
|
104
|
+
rel_path = str(fpath.relative_to(self.project_path))
|
|
105
|
+
doc_id = rel_path
|
|
106
|
+
|
|
107
|
+
# Create document entry
|
|
108
|
+
self.documents[doc_id] = {
|
|
109
|
+
"path": rel_path,
|
|
110
|
+
"full_path": str(fpath),
|
|
111
|
+
"lines": len(content.splitlines()),
|
|
112
|
+
"tokens": count_tokens(content),
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
# Chunk the file
|
|
116
|
+
file_chunks = self._chunk_file(content, fpath.suffix.lower(), doc_id)
|
|
117
|
+
for chunk in file_chunks:
|
|
118
|
+
self.chunks[chunk["id"]] = chunk
|
|
119
|
+
chunks_created += 1
|
|
120
|
+
|
|
121
|
+
# Index symbols
|
|
122
|
+
if chunk.get("name"):
|
|
123
|
+
sym = chunk["name"].lower()
|
|
124
|
+
if sym not in self.symbols:
|
|
125
|
+
self.symbols[sym] = []
|
|
126
|
+
self.symbols[sym].append(chunk["id"])
|
|
127
|
+
|
|
128
|
+
# Track file modification time for recency bias
|
|
129
|
+
try:
|
|
130
|
+
self._file_mtimes[doc_id] = os.path.getmtime(str(fpath))
|
|
131
|
+
except Exception:
|
|
132
|
+
pass
|
|
133
|
+
|
|
134
|
+
files_indexed += 1
|
|
135
|
+
|
|
136
|
+
# Build TF-IDF and co-occurrence synonyms
|
|
137
|
+
self._build_tfidf()
|
|
138
|
+
self._build_cooccurrence()
|
|
139
|
+
|
|
140
|
+
# Save index
|
|
141
|
+
self._save_index()
|
|
142
|
+
|
|
143
|
+
return {
|
|
144
|
+
"files_indexed": files_indexed,
|
|
145
|
+
"chunks_created": chunks_created,
|
|
146
|
+
"unique_symbols": len(self.symbols),
|
|
147
|
+
"index_path": str(self.index_dir)
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
def _chunk_file(self, content: str, ext: str, doc_id: str) -> list:
|
|
151
|
+
"""Split a file into meaningful chunks (functions, classes, blocks)."""
|
|
152
|
+
lines = content.split('\n')
|
|
153
|
+
chunks = []
|
|
154
|
+
|
|
155
|
+
try:
|
|
156
|
+
from services.parser import extract_sections_ast
|
|
157
|
+
ast_sections = extract_sections_ast(content, ext)
|
|
158
|
+
if ast_sections:
|
|
159
|
+
ast_chunks = self._chunk_by_ast(ast_sections, lines, doc_id)
|
|
160
|
+
if ast_chunks:
|
|
161
|
+
return ast_chunks
|
|
162
|
+
except Exception:
|
|
163
|
+
pass
|
|
164
|
+
|
|
165
|
+
# Try structural chunking first
|
|
166
|
+
if ext in ('.py', '.r', '.R'):
|
|
167
|
+
chunks = self._chunk_by_indent(lines, doc_id, ext)
|
|
168
|
+
elif ext in ('.js', '.ts', '.tsx', '.jsx'):
|
|
169
|
+
chunks = self._chunk_by_braces(lines, doc_id, ext)
|
|
170
|
+
|
|
171
|
+
# Fallback: fixed-size chunks with overlap
|
|
172
|
+
if not chunks:
|
|
173
|
+
chunks = self._chunk_fixed(lines, doc_id, chunk_size=40, overlap=10)
|
|
174
|
+
|
|
175
|
+
return chunks
|
|
176
|
+
|
|
177
|
+
def _chunk_by_ast(self, sections: list, lines: list, doc_id: str) -> list:
|
|
178
|
+
chunks = []
|
|
179
|
+
from core import count_tokens
|
|
180
|
+
|
|
181
|
+
def process_section(sec, parent_name=""):
|
|
182
|
+
name = sec.get("name", "unnamed")
|
|
183
|
+
full_name = f"{parent_name}.{name}" if parent_name else name
|
|
184
|
+
start = sec["line_start"] - 1 # 0-indexed
|
|
185
|
+
end = sec["line_end"] - 1
|
|
186
|
+
chunk_content = '\n'.join(lines[start:end+1])
|
|
187
|
+
|
|
188
|
+
if sec.get("type") != "import":
|
|
189
|
+
chunks.append({
|
|
190
|
+
"id": f"{doc_id}::{full_name}",
|
|
191
|
+
"doc_id": doc_id,
|
|
192
|
+
"content": chunk_content,
|
|
193
|
+
"tokens": count_tokens(chunk_content),
|
|
194
|
+
"type": sec.get("type", "block"),
|
|
195
|
+
"name": full_name,
|
|
196
|
+
"line_start": start,
|
|
197
|
+
"line_end": end,
|
|
198
|
+
})
|
|
199
|
+
|
|
200
|
+
for child in sec.get("children", []):
|
|
201
|
+
process_section(child, full_name)
|
|
202
|
+
|
|
203
|
+
for sec in sections:
|
|
204
|
+
process_section(sec)
|
|
205
|
+
|
|
206
|
+
return chunks
|
|
207
|
+
|
|
208
|
+
def _chunk_by_indent(self, lines: list, doc_id: str, ext: str) -> list:
|
|
209
|
+
"""Chunk Python/R files by definitions, including class methods."""
|
|
210
|
+
chunks = []
|
|
211
|
+
current_chunk = []
|
|
212
|
+
current_name = None
|
|
213
|
+
current_type = None
|
|
214
|
+
chunk_start = 0
|
|
215
|
+
class_stack = []
|
|
216
|
+
|
|
217
|
+
def flush_chunk(end_index: int):
|
|
218
|
+
nonlocal current_chunk, chunk_start, current_name, current_type
|
|
219
|
+
if not current_chunk:
|
|
220
|
+
return
|
|
221
|
+
content = '\n'.join(current_chunk)
|
|
222
|
+
chunks.append({
|
|
223
|
+
"id": f"{doc_id}::{current_name or f'block_{chunk_start}'}",
|
|
224
|
+
"doc_id": doc_id,
|
|
225
|
+
"content": content,
|
|
226
|
+
"tokens": count_tokens(content),
|
|
227
|
+
"type": current_type or "block",
|
|
228
|
+
"name": current_name,
|
|
229
|
+
"line_start": chunk_start,
|
|
230
|
+
"line_end": end_index,
|
|
231
|
+
})
|
|
232
|
+
current_chunk = []
|
|
233
|
+
|
|
234
|
+
for i, line in enumerate(lines):
|
|
235
|
+
stripped = line.rstrip()
|
|
236
|
+
lstripped = line.lstrip()
|
|
237
|
+
indent = len(line) - len(lstripped)
|
|
238
|
+
|
|
239
|
+
while class_stack and indent <= class_stack[-1][0] and lstripped:
|
|
240
|
+
class_stack.pop()
|
|
241
|
+
|
|
242
|
+
# Detect definitions
|
|
243
|
+
is_definition = False
|
|
244
|
+
name = None
|
|
245
|
+
ctype = None
|
|
246
|
+
|
|
247
|
+
if ext == '.py':
|
|
248
|
+
m = re.match(r'^(class|(?:async\s+)?def)\s+(\w+)', lstripped)
|
|
249
|
+
if m:
|
|
250
|
+
is_definition = True
|
|
251
|
+
ctype = 'class' if m.group(1) == 'class' else 'function'
|
|
252
|
+
name = m.group(2)
|
|
253
|
+
if ctype == 'function' and class_stack:
|
|
254
|
+
name = f"{class_stack[-1][1]}.{name}"
|
|
255
|
+
elif ext in ('.r', '.R'):
|
|
256
|
+
m = re.match(r'^(\w+)\s*<-\s*function', lstripped)
|
|
257
|
+
if m:
|
|
258
|
+
is_definition = True
|
|
259
|
+
ctype = 'function'
|
|
260
|
+
name = m.group(1)
|
|
261
|
+
|
|
262
|
+
if is_definition and current_chunk:
|
|
263
|
+
flush_chunk(i - 1)
|
|
264
|
+
chunk_start = i
|
|
265
|
+
|
|
266
|
+
current_chunk.append(stripped)
|
|
267
|
+
if is_definition:
|
|
268
|
+
current_name = name
|
|
269
|
+
current_type = ctype
|
|
270
|
+
if ctype == 'class' and ext == '.py':
|
|
271
|
+
class_stack.append((indent, m.group(2)))
|
|
272
|
+
|
|
273
|
+
# Save last chunk
|
|
274
|
+
flush_chunk(len(lines) - 1)
|
|
275
|
+
|
|
276
|
+
return chunks
|
|
277
|
+
|
|
278
|
+
def _chunk_by_braces(self, lines: list, doc_id: str, ext: str) -> list:
|
|
279
|
+
"""Chunk JS/TS files by top-level declarations."""
|
|
280
|
+
chunks = []
|
|
281
|
+
current_chunk = []
|
|
282
|
+
current_name = None
|
|
283
|
+
current_type = None
|
|
284
|
+
chunk_start = 0
|
|
285
|
+
brace_depth = 0
|
|
286
|
+
|
|
287
|
+
for i, line in enumerate(lines):
|
|
288
|
+
stripped = line.rstrip()
|
|
289
|
+
current_chunk.append(stripped)
|
|
290
|
+
|
|
291
|
+
# Track brace depth
|
|
292
|
+
brace_depth += stripped.count('{') - stripped.count('}')
|
|
293
|
+
|
|
294
|
+
# Detect top-level declarations at depth 0
|
|
295
|
+
if brace_depth <= 0:
|
|
296
|
+
m = re.match(
|
|
297
|
+
r'^(?:export\s+)?(?:default\s+)?(?:async\s+)?'
|
|
298
|
+
r'(?:function|class|const|let|var|interface|type|enum)\s+(\w+)',
|
|
299
|
+
stripped
|
|
300
|
+
)
|
|
301
|
+
if m and len(current_chunk) > 1:
|
|
302
|
+
name = m.group(1)
|
|
303
|
+
# Save accumulated chunk
|
|
304
|
+
prev_lines = current_chunk[:-1]
|
|
305
|
+
if prev_lines:
|
|
306
|
+
chunks.append({
|
|
307
|
+
"id": f"{doc_id}::{current_name or f'block_{chunk_start}'}",
|
|
308
|
+
"doc_id": doc_id,
|
|
309
|
+
"content": '\n'.join(prev_lines),
|
|
310
|
+
"tokens": count_tokens('\n'.join(prev_lines)),
|
|
311
|
+
"type": current_type or "block",
|
|
312
|
+
"name": current_name,
|
|
313
|
+
"line_start": chunk_start,
|
|
314
|
+
"line_end": i - 1,
|
|
315
|
+
})
|
|
316
|
+
current_chunk = [stripped]
|
|
317
|
+
current_name = name
|
|
318
|
+
current_type = "declaration"
|
|
319
|
+
chunk_start = i
|
|
320
|
+
brace_depth = stripped.count('{') - stripped.count('}')
|
|
321
|
+
|
|
322
|
+
if current_chunk:
|
|
323
|
+
chunks.append({
|
|
324
|
+
"id": f"{doc_id}::{current_name or f'block_{chunk_start}'}",
|
|
325
|
+
"doc_id": doc_id,
|
|
326
|
+
"content": '\n'.join(current_chunk),
|
|
327
|
+
"tokens": count_tokens('\n'.join(current_chunk)),
|
|
328
|
+
"type": current_type or "block",
|
|
329
|
+
"name": current_name,
|
|
330
|
+
"line_start": chunk_start,
|
|
331
|
+
"line_end": len(lines) - 1,
|
|
332
|
+
})
|
|
333
|
+
|
|
334
|
+
return chunks
|
|
335
|
+
|
|
336
|
+
def _chunk_fixed(self, lines: list, doc_id: str,
|
|
337
|
+
chunk_size: int = 40, overlap: int = 10) -> list:
|
|
338
|
+
"""Fixed-size chunking with overlap."""
|
|
339
|
+
chunks = []
|
|
340
|
+
for i in range(0, len(lines), chunk_size - overlap):
|
|
341
|
+
chunk_lines = lines[i:i + chunk_size]
|
|
342
|
+
if not any(l.strip() for l in chunk_lines):
|
|
343
|
+
continue
|
|
344
|
+
chunks.append({
|
|
345
|
+
"id": f"{doc_id}::chunk_{i}",
|
|
346
|
+
"doc_id": doc_id,
|
|
347
|
+
"content": '\n'.join(chunk_lines),
|
|
348
|
+
"tokens": count_tokens('\n'.join(chunk_lines)),
|
|
349
|
+
"type": "block",
|
|
350
|
+
"name": None,
|
|
351
|
+
"line_start": i,
|
|
352
|
+
"line_end": min(i + chunk_size, len(lines)) - 1,
|
|
353
|
+
})
|
|
354
|
+
return chunks
|
|
355
|
+
|
|
356
|
+
def _tokenize(self, text: str) -> list:
|
|
357
|
+
"""Simple tokenization for TF-IDF."""
|
|
358
|
+
# Split camelCase and snake_case
|
|
359
|
+
text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)
|
|
360
|
+
text = text.replace('_', ' ').replace('-', ' ')
|
|
361
|
+
tokens = re.findall(r'[a-zA-Z]{2,}', text.lower())
|
|
362
|
+
return tokens
|
|
363
|
+
|
|
364
|
+
def _expand_query_tokens(self, query: str) -> tuple:
|
|
365
|
+
"""Expand query with synonyms + return phrase bigrams.
|
|
366
|
+
|
|
367
|
+
Returns (expanded_tokens, bigrams). Memoized per query string —
|
|
368
|
+
repeat searches (agentic flows) skip re-expansion.
|
|
369
|
+
Bigrams are returned (not stored on self) so concurrent searches
|
|
370
|
+
don't race on shared state.
|
|
371
|
+
"""
|
|
372
|
+
cached = self._expand_cache.get(query)
|
|
373
|
+
if cached is not None:
|
|
374
|
+
return cached
|
|
375
|
+
|
|
376
|
+
base_tokens = self._tokenize(query)
|
|
377
|
+
if not base_tokens:
|
|
378
|
+
self._expand_cache[query] = ([], [])
|
|
379
|
+
if len(self._expand_cache) > 256:
|
|
380
|
+
self._expand_cache.pop(next(iter(self._expand_cache)))
|
|
381
|
+
return [], []
|
|
382
|
+
|
|
383
|
+
synonyms = {
|
|
384
|
+
"endpoint": ["route", "handler", "api"],
|
|
385
|
+
"api": ["endpoint", "route", "handler"],
|
|
386
|
+
"helper": ["util", "utils", "common"],
|
|
387
|
+
"registry": ["profile", "profiles", "config"],
|
|
388
|
+
"profile": ["registry", "config", "ide"],
|
|
389
|
+
"compress": ["compression", "compressor"],
|
|
390
|
+
"metrics": ["stats", "summary"],
|
|
391
|
+
"summary": ["metrics", "stats"],
|
|
392
|
+
"delegate": ["ollama", "model"],
|
|
393
|
+
"search": ["index", "retrieval"],
|
|
394
|
+
"file": ["path", "filepath"],
|
|
395
|
+
"path": ["file", "filepath"],
|
|
396
|
+
"mcp": ["server", "tool"],
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
expanded = list(base_tokens)
|
|
400
|
+
seen = set(base_tokens)
|
|
401
|
+
for token in base_tokens:
|
|
402
|
+
# Hardcoded synonyms
|
|
403
|
+
for related in synonyms.get(token, []):
|
|
404
|
+
if related not in seen:
|
|
405
|
+
expanded.append(related)
|
|
406
|
+
seen.add(related)
|
|
407
|
+
# Co-occurrence synonyms (learned from index)
|
|
408
|
+
if token in self._cooccurrence:
|
|
409
|
+
for related in list(self._cooccurrence[token].keys())[:3]:
|
|
410
|
+
if related not in seen:
|
|
411
|
+
expanded.append(related)
|
|
412
|
+
seen.add(related)
|
|
413
|
+
|
|
414
|
+
# Bigrams for phrase matching — returned to caller, never stored on self
|
|
415
|
+
# (thread-safety: concurrent searches would race on a shared attribute).
|
|
416
|
+
bigrams = [
|
|
417
|
+
(base_tokens[i], base_tokens[i + 1])
|
|
418
|
+
for i in range(len(base_tokens) - 1)
|
|
419
|
+
]
|
|
420
|
+
self._expand_cache[query] = (expanded, bigrams)
|
|
421
|
+
if len(self._expand_cache) > 256:
|
|
422
|
+
self._expand_cache.pop(next(iter(self._expand_cache)))
|
|
423
|
+
return expanded, bigrams
|
|
424
|
+
|
|
425
|
+
def _score_chunk(self, chunk_id: str, query: str, query_tokens: list,
|
|
426
|
+
query_bigrams: list = None) -> float:
|
|
427
|
+
"""Combine TF-IDF relevance with path/name heuristics, bigrams, and recency."""
|
|
428
|
+
tfidf = self.chunk_tfidf.get(chunk_id, {})
|
|
429
|
+
chunk = self.chunks[chunk_id]
|
|
430
|
+
doc_id = chunk["doc_id"]
|
|
431
|
+
path_lower = doc_id.lower()
|
|
432
|
+
name_lower = (chunk.get("name") or "").lower()
|
|
433
|
+
path_parts = [part for part in re.split(r'[/\\._-]+', path_lower) if part]
|
|
434
|
+
chunk_tokens = chunk.get("tokens")
|
|
435
|
+
if not chunk_tokens:
|
|
436
|
+
chunk_tokens = count_tokens(chunk["content"])
|
|
437
|
+
chunk["tokens"] = chunk_tokens
|
|
438
|
+
|
|
439
|
+
score = sum(tfidf.get(qt, 0) for qt in query_tokens)
|
|
440
|
+
if score <= 0:
|
|
441
|
+
return 0.0
|
|
442
|
+
|
|
443
|
+
for qt in query_tokens:
|
|
444
|
+
if chunk.get("name"):
|
|
445
|
+
if qt == name_lower:
|
|
446
|
+
score *= 3.2
|
|
447
|
+
elif qt in name_lower:
|
|
448
|
+
score *= 1.7
|
|
449
|
+
|
|
450
|
+
if qt in path_parts:
|
|
451
|
+
score += 1.5
|
|
452
|
+
elif qt in path_lower:
|
|
453
|
+
score += 0.75
|
|
454
|
+
|
|
455
|
+
# Cache lowercased content + tokens per-chunk. Before this cache,
|
|
456
|
+
# the tokenizer ran for every chunk on every search — dominant hot-path cost.
|
|
457
|
+
content_lower = chunk.get("_content_lower")
|
|
458
|
+
if content_lower is None:
|
|
459
|
+
content_lower = (chunk.get("content") or "").lower()
|
|
460
|
+
chunk["_content_lower"] = content_lower
|
|
461
|
+
|
|
462
|
+
query_lower = query.lower()
|
|
463
|
+
if query_lower and query_lower in content_lower:
|
|
464
|
+
score *= 2.0
|
|
465
|
+
|
|
466
|
+
query_joined = query_lower.replace(" ", "_")
|
|
467
|
+
if len(query_joined) > 3 and query_joined in content_lower:
|
|
468
|
+
score *= 1.8
|
|
469
|
+
|
|
470
|
+
if any(part in query_lower for part in path_parts[:2]):
|
|
471
|
+
score += 0.4
|
|
472
|
+
|
|
473
|
+
# Bigram scoring — bigrams are passed in (not read from self) so the
|
|
474
|
+
# search path is thread-safe. Chunk tokens are cached after first use.
|
|
475
|
+
if query_bigrams:
|
|
476
|
+
chunk_content_tokens = chunk.get("_content_tokens")
|
|
477
|
+
if chunk_content_tokens is None:
|
|
478
|
+
chunk_content_tokens = self._tokenize(content_lower)
|
|
479
|
+
chunk["_content_tokens"] = chunk_content_tokens
|
|
480
|
+
for t1, t2 in query_bigrams:
|
|
481
|
+
for j in range(len(chunk_content_tokens) - 1):
|
|
482
|
+
if chunk_content_tokens[j] == t1 and chunk_content_tokens[j + 1] == t2:
|
|
483
|
+
score *= 1.5
|
|
484
|
+
break
|
|
485
|
+
|
|
486
|
+
# Recency bias — recently modified files get a small boost (1d)
|
|
487
|
+
mtime = self._file_mtimes.get(doc_id, 0)
|
|
488
|
+
if mtime > 0 and self._file_mtimes:
|
|
489
|
+
max_mtime = max(self._file_mtimes.values())
|
|
490
|
+
if max_mtime > 0:
|
|
491
|
+
age_ratio = mtime / max_mtime # 1.0 for newest, lower for older
|
|
492
|
+
score *= (0.9 + 0.2 * age_ratio) # up to 1.1x for newest files
|
|
493
|
+
|
|
494
|
+
score += min(len(path_parts), 6) * 0.02
|
|
495
|
+
size_penalty = 1.0 + max(0.0, chunk_tokens - 450) / 1200.0
|
|
496
|
+
return score / size_penalty
|
|
497
|
+
|
|
498
|
+
def _build_tfidf(self):
|
|
499
|
+
"""Build TF-IDF scores for all chunks."""
|
|
500
|
+
N = len(self.chunks)
|
|
501
|
+
if N == 0:
|
|
502
|
+
return
|
|
503
|
+
|
|
504
|
+
# Document frequency
|
|
505
|
+
df = Counter()
|
|
506
|
+
chunk_tf = {}
|
|
507
|
+
|
|
508
|
+
for chunk_id, chunk in self.chunks.items():
|
|
509
|
+
tokens = self._tokenize(chunk["content"])
|
|
510
|
+
# Include file path tokens
|
|
511
|
+
tokens += self._tokenize(chunk["doc_id"])
|
|
512
|
+
if chunk.get("name"):
|
|
513
|
+
tokens += self._tokenize(chunk["name"]) * 3 # Boost symbol names
|
|
514
|
+
|
|
515
|
+
tf = Counter(tokens)
|
|
516
|
+
chunk_tf[chunk_id] = tf
|
|
517
|
+
for term in set(tokens):
|
|
518
|
+
df[term] += 1
|
|
519
|
+
|
|
520
|
+
# IDF
|
|
521
|
+
self.idf = {term: math.log(N / (1 + freq)) for term, freq in df.items()}
|
|
522
|
+
|
|
523
|
+
# TF-IDF per chunk
|
|
524
|
+
self.chunk_tfidf = {}
|
|
525
|
+
for chunk_id, tf in chunk_tf.items():
|
|
526
|
+
self.chunk_tfidf[chunk_id] = {}
|
|
527
|
+
max_tf = max(tf.values()) if tf else 1
|
|
528
|
+
for term, freq in tf.items():
|
|
529
|
+
normalized_tf = 0.5 + 0.5 * (freq / max_tf)
|
|
530
|
+
self.chunk_tfidf[chunk_id][term] = normalized_tf * self.idf.get(term, 0)
|
|
531
|
+
|
|
532
|
+
def _build_cooccurrence(self):
|
|
533
|
+
"""Build lightweight co-occurrence map from indexed chunks for auto-synonyms."""
|
|
534
|
+
self._cooccurrence = {}
|
|
535
|
+
for chunk in self.chunks.values():
|
|
536
|
+
tokens = set(self._tokenize(chunk["content"]))
|
|
537
|
+
for t in tokens:
|
|
538
|
+
if t not in self._cooccurrence:
|
|
539
|
+
self._cooccurrence[t] = Counter()
|
|
540
|
+
for t2 in tokens:
|
|
541
|
+
if t != t2:
|
|
542
|
+
self._cooccurrence[t][t2] += 1
|
|
543
|
+
# Prune: keep only top-5 co-occurring terms per token (minimum 3 co-occurrences)
|
|
544
|
+
pruned = {}
|
|
545
|
+
for term, counts in self._cooccurrence.items():
|
|
546
|
+
top = [(t, c) for t, c in counts.most_common(5) if c >= 3]
|
|
547
|
+
if top:
|
|
548
|
+
pruned[term] = dict(top)
|
|
549
|
+
self._cooccurrence = pruned
|
|
550
|
+
|
|
551
|
+
def search(self, query: str, top_k: int = 5, max_tokens: int = 4000,
|
|
552
|
+
include_content: bool = True) -> list:
|
|
553
|
+
"""Search the index and return most relevant chunks.
|
|
554
|
+
|
|
555
|
+
Set include_content=False to get metadata only (saves ~70% tokens).
|
|
556
|
+
"""
|
|
557
|
+
if not self.chunks:
|
|
558
|
+
self._load_index()
|
|
559
|
+
if not self.chunks:
|
|
560
|
+
return []
|
|
561
|
+
|
|
562
|
+
cache_key = (query, int(top_k), int(max_tokens), bool(include_content))
|
|
563
|
+
cached = self._search_cache.get(cache_key)
|
|
564
|
+
if cached is not None:
|
|
565
|
+
self._search_cache.move_to_end(cache_key)
|
|
566
|
+
return [dict(item) for item in cached]
|
|
567
|
+
|
|
568
|
+
query_tokens, query_bigrams = self._expand_query_tokens(query)
|
|
569
|
+
if not query_tokens:
|
|
570
|
+
return []
|
|
571
|
+
|
|
572
|
+
# Score each chunk
|
|
573
|
+
scores = {}
|
|
574
|
+
for chunk_id in self.chunk_tfidf:
|
|
575
|
+
score = self._score_chunk(chunk_id, query, query_tokens, query_bigrams)
|
|
576
|
+
if score > 0:
|
|
577
|
+
scores[chunk_id] = score
|
|
578
|
+
|
|
579
|
+
# Sort by score, then prefer named/structural chunks and shorter paths on ties.
|
|
580
|
+
ranked = sorted(
|
|
581
|
+
scores.items(),
|
|
582
|
+
key=lambda item: (
|
|
583
|
+
item[1],
|
|
584
|
+
1 if self.chunks[item[0]].get("name") else 0,
|
|
585
|
+
1 if self.chunks[item[0]].get("type") in {"function", "class", "method", "declaration"} else 0,
|
|
586
|
+
-len(self.chunks[item[0]]["doc_id"]),
|
|
587
|
+
),
|
|
588
|
+
reverse=True,
|
|
589
|
+
)
|
|
590
|
+
|
|
591
|
+
# Collect results up to token budget
|
|
592
|
+
results = []
|
|
593
|
+
token_budget = max_tokens
|
|
594
|
+
seen_docs = set()
|
|
595
|
+
|
|
596
|
+
for chunk_id, score in ranked[:top_k * 4]:
|
|
597
|
+
chunk = self.chunks[chunk_id]
|
|
598
|
+
chunk_tokens = chunk.get("tokens") or count_tokens(chunk["content"])
|
|
599
|
+
|
|
600
|
+
if chunk_tokens > token_budget:
|
|
601
|
+
continue
|
|
602
|
+
|
|
603
|
+
if chunk["doc_id"] in seen_docs and len(results) >= max(2, top_k // 2):
|
|
604
|
+
continue
|
|
605
|
+
|
|
606
|
+
doc = self.documents.get(chunk["doc_id"], {})
|
|
607
|
+
result = {
|
|
608
|
+
"chunk_id": chunk_id,
|
|
609
|
+
"file": chunk["doc_id"],
|
|
610
|
+
"name": chunk.get("name"),
|
|
611
|
+
"type": chunk["type"],
|
|
612
|
+
"lines": f"{chunk['line_start']}-{chunk['line_end']}",
|
|
613
|
+
"tokens": chunk_tokens,
|
|
614
|
+
"file_tokens": doc.get("tokens", chunk_tokens),
|
|
615
|
+
"score": round(score, 3),
|
|
616
|
+
}
|
|
617
|
+
if include_content:
|
|
618
|
+
result["content"] = chunk["content"]
|
|
619
|
+
|
|
620
|
+
results.append(result)
|
|
621
|
+
|
|
622
|
+
token_budget -= chunk_tokens
|
|
623
|
+
seen_docs.add(chunk["doc_id"])
|
|
624
|
+
|
|
625
|
+
if len(results) >= top_k or token_budget <= 0:
|
|
626
|
+
break
|
|
627
|
+
|
|
628
|
+
self._search_cache[cache_key] = [dict(item) for item in results]
|
|
629
|
+
if len(self._search_cache) > self._search_cache_max:
|
|
630
|
+
self._search_cache.popitem(last=False)
|
|
631
|
+
return results
|
|
632
|
+
|
|
633
|
+
def get_context(self, query: str, top_k: int = 5, max_tokens: int = 4000) -> str:
|
|
634
|
+
"""Get a formatted context string ready to pass to Claude."""
|
|
635
|
+
results = self.search(query, top_k, max_tokens)
|
|
636
|
+
|
|
637
|
+
if not results:
|
|
638
|
+
return "No relevant code found in index."
|
|
639
|
+
|
|
640
|
+
sections = []
|
|
641
|
+
total_tokens = 0
|
|
642
|
+
|
|
643
|
+
for r in results:
|
|
644
|
+
section = f"## {r['file']} (L{r['lines']})"
|
|
645
|
+
if r['name']:
|
|
646
|
+
section += f" — {r['name']}"
|
|
647
|
+
section += f"\n```\n{r['content']}\n```"
|
|
648
|
+
sections.append(section)
|
|
649
|
+
total_tokens += r['tokens']
|
|
650
|
+
|
|
651
|
+
header = f"# Relevant Code Context ({total_tokens} tokens, {len(results)} chunks)\n"
|
|
652
|
+
return header + '\n\n'.join(sections)
|
|
653
|
+
|
|
654
|
+
def _save_index(self):
|
|
655
|
+
"""Save index to disk."""
|
|
656
|
+
# Strip in-memory caches (fields prefixed with _) before persisting —
|
|
657
|
+
# they're regenerated on demand and would bloat index.json.
|
|
658
|
+
data = {
|
|
659
|
+
"documents": self.documents,
|
|
660
|
+
"chunks": {
|
|
661
|
+
k: {kk: vv for kk, vv in v.items() if not kk.startswith("_")}
|
|
662
|
+
for k, v in self.chunks.items()
|
|
663
|
+
},
|
|
664
|
+
"symbols": self.symbols,
|
|
665
|
+
"idf": self.idf,
|
|
666
|
+
"chunk_tfidf": self.chunk_tfidf,
|
|
667
|
+
}
|
|
668
|
+
index_file = self.index_dir / "index.json"
|
|
669
|
+
with open(index_file, 'w', encoding='utf-8') as f:
|
|
670
|
+
json.dump(data, f)
|
|
671
|
+
|
|
672
|
+
def _load_index(self) -> bool:
|
|
673
|
+
"""Load index from disk."""
|
|
674
|
+
index_file = self.index_dir / "index.json"
|
|
675
|
+
if not index_file.exists():
|
|
676
|
+
return False
|
|
677
|
+
|
|
678
|
+
try:
|
|
679
|
+
with open(index_file, encoding='utf-8') as f:
|
|
680
|
+
data = json.load(f)
|
|
681
|
+
self.documents = data["documents"]
|
|
682
|
+
self.chunks = data["chunks"]
|
|
683
|
+
self.symbols = data.get("symbols", {})
|
|
684
|
+
self.idf = data.get("idf", {})
|
|
685
|
+
self.chunk_tfidf = data.get("chunk_tfidf", {})
|
|
686
|
+
mutated = False
|
|
687
|
+
for chunk in self.chunks.values():
|
|
688
|
+
if "tokens" not in chunk:
|
|
689
|
+
chunk["tokens"] = count_tokens(chunk.get("content", ""))
|
|
690
|
+
mutated = True
|
|
691
|
+
self._search_cache = OrderedDict()
|
|
692
|
+
if mutated:
|
|
693
|
+
self._save_index()
|
|
694
|
+
return True
|
|
695
|
+
except Exception:
|
|
696
|
+
return False
|
|
697
|
+
|
|
698
|
+
def get_stats(self) -> dict:
|
|
699
|
+
"""Get index statistics."""
|
|
700
|
+
if not self.documents:
|
|
701
|
+
self._load_index()
|
|
702
|
+
|
|
703
|
+
total_tokens = sum(d.get("tokens", 0) for d in self.documents.values())
|
|
704
|
+
return {
|
|
705
|
+
"files_indexed": len(self.documents),
|
|
706
|
+
"total_chunks": len(self.chunks),
|
|
707
|
+
"total_tokens_in_codebase": total_tokens,
|
|
708
|
+
"unique_symbols": len(self.symbols),
|
|
709
|
+
"index_size_kb": round(
|
|
710
|
+
(self.index_dir / "index.json").stat().st_size / 1024, 1
|
|
711
|
+
) if (self.index_dir / "index.json").exists() else 0
|
|
712
|
+
}
|