code-context-control 2.28.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. cli/__init__.py +1 -0
  2. cli/_hook_utils.py +99 -0
  3. cli/c3.py +6152 -0
  4. cli/commands/__init__.py +1 -0
  5. cli/commands/common.py +312 -0
  6. cli/commands/parser.py +286 -0
  7. cli/docs.html +3178 -0
  8. cli/edits.html +878 -0
  9. cli/hook_auto_snapshot.py +142 -0
  10. cli/hook_c3_signal.py +61 -0
  11. cli/hook_c3read.py +116 -0
  12. cli/hook_edit_ledger.py +213 -0
  13. cli/hook_edit_unlock.py +170 -0
  14. cli/hook_filter.py +130 -0
  15. cli/hook_ghost_files.py +238 -0
  16. cli/hook_pretool_enforce.py +334 -0
  17. cli/hook_read.py +200 -0
  18. cli/hook_session_stats.py +62 -0
  19. cli/hook_terse_advisor.py +190 -0
  20. cli/hub.html +3764 -0
  21. cli/hub_server.py +1619 -0
  22. cli/mcp_proxy.py +428 -0
  23. cli/mcp_server.py +660 -0
  24. cli/server.py +2985 -0
  25. cli/tools/__init__.py +4 -0
  26. cli/tools/_helpers.py +65 -0
  27. cli/tools/agent.py +1165 -0
  28. cli/tools/compress.py +215 -0
  29. cli/tools/delegate.py +1184 -0
  30. cli/tools/edit.py +313 -0
  31. cli/tools/edits.py +118 -0
  32. cli/tools/filter.py +285 -0
  33. cli/tools/impact.py +163 -0
  34. cli/tools/memory.py +469 -0
  35. cli/tools/read.py +224 -0
  36. cli/tools/search.py +337 -0
  37. cli/tools/session.py +95 -0
  38. cli/tools/shell.py +193 -0
  39. cli/tools/status.py +306 -0
  40. cli/tools/validate.py +310 -0
  41. cli/ui/api.js +36 -0
  42. cli/ui/app.js +207 -0
  43. cli/ui/components/chat.js +758 -0
  44. cli/ui/components/dashboard.js +689 -0
  45. cli/ui/components/edits.js +220 -0
  46. cli/ui/components/instructions.js +481 -0
  47. cli/ui/components/memory.js +626 -0
  48. cli/ui/components/sessions.js +606 -0
  49. cli/ui/components/settings.js +1404 -0
  50. cli/ui/components/sidebar.js +156 -0
  51. cli/ui/icons.js +51 -0
  52. cli/ui/shared.js +119 -0
  53. cli/ui/theme.js +22 -0
  54. cli/ui.html +168 -0
  55. cli/ui_legacy.html +6797 -0
  56. cli/ui_nano.html +503 -0
  57. code_context_control-2.28.0.dist-info/METADATA +248 -0
  58. code_context_control-2.28.0.dist-info/RECORD +150 -0
  59. code_context_control-2.28.0.dist-info/WHEEL +5 -0
  60. code_context_control-2.28.0.dist-info/entry_points.txt +4 -0
  61. code_context_control-2.28.0.dist-info/licenses/LICENSE +201 -0
  62. code_context_control-2.28.0.dist-info/top_level.txt +5 -0
  63. core/__init__.py +75 -0
  64. core/config.py +269 -0
  65. core/ide.py +188 -0
  66. oracle/__init__.py +1 -0
  67. oracle/config.py +75 -0
  68. oracle/oracle.html +3900 -0
  69. oracle/oracle_server.py +663 -0
  70. oracle/services/__init__.py +1 -0
  71. oracle/services/c3_bridge.py +210 -0
  72. oracle/services/chat_engine.py +1103 -0
  73. oracle/services/chat_store.py +155 -0
  74. oracle/services/cross_memory.py +154 -0
  75. oracle/services/federated_graph.py +463 -0
  76. oracle/services/health_checker.py +117 -0
  77. oracle/services/insight_engine.py +307 -0
  78. oracle/services/memory_reader.py +106 -0
  79. oracle/services/memory_writer.py +182 -0
  80. oracle/services/ollama_bridge.py +332 -0
  81. oracle/services/project_scanner.py +87 -0
  82. oracle/services/review_agent.py +206 -0
  83. services/__init__.py +1 -0
  84. services/activity_log.py +93 -0
  85. services/agent_base.py +124 -0
  86. services/agents.py +1529 -0
  87. services/auto_memory.py +407 -0
  88. services/bench/__init__.py +6 -0
  89. services/bench/external/__init__.py +29 -0
  90. services/bench/external/aider_polyglot.py +405 -0
  91. services/bench/external/swe_bench.py +485 -0
  92. services/benchmark_dashboard.py +596 -0
  93. services/claude_md.py +785 -0
  94. services/compressor.py +592 -0
  95. services/context_snapshot.py +356 -0
  96. services/conversation_store.py +870 -0
  97. services/doc_index.py +537 -0
  98. services/e2e_benchmark.py +2884 -0
  99. services/e2e_evaluator.py +396 -0
  100. services/e2e_tasks.py +743 -0
  101. services/edit_ledger.py +459 -0
  102. services/embedding_index.py +341 -0
  103. services/error_reporting.py +123 -0
  104. services/file_memory.py +734 -0
  105. services/hub_service.py +585 -0
  106. services/indexer.py +712 -0
  107. services/memory.py +318 -0
  108. services/memory_consolidator.py +538 -0
  109. services/memory_graph.py +382 -0
  110. services/memory_grounder.py +304 -0
  111. services/memory_scorer.py +246 -0
  112. services/metrics.py +86 -0
  113. services/notifications.py +209 -0
  114. services/ollama_client.py +201 -0
  115. services/output_filter.py +488 -0
  116. services/parser.py +1238 -0
  117. services/project_manager.py +579 -0
  118. services/protocol.py +306 -0
  119. services/proxy_state.py +152 -0
  120. services/retrieval_broker.py +129 -0
  121. services/router.py +414 -0
  122. services/runtime.py +326 -0
  123. services/session_benchmark.py +1945 -0
  124. services/session_manager.py +1026 -0
  125. services/session_preloader.py +251 -0
  126. services/text_index.py +90 -0
  127. services/tool_classifier.py +176 -0
  128. services/transcript_index.py +340 -0
  129. services/validation_cache.py +155 -0
  130. services/vector_store.py +299 -0
  131. services/version_tracker.py +271 -0
  132. services/watcher.py +192 -0
  133. tui/__init__.py +0 -0
  134. tui/backend.py +59 -0
  135. tui/main.py +145 -0
  136. tui/screens/__init__.py +1 -0
  137. tui/screens/benchmark_view.py +109 -0
  138. tui/screens/claudemd_view.py +46 -0
  139. tui/screens/compress_view.py +52 -0
  140. tui/screens/index_view.py +74 -0
  141. tui/screens/init_view.py +82 -0
  142. tui/screens/mcp_view.py +73 -0
  143. tui/screens/optimize_view.py +41 -0
  144. tui/screens/pipe_view.py +46 -0
  145. tui/screens/projects_view.py +355 -0
  146. tui/screens/search_view.py +55 -0
  147. tui/screens/session_view.py +143 -0
  148. tui/screens/stats.py +158 -0
  149. tui/screens/ui_view.py +54 -0
  150. tui/theme.tcss +335 -0
services/indexer.py ADDED
@@ -0,0 +1,712 @@
1
+ """
2
+ Smart Local Index Service
3
+
4
+ Builds a searchable index of your codebase using TF-IDF and code structure analysis.
5
+ Retrieves only the most relevant code snippets for a given query, dramatically reducing
6
+ the amount of code Claude needs to read.
7
+ """
8
+ import json
9
+ import math
10
+ import os
11
+ import re
12
+ from collections import Counter, OrderedDict
13
+ from pathlib import Path
14
+
15
+ from core import count_tokens
16
+
17
+
18
+ class CodeIndex:
19
+ """TF-IDF based code search index with structural awareness."""
20
+
21
+ def __init__(self, project_path: str, index_dir: str = ".c3/index"):
22
+ self.project_path = Path(project_path)
23
+ self.index_dir = self.project_path / index_dir
24
+ self.index_dir.mkdir(parents=True, exist_ok=True)
25
+
26
+ # Index data
27
+ self.documents = {} # doc_id -> {path, content, chunks}
28
+ self.chunks = {} # chunk_id -> {doc_id, content, type, name, line_start, line_end}
29
+ self.idf = {} # term -> IDF score
30
+ self.chunk_tfidf = {} # chunk_id -> {term: tfidf_score}
31
+ self.symbols = {} # symbol_name -> [chunk_ids]
32
+ # Bounded LRU — an unbounded dict grew indefinitely over long sessions.
33
+ self._search_cache: "OrderedDict" = OrderedDict()
34
+ self._search_cache_max = 128
35
+ # Memoized query expansion + bigrams. Agents repeat the same queries.
36
+ self._expand_cache: dict = {}
37
+ self._cooccurrence = {} # term -> {term: count} for auto-synonyms
38
+ self._file_mtimes = {} # doc_id -> mtime for recency bias
39
+
40
+ # Config
41
+ self.skip_dirs = {'node_modules', '.git', '__pycache__', '.c3', 'venv',
42
+ 'env', '.venv', 'dist', 'build', '.next', '.cache',
43
+ 'coverage', '.pytest_cache'}
44
+ self.code_exts = {
45
+ # Python
46
+ '.py', '.pyi', '.pyx',
47
+ # JavaScript / TypeScript
48
+ '.js', '.jsx', '.ts', '.tsx', '.mjs', '.cjs',
49
+ # Web
50
+ '.html', '.htm', '.css', '.scss', '.sass', '.less', '.vue', '.svelte',
51
+ # Markdown
52
+ '.md', '.mdx',
53
+ # Data / Config
54
+ '.json', '.yaml', '.yml', '.toml', '.ini', '.cfg', '.env.example',
55
+ '.xml', '.csv',
56
+ # Systems
57
+ '.c', '.h', '.cpp', '.cxx', '.cc', '.hpp', '.hxx',
58
+ '.rs', '.go', '.java', '.kt', '.kts', '.scala',
59
+ '.cs', '.fs', '.vb',
60
+ # Scripting
61
+ '.sh', '.bash', '.zsh', '.fish', '.ps1', '.bat', '.cmd',
62
+ '.rb', '.pl', '.pm', '.lua', '.php',
63
+ '.r', '.R', '.jl',
64
+ # Query / Schema
65
+ '.sql', '.graphql', '.gql', '.prisma',
66
+ # Functional
67
+ '.hs', '.ex', '.exs', '.erl', '.clj', '.cljs', '.elm', '.ml', '.mli',
68
+ # Mobile
69
+ '.swift', '.m', '.mm', '.dart',
70
+ # Docs / Markup
71
+ '.md', '.mdx', '.rst', '.tex', '.adoc',
72
+ # DevOps / IaC
73
+ '.tf', '.hcl', '.dockerfile', '.nix',
74
+ # Other
75
+ '.proto', '.thrift', '.zig', '.nim', '.v',
76
+ '.makefile', '.cmake',
77
+ }
78
+
79
+ def build_index(self, max_files: int = 500) -> dict:
80
+ """Build the full code index."""
81
+ self.documents = {}
82
+ self.chunks = {}
83
+ self.symbols = {}
84
+ self._search_cache: OrderedDict = OrderedDict()
85
+
86
+ files_indexed = 0
87
+ chunks_created = 0
88
+
89
+ for fpath in sorted(self.project_path.rglob('*')):
90
+ if files_indexed >= max_files:
91
+ break
92
+ if not fpath.is_file():
93
+ continue
94
+ if fpath.suffix.lower() not in self.code_exts:
95
+ continue
96
+ if any(skip in fpath.parts for skip in self.skip_dirs):
97
+ continue
98
+
99
+ try:
100
+ content = fpath.read_text(errors='replace')
101
+ except Exception:
102
+ continue
103
+
104
+ rel_path = str(fpath.relative_to(self.project_path))
105
+ doc_id = rel_path
106
+
107
+ # Create document entry
108
+ self.documents[doc_id] = {
109
+ "path": rel_path,
110
+ "full_path": str(fpath),
111
+ "lines": len(content.splitlines()),
112
+ "tokens": count_tokens(content),
113
+ }
114
+
115
+ # Chunk the file
116
+ file_chunks = self._chunk_file(content, fpath.suffix.lower(), doc_id)
117
+ for chunk in file_chunks:
118
+ self.chunks[chunk["id"]] = chunk
119
+ chunks_created += 1
120
+
121
+ # Index symbols
122
+ if chunk.get("name"):
123
+ sym = chunk["name"].lower()
124
+ if sym not in self.symbols:
125
+ self.symbols[sym] = []
126
+ self.symbols[sym].append(chunk["id"])
127
+
128
+ # Track file modification time for recency bias
129
+ try:
130
+ self._file_mtimes[doc_id] = os.path.getmtime(str(fpath))
131
+ except Exception:
132
+ pass
133
+
134
+ files_indexed += 1
135
+
136
+ # Build TF-IDF and co-occurrence synonyms
137
+ self._build_tfidf()
138
+ self._build_cooccurrence()
139
+
140
+ # Save index
141
+ self._save_index()
142
+
143
+ return {
144
+ "files_indexed": files_indexed,
145
+ "chunks_created": chunks_created,
146
+ "unique_symbols": len(self.symbols),
147
+ "index_path": str(self.index_dir)
148
+ }
149
+
150
+ def _chunk_file(self, content: str, ext: str, doc_id: str) -> list:
151
+ """Split a file into meaningful chunks (functions, classes, blocks)."""
152
+ lines = content.split('\n')
153
+ chunks = []
154
+
155
+ try:
156
+ from services.parser import extract_sections_ast
157
+ ast_sections = extract_sections_ast(content, ext)
158
+ if ast_sections:
159
+ ast_chunks = self._chunk_by_ast(ast_sections, lines, doc_id)
160
+ if ast_chunks:
161
+ return ast_chunks
162
+ except Exception:
163
+ pass
164
+
165
+ # Try structural chunking first
166
+ if ext in ('.py', '.r', '.R'):
167
+ chunks = self._chunk_by_indent(lines, doc_id, ext)
168
+ elif ext in ('.js', '.ts', '.tsx', '.jsx'):
169
+ chunks = self._chunk_by_braces(lines, doc_id, ext)
170
+
171
+ # Fallback: fixed-size chunks with overlap
172
+ if not chunks:
173
+ chunks = self._chunk_fixed(lines, doc_id, chunk_size=40, overlap=10)
174
+
175
+ return chunks
176
+
177
+ def _chunk_by_ast(self, sections: list, lines: list, doc_id: str) -> list:
178
+ chunks = []
179
+ from core import count_tokens
180
+
181
+ def process_section(sec, parent_name=""):
182
+ name = sec.get("name", "unnamed")
183
+ full_name = f"{parent_name}.{name}" if parent_name else name
184
+ start = sec["line_start"] - 1 # 0-indexed
185
+ end = sec["line_end"] - 1
186
+ chunk_content = '\n'.join(lines[start:end+1])
187
+
188
+ if sec.get("type") != "import":
189
+ chunks.append({
190
+ "id": f"{doc_id}::{full_name}",
191
+ "doc_id": doc_id,
192
+ "content": chunk_content,
193
+ "tokens": count_tokens(chunk_content),
194
+ "type": sec.get("type", "block"),
195
+ "name": full_name,
196
+ "line_start": start,
197
+ "line_end": end,
198
+ })
199
+
200
+ for child in sec.get("children", []):
201
+ process_section(child, full_name)
202
+
203
+ for sec in sections:
204
+ process_section(sec)
205
+
206
+ return chunks
207
+
208
+ def _chunk_by_indent(self, lines: list, doc_id: str, ext: str) -> list:
209
+ """Chunk Python/R files by definitions, including class methods."""
210
+ chunks = []
211
+ current_chunk = []
212
+ current_name = None
213
+ current_type = None
214
+ chunk_start = 0
215
+ class_stack = []
216
+
217
+ def flush_chunk(end_index: int):
218
+ nonlocal current_chunk, chunk_start, current_name, current_type
219
+ if not current_chunk:
220
+ return
221
+ content = '\n'.join(current_chunk)
222
+ chunks.append({
223
+ "id": f"{doc_id}::{current_name or f'block_{chunk_start}'}",
224
+ "doc_id": doc_id,
225
+ "content": content,
226
+ "tokens": count_tokens(content),
227
+ "type": current_type or "block",
228
+ "name": current_name,
229
+ "line_start": chunk_start,
230
+ "line_end": end_index,
231
+ })
232
+ current_chunk = []
233
+
234
+ for i, line in enumerate(lines):
235
+ stripped = line.rstrip()
236
+ lstripped = line.lstrip()
237
+ indent = len(line) - len(lstripped)
238
+
239
+ while class_stack and indent <= class_stack[-1][0] and lstripped:
240
+ class_stack.pop()
241
+
242
+ # Detect definitions
243
+ is_definition = False
244
+ name = None
245
+ ctype = None
246
+
247
+ if ext == '.py':
248
+ m = re.match(r'^(class|(?:async\s+)?def)\s+(\w+)', lstripped)
249
+ if m:
250
+ is_definition = True
251
+ ctype = 'class' if m.group(1) == 'class' else 'function'
252
+ name = m.group(2)
253
+ if ctype == 'function' and class_stack:
254
+ name = f"{class_stack[-1][1]}.{name}"
255
+ elif ext in ('.r', '.R'):
256
+ m = re.match(r'^(\w+)\s*<-\s*function', lstripped)
257
+ if m:
258
+ is_definition = True
259
+ ctype = 'function'
260
+ name = m.group(1)
261
+
262
+ if is_definition and current_chunk:
263
+ flush_chunk(i - 1)
264
+ chunk_start = i
265
+
266
+ current_chunk.append(stripped)
267
+ if is_definition:
268
+ current_name = name
269
+ current_type = ctype
270
+ if ctype == 'class' and ext == '.py':
271
+ class_stack.append((indent, m.group(2)))
272
+
273
+ # Save last chunk
274
+ flush_chunk(len(lines) - 1)
275
+
276
+ return chunks
277
+
278
+ def _chunk_by_braces(self, lines: list, doc_id: str, ext: str) -> list:
279
+ """Chunk JS/TS files by top-level declarations."""
280
+ chunks = []
281
+ current_chunk = []
282
+ current_name = None
283
+ current_type = None
284
+ chunk_start = 0
285
+ brace_depth = 0
286
+
287
+ for i, line in enumerate(lines):
288
+ stripped = line.rstrip()
289
+ current_chunk.append(stripped)
290
+
291
+ # Track brace depth
292
+ brace_depth += stripped.count('{') - stripped.count('}')
293
+
294
+ # Detect top-level declarations at depth 0
295
+ if brace_depth <= 0:
296
+ m = re.match(
297
+ r'^(?:export\s+)?(?:default\s+)?(?:async\s+)?'
298
+ r'(?:function|class|const|let|var|interface|type|enum)\s+(\w+)',
299
+ stripped
300
+ )
301
+ if m and len(current_chunk) > 1:
302
+ name = m.group(1)
303
+ # Save accumulated chunk
304
+ prev_lines = current_chunk[:-1]
305
+ if prev_lines:
306
+ chunks.append({
307
+ "id": f"{doc_id}::{current_name or f'block_{chunk_start}'}",
308
+ "doc_id": doc_id,
309
+ "content": '\n'.join(prev_lines),
310
+ "tokens": count_tokens('\n'.join(prev_lines)),
311
+ "type": current_type or "block",
312
+ "name": current_name,
313
+ "line_start": chunk_start,
314
+ "line_end": i - 1,
315
+ })
316
+ current_chunk = [stripped]
317
+ current_name = name
318
+ current_type = "declaration"
319
+ chunk_start = i
320
+ brace_depth = stripped.count('{') - stripped.count('}')
321
+
322
+ if current_chunk:
323
+ chunks.append({
324
+ "id": f"{doc_id}::{current_name or f'block_{chunk_start}'}",
325
+ "doc_id": doc_id,
326
+ "content": '\n'.join(current_chunk),
327
+ "tokens": count_tokens('\n'.join(current_chunk)),
328
+ "type": current_type or "block",
329
+ "name": current_name,
330
+ "line_start": chunk_start,
331
+ "line_end": len(lines) - 1,
332
+ })
333
+
334
+ return chunks
335
+
336
+ def _chunk_fixed(self, lines: list, doc_id: str,
337
+ chunk_size: int = 40, overlap: int = 10) -> list:
338
+ """Fixed-size chunking with overlap."""
339
+ chunks = []
340
+ for i in range(0, len(lines), chunk_size - overlap):
341
+ chunk_lines = lines[i:i + chunk_size]
342
+ if not any(l.strip() for l in chunk_lines):
343
+ continue
344
+ chunks.append({
345
+ "id": f"{doc_id}::chunk_{i}",
346
+ "doc_id": doc_id,
347
+ "content": '\n'.join(chunk_lines),
348
+ "tokens": count_tokens('\n'.join(chunk_lines)),
349
+ "type": "block",
350
+ "name": None,
351
+ "line_start": i,
352
+ "line_end": min(i + chunk_size, len(lines)) - 1,
353
+ })
354
+ return chunks
355
+
356
+ def _tokenize(self, text: str) -> list:
357
+ """Simple tokenization for TF-IDF."""
358
+ # Split camelCase and snake_case
359
+ text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)
360
+ text = text.replace('_', ' ').replace('-', ' ')
361
+ tokens = re.findall(r'[a-zA-Z]{2,}', text.lower())
362
+ return tokens
363
+
364
+ def _expand_query_tokens(self, query: str) -> tuple:
365
+ """Expand query with synonyms + return phrase bigrams.
366
+
367
+ Returns (expanded_tokens, bigrams). Memoized per query string —
368
+ repeat searches (agentic flows) skip re-expansion.
369
+ Bigrams are returned (not stored on self) so concurrent searches
370
+ don't race on shared state.
371
+ """
372
+ cached = self._expand_cache.get(query)
373
+ if cached is not None:
374
+ return cached
375
+
376
+ base_tokens = self._tokenize(query)
377
+ if not base_tokens:
378
+ self._expand_cache[query] = ([], [])
379
+ if len(self._expand_cache) > 256:
380
+ self._expand_cache.pop(next(iter(self._expand_cache)))
381
+ return [], []
382
+
383
+ synonyms = {
384
+ "endpoint": ["route", "handler", "api"],
385
+ "api": ["endpoint", "route", "handler"],
386
+ "helper": ["util", "utils", "common"],
387
+ "registry": ["profile", "profiles", "config"],
388
+ "profile": ["registry", "config", "ide"],
389
+ "compress": ["compression", "compressor"],
390
+ "metrics": ["stats", "summary"],
391
+ "summary": ["metrics", "stats"],
392
+ "delegate": ["ollama", "model"],
393
+ "search": ["index", "retrieval"],
394
+ "file": ["path", "filepath"],
395
+ "path": ["file", "filepath"],
396
+ "mcp": ["server", "tool"],
397
+ }
398
+
399
+ expanded = list(base_tokens)
400
+ seen = set(base_tokens)
401
+ for token in base_tokens:
402
+ # Hardcoded synonyms
403
+ for related in synonyms.get(token, []):
404
+ if related not in seen:
405
+ expanded.append(related)
406
+ seen.add(related)
407
+ # Co-occurrence synonyms (learned from index)
408
+ if token in self._cooccurrence:
409
+ for related in list(self._cooccurrence[token].keys())[:3]:
410
+ if related not in seen:
411
+ expanded.append(related)
412
+ seen.add(related)
413
+
414
+ # Bigrams for phrase matching — returned to caller, never stored on self
415
+ # (thread-safety: concurrent searches would race on a shared attribute).
416
+ bigrams = [
417
+ (base_tokens[i], base_tokens[i + 1])
418
+ for i in range(len(base_tokens) - 1)
419
+ ]
420
+ self._expand_cache[query] = (expanded, bigrams)
421
+ if len(self._expand_cache) > 256:
422
+ self._expand_cache.pop(next(iter(self._expand_cache)))
423
+ return expanded, bigrams
424
+
425
+ def _score_chunk(self, chunk_id: str, query: str, query_tokens: list,
426
+ query_bigrams: list = None) -> float:
427
+ """Combine TF-IDF relevance with path/name heuristics, bigrams, and recency."""
428
+ tfidf = self.chunk_tfidf.get(chunk_id, {})
429
+ chunk = self.chunks[chunk_id]
430
+ doc_id = chunk["doc_id"]
431
+ path_lower = doc_id.lower()
432
+ name_lower = (chunk.get("name") or "").lower()
433
+ path_parts = [part for part in re.split(r'[/\\._-]+', path_lower) if part]
434
+ chunk_tokens = chunk.get("tokens")
435
+ if not chunk_tokens:
436
+ chunk_tokens = count_tokens(chunk["content"])
437
+ chunk["tokens"] = chunk_tokens
438
+
439
+ score = sum(tfidf.get(qt, 0) for qt in query_tokens)
440
+ if score <= 0:
441
+ return 0.0
442
+
443
+ for qt in query_tokens:
444
+ if chunk.get("name"):
445
+ if qt == name_lower:
446
+ score *= 3.2
447
+ elif qt in name_lower:
448
+ score *= 1.7
449
+
450
+ if qt in path_parts:
451
+ score += 1.5
452
+ elif qt in path_lower:
453
+ score += 0.75
454
+
455
+ # Cache lowercased content + tokens per-chunk. Before this cache,
456
+ # the tokenizer ran for every chunk on every search — dominant hot-path cost.
457
+ content_lower = chunk.get("_content_lower")
458
+ if content_lower is None:
459
+ content_lower = (chunk.get("content") or "").lower()
460
+ chunk["_content_lower"] = content_lower
461
+
462
+ query_lower = query.lower()
463
+ if query_lower and query_lower in content_lower:
464
+ score *= 2.0
465
+
466
+ query_joined = query_lower.replace(" ", "_")
467
+ if len(query_joined) > 3 and query_joined in content_lower:
468
+ score *= 1.8
469
+
470
+ if any(part in query_lower for part in path_parts[:2]):
471
+ score += 0.4
472
+
473
+ # Bigram scoring — bigrams are passed in (not read from self) so the
474
+ # search path is thread-safe. Chunk tokens are cached after first use.
475
+ if query_bigrams:
476
+ chunk_content_tokens = chunk.get("_content_tokens")
477
+ if chunk_content_tokens is None:
478
+ chunk_content_tokens = self._tokenize(content_lower)
479
+ chunk["_content_tokens"] = chunk_content_tokens
480
+ for t1, t2 in query_bigrams:
481
+ for j in range(len(chunk_content_tokens) - 1):
482
+ if chunk_content_tokens[j] == t1 and chunk_content_tokens[j + 1] == t2:
483
+ score *= 1.5
484
+ break
485
+
486
+ # Recency bias — recently modified files get a small boost (1d)
487
+ mtime = self._file_mtimes.get(doc_id, 0)
488
+ if mtime > 0 and self._file_mtimes:
489
+ max_mtime = max(self._file_mtimes.values())
490
+ if max_mtime > 0:
491
+ age_ratio = mtime / max_mtime # 1.0 for newest, lower for older
492
+ score *= (0.9 + 0.2 * age_ratio) # up to 1.1x for newest files
493
+
494
+ score += min(len(path_parts), 6) * 0.02
495
+ size_penalty = 1.0 + max(0.0, chunk_tokens - 450) / 1200.0
496
+ return score / size_penalty
497
+
498
+ def _build_tfidf(self):
499
+ """Build TF-IDF scores for all chunks."""
500
+ N = len(self.chunks)
501
+ if N == 0:
502
+ return
503
+
504
+ # Document frequency
505
+ df = Counter()
506
+ chunk_tf = {}
507
+
508
+ for chunk_id, chunk in self.chunks.items():
509
+ tokens = self._tokenize(chunk["content"])
510
+ # Include file path tokens
511
+ tokens += self._tokenize(chunk["doc_id"])
512
+ if chunk.get("name"):
513
+ tokens += self._tokenize(chunk["name"]) * 3 # Boost symbol names
514
+
515
+ tf = Counter(tokens)
516
+ chunk_tf[chunk_id] = tf
517
+ for term in set(tokens):
518
+ df[term] += 1
519
+
520
+ # IDF
521
+ self.idf = {term: math.log(N / (1 + freq)) for term, freq in df.items()}
522
+
523
+ # TF-IDF per chunk
524
+ self.chunk_tfidf = {}
525
+ for chunk_id, tf in chunk_tf.items():
526
+ self.chunk_tfidf[chunk_id] = {}
527
+ max_tf = max(tf.values()) if tf else 1
528
+ for term, freq in tf.items():
529
+ normalized_tf = 0.5 + 0.5 * (freq / max_tf)
530
+ self.chunk_tfidf[chunk_id][term] = normalized_tf * self.idf.get(term, 0)
531
+
532
+ def _build_cooccurrence(self):
533
+ """Build lightweight co-occurrence map from indexed chunks for auto-synonyms."""
534
+ self._cooccurrence = {}
535
+ for chunk in self.chunks.values():
536
+ tokens = set(self._tokenize(chunk["content"]))
537
+ for t in tokens:
538
+ if t not in self._cooccurrence:
539
+ self._cooccurrence[t] = Counter()
540
+ for t2 in tokens:
541
+ if t != t2:
542
+ self._cooccurrence[t][t2] += 1
543
+ # Prune: keep only top-5 co-occurring terms per token (minimum 3 co-occurrences)
544
+ pruned = {}
545
+ for term, counts in self._cooccurrence.items():
546
+ top = [(t, c) for t, c in counts.most_common(5) if c >= 3]
547
+ if top:
548
+ pruned[term] = dict(top)
549
+ self._cooccurrence = pruned
550
+
551
+ def search(self, query: str, top_k: int = 5, max_tokens: int = 4000,
552
+ include_content: bool = True) -> list:
553
+ """Search the index and return most relevant chunks.
554
+
555
+ Set include_content=False to get metadata only (saves ~70% tokens).
556
+ """
557
+ if not self.chunks:
558
+ self._load_index()
559
+ if not self.chunks:
560
+ return []
561
+
562
+ cache_key = (query, int(top_k), int(max_tokens), bool(include_content))
563
+ cached = self._search_cache.get(cache_key)
564
+ if cached is not None:
565
+ self._search_cache.move_to_end(cache_key)
566
+ return [dict(item) for item in cached]
567
+
568
+ query_tokens, query_bigrams = self._expand_query_tokens(query)
569
+ if not query_tokens:
570
+ return []
571
+
572
+ # Score each chunk
573
+ scores = {}
574
+ for chunk_id in self.chunk_tfidf:
575
+ score = self._score_chunk(chunk_id, query, query_tokens, query_bigrams)
576
+ if score > 0:
577
+ scores[chunk_id] = score
578
+
579
+ # Sort by score, then prefer named/structural chunks and shorter paths on ties.
580
+ ranked = sorted(
581
+ scores.items(),
582
+ key=lambda item: (
583
+ item[1],
584
+ 1 if self.chunks[item[0]].get("name") else 0,
585
+ 1 if self.chunks[item[0]].get("type") in {"function", "class", "method", "declaration"} else 0,
586
+ -len(self.chunks[item[0]]["doc_id"]),
587
+ ),
588
+ reverse=True,
589
+ )
590
+
591
+ # Collect results up to token budget
592
+ results = []
593
+ token_budget = max_tokens
594
+ seen_docs = set()
595
+
596
+ for chunk_id, score in ranked[:top_k * 4]:
597
+ chunk = self.chunks[chunk_id]
598
+ chunk_tokens = chunk.get("tokens") or count_tokens(chunk["content"])
599
+
600
+ if chunk_tokens > token_budget:
601
+ continue
602
+
603
+ if chunk["doc_id"] in seen_docs and len(results) >= max(2, top_k // 2):
604
+ continue
605
+
606
+ doc = self.documents.get(chunk["doc_id"], {})
607
+ result = {
608
+ "chunk_id": chunk_id,
609
+ "file": chunk["doc_id"],
610
+ "name": chunk.get("name"),
611
+ "type": chunk["type"],
612
+ "lines": f"{chunk['line_start']}-{chunk['line_end']}",
613
+ "tokens": chunk_tokens,
614
+ "file_tokens": doc.get("tokens", chunk_tokens),
615
+ "score": round(score, 3),
616
+ }
617
+ if include_content:
618
+ result["content"] = chunk["content"]
619
+
620
+ results.append(result)
621
+
622
+ token_budget -= chunk_tokens
623
+ seen_docs.add(chunk["doc_id"])
624
+
625
+ if len(results) >= top_k or token_budget <= 0:
626
+ break
627
+
628
+ self._search_cache[cache_key] = [dict(item) for item in results]
629
+ if len(self._search_cache) > self._search_cache_max:
630
+ self._search_cache.popitem(last=False)
631
+ return results
632
+
633
+ def get_context(self, query: str, top_k: int = 5, max_tokens: int = 4000) -> str:
634
+ """Get a formatted context string ready to pass to Claude."""
635
+ results = self.search(query, top_k, max_tokens)
636
+
637
+ if not results:
638
+ return "No relevant code found in index."
639
+
640
+ sections = []
641
+ total_tokens = 0
642
+
643
+ for r in results:
644
+ section = f"## {r['file']} (L{r['lines']})"
645
+ if r['name']:
646
+ section += f" — {r['name']}"
647
+ section += f"\n```\n{r['content']}\n```"
648
+ sections.append(section)
649
+ total_tokens += r['tokens']
650
+
651
+ header = f"# Relevant Code Context ({total_tokens} tokens, {len(results)} chunks)\n"
652
+ return header + '\n\n'.join(sections)
653
+
654
+ def _save_index(self):
655
+ """Save index to disk."""
656
+ # Strip in-memory caches (fields prefixed with _) before persisting —
657
+ # they're regenerated on demand and would bloat index.json.
658
+ data = {
659
+ "documents": self.documents,
660
+ "chunks": {
661
+ k: {kk: vv for kk, vv in v.items() if not kk.startswith("_")}
662
+ for k, v in self.chunks.items()
663
+ },
664
+ "symbols": self.symbols,
665
+ "idf": self.idf,
666
+ "chunk_tfidf": self.chunk_tfidf,
667
+ }
668
+ index_file = self.index_dir / "index.json"
669
+ with open(index_file, 'w', encoding='utf-8') as f:
670
+ json.dump(data, f)
671
+
672
+ def _load_index(self) -> bool:
673
+ """Load index from disk."""
674
+ index_file = self.index_dir / "index.json"
675
+ if not index_file.exists():
676
+ return False
677
+
678
+ try:
679
+ with open(index_file, encoding='utf-8') as f:
680
+ data = json.load(f)
681
+ self.documents = data["documents"]
682
+ self.chunks = data["chunks"]
683
+ self.symbols = data.get("symbols", {})
684
+ self.idf = data.get("idf", {})
685
+ self.chunk_tfidf = data.get("chunk_tfidf", {})
686
+ mutated = False
687
+ for chunk in self.chunks.values():
688
+ if "tokens" not in chunk:
689
+ chunk["tokens"] = count_tokens(chunk.get("content", ""))
690
+ mutated = True
691
+ self._search_cache = OrderedDict()
692
+ if mutated:
693
+ self._save_index()
694
+ return True
695
+ except Exception:
696
+ return False
697
+
698
+ def get_stats(self) -> dict:
699
+ """Get index statistics."""
700
+ if not self.documents:
701
+ self._load_index()
702
+
703
+ total_tokens = sum(d.get("tokens", 0) for d in self.documents.values())
704
+ return {
705
+ "files_indexed": len(self.documents),
706
+ "total_chunks": len(self.chunks),
707
+ "total_tokens_in_codebase": total_tokens,
708
+ "unique_symbols": len(self.symbols),
709
+ "index_size_kb": round(
710
+ (self.index_dir / "index.json").stat().st_size / 1024, 1
711
+ ) if (self.index_dir / "index.json").exists() else 0
712
+ }