code-context-control 2.28.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cli/__init__.py +1 -0
- cli/_hook_utils.py +99 -0
- cli/c3.py +6152 -0
- cli/commands/__init__.py +1 -0
- cli/commands/common.py +312 -0
- cli/commands/parser.py +286 -0
- cli/docs.html +3178 -0
- cli/edits.html +878 -0
- cli/hook_auto_snapshot.py +142 -0
- cli/hook_c3_signal.py +61 -0
- cli/hook_c3read.py +116 -0
- cli/hook_edit_ledger.py +213 -0
- cli/hook_edit_unlock.py +170 -0
- cli/hook_filter.py +130 -0
- cli/hook_ghost_files.py +238 -0
- cli/hook_pretool_enforce.py +334 -0
- cli/hook_read.py +200 -0
- cli/hook_session_stats.py +62 -0
- cli/hook_terse_advisor.py +190 -0
- cli/hub.html +3764 -0
- cli/hub_server.py +1619 -0
- cli/mcp_proxy.py +428 -0
- cli/mcp_server.py +660 -0
- cli/server.py +2985 -0
- cli/tools/__init__.py +4 -0
- cli/tools/_helpers.py +65 -0
- cli/tools/agent.py +1165 -0
- cli/tools/compress.py +215 -0
- cli/tools/delegate.py +1184 -0
- cli/tools/edit.py +313 -0
- cli/tools/edits.py +118 -0
- cli/tools/filter.py +285 -0
- cli/tools/impact.py +163 -0
- cli/tools/memory.py +469 -0
- cli/tools/read.py +224 -0
- cli/tools/search.py +337 -0
- cli/tools/session.py +95 -0
- cli/tools/shell.py +193 -0
- cli/tools/status.py +306 -0
- cli/tools/validate.py +310 -0
- cli/ui/api.js +36 -0
- cli/ui/app.js +207 -0
- cli/ui/components/chat.js +758 -0
- cli/ui/components/dashboard.js +689 -0
- cli/ui/components/edits.js +220 -0
- cli/ui/components/instructions.js +481 -0
- cli/ui/components/memory.js +626 -0
- cli/ui/components/sessions.js +606 -0
- cli/ui/components/settings.js +1404 -0
- cli/ui/components/sidebar.js +156 -0
- cli/ui/icons.js +51 -0
- cli/ui/shared.js +119 -0
- cli/ui/theme.js +22 -0
- cli/ui.html +168 -0
- cli/ui_legacy.html +6797 -0
- cli/ui_nano.html +503 -0
- code_context_control-2.28.0.dist-info/METADATA +248 -0
- code_context_control-2.28.0.dist-info/RECORD +150 -0
- code_context_control-2.28.0.dist-info/WHEEL +5 -0
- code_context_control-2.28.0.dist-info/entry_points.txt +4 -0
- code_context_control-2.28.0.dist-info/licenses/LICENSE +201 -0
- code_context_control-2.28.0.dist-info/top_level.txt +5 -0
- core/__init__.py +75 -0
- core/config.py +269 -0
- core/ide.py +188 -0
- oracle/__init__.py +1 -0
- oracle/config.py +75 -0
- oracle/oracle.html +3900 -0
- oracle/oracle_server.py +663 -0
- oracle/services/__init__.py +1 -0
- oracle/services/c3_bridge.py +210 -0
- oracle/services/chat_engine.py +1103 -0
- oracle/services/chat_store.py +155 -0
- oracle/services/cross_memory.py +154 -0
- oracle/services/federated_graph.py +463 -0
- oracle/services/health_checker.py +117 -0
- oracle/services/insight_engine.py +307 -0
- oracle/services/memory_reader.py +106 -0
- oracle/services/memory_writer.py +182 -0
- oracle/services/ollama_bridge.py +332 -0
- oracle/services/project_scanner.py +87 -0
- oracle/services/review_agent.py +206 -0
- services/__init__.py +1 -0
- services/activity_log.py +93 -0
- services/agent_base.py +124 -0
- services/agents.py +1529 -0
- services/auto_memory.py +407 -0
- services/bench/__init__.py +6 -0
- services/bench/external/__init__.py +29 -0
- services/bench/external/aider_polyglot.py +405 -0
- services/bench/external/swe_bench.py +485 -0
- services/benchmark_dashboard.py +596 -0
- services/claude_md.py +785 -0
- services/compressor.py +592 -0
- services/context_snapshot.py +356 -0
- services/conversation_store.py +870 -0
- services/doc_index.py +537 -0
- services/e2e_benchmark.py +2884 -0
- services/e2e_evaluator.py +396 -0
- services/e2e_tasks.py +743 -0
- services/edit_ledger.py +459 -0
- services/embedding_index.py +341 -0
- services/error_reporting.py +123 -0
- services/file_memory.py +734 -0
- services/hub_service.py +585 -0
- services/indexer.py +712 -0
- services/memory.py +318 -0
- services/memory_consolidator.py +538 -0
- services/memory_graph.py +382 -0
- services/memory_grounder.py +304 -0
- services/memory_scorer.py +246 -0
- services/metrics.py +86 -0
- services/notifications.py +209 -0
- services/ollama_client.py +201 -0
- services/output_filter.py +488 -0
- services/parser.py +1238 -0
- services/project_manager.py +579 -0
- services/protocol.py +306 -0
- services/proxy_state.py +152 -0
- services/retrieval_broker.py +129 -0
- services/router.py +414 -0
- services/runtime.py +326 -0
- services/session_benchmark.py +1945 -0
- services/session_manager.py +1026 -0
- services/session_preloader.py +251 -0
- services/text_index.py +90 -0
- services/tool_classifier.py +176 -0
- services/transcript_index.py +340 -0
- services/validation_cache.py +155 -0
- services/vector_store.py +299 -0
- services/version_tracker.py +271 -0
- services/watcher.py +192 -0
- tui/__init__.py +0 -0
- tui/backend.py +59 -0
- tui/main.py +145 -0
- tui/screens/__init__.py +1 -0
- tui/screens/benchmark_view.py +109 -0
- tui/screens/claudemd_view.py +46 -0
- tui/screens/compress_view.py +52 -0
- tui/screens/index_view.py +74 -0
- tui/screens/init_view.py +82 -0
- tui/screens/mcp_view.py +73 -0
- tui/screens/optimize_view.py +41 -0
- tui/screens/pipe_view.py +46 -0
- tui/screens/projects_view.py +355 -0
- tui/screens/search_view.py +55 -0
- tui/screens/session_view.py +143 -0
- tui/screens/stats.py +158 -0
- tui/screens/ui_view.py +54 -0
- tui/theme.tcss +335 -0
services/doc_index.py
ADDED
|
@@ -0,0 +1,537 @@
|
|
|
1
|
+
"""DocIndex — Document-aware chunking layer for project docs, configs, and docstrings.
|
|
2
|
+
|
|
3
|
+
Chunks markdown by heading boundaries, extracts inline docstrings from code,
|
|
4
|
+
and indexes config files as whole-file chunks. Each chunk carries a priority
|
|
5
|
+
score based on its source type and file importance.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import hashlib
|
|
9
|
+
import json
|
|
10
|
+
import logging
|
|
11
|
+
import re
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Optional
|
|
14
|
+
|
|
15
|
+
from core import count_tokens
|
|
16
|
+
|
|
17
|
+
log = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
# Files that get boosted priority (pattern -> priority multiplier)
|
|
20
|
+
_PRIORITY_FILES = {
|
|
21
|
+
"CLAUDE.md": 2.0,
|
|
22
|
+
"AGENTS.md": 1.8,
|
|
23
|
+
"README.md": 1.5,
|
|
24
|
+
"CONTRIBUTING.md": 1.3,
|
|
25
|
+
"ARCHITECTURE.md": 1.5,
|
|
26
|
+
"CHANGELOG.md": 0.8,
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
# Priority by source type
|
|
30
|
+
_SOURCE_PRIORITY = {
|
|
31
|
+
"markdown": 1.2,
|
|
32
|
+
"docstring": 1.0,
|
|
33
|
+
"config": 0.8,
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
# Max tokens per chunk before splitting
|
|
37
|
+
_CHUNK_MAX_TOKENS = 400
|
|
38
|
+
|
|
39
|
+
# Config file patterns to index
|
|
40
|
+
_CONFIG_PATTERNS = [
|
|
41
|
+
".mcp.json", "pyproject.toml", "package.json", "tsconfig.json",
|
|
42
|
+
"Makefile", "Dockerfile", "docker-compose.yml", "docker-compose.yaml",
|
|
43
|
+
".env.example",
|
|
44
|
+
]
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class DocIndex:
|
|
48
|
+
"""Document-aware index for project docs, configs, and docstrings."""
|
|
49
|
+
|
|
50
|
+
def __init__(self, project_path: str, index_dir: str = ".c3/doc_index"):
|
|
51
|
+
self.project_path = Path(project_path)
|
|
52
|
+
self.index_dir = self.project_path / index_dir
|
|
53
|
+
self.index_dir.mkdir(parents=True, exist_ok=True)
|
|
54
|
+
|
|
55
|
+
self.chunks: dict = {} # chunk_id -> chunk dict
|
|
56
|
+
self._file_hashes: dict = {} # rel_path -> content hash
|
|
57
|
+
self._hash_file = self.index_dir / "file_hashes.json"
|
|
58
|
+
self._index_file = self.index_dir / "index.json"
|
|
59
|
+
|
|
60
|
+
self._load_hashes()
|
|
61
|
+
self._load_index()
|
|
62
|
+
|
|
63
|
+
# --- Persistence ---
|
|
64
|
+
|
|
65
|
+
def _load_hashes(self):
|
|
66
|
+
if self._hash_file.exists():
|
|
67
|
+
try:
|
|
68
|
+
self._file_hashes = json.loads(self._hash_file.read_text(encoding="utf-8"))
|
|
69
|
+
except Exception:
|
|
70
|
+
self._file_hashes = {}
|
|
71
|
+
|
|
72
|
+
def _save_hashes(self):
|
|
73
|
+
self._hash_file.write_text(json.dumps(self._file_hashes), encoding="utf-8")
|
|
74
|
+
|
|
75
|
+
def _load_index(self):
|
|
76
|
+
if self._index_file.exists():
|
|
77
|
+
try:
|
|
78
|
+
data = json.loads(self._index_file.read_text(encoding="utf-8"))
|
|
79
|
+
self.chunks = data.get("chunks", {})
|
|
80
|
+
except Exception:
|
|
81
|
+
self.chunks = {}
|
|
82
|
+
|
|
83
|
+
def _save_index(self):
|
|
84
|
+
self._index_file.write_text(
|
|
85
|
+
json.dumps({"chunks": self.chunks}, default=str),
|
|
86
|
+
encoding="utf-8",
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
@staticmethod
|
|
90
|
+
def _content_hash(content: str) -> str:
|
|
91
|
+
return hashlib.sha256(content.encode("utf-8")).hexdigest()[:16]
|
|
92
|
+
|
|
93
|
+
# --- Build ---
|
|
94
|
+
|
|
95
|
+
def build(self, force: bool = False) -> dict:
|
|
96
|
+
"""Build or incrementally update the doc index."""
|
|
97
|
+
stats = {"docs_indexed": 0, "chunks_created": 0, "skipped": 0}
|
|
98
|
+
|
|
99
|
+
files_to_index = self._discover_files()
|
|
100
|
+
old_hashes = dict(self._file_hashes)
|
|
101
|
+
new_hashes = {}
|
|
102
|
+
|
|
103
|
+
for rel_path, fpath in files_to_index:
|
|
104
|
+
try:
|
|
105
|
+
content = fpath.read_text(errors="replace")
|
|
106
|
+
except Exception:
|
|
107
|
+
continue
|
|
108
|
+
|
|
109
|
+
h = self._content_hash(content)
|
|
110
|
+
new_hashes[rel_path] = h
|
|
111
|
+
|
|
112
|
+
if not force and old_hashes.get(rel_path) == h:
|
|
113
|
+
stats["skipped"] += 1
|
|
114
|
+
continue
|
|
115
|
+
|
|
116
|
+
# Remove old chunks for this file
|
|
117
|
+
self._remove_file_chunks(rel_path)
|
|
118
|
+
|
|
119
|
+
# Chunk based on file type
|
|
120
|
+
ext = fpath.suffix.lower()
|
|
121
|
+
name = fpath.name
|
|
122
|
+
|
|
123
|
+
if ext in (".md", ".mdx", ".rst", ".adoc"):
|
|
124
|
+
new_chunks = self._chunk_markdown(content, rel_path, name)
|
|
125
|
+
elif ext in (".py", ".js", ".ts", ".tsx", ".jsx", ".go", ".rs", ".java"):
|
|
126
|
+
new_chunks = self._chunk_docstrings(content, rel_path, ext)
|
|
127
|
+
elif name in _CONFIG_PATTERNS or ext in (".toml", ".yaml", ".yml", ".json", ".ini"):
|
|
128
|
+
new_chunks = self._chunk_config(content, rel_path, name)
|
|
129
|
+
else:
|
|
130
|
+
continue
|
|
131
|
+
|
|
132
|
+
for chunk in new_chunks:
|
|
133
|
+
self.chunks[chunk["id"]] = chunk
|
|
134
|
+
|
|
135
|
+
stats["docs_indexed"] += 1
|
|
136
|
+
stats["chunks_created"] += len(new_chunks)
|
|
137
|
+
|
|
138
|
+
# Remove chunks for deleted files
|
|
139
|
+
deleted = set(old_hashes.keys()) - set(new_hashes.keys())
|
|
140
|
+
for rel_path in deleted:
|
|
141
|
+
self._remove_file_chunks(rel_path)
|
|
142
|
+
|
|
143
|
+
self._file_hashes = new_hashes
|
|
144
|
+
self._save_hashes()
|
|
145
|
+
self._save_index()
|
|
146
|
+
|
|
147
|
+
log.info("DocIndex built: %s", stats)
|
|
148
|
+
return stats
|
|
149
|
+
|
|
150
|
+
def _remove_file_chunks(self, doc_id: str):
|
|
151
|
+
to_remove = [cid for cid, c in self.chunks.items() if c.get("doc_id") == doc_id]
|
|
152
|
+
for cid in to_remove:
|
|
153
|
+
del self.chunks[cid]
|
|
154
|
+
|
|
155
|
+
def _discover_files(self) -> list[tuple[str, Path]]:
|
|
156
|
+
"""Find all doc, config, and code files to index."""
|
|
157
|
+
skip_dirs = {
|
|
158
|
+
"node_modules", ".git", "__pycache__", ".c3", "venv",
|
|
159
|
+
"env", ".venv", "dist", "build", ".next", ".cache",
|
|
160
|
+
"coverage", ".pytest_cache",
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
files = []
|
|
164
|
+
|
|
165
|
+
# Markdown docs
|
|
166
|
+
for ext in ("*.md", "*.mdx", "*.rst", "*.adoc"):
|
|
167
|
+
for fpath in self.project_path.rglob(ext):
|
|
168
|
+
if any(skip in fpath.parts for skip in skip_dirs):
|
|
169
|
+
continue
|
|
170
|
+
rel = str(fpath.relative_to(self.project_path))
|
|
171
|
+
files.append((rel, fpath))
|
|
172
|
+
|
|
173
|
+
# Config files at project root
|
|
174
|
+
for pattern in _CONFIG_PATTERNS:
|
|
175
|
+
fpath = self.project_path / pattern
|
|
176
|
+
if fpath.is_file():
|
|
177
|
+
files.append((pattern, fpath))
|
|
178
|
+
|
|
179
|
+
# Code files for docstring extraction (top-level only + key dirs)
|
|
180
|
+
code_exts = {".py", ".js", ".ts", ".tsx", ".jsx", ".go", ".rs", ".java"}
|
|
181
|
+
code_dirs = [self.project_path]
|
|
182
|
+
for subdir in ("cli", "core", "services", "src", "lib", "app", "pkg"):
|
|
183
|
+
d = self.project_path / subdir
|
|
184
|
+
if d.is_dir():
|
|
185
|
+
code_dirs.append(d)
|
|
186
|
+
|
|
187
|
+
seen = set()
|
|
188
|
+
for code_dir in code_dirs:
|
|
189
|
+
for fpath in code_dir.glob("*"):
|
|
190
|
+
if not fpath.is_file():
|
|
191
|
+
continue
|
|
192
|
+
if fpath.suffix.lower() not in code_exts:
|
|
193
|
+
continue
|
|
194
|
+
rel = str(fpath.relative_to(self.project_path))
|
|
195
|
+
if rel not in seen:
|
|
196
|
+
seen.add(rel)
|
|
197
|
+
files.append((rel, fpath))
|
|
198
|
+
|
|
199
|
+
return files
|
|
200
|
+
|
|
201
|
+
# --- Markdown chunking ---
|
|
202
|
+
|
|
203
|
+
def _chunk_markdown(self, content: str, doc_id: str, filename: str) -> list:
|
|
204
|
+
"""Split markdown by heading boundaries."""
|
|
205
|
+
lines = content.split("\n")
|
|
206
|
+
sections = []
|
|
207
|
+
current_heading = filename
|
|
208
|
+
current_lines = []
|
|
209
|
+
current_start = 0
|
|
210
|
+
heading_path = [filename]
|
|
211
|
+
|
|
212
|
+
for i, line in enumerate(lines):
|
|
213
|
+
heading_match = re.match(r"^(#{1,6})\s+(.+)", line)
|
|
214
|
+
if heading_match:
|
|
215
|
+
# Flush previous section
|
|
216
|
+
if current_lines:
|
|
217
|
+
text = "\n".join(current_lines).strip()
|
|
218
|
+
if text:
|
|
219
|
+
sections.append(self._make_doc_chunk(
|
|
220
|
+
doc_id, current_heading, text,
|
|
221
|
+
heading_path[:], current_start, i - 1,
|
|
222
|
+
"markdown", filename,
|
|
223
|
+
))
|
|
224
|
+
level = len(heading_match.group(1))
|
|
225
|
+
heading_text = heading_match.group(2).strip()
|
|
226
|
+
current_heading = f"{heading_match.group(1)} {heading_text}"
|
|
227
|
+
# Update heading path
|
|
228
|
+
heading_path = heading_path[:1] # keep filename
|
|
229
|
+
if level > 1:
|
|
230
|
+
heading_path.append(heading_text)
|
|
231
|
+
current_lines = [line]
|
|
232
|
+
current_start = i
|
|
233
|
+
else:
|
|
234
|
+
current_lines.append(line)
|
|
235
|
+
|
|
236
|
+
# Flush last section
|
|
237
|
+
if current_lines:
|
|
238
|
+
text = "\n".join(current_lines).strip()
|
|
239
|
+
if text:
|
|
240
|
+
sections.append(self._make_doc_chunk(
|
|
241
|
+
doc_id, current_heading, text,
|
|
242
|
+
heading_path[:], current_start, len(lines) - 1,
|
|
243
|
+
"markdown", filename,
|
|
244
|
+
))
|
|
245
|
+
|
|
246
|
+
# Split oversized chunks
|
|
247
|
+
result = []
|
|
248
|
+
for chunk in sections:
|
|
249
|
+
tokens = chunk["tokens"]
|
|
250
|
+
if tokens > _CHUNK_MAX_TOKENS:
|
|
251
|
+
result.extend(self._split_chunk(chunk))
|
|
252
|
+
else:
|
|
253
|
+
result.append(chunk)
|
|
254
|
+
|
|
255
|
+
return result
|
|
256
|
+
|
|
257
|
+
def _split_chunk(self, chunk: dict) -> list:
|
|
258
|
+
"""Split an oversized chunk into smaller pieces."""
|
|
259
|
+
lines = chunk["content"].split("\n")
|
|
260
|
+
parts = []
|
|
261
|
+
current = []
|
|
262
|
+
current_tokens = 0
|
|
263
|
+
|
|
264
|
+
for line in lines:
|
|
265
|
+
line_tokens = count_tokens(line)
|
|
266
|
+
if current_tokens + line_tokens > _CHUNK_MAX_TOKENS and current:
|
|
267
|
+
parts.append("\n".join(current))
|
|
268
|
+
current = [line]
|
|
269
|
+
current_tokens = line_tokens
|
|
270
|
+
else:
|
|
271
|
+
current.append(line)
|
|
272
|
+
current_tokens += line_tokens
|
|
273
|
+
|
|
274
|
+
if current:
|
|
275
|
+
parts.append("\n".join(current))
|
|
276
|
+
|
|
277
|
+
result = []
|
|
278
|
+
for idx, part_text in enumerate(parts):
|
|
279
|
+
part_text = part_text.strip()
|
|
280
|
+
if not part_text:
|
|
281
|
+
continue
|
|
282
|
+
c = dict(chunk)
|
|
283
|
+
c["id"] = f"{chunk['id']}::{idx}" if idx > 0 else chunk["id"]
|
|
284
|
+
c["content"] = part_text
|
|
285
|
+
c["tokens"] = count_tokens(part_text)
|
|
286
|
+
result.append(c)
|
|
287
|
+
|
|
288
|
+
return result
|
|
289
|
+
|
|
290
|
+
# --- Docstring chunking ---
|
|
291
|
+
|
|
292
|
+
def _chunk_docstrings(self, content: str, doc_id: str, ext: str) -> list:
|
|
293
|
+
"""Extract module and symbol docstrings from code files."""
|
|
294
|
+
chunks = []
|
|
295
|
+
|
|
296
|
+
if ext == ".py":
|
|
297
|
+
chunks.extend(self._extract_python_docstrings(content, doc_id))
|
|
298
|
+
elif ext in (".js", ".ts", ".tsx", ".jsx"):
|
|
299
|
+
chunks.extend(self._extract_jsdoc_comments(content, doc_id))
|
|
300
|
+
|
|
301
|
+
return chunks
|
|
302
|
+
|
|
303
|
+
def _extract_python_docstrings(self, content: str, doc_id: str) -> list:
|
|
304
|
+
"""Extract Python module and class/function docstrings."""
|
|
305
|
+
chunks = []
|
|
306
|
+
lines = content.split("\n")
|
|
307
|
+
|
|
308
|
+
# Module docstring: triple-quote at the start (within first 10 lines)
|
|
309
|
+
module_doc = self._find_python_docstring(lines, 0)
|
|
310
|
+
if module_doc:
|
|
311
|
+
chunks.append(self._make_doc_chunk(
|
|
312
|
+
doc_id, f"{doc_id}::module", module_doc["text"],
|
|
313
|
+
[doc_id], module_doc["start"], module_doc["end"],
|
|
314
|
+
"docstring", doc_id,
|
|
315
|
+
))
|
|
316
|
+
|
|
317
|
+
# Class and function docstrings
|
|
318
|
+
pattern = re.compile(r"^\s*(class|def|async\s+def)\s+(\w+)")
|
|
319
|
+
for i, line in enumerate(lines):
|
|
320
|
+
m = pattern.match(line)
|
|
321
|
+
if m:
|
|
322
|
+
kind = m.group(1).replace("async ", "")
|
|
323
|
+
name = m.group(2)
|
|
324
|
+
# Look for docstring on the next non-empty line after the def/class line
|
|
325
|
+
doc = self._find_python_docstring(lines, i + 1)
|
|
326
|
+
if doc and doc["text"]:
|
|
327
|
+
chunks.append(self._make_doc_chunk(
|
|
328
|
+
doc_id, f"{doc_id}::{name}", doc["text"],
|
|
329
|
+
[doc_id, name], doc["start"], doc["end"],
|
|
330
|
+
"docstring", doc_id,
|
|
331
|
+
))
|
|
332
|
+
|
|
333
|
+
return chunks
|
|
334
|
+
|
|
335
|
+
def _find_python_docstring(self, lines: list, start: int) -> Optional[dict]:
|
|
336
|
+
"""Find a triple-quoted docstring starting near `start`."""
|
|
337
|
+
# Skip blank lines and decorators
|
|
338
|
+
i = start
|
|
339
|
+
while i < len(lines) and i < start + 5:
|
|
340
|
+
stripped = lines[i].strip()
|
|
341
|
+
if stripped.startswith('"""') or stripped.startswith("'''"):
|
|
342
|
+
break
|
|
343
|
+
if stripped == "" or stripped.startswith("@") or stripped.startswith("#"):
|
|
344
|
+
i += 1
|
|
345
|
+
continue
|
|
346
|
+
# If line contains colon (def/class body start), skip to next line
|
|
347
|
+
if stripped.endswith(":"):
|
|
348
|
+
i += 1
|
|
349
|
+
continue
|
|
350
|
+
return None
|
|
351
|
+
else:
|
|
352
|
+
return None
|
|
353
|
+
|
|
354
|
+
if i >= len(lines):
|
|
355
|
+
return None
|
|
356
|
+
|
|
357
|
+
quote = lines[i].strip()[:3]
|
|
358
|
+
doc_start = i
|
|
359
|
+
|
|
360
|
+
# Single-line docstring
|
|
361
|
+
if lines[i].strip().count(quote) >= 2:
|
|
362
|
+
text = lines[i].strip().strip(quote).strip()
|
|
363
|
+
if text:
|
|
364
|
+
return {"text": text, "start": doc_start, "end": doc_start}
|
|
365
|
+
return None
|
|
366
|
+
|
|
367
|
+
# Multi-line docstring
|
|
368
|
+
doc_lines = [lines[i].strip().lstrip(quote)]
|
|
369
|
+
i += 1
|
|
370
|
+
while i < len(lines):
|
|
371
|
+
if quote in lines[i]:
|
|
372
|
+
doc_lines.append(lines[i].strip().rstrip(quote).strip())
|
|
373
|
+
break
|
|
374
|
+
doc_lines.append(lines[i].strip())
|
|
375
|
+
i += 1
|
|
376
|
+
|
|
377
|
+
text = "\n".join(doc_lines).strip()
|
|
378
|
+
if len(text) < 10: # skip trivial docstrings
|
|
379
|
+
return None
|
|
380
|
+
return {"text": text, "start": doc_start, "end": i}
|
|
381
|
+
|
|
382
|
+
def _extract_jsdoc_comments(self, content: str, doc_id: str) -> list:
|
|
383
|
+
"""Extract JSDoc comments (/** ... */) from JS/TS files."""
|
|
384
|
+
chunks = []
|
|
385
|
+
pattern = re.compile(r"/\*\*(.*?)\*/", re.DOTALL)
|
|
386
|
+
|
|
387
|
+
for match in pattern.finditer(content):
|
|
388
|
+
text = match.group(1).strip()
|
|
389
|
+
# Clean up JSDoc formatting
|
|
390
|
+
clean_lines = []
|
|
391
|
+
for line in text.split("\n"):
|
|
392
|
+
line = line.strip().lstrip("* ").strip()
|
|
393
|
+
if line:
|
|
394
|
+
clean_lines.append(line)
|
|
395
|
+
text = "\n".join(clean_lines)
|
|
396
|
+
|
|
397
|
+
if len(text) < 20: # skip trivial comments
|
|
398
|
+
continue
|
|
399
|
+
|
|
400
|
+
start_line = content[:match.start()].count("\n")
|
|
401
|
+
end_line = content[:match.end()].count("\n")
|
|
402
|
+
|
|
403
|
+
chunks.append(self._make_doc_chunk(
|
|
404
|
+
doc_id, f"{doc_id}::jsdoc:{start_line}",
|
|
405
|
+
text, [doc_id], start_line, end_line,
|
|
406
|
+
"docstring", doc_id,
|
|
407
|
+
))
|
|
408
|
+
|
|
409
|
+
return chunks
|
|
410
|
+
|
|
411
|
+
# --- Config chunking ---
|
|
412
|
+
|
|
413
|
+
def _chunk_config(self, content: str, doc_id: str, filename: str) -> list:
|
|
414
|
+
"""Index config files as whole-file chunks."""
|
|
415
|
+
tokens = count_tokens(content)
|
|
416
|
+
if tokens < 5:
|
|
417
|
+
return []
|
|
418
|
+
|
|
419
|
+
# For large configs, only keep the first _CHUNK_MAX_TOKENS worth
|
|
420
|
+
if tokens > _CHUNK_MAX_TOKENS:
|
|
421
|
+
lines = content.split("\n")
|
|
422
|
+
truncated = []
|
|
423
|
+
running = 0
|
|
424
|
+
for line in lines:
|
|
425
|
+
lt = count_tokens(line)
|
|
426
|
+
if running + lt > _CHUNK_MAX_TOKENS:
|
|
427
|
+
break
|
|
428
|
+
truncated.append(line)
|
|
429
|
+
running += lt
|
|
430
|
+
content = "\n".join(truncated)
|
|
431
|
+
tokens = running
|
|
432
|
+
|
|
433
|
+
return [self._make_doc_chunk(
|
|
434
|
+
doc_id, f"{doc_id}::config", content,
|
|
435
|
+
[filename], 0, content.count("\n"),
|
|
436
|
+
"config", filename,
|
|
437
|
+
)]
|
|
438
|
+
|
|
439
|
+
# --- Helpers ---
|
|
440
|
+
|
|
441
|
+
def _make_doc_chunk(self, doc_id: str, chunk_id: str, content: str,
|
|
442
|
+
heading_path: list, line_start: int, line_end: int,
|
|
443
|
+
source_type: str, filename: str) -> dict:
|
|
444
|
+
"""Create a standardized chunk dict with priority scoring."""
|
|
445
|
+
# Calculate priority
|
|
446
|
+
base_priority = _SOURCE_PRIORITY.get(source_type, 1.0)
|
|
447
|
+
file_boost = 1.0
|
|
448
|
+
for pattern, boost in _PRIORITY_FILES.items():
|
|
449
|
+
if filename == pattern or doc_id.endswith(pattern):
|
|
450
|
+
file_boost = boost
|
|
451
|
+
break
|
|
452
|
+
|
|
453
|
+
# Docs in a docs/ directory get a small boost
|
|
454
|
+
if doc_id.startswith("docs/") or doc_id.startswith("docs\\"):
|
|
455
|
+
file_boost = max(file_boost, 1.2)
|
|
456
|
+
|
|
457
|
+
priority = round(base_priority * file_boost, 2)
|
|
458
|
+
|
|
459
|
+
return {
|
|
460
|
+
"id": chunk_id,
|
|
461
|
+
"doc_id": doc_id,
|
|
462
|
+
"content": content,
|
|
463
|
+
"tokens": count_tokens(content),
|
|
464
|
+
"kind": "doc",
|
|
465
|
+
"heading_path": heading_path,
|
|
466
|
+
"source_type": source_type,
|
|
467
|
+
"priority": priority,
|
|
468
|
+
"line_start": line_start,
|
|
469
|
+
"line_end": line_end,
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
# --- Search ---
|
|
473
|
+
|
|
474
|
+
def search(self, query: str, top_k: int = 5) -> list:
|
|
475
|
+
"""Simple TF-IDF-like keyword search over doc chunks."""
|
|
476
|
+
if not self.chunks:
|
|
477
|
+
return []
|
|
478
|
+
|
|
479
|
+
query_tokens = set(re.findall(r"\w+", query.lower()))
|
|
480
|
+
if not query_tokens:
|
|
481
|
+
return []
|
|
482
|
+
|
|
483
|
+
scored = []
|
|
484
|
+
for cid, chunk in self.chunks.items():
|
|
485
|
+
content_tokens = set(re.findall(r"\w+", chunk["content"].lower()))
|
|
486
|
+
if not content_tokens:
|
|
487
|
+
continue
|
|
488
|
+
|
|
489
|
+
# Jaccard-like overlap
|
|
490
|
+
overlap = query_tokens & content_tokens
|
|
491
|
+
if not overlap:
|
|
492
|
+
continue
|
|
493
|
+
|
|
494
|
+
score = len(overlap) / len(query_tokens | content_tokens)
|
|
495
|
+
|
|
496
|
+
# Boost by heading path match
|
|
497
|
+
heading_text = " ".join(chunk.get("heading_path", [])).lower()
|
|
498
|
+
heading_tokens = set(re.findall(r"\w+", heading_text))
|
|
499
|
+
heading_overlap = query_tokens & heading_tokens
|
|
500
|
+
if heading_overlap:
|
|
501
|
+
score += 0.3 * len(heading_overlap) / len(query_tokens)
|
|
502
|
+
|
|
503
|
+
# Apply priority multiplier
|
|
504
|
+
score *= chunk.get("priority", 1.0)
|
|
505
|
+
|
|
506
|
+
scored.append((cid, score))
|
|
507
|
+
|
|
508
|
+
scored.sort(key=lambda x: x[1], reverse=True)
|
|
509
|
+
|
|
510
|
+
results = []
|
|
511
|
+
for cid, score in scored[:top_k]:
|
|
512
|
+
chunk = self.chunks[cid]
|
|
513
|
+
results.append({
|
|
514
|
+
**chunk,
|
|
515
|
+
"score": round(score, 4),
|
|
516
|
+
})
|
|
517
|
+
|
|
518
|
+
return results
|
|
519
|
+
|
|
520
|
+
def search_semantic(self, query: str, embedding_index, top_k: int = 5) -> list:
|
|
521
|
+
"""Search using embedding index if available, falling back to keyword."""
|
|
522
|
+
# For now, keyword search is the primary method for docs.
|
|
523
|
+
# Semantic search via EmbeddingIndex searches code chunks.
|
|
524
|
+
# We use keyword search here which works well for docs.
|
|
525
|
+
return self.search(query, top_k=top_k)
|
|
526
|
+
|
|
527
|
+
def get_stats(self) -> dict:
|
|
528
|
+
by_type = {}
|
|
529
|
+
for c in self.chunks.values():
|
|
530
|
+
st = c.get("source_type", "unknown")
|
|
531
|
+
by_type[st] = by_type.get(st, 0) + 1
|
|
532
|
+
|
|
533
|
+
return {
|
|
534
|
+
"total_chunks": len(self.chunks),
|
|
535
|
+
"files_tracked": len(self._file_hashes),
|
|
536
|
+
"by_source_type": by_type,
|
|
537
|
+
}
|