code-context-control 2.28.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. cli/__init__.py +1 -0
  2. cli/_hook_utils.py +99 -0
  3. cli/c3.py +6152 -0
  4. cli/commands/__init__.py +1 -0
  5. cli/commands/common.py +312 -0
  6. cli/commands/parser.py +286 -0
  7. cli/docs.html +3178 -0
  8. cli/edits.html +878 -0
  9. cli/hook_auto_snapshot.py +142 -0
  10. cli/hook_c3_signal.py +61 -0
  11. cli/hook_c3read.py +116 -0
  12. cli/hook_edit_ledger.py +213 -0
  13. cli/hook_edit_unlock.py +170 -0
  14. cli/hook_filter.py +130 -0
  15. cli/hook_ghost_files.py +238 -0
  16. cli/hook_pretool_enforce.py +334 -0
  17. cli/hook_read.py +200 -0
  18. cli/hook_session_stats.py +62 -0
  19. cli/hook_terse_advisor.py +190 -0
  20. cli/hub.html +3764 -0
  21. cli/hub_server.py +1619 -0
  22. cli/mcp_proxy.py +428 -0
  23. cli/mcp_server.py +660 -0
  24. cli/server.py +2985 -0
  25. cli/tools/__init__.py +4 -0
  26. cli/tools/_helpers.py +65 -0
  27. cli/tools/agent.py +1165 -0
  28. cli/tools/compress.py +215 -0
  29. cli/tools/delegate.py +1184 -0
  30. cli/tools/edit.py +313 -0
  31. cli/tools/edits.py +118 -0
  32. cli/tools/filter.py +285 -0
  33. cli/tools/impact.py +163 -0
  34. cli/tools/memory.py +469 -0
  35. cli/tools/read.py +224 -0
  36. cli/tools/search.py +337 -0
  37. cli/tools/session.py +95 -0
  38. cli/tools/shell.py +193 -0
  39. cli/tools/status.py +306 -0
  40. cli/tools/validate.py +310 -0
  41. cli/ui/api.js +36 -0
  42. cli/ui/app.js +207 -0
  43. cli/ui/components/chat.js +758 -0
  44. cli/ui/components/dashboard.js +689 -0
  45. cli/ui/components/edits.js +220 -0
  46. cli/ui/components/instructions.js +481 -0
  47. cli/ui/components/memory.js +626 -0
  48. cli/ui/components/sessions.js +606 -0
  49. cli/ui/components/settings.js +1404 -0
  50. cli/ui/components/sidebar.js +156 -0
  51. cli/ui/icons.js +51 -0
  52. cli/ui/shared.js +119 -0
  53. cli/ui/theme.js +22 -0
  54. cli/ui.html +168 -0
  55. cli/ui_legacy.html +6797 -0
  56. cli/ui_nano.html +503 -0
  57. code_context_control-2.28.0.dist-info/METADATA +248 -0
  58. code_context_control-2.28.0.dist-info/RECORD +150 -0
  59. code_context_control-2.28.0.dist-info/WHEEL +5 -0
  60. code_context_control-2.28.0.dist-info/entry_points.txt +4 -0
  61. code_context_control-2.28.0.dist-info/licenses/LICENSE +201 -0
  62. code_context_control-2.28.0.dist-info/top_level.txt +5 -0
  63. core/__init__.py +75 -0
  64. core/config.py +269 -0
  65. core/ide.py +188 -0
  66. oracle/__init__.py +1 -0
  67. oracle/config.py +75 -0
  68. oracle/oracle.html +3900 -0
  69. oracle/oracle_server.py +663 -0
  70. oracle/services/__init__.py +1 -0
  71. oracle/services/c3_bridge.py +210 -0
  72. oracle/services/chat_engine.py +1103 -0
  73. oracle/services/chat_store.py +155 -0
  74. oracle/services/cross_memory.py +154 -0
  75. oracle/services/federated_graph.py +463 -0
  76. oracle/services/health_checker.py +117 -0
  77. oracle/services/insight_engine.py +307 -0
  78. oracle/services/memory_reader.py +106 -0
  79. oracle/services/memory_writer.py +182 -0
  80. oracle/services/ollama_bridge.py +332 -0
  81. oracle/services/project_scanner.py +87 -0
  82. oracle/services/review_agent.py +206 -0
  83. services/__init__.py +1 -0
  84. services/activity_log.py +93 -0
  85. services/agent_base.py +124 -0
  86. services/agents.py +1529 -0
  87. services/auto_memory.py +407 -0
  88. services/bench/__init__.py +6 -0
  89. services/bench/external/__init__.py +29 -0
  90. services/bench/external/aider_polyglot.py +405 -0
  91. services/bench/external/swe_bench.py +485 -0
  92. services/benchmark_dashboard.py +596 -0
  93. services/claude_md.py +785 -0
  94. services/compressor.py +592 -0
  95. services/context_snapshot.py +356 -0
  96. services/conversation_store.py +870 -0
  97. services/doc_index.py +537 -0
  98. services/e2e_benchmark.py +2884 -0
  99. services/e2e_evaluator.py +396 -0
  100. services/e2e_tasks.py +743 -0
  101. services/edit_ledger.py +459 -0
  102. services/embedding_index.py +341 -0
  103. services/error_reporting.py +123 -0
  104. services/file_memory.py +734 -0
  105. services/hub_service.py +585 -0
  106. services/indexer.py +712 -0
  107. services/memory.py +318 -0
  108. services/memory_consolidator.py +538 -0
  109. services/memory_graph.py +382 -0
  110. services/memory_grounder.py +304 -0
  111. services/memory_scorer.py +246 -0
  112. services/metrics.py +86 -0
  113. services/notifications.py +209 -0
  114. services/ollama_client.py +201 -0
  115. services/output_filter.py +488 -0
  116. services/parser.py +1238 -0
  117. services/project_manager.py +579 -0
  118. services/protocol.py +306 -0
  119. services/proxy_state.py +152 -0
  120. services/retrieval_broker.py +129 -0
  121. services/router.py +414 -0
  122. services/runtime.py +326 -0
  123. services/session_benchmark.py +1945 -0
  124. services/session_manager.py +1026 -0
  125. services/session_preloader.py +251 -0
  126. services/text_index.py +90 -0
  127. services/tool_classifier.py +176 -0
  128. services/transcript_index.py +340 -0
  129. services/validation_cache.py +155 -0
  130. services/vector_store.py +299 -0
  131. services/version_tracker.py +271 -0
  132. services/watcher.py +192 -0
  133. tui/__init__.py +0 -0
  134. tui/backend.py +59 -0
  135. tui/main.py +145 -0
  136. tui/screens/__init__.py +1 -0
  137. tui/screens/benchmark_view.py +109 -0
  138. tui/screens/claudemd_view.py +46 -0
  139. tui/screens/compress_view.py +52 -0
  140. tui/screens/index_view.py +74 -0
  141. tui/screens/init_view.py +82 -0
  142. tui/screens/mcp_view.py +73 -0
  143. tui/screens/optimize_view.py +41 -0
  144. tui/screens/pipe_view.py +46 -0
  145. tui/screens/projects_view.py +355 -0
  146. tui/screens/search_view.py +55 -0
  147. tui/screens/session_view.py +143 -0
  148. tui/screens/stats.py +158 -0
  149. tui/screens/ui_view.py +54 -0
  150. tui/theme.tcss +335 -0
services/doc_index.py ADDED
@@ -0,0 +1,537 @@
1
+ """DocIndex — Document-aware chunking layer for project docs, configs, and docstrings.
2
+
3
+ Chunks markdown by heading boundaries, extracts inline docstrings from code,
4
+ and indexes config files as whole-file chunks. Each chunk carries a priority
5
+ score based on its source type and file importance.
6
+ """
7
+
8
+ import hashlib
9
+ import json
10
+ import logging
11
+ import re
12
+ from pathlib import Path
13
+ from typing import Optional
14
+
15
+ from core import count_tokens
16
+
17
+ log = logging.getLogger(__name__)
18
+
19
+ # Files that get boosted priority (pattern -> priority multiplier)
20
+ _PRIORITY_FILES = {
21
+ "CLAUDE.md": 2.0,
22
+ "AGENTS.md": 1.8,
23
+ "README.md": 1.5,
24
+ "CONTRIBUTING.md": 1.3,
25
+ "ARCHITECTURE.md": 1.5,
26
+ "CHANGELOG.md": 0.8,
27
+ }
28
+
29
+ # Priority by source type
30
+ _SOURCE_PRIORITY = {
31
+ "markdown": 1.2,
32
+ "docstring": 1.0,
33
+ "config": 0.8,
34
+ }
35
+
36
+ # Max tokens per chunk before splitting
37
+ _CHUNK_MAX_TOKENS = 400
38
+
39
+ # Config file patterns to index
40
+ _CONFIG_PATTERNS = [
41
+ ".mcp.json", "pyproject.toml", "package.json", "tsconfig.json",
42
+ "Makefile", "Dockerfile", "docker-compose.yml", "docker-compose.yaml",
43
+ ".env.example",
44
+ ]
45
+
46
+
47
+ class DocIndex:
48
+ """Document-aware index for project docs, configs, and docstrings."""
49
+
50
+ def __init__(self, project_path: str, index_dir: str = ".c3/doc_index"):
51
+ self.project_path = Path(project_path)
52
+ self.index_dir = self.project_path / index_dir
53
+ self.index_dir.mkdir(parents=True, exist_ok=True)
54
+
55
+ self.chunks: dict = {} # chunk_id -> chunk dict
56
+ self._file_hashes: dict = {} # rel_path -> content hash
57
+ self._hash_file = self.index_dir / "file_hashes.json"
58
+ self._index_file = self.index_dir / "index.json"
59
+
60
+ self._load_hashes()
61
+ self._load_index()
62
+
63
+ # --- Persistence ---
64
+
65
+ def _load_hashes(self):
66
+ if self._hash_file.exists():
67
+ try:
68
+ self._file_hashes = json.loads(self._hash_file.read_text(encoding="utf-8"))
69
+ except Exception:
70
+ self._file_hashes = {}
71
+
72
+ def _save_hashes(self):
73
+ self._hash_file.write_text(json.dumps(self._file_hashes), encoding="utf-8")
74
+
75
+ def _load_index(self):
76
+ if self._index_file.exists():
77
+ try:
78
+ data = json.loads(self._index_file.read_text(encoding="utf-8"))
79
+ self.chunks = data.get("chunks", {})
80
+ except Exception:
81
+ self.chunks = {}
82
+
83
+ def _save_index(self):
84
+ self._index_file.write_text(
85
+ json.dumps({"chunks": self.chunks}, default=str),
86
+ encoding="utf-8",
87
+ )
88
+
89
+ @staticmethod
90
+ def _content_hash(content: str) -> str:
91
+ return hashlib.sha256(content.encode("utf-8")).hexdigest()[:16]
92
+
93
+ # --- Build ---
94
+
95
+ def build(self, force: bool = False) -> dict:
96
+ """Build or incrementally update the doc index."""
97
+ stats = {"docs_indexed": 0, "chunks_created": 0, "skipped": 0}
98
+
99
+ files_to_index = self._discover_files()
100
+ old_hashes = dict(self._file_hashes)
101
+ new_hashes = {}
102
+
103
+ for rel_path, fpath in files_to_index:
104
+ try:
105
+ content = fpath.read_text(errors="replace")
106
+ except Exception:
107
+ continue
108
+
109
+ h = self._content_hash(content)
110
+ new_hashes[rel_path] = h
111
+
112
+ if not force and old_hashes.get(rel_path) == h:
113
+ stats["skipped"] += 1
114
+ continue
115
+
116
+ # Remove old chunks for this file
117
+ self._remove_file_chunks(rel_path)
118
+
119
+ # Chunk based on file type
120
+ ext = fpath.suffix.lower()
121
+ name = fpath.name
122
+
123
+ if ext in (".md", ".mdx", ".rst", ".adoc"):
124
+ new_chunks = self._chunk_markdown(content, rel_path, name)
125
+ elif ext in (".py", ".js", ".ts", ".tsx", ".jsx", ".go", ".rs", ".java"):
126
+ new_chunks = self._chunk_docstrings(content, rel_path, ext)
127
+ elif name in _CONFIG_PATTERNS or ext in (".toml", ".yaml", ".yml", ".json", ".ini"):
128
+ new_chunks = self._chunk_config(content, rel_path, name)
129
+ else:
130
+ continue
131
+
132
+ for chunk in new_chunks:
133
+ self.chunks[chunk["id"]] = chunk
134
+
135
+ stats["docs_indexed"] += 1
136
+ stats["chunks_created"] += len(new_chunks)
137
+
138
+ # Remove chunks for deleted files
139
+ deleted = set(old_hashes.keys()) - set(new_hashes.keys())
140
+ for rel_path in deleted:
141
+ self._remove_file_chunks(rel_path)
142
+
143
+ self._file_hashes = new_hashes
144
+ self._save_hashes()
145
+ self._save_index()
146
+
147
+ log.info("DocIndex built: %s", stats)
148
+ return stats
149
+
150
+ def _remove_file_chunks(self, doc_id: str):
151
+ to_remove = [cid for cid, c in self.chunks.items() if c.get("doc_id") == doc_id]
152
+ for cid in to_remove:
153
+ del self.chunks[cid]
154
+
155
+ def _discover_files(self) -> list[tuple[str, Path]]:
156
+ """Find all doc, config, and code files to index."""
157
+ skip_dirs = {
158
+ "node_modules", ".git", "__pycache__", ".c3", "venv",
159
+ "env", ".venv", "dist", "build", ".next", ".cache",
160
+ "coverage", ".pytest_cache",
161
+ }
162
+
163
+ files = []
164
+
165
+ # Markdown docs
166
+ for ext in ("*.md", "*.mdx", "*.rst", "*.adoc"):
167
+ for fpath in self.project_path.rglob(ext):
168
+ if any(skip in fpath.parts for skip in skip_dirs):
169
+ continue
170
+ rel = str(fpath.relative_to(self.project_path))
171
+ files.append((rel, fpath))
172
+
173
+ # Config files at project root
174
+ for pattern in _CONFIG_PATTERNS:
175
+ fpath = self.project_path / pattern
176
+ if fpath.is_file():
177
+ files.append((pattern, fpath))
178
+
179
+ # Code files for docstring extraction (top-level only + key dirs)
180
+ code_exts = {".py", ".js", ".ts", ".tsx", ".jsx", ".go", ".rs", ".java"}
181
+ code_dirs = [self.project_path]
182
+ for subdir in ("cli", "core", "services", "src", "lib", "app", "pkg"):
183
+ d = self.project_path / subdir
184
+ if d.is_dir():
185
+ code_dirs.append(d)
186
+
187
+ seen = set()
188
+ for code_dir in code_dirs:
189
+ for fpath in code_dir.glob("*"):
190
+ if not fpath.is_file():
191
+ continue
192
+ if fpath.suffix.lower() not in code_exts:
193
+ continue
194
+ rel = str(fpath.relative_to(self.project_path))
195
+ if rel not in seen:
196
+ seen.add(rel)
197
+ files.append((rel, fpath))
198
+
199
+ return files
200
+
201
+ # --- Markdown chunking ---
202
+
203
+ def _chunk_markdown(self, content: str, doc_id: str, filename: str) -> list:
204
+ """Split markdown by heading boundaries."""
205
+ lines = content.split("\n")
206
+ sections = []
207
+ current_heading = filename
208
+ current_lines = []
209
+ current_start = 0
210
+ heading_path = [filename]
211
+
212
+ for i, line in enumerate(lines):
213
+ heading_match = re.match(r"^(#{1,6})\s+(.+)", line)
214
+ if heading_match:
215
+ # Flush previous section
216
+ if current_lines:
217
+ text = "\n".join(current_lines).strip()
218
+ if text:
219
+ sections.append(self._make_doc_chunk(
220
+ doc_id, current_heading, text,
221
+ heading_path[:], current_start, i - 1,
222
+ "markdown", filename,
223
+ ))
224
+ level = len(heading_match.group(1))
225
+ heading_text = heading_match.group(2).strip()
226
+ current_heading = f"{heading_match.group(1)} {heading_text}"
227
+ # Update heading path
228
+ heading_path = heading_path[:1] # keep filename
229
+ if level > 1:
230
+ heading_path.append(heading_text)
231
+ current_lines = [line]
232
+ current_start = i
233
+ else:
234
+ current_lines.append(line)
235
+
236
+ # Flush last section
237
+ if current_lines:
238
+ text = "\n".join(current_lines).strip()
239
+ if text:
240
+ sections.append(self._make_doc_chunk(
241
+ doc_id, current_heading, text,
242
+ heading_path[:], current_start, len(lines) - 1,
243
+ "markdown", filename,
244
+ ))
245
+
246
+ # Split oversized chunks
247
+ result = []
248
+ for chunk in sections:
249
+ tokens = chunk["tokens"]
250
+ if tokens > _CHUNK_MAX_TOKENS:
251
+ result.extend(self._split_chunk(chunk))
252
+ else:
253
+ result.append(chunk)
254
+
255
+ return result
256
+
257
+ def _split_chunk(self, chunk: dict) -> list:
258
+ """Split an oversized chunk into smaller pieces."""
259
+ lines = chunk["content"].split("\n")
260
+ parts = []
261
+ current = []
262
+ current_tokens = 0
263
+
264
+ for line in lines:
265
+ line_tokens = count_tokens(line)
266
+ if current_tokens + line_tokens > _CHUNK_MAX_TOKENS and current:
267
+ parts.append("\n".join(current))
268
+ current = [line]
269
+ current_tokens = line_tokens
270
+ else:
271
+ current.append(line)
272
+ current_tokens += line_tokens
273
+
274
+ if current:
275
+ parts.append("\n".join(current))
276
+
277
+ result = []
278
+ for idx, part_text in enumerate(parts):
279
+ part_text = part_text.strip()
280
+ if not part_text:
281
+ continue
282
+ c = dict(chunk)
283
+ c["id"] = f"{chunk['id']}::{idx}" if idx > 0 else chunk["id"]
284
+ c["content"] = part_text
285
+ c["tokens"] = count_tokens(part_text)
286
+ result.append(c)
287
+
288
+ return result
289
+
290
+ # --- Docstring chunking ---
291
+
292
+ def _chunk_docstrings(self, content: str, doc_id: str, ext: str) -> list:
293
+ """Extract module and symbol docstrings from code files."""
294
+ chunks = []
295
+
296
+ if ext == ".py":
297
+ chunks.extend(self._extract_python_docstrings(content, doc_id))
298
+ elif ext in (".js", ".ts", ".tsx", ".jsx"):
299
+ chunks.extend(self._extract_jsdoc_comments(content, doc_id))
300
+
301
+ return chunks
302
+
303
+ def _extract_python_docstrings(self, content: str, doc_id: str) -> list:
304
+ """Extract Python module and class/function docstrings."""
305
+ chunks = []
306
+ lines = content.split("\n")
307
+
308
+ # Module docstring: triple-quote at the start (within first 10 lines)
309
+ module_doc = self._find_python_docstring(lines, 0)
310
+ if module_doc:
311
+ chunks.append(self._make_doc_chunk(
312
+ doc_id, f"{doc_id}::module", module_doc["text"],
313
+ [doc_id], module_doc["start"], module_doc["end"],
314
+ "docstring", doc_id,
315
+ ))
316
+
317
+ # Class and function docstrings
318
+ pattern = re.compile(r"^\s*(class|def|async\s+def)\s+(\w+)")
319
+ for i, line in enumerate(lines):
320
+ m = pattern.match(line)
321
+ if m:
322
+ kind = m.group(1).replace("async ", "")
323
+ name = m.group(2)
324
+ # Look for docstring on the next non-empty line after the def/class line
325
+ doc = self._find_python_docstring(lines, i + 1)
326
+ if doc and doc["text"]:
327
+ chunks.append(self._make_doc_chunk(
328
+ doc_id, f"{doc_id}::{name}", doc["text"],
329
+ [doc_id, name], doc["start"], doc["end"],
330
+ "docstring", doc_id,
331
+ ))
332
+
333
+ return chunks
334
+
335
+ def _find_python_docstring(self, lines: list, start: int) -> Optional[dict]:
336
+ """Find a triple-quoted docstring starting near `start`."""
337
+ # Skip blank lines and decorators
338
+ i = start
339
+ while i < len(lines) and i < start + 5:
340
+ stripped = lines[i].strip()
341
+ if stripped.startswith('"""') or stripped.startswith("'''"):
342
+ break
343
+ if stripped == "" or stripped.startswith("@") or stripped.startswith("#"):
344
+ i += 1
345
+ continue
346
+ # If line contains colon (def/class body start), skip to next line
347
+ if stripped.endswith(":"):
348
+ i += 1
349
+ continue
350
+ return None
351
+ else:
352
+ return None
353
+
354
+ if i >= len(lines):
355
+ return None
356
+
357
+ quote = lines[i].strip()[:3]
358
+ doc_start = i
359
+
360
+ # Single-line docstring
361
+ if lines[i].strip().count(quote) >= 2:
362
+ text = lines[i].strip().strip(quote).strip()
363
+ if text:
364
+ return {"text": text, "start": doc_start, "end": doc_start}
365
+ return None
366
+
367
+ # Multi-line docstring
368
+ doc_lines = [lines[i].strip().lstrip(quote)]
369
+ i += 1
370
+ while i < len(lines):
371
+ if quote in lines[i]:
372
+ doc_lines.append(lines[i].strip().rstrip(quote).strip())
373
+ break
374
+ doc_lines.append(lines[i].strip())
375
+ i += 1
376
+
377
+ text = "\n".join(doc_lines).strip()
378
+ if len(text) < 10: # skip trivial docstrings
379
+ return None
380
+ return {"text": text, "start": doc_start, "end": i}
381
+
382
+ def _extract_jsdoc_comments(self, content: str, doc_id: str) -> list:
383
+ """Extract JSDoc comments (/** ... */) from JS/TS files."""
384
+ chunks = []
385
+ pattern = re.compile(r"/\*\*(.*?)\*/", re.DOTALL)
386
+
387
+ for match in pattern.finditer(content):
388
+ text = match.group(1).strip()
389
+ # Clean up JSDoc formatting
390
+ clean_lines = []
391
+ for line in text.split("\n"):
392
+ line = line.strip().lstrip("* ").strip()
393
+ if line:
394
+ clean_lines.append(line)
395
+ text = "\n".join(clean_lines)
396
+
397
+ if len(text) < 20: # skip trivial comments
398
+ continue
399
+
400
+ start_line = content[:match.start()].count("\n")
401
+ end_line = content[:match.end()].count("\n")
402
+
403
+ chunks.append(self._make_doc_chunk(
404
+ doc_id, f"{doc_id}::jsdoc:{start_line}",
405
+ text, [doc_id], start_line, end_line,
406
+ "docstring", doc_id,
407
+ ))
408
+
409
+ return chunks
410
+
411
+ # --- Config chunking ---
412
+
413
+ def _chunk_config(self, content: str, doc_id: str, filename: str) -> list:
414
+ """Index config files as whole-file chunks."""
415
+ tokens = count_tokens(content)
416
+ if tokens < 5:
417
+ return []
418
+
419
+ # For large configs, only keep the first _CHUNK_MAX_TOKENS worth
420
+ if tokens > _CHUNK_MAX_TOKENS:
421
+ lines = content.split("\n")
422
+ truncated = []
423
+ running = 0
424
+ for line in lines:
425
+ lt = count_tokens(line)
426
+ if running + lt > _CHUNK_MAX_TOKENS:
427
+ break
428
+ truncated.append(line)
429
+ running += lt
430
+ content = "\n".join(truncated)
431
+ tokens = running
432
+
433
+ return [self._make_doc_chunk(
434
+ doc_id, f"{doc_id}::config", content,
435
+ [filename], 0, content.count("\n"),
436
+ "config", filename,
437
+ )]
438
+
439
+ # --- Helpers ---
440
+
441
+ def _make_doc_chunk(self, doc_id: str, chunk_id: str, content: str,
442
+ heading_path: list, line_start: int, line_end: int,
443
+ source_type: str, filename: str) -> dict:
444
+ """Create a standardized chunk dict with priority scoring."""
445
+ # Calculate priority
446
+ base_priority = _SOURCE_PRIORITY.get(source_type, 1.0)
447
+ file_boost = 1.0
448
+ for pattern, boost in _PRIORITY_FILES.items():
449
+ if filename == pattern or doc_id.endswith(pattern):
450
+ file_boost = boost
451
+ break
452
+
453
+ # Docs in a docs/ directory get a small boost
454
+ if doc_id.startswith("docs/") or doc_id.startswith("docs\\"):
455
+ file_boost = max(file_boost, 1.2)
456
+
457
+ priority = round(base_priority * file_boost, 2)
458
+
459
+ return {
460
+ "id": chunk_id,
461
+ "doc_id": doc_id,
462
+ "content": content,
463
+ "tokens": count_tokens(content),
464
+ "kind": "doc",
465
+ "heading_path": heading_path,
466
+ "source_type": source_type,
467
+ "priority": priority,
468
+ "line_start": line_start,
469
+ "line_end": line_end,
470
+ }
471
+
472
+ # --- Search ---
473
+
474
+ def search(self, query: str, top_k: int = 5) -> list:
475
+ """Simple TF-IDF-like keyword search over doc chunks."""
476
+ if not self.chunks:
477
+ return []
478
+
479
+ query_tokens = set(re.findall(r"\w+", query.lower()))
480
+ if not query_tokens:
481
+ return []
482
+
483
+ scored = []
484
+ for cid, chunk in self.chunks.items():
485
+ content_tokens = set(re.findall(r"\w+", chunk["content"].lower()))
486
+ if not content_tokens:
487
+ continue
488
+
489
+ # Jaccard-like overlap
490
+ overlap = query_tokens & content_tokens
491
+ if not overlap:
492
+ continue
493
+
494
+ score = len(overlap) / len(query_tokens | content_tokens)
495
+
496
+ # Boost by heading path match
497
+ heading_text = " ".join(chunk.get("heading_path", [])).lower()
498
+ heading_tokens = set(re.findall(r"\w+", heading_text))
499
+ heading_overlap = query_tokens & heading_tokens
500
+ if heading_overlap:
501
+ score += 0.3 * len(heading_overlap) / len(query_tokens)
502
+
503
+ # Apply priority multiplier
504
+ score *= chunk.get("priority", 1.0)
505
+
506
+ scored.append((cid, score))
507
+
508
+ scored.sort(key=lambda x: x[1], reverse=True)
509
+
510
+ results = []
511
+ for cid, score in scored[:top_k]:
512
+ chunk = self.chunks[cid]
513
+ results.append({
514
+ **chunk,
515
+ "score": round(score, 4),
516
+ })
517
+
518
+ return results
519
+
520
+ def search_semantic(self, query: str, embedding_index, top_k: int = 5) -> list:
521
+ """Search using embedding index if available, falling back to keyword."""
522
+ # For now, keyword search is the primary method for docs.
523
+ # Semantic search via EmbeddingIndex searches code chunks.
524
+ # We use keyword search here which works well for docs.
525
+ return self.search(query, top_k=top_k)
526
+
527
+ def get_stats(self) -> dict:
528
+ by_type = {}
529
+ for c in self.chunks.values():
530
+ st = c.get("source_type", "unknown")
531
+ by_type[st] = by_type.get(st, 0) + 1
532
+
533
+ return {
534
+ "total_chunks": len(self.chunks),
535
+ "files_tracked": len(self._file_hashes),
536
+ "by_source_type": by_type,
537
+ }