codebase-index 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. codebase_index/__init__.py +7 -0
  2. codebase_index/__main__.py +3 -0
  3. codebase_index/cli.py +916 -0
  4. codebase_index/config.py +110 -0
  5. codebase_index/discovery/__init__.py +10 -0
  6. codebase_index/discovery/classify.py +151 -0
  7. codebase_index/discovery/ignore.py +58 -0
  8. codebase_index/discovery/walker.py +75 -0
  9. codebase_index/doctor.py +138 -0
  10. codebase_index/embeddings/__init__.py +2 -0
  11. codebase_index/embeddings/backend.py +67 -0
  12. codebase_index/embeddings/external.py +56 -0
  13. codebase_index/embeddings/local.py +41 -0
  14. codebase_index/embeddings/noop.py +15 -0
  15. codebase_index/graph/__init__.py +8 -0
  16. codebase_index/graph/analysis.py +468 -0
  17. codebase_index/graph/builder.py +160 -0
  18. codebase_index/graph/expand.py +136 -0
  19. codebase_index/graph/export.py +381 -0
  20. codebase_index/graph/navigate.py +201 -0
  21. codebase_index/indexer/__init__.py +8 -0
  22. codebase_index/indexer/doc_chunks.py +202 -0
  23. codebase_index/indexer/freshness.py +109 -0
  24. codebase_index/indexer/pipeline.py +423 -0
  25. codebase_index/mcp/__init__.py +2 -0
  26. codebase_index/mcp/server.py +354 -0
  27. codebase_index/models.py +145 -0
  28. codebase_index/output/__init__.py +6 -0
  29. codebase_index/output/json.py +13 -0
  30. codebase_index/output/markdown.py +316 -0
  31. codebase_index/output/redact.py +31 -0
  32. codebase_index/parsers/__init__.py +9 -0
  33. codebase_index/parsers/base.py +47 -0
  34. codebase_index/parsers/languages.py +290 -0
  35. codebase_index/parsers/line_chunker.py +39 -0
  36. codebase_index/parsers/symbol_chunks.py +62 -0
  37. codebase_index/parsers/treesitter.py +439 -0
  38. codebase_index/retrieval/__init__.py +9 -0
  39. codebase_index/retrieval/budget.py +82 -0
  40. codebase_index/retrieval/fusion.py +62 -0
  41. codebase_index/retrieval/intent.py +56 -0
  42. codebase_index/retrieval/pipeline.py +207 -0
  43. codebase_index/retrieval/rerank.py +69 -0
  44. codebase_index/retrieval/searchers.py +291 -0
  45. codebase_index/retrieval/skeleton.py +251 -0
  46. codebase_index/retrieval/types.py +79 -0
  47. codebase_index/scaffold.py +399 -0
  48. codebase_index/service.py +158 -0
  49. codebase_index/skill_template/SKILL.md +198 -0
  50. codebase_index/skill_template/examples/hooks/settings.json +16 -0
  51. codebase_index/skill_template/scripts/cbx +25 -0
  52. codebase_index/skill_template/scripts/cbx.ps1 +25 -0
  53. codebase_index/skill_update.py +150 -0
  54. codebase_index/storage/__init__.py +8 -0
  55. codebase_index/storage/db.py +116 -0
  56. codebase_index/storage/repo.py +701 -0
  57. codebase_index/storage/schema.sql +125 -0
  58. codebase_index/watch/__init__.py +5 -0
  59. codebase_index/watch/watcher.py +93 -0
  60. codebase_index-1.6.0.dist-info/METADATA +748 -0
  61. codebase_index-1.6.0.dist-info/RECORD +64 -0
  62. codebase_index-1.6.0.dist-info/WHEEL +4 -0
  63. codebase_index-1.6.0.dist-info/entry_points.txt +4 -0
  64. codebase_index-1.6.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,251 @@
1
+ """Retrieval-time snippet skeletonization (line-granularity StructureMask).
2
+
3
+ Turns a raw code/text snippet into a compact skeleton: signature/structural
4
+ lines are kept, function bodies (and other compressible runs) collapse into a
5
+ marker that points at the absolute line range to read for the full body. A
6
+ line-granularity port of headroom's StructureMask, adapted for a retrieval
7
+ system: the query-matching line is always preserved, routing is by file
8
+ extension, and the transform never makes output worse than the raw snippet.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import re
14
+ from dataclasses import dataclass
15
+ from typing import Callable, Optional
16
+
17
+ from ..parsers.line_chunker import estimate_tokens
18
+ from .types import Candidate, Intent
19
+
20
+
21
+ def render_skeleton(
22
+ content: str, keep: list[bool], *, line_start: int
23
+ ) -> tuple[str, int]:
24
+ """Collapse consecutive ``keep=False`` lines into one elision marker.
25
+
26
+ ``line_start`` is the absolute file line number of ``content``'s first line,
27
+ so markers cite the real range to ``Read``. Returns (text, elided_count).
28
+ """
29
+ lines = content.split("\n")
30
+ if len(keep) != len(lines):
31
+ # Defensive: mask/line mismatch must never corrupt output.
32
+ return content, 0
33
+
34
+ out: list[str] = []
35
+ elided_total = 0
36
+ i = 0
37
+ n = len(lines)
38
+ while i < n:
39
+ if keep[i]:
40
+ out.append(lines[i])
41
+ i += 1
42
+ continue
43
+ run_start = i
44
+ while i < n and not keep[i]:
45
+ i += 1
46
+ run_len = i - run_start
47
+ elided_total += run_len
48
+ a = line_start + run_start
49
+ b = line_start + i - 1
50
+ out.append(f"... {run_len} lines elided (read {a}-{b})")
51
+ return "\n".join(out), elided_total
52
+
53
+
54
+ # Languages we skeletonize via tree-sitter signatures. Mirrors
55
+ # discovery.classify._TREE_SITTER_LANGS (kept local to avoid a private import).
56
+ _CODE_LANGS = frozenset({
57
+ "python", "typescript", "javascript", "go", "java", "rust",
58
+ "c", "cpp", "csharp", "ruby", "php", "kotlin", "lua",
59
+ })
60
+ # Languages whose body opens at a line containing '{' vs. one ending in ':'.
61
+ _BRACE_LANGS = frozenset({
62
+ "typescript", "javascript", "go", "java", "rust",
63
+ "c", "cpp", "csharp", "php", "kotlin",
64
+ })
65
+ _MAX_SIG_SCAN = 5 # bound the multi-line-signature lookahead
66
+
67
+
68
+ @dataclass
69
+ class Compacted:
70
+ text: str
71
+ token_est: int
72
+ elided_lines: int
73
+ skeletonized: bool
74
+
75
+
76
+ def _raw(content: str) -> Compacted:
77
+ return Compacted(text=content, token_est=estimate_tokens(content),
78
+ elided_lines=0, skeletonized=False)
79
+
80
+
81
+ def _signature_end(lines: list[str], start: int, lang: str | None, end: int) -> int:
82
+ """0-based index of the last signature line for a def starting at ``start``.
83
+
84
+ Scans forward (bounded) for the line that opens the body so multi-line
85
+ signatures stay visible; defaults to ``start`` when nothing matches.
86
+ """
87
+ limit = min(end, start + _MAX_SIG_SCAN)
88
+ for i in range(start, limit + 1):
89
+ s = lines[i].strip()
90
+ if lang in _BRACE_LANGS and "{" in s:
91
+ return i
92
+ if lang not in _BRACE_LANGS and s.endswith(":"):
93
+ return i
94
+ return start
95
+
96
+
97
+ def _classify_code(content: str, lines: list[str], lang: str) -> list[bool] | None:
98
+ """Keep imports/signatures/headers; elide function & method bodies.
99
+
100
+ Returns None when parsing yields no usable symbols (caller falls back).
101
+ """
102
+ from ..parsers.treesitter import parse_file
103
+
104
+ try:
105
+ result = parse_file(lang, content)
106
+ except Exception:
107
+ return None
108
+ symbols = result.symbols
109
+ if not symbols:
110
+ return None
111
+
112
+ n = len(lines)
113
+ keep = [True] * n
114
+ # Pass 1: elide the interior of every callable body.
115
+ for sym in symbols:
116
+ if sym.kind not in ("function", "method"):
117
+ continue
118
+ start0 = sym.line_start - 1
119
+ if not (0 <= start0 < n):
120
+ continue
121
+ end0 = min(sym.line_end - 1, n - 1)
122
+ sig_end = _signature_end(lines, start0, lang, end0)
123
+ for i in range(sig_end + 1, end0 + 1):
124
+ keep[i] = False
125
+ # Pass 2: re-keep every symbol's signature line(s) (restores nested defs).
126
+ for sym in symbols:
127
+ start0 = sym.line_start - 1
128
+ if not (0 <= start0 < n):
129
+ continue
130
+ end0 = min(sym.line_end - 1, n - 1)
131
+ sig_end = _signature_end(lines, start0, lang, end0)
132
+ for i in range(start0, sig_end + 1):
133
+ keep[i] = True
134
+ return keep
135
+
136
+
137
+ def _apply_focus(lines: list[str], keep: list[bool],
138
+ query_terms: list[str], ctx_lines: int) -> None:
139
+ """Force-keep any line containing a query term, plus +/- ctx_lines."""
140
+ if not query_terms:
141
+ return
142
+ n = len(lines)
143
+ for i, line in enumerate(lines):
144
+ low = line.lower()
145
+ if any(t in low for t in query_terms):
146
+ for j in range(max(0, i - ctx_lines), min(n, i + ctx_lines + 1)):
147
+ keep[j] = True
148
+
149
+
150
+ _STRUCT_LANGS = frozenset({"json", "yaml", "toml", "ini"})
151
+ _HEADING_RE = re.compile(r"^\s{0,3}#{1,6}\s")
152
+ _SECTION_RE = re.compile(r"^\s*\[.*\]\s*$") # toml/ini section header
153
+ _KEY_RE = re.compile(r"[:=]") # key/value introducer
154
+ _BRACKET = {"{", "}", "[", "]", "{}", "[]", "},", "],"}
155
+
156
+
157
+ def _classify_markdown(lines: list[str]) -> list[bool]:
158
+ keep = [False] * len(lines)
159
+ for i, line in enumerate(lines):
160
+ if _HEADING_RE.match(line):
161
+ keep[i] = True
162
+ # keep the first non-blank line of the section
163
+ for j in range(i + 1, len(lines)):
164
+ if lines[j].strip():
165
+ keep[j] = True
166
+ break
167
+ return keep
168
+
169
+
170
+ def _classify_structured(lines: list[str]) -> list[bool]:
171
+ keep = [False] * len(lines)
172
+ for i, line in enumerate(lines):
173
+ s = line.strip()
174
+ if not s or s in _BRACKET or _SECTION_RE.match(s) or _KEY_RE.search(s):
175
+ keep[i] = True
176
+ return keep
177
+
178
+
179
+ def classify_lines(content: str, *, lang: str | None,
180
+ query_terms: list[str], ctx_lines: int) -> list[bool]:
181
+ lines = content.split("\n")
182
+ keep: list[bool] | None = None
183
+ if lang in _CODE_LANGS:
184
+ keep = _classify_code(content, lines, lang)
185
+ elif lang == "markdown":
186
+ keep = _classify_markdown(lines)
187
+ elif lang in _STRUCT_LANGS:
188
+ keep = _classify_structured(lines)
189
+ if keep is None:
190
+ keep = [True] * len(lines) # unknown / parse miss -> keep all (raw)
191
+ _apply_focus(lines, keep, query_terms, ctx_lines)
192
+ return keep
193
+
194
+
195
+ def compact(content: str, *, path: str, line_start: int, ctx_lines: int,
196
+ query_terms: list[str], min_reduction: float) -> Compacted:
197
+ """Route -> classify -> render -> guard. Never raises; raw fallback on any miss."""
198
+ if not content.strip():
199
+ return _raw(content)
200
+ try:
201
+ from ..discovery.classify import detect_language
202
+ lang = detect_language(path)
203
+ keep = classify_lines(content, lang=lang,
204
+ query_terms=[t.lower() for t in query_terms],
205
+ ctx_lines=ctx_lines)
206
+ if all(keep):
207
+ return _raw(content)
208
+ text, elided = render_skeleton(content, keep, line_start=line_start)
209
+ if elided == 0:
210
+ return _raw(content)
211
+ new_tok = estimate_tokens(text)
212
+ raw_tok = estimate_tokens(content)
213
+ if new_tok > raw_tok * (1.0 - min_reduction):
214
+ return _raw(content) # not a meaningful win
215
+ return Compacted(text=text, token_est=new_tok,
216
+ elided_lines=elided, skeletonized=True)
217
+ except Exception:
218
+ return _raw(content)
219
+
220
+
221
+ # Shape-first intents want pure signatures (no context around matches).
222
+ _SHAPE_INTENTS = frozenset({Intent.ARCHITECTURE, Intent.HOW_IT_WORKS, Intent.DATA_FLOW})
223
+ _TERM_RE = re.compile(r"[A-Za-z0-9_]+")
224
+ _STOPWORDS = frozenset({
225
+ "the", "a", "an", "is", "are", "how", "does", "do", "what", "where",
226
+ "which", "to", "of", "in", "on", "for", "and", "or", "with", "from",
227
+ })
228
+
229
+
230
+ def _query_terms(query: str) -> list[str]:
231
+ out: list[str] = []
232
+ for t in _TERM_RE.findall(query):
233
+ tl = t.lower()
234
+ if len(tl) >= 3 and tl not in _STOPWORDS:
235
+ out.append(tl)
236
+ return list(dict.fromkeys(out))
237
+
238
+
239
+ def make_compactor(*, intent: Intent, query: str, enabled: bool,
240
+ min_reduction: float) -> Optional[Callable[[Candidate], Compacted]]:
241
+ if not enabled:
242
+ return None
243
+ ctx_lines = 0 if intent in _SHAPE_INTENTS else 2
244
+ terms = _query_terms(query)
245
+
246
+ def _compact(c: Candidate) -> Compacted:
247
+ return compact(c.content or "", path=c.path, line_start=c.line_start,
248
+ ctx_lines=ctx_lines, query_terms=terms,
249
+ min_reduction=min_reduction)
250
+
251
+ return _compact
@@ -0,0 +1,79 @@
1
+ """Shared retrieval types: the uniform candidate + intent plan."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from enum import Enum
7
+ from typing import Optional
8
+
9
+ # Line window used by Candidate.fuse_key to group co-located hits across retrievers.
10
+ # Wide enough to merge a symbol body and the FTS window that overlaps it, narrow
11
+ # enough to keep distinct regions of a large file separate.
12
+ _FUSE_BUCKET_LINES = 40
13
+
14
+
15
+ class Intent(str, Enum):
16
+ LOCATE_IMPL = "locate_impl"
17
+ HOW_IT_WORKS = "how_it_works"
18
+ IMPACT = "impact"
19
+ FIND_REFS = "find_refs"
20
+ DATA_FLOW = "data_flow"
21
+ DEBUG_ERROR = "debug_error"
22
+ ARCHITECTURE = "architecture"
23
+ KEYWORD = "keyword"
24
+
25
+
26
+ class Confidence(str, Enum):
27
+ HIGH = "high"
28
+ MEDIUM = "medium"
29
+ LOW = "low"
30
+
31
+
32
+ @dataclass
33
+ class Candidate:
34
+ """Source-agnostic retrieval hit. `source` in {"path","symbol","fts"}."""
35
+
36
+ path: str
37
+ line_start: int
38
+ line_end: int
39
+ source: str
40
+ score: float
41
+ kind: Optional[str] = None
42
+ symbol: Optional[str] = None
43
+ content: Optional[str] = None
44
+ token_est: int = 0
45
+ in_degree: int = 0
46
+ out_degree: int = 0
47
+ ref_count: int = 0
48
+ is_generated: bool = False
49
+ exact_symbol: bool = False
50
+ reason: str = ""
51
+ agreeing_sources: int = 1
52
+
53
+ def key(self) -> tuple[str, int, int]:
54
+ return (self.path, self.line_start, self.line_end)
55
+
56
+ def fuse_key(self) -> tuple[str, int]:
57
+ """Coarse locator for RRF fusion: path + line bucket.
58
+
59
+ Different retrievers emit different granularities for the same place — a
60
+ symbol body, an 80-line FTS window, a path hit anchored at line 1 — so an
61
+ exact (path, start, end) key almost never coincides across sources and RRF
62
+ degenerates into a weighted round-robin that never rewards agreement.
63
+ Bucketing line_start collapses co-located hits onto one key so their
64
+ per-source RRF contributions actually sum, while still separating genuinely
65
+ distant regions of a large file.
66
+ """
67
+ return (self.path, (max(self.line_start, 1) - 1) // _FUSE_BUCKET_LINES)
68
+
69
+
70
+ @dataclass
71
+ class IntentPlan:
72
+ intent: Intent
73
+ weights: dict[str, float]
74
+ token_budget: int
75
+ graph_strategy: str = "none"
76
+ summaries_first: bool = False
77
+
78
+ def weight(self, source: str) -> float:
79
+ return self.weights.get(source, 0.0)