codebase-index 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codebase_index/__init__.py +7 -0
- codebase_index/__main__.py +3 -0
- codebase_index/cli.py +916 -0
- codebase_index/config.py +110 -0
- codebase_index/discovery/__init__.py +10 -0
- codebase_index/discovery/classify.py +151 -0
- codebase_index/discovery/ignore.py +58 -0
- codebase_index/discovery/walker.py +75 -0
- codebase_index/doctor.py +138 -0
- codebase_index/embeddings/__init__.py +2 -0
- codebase_index/embeddings/backend.py +67 -0
- codebase_index/embeddings/external.py +56 -0
- codebase_index/embeddings/local.py +41 -0
- codebase_index/embeddings/noop.py +15 -0
- codebase_index/graph/__init__.py +8 -0
- codebase_index/graph/analysis.py +468 -0
- codebase_index/graph/builder.py +160 -0
- codebase_index/graph/expand.py +136 -0
- codebase_index/graph/export.py +381 -0
- codebase_index/graph/navigate.py +201 -0
- codebase_index/indexer/__init__.py +8 -0
- codebase_index/indexer/doc_chunks.py +202 -0
- codebase_index/indexer/freshness.py +109 -0
- codebase_index/indexer/pipeline.py +423 -0
- codebase_index/mcp/__init__.py +2 -0
- codebase_index/mcp/server.py +354 -0
- codebase_index/models.py +145 -0
- codebase_index/output/__init__.py +6 -0
- codebase_index/output/json.py +13 -0
- codebase_index/output/markdown.py +316 -0
- codebase_index/output/redact.py +31 -0
- codebase_index/parsers/__init__.py +9 -0
- codebase_index/parsers/base.py +47 -0
- codebase_index/parsers/languages.py +290 -0
- codebase_index/parsers/line_chunker.py +39 -0
- codebase_index/parsers/symbol_chunks.py +62 -0
- codebase_index/parsers/treesitter.py +439 -0
- codebase_index/retrieval/__init__.py +9 -0
- codebase_index/retrieval/budget.py +82 -0
- codebase_index/retrieval/fusion.py +62 -0
- codebase_index/retrieval/intent.py +56 -0
- codebase_index/retrieval/pipeline.py +207 -0
- codebase_index/retrieval/rerank.py +69 -0
- codebase_index/retrieval/searchers.py +291 -0
- codebase_index/retrieval/skeleton.py +251 -0
- codebase_index/retrieval/types.py +79 -0
- codebase_index/scaffold.py +399 -0
- codebase_index/service.py +158 -0
- codebase_index/skill_template/SKILL.md +198 -0
- codebase_index/skill_template/examples/hooks/settings.json +16 -0
- codebase_index/skill_template/scripts/cbx +25 -0
- codebase_index/skill_template/scripts/cbx.ps1 +25 -0
- codebase_index/skill_update.py +150 -0
- codebase_index/storage/__init__.py +8 -0
- codebase_index/storage/db.py +116 -0
- codebase_index/storage/repo.py +701 -0
- codebase_index/storage/schema.sql +125 -0
- codebase_index/watch/__init__.py +5 -0
- codebase_index/watch/watcher.py +93 -0
- codebase_index-1.6.0.dist-info/METADATA +748 -0
- codebase_index-1.6.0.dist-info/RECORD +64 -0
- codebase_index-1.6.0.dist-info/WHEEL +4 -0
- codebase_index-1.6.0.dist-info/entry_points.txt +4 -0
- codebase_index-1.6.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
"""Retrieval-time snippet skeletonization (line-granularity StructureMask).
|
|
2
|
+
|
|
3
|
+
Turns a raw code/text snippet into a compact skeleton: signature/structural
|
|
4
|
+
lines are kept, function bodies (and other compressible runs) collapse into a
|
|
5
|
+
marker that points at the absolute line range to read for the full body. A
|
|
6
|
+
line-granularity port of headroom's StructureMask, adapted for a retrieval
|
|
7
|
+
system: the query-matching line is always preserved, routing is by file
|
|
8
|
+
extension, and the transform never makes output worse than the raw snippet.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import re
|
|
14
|
+
from dataclasses import dataclass
|
|
15
|
+
from typing import Callable, Optional
|
|
16
|
+
|
|
17
|
+
from ..parsers.line_chunker import estimate_tokens
|
|
18
|
+
from .types import Candidate, Intent
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def render_skeleton(
|
|
22
|
+
content: str, keep: list[bool], *, line_start: int
|
|
23
|
+
) -> tuple[str, int]:
|
|
24
|
+
"""Collapse consecutive ``keep=False`` lines into one elision marker.
|
|
25
|
+
|
|
26
|
+
``line_start`` is the absolute file line number of ``content``'s first line,
|
|
27
|
+
so markers cite the real range to ``Read``. Returns (text, elided_count).
|
|
28
|
+
"""
|
|
29
|
+
lines = content.split("\n")
|
|
30
|
+
if len(keep) != len(lines):
|
|
31
|
+
# Defensive: mask/line mismatch must never corrupt output.
|
|
32
|
+
return content, 0
|
|
33
|
+
|
|
34
|
+
out: list[str] = []
|
|
35
|
+
elided_total = 0
|
|
36
|
+
i = 0
|
|
37
|
+
n = len(lines)
|
|
38
|
+
while i < n:
|
|
39
|
+
if keep[i]:
|
|
40
|
+
out.append(lines[i])
|
|
41
|
+
i += 1
|
|
42
|
+
continue
|
|
43
|
+
run_start = i
|
|
44
|
+
while i < n and not keep[i]:
|
|
45
|
+
i += 1
|
|
46
|
+
run_len = i - run_start
|
|
47
|
+
elided_total += run_len
|
|
48
|
+
a = line_start + run_start
|
|
49
|
+
b = line_start + i - 1
|
|
50
|
+
out.append(f"... {run_len} lines elided (read {a}-{b})")
|
|
51
|
+
return "\n".join(out), elided_total
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
# Languages we skeletonize via tree-sitter signatures. Mirrors
|
|
55
|
+
# discovery.classify._TREE_SITTER_LANGS (kept local to avoid a private import).
|
|
56
|
+
_CODE_LANGS = frozenset({
|
|
57
|
+
"python", "typescript", "javascript", "go", "java", "rust",
|
|
58
|
+
"c", "cpp", "csharp", "ruby", "php", "kotlin", "lua",
|
|
59
|
+
})
|
|
60
|
+
# Languages whose body opens at a line containing '{' vs. one ending in ':'.
|
|
61
|
+
_BRACE_LANGS = frozenset({
|
|
62
|
+
"typescript", "javascript", "go", "java", "rust",
|
|
63
|
+
"c", "cpp", "csharp", "php", "kotlin",
|
|
64
|
+
})
|
|
65
|
+
_MAX_SIG_SCAN = 5 # bound the multi-line-signature lookahead
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@dataclass
|
|
69
|
+
class Compacted:
|
|
70
|
+
text: str
|
|
71
|
+
token_est: int
|
|
72
|
+
elided_lines: int
|
|
73
|
+
skeletonized: bool
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _raw(content: str) -> Compacted:
|
|
77
|
+
return Compacted(text=content, token_est=estimate_tokens(content),
|
|
78
|
+
elided_lines=0, skeletonized=False)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _signature_end(lines: list[str], start: int, lang: str | None, end: int) -> int:
|
|
82
|
+
"""0-based index of the last signature line for a def starting at ``start``.
|
|
83
|
+
|
|
84
|
+
Scans forward (bounded) for the line that opens the body so multi-line
|
|
85
|
+
signatures stay visible; defaults to ``start`` when nothing matches.
|
|
86
|
+
"""
|
|
87
|
+
limit = min(end, start + _MAX_SIG_SCAN)
|
|
88
|
+
for i in range(start, limit + 1):
|
|
89
|
+
s = lines[i].strip()
|
|
90
|
+
if lang in _BRACE_LANGS and "{" in s:
|
|
91
|
+
return i
|
|
92
|
+
if lang not in _BRACE_LANGS and s.endswith(":"):
|
|
93
|
+
return i
|
|
94
|
+
return start
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _classify_code(content: str, lines: list[str], lang: str) -> list[bool] | None:
|
|
98
|
+
"""Keep imports/signatures/headers; elide function & method bodies.
|
|
99
|
+
|
|
100
|
+
Returns None when parsing yields no usable symbols (caller falls back).
|
|
101
|
+
"""
|
|
102
|
+
from ..parsers.treesitter import parse_file
|
|
103
|
+
|
|
104
|
+
try:
|
|
105
|
+
result = parse_file(lang, content)
|
|
106
|
+
except Exception:
|
|
107
|
+
return None
|
|
108
|
+
symbols = result.symbols
|
|
109
|
+
if not symbols:
|
|
110
|
+
return None
|
|
111
|
+
|
|
112
|
+
n = len(lines)
|
|
113
|
+
keep = [True] * n
|
|
114
|
+
# Pass 1: elide the interior of every callable body.
|
|
115
|
+
for sym in symbols:
|
|
116
|
+
if sym.kind not in ("function", "method"):
|
|
117
|
+
continue
|
|
118
|
+
start0 = sym.line_start - 1
|
|
119
|
+
if not (0 <= start0 < n):
|
|
120
|
+
continue
|
|
121
|
+
end0 = min(sym.line_end - 1, n - 1)
|
|
122
|
+
sig_end = _signature_end(lines, start0, lang, end0)
|
|
123
|
+
for i in range(sig_end + 1, end0 + 1):
|
|
124
|
+
keep[i] = False
|
|
125
|
+
# Pass 2: re-keep every symbol's signature line(s) (restores nested defs).
|
|
126
|
+
for sym in symbols:
|
|
127
|
+
start0 = sym.line_start - 1
|
|
128
|
+
if not (0 <= start0 < n):
|
|
129
|
+
continue
|
|
130
|
+
end0 = min(sym.line_end - 1, n - 1)
|
|
131
|
+
sig_end = _signature_end(lines, start0, lang, end0)
|
|
132
|
+
for i in range(start0, sig_end + 1):
|
|
133
|
+
keep[i] = True
|
|
134
|
+
return keep
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _apply_focus(lines: list[str], keep: list[bool],
|
|
138
|
+
query_terms: list[str], ctx_lines: int) -> None:
|
|
139
|
+
"""Force-keep any line containing a query term, plus +/- ctx_lines."""
|
|
140
|
+
if not query_terms:
|
|
141
|
+
return
|
|
142
|
+
n = len(lines)
|
|
143
|
+
for i, line in enumerate(lines):
|
|
144
|
+
low = line.lower()
|
|
145
|
+
if any(t in low for t in query_terms):
|
|
146
|
+
for j in range(max(0, i - ctx_lines), min(n, i + ctx_lines + 1)):
|
|
147
|
+
keep[j] = True
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
_STRUCT_LANGS = frozenset({"json", "yaml", "toml", "ini"})
|
|
151
|
+
_HEADING_RE = re.compile(r"^\s{0,3}#{1,6}\s")
|
|
152
|
+
_SECTION_RE = re.compile(r"^\s*\[.*\]\s*$") # toml/ini section header
|
|
153
|
+
_KEY_RE = re.compile(r"[:=]") # key/value introducer
|
|
154
|
+
_BRACKET = {"{", "}", "[", "]", "{}", "[]", "},", "],"}
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def _classify_markdown(lines: list[str]) -> list[bool]:
|
|
158
|
+
keep = [False] * len(lines)
|
|
159
|
+
for i, line in enumerate(lines):
|
|
160
|
+
if _HEADING_RE.match(line):
|
|
161
|
+
keep[i] = True
|
|
162
|
+
# keep the first non-blank line of the section
|
|
163
|
+
for j in range(i + 1, len(lines)):
|
|
164
|
+
if lines[j].strip():
|
|
165
|
+
keep[j] = True
|
|
166
|
+
break
|
|
167
|
+
return keep
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def _classify_structured(lines: list[str]) -> list[bool]:
|
|
171
|
+
keep = [False] * len(lines)
|
|
172
|
+
for i, line in enumerate(lines):
|
|
173
|
+
s = line.strip()
|
|
174
|
+
if not s or s in _BRACKET or _SECTION_RE.match(s) or _KEY_RE.search(s):
|
|
175
|
+
keep[i] = True
|
|
176
|
+
return keep
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def classify_lines(content: str, *, lang: str | None,
|
|
180
|
+
query_terms: list[str], ctx_lines: int) -> list[bool]:
|
|
181
|
+
lines = content.split("\n")
|
|
182
|
+
keep: list[bool] | None = None
|
|
183
|
+
if lang in _CODE_LANGS:
|
|
184
|
+
keep = _classify_code(content, lines, lang)
|
|
185
|
+
elif lang == "markdown":
|
|
186
|
+
keep = _classify_markdown(lines)
|
|
187
|
+
elif lang in _STRUCT_LANGS:
|
|
188
|
+
keep = _classify_structured(lines)
|
|
189
|
+
if keep is None:
|
|
190
|
+
keep = [True] * len(lines) # unknown / parse miss -> keep all (raw)
|
|
191
|
+
_apply_focus(lines, keep, query_terms, ctx_lines)
|
|
192
|
+
return keep
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def compact(content: str, *, path: str, line_start: int, ctx_lines: int,
|
|
196
|
+
query_terms: list[str], min_reduction: float) -> Compacted:
|
|
197
|
+
"""Route -> classify -> render -> guard. Never raises; raw fallback on any miss."""
|
|
198
|
+
if not content.strip():
|
|
199
|
+
return _raw(content)
|
|
200
|
+
try:
|
|
201
|
+
from ..discovery.classify import detect_language
|
|
202
|
+
lang = detect_language(path)
|
|
203
|
+
keep = classify_lines(content, lang=lang,
|
|
204
|
+
query_terms=[t.lower() for t in query_terms],
|
|
205
|
+
ctx_lines=ctx_lines)
|
|
206
|
+
if all(keep):
|
|
207
|
+
return _raw(content)
|
|
208
|
+
text, elided = render_skeleton(content, keep, line_start=line_start)
|
|
209
|
+
if elided == 0:
|
|
210
|
+
return _raw(content)
|
|
211
|
+
new_tok = estimate_tokens(text)
|
|
212
|
+
raw_tok = estimate_tokens(content)
|
|
213
|
+
if new_tok > raw_tok * (1.0 - min_reduction):
|
|
214
|
+
return _raw(content) # not a meaningful win
|
|
215
|
+
return Compacted(text=text, token_est=new_tok,
|
|
216
|
+
elided_lines=elided, skeletonized=True)
|
|
217
|
+
except Exception:
|
|
218
|
+
return _raw(content)
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
# Shape-first intents want pure signatures (no context around matches).
|
|
222
|
+
_SHAPE_INTENTS = frozenset({Intent.ARCHITECTURE, Intent.HOW_IT_WORKS, Intent.DATA_FLOW})
|
|
223
|
+
_TERM_RE = re.compile(r"[A-Za-z0-9_]+")
|
|
224
|
+
_STOPWORDS = frozenset({
|
|
225
|
+
"the", "a", "an", "is", "are", "how", "does", "do", "what", "where",
|
|
226
|
+
"which", "to", "of", "in", "on", "for", "and", "or", "with", "from",
|
|
227
|
+
})
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def _query_terms(query: str) -> list[str]:
|
|
231
|
+
out: list[str] = []
|
|
232
|
+
for t in _TERM_RE.findall(query):
|
|
233
|
+
tl = t.lower()
|
|
234
|
+
if len(tl) >= 3 and tl not in _STOPWORDS:
|
|
235
|
+
out.append(tl)
|
|
236
|
+
return list(dict.fromkeys(out))
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def make_compactor(*, intent: Intent, query: str, enabled: bool,
|
|
240
|
+
min_reduction: float) -> Optional[Callable[[Candidate], Compacted]]:
|
|
241
|
+
if not enabled:
|
|
242
|
+
return None
|
|
243
|
+
ctx_lines = 0 if intent in _SHAPE_INTENTS else 2
|
|
244
|
+
terms = _query_terms(query)
|
|
245
|
+
|
|
246
|
+
def _compact(c: Candidate) -> Compacted:
|
|
247
|
+
return compact(c.content or "", path=c.path, line_start=c.line_start,
|
|
248
|
+
ctx_lines=ctx_lines, query_terms=terms,
|
|
249
|
+
min_reduction=min_reduction)
|
|
250
|
+
|
|
251
|
+
return _compact
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""Shared retrieval types: the uniform candidate + intent plan."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from enum import Enum
|
|
7
|
+
from typing import Optional
|
|
8
|
+
|
|
9
|
+
# Line window used by Candidate.fuse_key to group co-located hits across retrievers.
|
|
10
|
+
# Wide enough to merge a symbol body and the FTS window that overlaps it, narrow
|
|
11
|
+
# enough to keep distinct regions of a large file separate.
|
|
12
|
+
_FUSE_BUCKET_LINES = 40
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class Intent(str, Enum):
|
|
16
|
+
LOCATE_IMPL = "locate_impl"
|
|
17
|
+
HOW_IT_WORKS = "how_it_works"
|
|
18
|
+
IMPACT = "impact"
|
|
19
|
+
FIND_REFS = "find_refs"
|
|
20
|
+
DATA_FLOW = "data_flow"
|
|
21
|
+
DEBUG_ERROR = "debug_error"
|
|
22
|
+
ARCHITECTURE = "architecture"
|
|
23
|
+
KEYWORD = "keyword"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class Confidence(str, Enum):
|
|
27
|
+
HIGH = "high"
|
|
28
|
+
MEDIUM = "medium"
|
|
29
|
+
LOW = "low"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class Candidate:
|
|
34
|
+
"""Source-agnostic retrieval hit. `source` in {"path","symbol","fts"}."""
|
|
35
|
+
|
|
36
|
+
path: str
|
|
37
|
+
line_start: int
|
|
38
|
+
line_end: int
|
|
39
|
+
source: str
|
|
40
|
+
score: float
|
|
41
|
+
kind: Optional[str] = None
|
|
42
|
+
symbol: Optional[str] = None
|
|
43
|
+
content: Optional[str] = None
|
|
44
|
+
token_est: int = 0
|
|
45
|
+
in_degree: int = 0
|
|
46
|
+
out_degree: int = 0
|
|
47
|
+
ref_count: int = 0
|
|
48
|
+
is_generated: bool = False
|
|
49
|
+
exact_symbol: bool = False
|
|
50
|
+
reason: str = ""
|
|
51
|
+
agreeing_sources: int = 1
|
|
52
|
+
|
|
53
|
+
def key(self) -> tuple[str, int, int]:
|
|
54
|
+
return (self.path, self.line_start, self.line_end)
|
|
55
|
+
|
|
56
|
+
def fuse_key(self) -> tuple[str, int]:
|
|
57
|
+
"""Coarse locator for RRF fusion: path + line bucket.
|
|
58
|
+
|
|
59
|
+
Different retrievers emit different granularities for the same place — a
|
|
60
|
+
symbol body, an 80-line FTS window, a path hit anchored at line 1 — so an
|
|
61
|
+
exact (path, start, end) key almost never coincides across sources and RRF
|
|
62
|
+
degenerates into a weighted round-robin that never rewards agreement.
|
|
63
|
+
Bucketing line_start collapses co-located hits onto one key so their
|
|
64
|
+
per-source RRF contributions actually sum, while still separating genuinely
|
|
65
|
+
distant regions of a large file.
|
|
66
|
+
"""
|
|
67
|
+
return (self.path, (max(self.line_start, 1) - 1) // _FUSE_BUCKET_LINES)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@dataclass
|
|
71
|
+
class IntentPlan:
|
|
72
|
+
intent: Intent
|
|
73
|
+
weights: dict[str, float]
|
|
74
|
+
token_budget: int
|
|
75
|
+
graph_strategy: str = "none"
|
|
76
|
+
summaries_first: bool = False
|
|
77
|
+
|
|
78
|
+
def weight(self, source: str) -> float:
|
|
79
|
+
return self.weights.get(source, 0.0)
|