codebase-index 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codebase_index/__init__.py +7 -0
- codebase_index/__main__.py +3 -0
- codebase_index/cli.py +916 -0
- codebase_index/config.py +110 -0
- codebase_index/discovery/__init__.py +10 -0
- codebase_index/discovery/classify.py +151 -0
- codebase_index/discovery/ignore.py +58 -0
- codebase_index/discovery/walker.py +75 -0
- codebase_index/doctor.py +138 -0
- codebase_index/embeddings/__init__.py +2 -0
- codebase_index/embeddings/backend.py +67 -0
- codebase_index/embeddings/external.py +56 -0
- codebase_index/embeddings/local.py +41 -0
- codebase_index/embeddings/noop.py +15 -0
- codebase_index/graph/__init__.py +8 -0
- codebase_index/graph/analysis.py +468 -0
- codebase_index/graph/builder.py +160 -0
- codebase_index/graph/expand.py +136 -0
- codebase_index/graph/export.py +381 -0
- codebase_index/graph/navigate.py +201 -0
- codebase_index/indexer/__init__.py +8 -0
- codebase_index/indexer/doc_chunks.py +202 -0
- codebase_index/indexer/freshness.py +109 -0
- codebase_index/indexer/pipeline.py +423 -0
- codebase_index/mcp/__init__.py +2 -0
- codebase_index/mcp/server.py +354 -0
- codebase_index/models.py +145 -0
- codebase_index/output/__init__.py +6 -0
- codebase_index/output/json.py +13 -0
- codebase_index/output/markdown.py +316 -0
- codebase_index/output/redact.py +31 -0
- codebase_index/parsers/__init__.py +9 -0
- codebase_index/parsers/base.py +47 -0
- codebase_index/parsers/languages.py +290 -0
- codebase_index/parsers/line_chunker.py +39 -0
- codebase_index/parsers/symbol_chunks.py +62 -0
- codebase_index/parsers/treesitter.py +439 -0
- codebase_index/retrieval/__init__.py +9 -0
- codebase_index/retrieval/budget.py +82 -0
- codebase_index/retrieval/fusion.py +62 -0
- codebase_index/retrieval/intent.py +56 -0
- codebase_index/retrieval/pipeline.py +207 -0
- codebase_index/retrieval/rerank.py +69 -0
- codebase_index/retrieval/searchers.py +291 -0
- codebase_index/retrieval/skeleton.py +251 -0
- codebase_index/retrieval/types.py +79 -0
- codebase_index/scaffold.py +399 -0
- codebase_index/service.py +158 -0
- codebase_index/skill_template/SKILL.md +198 -0
- codebase_index/skill_template/examples/hooks/settings.json +16 -0
- codebase_index/skill_template/scripts/cbx +25 -0
- codebase_index/skill_template/scripts/cbx.ps1 +25 -0
- codebase_index/skill_update.py +150 -0
- codebase_index/storage/__init__.py +8 -0
- codebase_index/storage/db.py +116 -0
- codebase_index/storage/repo.py +701 -0
- codebase_index/storage/schema.sql +125 -0
- codebase_index/watch/__init__.py +5 -0
- codebase_index/watch/watcher.py +93 -0
- codebase_index-1.6.0.dist-info/METADATA +748 -0
- codebase_index-1.6.0.dist-info/RECORD +64 -0
- codebase_index-1.6.0.dist-info/WHEEL +4 -0
- codebase_index-1.6.0.dist-info/entry_points.txt +4 -0
- codebase_index-1.6.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Fallback chunker: overlapping fixed-size line windows."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from .base import Chunk
|
|
6
|
+
|
|
7
|
+
_CHARS_PER_TOKEN = 4
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def estimate_tokens(text: str) -> int:
|
|
11
|
+
return max(1, round(len(text) / _CHARS_PER_TOKEN))
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def chunk_text(text: str, *, window_lines: int, overlap_lines: int) -> list[Chunk]:
|
|
15
|
+
if not text or not text.strip():
|
|
16
|
+
return []
|
|
17
|
+
if overlap_lines >= window_lines:
|
|
18
|
+
overlap_lines = window_lines - 1
|
|
19
|
+
stride = window_lines - overlap_lines
|
|
20
|
+
|
|
21
|
+
lines = text.splitlines()
|
|
22
|
+
chunks: list[Chunk] = []
|
|
23
|
+
start = 0
|
|
24
|
+
while start < len(lines):
|
|
25
|
+
end = min(start + window_lines, len(lines))
|
|
26
|
+
body = "\n".join(lines[start:end])
|
|
27
|
+
chunks.append(
|
|
28
|
+
Chunk(
|
|
29
|
+
line_start=start + 1,
|
|
30
|
+
line_end=end,
|
|
31
|
+
content=body,
|
|
32
|
+
token_est=estimate_tokens(body),
|
|
33
|
+
kind="window",
|
|
34
|
+
)
|
|
35
|
+
)
|
|
36
|
+
if end >= len(lines):
|
|
37
|
+
break
|
|
38
|
+
start += stride
|
|
39
|
+
return chunks
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""Symbol-aligned chunking with fallback line windows."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from .base import Chunk, Symbol
|
|
6
|
+
from .line_chunker import chunk_text, estimate_tokens
|
|
7
|
+
|
|
8
|
+
_GAP_WINDOW = 80
|
|
9
|
+
_GAP_OVERLAP = 0
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def build_chunks(text: str, symbols: list[Symbol]) -> list[Chunk]:
|
|
13
|
+
if not text.strip():
|
|
14
|
+
return []
|
|
15
|
+
if not symbols:
|
|
16
|
+
return chunk_text(text, window_lines=80, overlap_lines=10)
|
|
17
|
+
|
|
18
|
+
lines = text.splitlines()
|
|
19
|
+
top = sorted(
|
|
20
|
+
[s for s in symbols if s.parent_index is None],
|
|
21
|
+
key=lambda s: s.line_start,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
chunks: list[Chunk] = []
|
|
25
|
+
cursor = 1
|
|
26
|
+
for symbol in top:
|
|
27
|
+
symbol_index = symbols.index(symbol)
|
|
28
|
+
if symbol.line_start > cursor:
|
|
29
|
+
chunks.extend(_gap(lines, cursor, symbol.line_start - 1))
|
|
30
|
+
body = "\n".join(lines[symbol.line_start - 1 : symbol.line_end])
|
|
31
|
+
chunks.append(
|
|
32
|
+
Chunk(
|
|
33
|
+
line_start=symbol.line_start,
|
|
34
|
+
line_end=symbol.line_end,
|
|
35
|
+
content=body,
|
|
36
|
+
token_est=estimate_tokens(body),
|
|
37
|
+
kind="symbol_body",
|
|
38
|
+
symbol_index=symbol_index,
|
|
39
|
+
)
|
|
40
|
+
)
|
|
41
|
+
cursor = max(cursor, symbol.line_end + 1)
|
|
42
|
+
if cursor <= len(lines):
|
|
43
|
+
chunks.extend(_gap(lines, cursor, len(lines)))
|
|
44
|
+
return chunks
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _gap(lines: list[str], start: int, end: int) -> list[Chunk]:
|
|
48
|
+
segment = "\n".join(lines[start - 1 : end])
|
|
49
|
+
if not segment.strip():
|
|
50
|
+
return []
|
|
51
|
+
out: list[Chunk] = []
|
|
52
|
+
for chunk in chunk_text(segment, window_lines=_GAP_WINDOW, overlap_lines=_GAP_OVERLAP):
|
|
53
|
+
out.append(
|
|
54
|
+
Chunk(
|
|
55
|
+
line_start=start + chunk.line_start - 1,
|
|
56
|
+
line_end=start + chunk.line_end - 1,
|
|
57
|
+
content=chunk.content,
|
|
58
|
+
token_est=chunk.token_est,
|
|
59
|
+
kind="window",
|
|
60
|
+
)
|
|
61
|
+
)
|
|
62
|
+
return out
|
|
@@ -0,0 +1,439 @@
|
|
|
1
|
+
"""Tree-sitter parsing: text -> symbols, intra-file call edges, and chunks."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
from tree_sitter import Parser, Query, QueryCursor
|
|
8
|
+
from tree_sitter_language_pack import get_language
|
|
9
|
+
|
|
10
|
+
from .base import Edge, ParseResult, Symbol
|
|
11
|
+
from .languages import CONTAINER_KINDS, has_grammar, spec_for
|
|
12
|
+
from .symbol_chunks import build_chunks
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class UnsupportedLanguage(Exception):
|
|
16
|
+
pass
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def parse_file(lang: str, text: str) -> ParseResult:
|
|
20
|
+
spec = spec_for(lang)
|
|
21
|
+
# Tier A (hand-tuned spec) or Tier B (any loadable grammar). Only raise when no grammar
|
|
22
|
+
# exists at all, so "any language with a grammar" produces symbols, not a silent fallback.
|
|
23
|
+
ts_name = spec.ts_name if spec is not None else lang
|
|
24
|
+
if spec is None and not has_grammar(lang):
|
|
25
|
+
raise UnsupportedLanguage(lang)
|
|
26
|
+
|
|
27
|
+
grammar = get_language(ts_name)
|
|
28
|
+
parser = Parser(grammar)
|
|
29
|
+
source = text.encode("utf-8")
|
|
30
|
+
tree = parser.parse(source)
|
|
31
|
+
if tree is None:
|
|
32
|
+
raise ValueError("tree-sitter parser returned no tree")
|
|
33
|
+
root = tree.root_node
|
|
34
|
+
|
|
35
|
+
if spec is not None:
|
|
36
|
+
symbols = _extract_symbols(root, lang, source)
|
|
37
|
+
else:
|
|
38
|
+
symbols = _extract_symbols_generic(root, source)
|
|
39
|
+
edges = _extract_edges(root, symbols, source)
|
|
40
|
+
if spec is not None:
|
|
41
|
+
edges.extend(_extract_graph_edges(spec, grammar, root, symbols))
|
|
42
|
+
del grammar
|
|
43
|
+
chunks = build_chunks(text, symbols)
|
|
44
|
+
return ParseResult(chunks=chunks, symbols=symbols, edges=edges)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _row(point) -> int:
|
|
48
|
+
return point.row if hasattr(point, "row") else point[0]
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _text(node) -> str:
|
|
52
|
+
raw = getattr(node, "text", None)
|
|
53
|
+
if callable(raw):
|
|
54
|
+
raw = raw()
|
|
55
|
+
if isinstance(raw, bytes):
|
|
56
|
+
return raw.decode("utf-8", errors="ignore")
|
|
57
|
+
return raw if isinstance(raw, str) else ""
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class _Sym:
|
|
61
|
+
__slots__ = ("symbol", "start_byte", "end_byte")
|
|
62
|
+
|
|
63
|
+
def __init__(self, symbol: Symbol, def_node) -> None:
|
|
64
|
+
self.symbol = symbol
|
|
65
|
+
self.start_byte, self.end_byte = _byte_range(def_node)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _extract_symbols(root, lang: str, source: bytes) -> list[Symbol]:
|
|
69
|
+
raw: list[_Sym] = []
|
|
70
|
+
for def_node in _walk(root):
|
|
71
|
+
kind = _definition_kind(def_node, lang)
|
|
72
|
+
if kind is None:
|
|
73
|
+
continue
|
|
74
|
+
name_node = _name_node(def_node)
|
|
75
|
+
if name_node is None:
|
|
76
|
+
continue
|
|
77
|
+
raw.append(
|
|
78
|
+
_Sym(
|
|
79
|
+
Symbol(
|
|
80
|
+
name=_node_text(name_node, source),
|
|
81
|
+
kind=kind,
|
|
82
|
+
line_start=_row(_start_point(def_node)) + 1,
|
|
83
|
+
line_end=_row(_end_point(def_node)) + 1,
|
|
84
|
+
signature=_signature(def_node, source),
|
|
85
|
+
docstring=_python_docstring(def_node, source) if lang == "python" else None,
|
|
86
|
+
),
|
|
87
|
+
def_node,
|
|
88
|
+
)
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
raw.sort(key=lambda item: (item.start_byte, -(item.end_byte - item.start_byte)))
|
|
92
|
+
for item in raw:
|
|
93
|
+
parent = _enclosing(raw, item)
|
|
94
|
+
if parent is None:
|
|
95
|
+
item.symbol.qualified = item.symbol.name
|
|
96
|
+
continue
|
|
97
|
+
item.symbol.parent_index = raw.index(parent)
|
|
98
|
+
if item.symbol.kind == "function" and parent.symbol.kind in CONTAINER_KINDS:
|
|
99
|
+
item.symbol.kind = "method"
|
|
100
|
+
item.symbol.qualified = f"{parent.symbol.qualified or parent.symbol.name}.{item.symbol.name}"
|
|
101
|
+
return [item.symbol for item in raw]
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
# Maps a tree-sitter node `type` to a coarse symbol kind. Node types are largely unique across
|
|
105
|
+
# grammars, so a single table covers Java/Go/Rust/C/C++/C#/Ruby/PHP/Kotlin/JS/TS at once.
|
|
106
|
+
_DEF_KINDS: dict[str, str] = {
|
|
107
|
+
# functions
|
|
108
|
+
"function_declaration": "function", # go, kotlin, js
|
|
109
|
+
"function_definition": "function", # c, cpp, php (python handled separately below)
|
|
110
|
+
"function_item": "function", # rust
|
|
111
|
+
"function_signature_item": "function", # rust trait method signatures
|
|
112
|
+
# methods
|
|
113
|
+
"method_declaration": "method", # go, java, csharp
|
|
114
|
+
"method_definition": "method", # js/ts
|
|
115
|
+
"constructor_declaration": "method", # java, csharp
|
|
116
|
+
"method": "method", # ruby
|
|
117
|
+
# classes / type-like containers
|
|
118
|
+
"class_declaration": "class", # java, csharp, php, kotlin, js/ts
|
|
119
|
+
"class_specifier": "class", # cpp
|
|
120
|
+
"class": "class", # ruby
|
|
121
|
+
"object_declaration": "class", # kotlin
|
|
122
|
+
"record_declaration": "record", # java
|
|
123
|
+
"struct_item": "struct", # rust
|
|
124
|
+
"struct_specifier": "struct", # c, cpp
|
|
125
|
+
"struct_declaration": "struct", # csharp
|
|
126
|
+
"interface_declaration": "interface", # java, csharp, php, ts
|
|
127
|
+
"trait_item": "trait", # rust
|
|
128
|
+
"trait_declaration": "trait", # php
|
|
129
|
+
"enum_declaration": "enum", # java, csharp, ts
|
|
130
|
+
"enum_item": "enum", # rust
|
|
131
|
+
"enum_specifier": "enum", # c/cpp
|
|
132
|
+
"impl_item": "impl", # rust
|
|
133
|
+
# modules / namespaces (NOT containers — a function inside stays a function, not a method)
|
|
134
|
+
"mod_item": "module", # rust
|
|
135
|
+
"module": "module", # ruby
|
|
136
|
+
"namespace_definition": "module", # cpp
|
|
137
|
+
"namespace_declaration": "module", # csharp
|
|
138
|
+
"type_alias_declaration": "type", # ts
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def _definition_kind(node, lang: str) -> Optional[str]:
|
|
143
|
+
kind = _kind(node)
|
|
144
|
+
if lang == "python":
|
|
145
|
+
if kind == "function_definition":
|
|
146
|
+
return "function"
|
|
147
|
+
if kind == "class_definition":
|
|
148
|
+
return "class"
|
|
149
|
+
return None
|
|
150
|
+
if kind == "type_spec": # go: refine struct/interface from the underlying type
|
|
151
|
+
underlying = _field(node, "type")
|
|
152
|
+
u = _kind(underlying) if underlying is not None else ""
|
|
153
|
+
if u == "struct_type":
|
|
154
|
+
return "struct"
|
|
155
|
+
if u == "interface_type":
|
|
156
|
+
return "interface"
|
|
157
|
+
return "type"
|
|
158
|
+
mapped = _DEF_KINDS.get(kind)
|
|
159
|
+
if mapped is not None:
|
|
160
|
+
return mapped
|
|
161
|
+
if kind == "variable_declarator":
|
|
162
|
+
value = _field(node, "value")
|
|
163
|
+
if value is not None and _kind(value) in {"arrow_function", "function_expression"}:
|
|
164
|
+
return "function"
|
|
165
|
+
return None
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
# Identifier-like node types that can carry a definition's name across grammars.
|
|
169
|
+
_NAME_NODE_TYPES = {
|
|
170
|
+
"identifier",
|
|
171
|
+
"type_identifier",
|
|
172
|
+
"field_identifier",
|
|
173
|
+
"property_identifier",
|
|
174
|
+
"simple_identifier",
|
|
175
|
+
"constant",
|
|
176
|
+
"name",
|
|
177
|
+
"namespace_identifier",
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def _name_node(def_node):
|
|
182
|
+
"""Find the name node for a definition, tolerating grammars without a "name" field.
|
|
183
|
+
|
|
184
|
+
Handles: field "name" (most), Rust `impl_item` (field "type"), C/C++ function definitions
|
|
185
|
+
(name nested under the declarator), and fieldless grammars (Kotlin) via a child scan.
|
|
186
|
+
"""
|
|
187
|
+
named = _field(def_node, "name")
|
|
188
|
+
if named is not None:
|
|
189
|
+
return named
|
|
190
|
+
kind = _kind(def_node)
|
|
191
|
+
if kind == "impl_item":
|
|
192
|
+
return _field(def_node, "type")
|
|
193
|
+
if kind == "function_definition": # c / cpp: descend the declarator chain
|
|
194
|
+
decl = _field(def_node, "declarator")
|
|
195
|
+
return _declarator_identifier(decl) if decl is not None else None
|
|
196
|
+
for child in _named_children(def_node):
|
|
197
|
+
if _kind(child) in _NAME_NODE_TYPES:
|
|
198
|
+
return child
|
|
199
|
+
return None
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def _declarator_identifier(node):
|
|
203
|
+
if node is None:
|
|
204
|
+
return None
|
|
205
|
+
if _kind(node) in {"identifier", "field_identifier"}:
|
|
206
|
+
return node
|
|
207
|
+
inner = _field(node, "declarator")
|
|
208
|
+
if inner is not None:
|
|
209
|
+
return _declarator_identifier(inner)
|
|
210
|
+
for child in _named_children(node):
|
|
211
|
+
found = _declarator_identifier(child)
|
|
212
|
+
if found is not None:
|
|
213
|
+
return found
|
|
214
|
+
return None
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def _extract_symbols_generic(root, source: bytes) -> list[Symbol]:
|
|
218
|
+
"""Tier B: harvest definition-like nodes from an untuned grammar.
|
|
219
|
+
|
|
220
|
+
Any node whose `type` ends in declaration/definition/_item/_specifier and that has an
|
|
221
|
+
identifier-like named child is treated as a symbol; kind is a coarse keyword mapping.
|
|
222
|
+
"""
|
|
223
|
+
raw: list[_Sym] = []
|
|
224
|
+
for node in _walk(root):
|
|
225
|
+
ntype = _kind(node)
|
|
226
|
+
if not ntype.endswith(("declaration", "definition", "_item", "_specifier")):
|
|
227
|
+
continue
|
|
228
|
+
name_node = _name_node(node)
|
|
229
|
+
if name_node is None:
|
|
230
|
+
continue
|
|
231
|
+
raw.append(
|
|
232
|
+
_Sym(
|
|
233
|
+
Symbol(
|
|
234
|
+
name=_node_text(name_node, source),
|
|
235
|
+
kind=_generic_kind(ntype),
|
|
236
|
+
line_start=_row(_start_point(node)) + 1,
|
|
237
|
+
line_end=_row(_end_point(node)) + 1,
|
|
238
|
+
signature=_signature(node, source),
|
|
239
|
+
),
|
|
240
|
+
node,
|
|
241
|
+
)
|
|
242
|
+
)
|
|
243
|
+
raw.sort(key=lambda item: (item.start_byte, -(item.end_byte - item.start_byte)))
|
|
244
|
+
for item in raw:
|
|
245
|
+
parent = _enclosing(raw, item)
|
|
246
|
+
if parent is None:
|
|
247
|
+
item.symbol.qualified = item.symbol.name
|
|
248
|
+
continue
|
|
249
|
+
item.symbol.parent_index = raw.index(parent)
|
|
250
|
+
if item.symbol.kind == "function" and parent.symbol.kind in CONTAINER_KINDS:
|
|
251
|
+
item.symbol.kind = "method"
|
|
252
|
+
item.symbol.qualified = (
|
|
253
|
+
f"{parent.symbol.qualified or parent.symbol.name}.{item.symbol.name}"
|
|
254
|
+
)
|
|
255
|
+
return [item.symbol for item in raw]
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def _generic_kind(ntype: str) -> str:
|
|
259
|
+
low = ntype.lower()
|
|
260
|
+
for key in ("class", "struct", "enum", "interface", "trait", "module", "namespace"):
|
|
261
|
+
if key in low:
|
|
262
|
+
return "struct" if key == "struct" else key
|
|
263
|
+
return "function"
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def _signature(def_node, source: bytes) -> str:
|
|
267
|
+
return _node_text(def_node, source).splitlines()[0].strip().rstrip("{").strip()
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def _python_docstring(def_node, source: bytes) -> Optional[str]:
|
|
271
|
+
body = _field(def_node, "body")
|
|
272
|
+
if body is None:
|
|
273
|
+
return None
|
|
274
|
+
for stmt in _named_children(body):
|
|
275
|
+
if _kind(stmt) == "string":
|
|
276
|
+
return _node_text(stmt, source).strip().strip('"').strip("'").strip()
|
|
277
|
+
if _kind(stmt) == "expression_statement":
|
|
278
|
+
children = _named_children(stmt)
|
|
279
|
+
if children and _kind(children[0]) == "string":
|
|
280
|
+
return _node_text(children[0], source).strip().strip('"').strip("'").strip()
|
|
281
|
+
break
|
|
282
|
+
return None
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def _enclosing(raw: list[_Sym], child: _Sym) -> Optional[_Sym]:
|
|
286
|
+
best: Optional[_Sym] = None
|
|
287
|
+
for other in raw:
|
|
288
|
+
if other is child:
|
|
289
|
+
continue
|
|
290
|
+
if other.start_byte <= child.start_byte and other.end_byte >= child.end_byte:
|
|
291
|
+
other_span = other.end_byte - other.start_byte
|
|
292
|
+
child_span = child.end_byte - child.start_byte
|
|
293
|
+
if other_span <= child_span:
|
|
294
|
+
continue
|
|
295
|
+
if best is None or other_span < best.end_byte - best.start_byte:
|
|
296
|
+
best = other
|
|
297
|
+
return best
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
def _extract_edges(root, symbols: list[Symbol], source: bytes) -> list[Edge]:
|
|
301
|
+
edges: list[Edge] = []
|
|
302
|
+
for node in _walk(root):
|
|
303
|
+
callee = _callee_node(node)
|
|
304
|
+
if callee is None:
|
|
305
|
+
continue
|
|
306
|
+
line = _row(_start_point(callee)) + 1
|
|
307
|
+
edges.append(
|
|
308
|
+
Edge(
|
|
309
|
+
edge_type="call",
|
|
310
|
+
callee_name=_node_text(callee, source),
|
|
311
|
+
line=line,
|
|
312
|
+
src_symbol_index=_enclosing_symbol_index(symbols, line),
|
|
313
|
+
)
|
|
314
|
+
)
|
|
315
|
+
return edges
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
_EDGE_PREFIXES = {"import.": "import", "extends.": "extends", "implements.": "implements"}
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def _extract_graph_edges(spec, grammar, root, symbols) -> list[Edge]:
|
|
322
|
+
if not spec.imports_query:
|
|
323
|
+
return []
|
|
324
|
+
query = Query(grammar, spec.imports_query)
|
|
325
|
+
cursor = QueryCursor(query)
|
|
326
|
+
edges: list[Edge] = []
|
|
327
|
+
for _pattern_idx, captures in cursor.matches(root):
|
|
328
|
+
for capture_name, nodes in captures.items():
|
|
329
|
+
for node in nodes:
|
|
330
|
+
edge_type = next(
|
|
331
|
+
(et for pfx, et in _EDGE_PREFIXES.items() if capture_name.startswith(pfx)),
|
|
332
|
+
None,
|
|
333
|
+
)
|
|
334
|
+
if edge_type is None:
|
|
335
|
+
continue
|
|
336
|
+
line = _row(node.start_point) + 1
|
|
337
|
+
src_idx = None if edge_type == "import" else _enclosing_symbol_index(symbols, line)
|
|
338
|
+
edges.append(Edge(
|
|
339
|
+
edge_type=edge_type,
|
|
340
|
+
callee_name=_text(node).strip().strip('"').strip("'"),
|
|
341
|
+
line=line,
|
|
342
|
+
src_symbol_index=src_idx,
|
|
343
|
+
))
|
|
344
|
+
return edges
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
def _enclosing_symbol_index(symbols: list[Symbol], line: int) -> Optional[int]:
|
|
348
|
+
best_idx: Optional[int] = None
|
|
349
|
+
best_span: Optional[int] = None
|
|
350
|
+
for idx, symbol in enumerate(symbols):
|
|
351
|
+
if symbol.line_start <= line <= symbol.line_end:
|
|
352
|
+
span = symbol.line_end - symbol.line_start
|
|
353
|
+
if best_span is None or span < best_span:
|
|
354
|
+
best_idx = idx
|
|
355
|
+
best_span = span
|
|
356
|
+
return best_idx
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
_CALLEE_LEAVES = {"identifier", "property_identifier", "field_identifier", "simple_identifier"}
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
def _callee_node(node):
|
|
363
|
+
kind = _kind(node)
|
|
364
|
+
if kind == "method_invocation": # java: obj.method(...) / method(...)
|
|
365
|
+
return _field(node, "name")
|
|
366
|
+
if kind == "macro_invocation": # rust: name!(...)
|
|
367
|
+
return _field(node, "macro")
|
|
368
|
+
if kind not in {"call", "call_expression", "invocation_expression", "function_call_expression"}:
|
|
369
|
+
return None
|
|
370
|
+
# ruby `call` uses field "method"; everything else uses field "function".
|
|
371
|
+
fn = _field(node, "function") or _field(node, "method")
|
|
372
|
+
if fn is None:
|
|
373
|
+
return None
|
|
374
|
+
if _kind(fn) in _CALLEE_LEAVES:
|
|
375
|
+
return fn
|
|
376
|
+
# member / selector / scoped / field access: take the trailing identifier.
|
|
377
|
+
attr = (
|
|
378
|
+
_field(fn, "attribute")
|
|
379
|
+
or _field(fn, "property")
|
|
380
|
+
or _field(fn, "field")
|
|
381
|
+
or _field(fn, "name")
|
|
382
|
+
)
|
|
383
|
+
if attr is not None and _kind(attr) in _CALLEE_LEAVES:
|
|
384
|
+
return attr
|
|
385
|
+
return None
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
def _kind(node) -> str:
|
|
389
|
+
value = getattr(node, "type", None)
|
|
390
|
+
if value is None:
|
|
391
|
+
value = getattr(node, "kind", None)
|
|
392
|
+
resolved = value() if callable(value) else value
|
|
393
|
+
return resolved if isinstance(resolved, str) else ""
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
def _field(node, name: str):
|
|
397
|
+
return node.child_by_field_name(name)
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
def _start_point(node):
|
|
401
|
+
value = getattr(node, "start_point", None)
|
|
402
|
+
if value is None:
|
|
403
|
+
value = getattr(node, "start_position", None)
|
|
404
|
+
return value() if callable(value) else value
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
def _end_point(node):
|
|
408
|
+
value = getattr(node, "end_point", None)
|
|
409
|
+
if value is None:
|
|
410
|
+
value = getattr(node, "end_position", None)
|
|
411
|
+
return value() if callable(value) else value
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
def _byte_range(node) -> tuple[int, int]:
|
|
415
|
+
start = getattr(node, "start_byte", None)
|
|
416
|
+
end = getattr(node, "end_byte", None)
|
|
417
|
+
if start is not None and end is not None:
|
|
418
|
+
return (start() if callable(start) else start, end() if callable(end) else end)
|
|
419
|
+
br = node.byte_range()
|
|
420
|
+
return br.start, br.end
|
|
421
|
+
|
|
422
|
+
|
|
423
|
+
def _node_text(node, source: bytes) -> str:
|
|
424
|
+
start, end = _byte_range(node)
|
|
425
|
+
return source[start:end].decode("utf-8", errors="ignore")
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
def _named_children(node) -> list[object]:
|
|
429
|
+
children = getattr(node, "named_children", None)
|
|
430
|
+
if children is not None:
|
|
431
|
+
return list(children() if callable(children) else children)
|
|
432
|
+
count = node.named_child_count() if callable(node.named_child_count) else node.named_child_count
|
|
433
|
+
return [node.named_child(i) for i in range(count)]
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
def _walk(node):
|
|
437
|
+
yield node
|
|
438
|
+
for child in _named_children(node):
|
|
439
|
+
yield from _walk(child)
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
"""Hybrid retrieval engine. See docs/RETRIEVAL.md for the full pipeline.
|
|
2
|
+
|
|
3
|
+
intent.py : classify the query into an Intent + retriever weights + graph strategy.
|
|
4
|
+
searchers.py : path / symbol / fts / vector searchers -> uniform Candidate lists.
|
|
5
|
+
fusion.py : Reciprocal Rank Fusion across retriever lists (rrf_k, per-intent weights).
|
|
6
|
+
rerank.py : feature-based reordering (symbol-kind, path proximity, centrality, recency) +
|
|
7
|
+
produces the human-readable `reason` per result.
|
|
8
|
+
budget.py : greedy token-budgeted assembly of snippets vs. recommended_reads; secret redaction.
|
|
9
|
+
"""
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""Greedy token budgeting (RETRIEVAL.md §6).
|
|
2
|
+
|
|
3
|
+
Metadata for every result is always emitted (cheap). Snippets are attached to the
|
|
4
|
+
highest-ranked results until the budget is hit; the remainder become
|
|
5
|
+
recommended_reads. All snippet text is secret-redacted before emission.
|
|
6
|
+
|
|
7
|
+
A result is added to recommended_reads when:
|
|
8
|
+
- it has no snippet (budget exceeded or no content), OR
|
|
9
|
+
- its snippet is below _MIN_USEFUL_TOKENS (e.g. a bare function signature).
|
|
10
|
+
Claude still gets the short preview but also receives the read plan.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
from typing import Callable, Optional
|
|
16
|
+
|
|
17
|
+
from ..output.redact import redact_snippet
|
|
18
|
+
from .skeleton import Compacted
|
|
19
|
+
from .types import Candidate
|
|
20
|
+
|
|
21
|
+
# Snippets shorter than this threshold are treated as previews only; the result
|
|
22
|
+
# is still added to recommended_reads so Claude knows where to read the full body.
|
|
23
|
+
_MIN_USEFUL_TOKENS = 40
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _meta(c: Candidate) -> dict:
|
|
27
|
+
return {
|
|
28
|
+
"path": c.path,
|
|
29
|
+
"line_start": c.line_start,
|
|
30
|
+
"line_end": c.line_end,
|
|
31
|
+
"symbols": [c.symbol] if c.symbol else [],
|
|
32
|
+
"score": round(c.score, 4),
|
|
33
|
+
"reason": c.reason if c.reason else c.source,
|
|
34
|
+
"token_est": c.token_est,
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def apply_budget(
|
|
39
|
+
candidates: list[Candidate],
|
|
40
|
+
*,
|
|
41
|
+
token_budget: int,
|
|
42
|
+
compactor: Optional[Callable[[Candidate], Compacted]] = None,
|
|
43
|
+
) -> tuple[list[dict], list[dict]]:
|
|
44
|
+
results: list[dict] = []
|
|
45
|
+
recommended: list[dict] = []
|
|
46
|
+
spent = 0
|
|
47
|
+
|
|
48
|
+
for rank, c in enumerate(candidates, start=1):
|
|
49
|
+
meta = _meta(c)
|
|
50
|
+
meta["rank"] = rank
|
|
51
|
+
meta["skeletonized"] = False
|
|
52
|
+
meta["elided_lines"] = 0
|
|
53
|
+
|
|
54
|
+
# Resolve the snippet text + cost. A compactor only changes anything
|
|
55
|
+
# when it returns a real skeleton; otherwise we keep today's raw path
|
|
56
|
+
# byte-for-byte (uses c.content / c.token_est).
|
|
57
|
+
text = c.content
|
|
58
|
+
cost = c.token_est
|
|
59
|
+
if compactor is not None and c.content:
|
|
60
|
+
comp = compactor(c)
|
|
61
|
+
if comp.skeletonized:
|
|
62
|
+
text = comp.text
|
|
63
|
+
cost = comp.token_est
|
|
64
|
+
meta["skeletonized"] = True
|
|
65
|
+
meta["elided_lines"] = comp.elided_lines
|
|
66
|
+
|
|
67
|
+
snippet = None
|
|
68
|
+
snippet_is_useful = False
|
|
69
|
+
if text and spent + cost <= token_budget:
|
|
70
|
+
snippet = redact_snippet(text)
|
|
71
|
+
spent += cost
|
|
72
|
+
meta["token_est"] = cost
|
|
73
|
+
snippet_is_useful = cost >= _MIN_USEFUL_TOKENS
|
|
74
|
+
|
|
75
|
+
if not snippet_is_useful:
|
|
76
|
+
recommended.append(
|
|
77
|
+
{"path": c.path, "line_start": c.line_start, "line_end": c.line_end}
|
|
78
|
+
)
|
|
79
|
+
meta["snippet"] = snippet
|
|
80
|
+
results.append(meta)
|
|
81
|
+
|
|
82
|
+
return results, recommended
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""Reciprocal Rank Fusion across per-source ranked candidate lists.
|
|
2
|
+
|
|
3
|
+
RRF(d) = Σ_r w_r · k / (k + rank_r(d)) — robust to incomparable raw scores.
|
|
4
|
+
|
|
5
|
+
Two deliberate departures from the textbook formula:
|
|
6
|
+
|
|
7
|
+
* Scaled by k. Raw RRF tops out at w/k (≈0.017 for k=60), an order of magnitude
|
|
8
|
+
below the bounded bonuses the reranker layers on top, so rerank would silently
|
|
9
|
+
become the primary ranker and RRF a mere tiebreak. Multiplying by k is a pure
|
|
10
|
+
monotonic rescale (fusion order is identical) that lifts the top contribution to
|
|
11
|
+
≈w, putting fused scores and rerank bonuses on the same O(1) scale.
|
|
12
|
+
* Fused on a coarse (path, line-bucket) key, not (path, start, end). Different
|
|
13
|
+
retrievers report different line ranges for the same place; an exact key almost
|
|
14
|
+
never coincides across sources, so cross-source agreement — RRF's whole point —
|
|
15
|
+
would never fire. `agreeing_sources` is therefore counted at file granularity.
|
|
16
|
+
|
|
17
|
+
On merge, the candidate carrying the most signal (symbol > fts > path) is kept as
|
|
18
|
+
the representative so downstream rerank/snippet logic has the richest fields.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
from dataclasses import replace as _replace
|
|
24
|
+
|
|
25
|
+
from .types import Candidate
|
|
26
|
+
|
|
27
|
+
_SOURCE_RICHNESS = {"symbol": 3, "fts": 2, "vector": 2, "path": 1}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _richer(a: Candidate, b: Candidate) -> Candidate:
|
|
31
|
+
return a if _SOURCE_RICHNESS.get(a.source, 0) >= _SOURCE_RICHNESS.get(b.source, 0) else b
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def fuse(
|
|
35
|
+
lists: dict[str, list[Candidate]],
|
|
36
|
+
*,
|
|
37
|
+
weights: dict[str, float],
|
|
38
|
+
k: int,
|
|
39
|
+
) -> list[Candidate]:
|
|
40
|
+
accum: dict[tuple, float] = {}
|
|
41
|
+
rep: dict[tuple, Candidate] = {}
|
|
42
|
+
seen: set[tuple] = set()
|
|
43
|
+
file_sources: dict[str, set[str]] = {}
|
|
44
|
+
|
|
45
|
+
for source, candidates in lists.items():
|
|
46
|
+
w = weights.get(source, 0.0)
|
|
47
|
+
if w <= 0.0:
|
|
48
|
+
continue
|
|
49
|
+
for rank, cand in enumerate(candidates):
|
|
50
|
+
file_sources.setdefault(cand.path, set()).add(source)
|
|
51
|
+
key = cand.fuse_key()
|
|
52
|
+
# One contribution per source per locator: a file matching three FTS
|
|
53
|
+
# chunks in the same bucket is one lexical signal, not three.
|
|
54
|
+
if (source, key) in seen:
|
|
55
|
+
continue
|
|
56
|
+
seen.add((source, key))
|
|
57
|
+
accum[key] = accum.get(key, 0.0) + w * k / (k + rank)
|
|
58
|
+
rep[key] = _richer(rep[key], cand) if key in rep else cand
|
|
59
|
+
|
|
60
|
+
fused = [_replace(rep[key], score=score) for key, score in accum.items()]
|
|
61
|
+
fused.sort(key=lambda c: c.score, reverse=True)
|
|
62
|
+
return [_replace(c, agreeing_sources=len(file_sources[c.path])) for c in fused]
|