codebase-index 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. codebase_index/__init__.py +7 -0
  2. codebase_index/__main__.py +3 -0
  3. codebase_index/cli.py +916 -0
  4. codebase_index/config.py +110 -0
  5. codebase_index/discovery/__init__.py +10 -0
  6. codebase_index/discovery/classify.py +151 -0
  7. codebase_index/discovery/ignore.py +58 -0
  8. codebase_index/discovery/walker.py +75 -0
  9. codebase_index/doctor.py +138 -0
  10. codebase_index/embeddings/__init__.py +2 -0
  11. codebase_index/embeddings/backend.py +67 -0
  12. codebase_index/embeddings/external.py +56 -0
  13. codebase_index/embeddings/local.py +41 -0
  14. codebase_index/embeddings/noop.py +15 -0
  15. codebase_index/graph/__init__.py +8 -0
  16. codebase_index/graph/analysis.py +468 -0
  17. codebase_index/graph/builder.py +160 -0
  18. codebase_index/graph/expand.py +136 -0
  19. codebase_index/graph/export.py +381 -0
  20. codebase_index/graph/navigate.py +201 -0
  21. codebase_index/indexer/__init__.py +8 -0
  22. codebase_index/indexer/doc_chunks.py +202 -0
  23. codebase_index/indexer/freshness.py +109 -0
  24. codebase_index/indexer/pipeline.py +423 -0
  25. codebase_index/mcp/__init__.py +2 -0
  26. codebase_index/mcp/server.py +354 -0
  27. codebase_index/models.py +145 -0
  28. codebase_index/output/__init__.py +6 -0
  29. codebase_index/output/json.py +13 -0
  30. codebase_index/output/markdown.py +316 -0
  31. codebase_index/output/redact.py +31 -0
  32. codebase_index/parsers/__init__.py +9 -0
  33. codebase_index/parsers/base.py +47 -0
  34. codebase_index/parsers/languages.py +290 -0
  35. codebase_index/parsers/line_chunker.py +39 -0
  36. codebase_index/parsers/symbol_chunks.py +62 -0
  37. codebase_index/parsers/treesitter.py +439 -0
  38. codebase_index/retrieval/__init__.py +9 -0
  39. codebase_index/retrieval/budget.py +82 -0
  40. codebase_index/retrieval/fusion.py +62 -0
  41. codebase_index/retrieval/intent.py +56 -0
  42. codebase_index/retrieval/pipeline.py +207 -0
  43. codebase_index/retrieval/rerank.py +69 -0
  44. codebase_index/retrieval/searchers.py +291 -0
  45. codebase_index/retrieval/skeleton.py +251 -0
  46. codebase_index/retrieval/types.py +79 -0
  47. codebase_index/scaffold.py +399 -0
  48. codebase_index/service.py +158 -0
  49. codebase_index/skill_template/SKILL.md +198 -0
  50. codebase_index/skill_template/examples/hooks/settings.json +16 -0
  51. codebase_index/skill_template/scripts/cbx +25 -0
  52. codebase_index/skill_template/scripts/cbx.ps1 +25 -0
  53. codebase_index/skill_update.py +150 -0
  54. codebase_index/storage/__init__.py +8 -0
  55. codebase_index/storage/db.py +116 -0
  56. codebase_index/storage/repo.py +701 -0
  57. codebase_index/storage/schema.sql +125 -0
  58. codebase_index/watch/__init__.py +5 -0
  59. codebase_index/watch/watcher.py +93 -0
  60. codebase_index-1.6.0.dist-info/METADATA +748 -0
  61. codebase_index-1.6.0.dist-info/RECORD +64 -0
  62. codebase_index-1.6.0.dist-info/WHEEL +4 -0
  63. codebase_index-1.6.0.dist-info/entry_points.txt +4 -0
  64. codebase_index-1.6.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,39 @@
1
+ """Fallback chunker: overlapping fixed-size line windows."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .base import Chunk
6
+
7
+ _CHARS_PER_TOKEN = 4
8
+
9
+
10
+ def estimate_tokens(text: str) -> int:
11
+ return max(1, round(len(text) / _CHARS_PER_TOKEN))
12
+
13
+
14
+ def chunk_text(text: str, *, window_lines: int, overlap_lines: int) -> list[Chunk]:
15
+ if not text or not text.strip():
16
+ return []
17
+ if overlap_lines >= window_lines:
18
+ overlap_lines = window_lines - 1
19
+ stride = window_lines - overlap_lines
20
+
21
+ lines = text.splitlines()
22
+ chunks: list[Chunk] = []
23
+ start = 0
24
+ while start < len(lines):
25
+ end = min(start + window_lines, len(lines))
26
+ body = "\n".join(lines[start:end])
27
+ chunks.append(
28
+ Chunk(
29
+ line_start=start + 1,
30
+ line_end=end,
31
+ content=body,
32
+ token_est=estimate_tokens(body),
33
+ kind="window",
34
+ )
35
+ )
36
+ if end >= len(lines):
37
+ break
38
+ start += stride
39
+ return chunks
@@ -0,0 +1,62 @@
1
+ """Symbol-aligned chunking with fallback line windows."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .base import Chunk, Symbol
6
+ from .line_chunker import chunk_text, estimate_tokens
7
+
8
+ _GAP_WINDOW = 80
9
+ _GAP_OVERLAP = 0
10
+
11
+
12
+ def build_chunks(text: str, symbols: list[Symbol]) -> list[Chunk]:
13
+ if not text.strip():
14
+ return []
15
+ if not symbols:
16
+ return chunk_text(text, window_lines=80, overlap_lines=10)
17
+
18
+ lines = text.splitlines()
19
+ top = sorted(
20
+ [s for s in symbols if s.parent_index is None],
21
+ key=lambda s: s.line_start,
22
+ )
23
+
24
+ chunks: list[Chunk] = []
25
+ cursor = 1
26
+ for symbol in top:
27
+ symbol_index = symbols.index(symbol)
28
+ if symbol.line_start > cursor:
29
+ chunks.extend(_gap(lines, cursor, symbol.line_start - 1))
30
+ body = "\n".join(lines[symbol.line_start - 1 : symbol.line_end])
31
+ chunks.append(
32
+ Chunk(
33
+ line_start=symbol.line_start,
34
+ line_end=symbol.line_end,
35
+ content=body,
36
+ token_est=estimate_tokens(body),
37
+ kind="symbol_body",
38
+ symbol_index=symbol_index,
39
+ )
40
+ )
41
+ cursor = max(cursor, symbol.line_end + 1)
42
+ if cursor <= len(lines):
43
+ chunks.extend(_gap(lines, cursor, len(lines)))
44
+ return chunks
45
+
46
+
47
+ def _gap(lines: list[str], start: int, end: int) -> list[Chunk]:
48
+ segment = "\n".join(lines[start - 1 : end])
49
+ if not segment.strip():
50
+ return []
51
+ out: list[Chunk] = []
52
+ for chunk in chunk_text(segment, window_lines=_GAP_WINDOW, overlap_lines=_GAP_OVERLAP):
53
+ out.append(
54
+ Chunk(
55
+ line_start=start + chunk.line_start - 1,
56
+ line_end=start + chunk.line_end - 1,
57
+ content=chunk.content,
58
+ token_est=chunk.token_est,
59
+ kind="window",
60
+ )
61
+ )
62
+ return out
@@ -0,0 +1,439 @@
1
+ """Tree-sitter parsing: text -> symbols, intra-file call edges, and chunks."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Optional
6
+
7
+ from tree_sitter import Parser, Query, QueryCursor
8
+ from tree_sitter_language_pack import get_language
9
+
10
+ from .base import Edge, ParseResult, Symbol
11
+ from .languages import CONTAINER_KINDS, has_grammar, spec_for
12
+ from .symbol_chunks import build_chunks
13
+
14
+
15
+ class UnsupportedLanguage(Exception):
16
+ pass
17
+
18
+
19
+ def parse_file(lang: str, text: str) -> ParseResult:
20
+ spec = spec_for(lang)
21
+ # Tier A (hand-tuned spec) or Tier B (any loadable grammar). Only raise when no grammar
22
+ # exists at all, so "any language with a grammar" produces symbols, not a silent fallback.
23
+ ts_name = spec.ts_name if spec is not None else lang
24
+ if spec is None and not has_grammar(lang):
25
+ raise UnsupportedLanguage(lang)
26
+
27
+ grammar = get_language(ts_name)
28
+ parser = Parser(grammar)
29
+ source = text.encode("utf-8")
30
+ tree = parser.parse(source)
31
+ if tree is None:
32
+ raise ValueError("tree-sitter parser returned no tree")
33
+ root = tree.root_node
34
+
35
+ if spec is not None:
36
+ symbols = _extract_symbols(root, lang, source)
37
+ else:
38
+ symbols = _extract_symbols_generic(root, source)
39
+ edges = _extract_edges(root, symbols, source)
40
+ if spec is not None:
41
+ edges.extend(_extract_graph_edges(spec, grammar, root, symbols))
42
+ del grammar
43
+ chunks = build_chunks(text, symbols)
44
+ return ParseResult(chunks=chunks, symbols=symbols, edges=edges)
45
+
46
+
47
+ def _row(point) -> int:
48
+ return point.row if hasattr(point, "row") else point[0]
49
+
50
+
51
+ def _text(node) -> str:
52
+ raw = getattr(node, "text", None)
53
+ if callable(raw):
54
+ raw = raw()
55
+ if isinstance(raw, bytes):
56
+ return raw.decode("utf-8", errors="ignore")
57
+ return raw if isinstance(raw, str) else ""
58
+
59
+
60
+ class _Sym:
61
+ __slots__ = ("symbol", "start_byte", "end_byte")
62
+
63
+ def __init__(self, symbol: Symbol, def_node) -> None:
64
+ self.symbol = symbol
65
+ self.start_byte, self.end_byte = _byte_range(def_node)
66
+
67
+
68
+ def _extract_symbols(root, lang: str, source: bytes) -> list[Symbol]:
69
+ raw: list[_Sym] = []
70
+ for def_node in _walk(root):
71
+ kind = _definition_kind(def_node, lang)
72
+ if kind is None:
73
+ continue
74
+ name_node = _name_node(def_node)
75
+ if name_node is None:
76
+ continue
77
+ raw.append(
78
+ _Sym(
79
+ Symbol(
80
+ name=_node_text(name_node, source),
81
+ kind=kind,
82
+ line_start=_row(_start_point(def_node)) + 1,
83
+ line_end=_row(_end_point(def_node)) + 1,
84
+ signature=_signature(def_node, source),
85
+ docstring=_python_docstring(def_node, source) if lang == "python" else None,
86
+ ),
87
+ def_node,
88
+ )
89
+ )
90
+
91
+ raw.sort(key=lambda item: (item.start_byte, -(item.end_byte - item.start_byte)))
92
+ for item in raw:
93
+ parent = _enclosing(raw, item)
94
+ if parent is None:
95
+ item.symbol.qualified = item.symbol.name
96
+ continue
97
+ item.symbol.parent_index = raw.index(parent)
98
+ if item.symbol.kind == "function" and parent.symbol.kind in CONTAINER_KINDS:
99
+ item.symbol.kind = "method"
100
+ item.symbol.qualified = f"{parent.symbol.qualified or parent.symbol.name}.{item.symbol.name}"
101
+ return [item.symbol for item in raw]
102
+
103
+
104
+ # Maps a tree-sitter node `type` to a coarse symbol kind. Node types are largely unique across
105
+ # grammars, so a single table covers Java/Go/Rust/C/C++/C#/Ruby/PHP/Kotlin/JS/TS at once.
106
+ _DEF_KINDS: dict[str, str] = {
107
+ # functions
108
+ "function_declaration": "function", # go, kotlin, js
109
+ "function_definition": "function", # c, cpp, php (python handled separately below)
110
+ "function_item": "function", # rust
111
+ "function_signature_item": "function", # rust trait method signatures
112
+ # methods
113
+ "method_declaration": "method", # go, java, csharp
114
+ "method_definition": "method", # js/ts
115
+ "constructor_declaration": "method", # java, csharp
116
+ "method": "method", # ruby
117
+ # classes / type-like containers
118
+ "class_declaration": "class", # java, csharp, php, kotlin, js/ts
119
+ "class_specifier": "class", # cpp
120
+ "class": "class", # ruby
121
+ "object_declaration": "class", # kotlin
122
+ "record_declaration": "record", # java
123
+ "struct_item": "struct", # rust
124
+ "struct_specifier": "struct", # c, cpp
125
+ "struct_declaration": "struct", # csharp
126
+ "interface_declaration": "interface", # java, csharp, php, ts
127
+ "trait_item": "trait", # rust
128
+ "trait_declaration": "trait", # php
129
+ "enum_declaration": "enum", # java, csharp, ts
130
+ "enum_item": "enum", # rust
131
+ "enum_specifier": "enum", # c/cpp
132
+ "impl_item": "impl", # rust
133
+ # modules / namespaces (NOT containers — a function inside stays a function, not a method)
134
+ "mod_item": "module", # rust
135
+ "module": "module", # ruby
136
+ "namespace_definition": "module", # cpp
137
+ "namespace_declaration": "module", # csharp
138
+ "type_alias_declaration": "type", # ts
139
+ }
140
+
141
+
142
+ def _definition_kind(node, lang: str) -> Optional[str]:
143
+ kind = _kind(node)
144
+ if lang == "python":
145
+ if kind == "function_definition":
146
+ return "function"
147
+ if kind == "class_definition":
148
+ return "class"
149
+ return None
150
+ if kind == "type_spec": # go: refine struct/interface from the underlying type
151
+ underlying = _field(node, "type")
152
+ u = _kind(underlying) if underlying is not None else ""
153
+ if u == "struct_type":
154
+ return "struct"
155
+ if u == "interface_type":
156
+ return "interface"
157
+ return "type"
158
+ mapped = _DEF_KINDS.get(kind)
159
+ if mapped is not None:
160
+ return mapped
161
+ if kind == "variable_declarator":
162
+ value = _field(node, "value")
163
+ if value is not None and _kind(value) in {"arrow_function", "function_expression"}:
164
+ return "function"
165
+ return None
166
+
167
+
168
+ # Identifier-like node types that can carry a definition's name across grammars.
169
+ _NAME_NODE_TYPES = {
170
+ "identifier",
171
+ "type_identifier",
172
+ "field_identifier",
173
+ "property_identifier",
174
+ "simple_identifier",
175
+ "constant",
176
+ "name",
177
+ "namespace_identifier",
178
+ }
179
+
180
+
181
+ def _name_node(def_node):
182
+ """Find the name node for a definition, tolerating grammars without a "name" field.
183
+
184
+ Handles: field "name" (most), Rust `impl_item` (field "type"), C/C++ function definitions
185
+ (name nested under the declarator), and fieldless grammars (Kotlin) via a child scan.
186
+ """
187
+ named = _field(def_node, "name")
188
+ if named is not None:
189
+ return named
190
+ kind = _kind(def_node)
191
+ if kind == "impl_item":
192
+ return _field(def_node, "type")
193
+ if kind == "function_definition": # c / cpp: descend the declarator chain
194
+ decl = _field(def_node, "declarator")
195
+ return _declarator_identifier(decl) if decl is not None else None
196
+ for child in _named_children(def_node):
197
+ if _kind(child) in _NAME_NODE_TYPES:
198
+ return child
199
+ return None
200
+
201
+
202
+ def _declarator_identifier(node):
203
+ if node is None:
204
+ return None
205
+ if _kind(node) in {"identifier", "field_identifier"}:
206
+ return node
207
+ inner = _field(node, "declarator")
208
+ if inner is not None:
209
+ return _declarator_identifier(inner)
210
+ for child in _named_children(node):
211
+ found = _declarator_identifier(child)
212
+ if found is not None:
213
+ return found
214
+ return None
215
+
216
+
217
+ def _extract_symbols_generic(root, source: bytes) -> list[Symbol]:
218
+ """Tier B: harvest definition-like nodes from an untuned grammar.
219
+
220
+ Any node whose `type` ends in declaration/definition/_item/_specifier and that has an
221
+ identifier-like named child is treated as a symbol; kind is a coarse keyword mapping.
222
+ """
223
+ raw: list[_Sym] = []
224
+ for node in _walk(root):
225
+ ntype = _kind(node)
226
+ if not ntype.endswith(("declaration", "definition", "_item", "_specifier")):
227
+ continue
228
+ name_node = _name_node(node)
229
+ if name_node is None:
230
+ continue
231
+ raw.append(
232
+ _Sym(
233
+ Symbol(
234
+ name=_node_text(name_node, source),
235
+ kind=_generic_kind(ntype),
236
+ line_start=_row(_start_point(node)) + 1,
237
+ line_end=_row(_end_point(node)) + 1,
238
+ signature=_signature(node, source),
239
+ ),
240
+ node,
241
+ )
242
+ )
243
+ raw.sort(key=lambda item: (item.start_byte, -(item.end_byte - item.start_byte)))
244
+ for item in raw:
245
+ parent = _enclosing(raw, item)
246
+ if parent is None:
247
+ item.symbol.qualified = item.symbol.name
248
+ continue
249
+ item.symbol.parent_index = raw.index(parent)
250
+ if item.symbol.kind == "function" and parent.symbol.kind in CONTAINER_KINDS:
251
+ item.symbol.kind = "method"
252
+ item.symbol.qualified = (
253
+ f"{parent.symbol.qualified or parent.symbol.name}.{item.symbol.name}"
254
+ )
255
+ return [item.symbol for item in raw]
256
+
257
+
258
+ def _generic_kind(ntype: str) -> str:
259
+ low = ntype.lower()
260
+ for key in ("class", "struct", "enum", "interface", "trait", "module", "namespace"):
261
+ if key in low:
262
+ return "struct" if key == "struct" else key
263
+ return "function"
264
+
265
+
266
+ def _signature(def_node, source: bytes) -> str:
267
+ return _node_text(def_node, source).splitlines()[0].strip().rstrip("{").strip()
268
+
269
+
270
+ def _python_docstring(def_node, source: bytes) -> Optional[str]:
271
+ body = _field(def_node, "body")
272
+ if body is None:
273
+ return None
274
+ for stmt in _named_children(body):
275
+ if _kind(stmt) == "string":
276
+ return _node_text(stmt, source).strip().strip('"').strip("'").strip()
277
+ if _kind(stmt) == "expression_statement":
278
+ children = _named_children(stmt)
279
+ if children and _kind(children[0]) == "string":
280
+ return _node_text(children[0], source).strip().strip('"').strip("'").strip()
281
+ break
282
+ return None
283
+
284
+
285
+ def _enclosing(raw: list[_Sym], child: _Sym) -> Optional[_Sym]:
286
+ best: Optional[_Sym] = None
287
+ for other in raw:
288
+ if other is child:
289
+ continue
290
+ if other.start_byte <= child.start_byte and other.end_byte >= child.end_byte:
291
+ other_span = other.end_byte - other.start_byte
292
+ child_span = child.end_byte - child.start_byte
293
+ if other_span <= child_span:
294
+ continue
295
+ if best is None or other_span < best.end_byte - best.start_byte:
296
+ best = other
297
+ return best
298
+
299
+
300
+ def _extract_edges(root, symbols: list[Symbol], source: bytes) -> list[Edge]:
301
+ edges: list[Edge] = []
302
+ for node in _walk(root):
303
+ callee = _callee_node(node)
304
+ if callee is None:
305
+ continue
306
+ line = _row(_start_point(callee)) + 1
307
+ edges.append(
308
+ Edge(
309
+ edge_type="call",
310
+ callee_name=_node_text(callee, source),
311
+ line=line,
312
+ src_symbol_index=_enclosing_symbol_index(symbols, line),
313
+ )
314
+ )
315
+ return edges
316
+
317
+
318
+ _EDGE_PREFIXES = {"import.": "import", "extends.": "extends", "implements.": "implements"}
319
+
320
+
321
+ def _extract_graph_edges(spec, grammar, root, symbols) -> list[Edge]:
322
+ if not spec.imports_query:
323
+ return []
324
+ query = Query(grammar, spec.imports_query)
325
+ cursor = QueryCursor(query)
326
+ edges: list[Edge] = []
327
+ for _pattern_idx, captures in cursor.matches(root):
328
+ for capture_name, nodes in captures.items():
329
+ for node in nodes:
330
+ edge_type = next(
331
+ (et for pfx, et in _EDGE_PREFIXES.items() if capture_name.startswith(pfx)),
332
+ None,
333
+ )
334
+ if edge_type is None:
335
+ continue
336
+ line = _row(node.start_point) + 1
337
+ src_idx = None if edge_type == "import" else _enclosing_symbol_index(symbols, line)
338
+ edges.append(Edge(
339
+ edge_type=edge_type,
340
+ callee_name=_text(node).strip().strip('"').strip("'"),
341
+ line=line,
342
+ src_symbol_index=src_idx,
343
+ ))
344
+ return edges
345
+
346
+
347
+ def _enclosing_symbol_index(symbols: list[Symbol], line: int) -> Optional[int]:
348
+ best_idx: Optional[int] = None
349
+ best_span: Optional[int] = None
350
+ for idx, symbol in enumerate(symbols):
351
+ if symbol.line_start <= line <= symbol.line_end:
352
+ span = symbol.line_end - symbol.line_start
353
+ if best_span is None or span < best_span:
354
+ best_idx = idx
355
+ best_span = span
356
+ return best_idx
357
+
358
+
359
+ _CALLEE_LEAVES = {"identifier", "property_identifier", "field_identifier", "simple_identifier"}
360
+
361
+
362
+ def _callee_node(node):
363
+ kind = _kind(node)
364
+ if kind == "method_invocation": # java: obj.method(...) / method(...)
365
+ return _field(node, "name")
366
+ if kind == "macro_invocation": # rust: name!(...)
367
+ return _field(node, "macro")
368
+ if kind not in {"call", "call_expression", "invocation_expression", "function_call_expression"}:
369
+ return None
370
+ # ruby `call` uses field "method"; everything else uses field "function".
371
+ fn = _field(node, "function") or _field(node, "method")
372
+ if fn is None:
373
+ return None
374
+ if _kind(fn) in _CALLEE_LEAVES:
375
+ return fn
376
+ # member / selector / scoped / field access: take the trailing identifier.
377
+ attr = (
378
+ _field(fn, "attribute")
379
+ or _field(fn, "property")
380
+ or _field(fn, "field")
381
+ or _field(fn, "name")
382
+ )
383
+ if attr is not None and _kind(attr) in _CALLEE_LEAVES:
384
+ return attr
385
+ return None
386
+
387
+
388
+ def _kind(node) -> str:
389
+ value = getattr(node, "type", None)
390
+ if value is None:
391
+ value = getattr(node, "kind", None)
392
+ resolved = value() if callable(value) else value
393
+ return resolved if isinstance(resolved, str) else ""
394
+
395
+
396
+ def _field(node, name: str):
397
+ return node.child_by_field_name(name)
398
+
399
+
400
+ def _start_point(node):
401
+ value = getattr(node, "start_point", None)
402
+ if value is None:
403
+ value = getattr(node, "start_position", None)
404
+ return value() if callable(value) else value
405
+
406
+
407
+ def _end_point(node):
408
+ value = getattr(node, "end_point", None)
409
+ if value is None:
410
+ value = getattr(node, "end_position", None)
411
+ return value() if callable(value) else value
412
+
413
+
414
+ def _byte_range(node) -> tuple[int, int]:
415
+ start = getattr(node, "start_byte", None)
416
+ end = getattr(node, "end_byte", None)
417
+ if start is not None and end is not None:
418
+ return (start() if callable(start) else start, end() if callable(end) else end)
419
+ br = node.byte_range()
420
+ return br.start, br.end
421
+
422
+
423
+ def _node_text(node, source: bytes) -> str:
424
+ start, end = _byte_range(node)
425
+ return source[start:end].decode("utf-8", errors="ignore")
426
+
427
+
428
+ def _named_children(node) -> list[object]:
429
+ children = getattr(node, "named_children", None)
430
+ if children is not None:
431
+ return list(children() if callable(children) else children)
432
+ count = node.named_child_count() if callable(node.named_child_count) else node.named_child_count
433
+ return [node.named_child(i) for i in range(count)]
434
+
435
+
436
+ def _walk(node):
437
+ yield node
438
+ for child in _named_children(node):
439
+ yield from _walk(child)
@@ -0,0 +1,9 @@
1
+ """Hybrid retrieval engine. See docs/RETRIEVAL.md for the full pipeline.
2
+
3
+ intent.py : classify the query into an Intent + retriever weights + graph strategy.
4
+ searchers.py : path / symbol / fts / vector searchers -> uniform Candidate lists.
5
+ fusion.py : Reciprocal Rank Fusion across retriever lists (rrf_k, per-intent weights).
6
+ rerank.py : feature-based reordering (symbol-kind, path proximity, centrality, recency) +
7
+ produces the human-readable `reason` per result.
8
+ budget.py : greedy token-budgeted assembly of snippets vs. recommended_reads; secret redaction.
9
+ """
@@ -0,0 +1,82 @@
1
+ """Greedy token budgeting (RETRIEVAL.md §6).
2
+
3
+ Metadata for every result is always emitted (cheap). Snippets are attached to the
4
+ highest-ranked results until the budget is hit; the remainder become
5
+ recommended_reads. All snippet text is secret-redacted before emission.
6
+
7
+ A result is added to recommended_reads when:
8
+ - it has no snippet (budget exceeded or no content), OR
9
+ - its snippet is below _MIN_USEFUL_TOKENS (e.g. a bare function signature).
10
+ Claude still gets the short preview but also receives the read plan.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ from typing import Callable, Optional
16
+
17
+ from ..output.redact import redact_snippet
18
+ from .skeleton import Compacted
19
+ from .types import Candidate
20
+
21
+ # Snippets shorter than this threshold are treated as previews only; the result
22
+ # is still added to recommended_reads so Claude knows where to read the full body.
23
+ _MIN_USEFUL_TOKENS = 40
24
+
25
+
26
+ def _meta(c: Candidate) -> dict:
27
+ return {
28
+ "path": c.path,
29
+ "line_start": c.line_start,
30
+ "line_end": c.line_end,
31
+ "symbols": [c.symbol] if c.symbol else [],
32
+ "score": round(c.score, 4),
33
+ "reason": c.reason if c.reason else c.source,
34
+ "token_est": c.token_est,
35
+ }
36
+
37
+
38
+ def apply_budget(
39
+ candidates: list[Candidate],
40
+ *,
41
+ token_budget: int,
42
+ compactor: Optional[Callable[[Candidate], Compacted]] = None,
43
+ ) -> tuple[list[dict], list[dict]]:
44
+ results: list[dict] = []
45
+ recommended: list[dict] = []
46
+ spent = 0
47
+
48
+ for rank, c in enumerate(candidates, start=1):
49
+ meta = _meta(c)
50
+ meta["rank"] = rank
51
+ meta["skeletonized"] = False
52
+ meta["elided_lines"] = 0
53
+
54
+ # Resolve the snippet text + cost. A compactor only changes anything
55
+ # when it returns a real skeleton; otherwise we keep today's raw path
56
+ # byte-for-byte (uses c.content / c.token_est).
57
+ text = c.content
58
+ cost = c.token_est
59
+ if compactor is not None and c.content:
60
+ comp = compactor(c)
61
+ if comp.skeletonized:
62
+ text = comp.text
63
+ cost = comp.token_est
64
+ meta["skeletonized"] = True
65
+ meta["elided_lines"] = comp.elided_lines
66
+
67
+ snippet = None
68
+ snippet_is_useful = False
69
+ if text and spent + cost <= token_budget:
70
+ snippet = redact_snippet(text)
71
+ spent += cost
72
+ meta["token_est"] = cost
73
+ snippet_is_useful = cost >= _MIN_USEFUL_TOKENS
74
+
75
+ if not snippet_is_useful:
76
+ recommended.append(
77
+ {"path": c.path, "line_start": c.line_start, "line_end": c.line_end}
78
+ )
79
+ meta["snippet"] = snippet
80
+ results.append(meta)
81
+
82
+ return results, recommended
@@ -0,0 +1,62 @@
1
+ """Reciprocal Rank Fusion across per-source ranked candidate lists.
2
+
3
+ RRF(d) = Σ_r w_r · k / (k + rank_r(d)) — robust to incomparable raw scores.
4
+
5
+ Two deliberate departures from the textbook formula:
6
+
7
+ * Scaled by k. Raw RRF tops out at w/k (≈0.017 for k=60), an order of magnitude
8
+ below the bounded bonuses the reranker layers on top, so rerank would silently
9
+ become the primary ranker and RRF a mere tiebreak. Multiplying by k is a pure
10
+ monotonic rescale (fusion order is identical) that lifts the top contribution to
11
+ ≈w, putting fused scores and rerank bonuses on the same O(1) scale.
12
+ * Fused on a coarse (path, line-bucket) key, not (path, start, end). Different
13
+ retrievers report different line ranges for the same place; an exact key almost
14
+ never coincides across sources, so cross-source agreement — RRF's whole point —
15
+ would never fire. `agreeing_sources` is therefore counted at file granularity.
16
+
17
+ On merge, the candidate carrying the most signal (symbol > fts > path) is kept as
18
+ the representative so downstream rerank/snippet logic has the richest fields.
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ from dataclasses import replace as _replace
24
+
25
+ from .types import Candidate
26
+
27
+ _SOURCE_RICHNESS = {"symbol": 3, "fts": 2, "vector": 2, "path": 1}
28
+
29
+
30
+ def _richer(a: Candidate, b: Candidate) -> Candidate:
31
+ return a if _SOURCE_RICHNESS.get(a.source, 0) >= _SOURCE_RICHNESS.get(b.source, 0) else b
32
+
33
+
34
+ def fuse(
35
+ lists: dict[str, list[Candidate]],
36
+ *,
37
+ weights: dict[str, float],
38
+ k: int,
39
+ ) -> list[Candidate]:
40
+ accum: dict[tuple, float] = {}
41
+ rep: dict[tuple, Candidate] = {}
42
+ seen: set[tuple] = set()
43
+ file_sources: dict[str, set[str]] = {}
44
+
45
+ for source, candidates in lists.items():
46
+ w = weights.get(source, 0.0)
47
+ if w <= 0.0:
48
+ continue
49
+ for rank, cand in enumerate(candidates):
50
+ file_sources.setdefault(cand.path, set()).add(source)
51
+ key = cand.fuse_key()
52
+ # One contribution per source per locator: a file matching three FTS
53
+ # chunks in the same bucket is one lexical signal, not three.
54
+ if (source, key) in seen:
55
+ continue
56
+ seen.add((source, key))
57
+ accum[key] = accum.get(key, 0.0) + w * k / (k + rank)
58
+ rep[key] = _richer(rep[key], cand) if key in rep else cand
59
+
60
+ fused = [_replace(rep[key], score=score) for key, score in accum.items()]
61
+ fused.sort(key=lambda c: c.score, reverse=True)
62
+ return [_replace(c, agreeing_sources=len(file_sources[c.path])) for c in fused]