codebase-retrieval-context-engine 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codebase_retrieval_context_engine-2.0.0.dist-info/METADATA +505 -0
- codebase_retrieval_context_engine-2.0.0.dist-info/RECORD +46 -0
- codebase_retrieval_context_engine-2.0.0.dist-info/WHEEL +4 -0
- codebase_retrieval_context_engine-2.0.0.dist-info/entry_points.txt +3 -0
- codebase_retrieval_context_engine-2.0.0.dist-info/licenses/LICENSE +201 -0
- corbell/__init__.py +6 -0
- corbell/cli/__init__.py +1 -0
- corbell/cli/commands/__init__.py +1 -0
- corbell/cli/commands/index.py +86 -0
- corbell/cli/commands/query.py +71 -0
- corbell/cli/main.py +57 -0
- corbell/core/__init__.py +1 -0
- corbell/core/constants.py +52 -0
- corbell/core/embeddings/__init__.py +6 -0
- corbell/core/embeddings/base.py +68 -0
- corbell/core/embeddings/extractor.py +201 -0
- corbell/core/embeddings/factory.py +48 -0
- corbell/core/embeddings/model.py +401 -0
- corbell/core/embeddings/search_cache.py +95 -0
- corbell/core/embeddings/sqlite_store.py +271 -0
- corbell/core/gitignore.py +76 -0
- corbell/core/graph/__init__.py +1 -0
- corbell/core/graph/builder.py +696 -0
- corbell/core/graph/method_graph.py +1077 -0
- corbell/core/graph/providers/__init__.py +6 -0
- corbell/core/graph/providers/aws_patterns.py +62 -0
- corbell/core/graph/providers/azure_patterns.py +64 -0
- corbell/core/graph/providers/gcp_patterns.py +59 -0
- corbell/core/graph/schema.py +175 -0
- corbell/core/graph/sqlite_store.py +500 -0
- corbell/core/indexing/__init__.py +1 -0
- corbell/core/indexing/builder.py +608 -0
- corbell/core/indexing/lock.py +150 -0
- corbell/core/indexing/tracker.py +245 -0
- corbell/core/llm_client.py +677 -0
- corbell/core/mcp/__init__.py +1 -0
- corbell/core/mcp/server.py +214 -0
- corbell/core/query/__init__.py +1 -0
- corbell/core/query/diagnostics.py +38 -0
- corbell/core/query/engine.py +321 -0
- corbell/core/query/enhancer.py +102 -0
- corbell/core/query/formatter.py +98 -0
- corbell/core/query/graph_expander.py +284 -0
- corbell/core/query/merger.py +171 -0
- corbell/core/query/reranker.py +131 -0
- corbell/core/workspace.py +408 -0
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
"""Output formatter for code retrieval results."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Dict, List, TYPE_CHECKING
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from corbell.core.query.graph_expander import ScoredChunk
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def format_results(
|
|
13
|
+
chunks: List["ScoredChunk"],
|
|
14
|
+
repo_paths: Dict[str, str],
|
|
15
|
+
) -> str:
|
|
16
|
+
"""Format scored chunks as annotated code blocks for LLM context injection.
|
|
17
|
+
|
|
18
|
+
Output format:
|
|
19
|
+
<absolute_path>#L<start>-<end>
|
|
20
|
+
<start>: <code line>
|
|
21
|
+
<start+1>: <code line>
|
|
22
|
+
...
|
|
23
|
+
<end>: <code line>
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
chunks: Scored chunks to format (pre-sorted by score descending).
|
|
27
|
+
repo_paths: Mapping of repo_id -> absolute repo path string.
|
|
28
|
+
Used to resolve relative file paths to absolute paths.
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
Formatted string with all chunks, separated by blank lines.
|
|
32
|
+
"""
|
|
33
|
+
if not chunks:
|
|
34
|
+
return ""
|
|
35
|
+
|
|
36
|
+
blocks: List[str] = []
|
|
37
|
+
|
|
38
|
+
for chunk in chunks:
|
|
39
|
+
abs_path = _resolve_absolute_path(chunk.file_path, chunk.repo_id, repo_paths)
|
|
40
|
+
|
|
41
|
+
# Read the actual lines for this chunk range
|
|
42
|
+
lines = _read_chunk_lines(abs_path, chunk.start_line, chunk.end_line)
|
|
43
|
+
if lines is None:
|
|
44
|
+
# File not readable — use content from chunk object
|
|
45
|
+
lines = chunk.content.splitlines()
|
|
46
|
+
|
|
47
|
+
# Build the header: path#Lstart-end
|
|
48
|
+
header = f"{abs_path}#L{chunk.start_line}-{chunk.end_line}"
|
|
49
|
+
|
|
50
|
+
# Build numbered lines
|
|
51
|
+
numbered_lines: List[str] = []
|
|
52
|
+
for i, line in enumerate(lines):
|
|
53
|
+
line_num = chunk.start_line + i
|
|
54
|
+
numbered_lines.append(f"{line_num}: {line}")
|
|
55
|
+
|
|
56
|
+
block = header + "\n" + "\n".join(numbered_lines)
|
|
57
|
+
blocks.append(block)
|
|
58
|
+
|
|
59
|
+
return "\n\n".join(blocks)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _resolve_absolute_path(
|
|
63
|
+
file_path: str,
|
|
64
|
+
repo_id: str,
|
|
65
|
+
repo_paths: Dict[str, str],
|
|
66
|
+
) -> str:
|
|
67
|
+
"""Resolve a file_path to an absolute path string.
|
|
68
|
+
|
|
69
|
+
If file_path is already absolute, returns it as-is.
|
|
70
|
+
Otherwise, joins it with the repo's root path.
|
|
71
|
+
"""
|
|
72
|
+
p = Path(file_path)
|
|
73
|
+
if p.is_absolute():
|
|
74
|
+
return str(p)
|
|
75
|
+
|
|
76
|
+
repo_root = repo_paths.get(repo_id, "")
|
|
77
|
+
if repo_root:
|
|
78
|
+
return str((Path(repo_root) / file_path).resolve())
|
|
79
|
+
|
|
80
|
+
return file_path # best effort
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _read_chunk_lines(
|
|
84
|
+
file_path: str,
|
|
85
|
+
start_line: int,
|
|
86
|
+
end_line: int,
|
|
87
|
+
) -> List[str] | None:
|
|
88
|
+
"""Read lines start_line..end_line (1-based, inclusive) from a file.
|
|
89
|
+
|
|
90
|
+
Returns None if the file cannot be read.
|
|
91
|
+
"""
|
|
92
|
+
try:
|
|
93
|
+
all_lines = Path(file_path).read_text(encoding="utf-8", errors="ignore").splitlines()
|
|
94
|
+
start_idx = max(0, start_line - 1)
|
|
95
|
+
end_idx = min(len(all_lines), end_line)
|
|
96
|
+
return all_lines[start_idx:end_idx]
|
|
97
|
+
except Exception:
|
|
98
|
+
return None
|
|
@@ -0,0 +1,284 @@
|
|
|
1
|
+
"""Graph-based call-chain expansion for query results."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Dict, List, Optional, Set, TYPE_CHECKING
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from corbell.core.query.diagnostics import QueryDiagnostics
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class ScoredChunk:
|
|
15
|
+
"""An embedding chunk with a relevance score."""
|
|
16
|
+
|
|
17
|
+
chunk_id: str
|
|
18
|
+
score: float
|
|
19
|
+
file_path: str # absolute path
|
|
20
|
+
start_line: int
|
|
21
|
+
end_line: int
|
|
22
|
+
content: str
|
|
23
|
+
repo_id: str
|
|
24
|
+
symbol: Optional[str] = None
|
|
25
|
+
chunk_type: str = "block"
|
|
26
|
+
language: str = "python"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def expand_via_graph(
|
|
30
|
+
embedding_results: List[ScoredChunk],
|
|
31
|
+
graph_store: Any,
|
|
32
|
+
repos: List,
|
|
33
|
+
max_depth: int = 2,
|
|
34
|
+
max_chunks: int = 30,
|
|
35
|
+
diagnostics: Optional["QueryDiagnostics"] = None,
|
|
36
|
+
) -> List[ScoredChunk]:
|
|
37
|
+
"""Expand embedding results by following call-chain edges in the graph.
|
|
38
|
+
|
|
39
|
+
For each embedding result that overlaps with a MethodNode (by file path +
|
|
40
|
+
line range), BFS is performed over callers (score * 0.6) and callees
|
|
41
|
+
(score * 0.5). A global cap of max_chunks expanded chunks is enforced.
|
|
42
|
+
Score floor of 0.15 stops expansion of low-confidence chains.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
embedding_results: Initial scored chunks from embedding search.
|
|
46
|
+
graph_store: SQLiteGraphStore instance.
|
|
47
|
+
repos: List of RepoConfig objects (for resolving relative paths).
|
|
48
|
+
max_depth: BFS depth limit for expansion.
|
|
49
|
+
max_chunks: Maximum total expanded (bonus) chunks to add.
|
|
50
|
+
diagnostics: Optional diagnostics object for tracking warnings.
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
List of bonus ScoredChunk objects (excluding the original results).
|
|
54
|
+
"""
|
|
55
|
+
if not embedding_results:
|
|
56
|
+
return []
|
|
57
|
+
|
|
58
|
+
# Build repo_id → absolute path mapping
|
|
59
|
+
repo_path_map: Dict[str, Path] = {}
|
|
60
|
+
for repo in repos:
|
|
61
|
+
if repo.resolved_path:
|
|
62
|
+
repo_path_map[repo.id] = repo.resolved_path
|
|
63
|
+
|
|
64
|
+
bonus_chunks: List[ScoredChunk] = []
|
|
65
|
+
visited: Set[str] = set() # visited method IDs
|
|
66
|
+
|
|
67
|
+
# Get all method nodes from the graph (cached as dict keyed by file+lines)
|
|
68
|
+
try:
|
|
69
|
+
all_services = graph_store.get_all_services()
|
|
70
|
+
service_ids = [s.id for s in all_services]
|
|
71
|
+
except Exception:
|
|
72
|
+
return []
|
|
73
|
+
|
|
74
|
+
for base_chunk in embedding_results:
|
|
75
|
+
if len(bonus_chunks) >= max_chunks:
|
|
76
|
+
break
|
|
77
|
+
|
|
78
|
+
# Find MethodNodes that overlap with this chunk's file+line range
|
|
79
|
+
matching_methods = _find_matching_methods(
|
|
80
|
+
base_chunk, graph_store, repo_path_map, service_ids
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
for method_node in matching_methods:
|
|
84
|
+
if len(bonus_chunks) >= max_chunks:
|
|
85
|
+
break
|
|
86
|
+
if method_node.id in visited:
|
|
87
|
+
continue
|
|
88
|
+
|
|
89
|
+
visited.add(method_node.id)
|
|
90
|
+
_bfs_expand(
|
|
91
|
+
method_node=method_node,
|
|
92
|
+
parent_score=base_chunk.score,
|
|
93
|
+
depth=0,
|
|
94
|
+
max_depth=max_depth,
|
|
95
|
+
graph_store=graph_store,
|
|
96
|
+
repo_path_map=repo_path_map,
|
|
97
|
+
bonus_chunks=bonus_chunks,
|
|
98
|
+
max_chunks=max_chunks,
|
|
99
|
+
visited=visited,
|
|
100
|
+
diagnostics=diagnostics,
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
return bonus_chunks
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _find_matching_methods(
|
|
107
|
+
chunk: ScoredChunk,
|
|
108
|
+
graph_store: Any,
|
|
109
|
+
repo_path_map: Dict[str, Path],
|
|
110
|
+
service_ids: List[str],
|
|
111
|
+
) -> List[Any]:
|
|
112
|
+
"""Find MethodNodes whose file_path and line range overlap with the given chunk."""
|
|
113
|
+
results = []
|
|
114
|
+
|
|
115
|
+
try:
|
|
116
|
+
for service_id in service_ids:
|
|
117
|
+
methods = graph_store.get_methods_for_service(service_id)
|
|
118
|
+
for method in methods:
|
|
119
|
+
# Normalize method file_path to absolute
|
|
120
|
+
method_abs = Path(method.file_path)
|
|
121
|
+
chunk_abs = Path(chunk.file_path)
|
|
122
|
+
|
|
123
|
+
# Compare absolute paths
|
|
124
|
+
if not method_abs.is_absolute():
|
|
125
|
+
repo_path = repo_path_map.get(service_id)
|
|
126
|
+
if repo_path:
|
|
127
|
+
method_abs = (repo_path / method.file_path).resolve()
|
|
128
|
+
|
|
129
|
+
if method_abs != chunk_abs:
|
|
130
|
+
continue
|
|
131
|
+
|
|
132
|
+
# Check line overlap: method and chunk lines overlap
|
|
133
|
+
if method.line_end < chunk.start_line:
|
|
134
|
+
continue
|
|
135
|
+
if method.line_start > chunk.end_line:
|
|
136
|
+
continue
|
|
137
|
+
|
|
138
|
+
results.append(method)
|
|
139
|
+
except Exception:
|
|
140
|
+
pass
|
|
141
|
+
|
|
142
|
+
return results
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def _bfs_expand(
|
|
146
|
+
method_node: Any,
|
|
147
|
+
parent_score: float,
|
|
148
|
+
depth: int,
|
|
149
|
+
max_depth: int,
|
|
150
|
+
graph_store: Any,
|
|
151
|
+
repo_path_map: Dict[str, Path],
|
|
152
|
+
bonus_chunks: List[ScoredChunk],
|
|
153
|
+
max_chunks: int,
|
|
154
|
+
visited: Set[str],
|
|
155
|
+
diagnostics: Optional["QueryDiagnostics"],
|
|
156
|
+
) -> None:
|
|
157
|
+
"""BFS expansion from a method node, adding callers and callees as bonus chunks."""
|
|
158
|
+
if depth >= max_depth:
|
|
159
|
+
return
|
|
160
|
+
if len(bonus_chunks) >= max_chunks:
|
|
161
|
+
return
|
|
162
|
+
|
|
163
|
+
# Expand callers (score * 0.6)
|
|
164
|
+
try:
|
|
165
|
+
callers = graph_store.get_callers_of_method(method_node.id)
|
|
166
|
+
for caller in callers:
|
|
167
|
+
if len(bonus_chunks) >= max_chunks:
|
|
168
|
+
return
|
|
169
|
+
caller_score = parent_score * 0.6
|
|
170
|
+
if caller_score < 0.15:
|
|
171
|
+
continue
|
|
172
|
+
if caller.id in visited:
|
|
173
|
+
continue
|
|
174
|
+
visited.add(caller.id)
|
|
175
|
+
|
|
176
|
+
bonus = _method_to_scored_chunk(
|
|
177
|
+
caller, caller_score, repo_path_map, diagnostics
|
|
178
|
+
)
|
|
179
|
+
if bonus is not None:
|
|
180
|
+
bonus_chunks.append(bonus)
|
|
181
|
+
_bfs_expand(
|
|
182
|
+
method_node=caller,
|
|
183
|
+
parent_score=caller_score,
|
|
184
|
+
depth=depth + 1,
|
|
185
|
+
max_depth=max_depth,
|
|
186
|
+
graph_store=graph_store,
|
|
187
|
+
repo_path_map=repo_path_map,
|
|
188
|
+
bonus_chunks=bonus_chunks,
|
|
189
|
+
max_chunks=max_chunks,
|
|
190
|
+
visited=visited,
|
|
191
|
+
diagnostics=diagnostics,
|
|
192
|
+
)
|
|
193
|
+
except Exception:
|
|
194
|
+
if diagnostics:
|
|
195
|
+
diagnostics.graph_expansion_failures += 1
|
|
196
|
+
|
|
197
|
+
# Expand callees (score * 0.5)
|
|
198
|
+
try:
|
|
199
|
+
outgoing = graph_store.get_dependencies(method_node.id)
|
|
200
|
+
for edge in outgoing:
|
|
201
|
+
if edge.kind != "method_call":
|
|
202
|
+
continue
|
|
203
|
+
if len(bonus_chunks) >= max_chunks:
|
|
204
|
+
return
|
|
205
|
+
callee_score = parent_score * 0.5
|
|
206
|
+
if callee_score < 0.15:
|
|
207
|
+
continue
|
|
208
|
+
|
|
209
|
+
callee_id = edge.target_id
|
|
210
|
+
if callee_id in visited:
|
|
211
|
+
continue
|
|
212
|
+
visited.add(callee_id)
|
|
213
|
+
|
|
214
|
+
callee_node = graph_store.get_method(callee_id)
|
|
215
|
+
if callee_node is None:
|
|
216
|
+
if diagnostics:
|
|
217
|
+
diagnostics.skipped_methods += 1
|
|
218
|
+
continue
|
|
219
|
+
|
|
220
|
+
bonus = _method_to_scored_chunk(
|
|
221
|
+
callee_node, callee_score, repo_path_map, diagnostics
|
|
222
|
+
)
|
|
223
|
+
if bonus is not None:
|
|
224
|
+
bonus_chunks.append(bonus)
|
|
225
|
+
_bfs_expand(
|
|
226
|
+
method_node=callee_node,
|
|
227
|
+
parent_score=callee_score,
|
|
228
|
+
depth=depth + 1,
|
|
229
|
+
max_depth=max_depth,
|
|
230
|
+
graph_store=graph_store,
|
|
231
|
+
repo_path_map=repo_path_map,
|
|
232
|
+
bonus_chunks=bonus_chunks,
|
|
233
|
+
max_chunks=max_chunks,
|
|
234
|
+
visited=visited,
|
|
235
|
+
diagnostics=diagnostics,
|
|
236
|
+
)
|
|
237
|
+
except Exception:
|
|
238
|
+
if diagnostics:
|
|
239
|
+
diagnostics.graph_expansion_failures += 1
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def _method_to_scored_chunk(
|
|
243
|
+
method_node: Any,
|
|
244
|
+
score: float,
|
|
245
|
+
repo_path_map: Dict[str, Path],
|
|
246
|
+
diagnostics: Optional["QueryDiagnostics"],
|
|
247
|
+
) -> Optional[ScoredChunk]:
|
|
248
|
+
"""Convert a MethodNode to a ScoredChunk by reading its source lines.
|
|
249
|
+
|
|
250
|
+
Returns None if the file doesn't exist (increments diagnostics counter).
|
|
251
|
+
"""
|
|
252
|
+
file_path = Path(method_node.file_path)
|
|
253
|
+
if not file_path.is_absolute():
|
|
254
|
+
repo_path = repo_path_map.get(method_node.service_id)
|
|
255
|
+
if repo_path:
|
|
256
|
+
file_path = (repo_path / method_node.file_path).resolve()
|
|
257
|
+
|
|
258
|
+
if not file_path.exists():
|
|
259
|
+
if diagnostics:
|
|
260
|
+
diagnostics.skipped_files += 1
|
|
261
|
+
return None
|
|
262
|
+
|
|
263
|
+
try:
|
|
264
|
+
lines = file_path.read_text(encoding="utf-8", errors="ignore").splitlines()
|
|
265
|
+
start = max(0, method_node.line_start - 1)
|
|
266
|
+
end = min(len(lines), method_node.line_end)
|
|
267
|
+
content = "\n".join(lines[start:end])
|
|
268
|
+
except Exception:
|
|
269
|
+
if diagnostics:
|
|
270
|
+
diagnostics.skipped_files += 1
|
|
271
|
+
return None
|
|
272
|
+
|
|
273
|
+
return ScoredChunk(
|
|
274
|
+
chunk_id=method_node.id,
|
|
275
|
+
score=score,
|
|
276
|
+
file_path=str(file_path),
|
|
277
|
+
start_line=method_node.line_start,
|
|
278
|
+
end_line=method_node.line_end,
|
|
279
|
+
content=content,
|
|
280
|
+
repo_id=method_node.service_id,
|
|
281
|
+
symbol=method_node.method_name,
|
|
282
|
+
chunk_type="method",
|
|
283
|
+
language="python", # best effort; MethodNode doesn't store language
|
|
284
|
+
)
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
"""Deduplication and adjacent chunk merging for query results."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Dict, List, TYPE_CHECKING
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from corbell.core.query.graph_expander import ScoredChunk
|
|
9
|
+
|
|
10
|
+
# Maximum number of lines in a merged chunk block
|
|
11
|
+
_MAX_MERGED_LINES = 60
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def merge_and_dedup(chunks: List["ScoredChunk"]) -> List["ScoredChunk"]:
|
|
15
|
+
"""Deduplicate by chunk_id (keep max score), group by file, merge adjacent chunks.
|
|
16
|
+
|
|
17
|
+
Steps:
|
|
18
|
+
1. Deduplicate: for duplicate chunk_ids, keep the one with the highest score.
|
|
19
|
+
2. Group chunks by file path.
|
|
20
|
+
3. Within each file, sort by start_line and merge adjacent/overlapping chunks.
|
|
21
|
+
4. Cap merged blocks at 60 lines.
|
|
22
|
+
5. Remove chunks whose line range is fully contained within another chunk in the same file.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
chunks: List of ScoredChunk objects (may have duplicates from multiple queries).
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
Deduplicated and merged list of ScoredChunk objects.
|
|
29
|
+
"""
|
|
30
|
+
if not chunks:
|
|
31
|
+
return []
|
|
32
|
+
|
|
33
|
+
# Step 1: Dedup by chunk_id, keeping max score
|
|
34
|
+
best: Dict[str, "ScoredChunk"] = {}
|
|
35
|
+
for chunk in chunks:
|
|
36
|
+
if chunk.chunk_id not in best or chunk.score > best[chunk.chunk_id].score:
|
|
37
|
+
best[chunk.chunk_id] = chunk
|
|
38
|
+
|
|
39
|
+
deduped = list(best.values())
|
|
40
|
+
|
|
41
|
+
# Step 2: Group by file path
|
|
42
|
+
by_file: Dict[str, List["ScoredChunk"]] = {}
|
|
43
|
+
for chunk in deduped:
|
|
44
|
+
by_file.setdefault(chunk.file_path, []).append(chunk)
|
|
45
|
+
|
|
46
|
+
# Step 3+4: Sort and merge adjacent/overlapping chunks per file
|
|
47
|
+
result: List["ScoredChunk"] = []
|
|
48
|
+
for file_path, file_chunks in by_file.items():
|
|
49
|
+
merged = _merge_file_chunks(file_chunks)
|
|
50
|
+
# Step 5: Drop chunks fully contained within a larger chunk in the same file
|
|
51
|
+
merged = _drop_contained_ranges(merged)
|
|
52
|
+
result.extend(merged)
|
|
53
|
+
|
|
54
|
+
# Sort final result by score descending for output ordering
|
|
55
|
+
result.sort(key=lambda c: c.score, reverse=True)
|
|
56
|
+
return result
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _merge_file_chunks(chunks: List["ScoredChunk"]) -> List["ScoredChunk"]:
|
|
60
|
+
"""Merge adjacent/overlapping chunks within a single file.
|
|
61
|
+
|
|
62
|
+
Two chunks are merged if next.start_line <= current.end_line + 1.
|
|
63
|
+
The merged chunk gets the max score and a fresh chunk_id combining both.
|
|
64
|
+
Merged blocks are capped at 60 lines.
|
|
65
|
+
"""
|
|
66
|
+
# Sort by start line
|
|
67
|
+
sorted_chunks = sorted(chunks, key=lambda c: c.start_line)
|
|
68
|
+
|
|
69
|
+
merged: List["ScoredChunk"] = []
|
|
70
|
+
if not sorted_chunks:
|
|
71
|
+
return merged
|
|
72
|
+
|
|
73
|
+
current = sorted_chunks[0]
|
|
74
|
+
|
|
75
|
+
for next_chunk in sorted_chunks[1:]:
|
|
76
|
+
# Check adjacency: next starts before or at current.end + 1
|
|
77
|
+
if next_chunk.start_line <= current.end_line + 1:
|
|
78
|
+
# Check if merging would exceed 60-line cap
|
|
79
|
+
merged_lines = next_chunk.end_line - current.start_line + 1
|
|
80
|
+
if merged_lines <= _MAX_MERGED_LINES:
|
|
81
|
+
# Merge: extend current to cover next_chunk
|
|
82
|
+
current = _merge_two(current, next_chunk)
|
|
83
|
+
else:
|
|
84
|
+
# Would exceed cap: flush current, start new
|
|
85
|
+
merged.append(current)
|
|
86
|
+
current = next_chunk
|
|
87
|
+
else:
|
|
88
|
+
# Not adjacent: flush current, start new
|
|
89
|
+
merged.append(current)
|
|
90
|
+
current = next_chunk
|
|
91
|
+
|
|
92
|
+
merged.append(current)
|
|
93
|
+
return merged
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _drop_contained_ranges(chunks: List["ScoredChunk"]) -> List["ScoredChunk"]:
|
|
97
|
+
"""Remove chunks whose line range is fully contained within another chunk's range.
|
|
98
|
+
|
|
99
|
+
A chunk B is considered contained in chunk A when:
|
|
100
|
+
A.start_line <= B.start_line and B.end_line <= A.end_line
|
|
101
|
+
|
|
102
|
+
When two chunks share the exact same range, the one with the lower score is dropped;
|
|
103
|
+
ties keep the first encountered (already deduped by chunk_id earlier).
|
|
104
|
+
|
|
105
|
+
The larger containing chunk already includes all information from the inner chunk,
|
|
106
|
+
so sending both to the reranker wastes tokens without adding signal.
|
|
107
|
+
"""
|
|
108
|
+
if len(chunks) <= 1:
|
|
109
|
+
return chunks
|
|
110
|
+
|
|
111
|
+
# Sort by range width descending (widest first), break ties by score descending.
|
|
112
|
+
# This lets us efficiently check whether later (narrower) chunks are contained.
|
|
113
|
+
sorted_by_width = sorted(
|
|
114
|
+
chunks,
|
|
115
|
+
key=lambda c: (-(c.end_line - c.start_line), -c.score),
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
kept: List["ScoredChunk"] = []
|
|
119
|
+
for candidate in sorted_by_width:
|
|
120
|
+
contained = False
|
|
121
|
+
for keeper in kept:
|
|
122
|
+
if keeper.start_line <= candidate.start_line and candidate.end_line <= keeper.end_line:
|
|
123
|
+
contained = True
|
|
124
|
+
break
|
|
125
|
+
if not contained:
|
|
126
|
+
kept.append(candidate)
|
|
127
|
+
|
|
128
|
+
return kept
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _merge_two(a: "ScoredChunk", b: "ScoredChunk") -> "ScoredChunk":
|
|
132
|
+
"""Merge two chunks into one, combining their content and taking the max score."""
|
|
133
|
+
from corbell.core.query.graph_expander import ScoredChunk # local import to avoid circular
|
|
134
|
+
|
|
135
|
+
new_start = min(a.start_line, b.start_line)
|
|
136
|
+
new_end = max(a.end_line, b.end_line)
|
|
137
|
+
|
|
138
|
+
# Rebuild content by merging lines if both have content
|
|
139
|
+
# We read the combined content from file to avoid duplication.
|
|
140
|
+
# If reading fails, concatenate with a separator.
|
|
141
|
+
try:
|
|
142
|
+
lines = _read_lines(a.file_path, new_start, new_end)
|
|
143
|
+
content = "\n".join(lines)
|
|
144
|
+
except Exception:
|
|
145
|
+
# Fall back: concatenate without overlap
|
|
146
|
+
content = a.content + "\n" + b.content
|
|
147
|
+
|
|
148
|
+
combined_id = f"{a.chunk_id}+{b.chunk_id}"
|
|
149
|
+
|
|
150
|
+
return ScoredChunk(
|
|
151
|
+
chunk_id=combined_id,
|
|
152
|
+
score=max(a.score, b.score),
|
|
153
|
+
file_path=a.file_path,
|
|
154
|
+
start_line=new_start,
|
|
155
|
+
end_line=new_end,
|
|
156
|
+
content=content,
|
|
157
|
+
repo_id=a.repo_id,
|
|
158
|
+
symbol=a.symbol,
|
|
159
|
+
chunk_type=a.chunk_type,
|
|
160
|
+
language=a.language,
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def _read_lines(file_path: str, start_line: int, end_line: int) -> List[str]:
|
|
165
|
+
"""Read specific lines from a file (1-based, inclusive)."""
|
|
166
|
+
from pathlib import Path
|
|
167
|
+
path = Path(file_path)
|
|
168
|
+
all_lines = path.read_text(encoding="utf-8", errors="ignore").splitlines()
|
|
169
|
+
start_idx = max(0, start_line - 1)
|
|
170
|
+
end_idx = min(len(all_lines), end_line)
|
|
171
|
+
return all_lines[start_idx:end_idx]
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
"""LLM-based result reranker for the query pipeline."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from typing import Any, Dict, List, Optional, TYPE_CHECKING
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from corbell.core.query.graph_expander import ScoredChunk
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def rerank_chunks(
|
|
13
|
+
query: str,
|
|
14
|
+
chunks: List["ScoredChunk"],
|
|
15
|
+
llm_client: Optional[Any],
|
|
16
|
+
graph_meta: Optional[Dict[str, Dict]] = None,
|
|
17
|
+
) -> List[str]:
|
|
18
|
+
"""Rerank and filter query results using an LLM.
|
|
19
|
+
|
|
20
|
+
Sends chunk content + metadata to the LLM. The LLM returns a JSON array
|
|
21
|
+
of 0-based chunk indices ordered by relevance, omitting irrelevant chunks.
|
|
22
|
+
|
|
23
|
+
On any failure (parse error, LLM timeout), all chunk IDs are returned
|
|
24
|
+
in their original order (graceful fallback).
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
query: The original user query.
|
|
28
|
+
chunks: Scored chunks to rerank.
|
|
29
|
+
llm_client: An LLMClient instance (or None / unconfigured).
|
|
30
|
+
graph_meta: Optional dict mapping chunk_id to graph metadata
|
|
31
|
+
(callers count, callees count, flow name). When provided,
|
|
32
|
+
metadata is included in the chunk header sent to the LLM.
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
List of chunk_ids in reranked order (most relevant first).
|
|
36
|
+
Irrelevant chunks are excluded.
|
|
37
|
+
Falls back to original order on any failure.
|
|
38
|
+
"""
|
|
39
|
+
if not chunks:
|
|
40
|
+
return []
|
|
41
|
+
|
|
42
|
+
all_ids = [c.chunk_id for c in chunks]
|
|
43
|
+
|
|
44
|
+
if llm_client is None or not getattr(llm_client, "is_configured", False):
|
|
45
|
+
return all_ids
|
|
46
|
+
|
|
47
|
+
# Build payload with code content, indexed for compact LLM output
|
|
48
|
+
entries = []
|
|
49
|
+
for i, chunk in enumerate(chunks):
|
|
50
|
+
meta = graph_meta.get(chunk.chunk_id) if graph_meta else None
|
|
51
|
+
if meta:
|
|
52
|
+
callers = meta.get("callers", 0)
|
|
53
|
+
callees = meta.get("callees", 0)
|
|
54
|
+
flow = meta.get("flow")
|
|
55
|
+
if flow:
|
|
56
|
+
meta_str = f"score={chunk.score:.2f}, callers={callers}, callees={callees}, flow={flow}"
|
|
57
|
+
else:
|
|
58
|
+
meta_str = f"score={chunk.score:.2f}, callers={callers}, callees={callees}"
|
|
59
|
+
else:
|
|
60
|
+
meta_str = f"score={chunk.score:.2f}"
|
|
61
|
+
|
|
62
|
+
content = chunk.content
|
|
63
|
+
content_lines = content.splitlines()
|
|
64
|
+
if len(content_lines) > 100:
|
|
65
|
+
truncated_count = len(content_lines) - 100
|
|
66
|
+
content = "\n".join(
|
|
67
|
+
content_lines[:50]
|
|
68
|
+
+ [f"... ({truncated_count} lines truncated) ..."]
|
|
69
|
+
+ content_lines[-50:]
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
entry = (
|
|
73
|
+
f"[{i}] {meta_str} | {chunk.file_path}:{chunk.start_line}-{chunk.end_line}"
|
|
74
|
+
f" ({chunk.chunk_type}, {chunk.symbol or 'no symbol'})\n"
|
|
75
|
+
f"{content}"
|
|
76
|
+
)
|
|
77
|
+
entries.append(entry)
|
|
78
|
+
|
|
79
|
+
system = (
|
|
80
|
+
"You are a code search relevance ranker. "
|
|
81
|
+
"Given a query and numbered code chunks with metadata (relevance score, callers count, "
|
|
82
|
+
"callees count, flow membership), return a JSON array of chunk indices ordered from most "
|
|
83
|
+
"relevant to least relevant. "
|
|
84
|
+
"OMIT chunks that are not relevant to the query. "
|
|
85
|
+
"Higher score, more callers, and flow membership indicate higher structural importance. "
|
|
86
|
+
"Return ONLY a valid JSON array of integers, e.g. [2,0,5]."
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
separator = "---\n"
|
|
90
|
+
chunks_text = separator.join(entries)
|
|
91
|
+
|
|
92
|
+
user = (
|
|
93
|
+
f"Query: {query}\n\n"
|
|
94
|
+
f"Chunks:\n{chunks_text}\n\n"
|
|
95
|
+
"Return JSON array of relevant chunk indices (most relevant first):"
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
try:
|
|
99
|
+
response = llm_client.call(
|
|
100
|
+
system, user,
|
|
101
|
+
max_tokens=200,
|
|
102
|
+
temperature=0.0,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
text = response.strip()
|
|
106
|
+
if text.startswith("```"):
|
|
107
|
+
lines = text.splitlines()
|
|
108
|
+
text = "\n".join(
|
|
109
|
+
line for line in lines
|
|
110
|
+
if not line.startswith("```")
|
|
111
|
+
).strip()
|
|
112
|
+
|
|
113
|
+
indices = json.loads(text)
|
|
114
|
+
|
|
115
|
+
if not isinstance(indices, list):
|
|
116
|
+
return all_ids
|
|
117
|
+
|
|
118
|
+
# Validate indices are ints within range
|
|
119
|
+
n = len(chunks)
|
|
120
|
+
filtered = [
|
|
121
|
+
all_ids[idx] for idx in indices
|
|
122
|
+
if isinstance(idx, int) and 0 <= idx < n
|
|
123
|
+
]
|
|
124
|
+
|
|
125
|
+
if not filtered:
|
|
126
|
+
return all_ids
|
|
127
|
+
|
|
128
|
+
return filtered
|
|
129
|
+
|
|
130
|
+
except Exception:
|
|
131
|
+
return all_ids
|