codebase-retrieval-context-engine 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. codebase_retrieval_context_engine-2.0.0.dist-info/METADATA +505 -0
  2. codebase_retrieval_context_engine-2.0.0.dist-info/RECORD +46 -0
  3. codebase_retrieval_context_engine-2.0.0.dist-info/WHEEL +4 -0
  4. codebase_retrieval_context_engine-2.0.0.dist-info/entry_points.txt +3 -0
  5. codebase_retrieval_context_engine-2.0.0.dist-info/licenses/LICENSE +201 -0
  6. corbell/__init__.py +6 -0
  7. corbell/cli/__init__.py +1 -0
  8. corbell/cli/commands/__init__.py +1 -0
  9. corbell/cli/commands/index.py +86 -0
  10. corbell/cli/commands/query.py +71 -0
  11. corbell/cli/main.py +57 -0
  12. corbell/core/__init__.py +1 -0
  13. corbell/core/constants.py +52 -0
  14. corbell/core/embeddings/__init__.py +6 -0
  15. corbell/core/embeddings/base.py +68 -0
  16. corbell/core/embeddings/extractor.py +201 -0
  17. corbell/core/embeddings/factory.py +48 -0
  18. corbell/core/embeddings/model.py +401 -0
  19. corbell/core/embeddings/search_cache.py +95 -0
  20. corbell/core/embeddings/sqlite_store.py +271 -0
  21. corbell/core/gitignore.py +76 -0
  22. corbell/core/graph/__init__.py +1 -0
  23. corbell/core/graph/builder.py +696 -0
  24. corbell/core/graph/method_graph.py +1077 -0
  25. corbell/core/graph/providers/__init__.py +6 -0
  26. corbell/core/graph/providers/aws_patterns.py +62 -0
  27. corbell/core/graph/providers/azure_patterns.py +64 -0
  28. corbell/core/graph/providers/gcp_patterns.py +59 -0
  29. corbell/core/graph/schema.py +175 -0
  30. corbell/core/graph/sqlite_store.py +500 -0
  31. corbell/core/indexing/__init__.py +1 -0
  32. corbell/core/indexing/builder.py +608 -0
  33. corbell/core/indexing/lock.py +150 -0
  34. corbell/core/indexing/tracker.py +245 -0
  35. corbell/core/llm_client.py +677 -0
  36. corbell/core/mcp/__init__.py +1 -0
  37. corbell/core/mcp/server.py +214 -0
  38. corbell/core/query/__init__.py +1 -0
  39. corbell/core/query/diagnostics.py +38 -0
  40. corbell/core/query/engine.py +321 -0
  41. corbell/core/query/enhancer.py +102 -0
  42. corbell/core/query/formatter.py +98 -0
  43. corbell/core/query/graph_expander.py +284 -0
  44. corbell/core/query/merger.py +171 -0
  45. corbell/core/query/reranker.py +131 -0
  46. corbell/core/workspace.py +408 -0
@@ -0,0 +1,98 @@
1
+ """Output formatter for code retrieval results."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Dict, List, TYPE_CHECKING
7
+
8
+ if TYPE_CHECKING:
9
+ from corbell.core.query.graph_expander import ScoredChunk
10
+
11
+
12
+ def format_results(
13
+ chunks: List["ScoredChunk"],
14
+ repo_paths: Dict[str, str],
15
+ ) -> str:
16
+ """Format scored chunks as annotated code blocks for LLM context injection.
17
+
18
+ Output format:
19
+ <absolute_path>#L<start>-<end>
20
+ <start>: <code line>
21
+ <start+1>: <code line>
22
+ ...
23
+ <end>: <code line>
24
+
25
+ Args:
26
+ chunks: Scored chunks to format (pre-sorted by score descending).
27
+ repo_paths: Mapping of repo_id -> absolute repo path string.
28
+ Used to resolve relative file paths to absolute paths.
29
+
30
+ Returns:
31
+ Formatted string with all chunks, separated by blank lines.
32
+ """
33
+ if not chunks:
34
+ return ""
35
+
36
+ blocks: List[str] = []
37
+
38
+ for chunk in chunks:
39
+ abs_path = _resolve_absolute_path(chunk.file_path, chunk.repo_id, repo_paths)
40
+
41
+ # Read the actual lines for this chunk range
42
+ lines = _read_chunk_lines(abs_path, chunk.start_line, chunk.end_line)
43
+ if lines is None:
44
+ # File not readable — use content from chunk object
45
+ lines = chunk.content.splitlines()
46
+
47
+ # Build the header: path#Lstart-end
48
+ header = f"{abs_path}#L{chunk.start_line}-{chunk.end_line}"
49
+
50
+ # Build numbered lines
51
+ numbered_lines: List[str] = []
52
+ for i, line in enumerate(lines):
53
+ line_num = chunk.start_line + i
54
+ numbered_lines.append(f"{line_num}: {line}")
55
+
56
+ block = header + "\n" + "\n".join(numbered_lines)
57
+ blocks.append(block)
58
+
59
+ return "\n\n".join(blocks)
60
+
61
+
62
+ def _resolve_absolute_path(
63
+ file_path: str,
64
+ repo_id: str,
65
+ repo_paths: Dict[str, str],
66
+ ) -> str:
67
+ """Resolve a file_path to an absolute path string.
68
+
69
+ If file_path is already absolute, returns it as-is.
70
+ Otherwise, joins it with the repo's root path.
71
+ """
72
+ p = Path(file_path)
73
+ if p.is_absolute():
74
+ return str(p)
75
+
76
+ repo_root = repo_paths.get(repo_id, "")
77
+ if repo_root:
78
+ return str((Path(repo_root) / file_path).resolve())
79
+
80
+ return file_path # best effort
81
+
82
+
83
+ def _read_chunk_lines(
84
+ file_path: str,
85
+ start_line: int,
86
+ end_line: int,
87
+ ) -> List[str] | None:
88
+ """Read lines start_line..end_line (1-based, inclusive) from a file.
89
+
90
+ Returns None if the file cannot be read.
91
+ """
92
+ try:
93
+ all_lines = Path(file_path).read_text(encoding="utf-8", errors="ignore").splitlines()
94
+ start_idx = max(0, start_line - 1)
95
+ end_idx = min(len(all_lines), end_line)
96
+ return all_lines[start_idx:end_idx]
97
+ except Exception:
98
+ return None
@@ -0,0 +1,284 @@
1
+ """Graph-based call-chain expansion for query results."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+ from typing import Any, Dict, List, Optional, Set, TYPE_CHECKING
8
+
9
+ if TYPE_CHECKING:
10
+ from corbell.core.query.diagnostics import QueryDiagnostics
11
+
12
+
13
+ @dataclass
14
+ class ScoredChunk:
15
+ """An embedding chunk with a relevance score."""
16
+
17
+ chunk_id: str
18
+ score: float
19
+ file_path: str # absolute path
20
+ start_line: int
21
+ end_line: int
22
+ content: str
23
+ repo_id: str
24
+ symbol: Optional[str] = None
25
+ chunk_type: str = "block"
26
+ language: str = "python"
27
+
28
+
29
+ def expand_via_graph(
30
+ embedding_results: List[ScoredChunk],
31
+ graph_store: Any,
32
+ repos: List,
33
+ max_depth: int = 2,
34
+ max_chunks: int = 30,
35
+ diagnostics: Optional["QueryDiagnostics"] = None,
36
+ ) -> List[ScoredChunk]:
37
+ """Expand embedding results by following call-chain edges in the graph.
38
+
39
+ For each embedding result that overlaps with a MethodNode (by file path +
40
+ line range), BFS is performed over callers (score * 0.6) and callees
41
+ (score * 0.5). A global cap of max_chunks expanded chunks is enforced.
42
+ Score floor of 0.15 stops expansion of low-confidence chains.
43
+
44
+ Args:
45
+ embedding_results: Initial scored chunks from embedding search.
46
+ graph_store: SQLiteGraphStore instance.
47
+ repos: List of RepoConfig objects (for resolving relative paths).
48
+ max_depth: BFS depth limit for expansion.
49
+ max_chunks: Maximum total expanded (bonus) chunks to add.
50
+ diagnostics: Optional diagnostics object for tracking warnings.
51
+
52
+ Returns:
53
+ List of bonus ScoredChunk objects (excluding the original results).
54
+ """
55
+ if not embedding_results:
56
+ return []
57
+
58
+ # Build repo_id → absolute path mapping
59
+ repo_path_map: Dict[str, Path] = {}
60
+ for repo in repos:
61
+ if repo.resolved_path:
62
+ repo_path_map[repo.id] = repo.resolved_path
63
+
64
+ bonus_chunks: List[ScoredChunk] = []
65
+ visited: Set[str] = set() # visited method IDs
66
+
67
+ # Get all method nodes from the graph (cached as dict keyed by file+lines)
68
+ try:
69
+ all_services = graph_store.get_all_services()
70
+ service_ids = [s.id for s in all_services]
71
+ except Exception:
72
+ return []
73
+
74
+ for base_chunk in embedding_results:
75
+ if len(bonus_chunks) >= max_chunks:
76
+ break
77
+
78
+ # Find MethodNodes that overlap with this chunk's file+line range
79
+ matching_methods = _find_matching_methods(
80
+ base_chunk, graph_store, repo_path_map, service_ids
81
+ )
82
+
83
+ for method_node in matching_methods:
84
+ if len(bonus_chunks) >= max_chunks:
85
+ break
86
+ if method_node.id in visited:
87
+ continue
88
+
89
+ visited.add(method_node.id)
90
+ _bfs_expand(
91
+ method_node=method_node,
92
+ parent_score=base_chunk.score,
93
+ depth=0,
94
+ max_depth=max_depth,
95
+ graph_store=graph_store,
96
+ repo_path_map=repo_path_map,
97
+ bonus_chunks=bonus_chunks,
98
+ max_chunks=max_chunks,
99
+ visited=visited,
100
+ diagnostics=diagnostics,
101
+ )
102
+
103
+ return bonus_chunks
104
+
105
+
106
+ def _find_matching_methods(
107
+ chunk: ScoredChunk,
108
+ graph_store: Any,
109
+ repo_path_map: Dict[str, Path],
110
+ service_ids: List[str],
111
+ ) -> List[Any]:
112
+ """Find MethodNodes whose file_path and line range overlap with the given chunk."""
113
+ results = []
114
+
115
+ try:
116
+ for service_id in service_ids:
117
+ methods = graph_store.get_methods_for_service(service_id)
118
+ for method in methods:
119
+ # Normalize method file_path to absolute
120
+ method_abs = Path(method.file_path)
121
+ chunk_abs = Path(chunk.file_path)
122
+
123
+ # Compare absolute paths
124
+ if not method_abs.is_absolute():
125
+ repo_path = repo_path_map.get(service_id)
126
+ if repo_path:
127
+ method_abs = (repo_path / method.file_path).resolve()
128
+
129
+ if method_abs != chunk_abs:
130
+ continue
131
+
132
+ # Check line overlap: method and chunk lines overlap
133
+ if method.line_end < chunk.start_line:
134
+ continue
135
+ if method.line_start > chunk.end_line:
136
+ continue
137
+
138
+ results.append(method)
139
+ except Exception:
140
+ pass
141
+
142
+ return results
143
+
144
+
145
+ def _bfs_expand(
146
+ method_node: Any,
147
+ parent_score: float,
148
+ depth: int,
149
+ max_depth: int,
150
+ graph_store: Any,
151
+ repo_path_map: Dict[str, Path],
152
+ bonus_chunks: List[ScoredChunk],
153
+ max_chunks: int,
154
+ visited: Set[str],
155
+ diagnostics: Optional["QueryDiagnostics"],
156
+ ) -> None:
157
+ """BFS expansion from a method node, adding callers and callees as bonus chunks."""
158
+ if depth >= max_depth:
159
+ return
160
+ if len(bonus_chunks) >= max_chunks:
161
+ return
162
+
163
+ # Expand callers (score * 0.6)
164
+ try:
165
+ callers = graph_store.get_callers_of_method(method_node.id)
166
+ for caller in callers:
167
+ if len(bonus_chunks) >= max_chunks:
168
+ return
169
+ caller_score = parent_score * 0.6
170
+ if caller_score < 0.15:
171
+ continue
172
+ if caller.id in visited:
173
+ continue
174
+ visited.add(caller.id)
175
+
176
+ bonus = _method_to_scored_chunk(
177
+ caller, caller_score, repo_path_map, diagnostics
178
+ )
179
+ if bonus is not None:
180
+ bonus_chunks.append(bonus)
181
+ _bfs_expand(
182
+ method_node=caller,
183
+ parent_score=caller_score,
184
+ depth=depth + 1,
185
+ max_depth=max_depth,
186
+ graph_store=graph_store,
187
+ repo_path_map=repo_path_map,
188
+ bonus_chunks=bonus_chunks,
189
+ max_chunks=max_chunks,
190
+ visited=visited,
191
+ diagnostics=diagnostics,
192
+ )
193
+ except Exception:
194
+ if diagnostics:
195
+ diagnostics.graph_expansion_failures += 1
196
+
197
+ # Expand callees (score * 0.5)
198
+ try:
199
+ outgoing = graph_store.get_dependencies(method_node.id)
200
+ for edge in outgoing:
201
+ if edge.kind != "method_call":
202
+ continue
203
+ if len(bonus_chunks) >= max_chunks:
204
+ return
205
+ callee_score = parent_score * 0.5
206
+ if callee_score < 0.15:
207
+ continue
208
+
209
+ callee_id = edge.target_id
210
+ if callee_id in visited:
211
+ continue
212
+ visited.add(callee_id)
213
+
214
+ callee_node = graph_store.get_method(callee_id)
215
+ if callee_node is None:
216
+ if diagnostics:
217
+ diagnostics.skipped_methods += 1
218
+ continue
219
+
220
+ bonus = _method_to_scored_chunk(
221
+ callee_node, callee_score, repo_path_map, diagnostics
222
+ )
223
+ if bonus is not None:
224
+ bonus_chunks.append(bonus)
225
+ _bfs_expand(
226
+ method_node=callee_node,
227
+ parent_score=callee_score,
228
+ depth=depth + 1,
229
+ max_depth=max_depth,
230
+ graph_store=graph_store,
231
+ repo_path_map=repo_path_map,
232
+ bonus_chunks=bonus_chunks,
233
+ max_chunks=max_chunks,
234
+ visited=visited,
235
+ diagnostics=diagnostics,
236
+ )
237
+ except Exception:
238
+ if diagnostics:
239
+ diagnostics.graph_expansion_failures += 1
240
+
241
+
242
+ def _method_to_scored_chunk(
243
+ method_node: Any,
244
+ score: float,
245
+ repo_path_map: Dict[str, Path],
246
+ diagnostics: Optional["QueryDiagnostics"],
247
+ ) -> Optional[ScoredChunk]:
248
+ """Convert a MethodNode to a ScoredChunk by reading its source lines.
249
+
250
+ Returns None if the file doesn't exist (increments diagnostics counter).
251
+ """
252
+ file_path = Path(method_node.file_path)
253
+ if not file_path.is_absolute():
254
+ repo_path = repo_path_map.get(method_node.service_id)
255
+ if repo_path:
256
+ file_path = (repo_path / method_node.file_path).resolve()
257
+
258
+ if not file_path.exists():
259
+ if diagnostics:
260
+ diagnostics.skipped_files += 1
261
+ return None
262
+
263
+ try:
264
+ lines = file_path.read_text(encoding="utf-8", errors="ignore").splitlines()
265
+ start = max(0, method_node.line_start - 1)
266
+ end = min(len(lines), method_node.line_end)
267
+ content = "\n".join(lines[start:end])
268
+ except Exception:
269
+ if diagnostics:
270
+ diagnostics.skipped_files += 1
271
+ return None
272
+
273
+ return ScoredChunk(
274
+ chunk_id=method_node.id,
275
+ score=score,
276
+ file_path=str(file_path),
277
+ start_line=method_node.line_start,
278
+ end_line=method_node.line_end,
279
+ content=content,
280
+ repo_id=method_node.service_id,
281
+ symbol=method_node.method_name,
282
+ chunk_type="method",
283
+ language="python", # best effort; MethodNode doesn't store language
284
+ )
@@ -0,0 +1,171 @@
1
+ """Deduplication and adjacent chunk merging for query results."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Dict, List, TYPE_CHECKING
6
+
7
+ if TYPE_CHECKING:
8
+ from corbell.core.query.graph_expander import ScoredChunk
9
+
10
+ # Maximum number of lines in a merged chunk block
11
+ _MAX_MERGED_LINES = 60
12
+
13
+
14
+ def merge_and_dedup(chunks: List["ScoredChunk"]) -> List["ScoredChunk"]:
15
+ """Deduplicate by chunk_id (keep max score), group by file, merge adjacent chunks.
16
+
17
+ Steps:
18
+ 1. Deduplicate: for duplicate chunk_ids, keep the one with the highest score.
19
+ 2. Group chunks by file path.
20
+ 3. Within each file, sort by start_line and merge adjacent/overlapping chunks.
21
+ 4. Cap merged blocks at 60 lines.
22
+ 5. Remove chunks whose line range is fully contained within another chunk in the same file.
23
+
24
+ Args:
25
+ chunks: List of ScoredChunk objects (may have duplicates from multiple queries).
26
+
27
+ Returns:
28
+ Deduplicated and merged list of ScoredChunk objects.
29
+ """
30
+ if not chunks:
31
+ return []
32
+
33
+ # Step 1: Dedup by chunk_id, keeping max score
34
+ best: Dict[str, "ScoredChunk"] = {}
35
+ for chunk in chunks:
36
+ if chunk.chunk_id not in best or chunk.score > best[chunk.chunk_id].score:
37
+ best[chunk.chunk_id] = chunk
38
+
39
+ deduped = list(best.values())
40
+
41
+ # Step 2: Group by file path
42
+ by_file: Dict[str, List["ScoredChunk"]] = {}
43
+ for chunk in deduped:
44
+ by_file.setdefault(chunk.file_path, []).append(chunk)
45
+
46
+ # Step 3+4: Sort and merge adjacent/overlapping chunks per file
47
+ result: List["ScoredChunk"] = []
48
+ for file_path, file_chunks in by_file.items():
49
+ merged = _merge_file_chunks(file_chunks)
50
+ # Step 5: Drop chunks fully contained within a larger chunk in the same file
51
+ merged = _drop_contained_ranges(merged)
52
+ result.extend(merged)
53
+
54
+ # Sort final result by score descending for output ordering
55
+ result.sort(key=lambda c: c.score, reverse=True)
56
+ return result
57
+
58
+
59
+ def _merge_file_chunks(chunks: List["ScoredChunk"]) -> List["ScoredChunk"]:
60
+ """Merge adjacent/overlapping chunks within a single file.
61
+
62
+ Two chunks are merged if next.start_line <= current.end_line + 1.
63
+ The merged chunk gets the max score and a fresh chunk_id combining both.
64
+ Merged blocks are capped at 60 lines.
65
+ """
66
+ # Sort by start line
67
+ sorted_chunks = sorted(chunks, key=lambda c: c.start_line)
68
+
69
+ merged: List["ScoredChunk"] = []
70
+ if not sorted_chunks:
71
+ return merged
72
+
73
+ current = sorted_chunks[0]
74
+
75
+ for next_chunk in sorted_chunks[1:]:
76
+ # Check adjacency: next starts before or at current.end + 1
77
+ if next_chunk.start_line <= current.end_line + 1:
78
+ # Check if merging would exceed 60-line cap
79
+ merged_lines = next_chunk.end_line - current.start_line + 1
80
+ if merged_lines <= _MAX_MERGED_LINES:
81
+ # Merge: extend current to cover next_chunk
82
+ current = _merge_two(current, next_chunk)
83
+ else:
84
+ # Would exceed cap: flush current, start new
85
+ merged.append(current)
86
+ current = next_chunk
87
+ else:
88
+ # Not adjacent: flush current, start new
89
+ merged.append(current)
90
+ current = next_chunk
91
+
92
+ merged.append(current)
93
+ return merged
94
+
95
+
96
+ def _drop_contained_ranges(chunks: List["ScoredChunk"]) -> List["ScoredChunk"]:
97
+ """Remove chunks whose line range is fully contained within another chunk's range.
98
+
99
+ A chunk B is considered contained in chunk A when:
100
+ A.start_line <= B.start_line and B.end_line <= A.end_line
101
+
102
+ When two chunks share the exact same range, the one with the lower score is dropped;
103
+ ties keep the first encountered (already deduped by chunk_id earlier).
104
+
105
+ The larger containing chunk already includes all information from the inner chunk,
106
+ so sending both to the reranker wastes tokens without adding signal.
107
+ """
108
+ if len(chunks) <= 1:
109
+ return chunks
110
+
111
+ # Sort by range width descending (widest first), break ties by score descending.
112
+ # This lets us efficiently check whether later (narrower) chunks are contained.
113
+ sorted_by_width = sorted(
114
+ chunks,
115
+ key=lambda c: (-(c.end_line - c.start_line), -c.score),
116
+ )
117
+
118
+ kept: List["ScoredChunk"] = []
119
+ for candidate in sorted_by_width:
120
+ contained = False
121
+ for keeper in kept:
122
+ if keeper.start_line <= candidate.start_line and candidate.end_line <= keeper.end_line:
123
+ contained = True
124
+ break
125
+ if not contained:
126
+ kept.append(candidate)
127
+
128
+ return kept
129
+
130
+
131
+ def _merge_two(a: "ScoredChunk", b: "ScoredChunk") -> "ScoredChunk":
132
+ """Merge two chunks into one, combining their content and taking the max score."""
133
+ from corbell.core.query.graph_expander import ScoredChunk # local import to avoid circular
134
+
135
+ new_start = min(a.start_line, b.start_line)
136
+ new_end = max(a.end_line, b.end_line)
137
+
138
+ # Rebuild content by merging lines if both have content
139
+ # We read the combined content from file to avoid duplication.
140
+ # If reading fails, concatenate with a separator.
141
+ try:
142
+ lines = _read_lines(a.file_path, new_start, new_end)
143
+ content = "\n".join(lines)
144
+ except Exception:
145
+ # Fall back: concatenate without overlap
146
+ content = a.content + "\n" + b.content
147
+
148
+ combined_id = f"{a.chunk_id}+{b.chunk_id}"
149
+
150
+ return ScoredChunk(
151
+ chunk_id=combined_id,
152
+ score=max(a.score, b.score),
153
+ file_path=a.file_path,
154
+ start_line=new_start,
155
+ end_line=new_end,
156
+ content=content,
157
+ repo_id=a.repo_id,
158
+ symbol=a.symbol,
159
+ chunk_type=a.chunk_type,
160
+ language=a.language,
161
+ )
162
+
163
+
164
+ def _read_lines(file_path: str, start_line: int, end_line: int) -> List[str]:
165
+ """Read specific lines from a file (1-based, inclusive)."""
166
+ from pathlib import Path
167
+ path = Path(file_path)
168
+ all_lines = path.read_text(encoding="utf-8", errors="ignore").splitlines()
169
+ start_idx = max(0, start_line - 1)
170
+ end_idx = min(len(all_lines), end_line)
171
+ return all_lines[start_idx:end_idx]
@@ -0,0 +1,131 @@
1
+ """LLM-based result reranker for the query pipeline."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from typing import Any, Dict, List, Optional, TYPE_CHECKING
7
+
8
+ if TYPE_CHECKING:
9
+ from corbell.core.query.graph_expander import ScoredChunk
10
+
11
+
12
+ def rerank_chunks(
13
+ query: str,
14
+ chunks: List["ScoredChunk"],
15
+ llm_client: Optional[Any],
16
+ graph_meta: Optional[Dict[str, Dict]] = None,
17
+ ) -> List[str]:
18
+ """Rerank and filter query results using an LLM.
19
+
20
+ Sends chunk content + metadata to the LLM. The LLM returns a JSON array
21
+ of 0-based chunk indices ordered by relevance, omitting irrelevant chunks.
22
+
23
+ On any failure (parse error, LLM timeout), all chunk IDs are returned
24
+ in their original order (graceful fallback).
25
+
26
+ Args:
27
+ query: The original user query.
28
+ chunks: Scored chunks to rerank.
29
+ llm_client: An LLMClient instance (or None / unconfigured).
30
+ graph_meta: Optional dict mapping chunk_id to graph metadata
31
+ (callers count, callees count, flow name). When provided,
32
+ metadata is included in the chunk header sent to the LLM.
33
+
34
+ Returns:
35
+ List of chunk_ids in reranked order (most relevant first).
36
+ Irrelevant chunks are excluded.
37
+ Falls back to original order on any failure.
38
+ """
39
+ if not chunks:
40
+ return []
41
+
42
+ all_ids = [c.chunk_id for c in chunks]
43
+
44
+ if llm_client is None or not getattr(llm_client, "is_configured", False):
45
+ return all_ids
46
+
47
+ # Build payload with code content, indexed for compact LLM output
48
+ entries = []
49
+ for i, chunk in enumerate(chunks):
50
+ meta = graph_meta.get(chunk.chunk_id) if graph_meta else None
51
+ if meta:
52
+ callers = meta.get("callers", 0)
53
+ callees = meta.get("callees", 0)
54
+ flow = meta.get("flow")
55
+ if flow:
56
+ meta_str = f"score={chunk.score:.2f}, callers={callers}, callees={callees}, flow={flow}"
57
+ else:
58
+ meta_str = f"score={chunk.score:.2f}, callers={callers}, callees={callees}"
59
+ else:
60
+ meta_str = f"score={chunk.score:.2f}"
61
+
62
+ content = chunk.content
63
+ content_lines = content.splitlines()
64
+ if len(content_lines) > 100:
65
+ truncated_count = len(content_lines) - 100
66
+ content = "\n".join(
67
+ content_lines[:50]
68
+ + [f"... ({truncated_count} lines truncated) ..."]
69
+ + content_lines[-50:]
70
+ )
71
+
72
+ entry = (
73
+ f"[{i}] {meta_str} | {chunk.file_path}:{chunk.start_line}-{chunk.end_line}"
74
+ f" ({chunk.chunk_type}, {chunk.symbol or 'no symbol'})\n"
75
+ f"{content}"
76
+ )
77
+ entries.append(entry)
78
+
79
+ system = (
80
+ "You are a code search relevance ranker. "
81
+ "Given a query and numbered code chunks with metadata (relevance score, callers count, "
82
+ "callees count, flow membership), return a JSON array of chunk indices ordered from most "
83
+ "relevant to least relevant. "
84
+ "OMIT chunks that are not relevant to the query. "
85
+ "Higher score, more callers, and flow membership indicate higher structural importance. "
86
+ "Return ONLY a valid JSON array of integers, e.g. [2,0,5]."
87
+ )
88
+
89
+ separator = "---\n"
90
+ chunks_text = separator.join(entries)
91
+
92
+ user = (
93
+ f"Query: {query}\n\n"
94
+ f"Chunks:\n{chunks_text}\n\n"
95
+ "Return JSON array of relevant chunk indices (most relevant first):"
96
+ )
97
+
98
+ try:
99
+ response = llm_client.call(
100
+ system, user,
101
+ max_tokens=200,
102
+ temperature=0.0,
103
+ )
104
+
105
+ text = response.strip()
106
+ if text.startswith("```"):
107
+ lines = text.splitlines()
108
+ text = "\n".join(
109
+ line for line in lines
110
+ if not line.startswith("```")
111
+ ).strip()
112
+
113
+ indices = json.loads(text)
114
+
115
+ if not isinstance(indices, list):
116
+ return all_ids
117
+
118
+ # Validate indices are ints within range
119
+ n = len(chunks)
120
+ filtered = [
121
+ all_ids[idx] for idx in indices
122
+ if isinstance(idx, int) and 0 <= idx < n
123
+ ]
124
+
125
+ if not filtered:
126
+ return all_ids
127
+
128
+ return filtered
129
+
130
+ except Exception:
131
+ return all_ids