claude-toolstack-cli 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. claude_toolstack_cli-1.0.0.dist-info/METADATA +354 -0
  2. claude_toolstack_cli-1.0.0.dist-info/RECORD +48 -0
  3. claude_toolstack_cli-1.0.0.dist-info/WHEEL +5 -0
  4. claude_toolstack_cli-1.0.0.dist-info/entry_points.txt +2 -0
  5. claude_toolstack_cli-1.0.0.dist-info/licenses/LICENSE +21 -0
  6. claude_toolstack_cli-1.0.0.dist-info/top_level.txt +1 -0
  7. cts/__init__.py +3 -0
  8. cts/__main__.py +5 -0
  9. cts/autopilot.py +633 -0
  10. cts/bundle.py +958 -0
  11. cts/cli.py +2858 -0
  12. cts/confidence.py +218 -0
  13. cts/config.py +19 -0
  14. cts/corpus/__init__.py +139 -0
  15. cts/corpus/apply.py +305 -0
  16. cts/corpus/archive.py +309 -0
  17. cts/corpus/baseline.py +294 -0
  18. cts/corpus/evaluate.py +409 -0
  19. cts/corpus/experiment_eval.py +585 -0
  20. cts/corpus/experiment_schema.py +380 -0
  21. cts/corpus/extract.py +353 -0
  22. cts/corpus/load.py +44 -0
  23. cts/corpus/model.py +114 -0
  24. cts/corpus/patch.py +467 -0
  25. cts/corpus/registry.py +420 -0
  26. cts/corpus/report.py +745 -0
  27. cts/corpus/scan.py +87 -0
  28. cts/corpus/store.py +63 -0
  29. cts/corpus/trends.py +478 -0
  30. cts/corpus/tuning_schema.py +313 -0
  31. cts/corpus/variants.py +335 -0
  32. cts/ctags.py +133 -0
  33. cts/diff_context.py +92 -0
  34. cts/errors.py +109 -0
  35. cts/http.py +89 -0
  36. cts/ranking.py +466 -0
  37. cts/render.py +388 -0
  38. cts/schema.py +96 -0
  39. cts/semantic/__init__.py +47 -0
  40. cts/semantic/candidates.py +150 -0
  41. cts/semantic/chunker.py +184 -0
  42. cts/semantic/config.py +120 -0
  43. cts/semantic/embedder.py +151 -0
  44. cts/semantic/indexer.py +159 -0
  45. cts/semantic/search.py +252 -0
  46. cts/semantic/store.py +330 -0
  47. cts/sidecar.py +431 -0
  48. cts/structural.py +305 -0
cts/render.py ADDED
@@ -0,0 +1,388 @@
1
+ """Output renderers: --json, --text, --claude (v1 + v2 bundles)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import time
7
+ from typing import Any, Dict, List
8
+
9
+
10
+ def _strip_meta(data: Dict[str, Any]) -> Dict[str, Any]:
11
+ """Remove internal _fields from output."""
12
+ return {k: v for k, v in data.items() if not k.startswith("_")}
13
+
14
+
15
+ def render_json(data: Dict[str, Any]) -> None:
16
+ print(json.dumps(_strip_meta(data), indent=2))
17
+
18
+
19
+ def render_json_with_debug(data: Dict[str, Any]) -> None:
20
+ """Render full JSON including _debug key (for --debug-json)."""
21
+ print(json.dumps(data, indent=2, default=str))
22
+
23
+
24
+ def render_text_status(data: Dict[str, Any]) -> None:
25
+ rid = data.get("_request_id", "")
26
+ print(f"Request-ID: {rid}")
27
+ print(f"Gateway v{data.get('version', '?')} ok={data.get('ok')}")
28
+ print(f" Repo root: {data.get('repo_root')}")
29
+ print(f" Cache root: {data.get('cache_root')}")
30
+ print(f" RG threads: {data.get('rg_threads')}")
31
+ print(f" RG concurrency: {data.get('rg_concurrency')}")
32
+ print(f" Job concurrency: {data.get('job_concurrency')}")
33
+ print(f" Max response: {data.get('max_response_bytes')} bytes")
34
+ print(f" Timeout: {data.get('timeout_sec')}s")
35
+ print(f" Docker: {data.get('docker_host')}")
36
+ print(f" Containers: {', '.join(data.get('allowed_containers', []))}")
37
+ print(f" Allowed repos: {', '.join(data.get('allowed_repos', []))}")
38
+
39
+
40
+ def render_text_search(data: Dict[str, Any]) -> None:
41
+ rid = data.get("_request_id", "")
42
+ matches = data.get("matches", [])
43
+ print(f"Request-ID: {rid}")
44
+ repo = data.get("repo")
45
+ cnt = data.get("count")
46
+ print(f"Search: {data.get('query')!r} in {repo} ({cnt} matches)")
47
+ if data.get("truncated"):
48
+ print(" [truncated — output exceeded 512 KB]")
49
+ print()
50
+ for m in matches:
51
+ path = m.get("path", "?")
52
+ line = m.get("line", "?")
53
+ snippet = m.get("snippet", "").rstrip()
54
+ print(f" {path}:{line} {snippet}")
55
+
56
+
57
+ def render_text_slice(data: Dict[str, Any]) -> None:
58
+ rid = data.get("_request_id", "")
59
+ print(f"Request-ID: {rid}")
60
+ path = data.get("path", "?")
61
+ start = data.get("start", "?")
62
+ print(f"File: {data.get('repo')}/{path} (from line {start})")
63
+ if data.get("truncated"):
64
+ print(" [truncated]")
65
+ print()
66
+ for i, line in enumerate(data.get("lines", []), start=int(start)):
67
+ print(f" {i:>6} {line}")
68
+
69
+
70
+ def render_text_symbol(data: Dict[str, Any]) -> None:
71
+ rid = data.get("_request_id", "")
72
+ defs = data.get("defs", [])
73
+ print(f"Request-ID: {rid}")
74
+ repo = data.get("repo")
75
+ cnt = data.get("count")
76
+ print(f"Symbol: {data.get('symbol')!r} in {repo} ({cnt} defs)")
77
+ print()
78
+ for d in defs:
79
+ kind = d.get("kind", "?")
80
+ name = d.get("name", "?")
81
+ fpath = d.get("file", "?")
82
+ print(f" [{kind}] {name} {fpath}")
83
+
84
+
85
+ def render_text_job(data: Dict[str, Any]) -> None:
86
+ rid = data.get("_request_id", "")
87
+ ok = data.get("ok", False)
88
+ tag = "PASS" if ok else "FAIL"
89
+ print(f"Request-ID: {rid}")
90
+ print(
91
+ f"Job: {data.get('job')} ({data.get('preset')}) on {data.get('repo')} "
92
+ f"[{tag}] exit={data.get('exit_code')} {data.get('duration_sec')}s"
93
+ )
94
+ if data.get("truncated"):
95
+ print(" [output truncated]")
96
+ stdout = data.get("stdout", "").rstrip()
97
+ stderr = data.get("stderr", "").rstrip()
98
+ if stdout:
99
+ print("\n--- stdout ---")
100
+ print(stdout)
101
+ if stderr:
102
+ print("\n--- stderr ---")
103
+ print(stderr)
104
+
105
+
106
+ def render_text_index(data: Dict[str, Any]) -> None:
107
+ rid = data.get("_request_id", "")
108
+ ok = data.get("ok", False)
109
+ tag = "OK" if ok else "FAIL"
110
+ print(f"Request-ID: {rid}")
111
+ print(f"Index ctags: {data.get('repo')} [{tag}] {data.get('duration_sec')}s")
112
+ stderr = data.get("stderr", "").rstrip()
113
+ if stderr:
114
+ print(f" stderr: {stderr[:200]}")
115
+
116
+
117
+ # ---------------------------------------------------------------------------
118
+ # Claude evidence bundle renderers
119
+ # ---------------------------------------------------------------------------
120
+
121
+
122
+ def render_claude_search(
123
+ data: Dict[str, Any], slices: List[Dict[str, Any]] | None = None
124
+ ) -> None:
125
+ """Render a compact evidence bundle for Claude."""
126
+ rid = data.get("_request_id", "")
127
+ repo = data.get("repo", "?")
128
+ query = data.get("query", "?")
129
+ matches = data.get("matches", [])
130
+
131
+ lines: List[str] = []
132
+ lines.append("## Evidence Bundle")
133
+ lines.append(f"repo: {repo} query: {query!r} request_id: {rid}")
134
+ lines.append(f"matches: {data.get('count', 0)}")
135
+ if data.get("truncated"):
136
+ lines.append("[search results truncated at 512 KB]")
137
+ lines.append("")
138
+
139
+ # Match summary
140
+ lines.append("### Matches")
141
+ for m in matches:
142
+ path = m.get("path", "?")
143
+ line_no = m.get("line", "?")
144
+ snippet = m.get("snippet", "").rstrip()
145
+ if len(snippet) > 200:
146
+ snippet = snippet[:200] + "..."
147
+ lines.append(f" {path}:{line_no} {snippet}")
148
+ lines.append("")
149
+
150
+ # Inline slices (if provided)
151
+ if slices:
152
+ lines.append("### Context Slices")
153
+ for s in slices:
154
+ spath = s.get("path", "?")
155
+ sstart = s.get("start", 0)
156
+ lines.append(f"--- {s.get('repo', repo)}/{spath} (from line {sstart}) ---")
157
+ for i, sl in enumerate(s.get("lines", []), start=int(sstart)):
158
+ lines.append(f"{i:>6} {sl}")
159
+ lines.append("")
160
+
161
+ lines.append(
162
+ "If you need more: request wider slices, more matches, or specific files."
163
+ )
164
+ print("\n".join(lines))
165
+
166
+
167
+ def render_claude_job(data: Dict[str, Any]) -> None:
168
+ """Render job result as a compact evidence bundle."""
169
+ rid = data.get("_request_id", "")
170
+ ok = data.get("ok", False)
171
+ tag = "PASS" if ok else "FAIL"
172
+
173
+ lines: List[str] = []
174
+ lines.append("## Job Result")
175
+ repo = data.get("repo")
176
+ job = data.get("job")
177
+ preset = data.get("preset")
178
+ ec = data.get("exit_code")
179
+ dur = data.get("duration_sec")
180
+ lines.append(
181
+ f"repo: {repo} job: {job} preset: {preset} "
182
+ f"result: {tag} exit: {ec} "
183
+ f"duration: {dur}s request_id: {rid}"
184
+ )
185
+
186
+ stdout = data.get("stdout", "").rstrip()
187
+ stderr = data.get("stderr", "").rstrip()
188
+
189
+ # Trim to last N lines for Claude
190
+ max_lines = 80
191
+ if stdout:
192
+ stdout_lines = stdout.splitlines()
193
+ if len(stdout_lines) > max_lines:
194
+ lines.append(
195
+ f"\n### stdout (last {max_lines} of {len(stdout_lines)} lines)"
196
+ )
197
+ lines.extend(stdout_lines[-max_lines:])
198
+ else:
199
+ lines.append("\n### stdout")
200
+ lines.append(stdout)
201
+
202
+ if stderr:
203
+ stderr_lines = stderr.splitlines()
204
+ if len(stderr_lines) > max_lines:
205
+ lines.append(
206
+ f"\n### stderr (last {max_lines} of {len(stderr_lines)} lines)"
207
+ )
208
+ lines.extend(stderr_lines[-max_lines:])
209
+ else:
210
+ lines.append("\n### stderr")
211
+ lines.append(stderr)
212
+
213
+ if data.get("truncated"):
214
+ lines.append("[output truncated at 512 KB]")
215
+
216
+ print("\n".join(lines))
217
+
218
+
219
+ # ---------------------------------------------------------------------------
220
+ # v2 structured bundle renderer
221
+ # ---------------------------------------------------------------------------
222
+
223
+
224
+ def render_bundle(bundle: Dict[str, Any]) -> None:
225
+ """Render a v2 structured evidence bundle."""
226
+ lines: List[str] = []
227
+
228
+ mode = bundle.get("mode", "default")
229
+ repo = bundle.get("repo", "?")
230
+ rid = bundle.get("request_id", "")
231
+ query = bundle.get("query", "")
232
+ ts = bundle.get("timestamp", 0)
233
+ ts_str = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime(ts))
234
+
235
+ # Header
236
+ lines.append("# Evidence Bundle")
237
+ lines.append("")
238
+
239
+ # Metadata
240
+ lines.append("## Metadata")
241
+ lines.append(f" repo: {repo}")
242
+ lines.append(f" mode: {mode}")
243
+ lines.append(f" request_id: {rid}")
244
+ lines.append(f" timestamp: {ts_str}")
245
+ if bundle.get("truncated"):
246
+ lines.append(" [search results truncated at 512 KB]")
247
+ lines.append("")
248
+
249
+ # Query
250
+ if query:
251
+ lines.append("## Query")
252
+ lines.append(f" {query}")
253
+ lines.append("")
254
+
255
+ # Ranked evidence sources
256
+ sources = bundle.get("ranked_sources", [])
257
+ if sources:
258
+ lines.append(f"## Ranked Evidence Sources ({len(sources)})")
259
+ for s in sources:
260
+ path = s.get("path", "?")
261
+ line = s.get("line", 0)
262
+ score = s.get("score", 0.0)
263
+ extra = ""
264
+ if s.get("in_trace"):
265
+ extra = " [trace]"
266
+ lines.append(f" {score:+.2f} {path}:{line}{extra}")
267
+ lines.append("")
268
+
269
+ # Top matches
270
+ matches = bundle.get("matches", [])
271
+ if matches:
272
+ lines.append(f"## Top Matches ({len(matches)})")
273
+ for m in matches:
274
+ path = m.get("path", "?")
275
+ line = m.get("line", 0)
276
+ snippet = m.get("snippet", "")
277
+ lines.append(f" {path}:{line} {snippet}")
278
+ lines.append("")
279
+
280
+ # File slices
281
+ slices = bundle.get("slices", [])
282
+ if slices:
283
+ lines.append(f"## File Slices ({len(slices)})")
284
+ for s in slices:
285
+ spath = s.get("path", "?")
286
+ srepo = s.get("repo", repo)
287
+ sstart = s.get("start", 0)
288
+ lines.append(f"--- {srepo}/{spath} (from line {sstart}) ---")
289
+ for i, sl in enumerate(s.get("lines", []), start=int(sstart)):
290
+ lines.append(f"{i:>6} {sl}")
291
+ lines.append("")
292
+
293
+ # Symbols (symbol mode)
294
+ symbols = bundle.get("symbols", [])
295
+ if symbols:
296
+ lines.append(f"## Symbols ({len(symbols)})")
297
+ for sym in symbols:
298
+ kind = sym.get("kind", "?")
299
+ name = sym.get("name", "?")
300
+ fpath = sym.get("file", "?")
301
+ lines.append(f" [{kind}] {name} {fpath}")
302
+ lines.append("")
303
+
304
+ # Diff (change mode)
305
+ diff = bundle.get("diff", "")
306
+ if diff:
307
+ lines.append("## Diff")
308
+ # Trim to reasonable size
309
+ diff_lines = diff.splitlines()
310
+ if len(diff_lines) > 200:
311
+ lines.append(f"(showing last 200 of {len(diff_lines)} lines)")
312
+ lines.extend(diff_lines[-200:])
313
+ else:
314
+ lines.extend(diff_lines)
315
+ lines.append("")
316
+
317
+ # Suggested next commands
318
+ cmds = bundle.get("suggested_commands", [])
319
+ if cmds:
320
+ lines.append("## Suggested Next Commands")
321
+ for c in cmds:
322
+ lines.append(f" {c}")
323
+ lines.append("")
324
+
325
+ # Notes
326
+ notes = bundle.get("notes", [])
327
+ if notes:
328
+ lines.append("## Notes")
329
+ for n in notes:
330
+ lines.append(f" {n}")
331
+ lines.append("")
332
+
333
+ # Debug telemetry (when --debug-bundle)
334
+ debug = bundle.get("_debug")
335
+ if debug:
336
+ lines.append("## Debug Telemetry")
337
+ lines.append("")
338
+
339
+ # Timings
340
+ timings = debug.get("timings_ms", {})
341
+ if timings:
342
+ lines.append("### Timings (ms)")
343
+ for step, ms in timings.items():
344
+ lines.append(f" {step}: {ms:.2f}")
345
+ lines.append("")
346
+
347
+ # Bundle size
348
+ if "bundle_bytes" in debug:
349
+ kb = debug["bundle_bytes"] / 1024
350
+ lines.append(
351
+ f"### Bundle Size: {kb:.1f} KB ({debug['bundle_lines']} lines)"
352
+ )
353
+ lines.append("")
354
+
355
+ # Section sizes
356
+ sections = debug.get("sections", {})
357
+ if sections:
358
+ lines.append("### Section Sizes")
359
+ for name, info in sections.items():
360
+ skb = info.get("bytes", 0) / 1024
361
+ items = info.get("items", 0)
362
+ lines.append(f" {name}: {skb:.1f} KB ({items} items)")
363
+ lines.append("")
364
+
365
+ # Limits
366
+ limits = debug.get("limits", {})
367
+ if limits:
368
+ lines.append("### Limits")
369
+ for k, v in limits.items():
370
+ lines.append(f" {k}: {v}")
371
+ lines.append("")
372
+
373
+ # Score cards
374
+ cards = debug.get("score_cards", [])
375
+ if cards:
376
+ lines.append(f"### Score Cards (top {len(cards)})")
377
+ for card in cards:
378
+ cpath = card.get("path", "?")
379
+ ctotal = card.get("score_total", 0.0)
380
+ lines.append(f" {ctotal:+.2f} {cpath}")
381
+ signals = card.get("signals", {})
382
+ nonzero = {k: v for k, v in signals.items() if v != 0.0}
383
+ if nonzero:
384
+ parts = [f"{k}={v:+.2f}" for k, v in nonzero.items()]
385
+ lines.append(f" {', '.join(parts)}")
386
+ lines.append("")
387
+
388
+ print("\n".join(lines))
cts/schema.py ADDED
@@ -0,0 +1,96 @@
1
+ """Sidecar JSON schema wrapper for evidence bundles.
2
+
3
+ Wraps raw bundles with stable metadata for downstream consumers:
4
+ - bundle_schema_version: integer, bumped on breaking changes
5
+ - tool info: CLI version, gateway version
6
+ - inputs: the original query parameters
7
+ - passes: list of refinement passes (for autopilot)
8
+ - final: the final bundle
9
+
10
+ Consumers check bundle_schema_version to decide if they can parse
11
+ the payload. Non-breaking additions (new keys) don't bump the version.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import time
17
+ from typing import Any, Dict, List, Optional
18
+
19
+ BUNDLE_SCHEMA_VERSION = 1
20
+
21
+
22
+ def wrap_bundle(
23
+ raw_bundle: Dict[str, Any],
24
+ *,
25
+ mode: str,
26
+ request_id: str = "",
27
+ cli_version: str = "",
28
+ gateway_version: Optional[str] = None,
29
+ repo: str = "",
30
+ query: Optional[str] = None,
31
+ created_at: Optional[float] = None,
32
+ inputs: Optional[Dict[str, Any]] = None,
33
+ debug: bool = False,
34
+ passes: Optional[List[Dict[str, Any]]] = None,
35
+ ) -> Dict[str, Any]:
36
+ """Wrap a raw evidence bundle in the sidecar schema envelope.
37
+
38
+ Args:
39
+ raw_bundle: The evidence bundle dict from bundle.py builders.
40
+ mode: Bundle mode (default, error, symbol, change).
41
+ request_id: Request identifier.
42
+ cli_version: cts CLI version string.
43
+ gateway_version: Gateway version string (None if not available).
44
+ repo: Repository identifier.
45
+ query: Original search query or symbol name.
46
+ created_at: Unix timestamp (defaults to now).
47
+ inputs: Dict of original CLI inputs / query parameters.
48
+ debug: Whether debug telemetry was enabled.
49
+ passes: List of refinement pass dicts (for autopilot).
50
+
51
+ Returns:
52
+ Sidecar-wrapped dict with stable schema version.
53
+ """
54
+ ts = created_at if created_at is not None else time.time()
55
+
56
+ sidecar: Dict[str, Any] = {
57
+ "bundle_schema_version": BUNDLE_SCHEMA_VERSION,
58
+ "created_at": ts,
59
+ "tool": {
60
+ "name": "cts",
61
+ "cli_version": cli_version,
62
+ },
63
+ "request_id": request_id,
64
+ "repo": repo,
65
+ "mode": mode,
66
+ }
67
+
68
+ if gateway_version is not None:
69
+ sidecar["tool"]["gateway_version"] = gateway_version
70
+
71
+ if query is not None:
72
+ sidecar["query"] = query
73
+
74
+ if inputs is not None:
75
+ sidecar["inputs"] = inputs
76
+
77
+ sidecar["debug"] = debug
78
+
79
+ # Refinement passes (autopilot stores intermediate bundles here)
80
+ sidecar["passes"] = passes or []
81
+
82
+ # The final bundle payload
83
+ sidecar["final"] = _strip_debug_if_needed(raw_bundle, debug)
84
+
85
+ return sidecar
86
+
87
+
88
+ def _strip_debug_if_needed(bundle: Dict[str, Any], keep_debug: bool) -> Dict[str, Any]:
89
+ """Optionally strip _debug from the final bundle.
90
+
91
+ When debug=False, remove _debug to keep sidecar clean.
92
+ When debug=True, keep it in the final bundle for introspection.
93
+ """
94
+ if keep_debug or "_debug" not in bundle:
95
+ return bundle
96
+ return {k: v for k, v in bundle.items() if k != "_debug"}
@@ -0,0 +1,47 @@
1
+ """Semantic search augmentation for Claude Toolstack.
2
+
3
+ Optional module — requires ``pip install .[semantic]`` for dependencies.
4
+
5
+ Provides:
6
+ - Chunking strategies for source files
7
+ - SQLite-backed embedding persistence
8
+ - Pluggable embedder (sentence-transformers default)
9
+ - Pure Python cosine retrieval
10
+ - Indexing pipeline (incremental + rebuild)
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ SEMANTIC_SCHEMA_VERSION = 1
16
+
17
+ # Default knobs (workstation-safe)
18
+ DEFAULTS = {
19
+ "chunk_lines": 180,
20
+ "overlap_lines": 30,
21
+ "topk_chunks": 8,
22
+ "max_slices": 4,
23
+ "max_seconds": 4,
24
+ "max_file_bytes": 512 * 1024, # 512 KB
25
+ "confidence_gate": 0.45,
26
+ "match_gate": 5,
27
+ }
28
+
29
+
30
+ def _check_deps() -> None:
31
+ """Verify optional dependencies are installed."""
32
+ missing = []
33
+ try:
34
+ import numpy # noqa: F401
35
+ except ImportError:
36
+ missing.append("numpy")
37
+
38
+ try:
39
+ import sentence_transformers # noqa: F401
40
+ except ImportError:
41
+ missing.append("sentence-transformers")
42
+
43
+ if missing:
44
+ deps = ", ".join(missing)
45
+ raise ImportError(
46
+ f"Semantic search requires: {deps}. Install with: pip install .[semantic]"
47
+ )
@@ -0,0 +1,150 @@
1
+ """Candidate narrowing for semantic search.
2
+
3
+ Selects which files semantic search should consider, based on
4
+ lexical ranking signals. The goal: search only where lexical
5
+ didn't already produce good evidence, reducing both latency
6
+ and irrelevant noise.
7
+
8
+ Strategies:
9
+ - exclude_top_k: drop the top K lexically-ranked files and
10
+ take the next best candidates (default, conservative)
11
+ - none: no narrowing — all files are candidates
12
+
13
+ All strategies respect max_files and path prefer/avoid filters.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ from dataclasses import dataclass, field
19
+ from typing import Any, Dict, List, Optional, Set
20
+
21
+
22
+ @dataclass
23
+ class CandidateSelection:
24
+ """Result of candidate selection with debug metadata."""
25
+
26
+ strategy: str
27
+ allowed_paths: List[str]
28
+ excluded_top_k: int = 0
29
+ candidate_files: int = 0
30
+ excluded_files_sample: List[str] = field(default_factory=list)
31
+ candidate_rules_hit: List[str] = field(default_factory=list)
32
+
33
+ def to_dict(self) -> Dict[str, Any]:
34
+ return {
35
+ "strategy": self.strategy,
36
+ "excluded_top_k": self.excluded_top_k,
37
+ "candidate_files": self.candidate_files,
38
+ "excluded_files_sample": self.excluded_files_sample,
39
+ "candidate_rules_hit": self.candidate_rules_hit,
40
+ }
41
+
42
+
43
+ def _extract_paths(
44
+ ranked_sources: List[Dict[str, Any]],
45
+ ) -> List[str]:
46
+ """Extract unique file paths from ranked sources in order."""
47
+ seen: Set[str] = set()
48
+ paths: List[str] = []
49
+ for src in ranked_sources:
50
+ p = src.get("path", "")
51
+ if p and p not in seen:
52
+ seen.add(p)
53
+ paths.append(p)
54
+ return paths
55
+
56
+
57
+ def _apply_path_filters(
58
+ paths: List[str],
59
+ *,
60
+ prefer_paths: Optional[List[str]] = None,
61
+ avoid_paths: Optional[List[str]] = None,
62
+ rules_hit: List[str],
63
+ ) -> List[str]:
64
+ """Filter paths by prefer/avoid patterns.
65
+
66
+ - prefer_paths: if set, only include paths containing at least
67
+ one preferred prefix (e.g. "src/", "app/")
68
+ - avoid_paths: exclude paths containing any avoided prefix
69
+ (e.g. "vendor/", "test/")
70
+ """
71
+ result = paths
72
+
73
+ if avoid_paths:
74
+ before = len(result)
75
+ result = [p for p in result if not any(avoid in p for avoid in avoid_paths)]
76
+ if len(result) < before:
77
+ rules_hit.append(f"avoid_paths removed {before - len(result)}")
78
+
79
+ if prefer_paths:
80
+ preferred = [p for p in result if any(pref in p for pref in prefer_paths)]
81
+ if preferred:
82
+ rules_hit.append(f"prefer_paths kept {len(preferred)} of {len(result)}")
83
+ result = preferred
84
+
85
+ return result
86
+
87
+
88
+ def select_candidates(
89
+ ranked_sources: List[Dict[str, Any]],
90
+ *,
91
+ strategy: str = "exclude_top_k",
92
+ exclude_top_k: int = 10,
93
+ max_files: int = 200,
94
+ prefer_paths: Optional[List[str]] = None,
95
+ avoid_paths: Optional[List[str]] = None,
96
+ ) -> CandidateSelection:
97
+ """Select candidate files for semantic search.
98
+
99
+ Args:
100
+ ranked_sources: Lexically-ranked source list from the bundle.
101
+ strategy: Selection strategy ("exclude_top_k" or "none").
102
+ exclude_top_k: Number of top-ranked files to exclude.
103
+ max_files: Maximum candidate files to return.
104
+ prefer_paths: Path prefixes to prefer (e.g. ["src/", "app/"]).
105
+ avoid_paths: Path prefixes to avoid (e.g. ["vendor/", "test/"]).
106
+
107
+ Returns:
108
+ CandidateSelection with allowed_paths and debug metadata.
109
+ """
110
+ if strategy == "none":
111
+ return CandidateSelection(
112
+ strategy="none",
113
+ allowed_paths=[], # empty = "search everything"
114
+ candidate_files=0,
115
+ candidate_rules_hit=["no narrowing"],
116
+ )
117
+
118
+ # Default: exclude_top_k
119
+ all_paths = _extract_paths(ranked_sources)
120
+ rules_hit: List[str] = []
121
+
122
+ # Split into excluded top-K and remaining candidates
123
+ k = min(exclude_top_k, len(all_paths))
124
+ excluded = all_paths[:k]
125
+ candidates = all_paths[k:]
126
+
127
+ if k > 0:
128
+ rules_hit.append(f"excluded top {k} lexical files")
129
+
130
+ # Apply path filters
131
+ candidates = _apply_path_filters(
132
+ candidates,
133
+ prefer_paths=prefer_paths,
134
+ avoid_paths=avoid_paths,
135
+ rules_hit=rules_hit,
136
+ )
137
+
138
+ # Cap at max_files
139
+ if len(candidates) > max_files:
140
+ rules_hit.append(f"capped at {max_files} files")
141
+ candidates = candidates[:max_files]
142
+
143
+ return CandidateSelection(
144
+ strategy="exclude_top_k",
145
+ allowed_paths=candidates,
146
+ excluded_top_k=k,
147
+ candidate_files=len(candidates),
148
+ excluded_files_sample=excluded[:10],
149
+ candidate_rules_hit=rules_hit,
150
+ )