@ictechgy/context-guard 0.4.1 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/CHANGELOG.md +9 -0
  2. package/README.ko.md +61 -32
  3. package/README.md +90 -22
  4. package/context-guard-kit/README.md +39 -26
  5. package/context-guard-kit/benchmark_runner.py +273 -8
  6. package/context-guard-kit/claude_transcript_cost_audit.py +325 -12
  7. package/context-guard-kit/context_compress.py +153 -1
  8. package/context-guard-kit/context_filter.py +446 -0
  9. package/context-guard-kit/context_guard_cli.py +3 -0
  10. package/context-guard-kit/context_guard_diet.py +677 -2
  11. package/context-guard-kit/context_pack.py +1694 -2
  12. package/context-guard-kit/cost_guard.py +1870 -0
  13. package/context-guard-kit/setup_wizard.py +820 -29
  14. package/context-guard-kit/trim_command_output.py +396 -45
  15. package/docs/benchmark-fixtures/learned-compression.tasks.example.json +24 -0
  16. package/docs/benchmark-fixtures/learned-compression.variants.example.json +10 -0
  17. package/docs/benchmark-fixtures/visual-ocr.tasks.example.json +24 -0
  18. package/docs/benchmark-fixtures/visual-ocr.variants.example.json +10 -0
  19. package/docs/benchmark-workflow-examples.md +40 -0
  20. package/docs/benchmark-workflows/context-pack-byte-proxy.example.json +169 -0
  21. package/docs/benchmark-workflows/measured-token-workflow.example.json +170 -0
  22. package/docs/benchmark-workflows/provider-cache-telemetry.example.json +170 -0
  23. package/docs/cache-diagnostics-schema.md +75 -0
  24. package/docs/cache-diagnostics.example.json +116 -0
  25. package/docs/cache-diagnostics.schema.json +460 -0
  26. package/docs/distribution.md +4 -2
  27. package/docs/experimental-benchmark-fixtures.md +36 -0
  28. package/package.json +11 -2
  29. package/packaging/homebrew/context-guard.rb.template +3 -2
  30. package/plugins/context-guard/.claude-plugin/plugin.json +1 -1
  31. package/plugins/context-guard/README.ko.md +21 -13
  32. package/plugins/context-guard/README.md +24 -10
  33. package/plugins/context-guard/bin/context-guard +3 -0
  34. package/plugins/context-guard/bin/context-guard-audit +325 -12
  35. package/plugins/context-guard/bin/context-guard-bench +273 -8
  36. package/plugins/context-guard/bin/context-guard-compress +153 -1
  37. package/plugins/context-guard/bin/context-guard-cost +1870 -0
  38. package/plugins/context-guard/bin/context-guard-diet +677 -2
  39. package/plugins/context-guard/bin/context-guard-filter +446 -0
  40. package/plugins/context-guard/bin/context-guard-pack +1694 -2
  41. package/plugins/context-guard/bin/context-guard-setup +820 -29
  42. package/plugins/context-guard/bin/context-guard-trim-output +396 -45
  43. package/plugins/context-guard/brief/README.md +10 -3
  44. package/plugins/context-guard/skills/optimize/SKILL.md +5 -2
  45. package/plugins/context-guard/skills/setup/SKILL.md +3 -1
@@ -10,16 +10,19 @@ retrieval when the path is safe to display.
10
10
  from __future__ import annotations
11
11
 
12
12
  import argparse
13
+ import ast
13
14
  import copy
14
15
  import hashlib
15
16
  import importlib.machinery
16
17
  import importlib.util
17
18
  import json
18
19
  import os
20
+ import posixpath
19
21
  from pathlib import Path
20
22
  import re
21
23
  import shlex
22
24
  import stat
25
+ import subprocess
23
26
  import sys
24
27
  import time
25
28
  from dataclasses import dataclass
@@ -35,22 +38,79 @@ MAX_MANIFEST_BYTES = 1_000_000
35
38
  MAX_LABEL_CHARS = 160
36
39
  MAX_REASON_CHARS = 120
37
40
  TOKEN_PROXY_CHARS_PER_TOKEN = 4
41
+ SUGGEST_SCHEMA_VERSION = "contextguard.pack-suggest.v1"
42
+ AUTO_SCHEMA_VERSION = "contextguard.pack-auto.v1"
43
+ AUTO_EXPLAIN_SCHEMA_VERSION = "contextguard.pack-auto-explain.v1"
44
+ REPO_MAP_SCHEMA_VERSION = "contextguard.pack-repo-map.v1"
45
+ DEFAULT_SUGGEST_TOP = 8
46
+ MAX_SUGGEST_TOP = 50
47
+ DEFAULT_SUGGEST_CONTEXT_LINES = 20
48
+ MAX_SUGGEST_CONTEXT_LINES = 120
49
+ SUGGEST_WHOLE_FILE_MAX_LINES = 120
50
+ MAX_SUGGEST_INPUT_BYTES = 256_000
51
+ MAX_QUERY_SCAN_FILES = 2_000
52
+ MAX_QUERY_SCAN_BYTES_PER_FILE = 200_000
53
+ MAX_REPO_MAP_FILES = 1_000
54
+ MAX_REPO_MAP_BYTES_PER_FILE = 120_000
55
+ MAX_REPO_MAP_TREE_ENTRIES = 30
56
+ MAX_REPO_MAP_SIGNATURE_ENTRIES = 40
57
+ MAX_REPO_MAP_GRAPH_RANK_ENTRIES = 30
58
+ MAX_REPO_MAP_RETRIEVAL_HINTS = 30
59
+ MAX_REPO_MAP_SECRET_RISK_FILES = 20
38
60
  PACK_DIR = ".context-guard/packs"
39
61
  REDACTED_PATH_COMPONENT = "[REDACTED-PATH-COMPONENT]"
62
+ CONTROL_CHAR_RE = re.compile(r"[\x00-\x1f\x7f-\x9f]")
40
63
  SECRET_CONTENT_RE = re.compile(
41
64
  r"(?is)("
42
65
  r"-----BEGIN [A-Z0-9 ]*PRIVATE KEY-----|"
43
66
  r"AKIA[0-9A-Z]{16}|"
67
+ r"ASIA[0-9A-Z]{16}|"
44
68
  r"gh[pousr]_[A-Za-z0-9_]{20,}|"
45
69
  r"github_pat_[A-Za-z0-9_]{20,}|"
70
+ r"glpat-[A-Za-z0-9_-]{12,}|"
46
71
  r"xox[abprs]-[A-Za-z0-9-]{10,}|"
47
72
  r"sk-(?:ant|proj)-[A-Za-z0-9_-]{12,}|"
48
73
  r"sk-[A-Za-z0-9][A-Za-z0-9_-]{20,}|"
74
+ r"(?:sk|pk|rk)_(?:live|test)_[A-Za-z0-9]{16,}|"
75
+ r"npm_[A-Za-z0-9]{20,}|"
49
76
  r"AIza[0-9A-Za-z_\-]{20,}|"
50
77
  r"(?i:Authorization)\s*:\s*(?:Bearer|Basic)\s+[A-Za-z0-9._~+/=-]+|"
51
78
  r"(?<![A-Za-z0-9])(?:api[_-]?key|token|secret|password|client[_-]?secret)\s*[:=]\s*[^\s]+"
52
79
  r")"
53
80
  )
81
+ SECRET_PATH_COMPONENT_RE = re.compile(
82
+ r"(?i)("
83
+ r"SG\.[A-Za-z0-9_-]{16,256}\.[A-Za-z0-9_-]{16,512}|"
84
+ r"eyJ[A-Za-z0-9_-]{8,}\.[A-Za-z0-9_-]{8,}\.[A-Za-z0-9_-]{8,}|"
85
+ r"\b(?:Bearer|Basic)\s+[A-Za-z0-9._~+/=-]{12,}|"
86
+ r"[a-z][a-z0-9+.-]{0,31}:/+(?:[^/\s:@]{0,256}:[^/\s@]{0,2048}|[^/\s@]{1,2048})@"
87
+ r")"
88
+ )
89
+ SECRET_RISK_PATTERNS: tuple[tuple[str, re.Pattern[str]], ...] = (
90
+ ("private_key_block", re.compile(r"(?is)-----BEGIN [A-Z0-9 ]*PRIVATE KEY-----")),
91
+ ("github_token", re.compile(r"gh[pousr]_[A-Za-z0-9_]{20,}|github_pat_[A-Za-z0-9_]{20,}|glpat-[A-Za-z0-9_-]{12,}")),
92
+ ("provider_api_key", re.compile(r"sk-(?:ant|proj)-[A-Za-z0-9_-]{12,}|sk-[A-Za-z0-9][A-Za-z0-9_-]{20,}|AIza[0-9A-Za-z_\-]{20,}")),
93
+ ("authorization_header", re.compile(r"(?i)Authorization\s*:\s*(?:Bearer|Basic)\s+[A-Za-z0-9._~+/=-]+")),
94
+ ("generic_secret_assignment", re.compile(r"(?i)(?:api[_-]?key|token|secret|password|client[_-]?secret)\s*[:=]\s*[^\s]+")),
95
+ )
96
+ REPO_MAP_TEXT_EXTENSIONS = {
97
+ ".py", ".js", ".jsx", ".ts", ".tsx", ".mjs", ".cjs",
98
+ ".go", ".rs", ".java", ".kt", ".kts", ".swift", ".c", ".cc", ".cpp", ".h", ".hpp",
99
+ ".md", ".mdx", ".txt", ".json", ".yaml", ".yml", ".toml", ".sh", ".css", ".html",
100
+ }
101
+ SYMBOL_HINT_EXTENSIONS = {".py", ".js", ".jsx", ".ts", ".tsx", ".go", ".rs"}
102
+ SIGNATURE_LINE_RE = re.compile(
103
+ r"^\s*(?:export\s+)?(?:(?:async\s+)?function\s+([A-Za-z_$][\w$]*)\s*\(|class\s+([A-Za-z_$][\w$]*)|"
104
+ r"(?:const|let|var)\s+([A-Za-z_$][\w$]*)\s*=\s*(?:async\s*)?(?:\([^)]*\)|[A-Za-z_$][\w$]*)\s*=>|"
105
+ r"func\s+(?:\([^)]*\)\s*)?([A-Za-z_]\w*)\s*\(|(?:pub\s+)?(?:async\s+)?fn\s+([A-Za-z_]\w*)\s*\()"
106
+ )
107
+ IMPORT_PATH_RE = re.compile(
108
+ r"(?:from\s+['\"](?P<jsfrom>[^'\"]+)['\"]|"
109
+ r"import(?:\s+[^;\n'\"]+?\s+from)?\s+['\"](?P<jsimport>[^'\"]+)['\"]|"
110
+ r"from\s+(?P<pyfrom>\.*[A-Za-z_][\w.]*|\.+)\s+import|"
111
+ r"import\s+(?P<pyimport>[A-Za-z_][\w.]*))"
112
+ )
113
+ PY_FROM_IMPORT_LINE_RE = re.compile(r"^\s*from\s+(?P<module>\.*[A-Za-z_][\w.]*|\.+)\s+import\s+(?P<names>[^\n#;]+)")
54
114
 
55
115
 
56
116
  @dataclass(frozen=True)
@@ -87,6 +147,16 @@ class ResolvedSource:
87
147
  redacted_lines: int
88
148
 
89
149
 
150
+ @dataclass
151
+ class SuggestCandidate:
152
+ path: str
153
+ score: int
154
+ reason: str
155
+ lines: LineRange | None = None
156
+ label: str | None = None
157
+ input_index: int = 0
158
+
159
+
90
160
  class PackError(ValueError):
91
161
  pass
92
162
 
@@ -176,9 +246,10 @@ def display_root(root: Path) -> str:
176
246
 
177
247
 
178
248
  def display_rel_path(rel: str) -> tuple[str, bool]:
249
+ normalized = rel.replace("\\", "/")
179
250
  parts: list[str] = []
180
251
  redacted = False
181
- for part in rel.replace("\\", "/").split("/"):
252
+ for part in normalized.split("/"):
182
253
  if not part:
183
254
  continue
184
255
  safe, did = sanitize_path_component(part)
@@ -187,6 +258,24 @@ def display_rel_path(rel: str) -> tuple[str, bool]:
187
258
  return "/".join(parts), redacted
188
259
 
189
260
 
261
+ def repo_map_path_has_sensitive_evidence(value: str) -> bool:
262
+ return bool(CONTROL_CHAR_RE.search(value) or SECRET_PATH_COMPONENT_RE.search(value))
263
+
264
+
265
+ def repo_map_display_rel_path(rel: str) -> tuple[str, bool]:
266
+ normalized = rel.replace("\\", "/")
267
+ if repo_map_path_has_sensitive_evidence(normalized):
268
+ return f"redacted-path#path:{sha256_text(normalized)[:12]}", True
269
+ return display_rel_path(normalized)
270
+
271
+
272
+ def repo_map_safe_raw_path_label(raw: str) -> str:
273
+ normalized = raw.replace("\\", "/")
274
+ if repo_map_path_has_sensitive_evidence(normalized):
275
+ return f"redacted-path#path:{sha256_text(normalized)[:12]}"
276
+ return safe_raw_path_label(normalized)
277
+
278
+
190
279
  def parse_line_range(value: object) -> LineRange | None:
191
280
  if value is None or value == "":
192
281
  return None
@@ -484,7 +573,7 @@ def retrieval_cli(root_arg: str, display_path: str, lines: LineRange) -> str:
484
573
 
485
574
  def safe_root_arg_for_retrieval(root_arg: str) -> str | None:
486
575
  text = str(root_arg)
487
- if SECRET_CONTENT_RE.search(text):
576
+ if CONTROL_CHAR_RE.search(text) or SECRET_CONTENT_RE.search(text) or SECRET_PATH_COMPONENT_RE.search(text):
488
577
  return None
489
578
  for part in text.replace("\\", "/").split("/"):
490
579
  if not part:
@@ -495,6 +584,13 @@ def safe_root_arg_for_retrieval(root_arg: str) -> str | None:
495
584
  return text
496
585
 
497
586
 
587
+ def safe_repo_map_root_arg_for_retrieval(root_arg: str) -> str | None:
588
+ text = str(root_arg)
589
+ if repo_map_path_has_sensitive_evidence(text):
590
+ return None
591
+ return safe_root_arg_for_retrieval(text)
592
+
593
+
498
594
  def retrieval_for(root_arg: str, display_path: str, lines: LineRange, *, redacted_path: bool) -> tuple[str | None, str | None]:
499
595
  if redacted_path:
500
596
  return None, "redacted_path"
@@ -866,6 +962,1559 @@ def slice_source(root: Path, *, raw_path: str, lines: LineRange) -> tuple[dict[s
866
962
  return payload, 0
867
963
 
868
964
 
965
+ def suggest_tokens(text: str) -> set[str]:
966
+ sanitized = SECRET_CONTENT_RE.sub(" ", text.lower())
967
+ return {part for part in re.findall(r"[a-z0-9_][a-z0-9_.-]{1,}", sanitized) if len(part) >= 2}
968
+
969
+
970
+ def suggest_score_path(path: str, query_terms: set[str]) -> int:
971
+ lowered = path.lower()
972
+ score = 0
973
+ for term in query_terms:
974
+ if term in lowered:
975
+ score += 120
976
+ return score
977
+
978
+
979
+ def suggest_reason(*parts: str) -> str:
980
+ return cap_label("; ".join(part for part in parts if part), default="local heuristic", limit=MAX_REASON_CHARS) or "local heuristic"
981
+
982
+
983
+ def split_suggest_files(values: list[str] | None) -> list[str]:
984
+ out: list[str] = []
985
+ for value in values or []:
986
+ for part in str(value).split(","):
987
+ text = part.strip()
988
+ if text:
989
+ out.append(text)
990
+ return out
991
+
992
+
993
+ def line_window(line_number: int, total_lines: int | None, context_lines: int) -> LineRange:
994
+ start = max(1, line_number - context_lines)
995
+ if total_lines is None:
996
+ end = max(start, line_number + context_lines)
997
+ else:
998
+ end = min(max(start, line_number + context_lines), max(1, total_lines))
999
+ return LineRange(start, end)
1000
+
1001
+
1002
+ def merge_line_window(existing: LineRange | None, line_number: int, context_lines: int) -> LineRange:
1003
+ window = line_window(line_number, None, context_lines)
1004
+ if existing is None:
1005
+ return window
1006
+ return LineRange(min(existing.start, window.start), max(existing.end, window.end))
1007
+
1008
+
1009
+ def add_suggest_candidate(
1010
+ candidates: list[SuggestCandidate],
1011
+ *,
1012
+ path: str,
1013
+ score: int,
1014
+ reason: str,
1015
+ lines: LineRange | None = None,
1016
+ label: str | None = None,
1017
+ ) -> None:
1018
+ candidates.append(
1019
+ SuggestCandidate(
1020
+ path=path,
1021
+ score=score,
1022
+ reason=suggest_reason(reason),
1023
+ lines=lines,
1024
+ label=cap_label(label),
1025
+ input_index=len(candidates),
1026
+ )
1027
+ )
1028
+
1029
+
1030
+ def run_git_diff(root: Path, diff_ref: str) -> str:
1031
+ ref = diff_ref.strip()
1032
+ if not ref:
1033
+ raise PackError("empty --diff")
1034
+ command = ["git", "-C", str(root), "diff", "--no-ext-diff", "--no-textconv", "--unified=3"]
1035
+ if ref in {"staged", "--staged", "cached", "--cached"}:
1036
+ command.extend(["--cached"])
1037
+ elif ref in {"worktree", "unstaged", "working-tree"}:
1038
+ pass
1039
+ elif ref.startswith("-"):
1040
+ raise PackError("invalid --diff: revision must not start with '-'")
1041
+ else:
1042
+ command.append(ref)
1043
+ try:
1044
+ proc = subprocess.run(command, text=True, errors="replace", capture_output=True, timeout=10, check=False)
1045
+ except (OSError, UnicodeError, subprocess.TimeoutExpired) as exc:
1046
+ raise PackError(f"could not read diff: {exc.__class__.__name__}") from exc
1047
+ if proc.returncode != 0:
1048
+ detail = sanitize_text(proc.stderr or proc.stdout or "git diff failed")[0].strip().splitlines()
1049
+ message = detail[0] if detail else "git diff failed"
1050
+ raise PackError(f"could not read diff: {cap_label(message, default='git diff failed', limit=160)}")
1051
+ return sanitize_text(proc.stdout[:MAX_SUGGEST_INPUT_BYTES])[0]
1052
+
1053
+
1054
+ def collect_diff_candidates(root: Path, diff_ref: str, query_terms: set[str], context_lines: int) -> list[SuggestCandidate]:
1055
+ diff_text = run_git_diff(root, diff_ref)
1056
+ candidates: list[SuggestCandidate] = []
1057
+ current_path: str | None = None
1058
+ hunk_re = re.compile(r"^@@ -\d+(?:,\d+)? \+(\d+)(?:,(\d+))? @@")
1059
+ for line in diff_text.splitlines():
1060
+ if line.startswith("diff --git "):
1061
+ match = re.match(r"^diff --git a/(.+?) b/(.+)$", line)
1062
+ current_path = None
1063
+ if match:
1064
+ left, right = match.groups()
1065
+ current_path = right if right != "/dev/null" else left
1066
+ continue
1067
+ if current_path is None:
1068
+ continue
1069
+ hunk = hunk_re.match(line)
1070
+ if hunk:
1071
+ start = int(hunk.group(1))
1072
+ count = int(hunk.group(2) or "1")
1073
+ end_line = max(start, start + max(1, count) - 1)
1074
+ start_line = max(1, start - context_lines)
1075
+ window = LineRange(start_line, max(start_line, end_line + context_lines))
1076
+ score = 7_000 + suggest_score_path(current_path, query_terms)
1077
+ add_suggest_candidate(
1078
+ candidates,
1079
+ path=current_path,
1080
+ score=score,
1081
+ reason="changed diff hunk",
1082
+ lines=window,
1083
+ label=f"diff:{safe_raw_path_label(current_path)}",
1084
+ )
1085
+ return candidates
1086
+
1087
+
1088
+ OUTPUT_PATH_RE = re.compile(
1089
+ r"(?<![A-Za-z0-9_./-])"
1090
+ r"(?P<path>(?:\.\/)?(?:[A-Za-z0-9_.-]+/)*[A-Za-z0-9_.-]+\."
1091
+ r"(?:py|js|jsx|ts|tsx|mjs|cjs|md|json|yml|yaml|toml|sh|css|html|txt|rb|go|rs|java|kt|swift|c|cc|cpp|h|hpp))"
1092
+ r"(?::(?P<line>\d+))?"
1093
+ )
1094
+
1095
+
1096
+ def read_text_input_under_root(root: Path, raw_path: str) -> tuple[str | None, dict[str, Any] | None]:
1097
+ rel, reason = lexical_rel(raw_path)
1098
+ display = safe_raw_path_label(raw_path)
1099
+ if rel is None:
1100
+ return None, {"path": display, "status": "omitted", "reason": reason}
1101
+ display, redacted = display_rel_path(rel.as_posix())
1102
+ if redacted:
1103
+ return None, {"path": display, "status": "omitted", "reason": "redacted_path", "retrieval_omitted_reason": "redacted_path"}
1104
+ handle, reason = open_regular_under_root(root, rel)
1105
+ if handle is None:
1106
+ return None, {"path": display, "status": "omitted", "reason": reason}
1107
+ try:
1108
+ with handle:
1109
+ text = handle.read(MAX_SUGGEST_INPUT_BYTES + 1)
1110
+ except (OSError, UnicodeError):
1111
+ return None, {"path": display, "status": "omitted", "reason": "unsafe_path"}
1112
+ if len(text.encode("utf-8", errors="replace")) > MAX_SUGGEST_INPUT_BYTES:
1113
+ text = text[:MAX_SUGGEST_INPUT_BYTES]
1114
+ sanitized, _redacted = sanitize_text(text)
1115
+ return sanitized, None
1116
+
1117
+
1118
+ def collect_output_candidates(
1119
+ root: Path,
1120
+ raw_paths: list[str] | None,
1121
+ query_terms: set[str],
1122
+ context_lines: int,
1123
+ *,
1124
+ origin: str,
1125
+ ) -> tuple[list[SuggestCandidate], list[dict[str, Any]]]:
1126
+ candidates: list[SuggestCandidate] = []
1127
+ omitted: list[dict[str, Any]] = []
1128
+ for raw in raw_paths or []:
1129
+ text, omission_item = read_text_input_under_root(root, raw)
1130
+ if omission_item is not None:
1131
+ omission_item["origin"] = origin
1132
+ omitted.append(omission_item)
1133
+ continue
1134
+ assert text is not None
1135
+ by_path: dict[str, LineRange | None] = {}
1136
+ for match in OUTPUT_PATH_RE.finditer(text):
1137
+ path = match.group("path")
1138
+ if path.startswith("./"):
1139
+ path = path[2:]
1140
+ line_text = match.group("line")
1141
+ if line_text:
1142
+ try:
1143
+ line_number = int(line_text)
1144
+ except ValueError:
1145
+ line_number = 1
1146
+ by_path[path] = merge_line_window(by_path.get(path), line_number, context_lines)
1147
+ else:
1148
+ by_path.setdefault(path, None)
1149
+ for path, lines in sorted(by_path.items()):
1150
+ score = 5_000 + suggest_score_path(path, query_terms)
1151
+ add_suggest_candidate(
1152
+ candidates,
1153
+ path=path,
1154
+ score=score,
1155
+ reason=f"{origin} referenced path",
1156
+ lines=lines,
1157
+ label=f"{origin}:{safe_raw_path_label(path)}",
1158
+ )
1159
+ return candidates, omitted
1160
+
1161
+
1162
+ def git_ls_files(root: Path) -> list[str]:
1163
+ try:
1164
+ proc = subprocess.run(
1165
+ ["git", "-C", str(root), "ls-files", "-z"],
1166
+ text=False,
1167
+ capture_output=True,
1168
+ timeout=10,
1169
+ check=False,
1170
+ )
1171
+ except (OSError, subprocess.TimeoutExpired):
1172
+ proc = None
1173
+ if proc is not None and proc.returncode == 0:
1174
+ raw = proc.stdout[: MAX_QUERY_SCAN_FILES * 512]
1175
+ return [part.decode("utf-8", "replace") for part in raw.split(b"\0") if part][:MAX_QUERY_SCAN_FILES]
1176
+ out: list[str] = []
1177
+ skip_dirs = {".git", ".omx", ".context-guard", "node_modules", "dist", "build", "__pycache__"}
1178
+ for current, dirs, files in os.walk(root):
1179
+ dirs[:] = [name for name in dirs if name not in skip_dirs and not name.startswith(".pytest")]
1180
+ current_path = Path(current)
1181
+ for name in files:
1182
+ rel = (current_path / name).relative_to(root).as_posix()
1183
+ out.append(rel)
1184
+ if len(out) >= MAX_QUERY_SCAN_FILES:
1185
+ return out
1186
+ return out
1187
+
1188
+
1189
+ def collect_query_candidates(root: Path, query_terms: set[str], context_lines: int) -> list[SuggestCandidate]:
1190
+ if not query_terms:
1191
+ return []
1192
+ candidates: list[SuggestCandidate] = []
1193
+ for rel_path in git_ls_files(root):
1194
+ rel, reason = lexical_rel(rel_path)
1195
+ if rel is None or reason:
1196
+ continue
1197
+ display, redacted = display_rel_path(rel.as_posix())
1198
+ if redacted:
1199
+ continue
1200
+ path_score = suggest_score_path(display, query_terms)
1201
+ handle, open_reason = open_regular_under_root(root, rel)
1202
+ if handle is None:
1203
+ continue
1204
+ first_match_line: int | None = None
1205
+ content_score = 0
1206
+ try:
1207
+ with handle:
1208
+ scanned_bytes = 0
1209
+ for index, raw_line in enumerate(handle, start=1):
1210
+ scanned_bytes += byte_len(raw_line)
1211
+ if scanned_bytes > MAX_QUERY_SCAN_BYTES_PER_FILE:
1212
+ break
1213
+ if index > SUGGEST_WHOLE_FILE_MAX_LINES and content_score == 0 and path_score == 0:
1214
+ break
1215
+ lowered = raw_line.lower()
1216
+ hits = sum(1 for term in query_terms if term in lowered)
1217
+ if hits:
1218
+ content_score += 250 * hits
1219
+ if first_match_line is None:
1220
+ first_match_line = index
1221
+ except (OSError, UnicodeError):
1222
+ _ = open_reason
1223
+ continue
1224
+ if path_score == 0 and content_score == 0:
1225
+ continue
1226
+ if first_match_line is not None:
1227
+ lines = line_window(first_match_line, None, context_lines)
1228
+ reason = "query matched file content"
1229
+ else:
1230
+ lines = None
1231
+ reason = "query matched file path"
1232
+ add_suggest_candidate(
1233
+ candidates,
1234
+ path=display,
1235
+ score=3_000 + path_score + content_score,
1236
+ reason=reason,
1237
+ lines=lines,
1238
+ label=f"query:{display}",
1239
+ )
1240
+ return candidates
1241
+
1242
+
1243
+ def source_selected_range(source: ResolvedSource) -> LineRange:
1244
+ start = source.requested_lines.start if source.requested_lines else 1
1245
+ return LineRange(start, start + max(len(source.selected_lines), 1) - 1)
1246
+
1247
+
1248
+ def resolved_block_bytes(source: ResolvedSource, *, root_arg: str) -> int:
1249
+ included = source_selected_range(source)
1250
+ return byte_len(render_block(source, source.selected_lines, root_arg=root_arg, status="included", included=included))
1251
+
1252
+
1253
+ def manifest_source_for_candidate(source: ResolvedSource, *, priority: int, label: str | None) -> dict[str, Any]:
1254
+ item: dict[str, Any] = {"path": source.display_path, "priority": priority}
1255
+ if label:
1256
+ item["label"] = label
1257
+ if source.requested_lines is not None:
1258
+ item["lines"] = source_selected_range(source).as_dict()
1259
+ return item
1260
+
1261
+
1262
+ def suggested_source_payload(source: ResolvedSource, candidate: SuggestCandidate, *, root_arg: str) -> dict[str, Any]:
1263
+ included = source_selected_range(source)
1264
+ payload: dict[str, Any] = {
1265
+ "path": source.display_path,
1266
+ "priority": candidate.score,
1267
+ "score": candidate.score,
1268
+ "reason": candidate.reason,
1269
+ "lines": included.as_dict(),
1270
+ "bytes": byte_len("".join(source.selected_lines)),
1271
+ }
1272
+ if candidate.label:
1273
+ payload["label"] = candidate.label
1274
+ retrieval, retrieval_omitted_reason = retrieval_for(root_arg, source.display_path, included, redacted_path=source.redacted_path)
1275
+ if retrieval:
1276
+ payload["retrieval_cli"] = retrieval
1277
+ elif retrieval_omitted_reason:
1278
+ payload["retrieval_omitted_reason"] = retrieval_omitted_reason
1279
+ return payload
1280
+
1281
+
1282
+ def normalize_suggest_source(root: Path, candidate: SuggestCandidate) -> tuple[ResolvedSource | None, dict[str, Any] | None]:
1283
+ spec = SourceSpec(
1284
+ path=candidate.path,
1285
+ priority=candidate.score,
1286
+ lines=candidate.lines,
1287
+ label=candidate.label,
1288
+ input_index=candidate.input_index,
1289
+ origin="suggest",
1290
+ )
1291
+ source, omitted_item = resolve_source(root, spec)
1292
+ if omitted_item is not None:
1293
+ omitted_item["reason"] = omitted_item.get("reason") or candidate.reason
1294
+ omitted_item["suggest_reason"] = candidate.reason
1295
+ return None, omitted_item
1296
+ assert source is not None
1297
+ if source.redacted_path:
1298
+ return None, omission(spec, "redacted_path", path=source.display_path, redacted_path=True)
1299
+ if spec.lines is None and source.total_lines > SUGGEST_WHOLE_FILE_MAX_LINES:
1300
+ capped = SourceSpec(
1301
+ path=candidate.path,
1302
+ priority=candidate.score,
1303
+ lines=LineRange(1, min(SUGGEST_WHOLE_FILE_MAX_LINES, source.total_lines)),
1304
+ label=candidate.label,
1305
+ input_index=candidate.input_index,
1306
+ origin="suggest",
1307
+ )
1308
+ source, omitted_item = resolve_source(root, capped)
1309
+ if omitted_item is not None:
1310
+ omitted_item["suggest_reason"] = candidate.reason
1311
+ return None, omitted_item
1312
+ assert source is not None
1313
+ return source, None
1314
+
1315
+
1316
+ def write_manifest_under_root(root: Path, raw_path: str, manifest: dict[str, Any]) -> str:
1317
+ content = json.dumps(manifest, ensure_ascii=False, indent=2, sort_keys=True) + "\n"
1318
+ return write_text_under_root(root, raw_path, content, "--manifest-out")
1319
+
1320
+
1321
+ def validate_output_path_under_root(root: Path, raw_path: str, option_name: str) -> str:
1322
+ rel, reason = lexical_rel(raw_path)
1323
+ if rel is None:
1324
+ raise PackError(f"invalid {option_name}: {reason}")
1325
+ display, redacted = display_rel_path(rel.as_posix())
1326
+ if redacted:
1327
+ raise PackError(f"invalid {option_name}: redacted_path")
1328
+ parent_parts = rel.parts[:-1]
1329
+ filename = rel.parts[-1]
1330
+ current_fd: int | None = None
1331
+ file_fd = -1
1332
+ try:
1333
+ current_fd = open_dir_no_follow(root)
1334
+ for part in parent_parts:
1335
+ next_fd = open_dir_no_follow(part, dir_fd=current_fd)
1336
+ os.close(current_fd)
1337
+ current_fd = next_fd
1338
+ flags = os.O_WRONLY
1339
+ if hasattr(os, "O_NOFOLLOW"):
1340
+ flags |= os.O_NOFOLLOW
1341
+ if hasattr(os, "O_CLOEXEC"):
1342
+ flags |= os.O_CLOEXEC
1343
+ if hasattr(os, "O_NONBLOCK"):
1344
+ flags |= os.O_NONBLOCK
1345
+ try:
1346
+ file_fd = os.open(filename, flags, dir_fd=current_fd)
1347
+ st = os.fstat(file_fd)
1348
+ if not stat.S_ISREG(st.st_mode):
1349
+ raise PackError(f"invalid {option_name}: unsafe_path")
1350
+ except FileNotFoundError:
1351
+ temp_fd = -1
1352
+ temp_name = f".context-guard-pack-preflight-{os.getpid()}-{hashlib.sha256(raw_path.encode('utf-8', 'replace')).hexdigest()[:10]}"
1353
+ try:
1354
+ create_flags = os.O_WRONLY | os.O_CREAT | os.O_EXCL
1355
+ if hasattr(os, "O_NOFOLLOW"):
1356
+ create_flags |= os.O_NOFOLLOW
1357
+ if hasattr(os, "O_CLOEXEC"):
1358
+ create_flags |= os.O_CLOEXEC
1359
+ if hasattr(os, "O_NONBLOCK"):
1360
+ create_flags |= os.O_NONBLOCK
1361
+ temp_fd = os.open(temp_name, create_flags, 0o600, dir_fd=current_fd)
1362
+ except OSError as exc:
1363
+ raise PackError(f"invalid {option_name}: {exc.strerror or exc.__class__.__name__}") from exc
1364
+ finally:
1365
+ if temp_fd >= 0:
1366
+ try:
1367
+ os.close(temp_fd)
1368
+ except OSError:
1369
+ pass
1370
+ try:
1371
+ os.unlink(temp_name, dir_fd=current_fd)
1372
+ except OSError:
1373
+ pass
1374
+ except IsADirectoryError as exc:
1375
+ raise PackError(f"invalid {option_name}: unsafe_path") from exc
1376
+ except OSError as exc:
1377
+ raise PackError(f"invalid {option_name}: {exc.strerror or exc.__class__.__name__}") from exc
1378
+ except PackError:
1379
+ raise
1380
+ except FileNotFoundError as exc:
1381
+ raise PackError(f"invalid {option_name}: missing") from exc
1382
+ except OSError as exc:
1383
+ raise PackError(f"invalid {option_name}: {exc.strerror or exc.__class__.__name__}") from exc
1384
+ finally:
1385
+ if file_fd >= 0:
1386
+ try:
1387
+ os.close(file_fd)
1388
+ except OSError:
1389
+ pass
1390
+ if current_fd is not None:
1391
+ try:
1392
+ os.close(current_fd)
1393
+ except OSError:
1394
+ pass
1395
+ return display
1396
+
1397
+
1398
+ def output_rel_for_collision_check(raw_path: str, option_name: str) -> Path:
1399
+ rel, reason = lexical_rel(raw_path)
1400
+ if rel is None:
1401
+ raise PackError(f"invalid {option_name}: {reason}")
1402
+ _display, redacted = display_rel_path(rel.as_posix())
1403
+ if redacted:
1404
+ raise PackError(f"invalid {option_name}: redacted_path")
1405
+ return rel
1406
+
1407
+
1408
+ def existing_output_identity_under_root(root: Path, rel: Path) -> tuple[int, int] | None:
1409
+ current_fd: int | None = None
1410
+ try:
1411
+ current_fd = open_dir_no_follow(root)
1412
+ for part in rel.parts[:-1]:
1413
+ next_fd = open_dir_no_follow(part, dir_fd=current_fd)
1414
+ os.close(current_fd)
1415
+ current_fd = next_fd
1416
+ st = os.stat(rel.parts[-1], dir_fd=current_fd, follow_symlinks=False)
1417
+ if not stat.S_ISREG(st.st_mode):
1418
+ return None
1419
+ return int(st.st_dev), int(st.st_ino)
1420
+ except (FileNotFoundError, OSError, NotImplementedError):
1421
+ return None
1422
+ finally:
1423
+ if current_fd is not None:
1424
+ try:
1425
+ os.close(current_fd)
1426
+ except OSError:
1427
+ pass
1428
+
1429
+
1430
+ def reject_matching_output_targets(
1431
+ root: Path,
1432
+ *,
1433
+ first_rel: Path,
1434
+ second_rel: Path,
1435
+ second_option: str,
1436
+ reason: str,
1437
+ ) -> None:
1438
+ first_identity = existing_output_identity_under_root(root, first_rel)
1439
+ second_identity = existing_output_identity_under_root(root, second_rel)
1440
+ same_existing_target = first_identity is not None and first_identity == second_identity
1441
+ same_lexical_target = first_rel == second_rel or first_rel.as_posix().casefold() == second_rel.as_posix().casefold()
1442
+ if same_lexical_target or same_existing_target:
1443
+ raise PackError(f"invalid {second_option}: {reason}")
1444
+
1445
+
1446
+ def write_text_under_root(root: Path, raw_path: str, content: str, option_name: str) -> str:
1447
+ rel, reason = lexical_rel(raw_path)
1448
+ if rel is None:
1449
+ raise PackError(f"invalid {option_name}: {reason}")
1450
+ display, redacted = display_rel_path(rel.as_posix())
1451
+ if redacted:
1452
+ raise PackError(f"invalid {option_name}: redacted_path")
1453
+ parent_parts = rel.parts[:-1]
1454
+ filename = rel.parts[-1]
1455
+ current_fd: int | None = None
1456
+ file_fd = -1
1457
+ try:
1458
+ current_fd = open_dir_no_follow(root)
1459
+ for part in parent_parts:
1460
+ next_fd = open_dir_no_follow(part, dir_fd=current_fd)
1461
+ os.close(current_fd)
1462
+ current_fd = next_fd
1463
+ flags = os.O_WRONLY | os.O_CREAT | os.O_TRUNC
1464
+ if hasattr(os, "O_NOFOLLOW"):
1465
+ flags |= os.O_NOFOLLOW
1466
+ if hasattr(os, "O_CLOEXEC"):
1467
+ flags |= os.O_CLOEXEC
1468
+ if hasattr(os, "O_NONBLOCK"):
1469
+ flags |= os.O_NONBLOCK
1470
+ file_fd = os.open(filename, flags, 0o600, dir_fd=current_fd)
1471
+ st = os.fstat(file_fd)
1472
+ if not stat.S_ISREG(st.st_mode):
1473
+ raise PackError(f"invalid {option_name}: unsafe_path")
1474
+ with os.fdopen(file_fd, "w", encoding="utf-8") as handle:
1475
+ file_fd = -1
1476
+ handle.write(content)
1477
+ except PackError:
1478
+ raise
1479
+ except FileNotFoundError as exc:
1480
+ raise PackError(f"invalid {option_name}: missing") from exc
1481
+ except OSError as exc:
1482
+ raise PackError(f"invalid {option_name}: {exc.strerror or exc.__class__.__name__}") from exc
1483
+ finally:
1484
+ if file_fd >= 0:
1485
+ try:
1486
+ os.close(file_fd)
1487
+ except OSError:
1488
+ pass
1489
+ if current_fd is not None:
1490
+ try:
1491
+ os.close(current_fd)
1492
+ except OSError:
1493
+ pass
1494
+ return display
1495
+
1496
+
1497
+ def manifest_to_source_specs(manifest: dict[str, Any]) -> list[SourceSpec]:
1498
+ version = manifest.get("version", VERSION)
1499
+ if version != VERSION:
1500
+ raise PackError(f"unsupported manifest version: {version}")
1501
+ sources = manifest.get("sources")
1502
+ if not isinstance(sources, list):
1503
+ raise PackError("manifest sources must be a list")
1504
+ specs: list[SourceSpec] = []
1505
+ for index, item in enumerate(sources):
1506
+ if not isinstance(item, dict):
1507
+ raise PackError("manifest sources must be objects")
1508
+ if "path" not in item:
1509
+ raise PackError("manifest source missing path")
1510
+ try:
1511
+ lines = parse_line_range(item.get("lines"))
1512
+ except PackError:
1513
+ lines = LineRange(-1, -1)
1514
+ specs.append(SourceSpec(
1515
+ path=str(item.get("path", "")),
1516
+ priority=bounded_int(item.get("priority"), 0, -1_000_000, 1_000_000),
1517
+ lines=lines,
1518
+ label=cap_label(item.get("label")),
1519
+ input_index=index,
1520
+ origin="auto",
1521
+ ))
1522
+ return specs
1523
+
1524
+
1525
+ def build_suggest_manifest(sources: list[dict[str, Any]]) -> dict[str, Any]:
1526
+ manifest_sources: list[dict[str, Any]] = []
1527
+ for item in sources:
1528
+ source: dict[str, Any] = {"path": item["path"], "priority": item["priority"]}
1529
+ if "label" in item:
1530
+ source["label"] = item["label"]
1531
+ if "lines" in item:
1532
+ source["lines"] = item["lines"]
1533
+ manifest_sources.append(source)
1534
+ return {"version": VERSION, "sources": manifest_sources}
1535
+
1536
+
1537
+ def suggest_build_hint(root_arg: str, manifest_path: str | None, budget: int) -> tuple[str | None, str | None]:
1538
+ safe_root = safe_root_arg_for_retrieval(root_arg)
1539
+ if safe_root is None:
1540
+ return None, "unsafe_root_path"
1541
+ manifest_arg = manifest_path or "<manifest.json>"
1542
+ command_parts = ["context-guard-pack", "build", "--root", ".", "--manifest", manifest_arg, "--budget-bytes", str(budget), "--json"]
1543
+ command = " ".join(shlex.quote(part) for part in command_parts)
1544
+ if safe_root in {".", ""}:
1545
+ return command, None
1546
+ return f"cd {shlex.quote(safe_root)} && {command}", None
1547
+
1548
+
1549
+ def suggest_pack(root: Path, args: argparse.Namespace, *, root_arg: str) -> tuple[dict[str, Any], int]:
1550
+ query_text, _query_redactions = sanitize_text(args.query or "")
1551
+ query = " ".join(query_text.split())
1552
+ query_terms = suggest_tokens(query)
1553
+ context_lines = bounded_int(args.context_lines, DEFAULT_SUGGEST_CONTEXT_LINES, 0, MAX_SUGGEST_CONTEXT_LINES)
1554
+ top = bounded_int(args.top, DEFAULT_SUGGEST_TOP, 1, MAX_SUGGEST_TOP)
1555
+ budget = bounded_int(args.budget_bytes, DEFAULT_BUDGET_BYTES, MIN_BUDGET_BYTES, MAX_BUDGET_BYTES)
1556
+ candidates: list[SuggestCandidate] = []
1557
+ omitted: list[dict[str, Any]] = []
1558
+ file_inputs = split_suggest_files(args.files)
1559
+ has_signal = bool(query or file_inputs or args.diff or args.output or args.test_output)
1560
+ if not has_signal:
1561
+ raise PackError("provide --query, --files, --diff, --output, or --test-output")
1562
+
1563
+ for raw_path in file_inputs:
1564
+ add_suggest_candidate(
1565
+ candidates,
1566
+ path=raw_path,
1567
+ score=9_000 + suggest_score_path(raw_path, query_terms),
1568
+ reason="explicit file request",
1569
+ label=f"file:{safe_raw_path_label(raw_path)}",
1570
+ )
1571
+ if args.diff:
1572
+ candidates.extend(collect_diff_candidates(root, args.diff, query_terms, context_lines))
1573
+ output_candidates, output_omitted = collect_output_candidates(root, args.output, query_terms, context_lines, origin="output")
1574
+ test_candidates, test_omitted = collect_output_candidates(root, args.test_output, query_terms, context_lines, origin="test-output")
1575
+ candidates.extend(output_candidates)
1576
+ candidates.extend(test_candidates)
1577
+ omitted.extend(output_omitted)
1578
+ omitted.extend(test_omitted)
1579
+ candidates.extend(collect_query_candidates(root, query_terms, context_lines))
1580
+
1581
+ candidates.sort(key=lambda item: (-item.score, item.input_index, item.path, item.lines.identity() if item.lines else "0:0"))
1582
+ seen: set[tuple[str, str]] = set()
1583
+ final_seen: set[tuple[str, str]] = set()
1584
+ selected: list[dict[str, Any]] = []
1585
+ manifest_seed: list[dict[str, Any]] = []
1586
+ current_bytes = byte_len("# Context Pack\n\nGenerated by context-guard-pack. Token counts are estimated proxies; byte counts are observed.\n\n")
1587
+ for candidate in candidates:
1588
+ rel, reason = lexical_rel(candidate.path)
1589
+ identity_path = rel.as_posix() if rel is not None else safe_raw_path_label(candidate.path)
1590
+ identity_lines = candidate.lines.identity() if candidate.lines else "all"
1591
+ identity = (identity_path, identity_lines)
1592
+ if rel is not None and identity in seen:
1593
+ display, redacted = display_rel_path(rel.as_posix())
1594
+ duplicate_item = {
1595
+ "path": display,
1596
+ "status": "omitted",
1597
+ "reason": "duplicate_source",
1598
+ "suggest_reason": candidate.reason,
1599
+ "priority": candidate.score,
1600
+ "retrieval_omitted_reason": "redacted_path" if redacted else None,
1601
+ }
1602
+ omitted.append({key: value for key, value in duplicate_item.items() if value is not None})
1603
+ continue
1604
+ if rel is not None:
1605
+ seen.add(identity)
1606
+ source, omitted_item = normalize_suggest_source(root, candidate)
1607
+ if omitted_item is not None:
1608
+ omitted_item["priority"] = candidate.score
1609
+ omitted_item["suggest_reason"] = candidate.reason
1610
+ omitted.append({key: value for key, value in omitted_item.items() if value is not None})
1611
+ continue
1612
+ assert source is not None
1613
+ final_identity = (source.display_path, source_selected_range(source).identity() if source.requested_lines is not None else "all")
1614
+ if final_identity in final_seen:
1615
+ omitted.append({
1616
+ "path": source.display_path,
1617
+ "status": "omitted",
1618
+ "reason": "duplicate_source",
1619
+ "suggest_reason": candidate.reason,
1620
+ "priority": candidate.score,
1621
+ })
1622
+ continue
1623
+ final_seen.add(final_identity)
1624
+ source_bytes = resolved_block_bytes(source, root_arg=root_arg)
1625
+ remaining = budget - current_bytes
1626
+ if source_bytes > remaining:
1627
+ if not selected and remaining > 0:
1628
+ partial_lines, _partial_block, partial_range = fit_partial_lines(source, remaining, root_arg=root_arg)
1629
+ if partial_range is not None and partial_lines:
1630
+ partial_spec = SourceSpec(
1631
+ path=candidate.path,
1632
+ priority=candidate.score,
1633
+ lines=partial_range,
1634
+ label=candidate.label,
1635
+ input_index=candidate.input_index,
1636
+ origin="suggest",
1637
+ )
1638
+ source, omitted_item = resolve_source(root, partial_spec)
1639
+ if omitted_item is not None:
1640
+ omitted_item["priority"] = candidate.score
1641
+ omitted_item["suggest_reason"] = candidate.reason
1642
+ omitted.append(omitted_item)
1643
+ continue
1644
+ assert source is not None
1645
+ source_bytes = resolved_block_bytes(source, root_arg=root_arg)
1646
+ else:
1647
+ omitted.append({"path": source.display_path, "status": "omitted", "reason": "budget_exhausted", "priority": candidate.score})
1648
+ continue
1649
+ else:
1650
+ omitted.append({"path": source.display_path, "status": "omitted", "reason": "budget_exhausted", "priority": candidate.score})
1651
+ continue
1652
+ payload = suggested_source_payload(source, candidate, root_arg=root_arg)
1653
+ selected.append(payload)
1654
+ manifest_seed.append(manifest_source_for_candidate(source, priority=candidate.score, label=candidate.label))
1655
+ current_bytes += source_bytes
1656
+ if len(selected) >= top:
1657
+ break
1658
+
1659
+ manifest = build_suggest_manifest(manifest_seed)
1660
+ estimated_pack_bytes = current_bytes if selected else 0
1661
+ manifest_path: str | None = None
1662
+ if args.manifest_out:
1663
+ manifest_path = write_manifest_under_root(root, args.manifest_out, manifest)
1664
+ build_hint, build_hint_omitted_reason = suggest_build_hint(root_arg, manifest_path, budget)
1665
+ payload: dict[str, Any] = {
1666
+ "tool": TOOL_NAME,
1667
+ "schema_version": SUGGEST_SCHEMA_VERSION,
1668
+ "version": VERSION,
1669
+ "mode": "suggest",
1670
+ "root": display_root(root),
1671
+ "query": query,
1672
+ "budget_bytes": budget,
1673
+ "estimated_pack_bytes": estimated_pack_bytes,
1674
+ "token_proxy": {
1675
+ "measurement": "estimated",
1676
+ "method": f"chars_div_{TOKEN_PROXY_CHARS_PER_TOKEN}",
1677
+ "estimated_pack": estimated_pack_bytes // TOKEN_PROXY_CHARS_PER_TOKEN,
1678
+ },
1679
+ "sources": selected,
1680
+ "omitted_sources": sorted(omitted, key=lambda item: (str(item.get("path", "")), str(item.get("reason", "")), int(item.get("priority", 0) or 0))),
1681
+ "manifest": manifest,
1682
+ "manifest_path": manifest_path,
1683
+ "build_hint": build_hint,
1684
+ "caveats": [
1685
+ "Deterministic local heuristics only; no model, network, embedding, or provider-cost estimate is used.",
1686
+ "Byte and token values are pack-size proxies, not billing claims.",
1687
+ ],
1688
+ }
1689
+ if build_hint_omitted_reason:
1690
+ payload["build_hint_omitted_reason"] = build_hint_omitted_reason
1691
+ return payload, 0
1692
+
1693
+
1694
+ def line_range_identity(value: object) -> str:
1695
+ if isinstance(value, dict):
1696
+ return f"{value.get('start')}:{value.get('end')}"
1697
+ if value is None:
1698
+ return "all"
1699
+ return str(value)
1700
+
1701
+
1702
+ def copy_explain_fields(item: dict[str, Any], fields: tuple[str, ...]) -> dict[str, Any]:
1703
+ out: dict[str, Any] = {}
1704
+ for field in fields:
1705
+ if field in item and item[field] is not None:
1706
+ out[field] = copy.deepcopy(item[field])
1707
+ return out
1708
+
1709
+
1710
+ def build_source_matches_exact(suggest_item: dict[str, Any], build_item: dict[str, Any]) -> bool:
1711
+ if build_item.get("path") != suggest_item.get("path"):
1712
+ return False
1713
+ if build_item.get("priority") != suggest_item.get("priority"):
1714
+ return False
1715
+ lines = line_range_identity(suggest_item.get("lines"))
1716
+ requested = line_range_identity(build_item.get("requested_lines"))
1717
+ included = line_range_identity(build_item.get("included_lines"))
1718
+ return lines in {requested, included, "all"}
1719
+
1720
+
1721
+ def find_exact_build_source_for_explain(
1722
+ suggest_item: dict[str, Any],
1723
+ build_sources: list[dict[str, Any]],
1724
+ used_indexes: set[int],
1725
+ ) -> dict[str, Any] | None:
1726
+ for index, item in enumerate(build_sources):
1727
+ if index in used_indexes:
1728
+ continue
1729
+ if build_source_matches_exact(suggest_item, item):
1730
+ used_indexes.add(index)
1731
+ return item
1732
+ return None
1733
+
1734
+
1735
+ def find_fallback_build_source_for_explain(
1736
+ suggest_item: dict[str, Any],
1737
+ build_sources: list[dict[str, Any]],
1738
+ used_indexes: set[int],
1739
+ ) -> dict[str, Any] | None:
1740
+ path = suggest_item.get("path")
1741
+ for index, item in enumerate(build_sources):
1742
+ if index in used_indexes or item.get("path") != path:
1743
+ continue
1744
+ used_indexes.add(index)
1745
+ return item
1746
+ return None
1747
+
1748
+
1749
+ def explain_omission_key(item: dict[str, Any]) -> tuple[str, str, str, str, str]:
1750
+ return (
1751
+ str(item.get("phase", "")),
1752
+ str(item.get("path", "")),
1753
+ str(item.get("reason", "")),
1754
+ str(item.get("suggest_reason", "")),
1755
+ json.dumps(item.get("requested_lines", item.get("lines", "")), ensure_ascii=False, sort_keys=True),
1756
+ )
1757
+
1758
+
1759
+ def sanitize_explain_text(value: str, *, limit: int = MAX_LABEL_CHARS) -> str:
1760
+ sanitized, _redacted = sanitize_text(str(value))
1761
+ return cap_label(sanitized, default="", limit=limit) or ""
1762
+
1763
+
1764
+ def is_repo_map_text_path(path: str) -> bool:
1765
+ name = Path(path).name.lower()
1766
+ if name in {"readme", "license", "dockerfile", "makefile"}:
1767
+ return True
1768
+ return Path(path).suffix.lower() in REPO_MAP_TEXT_EXTENSIONS
1769
+
1770
+
1771
+ def read_repo_map_text(root: Path, rel_path: str) -> tuple[dict[str, Any] | None, dict[str, Any] | None]:
1772
+ rel, reason = lexical_rel(rel_path)
1773
+ if rel is None:
1774
+ return None, {"path": repo_map_safe_raw_path_label(rel_path), "reason": reason}
1775
+ display, redacted_path = repo_map_display_rel_path(rel.as_posix())
1776
+ if not is_repo_map_text_path(display):
1777
+ return None, {"path": display, "reason": "unsupported_file_type"}
1778
+ handle, open_reason = open_regular_under_root(root, rel)
1779
+ if handle is None:
1780
+ return None, {"path": display, "reason": open_reason, "retrieval_omitted_reason": "redacted_path" if redacted_path else None}
1781
+ try:
1782
+ with handle:
1783
+ text = handle.read(MAX_REPO_MAP_BYTES_PER_FILE + 1)
1784
+ except (OSError, UnicodeError):
1785
+ return None, {"path": display, "reason": "unsafe_path", "retrieval_omitted_reason": "redacted_path" if redacted_path else None}
1786
+ capped = byte_len(text) > MAX_REPO_MAP_BYTES_PER_FILE
1787
+ if capped:
1788
+ text = text.encode("utf-8", errors="replace")[:MAX_REPO_MAP_BYTES_PER_FILE].decode("utf-8", errors="ignore")
1789
+ risk_counts = secret_risk_counts(text)
1790
+ sanitized_text, redacted_lines = sanitize_text(text)
1791
+ return {
1792
+ "path": display,
1793
+ "raw_path": rel.as_posix(),
1794
+ "redacted_path": redacted_path,
1795
+ "text": sanitized_text,
1796
+ "bytes": byte_len(sanitized_text),
1797
+ "bytes_capped": capped,
1798
+ "line_count": len(sanitized_text.splitlines()) or (1 if sanitized_text else 0),
1799
+ "redacted_lines": redacted_lines,
1800
+ "secret_risk_counts": risk_counts,
1801
+ }, None
1802
+
1803
+
1804
+ def repo_map_records(root: Path) -> tuple[list[dict[str, Any]], list[dict[str, Any]], dict[str, Any]]:
1805
+ paths = git_ls_files(root)
1806
+ path_cap_reached = len(paths) > MAX_REPO_MAP_FILES
1807
+ records: list[dict[str, Any]] = []
1808
+ omitted: list[dict[str, Any]] = []
1809
+ for rel_path in paths[:MAX_REPO_MAP_FILES]:
1810
+ record, omission_item = read_repo_map_text(root, rel_path)
1811
+ if record is not None:
1812
+ records.append(record)
1813
+ elif omission_item is not None and omission_item.get("reason") != "unsupported_file_type":
1814
+ omitted.append({key: value for key, value in omission_item.items() if value is not None})
1815
+ caps = {
1816
+ "max_files": MAX_REPO_MAP_FILES,
1817
+ "files_capped": path_cap_reached,
1818
+ "max_bytes_per_file": MAX_REPO_MAP_BYTES_PER_FILE,
1819
+ "bytes_per_file_capped_count": sum(1 for item in records if item.get("bytes_capped")),
1820
+ "max_tree_entries": MAX_REPO_MAP_TREE_ENTRIES,
1821
+ "max_signature_entries": MAX_REPO_MAP_SIGNATURE_ENTRIES,
1822
+ "max_graph_rank_entries": MAX_REPO_MAP_GRAPH_RANK_ENTRIES,
1823
+ "max_retrieval_hints": MAX_REPO_MAP_RETRIEVAL_HINTS,
1824
+ "max_secret_risk_files": MAX_REPO_MAP_SECRET_RISK_FILES,
1825
+ }
1826
+ return records, omitted, caps
1827
+
1828
+
1829
+ def secret_risk_counts(text: str) -> dict[str, int]:
1830
+ counts: dict[str, int] = {}
1831
+ for name, pattern in SECRET_RISK_PATTERNS:
1832
+ found = len(pattern.findall(text))
1833
+ if found:
1834
+ counts[name] = found
1835
+ return counts
1836
+
1837
+
1838
+ def build_secret_scan(records: list[dict[str, Any]]) -> dict[str, Any]:
1839
+ risk_counts: dict[str, int] = {}
1840
+ files: list[dict[str, Any]] = []
1841
+ for record in records:
1842
+ counts = dict(record.get("secret_risk_counts", {}) if isinstance(record.get("secret_risk_counts"), dict) else {})
1843
+ if not counts:
1844
+ continue
1845
+ for name, count in counts.items():
1846
+ risk_counts[name] = risk_counts.get(name, 0) + count
1847
+ files.append({
1848
+ "path": record["path"],
1849
+ "counts": counts,
1850
+ "redacted_path": bool(record.get("redacted_path")),
1851
+ })
1852
+ files.sort(key=lambda item: (-sum(item["counts"].values()), item["path"]))
1853
+ return {
1854
+ "risk_counts": dict(sorted(risk_counts.items())),
1855
+ "files_with_risks": files[:MAX_REPO_MAP_SECRET_RISK_FILES],
1856
+ "files_omitted_by_cap": max(0, len(files) - MAX_REPO_MAP_SECRET_RISK_FILES),
1857
+ "caveat": "Counts are local best-effort secret-pattern risk signals; raw matched values are never emitted.",
1858
+ }
1859
+
1860
+
1861
+ def build_token_tree(records: list[dict[str, Any]]) -> list[dict[str, Any]]:
1862
+ directory_totals: dict[str, dict[str, int]] = {}
1863
+ file_entries: list[dict[str, Any]] = []
1864
+ for record in records:
1865
+ path = str(record["path"])
1866
+ bytes_count = int(record.get("bytes", 0) or 0)
1867
+ file_entries.append({
1868
+ "kind": "file",
1869
+ "path": path,
1870
+ "bytes": bytes_count,
1871
+ "token_proxy": token_proxy(str(record.get("text", ""))),
1872
+ "line_count": int(record.get("line_count", 0) or 0),
1873
+ "bytes_capped": bool(record.get("bytes_capped")),
1874
+ })
1875
+ parts = path.split("/")
1876
+ if len(parts) > 1:
1877
+ prefix = ""
1878
+ for part in parts[:-1]:
1879
+ prefix = part if not prefix else f"{prefix}/{part}"
1880
+ bucket = directory_totals.setdefault(prefix, {"bytes": 0, "file_count": 0})
1881
+ bucket["bytes"] += bytes_count
1882
+ bucket["file_count"] += 1
1883
+ directory_entries = [
1884
+ {
1885
+ "kind": "directory",
1886
+ "path": path,
1887
+ "bytes": data["bytes"],
1888
+ "token_proxy": max(0, round(data["bytes"] / TOKEN_PROXY_CHARS_PER_TOKEN)),
1889
+ "file_count": data["file_count"],
1890
+ }
1891
+ for path, data in directory_totals.items()
1892
+ ]
1893
+ entries = directory_entries + file_entries
1894
+ entries.sort(key=lambda item: (-int(item.get("bytes", 0) or 0), str(item.get("path", ""))))
1895
+ return entries[:MAX_REPO_MAP_TREE_ENTRIES]
1896
+
1897
+
1898
+ def signature_range(line_number: int, total_lines: int) -> LineRange:
1899
+ return LineRange(max(1, line_number), min(max(1, total_lines), max(1, line_number) + 24))
1900
+
1901
+
1902
+ def signature_entry(record: dict[str, Any], *, kind: str, name: str, raw_signature: str, line_number: int) -> dict[str, Any]:
1903
+ total_lines = int(record.get("line_count", 0) or 1)
1904
+ line_range = signature_range(line_number, total_lines)
1905
+ return {
1906
+ "path": record["path"],
1907
+ "kind": kind,
1908
+ "name": sanitize_explain_text(name, limit=80),
1909
+ "signature": sanitize_explain_text(raw_signature, limit=180),
1910
+ "line": line_number,
1911
+ "lines": line_range.as_dict(),
1912
+ }
1913
+
1914
+
1915
+ def python_signatures(record: dict[str, Any], text: str) -> list[dict[str, Any]]:
1916
+ try:
1917
+ module = ast.parse(text)
1918
+ except (SyntaxError, ValueError, RecursionError):
1919
+ return []
1920
+ lines = text.splitlines()
1921
+ out: list[dict[str, Any]] = []
1922
+ for node in module.body:
1923
+ if isinstance(node, ast.ClassDef):
1924
+ raw = lines[node.lineno - 1].strip() if 0 < node.lineno <= len(lines) else f"class {node.name}"
1925
+ out.append(signature_entry(record, kind="class", name=node.name, raw_signature=raw, line_number=node.lineno))
1926
+ for child in node.body:
1927
+ if isinstance(child, (ast.FunctionDef, ast.AsyncFunctionDef)):
1928
+ raw_child = lines[child.lineno - 1].strip() if 0 < child.lineno <= len(lines) else f"def {child.name}"
1929
+ out.append(signature_entry(record, kind="method", name=child.name, raw_signature=raw_child, line_number=child.lineno))
1930
+ elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
1931
+ raw = lines[node.lineno - 1].strip() if 0 < node.lineno <= len(lines) else f"def {node.name}"
1932
+ out.append(signature_entry(record, kind="function", name=node.name, raw_signature=raw, line_number=node.lineno))
1933
+ return out
1934
+
1935
+
1936
+ def regex_signatures(record: dict[str, Any], text: str) -> list[dict[str, Any]]:
1937
+ out: list[dict[str, Any]] = []
1938
+ suffix = Path(str(record.get("path", ""))).suffix.lower()
1939
+ for index, raw in enumerate(text.splitlines(), start=1):
1940
+ stripped = raw.strip()
1941
+ if suffix in {".md", ".mdx"}:
1942
+ heading = re.match(r"^(#{1,6})\s+(.+)$", stripped)
1943
+ if heading:
1944
+ out.append(signature_entry(record, kind="heading", name=heading.group(2), raw_signature=stripped, line_number=index))
1945
+ continue
1946
+ match = SIGNATURE_LINE_RE.match(raw)
1947
+ if not match:
1948
+ continue
1949
+ name = next((group for group in match.groups() if group), "signature")
1950
+ kind = "class" if re.search(r"\bclass\s+" + re.escape(name), raw) else "function"
1951
+ out.append(signature_entry(record, kind=kind, name=name, raw_signature=stripped, line_number=index))
1952
+ return out
1953
+
1954
+
1955
+ def extract_signatures(records: list[dict[str, Any]]) -> list[dict[str, Any]]:
1956
+ signatures: list[dict[str, Any]] = []
1957
+ for record in records:
1958
+ text = str(record.get("text", ""))
1959
+ suffix = Path(str(record.get("path", ""))).suffix.lower()
1960
+ if suffix == ".py":
1961
+ parsed = python_signatures(record, text)
1962
+ if parsed:
1963
+ signatures.extend(parsed)
1964
+ continue
1965
+ signatures.extend(regex_signatures(record, text))
1966
+ signatures.sort(key=lambda item: (str(item.get("path", "")), int(item.get("line", 0) or 0), str(item.get("name", ""))))
1967
+ return signatures[:MAX_REPO_MAP_SIGNATURE_ENTRIES]
1968
+
1969
+
1970
+ def normalize_repo_map_candidate(path: str) -> str:
1971
+ normalized = posixpath.normpath(path.replace("\\", "/"))
1972
+ if normalized == ".":
1973
+ return ""
1974
+ return normalized.lstrip("/")
1975
+
1976
+
1977
+ def resolve_import_target(raw_target: str, source_path: str, known_paths: set[str]) -> str | None:
1978
+ target = raw_target.strip()
1979
+ if not target:
1980
+ return None
1981
+ candidates: list[str] = []
1982
+ source_dir = Path(source_path).parent.as_posix()
1983
+ if target.startswith("."):
1984
+ if target.startswith("./") or target.startswith("../"):
1985
+ base = normalize_repo_map_candidate(posixpath.join(source_dir, target))
1986
+ else:
1987
+ leading = len(target) - len(target.lstrip("."))
1988
+ remainder = target[leading:].replace(".", "/")
1989
+ base_dir = source_dir
1990
+ for _ in range(max(0, leading - 1)):
1991
+ base_dir = posixpath.dirname(base_dir)
1992
+ base = normalize_repo_map_candidate(posixpath.join(base_dir, remainder)) if remainder else normalize_repo_map_candidate(base_dir)
1993
+ candidates.extend([base, f"{base}.py", f"{base}.ts", f"{base}.tsx", f"{base}.js", f"{base}.jsx", f"{base}/index.ts", f"{base}/index.js"])
1994
+ else:
1995
+ module_path = target.replace(".", "/")
1996
+ candidates.extend([f"{module_path}.py", f"{module_path}.ts", f"{module_path}.tsx", f"{module_path}.js", f"{module_path}.jsx", f"{module_path}/index.ts", f"{module_path}/index.js"])
1997
+ for candidate in candidates:
1998
+ normalized = normalize_repo_map_candidate(candidate)
1999
+ if normalized in known_paths:
2000
+ return normalized
2001
+ return None
2002
+
2003
+
2004
+ def python_from_import_targets(module_name: str, imported_names: str) -> list[str]:
2005
+ targets = [module_name]
2006
+ if module_name.strip("."):
2007
+ return targets
2008
+ for raw_name in imported_names.replace("(", " ").replace(")", " ").split(","):
2009
+ name = raw_name.strip().split(" as ", 1)[0].strip()
2010
+ if not re.fullmatch(r"[A-Za-z_]\w*", name):
2011
+ continue
2012
+ targets.append(f"{module_name}{name}")
2013
+ return targets
2014
+
2015
+
2016
+ def collect_import_edges(records: list[dict[str, Any]]) -> list[dict[str, str]]:
2017
+ known = {str(record.get("path", "")) for record in records}
2018
+ edges: list[dict[str, str]] = []
2019
+ seen: set[tuple[str, str]] = set()
2020
+ for record in records:
2021
+ source = str(record.get("path", ""))
2022
+ for line in str(record.get("text", "")).splitlines():
2023
+ py_from_match = PY_FROM_IMPORT_LINE_RE.match(line)
2024
+ if py_from_match:
2025
+ raw_targets = python_from_import_targets(py_from_match.group("module"), py_from_match.group("names"))
2026
+ else:
2027
+ raw_targets = [next((value for value in match.groupdict().values() if value), "") for match in IMPORT_PATH_RE.finditer(line)]
2028
+ for raw_target in raw_targets:
2029
+ target = resolve_import_target(raw_target, source, known)
2030
+ if target is None or target == source:
2031
+ continue
2032
+ edge = (source, target)
2033
+ if edge in seen:
2034
+ continue
2035
+ seen.add(edge)
2036
+ edges.append({"from": source, "to": target})
2037
+ if len(edges) >= MAX_REPO_MAP_FILES:
2038
+ return edges
2039
+ return edges
2040
+
2041
+
2042
+ def repo_map_seed_paths(args: argparse.Namespace, suggest_payload: dict[str, Any], build_payload: dict[str, Any]) -> set[str]:
2043
+ seeds: set[str] = set()
2044
+ for raw in split_suggest_files(getattr(args, "files", None)):
2045
+ rel, _reason = lexical_rel(raw)
2046
+ if rel is not None:
2047
+ display, redacted = repo_map_display_rel_path(rel.as_posix())
2048
+ if not redacted:
2049
+ seeds.add(display)
2050
+ for source in suggest_payload.get("sources", []):
2051
+ if isinstance(source, dict) and isinstance(source.get("path"), str):
2052
+ seeds.add(source["path"])
2053
+ for source in build_payload.get("included_sources", []):
2054
+ if isinstance(source, dict) and isinstance(source.get("path"), str):
2055
+ seeds.add(source["path"])
2056
+ return seeds
2057
+
2058
+
2059
+ def build_graph_rank(
2060
+ records: list[dict[str, Any]],
2061
+ signatures: list[dict[str, Any]],
2062
+ edges: list[dict[str, str]],
2063
+ *,
2064
+ query_terms: set[str],
2065
+ seed_paths: set[str],
2066
+ secret_scan: dict[str, Any],
2067
+ ) -> list[dict[str, Any]]:
2068
+ signature_paths = {str(item.get("path", "")) for item in signatures}
2069
+ secret_paths = {str(item.get("path", "")) for item in secret_scan.get("files_with_risks", []) if isinstance(item, dict)}
2070
+ degree: dict[str, int] = {}
2071
+ for edge in edges:
2072
+ degree[edge["from"]] = degree.get(edge["from"], 0) + 1
2073
+ degree[edge["to"]] = degree.get(edge["to"], 0) + 1
2074
+ ranked: list[dict[str, Any]] = []
2075
+ for record in records:
2076
+ path = str(record.get("path", ""))
2077
+ text = str(record.get("text", "")).lower()
2078
+ components = {
2079
+ "seed": 1000 if path in seed_paths else 0,
2080
+ "query_path": suggest_score_path(path, query_terms),
2081
+ "query_content": min(500, 25 * sum(text.count(term) for term in query_terms)),
2082
+ "signature": 80 if path in signature_paths else 0,
2083
+ "graph_degree": 25 * degree.get(path, 0),
2084
+ "secret_risk_penalty": -25 if path in secret_paths else 0,
2085
+ }
2086
+ score = sum(components.values())
2087
+ if score <= 0:
2088
+ continue
2089
+ ranked.append({
2090
+ "path": path,
2091
+ "score": score,
2092
+ "components": components,
2093
+ "explain_only": True,
2094
+ "line_count": int(record.get("line_count", 0) or 0),
2095
+ })
2096
+ ranked.sort(key=lambda item: (-int(item["score"]), str(item["path"])))
2097
+ return ranked[:MAX_REPO_MAP_GRAPH_RANK_ENTRIES]
2098
+
2099
+
2100
+ def repo_map_retrieval_for(root_arg: str, display_path: str, lines: LineRange, *, redacted_path: bool) -> tuple[str | None, str | None]:
2101
+ if redacted_path:
2102
+ return None, "redacted_path"
2103
+ safe_root = safe_repo_map_root_arg_for_retrieval(root_arg)
2104
+ if safe_root is None:
2105
+ return None, "unsafe_root_path"
2106
+ return retrieval_cli(safe_root, display_path, lines), None
2107
+
2108
+
2109
+ def repo_map_retrieval(
2110
+ record_by_path: dict[str, dict[str, Any]],
2111
+ signatures: list[dict[str, Any]],
2112
+ graph_rank: list[dict[str, Any]],
2113
+ *,
2114
+ root_arg: str,
2115
+ ) -> list[dict[str, Any]]:
2116
+ out: list[dict[str, Any]] = []
2117
+ seen: set[tuple[str, str, str]] = set()
2118
+
2119
+ def add(path: str, line_range: LineRange, source: str, name: str | None = None) -> None:
2120
+ record = record_by_path.get(path)
2121
+ if record is None:
2122
+ return
2123
+ retrieval, reason = repo_map_retrieval_for(root_arg, path, line_range, redacted_path=bool(record.get("redacted_path")))
2124
+ key = (path, line_range.identity(), source)
2125
+ if key in seen:
2126
+ return
2127
+ seen.add(key)
2128
+ item: dict[str, Any] = {"path": path, "source": source, "lines": line_range.as_dict()}
2129
+ if retrieval:
2130
+ item["slice_cli"] = retrieval
2131
+ elif reason:
2132
+ item["retrieval_omitted_reason"] = reason
2133
+ if name and retrieval and Path(path).suffix.lower() in SYMBOL_HINT_EXTENSIONS:
2134
+ item["symbol_cli"] = " ".join(shlex.quote(part) for part in ["context-guard-read-symbol", "--json", path, name])
2135
+ out.append(item)
2136
+
2137
+ for signature in signatures:
2138
+ lines = signature.get("lines")
2139
+ if isinstance(lines, dict):
2140
+ try:
2141
+ line_range = LineRange(int(lines.get("start")), int(lines.get("end")))
2142
+ except (TypeError, ValueError):
2143
+ continue
2144
+ add(str(signature.get("path", "")), line_range, "signature", str(signature.get("name", "")) or None)
2145
+ if len(out) >= MAX_REPO_MAP_RETRIEVAL_HINTS:
2146
+ return out[:MAX_REPO_MAP_RETRIEVAL_HINTS]
2147
+ for item in graph_rank:
2148
+ path = str(item.get("path", ""))
2149
+ record = record_by_path.get(path)
2150
+ if record is None:
2151
+ continue
2152
+ total = int(record.get("line_count", 0) or 1)
2153
+ add(path, LineRange(1, min(total, 80)), "graph_rank")
2154
+ if len(out) >= MAX_REPO_MAP_RETRIEVAL_HINTS:
2155
+ break
2156
+ return out[:MAX_REPO_MAP_RETRIEVAL_HINTS]
2157
+
2158
+
2159
+ def build_repo_map_payload(
2160
+ root: Path,
2161
+ args: argparse.Namespace,
2162
+ suggest_payload: dict[str, Any],
2163
+ build_payload: dict[str, Any],
2164
+ *,
2165
+ root_arg: str,
2166
+ ) -> dict[str, Any]:
2167
+ records, omitted, caps = repo_map_records(root)
2168
+ record_by_path = {str(record["path"]): record for record in records}
2169
+ signatures = extract_signatures(records)
2170
+ secret_scan = build_secret_scan(records)
2171
+ edges = collect_import_edges(records)
2172
+ query_terms = suggest_tokens(str(suggest_payload.get("query", "")))
2173
+ graph_rank = build_graph_rank(
2174
+ records,
2175
+ signatures,
2176
+ edges,
2177
+ query_terms=query_terms,
2178
+ seed_paths=repo_map_seed_paths(args, suggest_payload, build_payload),
2179
+ secret_scan=secret_scan,
2180
+ )
2181
+ retrieval = repo_map_retrieval(record_by_path, signatures, graph_rank, root_arg=root_arg)
2182
+ tree = build_token_tree(records)
2183
+ total_bytes = sum(int(record.get("bytes", 0) or 0) for record in records)
2184
+ return {
2185
+ "schema_version": REPO_MAP_SCHEMA_VERSION,
2186
+ "summary": {
2187
+ "files_scanned": len(records),
2188
+ "files_capped": bool(caps["files_capped"]),
2189
+ "bytes_per_file_capped_count": int(caps["bytes_per_file_capped_count"]),
2190
+ "tree_bytes": total_bytes,
2191
+ "tree_token_proxy": sum(int(item.get("token_proxy", 0) or 0) for item in tree),
2192
+ "signature_files": len({str(item.get("path", "")) for item in signatures}),
2193
+ "signature_count": len(signatures),
2194
+ "secret_risk_files": len(secret_scan.get("files_with_risks", [])),
2195
+ "graph_edges": len(edges),
2196
+ },
2197
+ "caps": caps,
2198
+ "token_tree": tree,
2199
+ "secret_scan": secret_scan,
2200
+ "signature_index": signatures,
2201
+ "graph": {
2202
+ "edges": edges[:MAX_REPO_MAP_GRAPH_RANK_ENTRIES],
2203
+ "edges_omitted_by_cap": max(0, len(edges) - MAX_REPO_MAP_GRAPH_RANK_ENTRIES),
2204
+ },
2205
+ "graph_rank": graph_rank,
2206
+ "retrieval": retrieval,
2207
+ "omitted_files": omitted[:MAX_REPO_MAP_TREE_ENTRIES],
2208
+ "safety": {
2209
+ "deterministic_local_only": True,
2210
+ "no_network": True,
2211
+ "no_model_or_embedding": True,
2212
+ "explain_only": True,
2213
+ "redacted_before_output": True,
2214
+ "tree_sitter": {"status": "unavailable_without_optional_dependency", "fallback": "python_ast_and_regex_signatures"},
2215
+ "caveats": [
2216
+ "Repo-map bytes are local sampled UTF-8 bytes and estimated chars_div_4 token proxies, not provider-token or savings claims.",
2217
+ "Graph ranking is deterministic explain metadata only; it does not change pack selection in this stage.",
2218
+ ],
2219
+ },
2220
+ }
2221
+
2222
+
2223
+ def build_auto_explain_payload(
2224
+ args: argparse.Namespace,
2225
+ suggest_payload: dict[str, Any],
2226
+ build_payload: dict[str, Any],
2227
+ payload: dict[str, Any],
2228
+ *,
2229
+ root: Path | None = None,
2230
+ root_arg: str = ".",
2231
+ ) -> dict[str, Any]:
2232
+ build_sources = [
2233
+ item
2234
+ for item in build_payload.get("included_sources", [])
2235
+ if isinstance(item, dict)
2236
+ ]
2237
+ used_build_indexes: set[int] = set()
2238
+ suggest_sources = [
2239
+ item
2240
+ for item in suggest_payload.get("sources", [])
2241
+ if isinstance(item, dict)
2242
+ ]
2243
+ exact_matches: dict[int, dict[str, Any]] = {}
2244
+ for index, item in enumerate(suggest_sources):
2245
+ build_item = find_exact_build_source_for_explain(item, build_sources, used_build_indexes)
2246
+ if build_item is not None:
2247
+ exact_matches[index] = build_item
2248
+
2249
+ selection: list[dict[str, Any]] = []
2250
+ for index, item in enumerate(suggest_sources):
2251
+ entry = copy_explain_fields(
2252
+ item,
2253
+ ("path", "score", "priority", "reason", "label", "lines", "bytes", "retrieval_cli", "retrieval_omitted_reason"),
2254
+ )
2255
+ build_item = exact_matches.get(index)
2256
+ if build_item is None:
2257
+ build_item = find_fallback_build_source_for_explain(item, build_sources, used_build_indexes)
2258
+ if build_item is not None:
2259
+ entry["build_status"] = build_item.get("status", "included")
2260
+ for key in ("requested_lines", "included_lines"):
2261
+ if key in build_item:
2262
+ entry[key] = copy.deepcopy(build_item[key])
2263
+ if "bytes" in build_item:
2264
+ entry["build_bytes"] = build_item["bytes"]
2265
+ else:
2266
+ entry["build_status"] = "not_built"
2267
+ selection.append(entry)
2268
+
2269
+ omissions: list[dict[str, Any]] = []
2270
+ seen_omissions: set[tuple[str, str, str, str, str]] = set()
2271
+ omission_fields = (
2272
+ "path",
2273
+ "status",
2274
+ "reason",
2275
+ "suggest_reason",
2276
+ "priority",
2277
+ "label",
2278
+ "requested_lines",
2279
+ "included_lines",
2280
+ "lines",
2281
+ "total_lines",
2282
+ "retrieval_cli",
2283
+ "retrieval_omitted_reason",
2284
+ "input_index",
2285
+ )
2286
+ for phase, source in (("suggest", suggest_payload), ("build", build_payload)):
2287
+ for item in source.get("omitted_sources", []):
2288
+ if not isinstance(item, dict):
2289
+ continue
2290
+ entry = copy_explain_fields(item, omission_fields)
2291
+ entry["phase"] = phase
2292
+ key = explain_omission_key(entry)
2293
+ if key in seen_omissions:
2294
+ continue
2295
+ seen_omissions.add(key)
2296
+ omissions.append(entry)
2297
+ omissions.sort(key=explain_omission_key)
2298
+
2299
+ build_source_counts = build_payload.get("sources", {}) if isinstance(build_payload.get("sources"), dict) else {}
2300
+ auto_source_counts = payload.get("sources", {}) if isinstance(payload.get("sources"), dict) else {}
2301
+ artifact = build_payload.get("artifact", {}) if isinstance(build_payload.get("artifact"), dict) else {}
2302
+ pack_bytes = int(payload.get("pack_bytes", build_payload.get("pack_bytes", 0)) or 0)
2303
+ budget_bytes = int(payload.get("budget_bytes", build_payload.get("budget_bytes", 0)) or 0)
2304
+ budget_omitted_count = sum(1 for item in omissions if item.get("reason") == "budget_exhausted")
2305
+ explicit_files = split_suggest_files(args.files)
2306
+ query = str(suggest_payload.get("query", ""))
2307
+ diff_label = cap_label(args.diff) if getattr(args, "diff", None) else None
2308
+ explain = {
2309
+ "schema_version": AUTO_EXPLAIN_SCHEMA_VERSION,
2310
+ "summary": {
2311
+ "suggested": int(auto_source_counts.get("suggested", len(selection)) or 0),
2312
+ "included": int(auto_source_counts.get("included", build_source_counts.get("included", 0)) or 0),
2313
+ "partial": int(auto_source_counts.get("partial", build_source_counts.get("partial", 0)) or 0),
2314
+ "omitted": int(auto_source_counts.get("omitted", build_source_counts.get("omitted", 0)) or 0),
2315
+ "suggest_omitted": len([item for item in suggest_payload.get("omitted_sources", []) if isinstance(item, dict)]),
2316
+ "explain_omissions": len(omissions),
2317
+ "pack_bytes": pack_bytes,
2318
+ "budget_bytes": budget_bytes,
2319
+ "manifest_written": bool(payload.get("manifest_path")),
2320
+ "pack_written": bool(payload.get("pack_path")),
2321
+ "artifact_stored": bool(artifact.get("stored")),
2322
+ "artifact_capped": bool(artifact.get("capped")),
2323
+ },
2324
+ "inputs": {
2325
+ "query": query,
2326
+ "query_present": bool(query),
2327
+ "diff": diff_label,
2328
+ "diff_present": bool(diff_label),
2329
+ "explicit_file_count": len(explicit_files),
2330
+ "output_count": len(args.output or []),
2331
+ "test_output_count": len(args.test_output or []),
2332
+ "top": bounded_int(args.top, DEFAULT_SUGGEST_TOP, 1, MAX_SUGGEST_TOP),
2333
+ "context_lines": bounded_int(args.context_lines, DEFAULT_SUGGEST_CONTEXT_LINES, 0, MAX_SUGGEST_CONTEXT_LINES),
2334
+ "no_artifact": bool(args.no_artifact),
2335
+ "manifest_path": payload.get("manifest_path"),
2336
+ "pack_path": payload.get("pack_path"),
2337
+ },
2338
+ "selection": selection,
2339
+ "omissions": omissions,
2340
+ "budget": {
2341
+ "pack_bytes": pack_bytes,
2342
+ "budget_bytes": budget_bytes,
2343
+ "remaining_bytes": budget_bytes - pack_bytes,
2344
+ "partial_count": int(build_source_counts.get("partial", 0) or 0),
2345
+ "budget_omitted_count": budget_omitted_count,
2346
+ "token_proxy": copy.deepcopy(payload.get("token_proxy", {})),
2347
+ "measurement": "observed_bytes_estimated_tokens",
2348
+ "caveat": "Byte counts are observed pack bytes; token counts are estimated chars_div_4 proxies, not provider-token savings.",
2349
+ },
2350
+ "safety": {
2351
+ "redaction": copy.deepcopy(build_payload.get("redaction", {})),
2352
+ "caveats": copy.deepcopy(payload.get("caveats", [])),
2353
+ "deterministic_local_only": True,
2354
+ "raw_output_embedded": False,
2355
+ "raw_test_output_embedded": False,
2356
+ },
2357
+ }
2358
+ if root is not None:
2359
+ explain["repo_map"] = build_repo_map_payload(root, args, suggest_payload, build_payload, root_arg=root_arg)
2360
+ return explain
2361
+
2362
+
2363
+ def auto_pack(root: Path, args: argparse.Namespace, *, root_arg: str) -> tuple[dict[str, Any], int]:
2364
+ manifest_rel = output_rel_for_collision_check(args.manifest_out, "--manifest-out") if args.manifest_out else None
2365
+ pack_rel = output_rel_for_collision_check(args.pack_out, "--pack-out") if args.pack_out else None
2366
+ if manifest_rel is not None and pack_rel is not None:
2367
+ reject_matching_output_targets(
2368
+ root,
2369
+ first_rel=manifest_rel,
2370
+ second_rel=pack_rel,
2371
+ second_option="--pack-out",
2372
+ reason="same_as_manifest_out",
2373
+ )
2374
+ if args.manifest_out:
2375
+ validate_output_path_under_root(root, args.manifest_out, "--manifest-out")
2376
+ if args.pack_out:
2377
+ validate_output_path_under_root(root, args.pack_out, "--pack-out")
2378
+ suggest_args = copy.copy(args)
2379
+ suggest_args.manifest_out = None
2380
+ suggest_payload, rc = suggest_pack(root, suggest_args, root_arg=root_arg)
2381
+ manifest = suggest_payload["manifest"]
2382
+ specs = manifest_to_source_specs(manifest)
2383
+ budget = bounded_int(args.budget_bytes, DEFAULT_BUDGET_BYTES, MIN_BUDGET_BYTES, MAX_BUDGET_BYTES)
2384
+ build_payload = build_pack(root, specs, budget_bytes=budget, root_arg=root_arg, store_artifact=False)
2385
+ if not args.no_artifact:
2386
+ receipt_rel = Path(PACK_DIR) / f"{build_payload['pack_id']}.json"
2387
+ if manifest_rel is not None:
2388
+ reject_matching_output_targets(
2389
+ root,
2390
+ first_rel=receipt_rel,
2391
+ second_rel=manifest_rel,
2392
+ second_option="--manifest-out",
2393
+ reason="same_as_artifact_receipt",
2394
+ )
2395
+ if pack_rel is not None:
2396
+ reject_matching_output_targets(
2397
+ root,
2398
+ first_rel=receipt_rel,
2399
+ second_rel=pack_rel,
2400
+ second_option="--pack-out",
2401
+ reason="same_as_artifact_receipt",
2402
+ )
2403
+ manifest_path: str | None = None
2404
+ pack_path: str | None = None
2405
+ if args.pack_out:
2406
+ pack_path = write_text_under_root(root, args.pack_out, str(build_payload["pack"]), "--pack-out")
2407
+ if args.manifest_out:
2408
+ manifest_path = write_manifest_under_root(root, args.manifest_out, manifest)
2409
+ if not args.no_artifact:
2410
+ build_payload["artifact"] = store_receipt(root, build_payload)
2411
+ build_hint, build_hint_omitted_reason = suggest_build_hint(root_arg, manifest_path, budget)
2412
+ suggest_payload["manifest_path"] = manifest_path
2413
+ suggest_payload["build_hint"] = build_hint
2414
+ suggest_payload.pop("build_hint_omitted_reason", None)
2415
+ if build_hint_omitted_reason:
2416
+ suggest_payload["build_hint_omitted_reason"] = build_hint_omitted_reason
2417
+ payload: dict[str, Any] = {
2418
+ "tool": TOOL_NAME,
2419
+ "schema_version": AUTO_SCHEMA_VERSION,
2420
+ "version": VERSION,
2421
+ "mode": "auto",
2422
+ "root": display_root(root),
2423
+ "query": suggest_payload.get("query", ""),
2424
+ "budget_bytes": budget,
2425
+ "manifest": manifest,
2426
+ "manifest_path": manifest_path,
2427
+ "pack_path": pack_path,
2428
+ "suggest": suggest_payload,
2429
+ "build": build_payload,
2430
+ "sources": {
2431
+ "suggested": len(suggest_payload.get("sources", [])),
2432
+ "included": build_payload.get("sources", {}).get("included", 0),
2433
+ "partial": build_payload.get("sources", {}).get("partial", 0),
2434
+ "omitted": build_payload.get("sources", {}).get("omitted", 0),
2435
+ },
2436
+ "pack_bytes": build_payload.get("pack_bytes", 0),
2437
+ "token_proxy": build_payload.get("token_proxy", {}),
2438
+ "caveats": [
2439
+ "Deterministic local heuristics only; no model, network, embedding, or provider-cost estimate is used.",
2440
+ "Byte and token values are pack-size proxies, not billing claims.",
2441
+ ],
2442
+ }
2443
+ if build_hint_omitted_reason:
2444
+ payload["build_hint_omitted_reason"] = build_hint_omitted_reason
2445
+ if args.explain:
2446
+ payload["explain"] = build_auto_explain_payload(args, suggest_payload, build_payload, payload, root=root, root_arg=root_arg)
2447
+ return payload, rc
2448
+
2449
+
2450
+ def print_suggest_text(payload: dict[str, Any]) -> None:
2451
+ print(
2452
+ f"context-guard-pack suggest: {len(payload['sources'])} source(s), "
2453
+ f"estimated {payload['estimated_pack_bytes']}/{payload['budget_bytes']} bytes"
2454
+ )
2455
+ for item in payload["sources"]:
2456
+ lines = item.get("lines")
2457
+ line_text = f":{lines['start']}:{lines['end']}" if isinstance(lines, dict) else ""
2458
+ print(f"- {item['path']}{line_text} priority={item['priority']} reason={item['reason']}")
2459
+ if payload.get("manifest_path"):
2460
+ print(f"manifest: {payload['manifest_path']}")
2461
+ if payload.get("build_hint"):
2462
+ print(f"build: {payload['build_hint']}")
2463
+ elif payload.get("build_hint_omitted_reason"):
2464
+ print(f"build hint omitted: {payload['build_hint_omitted_reason']}")
2465
+
2466
+
2467
+ def print_auto_text(payload: dict[str, Any]) -> None:
2468
+ print(
2469
+ f"context-guard-pack auto: {payload['sources']['suggested']} suggested source(s), "
2470
+ f"pack {payload['pack_bytes']}/{payload['budget_bytes']} bytes"
2471
+ )
2472
+ explain = payload.get("explain")
2473
+ if isinstance(explain, dict):
2474
+ summary = explain.get("summary", {}) if isinstance(explain.get("summary"), dict) else {}
2475
+ budget = explain.get("budget", {}) if isinstance(explain.get("budget"), dict) else {}
2476
+ print(
2477
+ "explain: "
2478
+ f"selected={summary.get('suggested', 0)} "
2479
+ f"included={summary.get('included', 0)} "
2480
+ f"partial={summary.get('partial', 0)} "
2481
+ f"omitted={summary.get('omitted', 0)} "
2482
+ f"budget={budget.get('pack_bytes', payload.get('pack_bytes', 0))}/{budget.get('budget_bytes', payload.get('budget_bytes', 0))} "
2483
+ "heuristic=local"
2484
+ )
2485
+ for item in (explain.get("selection", []) if isinstance(explain.get("selection"), list) else [])[:5]:
2486
+ if not isinstance(item, dict):
2487
+ continue
2488
+ lines = item.get("included_lines") or item.get("lines")
2489
+ if isinstance(lines, dict):
2490
+ line_text = f":{lines.get('start')}:{lines.get('end')}"
2491
+ else:
2492
+ line_text = ""
2493
+ print(
2494
+ f"- {item.get('path')}{line_text} "
2495
+ f"status={item.get('build_status', 'unknown')} "
2496
+ f"score={item.get('score', item.get('priority', 0))} "
2497
+ f"reason={item.get('reason', 'local heuristic')}"
2498
+ )
2499
+ omissions = explain.get("omissions", []) if isinstance(explain.get("omissions"), list) else []
2500
+ if omissions:
2501
+ reason_counts: dict[str, int] = {}
2502
+ for item in omissions:
2503
+ if not isinstance(item, dict):
2504
+ continue
2505
+ reason = str(item.get("reason", "unknown"))
2506
+ reason_counts[reason] = reason_counts.get(reason, 0) + 1
2507
+ reason_text = ", ".join(f"{reason}={count}" for reason, count in sorted(reason_counts.items()))
2508
+ print(f"omitted reasons: {reason_text}")
2509
+ if payload.get("manifest_path"):
2510
+ print(f"manifest: {payload['manifest_path']}")
2511
+ if payload.get("pack_path"):
2512
+ print(f"pack: {payload['pack_path']}")
2513
+ else:
2514
+ print()
2515
+ sys.stdout.write(str(payload["build"]["pack"]))
2516
+
2517
+
869
2518
  def build_parser() -> argparse.ArgumentParser:
870
2519
  parser = argparse.ArgumentParser(description="Build budgeted local context packs with exact retrieval hints.")
871
2520
  sub = parser.add_subparsers(dest="command", required=True)
@@ -881,6 +2530,33 @@ def build_parser() -> argparse.ArgumentParser:
881
2530
  slice_cmd.add_argument("--path", required=True, help="relative file path under root")
882
2531
  slice_cmd.add_argument("--lines", required=True, help="inclusive 1-indexed START:END")
883
2532
  slice_cmd.add_argument("--json", action="store_true", help="emit JSON payload")
2533
+ suggest = sub.add_parser("suggest", help="suggest a build-compatible context pack manifest from local signals")
2534
+ suggest.add_argument("--root", default=".", help="project root; must not be a symlink")
2535
+ suggest.add_argument("--query", default="", help="task or question to match against local files")
2536
+ suggest.add_argument("--diff", help="git diff range, or staged/worktree, to seed changed-file ranges")
2537
+ suggest.add_argument("--files", "--file", dest="files", action="append", help="explicit relative file path(s), comma-separated or repeated")
2538
+ suggest.add_argument("--output", action="append", help="relative path to sanitized command output text under root")
2539
+ suggest.add_argument("--test-output", action="append", help="relative path to sanitized test output text under root")
2540
+ suggest.add_argument("--budget-bytes", type=int, default=DEFAULT_BUDGET_BYTES)
2541
+ suggest.add_argument("--top", type=int, default=DEFAULT_SUGGEST_TOP, help="maximum suggested sources")
2542
+ suggest.add_argument("--context-lines", type=int, default=DEFAULT_SUGGEST_CONTEXT_LINES, help="line context around diff/output hits")
2543
+ suggest.add_argument("--manifest-out", help="write the suggested build manifest to this relative path under root")
2544
+ suggest.add_argument("--json", action="store_true", help="emit JSON payload")
2545
+ auto = sub.add_parser("auto", help="suggest a context pack manifest and build the budgeted pack in one local step")
2546
+ auto.add_argument("--root", default=".", help="project root; must not be a symlink")
2547
+ auto.add_argument("--query", default="", help="task or question to match against local files")
2548
+ auto.add_argument("--diff", help="git diff range, or staged/worktree, to seed changed-file ranges")
2549
+ auto.add_argument("--files", "--file", dest="files", action="append", help="explicit relative file path(s), comma-separated or repeated")
2550
+ auto.add_argument("--output", action="append", help="relative path to sanitized command output text under root")
2551
+ auto.add_argument("--test-output", action="append", help="relative path to sanitized test output text under root")
2552
+ auto.add_argument("--budget-bytes", type=int, default=DEFAULT_BUDGET_BYTES)
2553
+ auto.add_argument("--top", type=int, default=DEFAULT_SUGGEST_TOP, help="maximum suggested sources")
2554
+ auto.add_argument("--context-lines", type=int, default=DEFAULT_SUGGEST_CONTEXT_LINES, help="line context around diff/output hits")
2555
+ auto.add_argument("--manifest-out", help="write the suggested build manifest to this relative path under root")
2556
+ auto.add_argument("--pack-out", help="write the built Markdown pack to this relative path under root")
2557
+ auto.add_argument("--json", action="store_true", help="emit JSON payload")
2558
+ auto.add_argument("--no-artifact", action="store_true", help="do not write .context-guard/packs receipt")
2559
+ auto.add_argument("--explain", action="store_true", help="include deterministic local selection/build explanation metadata")
884
2560
  return parser
885
2561
 
886
2562
 
@@ -919,6 +2595,22 @@ def main(argv: list[str] | None = None) -> int:
919
2595
  else:
920
2596
  print(f"context-guard-pack: {payload.get('reason')}", file=sys.stderr)
921
2597
  return rc
2598
+ if args.command == "suggest":
2599
+ payload, rc = suggest_pack(root, args, root_arg=str(args.root))
2600
+ if args.json:
2601
+ json.dump(payload, sys.stdout, ensure_ascii=False, indent=2, sort_keys=True)
2602
+ sys.stdout.write("\n")
2603
+ else:
2604
+ print_suggest_text(payload)
2605
+ return rc
2606
+ if args.command == "auto":
2607
+ payload, rc = auto_pack(root, args, root_arg=str(args.root))
2608
+ if args.json:
2609
+ json.dump(payload, sys.stdout, ensure_ascii=False, indent=2, sort_keys=True)
2610
+ sys.stdout.write("\n")
2611
+ else:
2612
+ print_auto_text(payload)
2613
+ return rc
922
2614
  raise PackError("unknown command")
923
2615
  except PackError as exc:
924
2616
  print(f"context-guard-pack: {exc}", file=sys.stderr)