@ictechgy/context-guard 0.4.1 → 0.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +9 -0
- package/README.ko.md +61 -32
- package/README.md +90 -22
- package/context-guard-kit/README.md +39 -26
- package/context-guard-kit/benchmark_runner.py +273 -8
- package/context-guard-kit/claude_transcript_cost_audit.py +325 -12
- package/context-guard-kit/context_compress.py +153 -1
- package/context-guard-kit/context_filter.py +446 -0
- package/context-guard-kit/context_guard_cli.py +3 -0
- package/context-guard-kit/context_guard_diet.py +677 -2
- package/context-guard-kit/context_pack.py +1694 -2
- package/context-guard-kit/cost_guard.py +1870 -0
- package/context-guard-kit/setup_wizard.py +820 -29
- package/context-guard-kit/trim_command_output.py +396 -45
- package/docs/benchmark-fixtures/learned-compression.tasks.example.json +24 -0
- package/docs/benchmark-fixtures/learned-compression.variants.example.json +10 -0
- package/docs/benchmark-fixtures/visual-ocr.tasks.example.json +24 -0
- package/docs/benchmark-fixtures/visual-ocr.variants.example.json +10 -0
- package/docs/benchmark-workflow-examples.md +40 -0
- package/docs/benchmark-workflows/context-pack-byte-proxy.example.json +169 -0
- package/docs/benchmark-workflows/measured-token-workflow.example.json +170 -0
- package/docs/benchmark-workflows/provider-cache-telemetry.example.json +170 -0
- package/docs/cache-diagnostics-schema.md +75 -0
- package/docs/cache-diagnostics.example.json +116 -0
- package/docs/cache-diagnostics.schema.json +460 -0
- package/docs/distribution.md +4 -2
- package/docs/experimental-benchmark-fixtures.md +36 -0
- package/package.json +11 -2
- package/packaging/homebrew/context-guard.rb.template +3 -2
- package/plugins/context-guard/.claude-plugin/plugin.json +1 -1
- package/plugins/context-guard/README.ko.md +21 -13
- package/plugins/context-guard/README.md +24 -10
- package/plugins/context-guard/bin/context-guard +3 -0
- package/plugins/context-guard/bin/context-guard-audit +325 -12
- package/plugins/context-guard/bin/context-guard-bench +273 -8
- package/plugins/context-guard/bin/context-guard-compress +153 -1
- package/plugins/context-guard/bin/context-guard-cost +1870 -0
- package/plugins/context-guard/bin/context-guard-diet +677 -2
- package/plugins/context-guard/bin/context-guard-filter +446 -0
- package/plugins/context-guard/bin/context-guard-pack +1694 -2
- package/plugins/context-guard/bin/context-guard-setup +820 -29
- package/plugins/context-guard/bin/context-guard-trim-output +396 -45
- package/plugins/context-guard/brief/README.md +10 -3
- package/plugins/context-guard/skills/optimize/SKILL.md +5 -2
- package/plugins/context-guard/skills/setup/SKILL.md +3 -1
|
@@ -10,16 +10,19 @@ retrieval when the path is safe to display.
|
|
|
10
10
|
from __future__ import annotations
|
|
11
11
|
|
|
12
12
|
import argparse
|
|
13
|
+
import ast
|
|
13
14
|
import copy
|
|
14
15
|
import hashlib
|
|
15
16
|
import importlib.machinery
|
|
16
17
|
import importlib.util
|
|
17
18
|
import json
|
|
18
19
|
import os
|
|
20
|
+
import posixpath
|
|
19
21
|
from pathlib import Path
|
|
20
22
|
import re
|
|
21
23
|
import shlex
|
|
22
24
|
import stat
|
|
25
|
+
import subprocess
|
|
23
26
|
import sys
|
|
24
27
|
import time
|
|
25
28
|
from dataclasses import dataclass
|
|
@@ -35,22 +38,79 @@ MAX_MANIFEST_BYTES = 1_000_000
|
|
|
35
38
|
MAX_LABEL_CHARS = 160
|
|
36
39
|
MAX_REASON_CHARS = 120
|
|
37
40
|
TOKEN_PROXY_CHARS_PER_TOKEN = 4
|
|
41
|
+
SUGGEST_SCHEMA_VERSION = "contextguard.pack-suggest.v1"
|
|
42
|
+
AUTO_SCHEMA_VERSION = "contextguard.pack-auto.v1"
|
|
43
|
+
AUTO_EXPLAIN_SCHEMA_VERSION = "contextguard.pack-auto-explain.v1"
|
|
44
|
+
REPO_MAP_SCHEMA_VERSION = "contextguard.pack-repo-map.v1"
|
|
45
|
+
DEFAULT_SUGGEST_TOP = 8
|
|
46
|
+
MAX_SUGGEST_TOP = 50
|
|
47
|
+
DEFAULT_SUGGEST_CONTEXT_LINES = 20
|
|
48
|
+
MAX_SUGGEST_CONTEXT_LINES = 120
|
|
49
|
+
SUGGEST_WHOLE_FILE_MAX_LINES = 120
|
|
50
|
+
MAX_SUGGEST_INPUT_BYTES = 256_000
|
|
51
|
+
MAX_QUERY_SCAN_FILES = 2_000
|
|
52
|
+
MAX_QUERY_SCAN_BYTES_PER_FILE = 200_000
|
|
53
|
+
MAX_REPO_MAP_FILES = 1_000
|
|
54
|
+
MAX_REPO_MAP_BYTES_PER_FILE = 120_000
|
|
55
|
+
MAX_REPO_MAP_TREE_ENTRIES = 30
|
|
56
|
+
MAX_REPO_MAP_SIGNATURE_ENTRIES = 40
|
|
57
|
+
MAX_REPO_MAP_GRAPH_RANK_ENTRIES = 30
|
|
58
|
+
MAX_REPO_MAP_RETRIEVAL_HINTS = 30
|
|
59
|
+
MAX_REPO_MAP_SECRET_RISK_FILES = 20
|
|
38
60
|
PACK_DIR = ".context-guard/packs"
|
|
39
61
|
REDACTED_PATH_COMPONENT = "[REDACTED-PATH-COMPONENT]"
|
|
62
|
+
CONTROL_CHAR_RE = re.compile(r"[\x00-\x1f\x7f-\x9f]")
|
|
40
63
|
SECRET_CONTENT_RE = re.compile(
|
|
41
64
|
r"(?is)("
|
|
42
65
|
r"-----BEGIN [A-Z0-9 ]*PRIVATE KEY-----|"
|
|
43
66
|
r"AKIA[0-9A-Z]{16}|"
|
|
67
|
+
r"ASIA[0-9A-Z]{16}|"
|
|
44
68
|
r"gh[pousr]_[A-Za-z0-9_]{20,}|"
|
|
45
69
|
r"github_pat_[A-Za-z0-9_]{20,}|"
|
|
70
|
+
r"glpat-[A-Za-z0-9_-]{12,}|"
|
|
46
71
|
r"xox[abprs]-[A-Za-z0-9-]{10,}|"
|
|
47
72
|
r"sk-(?:ant|proj)-[A-Za-z0-9_-]{12,}|"
|
|
48
73
|
r"sk-[A-Za-z0-9][A-Za-z0-9_-]{20,}|"
|
|
74
|
+
r"(?:sk|pk|rk)_(?:live|test)_[A-Za-z0-9]{16,}|"
|
|
75
|
+
r"npm_[A-Za-z0-9]{20,}|"
|
|
49
76
|
r"AIza[0-9A-Za-z_\-]{20,}|"
|
|
50
77
|
r"(?i:Authorization)\s*:\s*(?:Bearer|Basic)\s+[A-Za-z0-9._~+/=-]+|"
|
|
51
78
|
r"(?<![A-Za-z0-9])(?:api[_-]?key|token|secret|password|client[_-]?secret)\s*[:=]\s*[^\s]+"
|
|
52
79
|
r")"
|
|
53
80
|
)
|
|
81
|
+
SECRET_PATH_COMPONENT_RE = re.compile(
|
|
82
|
+
r"(?i)("
|
|
83
|
+
r"SG\.[A-Za-z0-9_-]{16,256}\.[A-Za-z0-9_-]{16,512}|"
|
|
84
|
+
r"eyJ[A-Za-z0-9_-]{8,}\.[A-Za-z0-9_-]{8,}\.[A-Za-z0-9_-]{8,}|"
|
|
85
|
+
r"\b(?:Bearer|Basic)\s+[A-Za-z0-9._~+/=-]{12,}|"
|
|
86
|
+
r"[a-z][a-z0-9+.-]{0,31}:/+(?:[^/\s:@]{0,256}:[^/\s@]{0,2048}|[^/\s@]{1,2048})@"
|
|
87
|
+
r")"
|
|
88
|
+
)
|
|
89
|
+
SECRET_RISK_PATTERNS: tuple[tuple[str, re.Pattern[str]], ...] = (
|
|
90
|
+
("private_key_block", re.compile(r"(?is)-----BEGIN [A-Z0-9 ]*PRIVATE KEY-----")),
|
|
91
|
+
("github_token", re.compile(r"gh[pousr]_[A-Za-z0-9_]{20,}|github_pat_[A-Za-z0-9_]{20,}|glpat-[A-Za-z0-9_-]{12,}")),
|
|
92
|
+
("provider_api_key", re.compile(r"sk-(?:ant|proj)-[A-Za-z0-9_-]{12,}|sk-[A-Za-z0-9][A-Za-z0-9_-]{20,}|AIza[0-9A-Za-z_\-]{20,}")),
|
|
93
|
+
("authorization_header", re.compile(r"(?i)Authorization\s*:\s*(?:Bearer|Basic)\s+[A-Za-z0-9._~+/=-]+")),
|
|
94
|
+
("generic_secret_assignment", re.compile(r"(?i)(?:api[_-]?key|token|secret|password|client[_-]?secret)\s*[:=]\s*[^\s]+")),
|
|
95
|
+
)
|
|
96
|
+
REPO_MAP_TEXT_EXTENSIONS = {
|
|
97
|
+
".py", ".js", ".jsx", ".ts", ".tsx", ".mjs", ".cjs",
|
|
98
|
+
".go", ".rs", ".java", ".kt", ".kts", ".swift", ".c", ".cc", ".cpp", ".h", ".hpp",
|
|
99
|
+
".md", ".mdx", ".txt", ".json", ".yaml", ".yml", ".toml", ".sh", ".css", ".html",
|
|
100
|
+
}
|
|
101
|
+
SYMBOL_HINT_EXTENSIONS = {".py", ".js", ".jsx", ".ts", ".tsx", ".go", ".rs"}
|
|
102
|
+
SIGNATURE_LINE_RE = re.compile(
|
|
103
|
+
r"^\s*(?:export\s+)?(?:(?:async\s+)?function\s+([A-Za-z_$][\w$]*)\s*\(|class\s+([A-Za-z_$][\w$]*)|"
|
|
104
|
+
r"(?:const|let|var)\s+([A-Za-z_$][\w$]*)\s*=\s*(?:async\s*)?(?:\([^)]*\)|[A-Za-z_$][\w$]*)\s*=>|"
|
|
105
|
+
r"func\s+(?:\([^)]*\)\s*)?([A-Za-z_]\w*)\s*\(|(?:pub\s+)?(?:async\s+)?fn\s+([A-Za-z_]\w*)\s*\()"
|
|
106
|
+
)
|
|
107
|
+
IMPORT_PATH_RE = re.compile(
|
|
108
|
+
r"(?:from\s+['\"](?P<jsfrom>[^'\"]+)['\"]|"
|
|
109
|
+
r"import(?:\s+[^;\n'\"]+?\s+from)?\s+['\"](?P<jsimport>[^'\"]+)['\"]|"
|
|
110
|
+
r"from\s+(?P<pyfrom>\.*[A-Za-z_][\w.]*|\.+)\s+import|"
|
|
111
|
+
r"import\s+(?P<pyimport>[A-Za-z_][\w.]*))"
|
|
112
|
+
)
|
|
113
|
+
PY_FROM_IMPORT_LINE_RE = re.compile(r"^\s*from\s+(?P<module>\.*[A-Za-z_][\w.]*|\.+)\s+import\s+(?P<names>[^\n#;]+)")
|
|
54
114
|
|
|
55
115
|
|
|
56
116
|
@dataclass(frozen=True)
|
|
@@ -87,6 +147,16 @@ class ResolvedSource:
|
|
|
87
147
|
redacted_lines: int
|
|
88
148
|
|
|
89
149
|
|
|
150
|
+
@dataclass
|
|
151
|
+
class SuggestCandidate:
|
|
152
|
+
path: str
|
|
153
|
+
score: int
|
|
154
|
+
reason: str
|
|
155
|
+
lines: LineRange | None = None
|
|
156
|
+
label: str | None = None
|
|
157
|
+
input_index: int = 0
|
|
158
|
+
|
|
159
|
+
|
|
90
160
|
class PackError(ValueError):
|
|
91
161
|
pass
|
|
92
162
|
|
|
@@ -176,9 +246,10 @@ def display_root(root: Path) -> str:
|
|
|
176
246
|
|
|
177
247
|
|
|
178
248
|
def display_rel_path(rel: str) -> tuple[str, bool]:
|
|
249
|
+
normalized = rel.replace("\\", "/")
|
|
179
250
|
parts: list[str] = []
|
|
180
251
|
redacted = False
|
|
181
|
-
for part in
|
|
252
|
+
for part in normalized.split("/"):
|
|
182
253
|
if not part:
|
|
183
254
|
continue
|
|
184
255
|
safe, did = sanitize_path_component(part)
|
|
@@ -187,6 +258,24 @@ def display_rel_path(rel: str) -> tuple[str, bool]:
|
|
|
187
258
|
return "/".join(parts), redacted
|
|
188
259
|
|
|
189
260
|
|
|
261
|
+
def repo_map_path_has_sensitive_evidence(value: str) -> bool:
|
|
262
|
+
return bool(CONTROL_CHAR_RE.search(value) or SECRET_PATH_COMPONENT_RE.search(value))
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def repo_map_display_rel_path(rel: str) -> tuple[str, bool]:
|
|
266
|
+
normalized = rel.replace("\\", "/")
|
|
267
|
+
if repo_map_path_has_sensitive_evidence(normalized):
|
|
268
|
+
return f"redacted-path#path:{sha256_text(normalized)[:12]}", True
|
|
269
|
+
return display_rel_path(normalized)
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def repo_map_safe_raw_path_label(raw: str) -> str:
|
|
273
|
+
normalized = raw.replace("\\", "/")
|
|
274
|
+
if repo_map_path_has_sensitive_evidence(normalized):
|
|
275
|
+
return f"redacted-path#path:{sha256_text(normalized)[:12]}"
|
|
276
|
+
return safe_raw_path_label(normalized)
|
|
277
|
+
|
|
278
|
+
|
|
190
279
|
def parse_line_range(value: object) -> LineRange | None:
|
|
191
280
|
if value is None or value == "":
|
|
192
281
|
return None
|
|
@@ -484,7 +573,7 @@ def retrieval_cli(root_arg: str, display_path: str, lines: LineRange) -> str:
|
|
|
484
573
|
|
|
485
574
|
def safe_root_arg_for_retrieval(root_arg: str) -> str | None:
|
|
486
575
|
text = str(root_arg)
|
|
487
|
-
if SECRET_CONTENT_RE.search(text):
|
|
576
|
+
if CONTROL_CHAR_RE.search(text) or SECRET_CONTENT_RE.search(text) or SECRET_PATH_COMPONENT_RE.search(text):
|
|
488
577
|
return None
|
|
489
578
|
for part in text.replace("\\", "/").split("/"):
|
|
490
579
|
if not part:
|
|
@@ -495,6 +584,13 @@ def safe_root_arg_for_retrieval(root_arg: str) -> str | None:
|
|
|
495
584
|
return text
|
|
496
585
|
|
|
497
586
|
|
|
587
|
+
def safe_repo_map_root_arg_for_retrieval(root_arg: str) -> str | None:
|
|
588
|
+
text = str(root_arg)
|
|
589
|
+
if repo_map_path_has_sensitive_evidence(text):
|
|
590
|
+
return None
|
|
591
|
+
return safe_root_arg_for_retrieval(text)
|
|
592
|
+
|
|
593
|
+
|
|
498
594
|
def retrieval_for(root_arg: str, display_path: str, lines: LineRange, *, redacted_path: bool) -> tuple[str | None, str | None]:
|
|
499
595
|
if redacted_path:
|
|
500
596
|
return None, "redacted_path"
|
|
@@ -866,6 +962,1559 @@ def slice_source(root: Path, *, raw_path: str, lines: LineRange) -> tuple[dict[s
|
|
|
866
962
|
return payload, 0
|
|
867
963
|
|
|
868
964
|
|
|
965
|
+
def suggest_tokens(text: str) -> set[str]:
|
|
966
|
+
sanitized = SECRET_CONTENT_RE.sub(" ", text.lower())
|
|
967
|
+
return {part for part in re.findall(r"[a-z0-9_][a-z0-9_.-]{1,}", sanitized) if len(part) >= 2}
|
|
968
|
+
|
|
969
|
+
|
|
970
|
+
def suggest_score_path(path: str, query_terms: set[str]) -> int:
|
|
971
|
+
lowered = path.lower()
|
|
972
|
+
score = 0
|
|
973
|
+
for term in query_terms:
|
|
974
|
+
if term in lowered:
|
|
975
|
+
score += 120
|
|
976
|
+
return score
|
|
977
|
+
|
|
978
|
+
|
|
979
|
+
def suggest_reason(*parts: str) -> str:
|
|
980
|
+
return cap_label("; ".join(part for part in parts if part), default="local heuristic", limit=MAX_REASON_CHARS) or "local heuristic"
|
|
981
|
+
|
|
982
|
+
|
|
983
|
+
def split_suggest_files(values: list[str] | None) -> list[str]:
|
|
984
|
+
out: list[str] = []
|
|
985
|
+
for value in values or []:
|
|
986
|
+
for part in str(value).split(","):
|
|
987
|
+
text = part.strip()
|
|
988
|
+
if text:
|
|
989
|
+
out.append(text)
|
|
990
|
+
return out
|
|
991
|
+
|
|
992
|
+
|
|
993
|
+
def line_window(line_number: int, total_lines: int | None, context_lines: int) -> LineRange:
|
|
994
|
+
start = max(1, line_number - context_lines)
|
|
995
|
+
if total_lines is None:
|
|
996
|
+
end = max(start, line_number + context_lines)
|
|
997
|
+
else:
|
|
998
|
+
end = min(max(start, line_number + context_lines), max(1, total_lines))
|
|
999
|
+
return LineRange(start, end)
|
|
1000
|
+
|
|
1001
|
+
|
|
1002
|
+
def merge_line_window(existing: LineRange | None, line_number: int, context_lines: int) -> LineRange:
|
|
1003
|
+
window = line_window(line_number, None, context_lines)
|
|
1004
|
+
if existing is None:
|
|
1005
|
+
return window
|
|
1006
|
+
return LineRange(min(existing.start, window.start), max(existing.end, window.end))
|
|
1007
|
+
|
|
1008
|
+
|
|
1009
|
+
def add_suggest_candidate(
|
|
1010
|
+
candidates: list[SuggestCandidate],
|
|
1011
|
+
*,
|
|
1012
|
+
path: str,
|
|
1013
|
+
score: int,
|
|
1014
|
+
reason: str,
|
|
1015
|
+
lines: LineRange | None = None,
|
|
1016
|
+
label: str | None = None,
|
|
1017
|
+
) -> None:
|
|
1018
|
+
candidates.append(
|
|
1019
|
+
SuggestCandidate(
|
|
1020
|
+
path=path,
|
|
1021
|
+
score=score,
|
|
1022
|
+
reason=suggest_reason(reason),
|
|
1023
|
+
lines=lines,
|
|
1024
|
+
label=cap_label(label),
|
|
1025
|
+
input_index=len(candidates),
|
|
1026
|
+
)
|
|
1027
|
+
)
|
|
1028
|
+
|
|
1029
|
+
|
|
1030
|
+
def run_git_diff(root: Path, diff_ref: str) -> str:
|
|
1031
|
+
ref = diff_ref.strip()
|
|
1032
|
+
if not ref:
|
|
1033
|
+
raise PackError("empty --diff")
|
|
1034
|
+
command = ["git", "-C", str(root), "diff", "--no-ext-diff", "--no-textconv", "--unified=3"]
|
|
1035
|
+
if ref in {"staged", "--staged", "cached", "--cached"}:
|
|
1036
|
+
command.extend(["--cached"])
|
|
1037
|
+
elif ref in {"worktree", "unstaged", "working-tree"}:
|
|
1038
|
+
pass
|
|
1039
|
+
elif ref.startswith("-"):
|
|
1040
|
+
raise PackError("invalid --diff: revision must not start with '-'")
|
|
1041
|
+
else:
|
|
1042
|
+
command.append(ref)
|
|
1043
|
+
try:
|
|
1044
|
+
proc = subprocess.run(command, text=True, errors="replace", capture_output=True, timeout=10, check=False)
|
|
1045
|
+
except (OSError, UnicodeError, subprocess.TimeoutExpired) as exc:
|
|
1046
|
+
raise PackError(f"could not read diff: {exc.__class__.__name__}") from exc
|
|
1047
|
+
if proc.returncode != 0:
|
|
1048
|
+
detail = sanitize_text(proc.stderr or proc.stdout or "git diff failed")[0].strip().splitlines()
|
|
1049
|
+
message = detail[0] if detail else "git diff failed"
|
|
1050
|
+
raise PackError(f"could not read diff: {cap_label(message, default='git diff failed', limit=160)}")
|
|
1051
|
+
return sanitize_text(proc.stdout[:MAX_SUGGEST_INPUT_BYTES])[0]
|
|
1052
|
+
|
|
1053
|
+
|
|
1054
|
+
def collect_diff_candidates(root: Path, diff_ref: str, query_terms: set[str], context_lines: int) -> list[SuggestCandidate]:
|
|
1055
|
+
diff_text = run_git_diff(root, diff_ref)
|
|
1056
|
+
candidates: list[SuggestCandidate] = []
|
|
1057
|
+
current_path: str | None = None
|
|
1058
|
+
hunk_re = re.compile(r"^@@ -\d+(?:,\d+)? \+(\d+)(?:,(\d+))? @@")
|
|
1059
|
+
for line in diff_text.splitlines():
|
|
1060
|
+
if line.startswith("diff --git "):
|
|
1061
|
+
match = re.match(r"^diff --git a/(.+?) b/(.+)$", line)
|
|
1062
|
+
current_path = None
|
|
1063
|
+
if match:
|
|
1064
|
+
left, right = match.groups()
|
|
1065
|
+
current_path = right if right != "/dev/null" else left
|
|
1066
|
+
continue
|
|
1067
|
+
if current_path is None:
|
|
1068
|
+
continue
|
|
1069
|
+
hunk = hunk_re.match(line)
|
|
1070
|
+
if hunk:
|
|
1071
|
+
start = int(hunk.group(1))
|
|
1072
|
+
count = int(hunk.group(2) or "1")
|
|
1073
|
+
end_line = max(start, start + max(1, count) - 1)
|
|
1074
|
+
start_line = max(1, start - context_lines)
|
|
1075
|
+
window = LineRange(start_line, max(start_line, end_line + context_lines))
|
|
1076
|
+
score = 7_000 + suggest_score_path(current_path, query_terms)
|
|
1077
|
+
add_suggest_candidate(
|
|
1078
|
+
candidates,
|
|
1079
|
+
path=current_path,
|
|
1080
|
+
score=score,
|
|
1081
|
+
reason="changed diff hunk",
|
|
1082
|
+
lines=window,
|
|
1083
|
+
label=f"diff:{safe_raw_path_label(current_path)}",
|
|
1084
|
+
)
|
|
1085
|
+
return candidates
|
|
1086
|
+
|
|
1087
|
+
|
|
1088
|
+
OUTPUT_PATH_RE = re.compile(
|
|
1089
|
+
r"(?<![A-Za-z0-9_./-])"
|
|
1090
|
+
r"(?P<path>(?:\.\/)?(?:[A-Za-z0-9_.-]+/)*[A-Za-z0-9_.-]+\."
|
|
1091
|
+
r"(?:py|js|jsx|ts|tsx|mjs|cjs|md|json|yml|yaml|toml|sh|css|html|txt|rb|go|rs|java|kt|swift|c|cc|cpp|h|hpp))"
|
|
1092
|
+
r"(?::(?P<line>\d+))?"
|
|
1093
|
+
)
|
|
1094
|
+
|
|
1095
|
+
|
|
1096
|
+
def read_text_input_under_root(root: Path, raw_path: str) -> tuple[str | None, dict[str, Any] | None]:
|
|
1097
|
+
rel, reason = lexical_rel(raw_path)
|
|
1098
|
+
display = safe_raw_path_label(raw_path)
|
|
1099
|
+
if rel is None:
|
|
1100
|
+
return None, {"path": display, "status": "omitted", "reason": reason}
|
|
1101
|
+
display, redacted = display_rel_path(rel.as_posix())
|
|
1102
|
+
if redacted:
|
|
1103
|
+
return None, {"path": display, "status": "omitted", "reason": "redacted_path", "retrieval_omitted_reason": "redacted_path"}
|
|
1104
|
+
handle, reason = open_regular_under_root(root, rel)
|
|
1105
|
+
if handle is None:
|
|
1106
|
+
return None, {"path": display, "status": "omitted", "reason": reason}
|
|
1107
|
+
try:
|
|
1108
|
+
with handle:
|
|
1109
|
+
text = handle.read(MAX_SUGGEST_INPUT_BYTES + 1)
|
|
1110
|
+
except (OSError, UnicodeError):
|
|
1111
|
+
return None, {"path": display, "status": "omitted", "reason": "unsafe_path"}
|
|
1112
|
+
if len(text.encode("utf-8", errors="replace")) > MAX_SUGGEST_INPUT_BYTES:
|
|
1113
|
+
text = text[:MAX_SUGGEST_INPUT_BYTES]
|
|
1114
|
+
sanitized, _redacted = sanitize_text(text)
|
|
1115
|
+
return sanitized, None
|
|
1116
|
+
|
|
1117
|
+
|
|
1118
|
+
def collect_output_candidates(
|
|
1119
|
+
root: Path,
|
|
1120
|
+
raw_paths: list[str] | None,
|
|
1121
|
+
query_terms: set[str],
|
|
1122
|
+
context_lines: int,
|
|
1123
|
+
*,
|
|
1124
|
+
origin: str,
|
|
1125
|
+
) -> tuple[list[SuggestCandidate], list[dict[str, Any]]]:
|
|
1126
|
+
candidates: list[SuggestCandidate] = []
|
|
1127
|
+
omitted: list[dict[str, Any]] = []
|
|
1128
|
+
for raw in raw_paths or []:
|
|
1129
|
+
text, omission_item = read_text_input_under_root(root, raw)
|
|
1130
|
+
if omission_item is not None:
|
|
1131
|
+
omission_item["origin"] = origin
|
|
1132
|
+
omitted.append(omission_item)
|
|
1133
|
+
continue
|
|
1134
|
+
assert text is not None
|
|
1135
|
+
by_path: dict[str, LineRange | None] = {}
|
|
1136
|
+
for match in OUTPUT_PATH_RE.finditer(text):
|
|
1137
|
+
path = match.group("path")
|
|
1138
|
+
if path.startswith("./"):
|
|
1139
|
+
path = path[2:]
|
|
1140
|
+
line_text = match.group("line")
|
|
1141
|
+
if line_text:
|
|
1142
|
+
try:
|
|
1143
|
+
line_number = int(line_text)
|
|
1144
|
+
except ValueError:
|
|
1145
|
+
line_number = 1
|
|
1146
|
+
by_path[path] = merge_line_window(by_path.get(path), line_number, context_lines)
|
|
1147
|
+
else:
|
|
1148
|
+
by_path.setdefault(path, None)
|
|
1149
|
+
for path, lines in sorted(by_path.items()):
|
|
1150
|
+
score = 5_000 + suggest_score_path(path, query_terms)
|
|
1151
|
+
add_suggest_candidate(
|
|
1152
|
+
candidates,
|
|
1153
|
+
path=path,
|
|
1154
|
+
score=score,
|
|
1155
|
+
reason=f"{origin} referenced path",
|
|
1156
|
+
lines=lines,
|
|
1157
|
+
label=f"{origin}:{safe_raw_path_label(path)}",
|
|
1158
|
+
)
|
|
1159
|
+
return candidates, omitted
|
|
1160
|
+
|
|
1161
|
+
|
|
1162
|
+
def git_ls_files(root: Path) -> list[str]:
|
|
1163
|
+
try:
|
|
1164
|
+
proc = subprocess.run(
|
|
1165
|
+
["git", "-C", str(root), "ls-files", "-z"],
|
|
1166
|
+
text=False,
|
|
1167
|
+
capture_output=True,
|
|
1168
|
+
timeout=10,
|
|
1169
|
+
check=False,
|
|
1170
|
+
)
|
|
1171
|
+
except (OSError, subprocess.TimeoutExpired):
|
|
1172
|
+
proc = None
|
|
1173
|
+
if proc is not None and proc.returncode == 0:
|
|
1174
|
+
raw = proc.stdout[: MAX_QUERY_SCAN_FILES * 512]
|
|
1175
|
+
return [part.decode("utf-8", "replace") for part in raw.split(b"\0") if part][:MAX_QUERY_SCAN_FILES]
|
|
1176
|
+
out: list[str] = []
|
|
1177
|
+
skip_dirs = {".git", ".omx", ".context-guard", "node_modules", "dist", "build", "__pycache__"}
|
|
1178
|
+
for current, dirs, files in os.walk(root):
|
|
1179
|
+
dirs[:] = [name for name in dirs if name not in skip_dirs and not name.startswith(".pytest")]
|
|
1180
|
+
current_path = Path(current)
|
|
1181
|
+
for name in files:
|
|
1182
|
+
rel = (current_path / name).relative_to(root).as_posix()
|
|
1183
|
+
out.append(rel)
|
|
1184
|
+
if len(out) >= MAX_QUERY_SCAN_FILES:
|
|
1185
|
+
return out
|
|
1186
|
+
return out
|
|
1187
|
+
|
|
1188
|
+
|
|
1189
|
+
def collect_query_candidates(root: Path, query_terms: set[str], context_lines: int) -> list[SuggestCandidate]:
|
|
1190
|
+
if not query_terms:
|
|
1191
|
+
return []
|
|
1192
|
+
candidates: list[SuggestCandidate] = []
|
|
1193
|
+
for rel_path in git_ls_files(root):
|
|
1194
|
+
rel, reason = lexical_rel(rel_path)
|
|
1195
|
+
if rel is None or reason:
|
|
1196
|
+
continue
|
|
1197
|
+
display, redacted = display_rel_path(rel.as_posix())
|
|
1198
|
+
if redacted:
|
|
1199
|
+
continue
|
|
1200
|
+
path_score = suggest_score_path(display, query_terms)
|
|
1201
|
+
handle, open_reason = open_regular_under_root(root, rel)
|
|
1202
|
+
if handle is None:
|
|
1203
|
+
continue
|
|
1204
|
+
first_match_line: int | None = None
|
|
1205
|
+
content_score = 0
|
|
1206
|
+
try:
|
|
1207
|
+
with handle:
|
|
1208
|
+
scanned_bytes = 0
|
|
1209
|
+
for index, raw_line in enumerate(handle, start=1):
|
|
1210
|
+
scanned_bytes += byte_len(raw_line)
|
|
1211
|
+
if scanned_bytes > MAX_QUERY_SCAN_BYTES_PER_FILE:
|
|
1212
|
+
break
|
|
1213
|
+
if index > SUGGEST_WHOLE_FILE_MAX_LINES and content_score == 0 and path_score == 0:
|
|
1214
|
+
break
|
|
1215
|
+
lowered = raw_line.lower()
|
|
1216
|
+
hits = sum(1 for term in query_terms if term in lowered)
|
|
1217
|
+
if hits:
|
|
1218
|
+
content_score += 250 * hits
|
|
1219
|
+
if first_match_line is None:
|
|
1220
|
+
first_match_line = index
|
|
1221
|
+
except (OSError, UnicodeError):
|
|
1222
|
+
_ = open_reason
|
|
1223
|
+
continue
|
|
1224
|
+
if path_score == 0 and content_score == 0:
|
|
1225
|
+
continue
|
|
1226
|
+
if first_match_line is not None:
|
|
1227
|
+
lines = line_window(first_match_line, None, context_lines)
|
|
1228
|
+
reason = "query matched file content"
|
|
1229
|
+
else:
|
|
1230
|
+
lines = None
|
|
1231
|
+
reason = "query matched file path"
|
|
1232
|
+
add_suggest_candidate(
|
|
1233
|
+
candidates,
|
|
1234
|
+
path=display,
|
|
1235
|
+
score=3_000 + path_score + content_score,
|
|
1236
|
+
reason=reason,
|
|
1237
|
+
lines=lines,
|
|
1238
|
+
label=f"query:{display}",
|
|
1239
|
+
)
|
|
1240
|
+
return candidates
|
|
1241
|
+
|
|
1242
|
+
|
|
1243
|
+
def source_selected_range(source: ResolvedSource) -> LineRange:
|
|
1244
|
+
start = source.requested_lines.start if source.requested_lines else 1
|
|
1245
|
+
return LineRange(start, start + max(len(source.selected_lines), 1) - 1)
|
|
1246
|
+
|
|
1247
|
+
|
|
1248
|
+
def resolved_block_bytes(source: ResolvedSource, *, root_arg: str) -> int:
|
|
1249
|
+
included = source_selected_range(source)
|
|
1250
|
+
return byte_len(render_block(source, source.selected_lines, root_arg=root_arg, status="included", included=included))
|
|
1251
|
+
|
|
1252
|
+
|
|
1253
|
+
def manifest_source_for_candidate(source: ResolvedSource, *, priority: int, label: str | None) -> dict[str, Any]:
|
|
1254
|
+
item: dict[str, Any] = {"path": source.display_path, "priority": priority}
|
|
1255
|
+
if label:
|
|
1256
|
+
item["label"] = label
|
|
1257
|
+
if source.requested_lines is not None:
|
|
1258
|
+
item["lines"] = source_selected_range(source).as_dict()
|
|
1259
|
+
return item
|
|
1260
|
+
|
|
1261
|
+
|
|
1262
|
+
def suggested_source_payload(source: ResolvedSource, candidate: SuggestCandidate, *, root_arg: str) -> dict[str, Any]:
|
|
1263
|
+
included = source_selected_range(source)
|
|
1264
|
+
payload: dict[str, Any] = {
|
|
1265
|
+
"path": source.display_path,
|
|
1266
|
+
"priority": candidate.score,
|
|
1267
|
+
"score": candidate.score,
|
|
1268
|
+
"reason": candidate.reason,
|
|
1269
|
+
"lines": included.as_dict(),
|
|
1270
|
+
"bytes": byte_len("".join(source.selected_lines)),
|
|
1271
|
+
}
|
|
1272
|
+
if candidate.label:
|
|
1273
|
+
payload["label"] = candidate.label
|
|
1274
|
+
retrieval, retrieval_omitted_reason = retrieval_for(root_arg, source.display_path, included, redacted_path=source.redacted_path)
|
|
1275
|
+
if retrieval:
|
|
1276
|
+
payload["retrieval_cli"] = retrieval
|
|
1277
|
+
elif retrieval_omitted_reason:
|
|
1278
|
+
payload["retrieval_omitted_reason"] = retrieval_omitted_reason
|
|
1279
|
+
return payload
|
|
1280
|
+
|
|
1281
|
+
|
|
1282
|
+
def normalize_suggest_source(root: Path, candidate: SuggestCandidate) -> tuple[ResolvedSource | None, dict[str, Any] | None]:
|
|
1283
|
+
spec = SourceSpec(
|
|
1284
|
+
path=candidate.path,
|
|
1285
|
+
priority=candidate.score,
|
|
1286
|
+
lines=candidate.lines,
|
|
1287
|
+
label=candidate.label,
|
|
1288
|
+
input_index=candidate.input_index,
|
|
1289
|
+
origin="suggest",
|
|
1290
|
+
)
|
|
1291
|
+
source, omitted_item = resolve_source(root, spec)
|
|
1292
|
+
if omitted_item is not None:
|
|
1293
|
+
omitted_item["reason"] = omitted_item.get("reason") or candidate.reason
|
|
1294
|
+
omitted_item["suggest_reason"] = candidate.reason
|
|
1295
|
+
return None, omitted_item
|
|
1296
|
+
assert source is not None
|
|
1297
|
+
if source.redacted_path:
|
|
1298
|
+
return None, omission(spec, "redacted_path", path=source.display_path, redacted_path=True)
|
|
1299
|
+
if spec.lines is None and source.total_lines > SUGGEST_WHOLE_FILE_MAX_LINES:
|
|
1300
|
+
capped = SourceSpec(
|
|
1301
|
+
path=candidate.path,
|
|
1302
|
+
priority=candidate.score,
|
|
1303
|
+
lines=LineRange(1, min(SUGGEST_WHOLE_FILE_MAX_LINES, source.total_lines)),
|
|
1304
|
+
label=candidate.label,
|
|
1305
|
+
input_index=candidate.input_index,
|
|
1306
|
+
origin="suggest",
|
|
1307
|
+
)
|
|
1308
|
+
source, omitted_item = resolve_source(root, capped)
|
|
1309
|
+
if omitted_item is not None:
|
|
1310
|
+
omitted_item["suggest_reason"] = candidate.reason
|
|
1311
|
+
return None, omitted_item
|
|
1312
|
+
assert source is not None
|
|
1313
|
+
return source, None
|
|
1314
|
+
|
|
1315
|
+
|
|
1316
|
+
def write_manifest_under_root(root: Path, raw_path: str, manifest: dict[str, Any]) -> str:
|
|
1317
|
+
content = json.dumps(manifest, ensure_ascii=False, indent=2, sort_keys=True) + "\n"
|
|
1318
|
+
return write_text_under_root(root, raw_path, content, "--manifest-out")
|
|
1319
|
+
|
|
1320
|
+
|
|
1321
|
+
def validate_output_path_under_root(root: Path, raw_path: str, option_name: str) -> str:
|
|
1322
|
+
rel, reason = lexical_rel(raw_path)
|
|
1323
|
+
if rel is None:
|
|
1324
|
+
raise PackError(f"invalid {option_name}: {reason}")
|
|
1325
|
+
display, redacted = display_rel_path(rel.as_posix())
|
|
1326
|
+
if redacted:
|
|
1327
|
+
raise PackError(f"invalid {option_name}: redacted_path")
|
|
1328
|
+
parent_parts = rel.parts[:-1]
|
|
1329
|
+
filename = rel.parts[-1]
|
|
1330
|
+
current_fd: int | None = None
|
|
1331
|
+
file_fd = -1
|
|
1332
|
+
try:
|
|
1333
|
+
current_fd = open_dir_no_follow(root)
|
|
1334
|
+
for part in parent_parts:
|
|
1335
|
+
next_fd = open_dir_no_follow(part, dir_fd=current_fd)
|
|
1336
|
+
os.close(current_fd)
|
|
1337
|
+
current_fd = next_fd
|
|
1338
|
+
flags = os.O_WRONLY
|
|
1339
|
+
if hasattr(os, "O_NOFOLLOW"):
|
|
1340
|
+
flags |= os.O_NOFOLLOW
|
|
1341
|
+
if hasattr(os, "O_CLOEXEC"):
|
|
1342
|
+
flags |= os.O_CLOEXEC
|
|
1343
|
+
if hasattr(os, "O_NONBLOCK"):
|
|
1344
|
+
flags |= os.O_NONBLOCK
|
|
1345
|
+
try:
|
|
1346
|
+
file_fd = os.open(filename, flags, dir_fd=current_fd)
|
|
1347
|
+
st = os.fstat(file_fd)
|
|
1348
|
+
if not stat.S_ISREG(st.st_mode):
|
|
1349
|
+
raise PackError(f"invalid {option_name}: unsafe_path")
|
|
1350
|
+
except FileNotFoundError:
|
|
1351
|
+
temp_fd = -1
|
|
1352
|
+
temp_name = f".context-guard-pack-preflight-{os.getpid()}-{hashlib.sha256(raw_path.encode('utf-8', 'replace')).hexdigest()[:10]}"
|
|
1353
|
+
try:
|
|
1354
|
+
create_flags = os.O_WRONLY | os.O_CREAT | os.O_EXCL
|
|
1355
|
+
if hasattr(os, "O_NOFOLLOW"):
|
|
1356
|
+
create_flags |= os.O_NOFOLLOW
|
|
1357
|
+
if hasattr(os, "O_CLOEXEC"):
|
|
1358
|
+
create_flags |= os.O_CLOEXEC
|
|
1359
|
+
if hasattr(os, "O_NONBLOCK"):
|
|
1360
|
+
create_flags |= os.O_NONBLOCK
|
|
1361
|
+
temp_fd = os.open(temp_name, create_flags, 0o600, dir_fd=current_fd)
|
|
1362
|
+
except OSError as exc:
|
|
1363
|
+
raise PackError(f"invalid {option_name}: {exc.strerror or exc.__class__.__name__}") from exc
|
|
1364
|
+
finally:
|
|
1365
|
+
if temp_fd >= 0:
|
|
1366
|
+
try:
|
|
1367
|
+
os.close(temp_fd)
|
|
1368
|
+
except OSError:
|
|
1369
|
+
pass
|
|
1370
|
+
try:
|
|
1371
|
+
os.unlink(temp_name, dir_fd=current_fd)
|
|
1372
|
+
except OSError:
|
|
1373
|
+
pass
|
|
1374
|
+
except IsADirectoryError as exc:
|
|
1375
|
+
raise PackError(f"invalid {option_name}: unsafe_path") from exc
|
|
1376
|
+
except OSError as exc:
|
|
1377
|
+
raise PackError(f"invalid {option_name}: {exc.strerror or exc.__class__.__name__}") from exc
|
|
1378
|
+
except PackError:
|
|
1379
|
+
raise
|
|
1380
|
+
except FileNotFoundError as exc:
|
|
1381
|
+
raise PackError(f"invalid {option_name}: missing") from exc
|
|
1382
|
+
except OSError as exc:
|
|
1383
|
+
raise PackError(f"invalid {option_name}: {exc.strerror or exc.__class__.__name__}") from exc
|
|
1384
|
+
finally:
|
|
1385
|
+
if file_fd >= 0:
|
|
1386
|
+
try:
|
|
1387
|
+
os.close(file_fd)
|
|
1388
|
+
except OSError:
|
|
1389
|
+
pass
|
|
1390
|
+
if current_fd is not None:
|
|
1391
|
+
try:
|
|
1392
|
+
os.close(current_fd)
|
|
1393
|
+
except OSError:
|
|
1394
|
+
pass
|
|
1395
|
+
return display
|
|
1396
|
+
|
|
1397
|
+
|
|
1398
|
+
def output_rel_for_collision_check(raw_path: str, option_name: str) -> Path:
|
|
1399
|
+
rel, reason = lexical_rel(raw_path)
|
|
1400
|
+
if rel is None:
|
|
1401
|
+
raise PackError(f"invalid {option_name}: {reason}")
|
|
1402
|
+
_display, redacted = display_rel_path(rel.as_posix())
|
|
1403
|
+
if redacted:
|
|
1404
|
+
raise PackError(f"invalid {option_name}: redacted_path")
|
|
1405
|
+
return rel
|
|
1406
|
+
|
|
1407
|
+
|
|
1408
|
+
def existing_output_identity_under_root(root: Path, rel: Path) -> tuple[int, int] | None:
|
|
1409
|
+
current_fd: int | None = None
|
|
1410
|
+
try:
|
|
1411
|
+
current_fd = open_dir_no_follow(root)
|
|
1412
|
+
for part in rel.parts[:-1]:
|
|
1413
|
+
next_fd = open_dir_no_follow(part, dir_fd=current_fd)
|
|
1414
|
+
os.close(current_fd)
|
|
1415
|
+
current_fd = next_fd
|
|
1416
|
+
st = os.stat(rel.parts[-1], dir_fd=current_fd, follow_symlinks=False)
|
|
1417
|
+
if not stat.S_ISREG(st.st_mode):
|
|
1418
|
+
return None
|
|
1419
|
+
return int(st.st_dev), int(st.st_ino)
|
|
1420
|
+
except (FileNotFoundError, OSError, NotImplementedError):
|
|
1421
|
+
return None
|
|
1422
|
+
finally:
|
|
1423
|
+
if current_fd is not None:
|
|
1424
|
+
try:
|
|
1425
|
+
os.close(current_fd)
|
|
1426
|
+
except OSError:
|
|
1427
|
+
pass
|
|
1428
|
+
|
|
1429
|
+
|
|
1430
|
+
def reject_matching_output_targets(
|
|
1431
|
+
root: Path,
|
|
1432
|
+
*,
|
|
1433
|
+
first_rel: Path,
|
|
1434
|
+
second_rel: Path,
|
|
1435
|
+
second_option: str,
|
|
1436
|
+
reason: str,
|
|
1437
|
+
) -> None:
|
|
1438
|
+
first_identity = existing_output_identity_under_root(root, first_rel)
|
|
1439
|
+
second_identity = existing_output_identity_under_root(root, second_rel)
|
|
1440
|
+
same_existing_target = first_identity is not None and first_identity == second_identity
|
|
1441
|
+
same_lexical_target = first_rel == second_rel or first_rel.as_posix().casefold() == second_rel.as_posix().casefold()
|
|
1442
|
+
if same_lexical_target or same_existing_target:
|
|
1443
|
+
raise PackError(f"invalid {second_option}: {reason}")
|
|
1444
|
+
|
|
1445
|
+
|
|
1446
|
+
def write_text_under_root(root: Path, raw_path: str, content: str, option_name: str) -> str:
|
|
1447
|
+
rel, reason = lexical_rel(raw_path)
|
|
1448
|
+
if rel is None:
|
|
1449
|
+
raise PackError(f"invalid {option_name}: {reason}")
|
|
1450
|
+
display, redacted = display_rel_path(rel.as_posix())
|
|
1451
|
+
if redacted:
|
|
1452
|
+
raise PackError(f"invalid {option_name}: redacted_path")
|
|
1453
|
+
parent_parts = rel.parts[:-1]
|
|
1454
|
+
filename = rel.parts[-1]
|
|
1455
|
+
current_fd: int | None = None
|
|
1456
|
+
file_fd = -1
|
|
1457
|
+
try:
|
|
1458
|
+
current_fd = open_dir_no_follow(root)
|
|
1459
|
+
for part in parent_parts:
|
|
1460
|
+
next_fd = open_dir_no_follow(part, dir_fd=current_fd)
|
|
1461
|
+
os.close(current_fd)
|
|
1462
|
+
current_fd = next_fd
|
|
1463
|
+
flags = os.O_WRONLY | os.O_CREAT | os.O_TRUNC
|
|
1464
|
+
if hasattr(os, "O_NOFOLLOW"):
|
|
1465
|
+
flags |= os.O_NOFOLLOW
|
|
1466
|
+
if hasattr(os, "O_CLOEXEC"):
|
|
1467
|
+
flags |= os.O_CLOEXEC
|
|
1468
|
+
if hasattr(os, "O_NONBLOCK"):
|
|
1469
|
+
flags |= os.O_NONBLOCK
|
|
1470
|
+
file_fd = os.open(filename, flags, 0o600, dir_fd=current_fd)
|
|
1471
|
+
st = os.fstat(file_fd)
|
|
1472
|
+
if not stat.S_ISREG(st.st_mode):
|
|
1473
|
+
raise PackError(f"invalid {option_name}: unsafe_path")
|
|
1474
|
+
with os.fdopen(file_fd, "w", encoding="utf-8") as handle:
|
|
1475
|
+
file_fd = -1
|
|
1476
|
+
handle.write(content)
|
|
1477
|
+
except PackError:
|
|
1478
|
+
raise
|
|
1479
|
+
except FileNotFoundError as exc:
|
|
1480
|
+
raise PackError(f"invalid {option_name}: missing") from exc
|
|
1481
|
+
except OSError as exc:
|
|
1482
|
+
raise PackError(f"invalid {option_name}: {exc.strerror or exc.__class__.__name__}") from exc
|
|
1483
|
+
finally:
|
|
1484
|
+
if file_fd >= 0:
|
|
1485
|
+
try:
|
|
1486
|
+
os.close(file_fd)
|
|
1487
|
+
except OSError:
|
|
1488
|
+
pass
|
|
1489
|
+
if current_fd is not None:
|
|
1490
|
+
try:
|
|
1491
|
+
os.close(current_fd)
|
|
1492
|
+
except OSError:
|
|
1493
|
+
pass
|
|
1494
|
+
return display
|
|
1495
|
+
|
|
1496
|
+
|
|
1497
|
+
def manifest_to_source_specs(manifest: dict[str, Any]) -> list[SourceSpec]:
|
|
1498
|
+
version = manifest.get("version", VERSION)
|
|
1499
|
+
if version != VERSION:
|
|
1500
|
+
raise PackError(f"unsupported manifest version: {version}")
|
|
1501
|
+
sources = manifest.get("sources")
|
|
1502
|
+
if not isinstance(sources, list):
|
|
1503
|
+
raise PackError("manifest sources must be a list")
|
|
1504
|
+
specs: list[SourceSpec] = []
|
|
1505
|
+
for index, item in enumerate(sources):
|
|
1506
|
+
if not isinstance(item, dict):
|
|
1507
|
+
raise PackError("manifest sources must be objects")
|
|
1508
|
+
if "path" not in item:
|
|
1509
|
+
raise PackError("manifest source missing path")
|
|
1510
|
+
try:
|
|
1511
|
+
lines = parse_line_range(item.get("lines"))
|
|
1512
|
+
except PackError:
|
|
1513
|
+
lines = LineRange(-1, -1)
|
|
1514
|
+
specs.append(SourceSpec(
|
|
1515
|
+
path=str(item.get("path", "")),
|
|
1516
|
+
priority=bounded_int(item.get("priority"), 0, -1_000_000, 1_000_000),
|
|
1517
|
+
lines=lines,
|
|
1518
|
+
label=cap_label(item.get("label")),
|
|
1519
|
+
input_index=index,
|
|
1520
|
+
origin="auto",
|
|
1521
|
+
))
|
|
1522
|
+
return specs
|
|
1523
|
+
|
|
1524
|
+
|
|
1525
|
+
def build_suggest_manifest(sources: list[dict[str, Any]]) -> dict[str, Any]:
|
|
1526
|
+
manifest_sources: list[dict[str, Any]] = []
|
|
1527
|
+
for item in sources:
|
|
1528
|
+
source: dict[str, Any] = {"path": item["path"], "priority": item["priority"]}
|
|
1529
|
+
if "label" in item:
|
|
1530
|
+
source["label"] = item["label"]
|
|
1531
|
+
if "lines" in item:
|
|
1532
|
+
source["lines"] = item["lines"]
|
|
1533
|
+
manifest_sources.append(source)
|
|
1534
|
+
return {"version": VERSION, "sources": manifest_sources}
|
|
1535
|
+
|
|
1536
|
+
|
|
1537
|
+
def suggest_build_hint(root_arg: str, manifest_path: str | None, budget: int) -> tuple[str | None, str | None]:
|
|
1538
|
+
safe_root = safe_root_arg_for_retrieval(root_arg)
|
|
1539
|
+
if safe_root is None:
|
|
1540
|
+
return None, "unsafe_root_path"
|
|
1541
|
+
manifest_arg = manifest_path or "<manifest.json>"
|
|
1542
|
+
command_parts = ["context-guard-pack", "build", "--root", ".", "--manifest", manifest_arg, "--budget-bytes", str(budget), "--json"]
|
|
1543
|
+
command = " ".join(shlex.quote(part) for part in command_parts)
|
|
1544
|
+
if safe_root in {".", ""}:
|
|
1545
|
+
return command, None
|
|
1546
|
+
return f"cd {shlex.quote(safe_root)} && {command}", None
|
|
1547
|
+
|
|
1548
|
+
|
|
1549
|
+
def suggest_pack(root: Path, args: argparse.Namespace, *, root_arg: str) -> tuple[dict[str, Any], int]:
|
|
1550
|
+
query_text, _query_redactions = sanitize_text(args.query or "")
|
|
1551
|
+
query = " ".join(query_text.split())
|
|
1552
|
+
query_terms = suggest_tokens(query)
|
|
1553
|
+
context_lines = bounded_int(args.context_lines, DEFAULT_SUGGEST_CONTEXT_LINES, 0, MAX_SUGGEST_CONTEXT_LINES)
|
|
1554
|
+
top = bounded_int(args.top, DEFAULT_SUGGEST_TOP, 1, MAX_SUGGEST_TOP)
|
|
1555
|
+
budget = bounded_int(args.budget_bytes, DEFAULT_BUDGET_BYTES, MIN_BUDGET_BYTES, MAX_BUDGET_BYTES)
|
|
1556
|
+
candidates: list[SuggestCandidate] = []
|
|
1557
|
+
omitted: list[dict[str, Any]] = []
|
|
1558
|
+
file_inputs = split_suggest_files(args.files)
|
|
1559
|
+
has_signal = bool(query or file_inputs or args.diff or args.output or args.test_output)
|
|
1560
|
+
if not has_signal:
|
|
1561
|
+
raise PackError("provide --query, --files, --diff, --output, or --test-output")
|
|
1562
|
+
|
|
1563
|
+
for raw_path in file_inputs:
|
|
1564
|
+
add_suggest_candidate(
|
|
1565
|
+
candidates,
|
|
1566
|
+
path=raw_path,
|
|
1567
|
+
score=9_000 + suggest_score_path(raw_path, query_terms),
|
|
1568
|
+
reason="explicit file request",
|
|
1569
|
+
label=f"file:{safe_raw_path_label(raw_path)}",
|
|
1570
|
+
)
|
|
1571
|
+
if args.diff:
|
|
1572
|
+
candidates.extend(collect_diff_candidates(root, args.diff, query_terms, context_lines))
|
|
1573
|
+
output_candidates, output_omitted = collect_output_candidates(root, args.output, query_terms, context_lines, origin="output")
|
|
1574
|
+
test_candidates, test_omitted = collect_output_candidates(root, args.test_output, query_terms, context_lines, origin="test-output")
|
|
1575
|
+
candidates.extend(output_candidates)
|
|
1576
|
+
candidates.extend(test_candidates)
|
|
1577
|
+
omitted.extend(output_omitted)
|
|
1578
|
+
omitted.extend(test_omitted)
|
|
1579
|
+
candidates.extend(collect_query_candidates(root, query_terms, context_lines))
|
|
1580
|
+
|
|
1581
|
+
candidates.sort(key=lambda item: (-item.score, item.input_index, item.path, item.lines.identity() if item.lines else "0:0"))
|
|
1582
|
+
seen: set[tuple[str, str]] = set()
|
|
1583
|
+
final_seen: set[tuple[str, str]] = set()
|
|
1584
|
+
selected: list[dict[str, Any]] = []
|
|
1585
|
+
manifest_seed: list[dict[str, Any]] = []
|
|
1586
|
+
current_bytes = byte_len("# Context Pack\n\nGenerated by context-guard-pack. Token counts are estimated proxies; byte counts are observed.\n\n")
|
|
1587
|
+
for candidate in candidates:
|
|
1588
|
+
rel, reason = lexical_rel(candidate.path)
|
|
1589
|
+
identity_path = rel.as_posix() if rel is not None else safe_raw_path_label(candidate.path)
|
|
1590
|
+
identity_lines = candidate.lines.identity() if candidate.lines else "all"
|
|
1591
|
+
identity = (identity_path, identity_lines)
|
|
1592
|
+
if rel is not None and identity in seen:
|
|
1593
|
+
display, redacted = display_rel_path(rel.as_posix())
|
|
1594
|
+
duplicate_item = {
|
|
1595
|
+
"path": display,
|
|
1596
|
+
"status": "omitted",
|
|
1597
|
+
"reason": "duplicate_source",
|
|
1598
|
+
"suggest_reason": candidate.reason,
|
|
1599
|
+
"priority": candidate.score,
|
|
1600
|
+
"retrieval_omitted_reason": "redacted_path" if redacted else None,
|
|
1601
|
+
}
|
|
1602
|
+
omitted.append({key: value for key, value in duplicate_item.items() if value is not None})
|
|
1603
|
+
continue
|
|
1604
|
+
if rel is not None:
|
|
1605
|
+
seen.add(identity)
|
|
1606
|
+
source, omitted_item = normalize_suggest_source(root, candidate)
|
|
1607
|
+
if omitted_item is not None:
|
|
1608
|
+
omitted_item["priority"] = candidate.score
|
|
1609
|
+
omitted_item["suggest_reason"] = candidate.reason
|
|
1610
|
+
omitted.append({key: value for key, value in omitted_item.items() if value is not None})
|
|
1611
|
+
continue
|
|
1612
|
+
assert source is not None
|
|
1613
|
+
final_identity = (source.display_path, source_selected_range(source).identity() if source.requested_lines is not None else "all")
|
|
1614
|
+
if final_identity in final_seen:
|
|
1615
|
+
omitted.append({
|
|
1616
|
+
"path": source.display_path,
|
|
1617
|
+
"status": "omitted",
|
|
1618
|
+
"reason": "duplicate_source",
|
|
1619
|
+
"suggest_reason": candidate.reason,
|
|
1620
|
+
"priority": candidate.score,
|
|
1621
|
+
})
|
|
1622
|
+
continue
|
|
1623
|
+
final_seen.add(final_identity)
|
|
1624
|
+
source_bytes = resolved_block_bytes(source, root_arg=root_arg)
|
|
1625
|
+
remaining = budget - current_bytes
|
|
1626
|
+
if source_bytes > remaining:
|
|
1627
|
+
if not selected and remaining > 0:
|
|
1628
|
+
partial_lines, _partial_block, partial_range = fit_partial_lines(source, remaining, root_arg=root_arg)
|
|
1629
|
+
if partial_range is not None and partial_lines:
|
|
1630
|
+
partial_spec = SourceSpec(
|
|
1631
|
+
path=candidate.path,
|
|
1632
|
+
priority=candidate.score,
|
|
1633
|
+
lines=partial_range,
|
|
1634
|
+
label=candidate.label,
|
|
1635
|
+
input_index=candidate.input_index,
|
|
1636
|
+
origin="suggest",
|
|
1637
|
+
)
|
|
1638
|
+
source, omitted_item = resolve_source(root, partial_spec)
|
|
1639
|
+
if omitted_item is not None:
|
|
1640
|
+
omitted_item["priority"] = candidate.score
|
|
1641
|
+
omitted_item["suggest_reason"] = candidate.reason
|
|
1642
|
+
omitted.append(omitted_item)
|
|
1643
|
+
continue
|
|
1644
|
+
assert source is not None
|
|
1645
|
+
source_bytes = resolved_block_bytes(source, root_arg=root_arg)
|
|
1646
|
+
else:
|
|
1647
|
+
omitted.append({"path": source.display_path, "status": "omitted", "reason": "budget_exhausted", "priority": candidate.score})
|
|
1648
|
+
continue
|
|
1649
|
+
else:
|
|
1650
|
+
omitted.append({"path": source.display_path, "status": "omitted", "reason": "budget_exhausted", "priority": candidate.score})
|
|
1651
|
+
continue
|
|
1652
|
+
payload = suggested_source_payload(source, candidate, root_arg=root_arg)
|
|
1653
|
+
selected.append(payload)
|
|
1654
|
+
manifest_seed.append(manifest_source_for_candidate(source, priority=candidate.score, label=candidate.label))
|
|
1655
|
+
current_bytes += source_bytes
|
|
1656
|
+
if len(selected) >= top:
|
|
1657
|
+
break
|
|
1658
|
+
|
|
1659
|
+
manifest = build_suggest_manifest(manifest_seed)
|
|
1660
|
+
estimated_pack_bytes = current_bytes if selected else 0
|
|
1661
|
+
manifest_path: str | None = None
|
|
1662
|
+
if args.manifest_out:
|
|
1663
|
+
manifest_path = write_manifest_under_root(root, args.manifest_out, manifest)
|
|
1664
|
+
build_hint, build_hint_omitted_reason = suggest_build_hint(root_arg, manifest_path, budget)
|
|
1665
|
+
payload: dict[str, Any] = {
|
|
1666
|
+
"tool": TOOL_NAME,
|
|
1667
|
+
"schema_version": SUGGEST_SCHEMA_VERSION,
|
|
1668
|
+
"version": VERSION,
|
|
1669
|
+
"mode": "suggest",
|
|
1670
|
+
"root": display_root(root),
|
|
1671
|
+
"query": query,
|
|
1672
|
+
"budget_bytes": budget,
|
|
1673
|
+
"estimated_pack_bytes": estimated_pack_bytes,
|
|
1674
|
+
"token_proxy": {
|
|
1675
|
+
"measurement": "estimated",
|
|
1676
|
+
"method": f"chars_div_{TOKEN_PROXY_CHARS_PER_TOKEN}",
|
|
1677
|
+
"estimated_pack": estimated_pack_bytes // TOKEN_PROXY_CHARS_PER_TOKEN,
|
|
1678
|
+
},
|
|
1679
|
+
"sources": selected,
|
|
1680
|
+
"omitted_sources": sorted(omitted, key=lambda item: (str(item.get("path", "")), str(item.get("reason", "")), int(item.get("priority", 0) or 0))),
|
|
1681
|
+
"manifest": manifest,
|
|
1682
|
+
"manifest_path": manifest_path,
|
|
1683
|
+
"build_hint": build_hint,
|
|
1684
|
+
"caveats": [
|
|
1685
|
+
"Deterministic local heuristics only; no model, network, embedding, or provider-cost estimate is used.",
|
|
1686
|
+
"Byte and token values are pack-size proxies, not billing claims.",
|
|
1687
|
+
],
|
|
1688
|
+
}
|
|
1689
|
+
if build_hint_omitted_reason:
|
|
1690
|
+
payload["build_hint_omitted_reason"] = build_hint_omitted_reason
|
|
1691
|
+
return payload, 0
|
|
1692
|
+
|
|
1693
|
+
|
|
1694
|
+
def line_range_identity(value: object) -> str:
|
|
1695
|
+
if isinstance(value, dict):
|
|
1696
|
+
return f"{value.get('start')}:{value.get('end')}"
|
|
1697
|
+
if value is None:
|
|
1698
|
+
return "all"
|
|
1699
|
+
return str(value)
|
|
1700
|
+
|
|
1701
|
+
|
|
1702
|
+
def copy_explain_fields(item: dict[str, Any], fields: tuple[str, ...]) -> dict[str, Any]:
|
|
1703
|
+
out: dict[str, Any] = {}
|
|
1704
|
+
for field in fields:
|
|
1705
|
+
if field in item and item[field] is not None:
|
|
1706
|
+
out[field] = copy.deepcopy(item[field])
|
|
1707
|
+
return out
|
|
1708
|
+
|
|
1709
|
+
|
|
1710
|
+
def build_source_matches_exact(suggest_item: dict[str, Any], build_item: dict[str, Any]) -> bool:
|
|
1711
|
+
if build_item.get("path") != suggest_item.get("path"):
|
|
1712
|
+
return False
|
|
1713
|
+
if build_item.get("priority") != suggest_item.get("priority"):
|
|
1714
|
+
return False
|
|
1715
|
+
lines = line_range_identity(suggest_item.get("lines"))
|
|
1716
|
+
requested = line_range_identity(build_item.get("requested_lines"))
|
|
1717
|
+
included = line_range_identity(build_item.get("included_lines"))
|
|
1718
|
+
return lines in {requested, included, "all"}
|
|
1719
|
+
|
|
1720
|
+
|
|
1721
|
+
def find_exact_build_source_for_explain(
|
|
1722
|
+
suggest_item: dict[str, Any],
|
|
1723
|
+
build_sources: list[dict[str, Any]],
|
|
1724
|
+
used_indexes: set[int],
|
|
1725
|
+
) -> dict[str, Any] | None:
|
|
1726
|
+
for index, item in enumerate(build_sources):
|
|
1727
|
+
if index in used_indexes:
|
|
1728
|
+
continue
|
|
1729
|
+
if build_source_matches_exact(suggest_item, item):
|
|
1730
|
+
used_indexes.add(index)
|
|
1731
|
+
return item
|
|
1732
|
+
return None
|
|
1733
|
+
|
|
1734
|
+
|
|
1735
|
+
def find_fallback_build_source_for_explain(
|
|
1736
|
+
suggest_item: dict[str, Any],
|
|
1737
|
+
build_sources: list[dict[str, Any]],
|
|
1738
|
+
used_indexes: set[int],
|
|
1739
|
+
) -> dict[str, Any] | None:
|
|
1740
|
+
path = suggest_item.get("path")
|
|
1741
|
+
for index, item in enumerate(build_sources):
|
|
1742
|
+
if index in used_indexes or item.get("path") != path:
|
|
1743
|
+
continue
|
|
1744
|
+
used_indexes.add(index)
|
|
1745
|
+
return item
|
|
1746
|
+
return None
|
|
1747
|
+
|
|
1748
|
+
|
|
1749
|
+
def explain_omission_key(item: dict[str, Any]) -> tuple[str, str, str, str, str]:
|
|
1750
|
+
return (
|
|
1751
|
+
str(item.get("phase", "")),
|
|
1752
|
+
str(item.get("path", "")),
|
|
1753
|
+
str(item.get("reason", "")),
|
|
1754
|
+
str(item.get("suggest_reason", "")),
|
|
1755
|
+
json.dumps(item.get("requested_lines", item.get("lines", "")), ensure_ascii=False, sort_keys=True),
|
|
1756
|
+
)
|
|
1757
|
+
|
|
1758
|
+
|
|
1759
|
+
def sanitize_explain_text(value: str, *, limit: int = MAX_LABEL_CHARS) -> str:
|
|
1760
|
+
sanitized, _redacted = sanitize_text(str(value))
|
|
1761
|
+
return cap_label(sanitized, default="", limit=limit) or ""
|
|
1762
|
+
|
|
1763
|
+
|
|
1764
|
+
def is_repo_map_text_path(path: str) -> bool:
|
|
1765
|
+
name = Path(path).name.lower()
|
|
1766
|
+
if name in {"readme", "license", "dockerfile", "makefile"}:
|
|
1767
|
+
return True
|
|
1768
|
+
return Path(path).suffix.lower() in REPO_MAP_TEXT_EXTENSIONS
|
|
1769
|
+
|
|
1770
|
+
|
|
1771
|
+
def read_repo_map_text(root: Path, rel_path: str) -> tuple[dict[str, Any] | None, dict[str, Any] | None]:
|
|
1772
|
+
rel, reason = lexical_rel(rel_path)
|
|
1773
|
+
if rel is None:
|
|
1774
|
+
return None, {"path": repo_map_safe_raw_path_label(rel_path), "reason": reason}
|
|
1775
|
+
display, redacted_path = repo_map_display_rel_path(rel.as_posix())
|
|
1776
|
+
if not is_repo_map_text_path(display):
|
|
1777
|
+
return None, {"path": display, "reason": "unsupported_file_type"}
|
|
1778
|
+
handle, open_reason = open_regular_under_root(root, rel)
|
|
1779
|
+
if handle is None:
|
|
1780
|
+
return None, {"path": display, "reason": open_reason, "retrieval_omitted_reason": "redacted_path" if redacted_path else None}
|
|
1781
|
+
try:
|
|
1782
|
+
with handle:
|
|
1783
|
+
text = handle.read(MAX_REPO_MAP_BYTES_PER_FILE + 1)
|
|
1784
|
+
except (OSError, UnicodeError):
|
|
1785
|
+
return None, {"path": display, "reason": "unsafe_path", "retrieval_omitted_reason": "redacted_path" if redacted_path else None}
|
|
1786
|
+
capped = byte_len(text) > MAX_REPO_MAP_BYTES_PER_FILE
|
|
1787
|
+
if capped:
|
|
1788
|
+
text = text.encode("utf-8", errors="replace")[:MAX_REPO_MAP_BYTES_PER_FILE].decode("utf-8", errors="ignore")
|
|
1789
|
+
risk_counts = secret_risk_counts(text)
|
|
1790
|
+
sanitized_text, redacted_lines = sanitize_text(text)
|
|
1791
|
+
return {
|
|
1792
|
+
"path": display,
|
|
1793
|
+
"raw_path": rel.as_posix(),
|
|
1794
|
+
"redacted_path": redacted_path,
|
|
1795
|
+
"text": sanitized_text,
|
|
1796
|
+
"bytes": byte_len(sanitized_text),
|
|
1797
|
+
"bytes_capped": capped,
|
|
1798
|
+
"line_count": len(sanitized_text.splitlines()) or (1 if sanitized_text else 0),
|
|
1799
|
+
"redacted_lines": redacted_lines,
|
|
1800
|
+
"secret_risk_counts": risk_counts,
|
|
1801
|
+
}, None
|
|
1802
|
+
|
|
1803
|
+
|
|
1804
|
+
def repo_map_records(root: Path) -> tuple[list[dict[str, Any]], list[dict[str, Any]], dict[str, Any]]:
|
|
1805
|
+
paths = git_ls_files(root)
|
|
1806
|
+
path_cap_reached = len(paths) > MAX_REPO_MAP_FILES
|
|
1807
|
+
records: list[dict[str, Any]] = []
|
|
1808
|
+
omitted: list[dict[str, Any]] = []
|
|
1809
|
+
for rel_path in paths[:MAX_REPO_MAP_FILES]:
|
|
1810
|
+
record, omission_item = read_repo_map_text(root, rel_path)
|
|
1811
|
+
if record is not None:
|
|
1812
|
+
records.append(record)
|
|
1813
|
+
elif omission_item is not None and omission_item.get("reason") != "unsupported_file_type":
|
|
1814
|
+
omitted.append({key: value for key, value in omission_item.items() if value is not None})
|
|
1815
|
+
caps = {
|
|
1816
|
+
"max_files": MAX_REPO_MAP_FILES,
|
|
1817
|
+
"files_capped": path_cap_reached,
|
|
1818
|
+
"max_bytes_per_file": MAX_REPO_MAP_BYTES_PER_FILE,
|
|
1819
|
+
"bytes_per_file_capped_count": sum(1 for item in records if item.get("bytes_capped")),
|
|
1820
|
+
"max_tree_entries": MAX_REPO_MAP_TREE_ENTRIES,
|
|
1821
|
+
"max_signature_entries": MAX_REPO_MAP_SIGNATURE_ENTRIES,
|
|
1822
|
+
"max_graph_rank_entries": MAX_REPO_MAP_GRAPH_RANK_ENTRIES,
|
|
1823
|
+
"max_retrieval_hints": MAX_REPO_MAP_RETRIEVAL_HINTS,
|
|
1824
|
+
"max_secret_risk_files": MAX_REPO_MAP_SECRET_RISK_FILES,
|
|
1825
|
+
}
|
|
1826
|
+
return records, omitted, caps
|
|
1827
|
+
|
|
1828
|
+
|
|
1829
|
+
def secret_risk_counts(text: str) -> dict[str, int]:
|
|
1830
|
+
counts: dict[str, int] = {}
|
|
1831
|
+
for name, pattern in SECRET_RISK_PATTERNS:
|
|
1832
|
+
found = len(pattern.findall(text))
|
|
1833
|
+
if found:
|
|
1834
|
+
counts[name] = found
|
|
1835
|
+
return counts
|
|
1836
|
+
|
|
1837
|
+
|
|
1838
|
+
def build_secret_scan(records: list[dict[str, Any]]) -> dict[str, Any]:
|
|
1839
|
+
risk_counts: dict[str, int] = {}
|
|
1840
|
+
files: list[dict[str, Any]] = []
|
|
1841
|
+
for record in records:
|
|
1842
|
+
counts = dict(record.get("secret_risk_counts", {}) if isinstance(record.get("secret_risk_counts"), dict) else {})
|
|
1843
|
+
if not counts:
|
|
1844
|
+
continue
|
|
1845
|
+
for name, count in counts.items():
|
|
1846
|
+
risk_counts[name] = risk_counts.get(name, 0) + count
|
|
1847
|
+
files.append({
|
|
1848
|
+
"path": record["path"],
|
|
1849
|
+
"counts": counts,
|
|
1850
|
+
"redacted_path": bool(record.get("redacted_path")),
|
|
1851
|
+
})
|
|
1852
|
+
files.sort(key=lambda item: (-sum(item["counts"].values()), item["path"]))
|
|
1853
|
+
return {
|
|
1854
|
+
"risk_counts": dict(sorted(risk_counts.items())),
|
|
1855
|
+
"files_with_risks": files[:MAX_REPO_MAP_SECRET_RISK_FILES],
|
|
1856
|
+
"files_omitted_by_cap": max(0, len(files) - MAX_REPO_MAP_SECRET_RISK_FILES),
|
|
1857
|
+
"caveat": "Counts are local best-effort secret-pattern risk signals; raw matched values are never emitted.",
|
|
1858
|
+
}
|
|
1859
|
+
|
|
1860
|
+
|
|
1861
|
+
def build_token_tree(records: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
1862
|
+
directory_totals: dict[str, dict[str, int]] = {}
|
|
1863
|
+
file_entries: list[dict[str, Any]] = []
|
|
1864
|
+
for record in records:
|
|
1865
|
+
path = str(record["path"])
|
|
1866
|
+
bytes_count = int(record.get("bytes", 0) or 0)
|
|
1867
|
+
file_entries.append({
|
|
1868
|
+
"kind": "file",
|
|
1869
|
+
"path": path,
|
|
1870
|
+
"bytes": bytes_count,
|
|
1871
|
+
"token_proxy": token_proxy(str(record.get("text", ""))),
|
|
1872
|
+
"line_count": int(record.get("line_count", 0) or 0),
|
|
1873
|
+
"bytes_capped": bool(record.get("bytes_capped")),
|
|
1874
|
+
})
|
|
1875
|
+
parts = path.split("/")
|
|
1876
|
+
if len(parts) > 1:
|
|
1877
|
+
prefix = ""
|
|
1878
|
+
for part in parts[:-1]:
|
|
1879
|
+
prefix = part if not prefix else f"{prefix}/{part}"
|
|
1880
|
+
bucket = directory_totals.setdefault(prefix, {"bytes": 0, "file_count": 0})
|
|
1881
|
+
bucket["bytes"] += bytes_count
|
|
1882
|
+
bucket["file_count"] += 1
|
|
1883
|
+
directory_entries = [
|
|
1884
|
+
{
|
|
1885
|
+
"kind": "directory",
|
|
1886
|
+
"path": path,
|
|
1887
|
+
"bytes": data["bytes"],
|
|
1888
|
+
"token_proxy": max(0, round(data["bytes"] / TOKEN_PROXY_CHARS_PER_TOKEN)),
|
|
1889
|
+
"file_count": data["file_count"],
|
|
1890
|
+
}
|
|
1891
|
+
for path, data in directory_totals.items()
|
|
1892
|
+
]
|
|
1893
|
+
entries = directory_entries + file_entries
|
|
1894
|
+
entries.sort(key=lambda item: (-int(item.get("bytes", 0) or 0), str(item.get("path", ""))))
|
|
1895
|
+
return entries[:MAX_REPO_MAP_TREE_ENTRIES]
|
|
1896
|
+
|
|
1897
|
+
|
|
1898
|
+
def signature_range(line_number: int, total_lines: int) -> LineRange:
|
|
1899
|
+
return LineRange(max(1, line_number), min(max(1, total_lines), max(1, line_number) + 24))
|
|
1900
|
+
|
|
1901
|
+
|
|
1902
|
+
def signature_entry(record: dict[str, Any], *, kind: str, name: str, raw_signature: str, line_number: int) -> dict[str, Any]:
|
|
1903
|
+
total_lines = int(record.get("line_count", 0) or 1)
|
|
1904
|
+
line_range = signature_range(line_number, total_lines)
|
|
1905
|
+
return {
|
|
1906
|
+
"path": record["path"],
|
|
1907
|
+
"kind": kind,
|
|
1908
|
+
"name": sanitize_explain_text(name, limit=80),
|
|
1909
|
+
"signature": sanitize_explain_text(raw_signature, limit=180),
|
|
1910
|
+
"line": line_number,
|
|
1911
|
+
"lines": line_range.as_dict(),
|
|
1912
|
+
}
|
|
1913
|
+
|
|
1914
|
+
|
|
1915
|
+
def python_signatures(record: dict[str, Any], text: str) -> list[dict[str, Any]]:
|
|
1916
|
+
try:
|
|
1917
|
+
module = ast.parse(text)
|
|
1918
|
+
except (SyntaxError, ValueError, RecursionError):
|
|
1919
|
+
return []
|
|
1920
|
+
lines = text.splitlines()
|
|
1921
|
+
out: list[dict[str, Any]] = []
|
|
1922
|
+
for node in module.body:
|
|
1923
|
+
if isinstance(node, ast.ClassDef):
|
|
1924
|
+
raw = lines[node.lineno - 1].strip() if 0 < node.lineno <= len(lines) else f"class {node.name}"
|
|
1925
|
+
out.append(signature_entry(record, kind="class", name=node.name, raw_signature=raw, line_number=node.lineno))
|
|
1926
|
+
for child in node.body:
|
|
1927
|
+
if isinstance(child, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
1928
|
+
raw_child = lines[child.lineno - 1].strip() if 0 < child.lineno <= len(lines) else f"def {child.name}"
|
|
1929
|
+
out.append(signature_entry(record, kind="method", name=child.name, raw_signature=raw_child, line_number=child.lineno))
|
|
1930
|
+
elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
1931
|
+
raw = lines[node.lineno - 1].strip() if 0 < node.lineno <= len(lines) else f"def {node.name}"
|
|
1932
|
+
out.append(signature_entry(record, kind="function", name=node.name, raw_signature=raw, line_number=node.lineno))
|
|
1933
|
+
return out
|
|
1934
|
+
|
|
1935
|
+
|
|
1936
|
+
def regex_signatures(record: dict[str, Any], text: str) -> list[dict[str, Any]]:
|
|
1937
|
+
out: list[dict[str, Any]] = []
|
|
1938
|
+
suffix = Path(str(record.get("path", ""))).suffix.lower()
|
|
1939
|
+
for index, raw in enumerate(text.splitlines(), start=1):
|
|
1940
|
+
stripped = raw.strip()
|
|
1941
|
+
if suffix in {".md", ".mdx"}:
|
|
1942
|
+
heading = re.match(r"^(#{1,6})\s+(.+)$", stripped)
|
|
1943
|
+
if heading:
|
|
1944
|
+
out.append(signature_entry(record, kind="heading", name=heading.group(2), raw_signature=stripped, line_number=index))
|
|
1945
|
+
continue
|
|
1946
|
+
match = SIGNATURE_LINE_RE.match(raw)
|
|
1947
|
+
if not match:
|
|
1948
|
+
continue
|
|
1949
|
+
name = next((group for group in match.groups() if group), "signature")
|
|
1950
|
+
kind = "class" if re.search(r"\bclass\s+" + re.escape(name), raw) else "function"
|
|
1951
|
+
out.append(signature_entry(record, kind=kind, name=name, raw_signature=stripped, line_number=index))
|
|
1952
|
+
return out
|
|
1953
|
+
|
|
1954
|
+
|
|
1955
|
+
def extract_signatures(records: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
1956
|
+
signatures: list[dict[str, Any]] = []
|
|
1957
|
+
for record in records:
|
|
1958
|
+
text = str(record.get("text", ""))
|
|
1959
|
+
suffix = Path(str(record.get("path", ""))).suffix.lower()
|
|
1960
|
+
if suffix == ".py":
|
|
1961
|
+
parsed = python_signatures(record, text)
|
|
1962
|
+
if parsed:
|
|
1963
|
+
signatures.extend(parsed)
|
|
1964
|
+
continue
|
|
1965
|
+
signatures.extend(regex_signatures(record, text))
|
|
1966
|
+
signatures.sort(key=lambda item: (str(item.get("path", "")), int(item.get("line", 0) or 0), str(item.get("name", ""))))
|
|
1967
|
+
return signatures[:MAX_REPO_MAP_SIGNATURE_ENTRIES]
|
|
1968
|
+
|
|
1969
|
+
|
|
1970
|
+
def normalize_repo_map_candidate(path: str) -> str:
|
|
1971
|
+
normalized = posixpath.normpath(path.replace("\\", "/"))
|
|
1972
|
+
if normalized == ".":
|
|
1973
|
+
return ""
|
|
1974
|
+
return normalized.lstrip("/")
|
|
1975
|
+
|
|
1976
|
+
|
|
1977
|
+
def resolve_import_target(raw_target: str, source_path: str, known_paths: set[str]) -> str | None:
|
|
1978
|
+
target = raw_target.strip()
|
|
1979
|
+
if not target:
|
|
1980
|
+
return None
|
|
1981
|
+
candidates: list[str] = []
|
|
1982
|
+
source_dir = Path(source_path).parent.as_posix()
|
|
1983
|
+
if target.startswith("."):
|
|
1984
|
+
if target.startswith("./") or target.startswith("../"):
|
|
1985
|
+
base = normalize_repo_map_candidate(posixpath.join(source_dir, target))
|
|
1986
|
+
else:
|
|
1987
|
+
leading = len(target) - len(target.lstrip("."))
|
|
1988
|
+
remainder = target[leading:].replace(".", "/")
|
|
1989
|
+
base_dir = source_dir
|
|
1990
|
+
for _ in range(max(0, leading - 1)):
|
|
1991
|
+
base_dir = posixpath.dirname(base_dir)
|
|
1992
|
+
base = normalize_repo_map_candidate(posixpath.join(base_dir, remainder)) if remainder else normalize_repo_map_candidate(base_dir)
|
|
1993
|
+
candidates.extend([base, f"{base}.py", f"{base}.ts", f"{base}.tsx", f"{base}.js", f"{base}.jsx", f"{base}/index.ts", f"{base}/index.js"])
|
|
1994
|
+
else:
|
|
1995
|
+
module_path = target.replace(".", "/")
|
|
1996
|
+
candidates.extend([f"{module_path}.py", f"{module_path}.ts", f"{module_path}.tsx", f"{module_path}.js", f"{module_path}.jsx", f"{module_path}/index.ts", f"{module_path}/index.js"])
|
|
1997
|
+
for candidate in candidates:
|
|
1998
|
+
normalized = normalize_repo_map_candidate(candidate)
|
|
1999
|
+
if normalized in known_paths:
|
|
2000
|
+
return normalized
|
|
2001
|
+
return None
|
|
2002
|
+
|
|
2003
|
+
|
|
2004
|
+
def python_from_import_targets(module_name: str, imported_names: str) -> list[str]:
|
|
2005
|
+
targets = [module_name]
|
|
2006
|
+
if module_name.strip("."):
|
|
2007
|
+
return targets
|
|
2008
|
+
for raw_name in imported_names.replace("(", " ").replace(")", " ").split(","):
|
|
2009
|
+
name = raw_name.strip().split(" as ", 1)[0].strip()
|
|
2010
|
+
if not re.fullmatch(r"[A-Za-z_]\w*", name):
|
|
2011
|
+
continue
|
|
2012
|
+
targets.append(f"{module_name}{name}")
|
|
2013
|
+
return targets
|
|
2014
|
+
|
|
2015
|
+
|
|
2016
|
+
def collect_import_edges(records: list[dict[str, Any]]) -> list[dict[str, str]]:
|
|
2017
|
+
known = {str(record.get("path", "")) for record in records}
|
|
2018
|
+
edges: list[dict[str, str]] = []
|
|
2019
|
+
seen: set[tuple[str, str]] = set()
|
|
2020
|
+
for record in records:
|
|
2021
|
+
source = str(record.get("path", ""))
|
|
2022
|
+
for line in str(record.get("text", "")).splitlines():
|
|
2023
|
+
py_from_match = PY_FROM_IMPORT_LINE_RE.match(line)
|
|
2024
|
+
if py_from_match:
|
|
2025
|
+
raw_targets = python_from_import_targets(py_from_match.group("module"), py_from_match.group("names"))
|
|
2026
|
+
else:
|
|
2027
|
+
raw_targets = [next((value for value in match.groupdict().values() if value), "") for match in IMPORT_PATH_RE.finditer(line)]
|
|
2028
|
+
for raw_target in raw_targets:
|
|
2029
|
+
target = resolve_import_target(raw_target, source, known)
|
|
2030
|
+
if target is None or target == source:
|
|
2031
|
+
continue
|
|
2032
|
+
edge = (source, target)
|
|
2033
|
+
if edge in seen:
|
|
2034
|
+
continue
|
|
2035
|
+
seen.add(edge)
|
|
2036
|
+
edges.append({"from": source, "to": target})
|
|
2037
|
+
if len(edges) >= MAX_REPO_MAP_FILES:
|
|
2038
|
+
return edges
|
|
2039
|
+
return edges
|
|
2040
|
+
|
|
2041
|
+
|
|
2042
|
+
def repo_map_seed_paths(args: argparse.Namespace, suggest_payload: dict[str, Any], build_payload: dict[str, Any]) -> set[str]:
|
|
2043
|
+
seeds: set[str] = set()
|
|
2044
|
+
for raw in split_suggest_files(getattr(args, "files", None)):
|
|
2045
|
+
rel, _reason = lexical_rel(raw)
|
|
2046
|
+
if rel is not None:
|
|
2047
|
+
display, redacted = repo_map_display_rel_path(rel.as_posix())
|
|
2048
|
+
if not redacted:
|
|
2049
|
+
seeds.add(display)
|
|
2050
|
+
for source in suggest_payload.get("sources", []):
|
|
2051
|
+
if isinstance(source, dict) and isinstance(source.get("path"), str):
|
|
2052
|
+
seeds.add(source["path"])
|
|
2053
|
+
for source in build_payload.get("included_sources", []):
|
|
2054
|
+
if isinstance(source, dict) and isinstance(source.get("path"), str):
|
|
2055
|
+
seeds.add(source["path"])
|
|
2056
|
+
return seeds
|
|
2057
|
+
|
|
2058
|
+
|
|
2059
|
+
def build_graph_rank(
|
|
2060
|
+
records: list[dict[str, Any]],
|
|
2061
|
+
signatures: list[dict[str, Any]],
|
|
2062
|
+
edges: list[dict[str, str]],
|
|
2063
|
+
*,
|
|
2064
|
+
query_terms: set[str],
|
|
2065
|
+
seed_paths: set[str],
|
|
2066
|
+
secret_scan: dict[str, Any],
|
|
2067
|
+
) -> list[dict[str, Any]]:
|
|
2068
|
+
signature_paths = {str(item.get("path", "")) for item in signatures}
|
|
2069
|
+
secret_paths = {str(item.get("path", "")) for item in secret_scan.get("files_with_risks", []) if isinstance(item, dict)}
|
|
2070
|
+
degree: dict[str, int] = {}
|
|
2071
|
+
for edge in edges:
|
|
2072
|
+
degree[edge["from"]] = degree.get(edge["from"], 0) + 1
|
|
2073
|
+
degree[edge["to"]] = degree.get(edge["to"], 0) + 1
|
|
2074
|
+
ranked: list[dict[str, Any]] = []
|
|
2075
|
+
for record in records:
|
|
2076
|
+
path = str(record.get("path", ""))
|
|
2077
|
+
text = str(record.get("text", "")).lower()
|
|
2078
|
+
components = {
|
|
2079
|
+
"seed": 1000 if path in seed_paths else 0,
|
|
2080
|
+
"query_path": suggest_score_path(path, query_terms),
|
|
2081
|
+
"query_content": min(500, 25 * sum(text.count(term) for term in query_terms)),
|
|
2082
|
+
"signature": 80 if path in signature_paths else 0,
|
|
2083
|
+
"graph_degree": 25 * degree.get(path, 0),
|
|
2084
|
+
"secret_risk_penalty": -25 if path in secret_paths else 0,
|
|
2085
|
+
}
|
|
2086
|
+
score = sum(components.values())
|
|
2087
|
+
if score <= 0:
|
|
2088
|
+
continue
|
|
2089
|
+
ranked.append({
|
|
2090
|
+
"path": path,
|
|
2091
|
+
"score": score,
|
|
2092
|
+
"components": components,
|
|
2093
|
+
"explain_only": True,
|
|
2094
|
+
"line_count": int(record.get("line_count", 0) or 0),
|
|
2095
|
+
})
|
|
2096
|
+
ranked.sort(key=lambda item: (-int(item["score"]), str(item["path"])))
|
|
2097
|
+
return ranked[:MAX_REPO_MAP_GRAPH_RANK_ENTRIES]
|
|
2098
|
+
|
|
2099
|
+
|
|
2100
|
+
def repo_map_retrieval_for(root_arg: str, display_path: str, lines: LineRange, *, redacted_path: bool) -> tuple[str | None, str | None]:
|
|
2101
|
+
if redacted_path:
|
|
2102
|
+
return None, "redacted_path"
|
|
2103
|
+
safe_root = safe_repo_map_root_arg_for_retrieval(root_arg)
|
|
2104
|
+
if safe_root is None:
|
|
2105
|
+
return None, "unsafe_root_path"
|
|
2106
|
+
return retrieval_cli(safe_root, display_path, lines), None
|
|
2107
|
+
|
|
2108
|
+
|
|
2109
|
+
def repo_map_retrieval(
|
|
2110
|
+
record_by_path: dict[str, dict[str, Any]],
|
|
2111
|
+
signatures: list[dict[str, Any]],
|
|
2112
|
+
graph_rank: list[dict[str, Any]],
|
|
2113
|
+
*,
|
|
2114
|
+
root_arg: str,
|
|
2115
|
+
) -> list[dict[str, Any]]:
|
|
2116
|
+
out: list[dict[str, Any]] = []
|
|
2117
|
+
seen: set[tuple[str, str, str]] = set()
|
|
2118
|
+
|
|
2119
|
+
def add(path: str, line_range: LineRange, source: str, name: str | None = None) -> None:
|
|
2120
|
+
record = record_by_path.get(path)
|
|
2121
|
+
if record is None:
|
|
2122
|
+
return
|
|
2123
|
+
retrieval, reason = repo_map_retrieval_for(root_arg, path, line_range, redacted_path=bool(record.get("redacted_path")))
|
|
2124
|
+
key = (path, line_range.identity(), source)
|
|
2125
|
+
if key in seen:
|
|
2126
|
+
return
|
|
2127
|
+
seen.add(key)
|
|
2128
|
+
item: dict[str, Any] = {"path": path, "source": source, "lines": line_range.as_dict()}
|
|
2129
|
+
if retrieval:
|
|
2130
|
+
item["slice_cli"] = retrieval
|
|
2131
|
+
elif reason:
|
|
2132
|
+
item["retrieval_omitted_reason"] = reason
|
|
2133
|
+
if name and retrieval and Path(path).suffix.lower() in SYMBOL_HINT_EXTENSIONS:
|
|
2134
|
+
item["symbol_cli"] = " ".join(shlex.quote(part) for part in ["context-guard-read-symbol", "--json", path, name])
|
|
2135
|
+
out.append(item)
|
|
2136
|
+
|
|
2137
|
+
for signature in signatures:
|
|
2138
|
+
lines = signature.get("lines")
|
|
2139
|
+
if isinstance(lines, dict):
|
|
2140
|
+
try:
|
|
2141
|
+
line_range = LineRange(int(lines.get("start")), int(lines.get("end")))
|
|
2142
|
+
except (TypeError, ValueError):
|
|
2143
|
+
continue
|
|
2144
|
+
add(str(signature.get("path", "")), line_range, "signature", str(signature.get("name", "")) or None)
|
|
2145
|
+
if len(out) >= MAX_REPO_MAP_RETRIEVAL_HINTS:
|
|
2146
|
+
return out[:MAX_REPO_MAP_RETRIEVAL_HINTS]
|
|
2147
|
+
for item in graph_rank:
|
|
2148
|
+
path = str(item.get("path", ""))
|
|
2149
|
+
record = record_by_path.get(path)
|
|
2150
|
+
if record is None:
|
|
2151
|
+
continue
|
|
2152
|
+
total = int(record.get("line_count", 0) or 1)
|
|
2153
|
+
add(path, LineRange(1, min(total, 80)), "graph_rank")
|
|
2154
|
+
if len(out) >= MAX_REPO_MAP_RETRIEVAL_HINTS:
|
|
2155
|
+
break
|
|
2156
|
+
return out[:MAX_REPO_MAP_RETRIEVAL_HINTS]
|
|
2157
|
+
|
|
2158
|
+
|
|
2159
|
+
def build_repo_map_payload(
|
|
2160
|
+
root: Path,
|
|
2161
|
+
args: argparse.Namespace,
|
|
2162
|
+
suggest_payload: dict[str, Any],
|
|
2163
|
+
build_payload: dict[str, Any],
|
|
2164
|
+
*,
|
|
2165
|
+
root_arg: str,
|
|
2166
|
+
) -> dict[str, Any]:
|
|
2167
|
+
records, omitted, caps = repo_map_records(root)
|
|
2168
|
+
record_by_path = {str(record["path"]): record for record in records}
|
|
2169
|
+
signatures = extract_signatures(records)
|
|
2170
|
+
secret_scan = build_secret_scan(records)
|
|
2171
|
+
edges = collect_import_edges(records)
|
|
2172
|
+
query_terms = suggest_tokens(str(suggest_payload.get("query", "")))
|
|
2173
|
+
graph_rank = build_graph_rank(
|
|
2174
|
+
records,
|
|
2175
|
+
signatures,
|
|
2176
|
+
edges,
|
|
2177
|
+
query_terms=query_terms,
|
|
2178
|
+
seed_paths=repo_map_seed_paths(args, suggest_payload, build_payload),
|
|
2179
|
+
secret_scan=secret_scan,
|
|
2180
|
+
)
|
|
2181
|
+
retrieval = repo_map_retrieval(record_by_path, signatures, graph_rank, root_arg=root_arg)
|
|
2182
|
+
tree = build_token_tree(records)
|
|
2183
|
+
total_bytes = sum(int(record.get("bytes", 0) or 0) for record in records)
|
|
2184
|
+
return {
|
|
2185
|
+
"schema_version": REPO_MAP_SCHEMA_VERSION,
|
|
2186
|
+
"summary": {
|
|
2187
|
+
"files_scanned": len(records),
|
|
2188
|
+
"files_capped": bool(caps["files_capped"]),
|
|
2189
|
+
"bytes_per_file_capped_count": int(caps["bytes_per_file_capped_count"]),
|
|
2190
|
+
"tree_bytes": total_bytes,
|
|
2191
|
+
"tree_token_proxy": sum(int(item.get("token_proxy", 0) or 0) for item in tree),
|
|
2192
|
+
"signature_files": len({str(item.get("path", "")) for item in signatures}),
|
|
2193
|
+
"signature_count": len(signatures),
|
|
2194
|
+
"secret_risk_files": len(secret_scan.get("files_with_risks", [])),
|
|
2195
|
+
"graph_edges": len(edges),
|
|
2196
|
+
},
|
|
2197
|
+
"caps": caps,
|
|
2198
|
+
"token_tree": tree,
|
|
2199
|
+
"secret_scan": secret_scan,
|
|
2200
|
+
"signature_index": signatures,
|
|
2201
|
+
"graph": {
|
|
2202
|
+
"edges": edges[:MAX_REPO_MAP_GRAPH_RANK_ENTRIES],
|
|
2203
|
+
"edges_omitted_by_cap": max(0, len(edges) - MAX_REPO_MAP_GRAPH_RANK_ENTRIES),
|
|
2204
|
+
},
|
|
2205
|
+
"graph_rank": graph_rank,
|
|
2206
|
+
"retrieval": retrieval,
|
|
2207
|
+
"omitted_files": omitted[:MAX_REPO_MAP_TREE_ENTRIES],
|
|
2208
|
+
"safety": {
|
|
2209
|
+
"deterministic_local_only": True,
|
|
2210
|
+
"no_network": True,
|
|
2211
|
+
"no_model_or_embedding": True,
|
|
2212
|
+
"explain_only": True,
|
|
2213
|
+
"redacted_before_output": True,
|
|
2214
|
+
"tree_sitter": {"status": "unavailable_without_optional_dependency", "fallback": "python_ast_and_regex_signatures"},
|
|
2215
|
+
"caveats": [
|
|
2216
|
+
"Repo-map bytes are local sampled UTF-8 bytes and estimated chars_div_4 token proxies, not provider-token or savings claims.",
|
|
2217
|
+
"Graph ranking is deterministic explain metadata only; it does not change pack selection in this stage.",
|
|
2218
|
+
],
|
|
2219
|
+
},
|
|
2220
|
+
}
|
|
2221
|
+
|
|
2222
|
+
|
|
2223
|
+
def build_auto_explain_payload(
|
|
2224
|
+
args: argparse.Namespace,
|
|
2225
|
+
suggest_payload: dict[str, Any],
|
|
2226
|
+
build_payload: dict[str, Any],
|
|
2227
|
+
payload: dict[str, Any],
|
|
2228
|
+
*,
|
|
2229
|
+
root: Path | None = None,
|
|
2230
|
+
root_arg: str = ".",
|
|
2231
|
+
) -> dict[str, Any]:
|
|
2232
|
+
build_sources = [
|
|
2233
|
+
item
|
|
2234
|
+
for item in build_payload.get("included_sources", [])
|
|
2235
|
+
if isinstance(item, dict)
|
|
2236
|
+
]
|
|
2237
|
+
used_build_indexes: set[int] = set()
|
|
2238
|
+
suggest_sources = [
|
|
2239
|
+
item
|
|
2240
|
+
for item in suggest_payload.get("sources", [])
|
|
2241
|
+
if isinstance(item, dict)
|
|
2242
|
+
]
|
|
2243
|
+
exact_matches: dict[int, dict[str, Any]] = {}
|
|
2244
|
+
for index, item in enumerate(suggest_sources):
|
|
2245
|
+
build_item = find_exact_build_source_for_explain(item, build_sources, used_build_indexes)
|
|
2246
|
+
if build_item is not None:
|
|
2247
|
+
exact_matches[index] = build_item
|
|
2248
|
+
|
|
2249
|
+
selection: list[dict[str, Any]] = []
|
|
2250
|
+
for index, item in enumerate(suggest_sources):
|
|
2251
|
+
entry = copy_explain_fields(
|
|
2252
|
+
item,
|
|
2253
|
+
("path", "score", "priority", "reason", "label", "lines", "bytes", "retrieval_cli", "retrieval_omitted_reason"),
|
|
2254
|
+
)
|
|
2255
|
+
build_item = exact_matches.get(index)
|
|
2256
|
+
if build_item is None:
|
|
2257
|
+
build_item = find_fallback_build_source_for_explain(item, build_sources, used_build_indexes)
|
|
2258
|
+
if build_item is not None:
|
|
2259
|
+
entry["build_status"] = build_item.get("status", "included")
|
|
2260
|
+
for key in ("requested_lines", "included_lines"):
|
|
2261
|
+
if key in build_item:
|
|
2262
|
+
entry[key] = copy.deepcopy(build_item[key])
|
|
2263
|
+
if "bytes" in build_item:
|
|
2264
|
+
entry["build_bytes"] = build_item["bytes"]
|
|
2265
|
+
else:
|
|
2266
|
+
entry["build_status"] = "not_built"
|
|
2267
|
+
selection.append(entry)
|
|
2268
|
+
|
|
2269
|
+
omissions: list[dict[str, Any]] = []
|
|
2270
|
+
seen_omissions: set[tuple[str, str, str, str, str]] = set()
|
|
2271
|
+
omission_fields = (
|
|
2272
|
+
"path",
|
|
2273
|
+
"status",
|
|
2274
|
+
"reason",
|
|
2275
|
+
"suggest_reason",
|
|
2276
|
+
"priority",
|
|
2277
|
+
"label",
|
|
2278
|
+
"requested_lines",
|
|
2279
|
+
"included_lines",
|
|
2280
|
+
"lines",
|
|
2281
|
+
"total_lines",
|
|
2282
|
+
"retrieval_cli",
|
|
2283
|
+
"retrieval_omitted_reason",
|
|
2284
|
+
"input_index",
|
|
2285
|
+
)
|
|
2286
|
+
for phase, source in (("suggest", suggest_payload), ("build", build_payload)):
|
|
2287
|
+
for item in source.get("omitted_sources", []):
|
|
2288
|
+
if not isinstance(item, dict):
|
|
2289
|
+
continue
|
|
2290
|
+
entry = copy_explain_fields(item, omission_fields)
|
|
2291
|
+
entry["phase"] = phase
|
|
2292
|
+
key = explain_omission_key(entry)
|
|
2293
|
+
if key in seen_omissions:
|
|
2294
|
+
continue
|
|
2295
|
+
seen_omissions.add(key)
|
|
2296
|
+
omissions.append(entry)
|
|
2297
|
+
omissions.sort(key=explain_omission_key)
|
|
2298
|
+
|
|
2299
|
+
build_source_counts = build_payload.get("sources", {}) if isinstance(build_payload.get("sources"), dict) else {}
|
|
2300
|
+
auto_source_counts = payload.get("sources", {}) if isinstance(payload.get("sources"), dict) else {}
|
|
2301
|
+
artifact = build_payload.get("artifact", {}) if isinstance(build_payload.get("artifact"), dict) else {}
|
|
2302
|
+
pack_bytes = int(payload.get("pack_bytes", build_payload.get("pack_bytes", 0)) or 0)
|
|
2303
|
+
budget_bytes = int(payload.get("budget_bytes", build_payload.get("budget_bytes", 0)) or 0)
|
|
2304
|
+
budget_omitted_count = sum(1 for item in omissions if item.get("reason") == "budget_exhausted")
|
|
2305
|
+
explicit_files = split_suggest_files(args.files)
|
|
2306
|
+
query = str(suggest_payload.get("query", ""))
|
|
2307
|
+
diff_label = cap_label(args.diff) if getattr(args, "diff", None) else None
|
|
2308
|
+
explain = {
|
|
2309
|
+
"schema_version": AUTO_EXPLAIN_SCHEMA_VERSION,
|
|
2310
|
+
"summary": {
|
|
2311
|
+
"suggested": int(auto_source_counts.get("suggested", len(selection)) or 0),
|
|
2312
|
+
"included": int(auto_source_counts.get("included", build_source_counts.get("included", 0)) or 0),
|
|
2313
|
+
"partial": int(auto_source_counts.get("partial", build_source_counts.get("partial", 0)) or 0),
|
|
2314
|
+
"omitted": int(auto_source_counts.get("omitted", build_source_counts.get("omitted", 0)) or 0),
|
|
2315
|
+
"suggest_omitted": len([item for item in suggest_payload.get("omitted_sources", []) if isinstance(item, dict)]),
|
|
2316
|
+
"explain_omissions": len(omissions),
|
|
2317
|
+
"pack_bytes": pack_bytes,
|
|
2318
|
+
"budget_bytes": budget_bytes,
|
|
2319
|
+
"manifest_written": bool(payload.get("manifest_path")),
|
|
2320
|
+
"pack_written": bool(payload.get("pack_path")),
|
|
2321
|
+
"artifact_stored": bool(artifact.get("stored")),
|
|
2322
|
+
"artifact_capped": bool(artifact.get("capped")),
|
|
2323
|
+
},
|
|
2324
|
+
"inputs": {
|
|
2325
|
+
"query": query,
|
|
2326
|
+
"query_present": bool(query),
|
|
2327
|
+
"diff": diff_label,
|
|
2328
|
+
"diff_present": bool(diff_label),
|
|
2329
|
+
"explicit_file_count": len(explicit_files),
|
|
2330
|
+
"output_count": len(args.output or []),
|
|
2331
|
+
"test_output_count": len(args.test_output or []),
|
|
2332
|
+
"top": bounded_int(args.top, DEFAULT_SUGGEST_TOP, 1, MAX_SUGGEST_TOP),
|
|
2333
|
+
"context_lines": bounded_int(args.context_lines, DEFAULT_SUGGEST_CONTEXT_LINES, 0, MAX_SUGGEST_CONTEXT_LINES),
|
|
2334
|
+
"no_artifact": bool(args.no_artifact),
|
|
2335
|
+
"manifest_path": payload.get("manifest_path"),
|
|
2336
|
+
"pack_path": payload.get("pack_path"),
|
|
2337
|
+
},
|
|
2338
|
+
"selection": selection,
|
|
2339
|
+
"omissions": omissions,
|
|
2340
|
+
"budget": {
|
|
2341
|
+
"pack_bytes": pack_bytes,
|
|
2342
|
+
"budget_bytes": budget_bytes,
|
|
2343
|
+
"remaining_bytes": budget_bytes - pack_bytes,
|
|
2344
|
+
"partial_count": int(build_source_counts.get("partial", 0) or 0),
|
|
2345
|
+
"budget_omitted_count": budget_omitted_count,
|
|
2346
|
+
"token_proxy": copy.deepcopy(payload.get("token_proxy", {})),
|
|
2347
|
+
"measurement": "observed_bytes_estimated_tokens",
|
|
2348
|
+
"caveat": "Byte counts are observed pack bytes; token counts are estimated chars_div_4 proxies, not provider-token savings.",
|
|
2349
|
+
},
|
|
2350
|
+
"safety": {
|
|
2351
|
+
"redaction": copy.deepcopy(build_payload.get("redaction", {})),
|
|
2352
|
+
"caveats": copy.deepcopy(payload.get("caveats", [])),
|
|
2353
|
+
"deterministic_local_only": True,
|
|
2354
|
+
"raw_output_embedded": False,
|
|
2355
|
+
"raw_test_output_embedded": False,
|
|
2356
|
+
},
|
|
2357
|
+
}
|
|
2358
|
+
if root is not None:
|
|
2359
|
+
explain["repo_map"] = build_repo_map_payload(root, args, suggest_payload, build_payload, root_arg=root_arg)
|
|
2360
|
+
return explain
|
|
2361
|
+
|
|
2362
|
+
|
|
2363
|
+
def auto_pack(root: Path, args: argparse.Namespace, *, root_arg: str) -> tuple[dict[str, Any], int]:
|
|
2364
|
+
manifest_rel = output_rel_for_collision_check(args.manifest_out, "--manifest-out") if args.manifest_out else None
|
|
2365
|
+
pack_rel = output_rel_for_collision_check(args.pack_out, "--pack-out") if args.pack_out else None
|
|
2366
|
+
if manifest_rel is not None and pack_rel is not None:
|
|
2367
|
+
reject_matching_output_targets(
|
|
2368
|
+
root,
|
|
2369
|
+
first_rel=manifest_rel,
|
|
2370
|
+
second_rel=pack_rel,
|
|
2371
|
+
second_option="--pack-out",
|
|
2372
|
+
reason="same_as_manifest_out",
|
|
2373
|
+
)
|
|
2374
|
+
if args.manifest_out:
|
|
2375
|
+
validate_output_path_under_root(root, args.manifest_out, "--manifest-out")
|
|
2376
|
+
if args.pack_out:
|
|
2377
|
+
validate_output_path_under_root(root, args.pack_out, "--pack-out")
|
|
2378
|
+
suggest_args = copy.copy(args)
|
|
2379
|
+
suggest_args.manifest_out = None
|
|
2380
|
+
suggest_payload, rc = suggest_pack(root, suggest_args, root_arg=root_arg)
|
|
2381
|
+
manifest = suggest_payload["manifest"]
|
|
2382
|
+
specs = manifest_to_source_specs(manifest)
|
|
2383
|
+
budget = bounded_int(args.budget_bytes, DEFAULT_BUDGET_BYTES, MIN_BUDGET_BYTES, MAX_BUDGET_BYTES)
|
|
2384
|
+
build_payload = build_pack(root, specs, budget_bytes=budget, root_arg=root_arg, store_artifact=False)
|
|
2385
|
+
if not args.no_artifact:
|
|
2386
|
+
receipt_rel = Path(PACK_DIR) / f"{build_payload['pack_id']}.json"
|
|
2387
|
+
if manifest_rel is not None:
|
|
2388
|
+
reject_matching_output_targets(
|
|
2389
|
+
root,
|
|
2390
|
+
first_rel=receipt_rel,
|
|
2391
|
+
second_rel=manifest_rel,
|
|
2392
|
+
second_option="--manifest-out",
|
|
2393
|
+
reason="same_as_artifact_receipt",
|
|
2394
|
+
)
|
|
2395
|
+
if pack_rel is not None:
|
|
2396
|
+
reject_matching_output_targets(
|
|
2397
|
+
root,
|
|
2398
|
+
first_rel=receipt_rel,
|
|
2399
|
+
second_rel=pack_rel,
|
|
2400
|
+
second_option="--pack-out",
|
|
2401
|
+
reason="same_as_artifact_receipt",
|
|
2402
|
+
)
|
|
2403
|
+
manifest_path: str | None = None
|
|
2404
|
+
pack_path: str | None = None
|
|
2405
|
+
if args.pack_out:
|
|
2406
|
+
pack_path = write_text_under_root(root, args.pack_out, str(build_payload["pack"]), "--pack-out")
|
|
2407
|
+
if args.manifest_out:
|
|
2408
|
+
manifest_path = write_manifest_under_root(root, args.manifest_out, manifest)
|
|
2409
|
+
if not args.no_artifact:
|
|
2410
|
+
build_payload["artifact"] = store_receipt(root, build_payload)
|
|
2411
|
+
build_hint, build_hint_omitted_reason = suggest_build_hint(root_arg, manifest_path, budget)
|
|
2412
|
+
suggest_payload["manifest_path"] = manifest_path
|
|
2413
|
+
suggest_payload["build_hint"] = build_hint
|
|
2414
|
+
suggest_payload.pop("build_hint_omitted_reason", None)
|
|
2415
|
+
if build_hint_omitted_reason:
|
|
2416
|
+
suggest_payload["build_hint_omitted_reason"] = build_hint_omitted_reason
|
|
2417
|
+
payload: dict[str, Any] = {
|
|
2418
|
+
"tool": TOOL_NAME,
|
|
2419
|
+
"schema_version": AUTO_SCHEMA_VERSION,
|
|
2420
|
+
"version": VERSION,
|
|
2421
|
+
"mode": "auto",
|
|
2422
|
+
"root": display_root(root),
|
|
2423
|
+
"query": suggest_payload.get("query", ""),
|
|
2424
|
+
"budget_bytes": budget,
|
|
2425
|
+
"manifest": manifest,
|
|
2426
|
+
"manifest_path": manifest_path,
|
|
2427
|
+
"pack_path": pack_path,
|
|
2428
|
+
"suggest": suggest_payload,
|
|
2429
|
+
"build": build_payload,
|
|
2430
|
+
"sources": {
|
|
2431
|
+
"suggested": len(suggest_payload.get("sources", [])),
|
|
2432
|
+
"included": build_payload.get("sources", {}).get("included", 0),
|
|
2433
|
+
"partial": build_payload.get("sources", {}).get("partial", 0),
|
|
2434
|
+
"omitted": build_payload.get("sources", {}).get("omitted", 0),
|
|
2435
|
+
},
|
|
2436
|
+
"pack_bytes": build_payload.get("pack_bytes", 0),
|
|
2437
|
+
"token_proxy": build_payload.get("token_proxy", {}),
|
|
2438
|
+
"caveats": [
|
|
2439
|
+
"Deterministic local heuristics only; no model, network, embedding, or provider-cost estimate is used.",
|
|
2440
|
+
"Byte and token values are pack-size proxies, not billing claims.",
|
|
2441
|
+
],
|
|
2442
|
+
}
|
|
2443
|
+
if build_hint_omitted_reason:
|
|
2444
|
+
payload["build_hint_omitted_reason"] = build_hint_omitted_reason
|
|
2445
|
+
if args.explain:
|
|
2446
|
+
payload["explain"] = build_auto_explain_payload(args, suggest_payload, build_payload, payload, root=root, root_arg=root_arg)
|
|
2447
|
+
return payload, rc
|
|
2448
|
+
|
|
2449
|
+
|
|
2450
|
+
def print_suggest_text(payload: dict[str, Any]) -> None:
|
|
2451
|
+
print(
|
|
2452
|
+
f"context-guard-pack suggest: {len(payload['sources'])} source(s), "
|
|
2453
|
+
f"estimated {payload['estimated_pack_bytes']}/{payload['budget_bytes']} bytes"
|
|
2454
|
+
)
|
|
2455
|
+
for item in payload["sources"]:
|
|
2456
|
+
lines = item.get("lines")
|
|
2457
|
+
line_text = f":{lines['start']}:{lines['end']}" if isinstance(lines, dict) else ""
|
|
2458
|
+
print(f"- {item['path']}{line_text} priority={item['priority']} reason={item['reason']}")
|
|
2459
|
+
if payload.get("manifest_path"):
|
|
2460
|
+
print(f"manifest: {payload['manifest_path']}")
|
|
2461
|
+
if payload.get("build_hint"):
|
|
2462
|
+
print(f"build: {payload['build_hint']}")
|
|
2463
|
+
elif payload.get("build_hint_omitted_reason"):
|
|
2464
|
+
print(f"build hint omitted: {payload['build_hint_omitted_reason']}")
|
|
2465
|
+
|
|
2466
|
+
|
|
2467
|
+
def print_auto_text(payload: dict[str, Any]) -> None:
|
|
2468
|
+
print(
|
|
2469
|
+
f"context-guard-pack auto: {payload['sources']['suggested']} suggested source(s), "
|
|
2470
|
+
f"pack {payload['pack_bytes']}/{payload['budget_bytes']} bytes"
|
|
2471
|
+
)
|
|
2472
|
+
explain = payload.get("explain")
|
|
2473
|
+
if isinstance(explain, dict):
|
|
2474
|
+
summary = explain.get("summary", {}) if isinstance(explain.get("summary"), dict) else {}
|
|
2475
|
+
budget = explain.get("budget", {}) if isinstance(explain.get("budget"), dict) else {}
|
|
2476
|
+
print(
|
|
2477
|
+
"explain: "
|
|
2478
|
+
f"selected={summary.get('suggested', 0)} "
|
|
2479
|
+
f"included={summary.get('included', 0)} "
|
|
2480
|
+
f"partial={summary.get('partial', 0)} "
|
|
2481
|
+
f"omitted={summary.get('omitted', 0)} "
|
|
2482
|
+
f"budget={budget.get('pack_bytes', payload.get('pack_bytes', 0))}/{budget.get('budget_bytes', payload.get('budget_bytes', 0))} "
|
|
2483
|
+
"heuristic=local"
|
|
2484
|
+
)
|
|
2485
|
+
for item in (explain.get("selection", []) if isinstance(explain.get("selection"), list) else [])[:5]:
|
|
2486
|
+
if not isinstance(item, dict):
|
|
2487
|
+
continue
|
|
2488
|
+
lines = item.get("included_lines") or item.get("lines")
|
|
2489
|
+
if isinstance(lines, dict):
|
|
2490
|
+
line_text = f":{lines.get('start')}:{lines.get('end')}"
|
|
2491
|
+
else:
|
|
2492
|
+
line_text = ""
|
|
2493
|
+
print(
|
|
2494
|
+
f"- {item.get('path')}{line_text} "
|
|
2495
|
+
f"status={item.get('build_status', 'unknown')} "
|
|
2496
|
+
f"score={item.get('score', item.get('priority', 0))} "
|
|
2497
|
+
f"reason={item.get('reason', 'local heuristic')}"
|
|
2498
|
+
)
|
|
2499
|
+
omissions = explain.get("omissions", []) if isinstance(explain.get("omissions"), list) else []
|
|
2500
|
+
if omissions:
|
|
2501
|
+
reason_counts: dict[str, int] = {}
|
|
2502
|
+
for item in omissions:
|
|
2503
|
+
if not isinstance(item, dict):
|
|
2504
|
+
continue
|
|
2505
|
+
reason = str(item.get("reason", "unknown"))
|
|
2506
|
+
reason_counts[reason] = reason_counts.get(reason, 0) + 1
|
|
2507
|
+
reason_text = ", ".join(f"{reason}={count}" for reason, count in sorted(reason_counts.items()))
|
|
2508
|
+
print(f"omitted reasons: {reason_text}")
|
|
2509
|
+
if payload.get("manifest_path"):
|
|
2510
|
+
print(f"manifest: {payload['manifest_path']}")
|
|
2511
|
+
if payload.get("pack_path"):
|
|
2512
|
+
print(f"pack: {payload['pack_path']}")
|
|
2513
|
+
else:
|
|
2514
|
+
print()
|
|
2515
|
+
sys.stdout.write(str(payload["build"]["pack"]))
|
|
2516
|
+
|
|
2517
|
+
|
|
869
2518
|
def build_parser() -> argparse.ArgumentParser:
|
|
870
2519
|
parser = argparse.ArgumentParser(description="Build budgeted local context packs with exact retrieval hints.")
|
|
871
2520
|
sub = parser.add_subparsers(dest="command", required=True)
|
|
@@ -881,6 +2530,33 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
881
2530
|
slice_cmd.add_argument("--path", required=True, help="relative file path under root")
|
|
882
2531
|
slice_cmd.add_argument("--lines", required=True, help="inclusive 1-indexed START:END")
|
|
883
2532
|
slice_cmd.add_argument("--json", action="store_true", help="emit JSON payload")
|
|
2533
|
+
suggest = sub.add_parser("suggest", help="suggest a build-compatible context pack manifest from local signals")
|
|
2534
|
+
suggest.add_argument("--root", default=".", help="project root; must not be a symlink")
|
|
2535
|
+
suggest.add_argument("--query", default="", help="task or question to match against local files")
|
|
2536
|
+
suggest.add_argument("--diff", help="git diff range, or staged/worktree, to seed changed-file ranges")
|
|
2537
|
+
suggest.add_argument("--files", "--file", dest="files", action="append", help="explicit relative file path(s), comma-separated or repeated")
|
|
2538
|
+
suggest.add_argument("--output", action="append", help="relative path to sanitized command output text under root")
|
|
2539
|
+
suggest.add_argument("--test-output", action="append", help="relative path to sanitized test output text under root")
|
|
2540
|
+
suggest.add_argument("--budget-bytes", type=int, default=DEFAULT_BUDGET_BYTES)
|
|
2541
|
+
suggest.add_argument("--top", type=int, default=DEFAULT_SUGGEST_TOP, help="maximum suggested sources")
|
|
2542
|
+
suggest.add_argument("--context-lines", type=int, default=DEFAULT_SUGGEST_CONTEXT_LINES, help="line context around diff/output hits")
|
|
2543
|
+
suggest.add_argument("--manifest-out", help="write the suggested build manifest to this relative path under root")
|
|
2544
|
+
suggest.add_argument("--json", action="store_true", help="emit JSON payload")
|
|
2545
|
+
auto = sub.add_parser("auto", help="suggest a context pack manifest and build the budgeted pack in one local step")
|
|
2546
|
+
auto.add_argument("--root", default=".", help="project root; must not be a symlink")
|
|
2547
|
+
auto.add_argument("--query", default="", help="task or question to match against local files")
|
|
2548
|
+
auto.add_argument("--diff", help="git diff range, or staged/worktree, to seed changed-file ranges")
|
|
2549
|
+
auto.add_argument("--files", "--file", dest="files", action="append", help="explicit relative file path(s), comma-separated or repeated")
|
|
2550
|
+
auto.add_argument("--output", action="append", help="relative path to sanitized command output text under root")
|
|
2551
|
+
auto.add_argument("--test-output", action="append", help="relative path to sanitized test output text under root")
|
|
2552
|
+
auto.add_argument("--budget-bytes", type=int, default=DEFAULT_BUDGET_BYTES)
|
|
2553
|
+
auto.add_argument("--top", type=int, default=DEFAULT_SUGGEST_TOP, help="maximum suggested sources")
|
|
2554
|
+
auto.add_argument("--context-lines", type=int, default=DEFAULT_SUGGEST_CONTEXT_LINES, help="line context around diff/output hits")
|
|
2555
|
+
auto.add_argument("--manifest-out", help="write the suggested build manifest to this relative path under root")
|
|
2556
|
+
auto.add_argument("--pack-out", help="write the built Markdown pack to this relative path under root")
|
|
2557
|
+
auto.add_argument("--json", action="store_true", help="emit JSON payload")
|
|
2558
|
+
auto.add_argument("--no-artifact", action="store_true", help="do not write .context-guard/packs receipt")
|
|
2559
|
+
auto.add_argument("--explain", action="store_true", help="include deterministic local selection/build explanation metadata")
|
|
884
2560
|
return parser
|
|
885
2561
|
|
|
886
2562
|
|
|
@@ -919,6 +2595,22 @@ def main(argv: list[str] | None = None) -> int:
|
|
|
919
2595
|
else:
|
|
920
2596
|
print(f"context-guard-pack: {payload.get('reason')}", file=sys.stderr)
|
|
921
2597
|
return rc
|
|
2598
|
+
if args.command == "suggest":
|
|
2599
|
+
payload, rc = suggest_pack(root, args, root_arg=str(args.root))
|
|
2600
|
+
if args.json:
|
|
2601
|
+
json.dump(payload, sys.stdout, ensure_ascii=False, indent=2, sort_keys=True)
|
|
2602
|
+
sys.stdout.write("\n")
|
|
2603
|
+
else:
|
|
2604
|
+
print_suggest_text(payload)
|
|
2605
|
+
return rc
|
|
2606
|
+
if args.command == "auto":
|
|
2607
|
+
payload, rc = auto_pack(root, args, root_arg=str(args.root))
|
|
2608
|
+
if args.json:
|
|
2609
|
+
json.dump(payload, sys.stdout, ensure_ascii=False, indent=2, sort_keys=True)
|
|
2610
|
+
sys.stdout.write("\n")
|
|
2611
|
+
else:
|
|
2612
|
+
print_auto_text(payload)
|
|
2613
|
+
return rc
|
|
922
2614
|
raise PackError("unknown command")
|
|
923
2615
|
except PackError as exc:
|
|
924
2616
|
print(f"context-guard-pack: {exc}", file=sys.stderr)
|