@ictechgy/context-guard 0.4.9 → 0.4.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/CHANGELOG.md +28 -0
  2. package/README.ko.md +59 -31
  3. package/README.md +85 -36
  4. package/docs/benchmark-fixtures/token-savings-12task-baseline.prompt.example.md +7 -0
  5. package/docs/benchmark-fixtures/token-savings-12task-contextguard.prompt.example.md +7 -0
  6. package/docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl +24 -0
  7. package/docs/benchmark-fixtures/token-savings-12task.tasks.example.json +182 -0
  8. package/docs/benchmark-fixtures/token-savings-12task.variants.example.json +10 -0
  9. package/docs/benchmark-workflow-examples.md +3 -0
  10. package/docs/benchmark-workflows/context-pack-byte-proxy.example.json +278 -137
  11. package/docs/benchmark-workflows/measured-token-workflow.example.json +279 -138
  12. package/docs/benchmark-workflows/provider-cache-telemetry.example.json +279 -138
  13. package/docs/distribution.md +10 -7
  14. package/docs/experimental-benchmark-fixtures.md +30 -6
  15. package/package.json +4 -6
  16. package/packaging/homebrew/context-guard.rb.template +1 -1
  17. package/plugins/context-guard/.claude-plugin/plugin.json +1 -1
  18. package/plugins/context-guard/README.ko.md +20 -14
  19. package/plugins/context-guard/README.md +26 -17
  20. package/plugins/context-guard/bin/context-guard +147 -25
  21. package/plugins/context-guard/bin/context-guard-artifact +884 -79
  22. package/plugins/context-guard/bin/context-guard-audit +33 -2
  23. package/plugins/context-guard/bin/context-guard-bench +1542 -31
  24. package/plugins/context-guard/bin/context-guard-cache-score +665 -0
  25. package/plugins/context-guard/bin/context-guard-compress +146 -1
  26. package/plugins/context-guard/bin/context-guard-cost +790 -6
  27. package/plugins/context-guard/bin/context-guard-experiments +463 -26
  28. package/plugins/context-guard/bin/context-guard-failed-nudge +9 -2
  29. package/plugins/context-guard/bin/context-guard-filter +163 -7
  30. package/plugins/context-guard/bin/context-guard-guard-read +3 -0
  31. package/plugins/context-guard/bin/context-guard-pack +892 -49
  32. package/plugins/context-guard/bin/context-guard-rewrite-bash +3 -0
  33. package/plugins/context-guard/bin/context-guard-sanitize-output +76 -12
  34. package/plugins/context-guard/bin/context-guard-setup +165 -31
  35. package/plugins/context-guard/bin/context-guard-statusline +490 -283
  36. package/plugins/context-guard/bin/context-guard-statusline-merged +5 -0
  37. package/plugins/context-guard/bin/context-guard-tool-prune +480 -53
  38. package/plugins/context-guard/bin/context-guard-trim-output +288 -41
  39. package/plugins/context-guard/brief/README.md +5 -5
  40. package/plugins/context-guard/lib/context_guard_commands.py +230 -0
  41. package/plugins/context-guard/skills/setup/SKILL.md +1 -0
  42. package/context-guard-kit/README.md +0 -91
  43. package/context-guard-kit/benchmark_runner.py +0 -2401
  44. package/context-guard-kit/claude_transcript_cost_audit.py +0 -2346
  45. package/context-guard-kit/context_compress.py +0 -695
  46. package/context-guard-kit/context_escrow.py +0 -935
  47. package/context-guard-kit/context_filter.py +0 -637
  48. package/context-guard-kit/context_guard_cli.py +0 -325
  49. package/context-guard-kit/context_guard_diet.py +0 -1711
  50. package/context-guard-kit/context_pack.py +0 -2713
  51. package/context-guard-kit/cost_guard.py +0 -2349
  52. package/context-guard-kit/experimental_registry.py +0 -4348
  53. package/context-guard-kit/failed_attempt_nudge.py +0 -567
  54. package/context-guard-kit/guard_large_read.py +0 -690
  55. package/context-guard-kit/hook_secret_patterns.py +0 -43
  56. package/context-guard-kit/read_symbol.py +0 -483
  57. package/context-guard-kit/rewrite_bash_for_token_budget.py +0 -501
  58. package/context-guard-kit/sanitize_output.py +0 -725
  59. package/context-guard-kit/settings.example.json +0 -67
  60. package/context-guard-kit/setup_wizard.py +0 -2515
  61. package/context-guard-kit/statusline.sh +0 -362
  62. package/context-guard-kit/statusline_merged.sh +0 -157
  63. package/context-guard-kit/tool_schema_pruner.py +0 -837
  64. package/context-guard-kit/trim_command_output.py +0 -1449
@@ -1,2713 +0,0 @@
1
- #!/usr/bin/env python3
2
- """Build a deterministic, budgeted local context pack from prioritized files.
3
-
4
- The packer is local-only and intentionally conservative. It assembles selected
5
- file slices into a Markdown body whose rendered UTF-8 byte length is bounded by
6
- ``--budget-bytes``. It redacts before building the pack/receipt, records why
7
- lower-priority sources were omitted, and emits exact local slice commands for
8
- retrieval when the path is safe to display.
9
- """
10
- from __future__ import annotations
11
-
12
- import argparse
13
- import ast
14
- import copy
15
- import hashlib
16
- import importlib.machinery
17
- import importlib.util
18
- import json
19
- import os
20
- import posixpath
21
- from pathlib import Path
22
- import re
23
- import shlex
24
- import stat
25
- import subprocess
26
- import sys
27
- import threading
28
- import time
29
- from dataclasses import dataclass
30
- from typing import Any
31
-
32
- TOOL_NAME = "context-guard-pack"
33
- VERSION = 1
34
- DEFAULT_BUDGET_BYTES = 12_000
35
- MIN_BUDGET_BYTES = 0
36
- MAX_BUDGET_BYTES = 2_000_000
37
- MAX_RECEIPT_BYTES = 64_000
38
- MAX_MANIFEST_BYTES = 1_000_000
39
- MAX_LABEL_CHARS = 160
40
- MAX_REASON_CHARS = 120
41
- TOKEN_PROXY_CHARS_PER_TOKEN = 4
42
- SUGGEST_SCHEMA_VERSION = "contextguard.pack-suggest.v1"
43
- AUTO_SCHEMA_VERSION = "contextguard.pack-auto.v1"
44
- AUTO_EXPLAIN_SCHEMA_VERSION = "contextguard.pack-auto-explain.v1"
45
- REPO_MAP_SCHEMA_VERSION = "contextguard.pack-repo-map.v1"
46
- DEFAULT_SUGGEST_TOP = 8
47
- MAX_SUGGEST_TOP = 50
48
- DEFAULT_SUGGEST_CONTEXT_LINES = 20
49
- MAX_SUGGEST_CONTEXT_LINES = 120
50
- SUGGEST_WHOLE_FILE_MAX_LINES = 120
51
- MAX_SUGGEST_INPUT_BYTES = 256_000
52
- MAX_QUERY_SCAN_FILES = 2_000
53
- MAX_QUERY_SCAN_BYTES_PER_FILE = 200_000
54
- MAX_REPO_MAP_FILES = 1_000
55
- MAX_REPO_MAP_BYTES_PER_FILE = 120_000
56
- MAX_REPO_MAP_TREE_ENTRIES = 30
57
- MAX_REPO_MAP_SIGNATURE_ENTRIES = 40
58
- MAX_REPO_MAP_GRAPH_RANK_ENTRIES = 30
59
- MAX_REPO_MAP_RETRIEVAL_HINTS = 30
60
- MAX_REPO_MAP_SECRET_RISK_FILES = 20
61
- PACK_DIR = ".context-guard/packs"
62
- REDACTED_PATH_COMPONENT = "[REDACTED-PATH-COMPONENT]"
63
- CONTROL_CHAR_RE = re.compile(r"[\x00-\x1f\x7f-\x9f]")
64
- SECRET_CONTENT_RE = re.compile(
65
- r"(?is)("
66
- r"-----BEGIN [A-Z0-9 ]*PRIVATE KEY-----|"
67
- r"AKIA[0-9A-Z]{16}|"
68
- r"ASIA[0-9A-Z]{16}|"
69
- r"gh[pousr]_[A-Za-z0-9_]{20,}|"
70
- r"github_pat_[A-Za-z0-9_]{20,}|"
71
- r"glpat-[A-Za-z0-9_-]{12,}|"
72
- r"xox[abprs]-[A-Za-z0-9-]{10,}|"
73
- r"sk-(?:ant|proj)-[A-Za-z0-9_-]{12,}|"
74
- r"sk-[A-Za-z0-9][A-Za-z0-9_-]{20,}|"
75
- r"(?:sk|pk|rk)_(?:live|test)_[A-Za-z0-9]{16,}|"
76
- r"npm_[A-Za-z0-9]{20,}|"
77
- r"AIza[0-9A-Za-z_\-]{20,}|"
78
- r"(?i:Authorization)\s*:\s*(?:Bearer|Basic)\s+[A-Za-z0-9._~+/=-]+|"
79
- r"(?<![A-Za-z0-9])(?:api[_-]?key|token|secret|password|client[_-]?secret)\s*[:=]\s*[^\s]+"
80
- r")"
81
- )
82
- SECRET_PATH_COMPONENT_RE = re.compile(
83
- r"(?i)("
84
- r"SG\.[A-Za-z0-9_-]{16,256}\.[A-Za-z0-9_-]{16,512}|"
85
- r"eyJ[A-Za-z0-9_-]{8,}\.[A-Za-z0-9_-]{8,}\.[A-Za-z0-9_-]{8,}|"
86
- r"\b(?:Bearer|Basic)\s+[A-Za-z0-9._~+/=-]{12,}|"
87
- r"[a-z][a-z0-9+.-]{0,31}:/+(?:[^/\s:@]{0,256}:[^/\s@]{0,2048}|[^/\s@]{1,2048})@"
88
- r")"
89
- )
90
- SECRET_RISK_PATTERNS: tuple[tuple[str, re.Pattern[str]], ...] = (
91
- ("private_key_block", re.compile(r"(?is)-----BEGIN [A-Z0-9 ]*PRIVATE KEY-----")),
92
- ("github_token", re.compile(r"gh[pousr]_[A-Za-z0-9_]{20,}|github_pat_[A-Za-z0-9_]{20,}|glpat-[A-Za-z0-9_-]{12,}")),
93
- ("provider_api_key", re.compile(r"sk-(?:ant|proj)-[A-Za-z0-9_-]{12,}|sk-[A-Za-z0-9][A-Za-z0-9_-]{20,}|AIza[0-9A-Za-z_\-]{20,}")),
94
- ("authorization_header", re.compile(r"(?i)Authorization\s*:\s*(?:Bearer|Basic)\s+[A-Za-z0-9._~+/=-]+")),
95
- ("generic_secret_assignment", re.compile(r"(?i)(?:api[_-]?key|token|secret|password|client[_-]?secret)\s*[:=]\s*[^\s]+")),
96
- )
97
- REPO_MAP_TEXT_EXTENSIONS = {
98
- ".py", ".js", ".jsx", ".ts", ".tsx", ".mjs", ".cjs",
99
- ".go", ".rs", ".java", ".kt", ".kts", ".swift", ".c", ".cc", ".cpp", ".h", ".hpp",
100
- ".md", ".mdx", ".txt", ".json", ".yaml", ".yml", ".toml", ".sh", ".css", ".html",
101
- }
102
- SYMBOL_HINT_EXTENSIONS = {".py", ".js", ".jsx", ".ts", ".tsx", ".go", ".rs"}
103
- SIGNATURE_LINE_RE = re.compile(
104
- r"^\s*(?:export\s+)?(?:(?:async\s+)?function\s+([A-Za-z_$][\w$]*)\s*\(|class\s+([A-Za-z_$][\w$]*)|"
105
- r"(?:const|let|var)\s+([A-Za-z_$][\w$]*)\s*=\s*(?:async\s*)?(?:\([^)]*\)|[A-Za-z_$][\w$]*)\s*=>|"
106
- r"func\s+(?:\([^)]*\)\s*)?([A-Za-z_]\w*)\s*\(|(?:pub\s+)?(?:async\s+)?fn\s+([A-Za-z_]\w*)\s*\()"
107
- )
108
- IMPORT_PATH_RE = re.compile(
109
- r"(?:from\s+['\"](?P<jsfrom>[^'\"]+)['\"]|"
110
- r"import(?:\s+[^;\n'\"]+?\s+from)?\s+['\"](?P<jsimport>[^'\"]+)['\"]|"
111
- r"from\s+(?P<pyfrom>\.*[A-Za-z_][\w.]*|\.+)\s+import|"
112
- r"import\s+(?P<pyimport>[A-Za-z_][\w.]*))"
113
- )
114
- PY_FROM_IMPORT_LINE_RE = re.compile(r"^\s*from\s+(?P<module>\.*[A-Za-z_][\w.]*|\.+)\s+import\s+(?P<names>[^\n#;]+)")
115
-
116
-
117
- @dataclass(frozen=True)
118
- class LineRange:
119
- start: int
120
- end: int
121
-
122
- def as_dict(self) -> dict[str, int]:
123
- return {"start": self.start, "end": self.end}
124
-
125
- def identity(self) -> str:
126
- return f"{self.start}:{self.end}"
127
-
128
-
129
- @dataclass
130
- class SourceSpec:
131
- path: str
132
- priority: int = 0
133
- lines: LineRange | None = None
134
- label: str | None = None
135
- input_index: int = 0
136
- origin: str = "cli"
137
-
138
-
139
- @dataclass
140
- class ResolvedSource:
141
- spec: SourceSpec
142
- abs_path: Path
143
- display_path: str
144
- redacted_path: bool
145
- requested_lines: LineRange | None
146
- selected_lines: list[str]
147
- total_lines: int
148
- redacted_lines: int
149
-
150
-
151
- @dataclass
152
- class SuggestCandidate:
153
- path: str
154
- score: int
155
- reason: str
156
- lines: LineRange | None = None
157
- label: str | None = None
158
- input_index: int = 0
159
-
160
-
161
- class PackError(ValueError):
162
- pass
163
-
164
-
165
- class FallbackLineSanitizer:
166
- def __init__(self, *, show_paths: bool = False) -> None:
167
- self.show_paths = show_paths
168
- self.redactions = 0
169
-
170
- def sanitize(self, raw_line: str) -> tuple[str, bool]:
171
- def repl(match: re.Match[str]) -> str:
172
- text = match.group(0)
173
- if "=" in text:
174
- key = text.split("=", 1)[0]
175
- return key + "=[REDACTED]"
176
- if ":" in text and re.search(r"(?i)(api|token|secret|password|authorization)", text.split(":", 1)[0]):
177
- key = text.split(":", 1)[0]
178
- return key + ": [REDACTED]"
179
- return "[REDACTED]"
180
-
181
- line, count = SECRET_CONTENT_RE.subn(repl, raw_line)
182
- if count:
183
- self.redactions += 1
184
- return line, bool(count)
185
-
186
-
187
- # Process-static cache: CLI invocations should not re-import the sanitizer for
188
- # every file, while each sanitize_text() call still gets a fresh stateful
189
- # sanitizer instance.
190
- _LINE_SANITIZER_FACTORY_CACHE: Any | None = None
191
- _LINE_SANITIZER_FACTORY_LOCK = threading.Lock()
192
-
193
-
194
- def load_line_sanitizer_factory() -> Any:
195
- global _LINE_SANITIZER_FACTORY_CACHE
196
- if _LINE_SANITIZER_FACTORY_CACHE is not None:
197
- return _LINE_SANITIZER_FACTORY_CACHE
198
- with _LINE_SANITIZER_FACTORY_LOCK:
199
- if _LINE_SANITIZER_FACTORY_CACHE is not None:
200
- return _LINE_SANITIZER_FACTORY_CACHE
201
- script_dir = Path(__file__).resolve().parent
202
- for name in ("sanitize_output.py", "context-guard-sanitize-output", "claude-sanitize-output"):
203
- candidate = script_dir / name
204
- if not candidate.exists():
205
- continue
206
- try:
207
- loader = importlib.machinery.SourceFileLoader(f"_context_guard_pack_sanitize_{os.getpid()}", str(candidate))
208
- spec = importlib.util.spec_from_loader(loader.name, loader)
209
- if spec is None:
210
- raise RuntimeError("import spec unavailable")
211
- module = importlib.util.module_from_spec(spec)
212
- loader.exec_module(module)
213
- _LINE_SANITIZER_FACTORY_CACHE = module.LineSanitizer
214
- return _LINE_SANITIZER_FACTORY_CACHE
215
- except Exception as exc:
216
- raise RuntimeError(f"could not load sanitizer {candidate}: {exc}") from exc
217
- _LINE_SANITIZER_FACTORY_CACHE = FallbackLineSanitizer
218
- return _LINE_SANITIZER_FACTORY_CACHE
219
-
220
-
221
- def load_line_sanitizer(show_paths: bool = False) -> object:
222
- sanitizer_factory = load_line_sanitizer_factory()
223
- return sanitizer_factory(show_paths=show_paths)
224
-
225
-
226
- def sanitize_text(text: str, *, show_paths: bool = False) -> tuple[str, int]:
227
- sanitizer = load_line_sanitizer(show_paths)
228
- redacted = 0
229
- out: list[str] = []
230
- for line in text.splitlines(True):
231
- sanitized, did_redact = sanitizer.sanitize(line) # type: ignore[attr-defined]
232
- out.append(sanitized)
233
- if did_redact:
234
- redacted += 1
235
- return "".join(out), redacted
236
-
237
-
238
- def byte_len(text: str) -> int:
239
- return len(text.encode("utf-8", errors="replace"))
240
-
241
-
242
- def token_proxy(text: str) -> int:
243
- if not text:
244
- return 0
245
- return max(1, round(len(text) / TOKEN_PROXY_CHARS_PER_TOKEN))
246
-
247
-
248
- def sha256_text(text: str) -> str:
249
- return hashlib.sha256(text.encode("utf-8", errors="replace")).hexdigest()
250
-
251
-
252
- def path_hash(path: Path) -> str:
253
- return hashlib.sha256(str(path).encode("utf-8", "replace")).hexdigest()[:12]
254
-
255
-
256
- def sanitize_path_component(component: str) -> tuple[str, bool]:
257
- if SECRET_CONTENT_RE.search(component):
258
- return REDACTED_PATH_COMPONENT, True
259
- return component, False
260
-
261
-
262
- def display_root(root: Path) -> str:
263
- name, redacted = sanitize_path_component(root.name or "project")
264
- if redacted:
265
- name = "project"
266
- return f"{name}#path:{path_hash(root)}"
267
-
268
-
269
- def display_rel_path(rel: str) -> tuple[str, bool]:
270
- normalized = rel.replace("\\", "/")
271
- parts: list[str] = []
272
- redacted = False
273
- for part in normalized.split("/"):
274
- if not part:
275
- continue
276
- safe, did = sanitize_path_component(part)
277
- parts.append(safe)
278
- redacted = redacted or did
279
- return "/".join(parts), redacted
280
-
281
-
282
- def repo_map_path_has_sensitive_evidence(value: str) -> bool:
283
- return bool(CONTROL_CHAR_RE.search(value) or SECRET_PATH_COMPONENT_RE.search(value))
284
-
285
-
286
- def repo_map_display_rel_path(rel: str) -> tuple[str, bool]:
287
- normalized = rel.replace("\\", "/")
288
- if repo_map_path_has_sensitive_evidence(normalized):
289
- return f"redacted-path#path:{sha256_text(normalized)[:12]}", True
290
- return display_rel_path(normalized)
291
-
292
-
293
- def repo_map_safe_raw_path_label(raw: str) -> str:
294
- normalized = raw.replace("\\", "/")
295
- if repo_map_path_has_sensitive_evidence(normalized):
296
- return f"redacted-path#path:{sha256_text(normalized)[:12]}"
297
- return safe_raw_path_label(normalized)
298
-
299
-
300
- def parse_line_range(value: object) -> LineRange | None:
301
- if value is None or value == "":
302
- return None
303
- if isinstance(value, dict):
304
- try:
305
- start = int(value.get("start"))
306
- end = int(value.get("end"))
307
- except (TypeError, ValueError):
308
- raise PackError("invalid_lines")
309
- elif isinstance(value, str):
310
- if ":" not in value:
311
- raise PackError("invalid_lines")
312
- left, right = value.split(":", 1)
313
- try:
314
- start = int(left)
315
- end = int(right)
316
- except ValueError:
317
- raise PackError("invalid_lines")
318
- else:
319
- raise PackError("invalid_lines")
320
- if start < 1 or end < start:
321
- raise PackError("invalid_lines")
322
- return LineRange(start, end)
323
-
324
-
325
- def bounded_int(value: object, default: int, minimum: int, maximum: int) -> int:
326
- try:
327
- number = int(value)
328
- except (TypeError, ValueError, OverflowError):
329
- return default
330
- return min(max(number, minimum), maximum)
331
-
332
-
333
- def cap_label(value: object, default: str | None = None, limit: int = MAX_LABEL_CHARS) -> str | None:
334
- if value is None:
335
- return default
336
- text = " ".join(str(value).strip().split())
337
- text = SECRET_CONTENT_RE.sub("[REDACTED]", text)
338
- if not text:
339
- return default
340
- if len(text) > limit:
341
- text = text[: max(0, limit - 15)].rstrip() + " ...[truncated]"
342
- return text
343
-
344
-
345
- def read_manifest(path: Path) -> list[SourceSpec]:
346
- try:
347
- raw = path.read_bytes()
348
- except OSError as exc:
349
- raise PackError(f"could not read manifest: {exc.strerror or exc.__class__.__name__}") from exc
350
- if len(raw) > MAX_MANIFEST_BYTES:
351
- raise PackError(f"manifest exceeds trusted size cap: {len(raw)} > {MAX_MANIFEST_BYTES}")
352
- try:
353
- data = json.loads(raw.decode("utf-8"))
354
- except (UnicodeDecodeError, json.JSONDecodeError) as exc:
355
- raise PackError(f"invalid manifest JSON: {exc}") from exc
356
- version = data.get("version", VERSION) if isinstance(data, dict) else None
357
- if version != VERSION:
358
- raise PackError(f"unsupported manifest version: {version}")
359
- sources = data.get("sources") if isinstance(data, dict) else None
360
- if not isinstance(sources, list):
361
- raise PackError("manifest sources must be a list")
362
- out: list[SourceSpec] = []
363
- for item in sources:
364
- if not isinstance(item, dict):
365
- raise PackError("manifest sources must be objects")
366
- if "path" not in item:
367
- raise PackError("manifest source missing path")
368
- try:
369
- lines = parse_line_range(item.get("lines"))
370
- except PackError:
371
- lines = LineRange(-1, -1)
372
- out.append(SourceSpec(
373
- path=str(item.get("path", "")),
374
- priority=bounded_int(item.get("priority"), 0, -1_000_000, 1_000_000),
375
- lines=lines,
376
- label=cap_label(item.get("label")),
377
- origin="manifest",
378
- ))
379
- return out
380
-
381
-
382
- def parse_source_spec(raw: str) -> SourceSpec:
383
- raw = raw.strip()
384
- if not raw:
385
- raise PackError("empty --source")
386
- values: dict[str, str] = {}
387
- if "=" not in raw.split(",", 1)[0]:
388
- values["path"] = raw
389
- else:
390
- for part in raw.split(","):
391
- if not part:
392
- continue
393
- if "=" not in part:
394
- raise PackError(f"invalid --source part: {part}")
395
- key, value = part.split("=", 1)
396
- values[key.strip()] = value.strip()
397
- if "path" not in values or not values["path"]:
398
- raise PackError("--source missing path")
399
- try:
400
- lines = parse_line_range(values.get("lines"))
401
- except PackError:
402
- lines = LineRange(-1, -1)
403
- return SourceSpec(
404
- path=values["path"],
405
- priority=bounded_int(values.get("priority"), 0, -1_000_000, 1_000_000),
406
- lines=lines,
407
- label=cap_label(values.get("label")),
408
- origin="cli",
409
- )
410
-
411
-
412
- def normalize_root(raw_root: Path) -> Path:
413
- expanded = raw_root.expanduser()
414
- try:
415
- if expanded.is_symlink():
416
- raise PackError("root must not be a symlink")
417
- root = expanded.resolve()
418
- except OSError as exc:
419
- raise PackError(f"could not resolve root: {exc.strerror or exc.__class__.__name__}") from exc
420
- if not root.is_dir():
421
- raise PackError("root must be a directory")
422
- return root
423
-
424
-
425
- def omission(spec: SourceSpec, reason: str, *, path: str | None = None, redacted_path: bool = False) -> dict[str, Any]:
426
- item: dict[str, Any] = {
427
- "path": path if path is not None else safe_raw_path_label(spec.path),
428
- "status": "omitted",
429
- "priority": spec.priority,
430
- "reason": reason,
431
- "input_index": spec.input_index,
432
- }
433
- if spec.label:
434
- item["label"] = spec.label
435
- if spec.lines and spec.lines.start > 0:
436
- item["requested_lines"] = spec.lines.as_dict()
437
- if redacted_path:
438
- item["retrieval_omitted_reason"] = "redacted_path"
439
- return item
440
-
441
-
442
- def safe_raw_path_label(raw: str) -> str:
443
- text = raw.replace("\\", "/")
444
- parts = []
445
- for part in text.split("/"):
446
- if part in {"", "."}:
447
- continue
448
- safe, _ = sanitize_path_component(part)
449
- parts.append(safe)
450
- return "/".join(parts) or "path"
451
-
452
-
453
- def lexical_rel(raw_path: str) -> tuple[Path | None, str]:
454
- path = Path(raw_path)
455
- if path.is_absolute():
456
- return None, "outside_root"
457
- parts = path.parts
458
- if not parts or any(part in {"..", ""} for part in parts):
459
- return None, "outside_root"
460
- cleaned = Path(*[part for part in parts if part != "."])
461
- if not cleaned.parts:
462
- return None, "outside_root"
463
- return cleaned, ""
464
-
465
-
466
- def open_dir_no_follow(path: Path | str, *, dir_fd: int | None = None) -> int:
467
- flags = os.O_RDONLY
468
- if hasattr(os, "O_DIRECTORY"):
469
- flags |= os.O_DIRECTORY
470
- if hasattr(os, "O_NOFOLLOW"):
471
- flags |= os.O_NOFOLLOW
472
- if hasattr(os, "O_CLOEXEC"):
473
- flags |= os.O_CLOEXEC
474
- if dir_fd is None:
475
- fd = os.open(path, flags)
476
- else:
477
- fd = os.open(path, flags, dir_fd=dir_fd)
478
- try:
479
- st = os.fstat(fd)
480
- if not stat.S_ISDIR(st.st_mode):
481
- raise PackError("not a directory")
482
- return fd
483
- except Exception:
484
- os.close(fd)
485
- raise
486
-
487
-
488
- def file_open_flags() -> int:
489
- flags = os.O_RDONLY
490
- for name in ("O_NOFOLLOW", "O_CLOEXEC", "O_NONBLOCK", "O_NOCTTY"):
491
- flags |= getattr(os, name, 0)
492
- return flags
493
-
494
-
495
- def stat_leaf_no_follow(name: str, *, dir_fd: int) -> os.stat_result | None:
496
- supports_dir_fd = os.stat in getattr(os, "supports_dir_fd", set())
497
- supports_no_follow = os.stat in getattr(os, "supports_follow_symlinks", set())
498
- if not supports_dir_fd or not supports_no_follow:
499
- return None
500
- return os.stat(name, dir_fd=dir_fd, follow_symlinks=False)
501
-
502
-
503
- def open_regular_under_root(root: Path, rel: Path) -> tuple[Any | None, str]:
504
- current_fd: int | None = None
505
- try:
506
- current_fd = open_dir_no_follow(root)
507
- for index, part in enumerate(rel.parts):
508
- if part in {"", ".", ".."}:
509
- return None, "outside_root"
510
- is_final = index == len(rel.parts) - 1
511
- if not is_final:
512
- try:
513
- next_fd = open_dir_no_follow(part, dir_fd=current_fd)
514
- except FileNotFoundError:
515
- return None, "missing"
516
- except NotADirectoryError:
517
- return None, "missing"
518
- except OSError:
519
- return None, "unsafe_path"
520
- os.close(current_fd)
521
- current_fd = next_fd
522
- continue
523
- try:
524
- pre_st = stat_leaf_no_follow(part, dir_fd=current_fd)
525
- except FileNotFoundError:
526
- return None, "missing"
527
- except NotADirectoryError:
528
- return None, "missing"
529
- except OSError:
530
- return None, "unsafe_path"
531
- if pre_st is not None:
532
- if stat.S_ISLNK(pre_st.st_mode):
533
- return None, "unsafe_path"
534
- if not stat.S_ISREG(pre_st.st_mode):
535
- return None, "empty_source"
536
- flags = file_open_flags()
537
- file_fd = -1
538
- try:
539
- file_fd = os.open(part, flags, dir_fd=current_fd)
540
- st = os.fstat(file_fd)
541
- if not stat.S_ISREG(st.st_mode):
542
- os.close(file_fd)
543
- file_fd = -1
544
- return None, "empty_source"
545
- handle = os.fdopen(file_fd, "r", encoding="utf-8", errors="replace", newline="")
546
- file_fd = -1
547
- return handle, ""
548
- except FileNotFoundError:
549
- return None, "missing"
550
- except IsADirectoryError:
551
- return None, "empty_source"
552
- except NotADirectoryError:
553
- return None, "missing"
554
- except OSError:
555
- return None, "unsafe_path"
556
- finally:
557
- if file_fd >= 0:
558
- try:
559
- os.close(file_fd)
560
- except OSError:
561
- pass
562
- except OSError:
563
- return None, "unsafe_path"
564
- finally:
565
- if current_fd is not None:
566
- try:
567
- os.close(current_fd)
568
- except OSError:
569
- pass
570
- return None, "unsafe_path"
571
-
572
-
573
- def resolve_source(root: Path, spec: SourceSpec) -> tuple[ResolvedSource | None, dict[str, Any] | None]:
574
- if spec.lines is not None and spec.lines.start < 1:
575
- return None, omission(spec, "invalid_lines")
576
- rel, reason = lexical_rel(spec.path)
577
- if rel is None:
578
- return None, omission(spec, reason)
579
- display, redacted_path = display_rel_path(rel.as_posix())
580
- handle, reason = open_regular_under_root(root, rel)
581
- if handle is None:
582
- return None, omission(spec, reason, path=display, redacted_path=redacted_path)
583
- try:
584
- with handle:
585
- raw_text = handle.read()
586
- except OSError:
587
- return None, omission(spec, "unsafe_path", path=display, redacted_path=redacted_path)
588
- sanitized, redacted_lines = sanitize_text(raw_text)
589
- all_lines = sanitized.splitlines(True)
590
- if not all_lines:
591
- return None, omission(spec, "empty_source", path=display, redacted_path=redacted_path)
592
- total_lines = len(all_lines)
593
- requested = spec.lines or LineRange(1, total_lines)
594
- if requested.start > total_lines:
595
- return None, omission(spec, "empty_source", path=display, redacted_path=redacted_path)
596
- end = min(requested.end, total_lines)
597
- selected = all_lines[requested.start - 1:end]
598
- if not selected:
599
- return None, omission(spec, "empty_source", path=display, redacted_path=redacted_path)
600
- return ResolvedSource(
601
- spec=spec,
602
- abs_path=root / rel,
603
- display_path=display,
604
- redacted_path=redacted_path,
605
- requested_lines=requested,
606
- selected_lines=selected,
607
- total_lines=total_lines,
608
- redacted_lines=redacted_lines,
609
- ), None
610
-
611
-
612
- def retrieval_cli(root_arg: str, display_path: str, lines: LineRange) -> str:
613
- return (
614
- f"context-guard-pack slice --root {shlex.quote(root_arg)} "
615
- f"--path {shlex.quote(display_path)} --lines {lines.start}:{lines.end} --json"
616
- )
617
-
618
-
619
- def safe_root_arg_for_retrieval(root_arg: str) -> str | None:
620
- text = str(root_arg)
621
- if CONTROL_CHAR_RE.search(text) or SECRET_CONTENT_RE.search(text) or SECRET_PATH_COMPONENT_RE.search(text):
622
- return None
623
- for part in text.replace("\\", "/").split("/"):
624
- if not part:
625
- continue
626
- _safe, redacted = sanitize_path_component(part)
627
- if redacted:
628
- return None
629
- return text
630
-
631
-
632
- def safe_repo_map_root_arg_for_retrieval(root_arg: str) -> str | None:
633
- text = str(root_arg)
634
- if repo_map_path_has_sensitive_evidence(text):
635
- return None
636
- return safe_root_arg_for_retrieval(text)
637
-
638
-
639
- def retrieval_for(root_arg: str, display_path: str, lines: LineRange, *, redacted_path: bool) -> tuple[str | None, str | None]:
640
- if redacted_path:
641
- return None, "redacted_path"
642
- safe_root = safe_root_arg_for_retrieval(root_arg)
643
- if safe_root is None:
644
- return None, "unsafe_root_path"
645
- return retrieval_cli(safe_root, display_path, lines), None
646
-
647
-
648
- def render_block(source: ResolvedSource, lines: list[str], *, root_arg: str, status: str, included: LineRange) -> str:
649
- title = source.spec.label or source.display_path
650
- requested = source.requested_lines or LineRange(1, source.total_lines)
651
- retrieval, retrieval_omitted_reason = retrieval_for(root_arg, source.display_path, included, redacted_path=source.redacted_path)
652
- header = [
653
- f"## {title}",
654
- f"Source: `{source.display_path}`",
655
- f"Priority: {source.spec.priority}",
656
- f"Status: {status}",
657
- f"Included lines: {included.start}:{included.end}",
658
- f"Requested lines: {requested.start}:{requested.end}",
659
- ]
660
- if retrieval:
661
- header.append(f"Retrieval: `{retrieval}`")
662
- elif retrieval_omitted_reason:
663
- header.append(f"Retrieval omitted: {retrieval_omitted_reason}")
664
- return "\n".join(header) + "\n\n```text\n" + "".join(lines) + ("" if not lines or lines[-1].endswith("\n") else "\n") + "```\n\n"
665
-
666
-
667
- def source_metadata(source: ResolvedSource, *, status: str, lines: list[str], included: LineRange, root_arg: str) -> dict[str, Any]:
668
- requested = source.requested_lines or LineRange(1, source.total_lines)
669
- item: dict[str, Any] = {
670
- "path": source.display_path,
671
- "status": status,
672
- "priority": source.spec.priority,
673
- "input_index": source.spec.input_index,
674
- "requested_lines": requested.as_dict(),
675
- "included_lines": included.as_dict(),
676
- "bytes": byte_len("".join(lines)),
677
- }
678
- if source.spec.label:
679
- item["label"] = source.spec.label
680
- retrieval, retrieval_omitted_reason = retrieval_for(root_arg, source.display_path, included, redacted_path=source.redacted_path)
681
- if retrieval:
682
- item["retrieval_cli"] = retrieval
683
- elif retrieval_omitted_reason:
684
- item["retrieval_omitted_reason"] = retrieval_omitted_reason
685
- if status == "partial":
686
- item["reason"] = "budget_exhausted"
687
- return item
688
-
689
-
690
- def budget_omission(source: ResolvedSource, *, root_arg: str) -> dict[str, Any]:
691
- requested = source.requested_lines or LineRange(1, source.total_lines)
692
- item = omission(source.spec, "budget_exhausted", path=source.display_path, redacted_path=source.redacted_path)
693
- item["requested_lines"] = requested.as_dict()
694
- item["total_lines"] = source.total_lines
695
- retrieval, retrieval_omitted_reason = retrieval_for(root_arg, source.display_path, requested, redacted_path=source.redacted_path)
696
- if retrieval:
697
- item["retrieval_cli"] = retrieval
698
- item.pop("retrieval_omitted_reason", None)
699
- elif retrieval_omitted_reason:
700
- item["retrieval_omitted_reason"] = retrieval_omitted_reason
701
- return item
702
-
703
-
704
- def fit_partial_lines(source: ResolvedSource, remaining: int, *, root_arg: str) -> tuple[list[str], str | None, LineRange | None]:
705
- if remaining <= 0:
706
- return [], None, None
707
- picked: list[str] = []
708
- for line in source.selected_lines:
709
- candidate = picked + [line]
710
- included = LineRange(source.requested_lines.start if source.requested_lines else 1, (source.requested_lines.start if source.requested_lines else 1) + len(candidate) - 1)
711
- block = render_block(source, candidate, root_arg=root_arg, status="partial", included=included)
712
- if byte_len(block) <= remaining:
713
- picked = candidate
714
- else:
715
- break
716
- if not picked:
717
- return [], None, None
718
- included = LineRange(source.requested_lines.start if source.requested_lines else 1, (source.requested_lines.start if source.requested_lines else 1) + len(picked) - 1)
719
- return picked, render_block(source, picked, root_arg=root_arg, status="partial", included=included), included
720
-
721
-
722
- def metadata_size(data: dict[str, Any]) -> int:
723
- return len(json.dumps(data, ensure_ascii=False, indent=2, sort_keys=True).encode("utf-8", errors="replace")) + 1
724
-
725
-
726
- def artifact_failure(error: str, *, bytes_count: int = 0, capped: bool = False) -> dict[str, Any]:
727
- return {
728
- "stored": False,
729
- "path": None,
730
- "bytes": bytes_count,
731
- "capped": capped,
732
- "error": error,
733
- "cap_bytes": MAX_RECEIPT_BYTES,
734
- }
735
-
736
-
737
- def ensure_private_pack_dir(root: Path) -> tuple[Path | None, int | None, str | None]:
738
- """Create/verify the receipt directory by walking from a no-follow root fd."""
739
- current_fd: int | None = None
740
- try:
741
- current_fd = open_dir_no_follow(root)
742
- for part in (".context-guard", "packs"):
743
- while True:
744
- try:
745
- next_fd = open_dir_no_follow(part, dir_fd=current_fd)
746
- break
747
- except FileNotFoundError:
748
- try:
749
- os.mkdir(part, 0o700, dir_fd=current_fd)
750
- except FileExistsError:
751
- continue
752
- except (OSError, NotImplementedError):
753
- return None, None, "artifact_dir_unavailable"
754
- except NotADirectoryError:
755
- return None, None, "unsafe_artifact_dir"
756
- except (OSError, NotImplementedError):
757
- return None, None, "unsafe_artifact_dir"
758
- try:
759
- os.fchmod(next_fd, 0o700)
760
- except (AttributeError, OSError):
761
- pass
762
- os.close(current_fd)
763
- current_fd = next_fd
764
- dir_fd = current_fd
765
- current_fd = None
766
- return root / PACK_DIR, dir_fd, None
767
- except OSError:
768
- return None, None, "unsafe_artifact_dir"
769
- finally:
770
- if current_fd is not None:
771
- try:
772
- os.close(current_fd)
773
- except OSError:
774
- pass
775
-
776
-
777
- def atomic_write_ops_supported() -> bool:
778
- return (
779
- os.open in os.supports_dir_fd
780
- and os.rename in os.supports_dir_fd
781
- and os.unlink in os.supports_dir_fd
782
- )
783
-
784
-
785
- def fsync_dir_fd(dir_fd: int) -> None:
786
- os.fsync(dir_fd)
787
-
788
-
789
- def validate_existing_output_target_at(dir_fd: int, filename: str, option_name: str) -> None:
790
- flags = os.O_WRONLY
791
- if hasattr(os, "O_NOFOLLOW"):
792
- flags |= os.O_NOFOLLOW
793
- if hasattr(os, "O_CLOEXEC"):
794
- flags |= os.O_CLOEXEC
795
- if hasattr(os, "O_NONBLOCK"):
796
- flags |= os.O_NONBLOCK
797
- file_fd = -1
798
- try:
799
- file_fd = os.open(filename, flags, dir_fd=dir_fd)
800
- st = os.fstat(file_fd)
801
- if not stat.S_ISREG(st.st_mode):
802
- raise PackError(f"invalid {option_name}: unsafe_path")
803
- except FileNotFoundError:
804
- return
805
- except IsADirectoryError as exc:
806
- raise PackError(f"invalid {option_name}: unsafe_path") from exc
807
- except OSError as exc:
808
- raise PackError(f"invalid {option_name}: {exc.strerror or exc.__class__.__name__}") from exc
809
- finally:
810
- if file_fd >= 0:
811
- try:
812
- os.close(file_fd)
813
- except OSError:
814
- pass
815
-
816
-
817
- def write_text_atomic_at(dir_fd: int, filename: str, content: str, *, mode: int, option_name: str) -> None:
818
- if "/" in filename or filename in {"", ".", ".."}:
819
- raise PackError(f"invalid {option_name}: unsafe_path")
820
- if not atomic_write_ops_supported():
821
- raise PackError(f"invalid {option_name}: atomic_write_unsupported")
822
- validate_existing_output_target_at(dir_fd, filename, option_name)
823
- digest = hashlib.sha256(f"{filename}:{os.getpid()}:{time.time_ns()}".encode("utf-8", "replace")).hexdigest()[:16]
824
- temp_name = f".context-guard-pack-{digest}.tmp"
825
- flags = os.O_WRONLY | os.O_CREAT | os.O_EXCL
826
- if hasattr(os, "O_NOFOLLOW"):
827
- flags |= os.O_NOFOLLOW
828
- if hasattr(os, "O_CLOEXEC"):
829
- flags |= os.O_CLOEXEC
830
- fd = -1
831
- temp_created = False
832
- try:
833
- fd = os.open(temp_name, flags, mode, dir_fd=dir_fd)
834
- temp_created = True
835
- with os.fdopen(fd, "w", encoding="utf-8", newline="") as handle:
836
- fd = -1
837
- handle.write(content)
838
- handle.flush()
839
- os.fsync(handle.fileno())
840
- fsync_dir_fd(dir_fd)
841
- os.rename(temp_name, filename, src_dir_fd=dir_fd, dst_dir_fd=dir_fd)
842
- temp_created = False
843
- try:
844
- os.chmod(filename, mode, dir_fd=dir_fd, follow_symlinks=False)
845
- except (OSError, TypeError, NotImplementedError):
846
- pass
847
- fsync_dir_fd(dir_fd)
848
- finally:
849
- if fd >= 0:
850
- try:
851
- os.close(fd)
852
- except OSError:
853
- pass
854
- if temp_created:
855
- try:
856
- os.unlink(temp_name, dir_fd=dir_fd)
857
- except OSError:
858
- pass
859
-
860
-
861
- def write_private_json_at(dir_fd: int, filename: str, data: dict[str, Any]) -> None:
862
- if "/" in filename or filename in {"", ".", ".."}:
863
- raise PackError("unsafe_artifact_path")
864
- content = json.dumps(data, ensure_ascii=False, indent=2, sort_keys=True) + "\n"
865
- write_text_atomic_at(dir_fd, filename, content, mode=0o600, option_name="artifact receipt")
866
-
867
-
868
- def finalize_receipt_size(receipt: dict[str, Any]) -> int:
869
- artifact = receipt.setdefault("artifact", {})
870
- size = metadata_size(receipt)
871
- for _ in range(4):
872
- artifact["bytes"] = size
873
- next_size = metadata_size(receipt)
874
- if next_size == size:
875
- return size
876
- size = next_size
877
- artifact["bytes"] = size
878
- return metadata_size(receipt)
879
-
880
-
881
- def shrink_receipt_for_write(data: dict[str, Any]) -> tuple[dict[str, Any], bool]:
882
- receipt = copy.deepcopy(data)
883
- capped = False
884
- if metadata_size(receipt) <= MAX_RECEIPT_BYTES:
885
- return receipt, capped
886
- capped = True
887
- receipt.setdefault("artifact", {})["capped"] = True
888
- receipt.setdefault("artifact", {})["cap_bytes"] = MAX_RECEIPT_BYTES
889
- for item in receipt.get("omitted_sources", []):
890
- if isinstance(item, dict):
891
- item.pop("preview", None)
892
- if "label" in item:
893
- item["label"] = cap_label(item.get("label"), limit=80)
894
- if "reason" in item:
895
- item["reason"] = cap_label(item.get("reason"), default=str(item.get("reason")), limit=MAX_REASON_CHARS)
896
- if metadata_size(receipt) <= MAX_RECEIPT_BYTES:
897
- return receipt, capped
898
- for item in receipt.get("included_sources", []):
899
- if isinstance(item, dict):
900
- item.pop("preview", None)
901
- if "label" in item:
902
- item["label"] = cap_label(item.get("label"), limit=80)
903
- if metadata_size(receipt) <= MAX_RECEIPT_BYTES:
904
- return receipt, capped
905
- # The stdout payload remains authoritative for the full pack body. Receipts may omit it to stay readable.
906
- receipt["pack_omitted_from_receipt"] = True
907
- receipt.pop("pack", None)
908
- return receipt, capped
909
-
910
-
911
- def store_receipt(root: Path, result: dict[str, Any]) -> dict[str, Any]:
912
- out_dir, dir_fd, dir_error = ensure_private_pack_dir(root)
913
- if out_dir is None or dir_fd is None:
914
- return artifact_failure(dir_error or "unsafe_artifact_dir")
915
- size = 0
916
- capped = False
917
- try:
918
- receipt, capped = shrink_receipt_for_write(result)
919
- size = metadata_size(receipt)
920
- if size > MAX_RECEIPT_BYTES:
921
- return artifact_failure("receipt_metadata_too_large", bytes_count=size, capped=True)
922
- pack_id = str(result["pack_id"])
923
- filename = f"{pack_id}.json"
924
- receipt.setdefault("artifact", {})["stored"] = True
925
- receipt.setdefault("artifact", {})["path"] = f"{PACK_DIR}/{pack_id}.json"
926
- receipt.setdefault("artifact", {})["capped"] = capped
927
- size = finalize_receipt_size(receipt)
928
- if size > MAX_RECEIPT_BYTES:
929
- return artifact_failure("receipt_metadata_too_large", bytes_count=size, capped=True)
930
- write_private_json_at(dir_fd, filename, receipt)
931
- except (OSError, PackError, NotImplementedError):
932
- return artifact_failure("artifact_write_failed", bytes_count=size, capped=capped)
933
- finally:
934
- try:
935
- os.close(dir_fd)
936
- except OSError:
937
- pass
938
- return {
939
- "stored": True,
940
- "path": f"{PACK_DIR}/{pack_id}.json",
941
- "bytes": size,
942
- "capped": capped,
943
- "cap_bytes": MAX_RECEIPT_BYTES,
944
- }
945
-
946
-
947
- def build_pack(root: Path, specs: list[SourceSpec], *, budget_bytes: int, root_arg: str, store_artifact: bool) -> dict[str, Any]:
948
- seen: set[tuple[str, str]] = set()
949
- resolved: list[ResolvedSource] = []
950
- omitted: list[dict[str, Any]] = []
951
- canonical_specs: list[dict[str, Any]] = []
952
- for spec in specs:
953
- rel, reason = lexical_rel(spec.path)
954
- if spec.lines is not None and spec.lines.start < 1:
955
- omitted_item = omission(spec, "invalid_lines")
956
- omitted.append(omitted_item)
957
- canonical_specs.append({"path": omitted_item.get("path"), "priority": spec.priority, "lines": "invalid", "status": "invalid_lines"})
958
- continue
959
- if rel is not None and spec.lines is not None and spec.lines.start > 0:
960
- identity_lines = spec.lines.identity()
961
- elif rel is not None:
962
- identity_lines = "all"
963
- else:
964
- identity_lines = "invalid"
965
- identity = (rel.as_posix() if rel is not None else spec.path, identity_lines)
966
- if rel is not None and identity in seen:
967
- display, redacted = display_rel_path(rel.as_posix())
968
- omitted.append(omission(spec, "duplicate_source", path=display, redacted_path=redacted))
969
- canonical_specs.append({"path": display, "priority": spec.priority, "lines": identity_lines, "status": "duplicate_source"})
970
- continue
971
- if rel is not None:
972
- seen.add(identity)
973
- source, omitted_item = resolve_source(root, spec)
974
- if omitted_item is not None:
975
- omitted.append(omitted_item)
976
- canonical_specs.append({"path": omitted_item.get("path"), "priority": spec.priority, "lines": identity_lines, "status": omitted_item.get("reason")})
977
- continue
978
- assert source is not None
979
- resolved.append(source)
980
- canonical_specs.append({"path": source.display_path, "priority": spec.priority, "lines": identity_lines, "status": "candidate"})
981
- resolved.sort(key=lambda item: (-item.spec.priority, item.spec.input_index, item.display_path))
982
- header = "# Context Pack\n\nGenerated by context-guard-pack. Token counts are estimated proxies; byte counts are observed.\n\n"
983
- parts: list[str] = []
984
- included: list[dict[str, Any]] = []
985
- current_pack_bytes = 0
986
- header_bytes = byte_len(header)
987
- if header_bytes <= budget_bytes:
988
- parts.append(header)
989
- current_pack_bytes += header_bytes
990
- for source in resolved:
991
- start_line = source.requested_lines.start if source.requested_lines else 1
992
- included_range = LineRange(start_line, start_line + len(source.selected_lines) - 1)
993
- full_block = render_block(source, source.selected_lines, root_arg=root_arg, status="included", included=included_range)
994
- full_block_bytes = byte_len(full_block)
995
- remaining = budget_bytes - current_pack_bytes
996
- if full_block_bytes <= remaining:
997
- parts.append(full_block)
998
- current_pack_bytes += full_block_bytes
999
- included.append(source_metadata(source, status="included", lines=source.selected_lines, included=included_range, root_arg=root_arg))
1000
- continue
1001
- partial_lines, partial_block, partial_range = fit_partial_lines(source, remaining, root_arg=root_arg)
1002
- if partial_block is not None and partial_range is not None:
1003
- parts.append(partial_block)
1004
- current_pack_bytes += byte_len(partial_block)
1005
- included.append(source_metadata(source, status="partial", lines=partial_lines, included=partial_range, root_arg=root_arg))
1006
- else:
1007
- omitted.append(budget_omission(source, root_arg=root_arg))
1008
- pack = "".join(parts)
1009
- pack_bytes = current_pack_bytes
1010
- redacted_lines = sum(source.redacted_lines for source in resolved)
1011
- partial_count = sum(1 for item in included if item.get("status") == "partial")
1012
- omitted_sorted = sorted(omitted, key=lambda item: (item.get("input_index", 0), str(item.get("path", "")), str(item.get("reason", ""))))
1013
- canonical = {
1014
- "version": VERSION,
1015
- "root": display_root(root),
1016
- "budget_bytes": budget_bytes,
1017
- "sources": canonical_specs,
1018
- "pack_sha256": sha256_text(pack),
1019
- "omission_summary": sorted({str(item.get("reason")) for item in omitted_sorted}),
1020
- }
1021
- pack_id = hashlib.sha256(json.dumps(canonical, ensure_ascii=False, sort_keys=True, separators=(",", ":")).encode("utf-8")).hexdigest()[:20]
1022
- result: dict[str, Any] = {
1023
- "tool": TOOL_NAME,
1024
- "version": VERSION,
1025
- "pack_id": pack_id,
1026
- "root": display_root(root),
1027
- "budget_bytes": budget_bytes,
1028
- "pack_bytes": pack_bytes,
1029
- "pack": pack,
1030
- "token_proxy": {"measurement": "estimated", "method": f"chars_div_{TOKEN_PROXY_CHARS_PER_TOKEN}", "pack": token_proxy(pack)},
1031
- "sources": {"total": len(specs), "included": len(included) - partial_count, "partial": partial_count, "omitted": len(omitted_sorted)},
1032
- "included_sources": included,
1033
- "omitted_sources": omitted_sorted,
1034
- "redaction": {"redacted_lines": redacted_lines, "redacted_before_pack": True},
1035
- "artifact": {"stored": False, "path": None, "bytes": 0, "capped": False, "cap_bytes": MAX_RECEIPT_BYTES},
1036
- "created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
1037
- }
1038
- if store_artifact:
1039
- artifact = store_receipt(root, result)
1040
- result["artifact"] = artifact
1041
- return result
1042
-
1043
-
1044
- def parse_all_sources(args: argparse.Namespace) -> list[SourceSpec]:
1045
- specs: list[SourceSpec] = []
1046
- if args.manifest:
1047
- specs.extend(read_manifest(Path(args.manifest)))
1048
- for raw in args.source or []:
1049
- specs.append(parse_source_spec(raw))
1050
- for index, spec in enumerate(specs):
1051
- spec.input_index = index
1052
- return specs
1053
-
1054
-
1055
- def slice_source(root: Path, *, raw_path: str, lines: LineRange) -> tuple[dict[str, Any], int]:
1056
- spec = SourceSpec(path=raw_path, lines=lines)
1057
- source, omitted_item = resolve_source(root, spec)
1058
- if omitted_item is not None:
1059
- payload = {"tool": TOOL_NAME, "status": "error", "reason": omitted_item.get("reason"), "path": omitted_item.get("path")}
1060
- return payload, 1
1061
- assert source is not None
1062
- content = "".join(source.selected_lines)
1063
- payload = {
1064
- "tool": TOOL_NAME,
1065
- "version": VERSION,
1066
- "status": "ok",
1067
- "path": source.display_path,
1068
- "query": {"type": "lines", "start": lines.start, "end": min(lines.end, source.total_lines), "returned_lines": len(source.selected_lines)},
1069
- "content": content,
1070
- "bytes": byte_len(content),
1071
- "redaction": {"redacted_lines": source.redacted_lines, "redacted_before_pack": True},
1072
- }
1073
- return payload, 0
1074
-
1075
-
1076
- def suggest_tokens(text: str) -> set[str]:
1077
- sanitized = SECRET_CONTENT_RE.sub(" ", text.lower())
1078
- return {part for part in re.findall(r"[a-z0-9_][a-z0-9_.-]{1,}", sanitized) if len(part) >= 2}
1079
-
1080
-
1081
- def suggest_score_path(path: str, query_terms: set[str]) -> int:
1082
- lowered = path.lower()
1083
- score = 0
1084
- for term in query_terms:
1085
- if term in lowered:
1086
- score += 120
1087
- return score
1088
-
1089
-
1090
- def suggest_reason(*parts: str) -> str:
1091
- return cap_label("; ".join(part for part in parts if part), default="local heuristic", limit=MAX_REASON_CHARS) or "local heuristic"
1092
-
1093
-
1094
- def split_suggest_files(values: list[str] | None) -> list[str]:
1095
- out: list[str] = []
1096
- for value in values or []:
1097
- for part in str(value).split(","):
1098
- text = part.strip()
1099
- if text:
1100
- out.append(text)
1101
- return out
1102
-
1103
-
1104
- def line_window(line_number: int, total_lines: int | None, context_lines: int) -> LineRange:
1105
- start = max(1, line_number - context_lines)
1106
- if total_lines is None:
1107
- end = max(start, line_number + context_lines)
1108
- else:
1109
- end = min(max(start, line_number + context_lines), max(1, total_lines))
1110
- return LineRange(start, end)
1111
-
1112
-
1113
- def merge_line_window(existing: LineRange | None, line_number: int, context_lines: int) -> LineRange:
1114
- window = line_window(line_number, None, context_lines)
1115
- if existing is None:
1116
- return window
1117
- return LineRange(min(existing.start, window.start), max(existing.end, window.end))
1118
-
1119
-
1120
- def add_suggest_candidate(
1121
- candidates: list[SuggestCandidate],
1122
- *,
1123
- path: str,
1124
- score: int,
1125
- reason: str,
1126
- lines: LineRange | None = None,
1127
- label: str | None = None,
1128
- ) -> None:
1129
- candidates.append(
1130
- SuggestCandidate(
1131
- path=path,
1132
- score=score,
1133
- reason=suggest_reason(reason),
1134
- lines=lines,
1135
- label=cap_label(label),
1136
- input_index=len(candidates),
1137
- )
1138
- )
1139
-
1140
-
1141
- def run_git_diff(root: Path, diff_ref: str) -> str:
1142
- ref = diff_ref.strip()
1143
- if not ref:
1144
- raise PackError("empty --diff")
1145
- command = ["git", "-C", str(root), "diff", "--no-ext-diff", "--no-textconv", "--unified=3"]
1146
- if ref in {"staged", "--staged", "cached", "--cached"}:
1147
- command.extend(["--cached"])
1148
- elif ref in {"worktree", "unstaged", "working-tree"}:
1149
- pass
1150
- elif ref.startswith("-"):
1151
- raise PackError("invalid --diff: revision must not start with '-'")
1152
- else:
1153
- command.append(ref)
1154
- try:
1155
- proc = subprocess.run(command, text=True, errors="replace", capture_output=True, timeout=10, check=False)
1156
- except (OSError, UnicodeError, subprocess.TimeoutExpired) as exc:
1157
- raise PackError(f"could not read diff: {exc.__class__.__name__}") from exc
1158
- if proc.returncode != 0:
1159
- detail = sanitize_text(proc.stderr or proc.stdout or "git diff failed")[0].strip().splitlines()
1160
- message = detail[0] if detail else "git diff failed"
1161
- raise PackError(f"could not read diff: {cap_label(message, default='git diff failed', limit=160)}")
1162
- return sanitize_text(proc.stdout[:MAX_SUGGEST_INPUT_BYTES])[0]
1163
-
1164
-
1165
- def collect_diff_candidates(root: Path, diff_ref: str, query_terms: set[str], context_lines: int) -> list[SuggestCandidate]:
1166
- diff_text = run_git_diff(root, diff_ref)
1167
- candidates: list[SuggestCandidate] = []
1168
- current_path: str | None = None
1169
- hunk_re = re.compile(r"^@@ -\d+(?:,\d+)? \+(\d+)(?:,(\d+))? @@")
1170
- for line in diff_text.splitlines():
1171
- if line.startswith("diff --git "):
1172
- match = re.match(r"^diff --git a/(.+?) b/(.+)$", line)
1173
- current_path = None
1174
- if match:
1175
- left, right = match.groups()
1176
- current_path = right if right != "/dev/null" else left
1177
- continue
1178
- if current_path is None:
1179
- continue
1180
- hunk = hunk_re.match(line)
1181
- if hunk:
1182
- start = int(hunk.group(1))
1183
- count = int(hunk.group(2) or "1")
1184
- end_line = max(start, start + max(1, count) - 1)
1185
- start_line = max(1, start - context_lines)
1186
- window = LineRange(start_line, max(start_line, end_line + context_lines))
1187
- score = 7_000 + suggest_score_path(current_path, query_terms)
1188
- add_suggest_candidate(
1189
- candidates,
1190
- path=current_path,
1191
- score=score,
1192
- reason="changed diff hunk",
1193
- lines=window,
1194
- label=f"diff:{safe_raw_path_label(current_path)}",
1195
- )
1196
- return candidates
1197
-
1198
-
1199
- OUTPUT_PATH_RE = re.compile(
1200
- r"(?<![A-Za-z0-9_./-])"
1201
- r"(?P<path>(?:\.\/)?(?:[A-Za-z0-9_.-]+/)*[A-Za-z0-9_.-]+\."
1202
- r"(?:py|js|jsx|ts|tsx|mjs|cjs|md|json|yml|yaml|toml|sh|css|html|txt|rb|go|rs|java|kt|swift|c|cc|cpp|h|hpp))"
1203
- r"(?::(?P<line>\d+))?"
1204
- )
1205
-
1206
-
1207
- def read_text_input_under_root(root: Path, raw_path: str) -> tuple[str | None, dict[str, Any] | None]:
1208
- rel, reason = lexical_rel(raw_path)
1209
- display = safe_raw_path_label(raw_path)
1210
- if rel is None:
1211
- return None, {"path": display, "status": "omitted", "reason": reason}
1212
- display, redacted = display_rel_path(rel.as_posix())
1213
- if redacted:
1214
- return None, {"path": display, "status": "omitted", "reason": "redacted_path", "retrieval_omitted_reason": "redacted_path"}
1215
- handle, reason = open_regular_under_root(root, rel)
1216
- if handle is None:
1217
- return None, {"path": display, "status": "omitted", "reason": reason}
1218
- try:
1219
- with handle:
1220
- text = handle.read(MAX_SUGGEST_INPUT_BYTES + 1)
1221
- except (OSError, UnicodeError):
1222
- return None, {"path": display, "status": "omitted", "reason": "unsafe_path"}
1223
- if len(text.encode("utf-8", errors="replace")) > MAX_SUGGEST_INPUT_BYTES:
1224
- text = text[:MAX_SUGGEST_INPUT_BYTES]
1225
- sanitized, _redacted = sanitize_text(text)
1226
- return sanitized, None
1227
-
1228
-
1229
- def collect_output_candidates(
1230
- root: Path,
1231
- raw_paths: list[str] | None,
1232
- query_terms: set[str],
1233
- context_lines: int,
1234
- *,
1235
- origin: str,
1236
- ) -> tuple[list[SuggestCandidate], list[dict[str, Any]]]:
1237
- candidates: list[SuggestCandidate] = []
1238
- omitted: list[dict[str, Any]] = []
1239
- for raw in raw_paths or []:
1240
- text, omission_item = read_text_input_under_root(root, raw)
1241
- if omission_item is not None:
1242
- omission_item["origin"] = origin
1243
- omitted.append(omission_item)
1244
- continue
1245
- assert text is not None
1246
- by_path: dict[str, LineRange | None] = {}
1247
- for match in OUTPUT_PATH_RE.finditer(text):
1248
- path = match.group("path")
1249
- if path.startswith("./"):
1250
- path = path[2:]
1251
- line_text = match.group("line")
1252
- if line_text:
1253
- try:
1254
- line_number = int(line_text)
1255
- except ValueError:
1256
- line_number = 1
1257
- by_path[path] = merge_line_window(by_path.get(path), line_number, context_lines)
1258
- else:
1259
- by_path.setdefault(path, None)
1260
- for path, lines in sorted(by_path.items()):
1261
- score = 5_000 + suggest_score_path(path, query_terms)
1262
- add_suggest_candidate(
1263
- candidates,
1264
- path=path,
1265
- score=score,
1266
- reason=f"{origin} referenced path",
1267
- lines=lines,
1268
- label=f"{origin}:{safe_raw_path_label(path)}",
1269
- )
1270
- return candidates, omitted
1271
-
1272
-
1273
- def git_ls_files(root: Path) -> list[str]:
1274
- try:
1275
- proc = subprocess.run(
1276
- ["git", "-C", str(root), "ls-files", "-z"],
1277
- text=False,
1278
- capture_output=True,
1279
- timeout=10,
1280
- check=False,
1281
- )
1282
- except (OSError, subprocess.TimeoutExpired):
1283
- proc = None
1284
- if proc is not None and proc.returncode == 0:
1285
- raw = proc.stdout[: MAX_QUERY_SCAN_FILES * 512]
1286
- return [part.decode("utf-8", "replace") for part in raw.split(b"\0") if part][:MAX_QUERY_SCAN_FILES]
1287
- out: list[str] = []
1288
- skip_dirs = {".git", ".omx", ".context-guard", "node_modules", "dist", "build", "__pycache__"}
1289
- for current, dirs, files in os.walk(root):
1290
- dirs[:] = [name for name in dirs if name not in skip_dirs and not name.startswith(".pytest")]
1291
- current_path = Path(current)
1292
- for name in files:
1293
- rel = (current_path / name).relative_to(root).as_posix()
1294
- out.append(rel)
1295
- if len(out) >= MAX_QUERY_SCAN_FILES:
1296
- return out
1297
- return out
1298
-
1299
-
1300
- def collect_query_candidates(root: Path, query_terms: set[str], context_lines: int) -> list[SuggestCandidate]:
1301
- if not query_terms:
1302
- return []
1303
- candidates: list[SuggestCandidate] = []
1304
- for rel_path in git_ls_files(root):
1305
- rel, reason = lexical_rel(rel_path)
1306
- if rel is None or reason:
1307
- continue
1308
- display, redacted = display_rel_path(rel.as_posix())
1309
- if redacted:
1310
- continue
1311
- path_score = suggest_score_path(display, query_terms)
1312
- handle, open_reason = open_regular_under_root(root, rel)
1313
- if handle is None:
1314
- continue
1315
- first_match_line: int | None = None
1316
- content_score = 0
1317
- try:
1318
- with handle:
1319
- scanned_bytes = 0
1320
- for index, raw_line in enumerate(handle, start=1):
1321
- scanned_bytes += byte_len(raw_line)
1322
- if scanned_bytes > MAX_QUERY_SCAN_BYTES_PER_FILE:
1323
- break
1324
- if index > SUGGEST_WHOLE_FILE_MAX_LINES and content_score == 0 and path_score == 0:
1325
- break
1326
- lowered = raw_line.lower()
1327
- hits = sum(1 for term in query_terms if term in lowered)
1328
- if hits:
1329
- content_score += 250 * hits
1330
- if first_match_line is None:
1331
- first_match_line = index
1332
- except (OSError, UnicodeError):
1333
- _ = open_reason
1334
- continue
1335
- if path_score == 0 and content_score == 0:
1336
- continue
1337
- if first_match_line is not None:
1338
- lines = line_window(first_match_line, None, context_lines)
1339
- reason = "query matched file content"
1340
- else:
1341
- lines = None
1342
- reason = "query matched file path"
1343
- add_suggest_candidate(
1344
- candidates,
1345
- path=display,
1346
- score=3_000 + path_score + content_score,
1347
- reason=reason,
1348
- lines=lines,
1349
- label=f"query:{display}",
1350
- )
1351
- return candidates
1352
-
1353
-
1354
- def source_selected_range(source: ResolvedSource) -> LineRange:
1355
- start = source.requested_lines.start if source.requested_lines else 1
1356
- return LineRange(start, start + max(len(source.selected_lines), 1) - 1)
1357
-
1358
-
1359
- def resolved_block_bytes(source: ResolvedSource, *, root_arg: str) -> int:
1360
- included = source_selected_range(source)
1361
- return byte_len(render_block(source, source.selected_lines, root_arg=root_arg, status="included", included=included))
1362
-
1363
-
1364
- def manifest_source_for_candidate(source: ResolvedSource, *, priority: int, label: str | None) -> dict[str, Any]:
1365
- item: dict[str, Any] = {"path": source.display_path, "priority": priority}
1366
- if label:
1367
- item["label"] = label
1368
- if source.requested_lines is not None:
1369
- item["lines"] = source_selected_range(source).as_dict()
1370
- return item
1371
-
1372
-
1373
- def suggested_source_payload(source: ResolvedSource, candidate: SuggestCandidate, *, root_arg: str) -> dict[str, Any]:
1374
- included = source_selected_range(source)
1375
- payload: dict[str, Any] = {
1376
- "path": source.display_path,
1377
- "priority": candidate.score,
1378
- "score": candidate.score,
1379
- "reason": candidate.reason,
1380
- "lines": included.as_dict(),
1381
- "bytes": byte_len("".join(source.selected_lines)),
1382
- }
1383
- if candidate.label:
1384
- payload["label"] = candidate.label
1385
- retrieval, retrieval_omitted_reason = retrieval_for(root_arg, source.display_path, included, redacted_path=source.redacted_path)
1386
- if retrieval:
1387
- payload["retrieval_cli"] = retrieval
1388
- elif retrieval_omitted_reason:
1389
- payload["retrieval_omitted_reason"] = retrieval_omitted_reason
1390
- return payload
1391
-
1392
-
1393
- def normalize_suggest_source(root: Path, candidate: SuggestCandidate) -> tuple[ResolvedSource | None, dict[str, Any] | None]:
1394
- spec = SourceSpec(
1395
- path=candidate.path,
1396
- priority=candidate.score,
1397
- lines=candidate.lines,
1398
- label=candidate.label,
1399
- input_index=candidate.input_index,
1400
- origin="suggest",
1401
- )
1402
- source, omitted_item = resolve_source(root, spec)
1403
- if omitted_item is not None:
1404
- omitted_item["reason"] = omitted_item.get("reason") or candidate.reason
1405
- omitted_item["suggest_reason"] = candidate.reason
1406
- return None, omitted_item
1407
- assert source is not None
1408
- if source.redacted_path:
1409
- return None, omission(spec, "redacted_path", path=source.display_path, redacted_path=True)
1410
- if spec.lines is None and source.total_lines > SUGGEST_WHOLE_FILE_MAX_LINES:
1411
- capped = SourceSpec(
1412
- path=candidate.path,
1413
- priority=candidate.score,
1414
- lines=LineRange(1, min(SUGGEST_WHOLE_FILE_MAX_LINES, source.total_lines)),
1415
- label=candidate.label,
1416
- input_index=candidate.input_index,
1417
- origin="suggest",
1418
- )
1419
- source, omitted_item = resolve_source(root, capped)
1420
- if omitted_item is not None:
1421
- omitted_item["suggest_reason"] = candidate.reason
1422
- return None, omitted_item
1423
- assert source is not None
1424
- return source, None
1425
-
1426
-
1427
- def write_manifest_under_root(root: Path, raw_path: str, manifest: dict[str, Any]) -> str:
1428
- content = json.dumps(manifest, ensure_ascii=False, indent=2, sort_keys=True) + "\n"
1429
- return write_text_under_root(root, raw_path, content, "--manifest-out")
1430
-
1431
-
1432
- def validate_output_path_under_root(root: Path, raw_path: str, option_name: str) -> str:
1433
- rel, reason = lexical_rel(raw_path)
1434
- if rel is None:
1435
- raise PackError(f"invalid {option_name}: {reason}")
1436
- display, redacted = display_rel_path(rel.as_posix())
1437
- if redacted:
1438
- raise PackError(f"invalid {option_name}: redacted_path")
1439
- parent_parts = rel.parts[:-1]
1440
- filename = rel.parts[-1]
1441
- current_fd: int | None = None
1442
- file_fd = -1
1443
- try:
1444
- current_fd = open_dir_no_follow(root)
1445
- for part in parent_parts:
1446
- next_fd = open_dir_no_follow(part, dir_fd=current_fd)
1447
- os.close(current_fd)
1448
- current_fd = next_fd
1449
- flags = os.O_WRONLY
1450
- if hasattr(os, "O_NOFOLLOW"):
1451
- flags |= os.O_NOFOLLOW
1452
- if hasattr(os, "O_CLOEXEC"):
1453
- flags |= os.O_CLOEXEC
1454
- if hasattr(os, "O_NONBLOCK"):
1455
- flags |= os.O_NONBLOCK
1456
- try:
1457
- file_fd = os.open(filename, flags, dir_fd=current_fd)
1458
- st = os.fstat(file_fd)
1459
- if not stat.S_ISREG(st.st_mode):
1460
- raise PackError(f"invalid {option_name}: unsafe_path")
1461
- except FileNotFoundError:
1462
- temp_fd = -1
1463
- temp_name = f".context-guard-pack-preflight-{os.getpid()}-{hashlib.sha256(raw_path.encode('utf-8', 'replace')).hexdigest()[:10]}"
1464
- try:
1465
- create_flags = os.O_WRONLY | os.O_CREAT | os.O_EXCL
1466
- if hasattr(os, "O_NOFOLLOW"):
1467
- create_flags |= os.O_NOFOLLOW
1468
- if hasattr(os, "O_CLOEXEC"):
1469
- create_flags |= os.O_CLOEXEC
1470
- if hasattr(os, "O_NONBLOCK"):
1471
- create_flags |= os.O_NONBLOCK
1472
- temp_fd = os.open(temp_name, create_flags, 0o600, dir_fd=current_fd)
1473
- except OSError as exc:
1474
- raise PackError(f"invalid {option_name}: {exc.strerror or exc.__class__.__name__}") from exc
1475
- finally:
1476
- if temp_fd >= 0:
1477
- try:
1478
- os.close(temp_fd)
1479
- except OSError:
1480
- pass
1481
- try:
1482
- os.unlink(temp_name, dir_fd=current_fd)
1483
- except OSError:
1484
- pass
1485
- except IsADirectoryError as exc:
1486
- raise PackError(f"invalid {option_name}: unsafe_path") from exc
1487
- except OSError as exc:
1488
- raise PackError(f"invalid {option_name}: {exc.strerror or exc.__class__.__name__}") from exc
1489
- except PackError:
1490
- raise
1491
- except FileNotFoundError as exc:
1492
- raise PackError(f"invalid {option_name}: missing") from exc
1493
- except OSError as exc:
1494
- raise PackError(f"invalid {option_name}: {exc.strerror or exc.__class__.__name__}") from exc
1495
- finally:
1496
- if file_fd >= 0:
1497
- try:
1498
- os.close(file_fd)
1499
- except OSError:
1500
- pass
1501
- if current_fd is not None:
1502
- try:
1503
- os.close(current_fd)
1504
- except OSError:
1505
- pass
1506
- return display
1507
-
1508
-
1509
- def output_rel_for_collision_check(raw_path: str, option_name: str) -> Path:
1510
- rel, reason = lexical_rel(raw_path)
1511
- if rel is None:
1512
- raise PackError(f"invalid {option_name}: {reason}")
1513
- _display, redacted = display_rel_path(rel.as_posix())
1514
- if redacted:
1515
- raise PackError(f"invalid {option_name}: redacted_path")
1516
- return rel
1517
-
1518
-
1519
- def existing_output_identity_under_root(root: Path, rel: Path) -> tuple[int, int] | None:
1520
- current_fd: int | None = None
1521
- try:
1522
- current_fd = open_dir_no_follow(root)
1523
- for part in rel.parts[:-1]:
1524
- next_fd = open_dir_no_follow(part, dir_fd=current_fd)
1525
- os.close(current_fd)
1526
- current_fd = next_fd
1527
- st = os.stat(rel.parts[-1], dir_fd=current_fd, follow_symlinks=False)
1528
- if not stat.S_ISREG(st.st_mode):
1529
- return None
1530
- return int(st.st_dev), int(st.st_ino)
1531
- except (FileNotFoundError, OSError, NotImplementedError):
1532
- return None
1533
- finally:
1534
- if current_fd is not None:
1535
- try:
1536
- os.close(current_fd)
1537
- except OSError:
1538
- pass
1539
-
1540
-
1541
- def reject_matching_output_targets(
1542
- root: Path,
1543
- *,
1544
- first_rel: Path,
1545
- second_rel: Path,
1546
- second_option: str,
1547
- reason: str,
1548
- ) -> None:
1549
- first_identity = existing_output_identity_under_root(root, first_rel)
1550
- second_identity = existing_output_identity_under_root(root, second_rel)
1551
- same_existing_target = first_identity is not None and first_identity == second_identity
1552
- same_lexical_target = first_rel == second_rel or first_rel.as_posix().casefold() == second_rel.as_posix().casefold()
1553
- if same_lexical_target or same_existing_target:
1554
- raise PackError(f"invalid {second_option}: {reason}")
1555
-
1556
-
1557
- def write_text_under_root(root: Path, raw_path: str, content: str, option_name: str) -> str:
1558
- rel, reason = lexical_rel(raw_path)
1559
- if rel is None:
1560
- raise PackError(f"invalid {option_name}: {reason}")
1561
- display, redacted = display_rel_path(rel.as_posix())
1562
- if redacted:
1563
- raise PackError(f"invalid {option_name}: redacted_path")
1564
- parent_parts = rel.parts[:-1]
1565
- filename = rel.parts[-1]
1566
- current_fd: int | None = None
1567
- try:
1568
- current_fd = open_dir_no_follow(root)
1569
- for part in parent_parts:
1570
- next_fd = open_dir_no_follow(part, dir_fd=current_fd)
1571
- os.close(current_fd)
1572
- current_fd = next_fd
1573
- write_text_atomic_at(current_fd, filename, content, mode=0o600, option_name=option_name)
1574
- except PackError:
1575
- raise
1576
- except FileNotFoundError as exc:
1577
- raise PackError(f"invalid {option_name}: missing") from exc
1578
- except OSError as exc:
1579
- raise PackError(f"invalid {option_name}: {exc.strerror or exc.__class__.__name__}") from exc
1580
- finally:
1581
- if current_fd is not None:
1582
- try:
1583
- os.close(current_fd)
1584
- except OSError:
1585
- pass
1586
- return display
1587
-
1588
-
1589
- def manifest_to_source_specs(manifest: dict[str, Any]) -> list[SourceSpec]:
1590
- version = manifest.get("version", VERSION)
1591
- if version != VERSION:
1592
- raise PackError(f"unsupported manifest version: {version}")
1593
- sources = manifest.get("sources")
1594
- if not isinstance(sources, list):
1595
- raise PackError("manifest sources must be a list")
1596
- specs: list[SourceSpec] = []
1597
- for index, item in enumerate(sources):
1598
- if not isinstance(item, dict):
1599
- raise PackError("manifest sources must be objects")
1600
- if "path" not in item:
1601
- raise PackError("manifest source missing path")
1602
- try:
1603
- lines = parse_line_range(item.get("lines"))
1604
- except PackError:
1605
- lines = LineRange(-1, -1)
1606
- specs.append(SourceSpec(
1607
- path=str(item.get("path", "")),
1608
- priority=bounded_int(item.get("priority"), 0, -1_000_000, 1_000_000),
1609
- lines=lines,
1610
- label=cap_label(item.get("label")),
1611
- input_index=index,
1612
- origin="auto",
1613
- ))
1614
- return specs
1615
-
1616
-
1617
- def build_suggest_manifest(sources: list[dict[str, Any]]) -> dict[str, Any]:
1618
- manifest_sources: list[dict[str, Any]] = []
1619
- for item in sources:
1620
- source: dict[str, Any] = {"path": item["path"], "priority": item["priority"]}
1621
- if "label" in item:
1622
- source["label"] = item["label"]
1623
- if "lines" in item:
1624
- source["lines"] = item["lines"]
1625
- manifest_sources.append(source)
1626
- return {"version": VERSION, "sources": manifest_sources}
1627
-
1628
-
1629
- def suggest_build_hint(root_arg: str, manifest_path: str | None, budget: int) -> tuple[str | None, str | None]:
1630
- safe_root = safe_root_arg_for_retrieval(root_arg)
1631
- if safe_root is None:
1632
- return None, "unsafe_root_path"
1633
- manifest_arg = manifest_path or "<manifest.json>"
1634
- command_parts = ["context-guard-pack", "build", "--root", ".", "--manifest", manifest_arg, "--budget-bytes", str(budget), "--json"]
1635
- command = " ".join(shlex.quote(part) for part in command_parts)
1636
- if safe_root in {".", ""}:
1637
- return command, None
1638
- return f"cd {shlex.quote(safe_root)} && {command}", None
1639
-
1640
-
1641
- def suggest_pack(root: Path, args: argparse.Namespace, *, root_arg: str) -> tuple[dict[str, Any], int]:
1642
- query_text, _query_redactions = sanitize_text(args.query or "")
1643
- query = " ".join(query_text.split())
1644
- query_terms = suggest_tokens(query)
1645
- context_lines = bounded_int(args.context_lines, DEFAULT_SUGGEST_CONTEXT_LINES, 0, MAX_SUGGEST_CONTEXT_LINES)
1646
- top = bounded_int(args.top, DEFAULT_SUGGEST_TOP, 1, MAX_SUGGEST_TOP)
1647
- budget = bounded_int(args.budget_bytes, DEFAULT_BUDGET_BYTES, MIN_BUDGET_BYTES, MAX_BUDGET_BYTES)
1648
- candidates: list[SuggestCandidate] = []
1649
- omitted: list[dict[str, Any]] = []
1650
- file_inputs = split_suggest_files(args.files)
1651
- has_signal = bool(query or file_inputs or args.diff or args.output or args.test_output)
1652
- if not has_signal:
1653
- raise PackError("provide --query, --files, --diff, --output, or --test-output")
1654
-
1655
- for raw_path in file_inputs:
1656
- add_suggest_candidate(
1657
- candidates,
1658
- path=raw_path,
1659
- score=9_000 + suggest_score_path(raw_path, query_terms),
1660
- reason="explicit file request",
1661
- label=f"file:{safe_raw_path_label(raw_path)}",
1662
- )
1663
- if args.diff:
1664
- candidates.extend(collect_diff_candidates(root, args.diff, query_terms, context_lines))
1665
- output_candidates, output_omitted = collect_output_candidates(root, args.output, query_terms, context_lines, origin="output")
1666
- test_candidates, test_omitted = collect_output_candidates(root, args.test_output, query_terms, context_lines, origin="test-output")
1667
- candidates.extend(output_candidates)
1668
- candidates.extend(test_candidates)
1669
- omitted.extend(output_omitted)
1670
- omitted.extend(test_omitted)
1671
- candidates.extend(collect_query_candidates(root, query_terms, context_lines))
1672
-
1673
- candidates.sort(key=lambda item: (-item.score, item.input_index, item.path, item.lines.identity() if item.lines else "0:0"))
1674
- seen: set[tuple[str, str]] = set()
1675
- final_seen: set[tuple[str, str]] = set()
1676
- selected: list[dict[str, Any]] = []
1677
- manifest_seed: list[dict[str, Any]] = []
1678
- current_bytes = byte_len("# Context Pack\n\nGenerated by context-guard-pack. Token counts are estimated proxies; byte counts are observed.\n\n")
1679
- for candidate in candidates:
1680
- rel, reason = lexical_rel(candidate.path)
1681
- identity_path = rel.as_posix() if rel is not None else safe_raw_path_label(candidate.path)
1682
- identity_lines = candidate.lines.identity() if candidate.lines else "all"
1683
- identity = (identity_path, identity_lines)
1684
- if rel is not None and identity in seen:
1685
- display, redacted = display_rel_path(rel.as_posix())
1686
- duplicate_item = {
1687
- "path": display,
1688
- "status": "omitted",
1689
- "reason": "duplicate_source",
1690
- "suggest_reason": candidate.reason,
1691
- "priority": candidate.score,
1692
- "retrieval_omitted_reason": "redacted_path" if redacted else None,
1693
- }
1694
- omitted.append({key: value for key, value in duplicate_item.items() if value is not None})
1695
- continue
1696
- if rel is not None:
1697
- seen.add(identity)
1698
- source, omitted_item = normalize_suggest_source(root, candidate)
1699
- if omitted_item is not None:
1700
- omitted_item["priority"] = candidate.score
1701
- omitted_item["suggest_reason"] = candidate.reason
1702
- omitted.append({key: value for key, value in omitted_item.items() if value is not None})
1703
- continue
1704
- assert source is not None
1705
- final_identity = (source.display_path, source_selected_range(source).identity() if source.requested_lines is not None else "all")
1706
- if final_identity in final_seen:
1707
- omitted.append({
1708
- "path": source.display_path,
1709
- "status": "omitted",
1710
- "reason": "duplicate_source",
1711
- "suggest_reason": candidate.reason,
1712
- "priority": candidate.score,
1713
- })
1714
- continue
1715
- final_seen.add(final_identity)
1716
- source_bytes = resolved_block_bytes(source, root_arg=root_arg)
1717
- remaining = budget - current_bytes
1718
- if source_bytes > remaining:
1719
- if not selected and remaining > 0:
1720
- partial_lines, _partial_block, partial_range = fit_partial_lines(source, remaining, root_arg=root_arg)
1721
- if partial_range is not None and partial_lines:
1722
- partial_spec = SourceSpec(
1723
- path=candidate.path,
1724
- priority=candidate.score,
1725
- lines=partial_range,
1726
- label=candidate.label,
1727
- input_index=candidate.input_index,
1728
- origin="suggest",
1729
- )
1730
- source, omitted_item = resolve_source(root, partial_spec)
1731
- if omitted_item is not None:
1732
- omitted_item["priority"] = candidate.score
1733
- omitted_item["suggest_reason"] = candidate.reason
1734
- omitted.append(omitted_item)
1735
- continue
1736
- assert source is not None
1737
- source_bytes = resolved_block_bytes(source, root_arg=root_arg)
1738
- else:
1739
- omitted.append({"path": source.display_path, "status": "omitted", "reason": "budget_exhausted", "priority": candidate.score})
1740
- continue
1741
- else:
1742
- omitted.append({"path": source.display_path, "status": "omitted", "reason": "budget_exhausted", "priority": candidate.score})
1743
- continue
1744
- payload = suggested_source_payload(source, candidate, root_arg=root_arg)
1745
- selected.append(payload)
1746
- manifest_seed.append(manifest_source_for_candidate(source, priority=candidate.score, label=candidate.label))
1747
- current_bytes += source_bytes
1748
- if len(selected) >= top:
1749
- break
1750
-
1751
- manifest = build_suggest_manifest(manifest_seed)
1752
- estimated_pack_bytes = current_bytes if selected else 0
1753
- manifest_path: str | None = None
1754
- if args.manifest_out:
1755
- manifest_path = write_manifest_under_root(root, args.manifest_out, manifest)
1756
- build_hint, build_hint_omitted_reason = suggest_build_hint(root_arg, manifest_path, budget)
1757
- payload: dict[str, Any] = {
1758
- "tool": TOOL_NAME,
1759
- "schema_version": SUGGEST_SCHEMA_VERSION,
1760
- "version": VERSION,
1761
- "mode": "suggest",
1762
- "root": display_root(root),
1763
- "query": query,
1764
- "budget_bytes": budget,
1765
- "estimated_pack_bytes": estimated_pack_bytes,
1766
- "token_proxy": {
1767
- "measurement": "estimated",
1768
- "method": f"chars_div_{TOKEN_PROXY_CHARS_PER_TOKEN}",
1769
- "estimated_pack": estimated_pack_bytes // TOKEN_PROXY_CHARS_PER_TOKEN,
1770
- },
1771
- "sources": selected,
1772
- "omitted_sources": sorted(omitted, key=lambda item: (str(item.get("path", "")), str(item.get("reason", "")), int(item.get("priority", 0) or 0))),
1773
- "manifest": manifest,
1774
- "manifest_path": manifest_path,
1775
- "build_hint": build_hint,
1776
- "caveats": [
1777
- "Deterministic local heuristics only; no model, network, embedding, or provider-cost estimate is used.",
1778
- "Byte and token values are pack-size proxies, not billing claims.",
1779
- ],
1780
- }
1781
- if build_hint_omitted_reason:
1782
- payload["build_hint_omitted_reason"] = build_hint_omitted_reason
1783
- return payload, 0
1784
-
1785
-
1786
- def line_range_identity(value: object) -> str:
1787
- if isinstance(value, dict):
1788
- return f"{value.get('start')}:{value.get('end')}"
1789
- if value is None:
1790
- return "all"
1791
- return str(value)
1792
-
1793
-
1794
- def copy_explain_fields(item: dict[str, Any], fields: tuple[str, ...]) -> dict[str, Any]:
1795
- out: dict[str, Any] = {}
1796
- for field in fields:
1797
- if field in item and item[field] is not None:
1798
- out[field] = copy.deepcopy(item[field])
1799
- return out
1800
-
1801
-
1802
- def build_source_matches_exact(suggest_item: dict[str, Any], build_item: dict[str, Any]) -> bool:
1803
- if build_item.get("path") != suggest_item.get("path"):
1804
- return False
1805
- if build_item.get("priority") != suggest_item.get("priority"):
1806
- return False
1807
- lines = line_range_identity(suggest_item.get("lines"))
1808
- requested = line_range_identity(build_item.get("requested_lines"))
1809
- included = line_range_identity(build_item.get("included_lines"))
1810
- return lines in {requested, included, "all"}
1811
-
1812
-
1813
- def find_exact_build_source_for_explain(
1814
- suggest_item: dict[str, Any],
1815
- build_sources: list[dict[str, Any]],
1816
- used_indexes: set[int],
1817
- ) -> dict[str, Any] | None:
1818
- for index, item in enumerate(build_sources):
1819
- if index in used_indexes:
1820
- continue
1821
- if build_source_matches_exact(suggest_item, item):
1822
- used_indexes.add(index)
1823
- return item
1824
- return None
1825
-
1826
-
1827
- def find_fallback_build_source_for_explain(
1828
- suggest_item: dict[str, Any],
1829
- build_sources: list[dict[str, Any]],
1830
- used_indexes: set[int],
1831
- ) -> dict[str, Any] | None:
1832
- path = suggest_item.get("path")
1833
- for index, item in enumerate(build_sources):
1834
- if index in used_indexes or item.get("path") != path:
1835
- continue
1836
- used_indexes.add(index)
1837
- return item
1838
- return None
1839
-
1840
-
1841
- def explain_omission_key(item: dict[str, Any]) -> tuple[str, str, str, str, str]:
1842
- return (
1843
- str(item.get("phase", "")),
1844
- str(item.get("path", "")),
1845
- str(item.get("reason", "")),
1846
- str(item.get("suggest_reason", "")),
1847
- json.dumps(item.get("requested_lines", item.get("lines", "")), ensure_ascii=False, sort_keys=True),
1848
- )
1849
-
1850
-
1851
- def sanitize_explain_text(value: str, *, limit: int = MAX_LABEL_CHARS) -> str:
1852
- sanitized, _redacted = sanitize_text(str(value))
1853
- return cap_label(sanitized, default="", limit=limit) or ""
1854
-
1855
-
1856
- def is_repo_map_text_path(path: str) -> bool:
1857
- name = Path(path).name.lower()
1858
- if name in {"readme", "license", "dockerfile", "makefile"}:
1859
- return True
1860
- return Path(path).suffix.lower() in REPO_MAP_TEXT_EXTENSIONS
1861
-
1862
-
1863
- def read_repo_map_text(root: Path, rel_path: str) -> tuple[dict[str, Any] | None, dict[str, Any] | None]:
1864
- rel, reason = lexical_rel(rel_path)
1865
- if rel is None:
1866
- return None, {"path": repo_map_safe_raw_path_label(rel_path), "reason": reason}
1867
- display, redacted_path = repo_map_display_rel_path(rel.as_posix())
1868
- if not is_repo_map_text_path(display):
1869
- return None, {"path": display, "reason": "unsupported_file_type"}
1870
- handle, open_reason = open_regular_under_root(root, rel)
1871
- if handle is None:
1872
- return None, {"path": display, "reason": open_reason, "retrieval_omitted_reason": "redacted_path" if redacted_path else None}
1873
- try:
1874
- with handle:
1875
- text = handle.read(MAX_REPO_MAP_BYTES_PER_FILE + 1)
1876
- except (OSError, UnicodeError):
1877
- return None, {"path": display, "reason": "unsafe_path", "retrieval_omitted_reason": "redacted_path" if redacted_path else None}
1878
- capped = byte_len(text) > MAX_REPO_MAP_BYTES_PER_FILE
1879
- if capped:
1880
- text = text.encode("utf-8", errors="replace")[:MAX_REPO_MAP_BYTES_PER_FILE].decode("utf-8", errors="ignore")
1881
- risk_counts = secret_risk_counts(text)
1882
- sanitized_text, redacted_lines = sanitize_text(text)
1883
- return {
1884
- "path": display,
1885
- "raw_path": rel.as_posix(),
1886
- "redacted_path": redacted_path,
1887
- "text": sanitized_text,
1888
- "bytes": byte_len(sanitized_text),
1889
- "bytes_capped": capped,
1890
- "line_count": len(sanitized_text.splitlines()) or (1 if sanitized_text else 0),
1891
- "redacted_lines": redacted_lines,
1892
- "secret_risk_counts": risk_counts,
1893
- }, None
1894
-
1895
-
1896
- def repo_map_records(root: Path) -> tuple[list[dict[str, Any]], list[dict[str, Any]], dict[str, Any]]:
1897
- paths = git_ls_files(root)
1898
- path_cap_reached = len(paths) > MAX_REPO_MAP_FILES
1899
- records: list[dict[str, Any]] = []
1900
- omitted: list[dict[str, Any]] = []
1901
- for rel_path in paths[:MAX_REPO_MAP_FILES]:
1902
- record, omission_item = read_repo_map_text(root, rel_path)
1903
- if record is not None:
1904
- records.append(record)
1905
- elif omission_item is not None and omission_item.get("reason") != "unsupported_file_type":
1906
- omitted.append({key: value for key, value in omission_item.items() if value is not None})
1907
- caps = {
1908
- "max_files": MAX_REPO_MAP_FILES,
1909
- "files_capped": path_cap_reached,
1910
- "max_bytes_per_file": MAX_REPO_MAP_BYTES_PER_FILE,
1911
- "bytes_per_file_capped_count": sum(1 for item in records if item.get("bytes_capped")),
1912
- "max_tree_entries": MAX_REPO_MAP_TREE_ENTRIES,
1913
- "max_signature_entries": MAX_REPO_MAP_SIGNATURE_ENTRIES,
1914
- "max_graph_rank_entries": MAX_REPO_MAP_GRAPH_RANK_ENTRIES,
1915
- "max_retrieval_hints": MAX_REPO_MAP_RETRIEVAL_HINTS,
1916
- "max_secret_risk_files": MAX_REPO_MAP_SECRET_RISK_FILES,
1917
- }
1918
- return records, omitted, caps
1919
-
1920
-
1921
- def secret_risk_counts(text: str) -> dict[str, int]:
1922
- counts: dict[str, int] = {}
1923
- for name, pattern in SECRET_RISK_PATTERNS:
1924
- found = len(pattern.findall(text))
1925
- if found:
1926
- counts[name] = found
1927
- return counts
1928
-
1929
-
1930
- def build_secret_scan(records: list[dict[str, Any]]) -> dict[str, Any]:
1931
- risk_counts: dict[str, int] = {}
1932
- files: list[dict[str, Any]] = []
1933
- for record in records:
1934
- counts = dict(record.get("secret_risk_counts", {}) if isinstance(record.get("secret_risk_counts"), dict) else {})
1935
- if not counts:
1936
- continue
1937
- for name, count in counts.items():
1938
- risk_counts[name] = risk_counts.get(name, 0) + count
1939
- files.append({
1940
- "path": record["path"],
1941
- "counts": counts,
1942
- "redacted_path": bool(record.get("redacted_path")),
1943
- })
1944
- files.sort(key=lambda item: (-sum(item["counts"].values()), item["path"]))
1945
- return {
1946
- "risk_counts": dict(sorted(risk_counts.items())),
1947
- "files_with_risks": files[:MAX_REPO_MAP_SECRET_RISK_FILES],
1948
- "files_omitted_by_cap": max(0, len(files) - MAX_REPO_MAP_SECRET_RISK_FILES),
1949
- "caveat": "Counts are local best-effort secret-pattern risk signals; raw matched values are never emitted.",
1950
- }
1951
-
1952
-
1953
- def build_token_tree(records: list[dict[str, Any]]) -> list[dict[str, Any]]:
1954
- directory_totals: dict[str, dict[str, int]] = {}
1955
- file_entries: list[dict[str, Any]] = []
1956
- for record in records:
1957
- path = str(record["path"])
1958
- bytes_count = int(record.get("bytes", 0) or 0)
1959
- file_entries.append({
1960
- "kind": "file",
1961
- "path": path,
1962
- "bytes": bytes_count,
1963
- "token_proxy": token_proxy(str(record.get("text", ""))),
1964
- "line_count": int(record.get("line_count", 0) or 0),
1965
- "bytes_capped": bool(record.get("bytes_capped")),
1966
- })
1967
- parts = path.split("/")
1968
- if len(parts) > 1:
1969
- prefix = ""
1970
- for part in parts[:-1]:
1971
- prefix = part if not prefix else f"{prefix}/{part}"
1972
- bucket = directory_totals.setdefault(prefix, {"bytes": 0, "file_count": 0})
1973
- bucket["bytes"] += bytes_count
1974
- bucket["file_count"] += 1
1975
- directory_entries = [
1976
- {
1977
- "kind": "directory",
1978
- "path": path,
1979
- "bytes": data["bytes"],
1980
- "token_proxy": max(0, round(data["bytes"] / TOKEN_PROXY_CHARS_PER_TOKEN)),
1981
- "file_count": data["file_count"],
1982
- }
1983
- for path, data in directory_totals.items()
1984
- ]
1985
- entries = directory_entries + file_entries
1986
- entries.sort(key=lambda item: (-int(item.get("bytes", 0) or 0), str(item.get("path", ""))))
1987
- return entries[:MAX_REPO_MAP_TREE_ENTRIES]
1988
-
1989
-
1990
- def signature_range(line_number: int, total_lines: int) -> LineRange:
1991
- return LineRange(max(1, line_number), min(max(1, total_lines), max(1, line_number) + 24))
1992
-
1993
-
1994
- def signature_entry(record: dict[str, Any], *, kind: str, name: str, raw_signature: str, line_number: int) -> dict[str, Any]:
1995
- total_lines = int(record.get("line_count", 0) or 1)
1996
- line_range = signature_range(line_number, total_lines)
1997
- return {
1998
- "path": record["path"],
1999
- "kind": kind,
2000
- "name": sanitize_explain_text(name, limit=80),
2001
- "signature": sanitize_explain_text(raw_signature, limit=180),
2002
- "line": line_number,
2003
- "lines": line_range.as_dict(),
2004
- }
2005
-
2006
-
2007
- def python_signatures(record: dict[str, Any], text: str) -> list[dict[str, Any]]:
2008
- try:
2009
- module = ast.parse(text)
2010
- except (SyntaxError, ValueError, RecursionError):
2011
- return []
2012
- lines = text.splitlines()
2013
- out: list[dict[str, Any]] = []
2014
- for node in module.body:
2015
- if isinstance(node, ast.ClassDef):
2016
- raw = lines[node.lineno - 1].strip() if 0 < node.lineno <= len(lines) else f"class {node.name}"
2017
- out.append(signature_entry(record, kind="class", name=node.name, raw_signature=raw, line_number=node.lineno))
2018
- for child in node.body:
2019
- if isinstance(child, (ast.FunctionDef, ast.AsyncFunctionDef)):
2020
- raw_child = lines[child.lineno - 1].strip() if 0 < child.lineno <= len(lines) else f"def {child.name}"
2021
- out.append(signature_entry(record, kind="method", name=child.name, raw_signature=raw_child, line_number=child.lineno))
2022
- elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
2023
- raw = lines[node.lineno - 1].strip() if 0 < node.lineno <= len(lines) else f"def {node.name}"
2024
- out.append(signature_entry(record, kind="function", name=node.name, raw_signature=raw, line_number=node.lineno))
2025
- return out
2026
-
2027
-
2028
- def regex_signatures(record: dict[str, Any], text: str) -> list[dict[str, Any]]:
2029
- out: list[dict[str, Any]] = []
2030
- suffix = Path(str(record.get("path", ""))).suffix.lower()
2031
- for index, raw in enumerate(text.splitlines(), start=1):
2032
- stripped = raw.strip()
2033
- if suffix in {".md", ".mdx"}:
2034
- heading = re.match(r"^(#{1,6})\s+(.+)$", stripped)
2035
- if heading:
2036
- out.append(signature_entry(record, kind="heading", name=heading.group(2), raw_signature=stripped, line_number=index))
2037
- continue
2038
- match = SIGNATURE_LINE_RE.match(raw)
2039
- if not match:
2040
- continue
2041
- name = next((group for group in match.groups() if group), "signature")
2042
- kind = "class" if re.search(r"\bclass\s+" + re.escape(name), raw) else "function"
2043
- out.append(signature_entry(record, kind=kind, name=name, raw_signature=stripped, line_number=index))
2044
- return out
2045
-
2046
-
2047
- def extract_signatures(records: list[dict[str, Any]]) -> list[dict[str, Any]]:
2048
- signatures: list[dict[str, Any]] = []
2049
- for record in records:
2050
- text = str(record.get("text", ""))
2051
- suffix = Path(str(record.get("path", ""))).suffix.lower()
2052
- if suffix == ".py":
2053
- parsed = python_signatures(record, text)
2054
- if parsed:
2055
- signatures.extend(parsed)
2056
- continue
2057
- signatures.extend(regex_signatures(record, text))
2058
- signatures.sort(key=lambda item: (str(item.get("path", "")), int(item.get("line", 0) or 0), str(item.get("name", ""))))
2059
- return signatures[:MAX_REPO_MAP_SIGNATURE_ENTRIES]
2060
-
2061
-
2062
- def normalize_repo_map_candidate(path: str) -> str:
2063
- normalized = posixpath.normpath(path.replace("\\", "/"))
2064
- if normalized == ".":
2065
- return ""
2066
- return normalized.lstrip("/")
2067
-
2068
-
2069
- def resolve_import_target(raw_target: str, source_path: str, known_paths: set[str]) -> str | None:
2070
- target = raw_target.strip()
2071
- if not target:
2072
- return None
2073
- candidates: list[str] = []
2074
- source_dir = Path(source_path).parent.as_posix()
2075
- if target.startswith("."):
2076
- if target.startswith("./") or target.startswith("../"):
2077
- base = normalize_repo_map_candidate(posixpath.join(source_dir, target))
2078
- else:
2079
- leading = len(target) - len(target.lstrip("."))
2080
- remainder = target[leading:].replace(".", "/")
2081
- base_dir = source_dir
2082
- for _ in range(max(0, leading - 1)):
2083
- base_dir = posixpath.dirname(base_dir)
2084
- base = normalize_repo_map_candidate(posixpath.join(base_dir, remainder)) if remainder else normalize_repo_map_candidate(base_dir)
2085
- candidates.extend([base, f"{base}.py", f"{base}.ts", f"{base}.tsx", f"{base}.js", f"{base}.jsx", f"{base}/index.ts", f"{base}/index.js"])
2086
- else:
2087
- module_path = target.replace(".", "/")
2088
- candidates.extend([f"{module_path}.py", f"{module_path}.ts", f"{module_path}.tsx", f"{module_path}.js", f"{module_path}.jsx", f"{module_path}/index.ts", f"{module_path}/index.js"])
2089
- for candidate in candidates:
2090
- normalized = normalize_repo_map_candidate(candidate)
2091
- if normalized in known_paths:
2092
- return normalized
2093
- return None
2094
-
2095
-
2096
- def python_from_import_targets(module_name: str, imported_names: str) -> list[str]:
2097
- targets = [module_name]
2098
- if module_name.strip("."):
2099
- return targets
2100
- for raw_name in imported_names.replace("(", " ").replace(")", " ").split(","):
2101
- name = raw_name.strip().split(" as ", 1)[0].strip()
2102
- if not re.fullmatch(r"[A-Za-z_]\w*", name):
2103
- continue
2104
- targets.append(f"{module_name}{name}")
2105
- return targets
2106
-
2107
-
2108
- def collect_import_edges(records: list[dict[str, Any]]) -> list[dict[str, str]]:
2109
- known = {str(record.get("path", "")) for record in records}
2110
- edges: list[dict[str, str]] = []
2111
- seen: set[tuple[str, str]] = set()
2112
- for record in records:
2113
- source = str(record.get("path", ""))
2114
- for line in str(record.get("text", "")).splitlines():
2115
- py_from_match = PY_FROM_IMPORT_LINE_RE.match(line)
2116
- if py_from_match:
2117
- raw_targets = python_from_import_targets(py_from_match.group("module"), py_from_match.group("names"))
2118
- else:
2119
- raw_targets = [next((value for value in match.groupdict().values() if value), "") for match in IMPORT_PATH_RE.finditer(line)]
2120
- for raw_target in raw_targets:
2121
- target = resolve_import_target(raw_target, source, known)
2122
- if target is None or target == source:
2123
- continue
2124
- edge = (source, target)
2125
- if edge in seen:
2126
- continue
2127
- seen.add(edge)
2128
- edges.append({"from": source, "to": target})
2129
- if len(edges) >= MAX_REPO_MAP_FILES:
2130
- return edges
2131
- return edges
2132
-
2133
-
2134
- def repo_map_seed_paths(args: argparse.Namespace, suggest_payload: dict[str, Any], build_payload: dict[str, Any]) -> set[str]:
2135
- seeds: set[str] = set()
2136
- for raw in split_suggest_files(getattr(args, "files", None)):
2137
- rel, _reason = lexical_rel(raw)
2138
- if rel is not None:
2139
- display, redacted = repo_map_display_rel_path(rel.as_posix())
2140
- if not redacted:
2141
- seeds.add(display)
2142
- for source in suggest_payload.get("sources", []):
2143
- if isinstance(source, dict) and isinstance(source.get("path"), str):
2144
- seeds.add(source["path"])
2145
- for source in build_payload.get("included_sources", []):
2146
- if isinstance(source, dict) and isinstance(source.get("path"), str):
2147
- seeds.add(source["path"])
2148
- return seeds
2149
-
2150
-
2151
- def build_graph_rank(
2152
- records: list[dict[str, Any]],
2153
- signatures: list[dict[str, Any]],
2154
- edges: list[dict[str, str]],
2155
- *,
2156
- query_terms: set[str],
2157
- seed_paths: set[str],
2158
- secret_scan: dict[str, Any],
2159
- ) -> list[dict[str, Any]]:
2160
- signature_paths = {str(item.get("path", "")) for item in signatures}
2161
- secret_paths = {str(item.get("path", "")) for item in secret_scan.get("files_with_risks", []) if isinstance(item, dict)}
2162
- degree: dict[str, int] = {}
2163
- for edge in edges:
2164
- degree[edge["from"]] = degree.get(edge["from"], 0) + 1
2165
- degree[edge["to"]] = degree.get(edge["to"], 0) + 1
2166
- ranked: list[dict[str, Any]] = []
2167
- for record in records:
2168
- path = str(record.get("path", ""))
2169
- text = str(record.get("text", "")).lower()
2170
- components = {
2171
- "seed": 1000 if path in seed_paths else 0,
2172
- "query_path": suggest_score_path(path, query_terms),
2173
- "query_content": min(500, 25 * sum(text.count(term) for term in query_terms)),
2174
- "signature": 80 if path in signature_paths else 0,
2175
- "graph_degree": 25 * degree.get(path, 0),
2176
- "secret_risk_penalty": -25 if path in secret_paths else 0,
2177
- }
2178
- score = sum(components.values())
2179
- if score <= 0:
2180
- continue
2181
- ranked.append({
2182
- "path": path,
2183
- "score": score,
2184
- "components": components,
2185
- "explain_only": True,
2186
- "line_count": int(record.get("line_count", 0) or 0),
2187
- })
2188
- ranked.sort(key=lambda item: (-int(item["score"]), str(item["path"])))
2189
- return ranked[:MAX_REPO_MAP_GRAPH_RANK_ENTRIES]
2190
-
2191
-
2192
- def repo_map_retrieval_for(root_arg: str, display_path: str, lines: LineRange, *, redacted_path: bool) -> tuple[str | None, str | None]:
2193
- if redacted_path:
2194
- return None, "redacted_path"
2195
- safe_root = safe_repo_map_root_arg_for_retrieval(root_arg)
2196
- if safe_root is None:
2197
- return None, "unsafe_root_path"
2198
- return retrieval_cli(safe_root, display_path, lines), None
2199
-
2200
-
2201
- def repo_map_retrieval(
2202
- record_by_path: dict[str, dict[str, Any]],
2203
- signatures: list[dict[str, Any]],
2204
- graph_rank: list[dict[str, Any]],
2205
- *,
2206
- root_arg: str,
2207
- ) -> list[dict[str, Any]]:
2208
- out: list[dict[str, Any]] = []
2209
- seen: set[tuple[str, str, str]] = set()
2210
-
2211
- def add(path: str, line_range: LineRange, source: str, name: str | None = None) -> None:
2212
- record = record_by_path.get(path)
2213
- if record is None:
2214
- return
2215
- retrieval, reason = repo_map_retrieval_for(root_arg, path, line_range, redacted_path=bool(record.get("redacted_path")))
2216
- key = (path, line_range.identity(), source)
2217
- if key in seen:
2218
- return
2219
- seen.add(key)
2220
- item: dict[str, Any] = {"path": path, "source": source, "lines": line_range.as_dict()}
2221
- if retrieval:
2222
- item["slice_cli"] = retrieval
2223
- elif reason:
2224
- item["retrieval_omitted_reason"] = reason
2225
- if name and retrieval and Path(path).suffix.lower() in SYMBOL_HINT_EXTENSIONS:
2226
- item["symbol_cli"] = " ".join(shlex.quote(part) for part in ["context-guard-read-symbol", "--json", path, name])
2227
- out.append(item)
2228
-
2229
- for signature in signatures:
2230
- lines = signature.get("lines")
2231
- if isinstance(lines, dict):
2232
- try:
2233
- line_range = LineRange(int(lines.get("start")), int(lines.get("end")))
2234
- except (TypeError, ValueError):
2235
- continue
2236
- add(str(signature.get("path", "")), line_range, "signature", str(signature.get("name", "")) or None)
2237
- if len(out) >= MAX_REPO_MAP_RETRIEVAL_HINTS:
2238
- return out[:MAX_REPO_MAP_RETRIEVAL_HINTS]
2239
- for item in graph_rank:
2240
- path = str(item.get("path", ""))
2241
- record = record_by_path.get(path)
2242
- if record is None:
2243
- continue
2244
- total = int(record.get("line_count", 0) or 1)
2245
- add(path, LineRange(1, min(total, 80)), "graph_rank")
2246
- if len(out) >= MAX_REPO_MAP_RETRIEVAL_HINTS:
2247
- break
2248
- return out[:MAX_REPO_MAP_RETRIEVAL_HINTS]
2249
-
2250
-
2251
- def build_repo_map_payload(
2252
- root: Path,
2253
- args: argparse.Namespace,
2254
- suggest_payload: dict[str, Any],
2255
- build_payload: dict[str, Any],
2256
- *,
2257
- root_arg: str,
2258
- ) -> dict[str, Any]:
2259
- records, omitted, caps = repo_map_records(root)
2260
- record_by_path = {str(record["path"]): record for record in records}
2261
- signatures = extract_signatures(records)
2262
- secret_scan = build_secret_scan(records)
2263
- edges = collect_import_edges(records)
2264
- query_terms = suggest_tokens(str(suggest_payload.get("query", "")))
2265
- graph_rank = build_graph_rank(
2266
- records,
2267
- signatures,
2268
- edges,
2269
- query_terms=query_terms,
2270
- seed_paths=repo_map_seed_paths(args, suggest_payload, build_payload),
2271
- secret_scan=secret_scan,
2272
- )
2273
- retrieval = repo_map_retrieval(record_by_path, signatures, graph_rank, root_arg=root_arg)
2274
- tree = build_token_tree(records)
2275
- total_bytes = sum(int(record.get("bytes", 0) or 0) for record in records)
2276
- return {
2277
- "schema_version": REPO_MAP_SCHEMA_VERSION,
2278
- "summary": {
2279
- "files_scanned": len(records),
2280
- "files_capped": bool(caps["files_capped"]),
2281
- "bytes_per_file_capped_count": int(caps["bytes_per_file_capped_count"]),
2282
- "tree_bytes": total_bytes,
2283
- "tree_token_proxy": sum(int(item.get("token_proxy", 0) or 0) for item in tree),
2284
- "signature_files": len({str(item.get("path", "")) for item in signatures}),
2285
- "signature_count": len(signatures),
2286
- "secret_risk_files": len(secret_scan.get("files_with_risks", [])),
2287
- "graph_edges": len(edges),
2288
- },
2289
- "caps": caps,
2290
- "token_tree": tree,
2291
- "secret_scan": secret_scan,
2292
- "signature_index": signatures,
2293
- "graph": {
2294
- "edges": edges[:MAX_REPO_MAP_GRAPH_RANK_ENTRIES],
2295
- "edges_omitted_by_cap": max(0, len(edges) - MAX_REPO_MAP_GRAPH_RANK_ENTRIES),
2296
- },
2297
- "graph_rank": graph_rank,
2298
- "retrieval": retrieval,
2299
- "omitted_files": omitted[:MAX_REPO_MAP_TREE_ENTRIES],
2300
- "safety": {
2301
- "deterministic_local_only": True,
2302
- "no_network": True,
2303
- "no_model_or_embedding": True,
2304
- "explain_only": True,
2305
- "redacted_before_output": True,
2306
- "tree_sitter": {"status": "unavailable_without_optional_dependency", "fallback": "python_ast_and_regex_signatures"},
2307
- "caveats": [
2308
- "Repo-map bytes are local sampled UTF-8 bytes and estimated chars_div_4 token proxies, not provider-token or savings claims.",
2309
- "Graph ranking is deterministic explain metadata only; it does not change pack selection in this stage.",
2310
- ],
2311
- },
2312
- }
2313
-
2314
-
2315
- def build_auto_explain_payload(
2316
- args: argparse.Namespace,
2317
- suggest_payload: dict[str, Any],
2318
- build_payload: dict[str, Any],
2319
- payload: dict[str, Any],
2320
- *,
2321
- root: Path | None = None,
2322
- root_arg: str = ".",
2323
- ) -> dict[str, Any]:
2324
- build_sources = [
2325
- item
2326
- for item in build_payload.get("included_sources", [])
2327
- if isinstance(item, dict)
2328
- ]
2329
- used_build_indexes: set[int] = set()
2330
- suggest_sources = [
2331
- item
2332
- for item in suggest_payload.get("sources", [])
2333
- if isinstance(item, dict)
2334
- ]
2335
- exact_matches: dict[int, dict[str, Any]] = {}
2336
- for index, item in enumerate(suggest_sources):
2337
- build_item = find_exact_build_source_for_explain(item, build_sources, used_build_indexes)
2338
- if build_item is not None:
2339
- exact_matches[index] = build_item
2340
-
2341
- selection: list[dict[str, Any]] = []
2342
- for index, item in enumerate(suggest_sources):
2343
- entry = copy_explain_fields(
2344
- item,
2345
- ("path", "score", "priority", "reason", "label", "lines", "bytes", "retrieval_cli", "retrieval_omitted_reason"),
2346
- )
2347
- build_item = exact_matches.get(index)
2348
- if build_item is None:
2349
- build_item = find_fallback_build_source_for_explain(item, build_sources, used_build_indexes)
2350
- if build_item is not None:
2351
- entry["build_status"] = build_item.get("status", "included")
2352
- for key in ("requested_lines", "included_lines"):
2353
- if key in build_item:
2354
- entry[key] = copy.deepcopy(build_item[key])
2355
- if "bytes" in build_item:
2356
- entry["build_bytes"] = build_item["bytes"]
2357
- else:
2358
- entry["build_status"] = "not_built"
2359
- selection.append(entry)
2360
-
2361
- omissions: list[dict[str, Any]] = []
2362
- seen_omissions: set[tuple[str, str, str, str, str]] = set()
2363
- omission_fields = (
2364
- "path",
2365
- "status",
2366
- "reason",
2367
- "suggest_reason",
2368
- "priority",
2369
- "label",
2370
- "requested_lines",
2371
- "included_lines",
2372
- "lines",
2373
- "total_lines",
2374
- "retrieval_cli",
2375
- "retrieval_omitted_reason",
2376
- "input_index",
2377
- )
2378
- for phase, source in (("suggest", suggest_payload), ("build", build_payload)):
2379
- for item in source.get("omitted_sources", []):
2380
- if not isinstance(item, dict):
2381
- continue
2382
- entry = copy_explain_fields(item, omission_fields)
2383
- entry["phase"] = phase
2384
- key = explain_omission_key(entry)
2385
- if key in seen_omissions:
2386
- continue
2387
- seen_omissions.add(key)
2388
- omissions.append(entry)
2389
- omissions.sort(key=explain_omission_key)
2390
-
2391
- build_source_counts = build_payload.get("sources", {}) if isinstance(build_payload.get("sources"), dict) else {}
2392
- auto_source_counts = payload.get("sources", {}) if isinstance(payload.get("sources"), dict) else {}
2393
- artifact = build_payload.get("artifact", {}) if isinstance(build_payload.get("artifact"), dict) else {}
2394
- pack_bytes = int(payload.get("pack_bytes", build_payload.get("pack_bytes", 0)) or 0)
2395
- budget_bytes = int(payload.get("budget_bytes", build_payload.get("budget_bytes", 0)) or 0)
2396
- budget_omitted_count = sum(1 for item in omissions if item.get("reason") == "budget_exhausted")
2397
- explicit_files = split_suggest_files(args.files)
2398
- query = str(suggest_payload.get("query", ""))
2399
- diff_label = cap_label(args.diff) if getattr(args, "diff", None) else None
2400
- explain = {
2401
- "schema_version": AUTO_EXPLAIN_SCHEMA_VERSION,
2402
- "summary": {
2403
- "suggested": int(auto_source_counts.get("suggested", len(selection)) or 0),
2404
- "included": int(auto_source_counts.get("included", build_source_counts.get("included", 0)) or 0),
2405
- "partial": int(auto_source_counts.get("partial", build_source_counts.get("partial", 0)) or 0),
2406
- "omitted": int(auto_source_counts.get("omitted", build_source_counts.get("omitted", 0)) or 0),
2407
- "suggest_omitted": len([item for item in suggest_payload.get("omitted_sources", []) if isinstance(item, dict)]),
2408
- "explain_omissions": len(omissions),
2409
- "pack_bytes": pack_bytes,
2410
- "budget_bytes": budget_bytes,
2411
- "manifest_written": bool(payload.get("manifest_path")),
2412
- "pack_written": bool(payload.get("pack_path")),
2413
- "artifact_stored": bool(artifact.get("stored")),
2414
- "artifact_capped": bool(artifact.get("capped")),
2415
- },
2416
- "inputs": {
2417
- "query": query,
2418
- "query_present": bool(query),
2419
- "diff": diff_label,
2420
- "diff_present": bool(diff_label),
2421
- "explicit_file_count": len(explicit_files),
2422
- "output_count": len(args.output or []),
2423
- "test_output_count": len(args.test_output or []),
2424
- "top": bounded_int(args.top, DEFAULT_SUGGEST_TOP, 1, MAX_SUGGEST_TOP),
2425
- "context_lines": bounded_int(args.context_lines, DEFAULT_SUGGEST_CONTEXT_LINES, 0, MAX_SUGGEST_CONTEXT_LINES),
2426
- "no_artifact": bool(args.no_artifact),
2427
- "manifest_path": payload.get("manifest_path"),
2428
- "pack_path": payload.get("pack_path"),
2429
- },
2430
- "selection": selection,
2431
- "omissions": omissions,
2432
- "budget": {
2433
- "pack_bytes": pack_bytes,
2434
- "budget_bytes": budget_bytes,
2435
- "remaining_bytes": budget_bytes - pack_bytes,
2436
- "partial_count": int(build_source_counts.get("partial", 0) or 0),
2437
- "budget_omitted_count": budget_omitted_count,
2438
- "token_proxy": copy.deepcopy(payload.get("token_proxy", {})),
2439
- "measurement": "observed_bytes_estimated_tokens",
2440
- "caveat": "Byte counts are observed pack bytes; token counts are estimated chars_div_4 proxies, not provider-token savings.",
2441
- },
2442
- "safety": {
2443
- "redaction": copy.deepcopy(build_payload.get("redaction", {})),
2444
- "caveats": copy.deepcopy(payload.get("caveats", [])),
2445
- "deterministic_local_only": True,
2446
- "raw_output_embedded": False,
2447
- "raw_test_output_embedded": False,
2448
- },
2449
- }
2450
- if root is not None:
2451
- explain["repo_map"] = build_repo_map_payload(root, args, suggest_payload, build_payload, root_arg=root_arg)
2452
- return explain
2453
-
2454
-
2455
- def auto_pack(root: Path, args: argparse.Namespace, *, root_arg: str) -> tuple[dict[str, Any], int]:
2456
- manifest_rel = output_rel_for_collision_check(args.manifest_out, "--manifest-out") if args.manifest_out else None
2457
- pack_rel = output_rel_for_collision_check(args.pack_out, "--pack-out") if args.pack_out else None
2458
- if manifest_rel is not None and pack_rel is not None:
2459
- reject_matching_output_targets(
2460
- root,
2461
- first_rel=manifest_rel,
2462
- second_rel=pack_rel,
2463
- second_option="--pack-out",
2464
- reason="same_as_manifest_out",
2465
- )
2466
- if args.manifest_out:
2467
- validate_output_path_under_root(root, args.manifest_out, "--manifest-out")
2468
- if args.pack_out:
2469
- validate_output_path_under_root(root, args.pack_out, "--pack-out")
2470
- suggest_args = copy.copy(args)
2471
- suggest_args.manifest_out = None
2472
- suggest_payload, rc = suggest_pack(root, suggest_args, root_arg=root_arg)
2473
- manifest = suggest_payload["manifest"]
2474
- specs = manifest_to_source_specs(manifest)
2475
- budget = bounded_int(args.budget_bytes, DEFAULT_BUDGET_BYTES, MIN_BUDGET_BYTES, MAX_BUDGET_BYTES)
2476
- build_payload = build_pack(root, specs, budget_bytes=budget, root_arg=root_arg, store_artifact=False)
2477
- if not args.no_artifact:
2478
- receipt_rel = Path(PACK_DIR) / f"{build_payload['pack_id']}.json"
2479
- if manifest_rel is not None:
2480
- reject_matching_output_targets(
2481
- root,
2482
- first_rel=receipt_rel,
2483
- second_rel=manifest_rel,
2484
- second_option="--manifest-out",
2485
- reason="same_as_artifact_receipt",
2486
- )
2487
- if pack_rel is not None:
2488
- reject_matching_output_targets(
2489
- root,
2490
- first_rel=receipt_rel,
2491
- second_rel=pack_rel,
2492
- second_option="--pack-out",
2493
- reason="same_as_artifact_receipt",
2494
- )
2495
- manifest_path: str | None = None
2496
- pack_path: str | None = None
2497
- if args.pack_out:
2498
- pack_path = write_text_under_root(root, args.pack_out, str(build_payload["pack"]), "--pack-out")
2499
- if args.manifest_out:
2500
- manifest_path = write_manifest_under_root(root, args.manifest_out, manifest)
2501
- if not args.no_artifact:
2502
- build_payload["artifact"] = store_receipt(root, build_payload)
2503
- build_hint, build_hint_omitted_reason = suggest_build_hint(root_arg, manifest_path, budget)
2504
- suggest_payload["manifest_path"] = manifest_path
2505
- suggest_payload["build_hint"] = build_hint
2506
- suggest_payload.pop("build_hint_omitted_reason", None)
2507
- if build_hint_omitted_reason:
2508
- suggest_payload["build_hint_omitted_reason"] = build_hint_omitted_reason
2509
- payload: dict[str, Any] = {
2510
- "tool": TOOL_NAME,
2511
- "schema_version": AUTO_SCHEMA_VERSION,
2512
- "version": VERSION,
2513
- "mode": "auto",
2514
- "root": display_root(root),
2515
- "query": suggest_payload.get("query", ""),
2516
- "budget_bytes": budget,
2517
- "manifest": manifest,
2518
- "manifest_path": manifest_path,
2519
- "pack_path": pack_path,
2520
- "suggest": suggest_payload,
2521
- "build": build_payload,
2522
- "sources": {
2523
- "suggested": len(suggest_payload.get("sources", [])),
2524
- "included": build_payload.get("sources", {}).get("included", 0),
2525
- "partial": build_payload.get("sources", {}).get("partial", 0),
2526
- "omitted": build_payload.get("sources", {}).get("omitted", 0),
2527
- },
2528
- "pack_bytes": build_payload.get("pack_bytes", 0),
2529
- "token_proxy": build_payload.get("token_proxy", {}),
2530
- "caveats": [
2531
- "Deterministic local heuristics only; no model, network, embedding, or provider-cost estimate is used.",
2532
- "Byte and token values are pack-size proxies, not billing claims.",
2533
- ],
2534
- }
2535
- if build_hint_omitted_reason:
2536
- payload["build_hint_omitted_reason"] = build_hint_omitted_reason
2537
- if args.explain:
2538
- payload["explain"] = build_auto_explain_payload(args, suggest_payload, build_payload, payload, root=root, root_arg=root_arg)
2539
- return payload, rc
2540
-
2541
-
2542
- def print_suggest_text(payload: dict[str, Any]) -> None:
2543
- print(
2544
- f"context-guard-pack suggest: {len(payload['sources'])} source(s), "
2545
- f"estimated {payload['estimated_pack_bytes']}/{payload['budget_bytes']} bytes"
2546
- )
2547
- for item in payload["sources"]:
2548
- lines = item.get("lines")
2549
- line_text = f":{lines['start']}:{lines['end']}" if isinstance(lines, dict) else ""
2550
- print(f"- {item['path']}{line_text} priority={item['priority']} reason={item['reason']}")
2551
- if payload.get("manifest_path"):
2552
- print(f"manifest: {payload['manifest_path']}")
2553
- if payload.get("build_hint"):
2554
- print(f"build: {payload['build_hint']}")
2555
- elif payload.get("build_hint_omitted_reason"):
2556
- print(f"build hint omitted: {payload['build_hint_omitted_reason']}")
2557
-
2558
-
2559
- def print_auto_text(payload: dict[str, Any]) -> None:
2560
- print(
2561
- f"context-guard-pack auto: {payload['sources']['suggested']} suggested source(s), "
2562
- f"pack {payload['pack_bytes']}/{payload['budget_bytes']} bytes"
2563
- )
2564
- explain = payload.get("explain")
2565
- if isinstance(explain, dict):
2566
- summary = explain.get("summary", {}) if isinstance(explain.get("summary"), dict) else {}
2567
- budget = explain.get("budget", {}) if isinstance(explain.get("budget"), dict) else {}
2568
- print(
2569
- "explain: "
2570
- f"selected={summary.get('suggested', 0)} "
2571
- f"included={summary.get('included', 0)} "
2572
- f"partial={summary.get('partial', 0)} "
2573
- f"omitted={summary.get('omitted', 0)} "
2574
- f"budget={budget.get('pack_bytes', payload.get('pack_bytes', 0))}/{budget.get('budget_bytes', payload.get('budget_bytes', 0))} "
2575
- "heuristic=local"
2576
- )
2577
- for item in (explain.get("selection", []) if isinstance(explain.get("selection"), list) else [])[:5]:
2578
- if not isinstance(item, dict):
2579
- continue
2580
- lines = item.get("included_lines") or item.get("lines")
2581
- if isinstance(lines, dict):
2582
- line_text = f":{lines.get('start')}:{lines.get('end')}"
2583
- else:
2584
- line_text = ""
2585
- print(
2586
- f"- {item.get('path')}{line_text} "
2587
- f"status={item.get('build_status', 'unknown')} "
2588
- f"score={item.get('score', item.get('priority', 0))} "
2589
- f"reason={item.get('reason', 'local heuristic')}"
2590
- )
2591
- omissions = explain.get("omissions", []) if isinstance(explain.get("omissions"), list) else []
2592
- if omissions:
2593
- reason_counts: dict[str, int] = {}
2594
- for item in omissions:
2595
- if not isinstance(item, dict):
2596
- continue
2597
- reason = str(item.get("reason", "unknown"))
2598
- reason_counts[reason] = reason_counts.get(reason, 0) + 1
2599
- reason_text = ", ".join(f"{reason}={count}" for reason, count in sorted(reason_counts.items()))
2600
- print(f"omitted reasons: {reason_text}")
2601
- if payload.get("manifest_path"):
2602
- print(f"manifest: {payload['manifest_path']}")
2603
- if payload.get("pack_path"):
2604
- print(f"pack: {payload['pack_path']}")
2605
- else:
2606
- print()
2607
- sys.stdout.write(str(payload["build"]["pack"]))
2608
-
2609
-
2610
- def build_parser() -> argparse.ArgumentParser:
2611
- parser = argparse.ArgumentParser(description="Build budgeted local context packs with exact retrieval hints.")
2612
- sub = parser.add_subparsers(dest="command", required=True)
2613
- build = sub.add_parser("build", help="assemble a prioritized context pack")
2614
- build.add_argument("--root", default=".", help="project root; must not be a symlink")
2615
- build.add_argument("--manifest", help="JSON manifest with version/sources")
2616
- build.add_argument("--source", action="append", help="source spec: path=REL[,priority=N][,lines=A:B][,label=TEXT]")
2617
- build.add_argument("--budget-bytes", type=int, default=DEFAULT_BUDGET_BYTES)
2618
- build.add_argument("--json", action="store_true", help="emit JSON payload")
2619
- build.add_argument("--no-artifact", action="store_true", help="do not write .context-guard/packs receipt")
2620
- slice_cmd = sub.add_parser("slice", help="retrieve an exact sanitized file slice")
2621
- slice_cmd.add_argument("--root", default=".", help="project root; must not be a symlink")
2622
- slice_cmd.add_argument("--path", required=True, help="relative file path under root")
2623
- slice_cmd.add_argument("--lines", required=True, help="inclusive 1-indexed START:END")
2624
- slice_cmd.add_argument("--json", action="store_true", help="emit JSON payload")
2625
- suggest = sub.add_parser("suggest", help="suggest a build-compatible context pack manifest from local signals")
2626
- suggest.add_argument("--root", default=".", help="project root; must not be a symlink")
2627
- suggest.add_argument("--query", default="", help="task or question to match against local files")
2628
- suggest.add_argument("--diff", help="git diff range, or staged/worktree, to seed changed-file ranges")
2629
- suggest.add_argument("--files", "--file", dest="files", action="append", help="explicit relative file path(s), comma-separated or repeated")
2630
- suggest.add_argument("--output", action="append", help="relative path to sanitized command output text under root")
2631
- suggest.add_argument("--test-output", action="append", help="relative path to sanitized test output text under root")
2632
- suggest.add_argument("--budget-bytes", type=int, default=DEFAULT_BUDGET_BYTES)
2633
- suggest.add_argument("--top", type=int, default=DEFAULT_SUGGEST_TOP, help="maximum suggested sources")
2634
- suggest.add_argument("--context-lines", type=int, default=DEFAULT_SUGGEST_CONTEXT_LINES, help="line context around diff/output hits")
2635
- suggest.add_argument("--manifest-out", help="write the suggested build manifest to this relative path under root")
2636
- suggest.add_argument("--json", action="store_true", help="emit JSON payload")
2637
- auto = sub.add_parser("auto", help="suggest a context pack manifest and build the budgeted pack in one local step")
2638
- auto.add_argument("--root", default=".", help="project root; must not be a symlink")
2639
- auto.add_argument("--query", default="", help="task or question to match against local files")
2640
- auto.add_argument("--diff", help="git diff range, or staged/worktree, to seed changed-file ranges")
2641
- auto.add_argument("--files", "--file", dest="files", action="append", help="explicit relative file path(s), comma-separated or repeated")
2642
- auto.add_argument("--output", action="append", help="relative path to sanitized command output text under root")
2643
- auto.add_argument("--test-output", action="append", help="relative path to sanitized test output text under root")
2644
- auto.add_argument("--budget-bytes", type=int, default=DEFAULT_BUDGET_BYTES)
2645
- auto.add_argument("--top", type=int, default=DEFAULT_SUGGEST_TOP, help="maximum suggested sources")
2646
- auto.add_argument("--context-lines", type=int, default=DEFAULT_SUGGEST_CONTEXT_LINES, help="line context around diff/output hits")
2647
- auto.add_argument("--manifest-out", help="write the suggested build manifest to this relative path under root")
2648
- auto.add_argument("--pack-out", help="write the built Markdown pack to this relative path under root")
2649
- auto.add_argument("--json", action="store_true", help="emit JSON payload")
2650
- auto.add_argument("--no-artifact", action="store_true", help="do not write .context-guard/packs receipt")
2651
- auto.add_argument("--explain", action="store_true", help="include deterministic local selection/build explanation metadata")
2652
- return parser
2653
-
2654
-
2655
- def main(argv: list[str] | None = None) -> int:
2656
- parser = build_parser()
2657
- args = parser.parse_args(argv)
2658
- try:
2659
- root = normalize_root(Path(args.root))
2660
- if args.command == "build":
2661
- specs = parse_all_sources(args)
2662
- if not specs:
2663
- raise PackError("provide --manifest or --source")
2664
- budget = bounded_int(args.budget_bytes, DEFAULT_BUDGET_BYTES, MIN_BUDGET_BYTES, MAX_BUDGET_BYTES)
2665
- result = build_pack(root, specs, budget_bytes=budget, root_arg=str(args.root), store_artifact=not args.no_artifact)
2666
- if args.json:
2667
- json.dump(result, sys.stdout, ensure_ascii=False, indent=2, sort_keys=True)
2668
- sys.stdout.write("\n")
2669
- else:
2670
- sys.stdout.write(str(result["pack"]))
2671
- print(
2672
- f"[context-guard-pack] pack_id={result['pack_id']} bytes={result['pack_bytes']}/{result['budget_bytes']} "
2673
- f"included={result['sources']['included']} partial={result['sources']['partial']} omitted={result['sources']['omitted']}",
2674
- file=sys.stderr,
2675
- )
2676
- return 0
2677
- if args.command == "slice":
2678
- lines = parse_line_range(args.lines)
2679
- if lines is None:
2680
- raise PackError("invalid_lines")
2681
- payload, rc = slice_source(root, raw_path=args.path, lines=lines)
2682
- if args.json:
2683
- json.dump(payload, sys.stdout, ensure_ascii=False, indent=2, sort_keys=True)
2684
- sys.stdout.write("\n")
2685
- elif rc == 0:
2686
- sys.stdout.write(str(payload.get("content", "")))
2687
- else:
2688
- print(f"context-guard-pack: {payload.get('reason')}", file=sys.stderr)
2689
- return rc
2690
- if args.command == "suggest":
2691
- payload, rc = suggest_pack(root, args, root_arg=str(args.root))
2692
- if args.json:
2693
- json.dump(payload, sys.stdout, ensure_ascii=False, indent=2, sort_keys=True)
2694
- sys.stdout.write("\n")
2695
- else:
2696
- print_suggest_text(payload)
2697
- return rc
2698
- if args.command == "auto":
2699
- payload, rc = auto_pack(root, args, root_arg=str(args.root))
2700
- if args.json:
2701
- json.dump(payload, sys.stdout, ensure_ascii=False, indent=2, sort_keys=True)
2702
- sys.stdout.write("\n")
2703
- else:
2704
- print_auto_text(payload)
2705
- return rc
2706
- raise PackError("unknown command")
2707
- except PackError as exc:
2708
- print(f"context-guard-pack: {exc}", file=sys.stderr)
2709
- return 2
2710
-
2711
-
2712
- if __name__ == "__main__":
2713
- raise SystemExit(main())