@ictechgy/context-guard 0.4.9 → 0.4.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/CHANGELOG.md +28 -0
  2. package/README.ko.md +59 -31
  3. package/README.md +85 -36
  4. package/docs/benchmark-fixtures/token-savings-12task-baseline.prompt.example.md +7 -0
  5. package/docs/benchmark-fixtures/token-savings-12task-contextguard.prompt.example.md +7 -0
  6. package/docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl +24 -0
  7. package/docs/benchmark-fixtures/token-savings-12task.tasks.example.json +182 -0
  8. package/docs/benchmark-fixtures/token-savings-12task.variants.example.json +10 -0
  9. package/docs/benchmark-workflow-examples.md +3 -0
  10. package/docs/benchmark-workflows/context-pack-byte-proxy.example.json +278 -137
  11. package/docs/benchmark-workflows/measured-token-workflow.example.json +279 -138
  12. package/docs/benchmark-workflows/provider-cache-telemetry.example.json +279 -138
  13. package/docs/distribution.md +10 -7
  14. package/docs/experimental-benchmark-fixtures.md +30 -6
  15. package/package.json +4 -6
  16. package/packaging/homebrew/context-guard.rb.template +1 -1
  17. package/plugins/context-guard/.claude-plugin/plugin.json +1 -1
  18. package/plugins/context-guard/README.ko.md +20 -14
  19. package/plugins/context-guard/README.md +26 -17
  20. package/plugins/context-guard/bin/context-guard +147 -25
  21. package/plugins/context-guard/bin/context-guard-artifact +884 -79
  22. package/plugins/context-guard/bin/context-guard-audit +33 -2
  23. package/plugins/context-guard/bin/context-guard-bench +1542 -31
  24. package/plugins/context-guard/bin/context-guard-cache-score +665 -0
  25. package/plugins/context-guard/bin/context-guard-compress +146 -1
  26. package/plugins/context-guard/bin/context-guard-cost +790 -6
  27. package/plugins/context-guard/bin/context-guard-experiments +463 -26
  28. package/plugins/context-guard/bin/context-guard-failed-nudge +9 -2
  29. package/plugins/context-guard/bin/context-guard-filter +163 -7
  30. package/plugins/context-guard/bin/context-guard-guard-read +3 -0
  31. package/plugins/context-guard/bin/context-guard-pack +892 -49
  32. package/plugins/context-guard/bin/context-guard-rewrite-bash +3 -0
  33. package/plugins/context-guard/bin/context-guard-sanitize-output +76 -12
  34. package/plugins/context-guard/bin/context-guard-setup +165 -31
  35. package/plugins/context-guard/bin/context-guard-statusline +490 -283
  36. package/plugins/context-guard/bin/context-guard-statusline-merged +5 -0
  37. package/plugins/context-guard/bin/context-guard-tool-prune +480 -53
  38. package/plugins/context-guard/bin/context-guard-trim-output +288 -41
  39. package/plugins/context-guard/brief/README.md +5 -5
  40. package/plugins/context-guard/lib/context_guard_commands.py +230 -0
  41. package/plugins/context-guard/skills/setup/SKILL.md +1 -0
  42. package/context-guard-kit/README.md +0 -91
  43. package/context-guard-kit/benchmark_runner.py +0 -2401
  44. package/context-guard-kit/claude_transcript_cost_audit.py +0 -2346
  45. package/context-guard-kit/context_compress.py +0 -695
  46. package/context-guard-kit/context_escrow.py +0 -935
  47. package/context-guard-kit/context_filter.py +0 -637
  48. package/context-guard-kit/context_guard_cli.py +0 -325
  49. package/context-guard-kit/context_guard_diet.py +0 -1711
  50. package/context-guard-kit/context_pack.py +0 -2713
  51. package/context-guard-kit/cost_guard.py +0 -2349
  52. package/context-guard-kit/experimental_registry.py +0 -4348
  53. package/context-guard-kit/failed_attempt_nudge.py +0 -567
  54. package/context-guard-kit/guard_large_read.py +0 -690
  55. package/context-guard-kit/hook_secret_patterns.py +0 -43
  56. package/context-guard-kit/read_symbol.py +0 -483
  57. package/context-guard-kit/rewrite_bash_for_token_budget.py +0 -501
  58. package/context-guard-kit/sanitize_output.py +0 -725
  59. package/context-guard-kit/settings.example.json +0 -67
  60. package/context-guard-kit/setup_wizard.py +0 -2515
  61. package/context-guard-kit/statusline.sh +0 -362
  62. package/context-guard-kit/statusline_merged.sh +0 -157
  63. package/context-guard-kit/tool_schema_pruner.py +0 -837
  64. package/context-guard-kit/trim_command_output.py +0 -1449
@@ -43,6 +43,8 @@ SUGGEST_SCHEMA_VERSION = "contextguard.pack-suggest.v1"
43
43
  AUTO_SCHEMA_VERSION = "contextguard.pack-auto.v1"
44
44
  AUTO_EXPLAIN_SCHEMA_VERSION = "contextguard.pack-auto-explain.v1"
45
45
  REPO_MAP_SCHEMA_VERSION = "contextguard.pack-repo-map.v1"
46
+ ADAPTIVE_K_SCHEMA_VERSION = "contextguard.pack-adaptive-k.v1"
47
+ SYMBOL_MEMORY_SCHEMA_VERSION = "contextguard.pack-symbol-memory.v1"
46
48
  DEFAULT_SUGGEST_TOP = 8
47
49
  MAX_SUGGEST_TOP = 50
48
50
  DEFAULT_SUGGEST_CONTEXT_LINES = 20
@@ -51,15 +53,30 @@ SUGGEST_WHOLE_FILE_MAX_LINES = 120
51
53
  MAX_SUGGEST_INPUT_BYTES = 256_000
52
54
  MAX_QUERY_SCAN_FILES = 2_000
53
55
  MAX_QUERY_SCAN_BYTES_PER_FILE = 200_000
56
+ MAX_GIT_LS_FILES_OUTPUT_BYTES = MAX_QUERY_SCAN_FILES * 512
57
+ GIT_LS_FILES_READ_CHUNK_BYTES = 64 * 1024
54
58
  MAX_REPO_MAP_FILES = 1_000
59
+ MAX_REPO_MAP_SCAN_FILES = 160
55
60
  MAX_REPO_MAP_BYTES_PER_FILE = 120_000
56
61
  MAX_REPO_MAP_TREE_ENTRIES = 30
57
62
  MAX_REPO_MAP_SIGNATURE_ENTRIES = 40
58
63
  MAX_REPO_MAP_GRAPH_RANK_ENTRIES = 30
59
64
  MAX_REPO_MAP_RETRIEVAL_HINTS = 30
60
65
  MAX_REPO_MAP_SECRET_RISK_FILES = 20
66
+ MAX_ADAPTIVE_K_SCORE_SAMPLES = 200
67
+ MAX_ADAPTIVE_K_SELECTED_EVIDENCE = 12
68
+ MAX_ADAPTIVE_K_OMITTED_EVIDENCE = 12
69
+ MAX_ADAPTIVE_K_REASON_COUNTS = 12
70
+ MAX_ADAPTIVE_K_VERIFICATION_HINTS = 12
71
+ ADAPTIVE_K_POLICIES = ("balanced", "recall", "precision")
72
+ MAX_SYMBOL_MEMORY_ITEMS = 12
73
+ MAX_SYMBOL_MEMORY_GRAPH_ITEMS = 12
61
74
  PACK_DIR = ".context-guard/packs"
62
75
  REDACTED_PATH_COMPONENT = "[REDACTED-PATH-COMPONENT]"
76
+ ALLOWED_FIRST_ABSOLUTE_SYMLINKS = {
77
+ "tmp": Path("/private/tmp"),
78
+ "var": Path("/private/var"),
79
+ }
63
80
  CONTROL_CHAR_RE = re.compile(r"[\x00-\x1f\x7f-\x9f]")
64
81
  SECRET_CONTENT_RE = re.compile(
65
82
  r"(?is)("
@@ -235,6 +252,30 @@ def sanitize_text(text: str, *, show_paths: bool = False) -> tuple[str, int]:
235
252
  return "".join(out), redacted
236
253
 
237
254
 
255
+ def sanitize_source_lines(handle: Any, requested: LineRange | None) -> tuple[list[str], int, int]:
256
+ """Sanitize a source stream while retaining only the requested line window.
257
+
258
+ Explicit line-window retrieval still scans the complete file so global
259
+ redaction counts and total line counts stay compatible with previous
260
+ outputs, but it no longer materializes a sanitized all-lines list before
261
+ slicing.
262
+ """
263
+ sanitizer = load_line_sanitizer()
264
+ selected: list[str] = []
265
+ redacted = 0
266
+ total_lines = 0
267
+ collect_all = requested is None
268
+ start = requested.start if requested is not None else 1
269
+ end = requested.end if requested is not None else 0
270
+ for total_lines, raw_line in enumerate(handle, start=1):
271
+ sanitized, did_redact = sanitizer.sanitize(raw_line) # type: ignore[attr-defined]
272
+ if did_redact:
273
+ redacted += 1
274
+ if collect_all or start <= total_lines <= end:
275
+ selected.append(sanitized)
276
+ return selected, total_lines, redacted
277
+
278
+
238
279
  def byte_len(text: str) -> int:
239
280
  return len(text.encode("utf-8", errors="replace"))
240
281
 
@@ -330,6 +371,16 @@ def bounded_int(value: object, default: int, minimum: int, maximum: int) -> int:
330
371
  return min(max(number, minimum), maximum)
331
372
 
332
373
 
374
+ def adaptive_k_threshold(value: object) -> float:
375
+ try:
376
+ number = float(value)
377
+ except (TypeError, ValueError, OverflowError) as exc:
378
+ raise argparse.ArgumentTypeError("adaptive-k threshold must be a number between 0.0 and 1.0") from exc
379
+ if not 0.0 <= number <= 1.0:
380
+ raise argparse.ArgumentTypeError("adaptive-k threshold must be between 0.0 and 1.0")
381
+ return number
382
+
383
+
333
384
  def cap_label(value: object, default: str | None = None, limit: int = MAX_LABEL_CHARS) -> str | None:
334
385
  if value is None:
335
386
  return default
@@ -342,13 +393,150 @@ def cap_label(value: object, default: str | None = None, limit: int = MAX_LABEL_
342
393
  return text
343
394
 
344
395
 
345
- def read_manifest(path: Path) -> list[SourceSpec]:
396
+ def normalized_link_target(anchor: Path, raw_target: str) -> Path:
397
+ target = Path(raw_target)
398
+ if not target.is_absolute():
399
+ target = anchor / target
400
+ return Path(os.path.normpath(str(target)))
401
+
402
+
403
+ def normalize_allowed_first_absolute_symlink(path: Path) -> Path:
404
+ """Normalize common macOS absolute path aliases before no-follow traversal."""
405
+
406
+ if not path.is_absolute() or len(path.parts) < 2:
407
+ return path
408
+ first = path.parts[1]
409
+ expected = ALLOWED_FIRST_ABSOLUTE_SYMLINKS.get(first)
410
+ if expected is None:
411
+ return path
412
+ link = Path(path.anchor) / first
346
413
  try:
347
- raw = path.read_bytes()
414
+ if not stat.S_ISLNK(os.lstat(link).st_mode):
415
+ return path
416
+ if normalized_link_target(Path(path.anchor), os.readlink(link)) != expected:
417
+ return path
418
+ except OSError:
419
+ return path
420
+ return expected.joinpath(*path.parts[2:])
421
+
422
+
423
+ def manifest_safe_read_supported() -> bool:
424
+ return hasattr(os, "O_NOFOLLOW") and os.open in getattr(os, "supports_dir_fd", set())
425
+
426
+
427
+ def manifest_directory_open_flags(*, follow_final: bool = False) -> int:
428
+ flags = os.O_RDONLY
429
+ if hasattr(os, "O_DIRECTORY"):
430
+ flags |= os.O_DIRECTORY
431
+ if hasattr(os, "O_CLOEXEC"):
432
+ flags |= os.O_CLOEXEC
433
+ if not follow_final:
434
+ flags |= os.O_NOFOLLOW
435
+ return flags
436
+
437
+
438
+ def manifest_file_open_flags() -> int:
439
+ flags = os.O_RDONLY | os.O_NOFOLLOW
440
+ for name in ("O_CLOEXEC", "O_NONBLOCK", "O_NOCTTY"):
441
+ flags |= getattr(os, name, 0)
442
+ return flags
443
+
444
+
445
+ def manifest_leaf_name(path: Path) -> str:
446
+ name = path.name
447
+ if name in {"", ".", ".."}:
448
+ raise PackError("manifest path must name a regular file")
449
+ return name
450
+
451
+
452
+ def open_manifest_parent_no_follow(path: Path) -> int:
453
+ if not manifest_safe_read_supported():
454
+ raise PackError("safe manifest reads require O_NOFOLLOW and dir_fd support")
455
+ path = path.expanduser()
456
+ if any(part == ".." for part in path.parts):
457
+ raise PackError("manifest path must not contain parent traversal")
458
+ if path.is_absolute():
459
+ path = normalize_allowed_first_absolute_symlink(Path(os.path.normpath(str(path))))
460
+ current_fd = os.open(path.anchor or os.sep, manifest_directory_open_flags(follow_final=True))
461
+ parts = path.parts[1:-1]
462
+ else:
463
+ path = Path(os.path.normpath(str(path)))
464
+ current_fd = os.open(".", manifest_directory_open_flags())
465
+ parts = path.parts[:-1]
466
+ try:
467
+ for part in parts:
468
+ if part in {"", "."}:
469
+ continue
470
+ if part == "..":
471
+ raise PackError("manifest path must not contain parent traversal")
472
+ next_fd = -1
473
+ try:
474
+ next_fd = os.open(part, manifest_directory_open_flags(), dir_fd=current_fd)
475
+ if not stat.S_ISDIR(os.fstat(next_fd).st_mode):
476
+ raise PackError("manifest path must not traverse non-directory components")
477
+ except (OSError, PackError):
478
+ if next_fd >= 0:
479
+ try:
480
+ os.close(next_fd)
481
+ except OSError:
482
+ pass
483
+ raise
484
+ os.close(current_fd)
485
+ current_fd = next_fd
486
+ owned_fd = current_fd
487
+ current_fd = -1
488
+ return owned_fd
489
+ finally:
490
+ if current_fd >= 0:
491
+ try:
492
+ os.close(current_fd)
493
+ except OSError:
494
+ pass
495
+
496
+
497
+ def read_manifest_bytes_no_follow(path: Path) -> bytes:
498
+ parent_fd = -1
499
+ fd = -1
500
+ try:
501
+ leaf = manifest_leaf_name(path.expanduser())
502
+ parent_fd = open_manifest_parent_no_follow(path)
503
+ fd = os.open(leaf, manifest_file_open_flags(), dir_fd=parent_fd)
504
+ st = os.fstat(fd)
505
+ if not stat.S_ISREG(st.st_mode):
506
+ raise PackError("manifest must be a regular file")
507
+ if st.st_size > MAX_MANIFEST_BYTES:
508
+ raise PackError(f"manifest exceeds trusted size cap: {st.st_size} > {MAX_MANIFEST_BYTES}")
509
+ chunks: list[bytes] = []
510
+ remaining = MAX_MANIFEST_BYTES + 1
511
+ while remaining > 0:
512
+ chunk = os.read(fd, min(64 * 1024, remaining))
513
+ if not chunk:
514
+ break
515
+ chunks.append(chunk)
516
+ remaining -= len(chunk)
517
+ raw = b"".join(chunks)
518
+ if len(raw) > MAX_MANIFEST_BYTES:
519
+ raise PackError(f"manifest exceeds trusted size cap: {len(raw)} > {MAX_MANIFEST_BYTES}")
520
+ return raw
521
+ except PackError:
522
+ raise
348
523
  except OSError as exc:
349
524
  raise PackError(f"could not read manifest: {exc.strerror or exc.__class__.__name__}") from exc
350
- if len(raw) > MAX_MANIFEST_BYTES:
351
- raise PackError(f"manifest exceeds trusted size cap: {len(raw)} > {MAX_MANIFEST_BYTES}")
525
+ finally:
526
+ if fd >= 0:
527
+ try:
528
+ os.close(fd)
529
+ except OSError:
530
+ pass
531
+ if parent_fd >= 0:
532
+ try:
533
+ os.close(parent_fd)
534
+ except OSError:
535
+ pass
536
+
537
+
538
+ def read_manifest(path: Path) -> list[SourceSpec]:
539
+ raw = read_manifest_bytes_no_follow(path)
352
540
  try:
353
541
  data = json.loads(raw.decode("utf-8"))
354
542
  except (UnicodeDecodeError, json.JSONDecodeError) as exc:
@@ -582,19 +770,15 @@ def resolve_source(root: Path, spec: SourceSpec) -> tuple[ResolvedSource | None,
582
770
  return None, omission(spec, reason, path=display, redacted_path=redacted_path)
583
771
  try:
584
772
  with handle:
585
- raw_text = handle.read()
773
+ requested = spec.lines
774
+ selected, total_lines, redacted_lines = sanitize_source_lines(handle, requested)
586
775
  except OSError:
587
776
  return None, omission(spec, "unsafe_path", path=display, redacted_path=redacted_path)
588
- sanitized, redacted_lines = sanitize_text(raw_text)
589
- all_lines = sanitized.splitlines(True)
590
- if not all_lines:
777
+ if total_lines <= 0:
591
778
  return None, omission(spec, "empty_source", path=display, redacted_path=redacted_path)
592
- total_lines = len(all_lines)
593
- requested = spec.lines or LineRange(1, total_lines)
779
+ requested = requested or LineRange(1, total_lines)
594
780
  if requested.start > total_lines:
595
781
  return None, omission(spec, "empty_source", path=display, redacted_path=redacted_path)
596
- end = min(requested.end, total_lines)
597
- selected = all_lines[requested.start - 1:end]
598
782
  if not selected:
599
783
  return None, omission(spec, "empty_source", path=display, redacted_path=redacted_path)
600
784
  return ResolvedSource(
@@ -645,7 +829,11 @@ def retrieval_for(root_arg: str, display_path: str, lines: LineRange, *, redacte
645
829
  return retrieval_cli(safe_root, display_path, lines), None
646
830
 
647
831
 
648
- def render_block(source: ResolvedSource, lines: list[str], *, root_arg: str, status: str, included: LineRange) -> str:
832
+ BLOCK_OPEN = "\n\n```text\n"
833
+ BLOCK_CLOSE = "```\n\n"
834
+
835
+
836
+ def render_block_header(source: ResolvedSource, *, root_arg: str, status: str, included: LineRange) -> str:
649
837
  title = source.spec.label or source.display_path
650
838
  requested = source.requested_lines or LineRange(1, source.total_lines)
651
839
  retrieval, retrieval_omitted_reason = retrieval_for(root_arg, source.display_path, included, redacted_path=source.redacted_path)
@@ -661,7 +849,11 @@ def render_block(source: ResolvedSource, lines: list[str], *, root_arg: str, sta
661
849
  header.append(f"Retrieval: `{retrieval}`")
662
850
  elif retrieval_omitted_reason:
663
851
  header.append(f"Retrieval omitted: {retrieval_omitted_reason}")
664
- return "\n".join(header) + "\n\n```text\n" + "".join(lines) + ("" if not lines or lines[-1].endswith("\n") else "\n") + "```\n\n"
852
+ return "\n".join(header)
853
+
854
+
855
+ def render_block(source: ResolvedSource, lines: list[str], *, root_arg: str, status: str, included: LineRange) -> str:
856
+ return render_block_header(source, root_arg=root_arg, status=status, included=included) + BLOCK_OPEN + "".join(lines) + ("" if not lines or lines[-1].endswith("\n") else "\n") + BLOCK_CLOSE
665
857
 
666
858
 
667
859
  def source_metadata(source: ResolvedSource, *, status: str, lines: list[str], included: LineRange, root_arg: str) -> dict[str, Any]:
@@ -701,21 +893,63 @@ def budget_omission(source: ResolvedSource, *, root_arg: str) -> dict[str, Any]:
701
893
  return item
702
894
 
703
895
 
704
- def fit_partial_lines(source: ResolvedSource, remaining: int, *, root_arg: str) -> tuple[list[str], str | None, LineRange | None]:
896
+ def included_range_for_line_count(source: ResolvedSource, line_count: int) -> LineRange:
897
+ start = source.requested_lines.start if source.requested_lines else 1
898
+ return LineRange(start, start + line_count - 1)
899
+
900
+
901
+ def line_byte_prefixes(lines: list[str]) -> list[int]:
902
+ prefixes = [0]
903
+ total = 0
904
+ for line in lines:
905
+ total += byte_len(line)
906
+ prefixes.append(total)
907
+ return prefixes
908
+
909
+
910
+ def render_block_byte_len(
911
+ source: ResolvedSource,
912
+ line_count: int,
913
+ line_prefixes: list[int],
914
+ *,
915
+ root_arg: str,
916
+ status: str,
917
+ included: LineRange,
918
+ ) -> int:
919
+ body_bytes = line_prefixes[line_count]
920
+ if line_count > 0 and not source.selected_lines[line_count - 1].endswith("\n"):
921
+ body_bytes += 1
922
+ return byte_len(render_block_header(source, root_arg=root_arg, status=status, included=included)) + byte_len(BLOCK_OPEN) + body_bytes + byte_len(BLOCK_CLOSE)
923
+
924
+
925
+ def fit_partial_lines(
926
+ source: ResolvedSource,
927
+ remaining: int,
928
+ *,
929
+ root_arg: str,
930
+ line_prefixes: list[int] | None = None,
931
+ ) -> tuple[list[str], str | None, LineRange | None]:
705
932
  if remaining <= 0:
706
933
  return [], None, None
707
- picked: list[str] = []
708
- for line in source.selected_lines:
709
- candidate = picked + [line]
710
- included = LineRange(source.requested_lines.start if source.requested_lines else 1, (source.requested_lines.start if source.requested_lines else 1) + len(candidate) - 1)
711
- block = render_block(source, candidate, root_arg=root_arg, status="partial", included=included)
712
- if byte_len(block) <= remaining:
713
- picked = candidate
934
+ if not source.selected_lines:
935
+ return [], None, None
936
+ prefixes = line_prefixes if line_prefixes is not None else line_byte_prefixes(source.selected_lines)
937
+ best = 0
938
+ low = 1
939
+ high = len(source.selected_lines)
940
+ while low <= high:
941
+ mid = (low + high) // 2
942
+ included = included_range_for_line_count(source, mid)
943
+ block_bytes = render_block_byte_len(source, mid, prefixes, root_arg=root_arg, status="partial", included=included)
944
+ if block_bytes <= remaining:
945
+ best = mid
946
+ low = mid + 1
714
947
  else:
715
- break
716
- if not picked:
948
+ high = mid - 1
949
+ if best <= 0:
717
950
  return [], None, None
718
- included = LineRange(source.requested_lines.start if source.requested_lines else 1, (source.requested_lines.start if source.requested_lines else 1) + len(picked) - 1)
951
+ picked = source.selected_lines[:best]
952
+ included = included_range_for_line_count(source, best)
719
953
  return picked, render_block(source, picked, root_arg=root_arg, status="partial", included=included), included
720
954
 
721
955
 
@@ -988,17 +1222,17 @@ def build_pack(root: Path, specs: list[SourceSpec], *, budget_bytes: int, root_a
988
1222
  parts.append(header)
989
1223
  current_pack_bytes += header_bytes
990
1224
  for source in resolved:
991
- start_line = source.requested_lines.start if source.requested_lines else 1
992
- included_range = LineRange(start_line, start_line + len(source.selected_lines) - 1)
993
- full_block = render_block(source, source.selected_lines, root_arg=root_arg, status="included", included=included_range)
994
- full_block_bytes = byte_len(full_block)
1225
+ line_prefixes = line_byte_prefixes(source.selected_lines)
1226
+ included_range = included_range_for_line_count(source, len(source.selected_lines))
1227
+ full_block_bytes = render_block_byte_len(source, len(source.selected_lines), line_prefixes, root_arg=root_arg, status="included", included=included_range)
995
1228
  remaining = budget_bytes - current_pack_bytes
996
1229
  if full_block_bytes <= remaining:
1230
+ full_block = render_block(source, source.selected_lines, root_arg=root_arg, status="included", included=included_range)
997
1231
  parts.append(full_block)
998
1232
  current_pack_bytes += full_block_bytes
999
1233
  included.append(source_metadata(source, status="included", lines=source.selected_lines, included=included_range, root_arg=root_arg))
1000
1234
  continue
1001
- partial_lines, partial_block, partial_range = fit_partial_lines(source, remaining, root_arg=root_arg)
1235
+ partial_lines, partial_block, partial_range = fit_partial_lines(source, remaining, root_arg=root_arg, line_prefixes=line_prefixes)
1002
1236
  if partial_block is not None and partial_range is not None:
1003
1237
  parts.append(partial_block)
1004
1238
  current_pack_bytes += byte_len(partial_block)
@@ -1271,19 +1505,81 @@ def collect_output_candidates(
1271
1505
 
1272
1506
 
1273
1507
  def git_ls_files(root: Path) -> list[str]:
1508
+ def read_stdout_capped(proc: subprocess.Popen[bytes], limit: int, timeout_seconds: float) -> tuple[bytes, bool]:
1509
+ if proc.stdout is None:
1510
+ return b"", False
1511
+ chunks: list[bytes] = []
1512
+ total = 0
1513
+ capped = False
1514
+ timed_out = False
1515
+
1516
+ def reader() -> None:
1517
+ nonlocal total, capped
1518
+ try:
1519
+ while total <= limit:
1520
+ chunk = proc.stdout.read(min(GIT_LS_FILES_READ_CHUNK_BYTES, limit + 1 - total))
1521
+ if not chunk:
1522
+ break
1523
+ chunks.append(chunk)
1524
+ total += len(chunk)
1525
+ if total > limit:
1526
+ capped = True
1527
+ break
1528
+ finally:
1529
+ if capped and proc.poll() is None:
1530
+ try:
1531
+ proc.terminate()
1532
+ except OSError:
1533
+ pass
1534
+ try:
1535
+ proc.stdout.close()
1536
+ except OSError:
1537
+ pass
1538
+
1539
+ thread = threading.Thread(target=reader, daemon=True)
1540
+ thread.start()
1541
+ thread.join(timeout_seconds)
1542
+ if thread.is_alive() and proc.poll() is None:
1543
+ timed_out = True
1544
+ try:
1545
+ proc.kill()
1546
+ except OSError:
1547
+ pass
1548
+ try:
1549
+ proc.wait(timeout=2)
1550
+ except subprocess.TimeoutExpired:
1551
+ try:
1552
+ proc.kill()
1553
+ except OSError:
1554
+ pass
1555
+ try:
1556
+ proc.wait(timeout=2)
1557
+ except subprocess.TimeoutExpired:
1558
+ pass
1559
+ thread.join(0.2)
1560
+ raw_output = b"".join(chunks)[:limit]
1561
+ complete = proc.returncode == 0 and not capped and not timed_out and raw_output.endswith(b"\0")
1562
+ return raw_output, complete
1563
+
1564
+ raw = b""
1565
+ git_returncode: int | None = None
1274
1566
  try:
1275
- proc = subprocess.run(
1567
+ proc = subprocess.Popen(
1276
1568
  ["git", "-C", str(root), "ls-files", "-z"],
1569
+ stdout=subprocess.PIPE,
1570
+ stderr=subprocess.DEVNULL,
1277
1571
  text=False,
1278
- capture_output=True,
1279
- timeout=10,
1280
- check=False,
1281
1572
  )
1573
+ raw, _git_complete = read_stdout_capped(proc, MAX_GIT_LS_FILES_OUTPUT_BYTES, 10)
1574
+ git_returncode = proc.returncode
1282
1575
  except (OSError, subprocess.TimeoutExpired):
1283
1576
  proc = None
1284
- if proc is not None and proc.returncode == 0:
1285
- raw = proc.stdout[: MAX_QUERY_SCAN_FILES * 512]
1577
+ if raw:
1578
+ if not raw.endswith(b"\0"):
1579
+ raw = raw.rsplit(b"\0", 1)[0] if b"\0" in raw else b""
1286
1580
  return [part.decode("utf-8", "replace") for part in raw.split(b"\0") if part][:MAX_QUERY_SCAN_FILES]
1581
+ if git_returncode == 0 or (git_returncode is not None and git_returncode < 0):
1582
+ return []
1287
1583
  out: list[str] = []
1288
1584
  skip_dirs = {".git", ".omx", ".context-guard", "node_modules", "dist", "build", "__pycache__"}
1289
1585
  for current, dirs, files in os.walk(root):
@@ -1358,7 +1654,8 @@ def source_selected_range(source: ResolvedSource) -> LineRange:
1358
1654
 
1359
1655
  def resolved_block_bytes(source: ResolvedSource, *, root_arg: str) -> int:
1360
1656
  included = source_selected_range(source)
1361
- return byte_len(render_block(source, source.selected_lines, root_arg=root_arg, status="included", included=included))
1657
+ line_prefixes = line_byte_prefixes(source.selected_lines)
1658
+ return render_block_byte_len(source, len(source.selected_lines), line_prefixes, root_arg=root_arg, status="included", included=included)
1362
1659
 
1363
1660
 
1364
1661
  def manifest_source_for_candidate(source: ResolvedSource, *, priority: int, label: str | None) -> dict[str, Any]:
@@ -1638,6 +1935,328 @@ def suggest_build_hint(root_arg: str, manifest_path: str | None, budget: int) ->
1638
1935
  return f"cd {shlex.quote(safe_root)} && {command}", None
1639
1936
 
1640
1937
 
1938
+ def percentile_int(values: list[int], numerator: int, denominator: int) -> int:
1939
+ if not values:
1940
+ return 0
1941
+ if denominator <= 0:
1942
+ return values[0]
1943
+ index = min(len(values) - 1, max(0, (len(values) - 1) * numerator // denominator))
1944
+ return values[index]
1945
+
1946
+
1947
+ def score_gap_advice(scores: list[int], requested_top: int) -> tuple[int, dict[str, Any], list[str]]:
1948
+ if not scores:
1949
+ return 0, {"after_rank": 0, "delta": 0, "ratio": 0.0}, ["no_candidates"]
1950
+ if len(scores) == 1:
1951
+ return 1, {"after_rank": 1, "delta": 0, "ratio": 0.0}, ["single_candidate"]
1952
+ gaps = [max(0, scores[index] - scores[index + 1]) for index in range(len(scores) - 1)]
1953
+ max_gap = max(gaps)
1954
+ gap_index = gaps.index(max_gap)
1955
+ top_score = max(1, scores[0])
1956
+ ratio = round(max_gap / top_score, 4)
1957
+ if max_gap >= max(250, top_score // 5):
1958
+ elbow_k = gap_index + 1
1959
+ reasons = ["score_elbow"] if elbow_k <= requested_top else ["score_elbow_after_requested_top"]
1960
+ else:
1961
+ elbow_k = min(MAX_SUGGEST_TOP, len(scores))
1962
+ reasons = ["no_strong_score_elbow"]
1963
+ return max(1, elbow_k), {"after_rank": gap_index + 1, "delta": max_gap, "ratio": ratio}, reasons
1964
+
1965
+
1966
+ def clamp_proxy(value: float) -> float:
1967
+ return min(1.0, max(0.0, round(value, 4)))
1968
+
1969
+
1970
+ def adaptive_policy_recommended_k(
1971
+ *,
1972
+ policy: str,
1973
+ requested_top: int,
1974
+ score_elbow_k: int,
1975
+ budget_fit_k: int,
1976
+ candidate_count: int,
1977
+ ) -> int:
1978
+ candidate_limit = min(max(0, candidate_count), MAX_SUGGEST_TOP)
1979
+ if candidate_limit == 0 or budget_fit_k <= 0:
1980
+ return 0
1981
+ if policy == "recall":
1982
+ policy_k = max(requested_top, score_elbow_k)
1983
+ elif policy == "precision":
1984
+ policy_k = min(score_elbow_k, requested_top)
1985
+ else:
1986
+ policy_k = score_elbow_k
1987
+ return min(max(0, policy_k), max(0, budget_fit_k), candidate_limit)
1988
+
1989
+
1990
+ def adaptive_path_label(value: object) -> str:
1991
+ raw = "" if value is None else str(value)
1992
+ if CONTROL_CHAR_RE.search(raw) or SECRET_CONTENT_RE.search(raw) or SECRET_PATH_COMPONENT_RE.search(raw):
1993
+ return f"redacted-path#path:{sha256_text(raw)[:12]}"
1994
+ rel, _reason = lexical_rel(raw)
1995
+ if rel is None:
1996
+ return safe_raw_path_label(raw)
1997
+ display, _redacted = display_rel_path(rel.as_posix())
1998
+ return display
1999
+
2000
+
2001
+ def actionable_adaptive_path(value: object) -> tuple[str | None, str | None]:
2002
+ raw = "" if value is None else str(value)
2003
+ if not raw:
2004
+ return None, "missing_path"
2005
+ if REDACTED_PATH_COMPONENT in raw or "[REDACTED" in raw:
2006
+ return None, "redacted_path"
2007
+ if CONTROL_CHAR_RE.search(raw) or SECRET_CONTENT_RE.search(raw) or SECRET_PATH_COMPONENT_RE.search(raw):
2008
+ return None, "unsafe_path"
2009
+ rel, reason = lexical_rel(raw)
2010
+ if rel is None:
2011
+ return None, reason or "unsafe_path"
2012
+ return rel.as_posix(), None
2013
+
2014
+
2015
+ def adaptive_lines(value: object) -> dict[str, int] | None:
2016
+ if not isinstance(value, dict):
2017
+ return None
2018
+ try:
2019
+ start = int(value.get("start"))
2020
+ end = int(value.get("end"))
2021
+ except (TypeError, ValueError, OverflowError):
2022
+ return None
2023
+ if start < 1 or end < start:
2024
+ return None
2025
+ return {"start": start, "end": end}
2026
+
2027
+
2028
+ def adaptive_retrieval_hint(item: dict[str, Any]) -> dict[str, Any]:
2029
+ path, path_reason = actionable_adaptive_path(item.get("path"))
2030
+ lines = adaptive_lines(item.get("lines") or item.get("included_lines") or item.get("requested_lines"))
2031
+ omitted_reason = item.get("retrieval_omitted_reason")
2032
+ if path_reason:
2033
+ return {"type": "slice", "available": False, "reason": str(omitted_reason or path_reason)}
2034
+ if lines is None:
2035
+ return {"type": "slice", "available": False, "reason": "missing_lines"}
2036
+ if not item.get("retrieval_cli"):
2037
+ return {"type": "slice", "available": False, "reason": str(omitted_reason or "missing_retrieval_hint")}
2038
+ return {"type": "slice", "available": True, "path": path, "lines": lines}
2039
+
2040
+
2041
+ def adaptive_selected_evidence(selected: list[dict[str, Any]]) -> list[dict[str, Any]]:
2042
+ evidence: list[dict[str, Any]] = []
2043
+ for rank, item in enumerate(selected[:MAX_ADAPTIVE_K_SELECTED_EVIDENCE], start=1):
2044
+ entry: dict[str, Any] = {
2045
+ "rank": rank,
2046
+ "path": adaptive_path_label(item.get("path")),
2047
+ "score": max(0, int(item.get("score", item.get("priority", 0)) or 0)),
2048
+ "reason": cap_label(item.get("reason"), default="local heuristic", limit=MAX_REASON_CHARS),
2049
+ "retrieval_hint": adaptive_retrieval_hint(item),
2050
+ }
2051
+ lines = adaptive_lines(item.get("lines"))
2052
+ if lines is not None:
2053
+ entry["lines"] = lines
2054
+ evidence.append(entry)
2055
+ return evidence
2056
+
2057
+
2058
+ def adaptive_omitted_evidence(omitted: list[dict[str, Any]]) -> dict[str, Any]:
2059
+ reason_counts: dict[str, int] = {}
2060
+ sources: list[dict[str, Any]] = []
2061
+ for item in omitted:
2062
+ reason = cap_label(item.get("reason"), default="unknown", limit=MAX_REASON_CHARS) or "unknown"
2063
+ reason_counts[reason] = reason_counts.get(reason, 0) + 1
2064
+ if len(sources) >= MAX_ADAPTIVE_K_OMITTED_EVIDENCE:
2065
+ continue
2066
+ source: dict[str, Any] = {
2067
+ "path": adaptive_path_label(item.get("path")),
2068
+ "reason": reason,
2069
+ "priority": max(0, int(item.get("priority", 0) or 0)),
2070
+ }
2071
+ lines = adaptive_lines(item.get("requested_lines") or item.get("lines"))
2072
+ if lines is not None:
2073
+ source["lines"] = lines
2074
+ hint = adaptive_retrieval_hint(item)
2075
+ if hint.get("available") or hint.get("reason") in {"redacted_path", "unsafe_root_path", "unsafe_path"}:
2076
+ source["retrieval_hint"] = hint
2077
+ sources.append(source)
2078
+ counts = [
2079
+ {"reason": reason, "count": count}
2080
+ for reason, count in sorted(reason_counts.items(), key=lambda pair: (-pair[1], pair[0]))[:MAX_ADAPTIVE_K_REASON_COUNTS]
2081
+ ]
2082
+ return {
2083
+ "omitted_count": len(omitted),
2084
+ "sources_capped": len(omitted) > len(sources),
2085
+ "sources": sources,
2086
+ "reason_counts": counts,
2087
+ }
2088
+
2089
+
2090
+ def adaptive_source_verification(selected: list[dict[str, Any]]) -> dict[str, Any]:
2091
+ hints: list[dict[str, Any]] = []
2092
+ available = 0
2093
+ for rank, item in enumerate(selected[:MAX_ADAPTIVE_K_VERIFICATION_HINTS], start=1):
2094
+ hint = adaptive_retrieval_hint(item)
2095
+ if hint.get("available"):
2096
+ available += 1
2097
+ record: dict[str, Any] = {
2098
+ "rank": rank,
2099
+ "path": adaptive_path_label(item.get("path")),
2100
+ "retrieval_hint": hint,
2101
+ }
2102
+ hints.append(record)
2103
+ return {
2104
+ "requires_exact_source_before_edits": True,
2105
+ "format": "structured_relative_slice_hints",
2106
+ "selected_count": len(selected),
2107
+ "hint_count": len(hints),
2108
+ "hints_capped": len(selected) > len(hints),
2109
+ "available_hint_count": available,
2110
+ "omitted_hint_count": len(hints) - available,
2111
+ "hints": hints,
2112
+ }
2113
+
2114
+
2115
+ def build_adaptive_k_advisory(
2116
+ *,
2117
+ candidates: list[SuggestCandidate],
2118
+ selected: list[dict[str, Any]],
2119
+ omitted: list[dict[str, Any]],
2120
+ requested_top: int,
2121
+ budget_bytes: int,
2122
+ estimated_pack_bytes: int,
2123
+ policy: str = "balanced",
2124
+ min_recall_proxy: float = 0.0,
2125
+ min_precision_proxy: float = 0.0,
2126
+ ) -> dict[str, Any]:
2127
+ if policy not in ADAPTIVE_K_POLICIES:
2128
+ policy = "balanced"
2129
+ sampled_candidates = candidates[:MAX_ADAPTIVE_K_SCORE_SAMPLES]
2130
+ scores = [max(0, int(candidate.score)) for candidate in sampled_candidates]
2131
+ score_elbow_k, max_gap_details, reason_codes = score_gap_advice(scores, requested_top)
2132
+ selected_count = len(selected)
2133
+ selected_scores = [max(0, int(item.get("score", item.get("priority", 0)) or 0)) for item in selected]
2134
+ selected_score_mass = sum(selected_scores)
2135
+ analyzed_score_mass = sum(scores)
2136
+ budget_omitted_count = sum(1 for item in omitted if item.get("reason") == "budget_exhausted")
2137
+ budget_limited = bool(budget_omitted_count or estimated_pack_bytes > budget_bytes)
2138
+ remaining_bytes = budget_bytes - estimated_pack_bytes
2139
+ average_selected_bytes = int(estimated_pack_bytes / selected_count) if selected_count else 0
2140
+ if budget_limited:
2141
+ reason_codes.append("budget_limited")
2142
+ if len(candidates) > len(sampled_candidates):
2143
+ reason_codes.append("candidate_sample_capped")
2144
+ if selected_count < min(requested_top, len(candidates)):
2145
+ reason_codes.append("selected_below_requested_top")
2146
+ if selected_count == 0:
2147
+ budget_fit_k = 0
2148
+ if candidates:
2149
+ reason_codes.append("no_budget_fit" if budget_limited else "no_selected_sources")
2150
+ elif budget_limited:
2151
+ budget_fit_k = selected_count
2152
+ else:
2153
+ additional_by_budget = max(0, remaining_bytes // max(1, average_selected_bytes))
2154
+ budget_fit_k = min(MAX_SUGGEST_TOP, len(candidates), selected_count + additional_by_budget)
2155
+ if budget_fit_k > requested_top:
2156
+ reason_codes.append("budget_headroom_expand")
2157
+ if not candidates:
2158
+ recommended_k = 0
2159
+ else:
2160
+ recommended_k = adaptive_policy_recommended_k(
2161
+ policy=policy,
2162
+ requested_top=requested_top,
2163
+ score_elbow_k=score_elbow_k,
2164
+ budget_fit_k=budget_fit_k,
2165
+ candidate_count=len(candidates),
2166
+ )
2167
+ score_values_asc = sorted(scores)
2168
+ top_score = score_values_asc[-1] if score_values_asc else 0
2169
+ recall_proxy = clamp_proxy(selected_score_mass / analyzed_score_mass) if analyzed_score_mass else 0.0
2170
+ precision_proxy = (
2171
+ clamp_proxy((selected_score_mass / max(1, selected_count)) / max(1, top_score))
2172
+ if selected_count
2173
+ else 0.0
2174
+ )
2175
+ recall_gate_passed = recall_proxy >= min_recall_proxy
2176
+ precision_gate_passed = precision_proxy >= min_precision_proxy
2177
+ gate_status = "pass" if recall_gate_passed and precision_gate_passed else "failed"
2178
+ return {
2179
+ "schema_version": ADAPTIVE_K_SCHEMA_VERSION,
2180
+ "mode": "advisory",
2181
+ "requested_top": requested_top,
2182
+ "recommended_k": recommended_k,
2183
+ "policy": {
2184
+ "name": policy,
2185
+ "available_policies": list(ADAPTIVE_K_POLICIES),
2186
+ "changes_manifest_or_pack": False,
2187
+ "measurement_basis": "current_selected_sources_not_policy_applied_rebuild",
2188
+ "status": "evaluated",
2189
+ },
2190
+ "recommendation": {
2191
+ "apply": False,
2192
+ "reason_codes": sorted(set(reason_codes)),
2193
+ "next_step": "rerun with --top recommended_k if you accept this local proxy advisory",
2194
+ },
2195
+ "score_distribution": {
2196
+ "candidate_count": len(candidates),
2197
+ "analyzed_candidate_count": len(sampled_candidates),
2198
+ "sample_capped": len(candidates) > len(sampled_candidates),
2199
+ "top_score": top_score,
2200
+ "p50_score": percentile_int(score_values_asc, 1, 2),
2201
+ "p90_score": percentile_int(score_values_asc, 9, 10),
2202
+ "min_score": score_values_asc[0] if score_values_asc else 0,
2203
+ "max_gap_details": max_gap_details,
2204
+ "score_elbow_k": score_elbow_k,
2205
+ },
2206
+ "budget_fit": {
2207
+ "budget_bytes": budget_bytes,
2208
+ "estimated_pack_bytes": estimated_pack_bytes,
2209
+ "remaining_bytes": remaining_bytes,
2210
+ "selected_count": selected_count,
2211
+ "budget_omitted_count": budget_omitted_count,
2212
+ "budget_limited": budget_limited,
2213
+ "average_selected_bytes": average_selected_bytes,
2214
+ "budget_fit_k": budget_fit_k,
2215
+ },
2216
+ "regression_gates": {
2217
+ "status": gate_status,
2218
+ "measurement_basis": "current_selected_sources_not_policy_applied_rebuild",
2219
+ "comparison": "observed_greater_than_or_equal_threshold",
2220
+ "recall_proxy": {
2221
+ "observed": recall_proxy,
2222
+ "minimum": min_recall_proxy,
2223
+ "passed": recall_gate_passed,
2224
+ },
2225
+ "precision_proxy": {
2226
+ "observed": precision_proxy,
2227
+ "minimum": min_precision_proxy,
2228
+ "passed": precision_gate_passed,
2229
+ },
2230
+ },
2231
+ "recall_precision_proxy": {
2232
+ "measurement": "local_score_mass_proxy",
2233
+ "range": "clamped_0_1",
2234
+ "measurement_basis": "current_selected_sources_not_policy_applied_rebuild",
2235
+ "selected_score_mass": selected_score_mass,
2236
+ "analyzed_score_mass": analyzed_score_mass,
2237
+ "recall_proxy": recall_proxy,
2238
+ "precision_proxy": precision_proxy,
2239
+ "selected_count": selected_count,
2240
+ "candidate_count": len(candidates),
2241
+ },
2242
+ "selected_evidence": {
2243
+ "selected_count": selected_count,
2244
+ "items_capped": selected_count > MAX_ADAPTIVE_K_SELECTED_EVIDENCE,
2245
+ "items": adaptive_selected_evidence(selected),
2246
+ },
2247
+ "omitted_evidence": adaptive_omitted_evidence(omitted),
2248
+ "source_verification": adaptive_source_verification(selected),
2249
+ "claim_boundary": {
2250
+ "deterministic_local_only": True,
2251
+ "no_model_network_or_embedding": True,
2252
+ "token_counts_are_estimated_proxies": True,
2253
+ "provider_token_or_cost_savings_claim_allowed": False,
2254
+ "advisory_does_not_change_manifest_or_pack": True,
2255
+ "selectable_policy_changes_manifest_or_pack": False,
2256
+ },
2257
+ }
2258
+
2259
+
1641
2260
  def suggest_pack(root: Path, args: argparse.Namespace, *, root_arg: str) -> tuple[dict[str, Any], int]:
1642
2261
  query_text, _query_redactions = sanitize_text(args.query or "")
1643
2262
  query = " ".join(query_text.split())
@@ -1713,11 +2332,19 @@ def suggest_pack(root: Path, args: argparse.Namespace, *, root_arg: str) -> tupl
1713
2332
  })
1714
2333
  continue
1715
2334
  final_seen.add(final_identity)
1716
- source_bytes = resolved_block_bytes(source, root_arg=root_arg)
2335
+ line_prefixes = line_byte_prefixes(source.selected_lines)
2336
+ source_bytes = render_block_byte_len(
2337
+ source,
2338
+ len(source.selected_lines),
2339
+ line_prefixes,
2340
+ root_arg=root_arg,
2341
+ status="included",
2342
+ included=source_selected_range(source),
2343
+ )
1717
2344
  remaining = budget - current_bytes
1718
2345
  if source_bytes > remaining:
1719
2346
  if not selected and remaining > 0:
1720
- partial_lines, _partial_block, partial_range = fit_partial_lines(source, remaining, root_arg=root_arg)
2347
+ partial_lines, _partial_block, partial_range = fit_partial_lines(source, remaining, root_arg=root_arg, line_prefixes=line_prefixes)
1721
2348
  if partial_range is not None and partial_lines:
1722
2349
  partial_spec = SourceSpec(
1723
2350
  path=candidate.path,
@@ -1734,7 +2361,15 @@ def suggest_pack(root: Path, args: argparse.Namespace, *, root_arg: str) -> tupl
1734
2361
  omitted.append(omitted_item)
1735
2362
  continue
1736
2363
  assert source is not None
1737
- source_bytes = resolved_block_bytes(source, root_arg=root_arg)
2364
+ partial_prefixes = line_byte_prefixes(source.selected_lines)
2365
+ source_bytes = render_block_byte_len(
2366
+ source,
2367
+ len(source.selected_lines),
2368
+ partial_prefixes,
2369
+ root_arg=root_arg,
2370
+ status="included",
2371
+ included=source_selected_range(source),
2372
+ )
1738
2373
  else:
1739
2374
  omitted.append({"path": source.display_path, "status": "omitted", "reason": "budget_exhausted", "priority": candidate.score})
1740
2375
  continue
@@ -1780,6 +2415,18 @@ def suggest_pack(root: Path, args: argparse.Namespace, *, root_arg: str) -> tupl
1780
2415
  }
1781
2416
  if build_hint_omitted_reason:
1782
2417
  payload["build_hint_omitted_reason"] = build_hint_omitted_reason
2418
+ if getattr(args, "adaptive_k", False):
2419
+ payload["adaptive_k"] = build_adaptive_k_advisory(
2420
+ candidates=candidates,
2421
+ selected=selected,
2422
+ omitted=omitted,
2423
+ requested_top=top,
2424
+ budget_bytes=budget,
2425
+ estimated_pack_bytes=estimated_pack_bytes,
2426
+ policy=getattr(args, "adaptive_k_policy", "balanced"),
2427
+ min_recall_proxy=float(getattr(args, "adaptive_k_min_recall_proxy", 0.0) or 0.0),
2428
+ min_precision_proxy=float(getattr(args, "adaptive_k_min_precision_proxy", 0.0) or 0.0),
2429
+ )
1783
2430
  return payload, 0
1784
2431
 
1785
2432
 
@@ -1893,20 +2540,53 @@ def read_repo_map_text(root: Path, rel_path: str) -> tuple[dict[str, Any] | None
1893
2540
  }, None
1894
2541
 
1895
2542
 
1896
- def repo_map_records(root: Path) -> tuple[list[dict[str, Any]], list[dict[str, Any]], dict[str, Any]]:
2543
+ def repo_map_path_scan_priority(rel_path: str, *, seed_paths: set[str], query_terms: set[str], input_index: int) -> tuple[int, int, str]:
2544
+ rel, reason = lexical_rel(rel_path)
2545
+ display = repo_map_safe_raw_path_label(rel_path)
2546
+ redacted = False
2547
+ if rel is not None and not reason:
2548
+ display, redacted = repo_map_display_rel_path(rel.as_posix())
2549
+ score = 0
2550
+ if not redacted and display in seed_paths:
2551
+ score += 1_000_000
2552
+ if is_repo_map_text_path(display):
2553
+ score += 10_000
2554
+ score += suggest_score_path(display, query_terms)
2555
+ if Path(display).name.lower() in {"readme", "readme.md", "readme.mdx"}:
2556
+ score += 250
2557
+ return (-score, input_index, display)
2558
+
2559
+
2560
+ def repo_map_scan_paths(paths: list[str], *, seed_paths: set[str], query_terms: set[str]) -> list[str]:
2561
+ ranked = sorted(
2562
+ enumerate(paths[:MAX_REPO_MAP_FILES]),
2563
+ key=lambda item: repo_map_path_scan_priority(item[1], seed_paths=seed_paths, query_terms=query_terms, input_index=item[0]),
2564
+ )
2565
+ return [path for _index, path in ranked[:MAX_REPO_MAP_SCAN_FILES]]
2566
+
2567
+
2568
+ def repo_map_records(root: Path, *, seed_paths: set[str], query_terms: set[str]) -> tuple[list[dict[str, Any]], list[dict[str, Any]], dict[str, Any]]:
1897
2569
  paths = git_ls_files(root)
2570
+ candidate_paths = paths[:MAX_REPO_MAP_FILES]
1898
2571
  path_cap_reached = len(paths) > MAX_REPO_MAP_FILES
2572
+ scan_paths = repo_map_scan_paths(candidate_paths, seed_paths=seed_paths, query_terms=query_terms)
2573
+ scan_cap_reached = len(candidate_paths) > len(scan_paths)
1899
2574
  records: list[dict[str, Any]] = []
1900
2575
  omitted: list[dict[str, Any]] = []
1901
- for rel_path in paths[:MAX_REPO_MAP_FILES]:
2576
+ for rel_path in scan_paths:
1902
2577
  record, omission_item = read_repo_map_text(root, rel_path)
1903
2578
  if record is not None:
1904
2579
  records.append(record)
1905
2580
  elif omission_item is not None and omission_item.get("reason") != "unsupported_file_type":
1906
2581
  omitted.append({key: value for key, value in omission_item.items() if value is not None})
1907
2582
  caps = {
1908
- "max_files": MAX_REPO_MAP_FILES,
1909
- "files_capped": path_cap_reached,
2583
+ "max_files": MAX_REPO_MAP_SCAN_FILES,
2584
+ "files_capped": path_cap_reached or scan_cap_reached,
2585
+ "max_candidate_files": MAX_REPO_MAP_FILES,
2586
+ "candidate_files": len(candidate_paths),
2587
+ "candidate_files_capped": path_cap_reached,
2588
+ "scan_files": len(scan_paths),
2589
+ "scan_files_capped": scan_cap_reached,
1910
2590
  "max_bytes_per_file": MAX_REPO_MAP_BYTES_PER_FILE,
1911
2591
  "bytes_per_file_capped_count": sum(1 for item in records if item.get("bytes_capped")),
1912
2592
  "max_tree_entries": MAX_REPO_MAP_TREE_ENTRIES,
@@ -2256,18 +2936,19 @@ def build_repo_map_payload(
2256
2936
  *,
2257
2937
  root_arg: str,
2258
2938
  ) -> dict[str, Any]:
2259
- records, omitted, caps = repo_map_records(root)
2939
+ query_terms = suggest_tokens(str(suggest_payload.get("query", "")))
2940
+ seed_paths = repo_map_seed_paths(args, suggest_payload, build_payload)
2941
+ records, omitted, caps = repo_map_records(root, seed_paths=seed_paths, query_terms=query_terms)
2260
2942
  record_by_path = {str(record["path"]): record for record in records}
2261
2943
  signatures = extract_signatures(records)
2262
2944
  secret_scan = build_secret_scan(records)
2263
2945
  edges = collect_import_edges(records)
2264
- query_terms = suggest_tokens(str(suggest_payload.get("query", "")))
2265
2946
  graph_rank = build_graph_rank(
2266
2947
  records,
2267
2948
  signatures,
2268
2949
  edges,
2269
2950
  query_terms=query_terms,
2270
- seed_paths=repo_map_seed_paths(args, suggest_payload, build_payload),
2951
+ seed_paths=seed_paths,
2271
2952
  secret_scan=secret_scan,
2272
2953
  )
2273
2954
  retrieval = repo_map_retrieval(record_by_path, signatures, graph_rank, root_arg=root_arg)
@@ -2312,6 +2993,90 @@ def build_repo_map_payload(
2312
2993
  }
2313
2994
 
2314
2995
 
2996
+ def line_identity_from_dict(value: object) -> str:
2997
+ if not isinstance(value, dict):
2998
+ return "all"
2999
+ return f"{value.get('start')}:{value.get('end')}"
3000
+
3001
+
3002
+ def build_symbol_memory_payload(repo_map: dict[str, Any]) -> dict[str, Any]:
3003
+ retrieval_by_path_lines: dict[tuple[str, str], dict[str, Any]] = {}
3004
+ for item in repo_map.get("retrieval", []):
3005
+ if not isinstance(item, dict):
3006
+ continue
3007
+ path = str(item.get("path", ""))
3008
+ retrieval_by_path_lines[(path, line_identity_from_dict(item.get("lines")))] = item
3009
+
3010
+ symbols: list[dict[str, Any]] = []
3011
+ for signature in repo_map.get("signature_index", []):
3012
+ if not isinstance(signature, dict):
3013
+ continue
3014
+ path = str(signature.get("path", ""))
3015
+ lines = copy.deepcopy(signature.get("lines"))
3016
+ retrieval = retrieval_by_path_lines.get((path, line_identity_from_dict(lines)))
3017
+ symbol: dict[str, Any] = {
3018
+ "path": path,
3019
+ "kind": signature.get("kind"),
3020
+ "name": signature.get("name"),
3021
+ "signature": signature.get("signature"),
3022
+ "line": signature.get("line"),
3023
+ "lines": lines,
3024
+ "source": "repo_map.signature_index",
3025
+ "exact_source_verification_required": True,
3026
+ }
3027
+ if isinstance(retrieval, dict):
3028
+ for key in ("slice_cli", "symbol_cli", "retrieval_omitted_reason"):
3029
+ if retrieval.get(key):
3030
+ symbol[key] = retrieval[key]
3031
+ symbols.append({key: value for key, value in symbol.items() if value is not None})
3032
+ if len(symbols) >= MAX_SYMBOL_MEMORY_ITEMS:
3033
+ break
3034
+
3035
+ graph_context: list[dict[str, Any]] = []
3036
+ for item in repo_map.get("graph_rank", []):
3037
+ if not isinstance(item, dict):
3038
+ continue
3039
+ graph_context.append({
3040
+ "path": item.get("path"),
3041
+ "score": item.get("score"),
3042
+ "components": copy.deepcopy(item.get("components", {})),
3043
+ "line_count": item.get("line_count"),
3044
+ "exact_source_verification_required": True,
3045
+ })
3046
+ if len(graph_context) >= MAX_SYMBOL_MEMORY_GRAPH_ITEMS:
3047
+ break
3048
+
3049
+ summary = repo_map.get("summary", {}) if isinstance(repo_map.get("summary"), dict) else {}
3050
+ retrieval = repo_map.get("retrieval", []) if isinstance(repo_map.get("retrieval"), list) else []
3051
+ return {
3052
+ "schema_version": SYMBOL_MEMORY_SCHEMA_VERSION,
3053
+ "mode": "advisory",
3054
+ "source": "contextguard.pack-repo-map.v1",
3055
+ "summary": {
3056
+ "symbols": len(symbols),
3057
+ "graph_context": len(graph_context),
3058
+ "files_scanned": int(summary.get("files_scanned", 0) or 0),
3059
+ "graph_edges": int(summary.get("graph_edges", 0) or 0),
3060
+ "retrieval_hints": len(retrieval),
3061
+ },
3062
+ "symbols": symbols,
3063
+ "graph_context": graph_context,
3064
+ "source_verification": {
3065
+ "requires_exact_source_before_edits": True,
3066
+ "verified_by": ["slice_cli", "symbol_cli"],
3067
+ "retrieval_hint_count": len(retrieval),
3068
+ "missing_retrieval_hint_count": max(0, len(symbols) - sum(1 for item in symbols if item.get("slice_cli") or item.get("symbol_cli"))),
3069
+ },
3070
+ "claim_boundary": {
3071
+ "deterministic_local_only": True,
3072
+ "no_network_model_embedding_lsp_or_tree_sitter_dependency": True,
3073
+ "advisory_does_not_change_manifest_pack_or_receipt": True,
3074
+ "graph_rank_is_explain_only": True,
3075
+ "provider_token_or_cost_savings_claim_allowed": False,
3076
+ },
3077
+ }
3078
+
3079
+
2315
3080
  def build_auto_explain_payload(
2316
3081
  args: argparse.Namespace,
2317
3082
  suggest_payload: dict[str, Any],
@@ -2320,6 +3085,7 @@ def build_auto_explain_payload(
2320
3085
  *,
2321
3086
  root: Path | None = None,
2322
3087
  root_arg: str = ".",
3088
+ repo_map_payload: dict[str, Any] | None = None,
2323
3089
  ) -> dict[str, Any]:
2324
3090
  build_sources = [
2325
3091
  item
@@ -2447,7 +3213,9 @@ def build_auto_explain_payload(
2447
3213
  "raw_test_output_embedded": False,
2448
3214
  },
2449
3215
  }
2450
- if root is not None:
3216
+ if repo_map_payload is not None:
3217
+ explain["repo_map"] = copy.deepcopy(repo_map_payload)
3218
+ elif root is not None:
2451
3219
  explain["repo_map"] = build_repo_map_payload(root, args, suggest_payload, build_payload, root_arg=root_arg)
2452
3220
  return explain
2453
3221
 
@@ -2534,11 +3302,74 @@ def auto_pack(root: Path, args: argparse.Namespace, *, root_arg: str) -> tuple[d
2534
3302
  }
2535
3303
  if build_hint_omitted_reason:
2536
3304
  payload["build_hint_omitted_reason"] = build_hint_omitted_reason
3305
+ if getattr(args, "adaptive_k", False) and isinstance(suggest_payload.get("adaptive_k"), dict):
3306
+ payload["adaptive_k"] = copy.deepcopy(suggest_payload["adaptive_k"])
3307
+ repo_map_payload: dict[str, Any] | None = None
3308
+ if getattr(args, "symbol_memory", False) or args.explain:
3309
+ repo_map_payload = build_repo_map_payload(root, args, suggest_payload, build_payload, root_arg=root_arg)
3310
+ if getattr(args, "symbol_memory", False) and isinstance(repo_map_payload, dict):
3311
+ payload["symbol_memory"] = build_symbol_memory_payload(repo_map_payload)
2537
3312
  if args.explain:
2538
- payload["explain"] = build_auto_explain_payload(args, suggest_payload, build_payload, payload, root=root, root_arg=root_arg)
3313
+ payload["explain"] = build_auto_explain_payload(
3314
+ args,
3315
+ suggest_payload,
3316
+ build_payload,
3317
+ payload,
3318
+ root=root,
3319
+ root_arg=root_arg,
3320
+ repo_map_payload=repo_map_payload,
3321
+ )
2539
3322
  return payload, rc
2540
3323
 
2541
3324
 
3325
+ def print_adaptive_k_text(payload: dict[str, Any]) -> None:
3326
+ adaptive = payload.get("adaptive_k")
3327
+ if not isinstance(adaptive, dict):
3328
+ return
3329
+ recommendation = (
3330
+ adaptive.get("recommendation", {})
3331
+ if isinstance(adaptive.get("recommendation"), dict)
3332
+ else {}
3333
+ )
3334
+ score_distribution = (
3335
+ adaptive.get("score_distribution", {})
3336
+ if isinstance(adaptive.get("score_distribution"), dict)
3337
+ else {}
3338
+ )
3339
+ budget_fit = adaptive.get("budget_fit", {}) if isinstance(adaptive.get("budget_fit"), dict) else {}
3340
+ policy = adaptive.get("policy", {}) if isinstance(adaptive.get("policy"), dict) else {}
3341
+ regression_gates = adaptive.get("regression_gates", {}) if isinstance(adaptive.get("regression_gates"), dict) else {}
3342
+ reason_codes = recommendation.get("reason_codes", [])
3343
+ if isinstance(reason_codes, list):
3344
+ reason_text = ",".join(str(item) for item in reason_codes[:5])
3345
+ else:
3346
+ reason_text = str(reason_codes)
3347
+ print(
3348
+ "adaptive-k: "
3349
+ f"recommended={adaptive.get('recommended_k', 0)}/{adaptive.get('requested_top', 0)} "
3350
+ f"policy={policy.get('name', 'balanced')} "
3351
+ f"gates={regression_gates.get('status', 'pass')} "
3352
+ f"candidates={score_distribution.get('candidate_count', 0)} "
3353
+ f"budget_limited={budget_fit.get('budget_limited', False)} "
3354
+ f"apply=false reasons={reason_text or 'none'}"
3355
+ )
3356
+
3357
+
3358
+ def print_symbol_memory_text(payload: dict[str, Any]) -> None:
3359
+ symbol_memory = payload.get("symbol_memory")
3360
+ if not isinstance(symbol_memory, dict):
3361
+ return
3362
+ summary = symbol_memory.get("summary", {}) if isinstance(symbol_memory.get("summary"), dict) else {}
3363
+ verification = symbol_memory.get("source_verification", {}) if isinstance(symbol_memory.get("source_verification"), dict) else {}
3364
+ print(
3365
+ "symbol-memory: "
3366
+ f"symbols={summary.get('symbols', 0)} "
3367
+ f"graph_context={summary.get('graph_context', 0)} "
3368
+ f"retrieval_hints={summary.get('retrieval_hints', 0)} "
3369
+ f"verify_before_edits={str(verification.get('requires_exact_source_before_edits', True)).lower()}"
3370
+ )
3371
+
3372
+
2542
3373
  def print_suggest_text(payload: dict[str, Any]) -> None:
2543
3374
  print(
2544
3375
  f"context-guard-pack suggest: {len(payload['sources'])} source(s), "
@@ -2554,6 +3385,7 @@ def print_suggest_text(payload: dict[str, Any]) -> None:
2554
3385
  print(f"build: {payload['build_hint']}")
2555
3386
  elif payload.get("build_hint_omitted_reason"):
2556
3387
  print(f"build hint omitted: {payload['build_hint_omitted_reason']}")
3388
+ print_adaptive_k_text(payload)
2557
3389
 
2558
3390
 
2559
3391
  def print_auto_text(payload: dict[str, Any]) -> None:
@@ -2598,6 +3430,8 @@ def print_auto_text(payload: dict[str, Any]) -> None:
2598
3430
  reason_counts[reason] = reason_counts.get(reason, 0) + 1
2599
3431
  reason_text = ", ".join(f"{reason}={count}" for reason, count in sorted(reason_counts.items()))
2600
3432
  print(f"omitted reasons: {reason_text}")
3433
+ print_adaptive_k_text(payload)
3434
+ print_symbol_memory_text(payload)
2601
3435
  if payload.get("manifest_path"):
2602
3436
  print(f"manifest: {payload['manifest_path']}")
2603
3437
  if payload.get("pack_path"):
@@ -2633,6 +3467,10 @@ def build_parser() -> argparse.ArgumentParser:
2633
3467
  suggest.add_argument("--top", type=int, default=DEFAULT_SUGGEST_TOP, help="maximum suggested sources")
2634
3468
  suggest.add_argument("--context-lines", type=int, default=DEFAULT_SUGGEST_CONTEXT_LINES, help="line context around diff/output hits")
2635
3469
  suggest.add_argument("--manifest-out", help="write the suggested build manifest to this relative path under root")
3470
+ suggest.add_argument("--adaptive-k", action="store_true", help="include local score/budget top-k advisory metadata without changing the manifest")
3471
+ suggest.add_argument("--adaptive-k-policy", choices=ADAPTIVE_K_POLICIES, default="balanced", help="local adaptive-k recommendation policy used when --adaptive-k is set")
3472
+ suggest.add_argument("--adaptive-k-min-recall-proxy", type=adaptive_k_threshold, default=0.0, help="metadata-only minimum recall proxy gate for --adaptive-k")
3473
+ suggest.add_argument("--adaptive-k-min-precision-proxy", type=adaptive_k_threshold, default=0.0, help="metadata-only minimum precision proxy gate for --adaptive-k")
2636
3474
  suggest.add_argument("--json", action="store_true", help="emit JSON payload")
2637
3475
  auto = sub.add_parser("auto", help="suggest a context pack manifest and build the budgeted pack in one local step")
2638
3476
  auto.add_argument("--root", default=".", help="project root; must not be a symlink")
@@ -2649,6 +3487,11 @@ def build_parser() -> argparse.ArgumentParser:
2649
3487
  auto.add_argument("--json", action="store_true", help="emit JSON payload")
2650
3488
  auto.add_argument("--no-artifact", action="store_true", help="do not write .context-guard/packs receipt")
2651
3489
  auto.add_argument("--explain", action="store_true", help="include deterministic local selection/build explanation metadata")
3490
+ auto.add_argument("--adaptive-k", action="store_true", help="include local score/budget top-k advisory metadata without changing the manifest or pack")
3491
+ auto.add_argument("--adaptive-k-policy", choices=ADAPTIVE_K_POLICIES, default="balanced", help="local adaptive-k recommendation policy used when --adaptive-k is set")
3492
+ auto.add_argument("--adaptive-k-min-recall-proxy", type=adaptive_k_threshold, default=0.0, help="metadata-only minimum recall proxy gate for --adaptive-k")
3493
+ auto.add_argument("--adaptive-k-min-precision-proxy", type=adaptive_k_threshold, default=0.0, help="metadata-only minimum precision proxy gate for --adaptive-k")
3494
+ auto.add_argument("--symbol-memory", action="store_true", help="include repo-map derived symbol/graph advisory metadata with exact source verification hints")
2652
3495
  return parser
2653
3496
 
2654
3497