@de-otio/repo-aegis-core 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (191) hide show
  1. package/dist/age.d.ts +32 -0
  2. package/dist/age.d.ts.map +1 -0
  3. package/dist/age.js +98 -0
  4. package/dist/age.js.map +1 -0
  5. package/dist/audit-log.d.ts +50 -0
  6. package/dist/audit-log.d.ts.map +1 -0
  7. package/dist/audit-log.js +183 -0
  8. package/dist/audit-log.js.map +1 -0
  9. package/dist/audit-log.test.d.ts +2 -0
  10. package/dist/audit-log.test.d.ts.map +1 -0
  11. package/dist/audit-log.test.js +181 -0
  12. package/dist/audit-log.test.js.map +1 -0
  13. package/dist/deny-set.d.ts +43 -0
  14. package/dist/deny-set.d.ts.map +1 -0
  15. package/dist/deny-set.js +165 -0
  16. package/dist/deny-set.js.map +1 -0
  17. package/dist/deny-set.test.d.ts +2 -0
  18. package/dist/deny-set.test.d.ts.map +1 -0
  19. package/dist/deny-set.test.js +155 -0
  20. package/dist/deny-set.test.js.map +1 -0
  21. package/dist/exceptions.d.ts +96 -0
  22. package/dist/exceptions.d.ts.map +1 -0
  23. package/dist/exceptions.js +143 -0
  24. package/dist/exceptions.js.map +1 -0
  25. package/dist/exit-codes.d.ts +4 -0
  26. package/dist/exit-codes.d.ts.map +1 -0
  27. package/dist/exit-codes.js +6 -0
  28. package/dist/exit-codes.js.map +1 -0
  29. package/dist/first-touch.d.ts +57 -0
  30. package/dist/first-touch.d.ts.map +1 -0
  31. package/dist/first-touch.js +112 -0
  32. package/dist/first-touch.js.map +1 -0
  33. package/dist/import-graph.test.d.ts +2 -0
  34. package/dist/import-graph.test.d.ts.map +1 -0
  35. package/dist/import-graph.test.js +210 -0
  36. package/dist/import-graph.test.js.map +1 -0
  37. package/dist/index.d.ts +37 -0
  38. package/dist/index.d.ts.map +1 -0
  39. package/dist/index.js +68 -0
  40. package/dist/index.js.map +1 -0
  41. package/dist/lock.d.ts +22 -0
  42. package/dist/lock.d.ts.map +1 -0
  43. package/dist/lock.js +86 -0
  44. package/dist/lock.js.map +1 -0
  45. package/dist/lock.test.d.ts +2 -0
  46. package/dist/lock.test.d.ts.map +1 -0
  47. package/dist/lock.test.js +125 -0
  48. package/dist/lock.test.js.map +1 -0
  49. package/dist/paths.d.ts +22 -0
  50. package/dist/paths.d.ts.map +1 -0
  51. package/dist/paths.js +46 -0
  52. package/dist/paths.js.map +1 -0
  53. package/dist/paths.test.d.ts +2 -0
  54. package/dist/paths.test.d.ts.map +1 -0
  55. package/dist/paths.test.js +78 -0
  56. package/dist/paths.test.js.map +1 -0
  57. package/dist/redaction.d.ts +29 -0
  58. package/dist/redaction.d.ts.map +1 -0
  59. package/dist/redaction.js +48 -0
  60. package/dist/redaction.js.map +1 -0
  61. package/dist/redaction.test.d.ts +2 -0
  62. package/dist/redaction.test.d.ts.map +1 -0
  63. package/dist/redaction.test.js +67 -0
  64. package/dist/redaction.test.js.map +1 -0
  65. package/dist/regex-safety.d.ts +87 -0
  66. package/dist/regex-safety.d.ts.map +1 -0
  67. package/dist/regex-safety.js +322 -0
  68. package/dist/regex-safety.js.map +1 -0
  69. package/dist/regex-safety.test.d.ts +2 -0
  70. package/dist/regex-safety.test.d.ts.map +1 -0
  71. package/dist/regex-safety.test.js +149 -0
  72. package/dist/regex-safety.test.js.map +1 -0
  73. package/dist/registry-mutate.d.ts +35 -0
  74. package/dist/registry-mutate.d.ts.map +1 -0
  75. package/dist/registry-mutate.js +149 -0
  76. package/dist/registry-mutate.js.map +1 -0
  77. package/dist/registry-mutate.test.d.ts +2 -0
  78. package/dist/registry-mutate.test.d.ts.map +1 -0
  79. package/dist/registry-mutate.test.js +96 -0
  80. package/dist/registry-mutate.test.js.map +1 -0
  81. package/dist/registry.d.ts +64 -0
  82. package/dist/registry.d.ts.map +1 -0
  83. package/dist/registry.js +120 -0
  84. package/dist/registry.js.map +1 -0
  85. package/dist/registry.test.d.ts +2 -0
  86. package/dist/registry.test.d.ts.map +1 -0
  87. package/dist/registry.test.js +316 -0
  88. package/dist/registry.test.js.map +1 -0
  89. package/dist/remote-url.d.ts +18 -0
  90. package/dist/remote-url.d.ts.map +1 -0
  91. package/dist/remote-url.js +66 -0
  92. package/dist/remote-url.js.map +1 -0
  93. package/dist/remote-url.test.d.ts +2 -0
  94. package/dist/remote-url.test.d.ts.map +1 -0
  95. package/dist/remote-url.test.js +116 -0
  96. package/dist/remote-url.test.js.map +1 -0
  97. package/dist/render.d.ts +54 -0
  98. package/dist/render.d.ts.map +1 -0
  99. package/dist/render.js +182 -0
  100. package/dist/render.js.map +1 -0
  101. package/dist/render.test.d.ts +2 -0
  102. package/dist/render.test.d.ts.map +1 -0
  103. package/dist/render.test.js +152 -0
  104. package/dist/render.test.js.map +1 -0
  105. package/dist/repo.d.ts +40 -0
  106. package/dist/repo.d.ts.map +1 -0
  107. package/dist/repo.js +214 -0
  108. package/dist/repo.js.map +1 -0
  109. package/dist/repo.test.d.ts +2 -0
  110. package/dist/repo.test.d.ts.map +1 -0
  111. package/dist/repo.test.js +234 -0
  112. package/dist/repo.test.js.map +1 -0
  113. package/dist/scan.d.ts +103 -0
  114. package/dist/scan.d.ts.map +1 -0
  115. package/dist/scan.js +436 -0
  116. package/dist/scan.js.map +1 -0
  117. package/dist/scan.test.d.ts +2 -0
  118. package/dist/scan.test.d.ts.map +1 -0
  119. package/dist/scan.test.js +437 -0
  120. package/dist/scan.test.js.map +1 -0
  121. package/dist/schemas.d.ts +50 -0
  122. package/dist/schemas.d.ts.map +1 -0
  123. package/dist/schemas.js +190 -0
  124. package/dist/schemas.js.map +1 -0
  125. package/dist/secret-markers.d.ts +34 -0
  126. package/dist/secret-markers.d.ts.map +1 -0
  127. package/dist/secret-markers.js +118 -0
  128. package/dist/secret-markers.js.map +1 -0
  129. package/dist/secret-markers.test.d.ts +2 -0
  130. package/dist/secret-markers.test.d.ts.map +1 -0
  131. package/dist/secret-markers.test.js +154 -0
  132. package/dist/secret-markers.test.js.map +1 -0
  133. package/dist/trust-boundary.d.ts +33 -0
  134. package/dist/trust-boundary.d.ts.map +1 -0
  135. package/dist/trust-boundary.js +77 -0
  136. package/dist/trust-boundary.js.map +1 -0
  137. package/dist/trust-boundary.test.d.ts +2 -0
  138. package/dist/trust-boundary.test.d.ts.map +1 -0
  139. package/dist/trust-boundary.test.js +170 -0
  140. package/dist/trust-boundary.test.js.map +1 -0
  141. package/dist/types.d.ts +47 -0
  142. package/dist/types.d.ts.map +1 -0
  143. package/dist/types.js +8 -0
  144. package/dist/types.js.map +1 -0
  145. package/dist/working-tree.d.ts +38 -0
  146. package/dist/working-tree.d.ts.map +1 -0
  147. package/dist/working-tree.js +133 -0
  148. package/dist/working-tree.js.map +1 -0
  149. package/dist/working-tree.test.d.ts +2 -0
  150. package/dist/working-tree.test.d.ts.map +1 -0
  151. package/dist/working-tree.test.js +162 -0
  152. package/dist/working-tree.test.js.map +1 -0
  153. package/package.json +40 -0
  154. package/src/age.ts +113 -0
  155. package/src/audit-log.test.ts +222 -0
  156. package/src/audit-log.ts +215 -0
  157. package/src/deny-set.test.ts +208 -0
  158. package/src/deny-set.ts +231 -0
  159. package/src/exceptions.ts +134 -0
  160. package/src/exit-codes.ts +5 -0
  161. package/src/first-touch.ts +172 -0
  162. package/src/import-graph.test.ts +239 -0
  163. package/src/index.ts +191 -0
  164. package/src/lock.test.ts +151 -0
  165. package/src/lock.ts +88 -0
  166. package/src/paths.test.ts +94 -0
  167. package/src/paths.ts +55 -0
  168. package/src/redaction.test.ts +81 -0
  169. package/src/redaction.ts +49 -0
  170. package/src/regex-safety.test.ts +194 -0
  171. package/src/regex-safety.ts +349 -0
  172. package/src/registry-mutate.test.ts +134 -0
  173. package/src/registry-mutate.ts +185 -0
  174. package/src/registry.test.ts +460 -0
  175. package/src/registry.ts +178 -0
  176. package/src/remote-url.test.ts +121 -0
  177. package/src/remote-url.ts +78 -0
  178. package/src/render.test.ts +206 -0
  179. package/src/render.ts +215 -0
  180. package/src/repo.test.ts +275 -0
  181. package/src/repo.ts +245 -0
  182. package/src/scan.test.ts +580 -0
  183. package/src/scan.ts +531 -0
  184. package/src/schemas.ts +207 -0
  185. package/src/secret-markers.test.ts +183 -0
  186. package/src/secret-markers.ts +145 -0
  187. package/src/trust-boundary.test.ts +198 -0
  188. package/src/trust-boundary.ts +98 -0
  189. package/src/types.ts +55 -0
  190. package/src/working-tree.test.ts +193 -0
  191. package/src/working-tree.ts +130 -0
package/src/scan.ts ADDED
@@ -0,0 +1,531 @@
1
+ // SPDX-License-Identifier: GPL-3.0-or-later
2
+ // Copyright (C) 2026 Richard Myers and contributors.
3
+ import { execFileSync, spawnSync } from "node:child_process";
4
+ import {
5
+ closeSync,
6
+ mkdtempSync,
7
+ openSync,
8
+ readFileSync,
9
+ readSync,
10
+ rmSync,
11
+ existsSync,
12
+ statSync,
13
+ realpathSync,
14
+ } from "node:fs";
15
+ import { tmpdir } from "node:os";
16
+ import { isAbsolute, join, relative } from "node:path";
17
+ import type { DenySet } from "./deny-set.js";
18
+ import type { RepoConfig } from "./repo.js";
19
+ import { redactMatch, revealMatch, type RedactionMode } from "./redaction.js";
20
+ import { OutsideWorkingTreeError } from "./exceptions.js";
21
+
22
+ const DEFAULT_MAX_FILE_BYTES = 1024 * 1024; // 1 MiB
23
+
24
+ // Per-read chunk size when streaming `git diff` output through a temp
25
+ // file. 64 KiB keeps allocations small without making syscalls dominate
26
+ // throughput. Lines are reassembled across chunk boundaries.
27
+ const DIFF_STREAM_CHUNK_BYTES = 64 * 1024;
28
+
29
+ export interface ScanHit {
30
+ path?: string;
31
+ line: number;
32
+ column: number;
33
+ matchPreview: string;
34
+ /**
35
+ * The marker file stem (engagement id, or `_always`) the matched pattern
36
+ * was loaded from. Filled in by scanText when the deny set carries
37
+ * `patternSources`. Optional for backward compatibility with deny sets
38
+ * that don't supply attribution (synthetic test fixtures, older callers).
39
+ */
40
+ engagement?: string;
41
+ }
42
+
43
+ export interface SkippedFile {
44
+ path: string;
45
+ reason: "binary" | "too-large" | "unreadable";
46
+ bytes?: number;
47
+ }
48
+
49
+ export interface ScanOptions {
50
+ revealMatches?: boolean;
51
+ redactionMode?: RedactionMode;
52
+ maxFileBytes?: number;
53
+ /** When true, treat lines containing `repo-aegis: allow` as suppressed. Default: true. */
54
+ respectAllowComments?: boolean;
55
+ }
56
+
57
+ /**
58
+ * A line is allowed-by-comment if it contains the literal token
59
+ * `repo-aegis: allow` (case-insensitive). Optional reason can follow,
60
+ * e.g. `// repo-aegis: allow — synthetic test fixture`. The token is
61
+ * intentionally explicit (not just `allow`) to avoid accidental
62
+ * suppression by unrelated comments.
63
+ */
64
+ export const ALLOW_COMMENT = /repo-aegis:\s*allow\b/i;
65
+
66
+ function formatMatch(literal: string, opts: ScanOptions): string {
67
+ if (opts.revealMatches) return revealMatch(literal);
68
+ return redactMatch(literal, opts.redactionMode ?? "preview");
69
+ }
70
+
71
+ /**
72
+ * Find which deny-set pattern produced a given match, returning the
73
+ * engagement attribution from `patternSources`. Falls back to undefined
74
+ * when the deny set doesn't carry attribution (older fixtures).
75
+ *
76
+ * Iterates patterns in declaration order — first match wins. For typical
77
+ * marker counts (tens to low hundreds) this is microseconds; the
78
+ * resulting per-line cost is dominated by the combined-regex test that
79
+ * already happened.
80
+ */
81
+ function attributeMatch(matched: string, denySet: DenySet): string | undefined {
82
+ const sources = denySet.patternSources;
83
+ if (!sources || sources.length !== denySet.patterns.length) return undefined;
84
+ for (let i = 0; i < denySet.patterns.length; i++) {
85
+ const p = denySet.patterns[i]!;
86
+ try {
87
+ if (new RegExp(p, "i").test(matched)) {
88
+ return sources[i];
89
+ }
90
+ } catch {
91
+ /* malformed pattern slipped past validation; skip */
92
+ }
93
+ }
94
+ return undefined;
95
+ }
96
+
97
+ /**
98
+ * Scan an arbitrary text body. The most general primitive; called by
99
+ * the more specific scanners after they've extracted text from their
100
+ * input (staged diff, file contents, commit range diff).
101
+ */
102
+ export function scanText(
103
+ text: string,
104
+ denySet: DenySet,
105
+ path?: string,
106
+ opts: ScanOptions = {},
107
+ ): ScanHit[] {
108
+ if (!denySet.combinedRegex) return [];
109
+ const re = new RegExp(denySet.combinedRegex, "i");
110
+ const respectAllow = opts.respectAllowComments !== false;
111
+ const hits: ScanHit[] = [];
112
+ const lines = text.split("\n");
113
+ for (let i = 0; i < lines.length; i++) {
114
+ const line = lines[i] ?? "";
115
+ const m = line.match(re);
116
+ if (m && m[0]) {
117
+ if (respectAllow && ALLOW_COMMENT.test(line)) continue;
118
+ const engagement = attributeMatch(m[0], denySet);
119
+ hits.push({
120
+ ...(path !== undefined && { path }),
121
+ line: i + 1,
122
+ column: (m.index ?? 0) + 1,
123
+ matchPreview: formatMatch(m[0], opts),
124
+ ...(engagement !== undefined && { engagement }),
125
+ });
126
+ }
127
+ }
128
+ return hits;
129
+ }
130
+
131
+ /**
132
+ * Scan a single file from disk. Canonicalises the path via realpath to
133
+ * defeat symlink-tricks. Rejects paths outside the repo working tree
134
+ * (or current cwd if not in a git repo).
135
+ */
136
+ export function scanFile(
137
+ path: string,
138
+ denySet: DenySet,
139
+ opts: ScanOptions = {},
140
+ workingTree?: string,
141
+ ): { hits: ScanHit[]; skipped: SkippedFile[] } {
142
+ const skipped: SkippedFile[] = [];
143
+ if (!existsSync(path)) {
144
+ skipped.push({ path, reason: "unreadable" });
145
+ return { hits: [], skipped };
146
+ }
147
+ let real: string;
148
+ try {
149
+ real = realpathSync(path);
150
+ } catch {
151
+ skipped.push({ path, reason: "unreadable" });
152
+ return { hits: [], skipped };
153
+ }
154
+ if (workingTree) {
155
+ const wtReal = realpathSync(workingTree);
156
+ const rel = relative(wtReal, real);
157
+ if (rel.startsWith("..") || isAbsolute(rel)) {
158
+ throw new OutsideWorkingTreeError(real, wtReal);
159
+ }
160
+ }
161
+ const stat = statSync(real);
162
+ const max = opts.maxFileBytes ?? DEFAULT_MAX_FILE_BYTES;
163
+ if (stat.size > max) {
164
+ skipped.push({ path: real, reason: "too-large", bytes: stat.size });
165
+ return { hits: [], skipped };
166
+ }
167
+ let buf: Buffer;
168
+ try {
169
+ buf = readFileSync(real);
170
+ } catch {
171
+ skipped.push({ path: real, reason: "unreadable" });
172
+ return { hits: [], skipped };
173
+ }
174
+ if (looksBinary(buf)) {
175
+ skipped.push({ path: real, reason: "binary", bytes: stat.size });
176
+ return { hits: [], skipped };
177
+ }
178
+ const text = buf.toString("utf8");
179
+ return { hits: scanText(text, denySet, real, opts), skipped };
180
+ }
181
+
182
+ /**
183
+ * Stream `git diff <args>` and scan its added-line content. Works by
184
+ * spawning `git diff` with stdout redirected directly to a temp file
185
+ * (so the parent process never needs a giant in-memory buffer), then
186
+ * walking the file in fixed-size chunks, splitting into lines, and
187
+ * applying the deny-set regex per added line.
188
+ *
189
+ * Unified-diff parsing is hand-rolled here (replacing the previous
190
+ * `parse-diff`-based `extractAdditions`) so we can stream rather than
191
+ * load the entire diff. The rules implemented mirror parse-diff's
192
+ * handling of:
193
+ * - `diff --git`, `--- a/<x>`, `+++ b/<x>` headers (skipped, not content)
194
+ * - `@@ ... @@` chunk headers (toggle "in-chunk" state)
195
+ * - `+`-prefixed lines inside a chunk (added content; strip leading `+`)
196
+ * - `-` and ` ` lines (removed/context; ignored)
197
+ * - `` markers (ignored)
198
+ * - Binary-diff stanzas (no `@@`, so we never enter chunk state)
199
+ *
200
+ * Hit line numbers are 1-indexed across the synthetic stream of added
201
+ * lines (matching the prior behaviour where `extractAdditions` joined
202
+ * additions with `\n` and `scanText` numbered them by split-index).
203
+ */
204
+ function streamScanDiff(
205
+ cwd: string,
206
+ args: readonly string[],
207
+ denySet: DenySet,
208
+ opts: ScanOptions,
209
+ ): ScanHit[] {
210
+ if (!denySet.combinedRegex) return [];
211
+
212
+ // Spawn git diff with stdout going straight to a temp file. Using a
213
+ // file descriptor (vs. a pipe captured into a Buffer) means even a
214
+ // multi-GB diff doesn't allocate a single proportionally-sized
215
+ // buffer in our address space; the kernel writes the bytes to disk
216
+ // and we read them back in fixed-size chunks below.
217
+ const tmp = mkdtempSync(join(tmpdir(), "repo-aegis-diff-"));
218
+ const diffPath = join(tmp, "diff.patch");
219
+ let outFd: number | null = null;
220
+ try {
221
+ outFd = openSync(diffPath, "w");
222
+ const r = spawnSync("git", ["diff", ...args], {
223
+ cwd,
224
+ stdio: ["ignore", outFd, "pipe"],
225
+ });
226
+ closeSync(outFd);
227
+ outFd = null;
228
+ if (r.error) throw r.error;
229
+ if (r.status !== 0) {
230
+ const stderr =
231
+ r.stderr instanceof Buffer
232
+ ? r.stderr.toString("utf8")
233
+ : typeof r.stderr === "string"
234
+ ? r.stderr
235
+ : "";
236
+ throw new Error(`git diff exited ${r.status ?? "?"}: ${stderr.trim()}`);
237
+ }
238
+ return scanDiffFile(diffPath, denySet, opts);
239
+ } finally {
240
+ if (outFd !== null) {
241
+ try {
242
+ closeSync(outFd);
243
+ } catch {
244
+ /* best-effort */
245
+ }
246
+ }
247
+ try {
248
+ rmSync(tmp, { recursive: true, force: true });
249
+ } catch {
250
+ /* best-effort cleanup */
251
+ }
252
+ }
253
+ }
254
+
255
+ /**
256
+ * Walk a unified-diff file chunk-by-chunk, applying the deny-set regex
257
+ * per added line. The streaming counterpart to the prior
258
+ * extractAdditions + scanText pair. Memory usage is bounded by the
259
+ * read-chunk size (~64 KiB) plus any partial-line carry-over.
260
+ */
261
+ function scanDiffFile(
262
+ path: string,
263
+ denySet: DenySet,
264
+ opts: ScanOptions,
265
+ ): ScanHit[] {
266
+ const re = new RegExp(denySet.combinedRegex, "i");
267
+ const respectAllow = opts.respectAllowComments !== false;
268
+ const hits: ScanHit[] = [];
269
+ let inChunk = false;
270
+ let virtualLine = 0; // 1-indexed counter of added-content lines emitted
271
+
272
+ const fd = openSync(path, "r");
273
+ try {
274
+ const buf = Buffer.alloc(DIFF_STREAM_CHUNK_BYTES);
275
+ let carry = ""; // partial line spanning the previous chunk boundary
276
+ while (true) {
277
+ const n = readSync(fd, buf, 0, buf.length, null);
278
+ if (n === 0) break;
279
+ const text = carry + buf.subarray(0, n).toString("utf8");
280
+ // Split on \n; the last element is either a complete line (if
281
+ // the chunk ended on a newline) or a partial line carried into
282
+ // the next iteration.
283
+ const parts = text.split("\n");
284
+ carry = parts.pop() ?? "";
285
+ for (const line of parts) {
286
+ ({ inChunk, virtualLine } = processDiffLine(
287
+ line,
288
+ inChunk,
289
+ virtualLine,
290
+ re,
291
+ denySet,
292
+ respectAllow,
293
+ opts,
294
+ hits,
295
+ ));
296
+ }
297
+ }
298
+ if (carry.length > 0) {
299
+ processDiffLine(carry, inChunk, virtualLine, re, denySet, respectAllow, opts, hits);
300
+ }
301
+ } finally {
302
+ closeSync(fd);
303
+ }
304
+ return hits;
305
+ }
306
+
307
+ /**
308
+ * Examine a single diff line. Updates `inChunk` state on `@@` headers,
309
+ * and when the line is an added-content line, runs the regex and
310
+ * appends a hit (with a virtual line number based on the count of
311
+ * added lines seen so far). Returns the new (inChunk, virtualLine)
312
+ * state for the caller.
313
+ */
314
+ function processDiffLine(
315
+ line: string,
316
+ inChunk: boolean,
317
+ virtualLine: number,
318
+ re: RegExp,
319
+ denySet: DenySet,
320
+ respectAllow: boolean,
321
+ opts: ScanOptions,
322
+ hits: ScanHit[],
323
+ ): { inChunk: boolean; virtualLine: number } {
324
+ // File-level headers reset chunk state; they are never content.
325
+ if (line.startsWith("diff --git ")) return { inChunk: false, virtualLine };
326
+ if (line.startsWith("--- ") || line.startsWith("+++ ")) return { inChunk, virtualLine };
327
+ if (line.startsWith("@@")) return { inChunk: true, virtualLine };
328
+ // The "no newline at end of file" marker is content-adjacent but
329
+ // never an added line.
330
+ if (line.startsWith("\\ No newline")) return { inChunk, virtualLine };
331
+ if (!inChunk) return { inChunk, virtualLine };
332
+ // Inside a chunk: only `+`-prefixed lines (excluding `+++`, already
333
+ // filtered above) are added content. Strip the leading `+` to match
334
+ // the prior `extractAdditions` behaviour.
335
+ if (!line.startsWith("+")) return { inChunk, virtualLine };
336
+ const content = line.slice(1);
337
+ const next = virtualLine + 1;
338
+ const m = content.match(re);
339
+ if (!m || !m[0]) return { inChunk, virtualLine: next };
340
+ if (respectAllow && ALLOW_COMMENT.test(content)) {
341
+ return { inChunk, virtualLine: next };
342
+ }
343
+ const engagement = attributeMatch(m[0], denySet);
344
+ hits.push({
345
+ line: next,
346
+ column: (m.index ?? 0) + 1,
347
+ matchPreview: formatMatch(m[0], opts),
348
+ ...(engagement !== undefined && { engagement }),
349
+ });
350
+ return { inChunk, virtualLine: next };
351
+ }
352
+
353
+ /**
354
+ * Scan the staged diff in a git repo. Pre-commit hook entry point.
355
+ * Streams the diff through a temp file rather than buffering it whole
356
+ * — multi-GB pushes that previously OOM'd are now bounded by disk
357
+ * temp space and a small read buffer.
358
+ */
359
+ export function scanStagedDiff(
360
+ repo: RepoConfig,
361
+ denySet: DenySet,
362
+ opts: ScanOptions = {},
363
+ ): { hits: ScanHit[]; skipped: SkippedFile[] } {
364
+ if (!repo.isGitRepo) return { hits: [], skipped: [] };
365
+ if (!denySet.combinedRegex) return { hits: [], skipped: [] };
366
+ const hits = streamScanDiff(
367
+ repo.cwd,
368
+ ["--cached", "--diff-filter=ACM", "-U0", "--no-color"],
369
+ denySet,
370
+ opts,
371
+ );
372
+ return { hits, skipped: [] };
373
+ }
374
+
375
+ /**
376
+ * Scan the diff over an arbitrary git range (e.g. `main..HEAD`,
377
+ * `<remote-sha>..<local-sha>`). Pre-push hook entry point.
378
+ *
379
+ * Only added-line content is scanned. The caller is responsible for
380
+ * passing a syntactically valid range; if `git diff` exits non-zero,
381
+ * the throw propagates. Streams the diff (see scanStagedDiff).
382
+ */
383
+ export function scanRange(
384
+ repo: RepoConfig,
385
+ denySet: DenySet,
386
+ range: string,
387
+ opts: ScanOptions = {},
388
+ ): { hits: ScanHit[]; skipped: SkippedFile[] } {
389
+ if (!repo.isGitRepo) return { hits: [], skipped: [] };
390
+ if (!denySet.combinedRegex) return { hits: [], skipped: [] };
391
+ const hits = streamScanDiff(
392
+ repo.cwd,
393
+ [range, "--diff-filter=ACM", "-U0", "--no-color"],
394
+ denySet,
395
+ opts,
396
+ );
397
+ return { hits, skipped: [] };
398
+ }
399
+
400
+ export interface HistoryHit {
401
+ pattern: string;
402
+ commitSha: string;
403
+ commitSummary: string;
404
+ }
405
+
406
+ export interface ScanHistoryOptions extends ScanOptions {
407
+ /** Lower bound revspec; only commits reachable from the bound forward
408
+ * are scanned. e.g. "main", "v1.0.0", "HEAD~100". When omitted, scans
409
+ * the full history (the design's default). */
410
+ since?: string;
411
+ }
412
+
413
+ /**
414
+ * Scan the full git history with a single `git log -G <combined> -p`
415
+ * invocation, then attribute matches per-pattern by walking each
416
+ * commit's diff text. Returns one HistoryHit per (pattern, commit)
417
+ * match. Pass `--since` to bound the lower edge.
418
+ *
419
+ * Cost scales as O(history-size + patterns × hits). Patterns are
420
+ * combined via `|` into a single regex passed to `git log -G`, so we
421
+ * pay one git invocation regardless of pattern count. Per-pattern
422
+ * attribution happens in-process by re-testing each diff line against
423
+ * the individual patterns — cheap because git already filtered to
424
+ * commits where at least one pattern matched.
425
+ *
426
+ * The pattern field is redacted by default (preview mode) — same
427
+ * policy as scan hits. Pass `revealMatches: true` to opt into
428
+ * literals (NEVER from a hook).
429
+ */
430
+ export function scanHistory(
431
+ repo: RepoConfig,
432
+ denySet: DenySet,
433
+ opts: ScanHistoryOptions = {},
434
+ ): HistoryHit[] {
435
+ if (!repo.isGitRepo) return [];
436
+ if (denySet.patterns.length === 0) return [];
437
+
438
+ // Combine all patterns into a single -G regex. This matches any
439
+ // commit whose diff (added or removed line content) contains at
440
+ // least one pattern; we attribute the specific pattern(s) below.
441
+ const combined = denySet.patterns.join("|");
442
+ // `--format=__COMMIT__:%H %s` gives us a stable, parseable boundary
443
+ // that can't be confused with diff content (the diff body uses
444
+ // `diff --git`, `@@`, `+`, `-`, ` ` line prefixes). The summary
445
+ // can contain anything but is bounded by the next `__COMMIT__:`.
446
+ const commitMarker = "__COMMIT__:";
447
+ const args = [
448
+ "log",
449
+ "-G",
450
+ combined,
451
+ "-p",
452
+ "--no-color",
453
+ `--format=${commitMarker}%H %s`,
454
+ ];
455
+ if (opts.since) {
456
+ args.push(`${opts.since}..`);
457
+ }
458
+ let stdout = "";
459
+ try {
460
+ stdout = execFileSync("git", args, {
461
+ cwd: repo.cwd,
462
+ encoding: "utf8",
463
+ stdio: ["ignore", "pipe", "ignore"],
464
+ maxBuffer: 256 * 1024 * 1024,
465
+ });
466
+ } catch {
467
+ return [];
468
+ }
469
+
470
+ // Pre-compile per-pattern regexes once for attribution.
471
+ const perPatternRegexes: (RegExp | null)[] = denySet.patterns.map(p => {
472
+ try {
473
+ return new RegExp(p, "i");
474
+ } catch {
475
+ return null;
476
+ }
477
+ });
478
+
479
+ const hits: HistoryHit[] = [];
480
+ // Walk the output. Each commit's section starts with the marker
481
+ // line, followed by `diff --git` blocks. `git log -G` filters
482
+ // commits whose diff content matched the regex; `-p` includes the
483
+ // unified-diff body so we can attribute per pattern.
484
+ const lines = stdout.split("\n");
485
+ let curSha = "";
486
+ let curSummary = "";
487
+ // Tracks which (pattern-index, commit) pairs we've already emitted,
488
+ // since multiple lines in one commit can hit the same pattern.
489
+ const emitted = new Set<string>();
490
+ for (const line of lines) {
491
+ if (line.startsWith(commitMarker)) {
492
+ const rest = line.slice(commitMarker.length);
493
+ const sp = rest.indexOf(" ");
494
+ curSha = sp >= 0 ? rest.slice(0, sp) : rest;
495
+ curSummary = sp >= 0 ? rest.slice(sp + 1) : "";
496
+ continue;
497
+ }
498
+ if (!curSha) continue;
499
+ // -G matches both added and removed line content; attribute
500
+ // either kind. `+++` / `---` are headers, not content.
501
+ if (line.startsWith("+++") || line.startsWith("---")) continue;
502
+ if (line.length === 0) continue;
503
+ const c0 = line.charCodeAt(0);
504
+ // 43 = '+', 45 = '-'
505
+ if (c0 !== 43 && c0 !== 45) continue;
506
+ const content = line.slice(1);
507
+ for (let i = 0; i < denySet.patterns.length; i++) {
508
+ const re = perPatternRegexes[i];
509
+ if (!re) continue;
510
+ if (!re.test(content)) continue;
511
+ const key = `${i}:${curSha}`;
512
+ if (emitted.has(key)) continue;
513
+ emitted.add(key);
514
+ hits.push({
515
+ pattern: formatMatch(denySet.patterns[i]!, opts),
516
+ commitSha: curSha,
517
+ commitSummary: curSummary,
518
+ });
519
+ }
520
+ }
521
+ return hits;
522
+ }
523
+
524
+ function looksBinary(buf: Buffer): boolean {
525
+ // Heuristic: any NUL byte in the first 8KB is a strong binary signal.
526
+ const sample = buf.subarray(0, Math.min(buf.length, 8192));
527
+ for (let i = 0; i < sample.length; i++) {
528
+ if (sample[i] === 0) return true;
529
+ }
530
+ return false;
531
+ }