@de-otio/repo-aegis-core 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/age.d.ts +32 -0
- package/dist/age.d.ts.map +1 -0
- package/dist/age.js +98 -0
- package/dist/age.js.map +1 -0
- package/dist/audit-log.d.ts +50 -0
- package/dist/audit-log.d.ts.map +1 -0
- package/dist/audit-log.js +183 -0
- package/dist/audit-log.js.map +1 -0
- package/dist/audit-log.test.d.ts +2 -0
- package/dist/audit-log.test.d.ts.map +1 -0
- package/dist/audit-log.test.js +181 -0
- package/dist/audit-log.test.js.map +1 -0
- package/dist/deny-set.d.ts +43 -0
- package/dist/deny-set.d.ts.map +1 -0
- package/dist/deny-set.js +165 -0
- package/dist/deny-set.js.map +1 -0
- package/dist/deny-set.test.d.ts +2 -0
- package/dist/deny-set.test.d.ts.map +1 -0
- package/dist/deny-set.test.js +155 -0
- package/dist/deny-set.test.js.map +1 -0
- package/dist/exceptions.d.ts +96 -0
- package/dist/exceptions.d.ts.map +1 -0
- package/dist/exceptions.js +143 -0
- package/dist/exceptions.js.map +1 -0
- package/dist/exit-codes.d.ts +4 -0
- package/dist/exit-codes.d.ts.map +1 -0
- package/dist/exit-codes.js +6 -0
- package/dist/exit-codes.js.map +1 -0
- package/dist/first-touch.d.ts +57 -0
- package/dist/first-touch.d.ts.map +1 -0
- package/dist/first-touch.js +112 -0
- package/dist/first-touch.js.map +1 -0
- package/dist/import-graph.test.d.ts +2 -0
- package/dist/import-graph.test.d.ts.map +1 -0
- package/dist/import-graph.test.js +210 -0
- package/dist/import-graph.test.js.map +1 -0
- package/dist/index.d.ts +37 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +68 -0
- package/dist/index.js.map +1 -0
- package/dist/lock.d.ts +22 -0
- package/dist/lock.d.ts.map +1 -0
- package/dist/lock.js +86 -0
- package/dist/lock.js.map +1 -0
- package/dist/lock.test.d.ts +2 -0
- package/dist/lock.test.d.ts.map +1 -0
- package/dist/lock.test.js +125 -0
- package/dist/lock.test.js.map +1 -0
- package/dist/paths.d.ts +22 -0
- package/dist/paths.d.ts.map +1 -0
- package/dist/paths.js +46 -0
- package/dist/paths.js.map +1 -0
- package/dist/paths.test.d.ts +2 -0
- package/dist/paths.test.d.ts.map +1 -0
- package/dist/paths.test.js +78 -0
- package/dist/paths.test.js.map +1 -0
- package/dist/redaction.d.ts +29 -0
- package/dist/redaction.d.ts.map +1 -0
- package/dist/redaction.js +48 -0
- package/dist/redaction.js.map +1 -0
- package/dist/redaction.test.d.ts +2 -0
- package/dist/redaction.test.d.ts.map +1 -0
- package/dist/redaction.test.js +67 -0
- package/dist/redaction.test.js.map +1 -0
- package/dist/regex-safety.d.ts +87 -0
- package/dist/regex-safety.d.ts.map +1 -0
- package/dist/regex-safety.js +322 -0
- package/dist/regex-safety.js.map +1 -0
- package/dist/regex-safety.test.d.ts +2 -0
- package/dist/regex-safety.test.d.ts.map +1 -0
- package/dist/regex-safety.test.js +149 -0
- package/dist/regex-safety.test.js.map +1 -0
- package/dist/registry-mutate.d.ts +35 -0
- package/dist/registry-mutate.d.ts.map +1 -0
- package/dist/registry-mutate.js +149 -0
- package/dist/registry-mutate.js.map +1 -0
- package/dist/registry-mutate.test.d.ts +2 -0
- package/dist/registry-mutate.test.d.ts.map +1 -0
- package/dist/registry-mutate.test.js +96 -0
- package/dist/registry-mutate.test.js.map +1 -0
- package/dist/registry.d.ts +64 -0
- package/dist/registry.d.ts.map +1 -0
- package/dist/registry.js +120 -0
- package/dist/registry.js.map +1 -0
- package/dist/registry.test.d.ts +2 -0
- package/dist/registry.test.d.ts.map +1 -0
- package/dist/registry.test.js +316 -0
- package/dist/registry.test.js.map +1 -0
- package/dist/remote-url.d.ts +18 -0
- package/dist/remote-url.d.ts.map +1 -0
- package/dist/remote-url.js +66 -0
- package/dist/remote-url.js.map +1 -0
- package/dist/remote-url.test.d.ts +2 -0
- package/dist/remote-url.test.d.ts.map +1 -0
- package/dist/remote-url.test.js +116 -0
- package/dist/remote-url.test.js.map +1 -0
- package/dist/render.d.ts +54 -0
- package/dist/render.d.ts.map +1 -0
- package/dist/render.js +182 -0
- package/dist/render.js.map +1 -0
- package/dist/render.test.d.ts +2 -0
- package/dist/render.test.d.ts.map +1 -0
- package/dist/render.test.js +152 -0
- package/dist/render.test.js.map +1 -0
- package/dist/repo.d.ts +40 -0
- package/dist/repo.d.ts.map +1 -0
- package/dist/repo.js +214 -0
- package/dist/repo.js.map +1 -0
- package/dist/repo.test.d.ts +2 -0
- package/dist/repo.test.d.ts.map +1 -0
- package/dist/repo.test.js +234 -0
- package/dist/repo.test.js.map +1 -0
- package/dist/scan.d.ts +103 -0
- package/dist/scan.d.ts.map +1 -0
- package/dist/scan.js +436 -0
- package/dist/scan.js.map +1 -0
- package/dist/scan.test.d.ts +2 -0
- package/dist/scan.test.d.ts.map +1 -0
- package/dist/scan.test.js +437 -0
- package/dist/scan.test.js.map +1 -0
- package/dist/schemas.d.ts +50 -0
- package/dist/schemas.d.ts.map +1 -0
- package/dist/schemas.js +190 -0
- package/dist/schemas.js.map +1 -0
- package/dist/secret-markers.d.ts +34 -0
- package/dist/secret-markers.d.ts.map +1 -0
- package/dist/secret-markers.js +118 -0
- package/dist/secret-markers.js.map +1 -0
- package/dist/secret-markers.test.d.ts +2 -0
- package/dist/secret-markers.test.d.ts.map +1 -0
- package/dist/secret-markers.test.js +154 -0
- package/dist/secret-markers.test.js.map +1 -0
- package/dist/trust-boundary.d.ts +33 -0
- package/dist/trust-boundary.d.ts.map +1 -0
- package/dist/trust-boundary.js +77 -0
- package/dist/trust-boundary.js.map +1 -0
- package/dist/trust-boundary.test.d.ts +2 -0
- package/dist/trust-boundary.test.d.ts.map +1 -0
- package/dist/trust-boundary.test.js +170 -0
- package/dist/trust-boundary.test.js.map +1 -0
- package/dist/types.d.ts +47 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +8 -0
- package/dist/types.js.map +1 -0
- package/dist/working-tree.d.ts +38 -0
- package/dist/working-tree.d.ts.map +1 -0
- package/dist/working-tree.js +133 -0
- package/dist/working-tree.js.map +1 -0
- package/dist/working-tree.test.d.ts +2 -0
- package/dist/working-tree.test.d.ts.map +1 -0
- package/dist/working-tree.test.js +162 -0
- package/dist/working-tree.test.js.map +1 -0
- package/package.json +40 -0
- package/src/age.ts +113 -0
- package/src/audit-log.test.ts +222 -0
- package/src/audit-log.ts +215 -0
- package/src/deny-set.test.ts +208 -0
- package/src/deny-set.ts +231 -0
- package/src/exceptions.ts +134 -0
- package/src/exit-codes.ts +5 -0
- package/src/first-touch.ts +172 -0
- package/src/import-graph.test.ts +239 -0
- package/src/index.ts +191 -0
- package/src/lock.test.ts +151 -0
- package/src/lock.ts +88 -0
- package/src/paths.test.ts +94 -0
- package/src/paths.ts +55 -0
- package/src/redaction.test.ts +81 -0
- package/src/redaction.ts +49 -0
- package/src/regex-safety.test.ts +194 -0
- package/src/regex-safety.ts +349 -0
- package/src/registry-mutate.test.ts +134 -0
- package/src/registry-mutate.ts +185 -0
- package/src/registry.test.ts +460 -0
- package/src/registry.ts +178 -0
- package/src/remote-url.test.ts +121 -0
- package/src/remote-url.ts +78 -0
- package/src/render.test.ts +206 -0
- package/src/render.ts +215 -0
- package/src/repo.test.ts +275 -0
- package/src/repo.ts +245 -0
- package/src/scan.test.ts +580 -0
- package/src/scan.ts +531 -0
- package/src/schemas.ts +207 -0
- package/src/secret-markers.test.ts +183 -0
- package/src/secret-markers.ts +145 -0
- package/src/trust-boundary.test.ts +198 -0
- package/src/trust-boundary.ts +98 -0
- package/src/types.ts +55 -0
- package/src/working-tree.test.ts +193 -0
- package/src/working-tree.ts +130 -0
package/src/scan.ts
ADDED
|
@@ -0,0 +1,531 @@
|
|
|
1
|
+
// SPDX-License-Identifier: GPL-3.0-or-later
|
|
2
|
+
// Copyright (C) 2026 Richard Myers and contributors.
|
|
3
|
+
import { execFileSync, spawnSync } from "node:child_process";
|
|
4
|
+
import {
|
|
5
|
+
closeSync,
|
|
6
|
+
mkdtempSync,
|
|
7
|
+
openSync,
|
|
8
|
+
readFileSync,
|
|
9
|
+
readSync,
|
|
10
|
+
rmSync,
|
|
11
|
+
existsSync,
|
|
12
|
+
statSync,
|
|
13
|
+
realpathSync,
|
|
14
|
+
} from "node:fs";
|
|
15
|
+
import { tmpdir } from "node:os";
|
|
16
|
+
import { isAbsolute, join, relative } from "node:path";
|
|
17
|
+
import type { DenySet } from "./deny-set.js";
|
|
18
|
+
import type { RepoConfig } from "./repo.js";
|
|
19
|
+
import { redactMatch, revealMatch, type RedactionMode } from "./redaction.js";
|
|
20
|
+
import { OutsideWorkingTreeError } from "./exceptions.js";
|
|
21
|
+
|
|
22
|
+
const DEFAULT_MAX_FILE_BYTES = 1024 * 1024; // 1 MiB
|
|
23
|
+
|
|
24
|
+
// Per-read chunk size when streaming `git diff` output through a temp
|
|
25
|
+
// file. 64 KiB keeps allocations small without making syscalls dominate
|
|
26
|
+
// throughput. Lines are reassembled across chunk boundaries.
|
|
27
|
+
const DIFF_STREAM_CHUNK_BYTES = 64 * 1024;
|
|
28
|
+
|
|
29
|
+
export interface ScanHit {
|
|
30
|
+
path?: string;
|
|
31
|
+
line: number;
|
|
32
|
+
column: number;
|
|
33
|
+
matchPreview: string;
|
|
34
|
+
/**
|
|
35
|
+
* The marker file stem (engagement id, or `_always`) the matched pattern
|
|
36
|
+
* was loaded from. Filled in by scanText when the deny set carries
|
|
37
|
+
* `patternSources`. Optional for backward compatibility with deny sets
|
|
38
|
+
* that don't supply attribution (synthetic test fixtures, older callers).
|
|
39
|
+
*/
|
|
40
|
+
engagement?: string;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
export interface SkippedFile {
|
|
44
|
+
path: string;
|
|
45
|
+
reason: "binary" | "too-large" | "unreadable";
|
|
46
|
+
bytes?: number;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
export interface ScanOptions {
|
|
50
|
+
revealMatches?: boolean;
|
|
51
|
+
redactionMode?: RedactionMode;
|
|
52
|
+
maxFileBytes?: number;
|
|
53
|
+
/** When true, treat lines containing `repo-aegis: allow` as suppressed. Default: true. */
|
|
54
|
+
respectAllowComments?: boolean;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* A line is allowed-by-comment if it contains the literal token
|
|
59
|
+
* `repo-aegis: allow` (case-insensitive). Optional reason can follow,
|
|
60
|
+
* e.g. `// repo-aegis: allow — synthetic test fixture`. The token is
|
|
61
|
+
* intentionally explicit (not just `allow`) to avoid accidental
|
|
62
|
+
* suppression by unrelated comments.
|
|
63
|
+
*/
|
|
64
|
+
export const ALLOW_COMMENT = /repo-aegis:\s*allow\b/i;
|
|
65
|
+
|
|
66
|
+
function formatMatch(literal: string, opts: ScanOptions): string {
|
|
67
|
+
if (opts.revealMatches) return revealMatch(literal);
|
|
68
|
+
return redactMatch(literal, opts.redactionMode ?? "preview");
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
* Find which deny-set pattern produced a given match, returning the
|
|
73
|
+
* engagement attribution from `patternSources`. Falls back to undefined
|
|
74
|
+
* when the deny set doesn't carry attribution (older fixtures).
|
|
75
|
+
*
|
|
76
|
+
* Iterates patterns in declaration order — first match wins. For typical
|
|
77
|
+
* marker counts (tens to low hundreds) this is microseconds; the
|
|
78
|
+
* resulting per-line cost is dominated by the combined-regex test that
|
|
79
|
+
* already happened.
|
|
80
|
+
*/
|
|
81
|
+
function attributeMatch(matched: string, denySet: DenySet): string | undefined {
|
|
82
|
+
const sources = denySet.patternSources;
|
|
83
|
+
if (!sources || sources.length !== denySet.patterns.length) return undefined;
|
|
84
|
+
for (let i = 0; i < denySet.patterns.length; i++) {
|
|
85
|
+
const p = denySet.patterns[i]!;
|
|
86
|
+
try {
|
|
87
|
+
if (new RegExp(p, "i").test(matched)) {
|
|
88
|
+
return sources[i];
|
|
89
|
+
}
|
|
90
|
+
} catch {
|
|
91
|
+
/* malformed pattern slipped past validation; skip */
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
return undefined;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
/**
|
|
98
|
+
* Scan an arbitrary text body. The most general primitive; called by
|
|
99
|
+
* the more specific scanners after they've extracted text from their
|
|
100
|
+
* input (staged diff, file contents, commit range diff).
|
|
101
|
+
*/
|
|
102
|
+
export function scanText(
|
|
103
|
+
text: string,
|
|
104
|
+
denySet: DenySet,
|
|
105
|
+
path?: string,
|
|
106
|
+
opts: ScanOptions = {},
|
|
107
|
+
): ScanHit[] {
|
|
108
|
+
if (!denySet.combinedRegex) return [];
|
|
109
|
+
const re = new RegExp(denySet.combinedRegex, "i");
|
|
110
|
+
const respectAllow = opts.respectAllowComments !== false;
|
|
111
|
+
const hits: ScanHit[] = [];
|
|
112
|
+
const lines = text.split("\n");
|
|
113
|
+
for (let i = 0; i < lines.length; i++) {
|
|
114
|
+
const line = lines[i] ?? "";
|
|
115
|
+
const m = line.match(re);
|
|
116
|
+
if (m && m[0]) {
|
|
117
|
+
if (respectAllow && ALLOW_COMMENT.test(line)) continue;
|
|
118
|
+
const engagement = attributeMatch(m[0], denySet);
|
|
119
|
+
hits.push({
|
|
120
|
+
...(path !== undefined && { path }),
|
|
121
|
+
line: i + 1,
|
|
122
|
+
column: (m.index ?? 0) + 1,
|
|
123
|
+
matchPreview: formatMatch(m[0], opts),
|
|
124
|
+
...(engagement !== undefined && { engagement }),
|
|
125
|
+
});
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
return hits;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
/**
|
|
132
|
+
* Scan a single file from disk. Canonicalises the path via realpath to
|
|
133
|
+
* defeat symlink-tricks. Rejects paths outside the repo working tree
|
|
134
|
+
* (or current cwd if not in a git repo).
|
|
135
|
+
*/
|
|
136
|
+
export function scanFile(
|
|
137
|
+
path: string,
|
|
138
|
+
denySet: DenySet,
|
|
139
|
+
opts: ScanOptions = {},
|
|
140
|
+
workingTree?: string,
|
|
141
|
+
): { hits: ScanHit[]; skipped: SkippedFile[] } {
|
|
142
|
+
const skipped: SkippedFile[] = [];
|
|
143
|
+
if (!existsSync(path)) {
|
|
144
|
+
skipped.push({ path, reason: "unreadable" });
|
|
145
|
+
return { hits: [], skipped };
|
|
146
|
+
}
|
|
147
|
+
let real: string;
|
|
148
|
+
try {
|
|
149
|
+
real = realpathSync(path);
|
|
150
|
+
} catch {
|
|
151
|
+
skipped.push({ path, reason: "unreadable" });
|
|
152
|
+
return { hits: [], skipped };
|
|
153
|
+
}
|
|
154
|
+
if (workingTree) {
|
|
155
|
+
const wtReal = realpathSync(workingTree);
|
|
156
|
+
const rel = relative(wtReal, real);
|
|
157
|
+
if (rel.startsWith("..") || isAbsolute(rel)) {
|
|
158
|
+
throw new OutsideWorkingTreeError(real, wtReal);
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
const stat = statSync(real);
|
|
162
|
+
const max = opts.maxFileBytes ?? DEFAULT_MAX_FILE_BYTES;
|
|
163
|
+
if (stat.size > max) {
|
|
164
|
+
skipped.push({ path: real, reason: "too-large", bytes: stat.size });
|
|
165
|
+
return { hits: [], skipped };
|
|
166
|
+
}
|
|
167
|
+
let buf: Buffer;
|
|
168
|
+
try {
|
|
169
|
+
buf = readFileSync(real);
|
|
170
|
+
} catch {
|
|
171
|
+
skipped.push({ path: real, reason: "unreadable" });
|
|
172
|
+
return { hits: [], skipped };
|
|
173
|
+
}
|
|
174
|
+
if (looksBinary(buf)) {
|
|
175
|
+
skipped.push({ path: real, reason: "binary", bytes: stat.size });
|
|
176
|
+
return { hits: [], skipped };
|
|
177
|
+
}
|
|
178
|
+
const text = buf.toString("utf8");
|
|
179
|
+
return { hits: scanText(text, denySet, real, opts), skipped };
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
/**
|
|
183
|
+
* Stream `git diff <args>` and scan its added-line content. Works by
|
|
184
|
+
* spawning `git diff` with stdout redirected directly to a temp file
|
|
185
|
+
* (so the parent process never needs a giant in-memory buffer), then
|
|
186
|
+
* walking the file in fixed-size chunks, splitting into lines, and
|
|
187
|
+
* applying the deny-set regex per added line.
|
|
188
|
+
*
|
|
189
|
+
* Unified-diff parsing is hand-rolled here (replacing the previous
|
|
190
|
+
* `parse-diff`-based `extractAdditions`) so we can stream rather than
|
|
191
|
+
* load the entire diff. The rules implemented mirror parse-diff's
|
|
192
|
+
* handling of:
|
|
193
|
+
* - `diff --git`, `--- a/<x>`, `+++ b/<x>` headers (skipped, not content)
|
|
194
|
+
* - `@@ ... @@` chunk headers (toggle "in-chunk" state)
|
|
195
|
+
* - `+`-prefixed lines inside a chunk (added content; strip leading `+`)
|
|
196
|
+
* - `-` and ` ` lines (removed/context; ignored)
|
|
197
|
+
* - `` markers (ignored)
|
|
198
|
+
* - Binary-diff stanzas (no `@@`, so we never enter chunk state)
|
|
199
|
+
*
|
|
200
|
+
* Hit line numbers are 1-indexed across the synthetic stream of added
|
|
201
|
+
* lines (matching the prior behaviour where `extractAdditions` joined
|
|
202
|
+
* additions with `\n` and `scanText` numbered them by split-index).
|
|
203
|
+
*/
|
|
204
|
+
function streamScanDiff(
|
|
205
|
+
cwd: string,
|
|
206
|
+
args: readonly string[],
|
|
207
|
+
denySet: DenySet,
|
|
208
|
+
opts: ScanOptions,
|
|
209
|
+
): ScanHit[] {
|
|
210
|
+
if (!denySet.combinedRegex) return [];
|
|
211
|
+
|
|
212
|
+
// Spawn git diff with stdout going straight to a temp file. Using a
|
|
213
|
+
// file descriptor (vs. a pipe captured into a Buffer) means even a
|
|
214
|
+
// multi-GB diff doesn't allocate a single proportionally-sized
|
|
215
|
+
// buffer in our address space; the kernel writes the bytes to disk
|
|
216
|
+
// and we read them back in fixed-size chunks below.
|
|
217
|
+
const tmp = mkdtempSync(join(tmpdir(), "repo-aegis-diff-"));
|
|
218
|
+
const diffPath = join(tmp, "diff.patch");
|
|
219
|
+
let outFd: number | null = null;
|
|
220
|
+
try {
|
|
221
|
+
outFd = openSync(diffPath, "w");
|
|
222
|
+
const r = spawnSync("git", ["diff", ...args], {
|
|
223
|
+
cwd,
|
|
224
|
+
stdio: ["ignore", outFd, "pipe"],
|
|
225
|
+
});
|
|
226
|
+
closeSync(outFd);
|
|
227
|
+
outFd = null;
|
|
228
|
+
if (r.error) throw r.error;
|
|
229
|
+
if (r.status !== 0) {
|
|
230
|
+
const stderr =
|
|
231
|
+
r.stderr instanceof Buffer
|
|
232
|
+
? r.stderr.toString("utf8")
|
|
233
|
+
: typeof r.stderr === "string"
|
|
234
|
+
? r.stderr
|
|
235
|
+
: "";
|
|
236
|
+
throw new Error(`git diff exited ${r.status ?? "?"}: ${stderr.trim()}`);
|
|
237
|
+
}
|
|
238
|
+
return scanDiffFile(diffPath, denySet, opts);
|
|
239
|
+
} finally {
|
|
240
|
+
if (outFd !== null) {
|
|
241
|
+
try {
|
|
242
|
+
closeSync(outFd);
|
|
243
|
+
} catch {
|
|
244
|
+
/* best-effort */
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
try {
|
|
248
|
+
rmSync(tmp, { recursive: true, force: true });
|
|
249
|
+
} catch {
|
|
250
|
+
/* best-effort cleanup */
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
/**
|
|
256
|
+
* Walk a unified-diff file chunk-by-chunk, applying the deny-set regex
|
|
257
|
+
* per added line. The streaming counterpart to the prior
|
|
258
|
+
* extractAdditions + scanText pair. Memory usage is bounded by the
|
|
259
|
+
* read-chunk size (~64 KiB) plus any partial-line carry-over.
|
|
260
|
+
*/
|
|
261
|
+
function scanDiffFile(
|
|
262
|
+
path: string,
|
|
263
|
+
denySet: DenySet,
|
|
264
|
+
opts: ScanOptions,
|
|
265
|
+
): ScanHit[] {
|
|
266
|
+
const re = new RegExp(denySet.combinedRegex, "i");
|
|
267
|
+
const respectAllow = opts.respectAllowComments !== false;
|
|
268
|
+
const hits: ScanHit[] = [];
|
|
269
|
+
let inChunk = false;
|
|
270
|
+
let virtualLine = 0; // 1-indexed counter of added-content lines emitted
|
|
271
|
+
|
|
272
|
+
const fd = openSync(path, "r");
|
|
273
|
+
try {
|
|
274
|
+
const buf = Buffer.alloc(DIFF_STREAM_CHUNK_BYTES);
|
|
275
|
+
let carry = ""; // partial line spanning the previous chunk boundary
|
|
276
|
+
while (true) {
|
|
277
|
+
const n = readSync(fd, buf, 0, buf.length, null);
|
|
278
|
+
if (n === 0) break;
|
|
279
|
+
const text = carry + buf.subarray(0, n).toString("utf8");
|
|
280
|
+
// Split on \n; the last element is either a complete line (if
|
|
281
|
+
// the chunk ended on a newline) or a partial line carried into
|
|
282
|
+
// the next iteration.
|
|
283
|
+
const parts = text.split("\n");
|
|
284
|
+
carry = parts.pop() ?? "";
|
|
285
|
+
for (const line of parts) {
|
|
286
|
+
({ inChunk, virtualLine } = processDiffLine(
|
|
287
|
+
line,
|
|
288
|
+
inChunk,
|
|
289
|
+
virtualLine,
|
|
290
|
+
re,
|
|
291
|
+
denySet,
|
|
292
|
+
respectAllow,
|
|
293
|
+
opts,
|
|
294
|
+
hits,
|
|
295
|
+
));
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
if (carry.length > 0) {
|
|
299
|
+
processDiffLine(carry, inChunk, virtualLine, re, denySet, respectAllow, opts, hits);
|
|
300
|
+
}
|
|
301
|
+
} finally {
|
|
302
|
+
closeSync(fd);
|
|
303
|
+
}
|
|
304
|
+
return hits;
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
/**
|
|
308
|
+
* Examine a single diff line. Updates `inChunk` state on `@@` headers,
|
|
309
|
+
* and when the line is an added-content line, runs the regex and
|
|
310
|
+
* appends a hit (with a virtual line number based on the count of
|
|
311
|
+
* added lines seen so far). Returns the new (inChunk, virtualLine)
|
|
312
|
+
* state for the caller.
|
|
313
|
+
*/
|
|
314
|
+
function processDiffLine(
|
|
315
|
+
line: string,
|
|
316
|
+
inChunk: boolean,
|
|
317
|
+
virtualLine: number,
|
|
318
|
+
re: RegExp,
|
|
319
|
+
denySet: DenySet,
|
|
320
|
+
respectAllow: boolean,
|
|
321
|
+
opts: ScanOptions,
|
|
322
|
+
hits: ScanHit[],
|
|
323
|
+
): { inChunk: boolean; virtualLine: number } {
|
|
324
|
+
// File-level headers reset chunk state; they are never content.
|
|
325
|
+
if (line.startsWith("diff --git ")) return { inChunk: false, virtualLine };
|
|
326
|
+
if (line.startsWith("--- ") || line.startsWith("+++ ")) return { inChunk, virtualLine };
|
|
327
|
+
if (line.startsWith("@@")) return { inChunk: true, virtualLine };
|
|
328
|
+
// The "no newline at end of file" marker is content-adjacent but
|
|
329
|
+
// never an added line.
|
|
330
|
+
if (line.startsWith("\\ No newline")) return { inChunk, virtualLine };
|
|
331
|
+
if (!inChunk) return { inChunk, virtualLine };
|
|
332
|
+
// Inside a chunk: only `+`-prefixed lines (excluding `+++`, already
|
|
333
|
+
// filtered above) are added content. Strip the leading `+` to match
|
|
334
|
+
// the prior `extractAdditions` behaviour.
|
|
335
|
+
if (!line.startsWith("+")) return { inChunk, virtualLine };
|
|
336
|
+
const content = line.slice(1);
|
|
337
|
+
const next = virtualLine + 1;
|
|
338
|
+
const m = content.match(re);
|
|
339
|
+
if (!m || !m[0]) return { inChunk, virtualLine: next };
|
|
340
|
+
if (respectAllow && ALLOW_COMMENT.test(content)) {
|
|
341
|
+
return { inChunk, virtualLine: next };
|
|
342
|
+
}
|
|
343
|
+
const engagement = attributeMatch(m[0], denySet);
|
|
344
|
+
hits.push({
|
|
345
|
+
line: next,
|
|
346
|
+
column: (m.index ?? 0) + 1,
|
|
347
|
+
matchPreview: formatMatch(m[0], opts),
|
|
348
|
+
...(engagement !== undefined && { engagement }),
|
|
349
|
+
});
|
|
350
|
+
return { inChunk, virtualLine: next };
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
/**
|
|
354
|
+
* Scan the staged diff in a git repo. Pre-commit hook entry point.
|
|
355
|
+
* Streams the diff through a temp file rather than buffering it whole
|
|
356
|
+
* — multi-GB pushes that previously OOM'd are now bounded by disk
|
|
357
|
+
* temp space and a small read buffer.
|
|
358
|
+
*/
|
|
359
|
+
export function scanStagedDiff(
|
|
360
|
+
repo: RepoConfig,
|
|
361
|
+
denySet: DenySet,
|
|
362
|
+
opts: ScanOptions = {},
|
|
363
|
+
): { hits: ScanHit[]; skipped: SkippedFile[] } {
|
|
364
|
+
if (!repo.isGitRepo) return { hits: [], skipped: [] };
|
|
365
|
+
if (!denySet.combinedRegex) return { hits: [], skipped: [] };
|
|
366
|
+
const hits = streamScanDiff(
|
|
367
|
+
repo.cwd,
|
|
368
|
+
["--cached", "--diff-filter=ACM", "-U0", "--no-color"],
|
|
369
|
+
denySet,
|
|
370
|
+
opts,
|
|
371
|
+
);
|
|
372
|
+
return { hits, skipped: [] };
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
/**
|
|
376
|
+
* Scan the diff over an arbitrary git range (e.g. `main..HEAD`,
|
|
377
|
+
* `<remote-sha>..<local-sha>`). Pre-push hook entry point.
|
|
378
|
+
*
|
|
379
|
+
* Only added-line content is scanned. The caller is responsible for
|
|
380
|
+
* passing a syntactically valid range; if `git diff` exits non-zero,
|
|
381
|
+
* the throw propagates. Streams the diff (see scanStagedDiff).
|
|
382
|
+
*/
|
|
383
|
+
export function scanRange(
|
|
384
|
+
repo: RepoConfig,
|
|
385
|
+
denySet: DenySet,
|
|
386
|
+
range: string,
|
|
387
|
+
opts: ScanOptions = {},
|
|
388
|
+
): { hits: ScanHit[]; skipped: SkippedFile[] } {
|
|
389
|
+
if (!repo.isGitRepo) return { hits: [], skipped: [] };
|
|
390
|
+
if (!denySet.combinedRegex) return { hits: [], skipped: [] };
|
|
391
|
+
const hits = streamScanDiff(
|
|
392
|
+
repo.cwd,
|
|
393
|
+
[range, "--diff-filter=ACM", "-U0", "--no-color"],
|
|
394
|
+
denySet,
|
|
395
|
+
opts,
|
|
396
|
+
);
|
|
397
|
+
return { hits, skipped: [] };
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
export interface HistoryHit {
|
|
401
|
+
pattern: string;
|
|
402
|
+
commitSha: string;
|
|
403
|
+
commitSummary: string;
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
export interface ScanHistoryOptions extends ScanOptions {
|
|
407
|
+
/** Lower bound revspec; only commits reachable from the bound forward
|
|
408
|
+
* are scanned. e.g. "main", "v1.0.0", "HEAD~100". When omitted, scans
|
|
409
|
+
* the full history (the design's default). */
|
|
410
|
+
since?: string;
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
/**
|
|
414
|
+
* Scan the full git history with a single `git log -G <combined> -p`
|
|
415
|
+
* invocation, then attribute matches per-pattern by walking each
|
|
416
|
+
* commit's diff text. Returns one HistoryHit per (pattern, commit)
|
|
417
|
+
* match. Pass `--since` to bound the lower edge.
|
|
418
|
+
*
|
|
419
|
+
* Cost scales as O(history-size + patterns × hits). Patterns are
|
|
420
|
+
* combined via `|` into a single regex passed to `git log -G`, so we
|
|
421
|
+
* pay one git invocation regardless of pattern count. Per-pattern
|
|
422
|
+
* attribution happens in-process by re-testing each diff line against
|
|
423
|
+
* the individual patterns — cheap because git already filtered to
|
|
424
|
+
* commits where at least one pattern matched.
|
|
425
|
+
*
|
|
426
|
+
* The pattern field is redacted by default (preview mode) — same
|
|
427
|
+
* policy as scan hits. Pass `revealMatches: true` to opt into
|
|
428
|
+
* literals (NEVER from a hook).
|
|
429
|
+
*/
|
|
430
|
+
export function scanHistory(
|
|
431
|
+
repo: RepoConfig,
|
|
432
|
+
denySet: DenySet,
|
|
433
|
+
opts: ScanHistoryOptions = {},
|
|
434
|
+
): HistoryHit[] {
|
|
435
|
+
if (!repo.isGitRepo) return [];
|
|
436
|
+
if (denySet.patterns.length === 0) return [];
|
|
437
|
+
|
|
438
|
+
// Combine all patterns into a single -G regex. This matches any
|
|
439
|
+
// commit whose diff (added or removed line content) contains at
|
|
440
|
+
// least one pattern; we attribute the specific pattern(s) below.
|
|
441
|
+
const combined = denySet.patterns.join("|");
|
|
442
|
+
// `--format=__COMMIT__:%H %s` gives us a stable, parseable boundary
|
|
443
|
+
// that can't be confused with diff content (the diff body uses
|
|
444
|
+
// `diff --git`, `@@`, `+`, `-`, ` ` line prefixes). The summary
|
|
445
|
+
// can contain anything but is bounded by the next `__COMMIT__:`.
|
|
446
|
+
const commitMarker = "__COMMIT__:";
|
|
447
|
+
const args = [
|
|
448
|
+
"log",
|
|
449
|
+
"-G",
|
|
450
|
+
combined,
|
|
451
|
+
"-p",
|
|
452
|
+
"--no-color",
|
|
453
|
+
`--format=${commitMarker}%H %s`,
|
|
454
|
+
];
|
|
455
|
+
if (opts.since) {
|
|
456
|
+
args.push(`${opts.since}..`);
|
|
457
|
+
}
|
|
458
|
+
let stdout = "";
|
|
459
|
+
try {
|
|
460
|
+
stdout = execFileSync("git", args, {
|
|
461
|
+
cwd: repo.cwd,
|
|
462
|
+
encoding: "utf8",
|
|
463
|
+
stdio: ["ignore", "pipe", "ignore"],
|
|
464
|
+
maxBuffer: 256 * 1024 * 1024,
|
|
465
|
+
});
|
|
466
|
+
} catch {
|
|
467
|
+
return [];
|
|
468
|
+
}
|
|
469
|
+
|
|
470
|
+
// Pre-compile per-pattern regexes once for attribution.
|
|
471
|
+
const perPatternRegexes: (RegExp | null)[] = denySet.patterns.map(p => {
|
|
472
|
+
try {
|
|
473
|
+
return new RegExp(p, "i");
|
|
474
|
+
} catch {
|
|
475
|
+
return null;
|
|
476
|
+
}
|
|
477
|
+
});
|
|
478
|
+
|
|
479
|
+
const hits: HistoryHit[] = [];
|
|
480
|
+
// Walk the output. Each commit's section starts with the marker
|
|
481
|
+
// line, followed by `diff --git` blocks. `git log -G` filters
|
|
482
|
+
// commits whose diff content matched the regex; `-p` includes the
|
|
483
|
+
// unified-diff body so we can attribute per pattern.
|
|
484
|
+
const lines = stdout.split("\n");
|
|
485
|
+
let curSha = "";
|
|
486
|
+
let curSummary = "";
|
|
487
|
+
// Tracks which (pattern-index, commit) pairs we've already emitted,
|
|
488
|
+
// since multiple lines in one commit can hit the same pattern.
|
|
489
|
+
const emitted = new Set<string>();
|
|
490
|
+
for (const line of lines) {
|
|
491
|
+
if (line.startsWith(commitMarker)) {
|
|
492
|
+
const rest = line.slice(commitMarker.length);
|
|
493
|
+
const sp = rest.indexOf(" ");
|
|
494
|
+
curSha = sp >= 0 ? rest.slice(0, sp) : rest;
|
|
495
|
+
curSummary = sp >= 0 ? rest.slice(sp + 1) : "";
|
|
496
|
+
continue;
|
|
497
|
+
}
|
|
498
|
+
if (!curSha) continue;
|
|
499
|
+
// -G matches both added and removed line content; attribute
|
|
500
|
+
// either kind. `+++` / `---` are headers, not content.
|
|
501
|
+
if (line.startsWith("+++") || line.startsWith("---")) continue;
|
|
502
|
+
if (line.length === 0) continue;
|
|
503
|
+
const c0 = line.charCodeAt(0);
|
|
504
|
+
// 43 = '+', 45 = '-'
|
|
505
|
+
if (c0 !== 43 && c0 !== 45) continue;
|
|
506
|
+
const content = line.slice(1);
|
|
507
|
+
for (let i = 0; i < denySet.patterns.length; i++) {
|
|
508
|
+
const re = perPatternRegexes[i];
|
|
509
|
+
if (!re) continue;
|
|
510
|
+
if (!re.test(content)) continue;
|
|
511
|
+
const key = `${i}:${curSha}`;
|
|
512
|
+
if (emitted.has(key)) continue;
|
|
513
|
+
emitted.add(key);
|
|
514
|
+
hits.push({
|
|
515
|
+
pattern: formatMatch(denySet.patterns[i]!, opts),
|
|
516
|
+
commitSha: curSha,
|
|
517
|
+
commitSummary: curSummary,
|
|
518
|
+
});
|
|
519
|
+
}
|
|
520
|
+
}
|
|
521
|
+
return hits;
|
|
522
|
+
}
|
|
523
|
+
|
|
524
|
+
function looksBinary(buf: Buffer): boolean {
|
|
525
|
+
// Heuristic: any NUL byte in the first 8KB is a strong binary signal.
|
|
526
|
+
const sample = buf.subarray(0, Math.min(buf.length, 8192));
|
|
527
|
+
for (let i = 0; i < sample.length; i++) {
|
|
528
|
+
if (sample[i] === 0) return true;
|
|
529
|
+
}
|
|
530
|
+
return false;
|
|
531
|
+
}
|