pkgxray 0.6.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/bin/audit.js CHANGED
@@ -12,7 +12,7 @@ function printUsage() {
12
12
  " pkgxray < evidence.json",
13
13
  " pkgxray --format json < evidence.json",
14
14
  " pkgxray --file evidence.json --format markdown",
15
- " pkgxray guard <npm-package|npm:name@version|./path> [--promote-to dir] [--no-source-scan]",
15
+ " pkgxray guard <npm-package|npm:name@version|github:owner/repo[#ref]|./path> [--promote-to dir] [--no-source-scan]",
16
16
  "",
17
17
  "Evidence JSON fields:",
18
18
  " packageName, npmMetadata, githubMetadata, webPresence, sourceFiles",
@@ -51,6 +51,9 @@ function parseArgs(argv) {
51
51
  options.vulnerabilityCheck = false;
52
52
  } else if (arg === "--no-github") {
53
53
  options.githubMetadata = false;
54
+ options.githubDiff = false;
55
+ } else if (arg === "--no-github-diff") {
56
+ options.githubDiff = false;
54
57
  } else {
55
58
  throw new Error(`Unknown argument: ${arg}`);
56
59
  }
package/bin/mcp-server.js CHANGED
@@ -107,6 +107,11 @@ function guardToolDefinition() {
107
107
  default: true,
108
108
  description: "Set false to skip the GitHub provenance cross-check."
109
109
  },
110
+ githubDiff: {
111
+ type: "boolean",
112
+ default: true,
113
+ description: "Set false to skip the npm-vs-GitHub source diff (saves a tarball download)."
114
+ },
110
115
  outputFormat: {
111
116
  type: "string",
112
117
  enum: ["markdown", "json"],
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "pkgxray",
3
- "version": "0.6.0",
3
+ "version": "0.8.0",
4
4
  "description": "Zero-dep local CLI and MCP server that scans npm packages for supply-chain risk. OSV vuln pre-check, sandboxed quarantine, tarball-integrity verification, calibrated static heuristics, GitHub provenance cross-check.",
5
5
  "license": "MIT",
6
6
  "author": "Jack Adams-Lovell",
package/src/auditor.js CHANGED
@@ -13,25 +13,24 @@ const SEVERITY_ORDER = {
13
13
  high: 3
14
14
  };
15
15
 
16
+ // Suspicious credential / wallet read targets. Each entry is a regex that
17
+ // requires a path or quote boundary so we don't match identifiers like
18
+ // `process.env` or `someObj.ledger`.
16
19
  const SUSPICIOUS_READ_TARGETS = [
17
- "~/.ssh",
18
- ".ssh/",
19
- "id_rsa",
20
- "id_dsa",
21
- "id_ecdsa",
22
- "id_ed25519",
23
- "~/.aws",
24
- ".aws/credentials",
25
- ".npmrc",
26
- ".env",
27
- "keychain",
28
- "login.keychain",
29
- "cookies.sqlite",
30
- "local state",
31
- "metamask",
32
- "electrum",
33
- "exodus",
34
- "ledger"
20
+ { re: /['"`\/\\]\.?ssh\/(?:id_(?:rsa|dsa|ecdsa|ed25519)|authorized_keys)/i, label: "ssh-private-key" },
21
+ { re: /['"`\/\\]id_(?:rsa|dsa|ecdsa|ed25519)\b/i, label: "ssh-key-file" },
22
+ { re: /['"`\/\\]\.ssh(?:\/|['"`])/, label: ".ssh-dir" },
23
+ { re: /['"`\/\\]\.aws\/credentials\b/, label: ".aws/credentials" },
24
+ { re: /['"`\/\\]\.aws\/(?:config|credentials)\b/, label: ".aws-files" },
25
+ { re: /['"`\/\\]\.npmrc(?:['"`]|\s|$)/, label: ".npmrc" },
26
+ { re: /['"`\/\\]\.env(?:\.[a-z]+)?(?:['"`]|\s|$)/i, label: ".env-file" },
27
+ { re: /['"`]login\.keychain(?:-db)?['"`]/i, label: "macOS keychain" },
28
+ { re: /\bsecurity\s+find-(?:generic|internet)-password\b/, label: "macOS security CLI" },
29
+ { re: /['"`]\/?(?:Cookies|Login Data|Web Data|cookies\.sqlite)['"`]/i, label: "browser-creds" },
30
+ { re: /['"`]Local State['"`]/, label: "browser local-state" },
31
+ { re: /\bkeytar\.[a-z]+Password\(/i, label: "keytar API" },
32
+ { re: /\bmetamask['"`\s\/]/i, label: "metamask wallet" },
33
+ { re: /\b(?:electrum|exodus|ledger live|atomic wallet)\b/i, label: "crypto wallet" }
35
34
  ];
36
35
 
37
36
  // Persistence destinations. Each pattern requires a quote/slash boundary
@@ -158,7 +157,8 @@ function normalizeEvidence(input) {
158
157
  evidence.knownVulnerabilities || evidence.vulnerabilities || evidence.osvVulnerabilities || [],
159
158
  sourceFiles: normalizeSourceFiles(
160
159
  evidence.sourceFiles || evidence.SOURCE_FILES || evidence.files || {}
161
- )
160
+ ),
161
+ npmVsGithubDiff: evidence.npmVsGithubDiff || null
162
162
  };
163
163
  }
164
164
 
@@ -244,7 +244,9 @@ const BAND_DEFINITIONS = [
244
244
  { band: "github-archived", label: "github-archived", categories: ["github-archived"], rationale: "Linked repository is archived or disabled — no maintenance, security issues will not be fixed." },
245
245
  { band: "github-young", label: "github-young", categories: ["github-young"], rationale: "Linked repository was created within the last 30 days — common slopsquat shape." },
246
246
  { band: "github-lonely", label: "github-lonely", categories: ["github-lonely"], rationale: "0 stars + 0 forks + low watcher count on a young repo. Low community signal." },
247
- { band: "github-stale", label: "github-stale", categories: ["github-stale"], rationale: "Repository hasn't been pushed to in over two years and isn't formally archived." }
247
+ { band: "github-stale", label: "github-stale", categories: ["github-stale"], rationale: "Repository hasn't been pushed to in over two years and isn't formally archived." },
248
+ { band: "npm-vs-github-divergence", label: "npm-vs-github-divergence", categories: ["npm-vs-github-divergence"], rationale: "Published npm tarball contains source files that aren't in (or differ from) the linked GitHub repo at the matching ref. Strong account-takeover / build-tampering signal." },
249
+ { band: "npm-vs-github-clean", label: "npm-vs-github-clean", categories: ["npm-vs-github-clean"], rationale: "npm tarball matches the linked GitHub repo at the published version." }
248
250
  ];
249
251
 
250
252
  const SEVERITY_RANK = { info: 0, low: 1, medium: 2, high: 3 };
@@ -292,6 +294,54 @@ function auditMetadata(evidence, findings) {
292
294
  inspectMetadataObject("NPM_METADATA", evidence.npmMetadata, findings);
293
295
  inspectGithubMetadata(evidence, findings);
294
296
  inspectKnownVulnerabilities(evidence.knownVulnerabilities, findings);
297
+ inspectNpmVsGithubDiff(evidence, findings);
298
+ }
299
+
300
+ function inspectNpmVsGithubDiff(evidence, findings) {
301
+ const diff = evidence.npmVsGithubDiff;
302
+ if (!diff || !diff.compared) {
303
+ // Not gating — silent skip. Common reasons: no github repo,
304
+ // ref not found, github fetch failed.
305
+ return;
306
+ }
307
+ const c = diff.counts || {};
308
+ if (c.extraSource > 0) {
309
+ const examples = (diff.suspiciousExtras || [])
310
+ .filter((f) => f.category === "extra-source")
311
+ .slice(0, 5)
312
+ .map((f) => f.path);
313
+ findings.push({
314
+ severity: "high",
315
+ category: "npm-vs-github-divergence",
316
+ file: "NPM_VS_GITHUB",
317
+ snippet: `npm tarball contains ${c.extraSource} source file(s) not in the linked GitHub repo @${diff.githubRef}: ${examples.join(", ")}`,
318
+ rationale:
319
+ "Source files present in the published tarball but absent from the matching GitHub ref. Classic account-takeover / build-server-compromise signal."
320
+ });
321
+ }
322
+ if (c.mismatchedSource > 0) {
323
+ const examples = (diff.suspiciousMismatches || [])
324
+ .filter((f) => f.category === "content-mismatch-source")
325
+ .slice(0, 5)
326
+ .map((f) => f.path);
327
+ findings.push({
328
+ severity: "high",
329
+ category: "npm-vs-github-divergence",
330
+ file: "NPM_VS_GITHUB",
331
+ snippet: `${c.mismatchedSource} source file(s) differ between npm tarball and GitHub repo @${diff.githubRef}: ${examples.join(", ")}`,
332
+ rationale:
333
+ "Source files with the same path but different SHA256 in the published tarball vs the linked GitHub repo at the matching ref. Strong tampering signal."
334
+ });
335
+ }
336
+ if (c.extraSource === 0 && c.mismatchedSource === 0 && (c.matched > 0 || c.npmFiles > 0)) {
337
+ findings.push({
338
+ severity: "info",
339
+ category: "npm-vs-github-clean",
340
+ file: "NPM_VS_GITHUB",
341
+ snippet: `${c.matched}/${c.npmFiles} files match GitHub @${diff.githubRef}`,
342
+ rationale: "npm tarball source files match the linked GitHub repo at the matching ref."
343
+ });
344
+ }
295
345
  }
296
346
 
297
347
  const YOUNG_REPO_DAYS = 30;
@@ -607,16 +657,16 @@ const BULK_ENV_REGEXES = [
607
657
 
608
658
  function inspectCredentialAccess(file, content, lower, findings) {
609
659
  for (const target of SUSPICIOUS_READ_TARGETS) {
610
- const index = lower.indexOf(target.toLowerCase());
611
- if (index === -1) continue;
612
- if (!looksLikeCredentialRead(content, lower, index)) continue;
660
+ const match = target.re.exec(content);
661
+ if (!match) continue;
662
+ if (!looksLikeCredentialRead(content, lower, match.index)) continue;
613
663
  findings.push({
614
664
  severity: "high",
615
665
  category: "credential-access",
616
666
  file: file.path,
617
- snippet: clipAround(file.content, index),
667
+ snippet: clipAround(file.content, match.index),
618
668
  rationale:
619
- "Package reads (or constructs a path to) a credential / wallet / key store in proximity to a filesystem read primitive."
669
+ `Reads or references ${target.label} near a filesystem read primitive.`
620
670
  });
621
671
  return;
622
672
  }
package/src/diff.js ADDED
@@ -0,0 +1,281 @@
1
+ "use strict";
2
+
3
+ const fs = require("node:fs");
4
+ const fsp = require("node:fs/promises");
5
+ const path = require("node:path");
6
+ const crypto = require("node:crypto");
7
+
8
+ const SKIP_DIRS = new Set([
9
+ ".git",
10
+ "node_modules",
11
+ ".github",
12
+ ".vscode",
13
+ ".idea",
14
+ "coverage",
15
+ "__pycache__"
16
+ ]);
17
+
18
+ // File patterns that are expected to differ — never used to drive findings.
19
+ const ALWAYS_IGNORE = [
20
+ /(?:^|\/)package\.json$/,
21
+ /(?:^|\/)package-lock\.json$/,
22
+ /(?:^|\/)yarn\.lock$/,
23
+ /(?:^|\/)pnpm-lock\.yaml$/,
24
+ /(?:^|\/)\.npmignore$/,
25
+ /(?:^|\/)\.gitignore$/,
26
+ /(?:^|\/)\.gitattributes$/,
27
+ /(?:^|\/)CHANGELOG(?:\.md)?$/i,
28
+ /(?:^|\/)CONTRIBUTING(?:\.md)?$/i,
29
+ /(?:^|\/)\.npmrc$/
30
+ ];
31
+
32
+ // Patterns that mean "this is build output" — only flagged if no build script
33
+ // exists. With a prepare/prepack script, extras here are expected.
34
+ const BUILD_OUTPUT_PATTERNS = [
35
+ /(?:^|\/)dist\//,
36
+ /(?:^|\/)build\//,
37
+ /(?:^|\/)lib\//,
38
+ /(?:^|\/)es\//,
39
+ /(?:^|\/)esm\//,
40
+ /(?:^|\/)cjs\//,
41
+ /(?:^|\/)umd\//,
42
+ /\.min\.js$/,
43
+ /\.min\.mjs$/,
44
+ /\.min\.css$/,
45
+ /\.d\.ts$/,
46
+ /\.d\.cts$/,
47
+ /\.d\.mts$/,
48
+ /\.js\.map$/,
49
+ /\.css\.map$/
50
+ ];
51
+
52
+ // Source extensions whose contents we care about most. Mismatches or extras
53
+ // here are the strongest ATO signal.
54
+ const SOURCE_EXTENSIONS = new Set([
55
+ ".js", ".cjs", ".mjs", ".jsx",
56
+ ".ts", ".tsx",
57
+ ".vue", ".svelte",
58
+ ".py", ".rb", ".go", ".rs", ".java", ".cs", ".php",
59
+ ".sh", ".ps1", ".bash",
60
+ ".json", ".toml", ".yaml", ".yml"
61
+ ]);
62
+
63
+ function isAlwaysIgnored(relPath) {
64
+ return ALWAYS_IGNORE.some((re) => re.test(relPath));
65
+ }
66
+
67
+ function isBuildOutput(relPath) {
68
+ return BUILD_OUTPUT_PATTERNS.some((re) => re.test(relPath));
69
+ }
70
+
71
+ function isSourceFile(relPath) {
72
+ const ext = path.extname(relPath).toLowerCase();
73
+ return SOURCE_EXTENSIONS.has(ext);
74
+ }
75
+
76
+ async function hashTree(root, subdir, limits) {
77
+ const baseDir = subdir ? path.join(root, subdir) : root;
78
+ try {
79
+ await fsp.access(baseDir);
80
+ } catch {
81
+ return null;
82
+ }
83
+ const result = new Map();
84
+ const queue = [""];
85
+ let totalBytes = 0;
86
+ let totalFiles = 0;
87
+ const maxFiles = limits.maxFiles || 5000;
88
+ const maxBytes = limits.maxBytes || 50 * 1024 * 1024;
89
+ const maxFileBytes = limits.maxFileBytes || 1024 * 1024;
90
+
91
+ while (queue.length > 0 && totalFiles < maxFiles && totalBytes < maxBytes) {
92
+ const rel = queue.shift();
93
+ const full = path.join(baseDir, rel);
94
+ let entries;
95
+ try {
96
+ entries = await fsp.readdir(full, { withFileTypes: true });
97
+ } catch {
98
+ continue;
99
+ }
100
+ for (const entry of entries) {
101
+ const childRel = rel ? `${rel}/${entry.name}` : entry.name;
102
+ if (entry.isDirectory()) {
103
+ if (SKIP_DIRS.has(entry.name)) continue;
104
+ queue.push(childRel);
105
+ continue;
106
+ }
107
+ if (!entry.isFile()) continue;
108
+ const childFull = path.join(baseDir, childRel);
109
+ let stat;
110
+ try {
111
+ stat = await fsp.stat(childFull);
112
+ } catch {
113
+ continue;
114
+ }
115
+ if (stat.size > maxFileBytes) {
116
+ result.set(childRel, { size: stat.size, sha256: "skipped:too-large" });
117
+ continue;
118
+ }
119
+ if (totalBytes + stat.size > maxBytes) {
120
+ // soft cap — record presence without hash
121
+ result.set(childRel, { size: stat.size, sha256: "skipped:tree-budget" });
122
+ continue;
123
+ }
124
+ const hash = await hashFile(childFull);
125
+ result.set(childRel, { size: stat.size, sha256: hash });
126
+ totalBytes += stat.size;
127
+ totalFiles += 1;
128
+ if (totalFiles >= maxFiles) break;
129
+ }
130
+ }
131
+ return result;
132
+ }
133
+
134
+ function hashFile(filePath) {
135
+ return new Promise((resolve, reject) => {
136
+ const hash = crypto.createHash("sha256");
137
+ fs.createReadStream(filePath)
138
+ .on("data", (chunk) => hash.update(chunk))
139
+ .on("error", reject)
140
+ .on("end", () => resolve(hash.digest("hex")));
141
+ });
142
+ }
143
+
144
+ // Compare a staged npm package against the matching GitHub repo subtree.
145
+ // Returns null if the comparison wasn't possible.
146
+ async function diffNpmVsGithub({ npmStagedPath, githubStagedPath, subdir, hasBuildScript }) {
147
+ const limits = { maxFiles: 5000, maxBytes: 50 * 1024 * 1024 };
148
+ const [npmTree, ghTree] = await Promise.all([
149
+ hashTree(npmStagedPath, "", limits),
150
+ hashTree(githubStagedPath, subdir || "", limits)
151
+ ]);
152
+ if (!npmTree || !ghTree) {
153
+ return {
154
+ compared: false,
155
+ reason: !ghTree ? "github-subdir-missing" : "npm-tree-missing"
156
+ };
157
+ }
158
+
159
+ const extraInNpm = [];
160
+ const mismatched = [];
161
+ const matched = [];
162
+
163
+ for (const [rel, npmEntry] of npmTree.entries()) {
164
+ if (isAlwaysIgnored(rel)) continue;
165
+ const ghEntry = ghTree.get(rel);
166
+ if (!ghEntry) {
167
+ // Files in the npm tarball but NOT in the github repo at the matching
168
+ // ref. If the package has a build script, root-level JS at non-source
169
+ // paths is probably bundled / generated and we can't reliably catch
170
+ // tampering there — demote to silent. We DO still surface extras in
171
+ // paths that look like source trees (`src/`, `lib/`, `tests/`,
172
+ // `scripts/`) since those should be 1:1 even with a build step.
173
+ const inLikelySourceDir = /^(?:src|tests?|scripts|spec)\//.test(rel);
174
+ const category = isBuildOutput(rel)
175
+ ? hasBuildScript ? "expected-build-output" : "extra-build-output"
176
+ : isSourceFile(rel)
177
+ ? hasBuildScript && !inLikelySourceDir
178
+ ? "expected-build-output"
179
+ : "extra-source"
180
+ : "extra-other";
181
+ extraInNpm.push({ path: rel, category, size: npmEntry.size });
182
+ continue;
183
+ }
184
+ if (npmEntry.sha256 !== ghEntry.sha256) {
185
+ // skipped hashes don't count as a mismatch
186
+ if (npmEntry.sha256.startsWith("skipped") || ghEntry.sha256.startsWith("skipped")) {
187
+ continue;
188
+ }
189
+ const inLikelySourceDir = /^(?:src|tests?|scripts|spec)\//.test(rel);
190
+ const category = isBuildOutput(rel)
191
+ ? hasBuildScript ? "expected-build-output" : "content-mismatch-build"
192
+ : isSourceFile(rel)
193
+ ? hasBuildScript && !inLikelySourceDir
194
+ ? "expected-build-output"
195
+ : "content-mismatch-source"
196
+ : "content-mismatch-other";
197
+ mismatched.push({ path: rel, category, npmSize: npmEntry.size, ghSize: ghEntry.size });
198
+ } else {
199
+ matched.push(rel);
200
+ }
201
+ }
202
+
203
+ const extraSource = extraInNpm.filter((f) => f.category === "extra-source");
204
+ const extraBuild = extraInNpm.filter((f) => f.category === "extra-build-output");
205
+ const mismatchedSource = mismatched.filter((f) => f.category === "content-mismatch-source");
206
+
207
+ // Tree-overlap sanity check. Many real packages don't publish a 1:1 mirror
208
+ // of their repo (lodash publishes flat per-function modules, react bundles
209
+ // src/ to root, monorepos publish a subtree). If the npm tarball and the
210
+ // github tree have very little overlap at matching paths, comparing them
211
+ // produces mostly noise. Bail out with an honest "no-overlap" reason rather
212
+ // than firing HIGH on legit packages.
213
+ const consideredFiles = npmTree.size; // already excludes node_modules etc.
214
+ const overlapRatio = consideredFiles > 0 ? (matched.length + mismatched.length) / consideredFiles : 0;
215
+ const MIN_OVERLAP_RATIO = 0.3;
216
+
217
+ if (overlapRatio < MIN_OVERLAP_RATIO && extraSource.length > 20) {
218
+ return {
219
+ compared: false,
220
+ reason: "tree-layout-differs",
221
+ hasBuildScript,
222
+ subdir: subdir || null,
223
+ overlapRatio: Number(overlapRatio.toFixed(2)),
224
+ counts: {
225
+ npmFiles: npmTree.size,
226
+ ghFiles: ghTree.size,
227
+ matched: matched.length,
228
+ mismatched: mismatched.length,
229
+ extraInNpm: extraInNpm.length
230
+ },
231
+ note:
232
+ "npm tarball and GitHub tree have very little overlap at matching paths — the package is likely built / bundled before publish. Diff would be unreliable; skipping."
233
+ };
234
+ }
235
+
236
+ // Second skip case: of the files that DO exist at matching paths, most
237
+ // mismatch. That means the published artifacts are generated from the repo
238
+ // source (typical of `prepublish` / `release-please` / `changesets` build
239
+ // flows that minify or transform entry files). Diff is unreliable here.
240
+ const overlapCount = matched.length + mismatched.length;
241
+ if (overlapCount >= 5 && mismatched.length > matched.length * 2) {
242
+ return {
243
+ compared: false,
244
+ reason: "tree-mostly-generated",
245
+ hasBuildScript,
246
+ subdir: subdir || null,
247
+ overlapRatio: Number(overlapRatio.toFixed(2)),
248
+ counts: {
249
+ npmFiles: npmTree.size,
250
+ ghFiles: ghTree.size,
251
+ matched: matched.length,
252
+ mismatched: mismatched.length,
253
+ extraInNpm: extraInNpm.length
254
+ },
255
+ note:
256
+ "Most overlapping files differ in content — published artifacts are likely generated from the repo source (e.g. bundling, transpilation). Diff would be unreliable; skipping."
257
+ };
258
+ }
259
+
260
+ return {
261
+ compared: true,
262
+ hasBuildScript,
263
+ subdir: subdir || null,
264
+ overlapRatio: Number(overlapRatio.toFixed(2)),
265
+ counts: {
266
+ npmFiles: npmTree.size,
267
+ ghFiles: ghTree.size,
268
+ matched: matched.length,
269
+ mismatched: mismatched.length,
270
+ extraInNpm: extraInNpm.length,
271
+ extraSource: extraSource.length,
272
+ mismatchedSource: mismatchedSource.length
273
+ },
274
+ suspiciousExtras: extraSource.concat(extraBuild).slice(0, 25),
275
+ suspiciousMismatches: mismatchedSource.concat(
276
+ mismatched.filter((f) => f.category === "content-mismatch-build")
277
+ ).slice(0, 25)
278
+ };
279
+ }
280
+
281
+ module.exports = { diffNpmVsGithub };
package/src/github.js CHANGED
@@ -1,9 +1,12 @@
1
1
  "use strict";
2
2
 
3
+ const fs = require("node:fs");
3
4
  const fsp = require("node:fs/promises");
4
5
  const os = require("node:os");
5
6
  const path = require("node:path");
6
7
  const https = require("node:https");
8
+ const crypto = require("node:crypto");
9
+ const { spawn } = require("node:child_process");
7
10
 
8
11
  const USER_AGENT = "pkgxray/0.6.0";
9
12
  const CACHE_DIR = path.join(os.homedir(), ".cache", "pkgxray", "github");
@@ -160,7 +163,135 @@ async function fetchRepoMetadata(repository, options = {}) {
160
163
  }
161
164
  }
162
165
 
166
+ // ---- Tarball download + ref resolution ----
167
+
168
+ const TARBALL_CACHE_DIR = path.join(os.homedir(), ".cache", "pkgxray", "tarballs");
169
+ const TARBALL_TTL_MS = 24 * 60 * 60 * 1000; // 24h
170
+ const TARBALL_TIMEOUT_MS = 15000;
171
+ const MAX_TARBALL_BYTES = 100 * 1024 * 1024; // 100MB
172
+
173
+ function tarballCachePath(owner, repo, ref) {
174
+ const key = crypto
175
+ .createHash("sha1")
176
+ .update(`${owner}/${repo}@${ref}`)
177
+ .digest("hex");
178
+ return path.join(TARBALL_CACHE_DIR, `${key}.tgz`);
179
+ }
180
+
181
+ async function downloadCodeload(url, destination) {
182
+ return new Promise((resolve, reject) => {
183
+ const file = fs.createWriteStream(destination, { mode: 0o600 });
184
+ let written = 0;
185
+ let cleanedUp = false;
186
+ const cleanup = (err) => {
187
+ if (cleanedUp) return;
188
+ cleanedUp = true;
189
+ file.destroy();
190
+ fs.unlink(destination, () => reject(err));
191
+ };
192
+ const get = (currentUrl, hops) => {
193
+ if (hops > 5) return cleanup(new Error("Too many redirects"));
194
+ const parsed = new URL(currentUrl);
195
+ const request = https.get(
196
+ {
197
+ hostname: parsed.hostname,
198
+ path: parsed.pathname + parsed.search,
199
+ headers: { "user-agent": USER_AGENT },
200
+ timeout: TARBALL_TIMEOUT_MS
201
+ },
202
+ (response) => {
203
+ if ([301, 302, 303, 307, 308].includes(response.statusCode) && response.headers.location) {
204
+ response.resume();
205
+ return get(new URL(response.headers.location, currentUrl).toString(), hops + 1);
206
+ }
207
+ if (response.statusCode === 404) {
208
+ response.resume();
209
+ const err = new Error(`GitHub codeload 404: ${currentUrl}`);
210
+ err.statusCode = 404;
211
+ return cleanup(err);
212
+ }
213
+ if (response.statusCode < 200 || response.statusCode >= 300) {
214
+ response.resume();
215
+ return cleanup(new Error(`Codeload HTTP ${response.statusCode}`));
216
+ }
217
+ response.on("data", (chunk) => {
218
+ written += chunk.length;
219
+ if (written > MAX_TARBALL_BYTES) {
220
+ response.destroy();
221
+ cleanup(new Error(`Codeload exceeded ${MAX_TARBALL_BYTES} bytes`));
222
+ }
223
+ });
224
+ response.pipe(file);
225
+ file.on("finish", () => file.close(() => resolve()));
226
+ }
227
+ );
228
+ request.on("error", cleanup);
229
+ request.on("timeout", () => request.destroy(new Error("Codeload request timed out")));
230
+ };
231
+ get(url, 0);
232
+ });
233
+ }
234
+
235
+ function run(command, args) {
236
+ return new Promise((resolve, reject) => {
237
+ const child = spawn(command, args, { stdio: ["ignore", "pipe", "pipe"] });
238
+ let stderr = "";
239
+ child.stderr.on("data", (chunk) => {
240
+ stderr += chunk;
241
+ });
242
+ child.on("error", reject);
243
+ child.on("close", (code) => {
244
+ if (code === 0) resolve();
245
+ else reject(new Error(`${command} exited with ${code}: ${stderr.trim()}`));
246
+ });
247
+ });
248
+ }
249
+
250
+ async function extractTarball(archivePath, destination) {
251
+ await fsp.mkdir(destination, { recursive: true, mode: 0o700 });
252
+ return run("tar", [
253
+ "-xzf", archivePath,
254
+ "-C", destination,
255
+ "--strip-components", "1",
256
+ "--no-same-owner", "--no-same-permissions"
257
+ ]);
258
+ }
259
+
260
+ // Try refs in order until one downloads. Caches the first successful one.
261
+ async function fetchRepoTarballForVersion(owner, repo, version, defaultBranch) {
262
+ await fsp.mkdir(TARBALL_CACHE_DIR, { recursive: true, mode: 0o700 });
263
+ const candidates = [];
264
+ if (version) {
265
+ candidates.push(`v${version}`);
266
+ candidates.push(version);
267
+ }
268
+ if (defaultBranch) candidates.push(defaultBranch);
269
+
270
+ for (const ref of candidates) {
271
+ const cachePath = tarballCachePath(owner, repo, ref);
272
+ try {
273
+ const stat = await fsp.stat(cachePath);
274
+ if (Date.now() - stat.mtimeMs < TARBALL_TTL_MS) {
275
+ return { ref, archivePath: cachePath, fromCache: true };
276
+ }
277
+ } catch {
278
+ // not cached, fall through to download
279
+ }
280
+ const url = `https://codeload.github.com/${owner}/${repo}/tar.gz/${encodeURIComponent(ref)}`;
281
+ try {
282
+ await downloadCodeload(url, cachePath);
283
+ return { ref, archivePath: cachePath, fromCache: false };
284
+ } catch (error) {
285
+ if (error.statusCode === 404) continue;
286
+ throw error;
287
+ }
288
+ }
289
+ return null;
290
+ }
291
+
163
292
  module.exports = {
164
293
  parseGithubRepo,
165
- fetchRepoMetadata
294
+ fetchRepoMetadata,
295
+ fetchRepoTarballForVersion,
296
+ extractTarball
166
297
  };
package/src/quarantine.js CHANGED
@@ -8,7 +8,8 @@ const os = require("node:os");
8
8
  const path = require("node:path");
9
9
  const { spawn } = require("node:child_process");
10
10
  const { auditEvidence } = require("./auditor");
11
- const { fetchRepoMetadata } = require("./github");
11
+ const { fetchRepoMetadata, fetchRepoTarballForVersion, extractTarball: extractTarballGh } = require("./github");
12
+ const { diffNpmVsGithub } = require("./diff");
12
13
 
13
14
  const DEFAULT_MAX_FILE_BYTES = 256 * 1024;
14
15
  const DEFAULT_MAX_FILES = 600;
@@ -104,13 +105,41 @@ async function guardExtension(reference, options = {}) {
104
105
  const githubMetadata = await githubMetadataPromise;
105
106
  timings.githubMetadataMs = elapsed(githubStart);
106
107
 
108
+ // npm vs GitHub diff (Phase 3) — only for npm packages where we have repo
109
+ // metadata. Runs serially after we have both trees; tarballs are cached so
110
+ // re-runs are fast.
111
+ let npmVsGithubDiff = null;
112
+ if (
113
+ options.githubDiff !== false &&
114
+ resolved.type === "npm" &&
115
+ githubMetadata && githubMetadata.found &&
116
+ vulnerabilities.length === 0 &&
117
+ Object.keys(sourceFiles).length > 0
118
+ ) {
119
+ const diffStart = now();
120
+ try {
121
+ npmVsGithubDiff = await runNpmVsGithubDiff({
122
+ resolved,
123
+ npmStagedPath: stagedPath,
124
+ githubMetadata,
125
+ workspace
126
+ });
127
+ } catch (error) {
128
+ npmVsGithubDiff = { compared: false, reason: "diff-error", message: error.message };
129
+ }
130
+ timings.diffMs = elapsed(diffStart);
131
+ } else {
132
+ timings.diffMs = 0;
133
+ }
134
+
107
135
  const evidence = {
108
136
  packageName: resolved.packageName || reference,
109
137
  npmMetadata: resolved.npmMetadata || null,
110
138
  githubMetadata,
111
139
  webPresence: null,
112
140
  knownVulnerabilities: vulnerabilities,
113
- sourceFiles
141
+ sourceFiles,
142
+ npmVsGithubDiff
114
143
  };
115
144
  const auditStart = now();
116
145
  const report = auditEvidence(evidence);
@@ -123,6 +152,7 @@ async function guardExtension(reference, options = {}) {
123
152
  resolved,
124
153
  sourceFiles,
125
154
  githubMetadata,
155
+ npmVsGithubDiff,
126
156
  vulnerabilityPrecheck: {
127
157
  enabled: options.vulnerabilityCheck !== false,
128
158
  database: "OSV",
@@ -143,6 +173,53 @@ async function guardExtension(reference, options = {}) {
143
173
  return result;
144
174
  }
145
175
 
176
+ async function runNpmVsGithubDiff({ resolved, npmStagedPath, githubMetadata, workspace }) {
177
+ const version = resolved.version;
178
+ const tarball = await fetchRepoTarballForVersion(
179
+ githubMetadata.owner,
180
+ githubMetadata.repo,
181
+ version,
182
+ githubMetadata.default_branch
183
+ );
184
+ if (!tarball) {
185
+ return { compared: false, reason: "no-matching-ref", versionTried: version };
186
+ }
187
+
188
+ const ghStagePath = path.join(workspace, "github-tree");
189
+ await extractTarballGh(tarball.archivePath, ghStagePath);
190
+
191
+ // package.json may set repository.directory for monorepos — narrow the
192
+ // comparison to that subpath if present.
193
+ const pkgRepo = resolved.npmMetadata && resolved.npmMetadata.repository;
194
+ const subdir = pkgRepo && typeof pkgRepo === "object" ? pkgRepo.directory || null : null;
195
+
196
+ // Detect a publish-time build script (means built artifacts ≠ repo is normal)
197
+ const scripts = await readScripts(npmStagedPath);
198
+ const hasBuildScript = Boolean(scripts.prepare || scripts.prepack || scripts.build);
199
+
200
+ const diff = await diffNpmVsGithub({
201
+ npmStagedPath,
202
+ githubStagedPath: ghStagePath,
203
+ subdir,
204
+ hasBuildScript
205
+ });
206
+
207
+ return {
208
+ ...diff,
209
+ githubRef: tarball.ref,
210
+ tarballFromCache: tarball.fromCache
211
+ };
212
+ }
213
+
214
+ async function readScripts(stagedPath) {
215
+ try {
216
+ const pkg = JSON.parse(await fsp.readFile(path.join(stagedPath, "package.json"), "utf8"));
217
+ return pkg.scripts || {};
218
+ } catch {
219
+ return {};
220
+ }
221
+ }
222
+
146
223
  async function stageReference(reference, stagedPath, options) {
147
224
  const parsed = parseReference(reference);
148
225
  if (parsed.type === "local") {
@@ -158,6 +235,10 @@ async function stageReference(reference, stagedPath, options) {
158
235
  return resolveNpmPackage(parsed.specifier, options);
159
236
  }
160
237
 
238
+ if (parsed.type === "github") {
239
+ return resolveGithubRepo(parsed, options);
240
+ }
241
+
161
242
  throw new Error(`Unsupported reference type: ${reference}`);
162
243
  }
163
244
 
@@ -170,6 +251,21 @@ function parseReference(reference) {
170
251
  return { type: "local", path: path.resolve(reference.slice("file:".length)) };
171
252
  }
172
253
 
254
+ if (reference.startsWith("github:")) {
255
+ return parseGithubReference(reference.slice("github:".length));
256
+ }
257
+
258
+ // github.com URLs as a convenience shorthand
259
+ const ghMatch = reference.match(/^https?:\/\/github\.com\/([^/]+)\/([^/?#]+?)(?:\.git)?(?:#(.+))?$/);
260
+ if (ghMatch) {
261
+ return {
262
+ type: "github",
263
+ owner: ghMatch[1],
264
+ repo: ghMatch[2],
265
+ ref: ghMatch[3] || null
266
+ };
267
+ }
268
+
173
269
  if (
174
270
  reference.startsWith(".") ||
175
271
  reference.startsWith("/") ||
@@ -184,6 +280,62 @@ function parseReference(reference) {
184
280
  return { type: "npm", specifier: reference };
185
281
  }
186
282
 
283
+ function parseGithubReference(spec) {
284
+ // Supports owner/repo[#ref] and owner/repo[@ref]
285
+ const match = spec.match(/^([^/#@]+)\/([^/#@]+?)(?:[#@](.+))?$/);
286
+ if (!match) throw new Error(`Invalid github reference: github:${spec}`);
287
+ return {
288
+ type: "github",
289
+ owner: match[1],
290
+ repo: match[2].replace(/\.git$/, ""),
291
+ ref: match[3] || null
292
+ };
293
+ }
294
+
295
+ async function resolveGithubRepo(parsed, options) {
296
+ // Resolve default branch if no ref pinned. Uses the existing GitHub metadata
297
+ // helper which is already cached + parallel-safe.
298
+ const { fetchRepoMetadata } = require("./github");
299
+ let ref = parsed.ref;
300
+ let resolvedMeta = null;
301
+ if (!ref) {
302
+ const meta = await fetchRepoMetadata(`https://github.com/${parsed.owner}/${parsed.repo}`).catch(() => null);
303
+ if (meta && meta.found === false && meta.reason === "not-found") {
304
+ throw new Error(`GitHub repository not found: ${parsed.owner}/${parsed.repo}`);
305
+ }
306
+ if (meta && meta.found) {
307
+ ref = meta.default_branch || "HEAD";
308
+ resolvedMeta = meta;
309
+ } else {
310
+ ref = "HEAD";
311
+ }
312
+ }
313
+
314
+ // GitHub's "codeload" endpoint returns a .tar.gz of the repo at the given
315
+ // ref. Works for branch names, tags, and commit SHAs.
316
+ const tarballUrl = `https://codeload.github.com/${parsed.owner}/${parsed.repo}/tar.gz/${encodeURIComponent(ref)}`;
317
+
318
+ return {
319
+ type: "github",
320
+ owner: parsed.owner,
321
+ repo: parsed.repo,
322
+ ref,
323
+ needsDownload: true,
324
+ tarballUrl,
325
+ packageName: `${parsed.owner}/${parsed.repo}`,
326
+ githubArchive: true,
327
+ npmMetadata: resolvedMeta
328
+ ? {
329
+ // Synthetic shape so the downstream auditor still sees a repository
330
+ // URL and the github cross-check finds the same data we already have.
331
+ name: parsed.repo,
332
+ repository: { url: resolvedMeta.html_url, type: "git" },
333
+ maintainers: []
334
+ }
335
+ : null
336
+ };
337
+ }
338
+
187
339
  async function copyLocalPath(sourcePath, stagedPath) {
188
340
  const stat = await fsp.stat(sourcePath);
189
341
  if (!stat.isDirectory()) {