@better-internet/oss-verify 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,135 @@
1
+ // SPDX-License-Identifier: MIT
2
+ //
3
+ // Heuristic license detection from LICENSE / LICENCE / COPYING body text.
4
+ // Used as a fallback when a project declares its license via the file content
5
+ // alone (no SPDX-License-Identifier header, no package.json license field).
6
+ //
7
+ // Order matters: more specific patterns must come first (AGPL contains the
8
+ // GPL phrase, BSD-3-Clause includes the BSD-2 boilerplate, etc.).
9
+ //
10
+ // This isn't a replacement for full license-scanning tooling (licensee,
11
+ // ScanCode). It catches the ~10 most common OSI licenses with high precision
12
+ // — enough to keep checkOsiLicense and checkReuse from emitting false
13
+ // negatives against the majority of real-world OSS repos.
14
+ import { existsSync, readFileSync } from "node:fs";
15
+ import { join } from "node:path";
16
+ const LICENSE_FILES = ["LICENSE", "LICENSE.md", "LICENSE.txt", "LICENCE", "COPYING"];
17
+ const DETECTORS = [
18
+ // AGPL must beat the GPL detector below (AGPL text contains "GNU GENERAL
19
+ // PUBLIC LICENSE" as a referenced phrase).
20
+ {
21
+ spdx: "AGPL-3.0-only",
22
+ matches: (b) => /GNU AFFERO GENERAL PUBLIC LICENSE/i.test(b) && /Version 3/i.test(b),
23
+ },
24
+ {
25
+ spdx: "GPL-3.0-only",
26
+ matches: (b) => /GNU GENERAL PUBLIC LICENSE/i.test(b) &&
27
+ /Version 3/i.test(b) &&
28
+ !/AFFERO/i.test(b) &&
29
+ !/LESSER/i.test(b),
30
+ },
31
+ {
32
+ spdx: "GPL-2.0-only",
33
+ matches: (b) => /GNU GENERAL PUBLIC LICENSE/i.test(b) && /Version 2/i.test(b) && !/LESSER/i.test(b),
34
+ },
35
+ {
36
+ spdx: "LGPL-3.0-only",
37
+ matches: (b) => /GNU LESSER GENERAL PUBLIC LICENSE/i.test(b) && /Version 3/i.test(b),
38
+ },
39
+ {
40
+ spdx: "LGPL-2.1-only",
41
+ matches: (b) => /GNU LESSER GENERAL PUBLIC LICENSE/i.test(b) && /Version 2\.1/i.test(b),
42
+ },
43
+ {
44
+ spdx: "Apache-2.0",
45
+ matches: (b) => /Apache License/i.test(b) && /Version 2\.0/i.test(b),
46
+ },
47
+ {
48
+ spdx: "MPL-2.0",
49
+ matches: (b) => /Mozilla Public License/i.test(b) && /Version 2\.0/i.test(b),
50
+ },
51
+ {
52
+ spdx: "BSD-3-Clause",
53
+ matches: (b) => /Redistribution and use/i.test(b) && /Neither the name/i.test(b),
54
+ },
55
+ {
56
+ spdx: "BSD-2-Clause",
57
+ matches: (b) => /Redistribution and use/i.test(b) && !/Neither the name/i.test(b),
58
+ },
59
+ {
60
+ spdx: "MIT",
61
+ matches: (b) => /Permission is hereby granted, free of charge/i.test(b) && /MERCHANTABILITY/i.test(b),
62
+ },
63
+ {
64
+ spdx: "ISC",
65
+ matches: (b) => /Permission to use, copy, modify, and\/or distribute/i.test(b),
66
+ },
67
+ {
68
+ spdx: "Unlicense",
69
+ matches: (b) => /This is free and unencumbered software released into the public domain/i.test(b),
70
+ },
71
+ ];
72
+ /**
73
+ * Returns the SPDX identifier of the first detector that matches the LICENSE
74
+ * body, or null when nothing recognisable is present.
75
+ */
76
+ export function detectLicenseFromText(body) {
77
+ for (const d of DETECTORS) {
78
+ if (d.matches(body))
79
+ return d.spdx;
80
+ }
81
+ return null;
82
+ }
83
+ /**
84
+ * Returns the SPDX identifier detected from the repo's root LICENSE file (or
85
+ * its common variants), or null if no LICENSE file exists or its content
86
+ * doesn't match any known license.
87
+ */
88
+ export function detectRootLicense(repoRoot) {
89
+ for (const name of LICENSE_FILES) {
90
+ const p = join(repoRoot, name);
91
+ if (!existsSync(p))
92
+ continue;
93
+ try {
94
+ const body = readFileSync(p, "utf8").slice(0, 16384);
95
+ const id = detectLicenseFromText(body);
96
+ if (id)
97
+ return id;
98
+ }
99
+ catch { }
100
+ }
101
+ return null;
102
+ }
103
+ /**
104
+ * True iff the repo has *any* license declaration our checks can recognise:
105
+ * a root LICENSE we can text-detect, an SPDX-License-Identifier in a root
106
+ * license file, or a package.json license field. Used by checkReuse to
107
+ * decide whether a missing per-file SPDX header is a real problem.
108
+ */
109
+ export function hasAnyLicenseDeclaration(repoRoot) {
110
+ if (detectRootLicense(repoRoot))
111
+ return true;
112
+ // SPDX header in a root LICENSE file
113
+ for (const name of LICENSE_FILES) {
114
+ const p = join(repoRoot, name);
115
+ if (!existsSync(p))
116
+ continue;
117
+ try {
118
+ const head = readFileSync(p, "utf8").slice(0, 8192);
119
+ if (/SPDX-License-Identifier:/i.test(head))
120
+ return true;
121
+ }
122
+ catch { }
123
+ }
124
+ // package.json license field
125
+ const pkgPath = join(repoRoot, "package.json");
126
+ if (existsSync(pkgPath)) {
127
+ try {
128
+ const pkg = JSON.parse(readFileSync(pkgPath, "utf8"));
129
+ if (pkg.license && pkg.license !== "UNLICENSED")
130
+ return true;
131
+ }
132
+ catch { }
133
+ }
134
+ return false;
135
+ }
@@ -62,6 +62,15 @@ export async function runLlmAudit(ctx, opts) {
62
62
  // SPEC §7.4: three independent calls at temperature=0; majority verdict wins.
63
63
  // "Block" must be a strict majority — a 1:1:1 outcome (one of each + an error
64
64
  // or unparseable) defaults to block, since the audit is a veto layer.
65
+ //
66
+ // The three calls run *sequentially* rather than in parallel. Parallel
67
+ // firing creates a 3x burst within ~1s against Anthropic's per-minute
68
+ // token rate limit (30k tpm on default tier). Posthog-sized envelopes
69
+ // (~10k input tokens × 3) reliably tripped that ceiling. Sequential adds
70
+ // ~20s of wallclock per project — invisible for an offline watchlist —
71
+ // and lets the per-minute window relax between passes. callAnthropic
72
+ // also retries with backoff on 429, so even sequential bursts that
73
+ // exceed the limit recover instead of failing.
65
74
  const apiKey = opts.apiKey;
66
75
  const callOnce = () => callAnthropic({
67
76
  modelId: opts.modelId,
@@ -70,7 +79,9 @@ export async function runLlmAudit(ctx, opts) {
70
79
  system: SYSTEM_PROMPT,
71
80
  envelope: envelope.text,
72
81
  });
73
- const verdicts = await Promise.all([callOnce(), callOnce(), callOnce()]);
82
+ const verdicts = [];
83
+ for (let i = 0; i < 3; i++)
84
+ verdicts.push(await callOnce());
74
85
  const verdict = majorityVerdict(verdicts);
75
86
  return { verdict, promptHash, modelId: opts.modelId };
76
87
  }
@@ -141,47 +152,79 @@ function containsNul(buf, max) {
141
152
  return true;
142
153
  return false;
143
154
  }
155
+ // Retry policy for transient Anthropic failures. 429 (rate-limit) and 5xx
156
+ // (gateway / server) are retryable; 4xx-except-429 (auth, bad request, etc.)
157
+ // are not. Honors `retry-after` (seconds) and `retry-after-ms` headers when
158
+ // present; otherwise exponential backoff capped at 30s. Max 4 attempts.
159
+ const MAX_RETRIES = 3;
160
+ const BASE_DELAY_MS = 1000;
161
+ const MAX_DELAY_MS = 30_000;
162
+ function isRetryable(status) {
163
+ return status === 429 || (status >= 500 && status < 600);
164
+ }
165
+ function backoffDelay(attempt, res) {
166
+ const ms = res.headers.get("retry-after-ms");
167
+ if (ms && /^\d+$/.test(ms))
168
+ return Math.min(Number(ms), MAX_DELAY_MS);
169
+ const sec = res.headers.get("retry-after");
170
+ if (sec && /^\d+$/.test(sec))
171
+ return Math.min(Number(sec) * 1000, MAX_DELAY_MS);
172
+ // Exponential with full jitter — 1s, 2s, 4s, 8s capped at 30s.
173
+ const exp = Math.min(BASE_DELAY_MS * 2 ** attempt, MAX_DELAY_MS);
174
+ return Math.floor(Math.random() * exp);
175
+ }
176
+ const sleep = (ms) => new Promise((r) => setTimeout(r, ms));
144
177
  async function callAnthropic(args) {
145
- const res = await fetch(args.endpoint, {
146
- method: "POST",
147
- headers: {
148
- "x-api-key": args.apiKey,
149
- "anthropic-version": "2023-06-01",
150
- "content-type": "application/json",
151
- },
152
- body: JSON.stringify({
153
- model: args.modelId,
154
- max_tokens: 256,
155
- temperature: 0,
156
- system: args.system,
157
- messages: [{ role: "user", content: args.envelope }],
158
- }),
178
+ const body = JSON.stringify({
179
+ model: args.modelId,
180
+ max_tokens: 256,
181
+ temperature: 0,
182
+ system: args.system,
183
+ messages: [{ role: "user", content: args.envelope }],
159
184
  });
160
- if (!res.ok) {
161
- const body = await res.text();
162
- // Network/auth failure must BLOCK we have no opinion if the audit
163
- // didn't actually run, and the predicate must not be emitted on a
164
- // silent fallback.
165
- return {
166
- verdict: "block",
167
- rationale: `Anthropic API call failed (${res.status}): ${body.slice(0, 200)}`,
168
- passes: 0,
169
- };
170
- }
171
- const data = (await res.json());
172
- const text = data.content?.find((b) => b.type === "text")?.text?.trim() ?? "";
173
- const parsed = parseModelVerdict(text);
174
- // Belt-and-braces: if the response.model field doesn't match what we
175
- // asked for, block. Vendors can route to fallback models; we need the
176
- // exact one for predicate integrity.
177
- if (data.model && data.model !== args.modelId) {
178
- return {
179
- verdict: "block",
180
- rationale: `response.model '${data.model}' != requested '${args.modelId}'`,
181
- passes: 1,
182
- };
185
+ let lastStatus = 0;
186
+ let lastBody = "";
187
+ for (let attempt = 0; attempt <= MAX_RETRIES; attempt++) {
188
+ const res = await fetch(args.endpoint, {
189
+ method: "POST",
190
+ headers: {
191
+ "x-api-key": args.apiKey,
192
+ "anthropic-version": "2023-06-01",
193
+ "content-type": "application/json",
194
+ },
195
+ body,
196
+ });
197
+ if (res.ok) {
198
+ const data = (await res.json());
199
+ const text = data.content?.find((b) => b.type === "text")?.text?.trim() ?? "";
200
+ const parsed = parseModelVerdict(text);
201
+ // Belt-and-braces: if the response.model field doesn't match what we
202
+ // asked for, block. Vendors can route to fallback models; we need
203
+ // the exact one for predicate integrity.
204
+ if (data.model && data.model !== args.modelId) {
205
+ return {
206
+ verdict: "block",
207
+ rationale: `response.model '${data.model}' != requested '${args.modelId}'`,
208
+ passes: 1,
209
+ };
210
+ }
211
+ return { ...parsed, passes: 1 };
212
+ }
213
+ lastStatus = res.status;
214
+ lastBody = await res.text();
215
+ if (!isRetryable(res.status) || attempt === MAX_RETRIES)
216
+ break;
217
+ await sleep(backoffDelay(attempt, res));
183
218
  }
184
- return { ...parsed, passes: 1 };
219
+ // Network/auth failure must BLOCK — we have no opinion if the audit
220
+ // didn't actually run, and the predicate must not be emitted on a
221
+ // silent fallback. The "retried N times" suffix flags this as the
222
+ // outcome of a giving-up retry vs a single-shot failure.
223
+ return {
224
+ verdict: "block",
225
+ rationale: `Anthropic API call failed (${lastStatus}) after ${MAX_RETRIES + 1} attempts: ${lastBody.slice(0, 200)}`,
226
+ passes: 0,
227
+ };
185
228
  }
186
229
  function parseModelVerdict(text) {
187
230
  // Per the system prompt, the model returns one JSON object. Be forgiving
@@ -3,6 +3,7 @@ import { join } from "node:path";
3
3
  import parseSpdx from "spdx-expression-parse";
4
4
  import licenseIds from "spdx-license-ids" with { type: "json" };
5
5
  import { sha256Hex } from "../hash.js";
6
+ import { detectRootLicense } from "./license-text.js";
6
7
  // OSI used to publish a JSON API at api.opensource.org/licenses; that's been
7
8
  // deprecated. SPDX maintains the canonical list of licenses with an
8
9
  // isOsiApproved field, refreshed when OSI approves new ones. Source of truth.
@@ -31,8 +32,9 @@ function readDeclaredLicense(repoRoot) {
31
32
  if (existsSync(pkgPath)) {
32
33
  try {
33
34
  const pkg = JSON.parse(readFileSync(pkgPath, "utf8"));
34
- if (pkg.license && pkg.license !== "UNLICENSED")
35
- return pkg.license;
35
+ if (pkg.license && pkg.license !== "UNLICENSED") {
36
+ return { spdx: pkg.license, source: "package.json" };
37
+ }
36
38
  }
37
39
  catch { }
38
40
  }
@@ -43,9 +45,16 @@ function readDeclaredLicense(repoRoot) {
43
45
  const head = readFileSync(p, "utf8").slice(0, 8192);
44
46
  const m = head.match(/SPDX-License-Identifier:\s*([A-Za-z0-9.+\-\s()]+)/);
45
47
  if (m)
46
- return m[1].trim();
48
+ return { spdx: m[1].trim(), source: "spdx-header" };
47
49
  }
48
50
  }
51
+ // 3. Fall back to text-pattern detection from the LICENSE body. Many older
52
+ // OSS repos declare their license via the file content alone, with no
53
+ // SPDX header (e.g. GPL/AGPL/Apache/BSD preambles). Less precise than an
54
+ // explicit header but covers the long tail of real repos.
55
+ const detected = detectRootLicense(repoRoot);
56
+ if (detected)
57
+ return { spdx: detected, source: "text-match" };
49
58
  return null;
50
59
  }
51
60
  export function leafIdentifiers(expr) {
@@ -55,26 +64,31 @@ export function leafIdentifiers(expr) {
55
64
  return [...leafIdentifiers(expr.left), ...leafIdentifiers(expr.right)];
56
65
  return [];
57
66
  }
67
+ const SOURCE_LABEL = {
68
+ "package.json": "package.json `license` field",
69
+ "spdx-header": "SPDX-License-Identifier header",
70
+ "text-match": "LICENSE text match",
71
+ };
58
72
  export async function checkOsiLicense(ctx) {
59
73
  const declared = readDeclaredLicense(ctx.repoRoot);
60
74
  if (!declared) {
61
75
  return {
62
76
  result: {
63
77
  pass: false,
64
- details: "No declared license found. Looked at package.json `license` field and SPDX-License-Identifier headers in LICENSE/LICENCE/COPYING.",
78
+ details: "No declared license found. Looked at package.json `license` field, SPDX-License-Identifier headers in LICENSE/LICENCE/COPYING, and text-pattern detection against the LICENSE body.",
65
79
  },
66
80
  osiResponseHash: "",
67
81
  };
68
82
  }
69
83
  let parsed;
70
84
  try {
71
- parsed = parseSpdx(declared);
85
+ parsed = parseSpdx(declared.spdx);
72
86
  }
73
87
  catch (e) {
74
88
  return {
75
89
  result: {
76
90
  pass: false,
77
- details: `Declared license '${declared}' is not a valid SPDX expression: ${e.message}`,
91
+ details: `Declared license '${declared.spdx}' is not a valid SPDX expression: ${e.message}`,
78
92
  },
79
93
  osiResponseHash: "",
80
94
  };
@@ -82,7 +96,10 @@ export async function checkOsiLicense(ctx) {
82
96
  const leaves = leafIdentifiers(parsed);
83
97
  if (leaves.length === 0) {
84
98
  return {
85
- result: { pass: false, details: `Could not extract any SPDX identifiers from '${declared}'` },
99
+ result: {
100
+ pass: false,
101
+ details: `Could not extract any SPDX identifiers from '${declared.spdx}'`,
102
+ },
86
103
  osiResponseHash: "",
87
104
  };
88
105
  }
@@ -101,14 +118,15 @@ export async function checkOsiLicense(ctx) {
101
118
  const unknownSpdx = leaves.filter((id) => !licenseIds.includes(id));
102
119
  if (nonOsi.length > 0) {
103
120
  const reason = unknownSpdx.length === leaves.length
104
- ? `'${declared}' contains identifiers not in the SPDX license list: ${unknownSpdx.join(", ")}`
105
- : `'${declared}' contains non-OSI-approved identifiers: ${nonOsi.join(", ")}`;
121
+ ? `'${declared.spdx}' contains identifiers not in the SPDX license list: ${unknownSpdx.join(", ")}`
122
+ : `'${declared.spdx}' contains non-OSI-approved identifiers: ${nonOsi.join(", ")}`;
106
123
  return { result: { pass: false, details: reason }, osiResponseHash: osi.hash };
107
124
  }
125
+ const sourceNote = declared.source === "text-match" ? ` (detected via ${SOURCE_LABEL[declared.source]})` : "";
108
126
  return {
109
127
  result: {
110
128
  pass: true,
111
- details: `Declared '${declared}' resolves to OSI-approved leaves: ${leaves.join(", ")}`,
129
+ details: `Declared '${declared.spdx}'${sourceNote} resolves to OSI-approved leaves: ${leaves.join(", ")}`,
112
130
  },
113
131
  osiResponseHash: osi.hash,
114
132
  };
@@ -1,6 +1,7 @@
1
- import { readFileSync } from "node:fs";
1
+ import { existsSync, readFileSync } from "node:fs";
2
2
  import { join } from "node:path";
3
3
  import { lsFiles } from "../git.js";
4
+ import { hasAnyLicenseDeclaration } from "./license-text.js";
4
5
  // Files that don't need a license header.
5
6
  // - License files themselves (the license declaration itself)
6
7
  // - Common config files that are factually un-copyrightable
@@ -39,7 +40,36 @@ const looksBinary = (buf) => {
39
40
  return false;
40
41
  };
41
42
  const skip = (path) => SKIP_PATTERNS.some((re) => re.test(path));
43
+ /**
44
+ * SPEC §3.1 "REUSE compliance". The REUSE standard itself accepts three valid
45
+ * declaration patterns; we recognise all three:
46
+ *
47
+ * 1. Per-file SPDX-License-Identifier headers across every source file
48
+ * (strict REUSE).
49
+ * 2. A repo-level .reuse/dep5 or REUSE.toml file (REUSE's own blanket-
50
+ * declaration mechanism). We don't parse the file's content — its
51
+ * presence indicates the maintainer has opted into REUSE format.
52
+ * 3. A root LICENSE / LICENCE / COPYING file with a recognisable license
53
+ * declaration, in the absence of a REUSE-format file. This is the
54
+ * common case for projects that declare one license repo-wide without
55
+ * using REUSE-style per-file headers.
56
+ *
57
+ * Only patterns (1) and (2) are strictly "REUSE-compliant" per the spec;
58
+ * pattern (3) is the pragmatic recognition that a project with a single
59
+ * top-level license has made an unambiguous declaration without going
60
+ * through REUSE's per-file ceremony. Treating (3) as a soft fail (or
61
+ * blanket pass with a note) avoids 100% false-positive rates on the
62
+ * majority of real OSS repos.
63
+ */
42
64
  export function checkReuse(ctx) {
65
+ const hasReuseFormat = existsSync(join(ctx.repoRoot, ".reuse", "dep5")) ||
66
+ existsSync(join(ctx.repoRoot, "REUSE.toml"));
67
+ if (hasReuseFormat) {
68
+ return {
69
+ pass: true,
70
+ details: "Project uses REUSE-format declarations (.reuse/dep5 or REUSE.toml). Per-file SPDX headers not required.",
71
+ };
72
+ }
43
73
  const files = lsFiles(ctx.repoRoot);
44
74
  const missing = [];
45
75
  let checked = 0;
@@ -69,10 +99,19 @@ export function checkReuse(ctx) {
69
99
  details: `${checked} text files all carry SPDX-License-Identifier headers`,
70
100
  };
71
101
  }
102
+ // No per-file SPDX headers, no REUSE-format file. Fall back to "is there
103
+ // a recognisable repo-level declaration?" If yes, accept it as a blanket
104
+ // declaration; if no, this is a real REUSE gap.
105
+ if (hasAnyLicenseDeclaration(ctx.repoRoot)) {
106
+ return {
107
+ pass: true,
108
+ details: `${missing.length} of ${checked} source files lack per-file SPDX headers, but a repo-level license declaration (LICENSE file or package.json) is present. Accepted as a blanket declaration.`,
109
+ };
110
+ }
72
111
  const sample = missing.slice(0, 10);
73
112
  const more = missing.length > sample.length ? ` (+${missing.length - sample.length} more)` : "";
74
113
  return {
75
114
  pass: false,
76
- details: `${missing.length} of ${checked} text files missing SPDX-License-Identifier:\n - ${sample.join("\n - ")}${more}\n\nNote: this MVP doesn't yet honor .reuse/dep5 or REUSE.toml exemptions; that's a known limitation.`,
115
+ details: `${missing.length} of ${checked} source files missing SPDX-License-Identifier and no repo-level license declaration was found:\n - ${sample.join("\n - ")}${more}`,
77
116
  };
78
117
  }
@@ -47,10 +47,20 @@ export async function checkSbom(ctx) {
47
47
  const sbom = buildCycloneDx(ctx, meta, allComponents);
48
48
  const sbomHash = sha256Hex(canonicalJson(sbom));
49
49
  if (allMissing.length > 0) {
50
+ // "Unresolved" = the registry/index lookup failed for this dependency
51
+ // (e.g. unpublished Go module on deps.dev, custom forked package, or
52
+ // transient network failure). Distinct from "found a non-OSI license":
53
+ // these may well be OSI-licensed but we can't confirm. SPEC §3.3
54
+ // requires us to be able to verify *every* dependency's license, so
55
+ // this still fails the check — but the details now make it clear this
56
+ // is a resolution gap, not a confirmed violation, and re-running may
57
+ // succeed (registry mirror, package republished, etc.).
58
+ const detName = (s) => s.split("@")[0];
59
+ const ecosystems = detections.map((d) => d.ecosystem).join("+");
50
60
  return {
51
61
  result: {
52
62
  pass: false,
53
- details: `${allMissing.length} unresolved entr${allMissing.length === 1 ? "y" : "ies"}: ${allMissing.slice(0, 5).join(" | ")}${allMissing.length > 5 ? `, +${allMissing.length - 5} more` : ""}`,
63
+ details: `${allMissing.length} dependenc${allMissing.length === 1 ? "y" : "ies"} (${ecosystems}) had no resolvable license — registry lookup failed (retry-eligible; these may be OSI-licensed but we can't confirm):\n - ${allMissing.slice(0, 10).map(detName).join("\n - ")}${allMissing.length > 10 ? `\n +${allMissing.length - 10} more` : ""}`,
54
64
  },
55
65
  sbomHash,
56
66
  sbomFormat: "cyclonedx-1.5",
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@better-internet/oss-verify",
3
- "version": "0.1.1",
3
+ "version": "0.1.3",
4
4
  "private": false,
5
5
  "publishConfig": {
6
6
  "access": "public"