@better-internet/oss-verify 0.1.0-draft → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/src/checks/license-text.js +135 -0
- package/dist/src/checks/osi-license.js +28 -10
- package/dist/src/checks/reuse.js +41 -2
- package/dist/src/checks/sbom.js +11 -1
- package/dist/src/git.js +10 -1
- package/package.json +1 -1
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
// SPDX-License-Identifier: MIT
|
|
2
|
+
//
|
|
3
|
+
// Heuristic license detection from LICENSE / LICENCE / COPYING body text.
|
|
4
|
+
// Used as a fallback when a project declares its license via the file content
|
|
5
|
+
// alone (no SPDX-License-Identifier header, no package.json license field).
|
|
6
|
+
//
|
|
7
|
+
// Order matters: more specific patterns must come first (AGPL contains the
|
|
8
|
+
// GPL phrase, BSD-3-Clause includes the BSD-2 boilerplate, etc.).
|
|
9
|
+
//
|
|
10
|
+
// This isn't a replacement for full license-scanning tooling (licensee,
|
|
11
|
+
// ScanCode). It catches the ~10 most common OSI licenses with high precision
|
|
12
|
+
// — enough to keep checkOsiLicense and checkReuse from emitting false
|
|
13
|
+
// negatives against the majority of real-world OSS repos.
|
|
14
|
+
import { existsSync, readFileSync } from "node:fs";
|
|
15
|
+
import { join } from "node:path";
|
|
16
|
+
const LICENSE_FILES = ["LICENSE", "LICENSE.md", "LICENSE.txt", "LICENCE", "COPYING"];
|
|
17
|
+
const DETECTORS = [
|
|
18
|
+
// AGPL must beat the GPL detector below (AGPL text contains "GNU GENERAL
|
|
19
|
+
// PUBLIC LICENSE" as a referenced phrase).
|
|
20
|
+
{
|
|
21
|
+
spdx: "AGPL-3.0-only",
|
|
22
|
+
matches: (b) => /GNU AFFERO GENERAL PUBLIC LICENSE/i.test(b) && /Version 3/i.test(b),
|
|
23
|
+
},
|
|
24
|
+
{
|
|
25
|
+
spdx: "GPL-3.0-only",
|
|
26
|
+
matches: (b) => /GNU GENERAL PUBLIC LICENSE/i.test(b) &&
|
|
27
|
+
/Version 3/i.test(b) &&
|
|
28
|
+
!/AFFERO/i.test(b) &&
|
|
29
|
+
!/LESSER/i.test(b),
|
|
30
|
+
},
|
|
31
|
+
{
|
|
32
|
+
spdx: "GPL-2.0-only",
|
|
33
|
+
matches: (b) => /GNU GENERAL PUBLIC LICENSE/i.test(b) && /Version 2/i.test(b) && !/LESSER/i.test(b),
|
|
34
|
+
},
|
|
35
|
+
{
|
|
36
|
+
spdx: "LGPL-3.0-only",
|
|
37
|
+
matches: (b) => /GNU LESSER GENERAL PUBLIC LICENSE/i.test(b) && /Version 3/i.test(b),
|
|
38
|
+
},
|
|
39
|
+
{
|
|
40
|
+
spdx: "LGPL-2.1-only",
|
|
41
|
+
matches: (b) => /GNU LESSER GENERAL PUBLIC LICENSE/i.test(b) && /Version 2\.1/i.test(b),
|
|
42
|
+
},
|
|
43
|
+
{
|
|
44
|
+
spdx: "Apache-2.0",
|
|
45
|
+
matches: (b) => /Apache License/i.test(b) && /Version 2\.0/i.test(b),
|
|
46
|
+
},
|
|
47
|
+
{
|
|
48
|
+
spdx: "MPL-2.0",
|
|
49
|
+
matches: (b) => /Mozilla Public License/i.test(b) && /Version 2\.0/i.test(b),
|
|
50
|
+
},
|
|
51
|
+
{
|
|
52
|
+
spdx: "BSD-3-Clause",
|
|
53
|
+
matches: (b) => /Redistribution and use/i.test(b) && /Neither the name/i.test(b),
|
|
54
|
+
},
|
|
55
|
+
{
|
|
56
|
+
spdx: "BSD-2-Clause",
|
|
57
|
+
matches: (b) => /Redistribution and use/i.test(b) && !/Neither the name/i.test(b),
|
|
58
|
+
},
|
|
59
|
+
{
|
|
60
|
+
spdx: "MIT",
|
|
61
|
+
matches: (b) => /Permission is hereby granted, free of charge/i.test(b) && /MERCHANTABILITY/i.test(b),
|
|
62
|
+
},
|
|
63
|
+
{
|
|
64
|
+
spdx: "ISC",
|
|
65
|
+
matches: (b) => /Permission to use, copy, modify, and\/or distribute/i.test(b),
|
|
66
|
+
},
|
|
67
|
+
{
|
|
68
|
+
spdx: "Unlicense",
|
|
69
|
+
matches: (b) => /This is free and unencumbered software released into the public domain/i.test(b),
|
|
70
|
+
},
|
|
71
|
+
];
|
|
72
|
+
/**
|
|
73
|
+
* Returns the SPDX identifier of the first detector that matches the LICENSE
|
|
74
|
+
* body, or null when nothing recognisable is present.
|
|
75
|
+
*/
|
|
76
|
+
export function detectLicenseFromText(body) {
|
|
77
|
+
for (const d of DETECTORS) {
|
|
78
|
+
if (d.matches(body))
|
|
79
|
+
return d.spdx;
|
|
80
|
+
}
|
|
81
|
+
return null;
|
|
82
|
+
}
|
|
83
|
+
/**
|
|
84
|
+
* Returns the SPDX identifier detected from the repo's root LICENSE file (or
|
|
85
|
+
* its common variants), or null if no LICENSE file exists or its content
|
|
86
|
+
* doesn't match any known license.
|
|
87
|
+
*/
|
|
88
|
+
export function detectRootLicense(repoRoot) {
|
|
89
|
+
for (const name of LICENSE_FILES) {
|
|
90
|
+
const p = join(repoRoot, name);
|
|
91
|
+
if (!existsSync(p))
|
|
92
|
+
continue;
|
|
93
|
+
try {
|
|
94
|
+
const body = readFileSync(p, "utf8").slice(0, 16384);
|
|
95
|
+
const id = detectLicenseFromText(body);
|
|
96
|
+
if (id)
|
|
97
|
+
return id;
|
|
98
|
+
}
|
|
99
|
+
catch { }
|
|
100
|
+
}
|
|
101
|
+
return null;
|
|
102
|
+
}
|
|
103
|
+
/**
|
|
104
|
+
* True iff the repo has *any* license declaration our checks can recognise:
|
|
105
|
+
* a root LICENSE we can text-detect, an SPDX-License-Identifier in a root
|
|
106
|
+
* license file, or a package.json license field. Used by checkReuse to
|
|
107
|
+
* decide whether a missing per-file SPDX header is a real problem.
|
|
108
|
+
*/
|
|
109
|
+
export function hasAnyLicenseDeclaration(repoRoot) {
|
|
110
|
+
if (detectRootLicense(repoRoot))
|
|
111
|
+
return true;
|
|
112
|
+
// SPDX header in a root LICENSE file
|
|
113
|
+
for (const name of LICENSE_FILES) {
|
|
114
|
+
const p = join(repoRoot, name);
|
|
115
|
+
if (!existsSync(p))
|
|
116
|
+
continue;
|
|
117
|
+
try {
|
|
118
|
+
const head = readFileSync(p, "utf8").slice(0, 8192);
|
|
119
|
+
if (/SPDX-License-Identifier:/i.test(head))
|
|
120
|
+
return true;
|
|
121
|
+
}
|
|
122
|
+
catch { }
|
|
123
|
+
}
|
|
124
|
+
// package.json license field
|
|
125
|
+
const pkgPath = join(repoRoot, "package.json");
|
|
126
|
+
if (existsSync(pkgPath)) {
|
|
127
|
+
try {
|
|
128
|
+
const pkg = JSON.parse(readFileSync(pkgPath, "utf8"));
|
|
129
|
+
if (pkg.license && pkg.license !== "UNLICENSED")
|
|
130
|
+
return true;
|
|
131
|
+
}
|
|
132
|
+
catch { }
|
|
133
|
+
}
|
|
134
|
+
return false;
|
|
135
|
+
}
|
|
@@ -3,6 +3,7 @@ import { join } from "node:path";
|
|
|
3
3
|
import parseSpdx from "spdx-expression-parse";
|
|
4
4
|
import licenseIds from "spdx-license-ids" with { type: "json" };
|
|
5
5
|
import { sha256Hex } from "../hash.js";
|
|
6
|
+
import { detectRootLicense } from "./license-text.js";
|
|
6
7
|
// OSI used to publish a JSON API at api.opensource.org/licenses; that's been
|
|
7
8
|
// deprecated. SPDX maintains the canonical list of licenses with an
|
|
8
9
|
// isOsiApproved field, refreshed when OSI approves new ones. Source of truth.
|
|
@@ -31,8 +32,9 @@ function readDeclaredLicense(repoRoot) {
|
|
|
31
32
|
if (existsSync(pkgPath)) {
|
|
32
33
|
try {
|
|
33
34
|
const pkg = JSON.parse(readFileSync(pkgPath, "utf8"));
|
|
34
|
-
if (pkg.license && pkg.license !== "UNLICENSED")
|
|
35
|
-
return pkg.license;
|
|
35
|
+
if (pkg.license && pkg.license !== "UNLICENSED") {
|
|
36
|
+
return { spdx: pkg.license, source: "package.json" };
|
|
37
|
+
}
|
|
36
38
|
}
|
|
37
39
|
catch { }
|
|
38
40
|
}
|
|
@@ -43,9 +45,16 @@ function readDeclaredLicense(repoRoot) {
|
|
|
43
45
|
const head = readFileSync(p, "utf8").slice(0, 8192);
|
|
44
46
|
const m = head.match(/SPDX-License-Identifier:\s*([A-Za-z0-9.+\-\s()]+)/);
|
|
45
47
|
if (m)
|
|
46
|
-
return m[1].trim();
|
|
48
|
+
return { spdx: m[1].trim(), source: "spdx-header" };
|
|
47
49
|
}
|
|
48
50
|
}
|
|
51
|
+
// 3. Fall back to text-pattern detection from the LICENSE body. Many older
|
|
52
|
+
// OSS repos declare their license via the file content alone, with no
|
|
53
|
+
// SPDX header (e.g. GPL/AGPL/Apache/BSD preambles). Less precise than an
|
|
54
|
+
// explicit header but covers the long tail of real repos.
|
|
55
|
+
const detected = detectRootLicense(repoRoot);
|
|
56
|
+
if (detected)
|
|
57
|
+
return { spdx: detected, source: "text-match" };
|
|
49
58
|
return null;
|
|
50
59
|
}
|
|
51
60
|
export function leafIdentifiers(expr) {
|
|
@@ -55,26 +64,31 @@ export function leafIdentifiers(expr) {
|
|
|
55
64
|
return [...leafIdentifiers(expr.left), ...leafIdentifiers(expr.right)];
|
|
56
65
|
return [];
|
|
57
66
|
}
|
|
67
|
+
const SOURCE_LABEL = {
|
|
68
|
+
"package.json": "package.json `license` field",
|
|
69
|
+
"spdx-header": "SPDX-License-Identifier header",
|
|
70
|
+
"text-match": "LICENSE text match",
|
|
71
|
+
};
|
|
58
72
|
export async function checkOsiLicense(ctx) {
|
|
59
73
|
const declared = readDeclaredLicense(ctx.repoRoot);
|
|
60
74
|
if (!declared) {
|
|
61
75
|
return {
|
|
62
76
|
result: {
|
|
63
77
|
pass: false,
|
|
64
|
-
details: "No declared license found. Looked at package.json `license` field
|
|
78
|
+
details: "No declared license found. Looked at package.json `license` field, SPDX-License-Identifier headers in LICENSE/LICENCE/COPYING, and text-pattern detection against the LICENSE body.",
|
|
65
79
|
},
|
|
66
80
|
osiResponseHash: "",
|
|
67
81
|
};
|
|
68
82
|
}
|
|
69
83
|
let parsed;
|
|
70
84
|
try {
|
|
71
|
-
parsed = parseSpdx(declared);
|
|
85
|
+
parsed = parseSpdx(declared.spdx);
|
|
72
86
|
}
|
|
73
87
|
catch (e) {
|
|
74
88
|
return {
|
|
75
89
|
result: {
|
|
76
90
|
pass: false,
|
|
77
|
-
details: `Declared license '${declared}' is not a valid SPDX expression: ${e.message}`,
|
|
91
|
+
details: `Declared license '${declared.spdx}' is not a valid SPDX expression: ${e.message}`,
|
|
78
92
|
},
|
|
79
93
|
osiResponseHash: "",
|
|
80
94
|
};
|
|
@@ -82,7 +96,10 @@ export async function checkOsiLicense(ctx) {
|
|
|
82
96
|
const leaves = leafIdentifiers(parsed);
|
|
83
97
|
if (leaves.length === 0) {
|
|
84
98
|
return {
|
|
85
|
-
result: {
|
|
99
|
+
result: {
|
|
100
|
+
pass: false,
|
|
101
|
+
details: `Could not extract any SPDX identifiers from '${declared.spdx}'`,
|
|
102
|
+
},
|
|
86
103
|
osiResponseHash: "",
|
|
87
104
|
};
|
|
88
105
|
}
|
|
@@ -101,14 +118,15 @@ export async function checkOsiLicense(ctx) {
|
|
|
101
118
|
const unknownSpdx = leaves.filter((id) => !licenseIds.includes(id));
|
|
102
119
|
if (nonOsi.length > 0) {
|
|
103
120
|
const reason = unknownSpdx.length === leaves.length
|
|
104
|
-
? `'${declared}' contains identifiers not in the SPDX license list: ${unknownSpdx.join(", ")}`
|
|
105
|
-
: `'${declared}' contains non-OSI-approved identifiers: ${nonOsi.join(", ")}`;
|
|
121
|
+
? `'${declared.spdx}' contains identifiers not in the SPDX license list: ${unknownSpdx.join(", ")}`
|
|
122
|
+
: `'${declared.spdx}' contains non-OSI-approved identifiers: ${nonOsi.join(", ")}`;
|
|
106
123
|
return { result: { pass: false, details: reason }, osiResponseHash: osi.hash };
|
|
107
124
|
}
|
|
125
|
+
const sourceNote = declared.source === "text-match" ? ` (detected via ${SOURCE_LABEL[declared.source]})` : "";
|
|
108
126
|
return {
|
|
109
127
|
result: {
|
|
110
128
|
pass: true,
|
|
111
|
-
details: `Declared '${declared}' resolves to OSI-approved leaves: ${leaves.join(", ")}`,
|
|
129
|
+
details: `Declared '${declared.spdx}'${sourceNote} resolves to OSI-approved leaves: ${leaves.join(", ")}`,
|
|
112
130
|
},
|
|
113
131
|
osiResponseHash: osi.hash,
|
|
114
132
|
};
|
package/dist/src/checks/reuse.js
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
|
-
import { readFileSync } from "node:fs";
|
|
1
|
+
import { existsSync, readFileSync } from "node:fs";
|
|
2
2
|
import { join } from "node:path";
|
|
3
3
|
import { lsFiles } from "../git.js";
|
|
4
|
+
import { hasAnyLicenseDeclaration } from "./license-text.js";
|
|
4
5
|
// Files that don't need a license header.
|
|
5
6
|
// - License files themselves (the license declaration itself)
|
|
6
7
|
// - Common config files that are factually un-copyrightable
|
|
@@ -39,7 +40,36 @@ const looksBinary = (buf) => {
|
|
|
39
40
|
return false;
|
|
40
41
|
};
|
|
41
42
|
const skip = (path) => SKIP_PATTERNS.some((re) => re.test(path));
|
|
43
|
+
/**
|
|
44
|
+
* SPEC §3.1 "REUSE compliance". The REUSE standard itself accepts three valid
|
|
45
|
+
* declaration patterns; we recognise all three:
|
|
46
|
+
*
|
|
47
|
+
* 1. Per-file SPDX-License-Identifier headers across every source file
|
|
48
|
+
* (strict REUSE).
|
|
49
|
+
* 2. A repo-level .reuse/dep5 or REUSE.toml file (REUSE's own blanket-
|
|
50
|
+
* declaration mechanism). We don't parse the file's content — its
|
|
51
|
+
* presence indicates the maintainer has opted into REUSE format.
|
|
52
|
+
* 3. A root LICENSE / LICENCE / COPYING file with a recognisable license
|
|
53
|
+
* declaration, in the absence of a REUSE-format file. This is the
|
|
54
|
+
* common case for projects that declare one license repo-wide without
|
|
55
|
+
* using REUSE-style per-file headers.
|
|
56
|
+
*
|
|
57
|
+
* Only patterns (1) and (2) are strictly "REUSE-compliant" per the spec;
|
|
58
|
+
* pattern (3) is the pragmatic recognition that a project with a single
|
|
59
|
+
* top-level license has made an unambiguous declaration without going
|
|
60
|
+
* through REUSE's per-file ceremony. Treating (3) as a soft fail (or
|
|
61
|
+
* blanket pass with a note) avoids 100% false-positive rates on the
|
|
62
|
+
* majority of real OSS repos.
|
|
63
|
+
*/
|
|
42
64
|
export function checkReuse(ctx) {
|
|
65
|
+
const hasReuseFormat = existsSync(join(ctx.repoRoot, ".reuse", "dep5")) ||
|
|
66
|
+
existsSync(join(ctx.repoRoot, "REUSE.toml"));
|
|
67
|
+
if (hasReuseFormat) {
|
|
68
|
+
return {
|
|
69
|
+
pass: true,
|
|
70
|
+
details: "Project uses REUSE-format declarations (.reuse/dep5 or REUSE.toml). Per-file SPDX headers not required.",
|
|
71
|
+
};
|
|
72
|
+
}
|
|
43
73
|
const files = lsFiles(ctx.repoRoot);
|
|
44
74
|
const missing = [];
|
|
45
75
|
let checked = 0;
|
|
@@ -69,10 +99,19 @@ export function checkReuse(ctx) {
|
|
|
69
99
|
details: `${checked} text files all carry SPDX-License-Identifier headers`,
|
|
70
100
|
};
|
|
71
101
|
}
|
|
102
|
+
// No per-file SPDX headers, no REUSE-format file. Fall back to "is there
|
|
103
|
+
// a recognisable repo-level declaration?" If yes, accept it as a blanket
|
|
104
|
+
// declaration; if no, this is a real REUSE gap.
|
|
105
|
+
if (hasAnyLicenseDeclaration(ctx.repoRoot)) {
|
|
106
|
+
return {
|
|
107
|
+
pass: true,
|
|
108
|
+
details: `${missing.length} of ${checked} source files lack per-file SPDX headers, but a repo-level license declaration (LICENSE file or package.json) is present. Accepted as a blanket declaration.`,
|
|
109
|
+
};
|
|
110
|
+
}
|
|
72
111
|
const sample = missing.slice(0, 10);
|
|
73
112
|
const more = missing.length > sample.length ? ` (+${missing.length - sample.length} more)` : "";
|
|
74
113
|
return {
|
|
75
114
|
pass: false,
|
|
76
|
-
details: `${missing.length} of ${checked}
|
|
115
|
+
details: `${missing.length} of ${checked} source files missing SPDX-License-Identifier and no repo-level license declaration was found:\n - ${sample.join("\n - ")}${more}`,
|
|
77
116
|
};
|
|
78
117
|
}
|
package/dist/src/checks/sbom.js
CHANGED
|
@@ -47,10 +47,20 @@ export async function checkSbom(ctx) {
|
|
|
47
47
|
const sbom = buildCycloneDx(ctx, meta, allComponents);
|
|
48
48
|
const sbomHash = sha256Hex(canonicalJson(sbom));
|
|
49
49
|
if (allMissing.length > 0) {
|
|
50
|
+
// "Unresolved" = the registry/index lookup failed for this dependency
|
|
51
|
+
// (e.g. unpublished Go module on deps.dev, custom forked package, or
|
|
52
|
+
// transient network failure). Distinct from "found a non-OSI license":
|
|
53
|
+
// these may well be OSI-licensed but we can't confirm. SPEC §3.3
|
|
54
|
+
// requires us to be able to verify *every* dependency's license, so
|
|
55
|
+
// this still fails the check — but the details now make it clear this
|
|
56
|
+
// is a resolution gap, not a confirmed violation, and re-running may
|
|
57
|
+
// succeed (registry mirror, package republished, etc.).
|
|
58
|
+
const detName = (s) => s.split("@")[0];
|
|
59
|
+
const ecosystems = detections.map((d) => d.ecosystem).join("+");
|
|
50
60
|
return {
|
|
51
61
|
result: {
|
|
52
62
|
pass: false,
|
|
53
|
-
details: `${allMissing.length}
|
|
63
|
+
details: `${allMissing.length} dependenc${allMissing.length === 1 ? "y" : "ies"} (${ecosystems}) had no resolvable license — registry lookup failed (retry-eligible; these may be OSI-licensed but we can't confirm):\n - ${allMissing.slice(0, 10).map(detName).join("\n - ")}${allMissing.length > 10 ? `\n +${allMissing.length - 10} more` : ""}`,
|
|
54
64
|
},
|
|
55
65
|
sbomHash,
|
|
56
66
|
sbomFormat: "cyclonedx-1.5",
|
package/dist/src/git.js
CHANGED
|
@@ -1,5 +1,14 @@
|
|
|
1
1
|
import { execSync } from "node:child_process";
|
|
2
|
-
|
|
2
|
+
// 64 MiB stdout — `git ls-files` on a 21k-file repo (posthog) exceeds the
|
|
3
|
+
// 1 MiB default and throws ENOBUFS before we can do anything with it; the
|
|
4
|
+
// other helpers here are small but inherit the same setting for consistency.
|
|
5
|
+
const MAX_BUFFER = 64 * 1024 * 1024;
|
|
6
|
+
const exec = (cmd, cwd) => execSync(cmd, {
|
|
7
|
+
cwd,
|
|
8
|
+
encoding: "utf8",
|
|
9
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
10
|
+
maxBuffer: MAX_BUFFER,
|
|
11
|
+
}).trim();
|
|
3
12
|
export function commitSha(repoRoot) {
|
|
4
13
|
return exec("git rev-parse HEAD", repoRoot);
|
|
5
14
|
}
|