@better-internet/oss-verify 0.1.0-draft

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,78 @@
1
+ import { readFileSync } from "node:fs";
2
+ import { join } from "node:path";
3
+ import { lsFiles } from "../git.js";
4
+ // Files that don't need a license header.
5
+ // - License files themselves (the license declaration itself)
6
+ // - Common config files that are factually un-copyrightable
7
+ // - Generated / lock files
8
+ const SKIP_PATTERNS = [
9
+ /^LICENSE(\..+)?$/i,
10
+ /^LICENCE(\..+)?$/i,
11
+ /^COPYING(\..+)?$/i,
12
+ /^NOTICE(\..+)?$/i,
13
+ /^\.gitignore$/,
14
+ /^\.gitattributes$/,
15
+ /^\.editorconfig$/,
16
+ /^\.npmrc$/,
17
+ /^\.nvmrc$/,
18
+ /^\.tool-versions$/,
19
+ /(^|\/)pnpm-lock\.yaml$/,
20
+ /(^|\/)package-lock\.json$/,
21
+ /(^|\/)yarn\.lock$/,
22
+ /(^|\/)bun\.lock(b)?$/,
23
+ /(^|\/)Cargo\.lock$/,
24
+ /(^|\/)go\.sum$/,
25
+ /(^|\/)poetry\.lock$/,
26
+ /\.terraform\.lock\.hcl$/,
27
+ // JSON has no comment syntax — cannot carry an inline SPDX header. Real
28
+ // REUSE-compliant projects register these via .reuse/dep5 or REUSE.toml;
29
+ // MVP CLI doesn't yet parse those, so we skip JSON to avoid false negatives.
30
+ /\.json$/,
31
+ /\.jsonc$/,
32
+ ];
33
+ const SPDX_HEADER_RE = /SPDX-License-Identifier:\s*([A-Za-z0-9.+\-\s()]+)/;
34
+ const looksBinary = (buf) => {
35
+ const max = Math.min(buf.length, 4096);
36
+ for (let i = 0; i < max; i++)
37
+ if (buf[i] === 0)
38
+ return true;
39
+ return false;
40
+ };
41
+ const skip = (path) => SKIP_PATTERNS.some((re) => re.test(path));
42
+ export function checkReuse(ctx) {
43
+ const files = lsFiles(ctx.repoRoot);
44
+ const missing = [];
45
+ let checked = 0;
46
+ for (const rel of files) {
47
+ if (skip(rel))
48
+ continue;
49
+ const abs = join(ctx.repoRoot, rel);
50
+ let buf;
51
+ try {
52
+ buf = readFileSync(abs);
53
+ }
54
+ catch {
55
+ continue; // symlink to non-file, etc.
56
+ }
57
+ if (looksBinary(buf))
58
+ continue;
59
+ checked++;
60
+ // Check the first ~30 lines (or 8 KB) for an SPDX header.
61
+ const head = buf.subarray(0, 8192).toString("utf8");
62
+ if (!SPDX_HEADER_RE.test(head)) {
63
+ missing.push(rel);
64
+ }
65
+ }
66
+ if (missing.length === 0) {
67
+ return {
68
+ pass: true,
69
+ details: `${checked} text files all carry SPDX-License-Identifier headers`,
70
+ };
71
+ }
72
+ const sample = missing.slice(0, 10);
73
+ const more = missing.length > sample.length ? ` (+${missing.length - sample.length} more)` : "";
74
+ return {
75
+ pass: false,
76
+ details: `${missing.length} of ${checked} text files missing SPDX-License-Identifier:\n - ${sample.join("\n - ")}${more}\n\nNote: this MVP doesn't yet honor .reuse/dep5 or REUSE.toml exemptions; that's a known limitation.`,
77
+ };
78
+ }
@@ -0,0 +1,124 @@
1
+ // Cargo (Rust) ecosystem detector.
2
+ //
3
+ // Cargo.lock is the source of truth for the resolved dependency graph. It's
4
+ // a TOML file with `[[package]]` entries. We parse the minimal subset we
5
+ // need (name + version + dependencies) without pulling in a TOML library.
6
+ //
7
+ // Per-crate license metadata isn't in Cargo.lock — we have to look it up
8
+ // from the crates.io API:
9
+ // GET https://crates.io/api/v1/crates/<name>/<version>
10
+ // -> { "version": { "license": "MIT OR Apache-2.0", ... } }
11
+ //
12
+ // Lookups are concurrency-limited (CRATESIO_CONCURRENCY) and memoised by
13
+ // (name, version) so re-runs are fast and crates.io stays happy.
14
+ //
15
+ // Scope: includes ALL packages from Cargo.lock — Rust's dev-dependency
16
+ // distinction lives in Cargo.toml not Cargo.lock, and parsing the dep tree
17
+ // to filter would require a real resolver. SPEC §3.3 says runtime deps only,
18
+ // so this is conservative: we may include a few dev/build deps in the
19
+ // audited set. Fixing it requires shipping cargo-metadata output alongside,
20
+ // which is out of scope for the MVP detector.
21
+ import { existsSync, readFileSync } from "node:fs";
22
+ import { join } from "node:path";
23
+ const CRATESIO = "https://crates.io/api/v1/crates";
24
+ const CRATESIO_CONCURRENCY = 4;
25
+ const USER_AGENT = "oss-verify/0.1 (https://github.com/better-internet-org/oss-verify)";
26
+ export async function detect(ctx) {
27
+ const lockPath = join(ctx.repoRoot, "Cargo.lock");
28
+ if (!existsSync(lockPath))
29
+ return null;
30
+ let lockText;
31
+ try {
32
+ lockText = readFileSync(lockPath, "utf8");
33
+ }
34
+ catch (e) {
35
+ return {
36
+ ecosystem: "cargo",
37
+ components: [],
38
+ missing: [`Cargo.lock read failed: ${e.message}`],
39
+ };
40
+ }
41
+ const packages = parseCargoLock(lockText);
42
+ if (packages.length === 0) {
43
+ return { ecosystem: "cargo", components: [], missing: [], details: "no Cargo.lock packages" };
44
+ }
45
+ // Filter out the root package — Cargo.lock includes the project itself.
46
+ // Best signal: entries that have no `source` line are local (the workspace
47
+ // root or path-deps). Drop them so the SBOM is about EXTERNAL deps only.
48
+ const external = packages.filter((p) => p.source !== null);
49
+ const components = [];
50
+ const missing = [];
51
+ // Fetch licenses with a small concurrency limit.
52
+ const queue = [...external];
53
+ const workers = Array.from({ length: CRATESIO_CONCURRENCY }, async () => {
54
+ while (queue.length > 0) {
55
+ const p = queue.shift();
56
+ if (!p)
57
+ break;
58
+ const license = await fetchCrateLicense(p.name, p.version);
59
+ if (license === undefined) {
60
+ missing.push(`${p.name}@${p.version}`);
61
+ continue;
62
+ }
63
+ components.push({
64
+ name: p.name,
65
+ version: p.version,
66
+ license,
67
+ purl: `pkg:cargo/${p.name}@${p.version}`,
68
+ });
69
+ }
70
+ });
71
+ await Promise.all(workers);
72
+ components.sort((a, b) => a.name === b.name ? (a.version < b.version ? -1 : 1) : a.name < b.name ? -1 : 1);
73
+ return { ecosystem: "cargo", components, missing };
74
+ }
75
+ /**
76
+ * Minimal Cargo.lock parser. The format is TOML v3 with one top-level
77
+ * `[[package]]` array. We only need name/version/source. Lock files are
78
+ * machine-generated and the format is stable, so we lean on a simple
79
+ * regex rather than depending on a TOML library.
80
+ */
81
+ function parseCargoLock(text) {
82
+ const packages = [];
83
+ const blocks = text.split(/^\[\[package\]\]\s*$/m);
84
+ for (let i = 1; i < blocks.length; i++) {
85
+ const block = blocks[i];
86
+ const name = matchField(block, "name");
87
+ const version = matchField(block, "version");
88
+ const source = matchField(block, "source");
89
+ if (name && version)
90
+ packages.push({ name, version, source });
91
+ }
92
+ return packages;
93
+ }
94
+ function matchField(block, field) {
95
+ const re = new RegExp(`^${field}\\s*=\\s*"([^"]+)"`, "m");
96
+ const m = block.match(re);
97
+ return m ? m[1] : null;
98
+ }
99
+ const licenseCache = new Map();
100
+ async function fetchCrateLicense(name, version) {
101
+ const key = `${name}@${version}`;
102
+ if (licenseCache.has(key))
103
+ return licenseCache.get(key);
104
+ try {
105
+ const url = `${CRATESIO}/${encodeURIComponent(name)}/${encodeURIComponent(version)}`;
106
+ const res = await fetch(url, {
107
+ headers: { accept: "application/json", "user-agent": USER_AGENT },
108
+ });
109
+ if (!res.ok) {
110
+ licenseCache.set(key, undefined);
111
+ return undefined;
112
+ }
113
+ const data = (await res.json());
114
+ const license = data.version?.license?.trim();
115
+ const result = license && license.length > 0 ? license : null;
116
+ // crates.io returns a SPDX expression in `version.license` (since 2020).
117
+ licenseCache.set(key, result ?? undefined);
118
+ return result ?? undefined;
119
+ }
120
+ catch {
121
+ licenseCache.set(key, undefined);
122
+ return undefined;
123
+ }
124
+ }
@@ -0,0 +1,137 @@
1
+ // Go modules ecosystem detector.
2
+ //
3
+ // Strategy:
4
+ // 1. Read go.mod to find the project's own `module` path (so we don't
5
+ // flag self-references as missing).
6
+ // 2. Parse go.sum to enumerate every (module, version) pair Go's
7
+ // resolver pinned. go.sum lists each dep twice (content hash and
8
+ // go.mod hash); we dedupe.
9
+ // 3. Look up the license for each via deps.dev v3:
10
+ // GET https://api.deps.dev/v3/systems/GO/packages/<urlencoded-name>/versions/<version>
11
+ // -> { "licenses": ["MIT", "Apache-2.0"], ... }
12
+ //
13
+ // Lookups are concurrency-limited (DEPSDEV_CONCURRENCY) and memoised by
14
+ // (name, version) so re-runs are fast.
15
+ //
16
+ // Scope note: go.sum doesn't distinguish runtime vs test/build deps. The
17
+ // SPEC §3.3 requirement is runtime-only, but accurate filtering requires
18
+ // `go list -m -json all` or `go mod why` output that's not in the lockfile.
19
+ // We err conservative: audit all modules in go.sum. If a test-only dep
20
+ // has a non-OSI license, the criterion fails. The remedy for affected
21
+ // projects is to remove or replace the offending dep — strictly correct
22
+ // per the SPEC; mildly inconvenient in edge cases.
23
+ import { existsSync, readFileSync } from "node:fs";
24
+ import { join } from "node:path";
25
+ const DEPSDEV = "https://api.deps.dev/v3/systems/GO/packages";
26
+ const DEPSDEV_CONCURRENCY = 4;
27
+ const USER_AGENT = "oss-verify/0.1 (https://github.com/better-internet-org/oss-verify)";
28
+ export async function detect(ctx) {
29
+ const sumPath = join(ctx.repoRoot, "go.sum");
30
+ const modPath = join(ctx.repoRoot, "go.mod");
31
+ if (!existsSync(modPath))
32
+ return null;
33
+ // Empty go.sum is legal if the project has no deps at all.
34
+ if (!existsSync(sumPath)) {
35
+ return { ecosystem: "go", components: [], missing: [], details: "no go.sum (no deps)" };
36
+ }
37
+ const selfModule = readGoMod(modPath);
38
+ const sumText = readFileSync(sumPath, "utf8");
39
+ const pairs = parseGoSum(sumText, selfModule);
40
+ if (pairs.length === 0) {
41
+ return { ecosystem: "go", components: [], missing: [], details: "no go.sum entries" };
42
+ }
43
+ const components = [];
44
+ const missing = [];
45
+ const queue = [...pairs];
46
+ const workers = Array.from({ length: DEPSDEV_CONCURRENCY }, async () => {
47
+ while (queue.length > 0) {
48
+ const p = queue.shift();
49
+ if (!p)
50
+ break;
51
+ const license = await fetchGoLicense(p.name, p.version);
52
+ if (license === undefined) {
53
+ missing.push(`${p.name}@${p.version}`);
54
+ continue;
55
+ }
56
+ components.push({
57
+ name: p.name,
58
+ version: p.version,
59
+ license,
60
+ purl: `pkg:golang/${p.name.replace(/^v\d+$/, "")}@${p.version}`,
61
+ });
62
+ }
63
+ });
64
+ await Promise.all(workers);
65
+ components.sort((a, b) => a.name === b.name ? (a.version < b.version ? -1 : 1) : a.name < b.name ? -1 : 1);
66
+ return { ecosystem: "go", components, missing };
67
+ }
68
+ function readGoMod(modPath) {
69
+ try {
70
+ const text = readFileSync(modPath, "utf8");
71
+ const m = text.match(/^\s*module\s+(\S+)/m);
72
+ return m ? m[1] : null;
73
+ }
74
+ catch {
75
+ return null;
76
+ }
77
+ }
78
+ /**
79
+ * Parse go.sum into a deduplicated (name, version) list. Format:
80
+ * <module> <version> h1:<base64-hash>
81
+ * <module> <version>/go.mod h1:<base64-hash>
82
+ * Pre-release versions have suffixes like `-pre.0`, pseudo-versions look
83
+ * like `v0.0.0-<timestamp>-<sha>`. We don't try to normalise — deps.dev
84
+ * accepts the raw string back.
85
+ */
86
+ function parseGoSum(text, selfModule) {
87
+ const seen = new Set();
88
+ const out = [];
89
+ for (const line of text.split("\n")) {
90
+ const parts = line.trim().split(/\s+/);
91
+ if (parts.length < 3)
92
+ continue;
93
+ const name = parts[0];
94
+ // Strip `/go.mod` suffix on version lines so we dedupe.
95
+ const version = parts[1].endsWith("/go.mod") ? parts[1].slice(0, -7) : parts[1];
96
+ if (selfModule && (name === selfModule || name.startsWith(`${selfModule}/`)))
97
+ continue;
98
+ const key = `${name}@${version}`;
99
+ if (seen.has(key))
100
+ continue;
101
+ seen.add(key);
102
+ out.push({ name, version });
103
+ }
104
+ return out;
105
+ }
106
+ const licenseCache = new Map();
107
+ async function fetchGoLicense(name, version) {
108
+ const key = `${name}@${version}`;
109
+ if (licenseCache.has(key))
110
+ return licenseCache.get(key);
111
+ try {
112
+ const url = `${DEPSDEV}/${encodeURIComponent(name)}/versions/${encodeURIComponent(version)}`;
113
+ const res = await fetch(url, {
114
+ headers: { accept: "application/json", "user-agent": USER_AGENT },
115
+ });
116
+ if (!res.ok) {
117
+ licenseCache.set(key, undefined);
118
+ return undefined;
119
+ }
120
+ const data = (await res.json());
121
+ const licenses = (data.licenses ?? []).filter((s) => typeof s === "string" && s.length > 0);
122
+ if (licenses.length === 0) {
123
+ licenseCache.set(key, undefined);
124
+ return undefined;
125
+ }
126
+ // Convert deps.dev's array to a SPDX expression. Most Go modules
127
+ // declare a single license; multi-license entries get joined with OR
128
+ // since Go has no metadata to express AND-style co-licensing.
129
+ const result = licenses.length === 1 ? licenses[0] : `(${licenses.join(" OR ")})`;
130
+ licenseCache.set(key, result);
131
+ return result;
132
+ }
133
+ catch {
134
+ licenseCache.set(key, undefined);
135
+ return undefined;
136
+ }
137
+ }
@@ -0,0 +1,125 @@
1
+ // JavaScript / Node ecosystem detector.
2
+ //
3
+ // Walks node_modules from the root package.json's `dependencies` (runtime
4
+ // only) using Node's resolution algorithm. Pulls licenses out of each
5
+ // dep's installed package.json so no network calls or registry lookups
6
+ // are needed.
7
+ //
8
+ // Requires `pnpm install` / `npm install` to have run in the consumer
9
+ // project before the CLI runs — direct + transitive deps must be on disk.
10
+ import { existsSync, readFileSync, realpathSync } from "node:fs";
11
+ import { dirname, join } from "node:path";
12
+ export async function detect(ctx) {
13
+ const rootPkgPath = join(ctx.repoRoot, "package.json");
14
+ if (!existsSync(rootPkgPath))
15
+ return null;
16
+ let rootPkg;
17
+ try {
18
+ rootPkg = JSON.parse(readFileSync(rootPkgPath, "utf8"));
19
+ }
20
+ catch (e) {
21
+ return {
22
+ ecosystem: "javascript",
23
+ components: [],
24
+ missing: [`package.json parse failed: ${e.message}`],
25
+ };
26
+ }
27
+ const directDeps = Object.keys(rootPkg.dependencies ?? {});
28
+ if (directDeps.length === 0) {
29
+ return { ecosystem: "javascript", components: [], missing: [], details: "no runtime deps" };
30
+ }
31
+ const visited = new Map();
32
+ const missing = [];
33
+ const queue = directDeps.map((name) => ({ name, requestedFrom: ctx.repoRoot }));
34
+ while (queue.length > 0) {
35
+ const entry = queue.shift();
36
+ if (entry === undefined)
37
+ break;
38
+ const { name, requestedFrom } = entry;
39
+ const pkgJsonPath = resolvePackageJson(requestedFrom, name);
40
+ if (!pkgJsonPath) {
41
+ if (!missing.includes(name))
42
+ missing.push(name);
43
+ continue;
44
+ }
45
+ let pkg;
46
+ try {
47
+ pkg = JSON.parse(readFileSync(pkgJsonPath, "utf8"));
48
+ }
49
+ catch {
50
+ if (!missing.includes(name))
51
+ missing.push(name);
52
+ continue;
53
+ }
54
+ const version = pkg.version ?? "0.0.0";
55
+ const key = `${name}@${version}`;
56
+ if (visited.has(key))
57
+ continue;
58
+ visited.set(key, {
59
+ name,
60
+ version,
61
+ license: normaliseLicense(pkg.license ?? pkg.licenses),
62
+ purl: `pkg:npm/${name.replace(/^@/, "%40")}@${version}`,
63
+ });
64
+ const ownDir = dirname(pkgJsonPath);
65
+ for (const dep of Object.keys(pkg.dependencies ?? {})) {
66
+ queue.push({ name: dep, requestedFrom: ownDir });
67
+ }
68
+ }
69
+ return {
70
+ ecosystem: "javascript",
71
+ components: [...visited.values()].sort((a, b) => a.name < b.name ? -1 : a.name > b.name ? 1 : 0),
72
+ missing,
73
+ };
74
+ }
75
+ /**
76
+ * Walk up from `fromDir` looking for `node_modules/<name>/package.json`,
77
+ * matching Node's own resolution algorithm. Handles pnpm strict mode
78
+ * (where each package only sees its declared deps via local node_modules)
79
+ * and npm's hoisted flat tree.
80
+ */
81
+ function resolvePackageJson(fromDir, name) {
82
+ let dir = fromDir;
83
+ while (true) {
84
+ const candidate = join(dir, "node_modules", name, "package.json");
85
+ if (existsSync(candidate)) {
86
+ try {
87
+ return realpathSync(candidate);
88
+ }
89
+ catch {
90
+ return candidate;
91
+ }
92
+ }
93
+ const parent = dirname(dir);
94
+ if (parent === dir)
95
+ return null;
96
+ dir = parent;
97
+ }
98
+ }
99
+ const MAX_LICENSE_LEN = 256;
100
+ function normaliseLicense(raw) {
101
+ if (typeof raw === "string") {
102
+ const s = raw.trim();
103
+ if (!s || s.length > MAX_LICENSE_LEN)
104
+ return null;
105
+ if (s.toUpperCase() === "UNLICENSED")
106
+ return null;
107
+ if (s.toUpperCase().startsWith("SEE LICENSE IN"))
108
+ return null;
109
+ return s;
110
+ }
111
+ if (raw && typeof raw === "object" && !Array.isArray(raw)) {
112
+ const t = raw.type;
113
+ if (typeof t === "string")
114
+ return normaliseLicense(t);
115
+ }
116
+ if (Array.isArray(raw)) {
117
+ const parts = raw
118
+ .map((r) => normaliseLicense(r.type))
119
+ .filter((v) => Boolean(v));
120
+ if (parts.length === 0)
121
+ return null;
122
+ return parts.length === 1 ? parts[0] : `(${parts.join(" OR ")})`;
123
+ }
124
+ return null;
125
+ }