sbom-generator 1.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,209 @@
1
+ Metadata-Version: 2.4
2
+ Name: sbom-generator
3
+ Version: 1.1.0
4
+ Summary: Extract SPDX and CycloneDX SBOMs from open-source projects, including the Linux kernel
5
+ Author-email: Fatih Tekin <fatih.tekin.de@googlemail.com>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/DrFatihTekin/sbom-generator
8
+ Project-URL: Repository, https://github.com/DrFatihTekin/sbom-generator
9
+ Project-URL: Issues, https://github.com/DrFatihTekin/sbom-generator/issues
10
+ Keywords: sbom,spdx,cyclonedx,security,linux-kernel,compliance
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.9
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Operating System :: OS Independent
18
+ Classifier: Topic :: Security
19
+ Classifier: Topic :: Software Development :: Build Tools
20
+ Classifier: Intended Audience :: Developers
21
+ Requires-Python: >=3.9
22
+ Description-Content-Type: text/markdown
23
+ Requires-Dist: rich>=13.0
24
+
25
+ # OpenSBOM Extractor
26
+
27
+ [![CI](https://github.com/DrFatihTekin/sbom-generator/actions/workflows/ci.yml/badge.svg)](https://github.com/DrFatihTekin/sbom-generator/actions/workflows/ci.yml)
28
+
29
+ A production-ready Python CLI for extracting Software Bill of Materials (SBOM) from open-source codebases. Built for scale — from small libraries to the Linux kernel (70k+ files).
30
+
31
+ ---
32
+
33
+ ## Key Features
34
+
35
+ - **Multi-ecosystem dependency extraction** — parses manifests and lock files for Python, Node.js, Rust, Go, and Java (Maven + Gradle). Lock files are preferred over manifests for exact pinned versions.
36
+ - **Parallel file scanning** — thread pool for hashing and license extraction with a live progress bar.
37
+ - **Streaming JSON output** — SPDX and CycloneDX documents are written one entry at a time; the full document is never held in memory, making 70k+ file projects practical.
38
+ - **Correct PURL generation** — all package references follow the [Package URL spec](https://github.com/package-url/purl-spec) (`pkg:pypi/…`, `pkg:maven/…`, etc.).
39
+ - **CPE identifiers** — best-effort CPE 2.3 strings generated for every dependency, enabling vulnerability matching against the NVD.
40
+ - **SPDX expression support** — correctly preserves compound identifiers like `GPL-2.0-only OR MIT` and `GPL-2.0-only WITH Linux-syscall-note`.
41
+ - **Git VCS metadata** — embeds commit, branch, tag, and remote URL into every SBOM format.
42
+ - **Reproducible output** — `--reproducible` produces bit-identical SBOMs across runs (fixed timestamp, deterministic UUID).
43
+ - **NTIA minimum elements check** — validates the 7 NTIA-required fields at runtime and reports any gaps.
44
+ - **SBOM structural validation** — validates generated SPDX 2.3 and CycloneDX 1.5 documents before writing.
45
+ - **Standards-compliant output** — SPDX 2.3, SPDX 3.0.1, and CycloneDX 1.5 JSON; plus an interactive HTML dashboard.
46
+ - **Precision C/C++ build tracing** — via Clang `compile_commands.json` or Linux kernel Kbuild `.cmd` files.
47
+
48
+ ---
49
+
50
+ ## Supported Languages
51
+
52
+ ### License detection
53
+
54
+ SPDX license tags are extracted from any source file, including: C, C++, Python, JavaScript, TypeScript, Go, Rust, Java, Kotlin, Swift, C#, Shell, Perl, Ruby, PHP, Lua, Assembly, and common config formats (YAML, TOML, JSON, Makefile, Kconfig).
55
+
56
+ ### Dependency extraction
57
+
58
+ | Ecosystem | Files parsed (lock file preferred) |
59
+ |---|---|
60
+ | Python | `poetry.lock` / `requirements.txt` / `requirements.in`, `pyproject.toml` |
61
+ | Node.js | `package-lock.json` / `package.json` |
62
+ | Rust | `Cargo.lock` / `Cargo.toml` |
63
+ | Go | `go.sum` / `go.mod` |
64
+ | Java (Maven) | `pom.xml` — including sub-modules and `<properties>` resolution |
65
+ | Java (Gradle) | `gradle.lockfile` / `build.gradle` / `build.gradle.kts` |
66
+
67
+ ---
68
+
69
+ ## Installation
70
+
71
+ ```bash
72
+ pip install -e .
73
+ ```
74
+
75
+ Requires Python 3.9+. The only runtime dependency is [`rich`](https://github.com/Textualize/rich) for progress display.
76
+
77
+ ---
78
+
79
+ ## Usage
80
+
81
+ ### General directory scan
82
+
83
+ ```bash
84
+ sbom-extractor /path/to/project -o my-project-sbom
85
+ ```
86
+
87
+ ### With supplier name (required for NTIA compliance)
88
+
89
+ ```bash
90
+ sbom-extractor /path/to/project --supplier "Acme Corp" -o my-project-sbom
91
+ ```
92
+
93
+ ### Reproducible output (for SBOM diffing in CI)
94
+
95
+ ```bash
96
+ sbom-extractor /path/to/project --reproducible -o my-project-sbom
97
+ ```
98
+
99
+ ### C/C++ project with a Clang compilation database
100
+
101
+ ```bash
102
+ sbom-extractor /path/to/project \
103
+ --compile-commands /path/to/project/compile_commands.json \
104
+ -o project-sbom
105
+ ```
106
+
107
+ Generate `compile_commands.json` with CMake (`-DCMAKE_EXPORT_COMPILE_COMMANDS=ON`) or [Bear](https://github.com/rizsotto/Bear).
108
+
109
+ ### Linux kernel (compile_commands.json)
110
+
111
+ ```bash
112
+ cd /path/to/linux
113
+ make defconfig && make -j$(nproc)
114
+ python3 scripts/clang-tools/gen_compile_commands.py
115
+
116
+ sbom-extractor /path/to/linux \
117
+ --compile-commands /path/to/linux/compile_commands.json \
118
+ --no-hashes \
119
+ -o linux-sbom
120
+ ```
121
+
122
+ ### Linux kernel (Kbuild .cmd files)
123
+
124
+ ```bash
125
+ sbom-extractor /path/to/linux \
126
+ --kernel-build /path/to/linux/build-output \
127
+ --no-hashes \
128
+ -o linux-sbom
129
+ ```
130
+
131
+ `--no-hashes` is recommended for kernel-scale projects to skip SHA-256/SHA-1 computation.
132
+
133
+ ---
134
+
135
+ ## CLI Options
136
+
137
+ ```
138
+ positional arguments:
139
+ path Path to the project directory to scan
140
+
141
+ options:
142
+ -h, --help Show this help message and exit
143
+ -o, --output OUTPUT Base filename for output files (default: sbom)
144
+ --format {spdx,spdx3,cyclonedx,html,all}
145
+ Output format (default: all)
146
+ --project-name NAME Project name (default: directory name)
147
+ --project-version VERSION Project version (default: 1.0.0)
148
+ --supplier NAME Supplier / organization name — required for NTIA compliance
149
+ --no-hashes Skip SHA-256/SHA-1 hashing (faster for large projects)
150
+ --reproducible Deterministic output: fixed timestamp, UUID derived from
151
+ project name/version — useful for SBOM diffing in CI
152
+ --compile-commands PATH Path to compile_commands.json
153
+ --kernel-build PATH Path to kernel build directory (Kbuild .cmd files)
154
+ --exclude DIR Exclude a directory name from scanning (repeatable)
155
+ --workers N Number of parallel worker threads (default: 2 × CPU count)
156
+ -q, --quiet Suppress all progress output
157
+ -v, --verbose Show extra detail (full license list, validation results)
158
+ ```
159
+
160
+ ---
161
+
162
+ ## Output Files
163
+
164
+ | File | Format | Notes |
165
+ |---|---|---|
166
+ | `sbom.spdx.json` | SPDX 2.3 | Stream-written; validated before save |
167
+ | `sbom.spdx3.json` | SPDX 3.0.1 | JSON-LD graph format |
168
+ | `sbom.cdx.json` | CycloneDX 1.5 | Stream-written; validated before save; includes CPE |
169
+ | `sbom.html` | Interactive HTML | Dark-mode dashboard; file list capped at 5,000 for browser performance |
170
+
171
+ Use `--format spdx`, `--format cyclonedx`, etc. to generate only what you need.
172
+
173
+ ---
174
+
175
+ ## NTIA Compliance
176
+
177
+ The tool checks the [NTIA minimum elements](https://www.ntia.gov/report/2021/minimum-elements-software-bill-materials) at runtime:
178
+
179
+ | Element | How it's satisfied |
180
+ |---|---|
181
+ | Supplier name | `--supplier` flag |
182
+ | Component name | `--project-name` (or directory name) |
183
+ | Component version | `--project-version` |
184
+ | Unique identifiers | PURL + CPE generated for every dependency |
185
+ | Dependency relationships | `CONTAINS` / `DEPENDS_ON` relationships in all formats |
186
+ | Author of SBOM data | Tool name + version in `creationInfo` |
187
+ | Timestamp | UTC timestamp at generation time (or epoch with `--reproducible`) |
188
+
189
+ Any missing elements are reported as warnings at the end of every run.
190
+
191
+ ---
192
+
193
+ ## Architecture
194
+
195
+ | Module | Responsibility |
196
+ |---|---|
197
+ | `cli.py` | Entry point — argument parsing, progress display, orchestration |
198
+ | `scanner.py` | Parallel directory walk, license extraction, file hashing |
199
+ | `manifest_parser.py` | Manifest and lock file parsing for all supported ecosystems |
200
+ | `compilation_db.py` | Clang `compile_commands.json` and Kbuild `.cmd` parsing |
201
+ | `purl.py` | Canonical PURL generation |
202
+ | `cpe.py` | Best-effort CPE 2.3 generation |
203
+ | `vcs.py` | Git metadata extraction |
204
+ | `ntia.py` | NTIA minimum elements compliance check |
205
+ | `validator.py` | Structural validation for SPDX 2.3 and CycloneDX 1.5 |
206
+ | `spdx_generator.py` | SPDX 2.3 JSON output (in-memory + streaming) |
207
+ | `spdx3_generator.py` | SPDX 3.0.1 JSON-LD output |
208
+ | `cyclonedx_generator.py` | CycloneDX 1.5 JSON output (in-memory + streaming) |
209
+ | `html_generator.py` | Self-contained interactive HTML report |
@@ -0,0 +1,185 @@
1
+ # OpenSBOM Extractor
2
+
3
+ [![CI](https://github.com/DrFatihTekin/sbom-generator/actions/workflows/ci.yml/badge.svg)](https://github.com/DrFatihTekin/sbom-generator/actions/workflows/ci.yml)
4
+
5
+ A production-ready Python CLI for extracting Software Bill of Materials (SBOM) from open-source codebases. Built for scale — from small libraries to the Linux kernel (70k+ files).
6
+
7
+ ---
8
+
9
+ ## Key Features
10
+
11
+ - **Multi-ecosystem dependency extraction** — parses manifests and lock files for Python, Node.js, Rust, Go, and Java (Maven + Gradle). Lock files are preferred over manifests for exact pinned versions.
12
+ - **Parallel file scanning** — thread pool for hashing and license extraction with a live progress bar.
13
+ - **Streaming JSON output** — SPDX and CycloneDX documents are written one entry at a time; the full document is never held in memory, making 70k+ file projects practical.
14
+ - **Correct PURL generation** — all package references follow the [Package URL spec](https://github.com/package-url/purl-spec) (`pkg:pypi/…`, `pkg:maven/…`, etc.).
15
+ - **CPE identifiers** — best-effort CPE 2.3 strings generated for every dependency, enabling vulnerability matching against the NVD.
16
+ - **SPDX expression support** — correctly preserves compound identifiers like `GPL-2.0-only OR MIT` and `GPL-2.0-only WITH Linux-syscall-note`.
17
+ - **Git VCS metadata** — embeds commit, branch, tag, and remote URL into every SBOM format.
18
+ - **Reproducible output** — `--reproducible` produces bit-identical SBOMs across runs (fixed timestamp, deterministic UUID).
19
+ - **NTIA minimum elements check** — validates the 7 NTIA-required fields at runtime and reports any gaps.
20
+ - **SBOM structural validation** — validates generated SPDX 2.3 and CycloneDX 1.5 documents before writing.
21
+ - **Standards-compliant output** — SPDX 2.3, SPDX 3.0.1, and CycloneDX 1.5 JSON; plus an interactive HTML dashboard.
22
+ - **Precision C/C++ build tracing** — via Clang `compile_commands.json` or Linux kernel Kbuild `.cmd` files.
23
+
24
+ ---
25
+
26
+ ## Supported Languages
27
+
28
+ ### License detection
29
+
30
+ SPDX license tags are extracted from any source file, including: C, C++, Python, JavaScript, TypeScript, Go, Rust, Java, Kotlin, Swift, C#, Shell, Perl, Ruby, PHP, Lua, Assembly, and common config formats (YAML, TOML, JSON, Makefile, Kconfig).
31
+
32
+ ### Dependency extraction
33
+
34
+ | Ecosystem | Files parsed (lock file preferred) |
35
+ |---|---|
36
+ | Python | `poetry.lock` / `requirements.txt` / `requirements.in`, `pyproject.toml` |
37
+ | Node.js | `package-lock.json` / `package.json` |
38
+ | Rust | `Cargo.lock` / `Cargo.toml` |
39
+ | Go | `go.sum` / `go.mod` |
40
+ | Java (Maven) | `pom.xml` — including sub-modules and `<properties>` resolution |
41
+ | Java (Gradle) | `gradle.lockfile` / `build.gradle` / `build.gradle.kts` |
42
+
43
+ ---
44
+
45
+ ## Installation
46
+
47
+ ```bash
48
+ pip install -e .
49
+ ```
50
+
51
+ Requires Python 3.9+. The only runtime dependency is [`rich`](https://github.com/Textualize/rich) for progress display.
52
+
53
+ ---
54
+
55
+ ## Usage
56
+
57
+ ### General directory scan
58
+
59
+ ```bash
60
+ sbom-extractor /path/to/project -o my-project-sbom
61
+ ```
62
+
63
+ ### With supplier name (required for NTIA compliance)
64
+
65
+ ```bash
66
+ sbom-extractor /path/to/project --supplier "Acme Corp" -o my-project-sbom
67
+ ```
68
+
69
+ ### Reproducible output (for SBOM diffing in CI)
70
+
71
+ ```bash
72
+ sbom-extractor /path/to/project --reproducible -o my-project-sbom
73
+ ```
74
+
75
+ ### C/C++ project with a Clang compilation database
76
+
77
+ ```bash
78
+ sbom-extractor /path/to/project \
79
+ --compile-commands /path/to/project/compile_commands.json \
80
+ -o project-sbom
81
+ ```
82
+
83
+ Generate `compile_commands.json` with CMake (`-DCMAKE_EXPORT_COMPILE_COMMANDS=ON`) or [Bear](https://github.com/rizsotto/Bear).
84
+
85
+ ### Linux kernel (compile_commands.json)
86
+
87
+ ```bash
88
+ cd /path/to/linux
89
+ make defconfig && make -j$(nproc)
90
+ python3 scripts/clang-tools/gen_compile_commands.py
91
+
92
+ sbom-extractor /path/to/linux \
93
+ --compile-commands /path/to/linux/compile_commands.json \
94
+ --no-hashes \
95
+ -o linux-sbom
96
+ ```
97
+
98
+ ### Linux kernel (Kbuild .cmd files)
99
+
100
+ ```bash
101
+ sbom-extractor /path/to/linux \
102
+ --kernel-build /path/to/linux/build-output \
103
+ --no-hashes \
104
+ -o linux-sbom
105
+ ```
106
+
107
+ `--no-hashes` is recommended for kernel-scale projects to skip SHA-256/SHA-1 computation.
108
+
109
+ ---
110
+
111
+ ## CLI Options
112
+
113
+ ```
114
+ positional arguments:
115
+ path Path to the project directory to scan
116
+
117
+ options:
118
+ -h, --help Show this help message and exit
119
+ -o, --output OUTPUT Base filename for output files (default: sbom)
120
+ --format {spdx,spdx3,cyclonedx,html,all}
121
+ Output format (default: all)
122
+ --project-name NAME Project name (default: directory name)
123
+ --project-version VERSION Project version (default: 1.0.0)
124
+ --supplier NAME Supplier / organization name — required for NTIA compliance
125
+ --no-hashes Skip SHA-256/SHA-1 hashing (faster for large projects)
126
+ --reproducible Deterministic output: fixed timestamp, UUID derived from
127
+ project name/version — useful for SBOM diffing in CI
128
+ --compile-commands PATH Path to compile_commands.json
129
+ --kernel-build PATH Path to kernel build directory (Kbuild .cmd files)
130
+ --exclude DIR Exclude a directory name from scanning (repeatable)
131
+ --workers N Number of parallel worker threads (default: 2 × CPU count)
132
+ -q, --quiet Suppress all progress output
133
+ -v, --verbose Show extra detail (full license list, validation results)
134
+ ```
135
+
136
+ ---
137
+
138
+ ## Output Files
139
+
140
+ | File | Format | Notes |
141
+ |---|---|---|
142
+ | `sbom.spdx.json` | SPDX 2.3 | Stream-written; validated before save |
143
+ | `sbom.spdx3.json` | SPDX 3.0.1 | JSON-LD graph format |
144
+ | `sbom.cdx.json` | CycloneDX 1.5 | Stream-written; validated before save; includes CPE |
145
+ | `sbom.html` | Interactive HTML | Dark-mode dashboard; file list capped at 5,000 for browser performance |
146
+
147
+ Use `--format spdx`, `--format cyclonedx`, etc. to generate only what you need.
148
+
149
+ ---
150
+
151
+ ## NTIA Compliance
152
+
153
+ The tool checks the [NTIA minimum elements](https://www.ntia.gov/report/2021/minimum-elements-software-bill-materials) at runtime:
154
+
155
+ | Element | How it's satisfied |
156
+ |---|---|
157
+ | Supplier name | `--supplier` flag |
158
+ | Component name | `--project-name` (or directory name) |
159
+ | Component version | `--project-version` |
160
+ | Unique identifiers | PURL + CPE generated for every dependency |
161
+ | Dependency relationships | `CONTAINS` / `DEPENDS_ON` relationships in all formats |
162
+ | Author of SBOM data | Tool name + version in `creationInfo` |
163
+ | Timestamp | UTC timestamp at generation time (or epoch with `--reproducible`) |
164
+
165
+ Any missing elements are reported as warnings at the end of every run.
166
+
167
+ ---
168
+
169
+ ## Architecture
170
+
171
+ | Module | Responsibility |
172
+ |---|---|
173
+ | `cli.py` | Entry point — argument parsing, progress display, orchestration |
174
+ | `scanner.py` | Parallel directory walk, license extraction, file hashing |
175
+ | `manifest_parser.py` | Manifest and lock file parsing for all supported ecosystems |
176
+ | `compilation_db.py` | Clang `compile_commands.json` and Kbuild `.cmd` parsing |
177
+ | `purl.py` | Canonical PURL generation |
178
+ | `cpe.py` | Best-effort CPE 2.3 generation |
179
+ | `vcs.py` | Git metadata extraction |
180
+ | `ntia.py` | NTIA minimum elements compliance check |
181
+ | `validator.py` | Structural validation for SPDX 2.3 and CycloneDX 1.5 |
182
+ | `spdx_generator.py` | SPDX 2.3 JSON output (in-memory + streaming) |
183
+ | `spdx3_generator.py` | SPDX 3.0.1 JSON-LD output |
184
+ | `cyclonedx_generator.py` | CycloneDX 1.5 JSON output (in-memory + streaming) |
185
+ | `html_generator.py` | Self-contained interactive HTML report |
@@ -0,0 +1,38 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "sbom-generator"
7
+ version = "1.1.0"
8
+ description = "Extract SPDX and CycloneDX SBOMs from open-source projects, including the Linux kernel"
9
+ readme = "README.md"
10
+ requires-python = ">=3.9"
11
+ license = "MIT"
12
+ authors = [{name = "Fatih Tekin", email = "fatih.tekin.de@googlemail.com"}]
13
+ keywords = ["sbom", "spdx", "cyclonedx", "security", "linux-kernel", "compliance"]
14
+ classifiers = [
15
+ "Development Status :: 4 - Beta",
16
+ "Programming Language :: Python :: 3",
17
+ "Programming Language :: Python :: 3.9",
18
+ "Programming Language :: Python :: 3.10",
19
+ "Programming Language :: Python :: 3.11",
20
+ "Programming Language :: Python :: 3.12",
21
+ "Operating System :: OS Independent",
22
+ "Topic :: Security",
23
+ "Topic :: Software Development :: Build Tools",
24
+ "Intended Audience :: Developers",
25
+ ]
26
+ dependencies = ["rich>=13.0"]
27
+
28
+ [project.urls]
29
+ Homepage = "https://github.com/DrFatihTekin/sbom-generator"
30
+ Repository = "https://github.com/DrFatihTekin/sbom-generator"
31
+ Issues = "https://github.com/DrFatihTekin/sbom-generator/issues"
32
+
33
+ [project.scripts]
34
+ sbom-extractor = "sbom_extractor.cli:main"
35
+
36
+ [tool.setuptools.packages.find]
37
+ where = ["."]
38
+ include = ["sbom_extractor*"]
@@ -0,0 +1,7 @@
1
+ """
2
+ OpenSBOM Extractor
3
+ A Python-based utility to extract SPDX and CycloneDX Software Bill of Materials (SBOM)
4
+ for open-source projects, including complex C/C++ projects like the Linux kernel.
5
+ """
6
+
7
+ __version__ = "1.1.0"