gnn-vuln 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gnn_vuln-0.1.1/.gitignore +104 -0
- gnn_vuln-0.1.1/PKG-INFO +204 -0
- gnn_vuln-0.1.1/pyproject.toml +98 -0
- gnn_vuln-0.1.1/src/gnn_vuln/README.md +169 -0
- gnn_vuln-0.1.1/src/gnn_vuln/__init__.py +8 -0
- gnn_vuln-0.1.1/src/gnn_vuln/baselines.py +343 -0
- gnn_vuln-0.1.1/src/gnn_vuln/config.py +422 -0
- gnn_vuln-0.1.1/src/gnn_vuln/data/__init__.py +1 -0
- gnn_vuln-0.1.1/src/gnn_vuln/data/build_pt.py +147 -0
- gnn_vuln-0.1.1/src/gnn_vuln/data/cpg/__init__.py +39 -0
- gnn_vuln-0.1.1/src/gnn_vuln/data/cpg/builder.py +134 -0
- gnn_vuln-0.1.1/src/gnn_vuln/data/cpg/constants.py +103 -0
- gnn_vuln-0.1.1/src/gnn_vuln/data/cpg/features.py +221 -0
- gnn_vuln-0.1.1/src/gnn_vuln/data/cpg/parser.py +148 -0
- gnn_vuln-0.1.1/src/gnn_vuln/data/cwe_taxonomy.py +299 -0
- gnn_vuln-0.1.1/src/gnn_vuln/data/dataset_lm.py +1671 -0
- gnn_vuln-0.1.1/src/gnn_vuln/data/graph_builder_lm.py +16 -0
- gnn_vuln-0.1.1/src/gnn_vuln/data/graph_partition.py +170 -0
- gnn_vuln-0.1.1/src/gnn_vuln/data/graph_partition_ref.py +129 -0
- gnn_vuln-0.1.1/src/gnn_vuln/data/joern_runner.py +329 -0
- gnn_vuln-0.1.1/src/gnn_vuln/data/merge.py +180 -0
- gnn_vuln-0.1.1/src/gnn_vuln/data/node_embedder.py +188 -0
- gnn_vuln-0.1.1/src/gnn_vuln/data/prepare.py +780 -0
- gnn_vuln-0.1.1/src/gnn_vuln/data/preprocess.py +206 -0
- gnn_vuln-0.1.1/src/gnn_vuln/evaluate.py +341 -0
- gnn_vuln-0.1.1/src/gnn_vuln/evaluation/__init__.py +6 -0
- gnn_vuln-0.1.1/src/gnn_vuln/evaluation/localize.py +162 -0
- gnn_vuln-0.1.1/src/gnn_vuln/evaluation/plots.py +104 -0
- gnn_vuln-0.1.1/src/gnn_vuln/inference.py +479 -0
- gnn_vuln-0.1.1/src/gnn_vuln/losses/__init__.py +3 -0
- gnn_vuln-0.1.1/src/gnn_vuln/losses/hierarchical_supcon.py +353 -0
- gnn_vuln-0.1.1/src/gnn_vuln/metrics.py +242 -0
- gnn_vuln-0.1.1/src/gnn_vuln/models/__init__.py +31 -0
- gnn_vuln-0.1.1/src/gnn_vuln/models/_lm_utils.py +663 -0
- gnn_vuln-0.1.1/src/gnn_vuln/models/base.py +393 -0
- gnn_vuln-0.1.1/src/gnn_vuln/models/cross_task.py +401 -0
- gnn_vuln-0.1.1/src/gnn_vuln/models/encoders.py +1125 -0
- gnn_vuln-0.1.1/src/gnn_vuln/models/graph_vit.py +263 -0
- gnn_vuln-0.1.1/src/gnn_vuln/models/heads.py +593 -0
- gnn_vuln-0.1.1/src/gnn_vuln/models/lmgat_codebert.py +565 -0
- gnn_vuln-0.1.1/src/gnn_vuln/models/lmgat_codebert_mtl.py +149 -0
- gnn_vuln-0.1.1/src/gnn_vuln/models/lmgat_dualflow.py +101 -0
- gnn_vuln-0.1.1/src/gnn_vuln/models/lmgat_hcdfgat.py +120 -0
- gnn_vuln-0.1.1/src/gnn_vuln/models/lmgat_interp.py +87 -0
- gnn_vuln-0.1.1/src/gnn_vuln/models/lmgat_mcs.py +72 -0
- gnn_vuln-0.1.1/src/gnn_vuln/models/lmgat_seq.py +115 -0
- gnn_vuln-0.1.1/src/gnn_vuln/models/lmgat_seqgnn.py +169 -0
- gnn_vuln-0.1.1/src/gnn_vuln/models/lmgat_waves_seq.py +124 -0
- gnn_vuln-0.1.1/src/gnn_vuln/models/registry.py +84 -0
- gnn_vuln-0.1.1/src/gnn_vuln/models/supcon_head.py +28 -0
- gnn_vuln-0.1.1/src/gnn_vuln/pretrain_jepa.py +261 -0
- gnn_vuln-0.1.1/src/gnn_vuln/train.py +979 -0
- gnn_vuln-0.1.1/src/gnn_vuln/training/__init__.py +1 -0
- gnn_vuln-0.1.1/src/gnn_vuln/training/ewc.py +193 -0
- gnn_vuln-0.1.1/src/gnn_vuln/training/losses.py +195 -0
- gnn_vuln-0.1.1/src/gnn_vuln/training/mtl_balance.py +194 -0
- gnn_vuln-0.1.1/src/gnn_vuln/training/optimizer.py +153 -0
- gnn_vuln-0.1.1/src/gnn_vuln/training/pgd.py +226 -0
- gnn_vuln-0.1.1/src/gnn_vuln/training/sampler.py +93 -0
- gnn_vuln-0.1.1/src/gnn_vuln/training/trainer.py +740 -0
- gnn_vuln-0.1.1/src/gnn_vuln/training/unfreezer.py +78 -0
- gnn_vuln-0.1.1/src/gnn_vuln/utils.py +298 -0
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*.pyo
|
|
5
|
+
*.pyd
|
|
6
|
+
*.so
|
|
7
|
+
*.egg
|
|
8
|
+
*.egg-info/
|
|
9
|
+
dist/
|
|
10
|
+
build/
|
|
11
|
+
.eggs/
|
|
12
|
+
|
|
13
|
+
# uv / virtualenv
|
|
14
|
+
.venv/
|
|
15
|
+
.uv/
|
|
16
|
+
|
|
17
|
+
# embedded Joern clone (nested .git, large) + local materialize cache
|
|
18
|
+
src/gnn_vuln/joern/
|
|
19
|
+
data/.materialized/
|
|
20
|
+
|
|
21
|
+
# Jupyter
|
|
22
|
+
.ipynb_checkpoints/
|
|
23
|
+
*.ipynb_metadata/
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# Logs
|
|
27
|
+
logs/
|
|
28
|
+
*.log
|
|
29
|
+
|
|
30
|
+
# OS
|
|
31
|
+
.DS_Store
|
|
32
|
+
Thumbs.db
|
|
33
|
+
|
|
34
|
+
# IDE
|
|
35
|
+
.vscode/settings.json
|
|
36
|
+
.idea/
|
|
37
|
+
|
|
38
|
+
# Claude Code local config (commands, hooks — project-local, not for repo)
|
|
39
|
+
.claude/
|
|
40
|
+
.serena/
|
|
41
|
+
.agents/
|
|
42
|
+
|
|
43
|
+
# Env files
|
|
44
|
+
.env
|
|
45
|
+
*.env
|
|
46
|
+
|
|
47
|
+
# Joern output cache
|
|
48
|
+
joern-cli/
|
|
49
|
+
joern_workspace/
|
|
50
|
+
|
|
51
|
+
# Large generated / binary / third-party — do NOT push
|
|
52
|
+
/data/raw/**
|
|
53
|
+
!/data/raw/*/
|
|
54
|
+
!/data/raw/*/cwe_vocab.json
|
|
55
|
+
/data/datasets/
|
|
56
|
+
/data/processed/
|
|
57
|
+
/data/graphs/
|
|
58
|
+
/checkpoints/
|
|
59
|
+
/results/
|
|
60
|
+
/paper/
|
|
61
|
+
# Baselines tracked in-repo (LineVul, LineVD, LOSVER) so the pod gets them via git pull.
|
|
62
|
+
# Ignore only their generated junk:
|
|
63
|
+
src/*/__pycache__/
|
|
64
|
+
src/**/*.pyc
|
|
65
|
+
src/LineVul/saved_models/
|
|
66
|
+
src/linevd/storage/cache/
|
|
67
|
+
src/linevd/storage/processed/
|
|
68
|
+
src/VulChecker/
|
|
69
|
+
src/vul-LMGNN/
|
|
70
|
+
src/gnn_vuln/joern/
|
|
71
|
+
src/SCLCVD/
|
|
72
|
+
src/EDAT/
|
|
73
|
+
|
|
74
|
+
# Downloaded tools / secrets
|
|
75
|
+
gdrive.exe
|
|
76
|
+
gdrive3.zip
|
|
77
|
+
gdrive*.zip
|
|
78
|
+
rclone.zip
|
|
79
|
+
rclone*/
|
|
80
|
+
secret.txt
|
|
81
|
+
*.secret
|
|
82
|
+
|
|
83
|
+
.kiro/
|
|
84
|
+
src/GNNPlus/
|
|
85
|
+
src/Graph-Mixture-of-Experts/
|
|
86
|
+
src/mixture-of-experts/
|
|
87
|
+
src/CodeGraphSMOTE/
|
|
88
|
+
src/GMixup/
|
|
89
|
+
src/GraphSMOTE/
|
|
90
|
+
src/FLAG/
|
|
91
|
+
src/graph-jepa/
|
|
92
|
+
src/graph-vit-mlpmixer/
|
|
93
|
+
src/llm-jepa/
|
|
94
|
+
src/ijepa/
|
|
95
|
+
src/GraphMAE/
|
|
96
|
+
src/GraphMAE2/
|
|
97
|
+
src/bgrl/
|
|
98
|
+
src/StructMAE/
|
|
99
|
+
src/hypermixing/
|
|
100
|
+
src/pnlp-mixer/
|
|
101
|
+
src/GraphGPS/
|
|
102
|
+
src/LineVul/cppcheck/
|
|
103
|
+
/data/baselines/
|
|
104
|
+
docs/
|
gnn_vuln-0.1.1/PKG-INFO
ADDED
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: gnn-vuln
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: GNN-based vulnerability detection for code — Final Project (Tugas Akhir)
|
|
5
|
+
Author: Otzzu
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.11
|
|
8
|
+
Requires-Dist: datasets>=4.8.4
|
|
9
|
+
Requires-Dist: h5py>=3.16.0
|
|
10
|
+
Requires-Dist: ijson>=3.5.0
|
|
11
|
+
Requires-Dist: loguru>=0.7.2
|
|
12
|
+
Requires-Dist: matplotlib>=3.8.0
|
|
13
|
+
Requires-Dist: networkx>=3.3
|
|
14
|
+
Requires-Dist: numpy>=1.26.0
|
|
15
|
+
Requires-Dist: pandas>=2.2.0
|
|
16
|
+
Requires-Dist: pdfplumber>=0.11.9
|
|
17
|
+
Requires-Dist: python-docx>=1.2.0
|
|
18
|
+
Requires-Dist: pyyaml>=6.0.1
|
|
19
|
+
Requires-Dist: scikit-learn>=1.4.0
|
|
20
|
+
Requires-Dist: seaborn>=0.13.0
|
|
21
|
+
Requires-Dist: torch-geometric>=2.5.0
|
|
22
|
+
Requires-Dist: torch>=2.2.0
|
|
23
|
+
Requires-Dist: tqdm>=4.66.0
|
|
24
|
+
Requires-Dist: transformers<5.0,>=4.48
|
|
25
|
+
Requires-Dist: tree-sitter-c>=0.23.0
|
|
26
|
+
Requires-Dist: tree-sitter>=0.23.0
|
|
27
|
+
Provides-Extra: dev
|
|
28
|
+
Requires-Dist: ipykernel>=6.29.0; extra == 'dev'
|
|
29
|
+
Requires-Dist: jupyter>=1.0.0; extra == 'dev'
|
|
30
|
+
Requires-Dist: jupyterlab>=4.0.0; extra == 'dev'
|
|
31
|
+
Requires-Dist: pytest-cov>=5.0.0; extra == 'dev'
|
|
32
|
+
Requires-Dist: pytest>=8.0.0; extra == 'dev'
|
|
33
|
+
Requires-Dist: ruff>=0.4.0; extra == 'dev'
|
|
34
|
+
Description-Content-Type: text/markdown
|
|
35
|
+
|
|
36
|
+
# `gnn_vuln` — Library API Reference
|
|
37
|
+
|
|
38
|
+
The installable model library behind the vulnerability-detection service. This is the
|
|
39
|
+
complete public surface: what to import, the inputs, and the outputs.
|
|
40
|
+
|
|
41
|
+
**Not everything is file-based.** You pass a function **source string** and get a **result
|
|
42
|
+
dict** back. The only files involved are the model checkpoint + config (normal — weights and
|
|
43
|
+
config live on disk) and the Joern CPG, which is created in a private temp dir and hidden
|
|
44
|
+
from you. In-memory in, in-memory out.
|
|
45
|
+
|
|
46
|
+
---
|
|
47
|
+
|
|
48
|
+
## Install
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
# 1. torch + PyG sparse ext from their own indexes (PyPI can't resolve these alone)
|
|
52
|
+
pip install torch==2.6.0 --index-url https://download.pytorch.org/whl/cpu # or cu124
|
|
53
|
+
pip install torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-2.6.0+cpu.html
|
|
54
|
+
# 2. the library
|
|
55
|
+
pip install gnn-vuln
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
Plus **Joern** (CPG generation) + a **JDK 21** on the host. Point the predictor at the
|
|
59
|
+
`joern-cli` directory.
|
|
60
|
+
|
|
61
|
+
---
|
|
62
|
+
|
|
63
|
+
## Inference — `gnn_vuln.inference`
|
|
64
|
+
|
|
65
|
+
### `VulnPredictor` (high-level, recommended)
|
|
66
|
+
|
|
67
|
+
```python
|
|
68
|
+
from gnn_vuln.inference import VulnPredictor
|
|
69
|
+
|
|
70
|
+
predictor = VulnPredictor.from_checkpoint(
|
|
71
|
+
checkpoint="checkpoints/<run>/best_model.pt", # trained weights (.pt file)
|
|
72
|
+
config="configs/<arch>/config.yaml", # its config (file, or pass a list)
|
|
73
|
+
device="cuda", # "cpu" | "cuda"
|
|
74
|
+
)
|
|
75
|
+
predictor.class_names = ["benign", "CWE-787", ...] # optional: override label names
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
| Method | Input | Output |
|
|
79
|
+
| --- | --- | --- |
|
|
80
|
+
| `predict_code(code, joern_cli, max_nodes=2500, top_k_lines=None)` | function **source string** | result `dict`, or `None` if Joern produced no CPG |
|
|
81
|
+
| `predict_codes(codes, joern_cli, max_nodes=2500, top_k_lines=None)` | `list[str]` | list of result dicts (`None` per entry on Joern failure) |
|
|
82
|
+
| `predict(data, top_k_lines=None)` | a PyG `Data` object (already built) | result `dict` |
|
|
83
|
+
| `predict_from_file(cpg_path, max_nodes=1000, top_k_lines=None)` | path to a Joern CPG file | result `dict`, or `None` |
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
# the everyday call — string in, dict out (Joern handled internally)
|
|
87
|
+
result = predictor.predict_code(
|
|
88
|
+
"void f(char *s){ char b[8]; strcpy(b, s); }",
|
|
89
|
+
joern_cli="C:/joern/joern-cli",
|
|
90
|
+
top_k_lines=5,
|
|
91
|
+
)
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
### Result dict (schema)
|
|
95
|
+
|
|
96
|
+
```python
|
|
97
|
+
{
|
|
98
|
+
"prediction": "CWE-120", # predicted class name
|
|
99
|
+
"class_id": 7, # predicted class index
|
|
100
|
+
"is_vulnerable": True, # class_id > 0
|
|
101
|
+
"confidence": 0.87, # softmax prob of the predicted class [0,1]
|
|
102
|
+
"class_probabilities": {"benign": 0.01, "CWE-120": 0.87, ...},
|
|
103
|
+
"suspicious_lines": [{"line": 3, "score": 0.92, "code": "strcpy(b, s);"}, ...], # score-desc
|
|
104
|
+
"cls_embedding": [0.013, -0.44, ...], # pre-head function vector (for search/drift)
|
|
105
|
+
}
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
`suspicious_lines` may also carry `predicted_cwe` + per-line `class_probabilities` for the
|
|
109
|
+
multiclass statement head. `cls_embedding` is the representation fed to the output head.
|
|
110
|
+
|
|
111
|
+
### Module functions (lower-level)
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
from gnn_vuln.inference import load_model, predict, predict_from_file
|
|
115
|
+
|
|
116
|
+
model, class_names = load_model(checkpoint, config, device="cpu") # -> (nn.Module, list[str])
|
|
117
|
+
result = predict(model, data, class_names, device=None, top_k_lines=None) # PyG Data -> dict
|
|
118
|
+
result = predict_from_file(model, cpg_path, class_names, pretrained_lm=..., ...) # file -> dict
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
---
|
|
122
|
+
|
|
123
|
+
## CPG generation — `gnn_vuln.data.joern_runner`
|
|
124
|
+
|
|
125
|
+
Only needed if you want the CPG file yourself; `predict_code` calls this for you.
|
|
126
|
+
|
|
127
|
+
```python
|
|
128
|
+
from gnn_vuln.data.joern_runner import process_function
|
|
129
|
+
from pathlib import Path
|
|
130
|
+
|
|
131
|
+
cpg_path = process_function(
|
|
132
|
+
code="int add(int a,int b){return a+b;}", # source string
|
|
133
|
+
idx=0,
|
|
134
|
+
out_dir=Path("./out"),
|
|
135
|
+
joern_cli_dir=Path("C:/joern/joern-cli"),
|
|
136
|
+
fmt="graphml", # "graphml" | "json"
|
|
137
|
+
lang=None, # None = auto-detect (c/cpp/java/js/py)
|
|
138
|
+
) # -> Path to the written CPG, or None on failure
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
---
|
|
142
|
+
|
|
143
|
+
## Config — `gnn_vuln.config`
|
|
144
|
+
|
|
145
|
+
```python
|
|
146
|
+
from gnn_vuln.config import Config
|
|
147
|
+
|
|
148
|
+
cfg = Config.from_yaml("N48.yaml") # one monolithic file
|
|
149
|
+
cfg = Config.from_yamls(["data.yaml", "model.yaml", "train.yaml"]) # split, merged in order
|
|
150
|
+
# cfg.data, cfg.model, cfg.train, cfg.ewc, cfg.replay — dataclasses
|
|
151
|
+
cfg.data.mode # "binary" | "multiclass"
|
|
152
|
+
cfg.model.architecture # "lmgat_codebert" | "lmgat_seqgnn"
|
|
153
|
+
cfg.train.epochs # 100
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
`from_yamls` lets you split data / model / train configs into separate files; a single file
|
|
157
|
+
is just the one-element case (identical behaviour).
|
|
158
|
+
|
|
159
|
+
---
|
|
160
|
+
|
|
161
|
+
## Data pipeline & training — module CLIs (`python -m`)
|
|
162
|
+
|
|
163
|
+
Each step is a runnable module. All accept **one** config file or **several** split files
|
|
164
|
+
(merged section-by-section).
|
|
165
|
+
|
|
166
|
+
| Command | In | Out |
|
|
167
|
+
| ------------------------------------------------------------------------------------------------------- | ------------------ | ----------------------------------------- |
|
|
168
|
+
| `python -m gnn_vuln.data.prepare --input <parquet> --format bigvul --out-dir <dir> --joern-cli <joern>` | raw rows (parquet) | per-function CPGs + `cwe_vocab.json` |
|
|
169
|
+
| `python -m gnn_vuln.data.build_pt --config <yaml…> --split train` | CPG dir | processed `.pt` (UniXcoder node features) |
|
|
170
|
+
| `python -m gnn_vuln.data.merge --config <yaml…> --sources <s1> <s2> … --out-source <name> [--dedup]` | built `.pt`s | one merged `.pt` (label space unified) |
|
|
171
|
+
| `python -m gnn_vuln.train --config <yaml…>` | `.pt` + config | trained checkpoint + metrics |
|
|
172
|
+
|
|
173
|
+
`prepare` flags: `--binary`, `--top-cwe N`, `--sample-per-class N`, `--workers N`.
|
|
174
|
+
Installed console scripts: `train`, `evaluate` (= `python -m gnn_vuln.train` / `.evaluate`).
|
|
175
|
+
|
|
176
|
+
The whole raw→pt→train flow:
|
|
177
|
+
|
|
178
|
+
```bash
|
|
179
|
+
python -m gnn_vuln.data.prepare --input data.parquet --format bigvul --out-dir data/raw --joern-cli <joern>
|
|
180
|
+
python -m gnn_vuln.data.build_pt --config config.yaml --split train
|
|
181
|
+
python -m gnn_vuln.train --config config.yaml
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
---
|
|
185
|
+
|
|
186
|
+
## Package layout
|
|
187
|
+
|
|
188
|
+
```
|
|
189
|
+
gnn_vuln/
|
|
190
|
+
inference.py VulnPredictor, load_model, predict, predict_from_file
|
|
191
|
+
config.py Config (data/model/train/ewc/replay), from_yaml / from_yamls
|
|
192
|
+
train.py trainer (python -m gnn_vuln.train)
|
|
193
|
+
evaluate.py evaluation (python -m gnn_vuln.evaluate)
|
|
194
|
+
models/ lmgat_codebert, lmgat_seqgnn — the architectures (built via config)
|
|
195
|
+
data/
|
|
196
|
+
prepare.py raw rows → Joern CPG (python -m)
|
|
197
|
+
build_pt.py CPG → .pt (python -m)
|
|
198
|
+
joern_runner.py process_function — Joern wrapper
|
|
199
|
+
dataset_lm.py CodeBERTGraphDataset (PyG InMemoryDataset, UniXcoder features)
|
|
200
|
+
node_embedder.py frozen LM per-node embeddings
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
The library resolves its data/checkpoint root from `$GNN_VULN_ROOT` (else the current working
|
|
204
|
+
directory), so it behaves the same installed-from-PyPI as in a source checkout.
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "gnn-vuln"
|
|
3
|
+
version = "0.1.1"
|
|
4
|
+
description = "GNN-based vulnerability detection for code — Final Project (Tugas Akhir)"
|
|
5
|
+
readme = "src/gnn_vuln/README.md"
|
|
6
|
+
requires-python = ">=3.11"
|
|
7
|
+
license = { text = "MIT" }
|
|
8
|
+
authors = [{ name = "Otzzu" }]
|
|
9
|
+
|
|
10
|
+
dependencies = [
|
|
11
|
+
# Deep learning
|
|
12
|
+
"torch>=2.2.0",
|
|
13
|
+
# Graph neural networks (PyTorch Geometric)
|
|
14
|
+
"torch-geometric>=2.5.0",
|
|
15
|
+
# Graph utilities
|
|
16
|
+
"networkx>=3.3",
|
|
17
|
+
# Data science
|
|
18
|
+
"numpy>=1.26.0",
|
|
19
|
+
"pandas>=2.2.0",
|
|
20
|
+
"scikit-learn>=1.4.0",
|
|
21
|
+
# Visualisation
|
|
22
|
+
"matplotlib>=3.8.0",
|
|
23
|
+
"seaborn>=0.13.0",
|
|
24
|
+
# Progress & config
|
|
25
|
+
"tqdm>=4.66.0",
|
|
26
|
+
"pyyaml>=6.0.1",
|
|
27
|
+
# Parsing (AST extraction helper)
|
|
28
|
+
"tree-sitter>=0.23.0",
|
|
29
|
+
"tree-sitter-c>=0.23.0",
|
|
30
|
+
# Logging
|
|
31
|
+
"loguru>=0.7.2",
|
|
32
|
+
"datasets>=4.8.4",
|
|
33
|
+
"transformers>=4.48,<5.0",
|
|
34
|
+
"pdfplumber>=0.11.9",
|
|
35
|
+
"ijson>=3.5.0",
|
|
36
|
+
"python-docx>=1.2.0",
|
|
37
|
+
"h5py>=3.16.0",
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
[project.optional-dependencies]
|
|
41
|
+
dev = [
|
|
42
|
+
"jupyter>=1.0.0",
|
|
43
|
+
"ipykernel>=6.29.0",
|
|
44
|
+
"jupyterlab>=4.0.0",
|
|
45
|
+
"pytest>=8.0.0",
|
|
46
|
+
"pytest-cov>=5.0.0",
|
|
47
|
+
"ruff>=0.4.0",
|
|
48
|
+
]
|
|
49
|
+
|
|
50
|
+
[project.scripts]
|
|
51
|
+
train = "gnn_vuln.train:main"
|
|
52
|
+
evaluate = "gnn_vuln.evaluate:main"
|
|
53
|
+
|
|
54
|
+
[build-system]
|
|
55
|
+
requires = ["hatchling"]
|
|
56
|
+
build-backend = "hatchling.build"
|
|
57
|
+
|
|
58
|
+
[tool.hatch.build.targets.wheel]
|
|
59
|
+
packages = ["src/gnn_vuln"]
|
|
60
|
+
|
|
61
|
+
# Bound the sdist to the package only. Without this hatchling walks the whole src/ tree
|
|
62
|
+
# (LIVABLE, MVulD, VulPCL, … baseline clones — GBs) into the source tarball, which is
|
|
63
|
+
# slow and wrong. pyproject.toml + README.md are always added by hatchling.
|
|
64
|
+
[tool.hatch.build.targets.sdist]
|
|
65
|
+
only-include = ["src/gnn_vuln"]
|
|
66
|
+
|
|
67
|
+
# ---------------------------------------------------------------------------
|
|
68
|
+
# Tool configuration
|
|
69
|
+
# ---------------------------------------------------------------------------
|
|
70
|
+
|
|
71
|
+
[tool.ruff]
|
|
72
|
+
line-length = 100
|
|
73
|
+
target-version = "py311"
|
|
74
|
+
src = ["src"]
|
|
75
|
+
|
|
76
|
+
[tool.ruff.lint]
|
|
77
|
+
select = ["E", "F", "W", "I", "UP"]
|
|
78
|
+
ignore = ["E501"]
|
|
79
|
+
|
|
80
|
+
[tool.pytest.ini_options]
|
|
81
|
+
testpaths = ["tests"]
|
|
82
|
+
addopts = "-v --tb=short"
|
|
83
|
+
|
|
84
|
+
# NOTE: PyTorch Geometric sparse extensions (torch-scatter, torch-sparse)
|
|
85
|
+
# require matching torch+cpu wheels. Install them AFTER `uv sync` with:
|
|
86
|
+
# uv run pip install torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-2.2.0+cpu.html
|
|
87
|
+
# (See README.md Quick Start for full instructions.)
|
|
88
|
+
|
|
89
|
+
# ---------------------------------------------------------------------------
|
|
90
|
+
# PyTorch CUDA source (RTX 4060 / CUDA 12.4)
|
|
91
|
+
# ---------------------------------------------------------------------------
|
|
92
|
+
[tool.uv.sources]
|
|
93
|
+
torch = { index = "pytorch-cu124" }
|
|
94
|
+
|
|
95
|
+
[[tool.uv.index]]
|
|
96
|
+
name = "pytorch-cu124"
|
|
97
|
+
url = "https://download.pytorch.org/whl/cu124"
|
|
98
|
+
explicit = true
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
# `gnn_vuln` — Library API Reference
|
|
2
|
+
|
|
3
|
+
The installable model library behind the vulnerability-detection service. This is the
|
|
4
|
+
complete public surface: what to import, the inputs, and the outputs.
|
|
5
|
+
|
|
6
|
+
**Not everything is file-based.** You pass a function **source string** and get a **result
|
|
7
|
+
dict** back. The only files involved are the model checkpoint + config (normal — weights and
|
|
8
|
+
config live on disk) and the Joern CPG, which is created in a private temp dir and hidden
|
|
9
|
+
from you. In-memory in, in-memory out.
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
## Install
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
# 1. torch + PyG sparse ext from their own indexes (PyPI can't resolve these alone)
|
|
17
|
+
pip install torch==2.6.0 --index-url https://download.pytorch.org/whl/cpu # or cu124
|
|
18
|
+
pip install torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-2.6.0+cpu.html
|
|
19
|
+
# 2. the library
|
|
20
|
+
pip install gnn-vuln
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
Plus **Joern** (CPG generation) + a **JDK 21** on the host. Point the predictor at the
|
|
24
|
+
`joern-cli` directory.
|
|
25
|
+
|
|
26
|
+
---
|
|
27
|
+
|
|
28
|
+
## Inference — `gnn_vuln.inference`
|
|
29
|
+
|
|
30
|
+
### `VulnPredictor` (high-level, recommended)
|
|
31
|
+
|
|
32
|
+
```python
|
|
33
|
+
from gnn_vuln.inference import VulnPredictor
|
|
34
|
+
|
|
35
|
+
predictor = VulnPredictor.from_checkpoint(
|
|
36
|
+
checkpoint="checkpoints/<run>/best_model.pt", # trained weights (.pt file)
|
|
37
|
+
config="configs/<arch>/config.yaml", # its config (file, or pass a list)
|
|
38
|
+
device="cuda", # "cpu" | "cuda"
|
|
39
|
+
)
|
|
40
|
+
predictor.class_names = ["benign", "CWE-787", ...] # optional: override label names
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
| Method | Input | Output |
|
|
44
|
+
| --- | --- | --- |
|
|
45
|
+
| `predict_code(code, joern_cli, max_nodes=2500, top_k_lines=None)` | function **source string** | result `dict`, or `None` if Joern produced no CPG |
|
|
46
|
+
| `predict_codes(codes, joern_cli, max_nodes=2500, top_k_lines=None)` | `list[str]` | list of result dicts (`None` per entry on Joern failure) |
|
|
47
|
+
| `predict(data, top_k_lines=None)` | a PyG `Data` object (already built) | result `dict` |
|
|
48
|
+
| `predict_from_file(cpg_path, max_nodes=1000, top_k_lines=None)` | path to a Joern CPG file | result `dict`, or `None` |
|
|
49
|
+
|
|
50
|
+
```python
|
|
51
|
+
# the everyday call — string in, dict out (Joern handled internally)
|
|
52
|
+
result = predictor.predict_code(
|
|
53
|
+
"void f(char *s){ char b[8]; strcpy(b, s); }",
|
|
54
|
+
joern_cli="C:/joern/joern-cli",
|
|
55
|
+
top_k_lines=5,
|
|
56
|
+
)
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
### Result dict (schema)
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
{
|
|
63
|
+
"prediction": "CWE-120", # predicted class name
|
|
64
|
+
"class_id": 7, # predicted class index
|
|
65
|
+
"is_vulnerable": True, # class_id > 0
|
|
66
|
+
"confidence": 0.87, # softmax prob of the predicted class [0,1]
|
|
67
|
+
"class_probabilities": {"benign": 0.01, "CWE-120": 0.87, ...},
|
|
68
|
+
"suspicious_lines": [{"line": 3, "score": 0.92, "code": "strcpy(b, s);"}, ...], # score-desc
|
|
69
|
+
"cls_embedding": [0.013, -0.44, ...], # pre-head function vector (for search/drift)
|
|
70
|
+
}
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
`suspicious_lines` may also carry `predicted_cwe` + per-line `class_probabilities` for the
|
|
74
|
+
multiclass statement head. `cls_embedding` is the representation fed to the output head.
|
|
75
|
+
|
|
76
|
+
### Module functions (lower-level)
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
from gnn_vuln.inference import load_model, predict, predict_from_file
|
|
80
|
+
|
|
81
|
+
model, class_names = load_model(checkpoint, config, device="cpu") # -> (nn.Module, list[str])
|
|
82
|
+
result = predict(model, data, class_names, device=None, top_k_lines=None) # PyG Data -> dict
|
|
83
|
+
result = predict_from_file(model, cpg_path, class_names, pretrained_lm=..., ...) # file -> dict
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
---
|
|
87
|
+
|
|
88
|
+
## CPG generation — `gnn_vuln.data.joern_runner`
|
|
89
|
+
|
|
90
|
+
Only needed if you want the CPG file yourself; `predict_code` calls this for you.
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
from gnn_vuln.data.joern_runner import process_function
|
|
94
|
+
from pathlib import Path
|
|
95
|
+
|
|
96
|
+
cpg_path = process_function(
|
|
97
|
+
code="int add(int a,int b){return a+b;}", # source string
|
|
98
|
+
idx=0,
|
|
99
|
+
out_dir=Path("./out"),
|
|
100
|
+
joern_cli_dir=Path("C:/joern/joern-cli"),
|
|
101
|
+
fmt="graphml", # "graphml" | "json"
|
|
102
|
+
lang=None, # None = auto-detect (c/cpp/java/js/py)
|
|
103
|
+
) # -> Path to the written CPG, or None on failure
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
---
|
|
107
|
+
|
|
108
|
+
## Config — `gnn_vuln.config`
|
|
109
|
+
|
|
110
|
+
```python
|
|
111
|
+
from gnn_vuln.config import Config
|
|
112
|
+
|
|
113
|
+
cfg = Config.from_yaml("N48.yaml") # one monolithic file
|
|
114
|
+
cfg = Config.from_yamls(["data.yaml", "model.yaml", "train.yaml"]) # split, merged in order
|
|
115
|
+
# cfg.data, cfg.model, cfg.train, cfg.ewc, cfg.replay — dataclasses
|
|
116
|
+
cfg.data.mode # "binary" | "multiclass"
|
|
117
|
+
cfg.model.architecture # "lmgat_codebert" | "lmgat_seqgnn"
|
|
118
|
+
cfg.train.epochs # 100
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
`from_yamls` lets you split data / model / train configs into separate files; a single file
|
|
122
|
+
is just the one-element case (identical behaviour).
|
|
123
|
+
|
|
124
|
+
---
|
|
125
|
+
|
|
126
|
+
## Data pipeline & training — module CLIs (`python -m`)
|
|
127
|
+
|
|
128
|
+
Each step is a runnable module. All accept **one** config file or **several** split files
|
|
129
|
+
(merged section-by-section).
|
|
130
|
+
|
|
131
|
+
| Command | In | Out |
|
|
132
|
+
| ------------------------------------------------------------------------------------------------------- | ------------------ | ----------------------------------------- |
|
|
133
|
+
| `python -m gnn_vuln.data.prepare --input <parquet> --format bigvul --out-dir <dir> --joern-cli <joern>` | raw rows (parquet) | per-function CPGs + `cwe_vocab.json` |
|
|
134
|
+
| `python -m gnn_vuln.data.build_pt --config <yaml…> --split train` | CPG dir | processed `.pt` (UniXcoder node features) |
|
|
135
|
+
| `python -m gnn_vuln.data.merge --config <yaml…> --sources <s1> <s2> … --out-source <name> [--dedup]` | built `.pt`s | one merged `.pt` (label space unified) |
|
|
136
|
+
| `python -m gnn_vuln.train --config <yaml…>` | `.pt` + config | trained checkpoint + metrics |
|
|
137
|
+
|
|
138
|
+
`prepare` flags: `--binary`, `--top-cwe N`, `--sample-per-class N`, `--workers N`.
|
|
139
|
+
Installed console scripts: `train`, `evaluate` (= `python -m gnn_vuln.train` / `.evaluate`).
|
|
140
|
+
|
|
141
|
+
The whole raw→pt→train flow:
|
|
142
|
+
|
|
143
|
+
```bash
|
|
144
|
+
python -m gnn_vuln.data.prepare --input data.parquet --format bigvul --out-dir data/raw --joern-cli <joern>
|
|
145
|
+
python -m gnn_vuln.data.build_pt --config config.yaml --split train
|
|
146
|
+
python -m gnn_vuln.train --config config.yaml
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
---
|
|
150
|
+
|
|
151
|
+
## Package layout
|
|
152
|
+
|
|
153
|
+
```
|
|
154
|
+
gnn_vuln/
|
|
155
|
+
inference.py VulnPredictor, load_model, predict, predict_from_file
|
|
156
|
+
config.py Config (data/model/train/ewc/replay), from_yaml / from_yamls
|
|
157
|
+
train.py trainer (python -m gnn_vuln.train)
|
|
158
|
+
evaluate.py evaluation (python -m gnn_vuln.evaluate)
|
|
159
|
+
models/ lmgat_codebert, lmgat_seqgnn — the architectures (built via config)
|
|
160
|
+
data/
|
|
161
|
+
prepare.py raw rows → Joern CPG (python -m)
|
|
162
|
+
build_pt.py CPG → .pt (python -m)
|
|
163
|
+
joern_runner.py process_function — Joern wrapper
|
|
164
|
+
dataset_lm.py CodeBERTGraphDataset (PyG InMemoryDataset, UniXcoder features)
|
|
165
|
+
node_embedder.py frozen LM per-node embeddings
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
The library resolves its data/checkpoint root from `$GNN_VULN_ROOT` (else the current working
|
|
169
|
+
directory), so it behaves the same installed-from-PyPI as in a source checkout.
|