gnn-vuln 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. gnn_vuln-0.1.1/.gitignore +104 -0
  2. gnn_vuln-0.1.1/PKG-INFO +204 -0
  3. gnn_vuln-0.1.1/pyproject.toml +98 -0
  4. gnn_vuln-0.1.1/src/gnn_vuln/README.md +169 -0
  5. gnn_vuln-0.1.1/src/gnn_vuln/__init__.py +8 -0
  6. gnn_vuln-0.1.1/src/gnn_vuln/baselines.py +343 -0
  7. gnn_vuln-0.1.1/src/gnn_vuln/config.py +422 -0
  8. gnn_vuln-0.1.1/src/gnn_vuln/data/__init__.py +1 -0
  9. gnn_vuln-0.1.1/src/gnn_vuln/data/build_pt.py +147 -0
  10. gnn_vuln-0.1.1/src/gnn_vuln/data/cpg/__init__.py +39 -0
  11. gnn_vuln-0.1.1/src/gnn_vuln/data/cpg/builder.py +134 -0
  12. gnn_vuln-0.1.1/src/gnn_vuln/data/cpg/constants.py +103 -0
  13. gnn_vuln-0.1.1/src/gnn_vuln/data/cpg/features.py +221 -0
  14. gnn_vuln-0.1.1/src/gnn_vuln/data/cpg/parser.py +148 -0
  15. gnn_vuln-0.1.1/src/gnn_vuln/data/cwe_taxonomy.py +299 -0
  16. gnn_vuln-0.1.1/src/gnn_vuln/data/dataset_lm.py +1671 -0
  17. gnn_vuln-0.1.1/src/gnn_vuln/data/graph_builder_lm.py +16 -0
  18. gnn_vuln-0.1.1/src/gnn_vuln/data/graph_partition.py +170 -0
  19. gnn_vuln-0.1.1/src/gnn_vuln/data/graph_partition_ref.py +129 -0
  20. gnn_vuln-0.1.1/src/gnn_vuln/data/joern_runner.py +329 -0
  21. gnn_vuln-0.1.1/src/gnn_vuln/data/merge.py +180 -0
  22. gnn_vuln-0.1.1/src/gnn_vuln/data/node_embedder.py +188 -0
  23. gnn_vuln-0.1.1/src/gnn_vuln/data/prepare.py +780 -0
  24. gnn_vuln-0.1.1/src/gnn_vuln/data/preprocess.py +206 -0
  25. gnn_vuln-0.1.1/src/gnn_vuln/evaluate.py +341 -0
  26. gnn_vuln-0.1.1/src/gnn_vuln/evaluation/__init__.py +6 -0
  27. gnn_vuln-0.1.1/src/gnn_vuln/evaluation/localize.py +162 -0
  28. gnn_vuln-0.1.1/src/gnn_vuln/evaluation/plots.py +104 -0
  29. gnn_vuln-0.1.1/src/gnn_vuln/inference.py +479 -0
  30. gnn_vuln-0.1.1/src/gnn_vuln/losses/__init__.py +3 -0
  31. gnn_vuln-0.1.1/src/gnn_vuln/losses/hierarchical_supcon.py +353 -0
  32. gnn_vuln-0.1.1/src/gnn_vuln/metrics.py +242 -0
  33. gnn_vuln-0.1.1/src/gnn_vuln/models/__init__.py +31 -0
  34. gnn_vuln-0.1.1/src/gnn_vuln/models/_lm_utils.py +663 -0
  35. gnn_vuln-0.1.1/src/gnn_vuln/models/base.py +393 -0
  36. gnn_vuln-0.1.1/src/gnn_vuln/models/cross_task.py +401 -0
  37. gnn_vuln-0.1.1/src/gnn_vuln/models/encoders.py +1125 -0
  38. gnn_vuln-0.1.1/src/gnn_vuln/models/graph_vit.py +263 -0
  39. gnn_vuln-0.1.1/src/gnn_vuln/models/heads.py +593 -0
  40. gnn_vuln-0.1.1/src/gnn_vuln/models/lmgat_codebert.py +565 -0
  41. gnn_vuln-0.1.1/src/gnn_vuln/models/lmgat_codebert_mtl.py +149 -0
  42. gnn_vuln-0.1.1/src/gnn_vuln/models/lmgat_dualflow.py +101 -0
  43. gnn_vuln-0.1.1/src/gnn_vuln/models/lmgat_hcdfgat.py +120 -0
  44. gnn_vuln-0.1.1/src/gnn_vuln/models/lmgat_interp.py +87 -0
  45. gnn_vuln-0.1.1/src/gnn_vuln/models/lmgat_mcs.py +72 -0
  46. gnn_vuln-0.1.1/src/gnn_vuln/models/lmgat_seq.py +115 -0
  47. gnn_vuln-0.1.1/src/gnn_vuln/models/lmgat_seqgnn.py +169 -0
  48. gnn_vuln-0.1.1/src/gnn_vuln/models/lmgat_waves_seq.py +124 -0
  49. gnn_vuln-0.1.1/src/gnn_vuln/models/registry.py +84 -0
  50. gnn_vuln-0.1.1/src/gnn_vuln/models/supcon_head.py +28 -0
  51. gnn_vuln-0.1.1/src/gnn_vuln/pretrain_jepa.py +261 -0
  52. gnn_vuln-0.1.1/src/gnn_vuln/train.py +979 -0
  53. gnn_vuln-0.1.1/src/gnn_vuln/training/__init__.py +1 -0
  54. gnn_vuln-0.1.1/src/gnn_vuln/training/ewc.py +193 -0
  55. gnn_vuln-0.1.1/src/gnn_vuln/training/losses.py +195 -0
  56. gnn_vuln-0.1.1/src/gnn_vuln/training/mtl_balance.py +194 -0
  57. gnn_vuln-0.1.1/src/gnn_vuln/training/optimizer.py +153 -0
  58. gnn_vuln-0.1.1/src/gnn_vuln/training/pgd.py +226 -0
  59. gnn_vuln-0.1.1/src/gnn_vuln/training/sampler.py +93 -0
  60. gnn_vuln-0.1.1/src/gnn_vuln/training/trainer.py +740 -0
  61. gnn_vuln-0.1.1/src/gnn_vuln/training/unfreezer.py +78 -0
  62. gnn_vuln-0.1.1/src/gnn_vuln/utils.py +298 -0
@@ -0,0 +1,104 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.pyo
5
+ *.pyd
6
+ *.so
7
+ *.egg
8
+ *.egg-info/
9
+ dist/
10
+ build/
11
+ .eggs/
12
+
13
+ # uv / virtualenv
14
+ .venv/
15
+ .uv/
16
+
17
+ # embedded Joern clone (nested .git, large) + local materialize cache
18
+ src/gnn_vuln/joern/
19
+ data/.materialized/
20
+
21
+ # Jupyter
22
+ .ipynb_checkpoints/
23
+ *.ipynb_metadata/
24
+
25
+
26
+ # Logs
27
+ logs/
28
+ *.log
29
+
30
+ # OS
31
+ .DS_Store
32
+ Thumbs.db
33
+
34
+ # IDE
35
+ .vscode/settings.json
36
+ .idea/
37
+
38
+ # Claude Code local config (commands, hooks — project-local, not for repo)
39
+ .claude/
40
+ .serena/
41
+ .agents/
42
+
43
+ # Env files
44
+ .env
45
+ *.env
46
+
47
+ # Joern output cache
48
+ joern-cli/
49
+ joern_workspace/
50
+
51
+ # Large generated / binary / third-party — do NOT push
52
+ /data/raw/**
53
+ !/data/raw/*/
54
+ !/data/raw/*/cwe_vocab.json
55
+ /data/datasets/
56
+ /data/processed/
57
+ /data/graphs/
58
+ /checkpoints/
59
+ /results/
60
+ /paper/
61
+ # Baselines tracked in-repo (LineVul, LineVD, LOSVER) so the pod gets them via git pull.
62
+ # Ignore only their generated junk:
63
+ src/*/__pycache__/
64
+ src/**/*.pyc
65
+ src/LineVul/saved_models/
66
+ src/linevd/storage/cache/
67
+ src/linevd/storage/processed/
68
+ src/VulChecker/
69
+ src/vul-LMGNN/
70
+ src/gnn_vuln/joern/
71
+ src/SCLCVD/
72
+ src/EDAT/
73
+
74
+ # Downloaded tools / secrets
75
+ gdrive.exe
76
+ gdrive3.zip
77
+ gdrive*.zip
78
+ rclone.zip
79
+ rclone*/
80
+ secret.txt
81
+ *.secret
82
+
83
+ .kiro/
84
+ src/GNNPlus/
85
+ src/Graph-Mixture-of-Experts/
86
+ src/mixture-of-experts/
87
+ src/CodeGraphSMOTE/
88
+ src/GMixup/
89
+ src/GraphSMOTE/
90
+ src/FLAG/
91
+ src/graph-jepa/
92
+ src/graph-vit-mlpmixer/
93
+ src/llm-jepa/
94
+ src/ijepa/
95
+ src/GraphMAE/
96
+ src/GraphMAE2/
97
+ src/bgrl/
98
+ src/StructMAE/
99
+ src/hypermixing/
100
+ src/pnlp-mixer/
101
+ src/GraphGPS/
102
+ src/LineVul/cppcheck/
103
+ /data/baselines/
104
+ docs/
@@ -0,0 +1,204 @@
1
+ Metadata-Version: 2.4
2
+ Name: gnn-vuln
3
+ Version: 0.1.1
4
+ Summary: GNN-based vulnerability detection for code — Final Project (Tugas Akhir)
5
+ Author: Otzzu
6
+ License: MIT
7
+ Requires-Python: >=3.11
8
+ Requires-Dist: datasets>=4.8.4
9
+ Requires-Dist: h5py>=3.16.0
10
+ Requires-Dist: ijson>=3.5.0
11
+ Requires-Dist: loguru>=0.7.2
12
+ Requires-Dist: matplotlib>=3.8.0
13
+ Requires-Dist: networkx>=3.3
14
+ Requires-Dist: numpy>=1.26.0
15
+ Requires-Dist: pandas>=2.2.0
16
+ Requires-Dist: pdfplumber>=0.11.9
17
+ Requires-Dist: python-docx>=1.2.0
18
+ Requires-Dist: pyyaml>=6.0.1
19
+ Requires-Dist: scikit-learn>=1.4.0
20
+ Requires-Dist: seaborn>=0.13.0
21
+ Requires-Dist: torch-geometric>=2.5.0
22
+ Requires-Dist: torch>=2.2.0
23
+ Requires-Dist: tqdm>=4.66.0
24
+ Requires-Dist: transformers<5.0,>=4.48
25
+ Requires-Dist: tree-sitter-c>=0.23.0
26
+ Requires-Dist: tree-sitter>=0.23.0
27
+ Provides-Extra: dev
28
+ Requires-Dist: ipykernel>=6.29.0; extra == 'dev'
29
+ Requires-Dist: jupyter>=1.0.0; extra == 'dev'
30
+ Requires-Dist: jupyterlab>=4.0.0; extra == 'dev'
31
+ Requires-Dist: pytest-cov>=5.0.0; extra == 'dev'
32
+ Requires-Dist: pytest>=8.0.0; extra == 'dev'
33
+ Requires-Dist: ruff>=0.4.0; extra == 'dev'
34
+ Description-Content-Type: text/markdown
35
+
36
+ # `gnn_vuln` — Library API Reference
37
+
38
+ The installable model library behind the vulnerability-detection service. This is the
39
+ complete public surface: what to import, the inputs, and the outputs.
40
+
41
+ **Not everything is file-based.** You pass a function **source string** and get a **result
42
+ dict** back. The only files involved are the model checkpoint + config (normal — weights and
43
+ config live on disk) and the Joern CPG, which is created in a private temp dir and hidden
44
+ from you. In-memory in, in-memory out.
45
+
46
+ ---
47
+
48
+ ## Install
49
+
50
+ ```bash
51
+ # 1. torch + PyG sparse ext from their own indexes (PyPI can't resolve these alone)
52
+ pip install torch==2.6.0 --index-url https://download.pytorch.org/whl/cpu # or cu124
53
+ pip install torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-2.6.0+cpu.html
54
+ # 2. the library
55
+ pip install gnn-vuln
56
+ ```
57
+
58
+ Plus **Joern** (CPG generation) + a **JDK 21** on the host. Point the predictor at the
59
+ `joern-cli` directory.
60
+
61
+ ---
62
+
63
+ ## Inference — `gnn_vuln.inference`
64
+
65
+ ### `VulnPredictor` (high-level, recommended)
66
+
67
+ ```python
68
+ from gnn_vuln.inference import VulnPredictor
69
+
70
+ predictor = VulnPredictor.from_checkpoint(
71
+ checkpoint="checkpoints/<run>/best_model.pt", # trained weights (.pt file)
72
+ config="configs/<arch>/config.yaml", # its config (file, or pass a list)
73
+ device="cuda", # "cpu" | "cuda"
74
+ )
75
+ predictor.class_names = ["benign", "CWE-787", ...] # optional: override label names
76
+ ```
77
+
78
+ | Method | Input | Output |
79
+ | --- | --- | --- |
80
+ | `predict_code(code, joern_cli, max_nodes=2500, top_k_lines=None)` | function **source string** | result `dict`, or `None` if Joern produced no CPG |
81
+ | `predict_codes(codes, joern_cli, max_nodes=2500, top_k_lines=None)` | `list[str]` | list of result dicts (`None` per entry on Joern failure) |
82
+ | `predict(data, top_k_lines=None)` | a PyG `Data` object (already built) | result `dict` |
83
+ | `predict_from_file(cpg_path, max_nodes=1000, top_k_lines=None)` | path to a Joern CPG file | result `dict`, or `None` |
84
+
85
+ ```python
86
+ # the everyday call — string in, dict out (Joern handled internally)
87
+ result = predictor.predict_code(
88
+ "void f(char *s){ char b[8]; strcpy(b, s); }",
89
+ joern_cli="C:/joern/joern-cli",
90
+ top_k_lines=5,
91
+ )
92
+ ```
93
+
94
+ ### Result dict (schema)
95
+
96
+ ```python
97
+ {
98
+ "prediction": "CWE-120", # predicted class name
99
+ "class_id": 7, # predicted class index
100
+ "is_vulnerable": True, # class_id > 0
101
+ "confidence": 0.87, # softmax prob of the predicted class [0,1]
102
+ "class_probabilities": {"benign": 0.01, "CWE-120": 0.87, ...},
103
+ "suspicious_lines": [{"line": 3, "score": 0.92, "code": "strcpy(b, s);"}, ...], # score-desc
104
+ "cls_embedding": [0.013, -0.44, ...], # pre-head function vector (for search/drift)
105
+ }
106
+ ```
107
+
108
+ `suspicious_lines` may also carry `predicted_cwe` + per-line `class_probabilities` for the
109
+ multiclass statement head. `cls_embedding` is the representation fed to the output head.
110
+
111
+ ### Module functions (lower-level)
112
+
113
+ ```python
114
+ from gnn_vuln.inference import load_model, predict, predict_from_file
115
+
116
+ model, class_names = load_model(checkpoint, config, device="cpu") # -> (nn.Module, list[str])
117
+ result = predict(model, data, class_names, device=None, top_k_lines=None) # PyG Data -> dict
118
+ result = predict_from_file(model, cpg_path, class_names, pretrained_lm=..., ...) # file -> dict
119
+ ```
120
+
121
+ ---
122
+
123
+ ## CPG generation — `gnn_vuln.data.joern_runner`
124
+
125
+ Only needed if you want the CPG file yourself; `predict_code` calls this for you.
126
+
127
+ ```python
128
+ from gnn_vuln.data.joern_runner import process_function
129
+ from pathlib import Path
130
+
131
+ cpg_path = process_function(
132
+ code="int add(int a,int b){return a+b;}", # source string
133
+ idx=0,
134
+ out_dir=Path("./out"),
135
+ joern_cli_dir=Path("C:/joern/joern-cli"),
136
+ fmt="graphml", # "graphml" | "json"
137
+ lang=None, # None = auto-detect (c/cpp/java/js/py)
138
+ ) # -> Path to the written CPG, or None on failure
139
+ ```
140
+
141
+ ---
142
+
143
+ ## Config — `gnn_vuln.config`
144
+
145
+ ```python
146
+ from gnn_vuln.config import Config
147
+
148
+ cfg = Config.from_yaml("N48.yaml") # one monolithic file
149
+ cfg = Config.from_yamls(["data.yaml", "model.yaml", "train.yaml"]) # split, merged in order
150
+ # cfg.data, cfg.model, cfg.train, cfg.ewc, cfg.replay — dataclasses
151
+ cfg.data.mode # "binary" | "multiclass"
152
+ cfg.model.architecture # "lmgat_codebert" | "lmgat_seqgnn"
153
+ cfg.train.epochs # 100
154
+ ```
155
+
156
+ `from_yamls` lets you split data / model / train configs into separate files; a single file
157
+ is just the one-element case (identical behaviour).
158
+
159
+ ---
160
+
161
+ ## Data pipeline & training — module CLIs (`python -m`)
162
+
163
+ Each step is a runnable module. All accept **one** config file or **several** split files
164
+ (merged section-by-section).
165
+
166
+ | Command | In | Out |
167
+ | ------------------------------------------------------------------------------------------------------- | ------------------ | ----------------------------------------- |
168
+ | `python -m gnn_vuln.data.prepare --input <parquet> --format bigvul --out-dir <dir> --joern-cli <joern>` | raw rows (parquet) | per-function CPGs + `cwe_vocab.json` |
169
+ | `python -m gnn_vuln.data.build_pt --config <yaml…> --split train` | CPG dir | processed `.pt` (UniXcoder node features) |
170
+ | `python -m gnn_vuln.data.merge --config <yaml…> --sources <s1> <s2> … --out-source <name> [--dedup]` | built `.pt`s | one merged `.pt` (label space unified) |
171
+ | `python -m gnn_vuln.train --config <yaml…>` | `.pt` + config | trained checkpoint + metrics |
172
+
173
+ `prepare` flags: `--binary`, `--top-cwe N`, `--sample-per-class N`, `--workers N`.
174
+ Installed console scripts: `train`, `evaluate` (= `python -m gnn_vuln.train` / `.evaluate`).
175
+
176
+ The whole raw→pt→train flow:
177
+
178
+ ```bash
179
+ python -m gnn_vuln.data.prepare --input data.parquet --format bigvul --out-dir data/raw --joern-cli <joern>
180
+ python -m gnn_vuln.data.build_pt --config config.yaml --split train
181
+ python -m gnn_vuln.train --config config.yaml
182
+ ```
183
+
184
+ ---
185
+
186
+ ## Package layout
187
+
188
+ ```
189
+ gnn_vuln/
190
+ inference.py VulnPredictor, load_model, predict, predict_from_file
191
+ config.py Config (data/model/train/ewc/replay), from_yaml / from_yamls
192
+ train.py trainer (python -m gnn_vuln.train)
193
+ evaluate.py evaluation (python -m gnn_vuln.evaluate)
194
+ models/ lmgat_codebert, lmgat_seqgnn — the architectures (built via config)
195
+ data/
196
+ prepare.py raw rows → Joern CPG (python -m)
197
+ build_pt.py CPG → .pt (python -m)
198
+ joern_runner.py process_function — Joern wrapper
199
+ dataset_lm.py CodeBERTGraphDataset (PyG InMemoryDataset, UniXcoder features)
200
+ node_embedder.py frozen LM per-node embeddings
201
+ ```
202
+
203
+ The library resolves its data/checkpoint root from `$GNN_VULN_ROOT` (else the current working
204
+ directory), so it behaves the same installed-from-PyPI as in a source checkout.
@@ -0,0 +1,98 @@
1
+ [project]
2
+ name = "gnn-vuln"
3
+ version = "0.1.1"
4
+ description = "GNN-based vulnerability detection for code — Final Project (Tugas Akhir)"
5
+ readme = "src/gnn_vuln/README.md"
6
+ requires-python = ">=3.11"
7
+ license = { text = "MIT" }
8
+ authors = [{ name = "Otzzu" }]
9
+
10
+ dependencies = [
11
+ # Deep learning
12
+ "torch>=2.2.0",
13
+ # Graph neural networks (PyTorch Geometric)
14
+ "torch-geometric>=2.5.0",
15
+ # Graph utilities
16
+ "networkx>=3.3",
17
+ # Data science
18
+ "numpy>=1.26.0",
19
+ "pandas>=2.2.0",
20
+ "scikit-learn>=1.4.0",
21
+ # Visualisation
22
+ "matplotlib>=3.8.0",
23
+ "seaborn>=0.13.0",
24
+ # Progress & config
25
+ "tqdm>=4.66.0",
26
+ "pyyaml>=6.0.1",
27
+ # Parsing (AST extraction helper)
28
+ "tree-sitter>=0.23.0",
29
+ "tree-sitter-c>=0.23.0",
30
+ # Logging
31
+ "loguru>=0.7.2",
32
+ "datasets>=4.8.4",
33
+ "transformers>=4.48,<5.0",
34
+ "pdfplumber>=0.11.9",
35
+ "ijson>=3.5.0",
36
+ "python-docx>=1.2.0",
37
+ "h5py>=3.16.0",
38
+ ]
39
+
40
+ [project.optional-dependencies]
41
+ dev = [
42
+ "jupyter>=1.0.0",
43
+ "ipykernel>=6.29.0",
44
+ "jupyterlab>=4.0.0",
45
+ "pytest>=8.0.0",
46
+ "pytest-cov>=5.0.0",
47
+ "ruff>=0.4.0",
48
+ ]
49
+
50
+ [project.scripts]
51
+ train = "gnn_vuln.train:main"
52
+ evaluate = "gnn_vuln.evaluate:main"
53
+
54
+ [build-system]
55
+ requires = ["hatchling"]
56
+ build-backend = "hatchling.build"
57
+
58
+ [tool.hatch.build.targets.wheel]
59
+ packages = ["src/gnn_vuln"]
60
+
61
+ # Bound the sdist to the package only. Without this hatchling walks the whole src/ tree
62
+ # (LIVABLE, MVulD, VulPCL, … baseline clones — GBs) into the source tarball, which is
63
+ # slow and wrong. pyproject.toml + README.md are always added by hatchling.
64
+ [tool.hatch.build.targets.sdist]
65
+ only-include = ["src/gnn_vuln"]
66
+
67
+ # ---------------------------------------------------------------------------
68
+ # Tool configuration
69
+ # ---------------------------------------------------------------------------
70
+
71
+ [tool.ruff]
72
+ line-length = 100
73
+ target-version = "py311"
74
+ src = ["src"]
75
+
76
+ [tool.ruff.lint]
77
+ select = ["E", "F", "W", "I", "UP"]
78
+ ignore = ["E501"]
79
+
80
+ [tool.pytest.ini_options]
81
+ testpaths = ["tests"]
82
+ addopts = "-v --tb=short"
83
+
84
+ # NOTE: PyTorch Geometric sparse extensions (torch-scatter, torch-sparse)
85
+ # require matching torch+cpu wheels. Install them AFTER `uv sync` with:
86
+ # uv run pip install torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-2.2.0+cpu.html
87
+ # (See README.md Quick Start for full instructions.)
88
+
89
+ # ---------------------------------------------------------------------------
90
+ # PyTorch CUDA source (RTX 4060 / CUDA 12.4)
91
+ # ---------------------------------------------------------------------------
92
+ [tool.uv.sources]
93
+ torch = { index = "pytorch-cu124" }
94
+
95
+ [[tool.uv.index]]
96
+ name = "pytorch-cu124"
97
+ url = "https://download.pytorch.org/whl/cu124"
98
+ explicit = true
@@ -0,0 +1,169 @@
1
+ # `gnn_vuln` — Library API Reference
2
+
3
+ The installable model library behind the vulnerability-detection service. This is the
4
+ complete public surface: what to import, the inputs, and the outputs.
5
+
6
+ **Not everything is file-based.** You pass a function **source string** and get a **result
7
+ dict** back. The only files involved are the model checkpoint + config (normal — weights and
8
+ config live on disk) and the Joern CPG, which is created in a private temp dir and hidden
9
+ from you. In-memory in, in-memory out.
10
+
11
+ ---
12
+
13
+ ## Install
14
+
15
+ ```bash
16
+ # 1. torch + PyG sparse ext from their own indexes (PyPI can't resolve these alone)
17
+ pip install torch==2.6.0 --index-url https://download.pytorch.org/whl/cpu # or cu124
18
+ pip install torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-2.6.0+cpu.html
19
+ # 2. the library
20
+ pip install gnn-vuln
21
+ ```
22
+
23
+ Plus **Joern** (CPG generation) + a **JDK 21** on the host. Point the predictor at the
24
+ `joern-cli` directory.
25
+
26
+ ---
27
+
28
+ ## Inference — `gnn_vuln.inference`
29
+
30
+ ### `VulnPredictor` (high-level, recommended)
31
+
32
+ ```python
33
+ from gnn_vuln.inference import VulnPredictor
34
+
35
+ predictor = VulnPredictor.from_checkpoint(
36
+ checkpoint="checkpoints/<run>/best_model.pt", # trained weights (.pt file)
37
+ config="configs/<arch>/config.yaml", # its config (file, or pass a list)
38
+ device="cuda", # "cpu" | "cuda"
39
+ )
40
+ predictor.class_names = ["benign", "CWE-787", ...] # optional: override label names
41
+ ```
42
+
43
+ | Method | Input | Output |
44
+ | --- | --- | --- |
45
+ | `predict_code(code, joern_cli, max_nodes=2500, top_k_lines=None)` | function **source string** | result `dict`, or `None` if Joern produced no CPG |
46
+ | `predict_codes(codes, joern_cli, max_nodes=2500, top_k_lines=None)` | `list[str]` | list of result dicts (`None` per entry on Joern failure) |
47
+ | `predict(data, top_k_lines=None)` | a PyG `Data` object (already built) | result `dict` |
48
+ | `predict_from_file(cpg_path, max_nodes=1000, top_k_lines=None)` | path to a Joern CPG file | result `dict`, or `None` |
49
+
50
+ ```python
51
+ # the everyday call — string in, dict out (Joern handled internally)
52
+ result = predictor.predict_code(
53
+ "void f(char *s){ char b[8]; strcpy(b, s); }",
54
+ joern_cli="C:/joern/joern-cli",
55
+ top_k_lines=5,
56
+ )
57
+ ```
58
+
59
+ ### Result dict (schema)
60
+
61
+ ```python
62
+ {
63
+ "prediction": "CWE-120", # predicted class name
64
+ "class_id": 7, # predicted class index
65
+ "is_vulnerable": True, # class_id > 0
66
+ "confidence": 0.87, # softmax prob of the predicted class [0,1]
67
+ "class_probabilities": {"benign": 0.01, "CWE-120": 0.87, ...},
68
+ "suspicious_lines": [{"line": 3, "score": 0.92, "code": "strcpy(b, s);"}, ...], # score-desc
69
+ "cls_embedding": [0.013, -0.44, ...], # pre-head function vector (for search/drift)
70
+ }
71
+ ```
72
+
73
+ `suspicious_lines` may also carry `predicted_cwe` + per-line `class_probabilities` for the
74
+ multiclass statement head. `cls_embedding` is the representation fed to the output head.
75
+
76
+ ### Module functions (lower-level)
77
+
78
+ ```python
79
+ from gnn_vuln.inference import load_model, predict, predict_from_file
80
+
81
+ model, class_names = load_model(checkpoint, config, device="cpu") # -> (nn.Module, list[str])
82
+ result = predict(model, data, class_names, device=None, top_k_lines=None) # PyG Data -> dict
83
+ result = predict_from_file(model, cpg_path, class_names, pretrained_lm=..., ...) # file -> dict
84
+ ```
85
+
86
+ ---
87
+
88
+ ## CPG generation — `gnn_vuln.data.joern_runner`
89
+
90
+ Only needed if you want the CPG file yourself; `predict_code` calls this for you.
91
+
92
+ ```python
93
+ from gnn_vuln.data.joern_runner import process_function
94
+ from pathlib import Path
95
+
96
+ cpg_path = process_function(
97
+ code="int add(int a,int b){return a+b;}", # source string
98
+ idx=0,
99
+ out_dir=Path("./out"),
100
+ joern_cli_dir=Path("C:/joern/joern-cli"),
101
+ fmt="graphml", # "graphml" | "json"
102
+ lang=None, # None = auto-detect (c/cpp/java/js/py)
103
+ ) # -> Path to the written CPG, or None on failure
104
+ ```
105
+
106
+ ---
107
+
108
+ ## Config — `gnn_vuln.config`
109
+
110
+ ```python
111
+ from gnn_vuln.config import Config
112
+
113
+ cfg = Config.from_yaml("N48.yaml") # one monolithic file
114
+ cfg = Config.from_yamls(["data.yaml", "model.yaml", "train.yaml"]) # split, merged in order
115
+ # cfg.data, cfg.model, cfg.train, cfg.ewc, cfg.replay — dataclasses
116
+ cfg.data.mode # "binary" | "multiclass"
117
+ cfg.model.architecture # "lmgat_codebert" | "lmgat_seqgnn"
118
+ cfg.train.epochs # 100
119
+ ```
120
+
121
+ `from_yamls` lets you split data / model / train configs into separate files; a single file
122
+ is just the one-element case (identical behaviour).
123
+
124
+ ---
125
+
126
+ ## Data pipeline & training — module CLIs (`python -m`)
127
+
128
+ Each step is a runnable module. All accept **one** config file or **several** split files
129
+ (merged section-by-section).
130
+
131
+ | Command | In | Out |
132
+ | ------------------------------------------------------------------------------------------------------- | ------------------ | ----------------------------------------- |
133
+ | `python -m gnn_vuln.data.prepare --input <parquet> --format bigvul --out-dir <dir> --joern-cli <joern>` | raw rows (parquet) | per-function CPGs + `cwe_vocab.json` |
134
+ | `python -m gnn_vuln.data.build_pt --config <yaml…> --split train` | CPG dir | processed `.pt` (UniXcoder node features) |
135
+ | `python -m gnn_vuln.data.merge --config <yaml…> --sources <s1> <s2> … --out-source <name> [--dedup]` | built `.pt`s | one merged `.pt` (label space unified) |
136
+ | `python -m gnn_vuln.train --config <yaml…>` | `.pt` + config | trained checkpoint + metrics |
137
+
138
+ `prepare` flags: `--binary`, `--top-cwe N`, `--sample-per-class N`, `--workers N`.
139
+ Installed console scripts: `train`, `evaluate` (= `python -m gnn_vuln.train` / `.evaluate`).
140
+
141
+ The whole raw→pt→train flow:
142
+
143
+ ```bash
144
+ python -m gnn_vuln.data.prepare --input data.parquet --format bigvul --out-dir data/raw --joern-cli <joern>
145
+ python -m gnn_vuln.data.build_pt --config config.yaml --split train
146
+ python -m gnn_vuln.train --config config.yaml
147
+ ```
148
+
149
+ ---
150
+
151
+ ## Package layout
152
+
153
+ ```
154
+ gnn_vuln/
155
+ inference.py VulnPredictor, load_model, predict, predict_from_file
156
+ config.py Config (data/model/train/ewc/replay), from_yaml / from_yamls
157
+ train.py trainer (python -m gnn_vuln.train)
158
+ evaluate.py evaluation (python -m gnn_vuln.evaluate)
159
+ models/ lmgat_codebert, lmgat_seqgnn — the architectures (built via config)
160
+ data/
161
+ prepare.py raw rows → Joern CPG (python -m)
162
+ build_pt.py CPG → .pt (python -m)
163
+ joern_runner.py process_function — Joern wrapper
164
+ dataset_lm.py CodeBERTGraphDataset (PyG InMemoryDataset, UniXcoder features)
165
+ node_embedder.py frozen LM per-node embeddings
166
+ ```
167
+
168
+ The library resolves its data/checkpoint root from `$GNN_VULN_ROOT` (else the current working
169
+ directory), so it behaves the same installed-from-PyPI as in a source checkout.
@@ -0,0 +1,8 @@
1
+ """
2
+ gnn_vuln — GNN-based Vulnerability Detection
3
+ =============================================
4
+ Final project package.
5
+ """
6
+
7
+ __version__ = "0.1.1"
8
+ __author__ = "Otzzu"