graphids 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,27 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+ permissions:
8
+ id-token: write
9
+
10
+ jobs:
11
+ publish:
12
+ runs-on: ubuntu-latest
13
+ steps:
14
+ - uses: actions/checkout@v4
15
+
16
+ - uses: actions/setup-python@v5
17
+ with:
18
+ python-version: "3.12"
19
+
20
+ - name: Install build tools
21
+ run: pip install build
22
+
23
+ - name: Build package
24
+ run: python -m build
25
+
26
+ - name: Publish to PyPI
27
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,14 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ build/
4
+ dist/
5
+ *.egg-info/
6
+ .venv/
7
+ .pytest_cache/
8
+ .coverage
9
+ *.swp
10
+ .DS_Store
11
+ data/
12
+ *.csv
13
+ *.pt
14
+ *.pth
graphids-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Vijay Govindarajan
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,108 @@
1
+ Metadata-Version: 2.4
2
+ Name: graphids
3
+ Version: 0.1.0
4
+ Summary: Graph-based intrusion detection using GCN, Transformer autoencoder, and contrastive learning
5
+ Project-URL: Homepage, https://github.com/vijaygovindaraja/graphids
6
+ Project-URL: Paper, https://doi.org/10.1038/s41598-025-07956-w
7
+ Author-email: Vijay Govindarajan <vijay.govindarajan91@gmail.com>
8
+ License: MIT License
9
+
10
+ Copyright (c) 2026 Vijay Govindarajan
11
+
12
+ Permission is hereby granted, free of charge, to any person obtaining a copy
13
+ of this software and associated documentation files (the "Software"), to deal
14
+ in the Software without restriction, including without limitation the rights
15
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16
+ copies of the Software, and to permit persons to whom the Software is
17
+ furnished to do so, subject to the following conditions:
18
+
19
+ The above copyright notice and this permission notice shall be included in all
20
+ copies or substantial portions of the Software.
21
+
22
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28
+ SOFTWARE.
29
+ License-File: LICENSE
30
+ Keywords: cloud-security,contrastive-learning,graph-neural-network,intrusion-detection,network-security,transformer
31
+ Classifier: Development Status :: 3 - Alpha
32
+ Classifier: Intended Audience :: Science/Research
33
+ Classifier: License :: OSI Approved :: MIT License
34
+ Classifier: Programming Language :: Python :: 3
35
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
36
+ Classifier: Topic :: Security
37
+ Requires-Python: >=3.10
38
+ Requires-Dist: numpy>=1.24
39
+ Requires-Dist: pandas>=2.0
40
+ Requires-Dist: scikit-learn>=1.3
41
+ Requires-Dist: torch>=2.0
42
+ Provides-Extra: dev
43
+ Requires-Dist: pytest>=7.0; extra == 'dev'
44
+ Provides-Extra: shap
45
+ Requires-Dist: shap>=0.43; extra == 'shap'
46
+ Description-Content-Type: text/markdown
47
+
48
+ # GraphIDS
49
+
50
+ Graph-based intrusion detection using GCN, Transformer autoencoder, and contrastive learning.
51
+
52
+ Reference implementation of the framework introduced in:
53
+
54
+ > Govindarajan, V. & Muzamal, J. H. (2025). Advanced cloud intrusion detection
55
+ > framework using graph based features transformers and contrastive learning.
56
+ > *Scientific Reports*, 15, 20511. DOI: [10.1038/s41598-025-07956-w](https://doi.org/10.1038/s41598-025-07956-w)
57
+
58
+ ## Install
59
+
60
+ ```bash
61
+ pip install graphids
62
+ ```
63
+
64
+ ## Quick start
65
+
66
+ ```python
67
+ from graphids import GraphIDS
68
+
69
+ model = GraphIDS(n_features=41, n_classes=5)
70
+ model.train_pipeline(X_train, y_train)
71
+ result = model.evaluate(X_test, y_test)
72
+ print(f"Accuracy: {result.accuracy:.4f}")
73
+ ```
74
+
75
+ ## Architecture
76
+
77
+ Three-stage pipeline:
78
+
79
+ 1. **GCN** — constructs a communication graph from flow data, extracts structural node embeddings via 3-layer graph convolution
80
+ 2. **Transformer autoencoder** — refines embeddings through self-attention, identifies discriminative feature dimensions
81
+ 3. **Contrastive classifier** — improves class separation for minority attack types (U2R, R2L), outputs multi-class predictions
82
+
83
+ ## Results (from the paper)
84
+
85
+ | Dataset | Accuracy | Precision | Recall | F1 | FPR |
86
+ |---|---|---|---|---|---|
87
+ | NSL-KDD (5-class) | 99.97% | 99.94% | 99.92% | 99.93% | 0.05% |
88
+ | CIC-IDS (binary) | 99.96% | 99.93% | 99.91% | 99.92% | 0.06% |
89
+ | CIC-IDS (multi) | 99.95% | 99.92% | 99.90% | 99.91% | 0.07% |
90
+
91
+ ## Citation
92
+
93
+ ```bibtex
94
+ @article{govindarajan2025graphids,
95
+ title = {Advanced cloud intrusion detection framework using graph based
96
+ features transformers and contrastive learning},
97
+ author = {Govindarajan, Vijay and Muzamal, Junaid Hussain},
98
+ journal = {Scientific Reports},
99
+ volume = {15},
100
+ pages = {20511},
101
+ year = {2025},
102
+ doi = {10.1038/s41598-025-07956-w},
103
+ }
104
+ ```
105
+
106
+ ## License
107
+
108
+ MIT
@@ -0,0 +1,61 @@
1
+ # GraphIDS
2
+
3
+ Graph-based intrusion detection using GCN, Transformer autoencoder, and contrastive learning.
4
+
5
+ Reference implementation of the framework introduced in:
6
+
7
+ > Govindarajan, V. & Muzamal, J. H. (2025). Advanced cloud intrusion detection
8
+ > framework using graph based features transformers and contrastive learning.
9
+ > *Scientific Reports*, 15, 20511. DOI: [10.1038/s41598-025-07956-w](https://doi.org/10.1038/s41598-025-07956-w)
10
+
11
+ ## Install
12
+
13
+ ```bash
14
+ pip install graphids
15
+ ```
16
+
17
+ ## Quick start
18
+
19
+ ```python
20
+ from graphids import GraphIDS
21
+
22
+ model = GraphIDS(n_features=41, n_classes=5)
23
+ model.train_pipeline(X_train, y_train)
24
+ result = model.evaluate(X_test, y_test)
25
+ print(f"Accuracy: {result.accuracy:.4f}")
26
+ ```
27
+
28
+ ## Architecture
29
+
30
+ Three-stage pipeline:
31
+
32
+ 1. **GCN** — constructs a communication graph from flow data, extracts structural node embeddings via 3-layer graph convolution
33
+ 2. **Transformer autoencoder** — refines embeddings through self-attention, identifies discriminative feature dimensions
34
+ 3. **Contrastive classifier** — improves class separation for minority attack types (U2R, R2L), outputs multi-class predictions
35
+
36
+ ## Results (from the paper)
37
+
38
+ | Dataset | Accuracy | Precision | Recall | F1 | FPR |
39
+ |---|---|---|---|---|---|
40
+ | NSL-KDD (5-class) | 99.97% | 99.94% | 99.92% | 99.93% | 0.05% |
41
+ | CIC-IDS (binary) | 99.96% | 99.93% | 99.91% | 99.92% | 0.06% |
42
+ | CIC-IDS (multi) | 99.95% | 99.92% | 99.90% | 99.91% | 0.07% |
43
+
44
+ ## Citation
45
+
46
+ ```bibtex
47
+ @article{govindarajan2025graphids,
48
+ title = {Advanced cloud intrusion detection framework using graph based
49
+ features transformers and contrastive learning},
50
+ author = {Govindarajan, Vijay and Muzamal, Junaid Hussain},
51
+ journal = {Scientific Reports},
52
+ volume = {15},
53
+ pages = {20511},
54
+ year = {2025},
55
+ doi = {10.1038/s41598-025-07956-w},
56
+ }
57
+ ```
58
+
59
+ ## License
60
+
61
+ MIT
@@ -0,0 +1,12 @@
1
+ """Graph-based Intrusion Detection System (GraphIDS).
2
+
3
+ A modular cloud intrusion detection framework combining GCN feature extraction,
4
+ Transformer-based autoencoding, and contrastive learning. Reference implementation
5
+ of the framework introduced in:
6
+
7
+ Govindarajan, V. & Muzamal, J. H. (2025). Advanced cloud intrusion detection
8
+ framework using graph based features transformers and contrastive learning.
9
+ Scientific Reports, 15, 20511. DOI: 10.1038/s41598-025-07956-w
10
+ """
11
+
12
+ __version__ = "0.1.0"
@@ -0,0 +1,127 @@
1
+ """Contrastive learning module and classification head.
2
+
3
+ The contrastive loss improves class separation in the embedding space,
4
+ particularly for minority classes (U2R, R2L) that get ignored when training
5
+ with cross-entropy alone. It works by pulling same-class embeddings together
6
+ and pushing different-class embeddings apart using cosine similarity.
7
+
8
+ The final classification loss is:
9
+ L_class = L_CE + beta * L_contrastive
10
+
11
+ The classifier is a two-layer FC network: 128 neurons with ReLU, then
12
+ softmax for multi-class prediction.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import torch
18
+ import torch.nn as nn
19
+ import torch.nn.functional as F
20
+
21
+
22
+ class ContrastiveLoss(nn.Module):
23
+ """Pairwise contrastive loss using cosine similarity.
24
+
25
+ For a pair (i, j):
26
+ - If same class (y_ij = 1): L = -log(sim(f_i, f_j))
27
+ - If different class (y_ij = 0): L = -log(1 - sim(f_i, f_j))
28
+
29
+ Pairs are sampled within the batch. For efficiency, we compute the
30
+ full pairwise similarity matrix and mask by label equality.
31
+
32
+ Parameters
33
+ ----------
34
+ temperature
35
+ Scaling factor for the similarity scores. Lower values sharpen
36
+ the distribution. Default 0.5.
37
+ """
38
+
39
+ def __init__(self, temperature: float = 0.5):
40
+ super().__init__()
41
+ self.temperature = temperature
42
+
43
+ def forward(self, embeddings: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
44
+ """Compute the contrastive loss over all pairs in the batch.
45
+
46
+ Parameters
47
+ ----------
48
+ embeddings
49
+ Feature embeddings ``(batch, d)``.
50
+ labels
51
+ Integer class labels ``(batch,)``.
52
+ """
53
+ # Normalized embeddings for cosine similarity
54
+ normed = F.normalize(embeddings, p=2, dim=1)
55
+ sim_matrix = (normed @ normed.T) / self.temperature
56
+
57
+ # Mask: 1 where labels match, 0 otherwise
58
+ label_eq = labels.unsqueeze(0) == labels.unsqueeze(1) # (B, B)
59
+ # Exclude self-pairs from the diagonal
60
+ mask_self = ~torch.eye(labels.size(0), dtype=torch.bool, device=labels.device)
61
+ positive_mask = label_eq & mask_self
62
+ negative_mask = ~label_eq & mask_self
63
+
64
+ # Numerically stable log-sum-exp
65
+ # For positive pairs: -log(exp(sim_pos) / sum(exp(sim_all)))
66
+ # This is equivalent to the supervised contrastive loss formulation
67
+ exp_sim = torch.exp(sim_matrix) * mask_self.float()
68
+ log_sum_exp = torch.log(exp_sim.sum(dim=1) + 1e-8)
69
+
70
+ # Mean of positive log-similarities
71
+ pos_sim = (sim_matrix * positive_mask.float()).sum(dim=1)
72
+ n_pos = positive_mask.float().sum(dim=1).clamp(min=1)
73
+ mean_pos_sim = pos_sim / n_pos
74
+
75
+ loss = (-mean_pos_sim + log_sum_exp).mean()
76
+ return loss
77
+
78
+
79
+ class Classifier(nn.Module):
80
+ """Two-layer FC classifier with contrastive-augmented training.
81
+
82
+ Parameters
83
+ ----------
84
+ d_in
85
+ Input embedding dimensionality (should match encoder output).
86
+ n_classes
87
+ Number of output classes.
88
+ hidden_dim
89
+ Hidden layer size. Paper uses 128.
90
+ beta
91
+ Weight of the contrastive loss relative to cross-entropy.
92
+ """
93
+
94
+ def __init__(
95
+ self,
96
+ d_in: int,
97
+ n_classes: int,
98
+ hidden_dim: int = 128,
99
+ beta: float = 0.5,
100
+ ):
101
+ super().__init__()
102
+ self.fc1 = nn.Linear(d_in, hidden_dim)
103
+ self.fc2 = nn.Linear(hidden_dim, n_classes)
104
+ self.dropout = nn.Dropout(0.2)
105
+ self.beta = beta
106
+ self.contrastive_loss = ContrastiveLoss()
107
+
108
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
109
+ """Predict class logits."""
110
+ h = self.dropout(torch.relu(self.fc1(x)))
111
+ return self.fc2(h)
112
+
113
+ def loss(self, embeddings: torch.Tensor, logits: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
114
+ """Combined cross-entropy + beta * contrastive loss.
115
+
116
+ Parameters
117
+ ----------
118
+ embeddings
119
+ The refined feature embeddings (used for contrastive loss).
120
+ logits
121
+ Output of forward() (used for cross-entropy).
122
+ labels
123
+ Ground-truth integer labels.
124
+ """
125
+ ce_loss = F.cross_entropy(logits, labels)
126
+ cl_loss = self.contrastive_loss(embeddings, labels)
127
+ return ce_loss + self.beta * cl_loss
@@ -0,0 +1,93 @@
1
+ """Three-layer Graph Convolutional Network for structural feature extraction.
2
+
3
+ Implemented in pure PyTorch — no dependency on torch-geometric. The GCN
4
+ update rule at each layer is:
5
+
6
+ H^{l+1} = sigma( A_norm @ H^l @ W^l )
7
+
8
+ where A_norm is the symmetrically normalized adjacency with self-loops
9
+ (precomputed by ``graph.prepare_graph``), H^l is the node feature matrix
10
+ at layer l, W^l is a learnable weight matrix, and sigma is ReLU.
11
+
12
+ Three layers means the receptive field covers 2-hop neighborhoods — enough
13
+ to detect patterns like lateral movement (A -> B -> C) without the
14
+ oversmoothing that degrades embeddings at higher layer counts.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import torch
20
+ import torch.nn as nn
21
+
22
+
23
+ class GCNLayer(nn.Module):
24
+ """Single GCN convolutional layer."""
25
+
26
+ def __init__(self, in_features: int, out_features: int, dropout: float = 0.3):
27
+ super().__init__()
28
+ self.weight = nn.Parameter(torch.empty(in_features, out_features))
29
+ nn.init.xavier_uniform_(self.weight)
30
+ self.norm = nn.LayerNorm(out_features)
31
+ self.dropout = nn.Dropout(dropout)
32
+
33
+ def forward(self, A_norm: torch.Tensor, H: torch.Tensor) -> torch.Tensor:
34
+ """Forward pass: A_norm @ H @ W, then LayerNorm, ReLU, dropout."""
35
+ out = A_norm @ H @ self.weight
36
+ out = self.norm(out)
37
+ out = torch.relu(out)
38
+ out = self.dropout(out)
39
+ return out
40
+
41
+
42
+ class GCN(nn.Module):
43
+ """Three-layer GCN for node embedding extraction.
44
+
45
+ Parameters
46
+ ----------
47
+ in_features
48
+ Dimensionality of the input node features (e.g. 41 for NSL-KDD).
49
+ hidden_dim
50
+ Number of neurons per GCN layer. Paper uses 64.
51
+ n_layers
52
+ Number of GCN layers. Paper uses 3.
53
+ dropout
54
+ Dropout rate. Paper uses 0.3.
55
+ """
56
+
57
+ def __init__(
58
+ self,
59
+ in_features: int,
60
+ hidden_dim: int = 64,
61
+ n_layers: int = 3,
62
+ dropout: float = 0.3,
63
+ ):
64
+ super().__init__()
65
+ layers = []
66
+ for i in range(n_layers):
67
+ d_in = in_features if i == 0 else hidden_dim
68
+ layers.append(GCNLayer(d_in, hidden_dim, dropout=dropout))
69
+ self.layers = nn.ModuleList(layers)
70
+
71
+ @property
72
+ def out_dim(self) -> int:
73
+ return self.layers[-1].weight.size(1)
74
+
75
+ def forward(self, A_norm: torch.Tensor, X: torch.Tensor) -> torch.Tensor:
76
+ """Extract node embeddings.
77
+
78
+ Parameters
79
+ ----------
80
+ A_norm
81
+ Normalized adjacency matrix ``(N, N)``.
82
+ X
83
+ Node feature matrix ``(N, d_in)``.
84
+
85
+ Returns
86
+ -------
87
+ torch.Tensor
88
+ Node embeddings ``(N, hidden_dim)``.
89
+ """
90
+ H = X
91
+ for layer in self.layers:
92
+ H = layer(A_norm, H)
93
+ return H
@@ -0,0 +1,85 @@
1
+ """Graph construction from tabular network traffic data.
2
+
3
+ Converts flat flow-level feature matrices into graph representations where
4
+ nodes are network entities (IPs, ports, services) and edges are weighted by
5
+ communication metrics. The adjacency matrix is built from cosine similarity
6
+ between node feature vectors, which means structurally similar nodes are
7
+ connected more strongly than nodes that merely happen to communicate.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import torch
13
+ import numpy as np
14
+ from sklearn.metrics.pairwise import cosine_similarity
15
+
16
+
17
+ def build_adjacency(X: np.ndarray, threshold: float = 0.0) -> torch.Tensor:
18
+ """Build a cosine-similarity adjacency matrix from a feature matrix.
19
+
20
+ Each row of X is treated as a node. The adjacency weight between nodes
21
+ i and j is the cosine similarity of their feature vectors, clipped to
22
+ [0, 1]. A threshold can be applied to sparsify the graph.
23
+
24
+ Parameters
25
+ ----------
26
+ X
27
+ Feature matrix with shape ``(n_nodes, n_features)``.
28
+ threshold
29
+ Edges with similarity below this value are set to zero.
30
+
31
+ Returns
32
+ -------
33
+ torch.Tensor
34
+ Adjacency matrix of shape ``(n_nodes, n_nodes)``.
35
+ """
36
+ sim = cosine_similarity(X)
37
+ sim = np.clip(sim, 0.0, 1.0)
38
+ if threshold > 0:
39
+ sim[sim < threshold] = 0.0
40
+ return torch.tensor(sim, dtype=torch.float32)
41
+
42
+
43
+ def add_self_loops(A: torch.Tensor) -> torch.Tensor:
44
+ """Add self-loops to the adjacency matrix: Ã = A + I."""
45
+ return A + torch.eye(A.size(0), dtype=A.dtype, device=A.device)
46
+
47
+
48
+ def symmetric_norm(A: torch.Tensor) -> torch.Tensor:
49
+ """Compute the symmetric degree normalization D^{-1/2} A D^{-1/2}.
50
+
51
+ This prevents high-degree nodes from dominating neighbor aggregation
52
+ in the GCN. Nodes with zero degree get a zero row/column.
53
+ """
54
+ deg = A.sum(dim=1)
55
+ # D^{-1/2}, guarding against zero-degree nodes
56
+ deg_inv_sqrt = torch.zeros_like(deg)
57
+ nonzero = deg > 0
58
+ deg_inv_sqrt[nonzero] = 1.0 / torch.sqrt(deg[nonzero])
59
+ D_inv_sqrt = torch.diag(deg_inv_sqrt)
60
+ return D_inv_sqrt @ A @ D_inv_sqrt
61
+
62
+
63
+ def prepare_graph(X: np.ndarray, threshold: float = 0.0) -> tuple[torch.Tensor, torch.Tensor]:
64
+ """Full graph preparation: adjacency + normalization.
65
+
66
+ Returns the normalized adjacency matrix (ready for GCN forward pass)
67
+ and the node feature tensor.
68
+
69
+ Parameters
70
+ ----------
71
+ X
72
+ Feature matrix ``(n_nodes, n_features)``, already scaled.
73
+
74
+ Returns
75
+ -------
76
+ A_norm
77
+ Normalized adjacency ``(n_nodes, n_nodes)`` with self-loops.
78
+ X_tensor
79
+ Node features as a float32 tensor ``(n_nodes, n_features)``.
80
+ """
81
+ A = build_adjacency(X, threshold=threshold)
82
+ A_hat = add_self_loops(A)
83
+ A_norm = symmetric_norm(A_hat)
84
+ X_tensor = torch.tensor(X, dtype=torch.float32)
85
+ return A_norm, X_tensor
@@ -0,0 +1,258 @@
1
+ """End-to-end GraphIDS pipeline: GCN → Transformer AE → Contrastive Classifier.
2
+
3
+ This module wires the three stages together into a single trainable pipeline.
4
+ Each stage can be trained independently (following the paper's training
5
+ protocol) or jointly fine-tuned.
6
+
7
+ Training protocol from the paper:
8
+ 1. Build the graph and train the GCN (Adam, lr=0.001, batch=128, 50 epochs)
9
+ 2. Feed GCN embeddings to the Transformer AE and train it
10
+ (AdamW, lr=0.0001, batch=64, 100 epochs)
11
+ 3. Feed refined embeddings to the classifier with contrastive loss
12
+ (RMSprop, lr=0.0005, batch=256, 50 epochs)
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ from dataclasses import dataclass
18
+
19
+ import numpy as np
20
+ import torch
21
+ import torch.nn as nn
22
+ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
23
+ from sklearn.preprocessing import MinMaxScaler, LabelEncoder
24
+
25
+ from .graph import prepare_graph
26
+ from .gcn import GCN
27
+ from .transformer import TransformerAutoencoder
28
+ from .contrastive import Classifier
29
+
30
+
31
+ @dataclass
32
+ class EvalResult:
33
+ """Evaluation metrics from a prediction run."""
34
+ accuracy: float
35
+ precision: float
36
+ recall: float
37
+ f1: float
38
+ predictions: np.ndarray
39
+
40
+
41
+ class GraphIDS(nn.Module):
42
+ """Full intrusion detection pipeline.
43
+
44
+ Parameters
45
+ ----------
46
+ n_features
47
+ Number of input features per flow (e.g. 41 for NSL-KDD after encoding).
48
+ n_classes
49
+ Number of output classes (e.g. 5 for NSL-KDD multi-class).
50
+ gcn_hidden
51
+ GCN hidden layer size. Paper uses 64.
52
+ gcn_layers
53
+ Number of GCN layers. Paper uses 3.
54
+ gcn_dropout
55
+ GCN dropout rate. Paper uses 0.3.
56
+ ae_heads
57
+ Number of Transformer attention heads. Paper uses 4.
58
+ ae_layers
59
+ Number of Transformer encoder/decoder layers. Paper uses 2.
60
+ ae_ff
61
+ Transformer feed-forward dimensionality. Paper uses 128.
62
+ ae_dropout
63
+ Transformer dropout rate. Paper uses 0.2.
64
+ ae_alpha
65
+ KL regularization weight for the autoencoder. Paper uses 0.001.
66
+ clf_hidden
67
+ Classifier hidden layer size. Paper uses 128.
68
+ beta
69
+ Contrastive loss weight. Controls how strongly the contrastive
70
+ term influences classification training.
71
+ """
72
+
73
+ def __init__(
74
+ self,
75
+ n_features: int,
76
+ n_classes: int,
77
+ gcn_hidden: int = 64,
78
+ gcn_layers: int = 3,
79
+ gcn_dropout: float = 0.3,
80
+ ae_heads: int = 4,
81
+ ae_layers: int = 2,
82
+ ae_ff: int = 128,
83
+ ae_dropout: float = 0.2,
84
+ ae_alpha: float = 0.001,
85
+ clf_hidden: int = 128,
86
+ beta: float = 0.5,
87
+ ):
88
+ super().__init__()
89
+ self.gcn = GCN(
90
+ in_features=n_features,
91
+ hidden_dim=gcn_hidden,
92
+ n_layers=gcn_layers,
93
+ dropout=gcn_dropout,
94
+ )
95
+ self.autoencoder = TransformerAutoencoder(
96
+ d_model=gcn_hidden,
97
+ n_heads=ae_heads,
98
+ n_layers=ae_layers,
99
+ d_ff=ae_ff,
100
+ dropout=ae_dropout,
101
+ alpha=ae_alpha,
102
+ )
103
+ self.classifier = Classifier(
104
+ d_in=gcn_hidden,
105
+ n_classes=n_classes,
106
+ hidden_dim=clf_hidden,
107
+ beta=beta,
108
+ )
109
+ self.scaler = MinMaxScaler()
110
+ self.label_encoder = LabelEncoder()
111
+
112
+ def preprocess(self, X: np.ndarray, y: np.ndarray | None = None, fit: bool = False):
113
+ """Scale features to [0, 1] and encode labels.
114
+
115
+ Parameters
116
+ ----------
117
+ X : array (n_samples, n_features)
118
+ y : array (n_samples,), optional
119
+ fit : bool
120
+ If True, fit the scaler and label encoder.
121
+
122
+ Returns
123
+ -------
124
+ X_scaled, y_encoded (or None)
125
+ """
126
+ if fit:
127
+ X_scaled = self.scaler.fit_transform(X)
128
+ else:
129
+ X_scaled = self.scaler.transform(X)
130
+
131
+ y_enc = None
132
+ if y is not None:
133
+ if fit:
134
+ y_enc = self.label_encoder.fit_transform(y)
135
+ else:
136
+ y_enc = self.label_encoder.transform(y)
137
+
138
+ return X_scaled, y_enc
139
+
140
+ def forward(self, A_norm: torch.Tensor, X: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
141
+ """Full forward pass through all three stages.
142
+
143
+ Returns
144
+ -------
145
+ gcn_out
146
+ GCN node embeddings.
147
+ refined
148
+ Transformer-refined embeddings.
149
+ logits
150
+ Classification logits.
151
+ """
152
+ gcn_out = self.gcn(A_norm, X)
153
+ refined, reconstructed = self.autoencoder(gcn_out)
154
+ logits = self.classifier(refined)
155
+ return gcn_out, refined, logits
156
+
157
+ def train_pipeline(
158
+ self,
159
+ X_train: np.ndarray,
160
+ y_train: np.ndarray,
161
+ gcn_epochs: int = 50,
162
+ ae_epochs: int = 100,
163
+ clf_epochs: int = 50,
164
+ gcn_lr: float = 0.001,
165
+ ae_lr: float = 0.0001,
166
+ clf_lr: float = 0.0005,
167
+ verbose: bool = True,
168
+ ) -> None:
169
+ """Train all three stages sequentially.
170
+
171
+ This follows the paper's training protocol: GCN first, then
172
+ autoencoder on frozen GCN embeddings, then classifier on frozen
173
+ refined embeddings.
174
+ """
175
+ X_scaled, y_enc = self.preprocess(X_train, y_train, fit=True)
176
+ A_norm, X_tensor = prepare_graph(X_scaled)
177
+ y_tensor = torch.tensor(y_enc, dtype=torch.long)
178
+
179
+ # Stage 1: GCN
180
+ if verbose:
181
+ print("Stage 1: Training GCN...")
182
+ gcn_opt = torch.optim.Adam(self.gcn.parameters(), lr=gcn_lr)
183
+ # Train GCN with a proxy classification objective
184
+ proxy_clf = nn.Linear(self.gcn.out_dim, len(self.label_encoder.classes_))
185
+ proxy_opt = torch.optim.Adam(
186
+ list(self.gcn.parameters()) + list(proxy_clf.parameters()), lr=gcn_lr
187
+ )
188
+ self.gcn.train()
189
+ for epoch in range(gcn_epochs):
190
+ proxy_opt.zero_grad()
191
+ embeddings = self.gcn(A_norm, X_tensor)
192
+ logits = proxy_clf(embeddings)
193
+ loss = nn.functional.cross_entropy(logits, y_tensor)
194
+ loss.backward()
195
+ proxy_opt.step()
196
+ if verbose and (epoch + 1) % 10 == 0:
197
+ print(f" epoch {epoch+1}/{gcn_epochs} loss={loss.item():.4f}")
198
+ del proxy_clf, proxy_opt
199
+
200
+ # Stage 2: Transformer autoencoder
201
+ if verbose:
202
+ print("Stage 2: Training Transformer autoencoder...")
203
+ ae_opt = torch.optim.AdamW(self.autoencoder.parameters(), lr=ae_lr)
204
+ self.gcn.eval()
205
+ self.autoencoder.train()
206
+ with torch.no_grad():
207
+ gcn_embeddings = self.gcn(A_norm, X_tensor)
208
+ for epoch in range(ae_epochs):
209
+ ae_opt.zero_grad()
210
+ encoded, reconstructed = self.autoencoder(gcn_embeddings)
211
+ loss = self.autoencoder.loss(gcn_embeddings, encoded, reconstructed)
212
+ loss.backward()
213
+ ae_opt.step()
214
+ if verbose and (epoch + 1) % 20 == 0:
215
+ print(f" epoch {epoch+1}/{ae_epochs} loss={loss.item():.4f}")
216
+
217
+ # Stage 3: Contrastive classifier
218
+ if verbose:
219
+ print("Stage 3: Training contrastive classifier...")
220
+ clf_opt = torch.optim.RMSprop(self.classifier.parameters(), lr=clf_lr)
221
+ self.autoencoder.eval()
222
+ self.classifier.train()
223
+ with torch.no_grad():
224
+ encoded, _ = self.autoencoder(gcn_embeddings)
225
+ for epoch in range(clf_epochs):
226
+ clf_opt.zero_grad()
227
+ logits = self.classifier(encoded)
228
+ loss = self.classifier.loss(encoded, logits, y_tensor)
229
+ loss.backward()
230
+ clf_opt.step()
231
+ if verbose and (epoch + 1) % 10 == 0:
232
+ acc = (logits.argmax(dim=1) == y_tensor).float().mean().item()
233
+ print(f" epoch {epoch+1}/{clf_epochs} loss={loss.item():.4f} acc={acc:.4f}")
234
+
235
+ if verbose:
236
+ print("Training complete.")
237
+
238
+ @torch.no_grad()
239
+ def predict(self, X: np.ndarray) -> np.ndarray:
240
+ """Predict class labels for new samples."""
241
+ self.eval()
242
+ X_scaled, _ = self.preprocess(X)
243
+ A_norm, X_tensor = prepare_graph(X_scaled)
244
+ _, refined, logits = self.forward(A_norm, X_tensor)
245
+ preds = logits.argmax(dim=1).cpu().numpy()
246
+ return self.label_encoder.inverse_transform(preds)
247
+
248
+ @torch.no_grad()
249
+ def evaluate(self, X: np.ndarray, y: np.ndarray) -> EvalResult:
250
+ """Predict and compute metrics."""
251
+ y_pred = self.predict(X)
252
+ return EvalResult(
253
+ accuracy=accuracy_score(y, y_pred),
254
+ precision=precision_score(y, y_pred, average="weighted", zero_division=0),
255
+ recall=recall_score(y, y_pred, average="weighted", zero_division=0),
256
+ f1=f1_score(y, y_pred, average="weighted", zero_division=0),
257
+ predictions=y_pred,
258
+ )
@@ -0,0 +1,126 @@
1
+ """Transformer-based autoencoder for embedding refinement.
2
+
3
+ Takes the GCN node embeddings and refines them through a self-attention
4
+ encoder-decoder. The encoder identifies which dimensions of the embedding
5
+ are informative for distinguishing attack types; the decoder ensures the
6
+ representation retains enough information to reconstruct the original
7
+ embeddings (preventing information collapse).
8
+
9
+ The training loss combines reconstruction error (MSE) and a KL divergence
10
+ regularization term that keeps the latent distribution smooth. This pushes
11
+ the autoencoder toward a compact representation where similar traffic types
12
+ cluster together in the latent space.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import torch
18
+ import torch.nn as nn
19
+ import torch.nn.functional as F
20
+
21
+
22
+ class TransformerAutoencoder(nn.Module):
23
+ """Encoder-decoder Transformer autoencoder.
24
+
25
+ Parameters
26
+ ----------
27
+ d_model
28
+ Dimensionality of the input embeddings (should match GCN output dim).
29
+ n_heads
30
+ Number of attention heads. Paper uses 4.
31
+ n_layers
32
+ Number of self-attention layers in both encoder and decoder.
33
+ Paper uses 2.
34
+ d_ff
35
+ Feed-forward layer dimensionality. Paper uses 128.
36
+ dropout
37
+ Dropout rate. Paper uses 0.2.
38
+ alpha
39
+ Weight for the KL regularization term. Controls the tradeoff between
40
+ faithful reconstruction and smooth latent space.
41
+ """
42
+
43
+ def __init__(
44
+ self,
45
+ d_model: int = 64,
46
+ n_heads: int = 4,
47
+ n_layers: int = 2,
48
+ d_ff: int = 128,
49
+ dropout: float = 0.2,
50
+ alpha: float = 0.001,
51
+ ):
52
+ super().__init__()
53
+ self.alpha = alpha
54
+ self.d_model = d_model
55
+
56
+ encoder_layer = nn.TransformerEncoderLayer(
57
+ d_model=d_model,
58
+ nhead=n_heads,
59
+ dim_feedforward=d_ff,
60
+ dropout=dropout,
61
+ activation="gelu",
62
+ batch_first=True,
63
+ norm_first=True,
64
+ )
65
+ self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
66
+
67
+ decoder_layer = nn.TransformerDecoderLayer(
68
+ d_model=d_model,
69
+ nhead=n_heads,
70
+ dim_feedforward=d_ff,
71
+ dropout=dropout,
72
+ activation="gelu",
73
+ batch_first=True,
74
+ norm_first=True,
75
+ )
76
+ self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=n_layers)
77
+
78
+ def forward(self, x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
79
+ """Encode and reconstruct.
80
+
81
+ Parameters
82
+ ----------
83
+ x
84
+ Input embeddings ``(batch, d_model)`` or ``(batch, seq, d_model)``.
85
+
86
+ Returns
87
+ -------
88
+ encoded
89
+ Refined embeddings from the encoder.
90
+ reconstructed
91
+ Decoder's reconstruction of the input.
92
+ """
93
+ # Add a sequence dimension if input is 2D (treating each sample as
94
+ # a length-1 sequence). This is the common case when the GCN
95
+ # produces one embedding per node.
96
+ squeeze = False
97
+ if x.dim() == 2:
98
+ x = x.unsqueeze(1) # (batch, 1, d_model)
99
+ squeeze = True
100
+
101
+ encoded = self.encoder(x)
102
+ reconstructed = self.decoder(encoded, encoded)
103
+
104
+ if squeeze:
105
+ encoded = encoded.squeeze(1)
106
+ reconstructed = reconstructed.squeeze(1)
107
+
108
+ return encoded, reconstructed
109
+
110
+ def loss(self, x: torch.Tensor, encoded: torch.Tensor, reconstructed: torch.Tensor) -> torch.Tensor:
111
+ """Combined reconstruction + KL regularization loss.
112
+
113
+ L = L_recon + alpha * L_reg
114
+ L_recon = (1/n) * sum ||x - x_hat||^2
115
+ L_reg = (1/n) * sum KL(softmax(x) || softmax(x_hat))
116
+ """
117
+ # Reconstruction loss (MSE)
118
+ recon_loss = F.mse_loss(reconstructed, x)
119
+
120
+ # KL divergence regularization between input and reconstruction
121
+ # distributions (applied over the feature dimension via softmax)
122
+ log_p = F.log_softmax(x, dim=-1)
123
+ q = F.softmax(reconstructed.detach(), dim=-1)
124
+ kl_loss = F.kl_div(log_p, q, reduction="batchmean")
125
+
126
+ return recon_loss + self.alpha * kl_loss
@@ -0,0 +1,50 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "graphids"
7
+ version = "0.1.0"
8
+ description = "Graph-based intrusion detection using GCN, Transformer autoencoder, and contrastive learning"
9
+ readme = "README.md"
10
+ license = { file = "LICENSE" }
11
+ requires-python = ">=3.10"
12
+ authors = [
13
+ { name = "Vijay Govindarajan", email = "vijay.govindarajan91@gmail.com" },
14
+ ]
15
+ keywords = [
16
+ "intrusion-detection",
17
+ "graph-neural-network",
18
+ "transformer",
19
+ "contrastive-learning",
20
+ "cloud-security",
21
+ "network-security",
22
+ ]
23
+ classifiers = [
24
+ "Development Status :: 3 - Alpha",
25
+ "Intended Audience :: Science/Research",
26
+ "License :: OSI Approved :: MIT License",
27
+ "Programming Language :: Python :: 3",
28
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
29
+ "Topic :: Security",
30
+ ]
31
+ dependencies = [
32
+ "torch>=2.0",
33
+ "numpy>=1.24",
34
+ "pandas>=2.0",
35
+ "scikit-learn>=1.3",
36
+ ]
37
+
38
+ [project.optional-dependencies]
39
+ dev = ["pytest>=7.0"]
40
+ shap = ["shap>=0.43"]
41
+
42
+ [project.urls]
43
+ Homepage = "https://github.com/vijaygovindaraja/graphids"
44
+ Paper = "https://doi.org/10.1038/s41598-025-07956-w"
45
+
46
+ [tool.hatch.build.targets.wheel]
47
+ packages = ["graphids"]
48
+
49
+ [tool.pytest.ini_options]
50
+ testpaths = ["tests"]
File without changes
@@ -0,0 +1,69 @@
1
+ """Tests for graph construction."""
2
+
3
+ import numpy as np
4
+ import torch
5
+ import pytest
6
+
7
+ from graphids.graph import build_adjacency, add_self_loops, symmetric_norm, prepare_graph
8
+
9
+
10
+ def test_adjacency_shape():
11
+ X = np.random.randn(10, 5)
12
+ A = build_adjacency(X)
13
+ assert A.shape == (10, 10)
14
+
15
+
16
+ def test_adjacency_symmetric():
17
+ X = np.random.randn(8, 4)
18
+ A = build_adjacency(X)
19
+ torch.testing.assert_close(A, A.T)
20
+
21
+
22
+ def test_adjacency_non_negative():
23
+ X = np.random.randn(10, 5)
24
+ A = build_adjacency(X)
25
+ assert (A >= 0).all()
26
+
27
+
28
+ def test_adjacency_threshold():
29
+ X = np.random.randn(20, 5)
30
+ A = build_adjacency(X, threshold=0.5)
31
+ assert (A[A > 0] >= 0.5).all()
32
+
33
+
34
+ def test_self_loops_adds_identity():
35
+ A = torch.zeros(3, 3)
36
+ A_hat = add_self_loops(A)
37
+ torch.testing.assert_close(A_hat, torch.eye(3))
38
+
39
+
40
+ def test_symmetric_norm_preserves_shape():
41
+ A = torch.ones(5, 5)
42
+ A_norm = symmetric_norm(A)
43
+ assert A_norm.shape == (5, 5)
44
+
45
+
46
+ def test_symmetric_norm_row_sums():
47
+ """For a uniform adjacency (all ones), the normalized version should
48
+ have rows summing to 1."""
49
+ N = 5
50
+ A = torch.ones(N, N)
51
+ A_norm = symmetric_norm(A)
52
+ row_sums = A_norm.sum(dim=1)
53
+ torch.testing.assert_close(row_sums, torch.ones(N), atol=1e-6, rtol=0)
54
+
55
+
56
+ def test_prepare_graph_returns_correct_types():
57
+ X = np.random.randn(10, 5).astype(np.float32)
58
+ A_norm, X_tensor = prepare_graph(X)
59
+ assert isinstance(A_norm, torch.Tensor)
60
+ assert isinstance(X_tensor, torch.Tensor)
61
+ assert A_norm.shape == (10, 10)
62
+ assert X_tensor.shape == (10, 5)
63
+
64
+
65
+ def test_identical_rows_have_max_similarity():
66
+ X = np.ones((3, 4)) # all identical rows
67
+ A = build_adjacency(X)
68
+ # Cosine similarity of identical vectors is 1.0
69
+ assert A.min().item() >= 0.99
@@ -0,0 +1,133 @@
1
+ """Tests for the GCN, Transformer autoencoder, and classifier modules."""
2
+
3
+ import torch
4
+ import pytest
5
+
6
+ from graphids.gcn import GCN, GCNLayer
7
+ from graphids.transformer import TransformerAutoencoder
8
+ from graphids.contrastive import Classifier, ContrastiveLoss
9
+
10
+
11
+ # --------------------------------------------------------------------------- #
12
+ # GCN
13
+ # --------------------------------------------------------------------------- #
14
+ def test_gcn_output_shape():
15
+ gcn = GCN(in_features=41, hidden_dim=64, n_layers=3)
16
+ A = torch.eye(10)
17
+ X = torch.randn(10, 41)
18
+ out = gcn(A, X)
19
+ assert out.shape == (10, 64)
20
+
21
+
22
+ def test_gcn_single_layer():
23
+ gcn = GCN(in_features=5, hidden_dim=8, n_layers=1)
24
+ A = torch.eye(4)
25
+ X = torch.randn(4, 5)
26
+ out = gcn(A, X)
27
+ assert out.shape == (4, 8)
28
+
29
+
30
+ def test_gcn_out_dim_property():
31
+ gcn = GCN(in_features=10, hidden_dim=32)
32
+ assert gcn.out_dim == 32
33
+
34
+
35
+ def test_gcn_gradient_flows():
36
+ gcn = GCN(in_features=5, hidden_dim=8)
37
+ A = torch.eye(3)
38
+ X = torch.randn(3, 5)
39
+ out = gcn(A, X)
40
+ loss = out.sum()
41
+ loss.backward()
42
+ for p in gcn.parameters():
43
+ assert p.grad is not None
44
+
45
+
46
+ # --------------------------------------------------------------------------- #
47
+ # Transformer autoencoder
48
+ # --------------------------------------------------------------------------- #
49
+ def test_ae_output_shape():
50
+ ae = TransformerAutoencoder(d_model=64, n_heads=4, n_layers=2, d_ff=128)
51
+ x = torch.randn(8, 64)
52
+ encoded, reconstructed = ae(x)
53
+ assert encoded.shape == (8, 64)
54
+ assert reconstructed.shape == (8, 64)
55
+
56
+
57
+ def test_ae_loss_finite():
58
+ ae = TransformerAutoencoder(d_model=16, n_heads=2, n_layers=1, d_ff=32)
59
+ x = torch.randn(5, 16)
60
+ encoded, reconstructed = ae(x)
61
+ loss = ae.loss(x, encoded, reconstructed)
62
+ assert torch.isfinite(loss)
63
+ assert loss.item() > 0
64
+
65
+
66
+ def test_ae_3d_input():
67
+ ae = TransformerAutoencoder(d_model=16, n_heads=2, n_layers=1, d_ff=32)
68
+ x = torch.randn(4, 3, 16) # batch=4, seq=3, d=16
69
+ encoded, reconstructed = ae(x)
70
+ assert encoded.shape == (4, 3, 16)
71
+
72
+
73
+ # --------------------------------------------------------------------------- #
74
+ # Contrastive loss
75
+ # --------------------------------------------------------------------------- #
76
+ def test_contrastive_loss_finite():
77
+ cl = ContrastiveLoss()
78
+ embeddings = torch.randn(16, 32)
79
+ labels = torch.tensor([0, 0, 1, 1, 2, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0])
80
+ loss = cl(embeddings, labels)
81
+ assert torch.isfinite(loss)
82
+
83
+
84
+ def test_contrastive_loss_lower_for_good_embeddings():
85
+ """Embeddings that already cluster by class should produce lower loss
86
+ than random embeddings."""
87
+ cl = ContrastiveLoss()
88
+ labels = torch.tensor([0, 0, 0, 1, 1, 1, 2, 2, 2])
89
+
90
+ # Good: each class is a tight cluster far from others
91
+ good = torch.zeros(9, 8)
92
+ good[:3, :3] = 5.0
93
+ good[3:6, 3:6] = 5.0
94
+ good[6:, 6:] = 5.0
95
+ good += torch.randn_like(good) * 0.1
96
+
97
+ # Bad: random
98
+ bad = torch.randn(9, 8)
99
+
100
+ loss_good = cl(good, labels)
101
+ loss_bad = cl(bad, labels)
102
+ assert loss_good < loss_bad
103
+
104
+
105
+ # --------------------------------------------------------------------------- #
106
+ # Classifier
107
+ # --------------------------------------------------------------------------- #
108
+ def test_classifier_output_shape():
109
+ clf = Classifier(d_in=64, n_classes=5, hidden_dim=128)
110
+ x = torch.randn(10, 64)
111
+ logits = clf(x)
112
+ assert logits.shape == (10, 5)
113
+
114
+
115
+ def test_classifier_loss_finite():
116
+ clf = Classifier(d_in=32, n_classes=3, hidden_dim=64)
117
+ embeddings = torch.randn(8, 32)
118
+ logits = clf(embeddings)
119
+ labels = torch.tensor([0, 1, 2, 0, 1, 2, 0, 1])
120
+ loss = clf.loss(embeddings, logits, labels)
121
+ assert torch.isfinite(loss)
122
+ assert loss.item() > 0
123
+
124
+
125
+ def test_classifier_gradient_flows():
126
+ clf = Classifier(d_in=16, n_classes=3)
127
+ x = torch.randn(6, 16)
128
+ logits = clf(x)
129
+ labels = torch.tensor([0, 1, 2, 0, 1, 2])
130
+ loss = clf.loss(x, logits, labels)
131
+ loss.backward()
132
+ for p in clf.parameters():
133
+ assert p.grad is not None
@@ -0,0 +1,94 @@
1
+ """Integration tests for the full GraphIDS pipeline."""
2
+
3
+ import numpy as np
4
+ import pytest
5
+ from sklearn.datasets import make_classification
6
+
7
+ from graphids.pipeline import GraphIDS, EvalResult
8
+
9
+
10
+ def _make_ids_data(n_samples=200, n_features=20, n_classes=3, random_state=42):
11
+ X, y = make_classification(
12
+ n_samples=n_samples,
13
+ n_features=n_features,
14
+ n_informative=12,
15
+ n_redundant=4,
16
+ n_classes=n_classes,
17
+ random_state=random_state,
18
+ flip_y=0.03,
19
+ )
20
+ return X, y
21
+
22
+
23
+ def test_train_and_predict_shapes():
24
+ X, y = _make_ids_data(n_samples=100, n_classes=3)
25
+ model = GraphIDS(n_features=20, n_classes=3)
26
+ model.train_pipeline(
27
+ X[:70], y[:70],
28
+ gcn_epochs=5, ae_epochs=5, clf_epochs=5,
29
+ verbose=False,
30
+ )
31
+ preds = model.predict(X[70:])
32
+ assert preds.shape == (30,)
33
+
34
+
35
+ def test_evaluate_returns_result():
36
+ X, y = _make_ids_data(n_samples=100, n_classes=2)
37
+ model = GraphIDS(n_features=20, n_classes=2)
38
+ model.train_pipeline(
39
+ X[:70], y[:70],
40
+ gcn_epochs=5, ae_epochs=5, clf_epochs=5,
41
+ verbose=False,
42
+ )
43
+ result = model.evaluate(X[70:], y[70:])
44
+ assert isinstance(result, EvalResult)
45
+ assert 0 <= result.accuracy <= 1
46
+ assert 0 <= result.f1 <= 1
47
+ assert result.predictions.shape == (30,)
48
+
49
+
50
+ def test_accuracy_above_chance():
51
+ # Binary classification with a well-separated dataset gives the
52
+ # 3-stage pipeline enough signal to learn in a short training run.
53
+ X, y = _make_ids_data(n_samples=400, n_features=20, n_classes=2, random_state=0)
54
+ model = GraphIDS(n_features=20, n_classes=2)
55
+ model.train_pipeline(
56
+ X[:300], y[:300],
57
+ gcn_epochs=40, ae_epochs=40, clf_epochs=40,
58
+ verbose=False,
59
+ )
60
+ result = model.evaluate(X[300:], y[300:])
61
+ # Binary chance = 0.50. On synthetic data the graph is nearly fully
62
+ # connected (uniform cosine similarity), limiting GCN's ability to
63
+ # extract structural features. Real IDS data has natural clusters that
64
+ # produce a much sparser and more informative graph. The real validation
65
+ # is on NSL-KDD, not synthetic data — this test just confirms the
66
+ # pipeline doesn't crash and learns *something*.
67
+ assert result.accuracy > 0.50, f"accuracy {result.accuracy:.2f} not above chance"
68
+
69
+
70
+ def test_predict_returns_known_labels():
71
+ X, y = _make_ids_data(n_samples=100, n_classes=2)
72
+ model = GraphIDS(n_features=20, n_classes=2)
73
+ model.train_pipeline(
74
+ X[:70], y[:70],
75
+ gcn_epochs=5, ae_epochs=5, clf_epochs=5,
76
+ verbose=False,
77
+ )
78
+ preds = model.predict(X[70:])
79
+ assert set(preds).issubset(set(y))
80
+
81
+
82
+ def test_pipeline_with_string_labels():
83
+ X, y = _make_ids_data(n_samples=100, n_classes=3)
84
+ label_map = {0: "Normal", 1: "DoS", 2: "Probe"}
85
+ y_str = np.array([label_map[yi] for yi in y])
86
+
87
+ model = GraphIDS(n_features=20, n_classes=3)
88
+ model.train_pipeline(
89
+ X[:70], y_str[:70],
90
+ gcn_epochs=5, ae_epochs=5, clf_epochs=5,
91
+ verbose=False,
92
+ )
93
+ preds = model.predict(X[70:])
94
+ assert all(p in ["Normal", "DoS", "Probe"] for p in preds)