asmdetect 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- asmdetect-1.0.0/PKG-INFO +307 -0
- asmdetect-1.0.0/README.md +261 -0
- asmdetect-1.0.0/asmdetect/__init__.py +6 -0
- asmdetect-1.0.0/asmdetect/cli.py +43 -0
- asmdetect-1.0.0/asmdetect/detector.py +395 -0
- asmdetect-1.0.0/asmdetect/preprocessing.py +42 -0
- asmdetect-1.0.0/asmdetect/result.py +42 -0
- asmdetect-1.0.0/asmdetect/version.py +1 -0
- asmdetect-1.0.0/asmdetect.egg-info/PKG-INFO +307 -0
- asmdetect-1.0.0/asmdetect.egg-info/SOURCES.txt +22 -0
- asmdetect-1.0.0/asmdetect.egg-info/dependency_links.txt +1 -0
- asmdetect-1.0.0/asmdetect.egg-info/entry_points.txt +2 -0
- asmdetect-1.0.0/asmdetect.egg-info/requires.txt +36 -0
- asmdetect-1.0.0/asmdetect.egg-info/top_level.txt +7 -0
- asmdetect-1.0.0/examples/basic_usage.py +76 -0
- asmdetect-1.0.0/pyproject.toml +41 -0
- asmdetect-1.0.0/scripts/batch_triage.py +92 -0
- asmdetect-1.0.0/scripts/optimize.py +247 -0
- asmdetect-1.0.0/scripts/predict_from_hf.py +115 -0
- asmdetect-1.0.0/scripts/verify_security.py +330 -0
- asmdetect-1.0.0/setup.cfg +26 -0
- asmdetect-1.0.0/tests/__init__.py +0 -0
- asmdetect-1.0.0/tests/test_asmdetect.py +139 -0
asmdetect-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,307 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: asmdetect
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Assembly-level malware detection using fine-tuned CodeBERT
|
|
5
|
+
Author: Swarnadharshini S
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://huggingface.co/Swarnadharshini/codebert-malware-detector
|
|
8
|
+
Project-URL: Source, https://github.com/SwarnaDharshiniS/asmdetect
|
|
9
|
+
Keywords: malware-detection,assembly,opcodes,codebert,cybersecurity
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Topic :: Security
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Requires-Python: >=3.10
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
Requires-Dist: torch>=2.0.0
|
|
17
|
+
Requires-Dist: transformers>=4.35.0
|
|
18
|
+
Requires-Dist: numpy>=1.24.0
|
|
19
|
+
Requires-Dist: pandas>=2.0.0
|
|
20
|
+
Provides-Extra: ui
|
|
21
|
+
Requires-Dist: streamlit>=1.28.0; extra == "ui"
|
|
22
|
+
Provides-Extra: watcher
|
|
23
|
+
Requires-Dist: watchdog>=3.0.0; extra == "watcher"
|
|
24
|
+
Requires-Dist: plyer>=2.1.0; extra == "watcher"
|
|
25
|
+
Provides-Extra: onnx
|
|
26
|
+
Requires-Dist: optimum[onnxruntime]>=1.13.0; extra == "onnx"
|
|
27
|
+
Requires-Dist: onnxruntime>=1.16.0; extra == "onnx"
|
|
28
|
+
Provides-Extra: train
|
|
29
|
+
Requires-Dist: datasets>=2.14.0; extra == "train"
|
|
30
|
+
Requires-Dist: accelerate>=0.24.0; extra == "train"
|
|
31
|
+
Requires-Dist: scikit-learn>=1.3.0; extra == "train"
|
|
32
|
+
Requires-Dist: matplotlib>=3.7.0; extra == "train"
|
|
33
|
+
Requires-Dist: tqdm>=4.65.0; extra == "train"
|
|
34
|
+
Provides-Extra: all
|
|
35
|
+
Requires-Dist: streamlit>=1.28.0; extra == "all"
|
|
36
|
+
Requires-Dist: watchdog>=3.0.0; extra == "all"
|
|
37
|
+
Requires-Dist: plyer>=2.1.0; extra == "all"
|
|
38
|
+
Requires-Dist: datasets>=2.14.0; extra == "all"
|
|
39
|
+
Requires-Dist: accelerate>=0.24.0; extra == "all"
|
|
40
|
+
Requires-Dist: scikit-learn>=1.3.0; extra == "all"
|
|
41
|
+
Requires-Dist: matplotlib>=3.7.0; extra == "all"
|
|
42
|
+
Requires-Dist: tqdm>=4.65.0; extra == "all"
|
|
43
|
+
Provides-Extra: dev
|
|
44
|
+
Requires-Dist: pytest>=7.4.0; extra == "dev"
|
|
45
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
|
|
46
|
+
|
|
47
|
+
# asmdetect
|
|
48
|
+
|
|
49
|
+
**Compiler-Aware Assembly-Level Malware Detection using Fine-Tuned Language Models**
|
|
50
|
+
|
|
51
|
+
[](https://huggingface.co/Swarnadharshini/codebert-malware-detector)
|
|
52
|
+
[]()
|
|
53
|
+
|
|
54
|
+
A Python library for **static malware detection** using x86 assembly opcode sequences.
|
|
55
|
+
Fine-tunes `microsoft/codebert-base` on disassembled binaries to classify files as
|
|
56
|
+
malware or benign — **no execution required**.
|
|
57
|
+
|
|
58
|
+
Built for SOC teams who need fast, explainable triage of suspicious binaries before deeper sandbox analysis.
|
|
59
|
+
|
|
60
|
+
---
|
|
61
|
+
|
|
62
|
+
## Model
|
|
63
|
+
|
|
64
|
+
**HuggingFace:** [Swarnadharshini/codebert-malware-detector](https://huggingface.co/Swarnadharshini/codebert-malware-detector)
|
|
65
|
+
|
|
66
|
+
| Property | Value |
|
|
67
|
+
|---|---|
|
|
68
|
+
| Base model | `microsoft/codebert-base` |
|
|
69
|
+
| Task | Binary sequence classification |
|
|
70
|
+
| Input | x86 opcode sequences (max 512 tokens) |
|
|
71
|
+
| Output | `malware` / `benign` + confidence score |
|
|
72
|
+
| Parameters | ~125M (8 frozen layers + 4 fine-tuned + classifier) |
|
|
73
|
+
| Training data | 1,458 samples (with augmentation) |
|
|
74
|
+
| Dataset | Arun152k — objdump-disassembled PE binaries |
|
|
75
|
+
| Decision threshold | 0.62 (calibrated on test set) |
|
|
76
|
+
|
|
77
|
+
## Dataset
|
|
78
|
+
Processed opcode sequences available on Kaggle:
|
|
79
|
+
https://www.kaggle.com/datasets/swarnadharshini/malware-opcodes
|
|
80
|
+
|
|
81
|
+
Original source: [Arun152k/Malware-Detection-using-N-Gram-Frequency](https://github.com/Arun152k/Malware-Detection-using-N-Gram-Frequency)
|
|
82
|
+
|
|
83
|
+
### Test set results
|
|
84
|
+
|
|
85
|
+
| Metric | Value |
|
|
86
|
+
|---|---|
|
|
87
|
+
| Accuracy | **86.0%** |
|
|
88
|
+
| F1 Score | **0.857** |
|
|
89
|
+
| Precision | **86.8%** |
|
|
90
|
+
| Recall | **84.6%** |
|
|
91
|
+
| AUC-ROC | **0.910** |
|
|
92
|
+
| False Negatives | 12 |
|
|
93
|
+
| False Positives | 10 |
|
|
94
|
+
|
|
95
|
+
---
|
|
96
|
+
|
|
97
|
+
## Installation
|
|
98
|
+
|
|
99
|
+
```bash
|
|
100
|
+
pip install -r requirements.txt
|
|
101
|
+
pip install -e .
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
---
|
|
105
|
+
|
|
106
|
+
## Quick start
|
|
107
|
+
|
|
108
|
+
```python
|
|
109
|
+
from asmdetect import MalwareDetector
|
|
110
|
+
|
|
111
|
+
# Load model from HuggingFace Hub (downloads once, cached locally)
|
|
112
|
+
detector = MalwareDetector.from_pretrained()
|
|
113
|
+
|
|
114
|
+
# Predict from a .csv assembly file (IDA Pro / objdump format)
|
|
115
|
+
result = detector.predict_file("suspicious_binary.csv")
|
|
116
|
+
print(result)
|
|
117
|
+
# DetectionResult(
|
|
118
|
+
# source = suspicious_binary.csv
|
|
119
|
+
# prediction = MALWARE [HIGH] HIGH
|
|
120
|
+
# confidence = 99.60%
|
|
121
|
+
# malware_p = 99.60%
|
|
122
|
+
# benign_p = 0.40%
|
|
123
|
+
# tokens = 312 (truncated=False)
|
|
124
|
+
# threshold = 0.62
|
|
125
|
+
# )
|
|
126
|
+
|
|
127
|
+
# Predict from a raw opcode string
|
|
128
|
+
result = detector.predict_text("push mov xor xor call ret add nop")
|
|
129
|
+
|
|
130
|
+
# Batch triage a folder — sorted by malware probability (highest first)
|
|
131
|
+
results = detector.predict_batch("/soc/incoming/suspicious/")
|
|
132
|
+
for r in results[:5]:
|
|
133
|
+
print(f"{r.malware_probability*100:.1f}% {r.risk_level} {r.source}")
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
---
|
|
137
|
+
|
|
138
|
+
## CLI
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
# Single file
|
|
142
|
+
asmdetect --file suspicious.csv
|
|
143
|
+
|
|
144
|
+
# Raw opcode string
|
|
145
|
+
asmdetect --text "push mov xor call ret"
|
|
146
|
+
|
|
147
|
+
# Batch triage a folder
|
|
148
|
+
asmdetect --batch /soc/incoming/
|
|
149
|
+
|
|
150
|
+
# Custom threshold
|
|
151
|
+
asmdetect --file suspicious.csv --threshold 0.70
|
|
152
|
+
|
|
153
|
+
# JSON output (for SIEM/SOAR integration)
|
|
154
|
+
asmdetect --file suspicious.csv --json
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
---
|
|
158
|
+
|
|
159
|
+
## Scripts
|
|
160
|
+
|
|
161
|
+
```bash
|
|
162
|
+
# Run demo predictions from HuggingFace
|
|
163
|
+
python scripts/predict_from_hf.py
|
|
164
|
+
|
|
165
|
+
# Predict a specific file
|
|
166
|
+
python scripts/predict_from_hf.py --file path/to/sample.csv
|
|
167
|
+
|
|
168
|
+
# Predict raw opcodes
|
|
169
|
+
python scripts/predict_from_hf.py --text "push mov xor call ret"
|
|
170
|
+
|
|
171
|
+
# SOC batch triage with JSON report
|
|
172
|
+
python scripts/batch_triage.py --folder /soc/incoming/ --report report.json
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
---
|
|
176
|
+
|
|
177
|
+
## API reference
|
|
178
|
+
|
|
179
|
+
### `MalwareDetector`
|
|
180
|
+
|
|
181
|
+
```python
|
|
182
|
+
MalwareDetector.from_pretrained(
|
|
183
|
+
model_id = "Swarnadharshini/codebert-malware-detector", # or local path
|
|
184
|
+
threshold = 0.62, # decision threshold — lower = more sensitive
|
|
185
|
+
device = "auto", # 'cuda', 'cpu', or 'auto'
|
|
186
|
+
)
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
| Method | Description |
|
|
190
|
+
|---|---|
|
|
191
|
+
| `predict_file(filepath)` | Classify a `.csv` assembly file |
|
|
192
|
+
| `predict_text(text)` | Classify a raw opcode string |
|
|
193
|
+
| `predict_batch(folder)` | Classify all CSVs in a folder, sorted by risk |
|
|
194
|
+
| `set_threshold(value)` | Update threshold at runtime |
|
|
195
|
+
| `benchmark(text, n=10)` | Measure inference latency |
|
|
196
|
+
|
|
197
|
+
### `DetectionResult`
|
|
198
|
+
|
|
199
|
+
| Field | Type | Description |
|
|
200
|
+
|---|---|---|
|
|
201
|
+
| `prediction` | `str` | `'malware'` or `'benign'` |
|
|
202
|
+
| `label` | `int` | `1` = malware, `0` = benign |
|
|
203
|
+
| `confidence` | `float` | Probability of predicted class |
|
|
204
|
+
| `malware_probability` | `float` | Raw P(malware) |
|
|
205
|
+
| `benign_probability` | `float` | Raw P(benign) |
|
|
206
|
+
| `risk_level` | `str` | `'HIGH'` ≥80% \| `'MEDIUM'` ≥55% \| `'LOW'` |
|
|
207
|
+
| `input_tokens` | `int` | Token count before truncation |
|
|
208
|
+
| `truncated` | `bool` | True if input exceeded 512 tokens |
|
|
209
|
+
| `is_malware` | `bool` | Convenience property |
|
|
210
|
+
| `is_benign` | `bool` | Convenience property |
|
|
211
|
+
| `to_dict()` | `dict` | Serialise to dictionary |
|
|
212
|
+
| `to_json()` | `str` | Serialise to JSON string |
|
|
213
|
+
|
|
214
|
+
### Threshold tuning
|
|
215
|
+
|
|
216
|
+
```python
|
|
217
|
+
# High-security: catch more malware (more false alarms)
|
|
218
|
+
detector.set_threshold(0.45)
|
|
219
|
+
|
|
220
|
+
# Low-noise: fewer false alarms (may miss borderline cases)
|
|
221
|
+
detector.set_threshold(0.75)
|
|
222
|
+
|
|
223
|
+
# Reset to calibrated default
|
|
224
|
+
detector.set_threshold(0.62)
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
---
|
|
228
|
+
|
|
229
|
+
## Input format
|
|
230
|
+
|
|
231
|
+
**CSV files** — from IDA Pro or `objdump -d`:
|
|
232
|
+
```csv
|
|
233
|
+
Address,Hex_Opcode,Opcode,Operand 1,Operand 2
|
|
234
|
+
402000:,30 7d 07,xor,%bh,
|
|
235
|
+
402003:,00 00,add,%al,
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
**Raw opcode strings:**
|
|
239
|
+
```python
|
|
240
|
+
detector.predict_text("push mov sub lea call add pop ret")
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
---
|
|
244
|
+
|
|
245
|
+
## Project structure
|
|
246
|
+
|
|
247
|
+
```
|
|
248
|
+
asmdetect/
|
|
249
|
+
├── asmdetect/
|
|
250
|
+
│ ├── __init__.py ← public API
|
|
251
|
+
│ ├── detector.py ← MalwareDetector class
|
|
252
|
+
│ ├── preprocessing.py ← opcode extraction utilities
|
|
253
|
+
│ ├── result.py ← DetectionResult dataclass
|
|
254
|
+
│ ├── cli.py ← asmdetect CLI
|
|
255
|
+
│ └── version.py
|
|
256
|
+
├── tests/
|
|
257
|
+
│ └── test_asmdetect.py ← 26 unit tests
|
|
258
|
+
├── scripts/
|
|
259
|
+
│ ├── predict_from_hf.py ← standalone HF demo script
|
|
260
|
+
│ └── batch_triage.py ← SOC batch triage with report
|
|
261
|
+
├── examples/
|
|
262
|
+
│ └── basic_usage.py ← usage examples
|
|
263
|
+
├── notebooks/
|
|
264
|
+
│ └── malware-detect.ipynb ← Kaggle training notebook
|
|
265
|
+
├── pyproject.toml
|
|
266
|
+
├── requirements.txt
|
|
267
|
+
└── README.md
|
|
268
|
+
```
|
|
269
|
+
|
|
270
|
+
---
|
|
271
|
+
|
|
272
|
+
## Training
|
|
273
|
+
|
|
274
|
+
The model was trained on Kaggle (Tesla P100 GPU) using the pipeline in
|
|
275
|
+
`notebooks/malware-detect.ipynb`.
|
|
276
|
+
|
|
277
|
+
**Training iterations:**
|
|
278
|
+
|
|
279
|
+
| Run | Key changes | Accuracy | F1 | AUC-ROC |
|
|
280
|
+
|---|---|---|---|---|
|
|
281
|
+
| Baseline | Default cross-entropy loss | 78.9% | 0.802 | 0.890 |
|
|
282
|
+
| Run 2 | Frozen layers + weighted loss (2.0) | 76.4% | 0.800 | 0.854 |
|
|
283
|
+
| **Run 3** | **weight=1.5 + threshold calibration + augmentation** | **86.0%** | **0.857** | **0.910** |
|
|
284
|
+
|
|
285
|
+
---
|
|
286
|
+
|
|
287
|
+
## Limitations
|
|
288
|
+
|
|
289
|
+
- Trained on 1,042 samples — a larger dataset will improve generalisation
|
|
290
|
+
- Input truncated to 256 opcodes; signatures in binary tails may be missed
|
|
291
|
+
- Obfuscated/packed malware that alters opcode distribution may evade detection
|
|
292
|
+
- Designed for x86 Windows PE binaries; accuracy on ARM or ELF is untested
|
|
293
|
+
- Static analysis only — does not detect runtime/memory-resident malware
|
|
294
|
+
|
|
295
|
+
# AsmDetect
|
|
296
|
+
|
|
297
|
+
Assembly-level malware detection using CodeBERT.
|
|
298
|
+
|
|
299
|
+
## Install
|
|
300
|
+
pip install asmdetect
|
|
301
|
+
|
|
302
|
+
## Usage
|
|
303
|
+
```python
|
|
304
|
+
from asmdetect import MalwareDetector
|
|
305
|
+
|
|
306
|
+
detector = MalwareDetector.from_pretrained()
|
|
307
|
+
detector.predict_text("push mov xor call")
|
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
# asmdetect
|
|
2
|
+
|
|
3
|
+
**Compiler-Aware Assembly-Level Malware Detection using Fine-Tuned Language Models**
|
|
4
|
+
|
|
5
|
+
[](https://huggingface.co/Swarnadharshini/codebert-malware-detector)
|
|
6
|
+
[]()
|
|
7
|
+
|
|
8
|
+
A Python library for **static malware detection** using x86 assembly opcode sequences.
|
|
9
|
+
Fine-tunes `microsoft/codebert-base` on disassembled binaries to classify files as
|
|
10
|
+
malware or benign — **no execution required**.
|
|
11
|
+
|
|
12
|
+
Built for SOC teams who need fast, explainable triage of suspicious binaries before deeper sandbox analysis.
|
|
13
|
+
|
|
14
|
+
---
|
|
15
|
+
|
|
16
|
+
## Model
|
|
17
|
+
|
|
18
|
+
**HuggingFace:** [Swarnadharshini/codebert-malware-detector](https://huggingface.co/Swarnadharshini/codebert-malware-detector)
|
|
19
|
+
|
|
20
|
+
| Property | Value |
|
|
21
|
+
|---|---|
|
|
22
|
+
| Base model | `microsoft/codebert-base` |
|
|
23
|
+
| Task | Binary sequence classification |
|
|
24
|
+
| Input | x86 opcode sequences (max 512 tokens) |
|
|
25
|
+
| Output | `malware` / `benign` + confidence score |
|
|
26
|
+
| Parameters | ~125M (8 frozen layers + 4 fine-tuned + classifier) |
|
|
27
|
+
| Training data | 1,458 samples (with augmentation) |
|
|
28
|
+
| Dataset | Arun152k — objdump-disassembled PE binaries |
|
|
29
|
+
| Decision threshold | 0.62 (calibrated on test set) |
|
|
30
|
+
|
|
31
|
+
## Dataset
|
|
32
|
+
Processed opcode sequences available on Kaggle:
|
|
33
|
+
https://www.kaggle.com/datasets/swarnadharshini/malware-opcodes
|
|
34
|
+
|
|
35
|
+
Original source: [Arun152k/Malware-Detection-using-N-Gram-Frequency](https://github.com/Arun152k/Malware-Detection-using-N-Gram-Frequency)
|
|
36
|
+
|
|
37
|
+
### Test set results
|
|
38
|
+
|
|
39
|
+
| Metric | Value |
|
|
40
|
+
|---|---|
|
|
41
|
+
| Accuracy | **86.0%** |
|
|
42
|
+
| F1 Score | **0.857** |
|
|
43
|
+
| Precision | **86.8%** |
|
|
44
|
+
| Recall | **84.6%** |
|
|
45
|
+
| AUC-ROC | **0.910** |
|
|
46
|
+
| False Negatives | 12 |
|
|
47
|
+
| False Positives | 10 |
|
|
48
|
+
|
|
49
|
+
---
|
|
50
|
+
|
|
51
|
+
## Installation
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
pip install -r requirements.txt
|
|
55
|
+
pip install -e .
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
---
|
|
59
|
+
|
|
60
|
+
## Quick start
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
from asmdetect import MalwareDetector
|
|
64
|
+
|
|
65
|
+
# Load model from HuggingFace Hub (downloads once, cached locally)
|
|
66
|
+
detector = MalwareDetector.from_pretrained()
|
|
67
|
+
|
|
68
|
+
# Predict from a .csv assembly file (IDA Pro / objdump format)
|
|
69
|
+
result = detector.predict_file("suspicious_binary.csv")
|
|
70
|
+
print(result)
|
|
71
|
+
# DetectionResult(
|
|
72
|
+
# source = suspicious_binary.csv
|
|
73
|
+
# prediction = MALWARE [HIGH] HIGH
|
|
74
|
+
# confidence = 99.60%
|
|
75
|
+
# malware_p = 99.60%
|
|
76
|
+
# benign_p = 0.40%
|
|
77
|
+
# tokens = 312 (truncated=False)
|
|
78
|
+
# threshold = 0.62
|
|
79
|
+
# )
|
|
80
|
+
|
|
81
|
+
# Predict from a raw opcode string
|
|
82
|
+
result = detector.predict_text("push mov xor xor call ret add nop")
|
|
83
|
+
|
|
84
|
+
# Batch triage a folder — sorted by malware probability (highest first)
|
|
85
|
+
results = detector.predict_batch("/soc/incoming/suspicious/")
|
|
86
|
+
for r in results[:5]:
|
|
87
|
+
print(f"{r.malware_probability*100:.1f}% {r.risk_level} {r.source}")
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
---
|
|
91
|
+
|
|
92
|
+
## CLI
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
# Single file
|
|
96
|
+
asmdetect --file suspicious.csv
|
|
97
|
+
|
|
98
|
+
# Raw opcode string
|
|
99
|
+
asmdetect --text "push mov xor call ret"
|
|
100
|
+
|
|
101
|
+
# Batch triage a folder
|
|
102
|
+
asmdetect --batch /soc/incoming/
|
|
103
|
+
|
|
104
|
+
# Custom threshold
|
|
105
|
+
asmdetect --file suspicious.csv --threshold 0.70
|
|
106
|
+
|
|
107
|
+
# JSON output (for SIEM/SOAR integration)
|
|
108
|
+
asmdetect --file suspicious.csv --json
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
---
|
|
112
|
+
|
|
113
|
+
## Scripts
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
# Run demo predictions from HuggingFace
|
|
117
|
+
python scripts/predict_from_hf.py
|
|
118
|
+
|
|
119
|
+
# Predict a specific file
|
|
120
|
+
python scripts/predict_from_hf.py --file path/to/sample.csv
|
|
121
|
+
|
|
122
|
+
# Predict raw opcodes
|
|
123
|
+
python scripts/predict_from_hf.py --text "push mov xor call ret"
|
|
124
|
+
|
|
125
|
+
# SOC batch triage with JSON report
|
|
126
|
+
python scripts/batch_triage.py --folder /soc/incoming/ --report report.json
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
---
|
|
130
|
+
|
|
131
|
+
## API reference
|
|
132
|
+
|
|
133
|
+
### `MalwareDetector`
|
|
134
|
+
|
|
135
|
+
```python
|
|
136
|
+
MalwareDetector.from_pretrained(
|
|
137
|
+
model_id = "Swarnadharshini/codebert-malware-detector", # or local path
|
|
138
|
+
threshold = 0.62, # decision threshold — lower = more sensitive
|
|
139
|
+
device = "auto", # 'cuda', 'cpu', or 'auto'
|
|
140
|
+
)
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
| Method | Description |
|
|
144
|
+
|---|---|
|
|
145
|
+
| `predict_file(filepath)` | Classify a `.csv` assembly file |
|
|
146
|
+
| `predict_text(text)` | Classify a raw opcode string |
|
|
147
|
+
| `predict_batch(folder)` | Classify all CSVs in a folder, sorted by risk |
|
|
148
|
+
| `set_threshold(value)` | Update threshold at runtime |
|
|
149
|
+
| `benchmark(text, n=10)` | Measure inference latency |
|
|
150
|
+
|
|
151
|
+
### `DetectionResult`
|
|
152
|
+
|
|
153
|
+
| Field | Type | Description |
|
|
154
|
+
|---|---|---|
|
|
155
|
+
| `prediction` | `str` | `'malware'` or `'benign'` |
|
|
156
|
+
| `label` | `int` | `1` = malware, `0` = benign |
|
|
157
|
+
| `confidence` | `float` | Probability of predicted class |
|
|
158
|
+
| `malware_probability` | `float` | Raw P(malware) |
|
|
159
|
+
| `benign_probability` | `float` | Raw P(benign) |
|
|
160
|
+
| `risk_level` | `str` | `'HIGH'` ≥80% \| `'MEDIUM'` ≥55% \| `'LOW'` |
|
|
161
|
+
| `input_tokens` | `int` | Token count before truncation |
|
|
162
|
+
| `truncated` | `bool` | True if input exceeded 512 tokens |
|
|
163
|
+
| `is_malware` | `bool` | Convenience property |
|
|
164
|
+
| `is_benign` | `bool` | Convenience property |
|
|
165
|
+
| `to_dict()` | `dict` | Serialise to dictionary |
|
|
166
|
+
| `to_json()` | `str` | Serialise to JSON string |
|
|
167
|
+
|
|
168
|
+
### Threshold tuning
|
|
169
|
+
|
|
170
|
+
```python
|
|
171
|
+
# High-security: catch more malware (more false alarms)
|
|
172
|
+
detector.set_threshold(0.45)
|
|
173
|
+
|
|
174
|
+
# Low-noise: fewer false alarms (may miss borderline cases)
|
|
175
|
+
detector.set_threshold(0.75)
|
|
176
|
+
|
|
177
|
+
# Reset to calibrated default
|
|
178
|
+
detector.set_threshold(0.62)
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
---
|
|
182
|
+
|
|
183
|
+
## Input format
|
|
184
|
+
|
|
185
|
+
**CSV files** — from IDA Pro or `objdump -d`:
|
|
186
|
+
```csv
|
|
187
|
+
Address,Hex_Opcode,Opcode,Operand 1,Operand 2
|
|
188
|
+
402000:,30 7d 07,xor,%bh,
|
|
189
|
+
402003:,00 00,add,%al,
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
**Raw opcode strings:**
|
|
193
|
+
```python
|
|
194
|
+
detector.predict_text("push mov sub lea call add pop ret")
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
---
|
|
198
|
+
|
|
199
|
+
## Project structure
|
|
200
|
+
|
|
201
|
+
```
|
|
202
|
+
asmdetect/
|
|
203
|
+
├── asmdetect/
|
|
204
|
+
│ ├── __init__.py ← public API
|
|
205
|
+
│ ├── detector.py ← MalwareDetector class
|
|
206
|
+
│ ├── preprocessing.py ← opcode extraction utilities
|
|
207
|
+
│ ├── result.py ← DetectionResult dataclass
|
|
208
|
+
│ ├── cli.py ← asmdetect CLI
|
|
209
|
+
│ └── version.py
|
|
210
|
+
├── tests/
|
|
211
|
+
│ └── test_asmdetect.py ← 26 unit tests
|
|
212
|
+
├── scripts/
|
|
213
|
+
│ ├── predict_from_hf.py ← standalone HF demo script
|
|
214
|
+
│ └── batch_triage.py ← SOC batch triage with report
|
|
215
|
+
├── examples/
|
|
216
|
+
│ └── basic_usage.py ← usage examples
|
|
217
|
+
├── notebooks/
|
|
218
|
+
│ └── malware-detect.ipynb ← Kaggle training notebook
|
|
219
|
+
├── pyproject.toml
|
|
220
|
+
├── requirements.txt
|
|
221
|
+
└── README.md
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
---
|
|
225
|
+
|
|
226
|
+
## Training
|
|
227
|
+
|
|
228
|
+
The model was trained on Kaggle (Tesla P100 GPU) using the pipeline in
|
|
229
|
+
`notebooks/malware-detect.ipynb`.
|
|
230
|
+
|
|
231
|
+
**Training iterations:**
|
|
232
|
+
|
|
233
|
+
| Run | Key changes | Accuracy | F1 | AUC-ROC |
|
|
234
|
+
|---|---|---|---|---|
|
|
235
|
+
| Baseline | Default cross-entropy loss | 78.9% | 0.802 | 0.890 |
|
|
236
|
+
| Run 2 | Frozen layers + weighted loss (2.0) | 76.4% | 0.800 | 0.854 |
|
|
237
|
+
| **Run 3** | **weight=1.5 + threshold calibration + augmentation** | **86.0%** | **0.857** | **0.910** |
|
|
238
|
+
|
|
239
|
+
---
|
|
240
|
+
|
|
241
|
+
## Limitations
|
|
242
|
+
|
|
243
|
+
- Trained on 1,042 samples — a larger dataset will improve generalisation
|
|
244
|
+
- Input truncated to 256 opcodes; signatures in binary tails may be missed
|
|
245
|
+
- Obfuscated/packed malware that alters opcode distribution may evade detection
|
|
246
|
+
- Designed for x86 Windows PE binaries; accuracy on ARM or ELF is untested
|
|
247
|
+
- Static analysis only — does not detect runtime/memory-resident malware
|
|
248
|
+
|
|
249
|
+
# AsmDetect
|
|
250
|
+
|
|
251
|
+
Assembly-level malware detection using CodeBERT.
|
|
252
|
+
|
|
253
|
+
## Install
|
|
254
|
+
pip install asmdetect
|
|
255
|
+
|
|
256
|
+
## Usage
|
|
257
|
+
```python
|
|
258
|
+
from asmdetect import MalwareDetector
|
|
259
|
+
|
|
260
|
+
detector = MalwareDetector.from_pretrained()
|
|
261
|
+
detector.predict_text("push mov xor call")
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import argparse, json, sys, time
|
|
2
|
+
from .detector import MalwareDetector, DEFAULT_MODEL_ID, DEFAULT_THRESHOLD
|
|
3
|
+
from .version import __version__
|
|
4
|
+
|
|
5
|
+
def _print_result(r, use_json=False):
|
|
6
|
+
if use_json:
|
|
7
|
+
print(r.to_json()); return
|
|
8
|
+
icons = {"HIGH":"[HIGH]","MEDIUM":"[MED] ","LOW":"[LOW] "}
|
|
9
|
+
print(f"\n {'─'*46}")
|
|
10
|
+
print(f" File : {r.source}")
|
|
11
|
+
print(f" Verdict : {r.prediction.upper()} {icons.get(r.risk_level,'')}")
|
|
12
|
+
print(f" Confidence: {r.confidence*100:.2f}%")
|
|
13
|
+
print(f" Malware P : {r.malware_probability*100:.2f}%")
|
|
14
|
+
print(f" Tokens : {r.input_tokens}")
|
|
15
|
+
print(f" {'─'*46}")
|
|
16
|
+
|
|
17
|
+
def main():
|
|
18
|
+
p = argparse.ArgumentParser(prog="asmdetect",
|
|
19
|
+
description="Assembly-Level Malware Detector")
|
|
20
|
+
p.add_argument("--file"); p.add_argument("--text")
|
|
21
|
+
p.add_argument("--batch"); p.add_argument("--json", action="store_true")
|
|
22
|
+
p.add_argument("--model", default=DEFAULT_MODEL_ID)
|
|
23
|
+
p.add_argument("--threshold", type=float, default=DEFAULT_THRESHOLD)
|
|
24
|
+
p.add_argument("--version", action="version", version=f"asmdetect {__version__}")
|
|
25
|
+
args = p.parse_args()
|
|
26
|
+
if not any([args.file, args.text, args.batch]):
|
|
27
|
+
p.print_help(); sys.exit(0)
|
|
28
|
+
det = MalwareDetector.from_pretrained(args.model, args.threshold)
|
|
29
|
+
t0 = time.perf_counter()
|
|
30
|
+
if args.file:
|
|
31
|
+
_print_result(det.predict_file(args.file), args.json)
|
|
32
|
+
elif args.text:
|
|
33
|
+
_print_result(det.predict_text(args.text), args.json)
|
|
34
|
+
elif args.batch:
|
|
35
|
+
results = det.predict_batch(args.batch)
|
|
36
|
+
if args.json:
|
|
37
|
+
print(json.dumps([r.to_dict() for r in results], indent=2))
|
|
38
|
+
else:
|
|
39
|
+
for r in results: _print_result(r)
|
|
40
|
+
print(f"\n Inference: {(time.perf_counter()-t0)*1000:.0f} ms")
|
|
41
|
+
|
|
42
|
+
if __name__ == "__main__":
|
|
43
|
+
main()
|