disasm2vec 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- disasm2vec-0.1.0/LICENSE +21 -0
- disasm2vec-0.1.0/MANIFEST.in +4 -0
- disasm2vec-0.1.0/PKG-INFO +84 -0
- disasm2vec-0.1.0/README.md +69 -0
- disasm2vec-0.1.0/models/base_tfidf_asm.json +27 -0
- disasm2vec-0.1.0/models/base_tfidf_asm.pkl +0 -0
- disasm2vec-0.1.0/models/base_tfidf_asm_keepreg.json +27 -0
- disasm2vec-0.1.0/models/base_tfidf_asm_keepreg.pkl +0 -0
- disasm2vec-0.1.0/pyproject.toml +22 -0
- disasm2vec-0.1.0/setup.cfg +4 -0
- disasm2vec-0.1.0/src/disasm2vec/__init__.py +17 -0
- disasm2vec-0.1.0/src/disasm2vec/compiler/__init__.py +3 -0
- disasm2vec-0.1.0/src/disasm2vec/compiler/errors.py +3 -0
- disasm2vec-0.1.0/src/disasm2vec/compiler/gcc.py +106 -0
- disasm2vec-0.1.0/src/disasm2vec/disassembler/__init__.py +6 -0
- disasm2vec-0.1.0/src/disasm2vec/disassembler/errors.py +2 -0
- disasm2vec-0.1.0/src/disasm2vec/disassembler/objdump.py +127 -0
- disasm2vec-0.1.0/src/disasm2vec/pipeline/__init__.py +7 -0
- disasm2vec-0.1.0/src/disasm2vec/pipeline/config.py +32 -0
- disasm2vec-0.1.0/src/disasm2vec/pipeline/runner.py +78 -0
- disasm2vec-0.1.0/src/disasm2vec/tokenizer/__init__.py +4 -0
- disasm2vec-0.1.0/src/disasm2vec/tokenizer/cleaner.py +10 -0
- disasm2vec-0.1.0/src/disasm2vec/tokenizer/core.py +184 -0
- disasm2vec-0.1.0/src/disasm2vec/tokenizer/normalizer.py +26 -0
- disasm2vec-0.1.0/src/disasm2vec/vectorizer/__init__.py +5 -0
- disasm2vec-0.1.0/src/disasm2vec/vectorizer/base.py +50 -0
- disasm2vec-0.1.0/src/disasm2vec/vectorizer/factory.py +27 -0
- disasm2vec-0.1.0/src/disasm2vec/vectorizer/tfidf.py +123 -0
- disasm2vec-0.1.0/src/disasm2vec.egg-info/PKG-INFO +84 -0
- disasm2vec-0.1.0/src/disasm2vec.egg-info/SOURCES.txt +36 -0
- disasm2vec-0.1.0/src/disasm2vec.egg-info/dependency_links.txt +1 -0
- disasm2vec-0.1.0/src/disasm2vec.egg-info/requires.txt +2 -0
- disasm2vec-0.1.0/src/disasm2vec.egg-info/top_level.txt +1 -0
- disasm2vec-0.1.0/tests/test_compiler.py +72 -0
- disasm2vec-0.1.0/tests/test_disassembler.py +75 -0
- disasm2vec-0.1.0/tests/test_pipeline.py +56 -0
- disasm2vec-0.1.0/tests/test_tokenizer.py +54 -0
- disasm2vec-0.1.0/tests/test_vectorizer.py +119 -0
disasm2vec-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Ahmad Nur Rohim
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: disasm2vec
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: disasm2vec is a research framework designed to generate vector representations from disassembled C/C++ binaries. It provides a modular pipeline that handles compilation, disassembly, tokenization, and vectorization, enabling researchers and security analysts to transform raw code into machine-learning-ready features.
|
|
5
|
+
Author-email: Ahmad Nur Rohim <ahmadnurrohim2812@gmail.com>
|
|
6
|
+
Project-URL: Repository, https://github.com/Anro128/disasm2vec
|
|
7
|
+
Project-URL: Issues, https://github.com/Anro128/disasm2vec/issues
|
|
8
|
+
Project-URL: Changelog, https://github.com/Anro128/disasm2vec/blob/main/CHANGELOG.md
|
|
9
|
+
Requires-Python: >=3.10
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Requires-Dist: scikit-learn>=1.0.0
|
|
13
|
+
Requires-Dist: numpy>=1.20.0
|
|
14
|
+
Dynamic: license-file
|
|
15
|
+
|
|
16
|
+
# disasm2vec
|
|
17
|
+
|
|
18
|
+
**disasm2vec** is a research framework designed to generate vector representations from disassembled C/C++ binaries. It provides a modular pipeline that handles compilation, disassembly, tokenization, and vectorization, enabling researchers and security analysts to transform raw code into machine-learning-ready features.
|
|
19
|
+
|
|
20
|
+
## Features
|
|
21
|
+
|
|
22
|
+
- **Automated Compilation**: Seamlessly compiles C and C++ source files using GCC.
|
|
23
|
+
- **Disassembly Wrapper**: Extracts assembly instructions using `objdump`, supporting both full and function-specific disassembly.
|
|
24
|
+
- **Intelligent Tokenization**: Normalizes and cleans assembly instructions, with options to preserve or abstract register names.
|
|
25
|
+
- **Vectorization**: Implements TF-IDF vectorization with a flexible factory pattern for easy model management.
|
|
26
|
+
- **End-to-End Pipeline**: Orchestrates the entire process from source code to vector embedding.
|
|
27
|
+
- **Extensible Architecture**: Built with abstract base classes to easily support new compilers, disassemblers, or vectorizers.
|
|
28
|
+
|
|
29
|
+
## Prerequisites
|
|
30
|
+
|
|
31
|
+
- **Python**: version 3.10 or higher.
|
|
32
|
+
- **Operating System**: Linux or Windows Subsystem for Linux (WSL).
|
|
33
|
+
- **GCC**: Required for compiling source files.
|
|
34
|
+
- **Objdump**: Required for disassembling binaries.
|
|
35
|
+
|
|
36
|
+
**Note**: The compilation (`gcc`) and disassembly (`objdump`) modules rely on system-level tools typically found in Linux environments. If you are on Windows, please use WSL.
|
|
37
|
+
|
|
38
|
+
Ensure both `gcc` and `objdump` are installed and available in your system's PATH.
|
|
39
|
+
|
|
40
|
+
## Installation
|
|
41
|
+
|
|
42
|
+
Install directly from PyPI:
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
pip install disasm2vec
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
Or install from source:
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
git clone https://github.com/yourusername/disasm2vec.git
|
|
52
|
+
cd disasm2vec
|
|
53
|
+
pip install .
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Usage
|
|
57
|
+
|
|
58
|
+
The core of the framework is the `PipelineRunner`, which processes a source file based on a configuration object.
|
|
59
|
+
|
|
60
|
+
### Basic Example
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
from disasm2vec.pipeline import PipelineConfig, run_pipeline
|
|
64
|
+
|
|
65
|
+
# Configure the pipeline
|
|
66
|
+
config = PipelineConfig(
|
|
67
|
+
source_file="examples/sample.c",
|
|
68
|
+
build_dir="build",
|
|
69
|
+
asm_dir="asm",
|
|
70
|
+
model_path="models/base_tfidf_asm.pkl" # Path to pre-trained model or where to save a new one
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# Run the pipeline
|
|
74
|
+
# returns:
|
|
75
|
+
# vector: The vector representation of the source file
|
|
76
|
+
# vectorizer: The fitted vectorizer instance
|
|
77
|
+
vector, vectorizer = run_pipeline(config)
|
|
78
|
+
|
|
79
|
+
print(f"Generated Vector Shape: {vector.shape}")
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## License
|
|
83
|
+
|
|
84
|
+
This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
# disasm2vec
|
|
2
|
+
|
|
3
|
+
**disasm2vec** is a research framework designed to generate vector representations from disassembled C/C++ binaries. It provides a modular pipeline that handles compilation, disassembly, tokenization, and vectorization, enabling researchers and security analysts to transform raw code into machine-learning-ready features.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Automated Compilation**: Seamlessly compiles C and C++ source files using GCC.
|
|
8
|
+
- **Disassembly Wrapper**: Extracts assembly instructions using `objdump`, supporting both full and function-specific disassembly.
|
|
9
|
+
- **Intelligent Tokenization**: Normalizes and cleans assembly instructions, with options to preserve or abstract register names.
|
|
10
|
+
- **Vectorization**: Implements TF-IDF vectorization with a flexible factory pattern for easy model management.
|
|
11
|
+
- **End-to-End Pipeline**: Orchestrates the entire process from source code to vector embedding.
|
|
12
|
+
- **Extensible Architecture**: Built with abstract base classes to easily support new compilers, disassemblers, or vectorizers.
|
|
13
|
+
|
|
14
|
+
## Prerequisites
|
|
15
|
+
|
|
16
|
+
- **Python**: version 3.10 or higher.
|
|
17
|
+
- **Operating System**: Linux or Windows Subsystem for Linux (WSL).
|
|
18
|
+
- **GCC**: Required for compiling source files.
|
|
19
|
+
- **Objdump**: Required for disassembling binaries.
|
|
20
|
+
|
|
21
|
+
**Note**: The compilation (`gcc`) and disassembly (`objdump`) modules rely on system-level tools typically found in Linux environments. If you are on Windows, please use WSL.
|
|
22
|
+
|
|
23
|
+
Ensure both `gcc` and `objdump` are installed and available in your system's PATH.
|
|
24
|
+
|
|
25
|
+
## Installation
|
|
26
|
+
|
|
27
|
+
Install directly from PyPI:
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
pip install disasm2vec
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
Or install from source:
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
git clone https://github.com/yourusername/disasm2vec.git
|
|
37
|
+
cd disasm2vec
|
|
38
|
+
pip install .
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Usage
|
|
42
|
+
|
|
43
|
+
The core of the framework is the `PipelineRunner`, which processes a source file based on a configuration object.
|
|
44
|
+
|
|
45
|
+
### Basic Example
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
from disasm2vec.pipeline import PipelineConfig, run_pipeline
|
|
49
|
+
|
|
50
|
+
# Configure the pipeline
|
|
51
|
+
config = PipelineConfig(
|
|
52
|
+
source_file="examples/sample.c",
|
|
53
|
+
build_dir="build",
|
|
54
|
+
asm_dir="asm",
|
|
55
|
+
model_path="models/base_tfidf_asm.pkl" # Path to pre-trained model or where to save a new one
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# Run the pipeline
|
|
59
|
+
# returns:
|
|
60
|
+
# vector: The vector representation of the source file
|
|
61
|
+
# vectorizer: The fitted vectorizer instance
|
|
62
|
+
vector, vectorizer = run_pipeline(config)
|
|
63
|
+
|
|
64
|
+
print(f"Generated Vector Shape: {vector.shape}")
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## License
|
|
68
|
+
|
|
69
|
+
This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
{
|
|
2
|
+
"model_type": "tfidf",
|
|
3
|
+
"vectorizer": {
|
|
4
|
+
"max_features": 5000,
|
|
5
|
+
"ngram_range": [
|
|
6
|
+
1,
|
|
7
|
+
2
|
|
8
|
+
],
|
|
9
|
+
"min_df": 2,
|
|
10
|
+
"max_df": 1.0,
|
|
11
|
+
"norm": "l2",
|
|
12
|
+
"use_idf": true
|
|
13
|
+
},
|
|
14
|
+
"dataset": {
|
|
15
|
+
"num_documents": 653,
|
|
16
|
+
"avg_length": 153.9770290964778,
|
|
17
|
+
"hash": "a1ac3063bb6cd0ac601cb6e5421402411b722c90fca288a03db2c518425bb284"
|
|
18
|
+
},
|
|
19
|
+
"environment": {
|
|
20
|
+
"python": "3.12.3",
|
|
21
|
+
"platform": "Linux-6.6.87.2-microsoft-standard-WSL2-x86_64-with-glibc2.39"
|
|
22
|
+
},
|
|
23
|
+
"training": {
|
|
24
|
+
"timestamp": "2026-02-17T16:14:47.363169",
|
|
25
|
+
"source_dataset": "dataset_tokens.csv"
|
|
26
|
+
}
|
|
27
|
+
}
|
|
Binary file
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
{
|
|
2
|
+
"model_type": "tfidf",
|
|
3
|
+
"vectorizer": {
|
|
4
|
+
"max_features": 5000,
|
|
5
|
+
"ngram_range": [
|
|
6
|
+
1,
|
|
7
|
+
2
|
|
8
|
+
],
|
|
9
|
+
"min_df": 2,
|
|
10
|
+
"max_df": 1.0,
|
|
11
|
+
"norm": "l2",
|
|
12
|
+
"use_idf": true
|
|
13
|
+
},
|
|
14
|
+
"dataset": {
|
|
15
|
+
"num_documents": 653,
|
|
16
|
+
"avg_length": 153.9770290964778,
|
|
17
|
+
"hash": "2ad5da17cc369457374e2a4465e8bd5c753981f79fe9e8cd61cd2ad231be26fb"
|
|
18
|
+
},
|
|
19
|
+
"environment": {
|
|
20
|
+
"python": "3.12.3",
|
|
21
|
+
"platform": "Linux-6.6.87.2-microsoft-standard-WSL2-x86_64-with-glibc2.39"
|
|
22
|
+
},
|
|
23
|
+
"training": {
|
|
24
|
+
"timestamp": "2026-02-17T16:14:02.894795",
|
|
25
|
+
"source_dataset": "dataset_tokens_keep_register.csv"
|
|
26
|
+
}
|
|
27
|
+
}
|
|
Binary file
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "disasm2vec"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "disasm2vec is a research framework designed to generate vector representations from disassembled C/C++ binaries. It provides a modular pipeline that handles compilation, disassembly, tokenization, and vectorization, enabling researchers and security analysts to transform raw code into machine-learning-ready features."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
authors = [
|
|
12
|
+
{ name = "Ahmad Nur Rohim", email = "ahmadnurrohim2812@gmail.com" }
|
|
13
|
+
]
|
|
14
|
+
dependencies = [
|
|
15
|
+
"scikit-learn>=1.0.0",
|
|
16
|
+
"numpy>=1.20.0",
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
[project.urls]
|
|
20
|
+
Repository = "https://github.com/Anro128/disasm2vec"
|
|
21
|
+
Issues = "https://github.com/Anro128/disasm2vec/issues"
|
|
22
|
+
Changelog = "https://github.com/Anro128/disasm2vec/blob/main/CHANGELOG.md"
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""disasm2vec"""
|
|
2
|
+
|
|
3
|
+
from . import compiler
|
|
4
|
+
from . import disassembler
|
|
5
|
+
from . import tokenizer
|
|
6
|
+
from . import vectorizer
|
|
7
|
+
from . import pipeline
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"compiler",
|
|
11
|
+
"disassembler",
|
|
12
|
+
"tokenizer",
|
|
13
|
+
"vectorizer",
|
|
14
|
+
"pipeline",
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
import subprocess
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from .errors import CompilationError
|
|
4
|
+
|
|
5
|
+
def compile_c(
|
|
6
|
+
source: str,
|
|
7
|
+
output: str,
|
|
8
|
+
flags: list[str] | None = None
|
|
9
|
+
):
|
|
10
|
+
"""
|
|
11
|
+
Compile C source file using gcc.
|
|
12
|
+
"""
|
|
13
|
+
_compile(
|
|
14
|
+
compiler="gcc",
|
|
15
|
+
source=source,
|
|
16
|
+
output=output,
|
|
17
|
+
flags=flags,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def compile_cpp(
|
|
22
|
+
source: str,
|
|
23
|
+
output: str,
|
|
24
|
+
flags: list[str] | None = None
|
|
25
|
+
):
|
|
26
|
+
"""
|
|
27
|
+
Compile C++ source file using g++.
|
|
28
|
+
"""
|
|
29
|
+
_compile(
|
|
30
|
+
compiler="g++",
|
|
31
|
+
source=source,
|
|
32
|
+
output=output,
|
|
33
|
+
flags=flags,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _compile(
|
|
38
|
+
compiler: str,
|
|
39
|
+
source: str,
|
|
40
|
+
output: str,
|
|
41
|
+
flags: list[str] | None = None,
|
|
42
|
+
):
|
|
43
|
+
source = Path(source)
|
|
44
|
+
output = Path(output)
|
|
45
|
+
|
|
46
|
+
if not source.exists():
|
|
47
|
+
raise FileNotFoundError(source)
|
|
48
|
+
|
|
49
|
+
cmd = [
|
|
50
|
+
compiler,
|
|
51
|
+
str(source),
|
|
52
|
+
"-o",
|
|
53
|
+
str(output),
|
|
54
|
+
]
|
|
55
|
+
|
|
56
|
+
if flags:
|
|
57
|
+
cmd.extend(flags)
|
|
58
|
+
|
|
59
|
+
try:
|
|
60
|
+
subprocess.run(
|
|
61
|
+
cmd,
|
|
62
|
+
check=True,
|
|
63
|
+
stdout=subprocess.PIPE,
|
|
64
|
+
stderr=subprocess.PIPE,
|
|
65
|
+
text=True,
|
|
66
|
+
)
|
|
67
|
+
except subprocess.CalledProcessError as e:
|
|
68
|
+
raise CompilationError(
|
|
69
|
+
f"Compilation failed for {source}:\n{e.stderr}"
|
|
70
|
+
) from e
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def compile_folder(
|
|
74
|
+
src_dir: str,
|
|
75
|
+
out_dir: str,
|
|
76
|
+
optimize: str = "-O0",
|
|
77
|
+
extra_flags: list[str] | None = None,
|
|
78
|
+
):
|
|
79
|
+
"""
|
|
80
|
+
Compile all .c and .cpp files in a folder (recursively).
|
|
81
|
+
"""
|
|
82
|
+
src_dir = Path(src_dir)
|
|
83
|
+
out_dir = Path(out_dir)
|
|
84
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
85
|
+
|
|
86
|
+
extra_flags = extra_flags or []
|
|
87
|
+
|
|
88
|
+
sources = list(src_dir.rglob("*.c")) + list(src_dir.rglob("*.cpp"))
|
|
89
|
+
|
|
90
|
+
if not sources:
|
|
91
|
+
raise ValueError(f"No C/C++ files found in {src_dir}")
|
|
92
|
+
|
|
93
|
+
for src in sources:
|
|
94
|
+
output = out_dir / src.stem
|
|
95
|
+
flags = [optimize, *extra_flags]
|
|
96
|
+
|
|
97
|
+
try:
|
|
98
|
+
if src.suffix == ".c":
|
|
99
|
+
compile_c(src, output, flags)
|
|
100
|
+
else:
|
|
101
|
+
compile_cpp(src, output, flags)
|
|
102
|
+
|
|
103
|
+
except CompilationError as e:
|
|
104
|
+
raise CompilationError(
|
|
105
|
+
f"Compilation failed for {src}:\n{e}"
|
|
106
|
+
) from e
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
import subprocess
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from .errors import DisassemblyError
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def disassemble(
|
|
7
|
+
binary: str,
|
|
8
|
+
output: str,
|
|
9
|
+
arch: str | None = None,
|
|
10
|
+
full: bool = False,
|
|
11
|
+
):
|
|
12
|
+
"""
|
|
13
|
+
Disassemble a single binary using objdump.
|
|
14
|
+
|
|
15
|
+
Parameters
|
|
16
|
+
----------
|
|
17
|
+
binary : str
|
|
18
|
+
Path to compiled binary
|
|
19
|
+
output : str
|
|
20
|
+
Output .asm file
|
|
21
|
+
arch : str | None
|
|
22
|
+
Optional architecture (e.g. i386:x86-64)
|
|
23
|
+
full : bool
|
|
24
|
+
If True, disassemble all functions.
|
|
25
|
+
If False, exclude builtin / PLT functions.
|
|
26
|
+
"""
|
|
27
|
+
binary = Path(binary)
|
|
28
|
+
output = Path(output)
|
|
29
|
+
|
|
30
|
+
if not binary.exists():
|
|
31
|
+
raise FileNotFoundError(binary)
|
|
32
|
+
|
|
33
|
+
output.parent.mkdir(parents=True, exist_ok=True)
|
|
34
|
+
|
|
35
|
+
# Base command
|
|
36
|
+
cmd = ["objdump", "-d", "--section=.text", str(binary)]
|
|
37
|
+
|
|
38
|
+
if arch:
|
|
39
|
+
cmd.extend(["-m", arch])
|
|
40
|
+
|
|
41
|
+
try:
|
|
42
|
+
result = subprocess.run(
|
|
43
|
+
cmd,
|
|
44
|
+
check=True,
|
|
45
|
+
stdout=subprocess.PIPE,
|
|
46
|
+
stderr=subprocess.PIPE,
|
|
47
|
+
text=True,
|
|
48
|
+
)
|
|
49
|
+
except subprocess.CalledProcessError as e:
|
|
50
|
+
raise DisassemblyError(
|
|
51
|
+
f"objdump failed for {binary}:\n{e.stderr}"
|
|
52
|
+
) from e
|
|
53
|
+
|
|
54
|
+
asm = result.stdout
|
|
55
|
+
|
|
56
|
+
if not full:
|
|
57
|
+
asm = _filter_builtin_functions(asm)
|
|
58
|
+
|
|
59
|
+
output.write_text(asm)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def disassemble_folder(
|
|
63
|
+
bin_dir: str,
|
|
64
|
+
out_dir: str,
|
|
65
|
+
full: bool = False,
|
|
66
|
+
):
|
|
67
|
+
"""
|
|
68
|
+
Disassemble all binaries in a folder.
|
|
69
|
+
|
|
70
|
+
Parameters
|
|
71
|
+
----------
|
|
72
|
+
bin_dir : str
|
|
73
|
+
Folder containing compiled binaries
|
|
74
|
+
out_dir : str
|
|
75
|
+
Folder to store .asm outputs
|
|
76
|
+
full : bool
|
|
77
|
+
If True, disassemble all functions.
|
|
78
|
+
If False, exclude builtin / PLT functions.
|
|
79
|
+
"""
|
|
80
|
+
bin_dir = Path(bin_dir)
|
|
81
|
+
out_dir = Path(out_dir)
|
|
82
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
83
|
+
|
|
84
|
+
binaries = [p for p in bin_dir.iterdir() if p.is_file()]
|
|
85
|
+
|
|
86
|
+
if not binaries:
|
|
87
|
+
raise ValueError(f"No binaries found in {bin_dir}")
|
|
88
|
+
|
|
89
|
+
for binary in binaries:
|
|
90
|
+
asm_out = out_dir / f"{binary.name}.asm"
|
|
91
|
+
|
|
92
|
+
try:
|
|
93
|
+
disassemble(binary, asm_out, full=full)
|
|
94
|
+
except DisassemblyError as e:
|
|
95
|
+
raise DisassemblyError(
|
|
96
|
+
f"Disassembly failed for {binary}:\n{e}"
|
|
97
|
+
) from e
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _filter_builtin_functions(asm: str) -> str:
|
|
101
|
+
"""
|
|
102
|
+
Remove builtin / PLT / runtime functions from objdump output.
|
|
103
|
+
"""
|
|
104
|
+
filtered_lines = []
|
|
105
|
+
|
|
106
|
+
skip = False
|
|
107
|
+
for line in asm.splitlines():
|
|
108
|
+
if "<" in line and ">" in line and line.strip().endswith(":"):
|
|
109
|
+
name = line.split("<")[1].split(">")[0]
|
|
110
|
+
|
|
111
|
+
if (
|
|
112
|
+
name.endswith("@plt")
|
|
113
|
+
or name.startswith("_start")
|
|
114
|
+
or name.startswith("frame_dummy")
|
|
115
|
+
or name.startswith("register_tm_clones")
|
|
116
|
+
or name.startswith("deregister_tm_clones")
|
|
117
|
+
or name.startswith("__")
|
|
118
|
+
):
|
|
119
|
+
skip = True
|
|
120
|
+
continue
|
|
121
|
+
else:
|
|
122
|
+
skip = False
|
|
123
|
+
|
|
124
|
+
if not skip:
|
|
125
|
+
filtered_lines.append(line)
|
|
126
|
+
|
|
127
|
+
return "\n".join(filtered_lines)
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Optional, Tuple
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@dataclass
|
|
6
|
+
class PipelineConfig:
|
|
7
|
+
source_file: str
|
|
8
|
+
|
|
9
|
+
build_dir: str
|
|
10
|
+
asm_dir: str
|
|
11
|
+
|
|
12
|
+
# compiler
|
|
13
|
+
optimize: str = "-O0"
|
|
14
|
+
extra_flags: Optional[list[str]] = None
|
|
15
|
+
|
|
16
|
+
# disassembler
|
|
17
|
+
arch: Optional[str] = None
|
|
18
|
+
full_disasm: bool = False
|
|
19
|
+
|
|
20
|
+
# tokenizer
|
|
21
|
+
entry: str = "main"
|
|
22
|
+
keep_register: bool = False
|
|
23
|
+
|
|
24
|
+
# vectorizer
|
|
25
|
+
model_path: Optional[str] = None
|
|
26
|
+
max_features: Optional[int] = None
|
|
27
|
+
ngram_range: Tuple[int, int] = (1, 2)
|
|
28
|
+
min_df: int = 1
|
|
29
|
+
|
|
30
|
+
# switches
|
|
31
|
+
do_compile: bool = True
|
|
32
|
+
do_disassemble: bool = True
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from disasm2vec.compiler import compile_c, compile_cpp
|
|
4
|
+
from disasm2vec.disassembler import disassemble
|
|
5
|
+
from disasm2vec.tokenizer import tokenize
|
|
6
|
+
from disasm2vec.vectorizer import Tfidf
|
|
7
|
+
|
|
8
|
+
from .config import PipelineConfig
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def run_pipeline(config: PipelineConfig):
|
|
12
|
+
"""
|
|
13
|
+
Run pipeline for single source file.
|
|
14
|
+
|
|
15
|
+
Flow:
|
|
16
|
+
source -> compile -> disassemble -> tokenizer -> vectorize
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
source = Path(config.source_file)
|
|
20
|
+
|
|
21
|
+
if not source.exists():
|
|
22
|
+
raise FileNotFoundError(source)
|
|
23
|
+
|
|
24
|
+
stem = source.stem
|
|
25
|
+
|
|
26
|
+
binary_path = Path(config.build_dir) / stem
|
|
27
|
+
asm_path = Path(config.asm_dir) / f"{stem}.asm"
|
|
28
|
+
|
|
29
|
+
binary_path.parent.mkdir(parents=True, exist_ok=True)
|
|
30
|
+
asm_path.parent.mkdir(parents=True, exist_ok=True)
|
|
31
|
+
|
|
32
|
+
# COMPILE
|
|
33
|
+
if config.do_compile:
|
|
34
|
+
flags = [config.optimize]
|
|
35
|
+
if config.extra_flags:
|
|
36
|
+
flags.extend(config.extra_flags)
|
|
37
|
+
|
|
38
|
+
if source.suffix == ".c":
|
|
39
|
+
compile_c(source, binary_path, flags)
|
|
40
|
+
|
|
41
|
+
elif source.suffix == ".cpp":
|
|
42
|
+
compile_cpp(source, binary_path, flags)
|
|
43
|
+
|
|
44
|
+
else:
|
|
45
|
+
raise ValueError(
|
|
46
|
+
f"Unsupported source type: {source.suffix}"
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
# DISASSEMBLE
|
|
50
|
+
if config.do_disassemble:
|
|
51
|
+
disassemble(
|
|
52
|
+
binary=binary_path,
|
|
53
|
+
output=asm_path,
|
|
54
|
+
arch=config.arch,
|
|
55
|
+
full=config.full_disasm,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# TOKENIZER
|
|
59
|
+
corpus = tokenize(
|
|
60
|
+
path=asm_path,
|
|
61
|
+
entry=config.entry,
|
|
62
|
+
keep_register=config.keep_register,
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
# VECTORIZE
|
|
66
|
+
if not config.model_path:
|
|
67
|
+
raise ValueError("model_path is required for pipeline")
|
|
68
|
+
|
|
69
|
+
vectorizer = Tfidf(
|
|
70
|
+
max_features=config.max_features,
|
|
71
|
+
ngram_range=config.ngram_range,
|
|
72
|
+
min_df=config.min_df,
|
|
73
|
+
)
|
|
74
|
+
vectorizer.load(config.model_path)
|
|
75
|
+
|
|
76
|
+
X = vectorizer.transform_one(corpus)
|
|
77
|
+
|
|
78
|
+
return X, vectorizer
|