disasm2vec 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- disasm2vec/__init__.py +17 -0
- disasm2vec/compiler/__init__.py +3 -0
- disasm2vec/compiler/errors.py +3 -0
- disasm2vec/compiler/gcc.py +106 -0
- disasm2vec/disassembler/__init__.py +6 -0
- disasm2vec/disassembler/errors.py +2 -0
- disasm2vec/disassembler/objdump.py +127 -0
- disasm2vec/pipeline/__init__.py +7 -0
- disasm2vec/pipeline/config.py +32 -0
- disasm2vec/pipeline/runner.py +78 -0
- disasm2vec/tokenizer/__init__.py +4 -0
- disasm2vec/tokenizer/cleaner.py +10 -0
- disasm2vec/tokenizer/core.py +184 -0
- disasm2vec/tokenizer/normalizer.py +26 -0
- disasm2vec/vectorizer/__init__.py +5 -0
- disasm2vec/vectorizer/base.py +50 -0
- disasm2vec/vectorizer/factory.py +27 -0
- disasm2vec/vectorizer/tfidf.py +123 -0
- disasm2vec-0.1.0.dist-info/METADATA +84 -0
- disasm2vec-0.1.0.dist-info/RECORD +23 -0
- disasm2vec-0.1.0.dist-info/WHEEL +5 -0
- disasm2vec-0.1.0.dist-info/licenses/LICENSE +21 -0
- disasm2vec-0.1.0.dist-info/top_level.txt +1 -0
disasm2vec/__init__.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""disasm2vec"""
|
|
2
|
+
|
|
3
|
+
from . import compiler
|
|
4
|
+
from . import disassembler
|
|
5
|
+
from . import tokenizer
|
|
6
|
+
from . import vectorizer
|
|
7
|
+
from . import pipeline
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"compiler",
|
|
11
|
+
"disassembler",
|
|
12
|
+
"tokenizer",
|
|
13
|
+
"vectorizer",
|
|
14
|
+
"pipeline",
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
import subprocess
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from .errors import CompilationError
|
|
4
|
+
|
|
5
|
+
def compile_c(
|
|
6
|
+
source: str,
|
|
7
|
+
output: str,
|
|
8
|
+
flags: list[str] | None = None
|
|
9
|
+
):
|
|
10
|
+
"""
|
|
11
|
+
Compile C source file using gcc.
|
|
12
|
+
"""
|
|
13
|
+
_compile(
|
|
14
|
+
compiler="gcc",
|
|
15
|
+
source=source,
|
|
16
|
+
output=output,
|
|
17
|
+
flags=flags,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def compile_cpp(
|
|
22
|
+
source: str,
|
|
23
|
+
output: str,
|
|
24
|
+
flags: list[str] | None = None
|
|
25
|
+
):
|
|
26
|
+
"""
|
|
27
|
+
Compile C++ source file using g++.
|
|
28
|
+
"""
|
|
29
|
+
_compile(
|
|
30
|
+
compiler="g++",
|
|
31
|
+
source=source,
|
|
32
|
+
output=output,
|
|
33
|
+
flags=flags,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _compile(
|
|
38
|
+
compiler: str,
|
|
39
|
+
source: str,
|
|
40
|
+
output: str,
|
|
41
|
+
flags: list[str] | None = None,
|
|
42
|
+
):
|
|
43
|
+
source = Path(source)
|
|
44
|
+
output = Path(output)
|
|
45
|
+
|
|
46
|
+
if not source.exists():
|
|
47
|
+
raise FileNotFoundError(source)
|
|
48
|
+
|
|
49
|
+
cmd = [
|
|
50
|
+
compiler,
|
|
51
|
+
str(source),
|
|
52
|
+
"-o",
|
|
53
|
+
str(output),
|
|
54
|
+
]
|
|
55
|
+
|
|
56
|
+
if flags:
|
|
57
|
+
cmd.extend(flags)
|
|
58
|
+
|
|
59
|
+
try:
|
|
60
|
+
subprocess.run(
|
|
61
|
+
cmd,
|
|
62
|
+
check=True,
|
|
63
|
+
stdout=subprocess.PIPE,
|
|
64
|
+
stderr=subprocess.PIPE,
|
|
65
|
+
text=True,
|
|
66
|
+
)
|
|
67
|
+
except subprocess.CalledProcessError as e:
|
|
68
|
+
raise CompilationError(
|
|
69
|
+
f"Compilation failed for {source}:\n{e.stderr}"
|
|
70
|
+
) from e
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def compile_folder(
|
|
74
|
+
src_dir: str,
|
|
75
|
+
out_dir: str,
|
|
76
|
+
optimize: str = "-O0",
|
|
77
|
+
extra_flags: list[str] | None = None,
|
|
78
|
+
):
|
|
79
|
+
"""
|
|
80
|
+
Compile all .c and .cpp files in a folder (recursively).
|
|
81
|
+
"""
|
|
82
|
+
src_dir = Path(src_dir)
|
|
83
|
+
out_dir = Path(out_dir)
|
|
84
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
85
|
+
|
|
86
|
+
extra_flags = extra_flags or []
|
|
87
|
+
|
|
88
|
+
sources = list(src_dir.rglob("*.c")) + list(src_dir.rglob("*.cpp"))
|
|
89
|
+
|
|
90
|
+
if not sources:
|
|
91
|
+
raise ValueError(f"No C/C++ files found in {src_dir}")
|
|
92
|
+
|
|
93
|
+
for src in sources:
|
|
94
|
+
output = out_dir / src.stem
|
|
95
|
+
flags = [optimize, *extra_flags]
|
|
96
|
+
|
|
97
|
+
try:
|
|
98
|
+
if src.suffix == ".c":
|
|
99
|
+
compile_c(src, output, flags)
|
|
100
|
+
else:
|
|
101
|
+
compile_cpp(src, output, flags)
|
|
102
|
+
|
|
103
|
+
except CompilationError as e:
|
|
104
|
+
raise CompilationError(
|
|
105
|
+
f"Compilation failed for {src}:\n{e}"
|
|
106
|
+
) from e
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
import subprocess
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from .errors import DisassemblyError
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def disassemble(
|
|
7
|
+
binary: str,
|
|
8
|
+
output: str,
|
|
9
|
+
arch: str | None = None,
|
|
10
|
+
full: bool = False,
|
|
11
|
+
):
|
|
12
|
+
"""
|
|
13
|
+
Disassemble a single binary using objdump.
|
|
14
|
+
|
|
15
|
+
Parameters
|
|
16
|
+
----------
|
|
17
|
+
binary : str
|
|
18
|
+
Path to compiled binary
|
|
19
|
+
output : str
|
|
20
|
+
Output .asm file
|
|
21
|
+
arch : str | None
|
|
22
|
+
Optional architecture (e.g. i386:x86-64)
|
|
23
|
+
full : bool
|
|
24
|
+
If True, disassemble all functions.
|
|
25
|
+
If False, exclude builtin / PLT functions.
|
|
26
|
+
"""
|
|
27
|
+
binary = Path(binary)
|
|
28
|
+
output = Path(output)
|
|
29
|
+
|
|
30
|
+
if not binary.exists():
|
|
31
|
+
raise FileNotFoundError(binary)
|
|
32
|
+
|
|
33
|
+
output.parent.mkdir(parents=True, exist_ok=True)
|
|
34
|
+
|
|
35
|
+
# Base command
|
|
36
|
+
cmd = ["objdump", "-d", "--section=.text", str(binary)]
|
|
37
|
+
|
|
38
|
+
if arch:
|
|
39
|
+
cmd.extend(["-m", arch])
|
|
40
|
+
|
|
41
|
+
try:
|
|
42
|
+
result = subprocess.run(
|
|
43
|
+
cmd,
|
|
44
|
+
check=True,
|
|
45
|
+
stdout=subprocess.PIPE,
|
|
46
|
+
stderr=subprocess.PIPE,
|
|
47
|
+
text=True,
|
|
48
|
+
)
|
|
49
|
+
except subprocess.CalledProcessError as e:
|
|
50
|
+
raise DisassemblyError(
|
|
51
|
+
f"objdump failed for {binary}:\n{e.stderr}"
|
|
52
|
+
) from e
|
|
53
|
+
|
|
54
|
+
asm = result.stdout
|
|
55
|
+
|
|
56
|
+
if not full:
|
|
57
|
+
asm = _filter_builtin_functions(asm)
|
|
58
|
+
|
|
59
|
+
output.write_text(asm)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def disassemble_folder(
|
|
63
|
+
bin_dir: str,
|
|
64
|
+
out_dir: str,
|
|
65
|
+
full: bool = False,
|
|
66
|
+
):
|
|
67
|
+
"""
|
|
68
|
+
Disassemble all binaries in a folder.
|
|
69
|
+
|
|
70
|
+
Parameters
|
|
71
|
+
----------
|
|
72
|
+
bin_dir : str
|
|
73
|
+
Folder containing compiled binaries
|
|
74
|
+
out_dir : str
|
|
75
|
+
Folder to store .asm outputs
|
|
76
|
+
full : bool
|
|
77
|
+
If True, disassemble all functions.
|
|
78
|
+
If False, exclude builtin / PLT functions.
|
|
79
|
+
"""
|
|
80
|
+
bin_dir = Path(bin_dir)
|
|
81
|
+
out_dir = Path(out_dir)
|
|
82
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
83
|
+
|
|
84
|
+
binaries = [p for p in bin_dir.iterdir() if p.is_file()]
|
|
85
|
+
|
|
86
|
+
if not binaries:
|
|
87
|
+
raise ValueError(f"No binaries found in {bin_dir}")
|
|
88
|
+
|
|
89
|
+
for binary in binaries:
|
|
90
|
+
asm_out = out_dir / f"{binary.name}.asm"
|
|
91
|
+
|
|
92
|
+
try:
|
|
93
|
+
disassemble(binary, asm_out, full=full)
|
|
94
|
+
except DisassemblyError as e:
|
|
95
|
+
raise DisassemblyError(
|
|
96
|
+
f"Disassembly failed for {binary}:\n{e}"
|
|
97
|
+
) from e
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _filter_builtin_functions(asm: str) -> str:
|
|
101
|
+
"""
|
|
102
|
+
Remove builtin / PLT / runtime functions from objdump output.
|
|
103
|
+
"""
|
|
104
|
+
filtered_lines = []
|
|
105
|
+
|
|
106
|
+
skip = False
|
|
107
|
+
for line in asm.splitlines():
|
|
108
|
+
if "<" in line and ">" in line and line.strip().endswith(":"):
|
|
109
|
+
name = line.split("<")[1].split(">")[0]
|
|
110
|
+
|
|
111
|
+
if (
|
|
112
|
+
name.endswith("@plt")
|
|
113
|
+
or name.startswith("_start")
|
|
114
|
+
or name.startswith("frame_dummy")
|
|
115
|
+
or name.startswith("register_tm_clones")
|
|
116
|
+
or name.startswith("deregister_tm_clones")
|
|
117
|
+
or name.startswith("__")
|
|
118
|
+
):
|
|
119
|
+
skip = True
|
|
120
|
+
continue
|
|
121
|
+
else:
|
|
122
|
+
skip = False
|
|
123
|
+
|
|
124
|
+
if not skip:
|
|
125
|
+
filtered_lines.append(line)
|
|
126
|
+
|
|
127
|
+
return "\n".join(filtered_lines)
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from typing import Optional, Tuple
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@dataclass
|
|
6
|
+
class PipelineConfig:
|
|
7
|
+
source_file: str
|
|
8
|
+
|
|
9
|
+
build_dir: str
|
|
10
|
+
asm_dir: str
|
|
11
|
+
|
|
12
|
+
# compiler
|
|
13
|
+
optimize: str = "-O0"
|
|
14
|
+
extra_flags: Optional[list[str]] = None
|
|
15
|
+
|
|
16
|
+
# disassembler
|
|
17
|
+
arch: Optional[str] = None
|
|
18
|
+
full_disasm: bool = False
|
|
19
|
+
|
|
20
|
+
# tokenizer
|
|
21
|
+
entry: str = "main"
|
|
22
|
+
keep_register: bool = False
|
|
23
|
+
|
|
24
|
+
# vectorizer
|
|
25
|
+
model_path: Optional[str] = None
|
|
26
|
+
max_features: Optional[int] = None
|
|
27
|
+
ngram_range: Tuple[int, int] = (1, 2)
|
|
28
|
+
min_df: int = 1
|
|
29
|
+
|
|
30
|
+
# switches
|
|
31
|
+
do_compile: bool = True
|
|
32
|
+
do_disassemble: bool = True
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
from disasm2vec.compiler import compile_c, compile_cpp
|
|
4
|
+
from disasm2vec.disassembler import disassemble
|
|
5
|
+
from disasm2vec.tokenizer import tokenize
|
|
6
|
+
from disasm2vec.vectorizer import Tfidf
|
|
7
|
+
|
|
8
|
+
from .config import PipelineConfig
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def run_pipeline(config: PipelineConfig):
|
|
12
|
+
"""
|
|
13
|
+
Run pipeline for single source file.
|
|
14
|
+
|
|
15
|
+
Flow:
|
|
16
|
+
source -> compile -> disassemble -> tokenizer -> vectorize
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
source = Path(config.source_file)
|
|
20
|
+
|
|
21
|
+
if not source.exists():
|
|
22
|
+
raise FileNotFoundError(source)
|
|
23
|
+
|
|
24
|
+
stem = source.stem
|
|
25
|
+
|
|
26
|
+
binary_path = Path(config.build_dir) / stem
|
|
27
|
+
asm_path = Path(config.asm_dir) / f"{stem}.asm"
|
|
28
|
+
|
|
29
|
+
binary_path.parent.mkdir(parents=True, exist_ok=True)
|
|
30
|
+
asm_path.parent.mkdir(parents=True, exist_ok=True)
|
|
31
|
+
|
|
32
|
+
# COMPILE
|
|
33
|
+
if config.do_compile:
|
|
34
|
+
flags = [config.optimize]
|
|
35
|
+
if config.extra_flags:
|
|
36
|
+
flags.extend(config.extra_flags)
|
|
37
|
+
|
|
38
|
+
if source.suffix == ".c":
|
|
39
|
+
compile_c(source, binary_path, flags)
|
|
40
|
+
|
|
41
|
+
elif source.suffix == ".cpp":
|
|
42
|
+
compile_cpp(source, binary_path, flags)
|
|
43
|
+
|
|
44
|
+
else:
|
|
45
|
+
raise ValueError(
|
|
46
|
+
f"Unsupported source type: {source.suffix}"
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
# DISASSEMBLE
|
|
50
|
+
if config.do_disassemble:
|
|
51
|
+
disassemble(
|
|
52
|
+
binary=binary_path,
|
|
53
|
+
output=asm_path,
|
|
54
|
+
arch=config.arch,
|
|
55
|
+
full=config.full_disasm,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# TOKENIZER
|
|
59
|
+
corpus = tokenize(
|
|
60
|
+
path=asm_path,
|
|
61
|
+
entry=config.entry,
|
|
62
|
+
keep_register=config.keep_register,
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
# VECTORIZE
|
|
66
|
+
if not config.model_path:
|
|
67
|
+
raise ValueError("model_path is required for pipeline")
|
|
68
|
+
|
|
69
|
+
vectorizer = Tfidf(
|
|
70
|
+
max_features=config.max_features,
|
|
71
|
+
ngram_range=config.ngram_range,
|
|
72
|
+
min_df=config.min_df,
|
|
73
|
+
)
|
|
74
|
+
vectorizer.load(config.model_path)
|
|
75
|
+
|
|
76
|
+
X = vectorizer.transform_one(corpus)
|
|
77
|
+
|
|
78
|
+
return X, vectorizer
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from .cleaner import is_instruction_line
|
|
4
|
+
from .normalizer import normalize_operand
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
BYTE_PATTERN = re.compile(r"^[0-9a-fA-F]{2}$")
|
|
8
|
+
MNEMONIC_PATTERN = re.compile(r"^[a-zA-Z][a-zA-Z0-9]*$")
|
|
9
|
+
FUNCTION_HEADER = re.compile(r"<(.+?)>:")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def tokenize_instruction(line: str, keep_register: bool = False):
|
|
13
|
+
line = line.split("#", 1)[0]
|
|
14
|
+
|
|
15
|
+
if ":" not in line:
|
|
16
|
+
return None
|
|
17
|
+
|
|
18
|
+
_, rest = line.split(":", 1)
|
|
19
|
+
tokens = rest.strip().split()
|
|
20
|
+
|
|
21
|
+
if not tokens:
|
|
22
|
+
return None
|
|
23
|
+
|
|
24
|
+
i = 0
|
|
25
|
+
while i < len(tokens) and BYTE_PATTERN.match(tokens[i]):
|
|
26
|
+
i += 1
|
|
27
|
+
|
|
28
|
+
if i >= len(tokens):
|
|
29
|
+
return None
|
|
30
|
+
|
|
31
|
+
mnemonic = tokens[i].lower()
|
|
32
|
+
if not MNEMONIC_PATTERN.match(mnemonic):
|
|
33
|
+
return None
|
|
34
|
+
|
|
35
|
+
operand_str = " ".join(tokens[i + 1 :]).strip()
|
|
36
|
+
result = [mnemonic]
|
|
37
|
+
|
|
38
|
+
if mnemonic == "call":
|
|
39
|
+
result.append("FUNC")
|
|
40
|
+
return result
|
|
41
|
+
|
|
42
|
+
if mnemonic.startswith("j"):
|
|
43
|
+
result.append("JMP")
|
|
44
|
+
return result
|
|
45
|
+
|
|
46
|
+
if operand_str:
|
|
47
|
+
operands = [op.strip() for op in operand_str.split(",")]
|
|
48
|
+
for op in operands:
|
|
49
|
+
result.append(
|
|
50
|
+
normalize_operand(op, keep_register=keep_register)
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
return result
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _split_functions(path: Path) -> dict[str, list[str]]:
|
|
57
|
+
functions = {}
|
|
58
|
+
current = None
|
|
59
|
+
|
|
60
|
+
with path.open() as f:
|
|
61
|
+
for line in f:
|
|
62
|
+
header = FUNCTION_HEADER.search(line)
|
|
63
|
+
if header:
|
|
64
|
+
current = header.group(1)
|
|
65
|
+
functions[current] = []
|
|
66
|
+
continue
|
|
67
|
+
|
|
68
|
+
if current and is_instruction_line(line):
|
|
69
|
+
functions[current].append(line)
|
|
70
|
+
|
|
71
|
+
return functions
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _extract_call_target(line: str) -> str | None:
|
|
75
|
+
if "call" not in line:
|
|
76
|
+
return None
|
|
77
|
+
|
|
78
|
+
if "<" in line and ">" in line:
|
|
79
|
+
return line.split("<")[1].split(">")[0]
|
|
80
|
+
|
|
81
|
+
return None
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _expand_function(
|
|
85
|
+
func_name: str,
|
|
86
|
+
functions: dict[str, list[str]],
|
|
87
|
+
keep_register: bool,
|
|
88
|
+
visited: set,
|
|
89
|
+
) -> list[str]:
|
|
90
|
+
"""
|
|
91
|
+
Inline user-defined function bodies at call sites.
|
|
92
|
+
"""
|
|
93
|
+
if func_name in visited:
|
|
94
|
+
return []
|
|
95
|
+
|
|
96
|
+
visited.add(func_name)
|
|
97
|
+
|
|
98
|
+
result = []
|
|
99
|
+
|
|
100
|
+
for line in functions.get(func_name, []):
|
|
101
|
+
tokens = tokenize_instruction(
|
|
102
|
+
line,
|
|
103
|
+
keep_register=keep_register,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
if not tokens:
|
|
107
|
+
continue
|
|
108
|
+
|
|
109
|
+
if tokens[0] == "call":
|
|
110
|
+
callee = _extract_call_target(line)
|
|
111
|
+
|
|
112
|
+
if (
|
|
113
|
+
callee
|
|
114
|
+
and "@plt" not in callee
|
|
115
|
+
and callee in functions
|
|
116
|
+
):
|
|
117
|
+
result.extend(
|
|
118
|
+
_expand_function(
|
|
119
|
+
callee,
|
|
120
|
+
functions,
|
|
121
|
+
keep_register,
|
|
122
|
+
visited,
|
|
123
|
+
)
|
|
124
|
+
)
|
|
125
|
+
continue
|
|
126
|
+
|
|
127
|
+
result.append(" ".join(tokens))
|
|
128
|
+
|
|
129
|
+
return result
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def tokenize(
|
|
133
|
+
path: str,
|
|
134
|
+
keep_register: bool = False,
|
|
135
|
+
entry: str = "main",
|
|
136
|
+
) -> list[str]:
|
|
137
|
+
"""
|
|
138
|
+
Parse file and inline user-defined function calls
|
|
139
|
+
inside selected entry function.
|
|
140
|
+
"""
|
|
141
|
+
path = Path(path)
|
|
142
|
+
|
|
143
|
+
functions = _split_functions(path)
|
|
144
|
+
|
|
145
|
+
if entry not in functions:
|
|
146
|
+
raise ValueError(f"Function '{entry}' not found.")
|
|
147
|
+
|
|
148
|
+
return _expand_function(
|
|
149
|
+
entry,
|
|
150
|
+
functions,
|
|
151
|
+
keep_register,
|
|
152
|
+
visited=set(),
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def tokenize_batch(
|
|
157
|
+
asm_dir: str,
|
|
158
|
+
keep_register: bool = False,
|
|
159
|
+
entry: str = "main",
|
|
160
|
+
) -> dict[str, list[str]]:
|
|
161
|
+
"""
|
|
162
|
+
Parse all .asm files in a folder.
|
|
163
|
+
Each file processed independently.
|
|
164
|
+
"""
|
|
165
|
+
asm_dir = Path(asm_dir)
|
|
166
|
+
|
|
167
|
+
if not asm_dir.exists():
|
|
168
|
+
raise FileNotFoundError(asm_dir)
|
|
169
|
+
|
|
170
|
+
asm_files = sorted(asm_dir.glob("*.asm"))
|
|
171
|
+
|
|
172
|
+
if not asm_files:
|
|
173
|
+
raise ValueError(f"No .asm files found in {asm_dir}")
|
|
174
|
+
|
|
175
|
+
result: dict[str, list[str]] = {}
|
|
176
|
+
|
|
177
|
+
for asm_file in asm_files:
|
|
178
|
+
result[asm_file.name] = tokenize(
|
|
179
|
+
asm_file,
|
|
180
|
+
keep_register=keep_register,
|
|
181
|
+
entry=entry,
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
return result
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
REGISTER_PATTERN = re.compile(r"%([a-zA-Z0-9]+)")
|
|
4
|
+
IMMEDIATE_PATTERN = re.compile(r"\$0x[0-9a-fA-F]+|\$\d+")
|
|
5
|
+
MEMORY_PATTERN = re.compile(
|
|
6
|
+
r"%[a-z]{2}:|"
|
|
7
|
+
r"\([^)]+\)|"
|
|
8
|
+
r"0x[0-9a-fA-F]+"
|
|
9
|
+
)
|
|
10
|
+
SYMBOL_PATTERN = re.compile(r"<.*?>")
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def normalize_operand(operand: str, keep_register: bool = False) -> str:
|
|
14
|
+
operand = SYMBOL_PATTERN.sub("", operand)
|
|
15
|
+
|
|
16
|
+
if MEMORY_PATTERN.search(operand):
|
|
17
|
+
return "MEM"
|
|
18
|
+
|
|
19
|
+
if IMMEDIATE_PATTERN.search(operand):
|
|
20
|
+
return "IMM"
|
|
21
|
+
|
|
22
|
+
m = REGISTER_PATTERN.search(operand)
|
|
23
|
+
if m:
|
|
24
|
+
return m.group(1).lower() if keep_register else "REG"
|
|
25
|
+
|
|
26
|
+
return operand
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import List, Union
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
class VectorizerBase(ABC):
|
|
6
|
+
"""
|
|
7
|
+
Abstract base class for all vectorizers.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
@abstractmethod
|
|
11
|
+
def fit(self, documents: List[List[str]]):
|
|
12
|
+
"""
|
|
13
|
+
Fit the vectorizer to the documents.
|
|
14
|
+
"""
|
|
15
|
+
pass
|
|
16
|
+
|
|
17
|
+
@abstractmethod
|
|
18
|
+
def transform(self, documents: List[List[str]]):
|
|
19
|
+
"""
|
|
20
|
+
Transform documents to vectors.
|
|
21
|
+
"""
|
|
22
|
+
pass
|
|
23
|
+
|
|
24
|
+
@abstractmethod
|
|
25
|
+
def fit_transform(self, documents: List[List[str]]):
|
|
26
|
+
"""
|
|
27
|
+
Fit to data, then transform it.
|
|
28
|
+
"""
|
|
29
|
+
pass
|
|
30
|
+
|
|
31
|
+
@abstractmethod
|
|
32
|
+
def transform_one(self, document: List[str]):
|
|
33
|
+
"""
|
|
34
|
+
Transform a single document to a vector.
|
|
35
|
+
"""
|
|
36
|
+
pass
|
|
37
|
+
|
|
38
|
+
@abstractmethod
|
|
39
|
+
def save(self, path: Union[str, Path]):
|
|
40
|
+
"""
|
|
41
|
+
Save the vectorizer to a file.
|
|
42
|
+
"""
|
|
43
|
+
pass
|
|
44
|
+
|
|
45
|
+
@abstractmethod
|
|
46
|
+
def load(self, path: Union[str, Path]):
|
|
47
|
+
"""
|
|
48
|
+
Load the vectorizer from a file.
|
|
49
|
+
"""
|
|
50
|
+
pass
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
from .base import VectorizerBase
|
|
4
|
+
from .tfidf import Tfidf
|
|
5
|
+
|
|
6
|
+
_CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
7
|
+
_PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(_CURRENT_DIR)))
|
|
8
|
+
DEFAULT_MODEL_PATH = os.path.join(_PROJECT_ROOT, "models", "base_tfidf_asm.pkl")
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def get_vectorizer(model_type: str = "tfidf", **kwargs) -> VectorizerBase:
|
|
12
|
+
"""
|
|
13
|
+
Factory function to get a vectorizer instance.
|
|
14
|
+
"""
|
|
15
|
+
if model_type == "tfidf":
|
|
16
|
+
return Tfidf(**kwargs)
|
|
17
|
+
else:
|
|
18
|
+
raise ValueError(f"Unknown vectorizer type: {model_type}")
|
|
19
|
+
|
|
20
|
+
def load_vectorizer(path: str = DEFAULT_MODEL_PATH, model_type: str = "tfidf") -> VectorizerBase:
|
|
21
|
+
"""
|
|
22
|
+
Load a vectorizer from a file.
|
|
23
|
+
"""
|
|
24
|
+
if model_type == "tfidf":
|
|
25
|
+
return Tfidf().load(path)
|
|
26
|
+
else:
|
|
27
|
+
raise ValueError(f"Unknown vectorizer type: {model_type}")
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
from typing import List, Iterable, Optional, Tuple, Union
|
|
2
|
+
import pickle
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
5
|
+
from .base import VectorizerBase
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def identity(x):
|
|
9
|
+
return x
|
|
10
|
+
|
|
11
|
+
class Tfidf(VectorizerBase):
|
|
12
|
+
"""
|
|
13
|
+
TF-IDF vectorizer for assembly instruction tokens.
|
|
14
|
+
|
|
15
|
+
Input format
|
|
16
|
+
------------
|
|
17
|
+
documents must be:
|
|
18
|
+
List[List[str]]
|
|
19
|
+
|
|
20
|
+
where:
|
|
21
|
+
outer list = files
|
|
22
|
+
inner list = instructions
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
*,
|
|
28
|
+
max_features: Optional[int] = None,
|
|
29
|
+
ngram_range: Tuple[int, int] = (1, 2),
|
|
30
|
+
min_df: int = 1,
|
|
31
|
+
max_df: float | int = 1.0,
|
|
32
|
+
use_idf: bool = True,
|
|
33
|
+
norm: str | None = "l2",
|
|
34
|
+
):
|
|
35
|
+
self.vectorizer = TfidfVectorizer(
|
|
36
|
+
tokenizer=identity,
|
|
37
|
+
preprocessor=identity,
|
|
38
|
+
token_pattern=None,
|
|
39
|
+
lowercase=False,
|
|
40
|
+
max_features=max_features,
|
|
41
|
+
ngram_range=ngram_range,
|
|
42
|
+
min_df=min_df,
|
|
43
|
+
max_df=max_df,
|
|
44
|
+
use_idf=use_idf,
|
|
45
|
+
norm=norm,
|
|
46
|
+
)
|
|
47
|
+
self._fitted = False
|
|
48
|
+
|
|
49
|
+
# FIT
|
|
50
|
+
def fit(self, documents: List[List[str]]):
|
|
51
|
+
"""
|
|
52
|
+
Fit vocabulary + IDF from corpus.
|
|
53
|
+
"""
|
|
54
|
+
self._validate_docs(documents)
|
|
55
|
+
|
|
56
|
+
self.vectorizer.fit(documents)
|
|
57
|
+
self._fitted = True
|
|
58
|
+
return self
|
|
59
|
+
|
|
60
|
+
# TRANSFORM
|
|
61
|
+
def transform(self, documents: List[List[str]]):
|
|
62
|
+
"""
|
|
63
|
+
Transform documents → vectors.
|
|
64
|
+
"""
|
|
65
|
+
self._check_fitted()
|
|
66
|
+
self._validate_docs(documents)
|
|
67
|
+
|
|
68
|
+
X = self.vectorizer.transform(documents)
|
|
69
|
+
return X
|
|
70
|
+
|
|
71
|
+
# FIT + TRANSFORM
|
|
72
|
+
def fit_transform(self, documents: List[List[str]]):
|
|
73
|
+
"""
|
|
74
|
+
Fit then transform.
|
|
75
|
+
"""
|
|
76
|
+
self._validate_docs(documents)
|
|
77
|
+
|
|
78
|
+
X = self.vectorizer.fit_transform(documents)
|
|
79
|
+
self._fitted = True
|
|
80
|
+
return X
|
|
81
|
+
|
|
82
|
+
# SINGLE DOC
|
|
83
|
+
def transform_one(self, document: List[str]):
|
|
84
|
+
"""
|
|
85
|
+
Transform single file → vector
|
|
86
|
+
"""
|
|
87
|
+
self._check_fitted()
|
|
88
|
+
return self.vectorizer.transform([document])
|
|
89
|
+
|
|
90
|
+
# FEATURES
|
|
91
|
+
def features(self) -> List[str]:
|
|
92
|
+
self._check_fitted()
|
|
93
|
+
return self.vectorizer.get_feature_names_out().tolist()
|
|
94
|
+
|
|
95
|
+
# SAVE / LOAD
|
|
96
|
+
def save(self, path: Union[str, Path]):
|
|
97
|
+
self._check_fitted()
|
|
98
|
+
|
|
99
|
+
with open(path, "wb") as f:
|
|
100
|
+
pickle.dump(self.vectorizer, f)
|
|
101
|
+
|
|
102
|
+
def load(self, path: Union[str, Path]):
|
|
103
|
+
with open(path, "rb") as f:
|
|
104
|
+
self.vectorizer = pickle.load(f)
|
|
105
|
+
|
|
106
|
+
self._fitted = True
|
|
107
|
+
return self
|
|
108
|
+
|
|
109
|
+
def _validate_docs(self, docs):
|
|
110
|
+
if not isinstance(docs, Iterable):
|
|
111
|
+
raise TypeError("documents must be iterable")
|
|
112
|
+
|
|
113
|
+
for d in docs:
|
|
114
|
+
if not isinstance(d, list):
|
|
115
|
+
raise TypeError(
|
|
116
|
+
"Each document must be List[str]"
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
def _check_fitted(self):
|
|
120
|
+
if not self._fitted:
|
|
121
|
+
raise RuntimeError(
|
|
122
|
+
"Vectorizer not fitted. Call fit() first."
|
|
123
|
+
)
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: disasm2vec
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: disasm2vec is a research framework designed to generate vector representations from disassembled C/C++ binaries. It provides a modular pipeline that handles compilation, disassembly, tokenization, and vectorization, enabling researchers and security analysts to transform raw code into machine-learning-ready features.
|
|
5
|
+
Author-email: Ahmad Nur Rohim <ahmadnurrohim2812@gmail.com>
|
|
6
|
+
Project-URL: Repository, https://github.com/Anro128/disasm2vec
|
|
7
|
+
Project-URL: Issues, https://github.com/Anro128/disasm2vec/issues
|
|
8
|
+
Project-URL: Changelog, https://github.com/Anro128/disasm2vec/blob/main/CHANGELOG.md
|
|
9
|
+
Requires-Python: >=3.10
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Requires-Dist: scikit-learn>=1.0.0
|
|
13
|
+
Requires-Dist: numpy>=1.20.0
|
|
14
|
+
Dynamic: license-file
|
|
15
|
+
|
|
16
|
+
# disasm2vec
|
|
17
|
+
|
|
18
|
+
**disasm2vec** is a research framework designed to generate vector representations from disassembled C/C++ binaries. It provides a modular pipeline that handles compilation, disassembly, tokenization, and vectorization, enabling researchers and security analysts to transform raw code into machine-learning-ready features.
|
|
19
|
+
|
|
20
|
+
## Features
|
|
21
|
+
|
|
22
|
+
- **Automated Compilation**: Seamlessly compiles C and C++ source files using GCC.
|
|
23
|
+
- **Disassembly Wrapper**: Extracts assembly instructions using `objdump`, supporting both full and function-specific disassembly.
|
|
24
|
+
- **Intelligent Tokenization**: Normalizes and cleans assembly instructions, with options to preserve or abstract register names.
|
|
25
|
+
- **Vectorization**: Implements TF-IDF vectorization with a flexible factory pattern for easy model management.
|
|
26
|
+
- **End-to-End Pipeline**: Orchestrates the entire process from source code to vector embedding.
|
|
27
|
+
- **Extensible Architecture**: Built with abstract base classes to easily support new compilers, disassemblers, or vectorizers.
|
|
28
|
+
|
|
29
|
+
## Prerequisites
|
|
30
|
+
|
|
31
|
+
- **Python**: version 3.10 or higher.
|
|
32
|
+
- **Operating System**: Linux or Windows Subsystem for Linux (WSL).
|
|
33
|
+
- **GCC**: Required for compiling source files.
|
|
34
|
+
- **Objdump**: Required for disassembling binaries.
|
|
35
|
+
|
|
36
|
+
**Note**: The compilation (`gcc`) and disassembly (`objdump`) modules rely on system-level tools typically found in Linux environments. If you are on Windows, please use WSL.
|
|
37
|
+
|
|
38
|
+
Ensure both `gcc` and `objdump` are installed and available in your system's PATH.
|
|
39
|
+
|
|
40
|
+
## Installation
|
|
41
|
+
|
|
42
|
+
Install directly from PyPI:
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
pip install disasm2vec
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
Or install from source:
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
git clone https://github.com/yourusername/disasm2vec.git
|
|
52
|
+
cd disasm2vec
|
|
53
|
+
pip install .
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Usage
|
|
57
|
+
|
|
58
|
+
The core of the framework is the `PipelineRunner`, which processes a source file based on a configuration object.
|
|
59
|
+
|
|
60
|
+
### Basic Example
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
from disasm2vec.pipeline import PipelineConfig, run_pipeline
|
|
64
|
+
|
|
65
|
+
# Configure the pipeline
|
|
66
|
+
config = PipelineConfig(
|
|
67
|
+
source_file="examples/sample.c",
|
|
68
|
+
build_dir="build",
|
|
69
|
+
asm_dir="asm",
|
|
70
|
+
model_path="models/base_tfidf_asm.pkl" # Path to pre-trained model or where to save a new one
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# Run the pipeline
|
|
74
|
+
# returns:
|
|
75
|
+
# vector: The vector representation of the source file
|
|
76
|
+
# vectorizer: The fitted vectorizer instance
|
|
77
|
+
vector, vectorizer = run_pipeline(config)
|
|
78
|
+
|
|
79
|
+
print(f"Generated Vector Shape: {vector.shape}")
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## License
|
|
83
|
+
|
|
84
|
+
This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
disasm2vec/__init__.py,sha256=Noj0HXMvVAx8pDKbFHjrGbwHMa4Nax6UF4PNhCQXBGA,280
|
|
2
|
+
disasm2vec/compiler/__init__.py,sha256=i0EbRV8qOxOS3WNF604UqA_oHq0y8NYfs7Bsm0jokB8,117
|
|
3
|
+
disasm2vec/compiler/errors.py,sha256=RVehQDn0v-mpERoyIsOzMOZBTvYVrn2zsVwH-05mr1o,88
|
|
4
|
+
disasm2vec/compiler/gcc.py,sha256=nrdqCgmAU0tSWJXVbZdZnnwwcsqrzHsUD06-Q0KTWJQ,2311
|
|
5
|
+
disasm2vec/disassembler/__init__.py,sha256=d2lRMPfwl4D4pMf1QdBU9F62BXORnqRLOrOF9ML7XJg,119
|
|
6
|
+
disasm2vec/disassembler/errors.py,sha256=Jjlj22kmSnAITo3RAxPHmSN7XedupeeB1ok9S7HiZyE,89
|
|
7
|
+
disasm2vec/disassembler/objdump.py,sha256=TOckxmwXKUbQ6kUxsvkpPEo9wwLPiXGehPHEK0m8IEA,3245
|
|
8
|
+
disasm2vec/pipeline/__init__.py,sha256=TY5yYVka9uVWPa4Bgbxii_Llf21aZ2d2hy9lVbAMnv0,131
|
|
9
|
+
disasm2vec/pipeline/config.py,sha256=SHjVjWcXS6GJPbn53CkQpUi-CkXsSeuk3ssNa2JjDVI,666
|
|
10
|
+
disasm2vec/pipeline/runner.py,sha256=ylBhaGZ-1XxEhLBdvzzYDdrYBaEA5H11ZiNj9IyIU8s,2011
|
|
11
|
+
disasm2vec/tokenizer/__init__.py,sha256=CMO7ZS1-_weaHmC7HWJkOn9sju3ZYjnV27lcT3YugSo,102
|
|
12
|
+
disasm2vec/tokenizer/cleaner.py,sha256=exyjO1Vbd2Tulzw4k1_jGM-QP6rz9I4u8z2uEaYvaNE,251
|
|
13
|
+
disasm2vec/tokenizer/core.py,sha256=7UvVP3W8sAHFV70cUqgti7ZuzLrU6Szgxy-UsiY1cHM,4288
|
|
14
|
+
disasm2vec/tokenizer/normalizer.py,sha256=4iG9bS8vIGkbNqpg32lCzgu8q4VozpfzvQ1LPRp59hE,653
|
|
15
|
+
disasm2vec/vectorizer/__init__.py,sha256=bxqYPOky_0LDWF2dDPazcDOykUp0WAgA_ImBIVvreBU,192
|
|
16
|
+
disasm2vec/vectorizer/base.py,sha256=zjwZxHcoweq52xvAmSBHWVT1GyzGyQ5RKJv9prmLbwo,1136
|
|
17
|
+
disasm2vec/vectorizer/factory.py,sha256=bUbJ3DmffdFhw-x8vwyKIQqzW4PKayZ6IL65MNjvNdM,889
|
|
18
|
+
disasm2vec/vectorizer/tfidf.py,sha256=ReWQKlnh0HLOhAY6f688ViCJjQkimwVX4OdJClX9AjU,3239
|
|
19
|
+
disasm2vec-0.1.0.dist-info/licenses/LICENSE,sha256=RalBa8eUTXV7G8B7ZD0rD4su69Pr55NFd65d6W0UQNc,1093
|
|
20
|
+
disasm2vec-0.1.0.dist-info/METADATA,sha256=dSld2Qy4LlDwXtfE65PTIhQSdkVjEuckLLiU5rY22-o,3335
|
|
21
|
+
disasm2vec-0.1.0.dist-info/WHEEL,sha256=YCfwYGOYMi5Jhw2fU4yNgwErybb2IX5PEwBKV4ZbdBo,91
|
|
22
|
+
disasm2vec-0.1.0.dist-info/top_level.txt,sha256=3OV2TKP_vsNJzItCmKI9CRtoqdoq53QrWrLfL8TxNkQ,11
|
|
23
|
+
disasm2vec-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Ahmad Nur Rohim
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
disasm2vec
|