disasm2vec 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
disasm2vec/__init__.py ADDED
@@ -0,0 +1,17 @@
1
+ """disasm2vec"""
2
+
3
+ from . import compiler
4
+ from . import disassembler
5
+ from . import tokenizer
6
+ from . import vectorizer
7
+ from . import pipeline
8
+
9
+ __all__ = [
10
+ "compiler",
11
+ "disassembler",
12
+ "tokenizer",
13
+ "vectorizer",
14
+ "pipeline",
15
+ ]
16
+
17
+ __version__ = "0.1.0"
@@ -0,0 +1,3 @@
1
+ from .gcc import compile_c, compile_cpp, compile_folder
2
+
3
+ __all__ = ["compile_c", "compile_cpp", "compile_folder"]
@@ -0,0 +1,3 @@
1
+ class CompilationError(Exception):
2
+ """Raised when compilation fails."""
3
+ pass
@@ -0,0 +1,106 @@
1
+ import subprocess
2
+ from pathlib import Path
3
+ from .errors import CompilationError
4
+
5
+ def compile_c(
6
+ source: str,
7
+ output: str,
8
+ flags: list[str] | None = None
9
+ ):
10
+ """
11
+ Compile C source file using gcc.
12
+ """
13
+ _compile(
14
+ compiler="gcc",
15
+ source=source,
16
+ output=output,
17
+ flags=flags,
18
+ )
19
+
20
+
21
+ def compile_cpp(
22
+ source: str,
23
+ output: str,
24
+ flags: list[str] | None = None
25
+ ):
26
+ """
27
+ Compile C++ source file using g++.
28
+ """
29
+ _compile(
30
+ compiler="g++",
31
+ source=source,
32
+ output=output,
33
+ flags=flags,
34
+ )
35
+
36
+
37
+ def _compile(
38
+ compiler: str,
39
+ source: str,
40
+ output: str,
41
+ flags: list[str] | None = None,
42
+ ):
43
+ source = Path(source)
44
+ output = Path(output)
45
+
46
+ if not source.exists():
47
+ raise FileNotFoundError(source)
48
+
49
+ cmd = [
50
+ compiler,
51
+ str(source),
52
+ "-o",
53
+ str(output),
54
+ ]
55
+
56
+ if flags:
57
+ cmd.extend(flags)
58
+
59
+ try:
60
+ subprocess.run(
61
+ cmd,
62
+ check=True,
63
+ stdout=subprocess.PIPE,
64
+ stderr=subprocess.PIPE,
65
+ text=True,
66
+ )
67
+ except subprocess.CalledProcessError as e:
68
+ raise CompilationError(
69
+ f"Compilation failed for {source}:\n{e.stderr}"
70
+ ) from e
71
+
72
+
73
+ def compile_folder(
74
+ src_dir: str,
75
+ out_dir: str,
76
+ optimize: str = "-O0",
77
+ extra_flags: list[str] | None = None,
78
+ ):
79
+ """
80
+ Compile all .c and .cpp files in a folder (recursively).
81
+ """
82
+ src_dir = Path(src_dir)
83
+ out_dir = Path(out_dir)
84
+ out_dir.mkdir(parents=True, exist_ok=True)
85
+
86
+ extra_flags = extra_flags or []
87
+
88
+ sources = list(src_dir.rglob("*.c")) + list(src_dir.rglob("*.cpp"))
89
+
90
+ if not sources:
91
+ raise ValueError(f"No C/C++ files found in {src_dir}")
92
+
93
+ for src in sources:
94
+ output = out_dir / src.stem
95
+ flags = [optimize, *extra_flags]
96
+
97
+ try:
98
+ if src.suffix == ".c":
99
+ compile_c(src, output, flags)
100
+ else:
101
+ compile_cpp(src, output, flags)
102
+
103
+ except CompilationError as e:
104
+ raise CompilationError(
105
+ f"Compilation failed for {src}:\n{e}"
106
+ ) from e
@@ -0,0 +1,6 @@
1
+ from .objdump import disassemble, disassemble_folder
2
+
3
+ __all__ = [
4
+ "disassemble",
5
+ "disassemble_folder",
6
+ ]
@@ -0,0 +1,2 @@
1
+ class DisassemblyError(RuntimeError):
2
+ """Raised when objdump disassembly fails."""
@@ -0,0 +1,127 @@
1
+ import subprocess
2
+ from pathlib import Path
3
+ from .errors import DisassemblyError
4
+
5
+
6
+ def disassemble(
7
+ binary: str,
8
+ output: str,
9
+ arch: str | None = None,
10
+ full: bool = False,
11
+ ):
12
+ """
13
+ Disassemble a single binary using objdump.
14
+
15
+ Parameters
16
+ ----------
17
+ binary : str
18
+ Path to compiled binary
19
+ output : str
20
+ Output .asm file
21
+ arch : str | None
22
+ Optional architecture (e.g. i386:x86-64)
23
+ full : bool
24
+ If True, disassemble all functions.
25
+ If False, exclude builtin / PLT functions.
26
+ """
27
+ binary = Path(binary)
28
+ output = Path(output)
29
+
30
+ if not binary.exists():
31
+ raise FileNotFoundError(binary)
32
+
33
+ output.parent.mkdir(parents=True, exist_ok=True)
34
+
35
+ # Base command
36
+ cmd = ["objdump", "-d", "--section=.text", str(binary)]
37
+
38
+ if arch:
39
+ cmd.extend(["-m", arch])
40
+
41
+ try:
42
+ result = subprocess.run(
43
+ cmd,
44
+ check=True,
45
+ stdout=subprocess.PIPE,
46
+ stderr=subprocess.PIPE,
47
+ text=True,
48
+ )
49
+ except subprocess.CalledProcessError as e:
50
+ raise DisassemblyError(
51
+ f"objdump failed for {binary}:\n{e.stderr}"
52
+ ) from e
53
+
54
+ asm = result.stdout
55
+
56
+ if not full:
57
+ asm = _filter_builtin_functions(asm)
58
+
59
+ output.write_text(asm)
60
+
61
+
62
+ def disassemble_folder(
63
+ bin_dir: str,
64
+ out_dir: str,
65
+ full: bool = False,
66
+ ):
67
+ """
68
+ Disassemble all binaries in a folder.
69
+
70
+ Parameters
71
+ ----------
72
+ bin_dir : str
73
+ Folder containing compiled binaries
74
+ out_dir : str
75
+ Folder to store .asm outputs
76
+ full : bool
77
+ If True, disassemble all functions.
78
+ If False, exclude builtin / PLT functions.
79
+ """
80
+ bin_dir = Path(bin_dir)
81
+ out_dir = Path(out_dir)
82
+ out_dir.mkdir(parents=True, exist_ok=True)
83
+
84
+ binaries = [p for p in bin_dir.iterdir() if p.is_file()]
85
+
86
+ if not binaries:
87
+ raise ValueError(f"No binaries found in {bin_dir}")
88
+
89
+ for binary in binaries:
90
+ asm_out = out_dir / f"{binary.name}.asm"
91
+
92
+ try:
93
+ disassemble(binary, asm_out, full=full)
94
+ except DisassemblyError as e:
95
+ raise DisassemblyError(
96
+ f"Disassembly failed for {binary}:\n{e}"
97
+ ) from e
98
+
99
+
100
+ def _filter_builtin_functions(asm: str) -> str:
101
+ """
102
+ Remove builtin / PLT / runtime functions from objdump output.
103
+ """
104
+ filtered_lines = []
105
+
106
+ skip = False
107
+ for line in asm.splitlines():
108
+ if "<" in line and ">" in line and line.strip().endswith(":"):
109
+ name = line.split("<")[1].split(">")[0]
110
+
111
+ if (
112
+ name.endswith("@plt")
113
+ or name.startswith("_start")
114
+ or name.startswith("frame_dummy")
115
+ or name.startswith("register_tm_clones")
116
+ or name.startswith("deregister_tm_clones")
117
+ or name.startswith("__")
118
+ ):
119
+ skip = True
120
+ continue
121
+ else:
122
+ skip = False
123
+
124
+ if not skip:
125
+ filtered_lines.append(line)
126
+
127
+ return "\n".join(filtered_lines)
@@ -0,0 +1,7 @@
1
+ from .config import PipelineConfig
2
+ from .runner import run_pipeline
3
+
4
+ __all__ = [
5
+ "run_pipeline",
6
+ "PipelineConfig"
7
+ ]
@@ -0,0 +1,32 @@
1
+ from dataclasses import dataclass
2
+ from typing import Optional, Tuple
3
+
4
+
5
+ @dataclass
6
+ class PipelineConfig:
7
+ source_file: str
8
+
9
+ build_dir: str
10
+ asm_dir: str
11
+
12
+ # compiler
13
+ optimize: str = "-O0"
14
+ extra_flags: Optional[list[str]] = None
15
+
16
+ # disassembler
17
+ arch: Optional[str] = None
18
+ full_disasm: bool = False
19
+
20
+ # tokenizer
21
+ entry: str = "main"
22
+ keep_register: bool = False
23
+
24
+ # vectorizer
25
+ model_path: Optional[str] = None
26
+ max_features: Optional[int] = None
27
+ ngram_range: Tuple[int, int] = (1, 2)
28
+ min_df: int = 1
29
+
30
+ # switches
31
+ do_compile: bool = True
32
+ do_disassemble: bool = True
@@ -0,0 +1,78 @@
1
+ from pathlib import Path
2
+
3
+ from disasm2vec.compiler import compile_c, compile_cpp
4
+ from disasm2vec.disassembler import disassemble
5
+ from disasm2vec.tokenizer import tokenize
6
+ from disasm2vec.vectorizer import Tfidf
7
+
8
+ from .config import PipelineConfig
9
+
10
+
11
+ def run_pipeline(config: PipelineConfig):
12
+ """
13
+ Run pipeline for single source file.
14
+
15
+ Flow:
16
+ source -> compile -> disassemble -> tokenizer -> vectorize
17
+ """
18
+
19
+ source = Path(config.source_file)
20
+
21
+ if not source.exists():
22
+ raise FileNotFoundError(source)
23
+
24
+ stem = source.stem
25
+
26
+ binary_path = Path(config.build_dir) / stem
27
+ asm_path = Path(config.asm_dir) / f"{stem}.asm"
28
+
29
+ binary_path.parent.mkdir(parents=True, exist_ok=True)
30
+ asm_path.parent.mkdir(parents=True, exist_ok=True)
31
+
32
+ # COMPILE
33
+ if config.do_compile:
34
+ flags = [config.optimize]
35
+ if config.extra_flags:
36
+ flags.extend(config.extra_flags)
37
+
38
+ if source.suffix == ".c":
39
+ compile_c(source, binary_path, flags)
40
+
41
+ elif source.suffix == ".cpp":
42
+ compile_cpp(source, binary_path, flags)
43
+
44
+ else:
45
+ raise ValueError(
46
+ f"Unsupported source type: {source.suffix}"
47
+ )
48
+
49
+ # DISASSEMBLE
50
+ if config.do_disassemble:
51
+ disassemble(
52
+ binary=binary_path,
53
+ output=asm_path,
54
+ arch=config.arch,
55
+ full=config.full_disasm,
56
+ )
57
+
58
+ # TOKENIZER
59
+ corpus = tokenize(
60
+ path=asm_path,
61
+ entry=config.entry,
62
+ keep_register=config.keep_register,
63
+ )
64
+
65
+ # VECTORIZE
66
+ if not config.model_path:
67
+ raise ValueError("model_path is required for pipeline")
68
+
69
+ vectorizer = Tfidf(
70
+ max_features=config.max_features,
71
+ ngram_range=config.ngram_range,
72
+ min_df=config.min_df,
73
+ )
74
+ vectorizer.load(config.model_path)
75
+
76
+ X = vectorizer.transform_one(corpus)
77
+
78
+ return X, vectorizer
@@ -0,0 +1,4 @@
1
+ from .core import tokenize, tokenize_batch
2
+
3
+ __all__ = ["tokenize",
4
+ "tokenize_batch",]
@@ -0,0 +1,10 @@
1
+ import re
2
+
3
+ INSTRUCTION_PATTERN = re.compile(r"\s*[0-9a-fA-F]+:\s+")
4
+
5
+
6
+ def is_instruction_line(line: str) -> bool:
7
+ """
8
+ Check whether a line is a valid objdump instruction line.
9
+ """
10
+ return bool(INSTRUCTION_PATTERN.match(line))
@@ -0,0 +1,184 @@
1
+ import re
2
+ from pathlib import Path
3
+ from .cleaner import is_instruction_line
4
+ from .normalizer import normalize_operand
5
+
6
+
7
+ BYTE_PATTERN = re.compile(r"^[0-9a-fA-F]{2}$")
8
+ MNEMONIC_PATTERN = re.compile(r"^[a-zA-Z][a-zA-Z0-9]*$")
9
+ FUNCTION_HEADER = re.compile(r"<(.+?)>:")
10
+
11
+
12
+ def tokenize_instruction(line: str, keep_register: bool = False):
13
+ line = line.split("#", 1)[0]
14
+
15
+ if ":" not in line:
16
+ return None
17
+
18
+ _, rest = line.split(":", 1)
19
+ tokens = rest.strip().split()
20
+
21
+ if not tokens:
22
+ return None
23
+
24
+ i = 0
25
+ while i < len(tokens) and BYTE_PATTERN.match(tokens[i]):
26
+ i += 1
27
+
28
+ if i >= len(tokens):
29
+ return None
30
+
31
+ mnemonic = tokens[i].lower()
32
+ if not MNEMONIC_PATTERN.match(mnemonic):
33
+ return None
34
+
35
+ operand_str = " ".join(tokens[i + 1 :]).strip()
36
+ result = [mnemonic]
37
+
38
+ if mnemonic == "call":
39
+ result.append("FUNC")
40
+ return result
41
+
42
+ if mnemonic.startswith("j"):
43
+ result.append("JMP")
44
+ return result
45
+
46
+ if operand_str:
47
+ operands = [op.strip() for op in operand_str.split(",")]
48
+ for op in operands:
49
+ result.append(
50
+ normalize_operand(op, keep_register=keep_register)
51
+ )
52
+
53
+ return result
54
+
55
+
56
+ def _split_functions(path: Path) -> dict[str, list[str]]:
57
+ functions = {}
58
+ current = None
59
+
60
+ with path.open() as f:
61
+ for line in f:
62
+ header = FUNCTION_HEADER.search(line)
63
+ if header:
64
+ current = header.group(1)
65
+ functions[current] = []
66
+ continue
67
+
68
+ if current and is_instruction_line(line):
69
+ functions[current].append(line)
70
+
71
+ return functions
72
+
73
+
74
+ def _extract_call_target(line: str) -> str | None:
75
+ if "call" not in line:
76
+ return None
77
+
78
+ if "<" in line and ">" in line:
79
+ return line.split("<")[1].split(">")[0]
80
+
81
+ return None
82
+
83
+
84
+ def _expand_function(
85
+ func_name: str,
86
+ functions: dict[str, list[str]],
87
+ keep_register: bool,
88
+ visited: set,
89
+ ) -> list[str]:
90
+ """
91
+ Inline user-defined function bodies at call sites.
92
+ """
93
+ if func_name in visited:
94
+ return []
95
+
96
+ visited.add(func_name)
97
+
98
+ result = []
99
+
100
+ for line in functions.get(func_name, []):
101
+ tokens = tokenize_instruction(
102
+ line,
103
+ keep_register=keep_register,
104
+ )
105
+
106
+ if not tokens:
107
+ continue
108
+
109
+ if tokens[0] == "call":
110
+ callee = _extract_call_target(line)
111
+
112
+ if (
113
+ callee
114
+ and "@plt" not in callee
115
+ and callee in functions
116
+ ):
117
+ result.extend(
118
+ _expand_function(
119
+ callee,
120
+ functions,
121
+ keep_register,
122
+ visited,
123
+ )
124
+ )
125
+ continue
126
+
127
+ result.append(" ".join(tokens))
128
+
129
+ return result
130
+
131
+
132
+ def tokenize(
133
+ path: str,
134
+ keep_register: bool = False,
135
+ entry: str = "main",
136
+ ) -> list[str]:
137
+ """
138
+ Parse file and inline user-defined function calls
139
+ inside selected entry function.
140
+ """
141
+ path = Path(path)
142
+
143
+ functions = _split_functions(path)
144
+
145
+ if entry not in functions:
146
+ raise ValueError(f"Function '{entry}' not found.")
147
+
148
+ return _expand_function(
149
+ entry,
150
+ functions,
151
+ keep_register,
152
+ visited=set(),
153
+ )
154
+
155
+
156
+ def tokenize_batch(
157
+ asm_dir: str,
158
+ keep_register: bool = False,
159
+ entry: str = "main",
160
+ ) -> dict[str, list[str]]:
161
+ """
162
+ Parse all .asm files in a folder.
163
+ Each file processed independently.
164
+ """
165
+ asm_dir = Path(asm_dir)
166
+
167
+ if not asm_dir.exists():
168
+ raise FileNotFoundError(asm_dir)
169
+
170
+ asm_files = sorted(asm_dir.glob("*.asm"))
171
+
172
+ if not asm_files:
173
+ raise ValueError(f"No .asm files found in {asm_dir}")
174
+
175
+ result: dict[str, list[str]] = {}
176
+
177
+ for asm_file in asm_files:
178
+ result[asm_file.name] = tokenize(
179
+ asm_file,
180
+ keep_register=keep_register,
181
+ entry=entry,
182
+ )
183
+
184
+ return result
@@ -0,0 +1,26 @@
1
+ import re
2
+
3
+ REGISTER_PATTERN = re.compile(r"%([a-zA-Z0-9]+)")
4
+ IMMEDIATE_PATTERN = re.compile(r"\$0x[0-9a-fA-F]+|\$\d+")
5
+ MEMORY_PATTERN = re.compile(
6
+ r"%[a-z]{2}:|"
7
+ r"\([^)]+\)|"
8
+ r"0x[0-9a-fA-F]+"
9
+ )
10
+ SYMBOL_PATTERN = re.compile(r"<.*?>")
11
+
12
+
13
+ def normalize_operand(operand: str, keep_register: bool = False) -> str:
14
+ operand = SYMBOL_PATTERN.sub("", operand)
15
+
16
+ if MEMORY_PATTERN.search(operand):
17
+ return "MEM"
18
+
19
+ if IMMEDIATE_PATTERN.search(operand):
20
+ return "IMM"
21
+
22
+ m = REGISTER_PATTERN.search(operand)
23
+ if m:
24
+ return m.group(1).lower() if keep_register else "REG"
25
+
26
+ return operand
@@ -0,0 +1,5 @@
1
+ from .base import VectorizerBase
2
+ from .tfidf import Tfidf
3
+ from .factory import get_vectorizer, load_vectorizer
4
+
5
+ __all__ = ["VectorizerBase", "Tfidf", "get_vectorizer", "load_vectorizer"]
@@ -0,0 +1,50 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import List, Union
3
+ from pathlib import Path
4
+
5
+ class VectorizerBase(ABC):
6
+ """
7
+ Abstract base class for all vectorizers.
8
+ """
9
+
10
+ @abstractmethod
11
+ def fit(self, documents: List[List[str]]):
12
+ """
13
+ Fit the vectorizer to the documents.
14
+ """
15
+ pass
16
+
17
+ @abstractmethod
18
+ def transform(self, documents: List[List[str]]):
19
+ """
20
+ Transform documents to vectors.
21
+ """
22
+ pass
23
+
24
+ @abstractmethod
25
+ def fit_transform(self, documents: List[List[str]]):
26
+ """
27
+ Fit to data, then transform it.
28
+ """
29
+ pass
30
+
31
+ @abstractmethod
32
+ def transform_one(self, document: List[str]):
33
+ """
34
+ Transform a single document to a vector.
35
+ """
36
+ pass
37
+
38
+ @abstractmethod
39
+ def save(self, path: Union[str, Path]):
40
+ """
41
+ Save the vectorizer to a file.
42
+ """
43
+ pass
44
+
45
+ @abstractmethod
46
+ def load(self, path: Union[str, Path]):
47
+ """
48
+ Load the vectorizer from a file.
49
+ """
50
+ pass
@@ -0,0 +1,27 @@
1
+ import os
2
+
3
+ from .base import VectorizerBase
4
+ from .tfidf import Tfidf
5
+
6
+ _CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
7
+ _PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(_CURRENT_DIR)))
8
+ DEFAULT_MODEL_PATH = os.path.join(_PROJECT_ROOT, "models", "base_tfidf_asm.pkl")
9
+
10
+
11
+ def get_vectorizer(model_type: str = "tfidf", **kwargs) -> VectorizerBase:
12
+ """
13
+ Factory function to get a vectorizer instance.
14
+ """
15
+ if model_type == "tfidf":
16
+ return Tfidf(**kwargs)
17
+ else:
18
+ raise ValueError(f"Unknown vectorizer type: {model_type}")
19
+
20
+ def load_vectorizer(path: str = DEFAULT_MODEL_PATH, model_type: str = "tfidf") -> VectorizerBase:
21
+ """
22
+ Load a vectorizer from a file.
23
+ """
24
+ if model_type == "tfidf":
25
+ return Tfidf().load(path)
26
+ else:
27
+ raise ValueError(f"Unknown vectorizer type: {model_type}")
@@ -0,0 +1,123 @@
1
+ from typing import List, Iterable, Optional, Tuple, Union
2
+ import pickle
3
+ from pathlib import Path
4
+ from sklearn.feature_extraction.text import TfidfVectorizer
5
+ from .base import VectorizerBase
6
+
7
+
8
+ def identity(x):
9
+ return x
10
+
11
+ class Tfidf(VectorizerBase):
12
+ """
13
+ TF-IDF vectorizer for assembly instruction tokens.
14
+
15
+ Input format
16
+ ------------
17
+ documents must be:
18
+ List[List[str]]
19
+
20
+ where:
21
+ outer list = files
22
+ inner list = instructions
23
+ """
24
+
25
+ def __init__(
26
+ self,
27
+ *,
28
+ max_features: Optional[int] = None,
29
+ ngram_range: Tuple[int, int] = (1, 2),
30
+ min_df: int = 1,
31
+ max_df: float | int = 1.0,
32
+ use_idf: bool = True,
33
+ norm: str | None = "l2",
34
+ ):
35
+ self.vectorizer = TfidfVectorizer(
36
+ tokenizer=identity,
37
+ preprocessor=identity,
38
+ token_pattern=None,
39
+ lowercase=False,
40
+ max_features=max_features,
41
+ ngram_range=ngram_range,
42
+ min_df=min_df,
43
+ max_df=max_df,
44
+ use_idf=use_idf,
45
+ norm=norm,
46
+ )
47
+ self._fitted = False
48
+
49
+ # FIT
50
+ def fit(self, documents: List[List[str]]):
51
+ """
52
+ Fit vocabulary + IDF from corpus.
53
+ """
54
+ self._validate_docs(documents)
55
+
56
+ self.vectorizer.fit(documents)
57
+ self._fitted = True
58
+ return self
59
+
60
+ # TRANSFORM
61
+ def transform(self, documents: List[List[str]]):
62
+ """
63
+ Transform documents → vectors.
64
+ """
65
+ self._check_fitted()
66
+ self._validate_docs(documents)
67
+
68
+ X = self.vectorizer.transform(documents)
69
+ return X
70
+
71
+ # FIT + TRANSFORM
72
+ def fit_transform(self, documents: List[List[str]]):
73
+ """
74
+ Fit then transform.
75
+ """
76
+ self._validate_docs(documents)
77
+
78
+ X = self.vectorizer.fit_transform(documents)
79
+ self._fitted = True
80
+ return X
81
+
82
+ # SINGLE DOC
83
+ def transform_one(self, document: List[str]):
84
+ """
85
+ Transform single file → vector
86
+ """
87
+ self._check_fitted()
88
+ return self.vectorizer.transform([document])
89
+
90
+ # FEATURES
91
+ def features(self) -> List[str]:
92
+ self._check_fitted()
93
+ return self.vectorizer.get_feature_names_out().tolist()
94
+
95
+ # SAVE / LOAD
96
+ def save(self, path: Union[str, Path]):
97
+ self._check_fitted()
98
+
99
+ with open(path, "wb") as f:
100
+ pickle.dump(self.vectorizer, f)
101
+
102
+ def load(self, path: Union[str, Path]):
103
+ with open(path, "rb") as f:
104
+ self.vectorizer = pickle.load(f)
105
+
106
+ self._fitted = True
107
+ return self
108
+
109
+ def _validate_docs(self, docs):
110
+ if not isinstance(docs, Iterable):
111
+ raise TypeError("documents must be iterable")
112
+
113
+ for d in docs:
114
+ if not isinstance(d, list):
115
+ raise TypeError(
116
+ "Each document must be List[str]"
117
+ )
118
+
119
+ def _check_fitted(self):
120
+ if not self._fitted:
121
+ raise RuntimeError(
122
+ "Vectorizer not fitted. Call fit() first."
123
+ )
@@ -0,0 +1,84 @@
1
+ Metadata-Version: 2.4
2
+ Name: disasm2vec
3
+ Version: 0.1.0
4
+ Summary: disasm2vec is a research framework designed to generate vector representations from disassembled C/C++ binaries. It provides a modular pipeline that handles compilation, disassembly, tokenization, and vectorization, enabling researchers and security analysts to transform raw code into machine-learning-ready features.
5
+ Author-email: Ahmad Nur Rohim <ahmadnurrohim2812@gmail.com>
6
+ Project-URL: Repository, https://github.com/Anro128/disasm2vec
7
+ Project-URL: Issues, https://github.com/Anro128/disasm2vec/issues
8
+ Project-URL: Changelog, https://github.com/Anro128/disasm2vec/blob/main/CHANGELOG.md
9
+ Requires-Python: >=3.10
10
+ Description-Content-Type: text/markdown
11
+ License-File: LICENSE
12
+ Requires-Dist: scikit-learn>=1.0.0
13
+ Requires-Dist: numpy>=1.20.0
14
+ Dynamic: license-file
15
+
16
+ # disasm2vec
17
+
18
+ **disasm2vec** is a research framework designed to generate vector representations from disassembled C/C++ binaries. It provides a modular pipeline that handles compilation, disassembly, tokenization, and vectorization, enabling researchers and security analysts to transform raw code into machine-learning-ready features.
19
+
20
+ ## Features
21
+
22
+ - **Automated Compilation**: Seamlessly compiles C and C++ source files using GCC.
23
+ - **Disassembly Wrapper**: Extracts assembly instructions using `objdump`, supporting both full and function-specific disassembly.
24
+ - **Intelligent Tokenization**: Normalizes and cleans assembly instructions, with options to preserve or abstract register names.
25
+ - **Vectorization**: Implements TF-IDF vectorization with a flexible factory pattern for easy model management.
26
+ - **End-to-End Pipeline**: Orchestrates the entire process from source code to vector embedding.
27
+ - **Extensible Architecture**: Built with abstract base classes to easily support new compilers, disassemblers, or vectorizers.
28
+
29
+ ## Prerequisites
30
+
31
+ - **Python**: version 3.10 or higher.
32
+ - **Operating System**: Linux or Windows Subsystem for Linux (WSL).
33
+ - **GCC**: Required for compiling source files.
34
+ - **Objdump**: Required for disassembling binaries.
35
+
36
+ **Note**: The compilation (`gcc`) and disassembly (`objdump`) modules rely on system-level tools typically found in Linux environments. If you are on Windows, please use WSL.
37
+
38
+ Ensure both `gcc` and `objdump` are installed and available in your system's PATH.
39
+
40
+ ## Installation
41
+
42
+ Install directly from PyPI:
43
+
44
+ ```bash
45
+ pip install disasm2vec
46
+ ```
47
+
48
+ Or install from source:
49
+
50
+ ```bash
51
+ git clone https://github.com/yourusername/disasm2vec.git
52
+ cd disasm2vec
53
+ pip install .
54
+ ```
55
+
56
+ ## Usage
57
+
58
+ The core of the framework is the `PipelineRunner`, which processes a source file based on a configuration object.
59
+
60
+ ### Basic Example
61
+
62
+ ```python
63
+ from disasm2vec.pipeline import PipelineConfig, run_pipeline
64
+
65
+ # Configure the pipeline
66
+ config = PipelineConfig(
67
+ source_file="examples/sample.c",
68
+ build_dir="build",
69
+ asm_dir="asm",
70
+ model_path="models/base_tfidf_asm.pkl" # Path to pre-trained model or where to save a new one
71
+ )
72
+
73
+ # Run the pipeline
74
+ # returns:
75
+ # vector: The vector representation of the source file
76
+ # vectorizer: The fitted vectorizer instance
77
+ vector, vectorizer = run_pipeline(config)
78
+
79
+ print(f"Generated Vector Shape: {vector.shape}")
80
+ ```
81
+
82
+ ## License
83
+
84
+ This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
@@ -0,0 +1,23 @@
1
+ disasm2vec/__init__.py,sha256=Noj0HXMvVAx8pDKbFHjrGbwHMa4Nax6UF4PNhCQXBGA,280
2
+ disasm2vec/compiler/__init__.py,sha256=i0EbRV8qOxOS3WNF604UqA_oHq0y8NYfs7Bsm0jokB8,117
3
+ disasm2vec/compiler/errors.py,sha256=RVehQDn0v-mpERoyIsOzMOZBTvYVrn2zsVwH-05mr1o,88
4
+ disasm2vec/compiler/gcc.py,sha256=nrdqCgmAU0tSWJXVbZdZnnwwcsqrzHsUD06-Q0KTWJQ,2311
5
+ disasm2vec/disassembler/__init__.py,sha256=d2lRMPfwl4D4pMf1QdBU9F62BXORnqRLOrOF9ML7XJg,119
6
+ disasm2vec/disassembler/errors.py,sha256=Jjlj22kmSnAITo3RAxPHmSN7XedupeeB1ok9S7HiZyE,89
7
+ disasm2vec/disassembler/objdump.py,sha256=TOckxmwXKUbQ6kUxsvkpPEo9wwLPiXGehPHEK0m8IEA,3245
8
+ disasm2vec/pipeline/__init__.py,sha256=TY5yYVka9uVWPa4Bgbxii_Llf21aZ2d2hy9lVbAMnv0,131
9
+ disasm2vec/pipeline/config.py,sha256=SHjVjWcXS6GJPbn53CkQpUi-CkXsSeuk3ssNa2JjDVI,666
10
+ disasm2vec/pipeline/runner.py,sha256=ylBhaGZ-1XxEhLBdvzzYDdrYBaEA5H11ZiNj9IyIU8s,2011
11
+ disasm2vec/tokenizer/__init__.py,sha256=CMO7ZS1-_weaHmC7HWJkOn9sju3ZYjnV27lcT3YugSo,102
12
+ disasm2vec/tokenizer/cleaner.py,sha256=exyjO1Vbd2Tulzw4k1_jGM-QP6rz9I4u8z2uEaYvaNE,251
13
+ disasm2vec/tokenizer/core.py,sha256=7UvVP3W8sAHFV70cUqgti7ZuzLrU6Szgxy-UsiY1cHM,4288
14
+ disasm2vec/tokenizer/normalizer.py,sha256=4iG9bS8vIGkbNqpg32lCzgu8q4VozpfzvQ1LPRp59hE,653
15
+ disasm2vec/vectorizer/__init__.py,sha256=bxqYPOky_0LDWF2dDPazcDOykUp0WAgA_ImBIVvreBU,192
16
+ disasm2vec/vectorizer/base.py,sha256=zjwZxHcoweq52xvAmSBHWVT1GyzGyQ5RKJv9prmLbwo,1136
17
+ disasm2vec/vectorizer/factory.py,sha256=bUbJ3DmffdFhw-x8vwyKIQqzW4PKayZ6IL65MNjvNdM,889
18
+ disasm2vec/vectorizer/tfidf.py,sha256=ReWQKlnh0HLOhAY6f688ViCJjQkimwVX4OdJClX9AjU,3239
19
+ disasm2vec-0.1.0.dist-info/licenses/LICENSE,sha256=RalBa8eUTXV7G8B7ZD0rD4su69Pr55NFd65d6W0UQNc,1093
20
+ disasm2vec-0.1.0.dist-info/METADATA,sha256=dSld2Qy4LlDwXtfE65PTIhQSdkVjEuckLLiU5rY22-o,3335
21
+ disasm2vec-0.1.0.dist-info/WHEEL,sha256=YCfwYGOYMi5Jhw2fU4yNgwErybb2IX5PEwBKV4ZbdBo,91
22
+ disasm2vec-0.1.0.dist-info/top_level.txt,sha256=3OV2TKP_vsNJzItCmKI9CRtoqdoq53QrWrLfL8TxNkQ,11
23
+ disasm2vec-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Ahmad Nur Rohim
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ disasm2vec