disasm2vec 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. disasm2vec-0.1.0/LICENSE +21 -0
  2. disasm2vec-0.1.0/MANIFEST.in +4 -0
  3. disasm2vec-0.1.0/PKG-INFO +84 -0
  4. disasm2vec-0.1.0/README.md +69 -0
  5. disasm2vec-0.1.0/models/base_tfidf_asm.json +27 -0
  6. disasm2vec-0.1.0/models/base_tfidf_asm.pkl +0 -0
  7. disasm2vec-0.1.0/models/base_tfidf_asm_keepreg.json +27 -0
  8. disasm2vec-0.1.0/models/base_tfidf_asm_keepreg.pkl +0 -0
  9. disasm2vec-0.1.0/pyproject.toml +22 -0
  10. disasm2vec-0.1.0/setup.cfg +4 -0
  11. disasm2vec-0.1.0/src/disasm2vec/__init__.py +17 -0
  12. disasm2vec-0.1.0/src/disasm2vec/compiler/__init__.py +3 -0
  13. disasm2vec-0.1.0/src/disasm2vec/compiler/errors.py +3 -0
  14. disasm2vec-0.1.0/src/disasm2vec/compiler/gcc.py +106 -0
  15. disasm2vec-0.1.0/src/disasm2vec/disassembler/__init__.py +6 -0
  16. disasm2vec-0.1.0/src/disasm2vec/disassembler/errors.py +2 -0
  17. disasm2vec-0.1.0/src/disasm2vec/disassembler/objdump.py +127 -0
  18. disasm2vec-0.1.0/src/disasm2vec/pipeline/__init__.py +7 -0
  19. disasm2vec-0.1.0/src/disasm2vec/pipeline/config.py +32 -0
  20. disasm2vec-0.1.0/src/disasm2vec/pipeline/runner.py +78 -0
  21. disasm2vec-0.1.0/src/disasm2vec/tokenizer/__init__.py +4 -0
  22. disasm2vec-0.1.0/src/disasm2vec/tokenizer/cleaner.py +10 -0
  23. disasm2vec-0.1.0/src/disasm2vec/tokenizer/core.py +184 -0
  24. disasm2vec-0.1.0/src/disasm2vec/tokenizer/normalizer.py +26 -0
  25. disasm2vec-0.1.0/src/disasm2vec/vectorizer/__init__.py +5 -0
  26. disasm2vec-0.1.0/src/disasm2vec/vectorizer/base.py +50 -0
  27. disasm2vec-0.1.0/src/disasm2vec/vectorizer/factory.py +27 -0
  28. disasm2vec-0.1.0/src/disasm2vec/vectorizer/tfidf.py +123 -0
  29. disasm2vec-0.1.0/src/disasm2vec.egg-info/PKG-INFO +84 -0
  30. disasm2vec-0.1.0/src/disasm2vec.egg-info/SOURCES.txt +36 -0
  31. disasm2vec-0.1.0/src/disasm2vec.egg-info/dependency_links.txt +1 -0
  32. disasm2vec-0.1.0/src/disasm2vec.egg-info/requires.txt +2 -0
  33. disasm2vec-0.1.0/src/disasm2vec.egg-info/top_level.txt +1 -0
  34. disasm2vec-0.1.0/tests/test_compiler.py +72 -0
  35. disasm2vec-0.1.0/tests/test_disassembler.py +75 -0
  36. disasm2vec-0.1.0/tests/test_pipeline.py +56 -0
  37. disasm2vec-0.1.0/tests/test_tokenizer.py +54 -0
  38. disasm2vec-0.1.0/tests/test_vectorizer.py +119 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Ahmad Nur Rohim
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,4 @@
1
+ include LICENSE
2
+ include README.md
3
+ include pyproject.toml
4
+ recursive-include models *.json *.pkl
@@ -0,0 +1,84 @@
1
+ Metadata-Version: 2.4
2
+ Name: disasm2vec
3
+ Version: 0.1.0
4
+ Summary: disasm2vec is a research framework designed to generate vector representations from disassembled C/C++ binaries. It provides a modular pipeline that handles compilation, disassembly, tokenization, and vectorization, enabling researchers and security analysts to transform raw code into machine-learning-ready features.
5
+ Author-email: Ahmad Nur Rohim <ahmadnurrohim2812@gmail.com>
6
+ Project-URL: Repository, https://github.com/Anro128/disasm2vec
7
+ Project-URL: Issues, https://github.com/Anro128/disasm2vec/issues
8
+ Project-URL: Changelog, https://github.com/Anro128/disasm2vec/blob/main/CHANGELOG.md
9
+ Requires-Python: >=3.10
10
+ Description-Content-Type: text/markdown
11
+ License-File: LICENSE
12
+ Requires-Dist: scikit-learn>=1.0.0
13
+ Requires-Dist: numpy>=1.20.0
14
+ Dynamic: license-file
15
+
16
+ # disasm2vec
17
+
18
+ **disasm2vec** is a research framework designed to generate vector representations from disassembled C/C++ binaries. It provides a modular pipeline that handles compilation, disassembly, tokenization, and vectorization, enabling researchers and security analysts to transform raw code into machine-learning-ready features.
19
+
20
+ ## Features
21
+
22
+ - **Automated Compilation**: Seamlessly compiles C and C++ source files using GCC.
23
+ - **Disassembly Wrapper**: Extracts assembly instructions using `objdump`, supporting both full and function-specific disassembly.
24
+ - **Intelligent Tokenization**: Normalizes and cleans assembly instructions, with options to preserve or abstract register names.
25
+ - **Vectorization**: Implements TF-IDF vectorization with a flexible factory pattern for easy model management.
26
+ - **End-to-End Pipeline**: Orchestrates the entire process from source code to vector embedding.
27
+ - **Extensible Architecture**: Built with abstract base classes to easily support new compilers, disassemblers, or vectorizers.
28
+
29
+ ## Prerequisites
30
+
31
+ - **Python**: version 3.10 or higher.
32
+ - **Operating System**: Linux or Windows Subsystem for Linux (WSL).
33
+ - **GCC**: Required for compiling source files.
34
+ - **Objdump**: Required for disassembling binaries.
35
+
36
+ **Note**: The compilation (`gcc`) and disassembly (`objdump`) modules rely on system-level tools typically found in Linux environments. If you are on Windows, please use WSL.
37
+
38
+ Ensure both `gcc` and `objdump` are installed and available in your system's PATH.
39
+
40
+ ## Installation
41
+
42
+ Install directly from PyPI:
43
+
44
+ ```bash
45
+ pip install disasm2vec
46
+ ```
47
+
48
+ Or install from source:
49
+
50
+ ```bash
51
+ git clone https://github.com/yourusername/disasm2vec.git
52
+ cd disasm2vec
53
+ pip install .
54
+ ```
55
+
56
+ ## Usage
57
+
58
+ The core of the framework is the `PipelineRunner`, which processes a source file based on a configuration object.
59
+
60
+ ### Basic Example
61
+
62
+ ```python
63
+ from disasm2vec.pipeline import PipelineConfig, run_pipeline
64
+
65
+ # Configure the pipeline
66
+ config = PipelineConfig(
67
+ source_file="examples/sample.c",
68
+ build_dir="build",
69
+ asm_dir="asm",
70
+ model_path="models/base_tfidf_asm.pkl" # Path to pre-trained model or where to save a new one
71
+ )
72
+
73
+ # Run the pipeline
74
+ # returns:
75
+ # vector: The vector representation of the source file
76
+ # vectorizer: The fitted vectorizer instance
77
+ vector, vectorizer = run_pipeline(config)
78
+
79
+ print(f"Generated Vector Shape: {vector.shape}")
80
+ ```
81
+
82
+ ## License
83
+
84
+ This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
@@ -0,0 +1,69 @@
1
+ # disasm2vec
2
+
3
+ **disasm2vec** is a research framework designed to generate vector representations from disassembled C/C++ binaries. It provides a modular pipeline that handles compilation, disassembly, tokenization, and vectorization, enabling researchers and security analysts to transform raw code into machine-learning-ready features.
4
+
5
+ ## Features
6
+
7
+ - **Automated Compilation**: Seamlessly compiles C and C++ source files using GCC.
8
+ - **Disassembly Wrapper**: Extracts assembly instructions using `objdump`, supporting both full and function-specific disassembly.
9
+ - **Intelligent Tokenization**: Normalizes and cleans assembly instructions, with options to preserve or abstract register names.
10
+ - **Vectorization**: Implements TF-IDF vectorization with a flexible factory pattern for easy model management.
11
+ - **End-to-End Pipeline**: Orchestrates the entire process from source code to vector embedding.
12
+ - **Extensible Architecture**: Built with abstract base classes to easily support new compilers, disassemblers, or vectorizers.
13
+
14
+ ## Prerequisites
15
+
16
+ - **Python**: version 3.10 or higher.
17
+ - **Operating System**: Linux or Windows Subsystem for Linux (WSL).
18
+ - **GCC**: Required for compiling source files.
19
+ - **Objdump**: Required for disassembling binaries.
20
+
21
+ **Note**: The compilation (`gcc`) and disassembly (`objdump`) modules rely on system-level tools typically found in Linux environments. If you are on Windows, please use WSL.
22
+
23
+ Ensure both `gcc` and `objdump` are installed and available in your system's PATH.
24
+
25
+ ## Installation
26
+
27
+ Install directly from PyPI:
28
+
29
+ ```bash
30
+ pip install disasm2vec
31
+ ```
32
+
33
+ Or install from source:
34
+
35
+ ```bash
36
+ git clone https://github.com/yourusername/disasm2vec.git
37
+ cd disasm2vec
38
+ pip install .
39
+ ```
40
+
41
+ ## Usage
42
+
43
+ The core of the framework is the `PipelineRunner`, which processes a source file based on a configuration object.
44
+
45
+ ### Basic Example
46
+
47
+ ```python
48
+ from disasm2vec.pipeline import PipelineConfig, run_pipeline
49
+
50
+ # Configure the pipeline
51
+ config = PipelineConfig(
52
+ source_file="examples/sample.c",
53
+ build_dir="build",
54
+ asm_dir="asm",
55
+ model_path="models/base_tfidf_asm.pkl" # Path to pre-trained model or where to save a new one
56
+ )
57
+
58
+ # Run the pipeline
59
+ # returns:
60
+ # vector: The vector representation of the source file
61
+ # vectorizer: The fitted vectorizer instance
62
+ vector, vectorizer = run_pipeline(config)
63
+
64
+ print(f"Generated Vector Shape: {vector.shape}")
65
+ ```
66
+
67
+ ## License
68
+
69
+ This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
@@ -0,0 +1,27 @@
1
+ {
2
+ "model_type": "tfidf",
3
+ "vectorizer": {
4
+ "max_features": 5000,
5
+ "ngram_range": [
6
+ 1,
7
+ 2
8
+ ],
9
+ "min_df": 2,
10
+ "max_df": 1.0,
11
+ "norm": "l2",
12
+ "use_idf": true
13
+ },
14
+ "dataset": {
15
+ "num_documents": 653,
16
+ "avg_length": 153.9770290964778,
17
+ "hash": "a1ac3063bb6cd0ac601cb6e5421402411b722c90fca288a03db2c518425bb284"
18
+ },
19
+ "environment": {
20
+ "python": "3.12.3",
21
+ "platform": "Linux-6.6.87.2-microsoft-standard-WSL2-x86_64-with-glibc2.39"
22
+ },
23
+ "training": {
24
+ "timestamp": "2026-02-17T16:14:47.363169",
25
+ "source_dataset": "dataset_tokens.csv"
26
+ }
27
+ }
@@ -0,0 +1,27 @@
1
+ {
2
+ "model_type": "tfidf",
3
+ "vectorizer": {
4
+ "max_features": 5000,
5
+ "ngram_range": [
6
+ 1,
7
+ 2
8
+ ],
9
+ "min_df": 2,
10
+ "max_df": 1.0,
11
+ "norm": "l2",
12
+ "use_idf": true
13
+ },
14
+ "dataset": {
15
+ "num_documents": 653,
16
+ "avg_length": 153.9770290964778,
17
+ "hash": "2ad5da17cc369457374e2a4465e8bd5c753981f79fe9e8cd61cd2ad231be26fb"
18
+ },
19
+ "environment": {
20
+ "python": "3.12.3",
21
+ "platform": "Linux-6.6.87.2-microsoft-standard-WSL2-x86_64-with-glibc2.39"
22
+ },
23
+ "training": {
24
+ "timestamp": "2026-02-17T16:14:02.894795",
25
+ "source_dataset": "dataset_tokens_keep_register.csv"
26
+ }
27
+ }
@@ -0,0 +1,22 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "disasm2vec"
7
+ version = "0.1.0"
8
+ description = "disasm2vec is a research framework designed to generate vector representations from disassembled C/C++ binaries. It provides a modular pipeline that handles compilation, disassembly, tokenization, and vectorization, enabling researchers and security analysts to transform raw code into machine-learning-ready features."
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ authors = [
12
+ { name = "Ahmad Nur Rohim", email = "ahmadnurrohim2812@gmail.com" }
13
+ ]
14
+ dependencies = [
15
+ "scikit-learn>=1.0.0",
16
+ "numpy>=1.20.0",
17
+ ]
18
+
19
+ [project.urls]
20
+ Repository = "https://github.com/Anro128/disasm2vec"
21
+ Issues = "https://github.com/Anro128/disasm2vec/issues"
22
+ Changelog = "https://github.com/Anro128/disasm2vec/blob/main/CHANGELOG.md"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,17 @@
1
+ """disasm2vec"""
2
+
3
+ from . import compiler
4
+ from . import disassembler
5
+ from . import tokenizer
6
+ from . import vectorizer
7
+ from . import pipeline
8
+
9
+ __all__ = [
10
+ "compiler",
11
+ "disassembler",
12
+ "tokenizer",
13
+ "vectorizer",
14
+ "pipeline",
15
+ ]
16
+
17
+ __version__ = "0.1.0"
@@ -0,0 +1,3 @@
1
+ from .gcc import compile_c, compile_cpp, compile_folder
2
+
3
+ __all__ = ["compile_c", "compile_cpp", "compile_folder"]
@@ -0,0 +1,3 @@
1
+ class CompilationError(Exception):
2
+ """Raised when compilation fails."""
3
+ pass
@@ -0,0 +1,106 @@
1
+ import subprocess
2
+ from pathlib import Path
3
+ from .errors import CompilationError
4
+
5
+ def compile_c(
6
+ source: str,
7
+ output: str,
8
+ flags: list[str] | None = None
9
+ ):
10
+ """
11
+ Compile C source file using gcc.
12
+ """
13
+ _compile(
14
+ compiler="gcc",
15
+ source=source,
16
+ output=output,
17
+ flags=flags,
18
+ )
19
+
20
+
21
+ def compile_cpp(
22
+ source: str,
23
+ output: str,
24
+ flags: list[str] | None = None
25
+ ):
26
+ """
27
+ Compile C++ source file using g++.
28
+ """
29
+ _compile(
30
+ compiler="g++",
31
+ source=source,
32
+ output=output,
33
+ flags=flags,
34
+ )
35
+
36
+
37
+ def _compile(
38
+ compiler: str,
39
+ source: str,
40
+ output: str,
41
+ flags: list[str] | None = None,
42
+ ):
43
+ source = Path(source)
44
+ output = Path(output)
45
+
46
+ if not source.exists():
47
+ raise FileNotFoundError(source)
48
+
49
+ cmd = [
50
+ compiler,
51
+ str(source),
52
+ "-o",
53
+ str(output),
54
+ ]
55
+
56
+ if flags:
57
+ cmd.extend(flags)
58
+
59
+ try:
60
+ subprocess.run(
61
+ cmd,
62
+ check=True,
63
+ stdout=subprocess.PIPE,
64
+ stderr=subprocess.PIPE,
65
+ text=True,
66
+ )
67
+ except subprocess.CalledProcessError as e:
68
+ raise CompilationError(
69
+ f"Compilation failed for {source}:\n{e.stderr}"
70
+ ) from e
71
+
72
+
73
+ def compile_folder(
74
+ src_dir: str,
75
+ out_dir: str,
76
+ optimize: str = "-O0",
77
+ extra_flags: list[str] | None = None,
78
+ ):
79
+ """
80
+ Compile all .c and .cpp files in a folder (recursively).
81
+ """
82
+ src_dir = Path(src_dir)
83
+ out_dir = Path(out_dir)
84
+ out_dir.mkdir(parents=True, exist_ok=True)
85
+
86
+ extra_flags = extra_flags or []
87
+
88
+ sources = list(src_dir.rglob("*.c")) + list(src_dir.rglob("*.cpp"))
89
+
90
+ if not sources:
91
+ raise ValueError(f"No C/C++ files found in {src_dir}")
92
+
93
+ for src in sources:
94
+ output = out_dir / src.stem
95
+ flags = [optimize, *extra_flags]
96
+
97
+ try:
98
+ if src.suffix == ".c":
99
+ compile_c(src, output, flags)
100
+ else:
101
+ compile_cpp(src, output, flags)
102
+
103
+ except CompilationError as e:
104
+ raise CompilationError(
105
+ f"Compilation failed for {src}:\n{e}"
106
+ ) from e
@@ -0,0 +1,6 @@
1
+ from .objdump import disassemble, disassemble_folder
2
+
3
+ __all__ = [
4
+ "disassemble",
5
+ "disassemble_folder",
6
+ ]
@@ -0,0 +1,2 @@
1
+ class DisassemblyError(RuntimeError):
2
+ """Raised when objdump disassembly fails."""
@@ -0,0 +1,127 @@
1
+ import subprocess
2
+ from pathlib import Path
3
+ from .errors import DisassemblyError
4
+
5
+
6
+ def disassemble(
7
+ binary: str,
8
+ output: str,
9
+ arch: str | None = None,
10
+ full: bool = False,
11
+ ):
12
+ """
13
+ Disassemble a single binary using objdump.
14
+
15
+ Parameters
16
+ ----------
17
+ binary : str
18
+ Path to compiled binary
19
+ output : str
20
+ Output .asm file
21
+ arch : str | None
22
+ Optional architecture (e.g. i386:x86-64)
23
+ full : bool
24
+ If True, disassemble all functions.
25
+ If False, exclude builtin / PLT functions.
26
+ """
27
+ binary = Path(binary)
28
+ output = Path(output)
29
+
30
+ if not binary.exists():
31
+ raise FileNotFoundError(binary)
32
+
33
+ output.parent.mkdir(parents=True, exist_ok=True)
34
+
35
+ # Base command
36
+ cmd = ["objdump", "-d", "--section=.text", str(binary)]
37
+
38
+ if arch:
39
+ cmd.extend(["-m", arch])
40
+
41
+ try:
42
+ result = subprocess.run(
43
+ cmd,
44
+ check=True,
45
+ stdout=subprocess.PIPE,
46
+ stderr=subprocess.PIPE,
47
+ text=True,
48
+ )
49
+ except subprocess.CalledProcessError as e:
50
+ raise DisassemblyError(
51
+ f"objdump failed for {binary}:\n{e.stderr}"
52
+ ) from e
53
+
54
+ asm = result.stdout
55
+
56
+ if not full:
57
+ asm = _filter_builtin_functions(asm)
58
+
59
+ output.write_text(asm)
60
+
61
+
62
+ def disassemble_folder(
63
+ bin_dir: str,
64
+ out_dir: str,
65
+ full: bool = False,
66
+ ):
67
+ """
68
+ Disassemble all binaries in a folder.
69
+
70
+ Parameters
71
+ ----------
72
+ bin_dir : str
73
+ Folder containing compiled binaries
74
+ out_dir : str
75
+ Folder to store .asm outputs
76
+ full : bool
77
+ If True, disassemble all functions.
78
+ If False, exclude builtin / PLT functions.
79
+ """
80
+ bin_dir = Path(bin_dir)
81
+ out_dir = Path(out_dir)
82
+ out_dir.mkdir(parents=True, exist_ok=True)
83
+
84
+ binaries = [p for p in bin_dir.iterdir() if p.is_file()]
85
+
86
+ if not binaries:
87
+ raise ValueError(f"No binaries found in {bin_dir}")
88
+
89
+ for binary in binaries:
90
+ asm_out = out_dir / f"{binary.name}.asm"
91
+
92
+ try:
93
+ disassemble(binary, asm_out, full=full)
94
+ except DisassemblyError as e:
95
+ raise DisassemblyError(
96
+ f"Disassembly failed for {binary}:\n{e}"
97
+ ) from e
98
+
99
+
100
+ def _filter_builtin_functions(asm: str) -> str:
101
+ """
102
+ Remove builtin / PLT / runtime functions from objdump output.
103
+ """
104
+ filtered_lines = []
105
+
106
+ skip = False
107
+ for line in asm.splitlines():
108
+ if "<" in line and ">" in line and line.strip().endswith(":"):
109
+ name = line.split("<")[1].split(">")[0]
110
+
111
+ if (
112
+ name.endswith("@plt")
113
+ or name.startswith("_start")
114
+ or name.startswith("frame_dummy")
115
+ or name.startswith("register_tm_clones")
116
+ or name.startswith("deregister_tm_clones")
117
+ or name.startswith("__")
118
+ ):
119
+ skip = True
120
+ continue
121
+ else:
122
+ skip = False
123
+
124
+ if not skip:
125
+ filtered_lines.append(line)
126
+
127
+ return "\n".join(filtered_lines)
@@ -0,0 +1,7 @@
1
+ from .config import PipelineConfig
2
+ from .runner import run_pipeline
3
+
4
+ __all__ = [
5
+ "run_pipeline",
6
+ "PipelineConfig"
7
+ ]
@@ -0,0 +1,32 @@
1
+ from dataclasses import dataclass
2
+ from typing import Optional, Tuple
3
+
4
+
5
+ @dataclass
6
+ class PipelineConfig:
7
+ source_file: str
8
+
9
+ build_dir: str
10
+ asm_dir: str
11
+
12
+ # compiler
13
+ optimize: str = "-O0"
14
+ extra_flags: Optional[list[str]] = None
15
+
16
+ # disassembler
17
+ arch: Optional[str] = None
18
+ full_disasm: bool = False
19
+
20
+ # tokenizer
21
+ entry: str = "main"
22
+ keep_register: bool = False
23
+
24
+ # vectorizer
25
+ model_path: Optional[str] = None
26
+ max_features: Optional[int] = None
27
+ ngram_range: Tuple[int, int] = (1, 2)
28
+ min_df: int = 1
29
+
30
+ # switches
31
+ do_compile: bool = True
32
+ do_disassemble: bool = True
@@ -0,0 +1,78 @@
1
+ from pathlib import Path
2
+
3
+ from disasm2vec.compiler import compile_c, compile_cpp
4
+ from disasm2vec.disassembler import disassemble
5
+ from disasm2vec.tokenizer import tokenize
6
+ from disasm2vec.vectorizer import Tfidf
7
+
8
+ from .config import PipelineConfig
9
+
10
+
11
+ def run_pipeline(config: PipelineConfig):
12
+ """
13
+ Run pipeline for single source file.
14
+
15
+ Flow:
16
+ source -> compile -> disassemble -> tokenizer -> vectorize
17
+ """
18
+
19
+ source = Path(config.source_file)
20
+
21
+ if not source.exists():
22
+ raise FileNotFoundError(source)
23
+
24
+ stem = source.stem
25
+
26
+ binary_path = Path(config.build_dir) / stem
27
+ asm_path = Path(config.asm_dir) / f"{stem}.asm"
28
+
29
+ binary_path.parent.mkdir(parents=True, exist_ok=True)
30
+ asm_path.parent.mkdir(parents=True, exist_ok=True)
31
+
32
+ # COMPILE
33
+ if config.do_compile:
34
+ flags = [config.optimize]
35
+ if config.extra_flags:
36
+ flags.extend(config.extra_flags)
37
+
38
+ if source.suffix == ".c":
39
+ compile_c(source, binary_path, flags)
40
+
41
+ elif source.suffix == ".cpp":
42
+ compile_cpp(source, binary_path, flags)
43
+
44
+ else:
45
+ raise ValueError(
46
+ f"Unsupported source type: {source.suffix}"
47
+ )
48
+
49
+ # DISASSEMBLE
50
+ if config.do_disassemble:
51
+ disassemble(
52
+ binary=binary_path,
53
+ output=asm_path,
54
+ arch=config.arch,
55
+ full=config.full_disasm,
56
+ )
57
+
58
+ # TOKENIZER
59
+ corpus = tokenize(
60
+ path=asm_path,
61
+ entry=config.entry,
62
+ keep_register=config.keep_register,
63
+ )
64
+
65
+ # VECTORIZE
66
+ if not config.model_path:
67
+ raise ValueError("model_path is required for pipeline")
68
+
69
+ vectorizer = Tfidf(
70
+ max_features=config.max_features,
71
+ ngram_range=config.ngram_range,
72
+ min_df=config.min_df,
73
+ )
74
+ vectorizer.load(config.model_path)
75
+
76
+ X = vectorizer.transform_one(corpus)
77
+
78
+ return X, vectorizer
@@ -0,0 +1,4 @@
1
+ from .core import tokenize, tokenize_batch
2
+
3
+ __all__ = ["tokenize",
4
+ "tokenize_batch",]
@@ -0,0 +1,10 @@
1
+ import re
2
+
3
+ INSTRUCTION_PATTERN = re.compile(r"\s*[0-9a-fA-F]+:\s+")
4
+
5
+
6
+ def is_instruction_line(line: str) -> bool:
7
+ """
8
+ Check whether a line is a valid objdump instruction line.
9
+ """
10
+ return bool(INSTRUCTION_PATTERN.match(line))