varbert 2.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- varbert-2.0.0/LICENSE +24 -0
- varbert-2.0.0/PKG-INFO +98 -0
- varbert-2.0.0/README.md +80 -0
- varbert-2.0.0/pyproject.toml +41 -0
- varbert-2.0.0/setup.cfg +4 -0
- varbert-2.0.0/tests/tests.py +321 -0
- varbert-2.0.0/varbert/__init__.py +108 -0
- varbert-2.0.0/varbert/__main__.py +37 -0
- varbert-2.0.0/varbert/api.py +138 -0
- varbert-2.0.0/varbert/logger.py +98 -0
- varbert-2.0.0/varbert/model.py +414 -0
- varbert-2.0.0/varbert/models/__init__.py +0 -0
- varbert-2.0.0/varbert/text_processor.py +242 -0
- varbert-2.0.0/varbert.egg-info/PKG-INFO +98 -0
- varbert-2.0.0/varbert.egg-info/SOURCES.txt +17 -0
- varbert-2.0.0/varbert.egg-info/dependency_links.txt +1 -0
- varbert-2.0.0/varbert.egg-info/entry_points.txt +2 -0
- varbert-2.0.0/varbert.egg-info/requires.txt +5 -0
- varbert-2.0.0/varbert.egg-info/top_level.txt +1 -0
varbert-2.0.0/LICENSE
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
BSD 2-Clause License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2023, BinSync
|
|
4
|
+
|
|
5
|
+
Redistribution and use in source and binary forms, with or without
|
|
6
|
+
modification, are permitted provided that the following conditions are met:
|
|
7
|
+
|
|
8
|
+
1. Redistributions of source code must retain the above copyright notice, this
|
|
9
|
+
list of conditions and the following disclaimer.
|
|
10
|
+
|
|
11
|
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
|
12
|
+
this list of conditions and the following disclaimer in the documentation
|
|
13
|
+
and/or other materials provided with the distribution.
|
|
14
|
+
|
|
15
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
16
|
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
17
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
18
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
19
|
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
20
|
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
21
|
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
22
|
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
23
|
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
24
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
varbert-2.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: varbert
|
|
3
|
+
Version: 2.0.0
|
|
4
|
+
Summary: The VarBERT API for renaming variables in decompiled code.
|
|
5
|
+
License: BSD 2 Clause
|
|
6
|
+
Project-URL: Homepage, https://github.com/binsync/varbert_api
|
|
7
|
+
Classifier: License :: OSI Approved :: BSD License
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
10
|
+
Requires-Python: >=3.8
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Requires-Dist: torch
|
|
14
|
+
Requires-Dist: transformers
|
|
15
|
+
Requires-Dist: tqdm
|
|
16
|
+
Requires-Dist: dailalib
|
|
17
|
+
Requires-Dist: libbs
|
|
18
|
+
|
|
19
|
+
# VarBERT API
|
|
20
|
+
The VarBERT API is a Python library to access and use the latest models from the S&P 2024 work
|
|
21
|
+
[""Len or index or count, anything but v1": Predicting Variable Names in Decompilation Output with Transfer Learning"](), featuring VarBERT.
|
|
22
|
+
VarBERT is a BERT-based model that predicts variable names for decompiled code.
|
|
23
|
+
To train new models and understand the pipeline, see the [VarBERT paper repo]().
|
|
24
|
+
Specialized models exist for IDA Pro and Ghidra, but can be used on any decompiler.
|
|
25
|
+
|
|
26
|
+
<p align="center">
|
|
27
|
+
<img src="./assets/varbert_no_background.png" style="width: 50%;" alt="DAILA context menu"/>
|
|
28
|
+
</p>
|
|
29
|
+
|
|
30
|
+
The main focus of this project is to provide an library API and CLI access to VarBERT models, but, it has
|
|
31
|
+
been designed to be used in decompiler directly using the [DAILA](https://github.com/mahaloz/DAILA) project.
|
|
32
|
+
DAILA comes with the VarBERT API bundled, so you do not need to install VarBERT if you are using DAILA.
|
|
33
|
+
|
|
34
|
+
## Install
|
|
35
|
+
```
|
|
36
|
+
pip3 install varbert && varbert --download-models
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
This will install the VarBERT API library and download the models to be stored inside the VarBERT package.
|
|
40
|
+
You can optionally provide a decompiler name to `--download-models` to only download the models for that decompiler.
|
|
41
|
+
|
|
42
|
+
## Usage
|
|
43
|
+
The VarBERT API can be used in three ways:
|
|
44
|
+
- From the CLI, directly on decompiled text (without an attached decompiler)
|
|
45
|
+
- As a scripting library
|
|
46
|
+
- As a decompiler plugin (using [DALIA](https://github.com/mahaloz/DAILA))
|
|
47
|
+
|
|
48
|
+
### Command Line (without running a decompiler)
|
|
49
|
+
Note that VarBERT runs better when it is directly hooked up to a decompiler because it can use additional semantic information that the decompiler knows about the decompiled code.
|
|
50
|
+
However, we do have the ability to run VarBERT without a running decompiler, only operating on the text from the command line.
|
|
51
|
+
|
|
52
|
+
Running the following will cause VarBERT to read a function from standard input and output the function with predicted variable names to standard out:
|
|
53
|
+
```bash
|
|
54
|
+
varbert --predict --decompiler ida
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
You can select different decompilers that will use different models that are trained on the different decompilers.
|
|
58
|
+
If you do not specify a decompiler, the default is IDA Pro.
|
|
59
|
+
As an example, you can also give no decompiler:
|
|
60
|
+
```bash
|
|
61
|
+
echo "__int64 sub_400664(char *a1,char *a2)\n {}" | varbert -p
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### Scripting
|
|
65
|
+
#### Without Decompiler
|
|
66
|
+
```python
|
|
67
|
+
from varbert import VariableRenamingAPI
|
|
68
|
+
api = VariableRenamingAPI(decompiler_name="ida", use_decompiler=False)
|
|
69
|
+
new_names, new_code = api.predict_variable_names(decompilation_text="__int64 sub_400664(char *a1,char *a2)\n {}", use_decompiler=False)
|
|
70
|
+
print(new_code)
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
You can also find more examples in the [tests.py](./tests/tests.py) file.
|
|
74
|
+
|
|
75
|
+
#### Inside Decompiler
|
|
76
|
+
You can use VarBERT as a scripting library inside your decompiler, utilizing LibBS.
|
|
77
|
+
```python
|
|
78
|
+
from varbert import VariableRenamingAPI
|
|
79
|
+
from libbs.api import DecompilerInterface
|
|
80
|
+
dec = DecompilerInterface()
|
|
81
|
+
api = VariableRenamingAPI(decompiler_interface=dec)
|
|
82
|
+
for func_addr in dec.functions:
|
|
83
|
+
new_names, new_code = api.predict_variable_names(function=dec.functions[func_addr])
|
|
84
|
+
print(new_names)
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### As a Decompiler Plugin
|
|
88
|
+
If you would like to use VarBERT as a decompiler plugin, you can use [DAILA](https://github.com/mahaloz/DAILA).
|
|
89
|
+
You should follow the instructions on the DAILA repo to install DAILA, but it's generally as simple as:
|
|
90
|
+
```bash
|
|
91
|
+
pip3 install dailalib && daila --install
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## Citing
|
|
95
|
+
If you use VarBERT in your research, please cite our paper:
|
|
96
|
+
```
|
|
97
|
+
TODO
|
|
98
|
+
```
|
varbert-2.0.0/README.md
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# VarBERT API
|
|
2
|
+
The VarBERT API is a Python library to access and use the latest models from the S&P 2024 work
|
|
3
|
+
[""Len or index or count, anything but v1": Predicting Variable Names in Decompilation Output with Transfer Learning"](), featuring VarBERT.
|
|
4
|
+
VarBERT is a BERT-based model that predicts variable names for decompiled code.
|
|
5
|
+
To train new models and understand the pipeline, see the [VarBERT paper repo]().
|
|
6
|
+
Specialized models exist for IDA Pro and Ghidra, but can be used on any decompiler.
|
|
7
|
+
|
|
8
|
+
<p align="center">
|
|
9
|
+
<img src="./assets/varbert_no_background.png" style="width: 50%;" alt="DAILA context menu"/>
|
|
10
|
+
</p>
|
|
11
|
+
|
|
12
|
+
The main focus of this project is to provide an library API and CLI access to VarBERT models, but, it has
|
|
13
|
+
been designed to be used in decompiler directly using the [DAILA](https://github.com/mahaloz/DAILA) project.
|
|
14
|
+
DAILA comes with the VarBERT API bundled, so you do not need to install VarBERT if you are using DAILA.
|
|
15
|
+
|
|
16
|
+
## Install
|
|
17
|
+
```
|
|
18
|
+
pip3 install varbert && varbert --download-models
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
This will install the VarBERT API library and download the models to be stored inside the VarBERT package.
|
|
22
|
+
You can optionally provide a decompiler name to `--download-models` to only download the models for that decompiler.
|
|
23
|
+
|
|
24
|
+
## Usage
|
|
25
|
+
The VarBERT API can be used in three ways:
|
|
26
|
+
- From the CLI, directly on decompiled text (without an attached decompiler)
|
|
27
|
+
- As a scripting library
|
|
28
|
+
- As a decompiler plugin (using [DALIA](https://github.com/mahaloz/DAILA))
|
|
29
|
+
|
|
30
|
+
### Command Line (without running a decompiler)
|
|
31
|
+
Note that VarBERT runs better when it is directly hooked up to a decompiler because it can use additional semantic information that the decompiler knows about the decompiled code.
|
|
32
|
+
However, we do have the ability to run VarBERT without a running decompiler, only operating on the text from the command line.
|
|
33
|
+
|
|
34
|
+
Running the following will cause VarBERT to read a function from standard input and output the function with predicted variable names to standard out:
|
|
35
|
+
```bash
|
|
36
|
+
varbert --predict --decompiler ida
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
You can select different decompilers that will use different models that are trained on the different decompilers.
|
|
40
|
+
If you do not specify a decompiler, the default is IDA Pro.
|
|
41
|
+
As an example, you can also give no decompiler:
|
|
42
|
+
```bash
|
|
43
|
+
echo "__int64 sub_400664(char *a1,char *a2)\n {}" | varbert -p
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
### Scripting
|
|
47
|
+
#### Without Decompiler
|
|
48
|
+
```python
|
|
49
|
+
from varbert import VariableRenamingAPI
|
|
50
|
+
api = VariableRenamingAPI(decompiler_name="ida", use_decompiler=False)
|
|
51
|
+
new_names, new_code = api.predict_variable_names(decompilation_text="__int64 sub_400664(char *a1,char *a2)\n {}", use_decompiler=False)
|
|
52
|
+
print(new_code)
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
You can also find more examples in the [tests.py](./tests/tests.py) file.
|
|
56
|
+
|
|
57
|
+
#### Inside Decompiler
|
|
58
|
+
You can use VarBERT as a scripting library inside your decompiler, utilizing LibBS.
|
|
59
|
+
```python
|
|
60
|
+
from varbert import VariableRenamingAPI
|
|
61
|
+
from libbs.api import DecompilerInterface
|
|
62
|
+
dec = DecompilerInterface()
|
|
63
|
+
api = VariableRenamingAPI(decompiler_interface=dec)
|
|
64
|
+
for func_addr in dec.functions:
|
|
65
|
+
new_names, new_code = api.predict_variable_names(function=dec.functions[func_addr])
|
|
66
|
+
print(new_names)
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### As a Decompiler Plugin
|
|
70
|
+
If you would like to use VarBERT as a decompiler plugin, you can use [DAILA](https://github.com/mahaloz/DAILA).
|
|
71
|
+
You should follow the instructions on the DAILA repo to install DAILA, but it's generally as simple as:
|
|
72
|
+
```bash
|
|
73
|
+
pip3 install dailalib && daila --install
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## Citing
|
|
77
|
+
If you use VarBERT in your research, please cite our paper:
|
|
78
|
+
```
|
|
79
|
+
TODO
|
|
80
|
+
```
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.2"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "varbert"
|
|
7
|
+
classifiers = [
|
|
8
|
+
"License :: OSI Approved :: BSD License",
|
|
9
|
+
"Programming Language :: Python :: 3",
|
|
10
|
+
"Programming Language :: Python :: 3.8",
|
|
11
|
+
]
|
|
12
|
+
license = {text = "BSD 2 Clause"}
|
|
13
|
+
description = "The VarBERT API for renaming variables in decompiled code."
|
|
14
|
+
urls = {Homepage = "https://github.com/binsync/varbert_api"}
|
|
15
|
+
requires-python = ">= 3.8"
|
|
16
|
+
dependencies = [
|
|
17
|
+
"torch",
|
|
18
|
+
"transformers",
|
|
19
|
+
"tqdm",
|
|
20
|
+
"dailalib",
|
|
21
|
+
"libbs"
|
|
22
|
+
]
|
|
23
|
+
dynamic = ["version"]
|
|
24
|
+
|
|
25
|
+
[project.readme]
|
|
26
|
+
file = "README.md"
|
|
27
|
+
content-type = "text/markdown"
|
|
28
|
+
|
|
29
|
+
[project.scripts]
|
|
30
|
+
varbert = "varbert.__main__:main"
|
|
31
|
+
|
|
32
|
+
[tool.setuptools]
|
|
33
|
+
include-package-data = true
|
|
34
|
+
license-files = ["LICENSE"]
|
|
35
|
+
|
|
36
|
+
[tool.setuptools.packages]
|
|
37
|
+
find = {namespaces = false}
|
|
38
|
+
|
|
39
|
+
[tool.setuptools.dynamic]
|
|
40
|
+
version = {attr = "varbert.__version__"}
|
|
41
|
+
|
varbert-2.0.0/setup.cfg
ADDED
|
@@ -0,0 +1,321 @@
|
|
|
1
|
+
import unittest
|
|
2
|
+
import sys
|
|
3
|
+
from typing import Dict
|
|
4
|
+
|
|
5
|
+
from varbert import VariableRenamingAPI
|
|
6
|
+
from yodalib.data import Function, FunctionArgument, FunctionHeader, StackVariable
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def function_with_new_names(function: Function, new_names: Dict[str, str]):
|
|
10
|
+
new_func: Function = function.copy()
|
|
11
|
+
for old_name, new_name in new_names.items():
|
|
12
|
+
for _, arg in new_func.args.items():
|
|
13
|
+
if arg.name == old_name:
|
|
14
|
+
arg.name = new_name
|
|
15
|
+
|
|
16
|
+
for _, svar in new_func.stack_vars.items():
|
|
17
|
+
if svar.name == old_name:
|
|
18
|
+
svar.name = new_name
|
|
19
|
+
|
|
20
|
+
return new_func
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class TestBinSyncRenaming(unittest.TestCase):
|
|
24
|
+
def test_renaming(self):
|
|
25
|
+
api = VariableRenamingAPI(use_decompiler=False, decompiler_name="ida")
|
|
26
|
+
|
|
27
|
+
# testing text
|
|
28
|
+
function_text = "__int64 __fastcall sub_5E007(__int64 a1, __int64 a2, const char *a3)\n{\n __int64 v3; // rcx\n __int64 v4; // r8\n __int64 v5; // r9\n unsigned int v7; // [rsp+20h] [rbp-10h]\n int v8; // [rsp+24h] [rbp-Ch]\n __int64 v9; // [rsp+28h] [rbp-8h]\n\n v9 = qword_246250;\n v7 = 0;\n v8 = atoi(a3);\n if ( v8 >= 0 )\n {\n while ( v9 )\n {\n *(v9 + 3240) = v8;\n v9 = *(v9 + 21664);\n ++v7;\n }\n if ( a2 )\n sub_33178(\n 4,\n \"WARN: [%s] plugin name not supported for key 'telemetry_dump_kafka_topic_rr'. Globalized.\\n\",\n a1,\n v3,\n v4,\n v5);\n return v7;\n }\n else\n {\n sub_33178(4, \"WARN: [%s] 'telemetry_dump_kafka_topic_rr' has to be >= 0.\\n\", a1, v3, v4, v5);\n return 0xFFFFFFFFLL;\n }\n}\n// 5E064: variable 'v3' is possibly undefined\n// 5E064: variable 'v4' is possibly undefined\n// 5E064: variable 'v5' is possibly undefined\n// 246250: using guessed type __int64 qword_246250;\n"
|
|
29
|
+
svar_name_data = ["v3", "v4", "v5", "v7", "v8", "v9"]
|
|
30
|
+
args_name_data = ["a1", "a2", "a3"]
|
|
31
|
+
|
|
32
|
+
# WARNING: the offsets that these stack variables have as BinSync objects are not real and are only for this testcase
|
|
33
|
+
function = Function(0xdead, 0x1337, header=FunctionHeader("sub_5E007", 0xdead, args={}), stack_vars={})
|
|
34
|
+
for i, name in enumerate(svar_name_data):
|
|
35
|
+
function.stack_vars[i] = StackVariable(i, name, None, 8, function.addr)
|
|
36
|
+
for i, name in enumerate(args_name_data):
|
|
37
|
+
function.args[i] = FunctionArgument(i, name, None, 8)
|
|
38
|
+
|
|
39
|
+
new_names, _ = api.predict_variable_names(function, decompilation_text=function_text, use_decompiler=False)
|
|
40
|
+
new_function = function_with_new_names(function, new_names)
|
|
41
|
+
|
|
42
|
+
assert new_function.args[0] != function.args[0]
|
|
43
|
+
assert new_function.args[1] != function.args[1]
|
|
44
|
+
assert new_function.args[2] != function.args[2]
|
|
45
|
+
|
|
46
|
+
assert new_function.stack_vars[3] != function.stack_vars[3]
|
|
47
|
+
|
|
48
|
+
def test_renaming_1(self):
|
|
49
|
+
api = VariableRenamingAPI(use_decompiler=False, decompiler_name="ida")
|
|
50
|
+
|
|
51
|
+
# testing text
|
|
52
|
+
function_text = "_int64 __fastcall main(int a1, char **a2, char **a3)\n{\n _BOOL4 v4; // [rsp+1Ch] [rbp-24h] BYREF\n char v5[16]; // [rsp+20h] [rbp-20h] BYREF\n char buf[16]; // [rsp+30h] [rbp-10h] BYREF\n\n buf[8] = 0;\n v5[8] = 0;\n puts(\"Username: \");\n read(0, buf, 8uLL);\n read(0, &v4, 1uLL);\n puts(\"Password: \");\n read(0, v5, 8uLL);\n read(0, &v4, 1uLL);\n v4 = sub_400664(buf, v5);\n if ( !v4 )\n sub_4006FD();\n return sub_4006ED(buf);\n}\n"
|
|
53
|
+
svar_name_data = ['v4', 'v5', 'buf']
|
|
54
|
+
args_name_data = ["a1", "a2", "a3"]
|
|
55
|
+
|
|
56
|
+
# WARNING: the offsets that these stack variables have as BinSync objects are not real and are only for this testcase
|
|
57
|
+
function = Function(0xdead, 0x1337, header=FunctionHeader("main", 0xdead, args={}), stack_vars={})
|
|
58
|
+
for i, name in enumerate(svar_name_data):
|
|
59
|
+
function.stack_vars[i] = StackVariable(i, name, None, 8, function.addr)
|
|
60
|
+
for i, name in enumerate(args_name_data):
|
|
61
|
+
function.args[i] = FunctionArgument(i, name, None, 8)
|
|
62
|
+
|
|
63
|
+
new_names, _ = api.predict_variable_names(function, decompilation_text=function_text, use_decompiler=False)
|
|
64
|
+
new_function = function_with_new_names(function, new_names)
|
|
65
|
+
|
|
66
|
+
assert new_function.args[0] != function.args[0]
|
|
67
|
+
assert new_function.args[1] != function.args[1]
|
|
68
|
+
assert new_function.args[2] != function.args[2]
|
|
69
|
+
|
|
70
|
+
assert new_function.stack_vars[1] != function.stack_vars[1]
|
|
71
|
+
assert new_function.stack_vars[2] != function.stack_vars[2]
|
|
72
|
+
def test_renaming_2(self):
|
|
73
|
+
api = VariableRenamingAPI(use_decompiler=False, decompiler_name="ida")
|
|
74
|
+
|
|
75
|
+
# testing text
|
|
76
|
+
function_text = '''__int64 __fastcall main(int a1, char **a2, char **a3, char a4)
|
|
77
|
+
{
|
|
78
|
+
const char *v5; // rbp
|
|
79
|
+
|
|
80
|
+
if ( a1 != 2 )
|
|
81
|
+
return 0LL;
|
|
82
|
+
sub_2B30(*a2);
|
|
83
|
+
setlocale(6, "");
|
|
84
|
+
bindtextdomain("coreutils", "/usr/share/locale");
|
|
85
|
+
textdomain("coreutils");
|
|
86
|
+
sub_5550(sub_2A80);
|
|
87
|
+
v5 = a2[1];
|
|
88
|
+
if ( !strcmp(v5, "--help") )
|
|
89
|
+
sub_2700(0);
|
|
90
|
+
if ( !strcmp(v5, "--version") )
|
|
91
|
+
sub_4DC0(stdout, "true", &unk_6084, Version, "Jim Meyering", 0, a4);
|
|
92
|
+
return 0LL;
|
|
93
|
+
}
|
|
94
|
+
'''
|
|
95
|
+
svar_name_data = ['v5']
|
|
96
|
+
args_name_data = ["a1", "a2", "a3", "a4"]
|
|
97
|
+
# WARNING: the offsets that these stack variables have as BinSync objects are not real and are only for this testcase
|
|
98
|
+
function = Function(0xdead, 0x1337, header=FunctionHeader("main", 0xdead, args={}), stack_vars={})
|
|
99
|
+
for i, name in enumerate(svar_name_data):
|
|
100
|
+
function.stack_vars[i] = StackVariable(i, name, None, 8, function.addr)
|
|
101
|
+
for i, name in enumerate(args_name_data):
|
|
102
|
+
function.args[i] = FunctionArgument(i, name, None, 8)
|
|
103
|
+
|
|
104
|
+
new_names, _ = api.predict_variable_names(function, decompilation_text=function_text, use_decompiler=False)
|
|
105
|
+
new_function = function_with_new_names(function, new_names)
|
|
106
|
+
|
|
107
|
+
assert new_function.args[0] != function.args[0]
|
|
108
|
+
assert new_function.args[1] != function.args[1]
|
|
109
|
+
assert new_function.args[2] != function.args[2]
|
|
110
|
+
assert new_function.args[3] != function.args[3]
|
|
111
|
+
|
|
112
|
+
def test_renaming_3(self):
|
|
113
|
+
api = VariableRenamingAPI(use_decompiler=False, decompiler_name="ida")
|
|
114
|
+
|
|
115
|
+
# testing text
|
|
116
|
+
function_text = ''' __int64 __fastcall sub_4760(__int64 a1, __int64 a2, __int64 a3, __int64 a4, __int64 a5)
|
|
117
|
+
{
|
|
118
|
+
__int128 v6[2]; // [rsp+0h] [rbp-48h] BYREF
|
|
119
|
+
__m128i si128; // [rsp+20h] [rbp-28h]
|
|
120
|
+
__int64 v8; // [rsp+30h] [rbp-18h]
|
|
121
|
+
unsigned __int64 v9; // [rsp+38h] [rbp-10h]
|
|
122
|
+
|
|
123
|
+
v9 = __readfsqword(0x28u);
|
|
124
|
+
v6[0] = _mm_load_si128(&xmmword_A1E0);
|
|
125
|
+
v8 = qword_A210;
|
|
126
|
+
LODWORD(v6[0]) = 10;
|
|
127
|
+
v6[1] = _mm_load_si128(&xmmword_A1F0);
|
|
128
|
+
si128 = _mm_load_si128(&xmmword_A200);
|
|
129
|
+
if ( !a2 || !a3 )
|
|
130
|
+
abort();
|
|
131
|
+
si128.m128i_i64[1] = a2;
|
|
132
|
+
v8 = a3;
|
|
133
|
+
return sub_3F20(a1, a4, a5, v6);
|
|
134
|
+
}'''
|
|
135
|
+
svar_name_data = ['v6', "s128", "v8", "v9"]
|
|
136
|
+
args_name_data = ["a1", "a2", "a3", "a4", "a5"]
|
|
137
|
+
# WARNING: the offsets that these stack variables have as BinSync objects are not real and are only for this testcase
|
|
138
|
+
function = Function(0xdead, 0x1337, header=FunctionHeader("sub_4760", 0xdead, args={}), stack_vars={})
|
|
139
|
+
for i, name in enumerate(svar_name_data):
|
|
140
|
+
function.stack_vars[i] = StackVariable(i, name, None, 8, function.addr)
|
|
141
|
+
for i, name in enumerate(args_name_data):
|
|
142
|
+
function.args[i] = FunctionArgument(i, name, None, 8)
|
|
143
|
+
|
|
144
|
+
new_names, _ = api.predict_variable_names(function, decompilation_text=function_text, use_decompiler=False)
|
|
145
|
+
new_function = function_with_new_names(function, new_names)
|
|
146
|
+
|
|
147
|
+
assert new_function.args[0] != function.args[0]
|
|
148
|
+
assert new_function.args[1] != function.args[1]
|
|
149
|
+
assert new_function.args[2] != function.args[2]
|
|
150
|
+
assert new_function.args[3] != function.args[3]
|
|
151
|
+
assert new_function.args[4] != function.args[4]
|
|
152
|
+
|
|
153
|
+
def test_renaming_4(self):
|
|
154
|
+
api = VariableRenamingAPI(use_decompiler=False, decompiler_name="ida")
|
|
155
|
+
|
|
156
|
+
# testing text
|
|
157
|
+
function_text = '''__int64 __fastcall main(int a1, char **a2, char **a3)
|
|
158
|
+
{
|
|
159
|
+
char *v4; // r12
|
|
160
|
+
char *v5; // rax
|
|
161
|
+
char *v6; // rbp
|
|
162
|
+
const char *v7; // r13
|
|
163
|
+
const char *v8; // r13
|
|
164
|
+
char *v9; // rax
|
|
165
|
+
char *v10; // rax
|
|
166
|
+
FILE *v11; // r12
|
|
167
|
+
char *v12; // rax
|
|
168
|
+
FILE *v13; // r12
|
|
169
|
+
char *v14; // rax
|
|
170
|
+
char *v15; // rax
|
|
171
|
+
__int64 *v16; // rbp
|
|
172
|
+
const char *v17; // rsi
|
|
173
|
+
const char *v18; // r14
|
|
174
|
+
char *v19; // rax
|
|
175
|
+
char *v20; // rdi
|
|
176
|
+
char *v21; // rax
|
|
177
|
+
char *v22; // r12
|
|
178
|
+
char *v23; // rax
|
|
179
|
+
char *v24; // rax
|
|
180
|
+
char *v25; // rdi
|
|
181
|
+
char *v26; // rax
|
|
182
|
+
FILE *v27; // rbp
|
|
183
|
+
char *v28; // rax
|
|
184
|
+
__int64 v29[21]; // [rsp+0h] [rbp-A8h] BYREF
|
|
185
|
+
|
|
186
|
+
v29[15] = __readfsqword(0x28u);
|
|
187
|
+
if ( a1 != 2 )
|
|
188
|
+
return 0LL;
|
|
189
|
+
v4 = *a2;
|
|
190
|
+
if ( !*a2 )
|
|
191
|
+
{
|
|
192
|
+
fwrite("A NULL argv[0] was passed through an exec system call.\n", 1uLL, 0x37uLL, stderr);
|
|
193
|
+
abort();
|
|
194
|
+
}
|
|
195
|
+
v5 = strrchr(v4, 47);
|
|
196
|
+
v6 = v5;
|
|
197
|
+
if ( v5 )
|
|
198
|
+
{
|
|
199
|
+
v7 = v5 + 1;
|
|
200
|
+
if ( v5 + 1 - v4 > 6 && !strncmp(v5 - 6, "/.libs/", 7uLL) )
|
|
201
|
+
{
|
|
202
|
+
v4 = v7;
|
|
203
|
+
if ( !strncmp(v7, "lt-", 3uLL) )
|
|
204
|
+
{
|
|
205
|
+
v4 = v6 + 4;
|
|
206
|
+
program_invocation_short_name = v6 + 4;
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
qword_7028 = v4;
|
|
211
|
+
program_invocation_name = v4;
|
|
212
|
+
setlocale(6, "");
|
|
213
|
+
bindtextdomain("coreutils", "/usr/share/locale");
|
|
214
|
+
textdomain("coreutils");
|
|
215
|
+
sub_39D0(sub_3390);
|
|
216
|
+
v8 = a2[1];
|
|
217
|
+
if ( !strcmp(v8, "--help") )
|
|
218
|
+
{
|
|
219
|
+
v9 = dcgettext(0LL, "Usage: %s [ignored command line arguments]\n or: %s OPTION\n", 5);
|
|
220
|
+
__printf_chk(1LL, v9, v4, v4);
|
|
221
|
+
v10 = dcgettext(0LL, "Exit with a status code indicating success.", 5);
|
|
222
|
+
__printf_chk(1LL, "%s\n\n", v10);
|
|
223
|
+
v11 = stdout;
|
|
224
|
+
v12 = dcgettext(0LL, " --help display this help and exit\n", 5);
|
|
225
|
+
fputs_unlocked(v12, v11);
|
|
226
|
+
v13 = stdout;
|
|
227
|
+
v14 = dcgettext(0LL, " --version output version information and exit\n", 5);
|
|
228
|
+
fputs_unlocked(v14, v13);
|
|
229
|
+
v15 = dcgettext(
|
|
230
|
+
0LL,
|
|
231
|
+
"\n"
|
|
232
|
+
"NOTE: your shell may have its own version of %s, which usually supersedes\n"
|
|
233
|
+
"the version described here. Please refer to your shell's documentation\n"
|
|
234
|
+
"for details about the options it supports.\n",
|
|
235
|
+
5);
|
|
236
|
+
__printf_chk(1LL, v15, "true");
|
|
237
|
+
v29[2] = "coreutils";
|
|
238
|
+
v29[1] = "test invocation";
|
|
239
|
+
v16 = v29;
|
|
240
|
+
v17 = "[";
|
|
241
|
+
v29[3] = "Multi-call invocation";
|
|
242
|
+
v29[6] = "sha256sum";
|
|
243
|
+
v29[4] = "sha224sum";
|
|
244
|
+
v29[8] = "sha384sum";
|
|
245
|
+
v29[0] = "[";
|
|
246
|
+
v29[5] = "sha2 utilities";
|
|
247
|
+
v29[7] = "sha2 utilities";
|
|
248
|
+
v29[9] = "sha2 utilities";
|
|
249
|
+
v29[10] = "sha512sum";
|
|
250
|
+
v29[11] = "sha2 utilities";
|
|
251
|
+
v29[12] = 0LL;
|
|
252
|
+
v29[13] = 0LL;
|
|
253
|
+
do
|
|
254
|
+
{
|
|
255
|
+
if ( !strcmp("true", v17) )
|
|
256
|
+
break;
|
|
257
|
+
v17 = v16[2];
|
|
258
|
+
v16 += 2;
|
|
259
|
+
}
|
|
260
|
+
while ( v17 );
|
|
261
|
+
v18 = v16[1];
|
|
262
|
+
if ( v18 )
|
|
263
|
+
{
|
|
264
|
+
v19 = dcgettext(0LL, "\n%s online help: <%s>\n", 5);
|
|
265
|
+
__printf_chk(1LL, v19, &unk_4047, "https://www.gnu.org/software/coreutils/");
|
|
266
|
+
v20 = setlocale(5, 0LL);
|
|
267
|
+
if ( !v20 || !strncmp(v20, "en_", 3uLL) )
|
|
268
|
+
{
|
|
269
|
+
LABEL_19:
|
|
270
|
+
v21 = dcgettext(0LL, "Full documentation <%s%s>\n", 5);
|
|
271
|
+
v22 = " invocation";
|
|
272
|
+
__printf_chk(1LL, v21, "https://www.gnu.org/software/coreutils/", "true");
|
|
273
|
+
if ( v18 != "true" )
|
|
274
|
+
v22 = "";
|
|
275
|
+
LABEL_21:
|
|
276
|
+
v23 = dcgettext(0LL, "or available locally via: info '(coreutils) %s%s'\n", 5);
|
|
277
|
+
__printf_chk(1LL, v23, v18, v22);
|
|
278
|
+
exit(0);
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
else
|
|
282
|
+
{
|
|
283
|
+
v24 = dcgettext(0LL, "\n%s online help: <%s>\n", 5);
|
|
284
|
+
__printf_chk(1LL, v24, &unk_4047, "https://www.gnu.org/software/coreutils/");
|
|
285
|
+
v25 = setlocale(5, 0LL);
|
|
286
|
+
if ( !v25 || !strncmp(v25, "en_", 3uLL) )
|
|
287
|
+
{
|
|
288
|
+
v26 = dcgettext(0LL, "Full documentation <%s%s>\n", 5);
|
|
289
|
+
v18 = "true";
|
|
290
|
+
__printf_chk(1LL, v26, "https://www.gnu.org/software/coreutils/", "true");
|
|
291
|
+
v22 = " invocation";
|
|
292
|
+
goto LABEL_21;
|
|
293
|
+
}
|
|
294
|
+
v18 = "true";
|
|
295
|
+
}
|
|
296
|
+
v27 = stdout;
|
|
297
|
+
v28 = dcgettext(0LL, "Report any translation bugs to <https://translationproject.org/team/>\n", 5);
|
|
298
|
+
fputs_unlocked(v28, v27);
|
|
299
|
+
goto LABEL_19;
|
|
300
|
+
}
|
|
301
|
+
if ( !strcmp(v8, "--version") )
|
|
302
|
+
sub_3410(stdout, v29[0]);
|
|
303
|
+
return 0LL;
|
|
304
|
+
}
|
|
305
|
+
'''
|
|
306
|
+
svar_name_data = ['v4', 'v5', 'v6', 'v7', 'v8', 'v9', 'v10', 'v11', 'v12', 'v13', 'v14', 'v15', 'v16', 'v17', 'v18', 'v19',
|
|
307
|
+
'v20', 'v21', 'v22', 'v23', 'v24', 'v25', 'v26', 'v27', 'v28', 'v29']
|
|
308
|
+
args_name_data = ["a1", "a2", "a3"]
|
|
309
|
+
# WARNING: the offsets that these stack variables have as BinSync objects are not real and are only for this testcase
|
|
310
|
+
function = Function(0xdead, 0x1337, header=FunctionHeader("sub_4760", 0xdead, args={}), stack_vars={})
|
|
311
|
+
for i, name in enumerate(svar_name_data):
|
|
312
|
+
function.stack_vars[i] = StackVariable(i, name, None, 8, function.addr)
|
|
313
|
+
for i, name in enumerate(args_name_data):
|
|
314
|
+
function.args[i] = FunctionArgument(i, name, None, 8)
|
|
315
|
+
|
|
316
|
+
new_names, _ = api.predict_variable_names(function, decompilation_text=function_text, use_decompiler=False)
|
|
317
|
+
assert new_names != {}
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
if __name__ == "__main__":
|
|
321
|
+
unittest.main(argv=sys.argv)
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
__version__ = "2.0.0"
|
|
2
|
+
|
|
3
|
+
import importlib.resources
|
|
4
|
+
import tarfile
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
import urllib.request
|
|
7
|
+
import hashlib
|
|
8
|
+
import math
|
|
9
|
+
import platform
|
|
10
|
+
import shutil
|
|
11
|
+
|
|
12
|
+
from tqdm import tqdm
|
|
13
|
+
from libbs.decompilers import GHIDRA_DECOMPILER, IDA_DECOMPILER
|
|
14
|
+
|
|
15
|
+
# initialize logging for the entire project
|
|
16
|
+
import logging
|
|
17
|
+
logging.getLogger("varbert").addHandler(logging.NullHandler())
|
|
18
|
+
from .logger import Loggers
|
|
19
|
+
loggers = Loggers()
|
|
20
|
+
del Loggers
|
|
21
|
+
|
|
22
|
+
from .api import VariableRenamingAPI
|
|
23
|
+
|
|
24
|
+
MODELS_PATH = Path(Path(str(importlib.resources.files("varbert"))) / "models").absolute()
|
|
25
|
+
SUPPORTED_MODELS = {GHIDRA_DECOMPILER, IDA_DECOMPILER}
|
|
26
|
+
SUBSTITUTE_DECOMPILER_MODEL = IDA_DECOMPILER
|
|
27
|
+
MODEL_FOLDER = "DECOMPILER-OPT-Function"
|
|
28
|
+
# all models are found here: https://www.dropbox.com/scl/fo/socl7rd5lsv926whylqpn/h?rlkey=i0x74bdipj41hys5rorflxawo
|
|
29
|
+
MODEL_URLS = {
|
|
30
|
+
# function based models:
|
|
31
|
+
f"{GHIDRA_DECOMPILER}-O0": "https://www.dropbox.com/scl/fi/8xsmmlzypd45icn8csk6y/Ghidra-O0-Function.tar.gz?rlkey=1b92b9ejktoyewjztvo3ns8q1&dl=1",
|
|
32
|
+
f"{IDA_DECOMPILER}-O0": "https://www.dropbox.com/scl/fi/dmmfqqwvwhkswiv48ltfs/IDA-O0-Function.tar.gz?rlkey=3unxmiydbm5si3n7jh5r43qjp&dl=1",
|
|
33
|
+
f"{GHIDRA_DECOMPILER}-O2": "https://www.dropbox.com/scl/fi/x5ci28s0aw3i852kg9w1j/Ghidra-O2-Function.tar.gz?rlkey=wpe08afvxelcblgcqndrxmvtm&dl=1",
|
|
34
|
+
f"{IDA_DECOMPILER}-O2": "https://www.dropbox.com/scl/fi/ku26eebbwvug5fu2pc4ek/IDA-O2-Function.tar.gz?rlkey=edlri604hhuohh8n5d7d02tnd&dl=1",
|
|
35
|
+
# binary based models:
|
|
36
|
+
#f"{GHIDRA_DECOMPILER}-O2": "https://www.dropbox.com/scl/fi/nbk5b068z6ffsdl0kgbuw/Ghidra-O2-Binary.tar.gz?rlkey=m83iit4jh5fg6icl5cf2z3yhq&dl=1",
|
|
37
|
+
#f"{IDA_DECOMPILER}-O2": "https://www.dropbox.com/scl/fi/vk0ybwu4uoru4fl61yztw/IDA-O2-Binary.tar.gz?rlkey=9rt8js8qrhkqp2cvvttxrlwd2&dl=1",
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
_l = logging.getLogger(__name__)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def install_model(decompiler, opt_level="O0", reinstall=False):
|
|
44
|
+
if decompiler not in SUPPORTED_MODELS:
|
|
45
|
+
_l.warning("Model for decompiler is not supported yet, using model for %s", SUBSTITUTE_DECOMPILER_MODEL)
|
|
46
|
+
decompiler = SUBSTITUTE_DECOMPILER_MODEL
|
|
47
|
+
|
|
48
|
+
# check if the model exists
|
|
49
|
+
decompiler_model = MODELS_PATH / decompiler
|
|
50
|
+
if decompiler_model.exists():
|
|
51
|
+
if reinstall:
|
|
52
|
+
shutil.rmtree(decompiler_model)
|
|
53
|
+
else:
|
|
54
|
+
_l.info(f"Model for {decompiler} already exists. Skipping download.")
|
|
55
|
+
return
|
|
56
|
+
|
|
57
|
+
# saved models on the remote side have some messed up names, so we have to do some
|
|
58
|
+
# string matching here to make sure we download and move the correct stuff
|
|
59
|
+
compliant_decompiler_name = decompiler
|
|
60
|
+
if decompiler == GHIDRA_DECOMPILER:
|
|
61
|
+
decompiler = "Ghidra"
|
|
62
|
+
elif decompiler == IDA_DECOMPILER:
|
|
63
|
+
decompiler = "IDA"
|
|
64
|
+
|
|
65
|
+
dl_model_folder = MODELS_PATH / Path(MODEL_FOLDER.replace("DECOMPILER", decompiler).replace("OPT", opt_level))
|
|
66
|
+
url = MODEL_URLS[f"{compliant_decompiler_name}-{opt_level}"]
|
|
67
|
+
_l.info(f"Downloading model for {compliant_decompiler_name} now...")
|
|
68
|
+
tar_file_path = _download_file(url, MODELS_PATH / f"model.tar.gz")
|
|
69
|
+
with tarfile.open(tar_file_path, "r:gz") as tar:
|
|
70
|
+
tar.extractall(path=MODELS_PATH)
|
|
71
|
+
|
|
72
|
+
# move the model folder to be in a compliant form
|
|
73
|
+
dl_model_folder.rename(MODELS_PATH / compliant_decompiler_name)
|
|
74
|
+
# delete the old tar
|
|
75
|
+
tar_file_path.unlink()
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _download_file(url: str, save_location: Path, verify_hash=False) -> Path:
|
|
79
|
+
# XXX: hacked code for non-ssl verification
|
|
80
|
+
if platform.system() == "Darwin":
|
|
81
|
+
import ssl
|
|
82
|
+
ssl._create_default_https_context = ssl._create_unverified_context
|
|
83
|
+
|
|
84
|
+
with urllib.request.urlopen(url) as response:
|
|
85
|
+
total_size = response.length
|
|
86
|
+
if response.status != 200:
|
|
87
|
+
raise Exception(f"HTTP error {response.status}: {response.reason}")
|
|
88
|
+
|
|
89
|
+
hasher = hashlib.md5()
|
|
90
|
+
chunk_size = 8192
|
|
91
|
+
mb_size = int(total_size / 1000000)
|
|
92
|
+
with open(save_location, 'wb') as f:
|
|
93
|
+
for _ in tqdm(range(math.ceil(total_size / chunk_size)), desc=f"Downloading model ~{mb_size} MB..."):
|
|
94
|
+
chunk = response.read(chunk_size)
|
|
95
|
+
hasher.update(chunk)
|
|
96
|
+
if not chunk:
|
|
97
|
+
break
|
|
98
|
+
|
|
99
|
+
f.write(chunk)
|
|
100
|
+
|
|
101
|
+
# hash for extra security
|
|
102
|
+
#download_hash = hasher.hexdigest()
|
|
103
|
+
#if verify_hash and download_hash != JOERN_ZIP_HASH:
|
|
104
|
+
# raise Exception(f"Files corrupted in download: {download_hash} != {JOERN_ZIP_HASH}")
|
|
105
|
+
|
|
106
|
+
return save_location
|
|
107
|
+
|
|
108
|
+
|