varbert 2.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
varbert-2.0.0/LICENSE ADDED
@@ -0,0 +1,24 @@
1
+ BSD 2-Clause License
2
+
3
+ Copyright (c) 2023, BinSync
4
+
5
+ Redistribution and use in source and binary forms, with or without
6
+ modification, are permitted provided that the following conditions are met:
7
+
8
+ 1. Redistributions of source code must retain the above copyright notice, this
9
+ list of conditions and the following disclaimer.
10
+
11
+ 2. Redistributions in binary form must reproduce the above copyright notice,
12
+ this list of conditions and the following disclaimer in the documentation
13
+ and/or other materials provided with the distribution.
14
+
15
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
19
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
22
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
23
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
varbert-2.0.0/PKG-INFO ADDED
@@ -0,0 +1,98 @@
1
+ Metadata-Version: 2.1
2
+ Name: varbert
3
+ Version: 2.0.0
4
+ Summary: The VarBERT API for renaming variables in decompiled code.
5
+ License: BSD 2 Clause
6
+ Project-URL: Homepage, https://github.com/binsync/varbert_api
7
+ Classifier: License :: OSI Approved :: BSD License
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.8
10
+ Requires-Python: >=3.8
11
+ Description-Content-Type: text/markdown
12
+ License-File: LICENSE
13
+ Requires-Dist: torch
14
+ Requires-Dist: transformers
15
+ Requires-Dist: tqdm
16
+ Requires-Dist: dailalib
17
+ Requires-Dist: libbs
18
+
19
+ # VarBERT API
20
+ The VarBERT API is a Python library to access and use the latest models from the S&P 2024 work
21
+ [""Len or index or count, anything but v1": Predicting Variable Names in Decompilation Output with Transfer Learning"](), featuring VarBERT.
22
+ VarBERT is a BERT-based model that predicts variable names for decompiled code.
23
+ To train new models and understand the pipeline, see the [VarBERT paper repo]().
24
+ Specialized models exist for IDA Pro and Ghidra, but can be used on any decompiler.
25
+
26
+ <p align="center">
27
+ <img src="./assets/varbert_no_background.png" style="width: 50%;" alt="DAILA context menu"/>
28
+ </p>
29
+
30
+ The main focus of this project is to provide an library API and CLI access to VarBERT models, but, it has
31
+ been designed to be used in decompiler directly using the [DAILA](https://github.com/mahaloz/DAILA) project.
32
+ DAILA comes with the VarBERT API bundled, so you do not need to install VarBERT if you are using DAILA.
33
+
34
+ ## Install
35
+ ```
36
+ pip3 install varbert && varbert --download-models
37
+ ```
38
+
39
+ This will install the VarBERT API library and download the models to be stored inside the VarBERT package.
40
+ You can optionally provide a decompiler name to `--download-models` to only download the models for that decompiler.
41
+
42
+ ## Usage
43
+ The VarBERT API can be used in three ways:
44
+ - From the CLI, directly on decompiled text (without an attached decompiler)
45
+ - As a scripting library
46
+ - As a decompiler plugin (using [DALIA](https://github.com/mahaloz/DAILA))
47
+
48
+ ### Command Line (without running a decompiler)
49
+ Note that VarBERT runs better when it is directly hooked up to a decompiler because it can use additional semantic information that the decompiler knows about the decompiled code.
50
+ However, we do have the ability to run VarBERT without a running decompiler, only operating on the text from the command line.
51
+
52
+ Running the following will cause VarBERT to read a function from standard input and output the function with predicted variable names to standard out:
53
+ ```bash
54
+ varbert --predict --decompiler ida
55
+ ```
56
+
57
+ You can select different decompilers that will use different models that are trained on the different decompilers.
58
+ If you do not specify a decompiler, the default is IDA Pro.
59
+ As an example, you can also give no decompiler:
60
+ ```bash
61
+ echo "__int64 sub_400664(char *a1,char *a2)\n {}" | varbert -p
62
+ ```
63
+
64
+ ### Scripting
65
+ #### Without Decompiler
66
+ ```python
67
+ from varbert import VariableRenamingAPI
68
+ api = VariableRenamingAPI(decompiler_name="ida", use_decompiler=False)
69
+ new_names, new_code = api.predict_variable_names(decompilation_text="__int64 sub_400664(char *a1,char *a2)\n {}", use_decompiler=False)
70
+ print(new_code)
71
+ ```
72
+
73
+ You can also find more examples in the [tests.py](./tests/tests.py) file.
74
+
75
+ #### Inside Decompiler
76
+ You can use VarBERT as a scripting library inside your decompiler, utilizing LibBS.
77
+ ```python
78
+ from varbert import VariableRenamingAPI
79
+ from libbs.api import DecompilerInterface
80
+ dec = DecompilerInterface()
81
+ api = VariableRenamingAPI(decompiler_interface=dec)
82
+ for func_addr in dec.functions:
83
+ new_names, new_code = api.predict_variable_names(function=dec.functions[func_addr])
84
+ print(new_names)
85
+ ```
86
+
87
+ ### As a Decompiler Plugin
88
+ If you would like to use VarBERT as a decompiler plugin, you can use [DAILA](https://github.com/mahaloz/DAILA).
89
+ You should follow the instructions on the DAILA repo to install DAILA, but it's generally as simple as:
90
+ ```bash
91
+ pip3 install dailalib && daila --install
92
+ ```
93
+
94
+ ## Citing
95
+ If you use VarBERT in your research, please cite our paper:
96
+ ```
97
+ TODO
98
+ ```
@@ -0,0 +1,80 @@
1
+ # VarBERT API
2
+ The VarBERT API is a Python library to access and use the latest models from the S&P 2024 work
3
+ [""Len or index or count, anything but v1": Predicting Variable Names in Decompilation Output with Transfer Learning"](), featuring VarBERT.
4
+ VarBERT is a BERT-based model that predicts variable names for decompiled code.
5
+ To train new models and understand the pipeline, see the [VarBERT paper repo]().
6
+ Specialized models exist for IDA Pro and Ghidra, but can be used on any decompiler.
7
+
8
+ <p align="center">
9
+ <img src="./assets/varbert_no_background.png" style="width: 50%;" alt="DAILA context menu"/>
10
+ </p>
11
+
12
+ The main focus of this project is to provide an library API and CLI access to VarBERT models, but, it has
13
+ been designed to be used in decompiler directly using the [DAILA](https://github.com/mahaloz/DAILA) project.
14
+ DAILA comes with the VarBERT API bundled, so you do not need to install VarBERT if you are using DAILA.
15
+
16
+ ## Install
17
+ ```
18
+ pip3 install varbert && varbert --download-models
19
+ ```
20
+
21
+ This will install the VarBERT API library and download the models to be stored inside the VarBERT package.
22
+ You can optionally provide a decompiler name to `--download-models` to only download the models for that decompiler.
23
+
24
+ ## Usage
25
+ The VarBERT API can be used in three ways:
26
+ - From the CLI, directly on decompiled text (without an attached decompiler)
27
+ - As a scripting library
28
+ - As a decompiler plugin (using [DALIA](https://github.com/mahaloz/DAILA))
29
+
30
+ ### Command Line (without running a decompiler)
31
+ Note that VarBERT runs better when it is directly hooked up to a decompiler because it can use additional semantic information that the decompiler knows about the decompiled code.
32
+ However, we do have the ability to run VarBERT without a running decompiler, only operating on the text from the command line.
33
+
34
+ Running the following will cause VarBERT to read a function from standard input and output the function with predicted variable names to standard out:
35
+ ```bash
36
+ varbert --predict --decompiler ida
37
+ ```
38
+
39
+ You can select different decompilers that will use different models that are trained on the different decompilers.
40
+ If you do not specify a decompiler, the default is IDA Pro.
41
+ As an example, you can also give no decompiler:
42
+ ```bash
43
+ echo "__int64 sub_400664(char *a1,char *a2)\n {}" | varbert -p
44
+ ```
45
+
46
+ ### Scripting
47
+ #### Without Decompiler
48
+ ```python
49
+ from varbert import VariableRenamingAPI
50
+ api = VariableRenamingAPI(decompiler_name="ida", use_decompiler=False)
51
+ new_names, new_code = api.predict_variable_names(decompilation_text="__int64 sub_400664(char *a1,char *a2)\n {}", use_decompiler=False)
52
+ print(new_code)
53
+ ```
54
+
55
+ You can also find more examples in the [tests.py](./tests/tests.py) file.
56
+
57
+ #### Inside Decompiler
58
+ You can use VarBERT as a scripting library inside your decompiler, utilizing LibBS.
59
+ ```python
60
+ from varbert import VariableRenamingAPI
61
+ from libbs.api import DecompilerInterface
62
+ dec = DecompilerInterface()
63
+ api = VariableRenamingAPI(decompiler_interface=dec)
64
+ for func_addr in dec.functions:
65
+ new_names, new_code = api.predict_variable_names(function=dec.functions[func_addr])
66
+ print(new_names)
67
+ ```
68
+
69
+ ### As a Decompiler Plugin
70
+ If you would like to use VarBERT as a decompiler plugin, you can use [DAILA](https://github.com/mahaloz/DAILA).
71
+ You should follow the instructions on the DAILA repo to install DAILA, but it's generally as simple as:
72
+ ```bash
73
+ pip3 install dailalib && daila --install
74
+ ```
75
+
76
+ ## Citing
77
+ If you use VarBERT in your research, please cite our paper:
78
+ ```
79
+ TODO
80
+ ```
@@ -0,0 +1,41 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.2"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "varbert"
7
+ classifiers = [
8
+ "License :: OSI Approved :: BSD License",
9
+ "Programming Language :: Python :: 3",
10
+ "Programming Language :: Python :: 3.8",
11
+ ]
12
+ license = {text = "BSD 2 Clause"}
13
+ description = "The VarBERT API for renaming variables in decompiled code."
14
+ urls = {Homepage = "https://github.com/binsync/varbert_api"}
15
+ requires-python = ">= 3.8"
16
+ dependencies = [
17
+ "torch",
18
+ "transformers",
19
+ "tqdm",
20
+ "dailalib",
21
+ "libbs"
22
+ ]
23
+ dynamic = ["version"]
24
+
25
+ [project.readme]
26
+ file = "README.md"
27
+ content-type = "text/markdown"
28
+
29
+ [project.scripts]
30
+ varbert = "varbert.__main__:main"
31
+
32
+ [tool.setuptools]
33
+ include-package-data = true
34
+ license-files = ["LICENSE"]
35
+
36
+ [tool.setuptools.packages]
37
+ find = {namespaces = false}
38
+
39
+ [tool.setuptools.dynamic]
40
+ version = {attr = "varbert.__version__"}
41
+
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,321 @@
1
+ import unittest
2
+ import sys
3
+ from typing import Dict
4
+
5
+ from varbert import VariableRenamingAPI
6
+ from yodalib.data import Function, FunctionArgument, FunctionHeader, StackVariable
7
+
8
+
9
+ def function_with_new_names(function: Function, new_names: Dict[str, str]):
10
+ new_func: Function = function.copy()
11
+ for old_name, new_name in new_names.items():
12
+ for _, arg in new_func.args.items():
13
+ if arg.name == old_name:
14
+ arg.name = new_name
15
+
16
+ for _, svar in new_func.stack_vars.items():
17
+ if svar.name == old_name:
18
+ svar.name = new_name
19
+
20
+ return new_func
21
+
22
+
23
+ class TestBinSyncRenaming(unittest.TestCase):
24
+ def test_renaming(self):
25
+ api = VariableRenamingAPI(use_decompiler=False, decompiler_name="ida")
26
+
27
+ # testing text
28
+ function_text = "__int64 __fastcall sub_5E007(__int64 a1, __int64 a2, const char *a3)\n{\n __int64 v3; // rcx\n __int64 v4; // r8\n __int64 v5; // r9\n unsigned int v7; // [rsp+20h] [rbp-10h]\n int v8; // [rsp+24h] [rbp-Ch]\n __int64 v9; // [rsp+28h] [rbp-8h]\n\n v9 = qword_246250;\n v7 = 0;\n v8 = atoi(a3);\n if ( v8 >= 0 )\n {\n while ( v9 )\n {\n *(v9 + 3240) = v8;\n v9 = *(v9 + 21664);\n ++v7;\n }\n if ( a2 )\n sub_33178(\n 4,\n \"WARN: [%s] plugin name not supported for key 'telemetry_dump_kafka_topic_rr'. Globalized.\\n\",\n a1,\n v3,\n v4,\n v5);\n return v7;\n }\n else\n {\n sub_33178(4, \"WARN: [%s] 'telemetry_dump_kafka_topic_rr' has to be >= 0.\\n\", a1, v3, v4, v5);\n return 0xFFFFFFFFLL;\n }\n}\n// 5E064: variable 'v3' is possibly undefined\n// 5E064: variable 'v4' is possibly undefined\n// 5E064: variable 'v5' is possibly undefined\n// 246250: using guessed type __int64 qword_246250;\n"
29
+ svar_name_data = ["v3", "v4", "v5", "v7", "v8", "v9"]
30
+ args_name_data = ["a1", "a2", "a3"]
31
+
32
+ # WARNING: the offsets that these stack variables have as BinSync objects are not real and are only for this testcase
33
+ function = Function(0xdead, 0x1337, header=FunctionHeader("sub_5E007", 0xdead, args={}), stack_vars={})
34
+ for i, name in enumerate(svar_name_data):
35
+ function.stack_vars[i] = StackVariable(i, name, None, 8, function.addr)
36
+ for i, name in enumerate(args_name_data):
37
+ function.args[i] = FunctionArgument(i, name, None, 8)
38
+
39
+ new_names, _ = api.predict_variable_names(function, decompilation_text=function_text, use_decompiler=False)
40
+ new_function = function_with_new_names(function, new_names)
41
+
42
+ assert new_function.args[0] != function.args[0]
43
+ assert new_function.args[1] != function.args[1]
44
+ assert new_function.args[2] != function.args[2]
45
+
46
+ assert new_function.stack_vars[3] != function.stack_vars[3]
47
+
48
+ def test_renaming_1(self):
49
+ api = VariableRenamingAPI(use_decompiler=False, decompiler_name="ida")
50
+
51
+ # testing text
52
+ function_text = "_int64 __fastcall main(int a1, char **a2, char **a3)\n{\n _BOOL4 v4; // [rsp+1Ch] [rbp-24h] BYREF\n char v5[16]; // [rsp+20h] [rbp-20h] BYREF\n char buf[16]; // [rsp+30h] [rbp-10h] BYREF\n\n buf[8] = 0;\n v5[8] = 0;\n puts(\"Username: \");\n read(0, buf, 8uLL);\n read(0, &v4, 1uLL);\n puts(\"Password: \");\n read(0, v5, 8uLL);\n read(0, &v4, 1uLL);\n v4 = sub_400664(buf, v5);\n if ( !v4 )\n sub_4006FD();\n return sub_4006ED(buf);\n}\n"
53
+ svar_name_data = ['v4', 'v5', 'buf']
54
+ args_name_data = ["a1", "a2", "a3"]
55
+
56
+ # WARNING: the offsets that these stack variables have as BinSync objects are not real and are only for this testcase
57
+ function = Function(0xdead, 0x1337, header=FunctionHeader("main", 0xdead, args={}), stack_vars={})
58
+ for i, name in enumerate(svar_name_data):
59
+ function.stack_vars[i] = StackVariable(i, name, None, 8, function.addr)
60
+ for i, name in enumerate(args_name_data):
61
+ function.args[i] = FunctionArgument(i, name, None, 8)
62
+
63
+ new_names, _ = api.predict_variable_names(function, decompilation_text=function_text, use_decompiler=False)
64
+ new_function = function_with_new_names(function, new_names)
65
+
66
+ assert new_function.args[0] != function.args[0]
67
+ assert new_function.args[1] != function.args[1]
68
+ assert new_function.args[2] != function.args[2]
69
+
70
+ assert new_function.stack_vars[1] != function.stack_vars[1]
71
+ assert new_function.stack_vars[2] != function.stack_vars[2]
72
+ def test_renaming_2(self):
73
+ api = VariableRenamingAPI(use_decompiler=False, decompiler_name="ida")
74
+
75
+ # testing text
76
+ function_text = '''__int64 __fastcall main(int a1, char **a2, char **a3, char a4)
77
+ {
78
+ const char *v5; // rbp
79
+
80
+ if ( a1 != 2 )
81
+ return 0LL;
82
+ sub_2B30(*a2);
83
+ setlocale(6, "");
84
+ bindtextdomain("coreutils", "/usr/share/locale");
85
+ textdomain("coreutils");
86
+ sub_5550(sub_2A80);
87
+ v5 = a2[1];
88
+ if ( !strcmp(v5, "--help") )
89
+ sub_2700(0);
90
+ if ( !strcmp(v5, "--version") )
91
+ sub_4DC0(stdout, "true", &unk_6084, Version, "Jim Meyering", 0, a4);
92
+ return 0LL;
93
+ }
94
+ '''
95
+ svar_name_data = ['v5']
96
+ args_name_data = ["a1", "a2", "a3", "a4"]
97
+ # WARNING: the offsets that these stack variables have as BinSync objects are not real and are only for this testcase
98
+ function = Function(0xdead, 0x1337, header=FunctionHeader("main", 0xdead, args={}), stack_vars={})
99
+ for i, name in enumerate(svar_name_data):
100
+ function.stack_vars[i] = StackVariable(i, name, None, 8, function.addr)
101
+ for i, name in enumerate(args_name_data):
102
+ function.args[i] = FunctionArgument(i, name, None, 8)
103
+
104
+ new_names, _ = api.predict_variable_names(function, decompilation_text=function_text, use_decompiler=False)
105
+ new_function = function_with_new_names(function, new_names)
106
+
107
+ assert new_function.args[0] != function.args[0]
108
+ assert new_function.args[1] != function.args[1]
109
+ assert new_function.args[2] != function.args[2]
110
+ assert new_function.args[3] != function.args[3]
111
+
112
+ def test_renaming_3(self):
113
+ api = VariableRenamingAPI(use_decompiler=False, decompiler_name="ida")
114
+
115
+ # testing text
116
+ function_text = ''' __int64 __fastcall sub_4760(__int64 a1, __int64 a2, __int64 a3, __int64 a4, __int64 a5)
117
+ {
118
+ __int128 v6[2]; // [rsp+0h] [rbp-48h] BYREF
119
+ __m128i si128; // [rsp+20h] [rbp-28h]
120
+ __int64 v8; // [rsp+30h] [rbp-18h]
121
+ unsigned __int64 v9; // [rsp+38h] [rbp-10h]
122
+
123
+ v9 = __readfsqword(0x28u);
124
+ v6[0] = _mm_load_si128(&xmmword_A1E0);
125
+ v8 = qword_A210;
126
+ LODWORD(v6[0]) = 10;
127
+ v6[1] = _mm_load_si128(&xmmword_A1F0);
128
+ si128 = _mm_load_si128(&xmmword_A200);
129
+ if ( !a2 || !a3 )
130
+ abort();
131
+ si128.m128i_i64[1] = a2;
132
+ v8 = a3;
133
+ return sub_3F20(a1, a4, a5, v6);
134
+ }'''
135
+ svar_name_data = ['v6', "s128", "v8", "v9"]
136
+ args_name_data = ["a1", "a2", "a3", "a4", "a5"]
137
+ # WARNING: the offsets that these stack variables have as BinSync objects are not real and are only for this testcase
138
+ function = Function(0xdead, 0x1337, header=FunctionHeader("sub_4760", 0xdead, args={}), stack_vars={})
139
+ for i, name in enumerate(svar_name_data):
140
+ function.stack_vars[i] = StackVariable(i, name, None, 8, function.addr)
141
+ for i, name in enumerate(args_name_data):
142
+ function.args[i] = FunctionArgument(i, name, None, 8)
143
+
144
+ new_names, _ = api.predict_variable_names(function, decompilation_text=function_text, use_decompiler=False)
145
+ new_function = function_with_new_names(function, new_names)
146
+
147
+ assert new_function.args[0] != function.args[0]
148
+ assert new_function.args[1] != function.args[1]
149
+ assert new_function.args[2] != function.args[2]
150
+ assert new_function.args[3] != function.args[3]
151
+ assert new_function.args[4] != function.args[4]
152
+
153
+ def test_renaming_4(self):
154
+ api = VariableRenamingAPI(use_decompiler=False, decompiler_name="ida")
155
+
156
+ # testing text
157
+ function_text = '''__int64 __fastcall main(int a1, char **a2, char **a3)
158
+ {
159
+ char *v4; // r12
160
+ char *v5; // rax
161
+ char *v6; // rbp
162
+ const char *v7; // r13
163
+ const char *v8; // r13
164
+ char *v9; // rax
165
+ char *v10; // rax
166
+ FILE *v11; // r12
167
+ char *v12; // rax
168
+ FILE *v13; // r12
169
+ char *v14; // rax
170
+ char *v15; // rax
171
+ __int64 *v16; // rbp
172
+ const char *v17; // rsi
173
+ const char *v18; // r14
174
+ char *v19; // rax
175
+ char *v20; // rdi
176
+ char *v21; // rax
177
+ char *v22; // r12
178
+ char *v23; // rax
179
+ char *v24; // rax
180
+ char *v25; // rdi
181
+ char *v26; // rax
182
+ FILE *v27; // rbp
183
+ char *v28; // rax
184
+ __int64 v29[21]; // [rsp+0h] [rbp-A8h] BYREF
185
+
186
+ v29[15] = __readfsqword(0x28u);
187
+ if ( a1 != 2 )
188
+ return 0LL;
189
+ v4 = *a2;
190
+ if ( !*a2 )
191
+ {
192
+ fwrite("A NULL argv[0] was passed through an exec system call.\n", 1uLL, 0x37uLL, stderr);
193
+ abort();
194
+ }
195
+ v5 = strrchr(v4, 47);
196
+ v6 = v5;
197
+ if ( v5 )
198
+ {
199
+ v7 = v5 + 1;
200
+ if ( v5 + 1 - v4 > 6 && !strncmp(v5 - 6, "/.libs/", 7uLL) )
201
+ {
202
+ v4 = v7;
203
+ if ( !strncmp(v7, "lt-", 3uLL) )
204
+ {
205
+ v4 = v6 + 4;
206
+ program_invocation_short_name = v6 + 4;
207
+ }
208
+ }
209
+ }
210
+ qword_7028 = v4;
211
+ program_invocation_name = v4;
212
+ setlocale(6, "");
213
+ bindtextdomain("coreutils", "/usr/share/locale");
214
+ textdomain("coreutils");
215
+ sub_39D0(sub_3390);
216
+ v8 = a2[1];
217
+ if ( !strcmp(v8, "--help") )
218
+ {
219
+ v9 = dcgettext(0LL, "Usage: %s [ignored command line arguments]\n or: %s OPTION\n", 5);
220
+ __printf_chk(1LL, v9, v4, v4);
221
+ v10 = dcgettext(0LL, "Exit with a status code indicating success.", 5);
222
+ __printf_chk(1LL, "%s\n\n", v10);
223
+ v11 = stdout;
224
+ v12 = dcgettext(0LL, " --help display this help and exit\n", 5);
225
+ fputs_unlocked(v12, v11);
226
+ v13 = stdout;
227
+ v14 = dcgettext(0LL, " --version output version information and exit\n", 5);
228
+ fputs_unlocked(v14, v13);
229
+ v15 = dcgettext(
230
+ 0LL,
231
+ "\n"
232
+ "NOTE: your shell may have its own version of %s, which usually supersedes\n"
233
+ "the version described here. Please refer to your shell's documentation\n"
234
+ "for details about the options it supports.\n",
235
+ 5);
236
+ __printf_chk(1LL, v15, "true");
237
+ v29[2] = "coreutils";
238
+ v29[1] = "test invocation";
239
+ v16 = v29;
240
+ v17 = "[";
241
+ v29[3] = "Multi-call invocation";
242
+ v29[6] = "sha256sum";
243
+ v29[4] = "sha224sum";
244
+ v29[8] = "sha384sum";
245
+ v29[0] = "[";
246
+ v29[5] = "sha2 utilities";
247
+ v29[7] = "sha2 utilities";
248
+ v29[9] = "sha2 utilities";
249
+ v29[10] = "sha512sum";
250
+ v29[11] = "sha2 utilities";
251
+ v29[12] = 0LL;
252
+ v29[13] = 0LL;
253
+ do
254
+ {
255
+ if ( !strcmp("true", v17) )
256
+ break;
257
+ v17 = v16[2];
258
+ v16 += 2;
259
+ }
260
+ while ( v17 );
261
+ v18 = v16[1];
262
+ if ( v18 )
263
+ {
264
+ v19 = dcgettext(0LL, "\n%s online help: <%s>\n", 5);
265
+ __printf_chk(1LL, v19, &unk_4047, "https://www.gnu.org/software/coreutils/");
266
+ v20 = setlocale(5, 0LL);
267
+ if ( !v20 || !strncmp(v20, "en_", 3uLL) )
268
+ {
269
+ LABEL_19:
270
+ v21 = dcgettext(0LL, "Full documentation <%s%s>\n", 5);
271
+ v22 = " invocation";
272
+ __printf_chk(1LL, v21, "https://www.gnu.org/software/coreutils/", "true");
273
+ if ( v18 != "true" )
274
+ v22 = "";
275
+ LABEL_21:
276
+ v23 = dcgettext(0LL, "or available locally via: info '(coreutils) %s%s'\n", 5);
277
+ __printf_chk(1LL, v23, v18, v22);
278
+ exit(0);
279
+ }
280
+ }
281
+ else
282
+ {
283
+ v24 = dcgettext(0LL, "\n%s online help: <%s>\n", 5);
284
+ __printf_chk(1LL, v24, &unk_4047, "https://www.gnu.org/software/coreutils/");
285
+ v25 = setlocale(5, 0LL);
286
+ if ( !v25 || !strncmp(v25, "en_", 3uLL) )
287
+ {
288
+ v26 = dcgettext(0LL, "Full documentation <%s%s>\n", 5);
289
+ v18 = "true";
290
+ __printf_chk(1LL, v26, "https://www.gnu.org/software/coreutils/", "true");
291
+ v22 = " invocation";
292
+ goto LABEL_21;
293
+ }
294
+ v18 = "true";
295
+ }
296
+ v27 = stdout;
297
+ v28 = dcgettext(0LL, "Report any translation bugs to <https://translationproject.org/team/>\n", 5);
298
+ fputs_unlocked(v28, v27);
299
+ goto LABEL_19;
300
+ }
301
+ if ( !strcmp(v8, "--version") )
302
+ sub_3410(stdout, v29[0]);
303
+ return 0LL;
304
+ }
305
+ '''
306
+ svar_name_data = ['v4', 'v5', 'v6', 'v7', 'v8', 'v9', 'v10', 'v11', 'v12', 'v13', 'v14', 'v15', 'v16', 'v17', 'v18', 'v19',
307
+ 'v20', 'v21', 'v22', 'v23', 'v24', 'v25', 'v26', 'v27', 'v28', 'v29']
308
+ args_name_data = ["a1", "a2", "a3"]
309
+ # WARNING: the offsets that these stack variables have as BinSync objects are not real and are only for this testcase
310
+ function = Function(0xdead, 0x1337, header=FunctionHeader("sub_4760", 0xdead, args={}), stack_vars={})
311
+ for i, name in enumerate(svar_name_data):
312
+ function.stack_vars[i] = StackVariable(i, name, None, 8, function.addr)
313
+ for i, name in enumerate(args_name_data):
314
+ function.args[i] = FunctionArgument(i, name, None, 8)
315
+
316
+ new_names, _ = api.predict_variable_names(function, decompilation_text=function_text, use_decompiler=False)
317
+ assert new_names != {}
318
+
319
+
320
+ if __name__ == "__main__":
321
+ unittest.main(argv=sys.argv)
@@ -0,0 +1,108 @@
1
+ __version__ = "2.0.0"
2
+
3
+ import importlib.resources
4
+ import tarfile
5
+ from pathlib import Path
6
+ import urllib.request
7
+ import hashlib
8
+ import math
9
+ import platform
10
+ import shutil
11
+
12
+ from tqdm import tqdm
13
+ from libbs.decompilers import GHIDRA_DECOMPILER, IDA_DECOMPILER
14
+
15
+ # initialize logging for the entire project
16
+ import logging
17
+ logging.getLogger("varbert").addHandler(logging.NullHandler())
18
+ from .logger import Loggers
19
+ loggers = Loggers()
20
+ del Loggers
21
+
22
+ from .api import VariableRenamingAPI
23
+
24
+ MODELS_PATH = Path(Path(str(importlib.resources.files("varbert"))) / "models").absolute()
25
+ SUPPORTED_MODELS = {GHIDRA_DECOMPILER, IDA_DECOMPILER}
26
+ SUBSTITUTE_DECOMPILER_MODEL = IDA_DECOMPILER
27
+ MODEL_FOLDER = "DECOMPILER-OPT-Function"
28
+ # all models are found here: https://www.dropbox.com/scl/fo/socl7rd5lsv926whylqpn/h?rlkey=i0x74bdipj41hys5rorflxawo
29
+ MODEL_URLS = {
30
+ # function based models:
31
+ f"{GHIDRA_DECOMPILER}-O0": "https://www.dropbox.com/scl/fi/8xsmmlzypd45icn8csk6y/Ghidra-O0-Function.tar.gz?rlkey=1b92b9ejktoyewjztvo3ns8q1&dl=1",
32
+ f"{IDA_DECOMPILER}-O0": "https://www.dropbox.com/scl/fi/dmmfqqwvwhkswiv48ltfs/IDA-O0-Function.tar.gz?rlkey=3unxmiydbm5si3n7jh5r43qjp&dl=1",
33
+ f"{GHIDRA_DECOMPILER}-O2": "https://www.dropbox.com/scl/fi/x5ci28s0aw3i852kg9w1j/Ghidra-O2-Function.tar.gz?rlkey=wpe08afvxelcblgcqndrxmvtm&dl=1",
34
+ f"{IDA_DECOMPILER}-O2": "https://www.dropbox.com/scl/fi/ku26eebbwvug5fu2pc4ek/IDA-O2-Function.tar.gz?rlkey=edlri604hhuohh8n5d7d02tnd&dl=1",
35
+ # binary based models:
36
+ #f"{GHIDRA_DECOMPILER}-O2": "https://www.dropbox.com/scl/fi/nbk5b068z6ffsdl0kgbuw/Ghidra-O2-Binary.tar.gz?rlkey=m83iit4jh5fg6icl5cf2z3yhq&dl=1",
37
+ #f"{IDA_DECOMPILER}-O2": "https://www.dropbox.com/scl/fi/vk0ybwu4uoru4fl61yztw/IDA-O2-Binary.tar.gz?rlkey=9rt8js8qrhkqp2cvvttxrlwd2&dl=1",
38
+ }
39
+
40
+ _l = logging.getLogger(__name__)
41
+
42
+
43
+ def install_model(decompiler, opt_level="O0", reinstall=False):
44
+ if decompiler not in SUPPORTED_MODELS:
45
+ _l.warning("Model for decompiler is not supported yet, using model for %s", SUBSTITUTE_DECOMPILER_MODEL)
46
+ decompiler = SUBSTITUTE_DECOMPILER_MODEL
47
+
48
+ # check if the model exists
49
+ decompiler_model = MODELS_PATH / decompiler
50
+ if decompiler_model.exists():
51
+ if reinstall:
52
+ shutil.rmtree(decompiler_model)
53
+ else:
54
+ _l.info(f"Model for {decompiler} already exists. Skipping download.")
55
+ return
56
+
57
+ # saved models on the remote side have some messed up names, so we have to do some
58
+ # string matching here to make sure we download and move the correct stuff
59
+ compliant_decompiler_name = decompiler
60
+ if decompiler == GHIDRA_DECOMPILER:
61
+ decompiler = "Ghidra"
62
+ elif decompiler == IDA_DECOMPILER:
63
+ decompiler = "IDA"
64
+
65
+ dl_model_folder = MODELS_PATH / Path(MODEL_FOLDER.replace("DECOMPILER", decompiler).replace("OPT", opt_level))
66
+ url = MODEL_URLS[f"{compliant_decompiler_name}-{opt_level}"]
67
+ _l.info(f"Downloading model for {compliant_decompiler_name} now...")
68
+ tar_file_path = _download_file(url, MODELS_PATH / f"model.tar.gz")
69
+ with tarfile.open(tar_file_path, "r:gz") as tar:
70
+ tar.extractall(path=MODELS_PATH)
71
+
72
+ # move the model folder to be in a compliant form
73
+ dl_model_folder.rename(MODELS_PATH / compliant_decompiler_name)
74
+ # delete the old tar
75
+ tar_file_path.unlink()
76
+
77
+
78
+ def _download_file(url: str, save_location: Path, verify_hash=False) -> Path:
79
+ # XXX: hacked code for non-ssl verification
80
+ if platform.system() == "Darwin":
81
+ import ssl
82
+ ssl._create_default_https_context = ssl._create_unverified_context
83
+
84
+ with urllib.request.urlopen(url) as response:
85
+ total_size = response.length
86
+ if response.status != 200:
87
+ raise Exception(f"HTTP error {response.status}: {response.reason}")
88
+
89
+ hasher = hashlib.md5()
90
+ chunk_size = 8192
91
+ mb_size = int(total_size / 1000000)
92
+ with open(save_location, 'wb') as f:
93
+ for _ in tqdm(range(math.ceil(total_size / chunk_size)), desc=f"Downloading model ~{mb_size} MB..."):
94
+ chunk = response.read(chunk_size)
95
+ hasher.update(chunk)
96
+ if not chunk:
97
+ break
98
+
99
+ f.write(chunk)
100
+
101
+ # hash for extra security
102
+ #download_hash = hasher.hexdigest()
103
+ #if verify_hash and download_hash != JOERN_ZIP_HASH:
104
+ # raise Exception(f"Files corrupted in download: {download_hash} != {JOERN_ZIP_HASH}")
105
+
106
+ return save_location
107
+
108
+