angr 9.2.146__py3-none-manylinux2014_x86_64.whl → 9.2.147__py3-none-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of angr might be problematic. Click here for more details.

angr/__init__.py CHANGED
@@ -2,7 +2,7 @@
2
2
  # pylint: disable=wrong-import-position
3
3
  from __future__ import annotations
4
4
 
5
- __version__ = "9.2.146"
5
+ __version__ = "9.2.147"
6
6
 
7
7
  if bytes is str:
8
8
  raise Exception(
angr/analyses/bindiff.py CHANGED
@@ -4,11 +4,13 @@ import math
4
4
  import types
5
5
  from collections import deque, defaultdict
6
6
  from typing import TYPE_CHECKING
7
+ from functools import partial
7
8
 
8
9
  import networkx
9
10
 
10
- from angr.analyses import AnalysesHub, Analysis, CFGEmulated
11
+ from angr.analyses import AnalysesHub, Analysis, CFGFast
11
12
  from angr.errors import SimEngineError, SimMemoryError
13
+ from angr.knowledge_plugins.cfg.memory_data import MemoryDataSort
12
14
 
13
15
 
14
16
  if TYPE_CHECKING:
@@ -140,6 +142,9 @@ def _is_better_match(x, y, matched_a, matched_b, attributes_dict_a, attributes_d
140
142
  :param attributes_dict_b: The attributes for each element in the second set.
141
143
  :returns: True/False
142
144
  """
145
+ if x not in attributes_dict_a or y not in attributes_dict_b:
146
+ return False
147
+
143
148
  attributes_x = attributes_dict_a[x]
144
149
  attributes_y = attributes_dict_b[y]
145
150
  if x in matched_a:
@@ -162,6 +167,11 @@ def differing_constants(block_a, block_b):
162
167
  :returns: Returns a list of differing constants in the form of ConstantChange, which has the offset in the
163
168
  block and the respective constants.
164
169
  """
170
+ if block_a.size == 0 or block_b.size == 0:
171
+ return []
172
+ if not block_a.instruction_addrs or not block_b.instruction_addrs:
173
+ return []
174
+
165
175
  statements_a = [s for s in block_a.vex.statements if s.tag != "Ist_IMark"] + [block_a.vex.next]
166
176
  statements_b = [s for s in block_b.vex.statements if s.tag != "Ist_IMark"] + [block_b.vex.next]
167
177
  if len(statements_a) != len(statements_b):
@@ -858,36 +868,20 @@ class BinDiff(Analysis):
858
868
  This class computes the a diff between two binaries represented by angr Projects
859
869
  """
860
870
 
861
- def __init__(self, other_project, enable_advanced_backward_slicing=False, cfg_a=None, cfg_b=None):
871
+ def __init__(self, other_project, cfg_a=None, cfg_b=None):
862
872
  """
863
873
  :param other_project: The second project to diff
864
874
  """
865
- l.debug("Computing cfg's")
866
-
867
- back_traversal = not enable_advanced_backward_slicing
868
-
869
875
  if cfg_a is None:
870
- # self.cfg_a = self.project.analyses.CFG(resolve_indirect_jumps=True)
871
- # self.cfg_b = other_project.analyses.CFG(resolve_indirect_jumps=True)
872
- self.cfg_a = self.project.analyses[CFGEmulated].prep()(
873
- context_sensitivity_level=1,
874
- keep_state=True,
875
- enable_symbolic_back_traversal=back_traversal,
876
- enable_advanced_backward_slicing=enable_advanced_backward_slicing,
877
- )
878
-
879
- self.cfg_b = other_project.analyses[CFGEmulated].prep()(
880
- context_sensitivity_level=1,
881
- keep_state=True,
882
- enable_symbolic_back_traversal=back_traversal,
883
- enable_advanced_backward_slicing=enable_advanced_backward_slicing,
884
- )
885
-
876
+ l.debug("Computing cfg's")
877
+ self.cfg_a = self.project.analyses[CFGFast].prep(fail_fast=self._fail_fast)().model
878
+ self.cfg_b = other_project.analyses[CFGFast].prep(fail_fast=self._fail_fast)().model
879
+ l.debug("Done computing cfg's")
886
880
  else:
887
881
  self.cfg_a = cfg_a
888
882
  self.cfg_b = cfg_b
889
-
890
- l.debug("Done computing cfg's")
883
+ self.funcs_a = self.kb.functions
884
+ self.funcs_b = other_project.kb.functions
891
885
 
892
886
  self._p2 = other_project
893
887
  self._attributes_a = {}
@@ -908,8 +902,8 @@ class BinDiff(Analysis):
908
902
  :param func_b_addr: The address of the second function (in the second binary).
909
903
  :returns: Whether or not the functions appear to be identical.
910
904
  """
911
- if self.cfg_a.project.is_hooked(func_a_addr) and self.cfg_b.project.is_hooked(func_b_addr):
912
- return self.cfg_a.project._sim_procedures[func_a_addr] == self.cfg_b.project._sim_procedures[func_b_addr]
905
+ if self.project.is_hooked(func_a_addr) and self._p2.is_hooked(func_b_addr):
906
+ return self.project._sim_procedures[func_a_addr] == self._p2._sim_procedures[func_b_addr]
913
907
 
914
908
  func_diff = self.get_function_diff(func_a_addr, func_b_addr)
915
909
  if check_consts:
@@ -992,36 +986,33 @@ class BinDiff(Analysis):
992
986
  """
993
987
  pair = (function_addr_a, function_addr_b)
994
988
  if pair not in self._function_diffs:
995
- function_a = self.cfg_a.kb.functions.function(function_addr_a)
996
- function_b = self.cfg_b.kb.functions.function(function_addr_b)
989
+ function_a = self.funcs_a.function(function_addr_a)
990
+ function_b = self.funcs_b.function(function_addr_b)
997
991
  self._function_diffs[pair] = FunctionDiff(function_a, function_b, self)
998
992
  return self._function_diffs[pair]
999
993
 
1000
994
  @staticmethod
1001
- def _compute_function_attributes(cfg):
995
+ def _compute_function_attributes(funcs, exclude_func_addrs: set[int] | None = None):
1002
996
  """
1003
- :param cfg: An angr CFG object
1004
997
  :returns: a dictionary of function addresses to tuples of attributes
1005
998
  """
1006
999
  # the attributes we use are the number of basic blocks, number of edges, and number of subfunction calls
1007
1000
  attributes = {}
1008
- all_funcs = set(cfg.kb.callgraph.nodes())
1009
- for function_addr in cfg.kb.functions:
1001
+ all_funcs = set(funcs.callgraph)
1002
+ for function_addr in funcs:
1003
+ if not funcs.contains_addr(function_addr):
1004
+ continue
1005
+ if exclude_func_addrs and function_addr in exclude_func_addrs:
1006
+ continue
1007
+ func = funcs.get_by_addr(function_addr)
1010
1008
  # skip syscalls and functions which are None in the cfg
1011
- if cfg.kb.functions.function(function_addr) is None or cfg.kb.functions.function(function_addr).is_syscall:
1009
+ if func.is_syscall or func.is_alignment or func.is_plt:
1012
1010
  continue
1013
- if cfg.kb.functions.function(function_addr) is not None:
1014
- normalized_function = NormalizedFunction(cfg.kb.functions.function(function_addr))
1015
- number_of_basic_blocks = len(normalized_function.graph.nodes())
1016
- number_of_edges = len(normalized_function.graph.edges())
1017
- else:
1018
- number_of_basic_blocks = 0
1019
- number_of_edges = 0
1020
- if function_addr in all_funcs:
1021
- number_of_subfunction_calls = len(list(cfg.kb.callgraph.successors(function_addr)))
1022
- else:
1023
- number_of_subfunction_calls = 0
1024
- attributes[function_addr] = (number_of_basic_blocks, number_of_edges, number_of_subfunction_calls)
1011
+ normalized_function = NormalizedFunction(func)
1012
+ number_of_basic_blocks = len(normalized_function.graph.nodes())
1013
+ number_of_edges = len(normalized_function.graph.edges())
1014
+ number_of_subfunction_calls = funcs.callgraph.out_degree[function_addr] if function_addr in all_funcs else 0
1015
+ attributes[function_addr] = number_of_basic_blocks, number_of_edges, number_of_subfunction_calls
1025
1016
 
1026
1017
  return attributes
1027
1018
 
@@ -1029,8 +1020,8 @@ class BinDiff(Analysis):
1029
1020
  possible_matches = set()
1030
1021
 
1031
1022
  # Make sure those functions are not SimProcedures
1032
- f_a = self.cfg_a.kb.functions.function(func_a)
1033
- f_b = self.cfg_b.kb.functions.function(func_b)
1023
+ f_a = self.funcs_a.function(func_a)
1024
+ f_b = self.funcs_b.function(func_b)
1034
1025
  if f_a.startpoint is None or f_b.startpoint is None:
1035
1026
  return possible_matches
1036
1027
 
@@ -1052,6 +1043,8 @@ class BinDiff(Analysis):
1052
1043
 
1053
1044
  def _get_plt_matches(self):
1054
1045
  plt_matches = []
1046
+ if not hasattr(self.project.loader.main_object, "plt") or not hasattr(self._p2.loader.main_object, "plt"):
1047
+ return []
1055
1048
  for name, addr in self.project.loader.main_object.plt.items():
1056
1049
  if name in self._p2.loader.main_object.plt:
1057
1050
  plt_matches.append((addr, self._p2.loader.main_object.plt[name]))
@@ -1072,18 +1065,18 @@ class BinDiff(Analysis):
1072
1065
  plt_matches.append((addr, func_to_addr_b[name]))
1073
1066
 
1074
1067
  # remove ones that aren't in the interfunction graph, because these seem to not be consistent
1075
- all_funcs_a = set(self.cfg_a.kb.callgraph.nodes())
1076
- all_funcs_b = set(self.cfg_b.kb.callgraph.nodes())
1068
+ all_funcs_a = set(self.funcs_a.callgraph.nodes())
1069
+ all_funcs_b = set(self.funcs_b.callgraph.nodes())
1077
1070
  return [x for x in plt_matches if x[0] in all_funcs_a and x[1] in all_funcs_b]
1078
1071
 
1079
1072
  def _get_name_matches(self):
1080
1073
  names_to_addrs_a = defaultdict(list)
1081
- for f in self.cfg_a.functions.values():
1074
+ for f in self.funcs_a.values():
1082
1075
  if not f.name.startswith("sub_"):
1083
1076
  names_to_addrs_a[f.name].append(f.addr)
1084
1077
 
1085
1078
  names_to_addrs_b = defaultdict(list)
1086
- for f in self.cfg_b.functions.values():
1079
+ for f in self.funcs_b.values():
1087
1080
  if not f.name.startswith("sub_"):
1088
1081
  names_to_addrs_b[f.name].append(f.addr)
1089
1082
 
@@ -1097,23 +1090,305 @@ class BinDiff(Analysis):
1097
1090
 
1098
1091
  return name_matches
1099
1092
 
1100
- def _compute_diff(self):
1101
- # get the attributes for all functions
1102
- self.attributes_a = self._compute_function_attributes(self.cfg_a)
1103
- self.attributes_b = self._compute_function_attributes(self.cfg_b)
1093
+ def _get_string_reference_matches(self) -> list[tuple[int, int]]:
1094
+ strs_main: dict[str, int | None] = {}
1095
+ strs_secondary: dict[str, int | None] = {}
1096
+
1097
+ for mem_data in self.cfg_a.memory_data.values():
1098
+ if mem_data.sort == MemoryDataSort.String:
1099
+ if mem_data.content not in strs_main:
1100
+ strs_main[mem_data.content] = mem_data.addr
1101
+ else:
1102
+ # unfortunately there are multiple strings with the same value...
1103
+ strs_main[mem_data.content] = None
1104
+
1105
+ for mem_data in self.cfg_b.memory_data.values():
1106
+ if mem_data.sort == MemoryDataSort.String:
1107
+ if mem_data.content not in strs_secondary:
1108
+ strs_secondary[mem_data.content] = mem_data.addr
1109
+ else:
1110
+ # unfortunately there are multiple strings with the same value...
1111
+ strs_secondary[mem_data.content] = None
1112
+
1113
+ shared_strs = set(strs_main.keys()) & set(strs_secondary.keys())
1114
+ matches = []
1115
+ # check cross-references
1116
+ for s in shared_strs:
1117
+ if strs_main[s] is None or strs_secondary[s] is None:
1118
+ continue
1119
+ addr_main = strs_main[s]
1120
+ addr_secondary = strs_secondary[s]
1121
+ xrefs_main = self.kb.xrefs.get_xrefs_by_dst(addr_main)
1122
+ xrefs_secondary = self._p2.kb.xrefs.get_xrefs_by_dst(addr_secondary)
1123
+ if len(xrefs_main) == len(xrefs_secondary) == 1:
1124
+ xref_main = next(iter(xrefs_main))
1125
+ xref_secondary = next(iter(xrefs_secondary))
1126
+ cfgnode_main = self.cfg_a.get_any_node(xref_main.block_addr)
1127
+ cfgnode_secondary = self.cfg_b.get_any_node(xref_secondary.block_addr)
1128
+ if cfgnode_main is not None and cfgnode_secondary is not None:
1129
+ matches.append((cfgnode_main.function_address, cfgnode_secondary.function_address))
1130
+
1131
+ return sorted(set(matches))
1132
+
1133
+ @staticmethod
1134
+ def _approximate_matcher_func_block_and_edge_count(
1135
+ main_funcs, secondary_funcs, new_matches: list, size_tolerance=0.1
1136
+ ) -> None:
1137
+ # functions likely match if they have the same number of blocks and the same number of edges
1138
+ main_funcs = sorted(main_funcs, key=lambda x: x.addr)
1139
+ secondary_funcs = sorted(secondary_funcs, key=lambda x: x.addr)
1140
+ m, s = 0, 0
1141
+ while m < len(main_funcs) and s < len(secondary_funcs):
1142
+ mf = main_funcs[m]
1143
+ sf = secondary_funcs[s]
1144
+ # best case: there is a direct match
1145
+ if len(mf.block_addrs_set) == len(sf.block_addrs_set) and len(mf.graph.edges) == len(sf.graph.edges):
1146
+ # ensure function sizes are roughly the same
1147
+ if abs(mf.size - sf.size) / max(mf.size, sf.size) < size_tolerance:
1148
+ l.info(
1149
+ "Approximate matcher (block&edge count) found %#x (%s) and %#x (%s).",
1150
+ mf.addr,
1151
+ mf.name,
1152
+ sf.addr,
1153
+ sf.name,
1154
+ )
1155
+ new_matches.append((mf.addr, sf.addr))
1156
+ m += 1
1157
+ s += 1
1158
+ else:
1159
+ if len(main_funcs) - m > len(secondary_funcs) - s:
1160
+ # more main funcs than secondary funcs; we increment m in case a function in the main binary
1161
+ # is removed
1162
+ m += 1
1163
+ elif len(main_funcs) - m < len(secondary_funcs) - s:
1164
+ # more secondary funcs than main funcs; we increment s in case a function in the secondary
1165
+ # binary is removed
1166
+ s += 1
1167
+ else:
1168
+ m += 1
1169
+ s += 1
1170
+
1171
+ @staticmethod
1172
+ def _get_function_max_addr(func) -> int | None:
1173
+ if not func.block_addrs_set:
1174
+ return None
1175
+ last_block_addr = max(func.block_addrs_set)
1176
+ block_size = func.get_block_size(last_block_addr)
1177
+ return last_block_addr + block_size
1178
+
1179
+ def _get_function_string_refs(self, proj, cfg, func) -> set[bytes]:
1180
+ strs = set()
1181
+ func_max_addr = self._get_function_max_addr(func)
1182
+ if func_max_addr is None:
1183
+ return strs
1184
+ xrefs = proj.kb.xrefs.get_xrefs_by_ins_addr_region(func.addr, func_max_addr)
1185
+ for xref in xrefs:
1186
+ if xref.dst in cfg.memory_data:
1187
+ md = cfg.memory_data[xref.dst]
1188
+ if md.sort == MemoryDataSort.String:
1189
+ strs.add(md.content)
1190
+ return strs
1191
+
1192
+ def _approximate_matcher_func_string_refs(self, main_funcs, secondary_funcs, new_matches: list) -> None:
1193
+ # functions likely match if they both refer to the same strings
1194
+ strs_to_funcs_main = defaultdict(list)
1195
+ strs_to_funcs_secondary = defaultdict(list)
1196
+
1197
+ for func in main_funcs:
1198
+ strs = self._get_function_string_refs(self.project, self.cfg_a, func)
1199
+ if strs:
1200
+ strs_to_funcs_main[frozenset(strs)].append(func)
1201
+
1202
+ for func in secondary_funcs:
1203
+ strs = self._get_function_string_refs(self._p2, self.cfg_b, func)
1204
+ if strs:
1205
+ strs_to_funcs_secondary[frozenset(strs)].append(func)
1206
+
1207
+ for strs_main, funcs_main in strs_to_funcs_main.items():
1208
+ if strs_main in strs_to_funcs_secondary and len(funcs_main) == 1:
1209
+ funcs_secondary = strs_to_funcs_secondary[strs_main]
1210
+ if len(funcs_secondary) == 1:
1211
+ # found a match!
1212
+ mf = funcs_main[0]
1213
+ sf = funcs_secondary[0]
1214
+ l.info(
1215
+ "Approximate matcher (string refs) found %#x (%s) and %#x (%s).",
1216
+ mf.addr,
1217
+ mf.name,
1218
+ sf.addr,
1219
+ sf.name,
1220
+ )
1221
+ new_matches.append((mf.addr, sf.addr))
1104
1222
 
1223
+ @staticmethod
1224
+ def _get_function_callees(
1225
+ proj, funcs, func, main2secondary: dict[int, int] | None = None, funcs_secondary=None
1226
+ ) -> tuple[str, ...]:
1227
+ callees = proj.kb.functions.callgraph.successors(func.addr)
1228
+ # convert callees to meaningful function names
1229
+ callee_names = []
1230
+ for callee in callees:
1231
+ if callee == func.addr:
1232
+ name = "!self"
1233
+ else:
1234
+ if main2secondary is not None and funcs_secondary is not None and callee in main2secondary:
1235
+ callee = main2secondary[callee]
1236
+ func = funcs_secondary.get_by_addr(callee)
1237
+ name = func.name
1238
+ else:
1239
+ func = funcs.get_by_addr(callee)
1240
+ name = None if func.is_default_name else func.name
1241
+ if name is not None:
1242
+ callee_names.append(name)
1243
+ else:
1244
+ # has at least one unknown/unmatched callee
1245
+ return ()
1246
+ return tuple(sorted(callee_names))
1247
+
1248
+ def _approximate_matcher_func_callees(
1249
+ self, main2secondary: dict[int, int], main_funcs, secondary_funcs, new_matches: list
1250
+ ) -> None:
1251
+ # functions likely match if they both call the same callees
1252
+ callees_to_funcs_main = defaultdict(list)
1253
+ callees_to_funcs_secondary = defaultdict(list)
1254
+
1255
+ for func in main_funcs:
1256
+ callees = self._get_function_callees(
1257
+ self.project, self.funcs_a, func, main2secondary=main2secondary, funcs_secondary=self.funcs_b
1258
+ )
1259
+ if callees:
1260
+ callees_to_funcs_main[callees].append(func)
1261
+
1262
+ for func in secondary_funcs:
1263
+ callees = self._get_function_callees(self._p2, self.funcs_b, func)
1264
+ if callees:
1265
+ callees_to_funcs_secondary[callees].append(func)
1266
+
1267
+ for callees_main, funcs_main in callees_to_funcs_main.items():
1268
+ if callees_main in callees_to_funcs_secondary and len(funcs_main) == 1:
1269
+ funcs_secondary = callees_to_funcs_secondary[callees_main]
1270
+ if len(funcs_secondary) == 1:
1271
+ # found a match!
1272
+ mf = funcs_main[0]
1273
+ sf = funcs_secondary[0]
1274
+ l.info(
1275
+ "Approximate matcher (callees) found %#x (%s) and %#x (%s).",
1276
+ mf.addr,
1277
+ mf.name,
1278
+ sf.addr,
1279
+ sf.name,
1280
+ )
1281
+ new_matches.append((mf.addr, sf.addr))
1282
+
1283
+ def _get_approximate_matches_between_matched_pairs(self, matches: list[tuple[int, int]], matcher):
1284
+ sorted_matches = sorted(matches, key=lambda x: x[0])
1285
+ new_matches = []
1286
+
1287
+ for idx, (addr1_main, addr1_secondary) in enumerate(sorted_matches):
1288
+ if idx == len(sorted_matches) - 1:
1289
+ break
1290
+ addr2_main, addr2_secondary = sorted_matches[idx + 1]
1291
+ if addr1_secondary >= addr2_secondary:
1292
+ continue
1293
+
1294
+ # either two main functions are named, or two secondary functions are named
1295
+ f1_main = self.funcs_a.get_by_addr(addr1_main)
1296
+ f2_main = self.funcs_a.get_by_addr(addr2_main)
1297
+ f1_secondary = self.funcs_b.get_by_addr(addr1_secondary)
1298
+ f2_secondary = self.funcs_b.get_by_addr(addr2_secondary)
1299
+ if not (
1300
+ (not f1_main.is_default_name and not f2_main.is_default_name)
1301
+ or (not f1_secondary.is_default_name and not f2_secondary.is_default_name)
1302
+ ):
1303
+ continue
1304
+
1305
+ # are there any functions in between?
1306
+ main_funcaddrs = list(self.funcs_a._function_map.irange(minimum=addr1_main + 1, maximum=addr2_main - 1))
1307
+ secondary_funcaddrs = list(
1308
+ self.funcs_b._function_map.irange(minimum=addr1_secondary + 1, maximum=addr2_secondary - 1)
1309
+ )
1310
+ # eliminate bad funcs
1311
+ main_funcs = [self.funcs_a.get_by_addr(addr) for addr in main_funcaddrs]
1312
+ main_funcs = [
1313
+ f
1314
+ for f in main_funcs
1315
+ if not f.is_syscall and not f.is_simprocedure and not f.is_alignment and not f.is_plt
1316
+ ]
1317
+ secondary_funcs = [self.funcs_b.get_by_addr(addr) for addr in secondary_funcaddrs]
1318
+ secondary_funcs = [
1319
+ f
1320
+ for f in secondary_funcs
1321
+ if not f.is_syscall and not f.is_simprocedure and not f.is_alignment and not f.is_plt
1322
+ ]
1323
+ if (
1324
+ main_funcs
1325
+ and secondary_funcs
1326
+ and len(main_funcs) > 0
1327
+ and len(secondary_funcs) > 0
1328
+ and len(main_funcs) < 100
1329
+ and len(secondary_funcs) < 100
1330
+ ):
1331
+ # more checks
1332
+ matcher(main_funcs, secondary_funcs, new_matches)
1333
+
1334
+ return new_matches
1335
+
1336
+ def _compute_diff(self):
1105
1337
  # get the initial matches
1338
+ l.info("Getting PLT-based matches...")
1106
1339
  initial_matches = self._get_plt_matches()
1340
+ l.info("... initial matches: %d", len(initial_matches))
1341
+
1342
+ l.info("Getting function name-based matches...")
1107
1343
  initial_matches += self._get_name_matches()
1108
- initial_matches += self._get_function_matches(self.attributes_a, self.attributes_b)
1109
- for a, b in initial_matches:
1110
- l.debug("Initially matched (%#x, %#x)", a, b)
1344
+ l.info("... initial matches: %d", len(initial_matches))
1345
+
1346
+ l.info("Getting string reference-based matches...")
1347
+ initial_matches += self._get_string_reference_matches()
1348
+ l.info("... initial matches: %d", len(initial_matches))
1349
+
1350
+ l.info("Getting adjacent function matches based on function block and edge counts...")
1351
+ initial_matches += self._get_approximate_matches_between_matched_pairs(
1352
+ initial_matches, self._approximate_matcher_func_block_and_edge_count
1353
+ )
1354
+ l.info("... initial matches: %d", len(initial_matches))
1355
+
1356
+ l.info("Getting adjacent function matches based on string references...")
1357
+ initial_matches += self._get_approximate_matches_between_matched_pairs(
1358
+ initial_matches, self._approximate_matcher_func_string_refs
1359
+ )
1360
+ l.info("... initial matches: %d", len(initial_matches))
1361
+
1362
+ l.info("Getting adjacent function matches based on callees...")
1363
+ main2secondary = dict(initial_matches)
1364
+ initial_matches += self._get_approximate_matches_between_matched_pairs(
1365
+ initial_matches, partial(self._approximate_matcher_func_callees, main2secondary)
1366
+ )
1367
+ l.info("... initial matches: %d", len(initial_matches))
1368
+
1369
+ # dedup
1370
+ initial_matches = sorted(set(initial_matches))
1371
+ l.info("We got %d initial matches so far. Time to get busy...", len(initial_matches))
1372
+
1373
+ # get the attributes for all functions
1374
+ l.info("Computing function attributes for main project...")
1375
+ self.attributes_a = self._compute_function_attributes(
1376
+ self.funcs_a, exclude_func_addrs={a for a, _ in initial_matches}
1377
+ )
1378
+ l.info("Computing function attributes for secondary project...")
1379
+ self.attributes_b = self._compute_function_attributes(
1380
+ self.funcs_b, exclude_func_addrs={a for _, a in initial_matches}
1381
+ )
1382
+
1383
+ l.info("Getting function attribute-based matches...")
1384
+ attribute_based_matches = self._get_function_matches(self.attributes_a, self.attributes_b)
1385
+ l.info("Got %d attribute-based matches.", len(attribute_based_matches))
1111
1386
 
1112
1387
  # Use a queue so we process matches in the order that they are found
1113
- to_process = deque(initial_matches)
1388
+ to_process = deque(attribute_based_matches)
1114
1389
 
1115
1390
  # Keep track of which matches we've already added to the queue
1116
- processed_matches = set(initial_matches)
1391
+ processed_matches = set(initial_matches + attribute_based_matches)
1117
1392
 
1118
1393
  # Keep a dict of current matches, which will be updated if better matches are found
1119
1394
  matched_a = {}
@@ -1122,8 +1397,8 @@ class BinDiff(Analysis):
1122
1397
  matched_a[x] = y
1123
1398
  matched_b[y] = x
1124
1399
 
1125
- callgraph_a_nodes = set(self.cfg_a.kb.callgraph.nodes())
1126
- callgraph_b_nodes = set(self.cfg_b.kb.callgraph.nodes())
1400
+ callgraph_a_nodes = set(self.funcs_a.callgraph.nodes())
1401
+ callgraph_b_nodes = set(self.funcs_b.callgraph.nodes())
1127
1402
 
1128
1403
  # while queue is not empty
1129
1404
  while to_process:
@@ -1136,10 +1411,10 @@ class BinDiff(Analysis):
1136
1411
  if not self._p2.loader.main_object.contains_addr(func_b):
1137
1412
  continue
1138
1413
 
1139
- func_a_succ = self.cfg_a.kb.callgraph.successors(func_a) if func_a in callgraph_a_nodes else []
1140
- func_b_succ = self.cfg_b.kb.callgraph.successors(func_b) if func_b in callgraph_b_nodes else []
1141
- func_a_pred = self.cfg_a.kb.callgraph.predecessors(func_a) if func_a in callgraph_a_nodes else []
1142
- func_b_pred = self.cfg_b.kb.callgraph.predecessors(func_b) if func_b in callgraph_b_nodes else []
1414
+ func_a_succ = self.funcs_a.callgraph.successors(func_a) if func_a in callgraph_a_nodes else []
1415
+ func_b_succ = self.funcs_b.callgraph.successors(func_b) if func_b in callgraph_b_nodes else []
1416
+ func_a_pred = self.funcs_a.callgraph.predecessors(func_a) if func_a in callgraph_a_nodes else []
1417
+ func_b_pred = self.funcs_b.callgraph.predecessors(func_b) if func_b in callgraph_b_nodes else []
1143
1418
 
1144
1419
  # get possible new matches
1145
1420
  new_matches = set(
@@ -1155,10 +1430,10 @@ class BinDiff(Analysis):
1155
1430
  # for each of the possible new matches add it if it improves the matching
1156
1431
  for x, y in new_matches:
1157
1432
  # skip none functions and syscalls
1158
- func_a = self.cfg_a.kb.functions.function(x)
1433
+ func_a = self.funcs_a.function(x)
1159
1434
  if func_a is None or func_a.is_simprocedure or func_a.is_syscall:
1160
1435
  continue
1161
- func_b = self.cfg_b.kb.functions.function(y)
1436
+ func_b = self.funcs_b.function(y)
1162
1437
  if func_b is None or func_b.is_simprocedure or func_b.is_syscall:
1163
1438
  continue
1164
1439
 
@@ -25,16 +25,20 @@ class CFGArchOptions:
25
25
  "switch_mode_on_nodecode": (bool, True),
26
26
  # Whether we should use byte-based pattern-matching to identify ifuncs
27
27
  "pattern_match_ifuncs": (bool, True),
28
+ # Do we consider ARM-mode code at all
29
+ "has_arm_code": (bool, True),
28
30
  },
29
31
  "ARMHF": {
30
32
  "ret_jumpkind_heuristics": (bool, True),
31
33
  "switch_mode_on_nodecode": (bool, True),
32
34
  "pattern_match_ifuncs": (bool, True),
35
+ "has_arm_code": (bool, True),
33
36
  },
34
37
  "ARMCortexM": {
35
38
  "ret_jumpkind_heuristics": (bool, True),
36
39
  "switch_mode_on_nodecode": (bool, False),
37
40
  "pattern_match_ifuncs": (bool, True),
41
+ "has_arm_code": (bool, False),
38
42
  },
39
43
  }
40
44
 
@@ -83,3 +87,9 @@ class CFGArchOptions:
83
87
 
84
88
  else:
85
89
  super().__setattr__(option_name, option_value)
90
+
91
+ def __getitem__(self, option_name: str):
92
+ return self._options[option_name]
93
+
94
+ def __contains__(self, option_name: str) -> bool:
95
+ return option_name in self._options
@@ -1550,7 +1550,9 @@ class CFGBase(Analysis):
1550
1550
  block = next((b for b in function.blocks), None)
1551
1551
  if block is None:
1552
1552
  continue
1553
- if all(self._is_noop_insn(insn) for insn in block.capstone.insns):
1553
+ if self._is_noop_block(self.project.arch, block) or all(
1554
+ self._is_noop_insn(insn) for insn in block.capstone.insns
1555
+ ):
1554
1556
  # all nops. mark this function as a function alignment
1555
1557
  l.debug("Function chunk %#x is probably used as a function alignment (all nops).", func_addr)
1556
1558
  self.kb.functions[func_addr].alignment = True
@@ -2205,10 +2207,6 @@ class CFGBase(Analysis):
2205
2207
  out_edges = [e for e in g.out_edges(node_) if g.get_edge_data(*e)["jumpkind"] != "Ijk_FakeRet"]
2206
2208
  return len(out_edges) > 1
2207
2209
 
2208
- if len(src_function.block_addrs_set) > 10:
2209
- # ignore functions unless they are extremely small
2210
- return False
2211
-
2212
2210
  if len(all_edges) == 1 and dst_addr != src_addr:
2213
2211
  the_edge = next(iter(all_edges))
2214
2212
  _, dst, data = the_edge
@@ -2251,15 +2249,41 @@ class CFGBase(Analysis):
2251
2249
  candidate = True
2252
2250
 
2253
2251
  if candidate:
2254
- regs = {self.project.arch.sp_offset}
2255
- if hasattr(self.project.arch, "bp_offset") and self.project.arch.bp_offset is not None:
2256
- regs.add(self.project.arch.bp_offset)
2257
- sptracker = self.project.analyses[StackPointerTracker].prep()(
2258
- src_function, regs, track_memory=self._sp_tracking_track_memory
2259
- )
2260
- sp_delta = sptracker.offset_after_block(src_addr, self.project.arch.sp_offset)
2261
- if sp_delta == 0:
2262
- return True
2252
+ # we have two strategies; for small functions, we run SPTracker on the entire function and see if the
2253
+ # stack pointer changes or not; for large functions, we simply detect how far away we jump as well as
2254
+ # if there are any other functions identified between the source and the destination.
2255
+ if len(src_function.block_addrs_set) <= 10:
2256
+ regs = {self.project.arch.sp_offset}
2257
+ if hasattr(self.project.arch, "bp_offset") and self.project.arch.bp_offset is not None:
2258
+ regs.add(self.project.arch.bp_offset)
2259
+ sptracker = self.project.analyses[StackPointerTracker].prep()(
2260
+ src_function, regs, track_memory=self._sp_tracking_track_memory
2261
+ )
2262
+ sp_delta = sptracker.offset_after_block(src_addr, self.project.arch.sp_offset)
2263
+ if sp_delta == 0:
2264
+ return True
2265
+ else:
2266
+ # large function; to speed things up, we don't track sp
2267
+ minaddr, maxaddr = None, None
2268
+ if dst_addr - src_addr >= 0x100:
2269
+ minaddr = src_addr
2270
+ maxaddr = dst_addr
2271
+ elif dst_addr < src_addr:
2272
+ # jumping back; is it jumping beyond the function header?
2273
+ src_func = blockaddr_to_function[src_addr]
2274
+ if dst_addr < src_func.addr and src_func.addr - dst_addr >= 0x100:
2275
+ minaddr = dst_addr
2276
+ maxaddr = src_func.addr
2277
+
2278
+ if minaddr is not None and maxaddr is not None:
2279
+ # are there other function in between?
2280
+ funcaddrs_in_between = list(
2281
+ known_functions._function_map.irange(minimum=minaddr + 1, maximum=maxaddr - 1)
2282
+ )
2283
+ funcs_in_between = [known_functions.get_by_addr(a) for a in funcaddrs_in_between]
2284
+ funcs_in_between = [func for func in funcs_in_between if not func.is_alignment]
2285
+ if len(funcs_in_between) >= 3:
2286
+ return True
2263
2287
 
2264
2288
  return False
2265
2289
 
@@ -2639,7 +2663,7 @@ class CFGBase(Analysis):
2639
2663
  :return: True if the instruction does no-op, False otherwise.
2640
2664
  """
2641
2665
 
2642
- insn_name = insn.insn_name()
2666
+ insn_name = insn.mnemonic
2643
2667
 
2644
2668
  if insn_name == "nop":
2645
2669
  # nops