PyPI - angr - Versions diffs - 9.2.112__py3-none-manylinux2014_aarch64.whl → 9.2.113__py3-none-manylinux2014_aarch64.whl - Mend

angr 9.2.112__py3-none-manylinux2014_aarch64.whl → 9.2.113__py3-none-manylinux2014_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of angr might be problematic. Click here for more details.

Files changed (31) hide show

angr/analyses/decompiler/optimization_passes/optimization_pass.py CHANGED Viewed

@@ -1,4 +1,5 @@
 # pylint:disable=unused-argument
+import logging
 from typing import TYPE_CHECKING
 from collections.abc import Generator
 from enum import Enum
@@ -11,10 +12,13 @@ from angr.analyses.decompiler.condition_processor import ConditionProcessor
 from angr.analyses.decompiler.goto_manager import GotoManager
 from angr.analyses.decompiler.structuring import RecursiveStructurer, PhoenixStructurer
 from angr.analyses.decompiler.utils import add_labels
+from angr.analyses.decompiler.seq_cf_structure_counter import ControlFlowStructureCounter
 if TYPE_CHECKING:
     from angr.knowledge_plugins.functions import Function
+_l = logging.getLogger(__name__)
 class MultipleBlocksException(Exception):
     """
@@ -274,6 +278,7 @@ class StructuringOptimizationPass(OptimizationPass):
         prevent_new_gotos=True,
         strictly_less_gotos=False,
         recover_structure_fails=True,
+        must_improve_rel_quality=True,
         max_opt_iters=1,
         simplify_ail=True,
         require_gotos=True,
@@ -286,10 +291,15 @@ class StructuringOptimizationPass(OptimizationPass):
         self._max_opt_iters = max_opt_iters
         self._simplify_ail = simplify_ail
         self._require_gotos = require_gotos
+        self._must_improve_rel_quality = must_improve_rel_quality
         self._goto_manager: GotoManager | None = None
         self._prev_graph: networkx.DiGraph | None = None
+        # relative quality metrics (excludes gotos)
+        self._initial_structure_counter = None
+        self._current_structure_counter = None
     def _analyze(self, cache=None) -> bool:
         raise NotImplementedError()
@@ -297,7 +307,7 @@ class StructuringOptimizationPass(OptimizationPass):
         """
         Wrapper for _analyze() that verifies the graph is structurable before and after the optimization.
         """
-        if not self._graph_is_structurable(self._graph):
+        if not self._graph_is_structurable(self._graph, initial=True):
             return
         initial_gotos = self._goto_manager.gotos.copy()
@@ -340,6 +350,10 @@ class StructuringOptimizationPass(OptimizationPass):
                 self.out_graph = None
                 return
+        if self._must_improve_rel_quality and not self._improves_relative_quality():
+            self.out_graph = None
+            return
     def _fixed_point_analyze(self, cache=None):
         for _ in range(self._max_opt_iters):
             if self._require_gotos and not self._goto_manager.gotos:
@@ -359,7 +373,7 @@ class StructuringOptimizationPass(OptimizationPass):
                 self.out_graph = self._prev_graph if self._recover_structure_fails else None
                 break
-    def _graph_is_structurable(self, graph, readd_labels=False) -> bool:
+    def _graph_is_structurable(self, graph, readd_labels=False, initial=False) -> bool:
         """
         Checks weather the input graph is structurable under the Phoenix schema-matching structuring algorithm.
         As a side effect, this will also update the region identifier and goto manager of this optimization pass.
@@ -380,18 +394,74 @@ class StructuringOptimizationPass(OptimizationPass):
         if self._ri is None:
             return False
-        rs = self.project.analyses[RecursiveStructurer].prep(kb=self.kb)(
-            self._ri.region,
-            cond_proc=self._ri.cond_proc,
-            func=self._func,
-            structurer_cls=PhoenixStructurer,
-        )
+        # we should try-catch structuring here because we can often pass completely invalid graphs
+        # that break the assumptions of the structuring algorithm
+        try:
+            rs = self.project.analyses[RecursiveStructurer].prep(kb=self.kb)(
+                self._ri.region,
+                cond_proc=self._ri.cond_proc,
+                func=self._func,
+                structurer_cls=PhoenixStructurer,
+            )
+        # pylint:disable=broad-except
+        except Exception:
+            _l.warning("Internal structuring failed for OptimizationPass on %s", self._func.name)
+            rs = None
         if not rs or not rs.result or not rs.result.nodes or rs.result_incomplete:
             return False
         rs = self.project.analyses.RegionSimplifier(self._func, rs.result, kb=self.kb, variable_kb=self._variable_kb)
-        if not rs or rs.goto_manager is None:
+        if not rs or rs.goto_manager is None or rs.result is None:
             return False
+        self._analyze_simplified_region(rs.result, initial=initial)
         self._goto_manager = rs.goto_manager
         return True
+    # pylint:disable=no-self-use
+    def _analyze_simplified_region(self, region, initial=False):
+        """
+        Analyze the simplified regions after a successful structuring pass.
+        This should be overridden by the subclass if it needs to do anything with the simplified regions for making
+        optimizations decisions.
+        """
+        if region is None:
+            return
+        # record quality metrics
+        if self._must_improve_rel_quality:
+            if initial:
+                self._initial_structure_counter = ControlFlowStructureCounter(region)
+            else:
+                self._current_structure_counter = ControlFlowStructureCounter(region)
+    def _improves_relative_quality(self) -> bool:
+        """
+        Checks if the new structured output improves (or maintains) the relative quality of the control flow structures
+        present in the function.
+        For now, this only involves loops
+        """
+        if self._initial_structure_counter is None or self._current_structure_counter is None:
+            _l.warning("Relative quality check failed due to missing structure counters")
+            return True
+        prev_wloops = self._initial_structure_counter.while_loops
+        curr_wloops = self._current_structure_counter.while_loops
+        prev_dloops = self._initial_structure_counter.do_while_loops
+        curr_dloops = self._current_structure_counter.do_while_loops
+        prev_floops = self._initial_structure_counter.for_loops
+        curr_floops = self._current_structure_counter.for_loops
+        total_prev_loops = prev_wloops + prev_dloops + prev_floops
+        total_curr_loops = curr_wloops + curr_dloops + curr_floops
+        # Sometimes, if we mess up structuring you can easily tell because we traded "good" loops for "bad" loops.
+        # Generally, loops are ordered good -> bad as follows: for, while, do-while.
+        # Note: this check is only for _trading_, meaning the total number of loops must be the same.
+        #
+        # 1. We traded to remove a for-loop
+        if curr_floops < prev_floops and total_curr_loops == total_prev_loops:
+            return False
+        return True

angr/analyses/decompiler/optimization_passes/return_duplicator_base.py CHANGED Viewed

@@ -38,6 +38,7 @@ class ReturnDuplicatorBase:
         self.node_idx = count(start=node_idx_start)
         self._max_calls_in_region = max_calls_in_regions
         self._minimize_copies_for_regions = minimize_copies_for_regions
+        self._supergraph = None
         # this should also be set by the optimization passes initer
         self._func = func
@@ -71,6 +72,8 @@ class ReturnDuplicatorBase:
             # for connected in_edges that form a region
             endnode_regions = self._copy_connected_edge_components(endnode_regions, graph)
+        # refresh the supergraph
+        self._supergraph = to_ail_supergraph(graph)
         for region_head, (in_edges, region) in endnode_regions.items():
             is_single_const_ret_region = self._is_simple_return_graph(region)
             for in_edge in in_edges:
@@ -150,6 +153,7 @@ class ReturnDuplicatorBase:
             else:
                 node_copy = copy.deepcopy(node)
                 node_copy.idx = next(self.node_idx)
+                self._fix_copied_node_labels(node_copy)
                 copies[node] = node_copy
             # modify Jump.target_idx and ConditionalJump.{true,false}_target_idx accordingly
@@ -446,3 +450,20 @@ class ReturnDuplicatorBase:
         all_region_block_sets = {}
         _unpack_every_region(top_region, all_region_block_sets)
         return all_region_block_sets
+    @staticmethod
+    def _fix_copied_node_labels(block: Block):
+        for i in range(len(block.statements)):  # pylint:disable=consider-using-enumerate
+            stmt = block.statements[i]
+            if isinstance(stmt, Label):
+                # fix the default name by suffixing it with the new block ID
+                new_name = stmt.name if stmt.name else f"Label_{stmt.ins_addr:x}"
+                if stmt.block_idx is not None:
+                    suffix = f"__{stmt.block_idx}"
+                    if new_name.endswith(suffix):
+                        new_name = new_name[: -len(suffix)]
+                else:
+                    new_name = stmt.name
+                new_name += f"__{block.idx}"
+                block.statements[i] = Label(stmt.idx, new_name, stmt.ins_addr, block_idx=block.idx, **stmt.tags)

angr/analyses/decompiler/optimization_passes/return_duplicator_low.py CHANGED Viewed

@@ -4,7 +4,7 @@ import inspect
 import networkx
 from ailment import Block
-from ailment.statement import ConditionalJump
+from ailment.statement import ConditionalJump, Label
 from .return_duplicator_base import ReturnDuplicatorBase
 from .optimization_pass import StructuringOptimizationPass
@@ -71,23 +71,29 @@ class ReturnDuplicatorLow(StructuringOptimizationPass, ReturnDuplicatorBase):
         return ReturnDuplicatorBase._check(self)
     def _should_duplicate_dst(self, src, dst, graph, dst_is_const_ret=False):
-        return self._is_goto_edge(src, dst, graph=graph, check_for_ifstmts=True)
+        return self._is_goto_edge(src, dst, graph=graph)
     def _is_goto_edge(
         self,
         src: Block,
         dst: Block,
         graph: networkx.DiGraph = None,
-        check_for_ifstmts=True,
         max_level_check=1,
     ):
         """
-        TODO: correct how goto edge addressing works
+        TODO: Implement a more principled way of checking if an edge is a goto edge with Phoenix's structuring info
         This function only exists because a long-standing bug that sometimes reports the if-stmt addr
-        above a goto edge as the goto src. Because of this, we need to check for predecessors above the goto and
-        see if they are a goto. This needs to include Jump to deal with loops.
+        above a goto edge as the goto src.
         """
-        if check_for_ifstmts and graph is not None:
+        # Do a simple and fast check first
+        is_simple_goto = self._goto_manager.is_goto_edge(src, dst)
+        if is_simple_goto:
+            return True
+        if graph is not None:
+            # Special case 1:
+            # We need to check for predecessors above the goto and see if they are a goto.
+            # This needs to include Jump to deal with loops.
             blocks = [src]
             level_blocks = [src]
             for _ in range(max_level_check):
@@ -109,8 +115,104 @@ class ReturnDuplicatorLow(StructuringOptimizationPass, ReturnDuplicatorBase):
                 if self._goto_manager.is_goto_edge(block, dst):
                     return True
-        else:
-            return self._goto_manager.is_goto_edge(src, dst)
+            # Special case 2: A "goto edge" that ReturnDuplicator wants to test might be an edge that Phoenix
+            # includes in its loop region (during the cyclic refinement). In fact, Phoenix tends to include as many
+            # nodes as possible into the loop region, and generate a goto edge (which ends up in the structured code)
+            # from `dst` to the loop successor.
+            # an example of this is captured by the test case `TestDecompiler.test_stty_recover_mode_ret_dup_region`.
+            # until someone (ideally @mahaloz) implements a more principled way of translating "goto statements" that
+            # Phoenix generates and "goto edges" that ReturnDuplicator tests, we rely on the following stopgap to
+            # handle this case.
+            node = dst
+            while True:
+                succs = list(graph.successors(node))
+                if len(succs) != 1:
+                    break
+                succ = succs[0]
+                if succ is node:
+                    # loop!
+                    break
+                succ_preds = list(graph.predecessors(succ))
+                if len(succ_preds) != 1:
+                    break
+                if self._goto_manager.is_goto_edge(node, succ):
+                    return True
+                # keep testing the next edge
+                node = succ
+            # Special case 3: In Phoenix, regions full of only if-stmts can be collapsed and moved. This causes
+            # the goto manager to report gotos that are at the top of the region instead of ones in the middle of it.
+            # Because of this, we need to gather all the nodes above the original src and check if any of them
+            # go to the destination. Additionally, we need to do this on the supergraph to get rid of
+            # goto edges that are removed by Phoenix.
+            # This case is observed in the test case `TestDecompiler.test_tail_tail_bytes_ret_dup`.
+            if self._supergraph is None:
+                return False
+            super_to_og_nodes = {n: self._supergraph.nodes[n]["original_nodes"] for n in self._supergraph.nodes}
+            og_to_super_nodes = {og: super_n for super_n, ogs in super_to_og_nodes.items() for og in ogs}
+            super_src = og_to_super_nodes.get(src, None)
+            super_dst = og_to_super_nodes.get(dst, None)
+            if super_src is None or super_dst is None:
+                return False
+            # collect all nodes which have only an if-stmt in them that are ancestors of super_src
+            check_blks = {super_src}
+            level_blocks = {super_src}
+            for _ in range(10):
+                done = False
+                if_blks = set()
+                for lblock in level_blocks:
+                    preds = list(self._supergraph.predecessors(lblock))
+                    for pred in preds:
+                        only_cond_jump = all(isinstance(s, (ConditionalJump, Label)) for s in pred.statements)
+                        if only_cond_jump:
+                            if_blks.add(pred)
+                    done = len(if_blks) == 0
+                if done:
+                    break
+                check_blks |= if_blks
+                level_blocks = if_blks
+            # convert all the found if-only super-blocks back into their original blocks
+            og_check_blocks = set()
+            for blk in check_blks:
+                og_check_blocks |= set(super_to_og_nodes[blk])
+            # check if any of the original blocks are gotos to the destination
+            goto_hits = 0
+            for block in og_check_blocks:
+                if self._goto_manager.is_goto_edge(block, dst):
+                    goto_hits += 1
+            # Although it is good to find a goto in the if-only block region, having more than a single goto
+            # existing that goes to the same dst is a bad sign. This can be seen in the the following test:
+            # TestDecompiler.test_dd_iread_ret_dup_region
+            #
+            # It occurs when you have something like:
+            # ```
+            # if (a || c)
+            #     goto target;
+            # target:
+            # return 0;
+            # ```
+            #
+            #
+            # This looks like an edge from (a, target) and (c, target) but it is actually a single edge.
+            # If you allow both to duplicate you get the following:
+            # ```
+            # if (a):
+            #    return
+            # if (c):
+            #    return
+            # ```
+            # This is not the desired behavior.
+            # So we need to check if there is only a single goto that goes to the destination.
+            return goto_hits == 1
         return False

angr/analyses/decompiler/redundant_label_remover.py CHANGED Viewed

@@ -30,6 +30,9 @@ class RedundantLabelRemover:
         self._walker0 = SequenceWalker(handlers=handlers0)
         self._walker0.walk(self.root)
+        # update jump targets
+        self._update_jump_targets()
         handlers1 = {
             ailment.Block: self._handle_Block,
         }
@@ -37,6 +40,20 @@ class RedundantLabelRemover:
         self._walker1.walk(self.root)
         self.result = self.root
+    def _update_jump_targets(self) -> None:
+        """
+        Update self._jump_targets after the first pass fills in self._new_jump_target.
+        """
+        if self._new_jump_target:
+            jump_targets = set()
+            for jt in self._jump_targets:
+                if jt in self._new_jump_target:
+                    jump_targets.add(self._new_jump_target[jt])
+                else:
+                    jump_targets.add(jt)
+            self._jump_targets = jump_targets
     #
     # Handlers
     #

angr/analyses/decompiler/seq_cf_structure_counter.py ADDED Viewed

@@ -0,0 +1,37 @@
+from angr.analyses.decompiler.sequence_walker import SequenceWalker
+from angr.analyses.decompiler.structuring.structurer_nodes import SwitchCaseNode, LoopNode
+class ControlFlowStructureCounter(SequenceWalker):
+    """
+    Counts the number of different types of control flow structures found in a sequence of nodes.
+    This should be used after the sequence has been simplified.
+    """
+    def __init__(self, node):
+        handlers = {
+            LoopNode: self._handle_Loop,
+        }
+        super().__init__(handlers)
+        self.while_loops = 0
+        self.do_while_loops = 0
+        self.for_loops = 0
+        self.walk(node)
+    def _handle_Loop(self, node: LoopNode, **kwargs):
+        if node.sort == "while":
+            self.while_loops += 1
+        elif node.sort == "do-while":
+            self.do_while_loops += 1
+        elif node.sort == "for":
+            self.for_loops += 1
+        return super()._handle_Loop(node, **kwargs)
+    def _handle_Condition(self, node, parent=None, **kwargs):
+        return super()._handle_Condition(node, parent=parent, **kwargs)
+    def _handle_SwitchCase(self, node: SwitchCaseNode, parent=None, **kwargs):
+        return super()._handle_SwitchCase(node, parent=parent, **kwargs)

angr/analyses/decompiler/structured_codegen/c.py CHANGED Viewed

@@ -2769,9 +2769,7 @@ class CStructuredCodeGenerator(BaseStructuredCodeGenerator, Analysis):
         if offset == 0:
             data_type = renegotiate_type(data_type, base_type)
             if base_type == data_type or (
-                not isinstance(base_type, SimTypeBottom)
-                and not isinstance(data_type, SimTypeBottom)
-                and base_type.size < data_type.size
+                base_type.size is not None and data_type.size is not None and base_type.size < data_type.size
             ):
                 # case 1: we're done because we found it
                 # case 2: we're done because we can never find it and we might as well stop early
@@ -2784,7 +2782,7 @@ class CStructuredCodeGenerator(BaseStructuredCodeGenerator, Analysis):
                     return _force_type_cast(base_type, data_type, expr)
                 return CUnaryOp("Dereference", expr, codegen=self)
-        if isinstance(base_type, SimTypeBottom):
+        if base_type.size is None:
             stride = 1
         else:
             stride = base_type.size // self.project.arch.byte_width or 1
@@ -2968,7 +2966,7 @@ class CStructuredCodeGenerator(BaseStructuredCodeGenerator, Analysis):
             kernel_type = unpack_typeref(unpack_pointer(kernel.type))
             assert kernel_type
-            if isinstance(kernel_type, SimTypeBottom):
+            if kernel_type.size is None:
                 return bail_out()
             kernel_stride = kernel_type.size // self.project.arch.byte_width
@@ -3699,6 +3697,7 @@ class MakeTypecastsImplicit(CStructuredCodeWalker):
                 and isinstance(intermediate_ty, (SimTypeChar, SimTypeInt, SimTypeNum))
                 and isinstance(start_ty, (SimTypeChar, SimTypeInt, SimTypeNum))
             ):
+                assert dst_ty.size and start_ty.size and intermediate_ty.size
                 if dst_ty.size <= start_ty.size and dst_ty.size <= intermediate_ty.size:
                     # this is a down- or neutral-cast with an intermediate step that doesn't matter
                     result = child.expr

angr/analyses/decompiler/structuring/phoenix.py CHANGED Viewed

@@ -719,7 +719,7 @@ class PhoenixStructurer(StructurerBase):
                             break_stmt = Jump(
                                 None,
                                 Const(None, None, successor.addr, self.project.arch.bits),
-                                None,
+                                target_idx=successor.idx if isinstance(successor, Block) else None,
                                 ins_addr=last_src_stmt.ins_addr,
                             )
                             break_node = Block(last_src_stmt.ins_addr, None, statements=[break_stmt])
@@ -727,7 +727,7 @@ class PhoenixStructurer(StructurerBase):
                             break_stmt = Jump(
                                 None,
                                 Const(None, None, successor.addr, self.project.arch.bits),
-                                None,
+                                target_idx=successor.idx if isinstance(successor, Block) else None,
                                 ins_addr=last_src_stmt.ins_addr,
                             )
                             break_node_inner = Block(last_src_stmt.ins_addr, None, statements=[break_stmt])
@@ -744,7 +744,7 @@ class PhoenixStructurer(StructurerBase):
                                 break_stmt = Jump(
                                     None,
                                     Const(None, None, successor.addr, self.project.arch.bits),
-                                    None,
+                                    target_idx=successor.idx if isinstance(successor, Block) else None,
                                     ins_addr=last_src_stmt.ins_addr,
                                 )
                                 break_node = Block(last_src_stmt.ins_addr, None, statements=[break_stmt])

angr/analyses/reaching_definitions/rd_state.py CHANGED Viewed

@@ -93,6 +93,7 @@ class ReachingDefinitionsState:
         all_definitions: set[Definition] | None = None,
         initializer: Optional["RDAStateInitializer"] = None,
         element_limit: int = 5,
+        merge_into_tops: bool = True,
     ):
         # handy short-hands
         self.codeloc = codeloc
@@ -130,6 +131,7 @@ class ReachingDefinitionsState:
                 track_tmps=self._track_tmps,
                 canonical_size=canonical_size,
                 element_limit=element_limit,
+                merge_into_tops=merge_into_tops,
             )
             if self.analysis is not None:
                 self.live_definitions.project = self.analysis.project

angr/analyses/reaching_definitions/reaching_definitions.py CHANGED Viewed

@@ -76,6 +76,7 @@ class ReachingDefinitionsAnalysis(
         track_liveness: bool = True,
         func_addr: int | None = None,
         element_limit: int = 5,
+        merge_into_tops: bool = True,
     ):
         """
         :param subject:                         The subject of the analysis: a function, or a single basic block
@@ -110,6 +111,10 @@ class ReachingDefinitionsAnalysis(
         :param track_liveness:                  Whether to track liveness information. This can consume
                                                 sizeable amounts of RAM on large functions. (e.g. ~15GB for a function
                                                 with 4k nodes)
+        :param merge_into_tops:                 Merge known values into TOP if TOP is present.
+                                                If True: {TOP} V {0xabc} = {TOP}
+                                                If False: {TOP} V {0xabc} = {TOP, 0xabc}
         """
@@ -134,6 +139,7 @@ class ReachingDefinitionsAnalysis(
         self._use_callee_saved_regs_at_return = use_callee_saved_regs_at_return
         self._func_addr = func_addr
         self._element_limit = element_limit
+        self._merge_into_tops = merge_into_tops
         if dep_graph is None or dep_graph is False:
             self._dep_graph = None
@@ -473,6 +479,7 @@ class ReachingDefinitionsAnalysis(
                 canonical_size=self._canonical_size,
                 initializer=self._state_initializer,
                 element_limit=self._element_limit,
+                merge_into_tops=self._merge_into_tops,
             )
     # pylint: disable=no-self-use,arguments-differ