PyPI - angr - Versions diffs - 9.2.114__py3-none-macosx_11_0_arm64.whl → 9.2.116__py3-none-macosx_11_0_arm64.whl - Mend

angr 9.2.114__py3-none-macosx_11_0_arm64.whl → 9.2.116__py3-none-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of angr might be problematic. Click here for more details.

Files changed (50) hide show

angr/analyses/decompiler/structuring/sailr.py ADDED Viewed

@@ -0,0 +1,111 @@
+from typing import Any
+import networkx
+from ..utils import structured_node_is_simple_return
+from ....utils.graph import PostDominators, TemporaryNode
+from .phoenix import PhoenixStructurer
+class SAILRStructurer(PhoenixStructurer):
+    """
+    The SAILR structuring algorithm is the phoenix-based algorithm from the USENIX 2024 paper SAILR.
+    The entirety of the algorithm is implemented across this class and various optimization passes in the decompiler.
+    To find each optimization class, simply search for optimizations which reference this class.NAME.
+    At a high-level, SAILR does three things different from the traditional Phoenix schema-based algorithm:
+    1. It recursively structures the graph, rather than doing it in a single pass. This allows decisions to be made
+        based on the currrent state of what the decompilation would look like.
+    2. It performs deoptimizations targeting specific optimizations that introduces gotos and mis-structured code.
+        It can only do this because of the recursive nature of the algorithm.
+    3. It uses a more advanced heuristic for virtualizing edges, which is implemented in this class.
+    Additionally, some changes in Phoenix are only activated when SAILR is used.
+    """
+    NAME = "sailr"
+    def __init__(self, region, improve_phoenix=True, **kwargs):
+        super().__init__(
+            region,
+            improve_algorithm=improve_phoenix,
+            **kwargs,
+        )
+    def _order_virtualizable_edges(self, graph: networkx.DiGraph, edges: list, node_seq: dict[Any, int]) -> list:
+        """
+        The criteria for "best" is defined by a variety of heuristics described below.
+        """
+        if len(edges) <= 1:
+            return edges
+        # TODO: the graph we have here is not an accurate graph and can have no "entry node". We need a better graph.
+        try:
+            entry_node = [node for node in graph.nodes if graph.in_degree(node) == 0][0]
+        except IndexError:
+            entry_node = None
+        best_edges = edges
+        if entry_node is not None:
+            # the first few heuristics are based on the post-dominator count of the edge
+            # so we collect them for each candidate edge
+            edge_postdom_count = {}
+            edge_sibling_count = {}
+            for edge in edges:
+                _, dst = edge
+                graph_copy = networkx.DiGraph(graph)
+                graph_copy.remove_edge(*edge)
+                sibling_cnt = graph_copy.in_degree(dst)
+                if sibling_cnt == 0:
+                    continue
+                edge_sibling_count[edge] = sibling_cnt
+                post_dom_graph = PostDominators(graph_copy, entry_node).post_dom
+                post_doms = set()
+                for postdom_node, dominatee in post_dom_graph.edges():
+                    if not isinstance(postdom_node, TemporaryNode) and not isinstance(dominatee, TemporaryNode):
+                        post_doms.add((postdom_node, dominatee))
+                edge_postdom_count[edge] = len(post_doms)
+                # H1: the edge that has the least amount of sibling edges should be virtualized first
+                # this is believed to reduce the amount of virtualization needed in future rounds and increase
+                # the edges that enter a single outer-scope if-stmt
+                if edge_sibling_count:
+                    min_sibling_count = min(edge_sibling_count.values())
+                    best_edges = [edge for edge, cnt in edge_sibling_count.items() if cnt == min_sibling_count]
+                    if len(best_edges) == 1:
+                        return best_edges
+                    # create the next heuristic based on the best edges from the previous heuristic
+                    filtered_edge_postdom_count = edge_postdom_count.copy()
+                    for edge in list(edge_postdom_count.keys()):
+                        if edge not in best_edges:
+                            del filtered_edge_postdom_count[edge]
+                    if filtered_edge_postdom_count:
+                        edge_postdom_count = filtered_edge_postdom_count
+                # H2: the edge, when removed, that causes the most post-dominators of the graph should be virtualized
+                # first. this is believed to make the code more linear looking be reducing the amount of scopes.
+                # informally, we believe post-dominators to be an inverse indicator of the number of scopes present
+                if edge_postdom_count:
+                    max_postdom_count = max(edge_postdom_count.values())
+                    best_edges = [edge for edge, cnt in edge_postdom_count.items() if cnt == max_postdom_count]
+                    if len(best_edges) == 1:
+                        return best_edges
+                # H3: the edge that goes directly to a return statement should be virtualized first
+                # this is believed to be good because it can be corrected in later optimization by duplicating
+                # the return
+                candidate_edges = best_edges
+                best_edges = []
+                for src, dst in candidate_edges:
+                    if graph.has_node(dst) and structured_node_is_simple_return(dst, graph):
+                        best_edges.append((src, dst))
+                if len(best_edges) == 1:
+                    return best_edges
+                elif not best_edges:
+                    best_edges = candidate_edges
+        # if we have another tie, or we never used improved heuristics, then we do the default ordering.
+        return super()._order_virtualizable_edges(graph, best_edges, node_seq)

angr/analyses/decompiler/structuring/structurer_base.py CHANGED Viewed

@@ -1,6 +1,5 @@
 # pylint:disable=unused-argument
 from typing import Optional, Any, TYPE_CHECKING
-from collections import OrderedDict as ODict
 from collections import defaultdict, OrderedDict
 import logging
@@ -53,7 +52,6 @@ class StructurerBase(Analysis):
         func: Optional["Function"] = None,
         case_entry_to_switch_head: dict[int, int] | None = None,
         parent_region=None,
-        improve_structurer=True,
         **kwargs,
     ):
         self._region: "GraphRegion" = region
@@ -61,7 +59,6 @@ class StructurerBase(Analysis):
         self.function = func
         self._case_entry_to_switch_head = case_entry_to_switch_head
         self._parent_region = parent_region
-        self._improve_structurer = improve_structurer
         self.cond_proc = (
             condition_processor if condition_processor is not None else ConditionProcessor(self.project.arch)
@@ -741,8 +738,8 @@ class StructurerBase(Analysis):
     #
     def _reorganize_switch_cases(
-        self, cases: ODict[int | tuple[int, ...], SequenceNode]
-    ) -> ODict[int | tuple[int, ...], SequenceNode]:
+        self, cases: OrderedDict[int | tuple[int, ...], SequenceNode]
+    ) -> OrderedDict[int | tuple[int, ...], SequenceNode]:
         new_cases = OrderedDict()
         caseid2gotoaddrs = {}

angr/analyses/decompiler/structuring/structurer_nodes.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # pylint:disable=missing-class-docstring
 from typing import Any
-from collections import OrderedDict as ODict
+from collections import OrderedDict
 import claripy
 import ailment
@@ -358,9 +358,9 @@ class SwitchCaseNode(BaseNode):
         "addr",
     )
-    def __init__(self, switch_expr, cases: ODict[int | tuple[int, ...], SequenceNode], default_node, addr=None):
+    def __init__(self, switch_expr, cases: OrderedDict[int | tuple[int, ...], SequenceNode], default_node, addr=None):
         self.switch_expr = switch_expr
-        self.cases: ODict[int | tuple[int, ...], SequenceNode] = cases
+        self.cases: OrderedDict[int | tuple[int, ...], SequenceNode] = cases
         self.default_node = default_node
         self.addr = addr

angr/analyses/reaching_definitions/dep_graph.py CHANGED Viewed

@@ -213,6 +213,68 @@ class DepGraph:
             self.graph.add_edge(memory_location_definition, definition)
+    @overload
+    def find_definitions(
+        self,
+        *,
+        kind: type[A],
+        **kwargs: Any,
+    ) -> list[Definition[A]]: ...
+    @overload
+    def find_definitions(
+        self,
+        *,
+        kind: Literal[AtomKind.REGISTER] = AtomKind.REGISTER,
+        **kwargs: Any,
+    ) -> list[Definition[Register]]: ...
+    @overload
+    def find_definitions(
+        self,
+        *,
+        kind: Literal[AtomKind.MEMORY] = AtomKind.MEMORY,
+        **kwargs: Any,
+    ) -> list[Definition[MemoryLocation]]: ...
+    @overload
+    def find_definitions(
+        self,
+        *,
+        kind: Literal[AtomKind.TMP] = AtomKind.TMP,
+        **kwargs: Any,
+    ) -> list[Definition[Tmp]]: ...
+    @overload
+    def find_definitions(
+        self,
+        *,
+        kind: Literal[AtomKind.CONSTANT] = AtomKind.CONSTANT,
+        **kwargs: Any,
+    ) -> list[Definition[ConstantSrc]]: ...
+    @overload
+    def find_definitions(
+        self,
+        *,
+        kind: Literal[AtomKind.GUARD] = AtomKind.GUARD,
+        **kwargs: Any,
+    ) -> list[Definition[GuardUse]]: ...
+    @overload
+    def find_definitions(
+        self,
+        *,
+        reg_name: int | str = ...,
+        **kwargs: Any,
+    ) -> list[Definition[Register]]: ...
+    @overload
+    def find_definitions(self, *, stack_offset: int = ..., **kwargs: Any) -> list[Definition[MemoryLocation]]: ...
+    @overload
+    def find_definitions(self, *, const_val: int = ..., **kwargs: Any) -> list[Definition[ConstantSrc]]: ...
     def find_definitions(self, **kwargs) -> list[Definition]:
         """
         Filter the definitions present in the graph based on various criteria.
@@ -299,11 +361,6 @@ class DepGraph:
         self, starts: Definition[Atom] | Iterable[Definition[Atom]], *, const_val: int = ..., **kwargs: Any
     ) -> list[Definition[ConstantSrc]]: ...
-    @overload
-    def find_all_predecessors(
-        self, starts: Definition[Atom] | Iterable[Definition[Atom]], **kwargs: Any
-    ) -> list[Definition[Atom]]: ...
     def find_all_predecessors(self, starts, **kwargs):
         """
         Filter the ancestors of the given start node or nodes that match various criteria.

angr/analyses/reaching_definitions/function_handler.py CHANGED Viewed

@@ -254,15 +254,25 @@ class FunctionCallDataUnwrapped(FunctionCallData):
         return inner
+def _mk_wrapper(func, iself):
+    return lambda *args, **kwargs: func(iself, *args, **kwargs)
 # pylint: disable=unused-argument, no-self-use
 class FunctionHandler:
     """
     A mechanism for summarizing a function call's effect on a program for ReachingDefinitionsAnalysis.
     """
-    def __init__(self, interfunction_level: int = 0):
+    def __init__(self, interfunction_level: int = 0, extra_impls: Iterable["FunctionHandler"] | None = None):
         self.interfunction_level: int = interfunction_level
+        if extra_impls is not None:
+            for extra_handler in extra_impls:
+                for name, func in vars(extra_handler).items():
+                    if name.startswith("handle_impl_"):
+                        setattr(self, name, _mk_wrapper(func, self))
     def hook(self, analysis: "ReachingDefinitionsAnalysis") -> "FunctionHandler":
         """
         Attach this instance of the function handler to an instance of RDA.

angr/analyses/reaching_definitions/function_handler_library/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+from .stdlib import LibcStdlibHandlers, EnvironAtom, SystemAtom, ExecveAtom
+from .stdio import LibcStdioHandlers, StdoutAtom, StdinAtom
+from .unistd import LibcUnistdHandlers
+from .string import LibcStringHandlers
+class LibcHandlers(LibcStdlibHandlers, LibcStdioHandlers, LibcUnistdHandlers, LibcStringHandlers):
+    pass
+__all__ = ["EnvironAtom", "SystemAtom", "ExecveAtom", "StdoutAtom", "StdinAtom", "LibcHandlers"]

angr/analyses/reaching_definitions/function_handler_library/stdio.py ADDED Viewed

@@ -0,0 +1,262 @@
+import re
+import random
+import logging
+from collections.abc import Iterable
+import archinfo
+import claripy
+from angr.analyses.reaching_definitions.function_handler import FunctionCallDataUnwrapped, FunctionHandler
+from angr.analyses.reaching_definitions.rd_state import ReachingDefinitionsState
+from angr.knowledge_plugins.key_definitions.atoms import Atom
+from angr.knowledge_plugins.key_definitions.live_definitions import DerefSize
+from angr.sim_type import SimType, SimTypeBottom, SimTypeChar, SimTypeFunction, SimTypeInt, SimTypePointer
+from angr.storage.memory_mixins.paged_memory.pages.multi_values import MultiValues
+# pylint: disable=no-self-use,missing-class-docstring,unused-argument
+_l = logging.getLogger(__name__)
+class StdoutAtom(Atom):
+    def __init__(self, sink: str, size: int | None):
+        self.nonce = random.randint(0, 999999999999)
+        self.sink = sink
+        super().__init__(size if size is not None else 1)
+    def _identity(self):
+        return (self.nonce,)
+    def __repr__(self):
+        return f"<StdoutAtom {self.sink}>"
+class StdinAtom(Atom):
+    def __init__(self, source: str, size: int | None):
+        self.nonce = random.randint(0, 999999999999)
+        self.source = source
+        super().__init__(size if size is not None else 1)
+    def _identity(self):
+        return (self.nonce,)
+    def __repr__(self):
+        return f"<StdinAtom {self.source}>"
+def parse_format_string(format_string: str) -> tuple[list[str | int], list[SimType], list[str]]:
+    result_pieces: list[str | int] = []
+    result_types: list[SimType] = []
+    result_specs: list[str] = []
+    last_piece = 0
+    idx = 0
+    for argspec in re.finditer(r"\%([0 #+-]?[0-9*]*\.?\d*([hl]{0,2}|[jztL])?[diuoxXeEfgGaAcpsSn%])", format_string):
+        start, end = argspec.span()
+        if format_string[end - 1] == "%":
+            continue
+        if start != last_piece:
+            result_pieces.append(format_string[last_piece:start])
+        result_pieces.append(idx)
+        idx += 1
+        fmt = format_string[start:end]
+        if fmt == "%s":
+            arg = SimTypePointer(SimTypeChar())
+        elif fmt == "%d":
+            arg = SimTypeInt(signed=True)
+        elif fmt == "%u":
+            arg = SimTypeInt(signed=False)
+        elif fmt == "%c":
+            arg = SimTypeChar(signed=True)
+        else:
+            arg = SimTypeBottom()
+        result_types.append(arg)
+        result_specs.append(fmt)
+        last_piece = end
+    if last_piece != len(format_string):
+        result_pieces.append(format_string[last_piece:])
+    return result_pieces, result_types, result_specs
+class LibcStdioHandlers(FunctionHandler):
+    @FunctionCallDataUnwrapped.decorate
+    def handle_impl_printf(self, state: ReachingDefinitionsState, data: FunctionCallDataUnwrapped):
+        result, source_atoms = handle_printf(state, data, 0)
+        dst_atoms = StdoutAtom("printf", len(result) if result is not None else None)
+        data.depends(dst_atoms, source_atoms, value=result)
+    @FunctionCallDataUnwrapped.decorate
+    def handle_impl_dprintf(self, state: ReachingDefinitionsState, data: FunctionCallDataUnwrapped):
+        result, source_atoms = handle_printf(state, data, 1)
+        dst_atoms = StdoutAtom("dprintf", len(result) if result is not None else None)
+        data.depends(dst_atoms, source_atoms, value=result)
+    @FunctionCallDataUnwrapped.decorate
+    def handle_impl_fprintf(self, state: ReachingDefinitionsState, data: FunctionCallDataUnwrapped):
+        result, source_atoms = handle_printf(state, data, 1)
+        dst_atoms = StdoutAtom("fprintf", len(result) if result is not None else None)
+        data.depends(dst_atoms, source_atoms, value=result)
+    @FunctionCallDataUnwrapped.decorate
+    def handle_impl_sprintf(self, state: ReachingDefinitionsState, data: FunctionCallDataUnwrapped):
+        result, source_atoms = handle_printf(state, data, 1)
+        dst_atoms = state.deref(data.args_atoms[0], size=len(result) // 8 if result is not None else 1)
+        data.depends(dst_atoms, source_atoms, value=result)
+    @FunctionCallDataUnwrapped.decorate
+    def handle_impl_snprintf(self, state: ReachingDefinitionsState, data: FunctionCallDataUnwrapped):
+        result, source_atoms = handle_printf(state, data, 2)
+        size = state.get_concrete_value(data.args_atoms[1]) or 2
+        if result is not None:
+            size = min(size, len(result) // 8)
+        dst_atoms = state.deref(data.args_atoms[0], size=size)
+        data.depends(dst_atoms, source_atoms, value=result)
+    @FunctionCallDataUnwrapped.decorate
+    def handle_impl___sprintf_chk(self, state: ReachingDefinitionsState, data: FunctionCallDataUnwrapped):
+        result, source_atoms = handle_printf(state, data, 3)
+        dst_atoms = state.deref(data.args_atoms[0], size=len(result) // 8 if result is not None else 1)
+        data.depends(dst_atoms, source_atoms, value=result)
+    @FunctionCallDataUnwrapped.decorate
+    def handle_impl___snprintf_chk(self, state: ReachingDefinitionsState, data: FunctionCallDataUnwrapped):
+        result, source_atoms = handle_printf(state, data, 4)
+        size = state.get_concrete_value(data.args_atoms[1]) or 2
+        dst_atoms = state.deref(data.args_atoms[0], size=size)
+        data.depends(dst_atoms, source_atoms, value=result)
+    @FunctionCallDataUnwrapped.decorate
+    def handle_impl_scanf(self, state: ReachingDefinitionsState, data: FunctionCallDataUnwrapped):
+        handle_scanf(state, data, 0, {StdinAtom("scanf", None)})
+    handle_impl___isoc99_scanf = handle_impl_scanf
+    @FunctionCallDataUnwrapped.decorate
+    def handle_impl_sscanf(self, state: ReachingDefinitionsState, data: FunctionCallDataUnwrapped):
+        handle_scanf(state, data, 1, state.deref(data.args_atoms[0], DerefSize.NULL_TERMINATE))
+    handle_impl___isoc99_sscanf = handle_impl_sscanf
+    @FunctionCallDataUnwrapped.decorate
+    def handle_impl_fgets(self, state: ReachingDefinitionsState, data: FunctionCallDataUnwrapped):
+        size = state.get_concrete_value(data.args_atoms[1]) or 2
+        dst_atom = state.deref(data.args_atoms[0], size)
+        input_value = claripy.BVS("weh", (size - 1) * 8).concat(claripy.BVV(0, 8))
+        data.depends(dst_atom, StdinAtom("fgets", size), value=input_value)
+        data.depends(data.ret_atoms, data.args_atoms[0])
+    @FunctionCallDataUnwrapped.decorate
+    def handle_impl_fgetc(self, state: ReachingDefinitionsState, data: FunctionCallDataUnwrapped):
+        data.depends(data.ret_atoms, StdinAtom(data.function.name, 1))
+    handle_impl_getchar = handle_impl_getc = handle_impl_fgetc
+    @FunctionCallDataUnwrapped.decorate
+    def handle_impl_fread(self, state: ReachingDefinitionsState, data: FunctionCallDataUnwrapped):
+        size = state.get_concrete_value(data.args_atoms[1]) or 1
+        nmemb = state.get_concrete_value(data.args_atoms[1]) or 2
+        dst_atom = state.deref(data.args_atoms[0], size * nmemb)
+        data.depends(dst_atom, StdinAtom("fread", size * nmemb))
+    @FunctionCallDataUnwrapped.decorate
+    def handle_impl_fwrite(self, state: ReachingDefinitionsState, data: FunctionCallDataUnwrapped):
+        size = state.get_concrete_value(data.args_atoms[1]) or 1
+        nmemb = state.get_concrete_value(data.args_atoms[1]) or 2
+        src_atom = state.deref(data.args_atoms[0], size * nmemb)
+        data.depends(StdoutAtom("fwrite", size * nmemb), src_atom, value=state.get_values(src_atom))
+def handle_printf(
+    state: ReachingDefinitionsState, data: FunctionCallDataUnwrapped, fmt_idx: int
+) -> tuple[MultiValues | None, Iterable[Atom]]:
+    format_str = state.get_concrete_value(
+        state.deref(data.args_atoms[fmt_idx], DerefSize.NULL_TERMINATE), cast_to=bytes
+    )
+    if format_str is None:
+        _l.info("Hmmm.... non-constant format string")
+        return None, set()
+    format_str = format_str.strip(b"\0").decode()
+    arg_pieces, arg_types, formats = parse_format_string(format_str)
+    data.reset_prototype(SimTypeFunction(data.prototype.args + tuple(arg_types), data.prototype.returnty), state)
+    result = MultiValues(claripy.BVV(b""))
+    source_atoms: set[Atom] = set()
+    for piece in arg_pieces:
+        if isinstance(piece, str):
+            if result is not None:
+                result = result.concat(piece.encode())
+            continue
+        atom = data.args_atoms[fmt_idx + 1 + piece]
+        fmt = formats[piece]
+        if fmt == "%s":
+            buf_atoms = state.deref(atom, DerefSize.NULL_TERMINATE)
+            buf_data = state.get_values(buf_atoms)
+            if buf_data is not None:
+                buf_data = buf_data.extract(0, len(buf_data) // 8 - 1, archinfo.Endness.BE)
+        elif fmt == "%u":
+            buf_atoms = atom
+            buf_data = state.get_concrete_value(buf_atoms)
+            if buf_data is not None:
+                buf_data = str(buf_data).encode()
+        elif fmt == "%d":
+            buf_atoms = atom
+            buf_data = state.get_concrete_value(buf_atoms)
+            if buf_data is not None:
+                if buf_data >= 2**31:
+                    buf_data -= 2**32
+                buf_data = str(buf_data).encode()
+        elif fmt == "%c":
+            buf_atoms = atom
+            buf_data = state.get_concrete_value(atom)
+            if buf_data is not None:
+                buf_data = chr(buf_data).encode()
+        else:
+            _l.warning("Unimplemented printf format string %s", fmt)
+            buf_atoms = set()
+            buf_data = None
+        if result is not None:
+            if buf_data is not None:
+                result = result.concat(buf_data)
+        source_atoms.update(buf_atoms)
+    if result is not None:
+        result = result.concat(b"\0")
+    return result, source_atoms
+def handle_scanf(
+    state: ReachingDefinitionsState, data: FunctionCallDataUnwrapped, fmt_idx: int, source_atoms: Iterable[Atom]
+):
+    format_str = state.get_concrete_value(
+        state.deref(data.args_atoms[fmt_idx], DerefSize.NULL_TERMINATE), cast_to=bytes
+    )
+    if format_str is None:
+        _l.info("Hmmm.... non-constant format string")
+        return
+    format_str = format_str.strip(b"\0").decode()
+    arg_pieces, arg_types, formats = parse_format_string(format_str)
+    data.reset_prototype(SimTypeFunction(data.prototype.args + tuple(arg_types), data.prototype.returnty), state)
+    for piece in arg_pieces:
+        if isinstance(piece, str):
+            continue
+        atom = data.args_atoms[fmt_idx + 1 + piece]
+        fmt = formats[piece]
+        buf_data = None
+        if fmt == "%s":
+            buf_atom = state.deref(atom, 1)
+            buf_data = b"\0"
+        elif fmt == "%u":
+            buf_atom = state.deref(atom, 4, state.arch.memory_endness)
+        elif fmt == "%d":
+            buf_atom = state.deref(atom, 4, state.arch.memory_endness)
+        elif fmt == "%c":
+            buf_atom = state.deref(atom, 1, state.arch.memory_endness)
+        else:
+            _l.warning("Unimplemented scanf format string %s", fmt)
+            continue
+        data.depends(buf_atom, source_atoms, value=buf_data)