phantomrt 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. atlas/__init__.py +3 -0
  2. atlas/agents/__init__.py +8 -0
  3. atlas/agents/command_space.py +227 -0
  4. atlas/analysis/__init__.py +3 -0
  5. atlas/analysis/binary_agent.py +488 -0
  6. atlas/analysis/binary_fuzz.py +389 -0
  7. atlas/analysis/frida_live.py +261 -0
  8. atlas/analysis/graph_annotator.py +147 -0
  9. atlas/analysis/spectrida_bridge.py +84 -0
  10. atlas/analysis/unicorn_harness.py +337 -0
  11. atlas/core/__init__.py +14 -0
  12. atlas/core/decoder.py +65 -0
  13. atlas/core/dynamics.py +217 -0
  14. atlas/core/encoder.py +120 -0
  15. atlas/core/surprise.py +145 -0
  16. atlas/core/world_model.py +334 -0
  17. atlas/environments/__init__.py +5 -0
  18. atlas/environments/base.py +51 -0
  19. atlas/environments/grid_world.py +219 -0
  20. atlas/environments/physics_2d.py +283 -0
  21. atlas/environments/vm_world.py +168 -0
  22. atlas/knowledge/__init__.py +3 -0
  23. atlas/knowledge/instruction_vocab.py +534 -0
  24. atlas/monitor/__init__.py +5 -0
  25. atlas/monitor/execution_monitor.py +518 -0
  26. atlas/optimization/__init__.py +6 -0
  27. atlas/optimization/speed.py +457 -0
  28. atlas/planning/__init__.py +4 -0
  29. atlas/planning/goal.py +100 -0
  30. atlas/planning/mcts.py +228 -0
  31. atlas/training/__init__.py +4 -0
  32. atlas/training/continual.py +392 -0
  33. atlas/training/growth.py +213 -0
  34. atlas/training/loop.py +306 -0
  35. atlas/training/losses.py +101 -0
  36. atlas/training/self_train.py +307 -0
  37. atlas/utils/__init__.py +4 -0
  38. atlas/utils/logging.py +33 -0
  39. atlas/utils/math_helpers.py +30 -0
  40. atlas/utils/viz.py +136 -0
  41. atlas/vm/__init__.py +4 -0
  42. atlas/vm/wsl_vm.py +249 -0
  43. phantomrt-0.1.0.dist-info/METADATA +75 -0
  44. phantomrt-0.1.0.dist-info/RECORD +48 -0
  45. phantomrt-0.1.0.dist-info/WHEEL +5 -0
  46. phantomrt-0.1.0.dist-info/entry_points.txt +3 -0
  47. phantomrt-0.1.0.dist-info/licenses/LICENSE +21 -0
  48. phantomrt-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,147 @@
1
+ """
2
+ Atlas -> spectrIDA graph: the annotation edge.
3
+
4
+ spectrIDA builds the static map (Function/Binary nodes in Neo4j). Atlas runs the
5
+ code and learns what actually happens. This writes Atlas's *runtime* findings back
6
+ onto the matching Function node as ``atlas_*`` properties, so anything reading the
7
+ graph (an LLM, the spectrIDA UI) instantly sees "this function is reachable and
8
+ crashes on long input".
9
+
10
+ Schema it targets (confirmed live):
11
+ (:Function {name, addr, binary, size, pseudocode, disasm, id})
12
+ (:Binary {tag, i64_path})
13
+
14
+ Design choices:
15
+ * Match by (binary, addr) primarily — addresses are stable; names get renamed.
16
+ * Only ever SET on EXISTING nodes (never MERGE) — Atlas annotates spectrIDA's
17
+ map, it does not invent functions. A miss returns 0 and is reported, not hidden.
18
+ * All properties are namespaced ``atlas_`` so they never collide with spectrIDA's,
19
+ and can be cleared wholesale.
20
+ * If the graph is unreachable, findings can still be dumped to JSON as a fallback.
21
+ """
22
+
23
+ from __future__ import annotations
24
+
25
+ import json
26
+ import time
27
+ from pathlib import Path
28
+ from typing import Optional
29
+
30
+
31
+ _PRIM = (str, int, float, bool)
32
+
33
+
34
+ def _coerce(v):
35
+ """Neo4j properties must be primitives or lists of primitives; JSON-encode
36
+ anything richer (dicts, mixed lists) so nothing is silently dropped."""
37
+ if isinstance(v, _PRIM) or v is None:
38
+ return v
39
+ if isinstance(v, (list, tuple)) and all(isinstance(x, _PRIM) for x in v):
40
+ return list(v)
41
+ return json.dumps(v, default=str)
42
+
43
+
44
+ class GraphAnnotator:
45
+ def __init__(self, uri: str = "bolt://localhost:7687", user: str = "neo4j",
46
+ password: Optional[str] = None, driver=None, log=print,
47
+ prefix: str = "atlas_"):
48
+ self.log = log
49
+ # property namespace: "atlas_" standalone, "dyn_" when driven from spectrIDA.
50
+ self.prefix = prefix
51
+ if driver is not None:
52
+ self._driver = driver
53
+ else:
54
+ from neo4j import GraphDatabase
55
+ self._driver = GraphDatabase.driver(uri, auth=(user, password))
56
+
57
+ @classmethod
58
+ def from_spectrida_config(cls, path: Optional[str] = None, log=print):
59
+ """Build from ~/.spectrida/config.toml ([graph] password)."""
60
+ import tomllib
61
+ p = Path(path) if path else Path.home() / ".spectrida" / "config.toml"
62
+ cfg = tomllib.loads(p.read_text())
63
+ pw = cfg.get("graph", {}).get("password")
64
+ uri = cfg.get("graph", {}).get("uri", "bolt://localhost:7687")
65
+ return cls(uri=uri, user=cfg.get("graph", {}).get("user", "neo4j"),
66
+ password=pw, log=log)
67
+
68
+ def close(self):
69
+ self._driver.close()
70
+
71
+ def __enter__(self):
72
+ return self
73
+
74
+ def __exit__(self, *exc):
75
+ self.close()
76
+
77
+ # ── matching helper ──────────────────────────────────────────────────────
78
+ @staticmethod
79
+ def _match(addr, name):
80
+ if addr is not None:
81
+ return "MATCH (f:Function {binary:$binary, addr:$addr})", {"addr": addr}
82
+ if name is not None:
83
+ return "MATCH (f:Function {binary:$binary, name:$name})", {"name": name}
84
+ raise ValueError("need addr or name to identify the function")
85
+
86
+ # ── write ────────────────────────────────────────────────────────────────
87
+ def annotate(self, binary: str, facts: dict, *, addr=None, name=None) -> int:
88
+ """Stamp prefixed runtime facts onto a Function node. Returns #matched."""
89
+ match, key = self._match(addr, name)
90
+ p = self.prefix
91
+ props = {(k if k.startswith(p) else f"{p}{k}"): _coerce(v)
92
+ for k, v in facts.items()}
93
+ props[f"{p}updated"] = time.strftime("%Y-%m-%dT%H:%M:%S")
94
+ with self._driver.session() as s:
95
+ n = s.run(f"{match} SET f += $props RETURN count(f) AS n",
96
+ binary=binary, props=props, **key).single()["n"]
97
+ if n == 0:
98
+ self.log(f"[annotate] no match for binary={binary} {key} — nothing written")
99
+ else:
100
+ self.log(f"[annotate] wrote {len(props)} {p}* props to {n} node(s) "
101
+ f"(binary={binary} {key})")
102
+ return n
103
+
104
+ def annotate_fuzz_run(self, binary: str, fuzz_env, *, addr=None, name=None) -> int:
105
+ """Convenience: turn a BinaryFuzzEnv's findings into function facts."""
106
+ s = fuzz_env.summary()
107
+ crashed = s["unique_crashes"] > 0
108
+ kinds = sorted({k.split(":")[0] for k in s["crash_inputs"]})
109
+ facts = {
110
+ "reachable": True,
111
+ "crashes": crashed,
112
+ "crash_kinds": kinds,
113
+ "functions_covered": s["functions_covered"],
114
+ "unique_crashes": s["unique_crashes"],
115
+ "sample_crash_input": next(iter(s["crash_inputs"].values()), None),
116
+ "verdict": ("crashes on fuzzed input" if crashed
117
+ else "exercised, no crash found"),
118
+ }
119
+ return self.annotate(binary, facts, addr=addr, name=name)
120
+
121
+ # ── read / clear ─────────────────────────────────────────────────────────
122
+ def read(self, binary: str, *, addr=None, name=None) -> dict:
123
+ match, key = self._match(addr, name)
124
+ with self._driver.session() as s:
125
+ rec = s.run(f"{match} RETURN properties(f) AS p",
126
+ binary=binary, **key).single()
127
+ if not rec:
128
+ return {}
129
+ return {k: v for k, v in rec["p"].items() if k.startswith(self.prefix)}
130
+
131
+ def clear(self, binary: str, *, addr=None, name=None) -> int:
132
+ """Remove all prefixed props (leaves spectrIDA's own data untouched)."""
133
+ match, key = self._match(addr, name)
134
+ with self._driver.session() as s:
135
+ rec = s.run(f"{match} RETURN [k IN keys(f) WHERE k STARTS WITH $p] "
136
+ f"AS ks", binary=binary, p=self.prefix, **key).single()
137
+ if not rec or not rec["ks"]:
138
+ return 0
139
+ removes = ", ".join(f"f.`{k}`" for k in rec["ks"])
140
+ n = s.run(f"{match} REMOVE {removes} RETURN count(f) AS n",
141
+ binary=binary, **key).single()["n"]
142
+ return n
143
+
144
+ # ── fallback when the graph is down ──────────────────────────────────────
145
+ @staticmethod
146
+ def export_json(path: str, findings: dict) -> None:
147
+ Path(path).write_text(json.dumps(findings, indent=2, default=str))
@@ -0,0 +1,84 @@
1
+ """
2
+ The connector: spectrIDA's FormatHandler -> Atlas's emulation harness.
3
+
4
+ Reuses spectrIDA's own format plugins (`spectrida.analysis.formats`) to load ANY
5
+ supported binary (PE/Windows, NSO/Switch, ELF, .so), decompress if needed, and
6
+ hand back the real section bytes + arch. Atlas then maps the whole image (so
7
+ internal calls resolve = "chain emulation") and fuzzes a chosen function.
8
+
9
+ This is the honest edge: real bytes, real arch, from the same plugin spectrIDA
10
+ already trusts — no re-implementing loaders, no guessing addresses.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import sys
16
+ import tempfile
17
+ from pathlib import Path
18
+
19
+ from .unicorn_harness import EmulationHarness, EmuResult
20
+
21
+ DEFAULT_SPECTRIDA = r"C:\Users\Administrator\Desktop\scrape\mini-mythos\spectrIDA"
22
+
23
+ _PE_MACHINE = {0x8664: "x86_64", 0xAA64: "arm64", 0x14C: "x86", 0x1C0: "arm"}
24
+ _ELF_MACHINE = {0x3E: "x86_64", 0xB7: "arm64", 0x28: "arm", 0x03: "x86"}
25
+
26
+
27
+ def _detect_arch(path: str) -> str:
28
+ """Read the true CPU from the PE/ELF header (the handler may hand back None)."""
29
+ import struct
30
+ with open(path, "rb") as f:
31
+ head = f.read(0x40)
32
+ if head[:2] == b"MZ": # PE
33
+ e_lfanew = struct.unpack_from("<I", head, 0x3C)[0]
34
+ f.seek(e_lfanew + 4) # skip "PE\0\0" signature
35
+ machine = struct.unpack("<H", f.read(2))[0]
36
+ return _PE_MACHINE.get(machine, "x86_64")
37
+ if head[:4] == b"\x7fELF": # ELF
38
+ return _ELF_MACHINE.get(struct.unpack_from("<H", head, 18)[0], "x86_64")
39
+ return "x86_64"
40
+
41
+
42
+ class EmulatedBinary:
43
+ """A binary loaded via spectrIDA's FormatHandler, ready to emulate functions."""
44
+
45
+ def __init__(self, path: str, spectrida_path: str = DEFAULT_SPECTRIDA):
46
+ if spectrida_path and spectrida_path not in sys.path:
47
+ sys.path.insert(0, spectrida_path)
48
+ from spectrida.analysis.formats.registry import detect
49
+
50
+ self.path = path
51
+ self.handler = detect(path)
52
+ self.image = self.handler.prepare(path, tempfile.mkdtemp())
53
+ # The handler's arch hint can be None (PE/ELF: it lets IDA decide). Never
54
+ # guess — read the real machine type from the header, else emulation runs
55
+ # the wrong CPU (e.g. x86-64 droid.exe mis-run as arm64 = garbage).
56
+ self.arch = self.image.arch or _detect_arch(path)
57
+ self._regions = None
58
+
59
+ @property
60
+ def format(self) -> str:
61
+ return self.handler.name
62
+
63
+ def regions(self):
64
+ """Whole-image sections as (va, bytes), cached — real bytes so internal
65
+ calls land on real code; .bss/short sections zero-padded to full size."""
66
+ if self._regions is None:
67
+ regs = []
68
+ for s in self.image.sections:
69
+ va = self.image.image_base + s.va
70
+ data = self.handler.read_bytes(self.image, va, va + s.vsize)
71
+ if len(data) < s.vsize:
72
+ data = data + b"\x00" * (s.vsize - len(data))
73
+ regs.append((va, data))
74
+ self._regions = regs
75
+ return self._regions
76
+
77
+ def emulate(self, entry_addr: int, input_bytes: bytes = b"",
78
+ max_insns: int = 20000) -> EmuResult:
79
+ """Emulate the function at ``entry_addr`` with the whole image mapped and
80
+ out-of-chain calls / syscalls stubbed. arg0 points at the fuzzable input
81
+ buffer (so a C++ ``this`` reads fuzzed object memory)."""
82
+ h = EmulationHarness(self.arch, max_insns=max_insns)
83
+ return h.run(regions=self.regions(), entry=entry_addr,
84
+ input_bytes=input_bytes, stub_calls=True)
@@ -0,0 +1,337 @@
1
+ """
2
+ The hard edge: exercise ONE function by CPU emulation — no OS, any architecture.
3
+
4
+ spectrIDA's targets are ARM64 (Switch NSO) and Android .so — they can't run in
5
+ the WSL VM. So instead of *running the program*, we emulate the function's raw
6
+ machine code with Unicorn: map its bytes, set up a stack + an input buffer, point
7
+ the first argument at the input, run for a bounded number of instructions, and
8
+ catch the faults. A function that dereferences an input-derived bad pointer, walks
9
+ off a buffer, etc. shows up as an unmapped-memory fault — a real crash signal,
10
+ found without a Switch in sight.
11
+
12
+ Arch-agnostic: pass ``arch`` ("arm64", "x86_64", "arm", "x86") — which is exactly
13
+ what spectrIDA's FormatHandler hands us via ``PreparedImage.arch``. So the same
14
+ harness fuzzes a Windows PE function (x86-64) and an Odyssey NSO function (arm64).
15
+
16
+ Honest limits: this emulates a function *in a vacuum*. Pure-computation functions
17
+ (parsers, crypto, validation, string/math) work great. Functions that call into
18
+ the OS / other libs will fetch-fault at the call unless those are stubbed — that's
19
+ standard harness work (or graduate to Qiling for full OS emulation).
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ from dataclasses import dataclass
25
+
26
+ from unicorn import (
27
+ Uc, UcError,
28
+ UC_ARCH_ARM64, UC_ARCH_ARM, UC_ARCH_X86,
29
+ UC_MODE_ARM, UC_MODE_32, UC_MODE_64, UC_MODE_LITTLE_ENDIAN,
30
+ UC_HOOK_BLOCK, UC_HOOK_MEM_UNMAPPED, UC_HOOK_INSN_INVALID, UC_HOOK_INTR,
31
+ UC_PROT_ALL,
32
+ UC_MEM_READ_UNMAPPED, UC_MEM_WRITE_UNMAPPED, UC_MEM_FETCH_UNMAPPED,
33
+ )
34
+
35
+ _PAGE = 0x1000
36
+ # a single "return" instruction per arch — used to stub out-of-chain calls
37
+ _RET = {"arm64": b"\xc0\x03\x5f\xd6", "x86_64": b"\xc3", "arm": b"\x1e\xff\x2f\xe1"}
38
+
39
+
40
+ def _align_down(x): return x & ~(_PAGE - 1)
41
+ def _align_up(x): return (x + _PAGE - 1) & ~(_PAGE - 1)
42
+
43
+ # memory layout (well-separated so a wild pointer lands in unmapped space = fault)
44
+ CODE_BASE = 0x0100_0000
45
+ CODE_SIZE = 0x0010_0000
46
+ STACK_BASE = 0x0200_0000
47
+ STACK_SIZE = 0x0010_0000
48
+ INPUT_BASE = 0x0300_0000
49
+ INPUT_SIZE = 0x0010_0000
50
+ RET_MAGIC = 0x0AAA_0000 # unmapped sentinel: function "returns" here → clean stop
51
+
52
+ _FAULT = {
53
+ UC_MEM_READ_UNMAPPED: "read_unmapped",
54
+ UC_MEM_WRITE_UNMAPPED: "write_unmapped",
55
+ UC_MEM_FETCH_UNMAPPED: "fetch_unmapped",
56
+ }
57
+
58
+
59
+ @dataclass
60
+ class EmuResult:
61
+ crashed: bool
62
+ crash_kind: str # read_unmapped / write_unmapped / fetch_unmapped / invalid_insn / ""
63
+ fault_addr: int
64
+ cov_ids: frozenset # basic-block addresses hit (as hex strings)
65
+ blocks: int
66
+ returned: bool # reached the return sentinel cleanly
67
+ timed_out: bool # hit instruction budget without returning
68
+ new_coverage: int = 0 # blocks not seen before — the env fills this in
69
+ stubbed_calls: int = 0 # out-of-chain calls stubbed (return 0) instead of faulting
70
+ syscalls: int = 0 # svc/syscall instructions skipped
71
+
72
+ @property
73
+ def status(self) -> str:
74
+ if self.crashed:
75
+ # a fault on a near-null address is almost always an uninitialized
76
+ # global/`this` pointer (missing live state), NOT a real bug — say so
77
+ # honestly instead of crying crash.
78
+ return "needs_state" if self.fault_addr < 0x10000 else "crash"
79
+ if self.returned:
80
+ return "clean"
81
+ return "inconclusive" # ran out of budget / couldn't complete faithfully
82
+
83
+ @property
84
+ def note(self) -> str:
85
+ return {
86
+ "needs_state": f"null-ish deref @+{hex(self.fault_addr)} — uninitialized "
87
+ f"global/this; needs live engine state (LLM reasons statically)",
88
+ "crash": f"{self.crash_kind} @ {hex(self.fault_addr)} on a wild address — "
89
+ f"candidate bug (verify: is that field input-controlled?)",
90
+ "clean": f"returned cleanly; {self.stubbed_calls} calls stubbed",
91
+ "inconclusive": "hit instruction budget without returning",
92
+ }[self.status]
93
+
94
+ # duck-typed shims so the fuzz/annotate reuse works unchanged
95
+ @property
96
+ def exit_code(self) -> int:
97
+ return 139 if self.crashed else (124 if self.timed_out else 0)
98
+
99
+ @property
100
+ def stdout(self) -> str:
101
+ return ""
102
+
103
+
104
+ class EmulationHarness:
105
+ """Emulates a single function's machine code with fuzzed inputs."""
106
+
107
+ def __init__(self, arch: str = "arm64", max_insns: int = 20000):
108
+ self.arch = arch
109
+ self.max_insns = max_insns
110
+ self._cfg = self._arch_config(arch)
111
+ self.arch_key = self._cfg["key"] # normalized: arm64 / x86_64 / arm
112
+
113
+ @staticmethod
114
+ def _arch_config(arch: str) -> dict:
115
+ a = arch.lower().replace("-", "").replace("_", "")
116
+ if a in ("arm64", "aarch64"):
117
+ from unicorn import arm64_const as C
118
+ return {"key": "arm64", "uc": (UC_ARCH_ARM64, UC_MODE_ARM), "sp": C.UC_ARM64_REG_SP,
119
+ "pc": C.UC_ARM64_REG_PC, "lr": C.UC_ARM64_REG_LR,
120
+ "args": [C.UC_ARM64_REG_X0, C.UC_ARM64_REG_X1, C.UC_ARM64_REG_X2,
121
+ C.UC_ARM64_REG_X3], "push_ret": False}
122
+ if a in ("x8664", "x64", "amd64"):
123
+ from unicorn import x86_const as C
124
+ return {"key": "x86_64", "uc": (UC_ARCH_X86, UC_MODE_64), "sp": C.UC_X86_REG_RSP,
125
+ "pc": C.UC_X86_REG_RIP, "lr": None,
126
+ "args": [C.UC_X86_REG_RDI, C.UC_X86_REG_RSI, C.UC_X86_REG_RDX,
127
+ C.UC_X86_REG_RCX], "push_ret": True}
128
+ if a in ("arm", "arm32", "thumb"):
129
+ from unicorn import arm_const as C
130
+ return {"key": "arm", "uc": (UC_ARCH_ARM, UC_MODE_ARM), "sp": C.UC_ARM_REG_SP,
131
+ "pc": C.UC_ARM_REG_PC, "lr": C.UC_ARM_REG_LR,
132
+ "args": [C.UC_ARM_REG_R0, C.UC_ARM_REG_R1, C.UC_ARM_REG_R2,
133
+ C.UC_ARM_REG_R3], "push_ret": False}
134
+ raise ValueError(f"unsupported arch: {arch}")
135
+
136
+ def run(self, code: bytes | None = None, input_bytes: bytes = b"", *,
137
+ regions=None, entry: int | None = None, stub_calls: bool = False,
138
+ extra_args=()) -> EmuResult:
139
+ """Emulate a function with fuzzed input.
140
+
141
+ Single-function mode (default): pass ``code`` — it's mapped at CODE_BASE.
142
+ Chain mode: pass ``regions`` = [(va, bytes), ...] (whole sections, so
143
+ internal calls land on real code) and ``entry`` (the function's VA).
144
+ ``stub_calls=True`` turns out-of-chain calls into RET-stubs (return 0) and
145
+ skips syscalls, so only real data faults count as crashes.
146
+ """
147
+ cfg = self._cfg
148
+ uc = Uc(*cfg["uc"])
149
+ uc.mem_map(STACK_BASE, STACK_SIZE, UC_PROT_ALL)
150
+ uc.mem_map(INPUT_BASE, INPUT_SIZE, UC_PROT_ALL)
151
+
152
+ if regions is None: # single-function mode
153
+ uc.mem_map(CODE_BASE, CODE_SIZE, UC_PROT_ALL)
154
+ uc.mem_write(CODE_BASE, code or b"")
155
+ start = CODE_BASE
156
+ else: # chain mode: map sections
157
+ # merge page ranges first — adjacent sections can round into the same
158
+ # page and mapping them separately raises UC_ERR_MAP.
159
+ ranges = sorted((_align_down(va), _align_up(va + len(d))) for va, d in regions)
160
+ merged: list[list[int]] = []
161
+ for st, en in ranges:
162
+ if merged and st <= merged[-1][1]:
163
+ merged[-1][1] = max(merged[-1][1], en)
164
+ else:
165
+ merged.append([st, en])
166
+ for st, en in merged:
167
+ uc.mem_map(st, en - st, UC_PROT_ALL)
168
+ for va, data in regions:
169
+ uc.mem_write(va, data)
170
+ start = entry
171
+ if input_bytes:
172
+ uc.mem_write(INPUT_BASE, input_bytes[:INPUT_SIZE])
173
+
174
+ sp = STACK_BASE + STACK_SIZE // 2
175
+ argvals = [INPUT_BASE, len(input_bytes), *extra_args] # arg0=input ptr, arg1=len
176
+ for reg, val in zip(cfg["args"], argvals):
177
+ uc.reg_write(reg, val & 0xFFFFFFFFFFFFFFFF)
178
+ if cfg["push_ret"]:
179
+ sp -= 8
180
+ uc.mem_write(sp, RET_MAGIC.to_bytes(8, "little"))
181
+ else:
182
+ uc.reg_write(cfg["lr"], RET_MAGIC)
183
+ uc.reg_write(cfg["sp"], sp)
184
+
185
+ cov: set[int] = set()
186
+ fault = {"kind": "", "addr": 0}
187
+ stats = {"stubbed": 0, "syscalls": 0}
188
+ ret_bytes = _RET.get(self.arch_key, b"\xc3")
189
+
190
+ uc.hook_add(UC_HOOK_BLOCK, lambda u, a, s, d: cov.add(a))
191
+
192
+ def on_bad_mem(u, access, address, size, value, data):
193
+ # a call/branch to unmapped code = an out-of-chain call → stub it
194
+ if stub_calls and access == UC_MEM_FETCH_UNMAPPED:
195
+ page = _align_down(address)
196
+ try:
197
+ u.mem_map(page, _PAGE, UC_PROT_ALL)
198
+ u.mem_write(page, ret_bytes * (_PAGE // len(ret_bytes)))
199
+ except UcError:
200
+ pass
201
+ u.reg_write(cfg["args"][0], 0) # stubbed call returns 0
202
+ stats["stubbed"] += 1
203
+ return True # resume → executes RET → returns
204
+ fault["kind"] = _FAULT.get(access, "mem_unmapped")
205
+ fault["addr"] = address
206
+ return False # real data fault → crash
207
+ uc.hook_add(UC_HOOK_MEM_UNMAPPED, on_bad_mem)
208
+
209
+ def on_bad_insn(u, data):
210
+ fault["kind"] = "invalid_insn"
211
+ fault["addr"] = u.reg_read(cfg["pc"])
212
+ return False
213
+ uc.hook_add(UC_HOOK_INSN_INVALID, on_bad_insn)
214
+
215
+ if stub_calls: # skip syscalls (svc/int)
216
+ def on_intr(u, intno, data):
217
+ stats["syscalls"] += 1
218
+ u.reg_write(cfg["args"][0], 0)
219
+ uc.hook_add(UC_HOOK_INTR, on_intr)
220
+
221
+ crashed = False
222
+ try:
223
+ uc.emu_start(start, RET_MAGIC, timeout=0, count=self.max_insns)
224
+ except UcError:
225
+ crashed = True
226
+ if not fault["kind"]:
227
+ fault["kind"] = "cpu_fault"
228
+
229
+ pc = uc.reg_read(cfg["pc"])
230
+ returned = (not crashed) and (pc == RET_MAGIC)
231
+ return EmuResult(
232
+ crashed=crashed, crash_kind=fault["kind"], fault_addr=fault["addr"],
233
+ cov_ids=frozenset(hex(a) for a in cov), blocks=len(cov),
234
+ returned=returned, timed_out=(not crashed and not returned),
235
+ stubbed_calls=stats["stubbed"], syscalls=stats["syscalls"],
236
+ )
237
+
238
+
239
+ # ── Atlas environment: fuzz an emulated function with the curiosity loop ──────
240
+ EMU_STATE_DIM = 14
241
+
242
+
243
+ class EmulatedFuzzEnv:
244
+ """Presents a single emulated function as an Atlas env, so SelfTrainer's
245
+ curiosity loop learns which INPUTS crash it. Same interface as
246
+ BinaryFuzzEnv, so the trainer/proposer/annotator all reuse unchanged."""
247
+
248
+ def __init__(self, code: bytes, arch: str = "arm64", max_insns: int = 20000,
249
+ log=print, binary: str = "emulated", addr: int = 0):
250
+ import numpy as np
251
+ from collections import Counter
252
+ self._np = np
253
+ self.harness = EmulationHarness(arch, max_insns=max_insns)
254
+ self.code = code
255
+ self.arch = arch
256
+ self.log = log
257
+ self.binary = binary
258
+ self.addr = addr
259
+ self.covered_global: set = set()
260
+ self.crash_inputs: dict[str, bytes] = {}
261
+ self.seen = Counter()
262
+ self._last = np.zeros(EMU_STATE_DIM, dtype=np.float32)
263
+ self.recoveries = 0 # emulation is sandboxed; nothing to recover
264
+ self.steps = 0
265
+
266
+ def get_action_dim(self):
267
+ from .binary_fuzz import FUZZ_ACTION_DIM
268
+ return FUZZ_ACTION_DIM
269
+
270
+ def get_observation_dim(self):
271
+ return EMU_STATE_DIM
272
+
273
+ def reset(self):
274
+ self._last = self._np.zeros(EMU_STATE_DIM, dtype=self._np.float32)
275
+ return self._last.copy()
276
+
277
+ def render(self):
278
+ return None
279
+
280
+ def _run(self, token: str, record: bool):
281
+ from .binary_fuzz import parse_token
282
+ _, payload = parse_token(token)
283
+ res = self.harness.run(self.code, payload)
284
+ fresh = res.cov_ids - self.covered_global
285
+ res.new_coverage = len(fresh)
286
+ if record:
287
+ self.covered_global |= res.cov_ids
288
+ return payload, res
289
+
290
+ def step(self, token: str):
291
+ from .binary_fuzz import input_family
292
+ self.steps += 1
293
+ payload, res = self._run(token, record=True)
294
+ obs = self.featurize(token, res)
295
+ sig = (res.crash_kind or ("timeout" if res.timed_out else "ok"),
296
+ min(res.blocks, 16))
297
+ self.seen[sig] += 1
298
+ reward = float(res.new_coverage) + (3.0 if res.crashed else 0.0)
299
+ if res.crashed:
300
+ key = f"{res.crash_kind}@{hex(res.fault_addr)}"
301
+ if key not in self.crash_inputs:
302
+ self.crash_inputs[key] = payload
303
+ self.log(f"[emu] CRASH {res.crash_kind} @ {hex(res.fault_addr)} "
304
+ f"on {payload[:24]!r} — {len(self.crash_inputs)} unique")
305
+ self._last = obs
306
+ info = {"command": token, "result": res, "family": input_family(token),
307
+ "recovered": False, "coverage": len(self.covered_global),
308
+ "crashed": res.crashed}
309
+ return obs, reward, False, info
310
+
311
+ def run_probe(self, token: str):
312
+ _, res = self._run(token, record=False)
313
+ return self.featurize(token, res)
314
+
315
+ def featurize(self, token: str, res: EmuResult):
316
+ np = self._np
317
+ v = np.zeros(EMU_STATE_DIM, dtype=np.float32)
318
+ v[0] = 1.0 if res.crashed else 0.0
319
+ v[1] = 1.0 if res.returned else 0.0
320
+ v[2] = 1.0 if res.timed_out else 0.0
321
+ v[3] = 1.0 if res.crash_kind == "read_unmapped" else 0.0
322
+ v[4] = 1.0 if res.crash_kind == "write_unmapped" else 0.0
323
+ v[5] = 1.0 if res.crash_kind == "fetch_unmapped" else 0.0
324
+ v[6] = 1.0 if res.crash_kind == "invalid_insn" else 0.0
325
+ v[7] = min(res.blocks / 16.0, 1.0)
326
+ v[8] = min(res.new_coverage / 4.0, 1.0)
327
+ v[9] = min(len(self.covered_global) / 32.0, 1.0)
328
+ v[10] = 1.0 if res.fault_addr else 0.0
329
+ v[11] = 1.0 if res.crash_kind.startswith("write") else 0.0
330
+ v[13] = 1.0
331
+ return v
332
+
333
+ def summary(self) -> dict:
334
+ return {"functions_covered": len(self.covered_global),
335
+ "unique_crashes": len(self.crash_inputs),
336
+ "crash_inputs": {k: v.hex() for k, v in self.crash_inputs.items()}}
337
+
atlas/core/__init__.py ADDED
@@ -0,0 +1,14 @@
1
+ from .world_model import WorldModel
2
+ from .encoder import Encoder
3
+ from .decoder import Decoder
4
+ from .dynamics import DynamicsFunction, NeuralODE
5
+ from .surprise import SurpriseDetector
6
+
7
+ __all__ = [
8
+ "WorldModel",
9
+ "Encoder",
10
+ "Decoder",
11
+ "DynamicsFunction",
12
+ "NeuralODE",
13
+ "SurpriseDetector",
14
+ ]
atlas/core/decoder.py ADDED
@@ -0,0 +1,65 @@
1
+ """
2
+ Decoder: Latent State → Predicted Observation
3
+
4
+ The mirror of the encoder. Takes the internal
5
+ representation and reconstructs what the
6
+ observation SHOULD look like.
7
+
8
+ If the decoder can reconstruct reality from the
9
+ latent state, it means the latent state captured
10
+ the important information — the model UNDERSTOOD
11
+ what it was seeing.
12
+ """
13
+
14
+ import torch
15
+ import torch.nn as nn
16
+
17
+
18
+ class Decoder(nn.Module):
19
+ """
20
+ Decodes latent state back into observation space.
21
+
22
+ Input: latent_state [batch, latent_dim]
23
+ Output: predicted_observation [batch, obs_dim]
24
+ """
25
+
26
+ def __init__(self, latent_dim: int, obs_dim: int, hidden_dims: list = None):
27
+ super().__init__()
28
+
29
+ self.latent_dim = latent_dim
30
+ self.obs_dim = obs_dim
31
+
32
+ if hidden_dims is None:
33
+ hidden_dims = [512, 512]
34
+
35
+ layers = []
36
+ prev_dim = latent_dim
37
+
38
+ for h_dim in hidden_dims:
39
+ layers.extend([
40
+ nn.Linear(prev_dim, h_dim),
41
+ nn.LayerNorm(h_dim),
42
+ nn.SiLU(),
43
+ ])
44
+ prev_dim = h_dim
45
+
46
+ # Final layer: map back to observation space
47
+ layers.append(nn.Linear(prev_dim, obs_dim))
48
+
49
+ self.net = nn.Sequential(*layers)
50
+
51
+ # Initialize final layer for reasonable initial reconstructions
52
+ nn.init.xavier_uniform_(self.net[-1].weight, gain=0.1)
53
+ nn.init.zeros_(self.net[-1].bias)
54
+
55
+ def forward(self, latent_state: torch.Tensor) -> torch.Tensor:
56
+ """
57
+ Decode latent state into predicted observation.
58
+
59
+ Args:
60
+ latent_state: [batch, latent_dim]
61
+
62
+ Returns:
63
+ predicted_obs: [batch, obs_dim]
64
+ """
65
+ return self.net(latent_state)