deaced 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deaced/__init__.py ADDED
@@ -0,0 +1,49 @@
1
+ """DeACED - dump and inspect Java Object Serialization (``0xAC 0xED``) streams.
2
+
3
+ A small, dependency-free library and CLI that turns a Java serialization stream
4
+ (and Java RMI packet contents) into a human-readable, hierarchical text dump.
5
+
6
+ This is a Python port of SerializationDumper by Nicky Bloor (MIT); see NOTICE.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from .model import Stream
12
+ from .parser import Parser, parse, run_deep
13
+ from .render import render_json, render_pretty, render_text
14
+
15
+ __all__ = ["dump", "parse", "__version__"]
16
+ __version__ = "0.1.0"
17
+
18
+ _FORMATS = ("text", "json", "pretty")
19
+
20
+
21
+ def dump(data: bytes, *, format: str = "text", offsets: bool = False) -> str:
22
+ """Parse ``data`` and render it.
23
+
24
+ Args:
25
+ data: The raw serialization stream.
26
+ format: Output format -- ``"text"`` (the default hierarchical dump,
27
+ byte-for-byte compatible with the patched upstream jar), ``"json"``
28
+ (a structured machine-readable view), or ``"pretty"`` (a compact
29
+ human-readable data tree).
30
+ offsets: When true (``text`` only), prefix every line with
31
+ ``@<byte-offset>|``.
32
+
33
+ Returns:
34
+ The rendered output as a single string.
35
+ """
36
+ if format not in _FORMATS:
37
+ raise ValueError(f"unknown format: {format!r}")
38
+ if offsets and format != "text":
39
+ raise ValueError("offsets are only supported for the 'text' format")
40
+
41
+ def render() -> str:
42
+ node: Stream = Parser(data).parse()
43
+ if format == "text":
44
+ return render_text(node, offsets=offsets)
45
+ if format == "json":
46
+ return render_json(node)
47
+ return render_pretty(node)
48
+
49
+ return run_deep(render)
deaced/__main__.py ADDED
@@ -0,0 +1,6 @@
1
+ """Enable ``python -m deaced``."""
2
+
3
+ from .cli import main
4
+
5
+ if __name__ == "__main__": # pragma: no cover
6
+ raise SystemExit(main())
deaced/cli.py ADDED
@@ -0,0 +1,94 @@
1
+ """Command-line interface for DeACED."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import sys
7
+
8
+ from . import __version__, dump
9
+
10
+
11
+ def _read_input(args: argparse.Namespace) -> bytes:
12
+ if args.hex is not None:
13
+ return bytes.fromhex("".join(args.hex.split()))
14
+ if args.hex_file is not None:
15
+ with open(args.hex_file, "rb") as f:
16
+ txt = f.read().decode("latin-1")
17
+ return bytes.fromhex("".join(c for c in txt if c in "0123456789abcdefABCDEF"))
18
+ # raw bytes ('-' = stdin)
19
+ if args.raw == "-":
20
+ return sys.stdin.buffer.read()
21
+ with open(args.raw, "rb") as f:
22
+ return f.read()
23
+
24
+
25
+ def main(argv: list[str] | None = None) -> int:
26
+ p = argparse.ArgumentParser(
27
+ prog="deaced",
28
+ description="Dump and inspect Java Object Serialization (0xAC 0xED) streams "
29
+ "and RMI packets in human-readable form.",
30
+ formatter_class=argparse.RawDescriptionHelpFormatter,
31
+ epilog=(
32
+ "examples:\n"
33
+ " deaced -r dump.bin dump a raw serialized file\n"
34
+ " cat dump.bin | deaced -r - read from stdin\n"
35
+ " deaced -x aced0005740004414243... decode hex from the command line\n"
36
+ " deaced -f hexdump.txt read a file of hex-ascii bytes\n"
37
+ " deaced -r dump.bin -F json emit structured JSON\n"
38
+ " deaced -r dump.bin -F pretty emit a compact data tree\n"
39
+ " deaced -r dump.bin --offsets annotate each line with its byte offset\n"
40
+ " deaced -r dump.bin -o out.txt write the dump to a file"
41
+ ),
42
+ )
43
+ src = p.add_mutually_exclusive_group(required=True)
44
+ src.add_argument(
45
+ "-r", "--raw", metavar="FILE", help="raw binary serialization file ('-' for stdin)"
46
+ )
47
+ src.add_argument(
48
+ "-f", "--hex-file", metavar="FILE", dest="hex_file", help="file of hex-ascii bytes"
49
+ )
50
+ src.add_argument("-x", "--hex", metavar="HEX", help="hex-ascii bytes on the command line")
51
+ p.add_argument("-o", "--output", metavar="FILE", help="write the dump to FILE (default stdout)")
52
+ p.add_argument(
53
+ "-F",
54
+ "--format",
55
+ choices=["text", "json", "pretty"],
56
+ default="text",
57
+ help="output format (default: text)",
58
+ )
59
+ p.add_argument(
60
+ "--offsets",
61
+ action="store_true",
62
+ help="prefix each line with '@<byte-offset>|' (text format only)",
63
+ )
64
+ p.add_argument("-V", "--version", action="version", version=f"deaced {__version__}")
65
+ args = p.parse_args(argv)
66
+
67
+ if args.offsets and args.format != "text":
68
+ p.error("--offsets is only valid with -F text")
69
+
70
+ try:
71
+ data = _read_input(args)
72
+ except (OSError, ValueError) as e:
73
+ print(f"deaced: cannot read input: {e}", file=sys.stderr)
74
+ return 2
75
+
76
+ try:
77
+ text = dump(data, format=args.format, offsets=args.offsets)
78
+ except Exception as e: # parser errors carry an offset in the message
79
+ print(f"deaced: parse error: {e}", file=sys.stderr)
80
+ return 1
81
+
82
+ if args.output:
83
+ with open(args.output, "w", encoding="utf-8") as f:
84
+ f.write(text)
85
+ else:
86
+ reconfigure = getattr(sys.stdout, "reconfigure", None)
87
+ if reconfigure is not None:
88
+ reconfigure(encoding="utf-8")
89
+ sys.stdout.write(text)
90
+ return 0
91
+
92
+
93
+ if __name__ == "__main__": # pragma: no cover
94
+ raise SystemExit(main())
deaced/errors.py ADDED
@@ -0,0 +1,27 @@
1
+ """Exception types for DeACED.
2
+
3
+ Every error carries the byte offset in the serialization stream where it was
4
+ detected, which makes truncated or malformed streams much easier to diagnose.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+
10
+ class SerDumpError(Exception):
11
+ """Base class for all DeACED parse errors."""
12
+
13
+ def __init__(self, message: str, offset: int | None = None) -> None:
14
+ self.offset = offset
15
+ super().__init__(f"{message} (at offset {offset})" if offset is not None else message)
16
+
17
+
18
+ class TruncatedStreamError(SerDumpError):
19
+ """The stream ended before the expected number of bytes was available."""
20
+
21
+
22
+ class UnknownTagError(SerDumpError):
23
+ """An unexpected or illegal type-code byte was encountered."""
24
+
25
+
26
+ class IllegalStateError(SerDumpError):
27
+ """The stream violated a structural rule of the serialization protocol."""
deaced/jfloat.py ADDED
@@ -0,0 +1,86 @@
1
+ """Format floats/doubles exactly as Java's ``Double.toString`` / ``Float.toString``.
2
+
3
+ The reference dumper is Java, so to reproduce its output we must match Java's
4
+ notation: decimal form when ``1e-3 <= |x| < 1e7`` and "computerized scientific
5
+ notation" (``d.dddEexp``) otherwise, always with at least one fractional digit,
6
+ an uppercase ``E`` and no ``+`` on the exponent.
7
+
8
+ The shortest round-tripping digits are found by formatting with increasing
9
+ precision until the value reparses to the identical bit pattern (checked via
10
+ :mod:`struct`), which matches the shortest-representation algorithm Java uses
11
+ (JDK 19+). Java itself does not emit the shortest digits for the smallest
12
+ subnormals (a small cluster near ``Double``/``Float.MIN_VALUE``), so DeACED
13
+ diverges from Java there; every value it prints still round-trips exactly.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import math
19
+ import struct
20
+
21
+
22
+ def _shortest_sci(m: float, fmt: str, max_prec: int) -> str:
23
+ """Return Python scientific notation with the fewest digits that round-trips."""
24
+ target = struct.pack(fmt, m)
25
+ for prec in range(max_prec + 1):
26
+ s = f"{m:.{prec}e}"
27
+ try:
28
+ packed = struct.pack(fmt, float(s))
29
+ except OverflowError:
30
+ # rounding pushed the value just past the type's max; try more digits
31
+ continue
32
+ if packed == target:
33
+ return s
34
+ return f"{m:.{max_prec}e}"
35
+
36
+
37
+ def _parts(sci: str) -> tuple[str, int]:
38
+ """Split Python scientific notation into (significant digits, exponent-of-first-digit)."""
39
+ mant, _, exp = sci.partition("e")
40
+ sci_exp = int(exp)
41
+ digits = mant.lstrip("+-").replace(".", "").rstrip("0") or "0"
42
+ return digits, sci_exp
43
+
44
+
45
+ def _format(neg: bool, digits: str, sci_exp: int) -> str:
46
+ sign = "-" if neg else ""
47
+ n = len(digits)
48
+ if -3 <= sci_exp <= 6:
49
+ if sci_exp >= 0:
50
+ int_len = sci_exp + 1
51
+ if n <= int_len:
52
+ int_part = digits + "0" * (int_len - n)
53
+ frac = "0"
54
+ else:
55
+ int_part = digits[:int_len]
56
+ frac = digits[int_len:]
57
+ else:
58
+ int_part = "0"
59
+ frac = "0" * (-sci_exp - 1) + digits
60
+ return f"{sign}{int_part}.{frac}"
61
+ mant = digits + ".0" if n == 1 else digits[0] + "." + digits[1:]
62
+ return f"{sign}{mant}E{sci_exp}"
63
+
64
+
65
+ def _to_string(v: float, fmt: str, max_prec: int) -> str:
66
+ if v != v:
67
+ return "NaN"
68
+ if v == math.inf:
69
+ return "Infinity"
70
+ if v == -math.inf:
71
+ return "-Infinity"
72
+ neg = math.copysign(1.0, v) < 0
73
+ if v == 0.0:
74
+ return "-0.0" if neg else "0.0"
75
+ digits, sci_exp = _parts(_shortest_sci(abs(v), fmt, max_prec))
76
+ return _format(neg, digits, sci_exp)
77
+
78
+
79
+ def double_to_string(v: float) -> str:
80
+ """Format ``v`` as Java ``Double.toString`` would."""
81
+ return _to_string(v, ">d", 16)
82
+
83
+
84
+ def float_to_string(v: float) -> str:
85
+ """Format a 32-bit float value ``v`` as Java ``Float.toString`` would."""
86
+ return _to_string(v, ">f", 8)
deaced/model.py ADDED
@@ -0,0 +1,209 @@
1
+ """Semantic AST for a parsed Java Object Serialization stream (ADR-0001).
2
+
3
+ Nodes mirror the entities of the serialization protocol, not the text dump:
4
+ there are no presentation-only grouping nodes here -- the text renderer adds
5
+ lines like ``Contents``/``values``/``(object)``. Each node keeps the raw bytes
6
+ of its primitives -- the renderers use them to reproduce the hex columns and, in
7
+ ``--offsets`` mode, to advance their own byte cursor -- and records its source
8
+ ``offset`` (start position in the stream) so callers can map a node back to the
9
+ input.
10
+
11
+ :class:`Stream` is the root; :attr:`Stream.handles` maps wire handles to the
12
+ node that owns them, so :class:`Reference` targets can be resolved.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ from dataclasses import dataclass, field
18
+
19
+
20
+ @dataclass
21
+ class Utf:
22
+ """A plain modified-UTF-8 string (class/field/interface name; no handle)."""
23
+
24
+ value: str
25
+ raw: bytes
26
+
27
+
28
+ @dataclass
29
+ class Node:
30
+ """Base class for every AST node; ``offset`` is the start byte position."""
31
+
32
+ offset: int
33
+
34
+
35
+ # --- values (may appear as stream content, field values or array elements) ---
36
+
37
+
38
+ @dataclass
39
+ class Null(Node):
40
+ """``TC_NULL`` -- a null reference."""
41
+
42
+
43
+ @dataclass
44
+ class Reference(Node):
45
+ """``TC_REFERENCE`` -- a back-reference to a previously seen handle."""
46
+
47
+ handle: int
48
+ target: Node | None = None
49
+
50
+
51
+ @dataclass
52
+ class StringVal(Node):
53
+ """``TC_STRING`` / ``TC_LONGSTRING`` -- a string object with a handle."""
54
+
55
+ handle: int
56
+ value: str
57
+ raw: bytes
58
+ long: bool = False
59
+
60
+
61
+ @dataclass
62
+ class Primitive(Node):
63
+ """A primitive field/array value (type code one of ``BCDFIJSZ``)."""
64
+
65
+ tc: str
66
+ value: int | float | bool
67
+ raw: bytes
68
+
69
+
70
+ @dataclass
71
+ class BlockData(Node):
72
+ """``TC_BLOCKDATA`` / ``TC_BLOCKDATALONG`` -- an opaque byte block."""
73
+
74
+ data: bytes
75
+ long: bool = False
76
+
77
+
78
+ @dataclass
79
+ class Reset(Node):
80
+ """``TC_RESET`` -- resets the stream's handle table to the base handle."""
81
+
82
+
83
+ @dataclass
84
+ class ExceptionObj(Node):
85
+ """``TC_EXCEPTION`` -- a serialized Throwable describing a serialization abort."""
86
+
87
+ throwable: Node
88
+
89
+
90
+ @dataclass
91
+ class ClassObj(Node):
92
+ """``TC_CLASS`` -- a Class object."""
93
+
94
+ handle: int
95
+ class_desc: ClassDescLike
96
+
97
+
98
+ @dataclass
99
+ class EnumObj(Node):
100
+ """``TC_ENUM`` -- an enum constant."""
101
+
102
+ handle: int
103
+ class_desc: ClassDescLike
104
+ constant: StringVal | Reference
105
+
106
+
107
+ @dataclass
108
+ class ArrayObj(Node):
109
+ """``TC_ARRAY`` -- an array object.
110
+
111
+ ``component`` is the element type code (the second char of the array class
112
+ name). For a ``byte[]`` the data is kept in ``byte_values``; otherwise each
113
+ element is a node in ``elements``.
114
+ """
115
+
116
+ handle: int
117
+ class_desc: ClassDescLike
118
+ component: str
119
+ size: int
120
+ elements: list[Node] = field(default_factory=list)
121
+ byte_values: bytes = b""
122
+
123
+
124
+ @dataclass
125
+ class ObjectInstance(Node):
126
+ """``TC_OBJECT`` -- a serialized object instance."""
127
+
128
+ handle: int
129
+ class_desc: ClassDescLike
130
+ data: list[ClassData] = field(default_factory=list)
131
+ na: bool = False
132
+
133
+
134
+ # --- class descriptions (the classDesc slot: one of these or Null/Reference) ---
135
+
136
+
137
+ @dataclass
138
+ class FieldDesc:
139
+ """A field declaration inside a class description."""
140
+
141
+ tc: str
142
+ name: Utf
143
+ class_name1: StringVal | Reference | None = None
144
+
145
+
146
+ @dataclass
147
+ class ClassDesc(Node):
148
+ """``TC_CLASSDESC`` -- a concrete class description."""
149
+
150
+ name: Utf
151
+ svuid: bytes
152
+ handle: int
153
+ flags: int
154
+ fields: list[FieldDesc] = field(default_factory=list)
155
+ annotations: list[Node] = field(default_factory=list)
156
+ super_desc: ClassDescLike | None = None
157
+
158
+
159
+ @dataclass
160
+ class ProxyClassDesc(Node):
161
+ """``TC_PROXYCLASSDESC`` -- a dynamic proxy class description."""
162
+
163
+ handle: int
164
+ interfaces: list[Utf] = field(default_factory=list)
165
+ annotations: list[Node] = field(default_factory=list)
166
+ super_desc: ClassDescLike | None = None
167
+
168
+
169
+ # --- object-instance class data ---
170
+
171
+
172
+ @dataclass
173
+ class FieldValue:
174
+ """One field's value within a class's slice of an object's data."""
175
+
176
+ name: str
177
+ declared_tc: str
178
+ value: Node
179
+
180
+
181
+ @dataclass
182
+ class ClassData:
183
+ """One class's contribution to an object's data (values + annotations)."""
184
+
185
+ class_name: str
186
+ serializable: bool
187
+ values: list[FieldValue] = field(default_factory=list)
188
+ has_annotation: bool = False
189
+ annotations: list[Node] = field(default_factory=list)
190
+
191
+
192
+ # --- root ---
193
+
194
+
195
+ @dataclass
196
+ class Stream(Node):
197
+ """The parsed stream: optional RMI prefix, header, and top-level contents."""
198
+
199
+ magic: bytes
200
+ magic_valid: bool
201
+ version: bytes | None = None
202
+ version_valid: bool = False
203
+ rmi: int | None = None
204
+ contents: list[Node] = field(default_factory=list)
205
+ handles: dict[int, Node] = field(default_factory=dict)
206
+
207
+
208
+ #: Anything that can occupy a ``classDesc`` slot.
209
+ ClassDescLike = ClassDesc | ProxyClassDesc | Null | Reference
deaced/mutf8.py ADDED
@@ -0,0 +1,70 @@
1
+ """Java "modified UTF-8" -- the string encoding used by serialization.
2
+
3
+ ``DataOutput.writeUTF`` / ``DataInput.readUTF`` (and therefore the Object
4
+ Serialization Stream Protocol) encode strings in a variant of UTF-8 that differs
5
+ from the standard in exactly two ways:
6
+
7
+ * ``U+0000`` is written as the two bytes ``C0 80`` (never a bare ``00``);
8
+ * characters outside the Basic Multilingual Plane are written as a UTF-16
9
+ surrogate pair, each surrogate emitted in its 3-byte form (i.e. CESU-8), rather
10
+ than as one 4-byte UTF-8 sequence.
11
+
12
+ For every other (BMP, non-zero) character it is identical to standard UTF-8.
13
+ Decoding is tolerant: malformed sequences yield U+FFFD rather than raising.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ _REPLACEMENT = 0xFFFD
19
+
20
+
21
+ def decode(data: bytes) -> str:
22
+ """Decode modified-UTF-8 ``data`` into a string."""
23
+ units = bytearray() # UTF-16 code units, big-endian
24
+ i = 0
25
+ n = len(data)
26
+ while i < n:
27
+ a = data[i]
28
+ if a < 0x80:
29
+ unit = a
30
+ i += 1
31
+ elif (a & 0xE0) == 0xC0:
32
+ if i + 1 < n and (data[i + 1] & 0xC0) == 0x80:
33
+ unit = ((a & 0x1F) << 6) | (data[i + 1] & 0x3F)
34
+ i += 2
35
+ else:
36
+ unit = _REPLACEMENT
37
+ i += 1
38
+ elif (a & 0xF0) == 0xE0:
39
+ if i + 2 < n and (data[i + 1] & 0xC0) == 0x80 and (data[i + 2] & 0xC0) == 0x80:
40
+ unit = ((a & 0x0F) << 12) | ((data[i + 1] & 0x3F) << 6) | (data[i + 2] & 0x3F)
41
+ i += 3
42
+ else:
43
+ unit = _REPLACEMENT
44
+ i += 1
45
+ else:
46
+ unit = _REPLACEMENT
47
+ i += 1
48
+ units += unit.to_bytes(2, "big")
49
+ # Combine surrogate pairs; keep lone surrogates as-is (Java permits them).
50
+ return units.decode("utf-16-be", errors="surrogatepass")
51
+
52
+
53
+ def encode(text: str) -> bytes:
54
+ """Encode ``text`` as modified UTF-8."""
55
+ b16 = text.encode("utf-16-be", errors="surrogatepass")
56
+ out = bytearray()
57
+ for j in range(0, len(b16), 2):
58
+ unit = (b16[j] << 8) | b16[j + 1]
59
+ if unit == 0x0000:
60
+ out += b"\xc0\x80"
61
+ elif unit <= 0x7F:
62
+ out.append(unit)
63
+ elif unit <= 0x7FF:
64
+ out.append(0xC0 | (unit >> 6))
65
+ out.append(0x80 | (unit & 0x3F))
66
+ else:
67
+ out.append(0xE0 | (unit >> 12))
68
+ out.append(0x80 | ((unit >> 6) & 0x3F))
69
+ out.append(0x80 | (unit & 0x3F))
70
+ return bytes(out)