deaced 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deaced/__init__.py +49 -0
- deaced/__main__.py +6 -0
- deaced/cli.py +94 -0
- deaced/errors.py +27 -0
- deaced/jfloat.py +86 -0
- deaced/model.py +209 -0
- deaced/mutf8.py +70 -0
- deaced/parser.py +699 -0
- deaced/py.typed +0 -0
- deaced/reader.py +63 -0
- deaced/render/__init__.py +17 -0
- deaced/render/_safe.py +18 -0
- deaced/render/json.py +151 -0
- deaced/render/pretty.py +137 -0
- deaced/render/text.py +430 -0
- deaced/tags.py +49 -0
- deaced-0.1.0.dist-info/METADATA +151 -0
- deaced-0.1.0.dist-info/RECORD +23 -0
- deaced-0.1.0.dist-info/WHEEL +5 -0
- deaced-0.1.0.dist-info/entry_points.txt +2 -0
- deaced-0.1.0.dist-info/licenses/LICENSE +22 -0
- deaced-0.1.0.dist-info/licenses/NOTICE +43 -0
- deaced-0.1.0.dist-info/top_level.txt +1 -0
deaced/__init__.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""DeACED - dump and inspect Java Object Serialization (``0xAC 0xED``) streams.
|
|
2
|
+
|
|
3
|
+
A small, dependency-free library and CLI that turns a Java serialization stream
|
|
4
|
+
(and Java RMI packet contents) into a human-readable, hierarchical text dump.
|
|
5
|
+
|
|
6
|
+
This is a Python port of SerializationDumper by Nicky Bloor (MIT); see NOTICE.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from .model import Stream
|
|
12
|
+
from .parser import Parser, parse, run_deep
|
|
13
|
+
from .render import render_json, render_pretty, render_text
|
|
14
|
+
|
|
15
|
+
__all__ = ["dump", "parse", "__version__"]
|
|
16
|
+
__version__ = "0.1.0"
|
|
17
|
+
|
|
18
|
+
_FORMATS = ("text", "json", "pretty")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def dump(data: bytes, *, format: str = "text", offsets: bool = False) -> str:
|
|
22
|
+
"""Parse ``data`` and render it.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
data: The raw serialization stream.
|
|
26
|
+
format: Output format -- ``"text"`` (the default hierarchical dump,
|
|
27
|
+
byte-for-byte compatible with the patched upstream jar), ``"json"``
|
|
28
|
+
(a structured machine-readable view), or ``"pretty"`` (a compact
|
|
29
|
+
human-readable data tree).
|
|
30
|
+
offsets: When true (``text`` only), prefix every line with
|
|
31
|
+
``@<byte-offset>|``.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
The rendered output as a single string.
|
|
35
|
+
"""
|
|
36
|
+
if format not in _FORMATS:
|
|
37
|
+
raise ValueError(f"unknown format: {format!r}")
|
|
38
|
+
if offsets and format != "text":
|
|
39
|
+
raise ValueError("offsets are only supported for the 'text' format")
|
|
40
|
+
|
|
41
|
+
def render() -> str:
|
|
42
|
+
node: Stream = Parser(data).parse()
|
|
43
|
+
if format == "text":
|
|
44
|
+
return render_text(node, offsets=offsets)
|
|
45
|
+
if format == "json":
|
|
46
|
+
return render_json(node)
|
|
47
|
+
return render_pretty(node)
|
|
48
|
+
|
|
49
|
+
return run_deep(render)
|
deaced/__main__.py
ADDED
deaced/cli.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"""Command-line interface for DeACED."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import sys
|
|
7
|
+
|
|
8
|
+
from . import __version__, dump
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _read_input(args: argparse.Namespace) -> bytes:
|
|
12
|
+
if args.hex is not None:
|
|
13
|
+
return bytes.fromhex("".join(args.hex.split()))
|
|
14
|
+
if args.hex_file is not None:
|
|
15
|
+
with open(args.hex_file, "rb") as f:
|
|
16
|
+
txt = f.read().decode("latin-1")
|
|
17
|
+
return bytes.fromhex("".join(c for c in txt if c in "0123456789abcdefABCDEF"))
|
|
18
|
+
# raw bytes ('-' = stdin)
|
|
19
|
+
if args.raw == "-":
|
|
20
|
+
return sys.stdin.buffer.read()
|
|
21
|
+
with open(args.raw, "rb") as f:
|
|
22
|
+
return f.read()
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def main(argv: list[str] | None = None) -> int:
|
|
26
|
+
p = argparse.ArgumentParser(
|
|
27
|
+
prog="deaced",
|
|
28
|
+
description="Dump and inspect Java Object Serialization (0xAC 0xED) streams "
|
|
29
|
+
"and RMI packets in human-readable form.",
|
|
30
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
31
|
+
epilog=(
|
|
32
|
+
"examples:\n"
|
|
33
|
+
" deaced -r dump.bin dump a raw serialized file\n"
|
|
34
|
+
" cat dump.bin | deaced -r - read from stdin\n"
|
|
35
|
+
" deaced -x aced0005740004414243... decode hex from the command line\n"
|
|
36
|
+
" deaced -f hexdump.txt read a file of hex-ascii bytes\n"
|
|
37
|
+
" deaced -r dump.bin -F json emit structured JSON\n"
|
|
38
|
+
" deaced -r dump.bin -F pretty emit a compact data tree\n"
|
|
39
|
+
" deaced -r dump.bin --offsets annotate each line with its byte offset\n"
|
|
40
|
+
" deaced -r dump.bin -o out.txt write the dump to a file"
|
|
41
|
+
),
|
|
42
|
+
)
|
|
43
|
+
src = p.add_mutually_exclusive_group(required=True)
|
|
44
|
+
src.add_argument(
|
|
45
|
+
"-r", "--raw", metavar="FILE", help="raw binary serialization file ('-' for stdin)"
|
|
46
|
+
)
|
|
47
|
+
src.add_argument(
|
|
48
|
+
"-f", "--hex-file", metavar="FILE", dest="hex_file", help="file of hex-ascii bytes"
|
|
49
|
+
)
|
|
50
|
+
src.add_argument("-x", "--hex", metavar="HEX", help="hex-ascii bytes on the command line")
|
|
51
|
+
p.add_argument("-o", "--output", metavar="FILE", help="write the dump to FILE (default stdout)")
|
|
52
|
+
p.add_argument(
|
|
53
|
+
"-F",
|
|
54
|
+
"--format",
|
|
55
|
+
choices=["text", "json", "pretty"],
|
|
56
|
+
default="text",
|
|
57
|
+
help="output format (default: text)",
|
|
58
|
+
)
|
|
59
|
+
p.add_argument(
|
|
60
|
+
"--offsets",
|
|
61
|
+
action="store_true",
|
|
62
|
+
help="prefix each line with '@<byte-offset>|' (text format only)",
|
|
63
|
+
)
|
|
64
|
+
p.add_argument("-V", "--version", action="version", version=f"deaced {__version__}")
|
|
65
|
+
args = p.parse_args(argv)
|
|
66
|
+
|
|
67
|
+
if args.offsets and args.format != "text":
|
|
68
|
+
p.error("--offsets is only valid with -F text")
|
|
69
|
+
|
|
70
|
+
try:
|
|
71
|
+
data = _read_input(args)
|
|
72
|
+
except (OSError, ValueError) as e:
|
|
73
|
+
print(f"deaced: cannot read input: {e}", file=sys.stderr)
|
|
74
|
+
return 2
|
|
75
|
+
|
|
76
|
+
try:
|
|
77
|
+
text = dump(data, format=args.format, offsets=args.offsets)
|
|
78
|
+
except Exception as e: # parser errors carry an offset in the message
|
|
79
|
+
print(f"deaced: parse error: {e}", file=sys.stderr)
|
|
80
|
+
return 1
|
|
81
|
+
|
|
82
|
+
if args.output:
|
|
83
|
+
with open(args.output, "w", encoding="utf-8") as f:
|
|
84
|
+
f.write(text)
|
|
85
|
+
else:
|
|
86
|
+
reconfigure = getattr(sys.stdout, "reconfigure", None)
|
|
87
|
+
if reconfigure is not None:
|
|
88
|
+
reconfigure(encoding="utf-8")
|
|
89
|
+
sys.stdout.write(text)
|
|
90
|
+
return 0
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
if __name__ == "__main__": # pragma: no cover
|
|
94
|
+
raise SystemExit(main())
|
deaced/errors.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""Exception types for DeACED.
|
|
2
|
+
|
|
3
|
+
Every error carries the byte offset in the serialization stream where it was
|
|
4
|
+
detected, which makes truncated or malformed streams much easier to diagnose.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class SerDumpError(Exception):
|
|
11
|
+
"""Base class for all DeACED parse errors."""
|
|
12
|
+
|
|
13
|
+
def __init__(self, message: str, offset: int | None = None) -> None:
|
|
14
|
+
self.offset = offset
|
|
15
|
+
super().__init__(f"{message} (at offset {offset})" if offset is not None else message)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class TruncatedStreamError(SerDumpError):
|
|
19
|
+
"""The stream ended before the expected number of bytes was available."""
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class UnknownTagError(SerDumpError):
|
|
23
|
+
"""An unexpected or illegal type-code byte was encountered."""
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class IllegalStateError(SerDumpError):
|
|
27
|
+
"""The stream violated a structural rule of the serialization protocol."""
|
deaced/jfloat.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""Format floats/doubles exactly as Java's ``Double.toString`` / ``Float.toString``.
|
|
2
|
+
|
|
3
|
+
The reference dumper is Java, so to reproduce its output we must match Java's
|
|
4
|
+
notation: decimal form when ``1e-3 <= |x| < 1e7`` and "computerized scientific
|
|
5
|
+
notation" (``d.dddEexp``) otherwise, always with at least one fractional digit,
|
|
6
|
+
an uppercase ``E`` and no ``+`` on the exponent.
|
|
7
|
+
|
|
8
|
+
The shortest round-tripping digits are found by formatting with increasing
|
|
9
|
+
precision until the value reparses to the identical bit pattern (checked via
|
|
10
|
+
:mod:`struct`), which matches the shortest-representation algorithm Java uses
|
|
11
|
+
(JDK 19+). Java itself does not emit the shortest digits for the smallest
|
|
12
|
+
subnormals (a small cluster near ``Double``/``Float.MIN_VALUE``), so DeACED
|
|
13
|
+
diverges from Java there; every value it prints still round-trips exactly.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import math
|
|
19
|
+
import struct
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _shortest_sci(m: float, fmt: str, max_prec: int) -> str:
|
|
23
|
+
"""Return Python scientific notation with the fewest digits that round-trips."""
|
|
24
|
+
target = struct.pack(fmt, m)
|
|
25
|
+
for prec in range(max_prec + 1):
|
|
26
|
+
s = f"{m:.{prec}e}"
|
|
27
|
+
try:
|
|
28
|
+
packed = struct.pack(fmt, float(s))
|
|
29
|
+
except OverflowError:
|
|
30
|
+
# rounding pushed the value just past the type's max; try more digits
|
|
31
|
+
continue
|
|
32
|
+
if packed == target:
|
|
33
|
+
return s
|
|
34
|
+
return f"{m:.{max_prec}e}"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _parts(sci: str) -> tuple[str, int]:
|
|
38
|
+
"""Split Python scientific notation into (significant digits, exponent-of-first-digit)."""
|
|
39
|
+
mant, _, exp = sci.partition("e")
|
|
40
|
+
sci_exp = int(exp)
|
|
41
|
+
digits = mant.lstrip("+-").replace(".", "").rstrip("0") or "0"
|
|
42
|
+
return digits, sci_exp
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _format(neg: bool, digits: str, sci_exp: int) -> str:
|
|
46
|
+
sign = "-" if neg else ""
|
|
47
|
+
n = len(digits)
|
|
48
|
+
if -3 <= sci_exp <= 6:
|
|
49
|
+
if sci_exp >= 0:
|
|
50
|
+
int_len = sci_exp + 1
|
|
51
|
+
if n <= int_len:
|
|
52
|
+
int_part = digits + "0" * (int_len - n)
|
|
53
|
+
frac = "0"
|
|
54
|
+
else:
|
|
55
|
+
int_part = digits[:int_len]
|
|
56
|
+
frac = digits[int_len:]
|
|
57
|
+
else:
|
|
58
|
+
int_part = "0"
|
|
59
|
+
frac = "0" * (-sci_exp - 1) + digits
|
|
60
|
+
return f"{sign}{int_part}.{frac}"
|
|
61
|
+
mant = digits + ".0" if n == 1 else digits[0] + "." + digits[1:]
|
|
62
|
+
return f"{sign}{mant}E{sci_exp}"
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _to_string(v: float, fmt: str, max_prec: int) -> str:
|
|
66
|
+
if v != v:
|
|
67
|
+
return "NaN"
|
|
68
|
+
if v == math.inf:
|
|
69
|
+
return "Infinity"
|
|
70
|
+
if v == -math.inf:
|
|
71
|
+
return "-Infinity"
|
|
72
|
+
neg = math.copysign(1.0, v) < 0
|
|
73
|
+
if v == 0.0:
|
|
74
|
+
return "-0.0" if neg else "0.0"
|
|
75
|
+
digits, sci_exp = _parts(_shortest_sci(abs(v), fmt, max_prec))
|
|
76
|
+
return _format(neg, digits, sci_exp)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def double_to_string(v: float) -> str:
|
|
80
|
+
"""Format ``v`` as Java ``Double.toString`` would."""
|
|
81
|
+
return _to_string(v, ">d", 16)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def float_to_string(v: float) -> str:
|
|
85
|
+
"""Format a 32-bit float value ``v`` as Java ``Float.toString`` would."""
|
|
86
|
+
return _to_string(v, ">f", 8)
|
deaced/model.py
ADDED
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
"""Semantic AST for a parsed Java Object Serialization stream (ADR-0001).
|
|
2
|
+
|
|
3
|
+
Nodes mirror the entities of the serialization protocol, not the text dump:
|
|
4
|
+
there are no presentation-only grouping nodes here -- the text renderer adds
|
|
5
|
+
lines like ``Contents``/``values``/``(object)``. Each node keeps the raw bytes
|
|
6
|
+
of its primitives -- the renderers use them to reproduce the hex columns and, in
|
|
7
|
+
``--offsets`` mode, to advance their own byte cursor -- and records its source
|
|
8
|
+
``offset`` (start position in the stream) so callers can map a node back to the
|
|
9
|
+
input.
|
|
10
|
+
|
|
11
|
+
:class:`Stream` is the root; :attr:`Stream.handles` maps wire handles to the
|
|
12
|
+
node that owns them, so :class:`Reference` targets can be resolved.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
from dataclasses import dataclass, field
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class Utf:
|
|
22
|
+
"""A plain modified-UTF-8 string (class/field/interface name; no handle)."""
|
|
23
|
+
|
|
24
|
+
value: str
|
|
25
|
+
raw: bytes
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class Node:
|
|
30
|
+
"""Base class for every AST node; ``offset`` is the start byte position."""
|
|
31
|
+
|
|
32
|
+
offset: int
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# --- values (may appear as stream content, field values or array elements) ---
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class Null(Node):
|
|
40
|
+
"""``TC_NULL`` -- a null reference."""
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class Reference(Node):
|
|
45
|
+
"""``TC_REFERENCE`` -- a back-reference to a previously seen handle."""
|
|
46
|
+
|
|
47
|
+
handle: int
|
|
48
|
+
target: Node | None = None
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@dataclass
|
|
52
|
+
class StringVal(Node):
|
|
53
|
+
"""``TC_STRING`` / ``TC_LONGSTRING`` -- a string object with a handle."""
|
|
54
|
+
|
|
55
|
+
handle: int
|
|
56
|
+
value: str
|
|
57
|
+
raw: bytes
|
|
58
|
+
long: bool = False
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@dataclass
|
|
62
|
+
class Primitive(Node):
|
|
63
|
+
"""A primitive field/array value (type code one of ``BCDFIJSZ``)."""
|
|
64
|
+
|
|
65
|
+
tc: str
|
|
66
|
+
value: int | float | bool
|
|
67
|
+
raw: bytes
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@dataclass
|
|
71
|
+
class BlockData(Node):
|
|
72
|
+
"""``TC_BLOCKDATA`` / ``TC_BLOCKDATALONG`` -- an opaque byte block."""
|
|
73
|
+
|
|
74
|
+
data: bytes
|
|
75
|
+
long: bool = False
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@dataclass
|
|
79
|
+
class Reset(Node):
|
|
80
|
+
"""``TC_RESET`` -- resets the stream's handle table to the base handle."""
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@dataclass
|
|
84
|
+
class ExceptionObj(Node):
|
|
85
|
+
"""``TC_EXCEPTION`` -- a serialized Throwable describing a serialization abort."""
|
|
86
|
+
|
|
87
|
+
throwable: Node
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
@dataclass
|
|
91
|
+
class ClassObj(Node):
|
|
92
|
+
"""``TC_CLASS`` -- a Class object."""
|
|
93
|
+
|
|
94
|
+
handle: int
|
|
95
|
+
class_desc: ClassDescLike
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
@dataclass
|
|
99
|
+
class EnumObj(Node):
|
|
100
|
+
"""``TC_ENUM`` -- an enum constant."""
|
|
101
|
+
|
|
102
|
+
handle: int
|
|
103
|
+
class_desc: ClassDescLike
|
|
104
|
+
constant: StringVal | Reference
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
@dataclass
|
|
108
|
+
class ArrayObj(Node):
|
|
109
|
+
"""``TC_ARRAY`` -- an array object.
|
|
110
|
+
|
|
111
|
+
``component`` is the element type code (the second char of the array class
|
|
112
|
+
name). For a ``byte[]`` the data is kept in ``byte_values``; otherwise each
|
|
113
|
+
element is a node in ``elements``.
|
|
114
|
+
"""
|
|
115
|
+
|
|
116
|
+
handle: int
|
|
117
|
+
class_desc: ClassDescLike
|
|
118
|
+
component: str
|
|
119
|
+
size: int
|
|
120
|
+
elements: list[Node] = field(default_factory=list)
|
|
121
|
+
byte_values: bytes = b""
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
@dataclass
|
|
125
|
+
class ObjectInstance(Node):
|
|
126
|
+
"""``TC_OBJECT`` -- a serialized object instance."""
|
|
127
|
+
|
|
128
|
+
handle: int
|
|
129
|
+
class_desc: ClassDescLike
|
|
130
|
+
data: list[ClassData] = field(default_factory=list)
|
|
131
|
+
na: bool = False
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
# --- class descriptions (the classDesc slot: one of these or Null/Reference) ---
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
@dataclass
|
|
138
|
+
class FieldDesc:
|
|
139
|
+
"""A field declaration inside a class description."""
|
|
140
|
+
|
|
141
|
+
tc: str
|
|
142
|
+
name: Utf
|
|
143
|
+
class_name1: StringVal | Reference | None = None
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
@dataclass
|
|
147
|
+
class ClassDesc(Node):
|
|
148
|
+
"""``TC_CLASSDESC`` -- a concrete class description."""
|
|
149
|
+
|
|
150
|
+
name: Utf
|
|
151
|
+
svuid: bytes
|
|
152
|
+
handle: int
|
|
153
|
+
flags: int
|
|
154
|
+
fields: list[FieldDesc] = field(default_factory=list)
|
|
155
|
+
annotations: list[Node] = field(default_factory=list)
|
|
156
|
+
super_desc: ClassDescLike | None = None
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
@dataclass
|
|
160
|
+
class ProxyClassDesc(Node):
|
|
161
|
+
"""``TC_PROXYCLASSDESC`` -- a dynamic proxy class description."""
|
|
162
|
+
|
|
163
|
+
handle: int
|
|
164
|
+
interfaces: list[Utf] = field(default_factory=list)
|
|
165
|
+
annotations: list[Node] = field(default_factory=list)
|
|
166
|
+
super_desc: ClassDescLike | None = None
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
# --- object-instance class data ---
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
@dataclass
|
|
173
|
+
class FieldValue:
|
|
174
|
+
"""One field's value within a class's slice of an object's data."""
|
|
175
|
+
|
|
176
|
+
name: str
|
|
177
|
+
declared_tc: str
|
|
178
|
+
value: Node
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
@dataclass
|
|
182
|
+
class ClassData:
|
|
183
|
+
"""One class's contribution to an object's data (values + annotations)."""
|
|
184
|
+
|
|
185
|
+
class_name: str
|
|
186
|
+
serializable: bool
|
|
187
|
+
values: list[FieldValue] = field(default_factory=list)
|
|
188
|
+
has_annotation: bool = False
|
|
189
|
+
annotations: list[Node] = field(default_factory=list)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
# --- root ---
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
@dataclass
|
|
196
|
+
class Stream(Node):
|
|
197
|
+
"""The parsed stream: optional RMI prefix, header, and top-level contents."""
|
|
198
|
+
|
|
199
|
+
magic: bytes
|
|
200
|
+
magic_valid: bool
|
|
201
|
+
version: bytes | None = None
|
|
202
|
+
version_valid: bool = False
|
|
203
|
+
rmi: int | None = None
|
|
204
|
+
contents: list[Node] = field(default_factory=list)
|
|
205
|
+
handles: dict[int, Node] = field(default_factory=dict)
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
#: Anything that can occupy a ``classDesc`` slot.
|
|
209
|
+
ClassDescLike = ClassDesc | ProxyClassDesc | Null | Reference
|
deaced/mutf8.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""Java "modified UTF-8" -- the string encoding used by serialization.
|
|
2
|
+
|
|
3
|
+
``DataOutput.writeUTF`` / ``DataInput.readUTF`` (and therefore the Object
|
|
4
|
+
Serialization Stream Protocol) encode strings in a variant of UTF-8 that differs
|
|
5
|
+
from the standard in exactly two ways:
|
|
6
|
+
|
|
7
|
+
* ``U+0000`` is written as the two bytes ``C0 80`` (never a bare ``00``);
|
|
8
|
+
* characters outside the Basic Multilingual Plane are written as a UTF-16
|
|
9
|
+
surrogate pair, each surrogate emitted in its 3-byte form (i.e. CESU-8), rather
|
|
10
|
+
than as one 4-byte UTF-8 sequence.
|
|
11
|
+
|
|
12
|
+
For every other (BMP, non-zero) character it is identical to standard UTF-8.
|
|
13
|
+
Decoding is tolerant: malformed sequences yield U+FFFD rather than raising.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
_REPLACEMENT = 0xFFFD
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def decode(data: bytes) -> str:
|
|
22
|
+
"""Decode modified-UTF-8 ``data`` into a string."""
|
|
23
|
+
units = bytearray() # UTF-16 code units, big-endian
|
|
24
|
+
i = 0
|
|
25
|
+
n = len(data)
|
|
26
|
+
while i < n:
|
|
27
|
+
a = data[i]
|
|
28
|
+
if a < 0x80:
|
|
29
|
+
unit = a
|
|
30
|
+
i += 1
|
|
31
|
+
elif (a & 0xE0) == 0xC0:
|
|
32
|
+
if i + 1 < n and (data[i + 1] & 0xC0) == 0x80:
|
|
33
|
+
unit = ((a & 0x1F) << 6) | (data[i + 1] & 0x3F)
|
|
34
|
+
i += 2
|
|
35
|
+
else:
|
|
36
|
+
unit = _REPLACEMENT
|
|
37
|
+
i += 1
|
|
38
|
+
elif (a & 0xF0) == 0xE0:
|
|
39
|
+
if i + 2 < n and (data[i + 1] & 0xC0) == 0x80 and (data[i + 2] & 0xC0) == 0x80:
|
|
40
|
+
unit = ((a & 0x0F) << 12) | ((data[i + 1] & 0x3F) << 6) | (data[i + 2] & 0x3F)
|
|
41
|
+
i += 3
|
|
42
|
+
else:
|
|
43
|
+
unit = _REPLACEMENT
|
|
44
|
+
i += 1
|
|
45
|
+
else:
|
|
46
|
+
unit = _REPLACEMENT
|
|
47
|
+
i += 1
|
|
48
|
+
units += unit.to_bytes(2, "big")
|
|
49
|
+
# Combine surrogate pairs; keep lone surrogates as-is (Java permits them).
|
|
50
|
+
return units.decode("utf-16-be", errors="surrogatepass")
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def encode(text: str) -> bytes:
|
|
54
|
+
"""Encode ``text`` as modified UTF-8."""
|
|
55
|
+
b16 = text.encode("utf-16-be", errors="surrogatepass")
|
|
56
|
+
out = bytearray()
|
|
57
|
+
for j in range(0, len(b16), 2):
|
|
58
|
+
unit = (b16[j] << 8) | b16[j + 1]
|
|
59
|
+
if unit == 0x0000:
|
|
60
|
+
out += b"\xc0\x80"
|
|
61
|
+
elif unit <= 0x7F:
|
|
62
|
+
out.append(unit)
|
|
63
|
+
elif unit <= 0x7FF:
|
|
64
|
+
out.append(0xC0 | (unit >> 6))
|
|
65
|
+
out.append(0x80 | (unit & 0x3F))
|
|
66
|
+
else:
|
|
67
|
+
out.append(0xE0 | (unit >> 12))
|
|
68
|
+
out.append(0x80 | ((unit >> 6) & 0x3F))
|
|
69
|
+
out.append(0x80 | (unit & 0x3F))
|
|
70
|
+
return bytes(out)
|