pytecode 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,145 @@
1
+ """Encode and decode JVM Modified UTF-8 strings (§4.4.7).
2
+
3
+ Modified UTF-8 is the encoding used by the JVM for ``CONSTANT_Utf8_info``
4
+ entries in class files. It differs from standard UTF-8 in two ways:
5
+
6
+ * The null character (U+0000) is encoded as the two-byte sequence
7
+ ``0xC0 0x80`` rather than a single ``0x00`` byte.
8
+ * Supplementary characters (U+10000–U+10FFFF) are represented as
9
+ surrogate pairs, each encoded independently as three bytes.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ from typing import Never
15
+
16
+ _ONE_BYTE_MAX = 0x7F
17
+ _TWO_BYTE_MAX = 0x7FF
18
+ _THREE_BYTE_MAX = 0xFFFF
19
+ _MAX_UNICODE = 0x10FFFF
20
+
21
+
22
+ def _decode_error(data: bytes, start: int, end: int, reason: str) -> Never:
23
+ raise UnicodeDecodeError("modified-utf-8", data, start, end, reason)
24
+
25
+
26
+ def _encode_code_unit(code_unit: int, out: bytearray) -> None:
27
+ if code_unit == 0:
28
+ out.extend((0xC0, 0x80))
29
+ return
30
+
31
+ if code_unit <= _ONE_BYTE_MAX:
32
+ out.append(code_unit)
33
+ return
34
+
35
+ if code_unit <= _TWO_BYTE_MAX:
36
+ out.extend((0xC0 | (code_unit >> 6), 0x80 | (code_unit & 0x3F)))
37
+ return
38
+
39
+ out.extend(
40
+ (
41
+ 0xE0 | (code_unit >> 12),
42
+ 0x80 | ((code_unit >> 6) & 0x3F),
43
+ 0x80 | (code_unit & 0x3F),
44
+ )
45
+ )
46
+
47
+
48
+ def encode_modified_utf8(value: str) -> bytes:
49
+ """Encode a Python string to JVM Modified UTF-8 bytes (§4.4.7).
50
+
51
+ Supplementary characters (U+10000–U+10FFFF) are split into surrogate
52
+ pairs, each encoded as a three-byte sequence. The null character
53
+ (U+0000) is encoded as ``0xC0 0x80``.
54
+
55
+ Args:
56
+ value: The Python string to encode.
57
+
58
+ Returns:
59
+ The Modified UTF-8 encoded byte string.
60
+
61
+ Raises:
62
+ ValueError: If any code point exceeds U+10FFFF.
63
+ """
64
+
65
+ out = bytearray()
66
+ for char in value:
67
+ code_point = ord(char)
68
+ if code_point <= _THREE_BYTE_MAX:
69
+ _encode_code_unit(code_point, out)
70
+ continue
71
+
72
+ if code_point > _MAX_UNICODE:
73
+ raise ValueError(f"code point out of range: U+{code_point:06X}")
74
+
75
+ supplemental = code_point - 0x10000
76
+ high_surrogate = 0xD800 | (supplemental >> 10)
77
+ low_surrogate = 0xDC00 | (supplemental & 0x3FF)
78
+ _encode_code_unit(high_surrogate, out)
79
+ _encode_code_unit(low_surrogate, out)
80
+
81
+ return bytes(out)
82
+
83
+
84
+ def decode_modified_utf8(data: bytes) -> str:
85
+ """Decode JVM Modified UTF-8 bytes into a Python string (§4.4.7).
86
+
87
+ Interprets one-, two-, and three-byte Modified UTF-8 sequences,
88
+ reassembling surrogate pairs into supplementary characters. Bare
89
+ ``0x00`` bytes and standard four-byte UTF-8 sequences are rejected.
90
+
91
+ Args:
92
+ data: The Modified UTF-8 encoded bytes to decode.
93
+
94
+ Returns:
95
+ The decoded Python string.
96
+
97
+ Raises:
98
+ UnicodeDecodeError: If *data* contains invalid, truncated, or
99
+ overlong sequences.
100
+ """
101
+
102
+ utf16_bytes = bytearray()
103
+ index = 0
104
+
105
+ while index < len(data):
106
+ first = data[index]
107
+
108
+ if first == 0:
109
+ _decode_error(data, index, index + 1, "NUL must use the modified UTF-8 two-byte form")
110
+
111
+ if first <= _ONE_BYTE_MAX:
112
+ code_unit = first
113
+ index += 1
114
+ elif 0xC0 <= first <= 0xDF:
115
+ if index + 1 >= len(data):
116
+ _decode_error(data, index, len(data), "truncated two-byte sequence")
117
+ second = data[index + 1]
118
+ if second & 0xC0 != 0x80:
119
+ _decode_error(data, index + 1, index + 2, "invalid continuation byte")
120
+ code_unit = ((first & 0x1F) << 6) | (second & 0x3F)
121
+ if code_unit < 0x80 and code_unit != 0:
122
+ _decode_error(data, index, index + 2, "overlong two-byte sequence")
123
+ index += 2
124
+ elif 0xE0 <= first <= 0xEF:
125
+ if index + 2 >= len(data):
126
+ _decode_error(data, index, len(data), "truncated three-byte sequence")
127
+ second = data[index + 1]
128
+ third = data[index + 2]
129
+ if second & 0xC0 != 0x80:
130
+ _decode_error(data, index + 1, index + 2, "invalid continuation byte")
131
+ if third & 0xC0 != 0x80:
132
+ _decode_error(data, index + 2, index + 3, "invalid continuation byte")
133
+ code_unit = ((first & 0x0F) << 12) | ((second & 0x3F) << 6) | (third & 0x3F)
134
+ if code_unit < 0x800:
135
+ _decode_error(data, index, index + 3, "overlong three-byte sequence")
136
+ index += 3
137
+ else:
138
+ _decode_error(data, index, index + 1, "modified UTF-8 does not permit four-byte sequences")
139
+
140
+ utf16_bytes.extend((code_unit >> 8, code_unit & 0xFF))
141
+
142
+ return utf16_bytes.decode("utf-16-be", errors="surrogatepass")
143
+
144
+
145
+ __all__ = ["decode_modified_utf8", "encode_modified_utf8"]