storetle 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
storetle/decoder.py ADDED
@@ -0,0 +1,211 @@
1
+ # decoder.py — v4
2
+ # Two-stream decoder: structure stream + content stream, both zlib-compressed.
3
+ # v4: text node payloads (T_TEXT, T_DOCTYPE, T_COMMENT, T_RAWTEXT) are written
4
+ # inline in the content stream (0xFD marker) rather than via string table IDs.
5
+
6
+ import struct
7
+ import zlib
8
+ from .vocab import ID_TO_TAG, ID_TO_ATTR, VOID_ELEMENTS, UNKNOWN_ID, SHARED_STRINGS, SHARED_COUNT
9
+ from .encoder import (
10
+ T_OPEN, T_CLOSE, T_TEXT, T_DOCTYPE, T_COMMENT, T_SELFCLOSE, T_RAWTEXT,
11
+ MAGIC, VERSION,
12
+ )
13
+
14
+
15
+ class _Stream:
16
+ """Lightweight wrapper around a decompressed bytes block for sequential reads."""
17
+ def __init__(self, data: bytes):
18
+ self._data = data
19
+ self._pos = 0
20
+
21
+ def read_byte(self) -> int:
22
+ b = self._data[self._pos]; self._pos += 1
23
+ return b
24
+
25
+ def read_sid(self):
26
+ """
27
+ Varint string ID:
28
+ 0x00-0xFB → that value (IDs 0-251, 1 byte)
29
+ 0xFC → class token list — raises ValueError
30
+ (use read_string() for positions that may contain class lists)
31
+ 0xFD → inline string — raises ValueError
32
+ (use read_string() for positions that may contain inline strings)
33
+ 0xFE → None (boolean attribute, no value)
34
+ 0xFF HH LL → big-endian uint16 ID (IDs 252+, 3 bytes total)
35
+ """
36
+ b = self.read_byte()
37
+ if b == 0xFE:
38
+ return None
39
+ if b == 0xFF:
40
+ hi = self.read_byte()
41
+ lo = self.read_byte()
42
+ return (hi << 8) | lo
43
+ if b == 0xFD:
44
+ raise ValueError('Unexpected inline string marker in read_sid; use read_string()')
45
+ if b == 0xFC:
46
+ raise ValueError('Unexpected class-list marker in read_sid; use read_string()')
47
+ return b
48
+
49
+ def read_string(self, all_strings):
50
+ """
51
+ Read a string value from the content stream. Handles:
52
+ 0x00-0xFB → 1-byte string ID (IDs 0-251)
53
+ 0xFC → class token list: count (1B) + N token reads → joined with spaces
54
+ 0xFD → inline string: 4-byte length + UTF-8 bytes
55
+ 0xFE → None (boolean / no-value attribute)
56
+ 0xFF HH LL → 3-byte string ID (IDs 252+)
57
+ """
58
+ b = self.read_byte()
59
+ if b == 0xFE:
60
+ return None
61
+ if b == 0xFF:
62
+ hi = self.read_byte()
63
+ lo = self.read_byte()
64
+ sid = (hi << 8) | lo
65
+ return all_strings[sid]
66
+ if b == 0xFD:
67
+ b1 = self.read_byte()
68
+ if b1 <= 254:
69
+ length = b1
70
+ else: # 0xFF escape for strings >= 255 bytes
71
+ length = struct.unpack_from('>I', self._data, self._pos)[0]
72
+ self._pos += 4
73
+ s = self._data[self._pos:self._pos + length].decode('utf-8')
74
+ self._pos += length
75
+ return s
76
+ if b == 0xFC:
77
+ count = self.read_byte()
78
+ tokens = []
79
+ for _ in range(count):
80
+ t = self.read_string(all_strings)
81
+ if t is not None:
82
+ tokens.append(t)
83
+ return ' '.join(tokens)
84
+ # 0x00-0xFB: 1-byte ID
85
+ return all_strings[b]
86
+
87
+
88
+ def decode(cube_bytes: bytes) -> str:
89
+ pos = 0
90
+
91
+ # --- Header ---
92
+ magic = cube_bytes[pos:pos+4]; pos += 4
93
+ if magic != MAGIC:
94
+ raise ValueError(f'Not a valid .cube file (magic: {magic!r})')
95
+
96
+ version = cube_bytes[pos]; pos += 1
97
+ if version != VERSION:
98
+ raise ValueError(f'Decoder handles v{VERSION}, file is v{version}. Re-encode the file.')
99
+ # Note: v5 adds class-list encoding (0xFC). Files encoded with v4 cannot
100
+ # be decoded by this decoder without re-encoding.
101
+
102
+ node_count = struct.unpack_from('>I', cube_bytes, pos)[0]; pos += 4
103
+ st_size = struct.unpack_from('>I', cube_bytes, pos)[0]; pos += 4
104
+ struct_size = struct.unpack_from('>I', cube_bytes, pos)[0]; pos += 4
105
+ content_size = struct.unpack_from('>I', cube_bytes, pos)[0]; pos += 4
106
+
107
+ # --- Decompress and read string table ---
108
+ st_compressed = cube_bytes[pos:pos + st_size]; pos += st_size
109
+ raw_table = zlib.decompress(st_compressed)
110
+
111
+ tpos = 0
112
+ string_count = struct.unpack_from('>I', raw_table, tpos)[0]; tpos += 4
113
+ file_strings = []
114
+ for _ in range(string_count):
115
+ length = struct.unpack_from('>I', raw_table, tpos)[0]; tpos += 4
116
+ s = raw_table[tpos:tpos + length].decode('utf-8'); tpos += length
117
+ file_strings.append(s)
118
+
119
+ all_strings = SHARED_STRINGS + file_strings
120
+
121
+ # --- Decompress the two body streams ---
122
+ struct_stream = _Stream(zlib.decompress(cube_bytes[pos:pos + struct_size]))
123
+ pos += struct_size
124
+ content_stream = _Stream(zlib.decompress(cube_bytes[pos:pos + content_size]))
125
+
126
+ # --- Reconstruct HTML ---
127
+ output = []
128
+ indent = 0
129
+ tag_stack = [] # tracks open tags so T_CLOSE needs no tag_id in stream
130
+
131
+ for _ in range(node_count):
132
+ node_type = struct_stream.read_byte()
133
+
134
+ if node_type in (T_OPEN, T_SELFCLOSE):
135
+ tag_id = struct_stream.read_byte()
136
+
137
+ if tag_id == UNKNOWN_ID:
138
+ tag = content_stream.read_string(all_strings)
139
+ else:
140
+ tag = ID_TO_TAG.get(tag_id, f'unknown_{tag_id}')
141
+
142
+ attr_count = struct_stream.read_byte()
143
+ attrs_str = ''
144
+
145
+ for _ in range(attr_count):
146
+ attr_id = struct_stream.read_byte()
147
+
148
+ if attr_id == UNKNOWN_ID:
149
+ attr_name = content_stream.read_string(all_strings)
150
+ else:
151
+ attr_name = ID_TO_ATTR.get(attr_id, f'unknown_attr_{attr_id}')
152
+
153
+ attr_value = content_stream.read_string(all_strings)
154
+
155
+ if attr_value is None:
156
+ attrs_str += f' {attr_name}'
157
+ else:
158
+ escaped = (attr_value
159
+ .replace('&', '&amp;')
160
+ .replace('"', '&quot;')
161
+ .replace('<', '&lt;'))
162
+ attrs_str += f' {attr_name}="{escaped}"'
163
+
164
+ if node_type == T_SELFCLOSE:
165
+ output.append(f'{" " * indent}<{tag}{attrs_str}>')
166
+ else:
167
+ output.append(f'{" " * indent}<{tag}{attrs_str}>')
168
+ tag_stack.append(tag)
169
+ indent += 1
170
+
171
+ elif node_type == T_CLOSE:
172
+ # No tag_id in stream — pop the open-tag stack
173
+ tag = tag_stack.pop() if tag_stack else ''
174
+ indent = max(0, indent - 1)
175
+ output.append(f'{" " * indent}</{tag}>')
176
+
177
+ elif node_type == T_TEXT:
178
+ raw = content_stream.read_string(all_strings) or ''
179
+ output.append(raw.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;'))
180
+
181
+ elif node_type == T_RAWTEXT:
182
+ output.append(content_stream.read_string(all_strings) or '')
183
+
184
+ elif node_type == T_DOCTYPE:
185
+ output.append(f'<!{content_stream.read_string(all_strings) or ""}>')
186
+
187
+ elif node_type == T_COMMENT:
188
+ output.append(f'<!--{content_stream.read_string(all_strings) or ""}-->')
189
+
190
+ else:
191
+ raise ValueError(f'Unknown node type {node_type:#x}')
192
+
193
+ return '\n'.join(output)
194
+
195
+
196
+ def decode_file(input_path: str, output_path: str) -> dict:
197
+ with open(input_path, 'rb') as f:
198
+ cube_bytes = f.read()
199
+
200
+ html_text = decode(cube_bytes)
201
+
202
+ with open(output_path, 'w', encoding='utf-8') as f:
203
+ f.write(html_text)
204
+
205
+ node_count = struct.unpack_from('>I', cube_bytes, 5)[0]
206
+
207
+ return {
208
+ 'cube_size': len(cube_bytes),
209
+ 'html_size': len(html_text.encode('utf-8')),
210
+ 'node_count': node_count,
211
+ }