storetle 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- storetle/__init__.py +111 -0
- storetle/brotli_compat.py +96 -0
- storetle/cli.py +302 -0
- storetle/cube_dict_v10.bin +0 -0
- storetle/decoder.py +211 -0
- storetle/encoder.py +717 -0
- storetle/folder.py +249 -0
- storetle/stream.py +464 -0
- storetle/vocab.py +635 -0
- storetle/warc.py +478 -0
- storetle/zstd_compat.py +202 -0
- storetle-0.2.0.dist-info/METADATA +161 -0
- storetle-0.2.0.dist-info/RECORD +17 -0
- storetle-0.2.0.dist-info/WHEEL +5 -0
- storetle-0.2.0.dist-info/entry_points.txt +2 -0
- storetle-0.2.0.dist-info/licenses/LICENSE +21 -0
- storetle-0.2.0.dist-info/top_level.txt +1 -0
storetle/decoder.py
ADDED
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
# decoder.py — v4
|
|
2
|
+
# Two-stream decoder: structure stream + content stream, both zlib-compressed.
|
|
3
|
+
# v4: text node payloads (T_TEXT, T_DOCTYPE, T_COMMENT, T_RAWTEXT) are written
|
|
4
|
+
# inline in the content stream (0xFD marker) rather than via string table IDs.
|
|
5
|
+
|
|
6
|
+
import struct
|
|
7
|
+
import zlib
|
|
8
|
+
from .vocab import ID_TO_TAG, ID_TO_ATTR, VOID_ELEMENTS, UNKNOWN_ID, SHARED_STRINGS, SHARED_COUNT
|
|
9
|
+
from .encoder import (
|
|
10
|
+
T_OPEN, T_CLOSE, T_TEXT, T_DOCTYPE, T_COMMENT, T_SELFCLOSE, T_RAWTEXT,
|
|
11
|
+
MAGIC, VERSION,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class _Stream:
|
|
16
|
+
"""Lightweight wrapper around a decompressed bytes block for sequential reads."""
|
|
17
|
+
def __init__(self, data: bytes):
|
|
18
|
+
self._data = data
|
|
19
|
+
self._pos = 0
|
|
20
|
+
|
|
21
|
+
def read_byte(self) -> int:
|
|
22
|
+
b = self._data[self._pos]; self._pos += 1
|
|
23
|
+
return b
|
|
24
|
+
|
|
25
|
+
def read_sid(self):
|
|
26
|
+
"""
|
|
27
|
+
Varint string ID:
|
|
28
|
+
0x00-0xFB → that value (IDs 0-251, 1 byte)
|
|
29
|
+
0xFC → class token list — raises ValueError
|
|
30
|
+
(use read_string() for positions that may contain class lists)
|
|
31
|
+
0xFD → inline string — raises ValueError
|
|
32
|
+
(use read_string() for positions that may contain inline strings)
|
|
33
|
+
0xFE → None (boolean attribute, no value)
|
|
34
|
+
0xFF HH LL → big-endian uint16 ID (IDs 252+, 3 bytes total)
|
|
35
|
+
"""
|
|
36
|
+
b = self.read_byte()
|
|
37
|
+
if b == 0xFE:
|
|
38
|
+
return None
|
|
39
|
+
if b == 0xFF:
|
|
40
|
+
hi = self.read_byte()
|
|
41
|
+
lo = self.read_byte()
|
|
42
|
+
return (hi << 8) | lo
|
|
43
|
+
if b == 0xFD:
|
|
44
|
+
raise ValueError('Unexpected inline string marker in read_sid; use read_string()')
|
|
45
|
+
if b == 0xFC:
|
|
46
|
+
raise ValueError('Unexpected class-list marker in read_sid; use read_string()')
|
|
47
|
+
return b
|
|
48
|
+
|
|
49
|
+
def read_string(self, all_strings):
|
|
50
|
+
"""
|
|
51
|
+
Read a string value from the content stream. Handles:
|
|
52
|
+
0x00-0xFB → 1-byte string ID (IDs 0-251)
|
|
53
|
+
0xFC → class token list: count (1B) + N token reads → joined with spaces
|
|
54
|
+
0xFD → inline string: 4-byte length + UTF-8 bytes
|
|
55
|
+
0xFE → None (boolean / no-value attribute)
|
|
56
|
+
0xFF HH LL → 3-byte string ID (IDs 252+)
|
|
57
|
+
"""
|
|
58
|
+
b = self.read_byte()
|
|
59
|
+
if b == 0xFE:
|
|
60
|
+
return None
|
|
61
|
+
if b == 0xFF:
|
|
62
|
+
hi = self.read_byte()
|
|
63
|
+
lo = self.read_byte()
|
|
64
|
+
sid = (hi << 8) | lo
|
|
65
|
+
return all_strings[sid]
|
|
66
|
+
if b == 0xFD:
|
|
67
|
+
b1 = self.read_byte()
|
|
68
|
+
if b1 <= 254:
|
|
69
|
+
length = b1
|
|
70
|
+
else: # 0xFF escape for strings >= 255 bytes
|
|
71
|
+
length = struct.unpack_from('>I', self._data, self._pos)[0]
|
|
72
|
+
self._pos += 4
|
|
73
|
+
s = self._data[self._pos:self._pos + length].decode('utf-8')
|
|
74
|
+
self._pos += length
|
|
75
|
+
return s
|
|
76
|
+
if b == 0xFC:
|
|
77
|
+
count = self.read_byte()
|
|
78
|
+
tokens = []
|
|
79
|
+
for _ in range(count):
|
|
80
|
+
t = self.read_string(all_strings)
|
|
81
|
+
if t is not None:
|
|
82
|
+
tokens.append(t)
|
|
83
|
+
return ' '.join(tokens)
|
|
84
|
+
# 0x00-0xFB: 1-byte ID
|
|
85
|
+
return all_strings[b]
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def decode(cube_bytes: bytes) -> str:
|
|
89
|
+
pos = 0
|
|
90
|
+
|
|
91
|
+
# --- Header ---
|
|
92
|
+
magic = cube_bytes[pos:pos+4]; pos += 4
|
|
93
|
+
if magic != MAGIC:
|
|
94
|
+
raise ValueError(f'Not a valid .cube file (magic: {magic!r})')
|
|
95
|
+
|
|
96
|
+
version = cube_bytes[pos]; pos += 1
|
|
97
|
+
if version != VERSION:
|
|
98
|
+
raise ValueError(f'Decoder handles v{VERSION}, file is v{version}. Re-encode the file.')
|
|
99
|
+
# Note: v5 adds class-list encoding (0xFC). Files encoded with v4 cannot
|
|
100
|
+
# be decoded by this decoder without re-encoding.
|
|
101
|
+
|
|
102
|
+
node_count = struct.unpack_from('>I', cube_bytes, pos)[0]; pos += 4
|
|
103
|
+
st_size = struct.unpack_from('>I', cube_bytes, pos)[0]; pos += 4
|
|
104
|
+
struct_size = struct.unpack_from('>I', cube_bytes, pos)[0]; pos += 4
|
|
105
|
+
content_size = struct.unpack_from('>I', cube_bytes, pos)[0]; pos += 4
|
|
106
|
+
|
|
107
|
+
# --- Decompress and read string table ---
|
|
108
|
+
st_compressed = cube_bytes[pos:pos + st_size]; pos += st_size
|
|
109
|
+
raw_table = zlib.decompress(st_compressed)
|
|
110
|
+
|
|
111
|
+
tpos = 0
|
|
112
|
+
string_count = struct.unpack_from('>I', raw_table, tpos)[0]; tpos += 4
|
|
113
|
+
file_strings = []
|
|
114
|
+
for _ in range(string_count):
|
|
115
|
+
length = struct.unpack_from('>I', raw_table, tpos)[0]; tpos += 4
|
|
116
|
+
s = raw_table[tpos:tpos + length].decode('utf-8'); tpos += length
|
|
117
|
+
file_strings.append(s)
|
|
118
|
+
|
|
119
|
+
all_strings = SHARED_STRINGS + file_strings
|
|
120
|
+
|
|
121
|
+
# --- Decompress the two body streams ---
|
|
122
|
+
struct_stream = _Stream(zlib.decompress(cube_bytes[pos:pos + struct_size]))
|
|
123
|
+
pos += struct_size
|
|
124
|
+
content_stream = _Stream(zlib.decompress(cube_bytes[pos:pos + content_size]))
|
|
125
|
+
|
|
126
|
+
# --- Reconstruct HTML ---
|
|
127
|
+
output = []
|
|
128
|
+
indent = 0
|
|
129
|
+
tag_stack = [] # tracks open tags so T_CLOSE needs no tag_id in stream
|
|
130
|
+
|
|
131
|
+
for _ in range(node_count):
|
|
132
|
+
node_type = struct_stream.read_byte()
|
|
133
|
+
|
|
134
|
+
if node_type in (T_OPEN, T_SELFCLOSE):
|
|
135
|
+
tag_id = struct_stream.read_byte()
|
|
136
|
+
|
|
137
|
+
if tag_id == UNKNOWN_ID:
|
|
138
|
+
tag = content_stream.read_string(all_strings)
|
|
139
|
+
else:
|
|
140
|
+
tag = ID_TO_TAG.get(tag_id, f'unknown_{tag_id}')
|
|
141
|
+
|
|
142
|
+
attr_count = struct_stream.read_byte()
|
|
143
|
+
attrs_str = ''
|
|
144
|
+
|
|
145
|
+
for _ in range(attr_count):
|
|
146
|
+
attr_id = struct_stream.read_byte()
|
|
147
|
+
|
|
148
|
+
if attr_id == UNKNOWN_ID:
|
|
149
|
+
attr_name = content_stream.read_string(all_strings)
|
|
150
|
+
else:
|
|
151
|
+
attr_name = ID_TO_ATTR.get(attr_id, f'unknown_attr_{attr_id}')
|
|
152
|
+
|
|
153
|
+
attr_value = content_stream.read_string(all_strings)
|
|
154
|
+
|
|
155
|
+
if attr_value is None:
|
|
156
|
+
attrs_str += f' {attr_name}'
|
|
157
|
+
else:
|
|
158
|
+
escaped = (attr_value
|
|
159
|
+
.replace('&', '&')
|
|
160
|
+
.replace('"', '"')
|
|
161
|
+
.replace('<', '<'))
|
|
162
|
+
attrs_str += f' {attr_name}="{escaped}"'
|
|
163
|
+
|
|
164
|
+
if node_type == T_SELFCLOSE:
|
|
165
|
+
output.append(f'{" " * indent}<{tag}{attrs_str}>')
|
|
166
|
+
else:
|
|
167
|
+
output.append(f'{" " * indent}<{tag}{attrs_str}>')
|
|
168
|
+
tag_stack.append(tag)
|
|
169
|
+
indent += 1
|
|
170
|
+
|
|
171
|
+
elif node_type == T_CLOSE:
|
|
172
|
+
# No tag_id in stream — pop the open-tag stack
|
|
173
|
+
tag = tag_stack.pop() if tag_stack else ''
|
|
174
|
+
indent = max(0, indent - 1)
|
|
175
|
+
output.append(f'{" " * indent}</{tag}>')
|
|
176
|
+
|
|
177
|
+
elif node_type == T_TEXT:
|
|
178
|
+
raw = content_stream.read_string(all_strings) or ''
|
|
179
|
+
output.append(raw.replace('&', '&').replace('<', '<').replace('>', '>'))
|
|
180
|
+
|
|
181
|
+
elif node_type == T_RAWTEXT:
|
|
182
|
+
output.append(content_stream.read_string(all_strings) or '')
|
|
183
|
+
|
|
184
|
+
elif node_type == T_DOCTYPE:
|
|
185
|
+
output.append(f'<!{content_stream.read_string(all_strings) or ""}>')
|
|
186
|
+
|
|
187
|
+
elif node_type == T_COMMENT:
|
|
188
|
+
output.append(f'<!--{content_stream.read_string(all_strings) or ""}-->')
|
|
189
|
+
|
|
190
|
+
else:
|
|
191
|
+
raise ValueError(f'Unknown node type {node_type:#x}')
|
|
192
|
+
|
|
193
|
+
return '\n'.join(output)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def decode_file(input_path: str, output_path: str) -> dict:
|
|
197
|
+
with open(input_path, 'rb') as f:
|
|
198
|
+
cube_bytes = f.read()
|
|
199
|
+
|
|
200
|
+
html_text = decode(cube_bytes)
|
|
201
|
+
|
|
202
|
+
with open(output_path, 'w', encoding='utf-8') as f:
|
|
203
|
+
f.write(html_text)
|
|
204
|
+
|
|
205
|
+
node_count = struct.unpack_from('>I', cube_bytes, 5)[0]
|
|
206
|
+
|
|
207
|
+
return {
|
|
208
|
+
'cube_size': len(cube_bytes),
|
|
209
|
+
'html_size': len(html_text.encode('utf-8')),
|
|
210
|
+
'node_count': node_count,
|
|
211
|
+
}
|