storetle 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
storetle/__init__.py ADDED
@@ -0,0 +1,111 @@
1
+ # storetle — HTML-aware streaming compression for large document collections
2
+ #
3
+ # Primary API:
4
+ # StreamWriter — append HTML documents to a .storetle file
5
+ # StreamReader — read/iterate/random-access a .storetle file
6
+ # pack — compress a folder of HTML files → .storetle
7
+ # unpack — decompress .storetle → folder of HTML files
8
+ # benchmark — compare storetle vs gzip on your own data
9
+
10
+ from .stream import StreamWriter, StreamReader
11
+ from .folder import pack, unpack
12
+
13
+ __version__ = '0.2.0'
14
+ __all__ = ['StreamWriter', 'StreamReader', 'pack', 'unpack', 'benchmark']
15
+
16
+
17
+ def benchmark(folder, quiet=False):
18
+ """Benchmark storetle vs gzip WARC on a folder of HTML files.
19
+
20
+ Returns a dict with size comparisons. Prints a table unless quiet=True.
21
+
22
+ Example:
23
+ import storetle
24
+ results = storetle.benchmark('my_crawl_data/')
25
+ """
26
+ import gzip, os, tempfile, time
27
+ from pathlib import Path
28
+
29
+ files = sorted(Path(folder).glob('**/*.html'))
30
+ if not files:
31
+ raise ValueError(f'No .html files found in {folder}')
32
+
33
+ docs = [f.read_bytes() for f in files]
34
+ total_html = sum(len(d) for d in docs)
35
+
36
+ # gzip WARC (industry standard)
37
+ warc_raw = b''.join(
38
+ 'WARC/1.0\r\nContent-Length: {}\r\n\r\n'.format(len(d)).encode()
39
+ + d + b'\r\n\r\n'
40
+ for d in docs
41
+ )
42
+ warc_gz = len(gzip.compress(warc_raw, compresslevel=9))
43
+
44
+ # gzip per-file
45
+ gz_pf = sum(len(gzip.compress(d, compresslevel=9)) for d in docs)
46
+
47
+ # storetle
48
+ with tempfile.NamedTemporaryFile(suffix='.storetle', delete=False) as tf:
49
+ tmp = tf.name
50
+ try:
51
+ t0 = time.time()
52
+ with StreamWriter(tmp) as w:
53
+ for d in docs:
54
+ w.append(d)
55
+ write_time = time.time() - t0
56
+
57
+ t1 = time.time()
58
+ with StreamReader(tmp) as r:
59
+ recovered = list(r)
60
+ read_time = time.time() - t1
61
+
62
+ cube_size = os.path.getsize(tmp)
63
+ finally:
64
+ os.unlink(tmp)
65
+
66
+ rt_ok = len(recovered) == len(docs)
67
+
68
+ result = {
69
+ 'files': len(files),
70
+ 'original_bytes': total_html,
71
+ 'gzip_warc': warc_gz,
72
+ 'gzip_per_file': gz_pf,
73
+ 'storetle': cube_size,
74
+ 'savings_vs_gzip_warc_pct': round((warc_gz - cube_size) / warc_gz * 100, 1),
75
+ 'write_kbps': int(total_html / 1024 / max(write_time, 0.001)),
76
+ 'read_kbps': int(total_html / 1024 / max(read_time, 0.001)),
77
+ 'roundtrip_ok': rt_ok,
78
+ }
79
+
80
+ if not quiet:
81
+ _print_benchmark(result)
82
+
83
+ return result
84
+
85
+
86
+ def _print_benchmark(r):
87
+ def fmt(n):
88
+ if n < 1024: return f'{n}B'
89
+ if n < 1048576: return f'{n/1024:.1f}KB'
90
+ return f'{n/1048576:.2f}MB'
91
+
92
+ def pct(a, b):
93
+ return f'{100*(1-a/b):.1f}%'
94
+
95
+ orig = r['original_bytes']
96
+ print(f'\n storetle benchmark — {r["files"]} files, {fmt(orig)} original\n')
97
+ print(f' {"Format":<28} {"Size":>10} {"Savings":>8}')
98
+ print(f' {"─"*50}')
99
+ print(f' {"Original HTML":<28} {fmt(orig):>10}')
100
+ print(f' {"gzip per-file (current)":<28} {fmt(r["gzip_per_file"]):>10} {pct(r["gzip_per_file"], orig):>8}')
101
+ print(f' {"gzip WARC (Common Crawl std)":<28} {fmt(r["gzip_warc"]):>10} {pct(r["gzip_warc"], orig):>8}')
102
+ print(f' {"storetle":<28} {fmt(r["storetle"]):>10} {pct(r["storetle"], orig):>8}')
103
+ print()
104
+
105
+ saved = r["gzip_warc"] - r["storetle"]
106
+ sign = '+' if saved > 0 else ''
107
+ print(f' vs gzip WARC: {sign}{r["savings_vs_gzip_warc_pct"]}% smaller ({fmt(abs(saved))} {"saved" if saved > 0 else "larger"})')
108
+ print(f' Write speed: {r["write_kbps"]:,} KB/s')
109
+ print(f' Read speed: {r["read_kbps"]:,} KB/s')
110
+ print(f' Round-trip: {"✓ all documents verified" if r["roundtrip_ok"] else "✗ FAILED"}')
111
+ print()
@@ -0,0 +1,96 @@
1
+ # brotli_compat.py — thin ctypes wrapper around the system brotli library
2
+ #
3
+ # Exposes compress(data, quality=11) and decompress(data) with the same
4
+ # calling convention as zlib.compress / zlib.decompress so callers can
5
+ # swap one for the other with a single import change.
6
+ #
7
+ # Falls back to zlib if the brotli dylib is not present (other machines).
8
+
9
+ import ctypes, zlib, os
10
+
11
+ _BROTLI_OK = False
12
+ _enc = None
13
+ _dec = None
14
+
15
+ def _try_load():
16
+ global _BROTLI_OK, _enc, _dec
17
+ candidates = [
18
+ '/usr/local/lib/libbrotlienc.dylib',
19
+ '/usr/local/Cellar/brotli/1.2.0/lib/libbrotlienc.dylib',
20
+ 'libbrotlienc.so',
21
+ 'libbrotlienc.dylib',
22
+ ]
23
+ dec_candidates = [p.replace('enc', 'dec') for p in candidates]
24
+
25
+ for ep, dp in zip(candidates, dec_candidates):
26
+ if os.path.exists(ep) and os.path.exists(dp):
27
+ try:
28
+ e = ctypes.CDLL(ep)
29
+ d = ctypes.CDLL(dp)
30
+ # wire up encoder
31
+ e.BrotliEncoderMaxCompressedSize.restype = ctypes.c_size_t
32
+ e.BrotliEncoderMaxCompressedSize.argtypes = [ctypes.c_size_t]
33
+ e.BrotliEncoderCompress.restype = ctypes.c_int
34
+ e.BrotliEncoderCompress.argtypes = [
35
+ ctypes.c_int, ctypes.c_int, ctypes.c_int,
36
+ ctypes.c_size_t, ctypes.c_char_p,
37
+ ctypes.POINTER(ctypes.c_size_t), ctypes.c_char_p,
38
+ ]
39
+ # wire up decoder
40
+ d.BrotliDecoderDecompress.restype = ctypes.c_int
41
+ d.BrotliDecoderDecompress.argtypes = [
42
+ ctypes.c_size_t, ctypes.c_char_p,
43
+ ctypes.POINTER(ctypes.c_size_t), ctypes.c_char_p,
44
+ ]
45
+ _enc = e; _dec = d; _BROTLI_OK = True
46
+ return
47
+ except Exception:
48
+ continue
49
+
50
+ _try_load()
51
+
52
+
53
+ def compress(data: bytes, quality: int = 11, lgwin: int = 24, mode: int = 0) -> bytes:
54
+ """Compress data with brotli (quality 0-11, mode 0=generic 1=text 2=font).
55
+ Falls back to zlib level 9."""
56
+ if not _BROTLI_OK:
57
+ return zlib.compress(data, level=9)
58
+
59
+ max_out = _enc.BrotliEncoderMaxCompressedSize(len(data))
60
+ out_buf = ctypes.create_string_buffer(max_out)
61
+ out_size = ctypes.c_size_t(max_out)
62
+
63
+ ok = _enc.BrotliEncoderCompress(
64
+ quality, lgwin, mode,
65
+ len(data), data,
66
+ ctypes.byref(out_size), out_buf,
67
+ )
68
+ if not ok:
69
+ raise RuntimeError('brotli compression failed')
70
+ return out_buf.raw[:out_size.value]
71
+
72
+
73
+ def decompress(data: bytes) -> bytes:
74
+ """Decompress brotli data. Falls back to zlib if brotli unavailable."""
75
+ if not _BROTLI_OK:
76
+ return zlib.decompress(data)
77
+
78
+ # Grow output buffer until it fits (decompressed size is unknown).
79
+ max_out = max(len(data) * 10, 1 << 20) # start at 10× or 1 MB
80
+ while True:
81
+ out_buf = ctypes.create_string_buffer(max_out)
82
+ out_size = ctypes.c_size_t(max_out)
83
+ result = _dec.BrotliDecoderDecompress(
84
+ len(data), data,
85
+ ctypes.byref(out_size), out_buf,
86
+ )
87
+ if result == 1: # BROTLI_DECODER_RESULT_SUCCESS
88
+ return out_buf.raw[:out_size.value]
89
+ if result == 3: # BROTLI_DECODER_RESULT_NEEDS_MORE_OUTPUT
90
+ max_out *= 4
91
+ continue
92
+ raise RuntimeError(f'brotli decompression failed (result={result})')
93
+
94
+
95
+ def available() -> bool:
96
+ return _BROTLI_OK
storetle/cli.py ADDED
@@ -0,0 +1,302 @@
1
+ #!/usr/bin/env python3
2
+ # cli.py — storetle command-line interface
3
+ #
4
+ # Commands:
5
+ # storetle bench <folder> — benchmark your data
6
+ # storetle pack <folder> <output> — compress folder → .storetle
7
+ # storetle unpack <input> <output_folder> — decompress .storetle → files
8
+ # storetle info <file.storetle> — show file stats
9
+ # storetle get <file.storetle> <idx> — extract one document by index
10
+
11
+ import sys
12
+ from pathlib import Path
13
+
14
+
15
+
16
+ def cmd_bench(args):
17
+ if not args:
18
+ print('Usage: storetle bench <folder>')
19
+ sys.exit(1)
20
+ from . import benchmark
21
+ try:
22
+ benchmark(args[0])
23
+ except ValueError as e:
24
+ print(f'Error: {e}')
25
+ sys.exit(1)
26
+
27
+
28
+ def cmd_pack(args):
29
+ if len(args) < 2:
30
+ print('Usage: storetle pack <folder> <output.storetle>')
31
+ sys.exit(1)
32
+ from .stream import StreamWriter
33
+ folder = Path(args[0])
34
+ output = args[1]
35
+ files = sorted(folder.glob('**/*.html'))
36
+ if not files:
37
+ print(f'No .html files found in {folder}')
38
+ sys.exit(1)
39
+
40
+ print(f'Packing {len(files)} files...')
41
+ with StreamWriter(output) as w:
42
+ for f in files:
43
+ w.append(f.read_bytes())
44
+
45
+ from .stream import StreamReader
46
+ info = StreamReader.info(output)
47
+
48
+ def fmt(n):
49
+ if n < 1048576: return f'{n/1024:.1f}KB'
50
+ return f'{n/1048576:.2f}MB'
51
+
52
+ print(f'Done: {fmt(info["original_bytes"])} → {fmt(info["compressed_bytes"])} '
53
+ f'({info["ratio_pct"]}% saved, {info["docs"]} docs, {info["chunks"]} chunks)')
54
+ print(f'Output: {output}')
55
+
56
+
57
+ def cmd_unpack(args):
58
+ if len(args) < 2:
59
+ print('Usage: storetle unpack <file.storetle> <output_folder>')
60
+ sys.exit(1)
61
+ from .stream import StreamReader
62
+ src = args[0]
63
+ dst = Path(args[1])
64
+ dst.mkdir(parents=True, exist_ok=True)
65
+
66
+ with StreamReader(src) as r:
67
+ print(f'Extracting {r.doc_count} documents to {dst}/')
68
+ for i, doc in enumerate(r):
69
+ out = dst / f'doc_{i:06d}.html'
70
+ out.write_bytes(doc)
71
+ if (i + 1) % 100 == 0:
72
+ print(f' {i+1}/{r.doc_count}...')
73
+ print(f'Done: {r.doc_count} files written to {dst}/')
74
+
75
+
76
+ def cmd_info(args):
77
+ if not args:
78
+ print('Usage: storetle info <file.storetle>')
79
+ sys.exit(1)
80
+ from .stream import StreamReader
81
+
82
+ def fmt(n):
83
+ if n < 1048576: return f'{n/1024:.1f}KB'
84
+ return f'{n/1048576:.2f}MB'
85
+
86
+ info = StreamReader.info(args[0])
87
+ print(f'\n {args[0]}')
88
+ print(f' Documents: {info["docs"]:,}')
89
+ print(f' Chunks: {info["chunks"]:,}')
90
+ print(f' Original: {fmt(info["original_bytes"])}')
91
+ print(f' Compressed: {fmt(info["compressed_bytes"])} ({info["ratio_pct"]}% saved)')
92
+ print()
93
+
94
+
95
+ def cmd_get(args):
96
+ if len(args) < 2:
97
+ print('Usage: storetle get <file.storetle> <index>')
98
+ sys.exit(1)
99
+ from .stream import StreamReader
100
+ with StreamReader(args[0]) as r:
101
+ try:
102
+ idx = int(args[1])
103
+ doc = r[idx]
104
+ sys.stdout.buffer.write(doc)
105
+ except (IndexError, ValueError) as e:
106
+ print(f'Error: {e}')
107
+ sys.exit(1)
108
+
109
+
110
+ def cmd_from_warc(args):
111
+ if len(args) < 2:
112
+ print('Usage: storetle from-warc <input.warc[.gz]> <output.storetle>')
113
+ sys.exit(1)
114
+ from .warc import from_warc
115
+ try:
116
+ from_warc(args[0], args[1], verbose=True)
117
+ except ValueError as e:
118
+ print('Error: %s' % e)
119
+ sys.exit(1)
120
+ except Exception as e:
121
+ print('Error: %s' % e)
122
+ sys.exit(1)
123
+
124
+
125
+ def cmd_to_warc(args):
126
+ if len(args) < 2:
127
+ print('Usage: storetle to-warc <input.storetle> <output.warc[.gz]>')
128
+ sys.exit(1)
129
+ from .warc import to_warc
130
+ try:
131
+ to_warc(args[0], args[1], verbose=True)
132
+ except Exception as e:
133
+ print('Error: %s' % e)
134
+ sys.exit(1)
135
+
136
+
137
+ def cmd_train(args):
138
+ """Train a custom zstd dictionary from a folder of HTML files.
139
+
140
+ Usage: storetle train <folder> [--output dict.bin] [--size 1024]
141
+
142
+ The trained dict can then be used with StreamWriter(path, dictionary=...).
143
+ Default output: storetle_dict.bin in current directory.
144
+ Default size: 1024KB (the same as the built-in dict).
145
+ """
146
+ if not args:
147
+ print('Usage: storetle train <folder> [--output dict.bin] [--size 1024]')
148
+ sys.exit(1)
149
+
150
+ folder = Path(args[0])
151
+ outfile = 'storetle_dict.bin'
152
+ size_kb = 1024
153
+
154
+ i = 1
155
+ while i < len(args):
156
+ if args[i] == '--output' and i + 1 < len(args):
157
+ outfile = args[i + 1]; i += 2
158
+ elif args[i] == '--size' and i + 1 < len(args):
159
+ try:
160
+ size_kb = int(args[i + 1])
161
+ except ValueError:
162
+ print('Error: --size must be an integer (KB)')
163
+ sys.exit(1)
164
+ i += 2
165
+ else:
166
+ i += 1
167
+
168
+ from . import zstd_compat as _zs
169
+ if not _zs.available():
170
+ print('Error: zstd not available. Install libzstd first.')
171
+ sys.exit(1)
172
+
173
+ from .stream import _encode_doc
174
+
175
+ files = sorted(folder.glob('**/*.html'))
176
+ if not files:
177
+ print('Error: no .html files found in %s' % folder)
178
+ sys.exit(1)
179
+
180
+ print('Encoding %d HTML files for dictionary training...' % len(files))
181
+ samples = []
182
+ errors = 0
183
+ for f in files:
184
+ try:
185
+ raw = _encode_doc(f.read_bytes())
186
+ samples.append(raw)
187
+ except Exception:
188
+ errors += 1
189
+
190
+ if not samples:
191
+ print('Error: failed to encode any files')
192
+ sys.exit(1)
193
+
194
+ total_kb = sum(len(s) for s in samples) // 1024
195
+ print('Training %dKB dictionary on %d samples (%dKB total)...' % (
196
+ size_kb, len(samples), total_kb))
197
+
198
+ dict_bytes = _zs.train_dictionary(samples, dict_size=size_kb * 1024)
199
+
200
+ Path(outfile).write_bytes(dict_bytes)
201
+ actual_kb = len(dict_bytes) // 1024
202
+ print('Done: %dKB dictionary saved to %s' % (actual_kb, outfile))
203
+ if errors:
204
+ print(' (%d files skipped due to encoding errors)' % errors)
205
+ print()
206
+ print('To use this dictionary:')
207
+ print(' import storetle')
208
+ print(' d = open("%s", "rb").read()' % outfile)
209
+ print(' with storetle.StreamWriter("out.storetle", dictionary=d) as w:')
210
+ print(' w.append(html)')
211
+ print(' with storetle.StreamReader("out.storetle") as r:')
212
+ print(' # reader auto-loads dict from disk; pass dictionary=d if custom')
213
+
214
+
215
+ def cmd_warc_encode(args):
216
+ if len(args) < 2:
217
+ print('Usage: storetle warc-encode <input.warc[.gz]> <output.warc[.gz]>')
218
+ sys.exit(1)
219
+ from .warc import warc_encode
220
+ try:
221
+ warc_encode(args[0], args[1], verbose=True)
222
+ except Exception as e:
223
+ print('Error: %s' % e)
224
+ sys.exit(1)
225
+
226
+
227
+ def cmd_warc_decode(args):
228
+ if len(args) < 2:
229
+ print('Usage: storetle warc-decode <encoded.warc[.gz]> <output.warc[.gz]>')
230
+ sys.exit(1)
231
+ from .warc import warc_decode
232
+ try:
233
+ warc_decode(args[0], args[1], verbose=True)
234
+ except Exception as e:
235
+ print('Error: %s' % e)
236
+ sys.exit(1)
237
+
238
+
239
+ COMMANDS = {
240
+ 'bench': cmd_bench,
241
+ 'pack': cmd_pack,
242
+ 'unpack': cmd_unpack,
243
+ 'info': cmd_info,
244
+ 'get': cmd_get,
245
+ 'from-warc': cmd_from_warc,
246
+ 'to-warc': cmd_to_warc,
247
+ 'warc-encode': cmd_warc_encode,
248
+ 'warc-decode': cmd_warc_decode,
249
+ 'train': cmd_train,
250
+ }
251
+
252
+ HELP = """storetle — HTML-aware compression for large document collections
253
+
254
+ Commands:
255
+ bench <folder> Benchmark your HTML data vs gzip WARC
256
+ pack <folder> <output> Compress a folder → .storetle file
257
+ unpack <file> <output_folder> Extract a .storetle → HTML files
258
+ info <file> Show file statistics
259
+ get <file> <index> Extract one document by index (0-based)
260
+ from-warc <input.warc[.gz]> <out> Convert WARC → .storetle
261
+ to-warc <input.storetle> <out> Convert .storetle → WARC (or .warc.gz)
262
+ warc-encode <input.warc[.gz]> <out> Encode HTML in-place → valid .warc.gz (smaller, standard format)
263
+ warc-decode <encoded.warc[.gz]> <out> Decode back to standard HTML WARC
264
+ train <folder> [options] Train a custom dictionary from HTML files
265
+ --output dict.bin Output path (default: storetle_dict.bin)
266
+ --size 1024 Dictionary size in KB (default: 1024)
267
+
268
+ Examples:
269
+ storetle bench my_crawl/
270
+ storetle pack my_crawl/ archive.storetle
271
+ storetle info archive.storetle
272
+ storetle get archive.storetle 0
273
+ storetle from-warc CC-MAIN.warc.gz archive.storetle
274
+ storetle to-warc archive.storetle output.warc.gz
275
+ storetle train my_corpus/ --output my_domain.bin --size 1024
276
+ """
277
+
278
+
279
+ def main():
280
+ if len(sys.argv) < 2 or sys.argv[1] in ('-h', '--help', 'help'):
281
+ print(HELP)
282
+ sys.exit(0)
283
+
284
+ cmd = sys.argv[1]
285
+ if cmd not in COMMANDS:
286
+ print(f'Unknown command: {cmd}\n')
287
+ print(HELP)
288
+ sys.exit(1)
289
+
290
+ try:
291
+ COMMANDS[cmd](sys.argv[2:])
292
+ except BrokenPipeError:
293
+ # downstream consumer (e.g. `| head`) closed the pipe — not an error
294
+ import os
295
+ os.dup2(os.open(os.devnull, os.O_WRONLY), sys.stdout.fileno())
296
+ sys.exit(0)
297
+ except KeyboardInterrupt:
298
+ sys.exit(130)
299
+
300
+
301
+ if __name__ == '__main__':
302
+ main()
Binary file