storetle 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- storetle/__init__.py +111 -0
- storetle/brotli_compat.py +96 -0
- storetle/cli.py +302 -0
- storetle/cube_dict_v10.bin +0 -0
- storetle/decoder.py +211 -0
- storetle/encoder.py +717 -0
- storetle/folder.py +249 -0
- storetle/stream.py +464 -0
- storetle/vocab.py +635 -0
- storetle/warc.py +478 -0
- storetle/zstd_compat.py +202 -0
- storetle-0.2.0.dist-info/METADATA +161 -0
- storetle-0.2.0.dist-info/RECORD +17 -0
- storetle-0.2.0.dist-info/WHEEL +5 -0
- storetle-0.2.0.dist-info/entry_points.txt +2 -0
- storetle-0.2.0.dist-info/licenses/LICENSE +21 -0
- storetle-0.2.0.dist-info/top_level.txt +1 -0
storetle/__init__.py
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
# storetle — HTML-aware streaming compression for large document collections
|
|
2
|
+
#
|
|
3
|
+
# Primary API:
|
|
4
|
+
# StreamWriter — append HTML documents to a .storetle file
|
|
5
|
+
# StreamReader — read/iterate/random-access a .storetle file
|
|
6
|
+
# pack — compress a folder of HTML files → .storetle
|
|
7
|
+
# unpack — decompress .storetle → folder of HTML files
|
|
8
|
+
# benchmark — compare storetle vs gzip on your own data
|
|
9
|
+
|
|
10
|
+
from .stream import StreamWriter, StreamReader
|
|
11
|
+
from .folder import pack, unpack
|
|
12
|
+
|
|
13
|
+
__version__ = '0.2.0'
|
|
14
|
+
__all__ = ['StreamWriter', 'StreamReader', 'pack', 'unpack', 'benchmark']
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def benchmark(folder, quiet=False):
|
|
18
|
+
"""Benchmark storetle vs gzip WARC on a folder of HTML files.
|
|
19
|
+
|
|
20
|
+
Returns a dict with size comparisons. Prints a table unless quiet=True.
|
|
21
|
+
|
|
22
|
+
Example:
|
|
23
|
+
import storetle
|
|
24
|
+
results = storetle.benchmark('my_crawl_data/')
|
|
25
|
+
"""
|
|
26
|
+
import gzip, os, tempfile, time
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
|
|
29
|
+
files = sorted(Path(folder).glob('**/*.html'))
|
|
30
|
+
if not files:
|
|
31
|
+
raise ValueError(f'No .html files found in {folder}')
|
|
32
|
+
|
|
33
|
+
docs = [f.read_bytes() for f in files]
|
|
34
|
+
total_html = sum(len(d) for d in docs)
|
|
35
|
+
|
|
36
|
+
# gzip WARC (industry standard)
|
|
37
|
+
warc_raw = b''.join(
|
|
38
|
+
'WARC/1.0\r\nContent-Length: {}\r\n\r\n'.format(len(d)).encode()
|
|
39
|
+
+ d + b'\r\n\r\n'
|
|
40
|
+
for d in docs
|
|
41
|
+
)
|
|
42
|
+
warc_gz = len(gzip.compress(warc_raw, compresslevel=9))
|
|
43
|
+
|
|
44
|
+
# gzip per-file
|
|
45
|
+
gz_pf = sum(len(gzip.compress(d, compresslevel=9)) for d in docs)
|
|
46
|
+
|
|
47
|
+
# storetle
|
|
48
|
+
with tempfile.NamedTemporaryFile(suffix='.storetle', delete=False) as tf:
|
|
49
|
+
tmp = tf.name
|
|
50
|
+
try:
|
|
51
|
+
t0 = time.time()
|
|
52
|
+
with StreamWriter(tmp) as w:
|
|
53
|
+
for d in docs:
|
|
54
|
+
w.append(d)
|
|
55
|
+
write_time = time.time() - t0
|
|
56
|
+
|
|
57
|
+
t1 = time.time()
|
|
58
|
+
with StreamReader(tmp) as r:
|
|
59
|
+
recovered = list(r)
|
|
60
|
+
read_time = time.time() - t1
|
|
61
|
+
|
|
62
|
+
cube_size = os.path.getsize(tmp)
|
|
63
|
+
finally:
|
|
64
|
+
os.unlink(tmp)
|
|
65
|
+
|
|
66
|
+
rt_ok = len(recovered) == len(docs)
|
|
67
|
+
|
|
68
|
+
result = {
|
|
69
|
+
'files': len(files),
|
|
70
|
+
'original_bytes': total_html,
|
|
71
|
+
'gzip_warc': warc_gz,
|
|
72
|
+
'gzip_per_file': gz_pf,
|
|
73
|
+
'storetle': cube_size,
|
|
74
|
+
'savings_vs_gzip_warc_pct': round((warc_gz - cube_size) / warc_gz * 100, 1),
|
|
75
|
+
'write_kbps': int(total_html / 1024 / max(write_time, 0.001)),
|
|
76
|
+
'read_kbps': int(total_html / 1024 / max(read_time, 0.001)),
|
|
77
|
+
'roundtrip_ok': rt_ok,
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
if not quiet:
|
|
81
|
+
_print_benchmark(result)
|
|
82
|
+
|
|
83
|
+
return result
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _print_benchmark(r):
|
|
87
|
+
def fmt(n):
|
|
88
|
+
if n < 1024: return f'{n}B'
|
|
89
|
+
if n < 1048576: return f'{n/1024:.1f}KB'
|
|
90
|
+
return f'{n/1048576:.2f}MB'
|
|
91
|
+
|
|
92
|
+
def pct(a, b):
|
|
93
|
+
return f'{100*(1-a/b):.1f}%'
|
|
94
|
+
|
|
95
|
+
orig = r['original_bytes']
|
|
96
|
+
print(f'\n storetle benchmark — {r["files"]} files, {fmt(orig)} original\n')
|
|
97
|
+
print(f' {"Format":<28} {"Size":>10} {"Savings":>8}')
|
|
98
|
+
print(f' {"─"*50}')
|
|
99
|
+
print(f' {"Original HTML":<28} {fmt(orig):>10}')
|
|
100
|
+
print(f' {"gzip per-file (current)":<28} {fmt(r["gzip_per_file"]):>10} {pct(r["gzip_per_file"], orig):>8}')
|
|
101
|
+
print(f' {"gzip WARC (Common Crawl std)":<28} {fmt(r["gzip_warc"]):>10} {pct(r["gzip_warc"], orig):>8}')
|
|
102
|
+
print(f' {"storetle":<28} {fmt(r["storetle"]):>10} {pct(r["storetle"], orig):>8}')
|
|
103
|
+
print()
|
|
104
|
+
|
|
105
|
+
saved = r["gzip_warc"] - r["storetle"]
|
|
106
|
+
sign = '+' if saved > 0 else ''
|
|
107
|
+
print(f' vs gzip WARC: {sign}{r["savings_vs_gzip_warc_pct"]}% smaller ({fmt(abs(saved))} {"saved" if saved > 0 else "larger"})')
|
|
108
|
+
print(f' Write speed: {r["write_kbps"]:,} KB/s')
|
|
109
|
+
print(f' Read speed: {r["read_kbps"]:,} KB/s')
|
|
110
|
+
print(f' Round-trip: {"✓ all documents verified" if r["roundtrip_ok"] else "✗ FAILED"}')
|
|
111
|
+
print()
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
# brotli_compat.py — thin ctypes wrapper around the system brotli library
|
|
2
|
+
#
|
|
3
|
+
# Exposes compress(data, quality=11) and decompress(data) with the same
|
|
4
|
+
# calling convention as zlib.compress / zlib.decompress so callers can
|
|
5
|
+
# swap one for the other with a single import change.
|
|
6
|
+
#
|
|
7
|
+
# Falls back to zlib if the brotli dylib is not present (other machines).
|
|
8
|
+
|
|
9
|
+
import ctypes, zlib, os
|
|
10
|
+
|
|
11
|
+
_BROTLI_OK = False
|
|
12
|
+
_enc = None
|
|
13
|
+
_dec = None
|
|
14
|
+
|
|
15
|
+
def _try_load():
|
|
16
|
+
global _BROTLI_OK, _enc, _dec
|
|
17
|
+
candidates = [
|
|
18
|
+
'/usr/local/lib/libbrotlienc.dylib',
|
|
19
|
+
'/usr/local/Cellar/brotli/1.2.0/lib/libbrotlienc.dylib',
|
|
20
|
+
'libbrotlienc.so',
|
|
21
|
+
'libbrotlienc.dylib',
|
|
22
|
+
]
|
|
23
|
+
dec_candidates = [p.replace('enc', 'dec') for p in candidates]
|
|
24
|
+
|
|
25
|
+
for ep, dp in zip(candidates, dec_candidates):
|
|
26
|
+
if os.path.exists(ep) and os.path.exists(dp):
|
|
27
|
+
try:
|
|
28
|
+
e = ctypes.CDLL(ep)
|
|
29
|
+
d = ctypes.CDLL(dp)
|
|
30
|
+
# wire up encoder
|
|
31
|
+
e.BrotliEncoderMaxCompressedSize.restype = ctypes.c_size_t
|
|
32
|
+
e.BrotliEncoderMaxCompressedSize.argtypes = [ctypes.c_size_t]
|
|
33
|
+
e.BrotliEncoderCompress.restype = ctypes.c_int
|
|
34
|
+
e.BrotliEncoderCompress.argtypes = [
|
|
35
|
+
ctypes.c_int, ctypes.c_int, ctypes.c_int,
|
|
36
|
+
ctypes.c_size_t, ctypes.c_char_p,
|
|
37
|
+
ctypes.POINTER(ctypes.c_size_t), ctypes.c_char_p,
|
|
38
|
+
]
|
|
39
|
+
# wire up decoder
|
|
40
|
+
d.BrotliDecoderDecompress.restype = ctypes.c_int
|
|
41
|
+
d.BrotliDecoderDecompress.argtypes = [
|
|
42
|
+
ctypes.c_size_t, ctypes.c_char_p,
|
|
43
|
+
ctypes.POINTER(ctypes.c_size_t), ctypes.c_char_p,
|
|
44
|
+
]
|
|
45
|
+
_enc = e; _dec = d; _BROTLI_OK = True
|
|
46
|
+
return
|
|
47
|
+
except Exception:
|
|
48
|
+
continue
|
|
49
|
+
|
|
50
|
+
_try_load()
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def compress(data: bytes, quality: int = 11, lgwin: int = 24, mode: int = 0) -> bytes:
|
|
54
|
+
"""Compress data with brotli (quality 0-11, mode 0=generic 1=text 2=font).
|
|
55
|
+
Falls back to zlib level 9."""
|
|
56
|
+
if not _BROTLI_OK:
|
|
57
|
+
return zlib.compress(data, level=9)
|
|
58
|
+
|
|
59
|
+
max_out = _enc.BrotliEncoderMaxCompressedSize(len(data))
|
|
60
|
+
out_buf = ctypes.create_string_buffer(max_out)
|
|
61
|
+
out_size = ctypes.c_size_t(max_out)
|
|
62
|
+
|
|
63
|
+
ok = _enc.BrotliEncoderCompress(
|
|
64
|
+
quality, lgwin, mode,
|
|
65
|
+
len(data), data,
|
|
66
|
+
ctypes.byref(out_size), out_buf,
|
|
67
|
+
)
|
|
68
|
+
if not ok:
|
|
69
|
+
raise RuntimeError('brotli compression failed')
|
|
70
|
+
return out_buf.raw[:out_size.value]
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def decompress(data: bytes) -> bytes:
|
|
74
|
+
"""Decompress brotli data. Falls back to zlib if brotli unavailable."""
|
|
75
|
+
if not _BROTLI_OK:
|
|
76
|
+
return zlib.decompress(data)
|
|
77
|
+
|
|
78
|
+
# Grow output buffer until it fits (decompressed size is unknown).
|
|
79
|
+
max_out = max(len(data) * 10, 1 << 20) # start at 10× or 1 MB
|
|
80
|
+
while True:
|
|
81
|
+
out_buf = ctypes.create_string_buffer(max_out)
|
|
82
|
+
out_size = ctypes.c_size_t(max_out)
|
|
83
|
+
result = _dec.BrotliDecoderDecompress(
|
|
84
|
+
len(data), data,
|
|
85
|
+
ctypes.byref(out_size), out_buf,
|
|
86
|
+
)
|
|
87
|
+
if result == 1: # BROTLI_DECODER_RESULT_SUCCESS
|
|
88
|
+
return out_buf.raw[:out_size.value]
|
|
89
|
+
if result == 3: # BROTLI_DECODER_RESULT_NEEDS_MORE_OUTPUT
|
|
90
|
+
max_out *= 4
|
|
91
|
+
continue
|
|
92
|
+
raise RuntimeError(f'brotli decompression failed (result={result})')
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def available() -> bool:
|
|
96
|
+
return _BROTLI_OK
|
storetle/cli.py
ADDED
|
@@ -0,0 +1,302 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# cli.py — storetle command-line interface
|
|
3
|
+
#
|
|
4
|
+
# Commands:
|
|
5
|
+
# storetle bench <folder> — benchmark your data
|
|
6
|
+
# storetle pack <folder> <output> — compress folder → .storetle
|
|
7
|
+
# storetle unpack <input> <output_folder> — decompress .storetle → files
|
|
8
|
+
# storetle info <file.storetle> — show file stats
|
|
9
|
+
# storetle get <file.storetle> <idx> — extract one document by index
|
|
10
|
+
|
|
11
|
+
import sys
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def cmd_bench(args):
|
|
17
|
+
if not args:
|
|
18
|
+
print('Usage: storetle bench <folder>')
|
|
19
|
+
sys.exit(1)
|
|
20
|
+
from . import benchmark
|
|
21
|
+
try:
|
|
22
|
+
benchmark(args[0])
|
|
23
|
+
except ValueError as e:
|
|
24
|
+
print(f'Error: {e}')
|
|
25
|
+
sys.exit(1)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def cmd_pack(args):
|
|
29
|
+
if len(args) < 2:
|
|
30
|
+
print('Usage: storetle pack <folder> <output.storetle>')
|
|
31
|
+
sys.exit(1)
|
|
32
|
+
from .stream import StreamWriter
|
|
33
|
+
folder = Path(args[0])
|
|
34
|
+
output = args[1]
|
|
35
|
+
files = sorted(folder.glob('**/*.html'))
|
|
36
|
+
if not files:
|
|
37
|
+
print(f'No .html files found in {folder}')
|
|
38
|
+
sys.exit(1)
|
|
39
|
+
|
|
40
|
+
print(f'Packing {len(files)} files...')
|
|
41
|
+
with StreamWriter(output) as w:
|
|
42
|
+
for f in files:
|
|
43
|
+
w.append(f.read_bytes())
|
|
44
|
+
|
|
45
|
+
from .stream import StreamReader
|
|
46
|
+
info = StreamReader.info(output)
|
|
47
|
+
|
|
48
|
+
def fmt(n):
|
|
49
|
+
if n < 1048576: return f'{n/1024:.1f}KB'
|
|
50
|
+
return f'{n/1048576:.2f}MB'
|
|
51
|
+
|
|
52
|
+
print(f'Done: {fmt(info["original_bytes"])} → {fmt(info["compressed_bytes"])} '
|
|
53
|
+
f'({info["ratio_pct"]}% saved, {info["docs"]} docs, {info["chunks"]} chunks)')
|
|
54
|
+
print(f'Output: {output}')
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def cmd_unpack(args):
|
|
58
|
+
if len(args) < 2:
|
|
59
|
+
print('Usage: storetle unpack <file.storetle> <output_folder>')
|
|
60
|
+
sys.exit(1)
|
|
61
|
+
from .stream import StreamReader
|
|
62
|
+
src = args[0]
|
|
63
|
+
dst = Path(args[1])
|
|
64
|
+
dst.mkdir(parents=True, exist_ok=True)
|
|
65
|
+
|
|
66
|
+
with StreamReader(src) as r:
|
|
67
|
+
print(f'Extracting {r.doc_count} documents to {dst}/')
|
|
68
|
+
for i, doc in enumerate(r):
|
|
69
|
+
out = dst / f'doc_{i:06d}.html'
|
|
70
|
+
out.write_bytes(doc)
|
|
71
|
+
if (i + 1) % 100 == 0:
|
|
72
|
+
print(f' {i+1}/{r.doc_count}...')
|
|
73
|
+
print(f'Done: {r.doc_count} files written to {dst}/')
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def cmd_info(args):
|
|
77
|
+
if not args:
|
|
78
|
+
print('Usage: storetle info <file.storetle>')
|
|
79
|
+
sys.exit(1)
|
|
80
|
+
from .stream import StreamReader
|
|
81
|
+
|
|
82
|
+
def fmt(n):
|
|
83
|
+
if n < 1048576: return f'{n/1024:.1f}KB'
|
|
84
|
+
return f'{n/1048576:.2f}MB'
|
|
85
|
+
|
|
86
|
+
info = StreamReader.info(args[0])
|
|
87
|
+
print(f'\n {args[0]}')
|
|
88
|
+
print(f' Documents: {info["docs"]:,}')
|
|
89
|
+
print(f' Chunks: {info["chunks"]:,}')
|
|
90
|
+
print(f' Original: {fmt(info["original_bytes"])}')
|
|
91
|
+
print(f' Compressed: {fmt(info["compressed_bytes"])} ({info["ratio_pct"]}% saved)')
|
|
92
|
+
print()
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def cmd_get(args):
|
|
96
|
+
if len(args) < 2:
|
|
97
|
+
print('Usage: storetle get <file.storetle> <index>')
|
|
98
|
+
sys.exit(1)
|
|
99
|
+
from .stream import StreamReader
|
|
100
|
+
with StreamReader(args[0]) as r:
|
|
101
|
+
try:
|
|
102
|
+
idx = int(args[1])
|
|
103
|
+
doc = r[idx]
|
|
104
|
+
sys.stdout.buffer.write(doc)
|
|
105
|
+
except (IndexError, ValueError) as e:
|
|
106
|
+
print(f'Error: {e}')
|
|
107
|
+
sys.exit(1)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def cmd_from_warc(args):
|
|
111
|
+
if len(args) < 2:
|
|
112
|
+
print('Usage: storetle from-warc <input.warc[.gz]> <output.storetle>')
|
|
113
|
+
sys.exit(1)
|
|
114
|
+
from .warc import from_warc
|
|
115
|
+
try:
|
|
116
|
+
from_warc(args[0], args[1], verbose=True)
|
|
117
|
+
except ValueError as e:
|
|
118
|
+
print('Error: %s' % e)
|
|
119
|
+
sys.exit(1)
|
|
120
|
+
except Exception as e:
|
|
121
|
+
print('Error: %s' % e)
|
|
122
|
+
sys.exit(1)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def cmd_to_warc(args):
|
|
126
|
+
if len(args) < 2:
|
|
127
|
+
print('Usage: storetle to-warc <input.storetle> <output.warc[.gz]>')
|
|
128
|
+
sys.exit(1)
|
|
129
|
+
from .warc import to_warc
|
|
130
|
+
try:
|
|
131
|
+
to_warc(args[0], args[1], verbose=True)
|
|
132
|
+
except Exception as e:
|
|
133
|
+
print('Error: %s' % e)
|
|
134
|
+
sys.exit(1)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def cmd_train(args):
|
|
138
|
+
"""Train a custom zstd dictionary from a folder of HTML files.
|
|
139
|
+
|
|
140
|
+
Usage: storetle train <folder> [--output dict.bin] [--size 1024]
|
|
141
|
+
|
|
142
|
+
The trained dict can then be used with StreamWriter(path, dictionary=...).
|
|
143
|
+
Default output: storetle_dict.bin in current directory.
|
|
144
|
+
Default size: 1024KB (the same as the built-in dict).
|
|
145
|
+
"""
|
|
146
|
+
if not args:
|
|
147
|
+
print('Usage: storetle train <folder> [--output dict.bin] [--size 1024]')
|
|
148
|
+
sys.exit(1)
|
|
149
|
+
|
|
150
|
+
folder = Path(args[0])
|
|
151
|
+
outfile = 'storetle_dict.bin'
|
|
152
|
+
size_kb = 1024
|
|
153
|
+
|
|
154
|
+
i = 1
|
|
155
|
+
while i < len(args):
|
|
156
|
+
if args[i] == '--output' and i + 1 < len(args):
|
|
157
|
+
outfile = args[i + 1]; i += 2
|
|
158
|
+
elif args[i] == '--size' and i + 1 < len(args):
|
|
159
|
+
try:
|
|
160
|
+
size_kb = int(args[i + 1])
|
|
161
|
+
except ValueError:
|
|
162
|
+
print('Error: --size must be an integer (KB)')
|
|
163
|
+
sys.exit(1)
|
|
164
|
+
i += 2
|
|
165
|
+
else:
|
|
166
|
+
i += 1
|
|
167
|
+
|
|
168
|
+
from . import zstd_compat as _zs
|
|
169
|
+
if not _zs.available():
|
|
170
|
+
print('Error: zstd not available. Install libzstd first.')
|
|
171
|
+
sys.exit(1)
|
|
172
|
+
|
|
173
|
+
from .stream import _encode_doc
|
|
174
|
+
|
|
175
|
+
files = sorted(folder.glob('**/*.html'))
|
|
176
|
+
if not files:
|
|
177
|
+
print('Error: no .html files found in %s' % folder)
|
|
178
|
+
sys.exit(1)
|
|
179
|
+
|
|
180
|
+
print('Encoding %d HTML files for dictionary training...' % len(files))
|
|
181
|
+
samples = []
|
|
182
|
+
errors = 0
|
|
183
|
+
for f in files:
|
|
184
|
+
try:
|
|
185
|
+
raw = _encode_doc(f.read_bytes())
|
|
186
|
+
samples.append(raw)
|
|
187
|
+
except Exception:
|
|
188
|
+
errors += 1
|
|
189
|
+
|
|
190
|
+
if not samples:
|
|
191
|
+
print('Error: failed to encode any files')
|
|
192
|
+
sys.exit(1)
|
|
193
|
+
|
|
194
|
+
total_kb = sum(len(s) for s in samples) // 1024
|
|
195
|
+
print('Training %dKB dictionary on %d samples (%dKB total)...' % (
|
|
196
|
+
size_kb, len(samples), total_kb))
|
|
197
|
+
|
|
198
|
+
dict_bytes = _zs.train_dictionary(samples, dict_size=size_kb * 1024)
|
|
199
|
+
|
|
200
|
+
Path(outfile).write_bytes(dict_bytes)
|
|
201
|
+
actual_kb = len(dict_bytes) // 1024
|
|
202
|
+
print('Done: %dKB dictionary saved to %s' % (actual_kb, outfile))
|
|
203
|
+
if errors:
|
|
204
|
+
print(' (%d files skipped due to encoding errors)' % errors)
|
|
205
|
+
print()
|
|
206
|
+
print('To use this dictionary:')
|
|
207
|
+
print(' import storetle')
|
|
208
|
+
print(' d = open("%s", "rb").read()' % outfile)
|
|
209
|
+
print(' with storetle.StreamWriter("out.storetle", dictionary=d) as w:')
|
|
210
|
+
print(' w.append(html)')
|
|
211
|
+
print(' with storetle.StreamReader("out.storetle") as r:')
|
|
212
|
+
print(' # reader auto-loads dict from disk; pass dictionary=d if custom')
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def cmd_warc_encode(args):
|
|
216
|
+
if len(args) < 2:
|
|
217
|
+
print('Usage: storetle warc-encode <input.warc[.gz]> <output.warc[.gz]>')
|
|
218
|
+
sys.exit(1)
|
|
219
|
+
from .warc import warc_encode
|
|
220
|
+
try:
|
|
221
|
+
warc_encode(args[0], args[1], verbose=True)
|
|
222
|
+
except Exception as e:
|
|
223
|
+
print('Error: %s' % e)
|
|
224
|
+
sys.exit(1)
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def cmd_warc_decode(args):
|
|
228
|
+
if len(args) < 2:
|
|
229
|
+
print('Usage: storetle warc-decode <encoded.warc[.gz]> <output.warc[.gz]>')
|
|
230
|
+
sys.exit(1)
|
|
231
|
+
from .warc import warc_decode
|
|
232
|
+
try:
|
|
233
|
+
warc_decode(args[0], args[1], verbose=True)
|
|
234
|
+
except Exception as e:
|
|
235
|
+
print('Error: %s' % e)
|
|
236
|
+
sys.exit(1)
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
COMMANDS = {
|
|
240
|
+
'bench': cmd_bench,
|
|
241
|
+
'pack': cmd_pack,
|
|
242
|
+
'unpack': cmd_unpack,
|
|
243
|
+
'info': cmd_info,
|
|
244
|
+
'get': cmd_get,
|
|
245
|
+
'from-warc': cmd_from_warc,
|
|
246
|
+
'to-warc': cmd_to_warc,
|
|
247
|
+
'warc-encode': cmd_warc_encode,
|
|
248
|
+
'warc-decode': cmd_warc_decode,
|
|
249
|
+
'train': cmd_train,
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
HELP = """storetle — HTML-aware compression for large document collections
|
|
253
|
+
|
|
254
|
+
Commands:
|
|
255
|
+
bench <folder> Benchmark your HTML data vs gzip WARC
|
|
256
|
+
pack <folder> <output> Compress a folder → .storetle file
|
|
257
|
+
unpack <file> <output_folder> Extract a .storetle → HTML files
|
|
258
|
+
info <file> Show file statistics
|
|
259
|
+
get <file> <index> Extract one document by index (0-based)
|
|
260
|
+
from-warc <input.warc[.gz]> <out> Convert WARC → .storetle
|
|
261
|
+
to-warc <input.storetle> <out> Convert .storetle → WARC (or .warc.gz)
|
|
262
|
+
warc-encode <input.warc[.gz]> <out> Encode HTML in-place → valid .warc.gz (smaller, standard format)
|
|
263
|
+
warc-decode <encoded.warc[.gz]> <out> Decode back to standard HTML WARC
|
|
264
|
+
train <folder> [options] Train a custom dictionary from HTML files
|
|
265
|
+
--output dict.bin Output path (default: storetle_dict.bin)
|
|
266
|
+
--size 1024 Dictionary size in KB (default: 1024)
|
|
267
|
+
|
|
268
|
+
Examples:
|
|
269
|
+
storetle bench my_crawl/
|
|
270
|
+
storetle pack my_crawl/ archive.storetle
|
|
271
|
+
storetle info archive.storetle
|
|
272
|
+
storetle get archive.storetle 0
|
|
273
|
+
storetle from-warc CC-MAIN.warc.gz archive.storetle
|
|
274
|
+
storetle to-warc archive.storetle output.warc.gz
|
|
275
|
+
storetle train my_corpus/ --output my_domain.bin --size 1024
|
|
276
|
+
"""
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
def main():
|
|
280
|
+
if len(sys.argv) < 2 or sys.argv[1] in ('-h', '--help', 'help'):
|
|
281
|
+
print(HELP)
|
|
282
|
+
sys.exit(0)
|
|
283
|
+
|
|
284
|
+
cmd = sys.argv[1]
|
|
285
|
+
if cmd not in COMMANDS:
|
|
286
|
+
print(f'Unknown command: {cmd}\n')
|
|
287
|
+
print(HELP)
|
|
288
|
+
sys.exit(1)
|
|
289
|
+
|
|
290
|
+
try:
|
|
291
|
+
COMMANDS[cmd](sys.argv[2:])
|
|
292
|
+
except BrokenPipeError:
|
|
293
|
+
# downstream consumer (e.g. `| head`) closed the pipe — not an error
|
|
294
|
+
import os
|
|
295
|
+
os.dup2(os.open(os.devnull, os.O_WRONLY), sys.stdout.fileno())
|
|
296
|
+
sys.exit(0)
|
|
297
|
+
except KeyboardInterrupt:
|
|
298
|
+
sys.exit(130)
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
if __name__ == '__main__':
|
|
302
|
+
main()
|
|
Binary file
|