storetle 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- storetle-0.2.0/LICENSE +21 -0
- storetle-0.2.0/PKG-INFO +161 -0
- storetle-0.2.0/README.md +138 -0
- storetle-0.2.0/pyproject.toml +42 -0
- storetle-0.2.0/setup.cfg +4 -0
- storetle-0.2.0/storetle/__init__.py +111 -0
- storetle-0.2.0/storetle/brotli_compat.py +96 -0
- storetle-0.2.0/storetle/cli.py +302 -0
- storetle-0.2.0/storetle/cube_dict_v10.bin +0 -0
- storetle-0.2.0/storetle/decoder.py +211 -0
- storetle-0.2.0/storetle/encoder.py +717 -0
- storetle-0.2.0/storetle/folder.py +249 -0
- storetle-0.2.0/storetle/stream.py +464 -0
- storetle-0.2.0/storetle/vocab.py +635 -0
- storetle-0.2.0/storetle/warc.py +478 -0
- storetle-0.2.0/storetle/zstd_compat.py +202 -0
- storetle-0.2.0/storetle.egg-info/PKG-INFO +161 -0
- storetle-0.2.0/storetle.egg-info/SOURCES.txt +20 -0
- storetle-0.2.0/storetle.egg-info/dependency_links.txt +1 -0
- storetle-0.2.0/storetle.egg-info/entry_points.txt +2 -0
- storetle-0.2.0/storetle.egg-info/requires.txt +3 -0
- storetle-0.2.0/storetle.egg-info/top_level.txt +1 -0
storetle-0.2.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Davis Brief
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
storetle-0.2.0/PKG-INFO
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: storetle
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: HTML-aware compression for document corpora — solid-archive ratios with random access
|
|
5
|
+
Author-email: Davis Brief <davis@team8.co>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/adventurelands/storetle
|
|
8
|
+
Project-URL: Specification, https://github.com/adventurelands/storetle/blob/main/FORMAT.md
|
|
9
|
+
Keywords: html,compression,warc,web-archive,corpus,dataset,streaming
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
15
|
+
Classifier: Topic :: System :: Archiving :: Compression
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Requires-Python: >=3.8
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
License-File: LICENSE
|
|
20
|
+
Provides-Extra: fast
|
|
21
|
+
Requires-Dist: lxml; extra == "fast"
|
|
22
|
+
Dynamic: license-file
|
|
23
|
+
|
|
24
|
+
# Storetle
|
|
25
|
+
|
|
26
|
+
**HTML-aware compression for document corpora — solid-archive ratios with random access.**
|
|
27
|
+
|
|
28
|
+
Storetle stores large collections of HTML (web crawls, academic corpora,
|
|
29
|
+
training datasets) in a format that is ~46% smaller than the per-record
|
|
30
|
+
gzip WARC files the web-archiving world ships today, while still letting
|
|
31
|
+
you pull any single document out of a multi-gigabyte archive without
|
|
32
|
+
decompressing the rest — locally, or straight off object storage.
|
|
33
|
+
|
|
34
|
+
```
|
|
35
|
+
pip: storetle (Python, read/write) · rust/: storetle-rs (Rust, read) · web/: read .storetle in the browser
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## The honest benchmark
|
|
39
|
+
|
|
40
|
+
Two different questions, two tables. Corpus: 10 real pages (Wikipedia,
|
|
41
|
+
arXiv abstracts, PLOS articles), 1.75 MB raw HTML, measured June 2026.
|
|
42
|
+
Reproduce with `storetle bench <folder>`.
|
|
43
|
+
|
|
44
|
+
**1. Among formats with random access** (you can extract one doc without
|
|
45
|
+
decompressing everything before it — this is how WARC is actually deployed):
|
|
46
|
+
|
|
47
|
+
| method | bytes | vs deployed standard |
|
|
48
|
+
|---|---|---|
|
|
49
|
+
| per-record gzip -9 (standard WARC) | 373,626 | — |
|
|
50
|
+
| per-record zstd -19 | 325,807 | −12.8% |
|
|
51
|
+
| per-record zstd -19 + trained dict | 274,226 | −26.6% |
|
|
52
|
+
| **storetle** | **200,598** | **−46.3%** |
|
|
53
|
+
|
|
54
|
+
**2. Against solid archives** (maximum compression, no random access):
|
|
55
|
+
|
|
56
|
+
| method | bytes |
|
|
57
|
+
|---|---|
|
|
58
|
+
| tar + gzip -9 | 370,307 |
|
|
59
|
+
| tar + zstd -19 | 220,512 |
|
|
60
|
+
| tar + zstd -22 --long | 220,422 |
|
|
61
|
+
| tar + zstd -22 + trained dict | 204,386 |
|
|
62
|
+
| **storetle (keeps random access)** | **200,598** |
|
|
63
|
+
|
|
64
|
+
Storetle matches solid zstd-22 while remaining randomly accessible. The
|
|
65
|
+
margin comes from three things: HTML-aware encoding (tags/attributes become
|
|
66
|
+
1-byte IDs from a shared vocabulary, structure and text compressed as
|
|
67
|
+
separate streams), a 1 MB dictionary trained on the binary encoding, and
|
|
68
|
+
256-document chunks that capture cross-page template redundancy.
|
|
69
|
+
|
|
70
|
+
On larger corpora measured against gzip WARC: 28.4% smaller on 3,000 live
|
|
71
|
+
Common Crawl docs (348.6 MB), 27–82% on same-domain collections (191 pages,
|
|
72
|
+
20 domains) where template sharing is strongest. Round-trip verified on all
|
|
73
|
+
of the above. Stream it yourself: `python3 bench_cc.py --docs 3000`.
|
|
74
|
+
|
|
75
|
+
## Install
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
brew install zstd # macOS (Ubuntu: apt install libzstd-dev)
|
|
79
|
+
pip install storetle
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
No Python dependencies — stdlib plus system libzstd via ctypes (brotli
|
|
83
|
+
fallback if zstd is missing). `lxml` is optional but strongly recommended
|
|
84
|
+
for encoding speed.
|
|
85
|
+
|
|
86
|
+
## CLI
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
storetle pack my_crawl/ archive.storetle # folder of .html → archive
|
|
90
|
+
storetle unpack archive.storetle out/ # archive → .html files
|
|
91
|
+
storetle info archive.storetle # stats
|
|
92
|
+
storetle get archive.storetle 42 # one doc to stdout, O(1)
|
|
93
|
+
storetle bench my_crawl/ # benchmark on YOUR data
|
|
94
|
+
storetle from-warc CC-MAIN.warc.gz archive.storetle
|
|
95
|
+
storetle to-warc archive.storetle out.warc.gz
|
|
96
|
+
storetle train my_corpus/ --output my.bin # domain-specific dictionary
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## Python API
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
import storetle
|
|
103
|
+
|
|
104
|
+
with storetle.StreamWriter('archive.storetle', workers=8) as w:
|
|
105
|
+
for html in crawl:
|
|
106
|
+
w.append(html)
|
|
107
|
+
|
|
108
|
+
with storetle.StreamReader('archive.storetle') as r:
|
|
109
|
+
print(r.doc_count)
|
|
110
|
+
doc = r[42] # random access: decompresses one ~2MB chunk
|
|
111
|
+
batch = r[100:200]
|
|
112
|
+
for doc in r: # sequential
|
|
113
|
+
...
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
## Rust reader
|
|
117
|
+
|
|
118
|
+
A read-only Rust implementation lives in [`rust/`](rust/) — library plus a
|
|
119
|
+
`storetle-rs` CLI (`ls` / `get` / `unpack`), differentially tested
|
|
120
|
+
byte-for-byte against the Python decoder.
|
|
121
|
+
|
|
122
|
+
## In the browser
|
|
123
|
+
|
|
124
|
+
[`web/`](web/) has a zero-dependency demo page: the Rust reader compiled to
|
|
125
|
+
WebAssembly. Drop a `.storetle` file onto the page and browse its documents.
|
|
126
|
+
|
|
127
|
+
## How it works
|
|
128
|
+
|
|
129
|
+
1. **Parse** — HTML is tokenized to a node stream (lxml fast path, pure-Python fallback).
|
|
130
|
+
2. **Encode** — tags and attribute names become 1-byte IDs from a fixed
|
|
131
|
+
vocabulary (130 tags, 163 attributes, 1,394 shared strings).
|
|
132
|
+
`class="flex items-center gap-4"` is split into per-token vocabulary
|
|
133
|
+
lookups. Structure and text go to separate streams.
|
|
134
|
+
3. **Chunk** — up to 256 docs / 2 MiB are concatenated, preserving
|
|
135
|
+
cross-document redundancy.
|
|
136
|
+
4. **Compress** — zstd-22 with a 1 MB dictionary trained on the binary
|
|
137
|
+
encoding (ships with the codec).
|
|
138
|
+
5. **Index** — a footer index maps documents to chunks, so readers seek
|
|
139
|
+
instead of scanning. Works over HTTP range requests against plain
|
|
140
|
+
object storage.
|
|
141
|
+
|
|
142
|
+
Full byte-level spec: [FORMAT.md](FORMAT.md).
|
|
143
|
+
|
|
144
|
+
## Limitations — read these
|
|
145
|
+
|
|
146
|
+
- **Structural, not byte-exact.** Reconstructed HTML preserves every tag,
|
|
147
|
+
attribute, text node, comment, and script/style body, but is
|
|
148
|
+
re-serialized (indentation and inter-tag whitespace differ). Fine for
|
|
149
|
+
corpora and ML pipelines; **wrong for byte-exact archival** — if you need
|
|
150
|
+
forensic fidelity, use WARC.
|
|
151
|
+
- **HTML only.** `from-warc` keeps HTML response records and skips
|
|
152
|
+
everything else. A raw passthrough mode for JSON/text is on the roadmap.
|
|
153
|
+
- **Encoding speed** ~3.5 MB/s per core (Python). Parallel via
|
|
154
|
+
`workers=N`. Decoding is zstd-bound and fast. A native encoder is on the
|
|
155
|
+
roadmap.
|
|
156
|
+
- **Alpha.** Format version 2. Validated on 150k+ Common Crawl documents,
|
|
157
|
+
but expect rough edges.
|
|
158
|
+
|
|
159
|
+
## License
|
|
160
|
+
|
|
161
|
+
MIT © 2026 Davis Brief
|
storetle-0.2.0/README.md
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
# Storetle
|
|
2
|
+
|
|
3
|
+
**HTML-aware compression for document corpora — solid-archive ratios with random access.**
|
|
4
|
+
|
|
5
|
+
Storetle stores large collections of HTML (web crawls, academic corpora,
|
|
6
|
+
training datasets) in a format that is ~46% smaller than the per-record
|
|
7
|
+
gzip WARC files the web-archiving world ships today, while still letting
|
|
8
|
+
you pull any single document out of a multi-gigabyte archive without
|
|
9
|
+
decompressing the rest — locally, or straight off object storage.
|
|
10
|
+
|
|
11
|
+
```
|
|
12
|
+
pip: storetle (Python, read/write) · rust/: storetle-rs (Rust, read) · web/: read .storetle in the browser
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
## The honest benchmark
|
|
16
|
+
|
|
17
|
+
Two different questions, two tables. Corpus: 10 real pages (Wikipedia,
|
|
18
|
+
arXiv abstracts, PLOS articles), 1.75 MB raw HTML, measured June 2026.
|
|
19
|
+
Reproduce with `storetle bench <folder>`.
|
|
20
|
+
|
|
21
|
+
**1. Among formats with random access** (you can extract one doc without
|
|
22
|
+
decompressing everything before it — this is how WARC is actually deployed):
|
|
23
|
+
|
|
24
|
+
| method | bytes | vs deployed standard |
|
|
25
|
+
|---|---|---|
|
|
26
|
+
| per-record gzip -9 (standard WARC) | 373,626 | — |
|
|
27
|
+
| per-record zstd -19 | 325,807 | −12.8% |
|
|
28
|
+
| per-record zstd -19 + trained dict | 274,226 | −26.6% |
|
|
29
|
+
| **storetle** | **200,598** | **−46.3%** |
|
|
30
|
+
|
|
31
|
+
**2. Against solid archives** (maximum compression, no random access):
|
|
32
|
+
|
|
33
|
+
| method | bytes |
|
|
34
|
+
|---|---|
|
|
35
|
+
| tar + gzip -9 | 370,307 |
|
|
36
|
+
| tar + zstd -19 | 220,512 |
|
|
37
|
+
| tar + zstd -22 --long | 220,422 |
|
|
38
|
+
| tar + zstd -22 + trained dict | 204,386 |
|
|
39
|
+
| **storetle (keeps random access)** | **200,598** |
|
|
40
|
+
|
|
41
|
+
Storetle matches solid zstd-22 while remaining randomly accessible. The
|
|
42
|
+
margin comes from three things: HTML-aware encoding (tags/attributes become
|
|
43
|
+
1-byte IDs from a shared vocabulary, structure and text compressed as
|
|
44
|
+
separate streams), a 1 MB dictionary trained on the binary encoding, and
|
|
45
|
+
256-document chunks that capture cross-page template redundancy.
|
|
46
|
+
|
|
47
|
+
On larger corpora measured against gzip WARC: 28.4% smaller on 3,000 live
|
|
48
|
+
Common Crawl docs (348.6 MB), 27–82% on same-domain collections (191 pages,
|
|
49
|
+
20 domains) where template sharing is strongest. Round-trip verified on all
|
|
50
|
+
of the above. Stream it yourself: `python3 bench_cc.py --docs 3000`.
|
|
51
|
+
|
|
52
|
+
## Install
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
brew install zstd # macOS (Ubuntu: apt install libzstd-dev)
|
|
56
|
+
pip install storetle
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
No Python dependencies — stdlib plus system libzstd via ctypes (brotli
|
|
60
|
+
fallback if zstd is missing). `lxml` is optional but strongly recommended
|
|
61
|
+
for encoding speed.
|
|
62
|
+
|
|
63
|
+
## CLI
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
storetle pack my_crawl/ archive.storetle # folder of .html → archive
|
|
67
|
+
storetle unpack archive.storetle out/ # archive → .html files
|
|
68
|
+
storetle info archive.storetle # stats
|
|
69
|
+
storetle get archive.storetle 42 # one doc to stdout, O(1)
|
|
70
|
+
storetle bench my_crawl/ # benchmark on YOUR data
|
|
71
|
+
storetle from-warc CC-MAIN.warc.gz archive.storetle
|
|
72
|
+
storetle to-warc archive.storetle out.warc.gz
|
|
73
|
+
storetle train my_corpus/ --output my.bin # domain-specific dictionary
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## Python API
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
import storetle
|
|
80
|
+
|
|
81
|
+
with storetle.StreamWriter('archive.storetle', workers=8) as w:
|
|
82
|
+
for html in crawl:
|
|
83
|
+
w.append(html)
|
|
84
|
+
|
|
85
|
+
with storetle.StreamReader('archive.storetle') as r:
|
|
86
|
+
print(r.doc_count)
|
|
87
|
+
doc = r[42] # random access: decompresses one ~2MB chunk
|
|
88
|
+
batch = r[100:200]
|
|
89
|
+
for doc in r: # sequential
|
|
90
|
+
...
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
## Rust reader
|
|
94
|
+
|
|
95
|
+
A read-only Rust implementation lives in [`rust/`](rust/) — library plus a
|
|
96
|
+
`storetle-rs` CLI (`ls` / `get` / `unpack`), differentially tested
|
|
97
|
+
byte-for-byte against the Python decoder.
|
|
98
|
+
|
|
99
|
+
## In the browser
|
|
100
|
+
|
|
101
|
+
[`web/`](web/) has a zero-dependency demo page: the Rust reader compiled to
|
|
102
|
+
WebAssembly. Drop a `.storetle` file onto the page and browse its documents.
|
|
103
|
+
|
|
104
|
+
## How it works
|
|
105
|
+
|
|
106
|
+
1. **Parse** — HTML is tokenized to a node stream (lxml fast path, pure-Python fallback).
|
|
107
|
+
2. **Encode** — tags and attribute names become 1-byte IDs from a fixed
|
|
108
|
+
vocabulary (130 tags, 163 attributes, 1,394 shared strings).
|
|
109
|
+
`class="flex items-center gap-4"` is split into per-token vocabulary
|
|
110
|
+
lookups. Structure and text go to separate streams.
|
|
111
|
+
3. **Chunk** — up to 256 docs / 2 MiB are concatenated, preserving
|
|
112
|
+
cross-document redundancy.
|
|
113
|
+
4. **Compress** — zstd-22 with a 1 MB dictionary trained on the binary
|
|
114
|
+
encoding (ships with the codec).
|
|
115
|
+
5. **Index** — a footer index maps documents to chunks, so readers seek
|
|
116
|
+
instead of scanning. Works over HTTP range requests against plain
|
|
117
|
+
object storage.
|
|
118
|
+
|
|
119
|
+
Full byte-level spec: [FORMAT.md](FORMAT.md).
|
|
120
|
+
|
|
121
|
+
## Limitations — read these
|
|
122
|
+
|
|
123
|
+
- **Structural, not byte-exact.** Reconstructed HTML preserves every tag,
|
|
124
|
+
attribute, text node, comment, and script/style body, but is
|
|
125
|
+
re-serialized (indentation and inter-tag whitespace differ). Fine for
|
|
126
|
+
corpora and ML pipelines; **wrong for byte-exact archival** — if you need
|
|
127
|
+
forensic fidelity, use WARC.
|
|
128
|
+
- **HTML only.** `from-warc` keeps HTML response records and skips
|
|
129
|
+
everything else. A raw passthrough mode for JSON/text is on the roadmap.
|
|
130
|
+
- **Encoding speed** ~3.5 MB/s per core (Python). Parallel via
|
|
131
|
+
`workers=N`. Decoding is zstd-bound and fast. A native encoder is on the
|
|
132
|
+
roadmap.
|
|
133
|
+
- **Alpha.** Format version 2. Validated on 150k+ Common Crawl documents,
|
|
134
|
+
but expect rough edges.
|
|
135
|
+
|
|
136
|
+
## License
|
|
137
|
+
|
|
138
|
+
MIT © 2026 Davis Brief
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "storetle"
|
|
7
|
+
version = "0.2.0"
|
|
8
|
+
description = "HTML-aware compression for document corpora — solid-archive ratios with random access"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.8"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [{ name = "Davis Brief", email = "davis@team8.co" }]
|
|
13
|
+
keywords = ["html", "compression", "warc", "web-archive", "corpus", "dataset", "streaming"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 3 - Alpha",
|
|
16
|
+
"Intended Audience :: Developers",
|
|
17
|
+
"Intended Audience :: Science/Research",
|
|
18
|
+
"License :: OSI Approved :: MIT License",
|
|
19
|
+
"Topic :: Internet :: WWW/HTTP",
|
|
20
|
+
"Topic :: System :: Archiving :: Compression",
|
|
21
|
+
"Programming Language :: Python :: 3",
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
# No hard Python dependencies — stdlib + system libzstd via ctypes.
|
|
25
|
+
# lxml is optional but strongly recommended for encoding throughput.
|
|
26
|
+
dependencies = []
|
|
27
|
+
|
|
28
|
+
[project.optional-dependencies]
|
|
29
|
+
fast = ["lxml"]
|
|
30
|
+
|
|
31
|
+
[project.urls]
|
|
32
|
+
Homepage = "https://github.com/adventurelands/storetle"
|
|
33
|
+
Specification = "https://github.com/adventurelands/storetle/blob/main/FORMAT.md"
|
|
34
|
+
|
|
35
|
+
[project.scripts]
|
|
36
|
+
storetle = "storetle.cli:main"
|
|
37
|
+
|
|
38
|
+
[tool.setuptools]
|
|
39
|
+
packages = ["storetle"]
|
|
40
|
+
|
|
41
|
+
[tool.setuptools.package-data]
|
|
42
|
+
storetle = ["cube_dict_v10.bin"]
|
storetle-0.2.0/setup.cfg
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
# storetle — HTML-aware streaming compression for large document collections
|
|
2
|
+
#
|
|
3
|
+
# Primary API:
|
|
4
|
+
# StreamWriter — append HTML documents to a .storetle file
|
|
5
|
+
# StreamReader — read/iterate/random-access a .storetle file
|
|
6
|
+
# pack — compress a folder of HTML files → .storetle
|
|
7
|
+
# unpack — decompress .storetle → folder of HTML files
|
|
8
|
+
# benchmark — compare storetle vs gzip on your own data
|
|
9
|
+
|
|
10
|
+
from .stream import StreamWriter, StreamReader
|
|
11
|
+
from .folder import pack, unpack
|
|
12
|
+
|
|
13
|
+
__version__ = '0.2.0'
|
|
14
|
+
__all__ = ['StreamWriter', 'StreamReader', 'pack', 'unpack', 'benchmark']
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def benchmark(folder, quiet=False):
|
|
18
|
+
"""Benchmark storetle vs gzip WARC on a folder of HTML files.
|
|
19
|
+
|
|
20
|
+
Returns a dict with size comparisons. Prints a table unless quiet=True.
|
|
21
|
+
|
|
22
|
+
Example:
|
|
23
|
+
import storetle
|
|
24
|
+
results = storetle.benchmark('my_crawl_data/')
|
|
25
|
+
"""
|
|
26
|
+
import gzip, os, tempfile, time
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
|
|
29
|
+
files = sorted(Path(folder).glob('**/*.html'))
|
|
30
|
+
if not files:
|
|
31
|
+
raise ValueError(f'No .html files found in {folder}')
|
|
32
|
+
|
|
33
|
+
docs = [f.read_bytes() for f in files]
|
|
34
|
+
total_html = sum(len(d) for d in docs)
|
|
35
|
+
|
|
36
|
+
# gzip WARC (industry standard)
|
|
37
|
+
warc_raw = b''.join(
|
|
38
|
+
'WARC/1.0\r\nContent-Length: {}\r\n\r\n'.format(len(d)).encode()
|
|
39
|
+
+ d + b'\r\n\r\n'
|
|
40
|
+
for d in docs
|
|
41
|
+
)
|
|
42
|
+
warc_gz = len(gzip.compress(warc_raw, compresslevel=9))
|
|
43
|
+
|
|
44
|
+
# gzip per-file
|
|
45
|
+
gz_pf = sum(len(gzip.compress(d, compresslevel=9)) for d in docs)
|
|
46
|
+
|
|
47
|
+
# storetle
|
|
48
|
+
with tempfile.NamedTemporaryFile(suffix='.storetle', delete=False) as tf:
|
|
49
|
+
tmp = tf.name
|
|
50
|
+
try:
|
|
51
|
+
t0 = time.time()
|
|
52
|
+
with StreamWriter(tmp) as w:
|
|
53
|
+
for d in docs:
|
|
54
|
+
w.append(d)
|
|
55
|
+
write_time = time.time() - t0
|
|
56
|
+
|
|
57
|
+
t1 = time.time()
|
|
58
|
+
with StreamReader(tmp) as r:
|
|
59
|
+
recovered = list(r)
|
|
60
|
+
read_time = time.time() - t1
|
|
61
|
+
|
|
62
|
+
cube_size = os.path.getsize(tmp)
|
|
63
|
+
finally:
|
|
64
|
+
os.unlink(tmp)
|
|
65
|
+
|
|
66
|
+
rt_ok = len(recovered) == len(docs)
|
|
67
|
+
|
|
68
|
+
result = {
|
|
69
|
+
'files': len(files),
|
|
70
|
+
'original_bytes': total_html,
|
|
71
|
+
'gzip_warc': warc_gz,
|
|
72
|
+
'gzip_per_file': gz_pf,
|
|
73
|
+
'storetle': cube_size,
|
|
74
|
+
'savings_vs_gzip_warc_pct': round((warc_gz - cube_size) / warc_gz * 100, 1),
|
|
75
|
+
'write_kbps': int(total_html / 1024 / max(write_time, 0.001)),
|
|
76
|
+
'read_kbps': int(total_html / 1024 / max(read_time, 0.001)),
|
|
77
|
+
'roundtrip_ok': rt_ok,
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
if not quiet:
|
|
81
|
+
_print_benchmark(result)
|
|
82
|
+
|
|
83
|
+
return result
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _print_benchmark(r):
|
|
87
|
+
def fmt(n):
|
|
88
|
+
if n < 1024: return f'{n}B'
|
|
89
|
+
if n < 1048576: return f'{n/1024:.1f}KB'
|
|
90
|
+
return f'{n/1048576:.2f}MB'
|
|
91
|
+
|
|
92
|
+
def pct(a, b):
|
|
93
|
+
return f'{100*(1-a/b):.1f}%'
|
|
94
|
+
|
|
95
|
+
orig = r['original_bytes']
|
|
96
|
+
print(f'\n storetle benchmark — {r["files"]} files, {fmt(orig)} original\n')
|
|
97
|
+
print(f' {"Format":<28} {"Size":>10} {"Savings":>8}')
|
|
98
|
+
print(f' {"─"*50}')
|
|
99
|
+
print(f' {"Original HTML":<28} {fmt(orig):>10}')
|
|
100
|
+
print(f' {"gzip per-file (current)":<28} {fmt(r["gzip_per_file"]):>10} {pct(r["gzip_per_file"], orig):>8}')
|
|
101
|
+
print(f' {"gzip WARC (Common Crawl std)":<28} {fmt(r["gzip_warc"]):>10} {pct(r["gzip_warc"], orig):>8}')
|
|
102
|
+
print(f' {"storetle":<28} {fmt(r["storetle"]):>10} {pct(r["storetle"], orig):>8}')
|
|
103
|
+
print()
|
|
104
|
+
|
|
105
|
+
saved = r["gzip_warc"] - r["storetle"]
|
|
106
|
+
sign = '+' if saved > 0 else ''
|
|
107
|
+
print(f' vs gzip WARC: {sign}{r["savings_vs_gzip_warc_pct"]}% smaller ({fmt(abs(saved))} {"saved" if saved > 0 else "larger"})')
|
|
108
|
+
print(f' Write speed: {r["write_kbps"]:,} KB/s')
|
|
109
|
+
print(f' Read speed: {r["read_kbps"]:,} KB/s')
|
|
110
|
+
print(f' Round-trip: {"✓ all documents verified" if r["roundtrip_ok"] else "✗ FAILED"}')
|
|
111
|
+
print()
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
# brotli_compat.py — thin ctypes wrapper around the system brotli library
|
|
2
|
+
#
|
|
3
|
+
# Exposes compress(data, quality=11) and decompress(data) with the same
|
|
4
|
+
# calling convention as zlib.compress / zlib.decompress so callers can
|
|
5
|
+
# swap one for the other with a single import change.
|
|
6
|
+
#
|
|
7
|
+
# Falls back to zlib if the brotli dylib is not present (other machines).
|
|
8
|
+
|
|
9
|
+
import ctypes, zlib, os
|
|
10
|
+
|
|
11
|
+
_BROTLI_OK = False
|
|
12
|
+
_enc = None
|
|
13
|
+
_dec = None
|
|
14
|
+
|
|
15
|
+
def _try_load():
|
|
16
|
+
global _BROTLI_OK, _enc, _dec
|
|
17
|
+
candidates = [
|
|
18
|
+
'/usr/local/lib/libbrotlienc.dylib',
|
|
19
|
+
'/usr/local/Cellar/brotli/1.2.0/lib/libbrotlienc.dylib',
|
|
20
|
+
'libbrotlienc.so',
|
|
21
|
+
'libbrotlienc.dylib',
|
|
22
|
+
]
|
|
23
|
+
dec_candidates = [p.replace('enc', 'dec') for p in candidates]
|
|
24
|
+
|
|
25
|
+
for ep, dp in zip(candidates, dec_candidates):
|
|
26
|
+
if os.path.exists(ep) and os.path.exists(dp):
|
|
27
|
+
try:
|
|
28
|
+
e = ctypes.CDLL(ep)
|
|
29
|
+
d = ctypes.CDLL(dp)
|
|
30
|
+
# wire up encoder
|
|
31
|
+
e.BrotliEncoderMaxCompressedSize.restype = ctypes.c_size_t
|
|
32
|
+
e.BrotliEncoderMaxCompressedSize.argtypes = [ctypes.c_size_t]
|
|
33
|
+
e.BrotliEncoderCompress.restype = ctypes.c_int
|
|
34
|
+
e.BrotliEncoderCompress.argtypes = [
|
|
35
|
+
ctypes.c_int, ctypes.c_int, ctypes.c_int,
|
|
36
|
+
ctypes.c_size_t, ctypes.c_char_p,
|
|
37
|
+
ctypes.POINTER(ctypes.c_size_t), ctypes.c_char_p,
|
|
38
|
+
]
|
|
39
|
+
# wire up decoder
|
|
40
|
+
d.BrotliDecoderDecompress.restype = ctypes.c_int
|
|
41
|
+
d.BrotliDecoderDecompress.argtypes = [
|
|
42
|
+
ctypes.c_size_t, ctypes.c_char_p,
|
|
43
|
+
ctypes.POINTER(ctypes.c_size_t), ctypes.c_char_p,
|
|
44
|
+
]
|
|
45
|
+
_enc = e; _dec = d; _BROTLI_OK = True
|
|
46
|
+
return
|
|
47
|
+
except Exception:
|
|
48
|
+
continue
|
|
49
|
+
|
|
50
|
+
_try_load()
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def compress(data: bytes, quality: int = 11, lgwin: int = 24, mode: int = 0) -> bytes:
|
|
54
|
+
"""Compress data with brotli (quality 0-11, mode 0=generic 1=text 2=font).
|
|
55
|
+
Falls back to zlib level 9."""
|
|
56
|
+
if not _BROTLI_OK:
|
|
57
|
+
return zlib.compress(data, level=9)
|
|
58
|
+
|
|
59
|
+
max_out = _enc.BrotliEncoderMaxCompressedSize(len(data))
|
|
60
|
+
out_buf = ctypes.create_string_buffer(max_out)
|
|
61
|
+
out_size = ctypes.c_size_t(max_out)
|
|
62
|
+
|
|
63
|
+
ok = _enc.BrotliEncoderCompress(
|
|
64
|
+
quality, lgwin, mode,
|
|
65
|
+
len(data), data,
|
|
66
|
+
ctypes.byref(out_size), out_buf,
|
|
67
|
+
)
|
|
68
|
+
if not ok:
|
|
69
|
+
raise RuntimeError('brotli compression failed')
|
|
70
|
+
return out_buf.raw[:out_size.value]
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def decompress(data: bytes) -> bytes:
|
|
74
|
+
"""Decompress brotli data. Falls back to zlib if brotli unavailable."""
|
|
75
|
+
if not _BROTLI_OK:
|
|
76
|
+
return zlib.decompress(data)
|
|
77
|
+
|
|
78
|
+
# Grow output buffer until it fits (decompressed size is unknown).
|
|
79
|
+
max_out = max(len(data) * 10, 1 << 20) # start at 10× or 1 MB
|
|
80
|
+
while True:
|
|
81
|
+
out_buf = ctypes.create_string_buffer(max_out)
|
|
82
|
+
out_size = ctypes.c_size_t(max_out)
|
|
83
|
+
result = _dec.BrotliDecoderDecompress(
|
|
84
|
+
len(data), data,
|
|
85
|
+
ctypes.byref(out_size), out_buf,
|
|
86
|
+
)
|
|
87
|
+
if result == 1: # BROTLI_DECODER_RESULT_SUCCESS
|
|
88
|
+
return out_buf.raw[:out_size.value]
|
|
89
|
+
if result == 3: # BROTLI_DECODER_RESULT_NEEDS_MORE_OUTPUT
|
|
90
|
+
max_out *= 4
|
|
91
|
+
continue
|
|
92
|
+
raise RuntimeError(f'brotli decompression failed (result={result})')
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def available() -> bool:
|
|
96
|
+
return _BROTLI_OK
|