storetle 0.2.1__tar.gz → 0.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. {storetle-0.2.1/storetle.egg-info → storetle-0.2.2}/PKG-INFO +30 -1
  2. {storetle-0.2.1 → storetle-0.2.2}/README.md +29 -0
  3. {storetle-0.2.1 → storetle-0.2.2}/pyproject.toml +1 -1
  4. {storetle-0.2.1 → storetle-0.2.2}/storetle/__init__.py +1 -1
  5. {storetle-0.2.1 → storetle-0.2.2}/storetle/cli.py +17 -8
  6. {storetle-0.2.1 → storetle-0.2.2}/storetle/remote.py +17 -0
  7. {storetle-0.2.1 → storetle-0.2.2}/storetle/stream.py +13 -0
  8. storetle-0.2.2/storetle/text.py +78 -0
  9. {storetle-0.2.1 → storetle-0.2.2/storetle.egg-info}/PKG-INFO +30 -1
  10. {storetle-0.2.1 → storetle-0.2.2}/storetle.egg-info/SOURCES.txt +1 -0
  11. {storetle-0.2.1 → storetle-0.2.2}/LICENSE +0 -0
  12. {storetle-0.2.1 → storetle-0.2.2}/setup.cfg +0 -0
  13. {storetle-0.2.1 → storetle-0.2.2}/storetle/brotli_compat.py +0 -0
  14. {storetle-0.2.1 → storetle-0.2.2}/storetle/cube_dict_v10.bin +0 -0
  15. {storetle-0.2.1 → storetle-0.2.2}/storetle/decoder.py +0 -0
  16. {storetle-0.2.1 → storetle-0.2.2}/storetle/encoder.py +0 -0
  17. {storetle-0.2.1 → storetle-0.2.2}/storetle/folder.py +0 -0
  18. {storetle-0.2.1 → storetle-0.2.2}/storetle/vocab.py +0 -0
  19. {storetle-0.2.1 → storetle-0.2.2}/storetle/warc.py +0 -0
  20. {storetle-0.2.1 → storetle-0.2.2}/storetle/zstd_compat.py +0 -0
  21. {storetle-0.2.1 → storetle-0.2.2}/storetle.egg-info/dependency_links.txt +0 -0
  22. {storetle-0.2.1 → storetle-0.2.2}/storetle.egg-info/entry_points.txt +0 -0
  23. {storetle-0.2.1 → storetle-0.2.2}/storetle.egg-info/requires.txt +0 -0
  24. {storetle-0.2.1 → storetle-0.2.2}/storetle.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: storetle
3
- Version: 0.2.1
3
+ Version: 0.2.2
4
4
  Summary: HTML-aware compression for document corpora — solid-archive ratios with random access
5
5
  Author-email: Davis Brief <davis@team8.co>
6
6
  License: MIT
@@ -96,6 +96,35 @@ storetle to-warc archive.storetle out.warc.gz
96
96
  storetle train my_corpus/ --output my.bin # domain-specific dictionary
97
97
  ```
98
98
 
99
+ ## Hosted corpora — free
100
+
101
+ **Simple English Wikipedia, complete** — 267,503 articles, 10.06 GB of HTML
102
+ in 843 MB, snapshot 2025-03-20, CC-BY-SA-4.0. Six self-contained shards with
103
+ JSONL metadata indexes (title ↔ doc index) and a SHA-256 manifest:
104
+
105
+ ```
106
+ https://pub-0a9a18b1320f46f794f8374a71aa608b.r2.dev/simplewiki/manifest.json
107
+ ```
108
+
109
+ Pull one article out of a 100+ MB shard, by index, in ~2 seconds:
110
+
111
+ ```bash
112
+ storetle get https://pub-0a9a18b1320f46f794f8374a71aa608b.r2.dev/simplewiki/simplewiki-20250320-0005.storetle 11244 # Albert Einstein, full HTML
113
+ storetle get https://pub-0a9a18b1320f46f794f8374a71aa608b.r2.dev/simplewiki/simplewiki-20250320-0005.storetle 11244 --text # …as clean plain text
114
+ ```
115
+
116
+ Find a title's index by grepping the shard's `.index.jsonl`. More corpora
117
+ (arXiv, PubMed Central OA) coming.
118
+
119
+ ## Plain text extraction (v0.2.2)
120
+
121
+ `--text` on `get`/`unpack` (and `get_text()`/`iter_text()` in the API)
122
+ extracts tag-stripped clean text **without re-parsing HTML** — the encoding
123
+ already separates structure from content, so text extraction is a walk over
124
+ the structure opcodes that keeps text nodes, drops script/style bodies, and
125
+ emits newlines at block boundaries. A 383 KB Wikipedia article becomes 39 KB
126
+ of readable text.
127
+
99
128
  ## Remote archives (v0.2.1)
100
129
 
101
130
  `get`, `info`, and `unpack` accept URLs. Opening an archive costs a few KB
@@ -73,6 +73,35 @@ storetle to-warc archive.storetle out.warc.gz
73
73
  storetle train my_corpus/ --output my.bin # domain-specific dictionary
74
74
  ```
75
75
 
76
+ ## Hosted corpora — free
77
+
78
+ **Simple English Wikipedia, complete** — 267,503 articles, 10.06 GB of HTML
79
+ in 843 MB, snapshot 2025-03-20, CC-BY-SA-4.0. Six self-contained shards with
80
+ JSONL metadata indexes (title ↔ doc index) and a SHA-256 manifest:
81
+
82
+ ```
83
+ https://pub-0a9a18b1320f46f794f8374a71aa608b.r2.dev/simplewiki/manifest.json
84
+ ```
85
+
86
+ Pull one article out of a 100+ MB shard, by index, in ~2 seconds:
87
+
88
+ ```bash
89
+ storetle get https://pub-0a9a18b1320f46f794f8374a71aa608b.r2.dev/simplewiki/simplewiki-20250320-0005.storetle 11244 # Albert Einstein, full HTML
90
+ storetle get https://pub-0a9a18b1320f46f794f8374a71aa608b.r2.dev/simplewiki/simplewiki-20250320-0005.storetle 11244 --text # …as clean plain text
91
+ ```
92
+
93
+ Find a title's index by grepping the shard's `.index.jsonl`. More corpora
94
+ (arXiv, PubMed Central OA) coming.
95
+
96
+ ## Plain text extraction (v0.2.2)
97
+
98
+ `--text` on `get`/`unpack` (and `get_text()`/`iter_text()` in the API)
99
+ extracts tag-stripped clean text **without re-parsing HTML** — the encoding
100
+ already separates structure from content, so text extraction is a walk over
101
+ the structure opcodes that keeps text nodes, drops script/style bodies, and
102
+ emits newlines at block boundaries. A 383 KB Wikipedia article becomes 39 KB
103
+ of readable text.
104
+
76
105
  ## Remote archives (v0.2.1)
77
106
 
78
107
  `get`, `info`, and `unpack` accept URLs. Opening an archive costs a few KB
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "storetle"
7
- version = "0.2.1"
7
+ version = "0.2.2"
8
8
  description = "HTML-aware compression for document corpora — solid-archive ratios with random access"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -11,7 +11,7 @@ from .stream import StreamWriter, StreamReader
11
11
  from .remote import RemoteReader
12
12
  from .folder import pack, unpack
13
13
 
14
- __version__ = '0.2.1'
14
+ __version__ = '0.2.2'
15
15
  __all__ = ['StreamWriter', 'StreamReader', 'RemoteReader', 'pack', 'unpack', 'benchmark']
16
16
 
17
17
 
@@ -68,17 +68,21 @@ def _open_reader(src):
68
68
 
69
69
 
70
70
  def cmd_unpack(args):
71
+ text = '--text' in args
72
+ args = [a for a in args if a != '--text']
71
73
  if len(args) < 2:
72
- print('Usage: storetle unpack <file-or-url> <output_folder>')
74
+ print('Usage: storetle unpack <file-or-url> <output_folder> [--text]')
73
75
  sys.exit(1)
74
76
  src = args[0]
75
77
  dst = Path(args[1])
76
78
  dst.mkdir(parents=True, exist_ok=True)
77
79
 
80
+ ext = 'txt' if text else 'html'
78
81
  with _open_reader(src) as r:
79
- print(f'Extracting {r.doc_count} documents to {dst}/')
80
- for i, doc in enumerate(r):
81
- out = dst / f'doc_{i:06d}.html'
82
+ print(f'Extracting {r.doc_count} documents to {dst}/ as .{ext}')
83
+ docs = r.iter_text() if text else iter(r)
84
+ for i, doc in enumerate(docs):
85
+ out = dst / f'doc_{i:06d}.{ext}'
82
86
  out.write_bytes(doc)
83
87
  if (i + 1) % 100 == 0:
84
88
  print(f' {i+1}/{r.doc_count}...')
@@ -110,14 +114,18 @@ def cmd_info(args):
110
114
 
111
115
 
112
116
  def cmd_get(args):
117
+ text = '--text' in args
118
+ args = [a for a in args if a != '--text']
113
119
  if len(args) < 2:
114
- print('Usage: storetle get <file-or-url> <index>')
120
+ print('Usage: storetle get <file-or-url> <index> [--text]')
115
121
  sys.exit(1)
116
122
  with _open_reader(args[0]) as r:
117
123
  try:
118
124
  idx = int(args[1])
119
- doc = r[idx]
125
+ doc = r.get_text(idx) if text else r[idx]
120
126
  sys.stdout.buffer.write(doc)
127
+ if text:
128
+ sys.stdout.buffer.write(b'\n')
121
129
  except (IndexError, ValueError) as e:
122
130
  print(f'Error: {e}')
123
131
  sys.exit(1)
@@ -270,10 +278,11 @@ HELP = """storetle — HTML-aware compression for large document collections
270
278
  Commands:
271
279
  bench <folder> Benchmark your HTML data vs gzip WARC
272
280
  pack <folder> <output> Compress a folder → .storetle file
273
- unpack <file-or-url> <out_folder> Extract a .storetle → HTML files
281
+ unpack <file-or-url> <out> [--text] Extract → HTML files (or clean .txt)
274
282
  info <file-or-url> Show file statistics
275
283
  get <file-or-url> <index> Extract one doc by index — over HTTP this
276
- fetches only the containing ~2MB chunk
284
+ fetches only the containing ~2MB chunk.
285
+ Add --text for tag-stripped plain text
277
286
  from-warc <input.warc[.gz]> <out> Convert WARC → .storetle
278
287
  to-warc <input.storetle> <out> Convert .storetle → WARC (or .warc.gz)
279
288
  warc-encode <input.warc[.gz]> <out> Encode HTML in-place → valid .warc.gz (smaller, standard format)
@@ -152,6 +152,23 @@ class RemoteReader:
152
152
  for raw in self._load_chunk(ci):
153
153
  yield _decode_doc(raw)
154
154
 
155
+ def get_text(self, idx):
156
+ """Return extracted plain text (no tags) for a single document."""
157
+ from .text import decode_text
158
+ if idx < 0:
159
+ idx += self.doc_count
160
+ if not 0 <= idx < self.doc_count:
161
+ raise IndexError('doc %d out of range (%d docs)' % (idx, self.doc_count))
162
+ ci = self._locate(idx)
163
+ return decode_text(self._load_chunk(ci)[idx - self._cum[ci]])
164
+
165
+ def iter_text(self):
166
+ """Yield extracted plain text for every document, in order."""
167
+ from .text import decode_text
168
+ for ci in range(len(self._index)):
169
+ for raw in self._load_chunk(ci):
170
+ yield decode_text(raw)
171
+
155
172
  def info(self):
156
173
  comp = self._chunk_ends[-1] - self._index[0][0] if self._index else 0
157
174
  return {
@@ -426,6 +426,19 @@ class StreamReader:
426
426
  raw = self._read_chunk(ci)[wi]
427
427
  return _decode_doc(raw)
428
428
 
429
+ def get_text(self, doc_idx: int):
430
+ """Return extracted plain text (no tags) for a single document."""
431
+ from .text import decode_text
432
+ ci, wi = self._locate(doc_idx)
433
+ return decode_text(self._read_chunk(ci)[wi])
434
+
435
+ def iter_text(self):
436
+ """Yield extracted plain text for every document, in order."""
437
+ from .text import decode_text
438
+ for ci in range(len(self._index)):
439
+ for raw in self._read_chunk(ci):
440
+ yield decode_text(raw)
441
+
429
442
  def __getitem__(self, key):
430
443
  if isinstance(key, int):
431
444
  return self.get(key)
@@ -0,0 +1,78 @@
1
+ # text.py — plain-text extraction straight from the NodeOp encoding.
2
+ #
3
+ # The encoded form already separates structure (struct stream) from content
4
+ # (content stream), so producing clean text never re-parses HTML: walk the
5
+ # opcodes, keep T_TEXT payloads, skip script/style bodies (T_RAWTEXT),
6
+ # comments and doctypes, and emit newlines at block-element boundaries.
7
+ #
8
+ # This consumes the content stream in exact lockstep with stream._decode_doc —
9
+ # every string the HTML decoder would read, this reads too, it just throws
10
+ # most of them away.
11
+
12
+ import re
13
+ import struct
14
+
15
+ from .decoder import _Stream
16
+ from .encoder import (T_OPEN, T_CLOSE, T_TEXT, T_DOCTYPE,
17
+ T_COMMENT, T_SELFCLOSE, T_RAWTEXT)
18
+ from .vocab import ID_TO_TAG, SHARED_STRINGS, UNKNOWN_ID
19
+
20
+ _BLOCK_TAGS = frozenset((
21
+ 'p', 'div', 'br', 'li', 'ul', 'ol', 'dl', 'dt', 'dd',
22
+ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
23
+ 'table', 'tr', 'caption', 'thead', 'tbody',
24
+ 'section', 'article', 'aside', 'header', 'footer', 'main', 'nav',
25
+ 'blockquote', 'pre', 'figure', 'figcaption', 'hr', 'title',
26
+ ))
27
+ _CELL_TAGS = frozenset(('td', 'th'))
28
+
29
+ _collapse_blank = re.compile(r'\n\s*\n+')
30
+ _collapse_space = re.compile(r'[ \t\f\v]+')
31
+
32
+
33
+ def decode_text(raw: bytes) -> bytes:
34
+ """Extract plain text from one encoded document (the blob stored in a
35
+ chunk), without reconstructing HTML."""
36
+ ss_len = struct.unpack_from('>I', raw, 0)[0]
37
+ ss = _Stream(raw[4: 4 + ss_len])
38
+ cs = _Stream(raw[4 + ss_len:])
39
+ ss_data_len = ss_len
40
+
41
+ out = []
42
+
43
+ def boundary(tag):
44
+ if tag in _BLOCK_TAGS:
45
+ out.append('\n')
46
+ elif tag in _CELL_TAGS:
47
+ out.append('\t')
48
+
49
+ while ss._pos < ss_data_len:
50
+ nt = ss.read_byte()
51
+
52
+ if nt in (T_OPEN, T_SELFCLOSE):
53
+ tag_id = ss.read_byte()
54
+ tag = cs.read_string(SHARED_STRINGS) if tag_id == UNKNOWN_ID \
55
+ else ID_TO_TAG.get(tag_id, '')
56
+ ac = ss.read_byte()
57
+ for _ in range(ac):
58
+ aid = ss.read_byte()
59
+ if aid == UNKNOWN_ID:
60
+ cs.read_string(SHARED_STRINGS) # attr name — discard
61
+ cs.read_string(SHARED_STRINGS) # attr value — discard
62
+ boundary(tag)
63
+
64
+ elif nt == T_CLOSE:
65
+ pass # no payload; block boundary handled at open
66
+
67
+ elif nt == T_TEXT:
68
+ t = cs.read_string(SHARED_STRINGS)
69
+ if t:
70
+ out.append(t)
71
+
72
+ elif nt in (T_RAWTEXT, T_DOCTYPE, T_COMMENT):
73
+ cs.read_string(SHARED_STRINGS) # script/style/meta — discard
74
+
75
+ text = ''.join(out)
76
+ text = _collapse_space.sub(' ', text)
77
+ text = _collapse_blank.sub('\n', text)
78
+ return text.strip().encode('utf-8')
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: storetle
3
- Version: 0.2.1
3
+ Version: 0.2.2
4
4
  Summary: HTML-aware compression for document corpora — solid-archive ratios with random access
5
5
  Author-email: Davis Brief <davis@team8.co>
6
6
  License: MIT
@@ -96,6 +96,35 @@ storetle to-warc archive.storetle out.warc.gz
96
96
  storetle train my_corpus/ --output my.bin # domain-specific dictionary
97
97
  ```
98
98
 
99
+ ## Hosted corpora — free
100
+
101
+ **Simple English Wikipedia, complete** — 267,503 articles, 10.06 GB of HTML
102
+ in 843 MB, snapshot 2025-03-20, CC-BY-SA-4.0. Six self-contained shards with
103
+ JSONL metadata indexes (title ↔ doc index) and a SHA-256 manifest:
104
+
105
+ ```
106
+ https://pub-0a9a18b1320f46f794f8374a71aa608b.r2.dev/simplewiki/manifest.json
107
+ ```
108
+
109
+ Pull one article out of a 100+ MB shard, by index, in ~2 seconds:
110
+
111
+ ```bash
112
+ storetle get https://pub-0a9a18b1320f46f794f8374a71aa608b.r2.dev/simplewiki/simplewiki-20250320-0005.storetle 11244 # Albert Einstein, full HTML
113
+ storetle get https://pub-0a9a18b1320f46f794f8374a71aa608b.r2.dev/simplewiki/simplewiki-20250320-0005.storetle 11244 --text # …as clean plain text
114
+ ```
115
+
116
+ Find a title's index by grepping the shard's `.index.jsonl`. More corpora
117
+ (arXiv, PubMed Central OA) coming.
118
+
119
+ ## Plain text extraction (v0.2.2)
120
+
121
+ `--text` on `get`/`unpack` (and `get_text()`/`iter_text()` in the API)
122
+ extracts tag-stripped clean text **without re-parsing HTML** — the encoding
123
+ already separates structure from content, so text extraction is a walk over
124
+ the structure opcodes that keeps text nodes, drops script/style bodies, and
125
+ emits newlines at block boundaries. A 383 KB Wikipedia article becomes 39 KB
126
+ of readable text.
127
+
99
128
  ## Remote archives (v0.2.1)
100
129
 
101
130
  `get`, `info`, and `unpack` accept URLs. Opening an archive costs a few KB
@@ -10,6 +10,7 @@ storetle/encoder.py
10
10
  storetle/folder.py
11
11
  storetle/remote.py
12
12
  storetle/stream.py
13
+ storetle/text.py
13
14
  storetle/vocab.py
14
15
  storetle/warc.py
15
16
  storetle/zstd_compat.py
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes