trieset 0.1.0a1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Daniel Benjamim
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,27 @@
1
+ # Make the sdist self-contained. ASCII only.
2
+ # The package sources (trieset/*.py) are included automatically; everything the
3
+ # build needs that lives OUTSIDE the package must be listed here:
4
+ # - the C-extension source (pybind/tt_trieset.cc),
5
+ # - the build-time-vendored shared core (_vendor/standalone/...), which setup.py
6
+ # populates from the repo-root standalone/ before the sdist is built,
7
+ # - the typing marker + stub, and the wheel smoke test.
8
+ include README.md
9
+ include LICENSE
10
+ include pyproject.toml
11
+ include setup.py
12
+ include pybind/tt_trieset.cc
13
+ recursive-include _vendor *.h *.cc
14
+ include trieset/py.typed
15
+ include trieset/*.pyi
16
+ include tests/test_package.py
17
+ # docs/ is intentionally NOT shipped. The source repo stays PRIVATE; the internal
18
+ # docs (STATUS / GOVERNANCE / NEXT_OPTIONS / FINDINGS / PUBLISH) carry session ids,
19
+ # box paths, and references to a separate private project -- none of which belong in
20
+ # a public PyPI artifact. The public README is self-contained: it states the honest
21
+ # wins/losses inline and does NOT relative-link into docs/.
22
+
23
+ # Never ship build junk.
24
+ global-exclude *.so *.o *.pyc
25
+ prune build
26
+ prune dist
27
+ prune wheelhouse
@@ -0,0 +1,251 @@
1
+ Metadata-Version: 2.4
2
+ Name: trieset
3
+ Version: 0.1.0a1
4
+ Summary: A sorted byte-string set backed by a threaded trie: a faster drop-in than sortedcontainers.SortedSet for build + prefix-scan + serialized-flush workloads.
5
+ Author-email: Daniel Benjamim <dangstan.gbr@gmail.com>
6
+ License-Expression: MIT
7
+ Keywords: trie,sorted-set,sortedcontainers,prefix,byte-string,ordered-set
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: Programming Language :: C++
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3 :: Only
13
+ Classifier: Programming Language :: Python :: Implementation :: CPython
14
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
15
+ Classifier: Operating System :: POSIX :: Linux
16
+ Requires-Python: >=3.9
17
+ Description-Content-Type: text/markdown
18
+ License-File: LICENSE
19
+ Dynamic: license-file
20
+
21
+ # trieset
22
+
23
+ A sorted **byte-string** set backed by a threaded trie. A *faster drop-in* than
24
+ [`sortedcontainers.SortedSet`](https://grantjenks.com/docs/sortedcontainers/) for
25
+ one specific, common workload: you **build** a set of byte strings and then do
26
+ **prefix scans / counts** and/or a **serialized sorted flush**.
27
+
28
+ It is **not** a "faster sorted set" in general. The wins and the costs are both
29
+ stated honestly below, measured per-element against `SortedSet` (the fair drop-in
30
+ test), not in a bulk microbenchmark.
31
+
32
+ > Status: **public alpha (0.1.0a1).** First release is an **sdist** -- `pip install
33
+ > trieset` compiles the C-extension from source (needs a C++17 compiler; standard
34
+ > on Linux dev machines). Prebuilt binary (manylinux) wheels are a planned
35
+ > follow-up. The benchmark numbers below are measured on a quiesced 16-core Linux
36
+ > box; re-measure on your hardware before relying on a specific ratio.
37
+
38
+ ## When to use it (and when not)
39
+
40
+ `SortedSet` is the de-facto answer for an ordered, mutable set with fast
41
+ membership + ordered iteration in pure Python. **Use `trieset` instead only when**
42
+ your keys are byte strings and your workload is dominated by ANY of:
43
+
44
+ - **autocomplete** -- `startsWith` / prefix scans: `startsWith` existence is
45
+ ~9-15x faster than `SortedSet` at every prefix width; NARROW prefix collect (a
46
+ prefix matching few keys) is a few x faster too. (WIDE prefix *collect* -- where
47
+ you materialize a large fraction of the set -- favors `SortedSet`; see the cost
48
+ section.)
49
+ - **prefix counting** -- `prefix_count(p)` is `O(prefix-length)` (per-node subtree
50
+ counts), so it beats `SortedSet`'s bisect at EVERY width (~16-25x measured).
51
+ - **building** the set (lots of inserts) -- the fastest builder of any
52
+ PREFIX-CAPABLE structure here (a plain hash set builds faster, but answers no
53
+ prefix query),
54
+ - **serialized sorted flush** (persist / send on the wire / feed a C consumer).
55
+
56
+ **Do NOT reach for it** when your hot path is point membership at scale, or
57
+ exploding all keys back into a Python list, or you are memory-constrained -- see
58
+ the costs below.
59
+
60
+ ## Measured drop-in verdict
61
+
62
+ Per-element, quiesced box, best-of-5, ratio = trie / `SortedSet` (`<1` = trie
63
+ faster). These core ops are unaffected by the prefix internals:
64
+
65
+ | operation | N=1k | N=10k | N=100k | verdict |
66
+ |-----------|------|-------|--------|---------|
67
+ | `add` (per-element build) | 0.13x | 0.12x | 0.15x | **trie WINS ~7x, every N** |
68
+ | `flush()` serialized | 0.04x | 0.13x | 0.25x | **trie WINS 4-25x, every N** |
69
+ | `x in t` membership HIT | 0.67x | 0.99x | 1.53x | wins small, **wash-to-~1.5x at scale** |
70
+ | `x in t` membership MISS | 0.45x | 0.59x | 1.08x | wins to ~10k, ~wash at 100k |
71
+ | `tolist()` (C loop) | 1.87x | 2.43x | 3.04x | **SS wins** (storage floor) |
72
+ | `list(t)` (per-elem iter) | 2.20x | 3.46x | 4.39x | **SS wins** (use `tolist()`) |
73
+
74
+ ### Prefix operations, by prefix width
75
+
76
+ Prefix performance depends on how WIDE the prefix is (how many keys it matches),
77
+ so a single ratio hides the story. Measured per query, N=100k, quiesced box,
78
+ clustered keys (us/query; lower is faster; **bold** = winner of trie vs `SortedSet`):
79
+
80
+ | prefix width | ~matches/query | `prefix_count`: trie / SS | `startsWith`: trie / SS | `prefix` collect: trie / SS |
81
+ |---|---|---|---|---|
82
+ | wide (1 char) | ~3850 | **0.06** / 1.51 (trie ~25x) | **0.10** / 1.48 (trie ~15x) | 175 / **53** (SS ~3.3x) |
83
+ | medium (3) | ~144 | **0.08** / 1.85 (trie ~23x) | **0.13** / 1.60 (trie ~12x) | 7.2 / **5.1** (SS ~1.4x) |
84
+ | narrow (6) | ~1 | **0.11** / 1.79 (trie ~16x) | **0.17** / 1.58 (trie ~9x) | **0.26** / 1.32 (trie ~5x) |
85
+
86
+ Reading it:
87
+
88
+ - **`prefix_count` wins at every width (~16-25x).** It is `O(prefix-length)` -- a
89
+ descend-and-read of a per-node subtree count, not a walk over matches -- so it
90
+ beats `SortedSet`'s `O(log n)` bisect-difference even on wide prefixes. (Earlier
91
+ releases walked the matches and LOST on wide prefixes; that is fixed.)
92
+ - **`startsWith` / prefix existence wins at every width (~9-15x).** An
93
+ `O(prefix-length)` descent vs a bisect + `startswith` compare, flat in the match
94
+ count. This is the most robust prefix win.
95
+ - **`prefix` collect (materializing the matching keys) is split:** the trie wins
96
+ NARROW (~5x -- few keys to build), but `SortedSet` wins WIDE (a contiguous slice
97
+ of `PyBytes` it already holds, vs the trie allocating each key). If your prefixes
98
+ routinely return a large fraction of the set, `SortedSet` collect is faster.
99
+
100
+ ## What WINS (the drop-in claim)
101
+
102
+ - **build / `add`: ~5x faster** than `SortedSet` at every N (and the fastest
103
+ builder of every PREFIX-CAPABLE structure benchmarked; a plain hash set builds
104
+ ~5x faster than `trieset` but answers no prefix query) -- per-element insert
105
+ beats `SortedSet`'s bisect + list-insert, with no relocation and no declared
106
+ alphabet.
107
+ - **`startsWith` / prefix existence: ~9-15x faster** than `SortedSet`, every width.
108
+ - **`prefix_count`: ~16-25x faster** than `SortedSet`, every width (`O(prefix-len)`).
109
+ - **NARROW prefix collect** (autocomplete -- a prefix matching few keys): a few x
110
+ faster than `SortedSet`, and 5-60x faster than every other Python trie.
111
+ - **serialized sorted flush (`flush`): 3-14x faster** -- one contiguous C pass vs
112
+ per-element Python serialization.
113
+ - **prefix-walk (ancestors): `O(word-length)`.** `shortest_prefix(word)` /
114
+ `longest_prefix(word)` / `prefixes(word)` return the stored keys that are
115
+ PREFIXES OF `word` (the dual of `prefix()`) in a single descent -- the natural
116
+ "is any stored key a prefix of X" / "shortest dictionary root of X" query.
117
+ (Honest caveat: on the whole *replace-words* style problem a plain `set` still
118
+ wins end-to-end because build cost dominates -- this primitive closes the gap
119
+ versus other tries, not a new headline versus a hash set.)
120
+
121
+ ## What does NOT win (stated up front)
122
+
123
+ - **Point membership (`x in t`)** is a **wash at small N and a wash-to-slight-loss
124
+ at scale** (measured ~1.0-1.5x SortedSet at N=100k, depending on the query mix).
125
+ `SortedSet`'s bisect is `O(log n)` over contiguous, cache-friendly arrays; the
126
+ trie does `O(k)` pointer-chasing across arena-scattered nodes (a cache miss per
127
+ level), so it does not WIN membership and a hash set beats both. **Do not pick
128
+ `trieset` for a membership-heavy workload** -- but it is the fastest *trie* here.
129
+ - **WIDE prefix collect** (materializing a large fraction of the set) is **slower**
130
+ than `SortedSet`'s contiguous slice -- see the by-width table. (Only the COUNT is
131
+ width-independent; collecting the keys is not.)
132
+ - **Listing all keys** is **slower** -- the storage-model floor: `SortedSet`
133
+ already holds the `PyBytes`, while the trie must `malloc` + `memcpy` a `bytes`
134
+ per key. `tolist()` (one C loop) is ~2-3x slower; the per-element `list(t)` is
135
+ ~2-4x slower. Use `tolist()` to materialize and `for k in t` to stream /
136
+ early-exit; do not use `list(t)`.
137
+ - **Memory: modestly higher (~1.1-1.5x, measurement-sensitive).** TrieSet's
138
+ structure is ~73 B/key (exact arena); `SortedSet` measures ~50-70 B/key by RSS
139
+ (it stores the `PyBytes` plus index pointers), so a like-for-like build-peak RSS
140
+ comparison is ~1.1x and an RSS-delta comparison ~1.4x (an older optimistic
141
+ SortedSet estimate gave ~1.8x). Either way it uses more than `SortedSet`, and far
142
+ more than `marisa-trie`'s succinct ~18 B/key loaded. (Was ~3-7x before the
143
+ leaf-tagging + inline-key shrink; +4% back from the per-node subtree counts that
144
+ make `prefix_count` `O(prefix-len)`.)
145
+
146
+ In one sentence: *the fastest structure here to BUILD a mutable byte-string
147
+ dictionary and do AUTOCOMPLETE on it (`startsWith` + `prefix_count` + narrow
148
+ prefix suggestions) and SERIALIZED FLUSH; WIDE prefix collect and point membership
149
+ favor `SortedSet`; ~2-4x slower to list all keys; modestly more memory (~1.1-1.5x).*
150
+
151
+ ## Versus compiled tries (datrie, marisa-trie)
152
+
153
+ `SortedSet` is pure Python; the honest next question is how `trieset` sits among
154
+ *compiled* trie libraries. Measured (lowercase-ascii keys, quiesced; peers:
155
+ `datrie`, `marisa-trie`, `pygtrie`, `pyahocorasick`):
156
+
157
+ - **Build:** `trieset` is the fastest -- **~2.9x** faster than `pyahocorasick`,
158
+ **~3.4x** faster than `marisa-trie`'s bulk build, **~18x** faster than `pygtrie`,
159
+ and orders of magnitude faster than `datrie` (~80x even pre-sorted, hundreds of x
160
+ in random order -- its double-array relocates on incremental insert).
161
+ - **Prefix COUNT + COLLECT:** `trieset` beats every compiled trie at **every width**
162
+ -- it **counts** in `O(prefix-len)` without building Python objects (the peers
163
+ must materialize and count: ~600-8000x slower on wide prefixes) and **collects**
164
+ ~3-60x faster.
165
+ - **Prefix EXISTENCE (`startsWith`):** `trieset` does **not** lead here. `datrie`
166
+ and `pyahocorasick` (a compiled Aho-Corasick automaton) answer `startsWith`
167
+ ~1.5-2.5x faster than `trieset` at every width (their compiled lookup beats the
168
+ trie's Seek-first-element). `trieset` still beats `marisa`/`pygtrie` on existence,
169
+ and beats `SortedSet` ~9-15x.
170
+ - **Membership:** `trieset` is the fastest *trie* (beats `datrie`, `marisa`,
171
+ `pyahocorasick`), though still a wash-to-slight-loss versus `SortedSet`'s bisect.
172
+ - **Memory:** `marisa-trie` is **far smaller** (succinct: ~4.4 B/key image, ~18 B/key
173
+ loaded vs ~73) and serializes/loads a real queryable index almost for free -- but
174
+ it is **read-only** (no `add`/`delete`). `pyahocorasick` is heavier than `trieset`;
175
+ `datrie` needs a fixed alphabet declared up front and builds pathologically slowly.
176
+ (`hat-trie` does not build on Python 3.11+ and was not tested.)
177
+
178
+ So the niche is a **mutable, arbitrary-byte, no-declared-alphabet** set you
179
+ **build incrementally** and **prefix count / collect / autocomplete** -- where
180
+ `trieset` is the fastest of the Python tries. If you only need `startsWith`
181
+ existence on a near-static set, `datrie` or `pyahocorasick` answer that one op
182
+ faster; for a static set where memory is paramount, prefer `marisa-trie`.
183
+
184
+ ## Install
185
+
186
+ ```sh
187
+ pip install trieset
188
+ ```
189
+
190
+ The first release is a source distribution, so `pip` compiles the CPython
191
+ C-extension (`trieset.tt_trieset`) from the binding plus the bundled trie core.
192
+ You need a **C++17 compiler** (`g++`/`clang++`; standard on Linux dev machines).
193
+ **Portability:** the build uses `-O3` and **not** `-march=native`. The core's SIMD
194
+ child-lookup is `__SSE2__`-guarded and SSE2 is the x86-64 baseline, so the build is
195
+ correct on every x86-64 CPU (and falls back to a scalar path elsewhere). Prebuilt
196
+ binary wheels (no compiler needed) are a planned follow-up.
197
+
198
+ ## Quickstart
199
+
200
+ ```python
201
+ from trieset import TrieSet
202
+
203
+ t = TrieSet([b"apple", b"apricot", b"banana"])
204
+ t.add(b"avocado")
205
+ t.update([b"cherry", b"apple"]) # set semantics: duplicate is a no-op
206
+
207
+ b"banana" in t # True (membership: O(len key))
208
+ len(t) # 5
209
+
210
+ list(t.prefix(b"ap")) # [b'apple', b'apricot'] (sorted)
211
+ t.prefix_count(b"a") # 3 (O(prefix-len), counted in C)
212
+
213
+ t.shortest_prefix(b"applesauce") # b'apple' (shortest stored key that prefixes the word)
214
+ t.longest_prefix(b"applesauce") # b'apple'
215
+ list(t.prefixes(b"applesauce")) # [b'apple'] (all stored keys that prefix the word)
216
+
217
+ t.tolist() # all keys, sorted (one C loop)
218
+ for k in t: # stream sorted keys, O(1) peak memory
219
+ ...
220
+ t.first(), t.last() # (b'apple', b'cherry')
221
+
222
+ blob = t.flush() # all keys, sorted, length-framed bytes:
223
+ # repeated [u32-le len][key bytes]
224
+ ```
225
+
226
+ ## API and honest limitations
227
+
228
+ - **Keys are byte strings only.** Any buffer-protocol object
229
+ (`bytes` / `bytearray` / `memoryview`) is accepted on input; every returned key
230
+ is `bytes`. **`str` raises `TypeError`** -- by design (this is a byte-string set).
231
+ - **No random `remove` / `discard`.** The core supports only "drain the smallest",
232
+ not delete-by-key, so it is not exposed (avoids an `O(n)` emulation). This is the
233
+ build-then-query / sorted-flush niche; a delete-heavy set is not the target.
234
+ - **No multiplicity.** It is a set: a key is present 0 or 1 times.
235
+ - **Not thread-safe for concurrent writes** (single-writer core). The GIL is held
236
+ throughout, so concurrent Python threads are serialized correctly.
237
+
238
+ Type hints ship with the package (`py.typed` + `trieset/__init__.pyi`).
239
+
240
+ ## Prior art (not novel, not first)
241
+
242
+ The structure is not novel: prior art includes `marisa-trie`, `datrie`, the Cuckoo
243
+ Trie, and the Patricia-trie memtables in ToplingDB / TerarkDB. The contribution
244
+ here is the **clean from-scratch derivation** plus the **packaged
245
+ build + prefix + sorted-flush combination** with honest, measured numbers --
246
+ something `SortedSet` and the existing trie packages do not jointly offer. The
247
+ shared trie core also backs a RocksDB `MemTableRep` plugin (a separate project).
248
+
249
+ ## License
250
+
251
+ MIT. See [LICENSE](LICENSE).
@@ -0,0 +1,231 @@
1
+ # trieset
2
+
3
+ A sorted **byte-string** set backed by a threaded trie. A *faster drop-in* than
4
+ [`sortedcontainers.SortedSet`](https://grantjenks.com/docs/sortedcontainers/) for
5
+ one specific, common workload: you **build** a set of byte strings and then do
6
+ **prefix scans / counts** and/or a **serialized sorted flush**.
7
+
8
+ It is **not** a "faster sorted set" in general. The wins and the costs are both
9
+ stated honestly below, measured per-element against `SortedSet` (the fair drop-in
10
+ test), not in a bulk microbenchmark.
11
+
12
+ > Status: **public alpha (0.1.0a1).** First release is an **sdist** -- `pip install
13
+ > trieset` compiles the C-extension from source (needs a C++17 compiler; standard
14
+ > on Linux dev machines). Prebuilt binary (manylinux) wheels are a planned
15
+ > follow-up. The benchmark numbers below are measured on a quiesced 16-core Linux
16
+ > box; re-measure on your hardware before relying on a specific ratio.
17
+
18
+ ## When to use it (and when not)
19
+
20
+ `SortedSet` is the de-facto answer for an ordered, mutable set with fast
21
+ membership + ordered iteration in pure Python. **Use `trieset` instead only when**
22
+ your keys are byte strings and your workload is dominated by ANY of:
23
+
24
+ - **autocomplete** -- `startsWith` / prefix scans: `startsWith` existence is
25
+ ~9-15x faster than `SortedSet` at every prefix width; NARROW prefix collect (a
26
+ prefix matching few keys) is a few x faster too. (WIDE prefix *collect* -- where
27
+ you materialize a large fraction of the set -- favors `SortedSet`; see the cost
28
+ section.)
29
+ - **prefix counting** -- `prefix_count(p)` is `O(prefix-length)` (per-node subtree
30
+ counts), so it beats `SortedSet`'s bisect at EVERY width (~16-25x measured).
31
+ - **building** the set (lots of inserts) -- the fastest builder of any
32
+ PREFIX-CAPABLE structure here (a plain hash set builds faster, but answers no
33
+ prefix query),
34
+ - **serialized sorted flush** (persist / send on the wire / feed a C consumer).
35
+
36
+ **Do NOT reach for it** when your hot path is point membership at scale, or
37
+ exploding all keys back into a Python list, or you are memory-constrained -- see
38
+ the costs below.
39
+
40
+ ## Measured drop-in verdict
41
+
42
+ Per-element, quiesced box, best-of-5, ratio = trie / `SortedSet` (`<1` = trie
43
+ faster). These core ops are unaffected by the prefix internals:
44
+
45
+ | operation | N=1k | N=10k | N=100k | verdict |
46
+ |-----------|------|-------|--------|---------|
47
+ | `add` (per-element build) | 0.13x | 0.12x | 0.15x | **trie WINS ~7x, every N** |
48
+ | `flush()` serialized | 0.04x | 0.13x | 0.25x | **trie WINS 4-25x, every N** |
49
+ | `x in t` membership HIT | 0.67x | 0.99x | 1.53x | wins small, **wash-to-~1.5x at scale** |
50
+ | `x in t` membership MISS | 0.45x | 0.59x | 1.08x | wins to ~10k, ~wash at 100k |
51
+ | `tolist()` (C loop) | 1.87x | 2.43x | 3.04x | **SS wins** (storage floor) |
52
+ | `list(t)` (per-elem iter) | 2.20x | 3.46x | 4.39x | **SS wins** (use `tolist()`) |
53
+
54
+ ### Prefix operations, by prefix width
55
+
56
+ Prefix performance depends on how WIDE the prefix is (how many keys it matches),
57
+ so a single ratio hides the story. Measured per query, N=100k, quiesced box,
58
+ clustered keys (us/query; lower is faster; **bold** = winner of trie vs `SortedSet`):
59
+
60
+ | prefix width | ~matches/query | `prefix_count`: trie / SS | `startsWith`: trie / SS | `prefix` collect: trie / SS |
61
+ |---|---|---|---|---|
62
+ | wide (1 char) | ~3850 | **0.06** / 1.51 (trie ~25x) | **0.10** / 1.48 (trie ~15x) | 175 / **53** (SS ~3.3x) |
63
+ | medium (3) | ~144 | **0.08** / 1.85 (trie ~23x) | **0.13** / 1.60 (trie ~12x) | 7.2 / **5.1** (SS ~1.4x) |
64
+ | narrow (6) | ~1 | **0.11** / 1.79 (trie ~16x) | **0.17** / 1.58 (trie ~9x) | **0.26** / 1.32 (trie ~5x) |
65
+
66
+ Reading it:
67
+
68
+ - **`prefix_count` wins at every width (~16-25x).** It is `O(prefix-length)` -- a
69
+ descend-and-read of a per-node subtree count, not a walk over matches -- so it
70
+ beats `SortedSet`'s `O(log n)` bisect-difference even on wide prefixes. (Earlier
71
+ releases walked the matches and LOST on wide prefixes; that is fixed.)
72
+ - **`startsWith` / prefix existence wins at every width (~9-15x).** An
73
+ `O(prefix-length)` descent vs a bisect + `startswith` compare, flat in the match
74
+ count. This is the most robust prefix win.
75
+ - **`prefix` collect (materializing the matching keys) is split:** the trie wins
76
+ NARROW (~5x -- few keys to build), but `SortedSet` wins WIDE (a contiguous slice
77
+ of `PyBytes` it already holds, vs the trie allocating each key). If your prefixes
78
+ routinely return a large fraction of the set, `SortedSet` collect is faster.
79
+
80
+ ## What WINS (the drop-in claim)
81
+
82
+ - **build / `add`: ~5x faster** than `SortedSet` at every N (and the fastest
83
+ builder of every PREFIX-CAPABLE structure benchmarked; a plain hash set builds
84
+ ~5x faster than `trieset` but answers no prefix query) -- per-element insert
85
+ beats `SortedSet`'s bisect + list-insert, with no relocation and no declared
86
+ alphabet.
87
+ - **`startsWith` / prefix existence: ~9-15x faster** than `SortedSet`, every width.
88
+ - **`prefix_count`: ~16-25x faster** than `SortedSet`, every width (`O(prefix-len)`).
89
+ - **NARROW prefix collect** (autocomplete -- a prefix matching few keys): a few x
90
+ faster than `SortedSet`, and 5-60x faster than every other Python trie.
91
+ - **serialized sorted flush (`flush`): 3-14x faster** -- one contiguous C pass vs
92
+ per-element Python serialization.
93
+ - **prefix-walk (ancestors): `O(word-length)`.** `shortest_prefix(word)` /
94
+ `longest_prefix(word)` / `prefixes(word)` return the stored keys that are
95
+ PREFIXES OF `word` (the dual of `prefix()`) in a single descent -- the natural
96
+ "is any stored key a prefix of X" / "shortest dictionary root of X" query.
97
+ (Honest caveat: on the whole *replace-words* style problem a plain `set` still
98
+ wins end-to-end because build cost dominates -- this primitive closes the gap
99
+ versus other tries, not a new headline versus a hash set.)
100
+
101
+ ## What does NOT win (stated up front)
102
+
103
+ - **Point membership (`x in t`)** is a **wash at small N and a wash-to-slight-loss
104
+ at scale** (measured ~1.0-1.5x SortedSet at N=100k, depending on the query mix).
105
+ `SortedSet`'s bisect is `O(log n)` over contiguous, cache-friendly arrays; the
106
+ trie does `O(k)` pointer-chasing across arena-scattered nodes (a cache miss per
107
+ level), so it does not WIN membership and a hash set beats both. **Do not pick
108
+ `trieset` for a membership-heavy workload** -- but it is the fastest *trie* here.
109
+ - **WIDE prefix collect** (materializing a large fraction of the set) is **slower**
110
+ than `SortedSet`'s contiguous slice -- see the by-width table. (Only the COUNT is
111
+ width-independent; collecting the keys is not.)
112
+ - **Listing all keys** is **slower** -- the storage-model floor: `SortedSet`
113
+ already holds the `PyBytes`, while the trie must `malloc` + `memcpy` a `bytes`
114
+ per key. `tolist()` (one C loop) is ~2-3x slower; the per-element `list(t)` is
115
+ ~2-4x slower. Use `tolist()` to materialize and `for k in t` to stream /
116
+ early-exit; do not use `list(t)`.
117
+ - **Memory: modestly higher (~1.1-1.5x, measurement-sensitive).** TrieSet's
118
+ structure is ~73 B/key (exact arena); `SortedSet` measures ~50-70 B/key by RSS
119
+ (it stores the `PyBytes` plus index pointers), so a like-for-like build-peak RSS
120
+ comparison is ~1.1x and an RSS-delta comparison ~1.4x (an older optimistic
121
+ SortedSet estimate gave ~1.8x). Either way it uses more than `SortedSet`, and far
122
+ more than `marisa-trie`'s succinct ~18 B/key loaded. (Was ~3-7x before the
123
+ leaf-tagging + inline-key shrink; +4% back from the per-node subtree counts that
124
+ make `prefix_count` `O(prefix-len)`.)
125
+
126
+ In one sentence: *the fastest structure here to BUILD a mutable byte-string
127
+ dictionary and do AUTOCOMPLETE on it (`startsWith` + `prefix_count` + narrow
128
+ prefix suggestions) and SERIALIZED FLUSH; WIDE prefix collect and point membership
129
+ favor `SortedSet`; ~2-4x slower to list all keys; modestly more memory (~1.1-1.5x).*
130
+
131
+ ## Versus compiled tries (datrie, marisa-trie)
132
+
133
+ `SortedSet` is pure Python; the honest next question is how `trieset` sits among
134
+ *compiled* trie libraries. Measured (lowercase-ascii keys, quiesced; peers:
135
+ `datrie`, `marisa-trie`, `pygtrie`, `pyahocorasick`):
136
+
137
+ - **Build:** `trieset` is the fastest -- **~2.9x** faster than `pyahocorasick`,
138
+ **~3.4x** faster than `marisa-trie`'s bulk build, **~18x** faster than `pygtrie`,
139
+ and orders of magnitude faster than `datrie` (~80x even pre-sorted, hundreds of x
140
+ in random order -- its double-array relocates on incremental insert).
141
+ - **Prefix COUNT + COLLECT:** `trieset` beats every compiled trie at **every width**
142
+ -- it **counts** in `O(prefix-len)` without building Python objects (the peers
143
+ must materialize and count: ~600-8000x slower on wide prefixes) and **collects**
144
+ ~3-60x faster.
145
+ - **Prefix EXISTENCE (`startsWith`):** `trieset` does **not** lead here. `datrie`
146
+ and `pyahocorasick` (a compiled Aho-Corasick automaton) answer `startsWith`
147
+ ~1.5-2.5x faster than `trieset` at every width (their compiled lookup beats the
148
+ trie's Seek-first-element). `trieset` still beats `marisa`/`pygtrie` on existence,
149
+ and beats `SortedSet` ~9-15x.
150
+ - **Membership:** `trieset` is the fastest *trie* (beats `datrie`, `marisa`,
151
+ `pyahocorasick`), though still a wash-to-slight-loss versus `SortedSet`'s bisect.
152
+ - **Memory:** `marisa-trie` is **far smaller** (succinct: ~4.4 B/key image, ~18 B/key
153
+ loaded vs ~73) and serializes/loads a real queryable index almost for free -- but
154
+ it is **read-only** (no `add`/`delete`). `pyahocorasick` is heavier than `trieset`;
155
+ `datrie` needs a fixed alphabet declared up front and builds pathologically slowly.
156
+ (`hat-trie` does not build on Python 3.11+ and was not tested.)
157
+
158
+ So the niche is a **mutable, arbitrary-byte, no-declared-alphabet** set you
159
+ **build incrementally** and **prefix count / collect / autocomplete** -- where
160
+ `trieset` is the fastest of the Python tries. If you only need `startsWith`
161
+ existence on a near-static set, `datrie` or `pyahocorasick` answer that one op
162
+ faster; for a static set where memory is paramount, prefer `marisa-trie`.
163
+
164
+ ## Install
165
+
166
+ ```sh
167
+ pip install trieset
168
+ ```
169
+
170
+ The first release is a source distribution, so `pip` compiles the CPython
171
+ C-extension (`trieset.tt_trieset`) from the binding plus the bundled trie core.
172
+ You need a **C++17 compiler** (`g++`/`clang++`; standard on Linux dev machines).
173
+ **Portability:** the build uses `-O3` and **not** `-march=native`. The core's SIMD
174
+ child-lookup is `__SSE2__`-guarded and SSE2 is the x86-64 baseline, so the build is
175
+ correct on every x86-64 CPU (and falls back to a scalar path elsewhere). Prebuilt
176
+ binary wheels (no compiler needed) are a planned follow-up.
177
+
178
+ ## Quickstart
179
+
180
+ ```python
181
+ from trieset import TrieSet
182
+
183
+ t = TrieSet([b"apple", b"apricot", b"banana"])
184
+ t.add(b"avocado")
185
+ t.update([b"cherry", b"apple"]) # set semantics: duplicate is a no-op
186
+
187
+ b"banana" in t # True (membership: O(len key))
188
+ len(t) # 5
189
+
190
+ list(t.prefix(b"ap")) # [b'apple', b'apricot'] (sorted)
191
+ t.prefix_count(b"a") # 3 (O(prefix-len), counted in C)
192
+
193
+ t.shortest_prefix(b"applesauce") # b'apple' (shortest stored key that prefixes the word)
194
+ t.longest_prefix(b"applesauce") # b'apple'
195
+ list(t.prefixes(b"applesauce")) # [b'apple'] (all stored keys that prefix the word)
196
+
197
+ t.tolist() # all keys, sorted (one C loop)
198
+ for k in t: # stream sorted keys, O(1) peak memory
199
+ ...
200
+ t.first(), t.last() # (b'apple', b'cherry')
201
+
202
+ blob = t.flush() # all keys, sorted, length-framed bytes:
203
+ # repeated [u32-le len][key bytes]
204
+ ```
205
+
206
+ ## API and honest limitations
207
+
208
+ - **Keys are byte strings only.** Any buffer-protocol object
209
+ (`bytes` / `bytearray` / `memoryview`) is accepted on input; every returned key
210
+ is `bytes`. **`str` raises `TypeError`** -- by design (this is a byte-string set).
211
+ - **No random `remove` / `discard`.** The core supports only "drain the smallest",
212
+ not delete-by-key, so it is not exposed (avoids an `O(n)` emulation). This is the
213
+ build-then-query / sorted-flush niche; a delete-heavy set is not the target.
214
+ - **No multiplicity.** It is a set: a key is present 0 or 1 times.
215
+ - **Not thread-safe for concurrent writes** (single-writer core). The GIL is held
216
+ throughout, so concurrent Python threads are serialized correctly.
217
+
218
+ Type hints ship with the package (`py.typed` + `trieset/__init__.pyi`).
219
+
220
+ ## Prior art (not novel, not first)
221
+
222
+ The structure is not novel: prior art includes `marisa-trie`, `datrie`, the Cuckoo
223
+ Trie, and the Patricia-trie memtables in ToplingDB / TerarkDB. The contribution
224
+ here is the **clean from-scratch derivation** plus the **packaged
225
+ build + prefix + sorted-flush combination** with honest, measured numbers --
226
+ something `SortedSet` and the existing trie packages do not jointly offer. The
227
+ shared trie core also backs a RocksDB `MemTableRep` plugin (a separate project).
228
+
229
+ ## License
230
+
231
+ MIT. See [LICENSE](LICENSE).
@@ -0,0 +1,134 @@
1
+ // hash_linklist.h -- a HashLinkList-style baseline, the "hash memtable" foil.
2
+ //
3
+ // This mirrors RocksDB's HashLinkListRep shape closely enough to make the
4
+ // article's point: keys are hashed into buckets, and each bucket is a sorted
5
+ // singly-linked list. That gives fast point ops (hash + short bucket walk), but
6
+ // the buckets are only LOCALLY ordered -- there is no global order across
7
+ // buckets. So a FLUSH (emit every key in sorted order, which an LSM-tree must do
8
+ // to write an SST) cannot just walk a list: it must gather all entries and sort
9
+ // them, O(n log n). The threaded trie's flush is an O(n) `next` walk -- that is
10
+ // the hash-vs-sorted tradeoff this baseline exists to expose.
11
+ //
12
+ // Same arena + bytewise comparator as the trie/skiplist so the comparison is
13
+ // honest. Single-threaded (Phase 1). ASCII-only.
14
+ #ifndef TTRIE_HASH_LINKLIST_H_
15
+ #define TTRIE_HASH_LINKLIST_H_
16
+
17
+ #include <algorithm>
18
+ #include <cstdint>
19
+ #include <cstring>
20
+ #include <vector>
21
+
22
+ #include "trie_arena.h"
23
+
24
+ namespace ttrie {
25
+
26
+ class HashLinkList {
27
+ public:
28
+ struct Node {
29
+ const char* key;
30
+ uint32_t key_len;
31
+ Node* next; // next within the bucket (sorted by full key)
32
+ };
33
+
34
+ // num_buckets is rounded up to a power of two (so hash & mask selects a bucket).
35
+ explicit HashLinkList(size_t num_buckets = (1u << 16)) {
36
+ size_t b = 1;
37
+ while (b < num_buckets) b <<= 1;
38
+ mask_ = b - 1;
39
+ buckets_.assign(b, nullptr);
40
+ }
41
+
42
+ HashLinkList(const HashLinkList&) = delete;
43
+ HashLinkList& operator=(const HashLinkList&) = delete;
44
+
45
+ // Inserts the key. Returns false (no-op) if already present -- set semantics,
46
+ // matching the trie and skiplist baselines.
47
+ bool Insert(const char* key, uint32_t len) {
48
+ size_t h = Hash(key, len) & mask_;
49
+ Node** pp = &buckets_[h];
50
+ while (*pp != nullptr) {
51
+ int c = Compare((*pp)->key, (*pp)->key_len, key, len);
52
+ if (c == 0) return false; // duplicate
53
+ if (c > 0) break; // insertion point (keep bucket sorted)
54
+ pp = &(*pp)->next;
55
+ }
56
+ Node* n = NewNode(key, len);
57
+ n->next = *pp;
58
+ *pp = n;
59
+ ++count_;
60
+ return true;
61
+ }
62
+
63
+ bool Contains(const char* key, uint32_t len) const {
64
+ size_t h = Hash(key, len) & mask_;
65
+ for (const Node* n = buckets_[h]; n != nullptr; n = n->next) {
66
+ int c = Compare(n->key, n->key_len, key, len);
67
+ if (c == 0) return true;
68
+ if (c > 0) return false; // bucket is sorted -> past it
69
+ }
70
+ return false;
71
+ }
72
+
73
+ // FLUSH: emit every key in GLOBAL sorted order. This is the cost the trie
74
+ // avoids -- buckets are only locally sorted, so a global sort is unavoidable.
75
+ // Gathers O(n) then std::sort O(n log n). Returns the count emitted.
76
+ size_t FlushSorted(std::vector<const Node*>& out) const {
77
+ out.clear();
78
+ out.reserve(count_);
79
+ for (Node* b : buckets_) {
80
+ for (Node* n = b; n != nullptr; n = n->next) out.push_back(n);
81
+ }
82
+ std::sort(out.begin(), out.end(), [](const Node* a, const Node* b) {
83
+ return Compare(a->key, a->key_len, b->key, b->key_len) < 0;
84
+ });
85
+ return out.size();
86
+ }
87
+
88
+ size_t Size() const { return count_; }
89
+ // Arena (nodes + keys) plus the bucket array -- the honest memory footprint.
90
+ size_t BytesReserved() const {
91
+ return arena_.BytesReserved() + buckets_.size() * sizeof(Node*);
92
+ }
93
+ size_t BytesRequested() const {
94
+ return arena_.BytesRequested() + buckets_.size() * sizeof(Node*);
95
+ }
96
+
97
+ private:
98
+ static int Compare(const char* a, uint32_t al, const char* b, uint32_t bl) {
99
+ uint32_t n = al < bl ? al : bl;
100
+ int c = n ? std::memcmp(a, b, n) : 0;
101
+ if (c) return c;
102
+ if (al < bl) return -1;
103
+ if (al > bl) return 1;
104
+ return 0;
105
+ }
106
+
107
+ static uint64_t Hash(const char* key, uint32_t len) { // FNV-1a, 64-bit
108
+ uint64_t h = 1469598103934665603ull;
109
+ for (uint32_t i = 0; i < len; ++i) {
110
+ h ^= static_cast<uint8_t>(key[i]);
111
+ h *= 1099511628211ull;
112
+ }
113
+ return h;
114
+ }
115
+
116
+ Node* NewNode(const char* key, uint32_t len) {
117
+ char* kc = arena_.Allocate(len == 0 ? 1 : len);
118
+ if (len) std::memcpy(kc, key, len);
119
+ Node* n = reinterpret_cast<Node*>(arena_.Allocate(sizeof(Node)));
120
+ n->key = kc;
121
+ n->key_len = len;
122
+ n->next = nullptr;
123
+ return n;
124
+ }
125
+
126
+ Arena arena_;
127
+ std::vector<Node*> buckets_;
128
+ size_t mask_ = 0;
129
+ size_t count_ = 0;
130
+ };
131
+
132
+ } // namespace ttrie
133
+
134
+ #endif // TTRIE_HASH_LINKLIST_H_