PyPI - trieset - Versions diffs - 0.1.0a1__tar.gz - Mend

trieset 0.1.0a1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

trieset-0.1.0a1/LICENSE +21 -0
trieset-0.1.0a1/MANIFEST.in +27 -0
trieset-0.1.0a1/PKG-INFO +251 -0
trieset-0.1.0a1/README.md +231 -0
trieset-0.1.0a1/_vendor/standalone/include/hash_linklist.h +134 -0
trieset-0.1.0a1/_vendor/standalone/include/skiplist.h +133 -0
trieset-0.1.0a1/_vendor/standalone/include/threaded_trie.h +219 -0
trieset-0.1.0a1/_vendor/standalone/include/trie_arena.h +76 -0
trieset-0.1.0a1/_vendor/standalone/src/threaded_trie.cc +915 -0
trieset-0.1.0a1/pybind/tt_trieset.cc +645 -0
trieset-0.1.0a1/pyproject.toml +79 -0
trieset-0.1.0a1/setup.cfg +4 -0
trieset-0.1.0a1/setup.py +89 -0
trieset-0.1.0a1/tests/test_package.py +173 -0
trieset-0.1.0a1/trieset/__init__.py +39 -0
trieset-0.1.0a1/trieset/__init__.pyi +91 -0
trieset-0.1.0a1/trieset/py.typed +0 -0
trieset-0.1.0a1/trieset.egg-info/PKG-INFO +251 -0
trieset-0.1.0a1/trieset.egg-info/SOURCES.txt +19 -0
trieset-0.1.0a1/trieset.egg-info/dependency_links.txt +1 -0
trieset-0.1.0a1/trieset.egg-info/top_level.txt +1 -0

trieset-0.1.0a1/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 Daniel Benjamim
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

trieset-0.1.0a1/MANIFEST.in ADDED Viewed

@@ -0,0 +1,27 @@
+# Make the sdist self-contained. ASCII only.
+# The package sources (trieset/*.py) are included automatically; everything the
+# build needs that lives OUTSIDE the package must be listed here:
+#   - the C-extension source (pybind/tt_trieset.cc),
+#   - the build-time-vendored shared core (_vendor/standalone/...), which setup.py
+#     populates from the repo-root standalone/ before the sdist is built,
+#   - the typing marker + stub, and the wheel smoke test.
+include README.md
+include LICENSE
+include pyproject.toml
+include setup.py
+include pybind/tt_trieset.cc
+recursive-include _vendor *.h *.cc
+include trieset/py.typed
+include trieset/*.pyi
+include tests/test_package.py
+# docs/ is intentionally NOT shipped. The source repo stays PRIVATE; the internal
+# docs (STATUS / GOVERNANCE / NEXT_OPTIONS / FINDINGS / PUBLISH) carry session ids,
+# box paths, and references to a separate private project -- none of which belong in
+# a public PyPI artifact. The public README is self-contained: it states the honest
+# wins/losses inline and does NOT relative-link into docs/.
+# Never ship build junk.
+global-exclude *.so *.o *.pyc
+prune build
+prune dist
+prune wheelhouse

trieset-0.1.0a1/PKG-INFO ADDED Viewed

@@ -0,0 +1,251 @@
+Metadata-Version: 2.4
+Name: trieset
+Version: 0.1.0a1
+Summary: A sorted byte-string set backed by a threaded trie: a faster drop-in than sortedcontainers.SortedSet for build + prefix-scan + serialized-flush workloads.
+Author-email: Daniel Benjamim <dangstan.gbr@gmail.com>
+License-Expression: MIT
+Keywords: trie,sorted-set,sortedcontainers,prefix,byte-string,ordered-set
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: Programming Language :: C++
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3 :: Only
+Classifier: Programming Language :: Python :: Implementation :: CPython
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Classifier: Operating System :: POSIX :: Linux
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Dynamic: license-file
+# trieset
+A sorted **byte-string** set backed by a threaded trie. A *faster drop-in* than
+[`sortedcontainers.SortedSet`](https://grantjenks.com/docs/sortedcontainers/) for
+one specific, common workload: you **build** a set of byte strings and then do
+**prefix scans / counts** and/or a **serialized sorted flush**.
+It is **not** a "faster sorted set" in general. The wins and the costs are both
+stated honestly below, measured per-element against `SortedSet` (the fair drop-in
+test), not in a bulk microbenchmark.
+> Status: **public alpha (0.1.0a1).** First release is an **sdist** -- `pip install
+> trieset` compiles the C-extension from source (needs a C++17 compiler; standard
+> on Linux dev machines). Prebuilt binary (manylinux) wheels are a planned
+> follow-up. The benchmark numbers below are measured on a quiesced 16-core Linux
+> box; re-measure on your hardware before relying on a specific ratio.
+## When to use it (and when not)
+`SortedSet` is the de-facto answer for an ordered, mutable set with fast
+membership + ordered iteration in pure Python. **Use `trieset` instead only when**
+your keys are byte strings and your workload is dominated by ANY of:
+- **autocomplete** -- `startsWith` / prefix scans: `startsWith` existence is
+  ~9-15x faster than `SortedSet` at every prefix width; NARROW prefix collect (a
+  prefix matching few keys) is a few x faster too. (WIDE prefix *collect* -- where
+  you materialize a large fraction of the set -- favors `SortedSet`; see the cost
+  section.)
+- **prefix counting** -- `prefix_count(p)` is `O(prefix-length)` (per-node subtree
+  counts), so it beats `SortedSet`'s bisect at EVERY width (~16-25x measured).
+- **building** the set (lots of inserts) -- the fastest builder of any
+  PREFIX-CAPABLE structure here (a plain hash set builds faster, but answers no
+  prefix query),
+- **serialized sorted flush** (persist / send on the wire / feed a C consumer).
+**Do NOT reach for it** when your hot path is point membership at scale, or
+exploding all keys back into a Python list, or you are memory-constrained -- see
+the costs below.
+## Measured drop-in verdict
+Per-element, quiesced box, best-of-5, ratio = trie / `SortedSet` (`<1` = trie
+faster). These core ops are unaffected by the prefix internals:
+| operation | N=1k | N=10k | N=100k | verdict |
+|-----------|------|-------|--------|---------|
+| `add` (per-element build) | 0.13x | 0.12x | 0.15x | **trie WINS ~7x, every N** |
+| `flush()` serialized       | 0.04x | 0.13x | 0.25x | **trie WINS 4-25x, every N** |
+| `x in t` membership HIT    | 0.67x | 0.99x | 1.53x | wins small, **wash-to-~1.5x at scale** |
+| `x in t` membership MISS   | 0.45x | 0.59x | 1.08x | wins to ~10k, ~wash at 100k |
+| `tolist()` (C loop)        | 1.87x | 2.43x | 3.04x | **SS wins** (storage floor) |
+| `list(t)` (per-elem iter)  | 2.20x | 3.46x | 4.39x | **SS wins** (use `tolist()`) |
+### Prefix operations, by prefix width
+Prefix performance depends on how WIDE the prefix is (how many keys it matches),
+so a single ratio hides the story. Measured per query, N=100k, quiesced box,
+clustered keys (us/query; lower is faster; **bold** = winner of trie vs `SortedSet`):
+| prefix width | ~matches/query | `prefix_count`: trie / SS | `startsWith`: trie / SS | `prefix` collect: trie / SS |
+|---|---|---|---|---|
+| wide (1 char) | ~3850 | **0.06** / 1.51  (trie ~25x) | **0.10** / 1.48  (trie ~15x) | 175 / **53**  (SS ~3.3x) |
+| medium (3)    | ~144  | **0.08** / 1.85  (trie ~23x) | **0.13** / 1.60  (trie ~12x) | 7.2 / **5.1**  (SS ~1.4x) |
+| narrow (6)    | ~1    | **0.11** / 1.79  (trie ~16x) | **0.17** / 1.58  (trie ~9x)  | **0.26** / 1.32  (trie ~5x) |
+Reading it:
+- **`prefix_count` wins at every width (~16-25x).** It is `O(prefix-length)` -- a
+  descend-and-read of a per-node subtree count, not a walk over matches -- so it
+  beats `SortedSet`'s `O(log n)` bisect-difference even on wide prefixes. (Earlier
+  releases walked the matches and LOST on wide prefixes; that is fixed.)
+- **`startsWith` / prefix existence wins at every width (~9-15x).** An
+  `O(prefix-length)` descent vs a bisect + `startswith` compare, flat in the match
+  count. This is the most robust prefix win.
+- **`prefix` collect (materializing the matching keys) is split:** the trie wins
+  NARROW (~5x -- few keys to build), but `SortedSet` wins WIDE (a contiguous slice
+  of `PyBytes` it already holds, vs the trie allocating each key). If your prefixes
+  routinely return a large fraction of the set, `SortedSet` collect is faster.
+## What WINS (the drop-in claim)
+- **build / `add`: ~5x faster** than `SortedSet` at every N (and the fastest
+  builder of every PREFIX-CAPABLE structure benchmarked; a plain hash set builds
+  ~5x faster than `trieset` but answers no prefix query) -- per-element insert
+  beats `SortedSet`'s bisect + list-insert, with no relocation and no declared
+  alphabet.
+- **`startsWith` / prefix existence: ~9-15x faster** than `SortedSet`, every width.
+- **`prefix_count`: ~16-25x faster** than `SortedSet`, every width (`O(prefix-len)`).
+- **NARROW prefix collect** (autocomplete -- a prefix matching few keys): a few x
+  faster than `SortedSet`, and 5-60x faster than every other Python trie.
+- **serialized sorted flush (`flush`): 3-14x faster** -- one contiguous C pass vs
+  per-element Python serialization.
+- **prefix-walk (ancestors): `O(word-length)`.** `shortest_prefix(word)` /
+  `longest_prefix(word)` / `prefixes(word)` return the stored keys that are
+  PREFIXES OF `word` (the dual of `prefix()`) in a single descent -- the natural
+  "is any stored key a prefix of X" / "shortest dictionary root of X" query.
+  (Honest caveat: on the whole *replace-words* style problem a plain `set` still
+  wins end-to-end because build cost dominates -- this primitive closes the gap
+  versus other tries, not a new headline versus a hash set.)
+## What does NOT win (stated up front)
+- **Point membership (`x in t`)** is a **wash at small N and a wash-to-slight-loss
+  at scale** (measured ~1.0-1.5x SortedSet at N=100k, depending on the query mix).
+  `SortedSet`'s bisect is `O(log n)` over contiguous, cache-friendly arrays; the
+  trie does `O(k)` pointer-chasing across arena-scattered nodes (a cache miss per
+  level), so it does not WIN membership and a hash set beats both. **Do not pick
+  `trieset` for a membership-heavy workload** -- but it is the fastest *trie* here.
+- **WIDE prefix collect** (materializing a large fraction of the set) is **slower**
+  than `SortedSet`'s contiguous slice -- see the by-width table. (Only the COUNT is
+  width-independent; collecting the keys is not.)
+- **Listing all keys** is **slower** -- the storage-model floor: `SortedSet`
+  already holds the `PyBytes`, while the trie must `malloc` + `memcpy` a `bytes`
+  per key. `tolist()` (one C loop) is ~2-3x slower; the per-element `list(t)` is
+  ~2-4x slower. Use `tolist()` to materialize and `for k in t` to stream /
+  early-exit; do not use `list(t)`.
+- **Memory: modestly higher (~1.1-1.5x, measurement-sensitive).** TrieSet's
+  structure is ~73 B/key (exact arena); `SortedSet` measures ~50-70 B/key by RSS
+  (it stores the `PyBytes` plus index pointers), so a like-for-like build-peak RSS
+  comparison is ~1.1x and an RSS-delta comparison ~1.4x (an older optimistic
+  SortedSet estimate gave ~1.8x). Either way it uses more than `SortedSet`, and far
+  more than `marisa-trie`'s succinct ~18 B/key loaded. (Was ~3-7x before the
+  leaf-tagging + inline-key shrink; +4% back from the per-node subtree counts that
+  make `prefix_count` `O(prefix-len)`.)
+In one sentence: *the fastest structure here to BUILD a mutable byte-string
+dictionary and do AUTOCOMPLETE on it (`startsWith` + `prefix_count` + narrow
+prefix suggestions) and SERIALIZED FLUSH; WIDE prefix collect and point membership
+favor `SortedSet`; ~2-4x slower to list all keys; modestly more memory (~1.1-1.5x).*
+## Versus compiled tries (datrie, marisa-trie)
+`SortedSet` is pure Python; the honest next question is how `trieset` sits among
+*compiled* trie libraries. Measured (lowercase-ascii keys, quiesced; peers:
+`datrie`, `marisa-trie`, `pygtrie`, `pyahocorasick`):
+- **Build:** `trieset` is the fastest -- **~2.9x** faster than `pyahocorasick`,
+  **~3.4x** faster than `marisa-trie`'s bulk build, **~18x** faster than `pygtrie`,
+  and orders of magnitude faster than `datrie` (~80x even pre-sorted, hundreds of x
+  in random order -- its double-array relocates on incremental insert).
+- **Prefix COUNT + COLLECT:** `trieset` beats every compiled trie at **every width**
+  -- it **counts** in `O(prefix-len)` without building Python objects (the peers
+  must materialize and count: ~600-8000x slower on wide prefixes) and **collects**
+  ~3-60x faster.
+- **Prefix EXISTENCE (`startsWith`):** `trieset` does **not** lead here. `datrie`
+  and `pyahocorasick` (a compiled Aho-Corasick automaton) answer `startsWith`
+  ~1.5-2.5x faster than `trieset` at every width (their compiled lookup beats the
+  trie's Seek-first-element). `trieset` still beats `marisa`/`pygtrie` on existence,
+  and beats `SortedSet` ~9-15x.
+- **Membership:** `trieset` is the fastest *trie* (beats `datrie`, `marisa`,
+  `pyahocorasick`), though still a wash-to-slight-loss versus `SortedSet`'s bisect.
+- **Memory:** `marisa-trie` is **far smaller** (succinct: ~4.4 B/key image, ~18 B/key
+  loaded vs ~73) and serializes/loads a real queryable index almost for free -- but
+  it is **read-only** (no `add`/`delete`). `pyahocorasick` is heavier than `trieset`;
+  `datrie` needs a fixed alphabet declared up front and builds pathologically slowly.
+  (`hat-trie` does not build on Python 3.11+ and was not tested.)
+So the niche is a **mutable, arbitrary-byte, no-declared-alphabet** set you
+**build incrementally** and **prefix count / collect / autocomplete** -- where
+`trieset` is the fastest of the Python tries. If you only need `startsWith`
+existence on a near-static set, `datrie` or `pyahocorasick` answer that one op
+faster; for a static set where memory is paramount, prefer `marisa-trie`.
+## Install
+```sh
+pip install trieset
+```
+The first release is a source distribution, so `pip` compiles the CPython
+C-extension (`trieset.tt_trieset`) from the binding plus the bundled trie core.
+You need a **C++17 compiler** (`g++`/`clang++`; standard on Linux dev machines).
+**Portability:** the build uses `-O3` and **not** `-march=native`. The core's SIMD
+child-lookup is `__SSE2__`-guarded and SSE2 is the x86-64 baseline, so the build is
+correct on every x86-64 CPU (and falls back to a scalar path elsewhere). Prebuilt
+binary wheels (no compiler needed) are a planned follow-up.
+## Quickstart
+```python
+from trieset import TrieSet
+t = TrieSet([b"apple", b"apricot", b"banana"])
+t.add(b"avocado")
+t.update([b"cherry", b"apple"])     # set semantics: duplicate is a no-op
+b"banana" in t                       # True  (membership: O(len key))
+len(t)                               # 5
+list(t.prefix(b"ap"))                # [b'apple', b'apricot']  (sorted)
+t.prefix_count(b"a")                 # 3                       (O(prefix-len), counted in C)
+t.shortest_prefix(b"applesauce")     # b'apple'   (shortest stored key that prefixes the word)
+t.longest_prefix(b"applesauce")      # b'apple'
+list(t.prefixes(b"applesauce"))      # [b'apple']  (all stored keys that prefix the word)
+t.tolist()                           # all keys, sorted (one C loop)
+for k in t:                          # stream sorted keys, O(1) peak memory
+    ...
+t.first(), t.last()                  # (b'apple', b'cherry')
+blob = t.flush()                     # all keys, sorted, length-framed bytes:
+                                     #   repeated [u32-le len][key bytes]
+```
+## API and honest limitations
+- **Keys are byte strings only.** Any buffer-protocol object
+  (`bytes` / `bytearray` / `memoryview`) is accepted on input; every returned key
+  is `bytes`. **`str` raises `TypeError`** -- by design (this is a byte-string set).
+- **No random `remove` / `discard`.** The core supports only "drain the smallest",
+  not delete-by-key, so it is not exposed (avoids an `O(n)` emulation). This is the
+  build-then-query / sorted-flush niche; a delete-heavy set is not the target.
+- **No multiplicity.** It is a set: a key is present 0 or 1 times.
+- **Not thread-safe for concurrent writes** (single-writer core). The GIL is held
+  throughout, so concurrent Python threads are serialized correctly.
+Type hints ship with the package (`py.typed` + `trieset/__init__.pyi`).
+## Prior art (not novel, not first)
+The structure is not novel: prior art includes `marisa-trie`, `datrie`, the Cuckoo
+Trie, and the Patricia-trie memtables in ToplingDB / TerarkDB. The contribution
+here is the **clean from-scratch derivation** plus the **packaged
+build + prefix + sorted-flush combination** with honest, measured numbers --
+something `SortedSet` and the existing trie packages do not jointly offer. The
+shared trie core also backs a RocksDB `MemTableRep` plugin (a separate project).
+## License
+MIT. See [LICENSE](LICENSE).

trieset-0.1.0a1/README.md ADDED Viewed

@@ -0,0 +1,231 @@
+# trieset
+A sorted **byte-string** set backed by a threaded trie. A *faster drop-in* than
+[`sortedcontainers.SortedSet`](https://grantjenks.com/docs/sortedcontainers/) for
+one specific, common workload: you **build** a set of byte strings and then do
+**prefix scans / counts** and/or a **serialized sorted flush**.
+It is **not** a "faster sorted set" in general. The wins and the costs are both
+stated honestly below, measured per-element against `SortedSet` (the fair drop-in
+test), not in a bulk microbenchmark.
+> Status: **public alpha (0.1.0a1).** First release is an **sdist** -- `pip install
+> trieset` compiles the C-extension from source (needs a C++17 compiler; standard
+> on Linux dev machines). Prebuilt binary (manylinux) wheels are a planned
+> follow-up. The benchmark numbers below are measured on a quiesced 16-core Linux
+> box; re-measure on your hardware before relying on a specific ratio.
+## When to use it (and when not)
+`SortedSet` is the de-facto answer for an ordered, mutable set with fast
+membership + ordered iteration in pure Python. **Use `trieset` instead only when**
+your keys are byte strings and your workload is dominated by ANY of:
+- **autocomplete** -- `startsWith` / prefix scans: `startsWith` existence is
+  ~9-15x faster than `SortedSet` at every prefix width; NARROW prefix collect (a
+  prefix matching few keys) is a few x faster too. (WIDE prefix *collect* -- where
+  you materialize a large fraction of the set -- favors `SortedSet`; see the cost
+  section.)
+- **prefix counting** -- `prefix_count(p)` is `O(prefix-length)` (per-node subtree
+  counts), so it beats `SortedSet`'s bisect at EVERY width (~16-25x measured).
+- **building** the set (lots of inserts) -- the fastest builder of any
+  PREFIX-CAPABLE structure here (a plain hash set builds faster, but answers no
+  prefix query),
+- **serialized sorted flush** (persist / send on the wire / feed a C consumer).
+**Do NOT reach for it** when your hot path is point membership at scale, or
+exploding all keys back into a Python list, or you are memory-constrained -- see
+the costs below.
+## Measured drop-in verdict
+Per-element, quiesced box, best-of-5, ratio = trie / `SortedSet` (`<1` = trie
+faster). These core ops are unaffected by the prefix internals:
+| operation | N=1k | N=10k | N=100k | verdict |
+|-----------|------|-------|--------|---------|
+| `add` (per-element build) | 0.13x | 0.12x | 0.15x | **trie WINS ~7x, every N** |
+| `flush()` serialized       | 0.04x | 0.13x | 0.25x | **trie WINS 4-25x, every N** |
+| `x in t` membership HIT    | 0.67x | 0.99x | 1.53x | wins small, **wash-to-~1.5x at scale** |
+| `x in t` membership MISS   | 0.45x | 0.59x | 1.08x | wins to ~10k, ~wash at 100k |
+| `tolist()` (C loop)        | 1.87x | 2.43x | 3.04x | **SS wins** (storage floor) |
+| `list(t)` (per-elem iter)  | 2.20x | 3.46x | 4.39x | **SS wins** (use `tolist()`) |
+### Prefix operations, by prefix width
+Prefix performance depends on how WIDE the prefix is (how many keys it matches),
+so a single ratio hides the story. Measured per query, N=100k, quiesced box,
+clustered keys (us/query; lower is faster; **bold** = winner of trie vs `SortedSet`):
+| prefix width | ~matches/query | `prefix_count`: trie / SS | `startsWith`: trie / SS | `prefix` collect: trie / SS |
+|---|---|---|---|---|
+| wide (1 char) | ~3850 | **0.06** / 1.51  (trie ~25x) | **0.10** / 1.48  (trie ~15x) | 175 / **53**  (SS ~3.3x) |
+| medium (3)    | ~144  | **0.08** / 1.85  (trie ~23x) | **0.13** / 1.60  (trie ~12x) | 7.2 / **5.1**  (SS ~1.4x) |
+| narrow (6)    | ~1    | **0.11** / 1.79  (trie ~16x) | **0.17** / 1.58  (trie ~9x)  | **0.26** / 1.32  (trie ~5x) |
+Reading it:
+- **`prefix_count` wins at every width (~16-25x).** It is `O(prefix-length)` -- a
+  descend-and-read of a per-node subtree count, not a walk over matches -- so it
+  beats `SortedSet`'s `O(log n)` bisect-difference even on wide prefixes. (Earlier
+  releases walked the matches and LOST on wide prefixes; that is fixed.)
+- **`startsWith` / prefix existence wins at every width (~9-15x).** An
+  `O(prefix-length)` descent vs a bisect + `startswith` compare, flat in the match
+  count. This is the most robust prefix win.
+- **`prefix` collect (materializing the matching keys) is split:** the trie wins
+  NARROW (~5x -- few keys to build), but `SortedSet` wins WIDE (a contiguous slice
+  of `PyBytes` it already holds, vs the trie allocating each key). If your prefixes
+  routinely return a large fraction of the set, `SortedSet` collect is faster.
+## What WINS (the drop-in claim)
+- **build / `add`: ~5x faster** than `SortedSet` at every N (and the fastest
+  builder of every PREFIX-CAPABLE structure benchmarked; a plain hash set builds
+  ~5x faster than `trieset` but answers no prefix query) -- per-element insert
+  beats `SortedSet`'s bisect + list-insert, with no relocation and no declared
+  alphabet.
+- **`startsWith` / prefix existence: ~9-15x faster** than `SortedSet`, every width.
+- **`prefix_count`: ~16-25x faster** than `SortedSet`, every width (`O(prefix-len)`).
+- **NARROW prefix collect** (autocomplete -- a prefix matching few keys): a few x
+  faster than `SortedSet`, and 5-60x faster than every other Python trie.
+- **serialized sorted flush (`flush`): 3-14x faster** -- one contiguous C pass vs
+  per-element Python serialization.
+- **prefix-walk (ancestors): `O(word-length)`.** `shortest_prefix(word)` /
+  `longest_prefix(word)` / `prefixes(word)` return the stored keys that are
+  PREFIXES OF `word` (the dual of `prefix()`) in a single descent -- the natural
+  "is any stored key a prefix of X" / "shortest dictionary root of X" query.
+  (Honest caveat: on the whole *replace-words* style problem a plain `set` still
+  wins end-to-end because build cost dominates -- this primitive closes the gap
+  versus other tries, not a new headline versus a hash set.)
+## What does NOT win (stated up front)
+- **Point membership (`x in t`)** is a **wash at small N and a wash-to-slight-loss
+  at scale** (measured ~1.0-1.5x SortedSet at N=100k, depending on the query mix).
+  `SortedSet`'s bisect is `O(log n)` over contiguous, cache-friendly arrays; the
+  trie does `O(k)` pointer-chasing across arena-scattered nodes (a cache miss per
+  level), so it does not WIN membership and a hash set beats both. **Do not pick
+  `trieset` for a membership-heavy workload** -- but it is the fastest *trie* here.
+- **WIDE prefix collect** (materializing a large fraction of the set) is **slower**
+  than `SortedSet`'s contiguous slice -- see the by-width table. (Only the COUNT is
+  width-independent; collecting the keys is not.)
+- **Listing all keys** is **slower** -- the storage-model floor: `SortedSet`
+  already holds the `PyBytes`, while the trie must `malloc` + `memcpy` a `bytes`
+  per key. `tolist()` (one C loop) is ~2-3x slower; the per-element `list(t)` is
+  ~2-4x slower. Use `tolist()` to materialize and `for k in t` to stream /
+  early-exit; do not use `list(t)`.
+- **Memory: modestly higher (~1.1-1.5x, measurement-sensitive).** TrieSet's
+  structure is ~73 B/key (exact arena); `SortedSet` measures ~50-70 B/key by RSS
+  (it stores the `PyBytes` plus index pointers), so a like-for-like build-peak RSS
+  comparison is ~1.1x and an RSS-delta comparison ~1.4x (an older optimistic
+  SortedSet estimate gave ~1.8x). Either way it uses more than `SortedSet`, and far
+  more than `marisa-trie`'s succinct ~18 B/key loaded. (Was ~3-7x before the
+  leaf-tagging + inline-key shrink; +4% back from the per-node subtree counts that
+  make `prefix_count` `O(prefix-len)`.)
+In one sentence: *the fastest structure here to BUILD a mutable byte-string
+dictionary and do AUTOCOMPLETE on it (`startsWith` + `prefix_count` + narrow
+prefix suggestions) and SERIALIZED FLUSH; WIDE prefix collect and point membership
+favor `SortedSet`; ~2-4x slower to list all keys; modestly more memory (~1.1-1.5x).*
+## Versus compiled tries (datrie, marisa-trie)
+`SortedSet` is pure Python; the honest next question is how `trieset` sits among
+*compiled* trie libraries. Measured (lowercase-ascii keys, quiesced; peers:
+`datrie`, `marisa-trie`, `pygtrie`, `pyahocorasick`):
+- **Build:** `trieset` is the fastest -- **~2.9x** faster than `pyahocorasick`,
+  **~3.4x** faster than `marisa-trie`'s bulk build, **~18x** faster than `pygtrie`,
+  and orders of magnitude faster than `datrie` (~80x even pre-sorted, hundreds of x
+  in random order -- its double-array relocates on incremental insert).
+- **Prefix COUNT + COLLECT:** `trieset` beats every compiled trie at **every width**
+  -- it **counts** in `O(prefix-len)` without building Python objects (the peers
+  must materialize and count: ~600-8000x slower on wide prefixes) and **collects**
+  ~3-60x faster.
+- **Prefix EXISTENCE (`startsWith`):** `trieset` does **not** lead here. `datrie`
+  and `pyahocorasick` (a compiled Aho-Corasick automaton) answer `startsWith`
+  ~1.5-2.5x faster than `trieset` at every width (their compiled lookup beats the
+  trie's Seek-first-element). `trieset` still beats `marisa`/`pygtrie` on existence,
+  and beats `SortedSet` ~9-15x.
+- **Membership:** `trieset` is the fastest *trie* (beats `datrie`, `marisa`,
+  `pyahocorasick`), though still a wash-to-slight-loss versus `SortedSet`'s bisect.
+- **Memory:** `marisa-trie` is **far smaller** (succinct: ~4.4 B/key image, ~18 B/key
+  loaded vs ~73) and serializes/loads a real queryable index almost for free -- but
+  it is **read-only** (no `add`/`delete`). `pyahocorasick` is heavier than `trieset`;
+  `datrie` needs a fixed alphabet declared up front and builds pathologically slowly.
+  (`hat-trie` does not build on Python 3.11+ and was not tested.)
+So the niche is a **mutable, arbitrary-byte, no-declared-alphabet** set you
+**build incrementally** and **prefix count / collect / autocomplete** -- where
+`trieset` is the fastest of the Python tries. If you only need `startsWith`
+existence on a near-static set, `datrie` or `pyahocorasick` answer that one op
+faster; for a static set where memory is paramount, prefer `marisa-trie`.
+## Install
+```sh
+pip install trieset
+```
+The first release is a source distribution, so `pip` compiles the CPython
+C-extension (`trieset.tt_trieset`) from the binding plus the bundled trie core.
+You need a **C++17 compiler** (`g++`/`clang++`; standard on Linux dev machines).
+**Portability:** the build uses `-O3` and **not** `-march=native`. The core's SIMD
+child-lookup is `__SSE2__`-guarded and SSE2 is the x86-64 baseline, so the build is
+correct on every x86-64 CPU (and falls back to a scalar path elsewhere). Prebuilt
+binary wheels (no compiler needed) are a planned follow-up.
+## Quickstart
+```python
+from trieset import TrieSet
+t = TrieSet([b"apple", b"apricot", b"banana"])
+t.add(b"avocado")
+t.update([b"cherry", b"apple"])     # set semantics: duplicate is a no-op
+b"banana" in t                       # True  (membership: O(len key))
+len(t)                               # 5
+list(t.prefix(b"ap"))                # [b'apple', b'apricot']  (sorted)
+t.prefix_count(b"a")                 # 3                       (O(prefix-len), counted in C)
+t.shortest_prefix(b"applesauce")     # b'apple'   (shortest stored key that prefixes the word)
+t.longest_prefix(b"applesauce")      # b'apple'
+list(t.prefixes(b"applesauce"))      # [b'apple']  (all stored keys that prefix the word)
+t.tolist()                           # all keys, sorted (one C loop)
+for k in t:                          # stream sorted keys, O(1) peak memory
+    ...
+t.first(), t.last()                  # (b'apple', b'cherry')
+blob = t.flush()                     # all keys, sorted, length-framed bytes:
+                                     #   repeated [u32-le len][key bytes]
+```
+## API and honest limitations
+- **Keys are byte strings only.** Any buffer-protocol object
+  (`bytes` / `bytearray` / `memoryview`) is accepted on input; every returned key
+  is `bytes`. **`str` raises `TypeError`** -- by design (this is a byte-string set).
+- **No random `remove` / `discard`.** The core supports only "drain the smallest",
+  not delete-by-key, so it is not exposed (avoids an `O(n)` emulation). This is the
+  build-then-query / sorted-flush niche; a delete-heavy set is not the target.
+- **No multiplicity.** It is a set: a key is present 0 or 1 times.
+- **Not thread-safe for concurrent writes** (single-writer core). The GIL is held
+  throughout, so concurrent Python threads are serialized correctly.
+Type hints ship with the package (`py.typed` + `trieset/__init__.pyi`).
+## Prior art (not novel, not first)
+The structure is not novel: prior art includes `marisa-trie`, `datrie`, the Cuckoo
+Trie, and the Patricia-trie memtables in ToplingDB / TerarkDB. The contribution
+here is the **clean from-scratch derivation** plus the **packaged
+build + prefix + sorted-flush combination** with honest, measured numbers --
+something `SortedSet` and the existing trie packages do not jointly offer. The
+shared trie core also backs a RocksDB `MemTableRep` plugin (a separate project).
+## License
+MIT. See [LICENSE](LICENSE).

trieset-0.1.0a1/_vendor/standalone/include/hash_linklist.h ADDED Viewed

@@ -0,0 +1,134 @@
+// hash_linklist.h -- a HashLinkList-style baseline, the "hash memtable" foil.
+//
+// This mirrors RocksDB's HashLinkListRep shape closely enough to make the
+// article's point: keys are hashed into buckets, and each bucket is a sorted
+// singly-linked list. That gives fast point ops (hash + short bucket walk), but
+// the buckets are only LOCALLY ordered -- there is no global order across
+// buckets. So a FLUSH (emit every key in sorted order, which an LSM-tree must do
+// to write an SST) cannot just walk a list: it must gather all entries and sort
+// them, O(n log n). The threaded trie's flush is an O(n) `next` walk -- that is
+// the hash-vs-sorted tradeoff this baseline exists to expose.
+//
+// Same arena + bytewise comparator as the trie/skiplist so the comparison is
+// honest. Single-threaded (Phase 1). ASCII-only.
+#ifndef TTRIE_HASH_LINKLIST_H_
+#define TTRIE_HASH_LINKLIST_H_
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <vector>
+#include "trie_arena.h"
+namespace ttrie {
+class HashLinkList {
+ public:
+  struct Node {
+    const char* key;
+    uint32_t key_len;
+    Node* next;  // next within the bucket (sorted by full key)
+  };
+  // num_buckets is rounded up to a power of two (so hash & mask selects a bucket).
+  explicit HashLinkList(size_t num_buckets = (1u << 16)) {
+    size_t b = 1;
+    while (b < num_buckets) b <<= 1;
+    mask_ = b - 1;
+    buckets_.assign(b, nullptr);
+  }
+  HashLinkList(const HashLinkList&) = delete;
+  HashLinkList& operator=(const HashLinkList&) = delete;
+  // Inserts the key. Returns false (no-op) if already present -- set semantics,
+  // matching the trie and skiplist baselines.
+  bool Insert(const char* key, uint32_t len) {
+    size_t h = Hash(key, len) & mask_;
+    Node** pp = &buckets_[h];
+    while (*pp != nullptr) {
+      int c = Compare((*pp)->key, (*pp)->key_len, key, len);
+      if (c == 0) return false;  // duplicate
+      if (c > 0) break;          // insertion point (keep bucket sorted)
+      pp = &(*pp)->next;
+    }
+    Node* n = NewNode(key, len);
+    n->next = *pp;
+    *pp = n;
+    ++count_;
+    return true;
+  }
+  bool Contains(const char* key, uint32_t len) const {
+    size_t h = Hash(key, len) & mask_;
+    for (const Node* n = buckets_[h]; n != nullptr; n = n->next) {
+      int c = Compare(n->key, n->key_len, key, len);
+      if (c == 0) return true;
+      if (c > 0) return false;  // bucket is sorted -> past it
+    }
+    return false;
+  }
+  // FLUSH: emit every key in GLOBAL sorted order. This is the cost the trie
+  // avoids -- buckets are only locally sorted, so a global sort is unavoidable.
+  // Gathers O(n) then std::sort O(n log n). Returns the count emitted.
+  size_t FlushSorted(std::vector<const Node*>& out) const {
+    out.clear();
+    out.reserve(count_);
+    for (Node* b : buckets_) {
+      for (Node* n = b; n != nullptr; n = n->next) out.push_back(n);
+    }
+    std::sort(out.begin(), out.end(), [](const Node* a, const Node* b) {
+      return Compare(a->key, a->key_len, b->key, b->key_len) < 0;
+    });
+    return out.size();
+  }
+  size_t Size() const { return count_; }
+  // Arena (nodes + keys) plus the bucket array -- the honest memory footprint.
+  size_t BytesReserved() const {
+    return arena_.BytesReserved() + buckets_.size() * sizeof(Node*);
+  }
+  size_t BytesRequested() const {
+    return arena_.BytesRequested() + buckets_.size() * sizeof(Node*);
+  }
+ private:
+  static int Compare(const char* a, uint32_t al, const char* b, uint32_t bl) {
+    uint32_t n = al < bl ? al : bl;
+    int c = n ? std::memcmp(a, b, n) : 0;
+    if (c) return c;
+    if (al < bl) return -1;
+    if (al > bl) return 1;
+    return 0;
+  }
+  static uint64_t Hash(const char* key, uint32_t len) {  // FNV-1a, 64-bit
+    uint64_t h = 1469598103934665603ull;
+    for (uint32_t i = 0; i < len; ++i) {
+      h ^= static_cast<uint8_t>(key[i]);
+      h *= 1099511628211ull;
+    }
+    return h;
+  }
+  Node* NewNode(const char* key, uint32_t len) {
+    char* kc = arena_.Allocate(len == 0 ? 1 : len);
+    if (len) std::memcpy(kc, key, len);
+    Node* n = reinterpret_cast<Node*>(arena_.Allocate(sizeof(Node)));
+    n->key = kc;
+    n->key_len = len;
+    n->next = nullptr;
+    return n;
+  }
+  Arena arena_;
+  std::vector<Node*> buckets_;
+  size_t mask_ = 0;
+  size_t count_ = 0;
+};
+}  // namespace ttrie
+#endif  // TTRIE_HASH_LINKLIST_H_