lexindex 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lexindex-0.1.0/.github/workflows/ci.yml +94 -0
- lexindex-0.1.0/.github/workflows/release.yml +100 -0
- lexindex-0.1.0/.gitignore +8 -0
- lexindex-0.1.0/CHANGELOG.md +28 -0
- lexindex-0.1.0/Cargo.lock +1518 -0
- lexindex-0.1.0/Cargo.toml +42 -0
- lexindex-0.1.0/LICENSE +21 -0
- lexindex-0.1.0/PKG-INFO +190 -0
- lexindex-0.1.0/README.md +163 -0
- lexindex-0.1.0/examples/bench.rs +130 -0
- lexindex-0.1.0/examples/bridge_clustering.py +81 -0
- lexindex-0.1.0/pyproject.toml +54 -0
- lexindex-0.1.0/python/lexindex/__init__.py +11 -0
- lexindex-0.1.0/python/lexindex/__init__.pyi +46 -0
- lexindex-0.1.0/python/lexindex/py.typed +0 -0
- lexindex-0.1.0/src/arena.rs +85 -0
- lexindex-0.1.0/src/lib.rs +92 -0
- lexindex-0.1.0/src/perfect_hash.rs +281 -0
- lexindex-0.1.0/src/python.rs +195 -0
- lexindex-0.1.0/src/string_index.rs +267 -0
- lexindex-0.1.0/tests/integration.rs +50 -0
- lexindex-0.1.0/tests/test_python.py +69 -0
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main, master]
|
|
6
|
+
pull_request:
|
|
7
|
+
|
|
8
|
+
permissions:
|
|
9
|
+
contents: read
|
|
10
|
+
|
|
11
|
+
concurrency:
|
|
12
|
+
group: ci-${{ github.ref }}
|
|
13
|
+
cancel-in-progress: true
|
|
14
|
+
|
|
15
|
+
env:
|
|
16
|
+
CARGO_TERM_COLOR: always
|
|
17
|
+
|
|
18
|
+
jobs:
|
|
19
|
+
rust:
|
|
20
|
+
name: rust (fmt · clippy · test · audit)
|
|
21
|
+
runs-on: ubuntu-latest
|
|
22
|
+
steps:
|
|
23
|
+
- uses: actions/checkout@v4
|
|
24
|
+
- uses: dtolnay/rust-toolchain@stable
|
|
25
|
+
with:
|
|
26
|
+
components: rustfmt, clippy
|
|
27
|
+
- uses: Swatinem/rust-cache@v2
|
|
28
|
+
- uses: taiki-e/install-action@v2
|
|
29
|
+
with:
|
|
30
|
+
tool: cargo-audit
|
|
31
|
+
- name: fmt
|
|
32
|
+
run: cargo fmt --all --check
|
|
33
|
+
- name: clippy (default = with mph)
|
|
34
|
+
run: cargo clippy --all-targets -- -D warnings
|
|
35
|
+
- name: clippy (no-default-features = fst only)
|
|
36
|
+
run: cargo clippy --no-default-features --all-targets -- -D warnings
|
|
37
|
+
- name: test (default)
|
|
38
|
+
run: cargo test
|
|
39
|
+
- name: test (no-default-features)
|
|
40
|
+
run: cargo test --no-default-features
|
|
41
|
+
- name: cargo audit
|
|
42
|
+
run: cargo audit
|
|
43
|
+
|
|
44
|
+
python-build:
|
|
45
|
+
name: python (ruff · clippy · build)
|
|
46
|
+
runs-on: ubuntu-latest
|
|
47
|
+
env:
|
|
48
|
+
PYO3_USE_ABI3_FORWARD_COMPATIBILITY: "1"
|
|
49
|
+
steps:
|
|
50
|
+
- uses: actions/checkout@v4
|
|
51
|
+
- uses: dtolnay/rust-toolchain@stable
|
|
52
|
+
with:
|
|
53
|
+
components: clippy
|
|
54
|
+
- uses: Swatinem/rust-cache@v2
|
|
55
|
+
- uses: astral-sh/setup-uv@v5
|
|
56
|
+
- name: ruff check
|
|
57
|
+
run: uvx ruff check python/ tests/test_python.py
|
|
58
|
+
- name: ruff format --check
|
|
59
|
+
run: uvx ruff format --check python/ tests/test_python.py
|
|
60
|
+
- name: clippy (python bindings)
|
|
61
|
+
run: cargo clippy --features python -- -D warnings
|
|
62
|
+
- name: build abi3 wheel
|
|
63
|
+
run: uv run --with maturin maturin build --release --out dist
|
|
64
|
+
- uses: actions/upload-artifact@v4
|
|
65
|
+
with:
|
|
66
|
+
name: wheel
|
|
67
|
+
path: dist/*.whl
|
|
68
|
+
|
|
69
|
+
python-test:
|
|
70
|
+
name: python (pytest · stubtest · py${{ matrix.python-version }})
|
|
71
|
+
needs: python-build
|
|
72
|
+
runs-on: ubuntu-latest
|
|
73
|
+
strategy:
|
|
74
|
+
fail-fast: false
|
|
75
|
+
matrix:
|
|
76
|
+
python-version: ["3.11", "3.12", "3.13", "3.14"]
|
|
77
|
+
steps:
|
|
78
|
+
- uses: actions/checkout@v4
|
|
79
|
+
- uses: astral-sh/setup-uv@v5
|
|
80
|
+
- uses: actions/download-artifact@v4
|
|
81
|
+
with:
|
|
82
|
+
name: wheel
|
|
83
|
+
path: dist
|
|
84
|
+
# Install-only: the single abi3 wheel must import and pass on every supported interpreter.
|
|
85
|
+
- name: pytest
|
|
86
|
+
run: |
|
|
87
|
+
wheel=$(ls dist/*.whl | head -1)
|
|
88
|
+
uv run --python ${{ matrix.python-version }} --with pytest --with "$wheel" \
|
|
89
|
+
pytest tests/test_python.py -q
|
|
90
|
+
- name: stubtest
|
|
91
|
+
run: |
|
|
92
|
+
wheel=$(ls dist/*.whl | head -1)
|
|
93
|
+
uv run --python ${{ matrix.python-version }} --with mypy --with "$wheel" \
|
|
94
|
+
python -m mypy.stubtest lexindex
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
name: Release
|
|
2
|
+
|
|
3
|
+
# Build redistributable wheels for every platform and (on a version tag) publish to PyPI.
|
|
4
|
+
# Publishing uses PyPI Trusted Publishing (OIDC) — add a pending publisher for this repo at
|
|
5
|
+
# https://pypi.org/manage/account/publishing/ (workflow `release.yml`, environment `pypi`) before the
|
|
6
|
+
# first `v*` tag, and create a `pypi` environment in the repo settings.
|
|
7
|
+
|
|
8
|
+
on:
|
|
9
|
+
push:
|
|
10
|
+
tags: ["v*"]
|
|
11
|
+
workflow_dispatch:
|
|
12
|
+
|
|
13
|
+
permissions:
|
|
14
|
+
contents: read
|
|
15
|
+
|
|
16
|
+
jobs:
|
|
17
|
+
wheels:
|
|
18
|
+
name: wheels ${{ matrix.platform.runner }} ${{ matrix.platform.target }}
|
|
19
|
+
runs-on: ${{ matrix.platform.runner }}
|
|
20
|
+
strategy:
|
|
21
|
+
fail-fast: false
|
|
22
|
+
matrix:
|
|
23
|
+
platform:
|
|
24
|
+
- { runner: ubuntu-latest, target: x86_64 }
|
|
25
|
+
- { runner: ubuntu-latest, target: aarch64 }
|
|
26
|
+
# macOS x86_64 is cross-built on the arm64 macos-14 runner: dedicated Intel (macos-13)
|
|
27
|
+
# runners are scarce/deprecated. abi3 needs no interpreter at build time, so this is sound.
|
|
28
|
+
- { runner: macos-14, target: x86_64 }
|
|
29
|
+
- { runner: macos-14, target: aarch64 }
|
|
30
|
+
- { runner: windows-latest, target: x64 }
|
|
31
|
+
steps:
|
|
32
|
+
- uses: actions/checkout@v4
|
|
33
|
+
- uses: actions/setup-python@v5
|
|
34
|
+
with:
|
|
35
|
+
python-version: "3.x"
|
|
36
|
+
- name: Build wheels
|
|
37
|
+
uses: PyO3/maturin-action@v1
|
|
38
|
+
with:
|
|
39
|
+
target: ${{ matrix.platform.target }}
|
|
40
|
+
args: --release --out dist
|
|
41
|
+
manylinux: auto
|
|
42
|
+
sccache: "true"
|
|
43
|
+
- uses: actions/upload-artifact@v4
|
|
44
|
+
with:
|
|
45
|
+
name: wheels-${{ matrix.platform.runner }}-${{ matrix.platform.target }}
|
|
46
|
+
path: dist
|
|
47
|
+
|
|
48
|
+
sdist:
|
|
49
|
+
name: sdist
|
|
50
|
+
runs-on: ubuntu-latest
|
|
51
|
+
steps:
|
|
52
|
+
- uses: actions/checkout@v4
|
|
53
|
+
- name: Build sdist
|
|
54
|
+
uses: PyO3/maturin-action@v1
|
|
55
|
+
with:
|
|
56
|
+
command: sdist
|
|
57
|
+
args: --out dist
|
|
58
|
+
- uses: actions/upload-artifact@v4
|
|
59
|
+
with:
|
|
60
|
+
name: wheels-sdist
|
|
61
|
+
path: dist
|
|
62
|
+
|
|
63
|
+
publish:
|
|
64
|
+
name: publish to PyPI
|
|
65
|
+
runs-on: ubuntu-latest
|
|
66
|
+
needs: [wheels, sdist]
|
|
67
|
+
if: startsWith(github.ref, 'refs/tags/')
|
|
68
|
+
environment: pypi
|
|
69
|
+
permissions:
|
|
70
|
+
id-token: write # OIDC token for PyPI Trusted Publishing
|
|
71
|
+
steps:
|
|
72
|
+
- uses: actions/download-artifact@v4
|
|
73
|
+
with:
|
|
74
|
+
pattern: wheels-*
|
|
75
|
+
merge-multiple: true
|
|
76
|
+
path: dist
|
|
77
|
+
- uses: pypa/gh-action-pypi-publish@release/v1
|
|
78
|
+
|
|
79
|
+
github-release:
|
|
80
|
+
name: GitHub Release
|
|
81
|
+
runs-on: ubuntu-latest
|
|
82
|
+
needs: [publish]
|
|
83
|
+
if: startsWith(github.ref, 'refs/tags/')
|
|
84
|
+
permissions:
|
|
85
|
+
contents: write # create the GitHub Release
|
|
86
|
+
steps:
|
|
87
|
+
- uses: actions/checkout@v4
|
|
88
|
+
- name: Extract this version's CHANGELOG section
|
|
89
|
+
run: |
|
|
90
|
+
ver="${GITHUB_REF_NAME#v}"
|
|
91
|
+
awk -v v="$ver" '
|
|
92
|
+
$0 ~ "^## \\[" v "\\]" { f = 1; next }
|
|
93
|
+
f && /^## \[/ { exit }
|
|
94
|
+
f { print }
|
|
95
|
+
' CHANGELOG.md > release-notes.md
|
|
96
|
+
echo "----- release notes -----"; cat release-notes.md
|
|
97
|
+
- uses: softprops/action-gh-release@v2
|
|
98
|
+
with:
|
|
99
|
+
name: lexindex ${{ github.ref_name }}
|
|
100
|
+
body_path: release-notes.md
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project are documented here. The format follows
|
|
4
|
+
[Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and the project adheres to
|
|
5
|
+
[Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
6
|
+
|
|
7
|
+
## [0.1.0] — 2026-06-28
|
|
8
|
+
|
|
9
|
+
First public release — compact, immutable string<->id indexes for huge catalogs; a standalone Rust +
|
|
10
|
+
Python library that also pairs with `betula-cluster` (map string ids to cluster ids and back).
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
|
|
14
|
+
- **`StringIndex`** — ordered, FST-backed index: exact `string <-> id`, plus prefix, range, fuzzy
|
|
15
|
+
(bounded Levenshtein edit distance), and subsequence iteration — all automaton-driven over the FST,
|
|
16
|
+
never a full scan. Serialises to a flat, relocatable blob (`save` / `load` / `to_bytes` /
|
|
17
|
+
`from_bytes`) with fully length- and offset-validated parsing (safe on untrusted input).
|
|
18
|
+
- **`PerfectHashIndex`** — minimal-perfect-hash dictionary (`ptr_hash`): verified-membership `id`,
|
|
19
|
+
a faster `id_unchecked` for closed vocabularies (~1.25× faster than `std::HashMap` on point lookup),
|
|
20
|
+
reverse lookup, and persistence (`save` / `load`) via `epserde`, keyed on a version-stable hash
|
|
21
|
+
(FNV-1a + splitmix64) so a serialised MPH reloads and queries identically on any build.
|
|
22
|
+
- **Python bindings** (PyO3 abi3 extension, CPython 3.11+): `pip install lexindex`, zero runtime
|
|
23
|
+
dependencies, typed (`py.typed` + stubs).
|
|
24
|
+
- **Feature gating** — `mph` (default) provides `PerfectHashIndex` (pulls `ptr_hash` + `epserde`);
|
|
25
|
+
`--no-default-features` is an `fst`-only build, free of the informational RustSec advisories on the
|
|
26
|
+
`ptr_hash` dependency tree. `fst`'s `levenshtein` is always on for fuzzy search.
|
|
27
|
+
- **Benchmark** — `cargo run --release --example bench` compares both indexes against
|
|
28
|
+
`std::HashMap` / `BTreeMap` (build time, lookup latency, serialised size).
|