stride-align 0.2.0__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {stride_align-0.2.0 → stride_align-0.3.0}/.claude/settings.local.json +5 -1
- {stride_align-0.2.0 → stride_align-0.3.0}/.gitignore +5 -1
- {stride_align-0.2.0 → stride_align-0.3.0}/BENCHMARK.md +248 -0
- stride_align-0.3.0/CHANGELOG.md +89 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/CMakeLists.txt +7 -2
- {stride_align-0.2.0 → stride_align-0.3.0}/PKG-INFO +77 -21
- {stride_align-0.2.0 → stride_align-0.3.0}/README.md +73 -18
- stride_align-0.3.0/docs/adding-a-new-algorithm.md +307 -0
- stride_align-0.3.0/docs/loongarch_lsx_lasx_vandn_gotcha.md +100 -0
- stride_align-0.3.0/docs/loongson-vs-tiger-lake-cdist-2026-05-24.md +127 -0
- stride_align-0.3.0/docs/power8-gcc10-workarounds.md +91 -0
- stride_align-0.3.0/include/stride_align/hamming.hpp +45 -0
- stride_align-0.3.0/include/stride_align/indel.hpp +154 -0
- stride_align-0.3.0/include/stride_align/jaro.hpp +273 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/include/stride_align/levenshtein.hpp +105 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/pyproject.toml +6 -5
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/backends/arm_neon128.hpp +69 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/backends/linux_aarch64_neon.hpp +64 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/backends/linux_aarch64_sve.hpp +74 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/backends/linux_aarch64_sve2.hpp +74 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/backends/linux_loongarch64_lasx.hpp +69 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/backends/linux_loongarch64_lsx.hpp +69 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/backends/linux_powerpc64_vsx.hpp +74 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/backends/macos_arm64_neon.hpp +64 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/backends/profile_traceback.hpp +12 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/backends/swar.hpp +17 -2
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/backends/x86_avx10_256.hpp +122 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/backends/x86_avx10_512.hpp +122 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/backends/x86_avx2.hpp +122 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/backends/x86_avx512bwvl.hpp +122 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/backends/x86_sse41.hpp +73 -0
- stride_align-0.3.0/src/cpp/cdist_runtime.hpp +346 -0
- stride_align-0.3.0/src/cpp/cdist_simd.hpp +563 -0
- stride_align-0.3.0/src/cpp/cdist_threshold.hpp +600 -0
- stride_align-0.3.0/src/cpp/cdist_topk.hpp +477 -0
- stride_align-0.3.0/src/cpp/hamming_dispatch.hpp +150 -0
- stride_align-0.3.0/src/cpp/hamming_simd.hpp +258 -0
- stride_align-0.3.0/src/cpp/indel_dispatch.hpp +132 -0
- stride_align-0.3.0/src/cpp/indel_simd.hpp +250 -0
- stride_align-0.3.0/src/cpp/jaro_dispatch.hpp +204 -0
- stride_align-0.3.0/src/cpp/jaro_simd.hpp +695 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/levenshtein_simd.hpp +213 -97
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/levenshtein_simd_ops.hpp +68 -5
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/module_bindings.hpp +1086 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/osa_simd.hpp +127 -22
- stride_align-0.3.0/src/cpp/topk.hpp +152 -0
- stride_align-0.3.0/src/cpp/true_damerau_dispatch.hpp +126 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/stride_align/__init__.py +914 -34
- {stride_align-0.2.0 → stride_align-0.3.0}/src/stride_align/benchmark.py +31 -32
- stride_align-0.3.0/tests/test_cdist.py +337 -0
- stride_align-0.3.0/tests/test_cdist_above_threshold.py +289 -0
- stride_align-0.3.0/tests/test_cdist_length_pruning.py +334 -0
- stride_align-0.3.0/tests/test_cdist_top_k.py +244 -0
- stride_align-0.3.0/tests/test_indel.py +175 -0
- stride_align-0.3.0/tests/test_jaro.py +190 -0
- stride_align-0.3.0/tests/test_top_k.py +344 -0
- stride_align-0.3.0/tests/test_true_damerau_levenshtein.py +184 -0
- stride_align-0.3.0/tools/bench_cdist_pruning.py +86 -0
- stride_align-0.2.0/11} +0 -0
- stride_align-0.2.0/12} +0 -0
- stride_align-0.2.0/5} +0 -0
- stride_align-0.2.0/TRANSLATION_REPORT.txt +0 -34
- stride_align-0.2.0/wheelhouse/stride_align-0.1.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl +0 -0
- stride_align-0.2.0/wheelhouse/stride_align-0.1.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl +0 -0
- stride_align-0.2.0/wheelhouse/stride_align-0.1.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl +0 -0
- stride_align-0.2.0/wheelhouse/stride_align-0.1.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl +0 -0
- stride_align-0.2.0/wheelhouse/stride_align-0.1.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl +0 -0
- stride_align-0.2.0/wheelhouse_v0.2.0/stride_align-0.2.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl +0 -0
- stride_align-0.2.0/wheelhouse_v0.2.0/stride_align-0.2.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl +0 -0
- stride_align-0.2.0/wheelhouse_v0.2.0/stride_align-0.2.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl +0 -0
- stride_align-0.2.0/wheelhouse_v0.2.0/stride_align-0.2.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl +0 -0
- stride_align-0.2.0/wheelhouse_v0.2.0/stride_align-0.2.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/.codex +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/.gitattributes +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/CITATION.bib +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/LICENSE +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/SME-EXPERIMENT.md +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/TRANSLATION_REPORT.md +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/TRANSLATION_RULES.md +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/benchmark.csv +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/benchmarks/graviton4-arm-simd-parasail-2026-05-16.csv +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/benchmarks/graviton4-arm-simd-parasail-2026-05-18.csv +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/benchmarks/graviton4-lev-osa-2026-05-19.csv +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/benchmarks/intel-damerau-levenshtein-2026-05-19.csv +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/benchmarks/intel-levenshtein-2026-05-19.csv +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/benchmarks/intel-levenshtein-v2-2026-05-19.csv +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/benchmarks/loongson-2026-05-13.md +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/benchmarks/loongson-lev-osa-2026-05-19.txt +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/benchmarks/loongson-native-2026-05-18.csv +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/benchmarks/loongson-path-native-2026-05-13.csv +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/benchmarks/loongson-score-1to1-parasail-2026-05-13.csv +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/benchmarks/loongson-score-native-2026-05-13.csv +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/benchmarks/loongson-sw-farrar-exactfill-baseline-2026-05-14.csv +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/benchmarks/loongson-sw-farrar-exactfill-study-2026-05-14.csv +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/benchmarks/macos-arm64-neon-2026-05-13.md +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/benchmarks/macos-arm64-neon-2026-05-14.md +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/benchmarks/macos-arm64-neon-2026-05-18.csv +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/benchmarks/macos-arm64-neon-focused-2026-05-14.csv +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/benchmarks/macos-arm64-neon-lev-osa-2026-05-19.csv +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/benchmarks/macos-arm64-neon-linear-trace-onepass-parasail-study-2026-05-14.csv +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/benchmarks/macos-arm64-neon-microbench-2026-05-14.txt +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/benchmarks/macos-arm64-neon-microbench-nw-affine-primitives-2026-05-14.txt +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/benchmarks/macos-arm64-neon-nw-affine-fastpaths-2026-05-14.csv +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/benchmarks/macos-arm64-neon-nw-affine-primitives-2026-05-14.csv +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/benchmarks/macos-arm64-neon-path-parasail-2026-05-13.csv +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/benchmarks/macos-arm64-neon-score-native-2026-05-13.csv +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/benchmarks/macos-arm64-neon-score-parasail-2026-05-13.csv +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/benchmarks/macos-arm64-neon-sw-farrar-parasail-study-2026-05-14.csv +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/benchmarks/power8-lev-osa-2026-05-19.txt +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/benchmarks/power8-vsx-2026-05-17.csv +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/benchmarks/power8-vsx-2026-05-17.md +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/benchmarks/power8-vsx-2026-05-18.csv +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/benchmarks/x86-sw-farrar-exactfill-study-2026-05-14.csv +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/benchmarks/x86_microbench_baseline.json +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/demo/demo2.py +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/demo/demo3.py +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/docs/x86_algorithmic_deltas.txt +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/fast.png +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/html/BENCHMARK.html +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/html/apple-touch-icon.png +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/html/fast.png +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/html/favicon-128.png +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/html/favicon-16.png +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/html/favicon-180.png +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/html/favicon-192.png +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/html/favicon-256.png +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/html/favicon-32.png +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/html/favicon-48.png +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/html/favicon-512.png +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/html/favicon-64.png +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/html/favicon.ico +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/html/icon-192.png +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/html/icon-512.png +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/html/index.html +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/html/site.webmanifest +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/html/style.css +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/include/stride_align/alignment.hpp +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/affine.hpp +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/backends/affine_fixed_kernel.hpp +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/backends/affine_scalable_kernel.hpp +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/backends/arm_neon_kernel.hpp +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/backends/arm_sve_backend.hpp +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/backends/arm_sve_kernel.hpp +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/backends/farrar_fixed_kernel.hpp +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/backends/farrar_scalable_kernel.hpp +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/backends/generic.cpp +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/backends/generic.hpp +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/backends/linux_aarch64_neon.cpp +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/backends/linux_aarch64_sve.cpp +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/backends/linux_aarch64_sve2.cpp +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/backends/linux_loongarch64_lasx.cpp +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/backends/linux_loongarch64_lsx.cpp +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/backends/linux_powerpc64_vsx.cpp +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/backends/linux_riscv64_rvv.cpp +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/backends/linux_riscv64_rvv.hpp +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/backends/loongarch_fixed_kernel.hpp +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/backends/macos_arm64_neon.cpp +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/backends/powerpc_vsx_kernel.hpp +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/backends/riscv_rvv_kernel.hpp +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/backends/score_fast_paths.hpp +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/backends/swar.cpp +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/backends/x86_avx10_256.cpp +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/backends/x86_avx10_512.cpp +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/backends/x86_avx2.cpp +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/backends/x86_avx512.cpp +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/backends/x86_fixed_kernel.hpp +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/backends/x86_sse.cpp +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/byte_view.hpp +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/cpu.cpp +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/cpu.hpp +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/cpu_module.cpp +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/farrar_preprocess.hpp +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/levenshtein_dispatch.hpp +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/preprocess.hpp +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/promotion.hpp +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/tools/arm_neon_microbench_backend.cpp +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/tools/x86_microbench.cpp +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/tools/x86_microbench_avx2.cpp +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/tools/x86_microbench_avx512bwvl.cpp +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/tools/x86_microbench_common.hpp +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/tools/x86_microbench_kernels.hpp +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/cpp/tools/x86_microbench_parasail.cpp +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/stride_align/_cpu.py +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/stride_align/_fallback.py +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/stride_align/_pybackend.py +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/stride_align/file_compare.py +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/src/stride_align/py.typed +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/tests/test_api.py +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/tools/benchmark_libs.py +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/tools/correctness_check.py +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/tools/md_to_html.py +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/tools/pinned_benchmark_sweep.py +0 -0
- {stride_align-0.2.0 → stride_align-0.3.0}/tools/x86_microbench_regression.py +0 -0
|
@@ -32,7 +32,11 @@
|
|
|
32
32
|
"Bash(~/.pyenv/bin/pyenv versions *)",
|
|
33
33
|
"Bash(scp *)",
|
|
34
34
|
"Bash(gh --version)",
|
|
35
|
-
"Bash(gh auth *)"
|
|
35
|
+
"Bash(gh auth *)",
|
|
36
|
+
"Bash(env)",
|
|
37
|
+
"Bash(.venv/bin/pip install *)",
|
|
38
|
+
"Bash(podman info *)",
|
|
39
|
+
"Bash(.venv/bin/cibuildwheel --help)"
|
|
36
40
|
]
|
|
37
41
|
}
|
|
38
42
|
}
|
|
@@ -45,6 +45,8 @@ ratio = baseline_median_seconds / stride_align_median_seconds
|
|
|
45
45
|
| Damerau-Lev (Graviton4, short tgts) | `linux_aarch64_neon`/`sve`/`sve2` | rapidfuzz OSA | 4 | **2.85x** | 2.83x | 2.27x | 3.89x |
|
|
46
46
|
| Lev (Power8 VSX, mixed tgts) | `linux_powerpc64_vsx` | generic (no rapidfuzz wheel) | 8 | **2.40x** | 2.51x | 1.56x | 3.03x |
|
|
47
47
|
| Damerau-Lev (Power8 VSX, mixed tgts) | `linux_powerpc64_vsx` | generic (no rapidfuzz wheel) | 7 | **1.99x** | 2.22x | 1.46x | 2.57x |
|
|
48
|
+
| Jaro batch (cross-arch, N=1000) | `x86_avx512bwvl` / `*_neon` / `*_lasx` / `*_vsx` | rapidfuzz | 10 | **5.1x** | 3.7x | 1.54x | 263x |
|
|
49
|
+
| cdist pruning (cross-arch, T=0.99) | `x86_avx512bwvl` / `*_neon` / `*_lasx` | own T=0 baseline | 24 | **426x** | 512x | 145x | 1,408x |
|
|
48
50
|
|
|
49
51
|
## Intel x86 - 2026-05-18
|
|
50
52
|
|
|
@@ -972,6 +974,252 @@ useful as a correctness/reference backend only.
|
|
|
972
974
|
4. Build parasail from source for ppc64le and add a parasail column to the next sweep — every other family in this file has at least one parasail point of reference.
|
|
973
975
|
5. Investigate why SWAR loses to generic on Power8 via an asm dump of the generic score loop.
|
|
974
976
|
|
|
977
|
+
## Jaro + Jaro-Winkler (cross-arch) - 2026-05-23
|
|
978
|
+
|
|
979
|
+
First cross-arch sweep of the new Jaro / Jaro-Winkler SIMD batch
|
|
980
|
+
kernels. One target per 64-bit SIMD lane; the query's per-byte PEQ is
|
|
981
|
+
gathered per-lane on each iteration, and the per-lane window mask is
|
|
982
|
+
built via the new `shl_var_u64` / `shr_var_u64` Ops primitives. After
|
|
983
|
+
the SIMD inner loop, a scalar finishing pass per lane computes
|
|
984
|
+
match/transposition counts from the bitmaps.
|
|
985
|
+
|
|
986
|
+
Same workload everywhere: random lowercase strings, one query of the
|
|
987
|
+
listed length, 1000 targets of the same length. Median of 3 runs of
|
|
988
|
+
50 iterations each. Baseline is `rapidfuzz.distance.Jaro.similarity`
|
|
989
|
+
called in a Python list comprehension — the natural "fuzzy match one
|
|
990
|
+
query against many targets" pattern.
|
|
991
|
+
|
|
992
|
+
Output is bit-equivalent to rapidfuzz across all listed backends:
|
|
993
|
+
verified on 500 random batches × ~25 targets each (~12,500 pairs per
|
|
994
|
+
backend); 0 mismatches at machine precision.
|
|
995
|
+
|
|
996
|
+
### Singular SIMD batch (one query, 1000 targets)
|
|
997
|
+
|
|
998
|
+
| Host / backend | Query len | stride-align | rapidfuzz | Ratio |
|
|
999
|
+
| --- | ---: | ---: | ---: | ---: |
|
|
1000
|
+
| Tiger Lake `x86_avx512bwvl` | 12 | 40.7 us | 181.5 us | **4.46x** |
|
|
1001
|
+
| Tiger Lake `x86_avx512bwvl` | 32 | 105.1 us | 289.2 us | **2.75x** |
|
|
1002
|
+
| Graviton4 `linux_aarch64_neon` | 12 | 43 us | 269 us | **6.26x** |
|
|
1003
|
+
| Graviton4 `linux_aarch64_neon` | 32 | 100 us | 353 us | **3.53x** |
|
|
1004
|
+
| Apple M-series `macos_arm64_neon` | 12 | 16 us | 151 us | **9.36x** |
|
|
1005
|
+
| Apple M-series `macos_arm64_neon` | 32 | 48 us | 183 us | **3.86x** |
|
|
1006
|
+
| Loongson `linux_loongarch64_lasx` | 12 | 87 us | 13,952 us | 161x |
|
|
1007
|
+
| Loongson `linux_loongarch64_lasx` | 32 | 187 us | 49,299 us | 263x |
|
|
1008
|
+
| Power8 `linux_powerpc64_vsx` | 12 | 194 us | 600 us | **3.09x** |
|
|
1009
|
+
| Power8 `linux_powerpc64_vsx` | 32 | 467 us | 719 us | **1.54x** |
|
|
1010
|
+
|
|
1011
|
+
The Loongson ratios are dramatic because rapidfuzz has no LSX/LASX
|
|
1012
|
+
SIMD path on LoongArch64 — it falls through to a scalar C kernel,
|
|
1013
|
+
while our LSX/LASX bit-parallel batch fans out 2/4 targets per vector
|
|
1014
|
+
iteration.
|
|
1015
|
+
|
|
1016
|
+
### One bug surfaced during deployment
|
|
1017
|
+
|
|
1018
|
+
VSX (`*reinterpret_cast<Vec*>(ptr)` for `load_aligned`/`store_aligned`)
|
|
1019
|
+
ran into a strict-aliasing miscompile under GCC -O3: the scalar
|
|
1020
|
+
writes to per-iteration `LaneScratch` could be reordered past the
|
|
1021
|
+
same-block Vec read, silently dropping lane-1 match updates on every
|
|
1022
|
+
2-target group. The Levenshtein SIMD kernel uses the same primitives
|
|
1023
|
+
but a different scratch pattern, so it didn't trip. Fix: switch VSX
|
|
1024
|
+
to `vec_xl` / `vec_xst`, the proper VSX load/store intrinsics.
|
|
1025
|
+
Documented in commit `8ae4905`.
|
|
1026
|
+
|
|
1027
|
+
### Multi-word query batch (q_len in (64, 256], m_len ≤ 64)
|
|
1028
|
+
|
|
1029
|
+
Same workload shape, query length stretched into the multi-word path
|
|
1030
|
+
(W = 2 for q in (64, 128], W = 3 for (128, 192], W = 4 for (192, 256]).
|
|
1031
|
+
Targets stay short; b_matched fits in a single word.
|
|
1032
|
+
|
|
1033
|
+
| Host / backend | q_len | m_len | N | stride-align | rapidfuzz | Ratio |
|
|
1034
|
+
| --- | ---: | ---: | ---: | ---: | ---: | ---: |
|
|
1035
|
+
| Tiger Lake `x86_avx512bwvl` | 50 | 20 | 500 | 45 us | 227 us | **5.01x** |
|
|
1036
|
+
| Tiger Lake `x86_avx512bwvl` | 100 | 20 | 500 | 44 us | 255 us | **5.81x** |
|
|
1037
|
+
| Tiger Lake `x86_avx512bwvl` | 150 | 20 | 500 | 52 us | 299 us | **5.79x** |
|
|
1038
|
+
| Tiger Lake `x86_avx512bwvl` | 200 | 20 | 500 | 60 us | 328 us | **5.45x** |
|
|
1039
|
+
| Graviton4 `linux_aarch64_neon` | 100 | 20 | 500 | 53 us | 238 us | **4.52x** |
|
|
1040
|
+
| Apple M-series `macos_arm64_neon` | 100 | 20 | 500 | 25 us | 124 us | **4.96x** |
|
|
1041
|
+
| Loongson `linux_loongarch64_lasx` | 100 | 20 | 500 | 110 us | 39,461 us | 358x |
|
|
1042
|
+
| Power8 `linux_powerpc64_vsx` | 100 | 20 | 500 | 1,532 us | 581 us | 0.38x |
|
|
1043
|
+
|
|
1044
|
+
Power8 is the one regression. The 2-block (W=2) inner loop doubles
|
|
1045
|
+
the gather count per j vs the single-word path, and Power8's VSX
|
|
1046
|
+
gather is emulated as scalar `vec_extract`/`vec_insert` (no native
|
|
1047
|
+
ppc gather instruction at this lane count). The per-iteration
|
|
1048
|
+
overhead exceeds rapidfuzz's tight scalar loop at this size.
|
|
1049
|
+
Workaround: if q_len ≤ 64 the single-word path stays 3x ahead;
|
|
1050
|
+
above 64 on Power8 specifically, prefer the per-target scalar
|
|
1051
|
+
dispatch (which the singular-API path already uses for q > 64).
|
|
1052
|
+
Future work: native pre-shuffle of the gather indices, or a Power-
|
|
1053
|
+
specific tuned gather using `vec_perm`.
|
|
1054
|
+
|
|
1055
|
+
### Multi-word target batch (q_len ≤ 256, m_len in (64, 256])
|
|
1056
|
+
|
|
1057
|
+
The second multi-word axis: target length crossing the 64-bit
|
|
1058
|
+
register boundary, in addition to (or independent of) the query
|
|
1059
|
+
length. b_matched becomes `std::array<Vec, W_target>`; the inner
|
|
1060
|
+
loop only updates block `j / 64` so the per-iteration work is the
|
|
1061
|
+
same as single-word target.
|
|
1062
|
+
|
|
1063
|
+
| Host / backend | q | m | N | stride-align | rapidfuzz | Ratio |
|
|
1064
|
+
| --- | ---: | ---: | ---: | ---: | ---: | ---: |
|
|
1065
|
+
| Tiger Lake `x86_avx512bwvl` | 100 | 100 | 500 | 196 us | 789 us | **4.02x** |
|
|
1066
|
+
| Tiger Lake `x86_avx512bwvl` | 200 | 150 | 500 | 440 us | 1305 us | **2.97x** |
|
|
1067
|
+
| Tiger Lake `x86_avx512bwvl` | 60 | 100 | 500 | 171 us | 631 us | **3.70x** |
|
|
1068
|
+
| Tiger Lake `x86_avx512bwvl` | 80 | 200 | 500 | 343 us | 1230 us | **3.58x** |
|
|
1069
|
+
| Graviton4 `linux_aarch64_neon` | 100 | 100 | 500 | 246 us | 643 us | **2.62x** |
|
|
1070
|
+
| Apple M-series `macos_arm64_neon` | 100 | 100 | 500 | 100 us | 296 us | **2.95x** |
|
|
1071
|
+
| Loongson `linux_loongarch64_lasx` | 100 | 100 | 500 | 509 us | 142,489 us | 279x |
|
|
1072
|
+
|
|
1073
|
+
The dispatch picks `(W_query, W_target)` from the actual lengths in
|
|
1074
|
+
the batch, so short-target inputs still get `W_target = 1` (no wasted
|
|
1075
|
+
work). 16 instantiations max per backend.
|
|
1076
|
+
|
|
1077
|
+
### Constraints (v0.3.0)
|
|
1078
|
+
|
|
1079
|
+
* SIMD path covers query lengths up to 256 AND target lengths up to
|
|
1080
|
+
256 (W = 1..4 blocks per side). Above 256 on either side it falls
|
|
1081
|
+
through to per-target scalar dispatch (bit-parallel single-word
|
|
1082
|
+
for ≤ 64 inputs and the scalar reference above).
|
|
1083
|
+
* Byte-compatible inputs (bytes / 1-byte unicode). Wider unicode
|
|
1084
|
+
falls through to scalar via the prepared-token path.
|
|
1085
|
+
|
|
1086
|
+
### Levenshtein audit (no changes needed)
|
|
1087
|
+
|
|
1088
|
+
Lev's SIMD batch already handles query lengths up to 256 via the
|
|
1089
|
+
same W = 1..4 multi-word pattern. Target length on Lev's side is
|
|
1090
|
+
just an iteration count over the inner DP loop — no per-target
|
|
1091
|
+
register-width constraint — so multi-word target is a non-issue for
|
|
1092
|
+
Lev. Above q_len = 256, Lev's scalar dispatch picks up via Hyyrö's
|
|
1093
|
+
multi-word Myers (no upper bound on q_len). Future work to extend
|
|
1094
|
+
the SIMD batch beyond W = 4 is small but the use case (queries >
|
|
1095
|
+
256 chars in batches of 1000+) is rare.
|
|
1096
|
+
|
|
1097
|
+
## cdist pruning + cutoff push-down (Intel x86) - 2026-05-24
|
|
1098
|
+
|
|
1099
|
+
Three optimizations stacked on top of `cdist_above_threshold` and
|
|
1100
|
+
`cdist_top_k`:
|
|
1101
|
+
|
|
1102
|
+
1. **Length-difference pruning.** Each pair `(q, t)` is gated by a
|
|
1103
|
+
closed-form upper bound on the achievable normalized similarity
|
|
1104
|
+
before any SIMD work runs. Bounds: `min/max` for Lev / OSA /
|
|
1105
|
+
true-DL, `(2 + min/max)/3` for Jaro, `2*min/(q+t)` for Indel,
|
|
1106
|
+
`1.0` if equal-length for Hamming.
|
|
1107
|
+
2. **Row-sort by query length, descending.** `cdist_top_k`
|
|
1108
|
+
processes the longest queries first so close-length high-scoring
|
|
1109
|
+
pairs surface early and the shared `global_min_bound` atomic
|
|
1110
|
+
reaches a useful value before the short-query rows run.
|
|
1111
|
+
3. **Per-pair cutoff push-down into the SIMD kernel.** The Myers /
|
|
1112
|
+
OSA / Hamming inner loops bail per lane when the score exceeds
|
|
1113
|
+
the per-pair cutoff plus the remaining-chars allowance; bailed
|
|
1114
|
+
lanes return the `cutoff + 1` sentinel. Lev/OSA use
|
|
1115
|
+
`floor((1-T)*max(|q|, |t|) + 1e-9)`; Hamming uses
|
|
1116
|
+
`floor((1-T)*|q|)`. Indel's bit-parallel Allison-Dix doesn't
|
|
1117
|
+
track a running distance per column, and Jaro/JW have multi-term
|
|
1118
|
+
scores without a clean per-column bail, so those two scorer
|
|
1119
|
+
families benefit from length pruning only.
|
|
1120
|
+
|
|
1121
|
+
All three are correctness-preserving — tests in
|
|
1122
|
+
`tests/test_cdist_length_pruning.py` pin the result set against the
|
|
1123
|
+
un-pruned full `cdist` matrix at multiple thresholds and the
|
|
1124
|
+
floating-point integer-boundary edges.
|
|
1125
|
+
|
|
1126
|
+
### Setup
|
|
1127
|
+
|
|
1128
|
+
Tiger Lake `x86_avx512bwvl`, N=400 queries × M=400 targets =
|
|
1129
|
+
160,000 pairs, random lowercase ASCII. Lengths 4–40 for Lev / OSA /
|
|
1130
|
+
Indel / Jaro / JW; lengths 100 (equal-length) for Hamming.
|
|
1131
|
+
`cpu_count=4`. Reproduce via `tools/bench_cdist_pruning.py --scorer
|
|
1132
|
+
<name>`.
|
|
1133
|
+
|
|
1134
|
+
### `cdist_above_threshold` throughput (pairs/sec)
|
|
1135
|
+
|
|
1136
|
+
| Scorer | T=0 | T=0.3 | T=0.5 | T=0.7 | T=0.85 | T=0.95 | T=0.99 |
|
|
1137
|
+
| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: |
|
|
1138
|
+
| LEVENSHTEIN_NORMALIZED | 0.49M | 12.7M | 31.3M | 53.7M | 116M | 226M | **290M** |
|
|
1139
|
+
| DAMERAU_LEVENSHTEIN_NORMALIZED| 0.50M | 13.0M | 35.5M | 70.2M | 147M | 190M | **297M** |
|
|
1140
|
+
| HAMMING_NORMALIZED (n=100) | 0.46M | 22.5M | 122M | 108M | 110M | 114M | **136M** |
|
|
1141
|
+
| INDEL_NORMALIZED | 0.47M | 11.3M | 24.7M | 39.0M | 66M | 143M | **286M** |
|
|
1142
|
+
| JARO | 0.53M | 0.5M | 2.2M | 12.6M | 24M | 47M | **129M** |
|
|
1143
|
+
| JARO_WINKLER | 0.51M | 0.5M | 1.8M | 11.9M | 14M | 41M | **141M** |
|
|
1144
|
+
|
|
1145
|
+
### Speedup ratio vs `T=0` (same scorer, same workload)
|
|
1146
|
+
|
|
1147
|
+
| Scorer | T=0.3 | T=0.5 | T=0.7 | T=0.85 | T=0.95 | T=0.99 |
|
|
1148
|
+
| --- | ---: | ---: | ---: | ---: | ---: | ---: |
|
|
1149
|
+
| LEVENSHTEIN_NORMALIZED | **26x** | **64x** | **109x** | **236x** | **458x** | **587x** |
|
|
1150
|
+
| DAMERAU_LEVENSHTEIN_NORMALIZED| **26x** | **72x** | **141x** | **298x** | **384x** | **598x** |
|
|
1151
|
+
| HAMMING_NORMALIZED | **49x** | **264x** | **233x** | **238x** | **246x** | **293x** |
|
|
1152
|
+
| INDEL_NORMALIZED | **24x** | **53x** | **83x** | **142x** | **306x** | **611x** |
|
|
1153
|
+
| JARO | 0.9x | 4.1x | **24x** | **46x** | **90x** | **245x** |
|
|
1154
|
+
| JARO_WINKLER | 1.0x | 3.5x | **23x** | **27x** | **80x** | **275x** |
|
|
1155
|
+
|
|
1156
|
+
Jaro / Jaro-Winkler show no benefit until the threshold rises above
|
|
1157
|
+
the natural-distribution floor of the `(2 + min/max)/3` length
|
|
1158
|
+
bound. For length 4–40 random strings that happens around T ≈ 0.7;
|
|
1159
|
+
above that the bound rules out most pairs and the speedup compounds.
|
|
1160
|
+
|
|
1161
|
+
### `cdist_top_k` throughput (pairs/sec)
|
|
1162
|
+
|
|
1163
|
+
| Scorer | k=1 | k=10 | k=100 | k=1000 | k=10000 |
|
|
1164
|
+
| --- | ---: | ---: | ---: | ---: | ---: |
|
|
1165
|
+
| LEVENSHTEIN_NORMALIZED | 8.0M | 25.3M | 21.1M | 17.0M | 8.5M |
|
|
1166
|
+
| DAMERAU_LEVENSHTEIN_NORMALIZED| 20.0M | 23.4M | 19.4M | 18.6M | 12.2M |
|
|
1167
|
+
| HAMMING_NORMALIZED (n=100) | 78.0M | 68.9M | 71.1M | 61.6M | 24.0M |
|
|
1168
|
+
| INDEL_NORMALIZED | 29.1M | 24.5M | 21.5M | 17.4M | 7.5M |
|
|
1169
|
+
| JARO | 15.3M | 14.2M | 13.9M | 12.6M | 8.9M |
|
|
1170
|
+
| JARO_WINKLER | 15.1M | 12.2M | 13.6M | 13.4M | 10.0M |
|
|
1171
|
+
|
|
1172
|
+
The row-sort matters most at small `k`: with the longest queries
|
|
1173
|
+
processed first, the global heap-min bound rises early and the
|
|
1174
|
+
per-pair cutoff push-down has a tight value to compare against for
|
|
1175
|
+
the bulk of the remaining rows. At very large `k` the heap rarely
|
|
1176
|
+
fills with strong matches so the bound stays close to the
|
|
1177
|
+
`(1.0 - safe margin)` floor and the kernel-level cutoff doesn't bite.
|
|
1178
|
+
|
|
1179
|
+
### Cross-arch throughput at `T=0.99` (pairs/sec)
|
|
1180
|
+
|
|
1181
|
+
Same script, same workload, four different SIMD backends.
|
|
1182
|
+
|
|
1183
|
+
| Scorer | Tiger Lake `x86_avx512bwvl` | Graviton4 `linux_aarch64_neon` | Mac M-series `macos_arm64_neon` | Loongson `linux_loongarch64_lasx` |
|
|
1184
|
+
| --- | ---: | ---: | ---: | ---: |
|
|
1185
|
+
| LEVENSHTEIN_NORMALIZED | 290M | 318M | 996M | 370M |
|
|
1186
|
+
| DAMERAU_LEVENSHTEIN_NORMALIZED| 297M | 318M | 1,014M | 426M |
|
|
1187
|
+
| HAMMING_NORMALIZED (n=100) | 136M | 70M | 295M | 139M |
|
|
1188
|
+
| INDEL_NORMALIZED | 286M | 272M | 995M | 402M |
|
|
1189
|
+
| JARO | 129M | 160M | 784M | 190M |
|
|
1190
|
+
| JARO_WINKLER | 141M | 105M | 543M | 136M |
|
|
1191
|
+
|
|
1192
|
+
### Cross-arch speedup vs `T=0` (same scorer, same host)
|
|
1193
|
+
|
|
1194
|
+
| Scorer | Tiger Lake | Graviton4 | Mac M-series | Loongson |
|
|
1195
|
+
| --- | ---: | ---: | ---: | ---: |
|
|
1196
|
+
| LEVENSHTEIN_NORMALIZED | 587x | 699x | 537x | **1,199x** |
|
|
1197
|
+
| DAMERAU_LEVENSHTEIN_NORMALIZED| 598x | 714x | 478x | **1,353x** |
|
|
1198
|
+
| HAMMING_NORMALIZED | 293x | 145x | 147x | 487x |
|
|
1199
|
+
| INDEL_NORMALIZED | 611x | 567x | 540x | **1,408x** |
|
|
1200
|
+
| JARO | 245x | 339x | 429x | 642x |
|
|
1201
|
+
| JARO_WINKLER | 275x | 225x | 293x | 472x |
|
|
1202
|
+
|
|
1203
|
+
The speedup ratios are algorithmic — the bound math is independent
|
|
1204
|
+
of ISA, so the cross-host spread reflects only how much the
|
|
1205
|
+
un-pruned baseline costs vs the post-pruning hot path on each
|
|
1206
|
+
machine. Mac's M-series tops the absolute throughput because the
|
|
1207
|
+
inner SIMD loops are bit-parallel ops that the Apple core hands
|
|
1208
|
+
back at high IPC; Loongson posts the largest *ratio* because its
|
|
1209
|
+
un-pruned baseline (full Myers / OSA / Indel scan per pair at
|
|
1210
|
+
length 4–40) is slowest in absolute terms.
|
|
1211
|
+
|
|
1212
|
+
Power8 numbers are deferred (host RAM too tight for a full `-O3`
|
|
1213
|
+
rebuild — see `docs/power8-gcc10-workarounds.md`).
|
|
1214
|
+
|
|
1215
|
+
### Reading the numbers
|
|
1216
|
+
|
|
1217
|
+
The relative speedups carry over across hosts; the absolute
|
|
1218
|
+
throughput numbers don't. The pre-pruning baseline (`T=0`) is
|
|
1219
|
+
`cdist_above_threshold` running every pair through full SIMD —
|
|
1220
|
+
equivalent to a full `cdist` plus the iterator overhead — so it's
|
|
1221
|
+
the right "no optimization" reference for the speedup ratios.
|
|
1222
|
+
|
|
975
1223
|
## Notes on comparing across families
|
|
976
1224
|
|
|
977
1225
|
These numbers are intended for engineering direction, not publication-grade
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to `stride-align` are recorded here. The format
|
|
4
|
+
follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and
|
|
5
|
+
this project adheres to [Semantic Versioning](https://semver.org/).
|
|
6
|
+
|
|
7
|
+
## [0.3.0] - 2026-05-24
|
|
8
|
+
|
|
9
|
+
### Added
|
|
10
|
+
|
|
11
|
+
* **Indel distance** (`Scorer.INDEL` / `Scorer.INDEL_NORMALIZED`).
|
|
12
|
+
Levenshtein restricted to insertions and deletions; equivalent to
|
|
13
|
+
`|a| + |b| - 2 * LCS(a, b)`. Bit-parallel single-word kernel uses
|
|
14
|
+
the Allison-Dix (1986) recurrence; multi-word patterns fall back
|
|
15
|
+
to scalar DP. Public API: `indel_score`, `indel_normalized_score`,
|
|
16
|
+
`indel_scores`, `indel_normalized_scores`, `indel_top_k`,
|
|
17
|
+
`indel_best`, and the corresponding normalized variants. Wired
|
|
18
|
+
through every backend, `cdist`, `cdist_above_threshold`,
|
|
19
|
+
`cdist_top_k`, and the function-reference dispatch in `extract`.
|
|
20
|
+
|
|
21
|
+
* **True (unrestricted) Damerau-Levenshtein**
|
|
22
|
+
(`Scorer.TRUE_DAMERAU_LEVENSHTEIN` /
|
|
23
|
+
`Scorer.TRUE_DAMERAU_LEVENSHTEIN_NORMALIZED`). The unrestricted
|
|
24
|
+
form where a single character may participate in multiple edits.
|
|
25
|
+
Diverges from OSA on overlapping transpositions
|
|
26
|
+
(e.g. `"ca"`→`"abc"`: OSA=3, true-DL=2). Scalar DP only; no
|
|
27
|
+
bit-parallel kernel yet (Hyyrö 2003 exists but is significantly
|
|
28
|
+
more complex than OSA's bit-parallel and rarely the bottleneck).
|
|
29
|
+
Existing `Scorer.DAMERAU_LEVENSHTEIN` continues to refer to OSA —
|
|
30
|
+
the API name is unchanged.
|
|
31
|
+
|
|
32
|
+
* **Length-difference pruning** for `cdist_above_threshold` and
|
|
33
|
+
`cdist_top_k`. Each pair is gated by a closed-form upper bound on
|
|
34
|
+
the achievable normalized similarity before any SIMD work runs;
|
|
35
|
+
bounds are scorer-specific (`min/max` for Lev/OSA/true-DL,
|
|
36
|
+
`(2 + min/max)/3` for Jaro, `2*min/(q+t)` for Indel, `1.0` if
|
|
37
|
+
equal-length for Hamming).
|
|
38
|
+
|
|
39
|
+
* **`cdist_top_k` row-sort by query length, descending.** Longest
|
|
40
|
+
queries processed first so close-length high-scoring pairs
|
|
41
|
+
surface early and the shared `global_min_bound` atomic reaches a
|
|
42
|
+
useful value before the short-query rows run.
|
|
43
|
+
|
|
44
|
+
* **Per-pair cutoff push-down into the SIMD kernels.** Myers
|
|
45
|
+
(Levenshtein single-word + multi-word), OSA single-word, and the
|
|
46
|
+
Hamming inner loop all bail when the running distance plus
|
|
47
|
+
remaining-chars allowance proves the pair can't reach its cutoff;
|
|
48
|
+
bailed lanes return the per-pair `cutoff + 1` sentinel.
|
|
49
|
+
|
|
50
|
+
* **`docs/adding-a-new-algorithm.md`**: grep-able checklist for the
|
|
51
|
+
touch points (`Scorer` enum, runtime helpers, cdist switches,
|
|
52
|
+
bindings, per-backend Implementation methods, tests) a new
|
|
53
|
+
scorer / alignment algorithm / SIMD backend has to hit.
|
|
54
|
+
|
|
55
|
+
* **Python 3.9 support.** The three `match` blocks in the Python
|
|
56
|
+
layer became dict lookups; `from __future__ import annotations`
|
|
57
|
+
was already in place project-wide. `pyproject.toml`
|
|
58
|
+
`requires-python = ">=3.9"`, classifiers extended.
|
|
59
|
+
|
|
60
|
+
### Changed
|
|
61
|
+
|
|
62
|
+
* **Lowered the build-time C++ requirement from C++23 to C++20.**
|
|
63
|
+
The project doesn't actually use any C++23 library feature — the
|
|
64
|
+
`cxx_std_23` setting was aspirational. Lowering it lets gcc 10
|
|
65
|
+
toolchains build the project (POWER8 Ubuntu 20.04 ships gcc 9.4
|
|
66
|
+
and 10.5). Two stdlib gaps in gcc-10 libstdc++ are bridged with
|
|
67
|
+
feature-test-gated fallbacks (`std::bit_cast` →
|
|
68
|
+
`__builtin_bit_cast`, `std::make_unique_for_overwrite` → plain
|
|
69
|
+
`new T[n]`). See `docs/power8-gcc10-workarounds.md` for the full
|
|
70
|
+
list and the revert recipe once gcc 16 lands.
|
|
71
|
+
|
|
72
|
+
### Fixed
|
|
73
|
+
|
|
74
|
+
* **`cdist_above_threshold` iterator on macOS and LoongArch64.** The
|
|
75
|
+
end-of-stream signal previously used `throw nb::stop_iteration()`,
|
|
76
|
+
which relies on cross-DSO RTTI matching for nanobind's
|
|
77
|
+
`builtin_exception`. macOS's two-level namespace and at least one
|
|
78
|
+
LoongArch toolchain configuration defeat that lookup, and the
|
|
79
|
+
exception ended up routed through nanobind's generic
|
|
80
|
+
`std::exception` translator → bare `RuntimeError` instead of
|
|
81
|
+
Python's `StopIteration`. Replaced with the C-API path
|
|
82
|
+
(`PyErr_SetNone(PyExc_StopIteration)` plus a null `nb::object`
|
|
83
|
+
return), which bypasses C++ exception machinery entirely. Fixes
|
|
84
|
+
91 macOS test failures.
|
|
85
|
+
|
|
86
|
+
## [0.2.0]
|
|
87
|
+
|
|
88
|
+
(Existing behavior at this tag was not previously tracked in this
|
|
89
|
+
file; future releases will list specific deltas.)
|
|
@@ -11,7 +11,7 @@ if(NOT SKBUILD)
|
|
|
11
11
|
)
|
|
12
12
|
endif()
|
|
13
13
|
|
|
14
|
-
set(CMAKE_CXX_STANDARD
|
|
14
|
+
set(CMAKE_CXX_STANDARD 20)
|
|
15
15
|
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
|
16
16
|
set(CMAKE_CXX_EXTENSIONS OFF)
|
|
17
17
|
string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" STRIDE_ALIGN_SYSTEM_PROCESSOR)
|
|
@@ -134,7 +134,12 @@ function(apply_stride_align_optimization_flags target_name)
|
|
|
134
134
|
endfunction()
|
|
135
135
|
|
|
136
136
|
function(configure_stride_align_target target_name)
|
|
137
|
-
|
|
137
|
+
# C++20 is sufficient — we use std::popcount, <bit>, consteval,
|
|
138
|
+
# if-constexpr-requires; nothing from the C++23 stdlib (no expected,
|
|
139
|
+
# no format/print, no flat_*, no generator, no mdspan). Keeping the
|
|
140
|
+
# required standard at 20 lets older toolchains (gcc 10, e.g. on
|
|
141
|
+
# POWER8 Ubuntu 20.04) build the project.
|
|
142
|
+
target_compile_features(${target_name} PRIVATE cxx_std_20)
|
|
138
143
|
target_include_directories(${target_name} PRIVATE include src/cpp)
|
|
139
144
|
|
|
140
145
|
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang|GNU")
|
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: stride-align
|
|
3
|
-
Version: 0.
|
|
4
|
-
Summary: Smith-Waterman and Needleman-Wunsch alignments with a nanobind C++
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: Smith-Waterman and Needleman-Wunsch alignments with a nanobind C++20 backend.
|
|
5
5
|
Author: Adam
|
|
6
6
|
License-Expression: Apache-2.0
|
|
7
7
|
Classifier: Development Status :: 3 - Alpha
|
|
8
8
|
Classifier: Intended Audience :: Developers
|
|
9
9
|
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
10
11
|
Classifier: Programming Language :: Python :: 3.10
|
|
11
12
|
Classifier: Programming Language :: Python :: 3.11
|
|
12
13
|
Classifier: Programming Language :: Python :: 3.12
|
|
@@ -15,7 +16,7 @@ Classifier: Programming Language :: Python :: 3.14
|
|
|
15
16
|
Classifier: Programming Language :: C++
|
|
16
17
|
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
17
18
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
18
|
-
Requires-Python: >=3.
|
|
19
|
+
Requires-Python: >=3.9
|
|
19
20
|
Requires-Dist: numpy>=1.22
|
|
20
21
|
Provides-Extra: dev
|
|
21
22
|
Requires-Dist: build>=1.2; extra == "dev"
|
|
@@ -49,14 +50,14 @@ sudo apt install python3-numpy
|
|
|
49
50
|
|
|
50
51
|
PY=$(python3 -c 'import sys; print(f"cp{sys.version_info.major}{sys.version_info.minor}")')
|
|
51
52
|
pip install \
|
|
52
|
-
https://github.com/adamdeprince/stride-align/releases/download/v0.
|
|
53
|
+
https://github.com/adamdeprince/stride-align/releases/download/v0.3.0/stride_align-0.3.0-${PY}-${PY}-linux_loongarch64.whl
|
|
53
54
|
```
|
|
54
55
|
|
|
55
|
-
Prebuilt LoongArch64 wheels are available for Python 3.
|
|
56
|
-
3.13, and 3.14. If you are on a different Python (or
|
|
57
|
-
build from source), `pip install stride-align` falls
|
|
58
|
-
source distribution on PyPI, which compiles the
|
|
59
|
-
locally.
|
|
56
|
+
Prebuilt LoongArch64 wheels are available for Python 3.9, 3.10,
|
|
57
|
+
3.11, 3.12, 3.13, and 3.14. If you are on a different Python (or
|
|
58
|
+
just want to build from source), `pip install stride-align` falls
|
|
59
|
+
back to the source distribution on PyPI, which compiles the
|
|
60
|
+
LSX/LASX kernels locally.
|
|
60
61
|
|
|
61
62
|
First, just a disclaimer: I'm not using religious texts here to push
|
|
62
63
|
an agenda - for this demo I need multiple largish public domain
|
|
@@ -326,10 +327,11 @@ Gapped Alignment Report". CIGAR is the compact alignment-operation notation
|
|
|
326
327
|
used by SAM/BAM tooling. If you want the full formal version, see the
|
|
327
328
|
[SAM specification](https://samtools.github.io/hts-specs/SAMv1.pdf).
|
|
328
329
|
|
|
329
|
-
###
|
|
330
|
+
### Edit-distance scorers
|
|
330
331
|
|
|
331
|
-
Beyond Smith-Waterman and Needleman-Wunsch, `stride-align` exposes
|
|
332
|
-
unit-cost edit-distance
|
|
332
|
+
Beyond Smith-Waterman and Needleman-Wunsch, `stride-align` exposes
|
|
333
|
+
six unit-cost edit-distance and similarity metrics — each with its
|
|
334
|
+
own SIMD-batched code path:
|
|
333
335
|
|
|
334
336
|
```python
|
|
335
337
|
import stride_align
|
|
@@ -338,7 +340,6 @@ import stride_align
|
|
|
338
340
|
stride_align.levenshtein_score("kitten", "sitting") # -> 3
|
|
339
341
|
stride_align.levenshtein_normalized_score("kitten", "sitting") # -> 0.571...
|
|
340
342
|
stride_align.levenshtein_scores("kitten", ["kit", "sitting"]) # -> ndarray[int64]
|
|
341
|
-
stride_align.levenshtein_normalized_scores("kitten", targets) # -> ndarray[float64]
|
|
342
343
|
|
|
343
344
|
# Optional `score_cutoff` (rapidfuzz convention): bail early per-target,
|
|
344
345
|
# results that exceed the cutoff come back as `cutoff + 1`.
|
|
@@ -349,23 +350,78 @@ stride_align.levenshtein_scores(query, targets, score_cutoff=3)
|
|
|
349
350
|
# OSA.distance and is what most callers asking for
|
|
350
351
|
# "Damerau-Levenshtein" actually want.
|
|
351
352
|
stride_align.damerau_levenshtein_score("ab", "ba") # -> 1
|
|
352
|
-
|
|
353
|
+
|
|
354
|
+
# True Damerau-Levenshtein — the unrestricted form, where one
|
|
355
|
+
# character may participate in more than one edit. Slower (no
|
|
356
|
+
# bit-parallel kernel yet) but matches rapidfuzz.distance.DamerauLevenshtein
|
|
357
|
+
# exactly. Diverges from OSA on overlapping transpositions, e.g.
|
|
358
|
+
# "ca" -> "abc": OSA=3, true-DL=2.
|
|
359
|
+
stride_align.true_damerau_levenshtein_score("ca", "abc") # -> 2
|
|
360
|
+
|
|
361
|
+
# Indel — Levenshtein restricted to insertions and deletions, no
|
|
362
|
+
# substitutions. Equivalent to |a| + |b| - 2 * LCS(a, b). Bit-
|
|
363
|
+
# parallel Allison-Dix (1986) inner loop.
|
|
364
|
+
stride_align.indel_score("kitten", "sitting") # -> 5
|
|
365
|
+
|
|
366
|
+
# Hamming — count of positions where two equal-length strings differ.
|
|
367
|
+
# Cutoff variant bails the byte loop once mismatches exceed the cap.
|
|
368
|
+
stride_align.hamming_score("100", "110") # -> 1
|
|
369
|
+
|
|
370
|
+
# Jaro / Jaro-Winkler — similarities in [0, 1]; Winkler adds a
|
|
371
|
+
# capped prefix bonus.
|
|
372
|
+
stride_align.jaro_similarity("martha", "marhta") # -> 0.944...
|
|
373
|
+
stride_align.jaro_winkler_similarity("martha", "marhta") # -> 0.961...
|
|
353
374
|
```
|
|
354
375
|
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
specialize on every architecture's primary 64-bit-lane SIMD:
|
|
376
|
+
The batch variants (`*_scores`, `*_similarities`) pack one target
|
|
377
|
+
per SIMD lane on every supported backend:
|
|
358
378
|
|
|
359
379
|
- x86: SSE4.1 / AVX2 / AVX-512 / AVX10-256 / AVX10-512
|
|
360
380
|
- ARM: NEON (Linux + macOS), SVE / SVE2
|
|
361
381
|
- LoongArch: LSX / LASX
|
|
362
382
|
- PowerPC: VSX
|
|
363
383
|
|
|
364
|
-
|
|
365
|
-
multi-word kernel (W=2/3/4).
|
|
366
|
-
|
|
384
|
+
For Lev / OSA, patterns up to 64 chars run a single-word Myers;
|
|
385
|
+
65–256 chars use the multi-word kernel (W=2/3/4). Indel and OSA
|
|
386
|
+
fall back to scalar bit-parallel for patterns >64 (multi-word
|
|
387
|
+
generalization deferred); true-DL is scalar DP only.
|
|
388
|
+
|
|
389
|
+
### `cdist`, `cdist_above_threshold`, `cdist_top_k`
|
|
390
|
+
|
|
391
|
+
For all-pairs scoring across two lists of strings, `stride-align`
|
|
392
|
+
ships three matrix-style entry points:
|
|
393
|
+
|
|
394
|
+
```python
|
|
395
|
+
qs = ["kitten", "sitting", "kit"]
|
|
396
|
+
ts = ["kitten", "kit", "sitting", "biting"]
|
|
397
|
+
|
|
398
|
+
# Full N×M similarity matrix — ndarray[float64] (similarity scorers)
|
|
399
|
+
# or ndarray[int64] (distance scorers).
|
|
400
|
+
sa.cdist(qs, ts, scorer=sa.Scorer.JARO)
|
|
401
|
+
|
|
402
|
+
# Streaming filter — yields only pairs whose similarity exceeds the
|
|
403
|
+
# threshold. Workers feed a bounded queue; the caller drains it.
|
|
404
|
+
# Length pruning + per-pair cutoff push-down into the kernel skip
|
|
405
|
+
# most of the work at high thresholds.
|
|
406
|
+
for score, q, t in sa.cdist_above_threshold(
|
|
407
|
+
qs, ts, scorer=sa.Scorer.LEVENSHTEIN_NORMALIZED, threshold=0.7,
|
|
408
|
+
):
|
|
409
|
+
...
|
|
410
|
+
|
|
411
|
+
# Top-k by score — returns at most k highest-scoring (or lowest, for
|
|
412
|
+
# distance scorers) (score, query, target) tuples. Heaps are
|
|
413
|
+
# per-thread; a shared atomic global-min bound lets the per-pair
|
|
414
|
+
# cutoff push-down lift the prune threshold as work progresses.
|
|
415
|
+
sa.cdist_top_k(qs, ts, scorer=sa.Scorer.JARO, k=10)
|
|
416
|
+
```
|
|
417
|
+
|
|
418
|
+
At high thresholds the pruning is dramatic — see the cross-arch
|
|
419
|
+
table in [BENCHMARK.md](BENCHMARK.md) (the `cdist pruning` rows).
|
|
420
|
+
Loongson LASX in particular flips the expected ranking against
|
|
421
|
+
Tiger Lake AVX-512 at T=0.99; the comparison report lives at
|
|
422
|
+
[docs/loongson-vs-tiger-lake-cdist-2026-05-24.md](docs/loongson-vs-tiger-lake-cdist-2026-05-24.md).
|
|
367
423
|
|
|
368
|
-
See [BENCHMARK.md](BENCHMARK.md) for cross-architecture numbers.
|
|
424
|
+
See [BENCHMARK.md](BENCHMARK.md) for full cross-architecture numbers.
|
|
369
425
|
|
|
370
426
|
## Optimizations and Benchmarks
|
|
371
427
|
|