fuzzybunny 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Aftaab Siddiqui
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,114 @@
1
+ Metadata-Version: 2.4
2
+ Name: fuzzybunny
3
+ Version: 0.1.0
4
+ Summary: A fuzzy search tool written in python (actually C++)
5
+ Requires-Python: >=3.8
6
+ Description-Content-Type: text/markdown
7
+ License-File: LICENSE
8
+ Dynamic: description
9
+ Dynamic: description-content-type
10
+ Dynamic: license-file
11
+ Dynamic: requires-python
12
+ Dynamic: summary
13
+
14
+ <p align="center">
15
+ <img src="./docs/assets/fuzzybunny.png" alt="FuzzyBunny Logo" width="150" />
16
+ </p>
17
+
18
+ <h1 align="center">FuzzyBunny</h1>
19
+
20
+ <p align="center">
21
+ <b> A fuzzy search tool written in C++ with Python bindings </b>
22
+ </p>
23
+
24
+ <p align="center">
25
+ <img src="https://img.shields.io/badge/License-MIT-green" />
26
+ <img src="https://img.shields.io/badge/Language-C%2B%2B-00599C" />
27
+ <img src="https://img.shields.io/badge/Bindings-Pybind11-blue" />
28
+ </p>
29
+
30
+ ## Overview
31
+
32
+ FuzzyBunny is a lightweight, high-performance Python library for fuzzy string matching and ranking. It is implemented in C++ for speed and exposes a Pythonic API via Pybind11. It supports various scoring algorithms including Levenshtein, Jaccard, and Token Sort, along with partial matching capabilities.
33
+
34
+ ## Features
35
+
36
+ - **Fast C++ Core**: Optimized string matching algorithms.
37
+ - **Multiple Scorers**:
38
+ - `levenshtein`: Standard edit distance ratio.
39
+ - `jaccard`: Set-based similarity.
40
+ - `token_sort`: Sorts tokens before comparing (good for "Apple Banana" vs "Banana Apple").
41
+ - **Ranking**: Efficiently rank a list of candidates against a query.
42
+ - **Partial Matching**: Support for substring matching via `mode='partial'`.
43
+ - **Unicode Support**: Correctly handles UTF-8 input.
44
+
45
+ ## Installation
46
+
47
+ ### Prerequisites
48
+ - Python 3.8+
49
+ - C++17 compatible compiler (GCC, Clang, MSVC)
50
+
51
+ ### Using uv (Recommended)
52
+
53
+ ```bash
54
+ uv pip install .
55
+ ```
56
+
57
+ ### Using pip
58
+
59
+ ```bash
60
+ pip install .
61
+ ```
62
+
63
+ ## Usage
64
+
65
+ ```python
66
+ import fuzzybunny
67
+
68
+ # Basic Levenshtein Ratio
69
+ score = fuzzybunny.levenshtein("kitten", "sitting")
70
+ print(f"Score: {score}") # ~0.57
71
+
72
+ # Partial Matching
73
+ # "apple" is a perfect substring of "apple pie"
74
+ score = fuzzybunny.partial_ratio("apple", "apple pie")
75
+ print(f"Partial Score: {score}") # 1.0
76
+
77
+ # Ranking Candidates
78
+ candidates = ["apple pie", "banana bread", "cherry tart", "apple crisp"]
79
+ results = fuzzybunny.rank(
80
+ query="apple",
81
+ candidates=candidates,
82
+ scorer="levenshtein",
83
+ mode="partial",
84
+ top_n=2
85
+ )
86
+
87
+ for candidate, score in results:
88
+ print(f"{candidate}: {score}")
89
+ # Output:
90
+ # apple pie: 1.0
91
+ # apple crisp: 1.0
92
+ ```
93
+
94
+ ## Development
95
+
96
+ 1. **Setup Environment**:
97
+ ```bash
98
+ uv venv
99
+ source .venv/bin/activate
100
+ ```
101
+
102
+ 2. **Install in Editable Mode**:
103
+ ```bash
104
+ uv pip install -e .
105
+ ```
106
+
107
+ 3. **Run Tests**:
108
+ ```bash
109
+ pytest
110
+ ```
111
+
112
+ ## License
113
+
114
+ This project is licensed under the [MIT License](LICENSE).
@@ -0,0 +1,101 @@
1
+ <p align="center">
2
+ <img src="./docs/assets/fuzzybunny.png" alt="FuzzyBunny Logo" width="150" />
3
+ </p>
4
+
5
+ <h1 align="center">FuzzyBunny</h1>
6
+
7
+ <p align="center">
8
+ <b> A fuzzy search tool written in C++ with Python bindings </b>
9
+ </p>
10
+
11
+ <p align="center">
12
+ <img src="https://img.shields.io/badge/License-MIT-green" />
13
+ <img src="https://img.shields.io/badge/Language-C%2B%2B-00599C" />
14
+ <img src="https://img.shields.io/badge/Bindings-Pybind11-blue" />
15
+ </p>
16
+
17
+ ## Overview
18
+
19
+ FuzzyBunny is a lightweight, high-performance Python library for fuzzy string matching and ranking. It is implemented in C++ for speed and exposes a Pythonic API via Pybind11. It supports various scoring algorithms including Levenshtein, Jaccard, and Token Sort, along with partial matching capabilities.
20
+
21
+ ## Features
22
+
23
+ - **Fast C++ Core**: Optimized string matching algorithms.
24
+ - **Multiple Scorers**:
25
+ - `levenshtein`: Standard edit distance ratio.
26
+ - `jaccard`: Set-based similarity.
27
+ - `token_sort`: Sorts tokens before comparing (good for "Apple Banana" vs "Banana Apple").
28
+ - **Ranking**: Efficiently rank a list of candidates against a query.
29
+ - **Partial Matching**: Support for substring matching via `mode='partial'`.
30
+ - **Unicode Support**: Correctly handles UTF-8 input.
31
+
32
+ ## Installation
33
+
34
+ ### Prerequisites
35
+ - Python 3.8+
36
+ - C++17 compatible compiler (GCC, Clang, MSVC)
37
+
38
+ ### Using uv (Recommended)
39
+
40
+ ```bash
41
+ uv pip install .
42
+ ```
43
+
44
+ ### Using pip
45
+
46
+ ```bash
47
+ pip install .
48
+ ```
49
+
50
+ ## Usage
51
+
52
+ ```python
53
+ import fuzzybunny
54
+
55
+ # Basic Levenshtein Ratio
56
+ score = fuzzybunny.levenshtein("kitten", "sitting")
57
+ print(f"Score: {score}") # ~0.57
58
+
59
+ # Partial Matching
60
+ # "apple" is a perfect substring of "apple pie"
61
+ score = fuzzybunny.partial_ratio("apple", "apple pie")
62
+ print(f"Partial Score: {score}") # 1.0
63
+
64
+ # Ranking Candidates
65
+ candidates = ["apple pie", "banana bread", "cherry tart", "apple crisp"]
66
+ results = fuzzybunny.rank(
67
+ query="apple",
68
+ candidates=candidates,
69
+ scorer="levenshtein",
70
+ mode="partial",
71
+ top_n=2
72
+ )
73
+
74
+ for candidate, score in results:
75
+ print(f"{candidate}: {score}")
76
+ # Output:
77
+ # apple pie: 1.0
78
+ # apple crisp: 1.0
79
+ ```
80
+
81
+ ## Development
82
+
83
+ 1. **Setup Environment**:
84
+ ```bash
85
+ uv venv
86
+ source .venv/bin/activate
87
+ ```
88
+
89
+ 2. **Install in Editable Mode**:
90
+ ```bash
91
+ uv pip install -e .
92
+ ```
93
+
94
+ 3. **Run Tests**:
95
+ ```bash
96
+ pytest
97
+ ```
98
+
99
+ ## License
100
+
101
+ This project is licensed under the [MIT License](LICENSE).
@@ -0,0 +1,114 @@
1
+ Metadata-Version: 2.4
2
+ Name: fuzzybunny
3
+ Version: 0.1.0
4
+ Summary: A fuzzy search tool written in python (actually C++)
5
+ Requires-Python: >=3.8
6
+ Description-Content-Type: text/markdown
7
+ License-File: LICENSE
8
+ Dynamic: description
9
+ Dynamic: description-content-type
10
+ Dynamic: license-file
11
+ Dynamic: requires-python
12
+ Dynamic: summary
13
+
14
+ <p align="center">
15
+ <img src="./docs/assets/fuzzybunny.png" alt="FuzzyBunny Logo" width="150" />
16
+ </p>
17
+
18
+ <h1 align="center">FuzzyBunny</h1>
19
+
20
+ <p align="center">
21
+ <b> A fuzzy search tool written in C++ with Python bindings </b>
22
+ </p>
23
+
24
+ <p align="center">
25
+ <img src="https://img.shields.io/badge/License-MIT-green" />
26
+ <img src="https://img.shields.io/badge/Language-C%2B%2B-00599C" />
27
+ <img src="https://img.shields.io/badge/Bindings-Pybind11-blue" />
28
+ </p>
29
+
30
+ ## Overview
31
+
32
+ FuzzyBunny is a lightweight, high-performance Python library for fuzzy string matching and ranking. It is implemented in C++ for speed and exposes a Pythonic API via Pybind11. It supports various scoring algorithms including Levenshtein, Jaccard, and Token Sort, along with partial matching capabilities.
33
+
34
+ ## Features
35
+
36
+ - **Fast C++ Core**: Optimized string matching algorithms.
37
+ - **Multiple Scorers**:
38
+ - `levenshtein`: Standard edit distance ratio.
39
+ - `jaccard`: Set-based similarity.
40
+ - `token_sort`: Sorts tokens before comparing (good for "Apple Banana" vs "Banana Apple").
41
+ - **Ranking**: Efficiently rank a list of candidates against a query.
42
+ - **Partial Matching**: Support for substring matching via `mode='partial'`.
43
+ - **Unicode Support**: Correctly handles UTF-8 input.
44
+
45
+ ## Installation
46
+
47
+ ### Prerequisites
48
+ - Python 3.8+
49
+ - C++17 compatible compiler (GCC, Clang, MSVC)
50
+
51
+ ### Using uv (Recommended)
52
+
53
+ ```bash
54
+ uv pip install .
55
+ ```
56
+
57
+ ### Using pip
58
+
59
+ ```bash
60
+ pip install .
61
+ ```
62
+
63
+ ## Usage
64
+
65
+ ```python
66
+ import fuzzybunny
67
+
68
+ # Basic Levenshtein Ratio
69
+ score = fuzzybunny.levenshtein("kitten", "sitting")
70
+ print(f"Score: {score}") # ~0.57
71
+
72
+ # Partial Matching
73
+ # "apple" is a perfect substring of "apple pie"
74
+ score = fuzzybunny.partial_ratio("apple", "apple pie")
75
+ print(f"Partial Score: {score}") # 1.0
76
+
77
+ # Ranking Candidates
78
+ candidates = ["apple pie", "banana bread", "cherry tart", "apple crisp"]
79
+ results = fuzzybunny.rank(
80
+ query="apple",
81
+ candidates=candidates,
82
+ scorer="levenshtein",
83
+ mode="partial",
84
+ top_n=2
85
+ )
86
+
87
+ for candidate, score in results:
88
+ print(f"{candidate}: {score}")
89
+ # Output:
90
+ # apple pie: 1.0
91
+ # apple crisp: 1.0
92
+ ```
93
+
94
+ ## Development
95
+
96
+ 1. **Setup Environment**:
97
+ ```bash
98
+ uv venv
99
+ source .venv/bin/activate
100
+ ```
101
+
102
+ 2. **Install in Editable Mode**:
103
+ ```bash
104
+ uv pip install -e .
105
+ ```
106
+
107
+ 3. **Run Tests**:
108
+ ```bash
109
+ pytest
110
+ ```
111
+
112
+ ## License
113
+
114
+ This project is licensed under the [MIT License](LICENSE).
@@ -0,0 +1,12 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ setup.py
5
+ fuzzybunny.egg-info/PKG-INFO
6
+ fuzzybunny.egg-info/SOURCES.txt
7
+ fuzzybunny.egg-info/dependency_links.txt
8
+ fuzzybunny.egg-info/not-zip-safe
9
+ fuzzybunny.egg-info/top_level.txt
10
+ src/bindings.cpp
11
+ src/scorers.cpp
12
+ tests/test_basic.py
@@ -0,0 +1 @@
1
+ fuzzybunny
@@ -0,0 +1,3 @@
1
+ [build-system]
2
+ requires = ["setuptools>=42", "wheel", "pybind11>=2.10.0"]
3
+ build-backend = "setuptools.build_meta"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,46 @@
1
+ import sys
2
+ from setuptools import setup, Extension
3
+ from setuptools.command.build_ext import build_ext
4
+ import pybind11
5
+
6
+ class PybindBuildExt(build_ext):
7
+ """Custom build_ext to ensure specific compiler flags."""
8
+ def build_extensions(self):
9
+ ct = self.compiler.compiler_type
10
+ opts = []
11
+ if ct == 'unix':
12
+ opts.append('-DVERSION_INFO="%s"' % self.distribution.get_version())
13
+ opts.append('-std=c++17')
14
+ opts.append('-fvisibility=hidden')
15
+ elif ct == 'msvc':
16
+ opts.append('/DVERSION_INFO=\"%s\"' % self.distribution.get_version())
17
+ opts.append('/std:c++17')
18
+
19
+ for ext in self.extensions:
20
+ ext.extra_compile_args = opts
21
+
22
+ build_ext.build_extensions(self)
23
+
24
+ ext_modules = [
25
+ Extension(
26
+ "fuzzybunny",
27
+ ["src/bindings.cpp", "src/scorers.cpp"],
28
+ include_dirs=[
29
+ pybind11.get_include(),
30
+ "src"
31
+ ],
32
+ language="c++"
33
+ ),
34
+ ]
35
+
36
+ setup(
37
+ name="fuzzybunny",
38
+ version="0.1.0",
39
+ description="A fuzzy search tool written in python (actually C++)",
40
+ long_description=open("README.md").read(),
41
+ long_description_content_type="text/markdown",
42
+ ext_modules=ext_modules,
43
+ cmdclass={"build_ext": PybindBuildExt},
44
+ zip_safe=False,
45
+ python_requires=">=3.8",
46
+ )
@@ -0,0 +1,60 @@
1
+ #include <pybind11/pybind11.h>
2
+ #include <pybind11/stl.h>
3
+ #include "scorers.hpp"
4
+
5
+ namespace py = pybind11;
6
+ using namespace fuzzybunny;
7
+
8
+ PYBIND11_MODULE(fuzzybunny, m) {
9
+ m.doc() = R"pbdoc(
10
+ FuzzyBunny: A fast fuzzy string matching library
11
+ ------------------------------------------------
12
+ .. currentmodule:: fuzzybunny
13
+ .. autosummary::
14
+ :toctree: _generate
15
+ levenshtein
16
+ jaccard
17
+ token_sort
18
+ rank
19
+ )pbdoc";
20
+
21
+ m.def("levenshtein", [](const std::string& s1, const std::string& s2) {
22
+ return levenshtein_ratio(utf8_to_u32(s1), utf8_to_u32(s2));
23
+ }, py::arg("s1"), py::arg("s2"), "Calculate Levenshtein ratio (0.0 - 1.0)");
24
+
25
+ m.def("partial_ratio", [](const std::string& s1, const std::string& s2) {
26
+ return partial_ratio(utf8_to_u32(s1), utf8_to_u32(s2));
27
+ }, py::arg("s1"), py::arg("s2"), "Calculate Partial Levenshtein ratio (0.0 - 1.0)");
28
+
29
+ m.def("jaccard", [](const std::string& s1, const std::string& s2) {
30
+ return jaccard_similarity(utf8_to_u32(s1), utf8_to_u32(s2));
31
+ }, py::arg("s1"), py::arg("s2"), "Calculate Jaccard similarity (0.0 - 1.0)");
32
+
33
+ m.def("token_sort", [](const std::string& s1, const std::string& s2) {
34
+ return token_sort_ratio(utf8_to_u32(s1), utf8_to_u32(s2));
35
+ }, py::arg("s1"), py::arg("s2"), "Calculate Token Sort ratio (0.0 - 1.0)");
36
+
37
+ m.def("rank", &rank,
38
+ py::arg("query"),
39
+ py::arg("candidates"),
40
+ py::arg("scorer") = "levenshtein",
41
+ py::arg("mode") = "full",
42
+ py::arg("threshold") = 0.0,
43
+ py::arg("top_n") = -1,
44
+ "Rank candidates against a query string. Returns list of (string, score) tuples.");
45
+
46
+ m.def("batch_match", &batch_match,
47
+ py::arg("queries"),
48
+ py::arg("candidates"),
49
+ py::arg("scorer") = "levenshtein",
50
+ py::arg("mode") = "full",
51
+ py::arg("threshold") = 0.0,
52
+ py::arg("top_n") = -1,
53
+ "Batch match multiple queries against candidates.");
54
+
55
+ #ifdef VERSION_INFO
56
+ m.attr("__version__") = VERSION_INFO;
57
+ #else
58
+ m.attr("__version__") = "dev";
59
+ #endif
60
+ }
@@ -0,0 +1,250 @@
1
+ #include "scorers.hpp"
2
+ #include <algorithm>
3
+ #include <vector>
4
+ #include <set>
5
+ #include <sstream>
6
+ #include <cmath>
7
+ #include <map>
8
+ #include <iostream>
9
+ #include <cstdint>
10
+
11
+ namespace fuzzybunny {
12
+
13
+ // --- Unicode Helper ---
14
+
15
+ // Converting manually because std::codecvt is deprecated in C++17
16
+ // and we want to avoid external dependencies like ICU for this lightweight lib.
17
+ std::u32string utf8_to_u32(const std::string& s) {
18
+ std::u32string result;
19
+ result.reserve(s.size());
20
+
21
+ for (size_t i = 0; i < s.length(); ) {
22
+ unsigned char c = static_cast<unsigned char>(s[i]);
23
+ uint32_t code_point = 0;
24
+ int seq_len = 0;
25
+
26
+ if (c < 0x80) {
27
+ code_point = c;
28
+ seq_len = 1;
29
+ } else if ((c & 0xE0) == 0xC0) {
30
+ code_point = c & 0x1F;
31
+ seq_len = 2;
32
+ } else if ((c & 0xF0) == 0xE0) {
33
+ code_point = c & 0x0F;
34
+ seq_len = 3;
35
+ } else if ((c & 0xF8) == 0xF0) {
36
+ code_point = c & 0x07;
37
+ seq_len = 4;
38
+ } else {
39
+ // Skip invalid start bytes to prevent decoding errors
40
+ i++;
41
+ continue;
42
+ }
43
+
44
+ if (i + seq_len > s.length()) break;
45
+
46
+ bool valid = true;
47
+ for (int k = 1; k < seq_len; ++k) {
48
+ unsigned char next = static_cast<unsigned char>(s[i + k]);
49
+ if ((next & 0xC0) != 0x80) {
50
+ valid = false;
51
+ break;
52
+ }
53
+ code_point = (code_point << 6) | (next & 0x3F);
54
+ }
55
+
56
+ if (valid) {
57
+ result.push_back(code_point);
58
+ i += seq_len;
59
+ } else {
60
+ i++;
61
+ }
62
+ }
63
+ return result;
64
+ }
65
+
66
+ // --- Internal Utils ---
67
+
68
+ std::vector<std::u32string> tokenize(const std::u32string& s) {
69
+ std::vector<std::u32string> tokens;
70
+ std::u32string current;
71
+ for (char32_t c : s) {
72
+ if (c == ' ' || c == '\t' || c == '\n' || c == '\r') {
73
+ if (!current.empty()) {
74
+ tokens.push_back(current);
75
+ current.clear();
76
+ }
77
+ } else {
78
+ current.push_back(c);
79
+ }
80
+ }
81
+ if (!current.empty()) tokens.push_back(current);
82
+ return tokens;
83
+ }
84
+
85
+ // --- Scorers ---
86
+
87
+ double levenshtein_ratio(const std::u32string& s1, const std::u32string& s2) {
88
+ size_t len1 = s1.size();
89
+ size_t len2 = s2.size();
90
+
91
+ if (len1 == 0 && len2 == 0) return 1.0;
92
+ if (len1 == 0 || len2 == 0) return 0.0;
93
+
94
+ std::vector<size_t> prev(len2 + 1);
95
+ std::vector<size_t> curr(len2 + 1);
96
+
97
+ for (size_t j = 0; j <= len2; ++j) prev[j] = j;
98
+
99
+ for (size_t i = 1; i <= len1; ++i) {
100
+ curr[0] = i;
101
+ for (size_t j = 1; j <= len2; ++j) {
102
+ size_t cost = (s1[i - 1] == s2[j - 1]) ? 0 : 1;
103
+ curr[j] = std::min({
104
+ prev[j] + 1,
105
+ curr[j - 1] + 1,
106
+ prev[j - 1] + cost
107
+ });
108
+ }
109
+ prev = curr;
110
+ }
111
+
112
+ size_t dist = prev[len2];
113
+ size_t max_len = std::max(len1, len2);
114
+
115
+ return 1.0 - (static_cast<double>(dist) / static_cast<double>(max_len));
116
+ }
117
+
118
+ double partial_ratio(const std::u32string& s1, const std::u32string& s2) {
119
+ if (s1.empty() && s2.empty()) return 1.0;
120
+ if (s1.empty() || s2.empty()) return 0.0;
121
+
122
+ const auto& shorter = (s1.size() <= s2.size()) ? s1 : s2;
123
+ const auto& longer = (s1.size() > s2.size()) ? s1 : s2;
124
+
125
+ double max_ratio = 0.0;
126
+ size_t k = shorter.size();
127
+
128
+ // Sliding window over the longer string to find the best matching substring
129
+ for (size_t i = 0; i <= longer.size() - k; ++i) {
130
+ std::u32string sub = longer.substr(i, k);
131
+ double ratio = levenshtein_ratio(shorter, sub);
132
+ if (ratio > max_ratio) max_ratio = ratio;
133
+ }
134
+ return max_ratio;
135
+ }
136
+
137
+ double jaccard_similarity(const std::u32string& s1, const std::u32string& s2) {
138
+ std::vector<std::u32string> tokens1 = tokenize(s1);
139
+ std::vector<std::u32string> tokens2 = tokenize(s2);
140
+
141
+ if (tokens1.empty() && tokens2.empty()) return 1.0;
142
+ if (tokens1.empty() || tokens2.empty()) return 0.0;
143
+
144
+ std::set<std::u32string> set1(tokens1.begin(), tokens1.end());
145
+ std::set<std::u32string> set2(tokens2.begin(), tokens2.end());
146
+
147
+ std::vector<std::u32string> intersection;
148
+ std::set_intersection(set1.begin(), set1.end(),
149
+ set2.begin(), set2.end(),
150
+ std::back_inserter(intersection));
151
+
152
+ std::vector<std::u32string> union_set;
153
+ std::set_union(set1.begin(), set1.end(),
154
+ set2.begin(), set2.end(),
155
+ std::back_inserter(union_set));
156
+
157
+ if (union_set.empty()) return 0.0;
158
+ return static_cast<double>(intersection.size()) / static_cast<double>(union_set.size());
159
+ }
160
+
161
+ double token_sort_ratio(const std::u32string& s1, const std::u32string& s2) {
162
+ auto t1 = tokenize(s1);
163
+ auto t2 = tokenize(s2);
164
+
165
+ std::sort(t1.begin(), t1.end());
166
+ std::sort(t2.begin(), t2.end());
167
+
168
+ std::u32string joined1, joined2;
169
+ for (size_t i = 0; i < t1.size(); ++i) {
170
+ joined1 += t1[i];
171
+ if (i < t1.size() - 1) joined1 += ' ';
172
+ }
173
+ for (size_t i = 0; i < t2.size(); ++i) {
174
+ joined2 += t2[i];
175
+ if (i < t2.size() - 1) joined2 += ' ';
176
+ }
177
+
178
+ return levenshtein_ratio(joined1, joined2);
179
+ }
180
+
181
+ // --- Ranking ---
182
+
183
+ std::vector<MatchResult> rank(
184
+ const std::string& query,
185
+ const std::vector<std::string>& candidates,
186
+ const std::string& scorer,
187
+ const std::string& mode,
188
+ double threshold,
189
+ int top_n
190
+ ) {
191
+ std::u32string uQuery = utf8_to_u32(query);
192
+ std::vector<MatchResult> results;
193
+ results.reserve(candidates.size());
194
+
195
+ for (const auto& cand : candidates) {
196
+ std::u32string uCand = utf8_to_u32(cand);
197
+ double score = 0.0;
198
+
199
+ if (scorer == "levenshtein") {
200
+ if (mode == "partial") {
201
+ score = partial_ratio(uQuery, uCand);
202
+ } else {
203
+ score = levenshtein_ratio(uQuery, uCand);
204
+ }
205
+ } else if (scorer == "jaccard") {
206
+ // Jaccard is inherently set-based, so partial matching on substrings
207
+ // doesn't align with the standard definition.
208
+ score = jaccard_similarity(uQuery, uCand);
209
+ } else if (scorer == "token_sort") {
210
+ score = token_sort_ratio(uQuery, uCand);
211
+ } else {
212
+ score = 0.0;
213
+ }
214
+
215
+ if (score >= threshold) {
216
+ results.push_back({cand, score});
217
+ }
218
+ }
219
+
220
+ std::sort(results.begin(), results.end(), [](const MatchResult& a, const MatchResult& b) {
221
+ return a.second > b.second;
222
+ });
223
+
224
+ if (top_n > 0 && static_cast<size_t>(top_n) < results.size()) {
225
+ results.resize(top_n);
226
+ }
227
+
228
+ return results;
229
+ }
230
+
231
+ std::vector<std::vector<MatchResult>> batch_match(
232
+ const std::vector<std::string>& queries,
233
+ const std::vector<std::string>& candidates,
234
+ const std::string& scorer,
235
+ const std::string& mode,
236
+ double threshold,
237
+ int top_n
238
+ ) {
239
+ std::vector<std::vector<MatchResult>> batch_results;
240
+ batch_results.reserve(queries.size());
241
+
242
+ // Simple sequential processing for now.
243
+ // Future optimization: OpenMP #pragma omp parallel for
244
+ for (const auto& query : queries) {
245
+ batch_results.push_back(rank(query, candidates, scorer, mode, threshold, top_n));
246
+ }
247
+ return batch_results;
248
+ }
249
+
250
+ } // namespace fuzzybunny
@@ -0,0 +1,70 @@
1
+ import pytest
2
+ import fuzzybunny
3
+
4
+ def test_levenshtein():
5
+ assert fuzzybunny.levenshtein("kitten", "sitting") > 0.5
6
+ assert fuzzybunny.levenshtein("apple", "apple") == 1.0
7
+ assert fuzzybunny.levenshtein("abc", "def") == 0.0 # Actually 0.0 because len=3, dist=3, 1 - 3/3 = 0
8
+ assert fuzzybunny.levenshtein("", "") == 1.0
9
+
10
+ def test_partial():
11
+ # "apple" is inside "apple pie"
12
+ assert fuzzybunny.partial_ratio("apple", "apple pie") == 1.0
13
+ # "pie" is inside "apple pie"
14
+ assert fuzzybunny.partial_ratio("pie", "apple pie") == 1.0
15
+ # "xyz" is not in "apple pie"
16
+ assert fuzzybunny.partial_ratio("xyz", "apple pie") == 0.0
17
+
18
+ # rank with mode='partial'
19
+ cands = ["apple pie", "banana split", "cherry tart"]
20
+ results = fuzzybunny.rank("apple", cands, scorer="levenshtein", mode="partial")
21
+ assert results[0][0] == "apple pie"
22
+ assert results[0][1] == 1.0
23
+
24
+ def test_jaccard():
25
+ assert fuzzybunny.jaccard("apple banana", "banana apple") == 1.0
26
+ assert fuzzybunny.jaccard("apple", "banana") == 0.0
27
+ assert fuzzybunny.jaccard("a b c", "a c") > 0.5
28
+
29
+ def test_token_sort():
30
+ assert fuzzybunny.token_sort("apple banana", "banana apple") == 1.0
31
+ assert fuzzybunny.token_sort("fuzzy bunny", "bunny fuzzy") == 1.0
32
+
33
+ def test_unicode():
34
+ # 'café' vs 'cafe' (levenshtein should catch diff)
35
+ s1 = "café"
36
+ s2 = "cafe"
37
+ assert fuzzybunny.levenshtein(s1, s2) < 1.0
38
+ assert fuzzybunny.levenshtein(s1, s1) == 1.0
39
+
40
+ # Emoji
41
+ assert fuzzybunny.levenshtein("😊", "😊") == 1.0
42
+ assert fuzzybunny.levenshtein("😊", "😂") < 1.0
43
+
44
+ def test_rank():
45
+ candidates = ["apple", "apricot", "banana", "cherry"]
46
+ results = fuzzybunny.rank("app", candidates, scorer="levenshtein", top_n=2)
47
+ assert len(results) == 2
48
+ assert results[0][0] == "apple"
49
+
50
+ results_empty = fuzzybunny.rank("xyz", candidates, threshold=0.9)
51
+ assert len(results_empty) == 0
52
+
53
+ def test_batch_match():
54
+ queries = ["apple", "banana"]
55
+ candidates = ["apple pie", "banana bread", "cherry tart"]
56
+ results = fuzzybunny.batch_match(queries, candidates, mode="partial")
57
+
58
+ assert len(results) == 2
59
+ # First query "apple" matches "apple pie"
60
+ assert results[0][0][0] == "apple pie"
61
+ assert results[0][0][1] == 1.0
62
+
63
+ # Second query "banana" matches "banana bread"
64
+ assert results[1][0][0] == "banana bread"
65
+ assert results[1][0][1] == 1.0
66
+
67
+ def test_invalid_scorer_rank():
68
+ # Should default to 0.0 or handle gracefully
69
+ results = fuzzybunny.rank("a", ["a"], scorer="unknown")
70
+ assert results[0][1] == 0.0