fuzzybunny 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fuzzybunny-0.1.0/LICENSE +21 -0
- fuzzybunny-0.1.0/PKG-INFO +114 -0
- fuzzybunny-0.1.0/README.md +101 -0
- fuzzybunny-0.1.0/fuzzybunny.egg-info/PKG-INFO +114 -0
- fuzzybunny-0.1.0/fuzzybunny.egg-info/SOURCES.txt +12 -0
- fuzzybunny-0.1.0/fuzzybunny.egg-info/dependency_links.txt +1 -0
- fuzzybunny-0.1.0/fuzzybunny.egg-info/not-zip-safe +1 -0
- fuzzybunny-0.1.0/fuzzybunny.egg-info/top_level.txt +1 -0
- fuzzybunny-0.1.0/pyproject.toml +3 -0
- fuzzybunny-0.1.0/setup.cfg +4 -0
- fuzzybunny-0.1.0/setup.py +46 -0
- fuzzybunny-0.1.0/src/bindings.cpp +60 -0
- fuzzybunny-0.1.0/src/scorers.cpp +250 -0
- fuzzybunny-0.1.0/tests/test_basic.py +70 -0
fuzzybunny-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Aftaab Siddiqui
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: fuzzybunny
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A fuzzy search tool written in python (actually C++)
|
|
5
|
+
Requires-Python: >=3.8
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Dynamic: description
|
|
9
|
+
Dynamic: description-content-type
|
|
10
|
+
Dynamic: license-file
|
|
11
|
+
Dynamic: requires-python
|
|
12
|
+
Dynamic: summary
|
|
13
|
+
|
|
14
|
+
<p align="center">
|
|
15
|
+
<img src="./docs/assets/fuzzybunny.png" alt="FuzzyBunny Logo" width="150" />
|
|
16
|
+
</p>
|
|
17
|
+
|
|
18
|
+
<h1 align="center">FuzzyBunny</h1>
|
|
19
|
+
|
|
20
|
+
<p align="center">
|
|
21
|
+
<b> A fuzzy search tool written in C++ with Python bindings </b>
|
|
22
|
+
</p>
|
|
23
|
+
|
|
24
|
+
<p align="center">
|
|
25
|
+
<img src="https://img.shields.io/badge/License-MIT-green" />
|
|
26
|
+
<img src="https://img.shields.io/badge/Language-C%2B%2B-00599C" />
|
|
27
|
+
<img src="https://img.shields.io/badge/Bindings-Pybind11-blue" />
|
|
28
|
+
</p>
|
|
29
|
+
|
|
30
|
+
## Overview
|
|
31
|
+
|
|
32
|
+
FuzzyBunny is a lightweight, high-performance Python library for fuzzy string matching and ranking. It is implemented in C++ for speed and exposes a Pythonic API via Pybind11. It supports various scoring algorithms including Levenshtein, Jaccard, and Token Sort, along with partial matching capabilities.
|
|
33
|
+
|
|
34
|
+
## Features
|
|
35
|
+
|
|
36
|
+
- **Fast C++ Core**: Optimized string matching algorithms.
|
|
37
|
+
- **Multiple Scorers**:
|
|
38
|
+
- `levenshtein`: Standard edit distance ratio.
|
|
39
|
+
- `jaccard`: Set-based similarity.
|
|
40
|
+
- `token_sort`: Sorts tokens before comparing (good for "Apple Banana" vs "Banana Apple").
|
|
41
|
+
- **Ranking**: Efficiently rank a list of candidates against a query.
|
|
42
|
+
- **Partial Matching**: Support for substring matching via `mode='partial'`.
|
|
43
|
+
- **Unicode Support**: Correctly handles UTF-8 input.
|
|
44
|
+
|
|
45
|
+
## Installation
|
|
46
|
+
|
|
47
|
+
### Prerequisites
|
|
48
|
+
- Python 3.8+
|
|
49
|
+
- C++17 compatible compiler (GCC, Clang, MSVC)
|
|
50
|
+
|
|
51
|
+
### Using uv (Recommended)
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
uv pip install .
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### Using pip
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
pip install .
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Usage
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
import fuzzybunny
|
|
67
|
+
|
|
68
|
+
# Basic Levenshtein Ratio
|
|
69
|
+
score = fuzzybunny.levenshtein("kitten", "sitting")
|
|
70
|
+
print(f"Score: {score}") # ~0.57
|
|
71
|
+
|
|
72
|
+
# Partial Matching
|
|
73
|
+
# "apple" is a perfect substring of "apple pie"
|
|
74
|
+
score = fuzzybunny.partial_ratio("apple", "apple pie")
|
|
75
|
+
print(f"Partial Score: {score}") # 1.0
|
|
76
|
+
|
|
77
|
+
# Ranking Candidates
|
|
78
|
+
candidates = ["apple pie", "banana bread", "cherry tart", "apple crisp"]
|
|
79
|
+
results = fuzzybunny.rank(
|
|
80
|
+
query="apple",
|
|
81
|
+
candidates=candidates,
|
|
82
|
+
scorer="levenshtein",
|
|
83
|
+
mode="partial",
|
|
84
|
+
top_n=2
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
for candidate, score in results:
|
|
88
|
+
print(f"{candidate}: {score}")
|
|
89
|
+
# Output:
|
|
90
|
+
# apple pie: 1.0
|
|
91
|
+
# apple crisp: 1.0
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## Development
|
|
95
|
+
|
|
96
|
+
1. **Setup Environment**:
|
|
97
|
+
```bash
|
|
98
|
+
uv venv
|
|
99
|
+
source .venv/bin/activate
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
2. **Install in Editable Mode**:
|
|
103
|
+
```bash
|
|
104
|
+
uv pip install -e .
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
3. **Run Tests**:
|
|
108
|
+
```bash
|
|
109
|
+
pytest
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
## License
|
|
113
|
+
|
|
114
|
+
This project is licensed under the [MIT License](LICENSE).
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<img src="./docs/assets/fuzzybunny.png" alt="FuzzyBunny Logo" width="150" />
|
|
3
|
+
</p>
|
|
4
|
+
|
|
5
|
+
<h1 align="center">FuzzyBunny</h1>
|
|
6
|
+
|
|
7
|
+
<p align="center">
|
|
8
|
+
<b> A fuzzy search tool written in C++ with Python bindings </b>
|
|
9
|
+
</p>
|
|
10
|
+
|
|
11
|
+
<p align="center">
|
|
12
|
+
<img src="https://img.shields.io/badge/License-MIT-green" />
|
|
13
|
+
<img src="https://img.shields.io/badge/Language-C%2B%2B-00599C" />
|
|
14
|
+
<img src="https://img.shields.io/badge/Bindings-Pybind11-blue" />
|
|
15
|
+
</p>
|
|
16
|
+
|
|
17
|
+
## Overview
|
|
18
|
+
|
|
19
|
+
FuzzyBunny is a lightweight, high-performance Python library for fuzzy string matching and ranking. It is implemented in C++ for speed and exposes a Pythonic API via Pybind11. It supports various scoring algorithms including Levenshtein, Jaccard, and Token Sort, along with partial matching capabilities.
|
|
20
|
+
|
|
21
|
+
## Features
|
|
22
|
+
|
|
23
|
+
- **Fast C++ Core**: Optimized string matching algorithms.
|
|
24
|
+
- **Multiple Scorers**:
|
|
25
|
+
- `levenshtein`: Standard edit distance ratio.
|
|
26
|
+
- `jaccard`: Set-based similarity.
|
|
27
|
+
- `token_sort`: Sorts tokens before comparing (good for "Apple Banana" vs "Banana Apple").
|
|
28
|
+
- **Ranking**: Efficiently rank a list of candidates against a query.
|
|
29
|
+
- **Partial Matching**: Support for substring matching via `mode='partial'`.
|
|
30
|
+
- **Unicode Support**: Correctly handles UTF-8 input.
|
|
31
|
+
|
|
32
|
+
## Installation
|
|
33
|
+
|
|
34
|
+
### Prerequisites
|
|
35
|
+
- Python 3.8+
|
|
36
|
+
- C++17 compatible compiler (GCC, Clang, MSVC)
|
|
37
|
+
|
|
38
|
+
### Using uv (Recommended)
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
uv pip install .
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
### Using pip
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
pip install .
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## Usage
|
|
51
|
+
|
|
52
|
+
```python
|
|
53
|
+
import fuzzybunny
|
|
54
|
+
|
|
55
|
+
# Basic Levenshtein Ratio
|
|
56
|
+
score = fuzzybunny.levenshtein("kitten", "sitting")
|
|
57
|
+
print(f"Score: {score}") # ~0.57
|
|
58
|
+
|
|
59
|
+
# Partial Matching
|
|
60
|
+
# "apple" is a perfect substring of "apple pie"
|
|
61
|
+
score = fuzzybunny.partial_ratio("apple", "apple pie")
|
|
62
|
+
print(f"Partial Score: {score}") # 1.0
|
|
63
|
+
|
|
64
|
+
# Ranking Candidates
|
|
65
|
+
candidates = ["apple pie", "banana bread", "cherry tart", "apple crisp"]
|
|
66
|
+
results = fuzzybunny.rank(
|
|
67
|
+
query="apple",
|
|
68
|
+
candidates=candidates,
|
|
69
|
+
scorer="levenshtein",
|
|
70
|
+
mode="partial",
|
|
71
|
+
top_n=2
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
for candidate, score in results:
|
|
75
|
+
print(f"{candidate}: {score}")
|
|
76
|
+
# Output:
|
|
77
|
+
# apple pie: 1.0
|
|
78
|
+
# apple crisp: 1.0
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## Development
|
|
82
|
+
|
|
83
|
+
1. **Setup Environment**:
|
|
84
|
+
```bash
|
|
85
|
+
uv venv
|
|
86
|
+
source .venv/bin/activate
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
2. **Install in Editable Mode**:
|
|
90
|
+
```bash
|
|
91
|
+
uv pip install -e .
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
3. **Run Tests**:
|
|
95
|
+
```bash
|
|
96
|
+
pytest
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
## License
|
|
100
|
+
|
|
101
|
+
This project is licensed under the [MIT License](LICENSE).
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: fuzzybunny
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A fuzzy search tool written in python (actually C++)
|
|
5
|
+
Requires-Python: >=3.8
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Dynamic: description
|
|
9
|
+
Dynamic: description-content-type
|
|
10
|
+
Dynamic: license-file
|
|
11
|
+
Dynamic: requires-python
|
|
12
|
+
Dynamic: summary
|
|
13
|
+
|
|
14
|
+
<p align="center">
|
|
15
|
+
<img src="./docs/assets/fuzzybunny.png" alt="FuzzyBunny Logo" width="150" />
|
|
16
|
+
</p>
|
|
17
|
+
|
|
18
|
+
<h1 align="center">FuzzyBunny</h1>
|
|
19
|
+
|
|
20
|
+
<p align="center">
|
|
21
|
+
<b> A fuzzy search tool written in C++ with Python bindings </b>
|
|
22
|
+
</p>
|
|
23
|
+
|
|
24
|
+
<p align="center">
|
|
25
|
+
<img src="https://img.shields.io/badge/License-MIT-green" />
|
|
26
|
+
<img src="https://img.shields.io/badge/Language-C%2B%2B-00599C" />
|
|
27
|
+
<img src="https://img.shields.io/badge/Bindings-Pybind11-blue" />
|
|
28
|
+
</p>
|
|
29
|
+
|
|
30
|
+
## Overview
|
|
31
|
+
|
|
32
|
+
FuzzyBunny is a lightweight, high-performance Python library for fuzzy string matching and ranking. It is implemented in C++ for speed and exposes a Pythonic API via Pybind11. It supports various scoring algorithms including Levenshtein, Jaccard, and Token Sort, along with partial matching capabilities.
|
|
33
|
+
|
|
34
|
+
## Features
|
|
35
|
+
|
|
36
|
+
- **Fast C++ Core**: Optimized string matching algorithms.
|
|
37
|
+
- **Multiple Scorers**:
|
|
38
|
+
- `levenshtein`: Standard edit distance ratio.
|
|
39
|
+
- `jaccard`: Set-based similarity.
|
|
40
|
+
- `token_sort`: Sorts tokens before comparing (good for "Apple Banana" vs "Banana Apple").
|
|
41
|
+
- **Ranking**: Efficiently rank a list of candidates against a query.
|
|
42
|
+
- **Partial Matching**: Support for substring matching via `mode='partial'`.
|
|
43
|
+
- **Unicode Support**: Correctly handles UTF-8 input.
|
|
44
|
+
|
|
45
|
+
## Installation
|
|
46
|
+
|
|
47
|
+
### Prerequisites
|
|
48
|
+
- Python 3.8+
|
|
49
|
+
- C++17 compatible compiler (GCC, Clang, MSVC)
|
|
50
|
+
|
|
51
|
+
### Using uv (Recommended)
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
uv pip install .
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### Using pip
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
pip install .
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Usage
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
import fuzzybunny
|
|
67
|
+
|
|
68
|
+
# Basic Levenshtein Ratio
|
|
69
|
+
score = fuzzybunny.levenshtein("kitten", "sitting")
|
|
70
|
+
print(f"Score: {score}") # ~0.57
|
|
71
|
+
|
|
72
|
+
# Partial Matching
|
|
73
|
+
# "apple" is a perfect substring of "apple pie"
|
|
74
|
+
score = fuzzybunny.partial_ratio("apple", "apple pie")
|
|
75
|
+
print(f"Partial Score: {score}") # 1.0
|
|
76
|
+
|
|
77
|
+
# Ranking Candidates
|
|
78
|
+
candidates = ["apple pie", "banana bread", "cherry tart", "apple crisp"]
|
|
79
|
+
results = fuzzybunny.rank(
|
|
80
|
+
query="apple",
|
|
81
|
+
candidates=candidates,
|
|
82
|
+
scorer="levenshtein",
|
|
83
|
+
mode="partial",
|
|
84
|
+
top_n=2
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
for candidate, score in results:
|
|
88
|
+
print(f"{candidate}: {score}")
|
|
89
|
+
# Output:
|
|
90
|
+
# apple pie: 1.0
|
|
91
|
+
# apple crisp: 1.0
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## Development
|
|
95
|
+
|
|
96
|
+
1. **Setup Environment**:
|
|
97
|
+
```bash
|
|
98
|
+
uv venv
|
|
99
|
+
source .venv/bin/activate
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
2. **Install in Editable Mode**:
|
|
103
|
+
```bash
|
|
104
|
+
uv pip install -e .
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
3. **Run Tests**:
|
|
108
|
+
```bash
|
|
109
|
+
pytest
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
## License
|
|
113
|
+
|
|
114
|
+
This project is licensed under the [MIT License](LICENSE).
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
setup.py
|
|
5
|
+
fuzzybunny.egg-info/PKG-INFO
|
|
6
|
+
fuzzybunny.egg-info/SOURCES.txt
|
|
7
|
+
fuzzybunny.egg-info/dependency_links.txt
|
|
8
|
+
fuzzybunny.egg-info/not-zip-safe
|
|
9
|
+
fuzzybunny.egg-info/top_level.txt
|
|
10
|
+
src/bindings.cpp
|
|
11
|
+
src/scorers.cpp
|
|
12
|
+
tests/test_basic.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
fuzzybunny
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from setuptools import setup, Extension
|
|
3
|
+
from setuptools.command.build_ext import build_ext
|
|
4
|
+
import pybind11
|
|
5
|
+
|
|
6
|
+
class PybindBuildExt(build_ext):
|
|
7
|
+
"""Custom build_ext to ensure specific compiler flags."""
|
|
8
|
+
def build_extensions(self):
|
|
9
|
+
ct = self.compiler.compiler_type
|
|
10
|
+
opts = []
|
|
11
|
+
if ct == 'unix':
|
|
12
|
+
opts.append('-DVERSION_INFO="%s"' % self.distribution.get_version())
|
|
13
|
+
opts.append('-std=c++17')
|
|
14
|
+
opts.append('-fvisibility=hidden')
|
|
15
|
+
elif ct == 'msvc':
|
|
16
|
+
opts.append('/DVERSION_INFO=\"%s\"' % self.distribution.get_version())
|
|
17
|
+
opts.append('/std:c++17')
|
|
18
|
+
|
|
19
|
+
for ext in self.extensions:
|
|
20
|
+
ext.extra_compile_args = opts
|
|
21
|
+
|
|
22
|
+
build_ext.build_extensions(self)
|
|
23
|
+
|
|
24
|
+
ext_modules = [
|
|
25
|
+
Extension(
|
|
26
|
+
"fuzzybunny",
|
|
27
|
+
["src/bindings.cpp", "src/scorers.cpp"],
|
|
28
|
+
include_dirs=[
|
|
29
|
+
pybind11.get_include(),
|
|
30
|
+
"src"
|
|
31
|
+
],
|
|
32
|
+
language="c++"
|
|
33
|
+
),
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
setup(
|
|
37
|
+
name="fuzzybunny",
|
|
38
|
+
version="0.1.0",
|
|
39
|
+
description="A fuzzy search tool written in python (actually C++)",
|
|
40
|
+
long_description=open("README.md").read(),
|
|
41
|
+
long_description_content_type="text/markdown",
|
|
42
|
+
ext_modules=ext_modules,
|
|
43
|
+
cmdclass={"build_ext": PybindBuildExt},
|
|
44
|
+
zip_safe=False,
|
|
45
|
+
python_requires=">=3.8",
|
|
46
|
+
)
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
#include <pybind11/pybind11.h>
|
|
2
|
+
#include <pybind11/stl.h>
|
|
3
|
+
#include "scorers.hpp"
|
|
4
|
+
|
|
5
|
+
namespace py = pybind11;
|
|
6
|
+
using namespace fuzzybunny;
|
|
7
|
+
|
|
8
|
+
PYBIND11_MODULE(fuzzybunny, m) {
|
|
9
|
+
m.doc() = R"pbdoc(
|
|
10
|
+
FuzzyBunny: A fast fuzzy string matching library
|
|
11
|
+
------------------------------------------------
|
|
12
|
+
.. currentmodule:: fuzzybunny
|
|
13
|
+
.. autosummary::
|
|
14
|
+
:toctree: _generate
|
|
15
|
+
levenshtein
|
|
16
|
+
jaccard
|
|
17
|
+
token_sort
|
|
18
|
+
rank
|
|
19
|
+
)pbdoc";
|
|
20
|
+
|
|
21
|
+
m.def("levenshtein", [](const std::string& s1, const std::string& s2) {
|
|
22
|
+
return levenshtein_ratio(utf8_to_u32(s1), utf8_to_u32(s2));
|
|
23
|
+
}, py::arg("s1"), py::arg("s2"), "Calculate Levenshtein ratio (0.0 - 1.0)");
|
|
24
|
+
|
|
25
|
+
m.def("partial_ratio", [](const std::string& s1, const std::string& s2) {
|
|
26
|
+
return partial_ratio(utf8_to_u32(s1), utf8_to_u32(s2));
|
|
27
|
+
}, py::arg("s1"), py::arg("s2"), "Calculate Partial Levenshtein ratio (0.0 - 1.0)");
|
|
28
|
+
|
|
29
|
+
m.def("jaccard", [](const std::string& s1, const std::string& s2) {
|
|
30
|
+
return jaccard_similarity(utf8_to_u32(s1), utf8_to_u32(s2));
|
|
31
|
+
}, py::arg("s1"), py::arg("s2"), "Calculate Jaccard similarity (0.0 - 1.0)");
|
|
32
|
+
|
|
33
|
+
m.def("token_sort", [](const std::string& s1, const std::string& s2) {
|
|
34
|
+
return token_sort_ratio(utf8_to_u32(s1), utf8_to_u32(s2));
|
|
35
|
+
}, py::arg("s1"), py::arg("s2"), "Calculate Token Sort ratio (0.0 - 1.0)");
|
|
36
|
+
|
|
37
|
+
m.def("rank", &rank,
|
|
38
|
+
py::arg("query"),
|
|
39
|
+
py::arg("candidates"),
|
|
40
|
+
py::arg("scorer") = "levenshtein",
|
|
41
|
+
py::arg("mode") = "full",
|
|
42
|
+
py::arg("threshold") = 0.0,
|
|
43
|
+
py::arg("top_n") = -1,
|
|
44
|
+
"Rank candidates against a query string. Returns list of (string, score) tuples.");
|
|
45
|
+
|
|
46
|
+
m.def("batch_match", &batch_match,
|
|
47
|
+
py::arg("queries"),
|
|
48
|
+
py::arg("candidates"),
|
|
49
|
+
py::arg("scorer") = "levenshtein",
|
|
50
|
+
py::arg("mode") = "full",
|
|
51
|
+
py::arg("threshold") = 0.0,
|
|
52
|
+
py::arg("top_n") = -1,
|
|
53
|
+
"Batch match multiple queries against candidates.");
|
|
54
|
+
|
|
55
|
+
#ifdef VERSION_INFO
|
|
56
|
+
m.attr("__version__") = VERSION_INFO;
|
|
57
|
+
#else
|
|
58
|
+
m.attr("__version__") = "dev";
|
|
59
|
+
#endif
|
|
60
|
+
}
|
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
#include "scorers.hpp"
|
|
2
|
+
#include <algorithm>
|
|
3
|
+
#include <vector>
|
|
4
|
+
#include <set>
|
|
5
|
+
#include <sstream>
|
|
6
|
+
#include <cmath>
|
|
7
|
+
#include <map>
|
|
8
|
+
#include <iostream>
|
|
9
|
+
#include <cstdint>
|
|
10
|
+
|
|
11
|
+
namespace fuzzybunny {
|
|
12
|
+
|
|
13
|
+
// --- Unicode Helper ---
|
|
14
|
+
|
|
15
|
+
// Converting manually because std::codecvt is deprecated in C++17
|
|
16
|
+
// and we want to avoid external dependencies like ICU for this lightweight lib.
|
|
17
|
+
std::u32string utf8_to_u32(const std::string& s) {
|
|
18
|
+
std::u32string result;
|
|
19
|
+
result.reserve(s.size());
|
|
20
|
+
|
|
21
|
+
for (size_t i = 0; i < s.length(); ) {
|
|
22
|
+
unsigned char c = static_cast<unsigned char>(s[i]);
|
|
23
|
+
uint32_t code_point = 0;
|
|
24
|
+
int seq_len = 0;
|
|
25
|
+
|
|
26
|
+
if (c < 0x80) {
|
|
27
|
+
code_point = c;
|
|
28
|
+
seq_len = 1;
|
|
29
|
+
} else if ((c & 0xE0) == 0xC0) {
|
|
30
|
+
code_point = c & 0x1F;
|
|
31
|
+
seq_len = 2;
|
|
32
|
+
} else if ((c & 0xF0) == 0xE0) {
|
|
33
|
+
code_point = c & 0x0F;
|
|
34
|
+
seq_len = 3;
|
|
35
|
+
} else if ((c & 0xF8) == 0xF0) {
|
|
36
|
+
code_point = c & 0x07;
|
|
37
|
+
seq_len = 4;
|
|
38
|
+
} else {
|
|
39
|
+
// Skip invalid start bytes to prevent decoding errors
|
|
40
|
+
i++;
|
|
41
|
+
continue;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
if (i + seq_len > s.length()) break;
|
|
45
|
+
|
|
46
|
+
bool valid = true;
|
|
47
|
+
for (int k = 1; k < seq_len; ++k) {
|
|
48
|
+
unsigned char next = static_cast<unsigned char>(s[i + k]);
|
|
49
|
+
if ((next & 0xC0) != 0x80) {
|
|
50
|
+
valid = false;
|
|
51
|
+
break;
|
|
52
|
+
}
|
|
53
|
+
code_point = (code_point << 6) | (next & 0x3F);
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
if (valid) {
|
|
57
|
+
result.push_back(code_point);
|
|
58
|
+
i += seq_len;
|
|
59
|
+
} else {
|
|
60
|
+
i++;
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
return result;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
// --- Internal Utils ---
|
|
67
|
+
|
|
68
|
+
std::vector<std::u32string> tokenize(const std::u32string& s) {
|
|
69
|
+
std::vector<std::u32string> tokens;
|
|
70
|
+
std::u32string current;
|
|
71
|
+
for (char32_t c : s) {
|
|
72
|
+
if (c == ' ' || c == '\t' || c == '\n' || c == '\r') {
|
|
73
|
+
if (!current.empty()) {
|
|
74
|
+
tokens.push_back(current);
|
|
75
|
+
current.clear();
|
|
76
|
+
}
|
|
77
|
+
} else {
|
|
78
|
+
current.push_back(c);
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
if (!current.empty()) tokens.push_back(current);
|
|
82
|
+
return tokens;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
// --- Scorers ---
|
|
86
|
+
|
|
87
|
+
double levenshtein_ratio(const std::u32string& s1, const std::u32string& s2) {
|
|
88
|
+
size_t len1 = s1.size();
|
|
89
|
+
size_t len2 = s2.size();
|
|
90
|
+
|
|
91
|
+
if (len1 == 0 && len2 == 0) return 1.0;
|
|
92
|
+
if (len1 == 0 || len2 == 0) return 0.0;
|
|
93
|
+
|
|
94
|
+
std::vector<size_t> prev(len2 + 1);
|
|
95
|
+
std::vector<size_t> curr(len2 + 1);
|
|
96
|
+
|
|
97
|
+
for (size_t j = 0; j <= len2; ++j) prev[j] = j;
|
|
98
|
+
|
|
99
|
+
for (size_t i = 1; i <= len1; ++i) {
|
|
100
|
+
curr[0] = i;
|
|
101
|
+
for (size_t j = 1; j <= len2; ++j) {
|
|
102
|
+
size_t cost = (s1[i - 1] == s2[j - 1]) ? 0 : 1;
|
|
103
|
+
curr[j] = std::min({
|
|
104
|
+
prev[j] + 1,
|
|
105
|
+
curr[j - 1] + 1,
|
|
106
|
+
prev[j - 1] + cost
|
|
107
|
+
});
|
|
108
|
+
}
|
|
109
|
+
prev = curr;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
size_t dist = prev[len2];
|
|
113
|
+
size_t max_len = std::max(len1, len2);
|
|
114
|
+
|
|
115
|
+
return 1.0 - (static_cast<double>(dist) / static_cast<double>(max_len));
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
double partial_ratio(const std::u32string& s1, const std::u32string& s2) {
|
|
119
|
+
if (s1.empty() && s2.empty()) return 1.0;
|
|
120
|
+
if (s1.empty() || s2.empty()) return 0.0;
|
|
121
|
+
|
|
122
|
+
const auto& shorter = (s1.size() <= s2.size()) ? s1 : s2;
|
|
123
|
+
const auto& longer = (s1.size() > s2.size()) ? s1 : s2;
|
|
124
|
+
|
|
125
|
+
double max_ratio = 0.0;
|
|
126
|
+
size_t k = shorter.size();
|
|
127
|
+
|
|
128
|
+
// Sliding window over the longer string to find the best matching substring
|
|
129
|
+
for (size_t i = 0; i <= longer.size() - k; ++i) {
|
|
130
|
+
std::u32string sub = longer.substr(i, k);
|
|
131
|
+
double ratio = levenshtein_ratio(shorter, sub);
|
|
132
|
+
if (ratio > max_ratio) max_ratio = ratio;
|
|
133
|
+
}
|
|
134
|
+
return max_ratio;
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
double jaccard_similarity(const std::u32string& s1, const std::u32string& s2) {
|
|
138
|
+
std::vector<std::u32string> tokens1 = tokenize(s1);
|
|
139
|
+
std::vector<std::u32string> tokens2 = tokenize(s2);
|
|
140
|
+
|
|
141
|
+
if (tokens1.empty() && tokens2.empty()) return 1.0;
|
|
142
|
+
if (tokens1.empty() || tokens2.empty()) return 0.0;
|
|
143
|
+
|
|
144
|
+
std::set<std::u32string> set1(tokens1.begin(), tokens1.end());
|
|
145
|
+
std::set<std::u32string> set2(tokens2.begin(), tokens2.end());
|
|
146
|
+
|
|
147
|
+
std::vector<std::u32string> intersection;
|
|
148
|
+
std::set_intersection(set1.begin(), set1.end(),
|
|
149
|
+
set2.begin(), set2.end(),
|
|
150
|
+
std::back_inserter(intersection));
|
|
151
|
+
|
|
152
|
+
std::vector<std::u32string> union_set;
|
|
153
|
+
std::set_union(set1.begin(), set1.end(),
|
|
154
|
+
set2.begin(), set2.end(),
|
|
155
|
+
std::back_inserter(union_set));
|
|
156
|
+
|
|
157
|
+
if (union_set.empty()) return 0.0;
|
|
158
|
+
return static_cast<double>(intersection.size()) / static_cast<double>(union_set.size());
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
double token_sort_ratio(const std::u32string& s1, const std::u32string& s2) {
|
|
162
|
+
auto t1 = tokenize(s1);
|
|
163
|
+
auto t2 = tokenize(s2);
|
|
164
|
+
|
|
165
|
+
std::sort(t1.begin(), t1.end());
|
|
166
|
+
std::sort(t2.begin(), t2.end());
|
|
167
|
+
|
|
168
|
+
std::u32string joined1, joined2;
|
|
169
|
+
for (size_t i = 0; i < t1.size(); ++i) {
|
|
170
|
+
joined1 += t1[i];
|
|
171
|
+
if (i < t1.size() - 1) joined1 += ' ';
|
|
172
|
+
}
|
|
173
|
+
for (size_t i = 0; i < t2.size(); ++i) {
|
|
174
|
+
joined2 += t2[i];
|
|
175
|
+
if (i < t2.size() - 1) joined2 += ' ';
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
return levenshtein_ratio(joined1, joined2);
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
// --- Ranking ---
|
|
182
|
+
|
|
183
|
+
std::vector<MatchResult> rank(
|
|
184
|
+
const std::string& query,
|
|
185
|
+
const std::vector<std::string>& candidates,
|
|
186
|
+
const std::string& scorer,
|
|
187
|
+
const std::string& mode,
|
|
188
|
+
double threshold,
|
|
189
|
+
int top_n
|
|
190
|
+
) {
|
|
191
|
+
std::u32string uQuery = utf8_to_u32(query);
|
|
192
|
+
std::vector<MatchResult> results;
|
|
193
|
+
results.reserve(candidates.size());
|
|
194
|
+
|
|
195
|
+
for (const auto& cand : candidates) {
|
|
196
|
+
std::u32string uCand = utf8_to_u32(cand);
|
|
197
|
+
double score = 0.0;
|
|
198
|
+
|
|
199
|
+
if (scorer == "levenshtein") {
|
|
200
|
+
if (mode == "partial") {
|
|
201
|
+
score = partial_ratio(uQuery, uCand);
|
|
202
|
+
} else {
|
|
203
|
+
score = levenshtein_ratio(uQuery, uCand);
|
|
204
|
+
}
|
|
205
|
+
} else if (scorer == "jaccard") {
|
|
206
|
+
// Jaccard is inherently set-based, so partial matching on substrings
|
|
207
|
+
// doesn't align with the standard definition.
|
|
208
|
+
score = jaccard_similarity(uQuery, uCand);
|
|
209
|
+
} else if (scorer == "token_sort") {
|
|
210
|
+
score = token_sort_ratio(uQuery, uCand);
|
|
211
|
+
} else {
|
|
212
|
+
score = 0.0;
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
if (score >= threshold) {
|
|
216
|
+
results.push_back({cand, score});
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
std::sort(results.begin(), results.end(), [](const MatchResult& a, const MatchResult& b) {
|
|
221
|
+
return a.second > b.second;
|
|
222
|
+
});
|
|
223
|
+
|
|
224
|
+
if (top_n > 0 && static_cast<size_t>(top_n) < results.size()) {
|
|
225
|
+
results.resize(top_n);
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
return results;
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
std::vector<std::vector<MatchResult>> batch_match(
|
|
232
|
+
const std::vector<std::string>& queries,
|
|
233
|
+
const std::vector<std::string>& candidates,
|
|
234
|
+
const std::string& scorer,
|
|
235
|
+
const std::string& mode,
|
|
236
|
+
double threshold,
|
|
237
|
+
int top_n
|
|
238
|
+
) {
|
|
239
|
+
std::vector<std::vector<MatchResult>> batch_results;
|
|
240
|
+
batch_results.reserve(queries.size());
|
|
241
|
+
|
|
242
|
+
// Simple sequential processing for now.
|
|
243
|
+
// Future optimization: OpenMP #pragma omp parallel for
|
|
244
|
+
for (const auto& query : queries) {
|
|
245
|
+
batch_results.push_back(rank(query, candidates, scorer, mode, threshold, top_n));
|
|
246
|
+
}
|
|
247
|
+
return batch_results;
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
} // namespace fuzzybunny
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
import fuzzybunny
|
|
3
|
+
|
|
4
|
+
def test_levenshtein():
|
|
5
|
+
assert fuzzybunny.levenshtein("kitten", "sitting") > 0.5
|
|
6
|
+
assert fuzzybunny.levenshtein("apple", "apple") == 1.0
|
|
7
|
+
assert fuzzybunny.levenshtein("abc", "def") == 0.0 # Actually 0.0 because len=3, dist=3, 1 - 3/3 = 0
|
|
8
|
+
assert fuzzybunny.levenshtein("", "") == 1.0
|
|
9
|
+
|
|
10
|
+
def test_partial():
|
|
11
|
+
# "apple" is inside "apple pie"
|
|
12
|
+
assert fuzzybunny.partial_ratio("apple", "apple pie") == 1.0
|
|
13
|
+
# "pie" is inside "apple pie"
|
|
14
|
+
assert fuzzybunny.partial_ratio("pie", "apple pie") == 1.0
|
|
15
|
+
# "xyz" is not in "apple pie"
|
|
16
|
+
assert fuzzybunny.partial_ratio("xyz", "apple pie") == 0.0
|
|
17
|
+
|
|
18
|
+
# rank with mode='partial'
|
|
19
|
+
cands = ["apple pie", "banana split", "cherry tart"]
|
|
20
|
+
results = fuzzybunny.rank("apple", cands, scorer="levenshtein", mode="partial")
|
|
21
|
+
assert results[0][0] == "apple pie"
|
|
22
|
+
assert results[0][1] == 1.0
|
|
23
|
+
|
|
24
|
+
def test_jaccard():
|
|
25
|
+
assert fuzzybunny.jaccard("apple banana", "banana apple") == 1.0
|
|
26
|
+
assert fuzzybunny.jaccard("apple", "banana") == 0.0
|
|
27
|
+
assert fuzzybunny.jaccard("a b c", "a c") > 0.5
|
|
28
|
+
|
|
29
|
+
def test_token_sort():
|
|
30
|
+
assert fuzzybunny.token_sort("apple banana", "banana apple") == 1.0
|
|
31
|
+
assert fuzzybunny.token_sort("fuzzy bunny", "bunny fuzzy") == 1.0
|
|
32
|
+
|
|
33
|
+
def test_unicode():
|
|
34
|
+
# 'café' vs 'cafe' (levenshtein should catch diff)
|
|
35
|
+
s1 = "café"
|
|
36
|
+
s2 = "cafe"
|
|
37
|
+
assert fuzzybunny.levenshtein(s1, s2) < 1.0
|
|
38
|
+
assert fuzzybunny.levenshtein(s1, s1) == 1.0
|
|
39
|
+
|
|
40
|
+
# Emoji
|
|
41
|
+
assert fuzzybunny.levenshtein("😊", "😊") == 1.0
|
|
42
|
+
assert fuzzybunny.levenshtein("😊", "😂") < 1.0
|
|
43
|
+
|
|
44
|
+
def test_rank():
|
|
45
|
+
candidates = ["apple", "apricot", "banana", "cherry"]
|
|
46
|
+
results = fuzzybunny.rank("app", candidates, scorer="levenshtein", top_n=2)
|
|
47
|
+
assert len(results) == 2
|
|
48
|
+
assert results[0][0] == "apple"
|
|
49
|
+
|
|
50
|
+
results_empty = fuzzybunny.rank("xyz", candidates, threshold=0.9)
|
|
51
|
+
assert len(results_empty) == 0
|
|
52
|
+
|
|
53
|
+
def test_batch_match():
|
|
54
|
+
queries = ["apple", "banana"]
|
|
55
|
+
candidates = ["apple pie", "banana bread", "cherry tart"]
|
|
56
|
+
results = fuzzybunny.batch_match(queries, candidates, mode="partial")
|
|
57
|
+
|
|
58
|
+
assert len(results) == 2
|
|
59
|
+
# First query "apple" matches "apple pie"
|
|
60
|
+
assert results[0][0][0] == "apple pie"
|
|
61
|
+
assert results[0][0][1] == 1.0
|
|
62
|
+
|
|
63
|
+
# Second query "banana" matches "banana bread"
|
|
64
|
+
assert results[1][0][0] == "banana bread"
|
|
65
|
+
assert results[1][0][1] == 1.0
|
|
66
|
+
|
|
67
|
+
def test_invalid_scorer_rank():
|
|
68
|
+
# Should default to 0.0 or handle gracefully
|
|
69
|
+
results = fuzzybunny.rank("a", ["a"], scorer="unknown")
|
|
70
|
+
assert results[0][1] == 0.0
|