fuzzybunny 0.1.2__tar.gz → 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fuzzybunny-0.2.1/PKG-INFO +101 -0
- fuzzybunny-0.2.1/README.md +83 -0
- {fuzzybunny-0.1.2 → fuzzybunny-0.2.1}/setup.py +9 -2
- {fuzzybunny-0.1.2 → fuzzybunny-0.2.1}/src/bindings.cpp +3 -1
- fuzzybunny-0.2.1/src/fuzzybunny/__init__.py +75 -0
- fuzzybunny-0.2.1/src/fuzzybunny/benchmark.py +49 -0
- fuzzybunny-0.2.1/src/fuzzybunny.egg-info/PKG-INFO +101 -0
- fuzzybunny-0.2.1/src/fuzzybunny.egg-info/SOURCES.txt +17 -0
- {fuzzybunny-0.1.2 → fuzzybunny-0.2.1}/src/scorers.cpp +39 -6
- {fuzzybunny-0.1.2 → fuzzybunny-0.2.1}/tests/test_basic.py +6 -2
- fuzzybunny-0.2.1/tests/test_benchmark.py +18 -0
- fuzzybunny-0.2.1/tests/test_hybrid.py +48 -0
- fuzzybunny-0.2.1/tests/test_pandas.py +29 -0
- fuzzybunny-0.1.2/PKG-INFO +0 -114
- fuzzybunny-0.1.2/README.md +0 -101
- fuzzybunny-0.1.2/fuzzybunny.egg-info/PKG-INFO +0 -114
- fuzzybunny-0.1.2/fuzzybunny.egg-info/SOURCES.txt +0 -12
- {fuzzybunny-0.1.2 → fuzzybunny-0.2.1}/LICENSE +0 -0
- {fuzzybunny-0.1.2 → fuzzybunny-0.2.1}/pyproject.toml +0 -0
- {fuzzybunny-0.1.2 → fuzzybunny-0.2.1}/setup.cfg +0 -0
- {fuzzybunny-0.1.2 → fuzzybunny-0.2.1/src}/fuzzybunny.egg-info/dependency_links.txt +0 -0
- {fuzzybunny-0.1.2 → fuzzybunny-0.2.1/src}/fuzzybunny.egg-info/not-zip-safe +0 -0
- {fuzzybunny-0.1.2 → fuzzybunny-0.2.1/src}/fuzzybunny.egg-info/top_level.txt +0 -0
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: fuzzybunny
|
|
3
|
+
Version: 0.2.1
|
|
4
|
+
Summary: A fuzzy search tool for python written in C++
|
|
5
|
+
Home-page: https://github.com/cachevector/fuzzybunny
|
|
6
|
+
Project-URL: Bug Tracker, https://github.com/cachevector/fuzzybunny/issues
|
|
7
|
+
Project-URL: Source Code, https://github.com/cachevector/fuzzybunny
|
|
8
|
+
Requires-Python: >=3.8
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Dynamic: description
|
|
12
|
+
Dynamic: description-content-type
|
|
13
|
+
Dynamic: home-page
|
|
14
|
+
Dynamic: license-file
|
|
15
|
+
Dynamic: project-url
|
|
16
|
+
Dynamic: requires-python
|
|
17
|
+
Dynamic: summary
|
|
18
|
+
|
|
19
|
+
<p align="center">
|
|
20
|
+
<img src="https://raw.githubusercontent.com/cachevector/fuzzybunny/master/docs/assets/fuzzybunny.png" alt="FuzzyBunny Logo" width="150" />
|
|
21
|
+
</p>
|
|
22
|
+
|
|
23
|
+
<h1 align="center">FuzzyBunny</h1>
|
|
24
|
+
|
|
25
|
+
<p align="center">
|
|
26
|
+
<b> A high-performance, lightweight Python library for fuzzy string matching and ranking, implemented in C++ with Pybind11. </b>
|
|
27
|
+
</p>
|
|
28
|
+
|
|
29
|
+
<p align="center">
|
|
30
|
+
<img src="https://img.shields.io/badge/License-MIT-green" />
|
|
31
|
+
<img src="https://img.shields.io/badge/Language-C%2B%2B-00599C" />
|
|
32
|
+
<img src="https://img.shields.io/badge/Bindings-Pybind11-blue" />
|
|
33
|
+
</p>
|
|
34
|
+
|
|
35
|
+
## Features
|
|
36
|
+
|
|
37
|
+
- **Blazing Fast**: C++ core for 2-5x speed improvement over pure Python alternatives.
|
|
38
|
+
- **Multiple Scorers**: Support for Levenshtein, Jaccard, and Token Sort ratios.
|
|
39
|
+
- **Partial Matching**: Find the best substring matches.
|
|
40
|
+
- **Hybrid Scoring**: Combine multiple scorers with custom weights.
|
|
41
|
+
- **Pandas & NumPy Integration**: Native support for Series and Arrays.
|
|
42
|
+
- **Batch Processing**: Parallelized matching for large datasets using OpenMP.
|
|
43
|
+
- **Unicode Support**: Handles international characters and normalization.
|
|
44
|
+
- **Benchmarking Tools**: Built-in utilities to measure performance.
|
|
45
|
+
|
|
46
|
+
## Installation
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
pip install fuzzybunny
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Quick Start
|
|
53
|
+
|
|
54
|
+
```python
|
|
55
|
+
import fuzzybunny
|
|
56
|
+
|
|
57
|
+
# Basic matching
|
|
58
|
+
score = fuzzybunny.levenshtein("kitten", "sitting")
|
|
59
|
+
print(f"Similarity: {score:.2f}")
|
|
60
|
+
|
|
61
|
+
# Ranking candidates
|
|
62
|
+
candidates = ["apple", "apricot", "banana", "cherry"]
|
|
63
|
+
results = fuzzybunny.rank("app", candidates, top_n=2)
|
|
64
|
+
# [('apple', 0.6), ('apricot', 0.42)]
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Advanced Usage
|
|
68
|
+
|
|
69
|
+
### Hybrid Scorer
|
|
70
|
+
Combine different algorithms to get better results:
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
results = fuzzybunny.rank(
|
|
74
|
+
"apple banana",
|
|
75
|
+
["banana apple"],
|
|
76
|
+
scorer="hybrid",
|
|
77
|
+
weights={"levenshtein": 0.3, "token_sort": 0.7}
|
|
78
|
+
)
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
### Pandas Integration
|
|
82
|
+
Use the specialized accessor for clean code:
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
import pandas as pd
|
|
86
|
+
import fuzzybunny
|
|
87
|
+
|
|
88
|
+
df = pd.DataFrame({"names": ["apple pie", "banana bread", "cherry tart"]})
|
|
89
|
+
results = df["names"].fuzzy.match("apple", mode="partial")
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
### Benchmarking
|
|
93
|
+
Compare performance on your specific data:
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
perf = fuzzybunny.benchmark("query", candidates)
|
|
97
|
+
print(f"Levenshtein mean time: {perf['levenshtein']['mean']:.6f}s")
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
## License
|
|
101
|
+
MIT
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<img src="https://raw.githubusercontent.com/cachevector/fuzzybunny/master/docs/assets/fuzzybunny.png" alt="FuzzyBunny Logo" width="150" />
|
|
3
|
+
</p>
|
|
4
|
+
|
|
5
|
+
<h1 align="center">FuzzyBunny</h1>
|
|
6
|
+
|
|
7
|
+
<p align="center">
|
|
8
|
+
<b> A high-performance, lightweight Python library for fuzzy string matching and ranking, implemented in C++ with Pybind11. </b>
|
|
9
|
+
</p>
|
|
10
|
+
|
|
11
|
+
<p align="center">
|
|
12
|
+
<img src="https://img.shields.io/badge/License-MIT-green" />
|
|
13
|
+
<img src="https://img.shields.io/badge/Language-C%2B%2B-00599C" />
|
|
14
|
+
<img src="https://img.shields.io/badge/Bindings-Pybind11-blue" />
|
|
15
|
+
</p>
|
|
16
|
+
|
|
17
|
+
## Features
|
|
18
|
+
|
|
19
|
+
- **Blazing Fast**: C++ core for 2-5x speed improvement over pure Python alternatives.
|
|
20
|
+
- **Multiple Scorers**: Support for Levenshtein, Jaccard, and Token Sort ratios.
|
|
21
|
+
- **Partial Matching**: Find the best substring matches.
|
|
22
|
+
- **Hybrid Scoring**: Combine multiple scorers with custom weights.
|
|
23
|
+
- **Pandas & NumPy Integration**: Native support for Series and Arrays.
|
|
24
|
+
- **Batch Processing**: Parallelized matching for large datasets using OpenMP.
|
|
25
|
+
- **Unicode Support**: Handles international characters and normalization.
|
|
26
|
+
- **Benchmarking Tools**: Built-in utilities to measure performance.
|
|
27
|
+
|
|
28
|
+
## Installation
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
pip install fuzzybunny
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Quick Start
|
|
35
|
+
|
|
36
|
+
```python
|
|
37
|
+
import fuzzybunny
|
|
38
|
+
|
|
39
|
+
# Basic matching
|
|
40
|
+
score = fuzzybunny.levenshtein("kitten", "sitting")
|
|
41
|
+
print(f"Similarity: {score:.2f}")
|
|
42
|
+
|
|
43
|
+
# Ranking candidates
|
|
44
|
+
candidates = ["apple", "apricot", "banana", "cherry"]
|
|
45
|
+
results = fuzzybunny.rank("app", candidates, top_n=2)
|
|
46
|
+
# [('apple', 0.6), ('apricot', 0.42)]
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Advanced Usage
|
|
50
|
+
|
|
51
|
+
### Hybrid Scorer
|
|
52
|
+
Combine different algorithms to get better results:
|
|
53
|
+
|
|
54
|
+
```python
|
|
55
|
+
results = fuzzybunny.rank(
|
|
56
|
+
"apple banana",
|
|
57
|
+
["banana apple"],
|
|
58
|
+
scorer="hybrid",
|
|
59
|
+
weights={"levenshtein": 0.3, "token_sort": 0.7}
|
|
60
|
+
)
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### Pandas Integration
|
|
64
|
+
Use the specialized accessor for clean code:
|
|
65
|
+
|
|
66
|
+
```python
|
|
67
|
+
import pandas as pd
|
|
68
|
+
import fuzzybunny
|
|
69
|
+
|
|
70
|
+
df = pd.DataFrame({"names": ["apple pie", "banana bread", "cherry tart"]})
|
|
71
|
+
results = df["names"].fuzzy.match("apple", mode="partial")
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
### Benchmarking
|
|
75
|
+
Compare performance on your specific data:
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
perf = fuzzybunny.benchmark("query", candidates)
|
|
79
|
+
print(f"Levenshtein mean time: {perf['levenshtein']['mean']:.6f}s")
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## License
|
|
83
|
+
MIT
|
|
@@ -28,7 +28,7 @@ class PybindBuildExt(build_ext):
|
|
|
28
28
|
|
|
29
29
|
ext_modules = [
|
|
30
30
|
Extension(
|
|
31
|
-
"fuzzybunny",
|
|
31
|
+
"fuzzybunny._fuzzybunny",
|
|
32
32
|
["src/bindings.cpp", "src/scorers.cpp"],
|
|
33
33
|
include_dirs=[
|
|
34
34
|
pybind11.get_include(),
|
|
@@ -40,10 +40,17 @@ ext_modules = [
|
|
|
40
40
|
|
|
41
41
|
setup(
|
|
42
42
|
name="fuzzybunny",
|
|
43
|
-
version="0.1
|
|
43
|
+
version="0.2.1",
|
|
44
44
|
description="A fuzzy search tool for python written in C++",
|
|
45
45
|
long_description=open("README.md").read(),
|
|
46
46
|
long_description_content_type="text/markdown",
|
|
47
|
+
url="https://github.com/cachevector/fuzzybunny",
|
|
48
|
+
project_urls={
|
|
49
|
+
"Bug Tracker": "https://github.com/cachevector/fuzzybunny/issues",
|
|
50
|
+
"Source Code": "https://github.com/cachevector/fuzzybunny",
|
|
51
|
+
},
|
|
52
|
+
packages=["fuzzybunny"],
|
|
53
|
+
package_dir={"": "src"},
|
|
47
54
|
ext_modules=ext_modules,
|
|
48
55
|
cmdclass={"build_ext": PybindBuildExt},
|
|
49
56
|
zip_safe=False,
|
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
namespace py = pybind11;
|
|
6
6
|
using namespace fuzzybunny;
|
|
7
7
|
|
|
8
|
-
PYBIND11_MODULE(
|
|
8
|
+
PYBIND11_MODULE(_fuzzybunny, m) {
|
|
9
9
|
m.doc() = R"pbdoc(
|
|
10
10
|
FuzzyBunny: A fast fuzzy string matching library
|
|
11
11
|
------------------------------------------------
|
|
@@ -42,6 +42,7 @@ PYBIND11_MODULE(fuzzybunny, m) {
|
|
|
42
42
|
py::arg("process") = true,
|
|
43
43
|
py::arg("threshold") = 0.0,
|
|
44
44
|
py::arg("top_n") = -1,
|
|
45
|
+
py::arg("weights") = std::map<std::string, double>{},
|
|
45
46
|
"Rank candidates against a query string. Returns list of (string, score) tuples.");
|
|
46
47
|
|
|
47
48
|
m.def("batch_match", &batch_match,
|
|
@@ -52,6 +53,7 @@ PYBIND11_MODULE(fuzzybunny, m) {
|
|
|
52
53
|
py::arg("process") = true,
|
|
53
54
|
py::arg("threshold") = 0.0,
|
|
54
55
|
py::arg("top_n") = -1,
|
|
56
|
+
py::arg("weights") = std::map<std::string, double>{},
|
|
55
57
|
"Batch match multiple queries against candidates.");
|
|
56
58
|
|
|
57
59
|
#ifdef VERSION_INFO
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
from . import _fuzzybunny
|
|
2
|
+
from ._fuzzybunny import (
|
|
3
|
+
levenshtein,
|
|
4
|
+
jaccard,
|
|
5
|
+
token_sort,
|
|
6
|
+
partial_ratio,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
from .benchmark import benchmark, benchmark_batch
|
|
10
|
+
|
|
11
|
+
def rank(query, candidates, scorer="levenshtein", mode="full", process=True, threshold=0.0, top_n=-1, weights=None):
|
|
12
|
+
"""
|
|
13
|
+
Enhanced rank function with support for Pandas Series and NumPy arrays.
|
|
14
|
+
"""
|
|
15
|
+
if weights is None:
|
|
16
|
+
weights = {}
|
|
17
|
+
|
|
18
|
+
# Check for pandas/numpy
|
|
19
|
+
if _is_pandas_series(candidates):
|
|
20
|
+
candidates = candidates.astype(str).tolist()
|
|
21
|
+
elif _is_numpy_array(candidates):
|
|
22
|
+
candidates = candidates.astype(str).tolist()
|
|
23
|
+
|
|
24
|
+
return _fuzzybunny.rank(query, candidates, scorer, mode, process, threshold, top_n, weights)
|
|
25
|
+
|
|
26
|
+
def batch_match(queries, candidates, scorer="levenshtein", mode="full", process=True, threshold=0.0, top_n=-1, weights=None):
|
|
27
|
+
"""
|
|
28
|
+
Enhanced batch_match function with support for Pandas/NumPy candidates.
|
|
29
|
+
"""
|
|
30
|
+
if weights is None:
|
|
31
|
+
weights = {}
|
|
32
|
+
|
|
33
|
+
if _is_pandas_series(candidates):
|
|
34
|
+
candidates = candidates.astype(str).tolist()
|
|
35
|
+
elif _is_numpy_array(candidates):
|
|
36
|
+
candidates = candidates.astype(str).tolist()
|
|
37
|
+
|
|
38
|
+
# queries can also be pandas/numpy
|
|
39
|
+
if _is_pandas_series(queries) or _is_numpy_array(queries):
|
|
40
|
+
import numpy as np
|
|
41
|
+
queries = np.array(queries).astype(str).tolist()
|
|
42
|
+
|
|
43
|
+
return _fuzzybunny.batch_match(queries, candidates, scorer, mode, process, threshold, top_n, weights)
|
|
44
|
+
|
|
45
|
+
def _is_pandas_series(obj):
|
|
46
|
+
try:
|
|
47
|
+
import pandas as pd
|
|
48
|
+
return isinstance(obj, pd.Series)
|
|
49
|
+
except ImportError:
|
|
50
|
+
return False
|
|
51
|
+
|
|
52
|
+
def _is_numpy_array(obj):
|
|
53
|
+
try:
|
|
54
|
+
import numpy as np
|
|
55
|
+
return isinstance(obj, np.ndarray)
|
|
56
|
+
except ImportError:
|
|
57
|
+
return False
|
|
58
|
+
|
|
59
|
+
def _register_pandas_accessor():
|
|
60
|
+
try:
|
|
61
|
+
import pandas as pd
|
|
62
|
+
|
|
63
|
+
@pd.api.extensions.register_series_accessor("fuzzy")
|
|
64
|
+
class FuzzyAccessor:
|
|
65
|
+
def __init__(self, pandas_obj):
|
|
66
|
+
self._obj = pandas_obj
|
|
67
|
+
|
|
68
|
+
def match(self, query, scorer="levenshtein", mode="full", process=True, threshold=0.0, top_n=-1, weights=None):
|
|
69
|
+
return rank(query, self._obj, scorer, mode, process, threshold, top_n, weights)
|
|
70
|
+
except (ImportError, AttributeError):
|
|
71
|
+
pass
|
|
72
|
+
|
|
73
|
+
_register_pandas_accessor()
|
|
74
|
+
|
|
75
|
+
__version__ = getattr(_fuzzybunny, "__version__", "dev")
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import time
|
|
2
|
+
import statistics
|
|
3
|
+
|
|
4
|
+
def benchmark(query, candidates, scorers=None, n_runs=5):
|
|
5
|
+
"""
|
|
6
|
+
Benchmark different scorers on a given query and set of candidates.
|
|
7
|
+
Returns a dictionary with timing results.
|
|
8
|
+
"""
|
|
9
|
+
from . import rank
|
|
10
|
+
if scorers is None:
|
|
11
|
+
scorers = ["levenshtein", "jaccard", "token_sort"]
|
|
12
|
+
|
|
13
|
+
results = {}
|
|
14
|
+
|
|
15
|
+
for scorer in scorers:
|
|
16
|
+
times = []
|
|
17
|
+
for _ in range(n_runs):
|
|
18
|
+
start = time.perf_counter()
|
|
19
|
+
rank(query, candidates, scorer=scorer)
|
|
20
|
+
end = time.perf_counter()
|
|
21
|
+
times.append(end - start)
|
|
22
|
+
|
|
23
|
+
results[scorer] = {
|
|
24
|
+
"mean": statistics.mean(times),
|
|
25
|
+
"stddev": statistics.stdev(times) if len(times) > 1 else 0,
|
|
26
|
+
"min": min(times),
|
|
27
|
+
"max": max(times)
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
return results
|
|
31
|
+
|
|
32
|
+
def benchmark_batch(queries, candidates, scorer="levenshtein", n_runs=3):
|
|
33
|
+
"""
|
|
34
|
+
Benchmark batch_match performance.
|
|
35
|
+
"""
|
|
36
|
+
from . import batch_match
|
|
37
|
+
times = []
|
|
38
|
+
for _ in range(n_runs):
|
|
39
|
+
start = time.perf_counter()
|
|
40
|
+
batch_match(queries, candidates, scorer=scorer)
|
|
41
|
+
end = time.perf_counter()
|
|
42
|
+
times.append(end - start)
|
|
43
|
+
|
|
44
|
+
return {
|
|
45
|
+
"mean": statistics.mean(times),
|
|
46
|
+
"total_queries": len(queries),
|
|
47
|
+
"total_candidates": len(candidates),
|
|
48
|
+
"queries_per_second": len(queries) / statistics.mean(times)
|
|
49
|
+
}
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: fuzzybunny
|
|
3
|
+
Version: 0.2.1
|
|
4
|
+
Summary: A fuzzy search tool for python written in C++
|
|
5
|
+
Home-page: https://github.com/cachevector/fuzzybunny
|
|
6
|
+
Project-URL: Bug Tracker, https://github.com/cachevector/fuzzybunny/issues
|
|
7
|
+
Project-URL: Source Code, https://github.com/cachevector/fuzzybunny
|
|
8
|
+
Requires-Python: >=3.8
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Dynamic: description
|
|
12
|
+
Dynamic: description-content-type
|
|
13
|
+
Dynamic: home-page
|
|
14
|
+
Dynamic: license-file
|
|
15
|
+
Dynamic: project-url
|
|
16
|
+
Dynamic: requires-python
|
|
17
|
+
Dynamic: summary
|
|
18
|
+
|
|
19
|
+
<p align="center">
|
|
20
|
+
<img src="https://raw.githubusercontent.com/cachevector/fuzzybunny/master/docs/assets/fuzzybunny.png" alt="FuzzyBunny Logo" width="150" />
|
|
21
|
+
</p>
|
|
22
|
+
|
|
23
|
+
<h1 align="center">FuzzyBunny</h1>
|
|
24
|
+
|
|
25
|
+
<p align="center">
|
|
26
|
+
<b> A high-performance, lightweight Python library for fuzzy string matching and ranking, implemented in C++ with Pybind11. </b>
|
|
27
|
+
</p>
|
|
28
|
+
|
|
29
|
+
<p align="center">
|
|
30
|
+
<img src="https://img.shields.io/badge/License-MIT-green" />
|
|
31
|
+
<img src="https://img.shields.io/badge/Language-C%2B%2B-00599C" />
|
|
32
|
+
<img src="https://img.shields.io/badge/Bindings-Pybind11-blue" />
|
|
33
|
+
</p>
|
|
34
|
+
|
|
35
|
+
## Features
|
|
36
|
+
|
|
37
|
+
- **Blazing Fast**: C++ core for 2-5x speed improvement over pure Python alternatives.
|
|
38
|
+
- **Multiple Scorers**: Support for Levenshtein, Jaccard, and Token Sort ratios.
|
|
39
|
+
- **Partial Matching**: Find the best substring matches.
|
|
40
|
+
- **Hybrid Scoring**: Combine multiple scorers with custom weights.
|
|
41
|
+
- **Pandas & NumPy Integration**: Native support for Series and Arrays.
|
|
42
|
+
- **Batch Processing**: Parallelized matching for large datasets using OpenMP.
|
|
43
|
+
- **Unicode Support**: Handles international characters and normalization.
|
|
44
|
+
- **Benchmarking Tools**: Built-in utilities to measure performance.
|
|
45
|
+
|
|
46
|
+
## Installation
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
pip install fuzzybunny
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Quick Start
|
|
53
|
+
|
|
54
|
+
```python
|
|
55
|
+
import fuzzybunny
|
|
56
|
+
|
|
57
|
+
# Basic matching
|
|
58
|
+
score = fuzzybunny.levenshtein("kitten", "sitting")
|
|
59
|
+
print(f"Similarity: {score:.2f}")
|
|
60
|
+
|
|
61
|
+
# Ranking candidates
|
|
62
|
+
candidates = ["apple", "apricot", "banana", "cherry"]
|
|
63
|
+
results = fuzzybunny.rank("app", candidates, top_n=2)
|
|
64
|
+
# [('apple', 0.6), ('apricot', 0.42)]
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Advanced Usage
|
|
68
|
+
|
|
69
|
+
### Hybrid Scorer
|
|
70
|
+
Combine different algorithms to get better results:
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
results = fuzzybunny.rank(
|
|
74
|
+
"apple banana",
|
|
75
|
+
["banana apple"],
|
|
76
|
+
scorer="hybrid",
|
|
77
|
+
weights={"levenshtein": 0.3, "token_sort": 0.7}
|
|
78
|
+
)
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
### Pandas Integration
|
|
82
|
+
Use the specialized accessor for clean code:
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
import pandas as pd
|
|
86
|
+
import fuzzybunny
|
|
87
|
+
|
|
88
|
+
df = pd.DataFrame({"names": ["apple pie", "banana bread", "cherry tart"]})
|
|
89
|
+
results = df["names"].fuzzy.match("apple", mode="partial")
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
### Benchmarking
|
|
93
|
+
Compare performance on your specific data:
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
perf = fuzzybunny.benchmark("query", candidates)
|
|
97
|
+
print(f"Levenshtein mean time: {perf['levenshtein']['mean']:.6f}s")
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
## License
|
|
101
|
+
MIT
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
setup.py
|
|
5
|
+
src/bindings.cpp
|
|
6
|
+
src/scorers.cpp
|
|
7
|
+
src/fuzzybunny/__init__.py
|
|
8
|
+
src/fuzzybunny/benchmark.py
|
|
9
|
+
src/fuzzybunny.egg-info/PKG-INFO
|
|
10
|
+
src/fuzzybunny.egg-info/SOURCES.txt
|
|
11
|
+
src/fuzzybunny.egg-info/dependency_links.txt
|
|
12
|
+
src/fuzzybunny.egg-info/not-zip-safe
|
|
13
|
+
src/fuzzybunny.egg-info/top_level.txt
|
|
14
|
+
tests/test_basic.py
|
|
15
|
+
tests/test_benchmark.py
|
|
16
|
+
tests/test_hybrid.py
|
|
17
|
+
tests/test_pandas.py
|
|
@@ -17,13 +17,19 @@ std::u32string normalize(const std::u32string& s) {
|
|
|
17
17
|
std::u32string result;
|
|
18
18
|
result.reserve(s.size());
|
|
19
19
|
for (char32_t c : s) {
|
|
20
|
-
// Lowercase and remove punctuation
|
|
20
|
+
// Lowercase and remove punctuation
|
|
21
|
+
// Supporting basic Latin-1 range for better international support
|
|
21
22
|
if (c < 128) {
|
|
22
23
|
if (std::iswalnum(static_cast<wint_t>(c)) || std::iswspace(static_cast<wint_t>(c))) {
|
|
23
24
|
result.push_back(static_cast<char32_t>(std::towlower(static_cast<wint_t>(c))));
|
|
24
25
|
}
|
|
26
|
+
} else if (c < 256) {
|
|
27
|
+
// Latin-1 Supplement
|
|
28
|
+
if (std::iswalpha(static_cast<wint_t>(c)) || std::iswspace(static_cast<wint_t>(c))) {
|
|
29
|
+
result.push_back(static_cast<char32_t>(std::towlower(static_cast<wint_t>(c))));
|
|
30
|
+
}
|
|
25
31
|
} else {
|
|
26
|
-
// For non-ASCII, just pass through
|
|
32
|
+
// For other non-ASCII, just pass through
|
|
27
33
|
result.push_back(c);
|
|
28
34
|
}
|
|
29
35
|
}
|
|
@@ -205,7 +211,8 @@ std::vector<MatchResult> rank(
|
|
|
205
211
|
const std::string& mode,
|
|
206
212
|
bool process,
|
|
207
213
|
double threshold,
|
|
208
|
-
int top_n
|
|
214
|
+
int top_n,
|
|
215
|
+
const std::map<std::string, double>& weights
|
|
209
216
|
) {
|
|
210
217
|
if (query.empty() || candidates.empty()) return {};
|
|
211
218
|
|
|
@@ -233,8 +240,33 @@ std::vector<MatchResult> rank(
|
|
|
233
240
|
score = jaccard_similarity(uQuery, uCand);
|
|
234
241
|
} else if (scorer == "token_sort") {
|
|
235
242
|
score = token_sort_ratio(uQuery, uCand);
|
|
243
|
+
} else if (scorer == "hybrid") {
|
|
244
|
+
double weighted_sum = 0.0;
|
|
245
|
+
double total_weight = 0.0;
|
|
246
|
+
|
|
247
|
+
for (const auto& [name, weight] : weights) {
|
|
248
|
+
double sub_score = 0.0;
|
|
249
|
+
if (name == "levenshtein") {
|
|
250
|
+
if (mode == "partial") sub_score = partial_ratio(uQuery, uCand);
|
|
251
|
+
else sub_score = levenshtein_ratio(uQuery, uCand);
|
|
252
|
+
} else if (name == "jaccard") {
|
|
253
|
+
sub_score = jaccard_similarity(uQuery, uCand);
|
|
254
|
+
} else if (name == "token_sort") {
|
|
255
|
+
sub_score = token_sort_ratio(uQuery, uCand);
|
|
256
|
+
}
|
|
257
|
+
// Ignore unknown scorers in weights for now, or could throw.
|
|
258
|
+
|
|
259
|
+
weighted_sum += sub_score * weight;
|
|
260
|
+
total_weight += weight;
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
if (total_weight > 0.0) {
|
|
264
|
+
score = weighted_sum / total_weight;
|
|
265
|
+
} else {
|
|
266
|
+
score = 0.0;
|
|
267
|
+
}
|
|
236
268
|
} else {
|
|
237
|
-
|
|
269
|
+
throw std::invalid_argument("Unknown scorer: " + scorer);
|
|
238
270
|
}
|
|
239
271
|
|
|
240
272
|
if (score >= threshold) {
|
|
@@ -260,13 +292,14 @@ std::vector<std::vector<MatchResult>> batch_match(
|
|
|
260
292
|
const std::string& mode,
|
|
261
293
|
bool process,
|
|
262
294
|
double threshold,
|
|
263
|
-
int top_n
|
|
295
|
+
int top_n,
|
|
296
|
+
const std::map<std::string, double>& weights
|
|
264
297
|
) {
|
|
265
298
|
std::vector<std::vector<MatchResult>> batch_results(queries.size());
|
|
266
299
|
|
|
267
300
|
#pragma omp parallel for if(queries.size() > 10)
|
|
268
301
|
for (int i = 0; i < static_cast<int>(queries.size()); ++i) {
|
|
269
|
-
batch_results[i] = rank(queries[i], candidates, scorer, mode, process, threshold, top_n);
|
|
302
|
+
batch_results[i] = rank(queries[i], candidates, scorer, mode, process, threshold, top_n, weights);
|
|
270
303
|
}
|
|
271
304
|
return batch_results;
|
|
272
305
|
}
|
|
@@ -41,6 +41,10 @@ def test_unicode():
|
|
|
41
41
|
assert fuzzybunny.levenshtein(s1, s2) < 1.0
|
|
42
42
|
assert fuzzybunny.levenshtein(s1, s1) == 1.0
|
|
43
43
|
assert fuzzybunny.levenshtein("😊", "😊") == 1.0
|
|
44
|
+
|
|
45
|
+
# Test normalization of international characters
|
|
46
|
+
res = fuzzybunny.rank("CAFÉ", ["café"], process=True)
|
|
47
|
+
assert res[0][1] == 1.0
|
|
44
48
|
|
|
45
49
|
def test_rank():
|
|
46
50
|
candidates = ["apple", "apricot", "banana", "cherry"]
|
|
@@ -61,5 +65,5 @@ def test_batch_match():
|
|
|
61
65
|
assert results[1][0][0] == "banana bread"
|
|
62
66
|
|
|
63
67
|
def test_invalid_scorer_rank():
|
|
64
|
-
|
|
65
|
-
|
|
68
|
+
with pytest.raises(ValueError, match="Unknown scorer"):
|
|
69
|
+
fuzzybunny.rank("a", ["a"], scorer="unknown")
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import fuzzybunny
|
|
2
|
+
|
|
3
|
+
def test_benchmark():
|
|
4
|
+
query = "apple"
|
|
5
|
+
candidates = ["apple", "banana", "cherry"]
|
|
6
|
+
results = fuzzybunny.benchmark(query, candidates, n_runs=2)
|
|
7
|
+
|
|
8
|
+
assert "levenshtein" in results
|
|
9
|
+
assert "mean" in results["levenshtein"]
|
|
10
|
+
assert results["levenshtein"]["mean"] > 0
|
|
11
|
+
|
|
12
|
+
def test_benchmark_batch():
|
|
13
|
+
queries = ["apple", "banana"]
|
|
14
|
+
candidates = ["apple pie", "banana bread"]
|
|
15
|
+
results = fuzzybunny.benchmark_batch(queries, candidates, n_runs=2)
|
|
16
|
+
|
|
17
|
+
assert "queries_per_second" in results
|
|
18
|
+
assert results["total_queries"] == 2
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
import fuzzybunny
|
|
3
|
+
|
|
4
|
+
def test_hybrid_scorer():
|
|
5
|
+
# Setup: "apple" is the query
|
|
6
|
+
# "apple pie" matches partially well
|
|
7
|
+
# "banana" matches poorly
|
|
8
|
+
|
|
9
|
+
candidates = ["apple pie", "banana"]
|
|
10
|
+
|
|
11
|
+
# Weight only levenshtein
|
|
12
|
+
res_lev = fuzzybunny.rank("apple", candidates, scorer="hybrid",
|
|
13
|
+
weights={"levenshtein": 1.0})
|
|
14
|
+
# Should be equivalent to standard levenshtein
|
|
15
|
+
expected_lev = fuzzybunny.rank("apple", candidates, scorer="levenshtein")
|
|
16
|
+
assert abs(res_lev[0][1] - expected_lev[0][1]) < 0.001
|
|
17
|
+
|
|
18
|
+
# Hybrid: 50% Levenshtein, 50% Token Sort
|
|
19
|
+
# "apple pie" vs "apple"
|
|
20
|
+
# Levenshtein: ratio is low (length diff)
|
|
21
|
+
# Token Sort: "apple" vs "apple", "pie" -> ratio slightly better or same?
|
|
22
|
+
|
|
23
|
+
# Let's test with a clearer case
|
|
24
|
+
# s1="apple banana", s2="banana apple"
|
|
25
|
+
# Levenshtein low (~0.5), Token Sort high (1.0)
|
|
26
|
+
|
|
27
|
+
q = "apple banana"
|
|
28
|
+
c = ["banana apple"]
|
|
29
|
+
|
|
30
|
+
score_lev = fuzzybunny.levenshtein(q, c[0])
|
|
31
|
+
score_ts = fuzzybunny.token_sort(q, c[0])
|
|
32
|
+
|
|
33
|
+
res_hybrid = fuzzybunny.rank(q, c, scorer="hybrid",
|
|
34
|
+
weights={"levenshtein": 0.5, "token_sort": 0.5})
|
|
35
|
+
|
|
36
|
+
expected_score = (score_lev * 0.5 + score_ts * 0.5)
|
|
37
|
+
assert abs(res_hybrid[0][1] - expected_score) < 0.001
|
|
38
|
+
|
|
39
|
+
def test_unknown_scorer_error():
|
|
40
|
+
with pytest.raises(ValueError, match="Unknown scorer"):
|
|
41
|
+
fuzzybunny.rank("a", ["b"], scorer="non_existent_scorer")
|
|
42
|
+
|
|
43
|
+
def test_hybrid_empty_weights():
|
|
44
|
+
# If weights are empty or total weight is 0, score should be 0
|
|
45
|
+
res = fuzzybunny.rank("apple", ["apple"], scorer="hybrid", weights={})
|
|
46
|
+
# threshold 0.0 (default), score 0.0. 0.0 >= 0.0 is True.
|
|
47
|
+
assert len(res) == 1
|
|
48
|
+
assert res[0][1] == 0.0
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
import fuzzybunny
|
|
3
|
+
try:
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import numpy as np
|
|
6
|
+
HAS_PANDAS = True
|
|
7
|
+
except ImportError:
|
|
8
|
+
HAS_PANDAS = False
|
|
9
|
+
|
|
10
|
+
@pytest.mark.skipif(not HAS_PANDAS, reason="pandas not installed")
|
|
11
|
+
def test_pandas_series_support():
|
|
12
|
+
s = pd.Series(["apple", "banana", "cherry"])
|
|
13
|
+
results = fuzzybunny.rank("app", s)
|
|
14
|
+
assert results[0][0] == "apple"
|
|
15
|
+
assert results[0][1] > 0.5
|
|
16
|
+
|
|
17
|
+
@pytest.mark.skipif(not HAS_PANDAS, reason="pandas not installed")
|
|
18
|
+
def test_pandas_accessor():
|
|
19
|
+
s = pd.Series(["apple pie", "banana bread", "cherry tart"])
|
|
20
|
+
# Test the accessor
|
|
21
|
+
results = s.fuzzy.match("apple", mode="partial")
|
|
22
|
+
assert results[0][0] == "apple pie"
|
|
23
|
+
assert results[0][1] == 1.0
|
|
24
|
+
|
|
25
|
+
@pytest.mark.skipif(not HAS_PANDAS, reason="numpy not installed")
|
|
26
|
+
def test_numpy_support():
|
|
27
|
+
arr = np.array(["apple", "banana", "cherry"])
|
|
28
|
+
results = fuzzybunny.rank("ban", arr)
|
|
29
|
+
assert results[0][0] == "banana"
|
fuzzybunny-0.1.2/PKG-INFO
DELETED
|
@@ -1,114 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: fuzzybunny
|
|
3
|
-
Version: 0.1.2
|
|
4
|
-
Summary: A fuzzy search tool for python written in C++
|
|
5
|
-
Requires-Python: >=3.8
|
|
6
|
-
Description-Content-Type: text/markdown
|
|
7
|
-
License-File: LICENSE
|
|
8
|
-
Dynamic: description
|
|
9
|
-
Dynamic: description-content-type
|
|
10
|
-
Dynamic: license-file
|
|
11
|
-
Dynamic: requires-python
|
|
12
|
-
Dynamic: summary
|
|
13
|
-
|
|
14
|
-
<p align="center">
|
|
15
|
-
<img src="./docs/assets/fuzzybunny.png" alt="FuzzyBunny Logo" width="150" />
|
|
16
|
-
</p>
|
|
17
|
-
|
|
18
|
-
<h1 align="center">FuzzyBunny</h1>
|
|
19
|
-
|
|
20
|
-
<p align="center">
|
|
21
|
-
<b> A fuzzy search tool written in C++ with Python bindings </b>
|
|
22
|
-
</p>
|
|
23
|
-
|
|
24
|
-
<p align="center">
|
|
25
|
-
<img src="https://img.shields.io/badge/License-MIT-green" />
|
|
26
|
-
<img src="https://img.shields.io/badge/Language-C%2B%2B-00599C" />
|
|
27
|
-
<img src="https://img.shields.io/badge/Bindings-Pybind11-blue" />
|
|
28
|
-
</p>
|
|
29
|
-
|
|
30
|
-
## Overview
|
|
31
|
-
|
|
32
|
-
FuzzyBunny is a lightweight, high-performance Python library for fuzzy string matching and ranking. It is implemented in C++ for speed and exposes a Pythonic API via Pybind11. It supports various scoring algorithms including Levenshtein, Jaccard, and Token Sort, along with partial matching capabilities.
|
|
33
|
-
|
|
34
|
-
## Features
|
|
35
|
-
|
|
36
|
-
- **Fast C++ Core**: Optimized string matching algorithms.
|
|
37
|
-
- **Multiple Scorers**:
|
|
38
|
-
- `levenshtein`: Standard edit distance ratio.
|
|
39
|
-
- `jaccard`: Set-based similarity.
|
|
40
|
-
- `token_sort`: Sorts tokens before comparing (good for "Apple Banana" vs "Banana Apple").
|
|
41
|
-
- **Ranking**: Efficiently rank a list of candidates against a query.
|
|
42
|
-
- **Partial Matching**: Support for substring matching via `mode='partial'`.
|
|
43
|
-
- **Unicode Support**: Correctly handles UTF-8 input.
|
|
44
|
-
|
|
45
|
-
## Installation
|
|
46
|
-
|
|
47
|
-
### Prerequisites
|
|
48
|
-
- Python 3.8+
|
|
49
|
-
- C++17 compatible compiler (GCC, Clang, MSVC)
|
|
50
|
-
|
|
51
|
-
### Using uv (Recommended)
|
|
52
|
-
|
|
53
|
-
```bash
|
|
54
|
-
uv pip install .
|
|
55
|
-
```
|
|
56
|
-
|
|
57
|
-
### Using pip
|
|
58
|
-
|
|
59
|
-
```bash
|
|
60
|
-
pip install .
|
|
61
|
-
```
|
|
62
|
-
|
|
63
|
-
## Usage
|
|
64
|
-
|
|
65
|
-
```python
|
|
66
|
-
import fuzzybunny
|
|
67
|
-
|
|
68
|
-
# Basic Levenshtein Ratio
|
|
69
|
-
score = fuzzybunny.levenshtein("kitten", "sitting")
|
|
70
|
-
print(f"Score: {score}") # ~0.57
|
|
71
|
-
|
|
72
|
-
# Partial Matching
|
|
73
|
-
# "apple" is a perfect substring of "apple pie"
|
|
74
|
-
score = fuzzybunny.partial_ratio("apple", "apple pie")
|
|
75
|
-
print(f"Partial Score: {score}") # 1.0
|
|
76
|
-
|
|
77
|
-
# Ranking Candidates
|
|
78
|
-
candidates = ["apple pie", "banana bread", "cherry tart", "apple crisp"]
|
|
79
|
-
results = fuzzybunny.rank(
|
|
80
|
-
query="apple",
|
|
81
|
-
candidates=candidates,
|
|
82
|
-
scorer="levenshtein",
|
|
83
|
-
mode="partial",
|
|
84
|
-
top_n=2
|
|
85
|
-
)
|
|
86
|
-
|
|
87
|
-
for candidate, score in results:
|
|
88
|
-
print(f"{candidate}: {score}")
|
|
89
|
-
# Output:
|
|
90
|
-
# apple pie: 1.0
|
|
91
|
-
# apple crisp: 1.0
|
|
92
|
-
```
|
|
93
|
-
|
|
94
|
-
## Development
|
|
95
|
-
|
|
96
|
-
1. **Setup Environment**:
|
|
97
|
-
```bash
|
|
98
|
-
uv venv
|
|
99
|
-
source .venv/bin/activate
|
|
100
|
-
```
|
|
101
|
-
|
|
102
|
-
2. **Install in Editable Mode**:
|
|
103
|
-
```bash
|
|
104
|
-
uv pip install -e .
|
|
105
|
-
```
|
|
106
|
-
|
|
107
|
-
3. **Run Tests**:
|
|
108
|
-
```bash
|
|
109
|
-
pytest
|
|
110
|
-
```
|
|
111
|
-
|
|
112
|
-
## License
|
|
113
|
-
|
|
114
|
-
This project is licensed under the [MIT License](LICENSE).
|
fuzzybunny-0.1.2/README.md
DELETED
|
@@ -1,101 +0,0 @@
|
|
|
1
|
-
<p align="center">
|
|
2
|
-
<img src="./docs/assets/fuzzybunny.png" alt="FuzzyBunny Logo" width="150" />
|
|
3
|
-
</p>
|
|
4
|
-
|
|
5
|
-
<h1 align="center">FuzzyBunny</h1>
|
|
6
|
-
|
|
7
|
-
<p align="center">
|
|
8
|
-
<b> A fuzzy search tool written in C++ with Python bindings </b>
|
|
9
|
-
</p>
|
|
10
|
-
|
|
11
|
-
<p align="center">
|
|
12
|
-
<img src="https://img.shields.io/badge/License-MIT-green" />
|
|
13
|
-
<img src="https://img.shields.io/badge/Language-C%2B%2B-00599C" />
|
|
14
|
-
<img src="https://img.shields.io/badge/Bindings-Pybind11-blue" />
|
|
15
|
-
</p>
|
|
16
|
-
|
|
17
|
-
## Overview
|
|
18
|
-
|
|
19
|
-
FuzzyBunny is a lightweight, high-performance Python library for fuzzy string matching and ranking. It is implemented in C++ for speed and exposes a Pythonic API via Pybind11. It supports various scoring algorithms including Levenshtein, Jaccard, and Token Sort, along with partial matching capabilities.
|
|
20
|
-
|
|
21
|
-
## Features
|
|
22
|
-
|
|
23
|
-
- **Fast C++ Core**: Optimized string matching algorithms.
|
|
24
|
-
- **Multiple Scorers**:
|
|
25
|
-
- `levenshtein`: Standard edit distance ratio.
|
|
26
|
-
- `jaccard`: Set-based similarity.
|
|
27
|
-
- `token_sort`: Sorts tokens before comparing (good for "Apple Banana" vs "Banana Apple").
|
|
28
|
-
- **Ranking**: Efficiently rank a list of candidates against a query.
|
|
29
|
-
- **Partial Matching**: Support for substring matching via `mode='partial'`.
|
|
30
|
-
- **Unicode Support**: Correctly handles UTF-8 input.
|
|
31
|
-
|
|
32
|
-
## Installation
|
|
33
|
-
|
|
34
|
-
### Prerequisites
|
|
35
|
-
- Python 3.8+
|
|
36
|
-
- C++17 compatible compiler (GCC, Clang, MSVC)
|
|
37
|
-
|
|
38
|
-
### Using uv (Recommended)
|
|
39
|
-
|
|
40
|
-
```bash
|
|
41
|
-
uv pip install .
|
|
42
|
-
```
|
|
43
|
-
|
|
44
|
-
### Using pip
|
|
45
|
-
|
|
46
|
-
```bash
|
|
47
|
-
pip install .
|
|
48
|
-
```
|
|
49
|
-
|
|
50
|
-
## Usage
|
|
51
|
-
|
|
52
|
-
```python
|
|
53
|
-
import fuzzybunny
|
|
54
|
-
|
|
55
|
-
# Basic Levenshtein Ratio
|
|
56
|
-
score = fuzzybunny.levenshtein("kitten", "sitting")
|
|
57
|
-
print(f"Score: {score}") # ~0.57
|
|
58
|
-
|
|
59
|
-
# Partial Matching
|
|
60
|
-
# "apple" is a perfect substring of "apple pie"
|
|
61
|
-
score = fuzzybunny.partial_ratio("apple", "apple pie")
|
|
62
|
-
print(f"Partial Score: {score}") # 1.0
|
|
63
|
-
|
|
64
|
-
# Ranking Candidates
|
|
65
|
-
candidates = ["apple pie", "banana bread", "cherry tart", "apple crisp"]
|
|
66
|
-
results = fuzzybunny.rank(
|
|
67
|
-
query="apple",
|
|
68
|
-
candidates=candidates,
|
|
69
|
-
scorer="levenshtein",
|
|
70
|
-
mode="partial",
|
|
71
|
-
top_n=2
|
|
72
|
-
)
|
|
73
|
-
|
|
74
|
-
for candidate, score in results:
|
|
75
|
-
print(f"{candidate}: {score}")
|
|
76
|
-
# Output:
|
|
77
|
-
# apple pie: 1.0
|
|
78
|
-
# apple crisp: 1.0
|
|
79
|
-
```
|
|
80
|
-
|
|
81
|
-
## Development
|
|
82
|
-
|
|
83
|
-
1. **Setup Environment**:
|
|
84
|
-
```bash
|
|
85
|
-
uv venv
|
|
86
|
-
source .venv/bin/activate
|
|
87
|
-
```
|
|
88
|
-
|
|
89
|
-
2. **Install in Editable Mode**:
|
|
90
|
-
```bash
|
|
91
|
-
uv pip install -e .
|
|
92
|
-
```
|
|
93
|
-
|
|
94
|
-
3. **Run Tests**:
|
|
95
|
-
```bash
|
|
96
|
-
pytest
|
|
97
|
-
```
|
|
98
|
-
|
|
99
|
-
## License
|
|
100
|
-
|
|
101
|
-
This project is licensed under the [MIT License](LICENSE).
|
|
@@ -1,114 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: fuzzybunny
|
|
3
|
-
Version: 0.1.2
|
|
4
|
-
Summary: A fuzzy search tool for python written in C++
|
|
5
|
-
Requires-Python: >=3.8
|
|
6
|
-
Description-Content-Type: text/markdown
|
|
7
|
-
License-File: LICENSE
|
|
8
|
-
Dynamic: description
|
|
9
|
-
Dynamic: description-content-type
|
|
10
|
-
Dynamic: license-file
|
|
11
|
-
Dynamic: requires-python
|
|
12
|
-
Dynamic: summary
|
|
13
|
-
|
|
14
|
-
<p align="center">
|
|
15
|
-
<img src="./docs/assets/fuzzybunny.png" alt="FuzzyBunny Logo" width="150" />
|
|
16
|
-
</p>
|
|
17
|
-
|
|
18
|
-
<h1 align="center">FuzzyBunny</h1>
|
|
19
|
-
|
|
20
|
-
<p align="center">
|
|
21
|
-
<b> A fuzzy search tool written in C++ with Python bindings </b>
|
|
22
|
-
</p>
|
|
23
|
-
|
|
24
|
-
<p align="center">
|
|
25
|
-
<img src="https://img.shields.io/badge/License-MIT-green" />
|
|
26
|
-
<img src="https://img.shields.io/badge/Language-C%2B%2B-00599C" />
|
|
27
|
-
<img src="https://img.shields.io/badge/Bindings-Pybind11-blue" />
|
|
28
|
-
</p>
|
|
29
|
-
|
|
30
|
-
## Overview
|
|
31
|
-
|
|
32
|
-
FuzzyBunny is a lightweight, high-performance Python library for fuzzy string matching and ranking. It is implemented in C++ for speed and exposes a Pythonic API via Pybind11. It supports various scoring algorithms including Levenshtein, Jaccard, and Token Sort, along with partial matching capabilities.
|
|
33
|
-
|
|
34
|
-
## Features
|
|
35
|
-
|
|
36
|
-
- **Fast C++ Core**: Optimized string matching algorithms.
|
|
37
|
-
- **Multiple Scorers**:
|
|
38
|
-
- `levenshtein`: Standard edit distance ratio.
|
|
39
|
-
- `jaccard`: Set-based similarity.
|
|
40
|
-
- `token_sort`: Sorts tokens before comparing (good for "Apple Banana" vs "Banana Apple").
|
|
41
|
-
- **Ranking**: Efficiently rank a list of candidates against a query.
|
|
42
|
-
- **Partial Matching**: Support for substring matching via `mode='partial'`.
|
|
43
|
-
- **Unicode Support**: Correctly handles UTF-8 input.
|
|
44
|
-
|
|
45
|
-
## Installation
|
|
46
|
-
|
|
47
|
-
### Prerequisites
|
|
48
|
-
- Python 3.8+
|
|
49
|
-
- C++17 compatible compiler (GCC, Clang, MSVC)
|
|
50
|
-
|
|
51
|
-
### Using uv (Recommended)
|
|
52
|
-
|
|
53
|
-
```bash
|
|
54
|
-
uv pip install .
|
|
55
|
-
```
|
|
56
|
-
|
|
57
|
-
### Using pip
|
|
58
|
-
|
|
59
|
-
```bash
|
|
60
|
-
pip install .
|
|
61
|
-
```
|
|
62
|
-
|
|
63
|
-
## Usage
|
|
64
|
-
|
|
65
|
-
```python
|
|
66
|
-
import fuzzybunny
|
|
67
|
-
|
|
68
|
-
# Basic Levenshtein Ratio
|
|
69
|
-
score = fuzzybunny.levenshtein("kitten", "sitting")
|
|
70
|
-
print(f"Score: {score}") # ~0.57
|
|
71
|
-
|
|
72
|
-
# Partial Matching
|
|
73
|
-
# "apple" is a perfect substring of "apple pie"
|
|
74
|
-
score = fuzzybunny.partial_ratio("apple", "apple pie")
|
|
75
|
-
print(f"Partial Score: {score}") # 1.0
|
|
76
|
-
|
|
77
|
-
# Ranking Candidates
|
|
78
|
-
candidates = ["apple pie", "banana bread", "cherry tart", "apple crisp"]
|
|
79
|
-
results = fuzzybunny.rank(
|
|
80
|
-
query="apple",
|
|
81
|
-
candidates=candidates,
|
|
82
|
-
scorer="levenshtein",
|
|
83
|
-
mode="partial",
|
|
84
|
-
top_n=2
|
|
85
|
-
)
|
|
86
|
-
|
|
87
|
-
for candidate, score in results:
|
|
88
|
-
print(f"{candidate}: {score}")
|
|
89
|
-
# Output:
|
|
90
|
-
# apple pie: 1.0
|
|
91
|
-
# apple crisp: 1.0
|
|
92
|
-
```
|
|
93
|
-
|
|
94
|
-
## Development
|
|
95
|
-
|
|
96
|
-
1. **Setup Environment**:
|
|
97
|
-
```bash
|
|
98
|
-
uv venv
|
|
99
|
-
source .venv/bin/activate
|
|
100
|
-
```
|
|
101
|
-
|
|
102
|
-
2. **Install in Editable Mode**:
|
|
103
|
-
```bash
|
|
104
|
-
uv pip install -e .
|
|
105
|
-
```
|
|
106
|
-
|
|
107
|
-
3. **Run Tests**:
|
|
108
|
-
```bash
|
|
109
|
-
pytest
|
|
110
|
-
```
|
|
111
|
-
|
|
112
|
-
## License
|
|
113
|
-
|
|
114
|
-
This project is licensed under the [MIT License](LICENSE).
|
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
LICENSE
|
|
2
|
-
README.md
|
|
3
|
-
pyproject.toml
|
|
4
|
-
setup.py
|
|
5
|
-
fuzzybunny.egg-info/PKG-INFO
|
|
6
|
-
fuzzybunny.egg-info/SOURCES.txt
|
|
7
|
-
fuzzybunny.egg-info/dependency_links.txt
|
|
8
|
-
fuzzybunny.egg-info/not-zip-safe
|
|
9
|
-
fuzzybunny.egg-info/top_level.txt
|
|
10
|
-
src/bindings.cpp
|
|
11
|
-
src/scorers.cpp
|
|
12
|
-
tests/test_basic.py
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|