fuzzybunny 0.1.2__cp310-cp310-win32.whl → 0.2.0__cp310-cp310-win32.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fuzzybunny/__init__.py +75 -0
- fuzzybunny/_fuzzybunny.cp310-win32.pyd +0 -0
- fuzzybunny/benchmark.py +49 -0
- fuzzybunny-0.2.0.dist-info/METADATA +96 -0
- fuzzybunny-0.2.0.dist-info/RECORD +8 -0
- {fuzzybunny-0.1.2.dist-info → fuzzybunny-0.2.0.dist-info}/WHEEL +1 -1
- fuzzybunny-0.1.2.dist-info/METADATA +0 -114
- fuzzybunny-0.1.2.dist-info/RECORD +0 -6
- fuzzybunny.cp310-win32.pyd +0 -0
- {fuzzybunny-0.1.2.dist-info → fuzzybunny-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {fuzzybunny-0.1.2.dist-info → fuzzybunny-0.2.0.dist-info}/top_level.txt +0 -0
fuzzybunny/__init__.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
from . import _fuzzybunny
|
|
2
|
+
from ._fuzzybunny import (
|
|
3
|
+
levenshtein,
|
|
4
|
+
jaccard,
|
|
5
|
+
token_sort,
|
|
6
|
+
partial_ratio,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
from .benchmark import benchmark, benchmark_batch
|
|
10
|
+
|
|
11
|
+
def rank(query, candidates, scorer="levenshtein", mode="full", process=True, threshold=0.0, top_n=-1, weights=None):
|
|
12
|
+
"""
|
|
13
|
+
Enhanced rank function with support for Pandas Series and NumPy arrays.
|
|
14
|
+
"""
|
|
15
|
+
if weights is None:
|
|
16
|
+
weights = {}
|
|
17
|
+
|
|
18
|
+
# Check for pandas/numpy
|
|
19
|
+
if _is_pandas_series(candidates):
|
|
20
|
+
candidates = candidates.astype(str).tolist()
|
|
21
|
+
elif _is_numpy_array(candidates):
|
|
22
|
+
candidates = candidates.astype(str).tolist()
|
|
23
|
+
|
|
24
|
+
return _fuzzybunny.rank(query, candidates, scorer, mode, process, threshold, top_n, weights)
|
|
25
|
+
|
|
26
|
+
def batch_match(queries, candidates, scorer="levenshtein", mode="full", process=True, threshold=0.0, top_n=-1, weights=None):
|
|
27
|
+
"""
|
|
28
|
+
Enhanced batch_match function with support for Pandas/NumPy candidates.
|
|
29
|
+
"""
|
|
30
|
+
if weights is None:
|
|
31
|
+
weights = {}
|
|
32
|
+
|
|
33
|
+
if _is_pandas_series(candidates):
|
|
34
|
+
candidates = candidates.astype(str).tolist()
|
|
35
|
+
elif _is_numpy_array(candidates):
|
|
36
|
+
candidates = candidates.astype(str).tolist()
|
|
37
|
+
|
|
38
|
+
# queries can also be pandas/numpy
|
|
39
|
+
if _is_pandas_series(queries) or _is_numpy_array(queries):
|
|
40
|
+
import numpy as np
|
|
41
|
+
queries = np.array(queries).astype(str).tolist()
|
|
42
|
+
|
|
43
|
+
return _fuzzybunny.batch_match(queries, candidates, scorer, mode, process, threshold, top_n, weights)
|
|
44
|
+
|
|
45
|
+
def _is_pandas_series(obj):
|
|
46
|
+
try:
|
|
47
|
+
import pandas as pd
|
|
48
|
+
return isinstance(obj, pd.Series)
|
|
49
|
+
except ImportError:
|
|
50
|
+
return False
|
|
51
|
+
|
|
52
|
+
def _is_numpy_array(obj):
|
|
53
|
+
try:
|
|
54
|
+
import numpy as np
|
|
55
|
+
return isinstance(obj, np.ndarray)
|
|
56
|
+
except ImportError:
|
|
57
|
+
return False
|
|
58
|
+
|
|
59
|
+
def _register_pandas_accessor():
|
|
60
|
+
try:
|
|
61
|
+
import pandas as pd
|
|
62
|
+
|
|
63
|
+
@pd.api.extensions.register_series_accessor("fuzzy")
|
|
64
|
+
class FuzzyAccessor:
|
|
65
|
+
def __init__(self, pandas_obj):
|
|
66
|
+
self._obj = pandas_obj
|
|
67
|
+
|
|
68
|
+
def match(self, query, scorer="levenshtein", mode="full", process=True, threshold=0.0, top_n=-1, weights=None):
|
|
69
|
+
return rank(query, self._obj, scorer, mode, process, threshold, top_n, weights)
|
|
70
|
+
except (ImportError, AttributeError):
|
|
71
|
+
pass
|
|
72
|
+
|
|
73
|
+
_register_pandas_accessor()
|
|
74
|
+
|
|
75
|
+
__version__ = getattr(_fuzzybunny, "__version__", "dev")
|
|
Binary file
|
fuzzybunny/benchmark.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import time
|
|
2
|
+
import statistics
|
|
3
|
+
|
|
4
|
+
def benchmark(query, candidates, scorers=None, n_runs=5):
|
|
5
|
+
"""
|
|
6
|
+
Benchmark different scorers on a given query and set of candidates.
|
|
7
|
+
Returns a dictionary with timing results.
|
|
8
|
+
"""
|
|
9
|
+
from . import rank
|
|
10
|
+
if scorers is None:
|
|
11
|
+
scorers = ["levenshtein", "jaccard", "token_sort"]
|
|
12
|
+
|
|
13
|
+
results = {}
|
|
14
|
+
|
|
15
|
+
for scorer in scorers:
|
|
16
|
+
times = []
|
|
17
|
+
for _ in range(n_runs):
|
|
18
|
+
start = time.perf_counter()
|
|
19
|
+
rank(query, candidates, scorer=scorer)
|
|
20
|
+
end = time.perf_counter()
|
|
21
|
+
times.append(end - start)
|
|
22
|
+
|
|
23
|
+
results[scorer] = {
|
|
24
|
+
"mean": statistics.mean(times),
|
|
25
|
+
"stddev": statistics.stdev(times) if len(times) > 1 else 0,
|
|
26
|
+
"min": min(times),
|
|
27
|
+
"max": max(times)
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
return results
|
|
31
|
+
|
|
32
|
+
def benchmark_batch(queries, candidates, scorer="levenshtein", n_runs=3):
|
|
33
|
+
"""
|
|
34
|
+
Benchmark batch_match performance.
|
|
35
|
+
"""
|
|
36
|
+
from . import batch_match
|
|
37
|
+
times = []
|
|
38
|
+
for _ in range(n_runs):
|
|
39
|
+
start = time.perf_counter()
|
|
40
|
+
batch_match(queries, candidates, scorer=scorer)
|
|
41
|
+
end = time.perf_counter()
|
|
42
|
+
times.append(end - start)
|
|
43
|
+
|
|
44
|
+
return {
|
|
45
|
+
"mean": statistics.mean(times),
|
|
46
|
+
"total_queries": len(queries),
|
|
47
|
+
"total_candidates": len(candidates),
|
|
48
|
+
"queries_per_second": len(queries) / statistics.mean(times)
|
|
49
|
+
}
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: fuzzybunny
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: A fuzzy search tool for python written in C++
|
|
5
|
+
Requires-Python: >=3.8
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Dynamic: description
|
|
9
|
+
Dynamic: description-content-type
|
|
10
|
+
Dynamic: license-file
|
|
11
|
+
Dynamic: requires-python
|
|
12
|
+
Dynamic: summary
|
|
13
|
+
|
|
14
|
+
<p align="center">
|
|
15
|
+
<img src="./docs/assets/fuzzybunny.png" alt="FuzzyBunny Logo" width="150" />
|
|
16
|
+
</p>
|
|
17
|
+
|
|
18
|
+
<h1 align="center">FuzzyBunny</h1>
|
|
19
|
+
|
|
20
|
+
<p align="center">
|
|
21
|
+
<b> A high-performance, lightweight Python library for fuzzy string matching and ranking, implemented in C++ with Pybind11. </b>
|
|
22
|
+
</p>
|
|
23
|
+
|
|
24
|
+
<p align="center">
|
|
25
|
+
<img src="https://img.shields.io/badge/License-MIT-green" />
|
|
26
|
+
<img src="https://img.shields.io/badge/Language-C%2B%2B-00599C" />
|
|
27
|
+
<img src="https://img.shields.io/badge/Bindings-Pybind11-blue" />
|
|
28
|
+
</p>
|
|
29
|
+
|
|
30
|
+
## Features
|
|
31
|
+
|
|
32
|
+
- **Blazing Fast**: C++ core for 2-5x speed improvement over pure Python alternatives.
|
|
33
|
+
- **Multiple Scorers**: Support for Levenshtein, Jaccard, and Token Sort ratios.
|
|
34
|
+
- **Partial Matching**: Find the best substring matches.
|
|
35
|
+
- **Hybrid Scoring**: Combine multiple scorers with custom weights.
|
|
36
|
+
- **Pandas & NumPy Integration**: Native support for Series and Arrays.
|
|
37
|
+
- **Batch Processing**: Parallelized matching for large datasets using OpenMP.
|
|
38
|
+
- **Unicode Support**: Handles international characters and normalization.
|
|
39
|
+
- **Benchmarking Tools**: Built-in utilities to measure performance.
|
|
40
|
+
|
|
41
|
+
## Installation
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
pip install fuzzybunny
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Quick Start
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
import fuzzybunny
|
|
51
|
+
|
|
52
|
+
# Basic matching
|
|
53
|
+
score = fuzzybunny.levenshtein("kitten", "sitting")
|
|
54
|
+
print(f"Similarity: {score:.2f}")
|
|
55
|
+
|
|
56
|
+
# Ranking candidates
|
|
57
|
+
candidates = ["apple", "apricot", "banana", "cherry"]
|
|
58
|
+
results = fuzzybunny.rank("app", candidates, top_n=2)
|
|
59
|
+
# [('apple', 0.6), ('apricot', 0.42)]
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## Advanced Usage
|
|
63
|
+
|
|
64
|
+
### Hybrid Scorer
|
|
65
|
+
Combine different algorithms to get better results:
|
|
66
|
+
|
|
67
|
+
```python
|
|
68
|
+
results = fuzzybunny.rank(
|
|
69
|
+
"apple banana",
|
|
70
|
+
["banana apple"],
|
|
71
|
+
scorer="hybrid",
|
|
72
|
+
weights={"levenshtein": 0.3, "token_sort": 0.7}
|
|
73
|
+
)
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### Pandas Integration
|
|
77
|
+
Use the specialized accessor for clean code:
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
import pandas as pd
|
|
81
|
+
import fuzzybunny
|
|
82
|
+
|
|
83
|
+
df = pd.DataFrame({"names": ["apple pie", "banana bread", "cherry tart"]})
|
|
84
|
+
results = df["names"].fuzzy.match("apple", mode="partial")
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### Benchmarking
|
|
88
|
+
Compare performance on your specific data:
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
perf = fuzzybunny.benchmark("query", candidates)
|
|
92
|
+
print(f"Levenshtein mean time: {perf['levenshtein']['mean']:.6f}s")
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
## License
|
|
96
|
+
MIT
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
fuzzybunny/__init__.py,sha256=P2QxF06r8iOSnRKnLiJ0Uq2HBm7BoFthZ7rWGs252k0,2470
|
|
2
|
+
fuzzybunny/_fuzzybunny.cp310-win32.pyd,sha256=WR643XnrP1VTPtvkpw-ibFmH7pPeqNQ-U7PUsn57-2A,160256
|
|
3
|
+
fuzzybunny/benchmark.py,sha256=Qmg4-CBnB3RXDBHBY4zI5648isWcmT99EbRcEeID9-4,1481
|
|
4
|
+
fuzzybunny-0.2.0.dist-info/licenses/LICENSE,sha256=hvCYI5G-uZ41pxcDhQFHZHyMLk6BxmI_a27L1656lpo,1093
|
|
5
|
+
fuzzybunny-0.2.0.dist-info/METADATA,sha256=TgKmua2FPHhng_ARrwoHP2hs1h1xqy1QeTTz5dt5Dk4,2624
|
|
6
|
+
fuzzybunny-0.2.0.dist-info/WHEEL,sha256=Vh7QCNXvj2TC-qJlYM9d2qTMV3xdKk7KO7QRmG3ME_Y,98
|
|
7
|
+
fuzzybunny-0.2.0.dist-info/top_level.txt,sha256=rBIGqLOrlMo_aY9-MDE05n-7FuenH7CnsVNydAlCa_E,11
|
|
8
|
+
fuzzybunny-0.2.0.dist-info/RECORD,,
|
|
@@ -1,114 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: fuzzybunny
|
|
3
|
-
Version: 0.1.2
|
|
4
|
-
Summary: A fuzzy search tool for python written in C++
|
|
5
|
-
Requires-Python: >=3.8
|
|
6
|
-
Description-Content-Type: text/markdown
|
|
7
|
-
License-File: LICENSE
|
|
8
|
-
Dynamic: description
|
|
9
|
-
Dynamic: description-content-type
|
|
10
|
-
Dynamic: license-file
|
|
11
|
-
Dynamic: requires-python
|
|
12
|
-
Dynamic: summary
|
|
13
|
-
|
|
14
|
-
<p align="center">
|
|
15
|
-
<img src="./docs/assets/fuzzybunny.png" alt="FuzzyBunny Logo" width="150" />
|
|
16
|
-
</p>
|
|
17
|
-
|
|
18
|
-
<h1 align="center">FuzzyBunny</h1>
|
|
19
|
-
|
|
20
|
-
<p align="center">
|
|
21
|
-
<b> A fuzzy search tool written in C++ with Python bindings </b>
|
|
22
|
-
</p>
|
|
23
|
-
|
|
24
|
-
<p align="center">
|
|
25
|
-
<img src="https://img.shields.io/badge/License-MIT-green" />
|
|
26
|
-
<img src="https://img.shields.io/badge/Language-C%2B%2B-00599C" />
|
|
27
|
-
<img src="https://img.shields.io/badge/Bindings-Pybind11-blue" />
|
|
28
|
-
</p>
|
|
29
|
-
|
|
30
|
-
## Overview
|
|
31
|
-
|
|
32
|
-
FuzzyBunny is a lightweight, high-performance Python library for fuzzy string matching and ranking. It is implemented in C++ for speed and exposes a Pythonic API via Pybind11. It supports various scoring algorithms including Levenshtein, Jaccard, and Token Sort, along with partial matching capabilities.
|
|
33
|
-
|
|
34
|
-
## Features
|
|
35
|
-
|
|
36
|
-
- **Fast C++ Core**: Optimized string matching algorithms.
|
|
37
|
-
- **Multiple Scorers**:
|
|
38
|
-
- `levenshtein`: Standard edit distance ratio.
|
|
39
|
-
- `jaccard`: Set-based similarity.
|
|
40
|
-
- `token_sort`: Sorts tokens before comparing (good for "Apple Banana" vs "Banana Apple").
|
|
41
|
-
- **Ranking**: Efficiently rank a list of candidates against a query.
|
|
42
|
-
- **Partial Matching**: Support for substring matching via `mode='partial'`.
|
|
43
|
-
- **Unicode Support**: Correctly handles UTF-8 input.
|
|
44
|
-
|
|
45
|
-
## Installation
|
|
46
|
-
|
|
47
|
-
### Prerequisites
|
|
48
|
-
- Python 3.8+
|
|
49
|
-
- C++17 compatible compiler (GCC, Clang, MSVC)
|
|
50
|
-
|
|
51
|
-
### Using uv (Recommended)
|
|
52
|
-
|
|
53
|
-
```bash
|
|
54
|
-
uv pip install .
|
|
55
|
-
```
|
|
56
|
-
|
|
57
|
-
### Using pip
|
|
58
|
-
|
|
59
|
-
```bash
|
|
60
|
-
pip install .
|
|
61
|
-
```
|
|
62
|
-
|
|
63
|
-
## Usage
|
|
64
|
-
|
|
65
|
-
```python
|
|
66
|
-
import fuzzybunny
|
|
67
|
-
|
|
68
|
-
# Basic Levenshtein Ratio
|
|
69
|
-
score = fuzzybunny.levenshtein("kitten", "sitting")
|
|
70
|
-
print(f"Score: {score}") # ~0.57
|
|
71
|
-
|
|
72
|
-
# Partial Matching
|
|
73
|
-
# "apple" is a perfect substring of "apple pie"
|
|
74
|
-
score = fuzzybunny.partial_ratio("apple", "apple pie")
|
|
75
|
-
print(f"Partial Score: {score}") # 1.0
|
|
76
|
-
|
|
77
|
-
# Ranking Candidates
|
|
78
|
-
candidates = ["apple pie", "banana bread", "cherry tart", "apple crisp"]
|
|
79
|
-
results = fuzzybunny.rank(
|
|
80
|
-
query="apple",
|
|
81
|
-
candidates=candidates,
|
|
82
|
-
scorer="levenshtein",
|
|
83
|
-
mode="partial",
|
|
84
|
-
top_n=2
|
|
85
|
-
)
|
|
86
|
-
|
|
87
|
-
for candidate, score in results:
|
|
88
|
-
print(f"{candidate}: {score}")
|
|
89
|
-
# Output:
|
|
90
|
-
# apple pie: 1.0
|
|
91
|
-
# apple crisp: 1.0
|
|
92
|
-
```
|
|
93
|
-
|
|
94
|
-
## Development
|
|
95
|
-
|
|
96
|
-
1. **Setup Environment**:
|
|
97
|
-
```bash
|
|
98
|
-
uv venv
|
|
99
|
-
source .venv/bin/activate
|
|
100
|
-
```
|
|
101
|
-
|
|
102
|
-
2. **Install in Editable Mode**:
|
|
103
|
-
```bash
|
|
104
|
-
uv pip install -e .
|
|
105
|
-
```
|
|
106
|
-
|
|
107
|
-
3. **Run Tests**:
|
|
108
|
-
```bash
|
|
109
|
-
pytest
|
|
110
|
-
```
|
|
111
|
-
|
|
112
|
-
## License
|
|
113
|
-
|
|
114
|
-
This project is licensed under the [MIT License](LICENSE).
|
|
@@ -1,6 +0,0 @@
|
|
|
1
|
-
fuzzybunny.cp310-win32.pyd,sha256=GFOakSDZYcRh1xsoXoeIwyVhorSm3Ru-aqegEvJIigY,153088
|
|
2
|
-
fuzzybunny-0.1.2.dist-info/licenses/LICENSE,sha256=hvCYI5G-uZ41pxcDhQFHZHyMLk6BxmI_a27L1656lpo,1093
|
|
3
|
-
fuzzybunny-0.1.2.dist-info/METADATA,sha256=OQ0NXmpf02e-_BydnDa3skOYRQmAAOhvUov2kzSD5mA,2838
|
|
4
|
-
fuzzybunny-0.1.2.dist-info/WHEEL,sha256=GWZF0cboiU4MhsG0baPl8rrtCaXFSLW25384gp3vddM,97
|
|
5
|
-
fuzzybunny-0.1.2.dist-info/top_level.txt,sha256=rBIGqLOrlMo_aY9-MDE05n-7FuenH7CnsVNydAlCa_E,11
|
|
6
|
-
fuzzybunny-0.1.2.dist-info/RECORD,,
|
fuzzybunny.cp310-win32.pyd
DELETED
|
Binary file
|
|
File without changes
|
|
File without changes
|