fuzzybunny 0.1.2__cp310-cp310-win_amd64.whl → 0.2.0__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
fuzzybunny/__init__.py ADDED
@@ -0,0 +1,75 @@
1
+ from . import _fuzzybunny
2
+ from ._fuzzybunny import (
3
+ levenshtein,
4
+ jaccard,
5
+ token_sort,
6
+ partial_ratio,
7
+ )
8
+
9
+ from .benchmark import benchmark, benchmark_batch
10
+
11
+ def rank(query, candidates, scorer="levenshtein", mode="full", process=True, threshold=0.0, top_n=-1, weights=None):
12
+ """
13
+ Enhanced rank function with support for Pandas Series and NumPy arrays.
14
+ """
15
+ if weights is None:
16
+ weights = {}
17
+
18
+ # Check for pandas/numpy
19
+ if _is_pandas_series(candidates):
20
+ candidates = candidates.astype(str).tolist()
21
+ elif _is_numpy_array(candidates):
22
+ candidates = candidates.astype(str).tolist()
23
+
24
+ return _fuzzybunny.rank(query, candidates, scorer, mode, process, threshold, top_n, weights)
25
+
26
+ def batch_match(queries, candidates, scorer="levenshtein", mode="full", process=True, threshold=0.0, top_n=-1, weights=None):
27
+ """
28
+ Enhanced batch_match function with support for Pandas/NumPy candidates.
29
+ """
30
+ if weights is None:
31
+ weights = {}
32
+
33
+ if _is_pandas_series(candidates):
34
+ candidates = candidates.astype(str).tolist()
35
+ elif _is_numpy_array(candidates):
36
+ candidates = candidates.astype(str).tolist()
37
+
38
+ # queries can also be pandas/numpy
39
+ if _is_pandas_series(queries) or _is_numpy_array(queries):
40
+ import numpy as np
41
+ queries = np.array(queries).astype(str).tolist()
42
+
43
+ return _fuzzybunny.batch_match(queries, candidates, scorer, mode, process, threshold, top_n, weights)
44
+
45
+ def _is_pandas_series(obj):
46
+ try:
47
+ import pandas as pd
48
+ return isinstance(obj, pd.Series)
49
+ except ImportError:
50
+ return False
51
+
52
+ def _is_numpy_array(obj):
53
+ try:
54
+ import numpy as np
55
+ return isinstance(obj, np.ndarray)
56
+ except ImportError:
57
+ return False
58
+
59
+ def _register_pandas_accessor():
60
+ try:
61
+ import pandas as pd
62
+
63
+ @pd.api.extensions.register_series_accessor("fuzzy")
64
+ class FuzzyAccessor:
65
+ def __init__(self, pandas_obj):
66
+ self._obj = pandas_obj
67
+
68
+ def match(self, query, scorer="levenshtein", mode="full", process=True, threshold=0.0, top_n=-1, weights=None):
69
+ return rank(query, self._obj, scorer, mode, process, threshold, top_n, weights)
70
+ except (ImportError, AttributeError):
71
+ pass
72
+
73
+ _register_pandas_accessor()
74
+
75
+ __version__ = getattr(_fuzzybunny, "__version__", "dev")
@@ -0,0 +1,49 @@
1
+ import time
2
+ import statistics
3
+
4
+ def benchmark(query, candidates, scorers=None, n_runs=5):
5
+ """
6
+ Benchmark different scorers on a given query and set of candidates.
7
+ Returns a dictionary with timing results.
8
+ """
9
+ from . import rank
10
+ if scorers is None:
11
+ scorers = ["levenshtein", "jaccard", "token_sort"]
12
+
13
+ results = {}
14
+
15
+ for scorer in scorers:
16
+ times = []
17
+ for _ in range(n_runs):
18
+ start = time.perf_counter()
19
+ rank(query, candidates, scorer=scorer)
20
+ end = time.perf_counter()
21
+ times.append(end - start)
22
+
23
+ results[scorer] = {
24
+ "mean": statistics.mean(times),
25
+ "stddev": statistics.stdev(times) if len(times) > 1 else 0,
26
+ "min": min(times),
27
+ "max": max(times)
28
+ }
29
+
30
+ return results
31
+
32
+ def benchmark_batch(queries, candidates, scorer="levenshtein", n_runs=3):
33
+ """
34
+ Benchmark batch_match performance.
35
+ """
36
+ from . import batch_match
37
+ times = []
38
+ for _ in range(n_runs):
39
+ start = time.perf_counter()
40
+ batch_match(queries, candidates, scorer=scorer)
41
+ end = time.perf_counter()
42
+ times.append(end - start)
43
+
44
+ return {
45
+ "mean": statistics.mean(times),
46
+ "total_queries": len(queries),
47
+ "total_candidates": len(candidates),
48
+ "queries_per_second": len(queries) / statistics.mean(times)
49
+ }
@@ -0,0 +1,96 @@
1
+ Metadata-Version: 2.4
2
+ Name: fuzzybunny
3
+ Version: 0.2.0
4
+ Summary: A fuzzy search tool for python written in C++
5
+ Requires-Python: >=3.8
6
+ Description-Content-Type: text/markdown
7
+ License-File: LICENSE
8
+ Dynamic: description
9
+ Dynamic: description-content-type
10
+ Dynamic: license-file
11
+ Dynamic: requires-python
12
+ Dynamic: summary
13
+
14
+ <p align="center">
15
+ <img src="./docs/assets/fuzzybunny.png" alt="FuzzyBunny Logo" width="150" />
16
+ </p>
17
+
18
+ <h1 align="center">FuzzyBunny</h1>
19
+
20
+ <p align="center">
21
+ <b> A high-performance, lightweight Python library for fuzzy string matching and ranking, implemented in C++ with Pybind11. </b>
22
+ </p>
23
+
24
+ <p align="center">
25
+ <img src="https://img.shields.io/badge/License-MIT-green" />
26
+ <img src="https://img.shields.io/badge/Language-C%2B%2B-00599C" />
27
+ <img src="https://img.shields.io/badge/Bindings-Pybind11-blue" />
28
+ </p>
29
+
30
+ ## Features
31
+
32
+ - **Blazing Fast**: C++ core for 2-5x speed improvement over pure Python alternatives.
33
+ - **Multiple Scorers**: Support for Levenshtein, Jaccard, and Token Sort ratios.
34
+ - **Partial Matching**: Find the best substring matches.
35
+ - **Hybrid Scoring**: Combine multiple scorers with custom weights.
36
+ - **Pandas & NumPy Integration**: Native support for Series and Arrays.
37
+ - **Batch Processing**: Parallelized matching for large datasets using OpenMP.
38
+ - **Unicode Support**: Handles international characters and normalization.
39
+ - **Benchmarking Tools**: Built-in utilities to measure performance.
40
+
41
+ ## Installation
42
+
43
+ ```bash
44
+ pip install fuzzybunny
45
+ ```
46
+
47
+ ## Quick Start
48
+
49
+ ```python
50
+ import fuzzybunny
51
+
52
+ # Basic matching
53
+ score = fuzzybunny.levenshtein("kitten", "sitting")
54
+ print(f"Similarity: {score:.2f}")
55
+
56
+ # Ranking candidates
57
+ candidates = ["apple", "apricot", "banana", "cherry"]
58
+ results = fuzzybunny.rank("app", candidates, top_n=2)
59
+ # [('apple', 0.6), ('apricot', 0.42)]
60
+ ```
61
+
62
+ ## Advanced Usage
63
+
64
+ ### Hybrid Scorer
65
+ Combine different algorithms to get better results:
66
+
67
+ ```python
68
+ results = fuzzybunny.rank(
69
+ "apple banana",
70
+ ["banana apple"],
71
+ scorer="hybrid",
72
+ weights={"levenshtein": 0.3, "token_sort": 0.7}
73
+ )
74
+ ```
75
+
76
+ ### Pandas Integration
77
+ Use the specialized accessor for clean code:
78
+
79
+ ```python
80
+ import pandas as pd
81
+ import fuzzybunny
82
+
83
+ df = pd.DataFrame({"names": ["apple pie", "banana bread", "cherry tart"]})
84
+ results = df["names"].fuzzy.match("apple", mode="partial")
85
+ ```
86
+
87
+ ### Benchmarking
88
+ Compare performance on your specific data:
89
+
90
+ ```python
91
+ perf = fuzzybunny.benchmark("query", candidates)
92
+ print(f"Levenshtein mean time: {perf['levenshtein']['mean']:.6f}s")
93
+ ```
94
+
95
+ ## License
96
+ MIT
@@ -0,0 +1,8 @@
1
+ fuzzybunny/__init__.py,sha256=P2QxF06r8iOSnRKnLiJ0Uq2HBm7BoFthZ7rWGs252k0,2470
2
+ fuzzybunny/_fuzzybunny.cp310-win_amd64.pyd,sha256=kxJbOqJWIsBX0r_kyEHyd3uW27IHH_CTW8T898vTHGA,187392
3
+ fuzzybunny/benchmark.py,sha256=Qmg4-CBnB3RXDBHBY4zI5648isWcmT99EbRcEeID9-4,1481
4
+ fuzzybunny-0.2.0.dist-info/licenses/LICENSE,sha256=hvCYI5G-uZ41pxcDhQFHZHyMLk6BxmI_a27L1656lpo,1093
5
+ fuzzybunny-0.2.0.dist-info/METADATA,sha256=TgKmua2FPHhng_ARrwoHP2hs1h1xqy1QeTTz5dt5Dk4,2624
6
+ fuzzybunny-0.2.0.dist-info/WHEEL,sha256=lVtJYX4SZwMxwg8oP4kB_UdF4VQRXLlqu7hUy_2nnAE,102
7
+ fuzzybunny-0.2.0.dist-info/top_level.txt,sha256=rBIGqLOrlMo_aY9-MDE05n-7FuenH7CnsVNydAlCa_E,11
8
+ fuzzybunny-0.2.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.9.0)
2
+ Generator: setuptools (80.10.2)
3
3
  Root-Is-Purelib: false
4
4
  Tag: cp310-cp310-win_amd64
5
5
 
@@ -1,114 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: fuzzybunny
3
- Version: 0.1.2
4
- Summary: A fuzzy search tool for python written in C++
5
- Requires-Python: >=3.8
6
- Description-Content-Type: text/markdown
7
- License-File: LICENSE
8
- Dynamic: description
9
- Dynamic: description-content-type
10
- Dynamic: license-file
11
- Dynamic: requires-python
12
- Dynamic: summary
13
-
14
- <p align="center">
15
- <img src="./docs/assets/fuzzybunny.png" alt="FuzzyBunny Logo" width="150" />
16
- </p>
17
-
18
- <h1 align="center">FuzzyBunny</h1>
19
-
20
- <p align="center">
21
- <b> A fuzzy search tool written in C++ with Python bindings </b>
22
- </p>
23
-
24
- <p align="center">
25
- <img src="https://img.shields.io/badge/License-MIT-green" />
26
- <img src="https://img.shields.io/badge/Language-C%2B%2B-00599C" />
27
- <img src="https://img.shields.io/badge/Bindings-Pybind11-blue" />
28
- </p>
29
-
30
- ## Overview
31
-
32
- FuzzyBunny is a lightweight, high-performance Python library for fuzzy string matching and ranking. It is implemented in C++ for speed and exposes a Pythonic API via Pybind11. It supports various scoring algorithms including Levenshtein, Jaccard, and Token Sort, along with partial matching capabilities.
33
-
34
- ## Features
35
-
36
- - **Fast C++ Core**: Optimized string matching algorithms.
37
- - **Multiple Scorers**:
38
- - `levenshtein`: Standard edit distance ratio.
39
- - `jaccard`: Set-based similarity.
40
- - `token_sort`: Sorts tokens before comparing (good for "Apple Banana" vs "Banana Apple").
41
- - **Ranking**: Efficiently rank a list of candidates against a query.
42
- - **Partial Matching**: Support for substring matching via `mode='partial'`.
43
- - **Unicode Support**: Correctly handles UTF-8 input.
44
-
45
- ## Installation
46
-
47
- ### Prerequisites
48
- - Python 3.8+
49
- - C++17 compatible compiler (GCC, Clang, MSVC)
50
-
51
- ### Using uv (Recommended)
52
-
53
- ```bash
54
- uv pip install .
55
- ```
56
-
57
- ### Using pip
58
-
59
- ```bash
60
- pip install .
61
- ```
62
-
63
- ## Usage
64
-
65
- ```python
66
- import fuzzybunny
67
-
68
- # Basic Levenshtein Ratio
69
- score = fuzzybunny.levenshtein("kitten", "sitting")
70
- print(f"Score: {score}") # ~0.57
71
-
72
- # Partial Matching
73
- # "apple" is a perfect substring of "apple pie"
74
- score = fuzzybunny.partial_ratio("apple", "apple pie")
75
- print(f"Partial Score: {score}") # 1.0
76
-
77
- # Ranking Candidates
78
- candidates = ["apple pie", "banana bread", "cherry tart", "apple crisp"]
79
- results = fuzzybunny.rank(
80
- query="apple",
81
- candidates=candidates,
82
- scorer="levenshtein",
83
- mode="partial",
84
- top_n=2
85
- )
86
-
87
- for candidate, score in results:
88
- print(f"{candidate}: {score}")
89
- # Output:
90
- # apple pie: 1.0
91
- # apple crisp: 1.0
92
- ```
93
-
94
- ## Development
95
-
96
- 1. **Setup Environment**:
97
- ```bash
98
- uv venv
99
- source .venv/bin/activate
100
- ```
101
-
102
- 2. **Install in Editable Mode**:
103
- ```bash
104
- uv pip install -e .
105
- ```
106
-
107
- 3. **Run Tests**:
108
- ```bash
109
- pytest
110
- ```
111
-
112
- ## License
113
-
114
- This project is licensed under the [MIT License](LICENSE).
@@ -1,6 +0,0 @@
1
- fuzzybunny.cp310-win_amd64.pyd,sha256=B1gdvhlxQENgAd_8gn2FEfMgx4g8649zVg8EH8JjSmQ,180224
2
- fuzzybunny-0.1.2.dist-info/licenses/LICENSE,sha256=hvCYI5G-uZ41pxcDhQFHZHyMLk6BxmI_a27L1656lpo,1093
3
- fuzzybunny-0.1.2.dist-info/METADATA,sha256=OQ0NXmpf02e-_BydnDa3skOYRQmAAOhvUov2kzSD5mA,2838
4
- fuzzybunny-0.1.2.dist-info/WHEEL,sha256=KUuBC6lxAbHCKilKua8R9W_TM71_-9Sg5uEP3uDWcoU,101
5
- fuzzybunny-0.1.2.dist-info/top_level.txt,sha256=rBIGqLOrlMo_aY9-MDE05n-7FuenH7CnsVNydAlCa_E,11
6
- fuzzybunny-0.1.2.dist-info/RECORD,,
Binary file