hybrid-fuzzy-matcher 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
1
+ from .matcher import HybridFuzzyMatcher
@@ -0,0 +1,317 @@
1
+ import re
2
+ from collections import defaultdict
3
+ import numpy as np
4
+ from thefuzz import fuzz
5
+ from thefuzz import process
6
+ from sentence_transformers import SentenceTransformer, util
7
+ import torch # Required by sentence_transformers for tensor operations
8
+
9
+ class HybridFuzzyMatcher:
10
+ """
11
+ A robust fuzzy string matching library that combines syntactic and semantic approaches.
12
+
13
+ This class provides functionality to:
14
+ 1. Add a corpus of strings for matching.
15
+ 2. Find the most similar strings to a given query using a hybrid approach.
16
+ 3. Identify duplicate or highly similar pairs within the added corpus.
17
+
18
+ The hybrid approach involves:
19
+ - Preprocessing: Standardizing text (lowercasing, punctuation removal, optional abbreviation mapping).
20
+ - Blocking: Reducing the search space using fast, simple key generation.
21
+ - Syntactic Matching: Using fuzz.WRatio and fuzz.token_sort_ratio for character-level similarity.
22
+ - Semantic Matching: Using Sentence-BERT embeddings for contextual and meaning-based similarity.
23
+ - Score Combination: A weighted average of syntactic and semantic scores.
24
+ """
25
+
26
+ def __init__(self,
27
+ semantic_model_name: str = 'all-MiniLM-L6-v2',
28
+ syntactic_weight: float = 0.3,
29
+ semantic_weight: float = 0.7,
30
+ syntactic_threshold: int = 60,
31
+ semantic_threshold: float = 0.5,
32
+ combined_threshold: float = 0.75,
33
+ abbreviation_map: dict = None):
34
+ """
35
+ Initializes the HybridFuzzyMatcher.
36
+
37
+ Args:
38
+ semantic_model_name (str): Name of the Sentence-BERT model to use.
39
+ 'all-MiniLM-L6-v2' is a good default.
40
+ See https://www.sbert.net/docs/pretrained_models.html
41
+ syntactic_weight (float): Weight for the syntactic similarity score (0.0 to 1.0).
42
+ semantic_weight (float): Weight for the semantic similarity score (0.0 to 1.0).
43
+ (syntactic_weight + semantic_weight should ideally be 1.0)
44
+ syntactic_threshold (int): Minimum syntactic score (0-100) for a candidate to be considered.
45
+ semantic_threshold (float): Minimum semantic score (0.0-1.0) for a candidate to be considered.
46
+ combined_threshold (float): Minimum combined score (0.0-1.0) for a result to be returned.
47
+ abbreviation_map (dict, optional): A dictionary for custom abbreviation/synonym mapping.
48
+ e.g., {"dr.": "doctor", "st.": "street"}. Defaults to None.
49
+ """
50
+ if not (0 <= syntactic_weight <= 1 and 0 <= semantic_weight <= 1 and
51
+ abs(syntactic_weight + semantic_weight - 1.0) < 1e-6):
52
+ raise ValueError("Syntactic and semantic weights must be between 0 and 1 and sum to 1.")
53
+
54
+ self.semantic_model_name = semantic_model_name
55
+ self.syntactic_weight = syntactic_weight
56
+ self.semantic_weight = semantic_weight
57
+ self.syntactic_threshold = syntactic_threshold
58
+ self.semantic_threshold = semantic_threshold
59
+ self.combined_threshold = combined_threshold
60
+ self.abbreviation_map = abbreviation_map if abbreviation_map is not None else {}
61
+
62
+ self.corpus = [] # Stores original strings
63
+ self.preprocessed_corpus = [] # Stores preprocessed strings
64
+ self.corpus_embeddings = None # Stores semantic embeddings (torch.Tensor)
65
+ self.blocking_map = defaultdict(list) # Maps blocking keys to list of corpus indices
66
+
67
+ self.model = self._load_semantic_model()
68
+ print(f"HybridFuzzyMatcher initialized with model: {self.semantic_model_name}")
69
+
70
+ def _load_semantic_model(self):
71
+ """Loads the pre-trained Sentence-BERT model."""
72
+ try:
73
+ return SentenceTransformer(self.semantic_model_name)
74
+ except Exception as e:
75
+ print(f"Error loading semantic model {self.semantic_model_name}: {e}")
76
+ print("Please ensure you have an active internet connection or the model is cached.")
77
+ raise
78
+
79
+ def _preprocess_text(self, text: str) -> str:
80
+ """
81
+ Applies standard text preprocessing steps.
82
+
83
+ Args:
84
+ text (str): The input string.
85
+
86
+ Returns:
87
+ str: The preprocessed string.
88
+ """
89
+ text = text.lower()
90
+ # Replace common abbreviations/synonyms based on the map
91
+ for abbr, full in self.abbreviation_map.items():
92
+ text = text.replace(abbr, full)
93
+
94
+ # Remove punctuation (keeping spaces)
95
+ text = re.sub(r'[^\w\s]', '', text)
96
+ # Normalize whitespace
97
+ text = re.sub(r'\s+', ' ', text).strip()
98
+ return text
99
+
100
+ def _generate_blocking_keys(self, text: str) -> list[str]:
101
+ """
102
+ Generates simple blocking keys for a given text.
103
+ This helps in reducing the number of comparisons.
104
+
105
+ Args:
106
+ text (str): The preprocessed text.
107
+
108
+ Returns:
109
+ list[str]: A list of blocking keys.
110
+ """
111
+ keys = []
112
+ words = text.split()
113
+
114
+ if len(words) > 0:
115
+ # Key 1: First 3 characters of the first word
116
+ if len(words[0]) >= 3:
117
+ keys.append(words[0][:3])
118
+ else:
119
+ keys.append(words[0]) # Use the whole word if shorter
120
+
121
+ # Key 2: First 3 characters of the last word (if different from first)
122
+ if len(words) > 1 and words[0] != words[-1]:
123
+ if len(words[-1]) >= 3:
124
+ keys.append(words[-1][:3])
125
+ else:
126
+ keys.append(words[-1])
127
+
128
+ # Key 3: A sorted token key (useful for reordered words)
129
+ sorted_tokens = " ".join(sorted(words))
130
+ if len(sorted_tokens) > 0:
131
+ keys.append(sorted_tokens)
132
+
133
+ # Add the full preprocessed text as a blocking key for exact match potential
134
+ keys.append(text)
135
+
136
+ return list(set(keys)) # Return unique keys
137
+
138
+ def add_data(self, data_list: list[str]):
139
+ """
140
+ Adds a list of strings to the matcher's internal corpus.
141
+ This also preprocesses the data, generates blocking keys, and computes embeddings.
142
+
143
+ Args:
144
+ data_list (list[str]): A list of strings to add to the corpus.
145
+ """
146
+ new_preprocessed_texts = []
147
+ start_idx = len(self.corpus)
148
+
149
+ for i, item in enumerate(data_list):
150
+ self.corpus.append(item)
151
+ preprocessed_item = self._preprocess_text(item)
152
+ self.preprocessed_corpus.append(preprocessed_item)
153
+
154
+ # Generate blocking keys for the current item and add to map
155
+ index = start_idx + i
156
+ for key in self._generate_blocking_keys(preprocessed_item):
157
+ self.blocking_map[key].append(index)
158
+
159
+ new_preprocessed_texts.append(preprocessed_item)
160
+
161
+ # Generate embeddings for the new data
162
+ if new_preprocessed_texts:
163
+ print(f"Generating embeddings for {len(new_preprocessed_texts)} new items...")
164
+ new_embeddings = self.model.encode(new_preprocessed_texts, convert_to_tensor=True)
165
+ if self.corpus_embeddings is None:
166
+ self.corpus_embeddings = new_embeddings
167
+ else:
168
+ self.corpus_embeddings = torch.cat((self.corpus_embeddings, new_embeddings), dim=0)
169
+ print("Embeddings generation complete.")
170
+ else:
171
+ print("No new data to add.")
172
+
173
+ def find_matches(self, query_string: str, top_n: int = 5) -> list[dict]:
174
+ """
175
+ Finds the top_n most similar strings to the query string from the corpus.
176
+
177
+ Args:
178
+ query_string (str): The string to find matches for.
179
+ top_n (int): The number of top matches to return.
180
+
181
+ Returns:
182
+ list[dict]: A list of dictionaries, each containing 'original_text',
183
+ 'preprocessed_text', and 'combined_score'.
184
+ """
185
+ if not self.corpus:
186
+ print("Corpus is empty. Add data using add_data() first.")
187
+ return []
188
+
189
+ preprocessed_query = self._preprocess_text(query_string)
190
+ query_embedding = self.model.encode(preprocessed_query, convert_to_tensor=True)
191
+
192
+ # Stage 2: Blocking - Get potential candidates
193
+ candidate_indices = set()
194
+ for key in self._generate_blocking_keys(preprocessed_query):
195
+ candidate_indices.update(self.blocking_map[key])
196
+
197
+ if not candidate_indices:
198
+ print(f"No candidates found via blocking for query: '{query_string}'")
199
+ return []
200
+
201
+ results = []
202
+ for idx in candidate_indices:
203
+ candidate_original = self.corpus[idx]
204
+ candidate_preprocessed = self.preprocessed_corpus[idx]
205
+ candidate_embedding = self.corpus_embeddings[idx]
206
+
207
+ # Stage 3: Syntactic Scoring
208
+ syntactic_score_wratio = fuzz.WRatio(preprocessed_query, candidate_preprocessed)
209
+ syntactic_score_token_sort = fuzz.token_sort_ratio(preprocessed_query, candidate_preprocessed)
210
+ syntactic_score = max(syntactic_score_wratio, syntactic_score_token_sort)
211
+
212
+ if syntactic_score < self.syntactic_threshold:
213
+ continue # Skip if syntactic similarity is too low
214
+
215
+ # Stage 4: Semantic Scoring
216
+ semantic_score = util.cos_sim(query_embedding, candidate_embedding).item()
217
+
218
+ if semantic_score < self.semantic_threshold:
219
+ continue # Skip if semantic similarity is too low
220
+
221
+ # Stage 5: Combine Scores
222
+ combined_score = (self.syntactic_weight * (syntactic_score / 100.0)) + \
223
+ (self.semantic_weight * semantic_score)
224
+
225
+ if combined_score >= self.combined_threshold:
226
+ results.append({
227
+ "original_text": candidate_original,
228
+ "preprocessed_text": candidate_preprocessed,
229
+ "syntactic_score": syntactic_score,
230
+ "semantic_score": semantic_score,
231
+ "combined_score": combined_score
232
+ })
233
+
234
+ # Sort results by combined score in descending order
235
+ results.sort(key=lambda x: x["combined_score"], reverse=True)
236
+
237
+ return results[:top_n]
238
+
239
+ def find_duplicates(self, min_combined_score: float = None) -> list[dict]:
240
+ """
241
+ Identifies duplicate or highly similar pairs within the added corpus.
242
+ This method is computationally intensive for very large corpora without strong blocking.
243
+
244
+ Args:
245
+ min_combined_score (float, optional): The minimum combined score for a pair to be considered a duplicate.
246
+ Defaults to the object's combined_threshold.
247
+
248
+ Returns:
249
+ list[dict]: A list of dictionaries, each representing a duplicate pair,
250
+ containing 'text1', 'text2', and 'combined_score'.
251
+ """
252
+ if not self.corpus or self.corpus_embeddings is None:
253
+ print("Corpus is empty or embeddings not generated. Add data using add_data() first.")
254
+ return []
255
+
256
+ if min_combined_score is None:
257
+ min_combined_score = self.combined_threshold
258
+
259
+ duplicate_pairs = []
260
+ processed_pairs = set() # To avoid duplicate (A, B) and (B, A) and self-comparison
261
+
262
+ print("Starting duplicate detection. This may take a while for large datasets...")
263
+
264
+ # Iterate through unique blocking keys to get candidate groups
265
+ for key, indices in self.blocking_map.items():
266
+ if len(indices) < 2: # No pairs to compare in this block
267
+ continue
268
+
269
+ # Compare all pairs within this block
270
+ for i in range(len(indices)):
271
+ idx1 = indices[i]
272
+ for j in range(i + 1, len(indices)): # Avoid self-comparison and duplicate pairs
273
+ idx2 = indices[j]
274
+
275
+ # Ensure the pair hasn't been processed from another block
276
+ pair_key = tuple(sorted((idx1, idx2)))
277
+ if pair_key in processed_pairs:
278
+ continue
279
+ processed_pairs.add(pair_key)
280
+
281
+ text1_original = self.corpus[idx1]
282
+ text2_original = self.corpus[idx2]
283
+ text1_preprocessed = self.preprocessed_corpus[idx1]
284
+ text2_preprocessed = self.preprocessed_corpus[idx2]
285
+ embedding1 = self.corpus_embeddings[idx1]
286
+ embedding2 = self.corpus_embeddings[idx2]
287
+
288
+ # Stage 3: Syntactic Scoring
289
+ syntactic_score_wratio = fuzz.WRatio(text1_preprocessed, text2_preprocessed)
290
+ syntactic_score_token_sort = fuzz.token_sort_ratio(text1_preprocessed, text2_preprocessed)
291
+ syntactic_score = max(syntactic_score_wratio, syntactic_score_token_sort)
292
+
293
+ if syntactic_score < self.syntactic_threshold:
294
+ continue
295
+
296
+ # Stage 4: Semantic Scoring
297
+ semantic_score = util.cos_sim(embedding1, embedding2).item()
298
+
299
+ if semantic_score < self.semantic_threshold:
300
+ continue
301
+
302
+ # Stage 5: Combine Scores
303
+ combined_score = (self.syntactic_weight * (syntactic_score / 100.0)) + \
304
+ (self.semantic_weight * semantic_score)
305
+
306
+ if combined_score >= min_combined_score:
307
+ duplicate_pairs.append({
308
+ "text1": text1_original,
309
+ "text2": text2_original,
310
+ "syntactic_score": syntactic_score,
311
+ "semantic_score": semantic_score,
312
+ "combined_score": combined_score
313
+ })
314
+
315
+ duplicate_pairs.sort(key=lambda x: x["combined_score"], reverse=True)
316
+ print(f"Duplicate detection complete. Found {len(duplicate_pairs)} pairs.")
317
+ return duplicate_pairs
@@ -0,0 +1,146 @@
1
+ Metadata-Version: 2.4
2
+ Name: hybrid-fuzzy-matcher
3
+ Version: 0.1.0
4
+ Summary: A robust fuzzy string matching library that combines syntactic and semantic approaches.
5
+ Author-email: Manoj Kumar <manojkumar.du.or.21@gmail.com>
6
+ Project-URL: Homepage, https://github.com/pypa/sampleproject
7
+ Project-URL: Bug Tracker, https://github.com/pypa/sampleproject/issues
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.8
12
+ Description-Content-Type: text/markdown
13
+ License-File: LICENSE
14
+ Requires-Dist: thefuzz
15
+ Requires-Dist: sentence-transformers
16
+ Requires-Dist: torch
17
+ Requires-Dist: numpy
18
+ Requires-Dist: scikit-learn
19
+ Dynamic: license-file
20
+
21
+ # Hybrid Fuzzy Matcher
22
+
23
+ A robust fuzzy string matching library that combines syntactic (character-based) and semantic (meaning-based) approaches to provide more accurate and context-aware matching.
24
+
25
+ This package is ideal for tasks like data cleaning, record linkage, and duplicate detection where simple fuzzy matching isn't enough. It leverages `thefuzz` for syntactic analysis and `sentence-transformers` for semantic similarity.
26
+
27
+ ## Key Features
28
+
29
+ - **Hybrid Scoring:** Combines `fuzz.WRatio` and `fuzz.token_sort_ratio` with cosine similarity from sentence embeddings.
30
+ - **Configurable:** Easily tune weights and thresholds for syntactic and semantic scores to fit your specific data.
31
+ - **Preprocessing:** Includes text normalization, punctuation removal, and a customizable abbreviation/synonym map.
32
+ - **Efficient Blocking:** Uses a blocking strategy to reduce the search space and improve performance on larger datasets.
33
+ - **Easy to Use:** A simple, intuitive API for adding data, finding matches, and detecting duplicates.
34
+
35
+ ## Installation
36
+
37
+ You can install the package via pip:
38
+
39
+ ```bash
40
+ pip install hybrid-fuzzy-matcher
41
+ ```
42
+
43
+ ## How to Use
44
+
45
+ Here is a complete example of how to use the `HybridFuzzyMatcher`.
46
+
47
+ ### 1. Initialize the Matcher
48
+
49
+ First, create an instance of the `HybridFuzzyMatcher`. You can optionally provide a custom abbreviation map and adjust the weights and thresholds.
50
+
51
+ ```python
52
+ from hybrid_fuzzy_matcher import HybridFuzzyMatcher
53
+
54
+ # Custom abbreviation map (optional)
55
+ custom_abbr_map = {
56
+ "dr.": "doctor",
57
+ "st.": "street",
58
+ "co.": "company",
59
+ "inc.": "incorporated",
60
+ "ny": "new york",
61
+ "usa": "united states of america",
62
+ }
63
+
64
+ # Initialize the matcher
65
+ matcher = HybridFuzzyMatcher(
66
+ syntactic_weight=0.4,
67
+ semantic_weight=0.6,
68
+ syntactic_threshold=70,
69
+ semantic_threshold=0.6,
70
+ combined_threshold=0.75,
71
+ abbreviation_map=custom_abbr_map
72
+ )
73
+ ```
74
+
75
+ ### 2. Add Data
76
+
77
+ Add the list of strings you want to match against. The matcher will automatically preprocess the text and generate the necessary embeddings.
78
+
79
+ ```python
80
+ data_corpus = [
81
+ "Apple iPhone 13 Pro Max, 256GB, Sierra Blue",
82
+ "iPhone 13 Pro Max 256 GB, Blue, Apple Brand",
83
+ "Samsung Galaxy S22 Ultra 512GB Phantom Black",
84
+ "Apple iPhone 12 Mini, 64GB, Red",
85
+ "Dr. John Smith, PhD",
86
+ "Doctor John Smith",
87
+ "New York City Department of Parks and Recreation",
88
+ "NYC Dept. of Parks & Rec",
89
+ ]
90
+
91
+ # Add data to the matcher
92
+ matcher.add_data(data_corpus)
93
+ ```
94
+
95
+ ### 3. Find Matches for a Query
96
+
97
+ Use the `find_matches` method to find the most similar strings in the corpus for a given query.
98
+
99
+ ```python
100
+ query = "iPhone 13 Pro Max, 256GB, Blue"
101
+ matches = matcher.find_matches(query, top_n=3)
102
+
103
+ print(f"Query: '{query}'")
104
+ for match in matches:
105
+ print(f" Match: '{match['original_text']}'")
106
+ print(f" Scores: Syntactic={match['syntactic_score']:.2f}, "
107
+ f"Semantic={match['semantic_score']:.4f}, Combined={match['combined_score']:.4f}")
108
+
109
+ # Query: 'iPhone 13 Pro Max, 256GB, Blue'
110
+ # Match: 'Apple iPhone 13 Pro Max, 256GB, Sierra Blue'
111
+ # Scores: Syntactic=95.00, Semantic=0.9801, Combined=0.9681
112
+ # Match: 'iPhone 13 Pro Max 256 GB, Blue, Apple Brand'
113
+ # Scores: Syntactic=95.00, Semantic=0.9734, Combined=0.9640
114
+ ```
115
+
116
+ ### 4. Find Duplicates in the Corpus
117
+
118
+ Use the `find_duplicates` method to identify highly similar pairs within the entire corpus.
119
+
120
+ ```python
121
+ duplicates = matcher.find_duplicates(min_combined_score=0.8)
122
+
123
+ print("\n--- Finding Duplicate Pairs ---")
124
+ for pair in duplicates:
125
+ print(f"\nDuplicate Pair (Score: {pair['combined_score']:.4f}):")
126
+ print(f" Text 1: '{pair['text1']}'")
127
+ print(f" Text 2: '{pair['text2']}'")
128
+
129
+ # --- Finding Duplicate Pairs ---
130
+ #
131
+ # Duplicate Pair (Score: 0.9933):
132
+ # Text 1: 'Dr. John Smith, PhD'
133
+ # Text 2: 'Doctor John Smith'
134
+ ```
135
+
136
+ ## How It Works
137
+
138
+ The matching process follows these steps:
139
+
140
+ 1. **Preprocessing:** Text is lowercased, punctuation is removed, and custom abbreviations are expanded.
141
+ 2. **Blocking:** To avoid comparing every string to every other string, candidate pairs are generated based on simple keys (like the first few letters of words). This dramatically speeds up the process.
142
+ 3. **Syntactic Scoring:** Candidates are scored using `thefuzz`'s `WRatio` and `token_sort_ratio`. This catches character-level similarities and typos.
143
+ 4. **Semantic Scoring:** The pre-trained `sentence-transformers` model (`all-MiniLM-L6-v2` by default) converts strings into vector embeddings. The cosine similarity between these embeddings measures how close they are in meaning.
144
+ 5. **Score Combination:** The final score is a weighted average of the syntactic and semantic scores. This hybrid score provides a more holistic measure of similarity.
145
+
146
+ This approach ensures that the matcher can identify similarities even when the wording is different but the meaning is the same.
@@ -0,0 +1,7 @@
1
+ hybrid_fuzzy_matcher/__init__.py,sha256=vZRlB9wV25Kxkm5YogPyz1VpfOds90fYWXrCcMpRkqg,40
2
+ hybrid_fuzzy_matcher/matcher.py,sha256=m7FJ5Sj3KrNuqqhBz_HWqxsb7hxqysSxKH6LdBYL9T8,14234
3
+ hybrid_fuzzy_matcher-0.1.0.dist-info/licenses/LICENSE,sha256=YeOlUyX_IlUKumAY-5eZPg-js_8tUm18JgSkZuebAbA,1068
4
+ hybrid_fuzzy_matcher-0.1.0.dist-info/METADATA,sha256=HLXxNP99M7f9x1SYSMkF0AcF9ICVpKCH14QdI4Btg8s,5766
5
+ hybrid_fuzzy_matcher-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
6
+ hybrid_fuzzy_matcher-0.1.0.dist-info/top_level.txt,sha256=S56outaLuf8-6m_SD9lc2PyrjqdPoeJBWpkc7X7qc28,21
7
+ hybrid_fuzzy_matcher-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Manoj Kumar
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ hybrid_fuzzy_matcher