NameComparator 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1588 @@
1
+ from unidecode import unidecode
2
+ from functools import lru_cache
3
+ from fuzzywuzzy import fuzz
4
+ import re
5
+ import itertools
6
+ import json
7
+ import os
8
+
9
+ class NameComparator():
10
+ """The class used for fuzzy comparing two names.
11
+ """
12
+ def __init__(self) -> None:
13
+ """Sets up the necessary attributes needed for most name comparisons.
14
+ """
15
+ self.namesToIpa:dict = self._getDataFromJson('_ipa_all_names.json')
16
+ self.syllableToIpa:dict = self._getDataFromJson('_ipa_common_word_parts.json')
17
+ self.topSurnames:set = set([str(tup[0]) for tup in self._getDataFromJson('_top_surnames.json')])
18
+ self.ipaRules:list = self._getDataFromJson('_rules_ipa.json')
19
+ replacementRules = {
20
+ "consonant_or_break": ["-", "l", "d", "z", "b", "t", "k", "n", "s", "w", "v", "ð", "ʒ", "ʧ", "θ", "h", "g", "ʤ", "ŋ", "p", "m", "ʃ", "f", "j", "r"],
21
+ "consonant": ["l", "d", "z", "b", "t", "k", "n", "s", "w", "v", "ð", "ʒ", "ʧ", "θ", "h", "g", "ʤ", "ŋ", "p", "m", "ʃ", "f", "j", "r"],
22
+ "vowel": ["ɑ", "a", "æ", "ɪ", "i", "ɛ", "e", "ə", "ɔ", "ʊ", "u", "o"]
23
+ }
24
+ for sublist in self.ipaRules:
25
+ for i, item in enumerate(sublist):
26
+ if not isinstance(item, str):
27
+ continue
28
+ if item in replacementRules.keys():
29
+ sublist[i] = replacementRules[item]
30
+
31
+ self.spellingRules:list = self._getDataFromJson('_rules_spelling.json')
32
+ replacementRules = {
33
+ "consonant": ["b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z"],
34
+ "vowel": ["a", "e", "i", "o", "u", "y"],
35
+ "letter": ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"],
36
+ "letter_or_break": ["-", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"]
37
+ }
38
+ for sublist in self.spellingRules:
39
+ for i, item in enumerate(sublist):
40
+ if not isinstance(item, str):
41
+ continue
42
+ if item in replacementRules.keys():
43
+ sublist[i] = replacementRules[item]
44
+
45
+ self.nicknameSets = self._getDataFromJson('_nickname_sets.json')
46
+ self.nicknameToNicknameSetNum = {}
47
+ for i, ns in enumerate(self.nicknameSets):
48
+ for name in ns:
49
+ if self.nicknameToNicknameSetNum.get(name):
50
+ self.nicknameToNicknameSetNum[name].append(i)
51
+ else:
52
+ self.nicknameToNicknameSetNum[name] = [i]
53
+
54
+ def _getDataFromJson(self, fileName:str) -> dict|list:
55
+ """Accesses data stored in the specified file.
56
+
57
+ Args:
58
+ fileName (str): the specified json file
59
+
60
+ Returns:
61
+ dict|list: the json data
62
+ """
63
+ pathToJsonFolder = os.path.join(os.path.dirname(__file__), 'jsonData')
64
+ path = os.path.join(pathToJsonFolder, fileName)
65
+ with open(path, encoding="utf-8") as file:
66
+ return json.load(file)
67
+
68
+ def compareTwoNames(self, name0:str, name1:str) -> dict:
69
+ """Compares two names to identify whether they are a fuzzy match.
70
+
71
+ Args:
72
+ name0 (str): a name
73
+ name1 (str): a name
74
+
75
+ Returns:
76
+ dict: the data gleaned from the comparison (whether they are a match, whether one or both names is too generic, whether one or both names is too short, along with the debugging attempt data)
77
+ """
78
+ # Setup
79
+ data = {
80
+ 'match': False,
81
+ 'tooGeneric': False,
82
+ 'tooShort': False,
83
+ 'attempt1': None,
84
+ 'attempt2': None,
85
+ 'attempt3': None,
86
+ 'attempt4': None
87
+ }
88
+
89
+ # Clean names
90
+ name0 = self._cleanNameByItself(name0)
91
+ name1 = self._cleanNameByItself(name1)
92
+ name0, name1 = self._cleanNamesTogether(name0, name1)
93
+
94
+ # Check if either name is too short (one word or less)
95
+ data['tooShort'] = self._eitherNameTooShort(name0, name1)
96
+
97
+ # Check if either name is too generic
98
+ data['tooGeneric'] = self._eitherNameTooGeneric(name0, name1)
99
+
100
+ # Cleans away nicknames
101
+ name0, name1 = self._removeNicknames(name0, name1)
102
+
103
+ # 1st attempt: Checks if names are a match according to string comparison alone
104
+ match, wordCombo = self._spellingComparison(name0, name1)
105
+ data['attempt1'] = (name0, name1, wordCombo)
106
+ if match:
107
+ data['match'] = True
108
+ return data
109
+
110
+ # Failed first attempt. Check if names are even worth continuing
111
+ if self._isWorthContinuing(name0, name1) is False:
112
+ return data
113
+
114
+ # 2nd attempt: Modify names via spelling rules, then check again if match according to string comparison
115
+ modifiedName0, modifiedName1 = self._modifyNamesTogether(name0, name1)
116
+ match, wordCombo = self._spellingComparison(modifiedName0, modifiedName1)
117
+ data['attempt2'] = (modifiedName0, modifiedName1, wordCombo)
118
+ if match:
119
+ data['match'] = True
120
+ return data
121
+
122
+ # 3rd attempt: Checks if modified names are a match according to pronunciation
123
+ match, wordCombo, ipaModifiedName0, ipaModifiedName1 = self._pronunciationComparison(modifiedName0, modifiedName1)
124
+ data['attempt3'] = (ipaModifiedName0, ipaModifiedName1, wordCombo)
125
+ if match:
126
+ data['match'] = True
127
+ return data
128
+
129
+ # 4th attempt: Check if original names are a match according to pronunciation
130
+ match, wordCombo, ipaName0, ipaName1 = self._pronunciationComparison(name0, name1)
131
+ data['attempt4'] = (ipaName0, ipaName1, wordCombo)
132
+ if match:
133
+ data['match'] = True
134
+ return data
135
+
136
+ # If they are still not a match, returns false
137
+ else:
138
+ return data
139
+
140
+ def _cleanNameByItself(self, name:str) -> str:
141
+ """Cleans a singular name to get rid of extra or unhelpful data, or to standardize surnames.
142
+
143
+ Args:
144
+ name (str): the name being cleaned
145
+
146
+ Returns:
147
+ str: the cleaned name
148
+ """
149
+ # Deal with blank names
150
+ if (name == "") or (not isinstance(name, str)):
151
+ return "_"
152
+
153
+ # Deal with whitespace
154
+ name = re.sub(r'[^\S ]', ' ', name)
155
+ name = re.sub(r" +", " ", name)
156
+ name = name.strip()
157
+
158
+ # Standardize name into ascii
159
+ name = unidecode(name)
160
+ name = name.lower()
161
+
162
+ # Deal with blank names again
163
+ if name == "":
164
+ return "_"
165
+
166
+ # Remove Punctiation
167
+ name = re.sub(r"[.,?;\"*()]", "", name)
168
+
169
+ # Remove spaces after apostrophe
170
+ name = re.sub("' +", "'", name)
171
+
172
+ # Remove jr and sr
173
+ name = re.sub(r"\bjr\b", "", name).replace(r"\bjunior\b", "")
174
+ name = re.sub(r"\bsr\b", "", name).replace(r"\bsenior\b", "")
175
+
176
+ # Remove titles
177
+ name = re.sub(r"\bprof\b", "", name).replace(r"\bprofessor\b", "")
178
+ name = re.sub(r"\bmr\b", "", name).replace(r"\bmister\b", "")
179
+ name = re.sub(r"\bmrs\b", "", name).replace(r"\bmissus\b", "")
180
+ name = re.sub(r"\bms\b", "", name).replace(r"\bmiss\b", "")
181
+ name = re.sub(r"\bdr\b", "", name).replace(r"\bdoctor\b", "")
182
+ name = re.sub(r"\bstudent\b", "", name)
183
+ name = re.sub(r"\brev\b", "", name)
184
+ name = name.replace("reverend", "")
185
+
186
+ # Remove family relations
187
+ name = re.sub(r"\bsister\b", "", name)
188
+ name = re.sub(r"\bbrother\b", "", name)
189
+ name = re.sub(r"\bmother\b", "", name)
190
+ name = re.sub(r"\bfather\b", "", name)
191
+ name = re.sub(r" in law", " ", name)
192
+
193
+ # Removes "head of household"
194
+ name = name.replace("head of household", "")
195
+
196
+ # Remove more than one space again
197
+ name = re.sub(r" +", " ", name)
198
+
199
+ # Remove stuff like 'the 3rd'
200
+ name = re.sub(r"[1-9][a-z]2,6", "", name).replace(" the ", "")
201
+
202
+ # Remove Roman numerals
203
+ name = ' '.join(re.sub(r'\b(ii|iii|iv)\b', '', word) for word in name.split())
204
+ name = re.sub(r" +", " ", name)
205
+ name = name.strip()
206
+
207
+ # Remove 'no suffix'
208
+ name = name.replace("no suffix", "")
209
+
210
+ # Deal with Dutch names
211
+ name = re.sub(r"\bvan de", "vande", name)
212
+ name = re.sub(r"\bvan den", "vanden", name)
213
+ name = re.sub(r"\bvan der", "vander", name)
214
+
215
+ # Deal with whitespace one last time, then return
216
+ name = re.sub(r" +", " ", name)
217
+ name = name.strip()
218
+ return name
219
+
220
+ def _cleanNamesTogether(self, name0:str, name1:str) -> tuple[str, str]:
221
+ """Cleans names by comparing them to one another, fixing common errors to standardize.
222
+
223
+ Args:
224
+ name0 (str): a name
225
+ name1 (str): a name
226
+
227
+ Returns:
228
+ tuple[str, str]: the two cleaned names
229
+ """
230
+ # Returns if either name is blank
231
+ if (name0 == "" or name1 == ""):
232
+ return name0, name1
233
+
234
+ # Deal with dashes
235
+ name0, name1 = self._dealWithDashes(name0, name1)
236
+
237
+ # Deal with Scottish and Irish names
238
+ name0, name1 = self._fixRelatedPrefixes(name0, name1, 'mac', 'mc')
239
+ name0, name1 = self._fixMcMac(name0, name1)
240
+
241
+ # Deal with just Irish names
242
+ oNames = [
243
+ 'beirne', 'berry', 'boyle', 'bryant', 'brian', 'brien', 'bryan', 'ceallaigh', 'conner',
244
+ 'connor', 'conor', 'daniel', 'day', 'dean', 'dea', 'doherty', 'donnell', 'donnel', 'donoghue',
245
+ 'donohue', 'donovan', 'dowd', 'driscoll', 'fallon', 'farrell', 'flaherty', 'flanagan', 'flynn',
246
+ 'gara', 'gorman', 'grady', 'guinn', 'guin', 'hagan', 'haire', 'hair', 'halloran', 'hanlon',
247
+ 'hara', 'hare', 'harra', 'harrow', 'haver', 'hearn', 'hern', 'herron', 'higgins', 'hora',
248
+ 'kane', 'keefe', 'keeffe', 'kelley', 'kelly', 'laughlin', 'leary', 'loughlin', 'mahoney',
249
+ 'mahony', 'maley', 'malley', 'mara', 'mary', 'meara', 'melia', 'moore', 'more', 'muir',
250
+ 'murchu', 'mure', 'murphy', 'neall', 'neal', 'neill', 'neil', 'ney', 'niall', 'quinn', 'regan',
251
+ 'reilly', 'riley', 'riordan', 'roark', 'rorke', 'rourke', 'ryan', 'shaughnessy', 'shea',
252
+ 'shields', 'sullivan', 'toole', 'tool',
253
+ ]
254
+ for surname in oNames:
255
+ name0, name1 = self._removeIrishO(name0, name1, surname)
256
+
257
+ # Deal with prefixes and optional intros that make the match worse
258
+ name0, name1 = self._fixRelatedPrefixes(name0, name1, 'de', 'di')
259
+ name0, name1 = self._fixRelatedPrefixes(name0, name1, 'del', 'dil')
260
+ name0, name1 = self._removeUnnecessaryPrefixes(name0, name1, "d'")
261
+ name0, name1 = self._removeUnnecessaryPrefixes(name0, name1, "de")
262
+ name0, name1 = self._removeUnnecessaryPrefixes(name0, name1, "fi")
263
+ name0, name1 = self._removeUnnecessaryPrefixes(name0, name1, "santa")
264
+ name0, name1 = self._removeUnnecessaryPrefixes(name0, name1, "san")
265
+ name0, name1 = self._removeUnnecessaryPrefixes(name0, name1, "de la")
266
+ name0, name1 = self._removeUnnecessaryPrefixes(name0, name1, "de los")
267
+ name0, name1 = self._removeUnnecessaryPrefixes(name0, name1, "del")
268
+ name0, name1 = self._removeUnnecessaryPrefixes(name0, name1, "la")
269
+ name0, name1 = self._removeUnnecessaryPrefixes(name0, name1, "le")
270
+ name0, name1 = self._removeUnnecessaryPrefixes(name0, name1, "du")
271
+ name0, name1 = self._removeUnnecessaryPrefixes(name0, name1, "dela")
272
+ name0, name1 = self._removeUnnecessaryPrefixes(name0, name1, "los")
273
+ name0, name1 = self._removeUnnecessaryPrefixes(name0, name1, "der")
274
+ name0, name1 = self._removeUnnecessaryPrefixes(name0, name1, "den")
275
+ name0, name1 = self._removeUnnecessaryPrefixes(name0, name1, "vanden")
276
+ name0, name1 = self._removeUnnecessaryPrefixes(name0, name1, "vander")
277
+ name0, name1 = self._removeUnnecessaryPrefixes(name0, name1, "vande")
278
+ name0, name1 = self._removeUnnecessaryPrefixes(name0, name1, "van")
279
+ name0, name1 = self._removeUnnecessaryPrefixes(name0, name1, "von")
280
+ name0, name1 = self._combinePrefixWithSurnameifInBoth(name0, name1, "de")
281
+ name0, name1 = self._combinePrefixWithSurnameifInBoth(name0, name1, "van")
282
+
283
+ # Combine words that are one word in the other name
284
+ while True:
285
+ combined, name0, name1 = self._combineSplitWords(name0, name1)
286
+ if not combined:
287
+ break
288
+ while True:
289
+ combined, name1, name0 = self._combineSplitWords(name1, name0)
290
+ if not combined:
291
+ break
292
+
293
+ # Remove extra spaces
294
+ name0 = re.sub(r'\s+', ' ', name0)
295
+ name1 = re.sub(r'\s+', ' ', name1)
296
+ name0 = name0.strip()
297
+ name1 = name1.strip()
298
+
299
+ # Return the cleaned names
300
+ return name0, name1
301
+
302
+ def _dealWithDashes(self, name0:str, name1:str) -> tuple[str, str]:
303
+ """Cleans both names in order to deal with dashes in names.
304
+
305
+ Args:
306
+ name0 (str): a name
307
+ name1 (str): a name
308
+
309
+ Returns:
310
+ tuple[str, str]: the cleaned names
311
+ """
312
+ # Return old if no dash in either
313
+ if ('-' not in name0) and ('-' not in name1):
314
+ return name0, name1
315
+
316
+ # Return old if dash in both
317
+ if ('-' in name0) and ('-' in name1):
318
+ return name0, name1
319
+
320
+ # Try replacing the dash with a space, and combine words if necessary
321
+ name0Edited = name0.replace('-', ' ')
322
+ name1Edited = name1.replace('-', ' ')
323
+ _, name0Edited, name1Edited = self._combineSplitWords(name0Edited, name1Edited)
324
+
325
+ # Return old if the score did not improve
326
+ diff, _, _ = self._calculateEditImprovement(name0, name1, name0Edited, name1Edited)
327
+ if diff <= 0:
328
+ return name0, name1
329
+
330
+ # Return the edited names
331
+ return name0Edited, name1Edited
332
+
333
+ def _calculateEditImprovement(self, name0:str, name1:str, name0Edited:str, name1Edited:str) -> tuple[float, tuple, tuple]:
334
+ """Calculates how much editing a name or both names improved the score in comparison to the original names.
335
+
336
+ Args:
337
+ name0 (str): the original first name
338
+ name1 (str): the original second name
339
+ name0Edited (str): the edited first name
340
+ name1Edited (str): the edited second name
341
+
342
+ Returns:
343
+ tuple[float, tuple, tuple]: the score of how much the edits improved the comparison (can be negative),
344
+ the word combo of the original, the word combo of the edited verison
345
+ """
346
+ ogWordCombo = self._findWhichWordsMatchAndHowWell(name0, name1)
347
+ editedWordCombo = self._findWhichWordsMatchAndHowWell(name0Edited, name1Edited)
348
+ ogAverageScore = sum(tup[2] for tup in ogWordCombo) / len(ogWordCombo)
349
+ editedAverageScore = sum(tup[2] for tup in editedWordCombo) / len(editedWordCombo)
350
+ diff = editedAverageScore - ogAverageScore
351
+ return diff, ogWordCombo, editedWordCombo
352
+
353
+ def _getPairIndicesAndWords(self, name0:str, name1:str) -> list[tuple[int, int, str, str]]:
354
+ """Identifies which words in the names match.
355
+
356
+ Args:
357
+ name0 (str): a name
358
+ name1 (str): a name
359
+
360
+ Returns:
361
+ list[tuple[int, int, str, str]]: the list of which words match. Tuples of: the index of word in name0, the index of word in name1, in word in name0, the word in name1
362
+ """
363
+ combo = self._findWhichWordsMatchAndHowWell(name0, name1)
364
+ words0 = name0.split()
365
+ words1 = name1.split()
366
+ matchIndices = [(int(tup[0]), int(tup[1])) for tup in combo]
367
+ matchIndicesWithWords = [(tup[0], tup[1], words0[tup[0]], words1[tup[1]]) for tup in matchIndices]
368
+ return matchIndicesWithWords
369
+
370
+ class NameEditor():
371
+ """ A class used for ease of editing specific words in names. TODO implement more where possible.
372
+ """
373
+ def __init__(self, name0:str, name1:str) -> None:
374
+ """Splits the words for later editing.
375
+
376
+ Args:
377
+ name0 (str): a name
378
+ name1 (str): a name
379
+ """
380
+ self.words0 = name0.split()
381
+ self.words1 = name1.split()
382
+
383
+ def updateName0(self, index:int, updatedWord:str) -> None:
384
+ """Replaces the stored word for name0 at the specified index.
385
+
386
+ Args:
387
+ index (int): the specified index
388
+ updatedWord (str): the replacement string
389
+ """
390
+ self.words0[index] = updatedWord
391
+
392
+ def updateName1(self, index:int, updatedWord:str) -> None:
393
+ """Replaces the stored word for name1 at the specified index.
394
+
395
+ Args:
396
+ index (int): the specified index
397
+ updatedWord (str): the replacement string
398
+ """
399
+ self.words1[index] = updatedWord
400
+
401
+ def getModifiedNames(self) -> tuple[str, str]:
402
+ """Retrieves the modified names.
403
+
404
+ Returns:
405
+ tuple[str, str]: the modified names
406
+ """
407
+ return ' '.join(self.words0), ' '.join(self.words1)
408
+
409
+ def _fixRelatedPrefixes(self, name0:str, name1:str, prefixA:str, prefixB:str) -> tuple[str, str]:
410
+ """Cleans names to deal with prefixes that are different by spelling, but functionally the same.
411
+
412
+ Args:
413
+ name0 (str): a name
414
+ name1 (str): a name
415
+ prefixA (str): the first related prefix
416
+ prefixB (str): the second related prefix
417
+
418
+ Returns:
419
+ tuple[str, str]: the two modified names
420
+ """
421
+ # Return if prefix1 in neither or prefix2 in neither
422
+ if (f' {prefixA}' not in name0) and (f' {prefixA}' not in name1):
423
+ return name0, name1
424
+ if (f' {prefixB}' not in name0) and (f' {prefixB}' not in name1):
425
+ return name0, name1
426
+
427
+ # Return if prefix1 or prefix2 is found in both
428
+ if (f' {prefixA}' in name0) and (f' {prefixA}' in name1):
429
+ return name0, name1
430
+ if (f' {prefixB}' in name0) and (f' {prefixB}' in name1):
431
+ return name0, name1
432
+
433
+ # Replace prefix2 with prefix1
434
+ if f' {prefixB}' in name0:
435
+ name0 = name0.replace(f' {prefixB}', f' {prefixA}')
436
+ else:
437
+ name1 = name1.replace(f' {prefixB}', f' {prefixA}')
438
+ return name0, name1
439
+
440
+ def _fixMcMac(self, name0:str, name1:str) -> tuple[str, str]:
441
+ """Modified names to fix problems where mc or mac are in either names and don't match when they should.
442
+
443
+ Args:
444
+ name0 (str): a name
445
+ name1 (str): a name
446
+
447
+ Returns:
448
+ tuple[str, str]: the two modified names
449
+ """
450
+ # Return for most names
451
+ if ("mc" not in name0) and ("mac" not in name0) and ("mc" not in name1) and ("mac" not in name1):
452
+ return name0, name1
453
+
454
+ # Combine split words (if any)
455
+ _, name0, name1 = self._combineSplitWords(name0, name1)
456
+
457
+ # Edit the names, if necessary
458
+ ne = self.NameEditor(name0, name1)
459
+ for prefix in ['mc', 'mac']:
460
+ for index0, index1, word0, word1 in self._getPairIndicesAndWords(name0, name1):
461
+ # Skip pair if the prefix is in both words
462
+ if (word0.startswith(prefix)) and (word1.startswith(prefix)):
463
+ continue
464
+
465
+ # Skip pair if the prefix is not in either of them
466
+ if (not word0.startswith(prefix)) and (not word1.startswith(prefix)):
467
+ continue
468
+
469
+ # Skip pair if either word is a firstname
470
+ if (index0 < 1) or (index1 < 1):
471
+ continue
472
+
473
+ # Skip pair if the shortest word is only 4 long
474
+ if min(len(word0), len(word1)) < 3:
475
+ continue
476
+
477
+ # Skip pair if they are already a solid match
478
+ if fuzz.ratio(word0, word1) > 80:
479
+ continue
480
+
481
+ # Skip pair if the prefix is removed and not a good fuzzy match
482
+ if word0.startswith(prefix):
483
+ updatedWord0 = word0.replace(prefix, "", 1)
484
+ updatedWord1 = word1
485
+ else:
486
+ updatedWord0 = word0
487
+ updatedWord1 = word1.replace(prefix, "", 1)
488
+ if fuzz.ratio(updatedWord0, updatedWord1) < 75:
489
+ continue
490
+
491
+ # Update the words
492
+ ne.updateName0(index0, updatedWord0)
493
+ ne.updateName1(index1, updatedWord1)
494
+
495
+ # Return the edited (or not) names
496
+ return ne.getModifiedNames()
497
+
498
+ def _removeIrishO(self, name0:str, name1:str, surname:str) -> tuple[str, str]:
499
+ """Removes the irish O if needed for easier name comparison.
500
+
501
+ Args:
502
+ name0 (str): a name
503
+ name1 (str): a name
504
+ surname (str): one of the irish surnames that often starts with O'
505
+
506
+ Returns:
507
+ tuple[str, str]: the modified names
508
+ """
509
+ # Skip non applicable names
510
+ if (' o ' not in name0) and (" o" not in name0) and (" o" not in name1) and (' o ' not in name1):
511
+ return name0, name1
512
+ if (surname not in name0) and (surname not in name1):
513
+ return name0, name1
514
+ # Edit the names
515
+ lastname0 = name0.split()[-1]
516
+ if fuzz.ratio(lastname0, surname) > 75:
517
+ if lastname0[0] == 'o':
518
+ name0 = name0.replace(f'{lastname0}', surname)
519
+ else:
520
+ name0 = name0.replace(f'o {lastname0}', surname)
521
+ lastname1 = name1.split()[-1]
522
+ if fuzz.ratio(lastname1, surname) > 75:
523
+ if lastname1[0] == 'o':
524
+ name1 = name1.replace(f'{lastname1}', surname)
525
+ else:
526
+ name1 = name1.replace(f'o {lastname1}', surname)
527
+ return name0, name1
528
+
529
+ def _removeUnnecessaryPrefixes(self, name0:str, name1:str, prefix:str) -> tuple[str,str]:
530
+ """Removes an unnecessary prefix from either or both of the names.
531
+
532
+ Args:
533
+ name0 (str): a name
534
+ name1 (str): a name
535
+ prefix (str): the prefix to (probably) remove
536
+
537
+ Returns:
538
+ tuple[str,str]: the modified names
539
+ """
540
+ # If the prefix is not in either names, return the names
541
+ name0 = re.sub(r"\s+", " ", name0)
542
+ name1 = re.sub(r"\s+", " ", name1)
543
+ if (f" {prefix}" not in name0) and (f" {prefix}" not in name1):
544
+ return name0, name1
545
+
546
+ # Setup
547
+ name0Edited = name0
548
+ name1Edited = name1
549
+ spPrefixSp = f" {prefix} "
550
+ spacePrefix = f" {prefix}"
551
+
552
+ # Make the edited names different
553
+ if (spPrefixSp in name0) and (spPrefixSp in name1):
554
+ pass
555
+ elif (spPrefixSp in name0) and (spacePrefix in name1):
556
+ name0Edited = name0Edited.replace(spPrefixSp, spacePrefix)
557
+ elif (spacePrefix in name0) and (spPrefixSp in name1):
558
+ name1Edited = name1Edited.replace(spPrefixSp, spacePrefix)
559
+ name0Edited = name0Edited.replace(spPrefixSp, " ")
560
+ name1Edited = name1Edited.replace(spPrefixSp, " ")
561
+ name0Edited = re.sub(r"\s+", " ", name0Edited)
562
+ name1Edited = re.sub(r"\s+", " ", name1Edited)
563
+
564
+ # If no edits were made, try removing spacePrefix if in just one name and it's a long word
565
+ if (name0 == name0Edited) and (name1 == name1Edited):
566
+ pattern = r'\b{}\w*\b'.format(spacePrefix)
567
+ if (spacePrefix in name0) and (spacePrefix not in name1):
568
+ match = re.search(pattern, name0)
569
+ matchedWord = match.group()
570
+ length = len(matchedWord)
571
+ if length > len(prefix) + 4:
572
+ name0Edited = name0.replace(spacePrefix, " ")
573
+ elif(spacePrefix in name1) and (spacePrefix not in name0):
574
+ match = re.search(pattern, name1)
575
+ matchedWord = match.group()
576
+ length = len(matchedWord)
577
+ if length > len(prefix) + 4:
578
+ name1Edited = name1.replace(spacePrefix, " ")
579
+
580
+ # If the edits were significantly beneficial (or pass spell), return the edited versions
581
+ improvement, _, _= self._calculateEditImprovement(name0, name1, name0Edited, name1Edited)
582
+ if (improvement >= 10) or (self._spellingComparison(name0Edited, name1Edited)[0] and not self._spellingComparison(name0, name1)[0]):
583
+ return name0Edited, name1Edited
584
+
585
+ # Finally, if the words are identical other than the prefix, remove the prefix
586
+ ne = self.NameEditor(name0, name1)
587
+ for index0, index1, word0, word1 in self._getPairIndicesAndWords(name0, name1):
588
+ if (word0.startswith(prefix)) and (word0[len(prefix):] == word1) and (len(word1) > 2):
589
+ ne.updateName0(index0, word0[len(prefix):])
590
+ elif (word1.startswith(prefix)) and (word1[len(prefix):] == word0) and (len(word0) > 2):
591
+ ne.updateName1(index1, word1[len(prefix):])
592
+ name0, name1 = ne.getModifiedNames()
593
+
594
+ # Otherwise, return originals
595
+ return name0, name1
596
+
597
+ def _combinePrefixWithSurnameifInBoth(self, name0:str, name1:str, prefix:str) -> tuple[str, str]:
598
+ """Combines the prefix with the surname in both of the names if the prefix exists in both.
599
+
600
+ Args:
601
+ name0 (str): a name
602
+ name1 (str): a name
603
+ prefix (str): the prefix to combine with the surname
604
+
605
+ Returns:
606
+ tuple[str, str]: the modified names
607
+ """
608
+ # Return if ' prefix ' in neither
609
+ if (not re.search(f' {prefix} .', name0)) or (not re.search(f' {prefix} .', name1)):
610
+ return name0, name1
611
+
612
+ # Get the letter after ' prefix '
613
+ letter0 = name0[name0.index(f' {prefix} ') + 4]
614
+ letter1 = name1[name1.index(f' {prefix} ') + 4]
615
+
616
+ # If the letter after matches, replace ' prefix ' with ' prefix'
617
+ if letter0 == letter1:
618
+ name0 = name0.replace(f' {prefix} ', f' {prefix}')
619
+ name1 = name1.replace(f' {prefix} ', f' {prefix}')
620
+ return name0, name1
621
+
622
+ def _combineSplitWords(self, name0:str, name1:str) -> tuple[str, str]:
623
+ """Combines words within one of the names if that combination is one word in the other name.
624
+
625
+ Args:
626
+ name0 (str): a name
627
+ name1 (str): a name
628
+
629
+ Returns:
630
+ tuple[str, str]: the modified names
631
+ """
632
+ words0 = name0.split()
633
+
634
+ # Do not combine words that are only two in length
635
+ if len(words0) < 3:
636
+ return False, name0, name1
637
+
638
+ # Do not combine words that are already a good spelling match
639
+ if self._spellingComparison(name0, name1)[0]:
640
+ return False, name0, name1
641
+
642
+ for index0, _, word0, word1 in self._getPairIndicesAndWords(name0, name1):
643
+ # Skip if word0 and word1 are not a good match
644
+ if (fuzz.partial_ratio(word0, word1) < 75):
645
+ continue
646
+
647
+ # Skip if either word is only an initial
648
+ if (len(word0) == 1) or (len(word1) == 1):
649
+ continue
650
+
651
+ # Find the left and right neighbors
652
+ leftNeighbor = words0[index0 - 1] if index0 - 1 >= 0 else ''
653
+ rightNeighbor = words0[index0 + 1] if index0 + 1 < len(words0) else ''
654
+
655
+ # Skip neighbors if they are initials
656
+ leftNeighbor = leftNeighbor if len(leftNeighbor) > 1 else ''
657
+ rightNeighbor = rightNeighbor if len(rightNeighbor) > 1 else ''
658
+ if (not leftNeighbor) and (not rightNeighbor):
659
+ return False, name0, name1
660
+
661
+ # Choose the neighbor that best matches word0's match (word1)
662
+ if not leftNeighbor:
663
+ leftWasChosen = False
664
+ elif not rightNeighbor:
665
+ leftWasChosen = True
666
+ else:
667
+ leftScore = fuzz.partial_ratio(leftNeighbor, word1)
668
+ rightScore = fuzz.partial_ratio(rightNeighbor, word1)
669
+ if leftScore > rightScore:
670
+ leftWasChosen = True
671
+ else:
672
+ leftWasChosen = False
673
+
674
+ # Initialize the chosen neighbor, compound, and neighbor index
675
+ if leftWasChosen:
676
+ chosenNeighbor = leftNeighbor
677
+ compound = f'{leftNeighbor}{word0}'
678
+ indexN = index0 - 1
679
+ else:
680
+ chosenNeighbor = rightNeighbor
681
+ compound = f'{word0}{rightNeighbor}'
682
+ indexN = index0 + 1
683
+
684
+ # Skip if the neighbor is a bad partial match to word0's match
685
+ if fuzz.partial_ratio(chosenNeighbor, word1) < 65:
686
+ continue
687
+
688
+ # Check if the compound is significantly better than the original
689
+ ogScore = fuzz.ratio(word0, word1)
690
+ compoundScore = fuzz.ratio(compound, word1)
691
+ if compoundScore < ogScore + 20:
692
+ continue
693
+ diffLength0 = abs(len(word1) - len(word0))
694
+ diffLengthCompound = abs(len(word1) - len(compound))
695
+ if diffLength0 < diffLengthCompound:
696
+ continue
697
+
698
+ # If the compound was a better match, use a name editor to create an edited name0 where the words are combined
699
+ ne = self.NameEditor(name0, name1)
700
+ ne.updateName0(index0, compound)
701
+ ne.updateName0(indexN, '')
702
+ name0Edited, _ = ne.getModifiedNames()
703
+
704
+ # If the edited name0 is better (or only slightly worse), go with the edited version
705
+ improvement = self._calculateEditImprovement(name0, name1, name0Edited, name1)[0]
706
+ if improvement > -1:
707
+ return True, name0Edited, name1
708
+
709
+ # If no edits were beneficial, just return the original words
710
+ return False, name0, name1
711
+
712
+ def _eitherNameTooShort(self, name0:str, name1:str) -> bool:
713
+ """Identifies if either of the names is too short.
714
+
715
+ Args:
716
+ name0 (str): a name
717
+ name1 (str): a name
718
+
719
+ Returns:
720
+ bool: whether either was too short
721
+ """
722
+ combo = self._findWhichWordsMatchAndHowWell(name0, name1)
723
+ shortestWordCount = len(combo)
724
+ if shortestWordCount < 2:
725
+ return True
726
+ else:
727
+ return False
728
+
729
+ def _findWhichWordsMatchAndHowWell(self, name0:str, name1:str) -> list[tuple[str, str, int]]:
730
+ """Identifies which words in either name are a match, and how well they match.
731
+
732
+ Args:
733
+ name0 (str): a name
734
+ name1 (str): a name
735
+
736
+ Returns:
737
+ list[tuple[str, str, int]]: a list of tuples idenifying the index of the word in the first name,
738
+ the index of the word in the second name, and the score of how well they match
739
+ """
740
+ # Split strings into lists of words
741
+ words0 = name0.split()
742
+ words1 = name1.split()
743
+
744
+ # Initialize empty list to store scores
745
+ scores = []
746
+
747
+ # Loops through each word in words1 and compare to each word in words2
748
+ for i, word0 in enumerate(words0):
749
+ for j, word1 in enumerate(words1):
750
+ # Gets the score for how well the words match by using fuzzywuzzy
751
+ rScore = fuzz.ratio(word0, word1)
752
+ prScore = fuzz.partial_ratio(word0, word1)
753
+ score = max(rScore, prScore)
754
+ if word0[0] != word1[0]:
755
+ score = rScore
756
+
757
+ # Unless word1 or word2 is only an initial,
758
+ if (len(word0) == 1) or (len(word1) == 1):
759
+ # If the initial matches the first letter of the other word, give it a near perfect score
760
+ if (word0[0] == word1[0]):
761
+ score = 100
762
+ # Otherwise the score is 0
763
+ else:
764
+ score = 0
765
+
766
+ # Add the score to scores
767
+ scores.append((f"{i} words1", f"{j} words2", score))
768
+
769
+ # Gets the length of the shortest word
770
+ minLength = min(len(words0), len(words1))
771
+
772
+ # Generate all combinations of tuples with length equal to the number of words in string2
773
+ combinations = itertools.combinations(scores, minLength)
774
+
775
+ # Filter the combinations to include only valid combinations
776
+ validCombinations = [c for c in combinations if len(set(x[0] for x in c)) == len(c) and len(set(x[1] for x in c)) == len(c)]
777
+
778
+ # Cleans the valid combinations
779
+ cleanedValidCombinations = []
780
+ for validCombo in validCombinations:
781
+ cleanedValidCombo = []
782
+ for tup in validCombo:
783
+ cleanedTup = tuple([s.replace(' words1', '').replace(' words2', '') for s in tup[:2]] + [tup[2]])
784
+ cleanedValidCombo.append(cleanedTup)
785
+ cleanedValidCombinations.append(cleanedValidCombo)
786
+
787
+ # Find the combination(s) with the maximum sum
788
+ maxSum = sum(y[2] for y in max(cleanedValidCombinations, key=lambda x: sum(y[2] for y in x)))
789
+ maxCombinations = []
790
+ for combo in cleanedValidCombinations:
791
+ if (sum(y[2] for y in combo)) == maxSum:
792
+ maxCombinations.append(combo)
793
+
794
+ # Assigns the max score combination with the most letters to be best_combo
795
+ bestCombo = []
796
+ maxLetterCount = 0
797
+ for combo in maxCombinations:
798
+ letterCount = 0
799
+ for tup in combo:
800
+ x, y, _ = map(int, (tup[0], tup[1], tup[2]))
801
+ letterCount += len(words0[x]) + len(words1[y])
802
+ if letterCount > maxLetterCount:
803
+ maxLetterCount = letterCount
804
+ bestCombo = combo
805
+
806
+ # Returns the combination of word matches that are the closest match
807
+ return bestCombo
808
+
809
+ def _eitherNameTooGeneric(self, name0:str, name1:str) -> bool:
810
+ """Identifies if either name is too generic.
811
+
812
+ Args:
813
+ name0 (str): a name
814
+ name1 (str): a name
815
+
816
+ Returns:
817
+ bool: whether the name is too generic
818
+ """
819
+ # Finds the length of the shortest of the two words
820
+ combo = self._findWhichWordsMatchAndHowWell(name0, name1)
821
+ shortestWordCount = len(combo)
822
+
823
+ # If both last names are very rare, returns False
824
+ if self._hasRareSurname(name0) and self._hasRareSurname(name1):
825
+ return False
826
+
827
+ # Checks if the initials between the two names make a match too uncertain
828
+ words0 = name0.split()
829
+ words2 = name1.split()
830
+ nonInitialMatchCount = 0
831
+ for match in combo:
832
+ index1, index2, _ = match
833
+ word0 = words0[int(index1)]
834
+ word1 = words2[int(index2)]
835
+ initialInWord1 = len(word0) == 1
836
+ initialInWord2 = len(word1) == 1
837
+ if initialInWord1 or initialInWord2:
838
+ nonInitialMatchCount += 1
839
+
840
+ if shortestWordCount <= nonInitialMatchCount + 1:
841
+ return True
842
+ else:
843
+ return False
844
+
845
+ def _hasRareSurname(self, name:str) -> bool:
846
+ """Identifies if a name has a rare surname.
847
+
848
+ Args:
849
+ name (str): a name
850
+
851
+ Returns:
852
+ bool: whether the name's surname is rare
853
+ """
854
+ # Isolates the last name
855
+ name = name.lower()
856
+ surname = name.split()[-1]
857
+
858
+ # If the last name is not in the list of surnames, returns true
859
+ if surname not in self.topSurnames:
860
+ return True
861
+ else:
862
+ return False
863
+
864
+ def _removeNicknames(self, name0:str, name1:str) -> tuple[str, str]:
865
+ """Replaces the nickname in one name for the official name found in the other.
866
+
867
+ Args:
868
+ name0 (str): a name
869
+ name1 (str): a name
870
+
871
+ Returns:
872
+ tuple[str, str]: the modified names
873
+ """
874
+ words0 = name0.split()
875
+ words1 = name1.split()
876
+ for word0 in words0:
877
+ # Skip if the word is also a word in name1
878
+ if word0 in words1:
879
+ continue
880
+
881
+ # Skip if the word does not have an nickname
882
+ if self.nicknameToNicknameSetNum.get(word0) is None:
883
+ continue
884
+
885
+ # Replace the word with a nickname if the nickname is in name1
886
+ # (and nickname not also in name0)
887
+ setNumbers = self.nicknameToNicknameSetNum[word0]
888
+ breaking = False
889
+ for num in setNumbers:
890
+ nicknames = set(self.nicknameSets[num])
891
+ nicknames.remove(word0)
892
+ for nickname in nicknames:
893
+ if (nickname in words0) and (nickname in words1):
894
+ continue
895
+ if nickname in words1:
896
+ name0 = re.sub(rf"\b{word0}\b", nickname, name0, flags=re.IGNORECASE)
897
+ breaking = True
898
+ break
899
+ if breaking:
900
+ break
901
+
902
+ return name0, name1
903
+
904
+ def _spellingComparison(self, name0:str, name1:str) -> tuple[bool, list]:
905
+ """Identifies if two names are a match according to a comparison based soley on spelling.
906
+
907
+ Args:
908
+ name0 (str): a name
909
+ name1 (str): a name
910
+
911
+ Returns:
912
+ tuple[bool, list]: whether the names are a match, and the resulting word combo
913
+ """
914
+ # Compares the combination of words that match the best, and to what extent
915
+ wordCombo = self._findWhichWordsMatchAndHowWell(name0, name1)
916
+
917
+ # Loops through the tuples and counts the number of times the score is greater than 80
918
+ count = sum(1 for tup in wordCombo if tup[2] > 80)
919
+
920
+ # If at least three of the scores are greater than 80, or,
921
+ # if the number of name matches is the same as the number of words in the shortest name, it's a match
922
+ minLength = min(len(name0.split()), len(name1.split()))
923
+ if (count >= 3) or (count == minLength):
924
+ return True, wordCombo
925
+
926
+ # If that didn't work, do a consonant check
927
+ if self._consonantComparison(name0, name1):
928
+ return True, wordCombo
929
+
930
+ # If that didn't work, spelling check returns false
931
+ return False, wordCombo
932
+
933
+ def _consonantComparison(self, name0:str, name1:str) -> bool:
934
+ """Identifies if two names are a match according to consonant comparison.
935
+
936
+ Args:
937
+ name0 (str): a name
938
+ name1 (str): a name
939
+
940
+ Returns:
941
+ bool: whether the two names are a match according to consonant comparison
942
+ """
943
+ # Setup
944
+ wordCombo = self._findWhichWordsMatchAndHowWell(name0, name1)
945
+ minRequiredMatches = len(wordCombo)
946
+ numWordConsonantMatches = 0
947
+ def reduceToSimpleConsonants(string:str) -> str:
948
+ """Reduces a string to the simple consonant componants.
949
+
950
+ Args:
951
+ string (str): a string
952
+
953
+ Returns:
954
+ str: the consonant componants
955
+ """
956
+ string = re.sub("a|e|i|o|u|y", "*", string)
957
+ string = string.replace("**", "*")
958
+ string = re.sub(r'(.)\1+', r'\1', string)
959
+ return string
960
+
961
+ # Loop through every word match in the combo
962
+ for tup in wordCombo:
963
+ # Get the matching word data
964
+ word0:str = name0.split()[int(tup[0])]
965
+ word1:str = name1.split()[int(tup[1])]
966
+ originalScoreForWords:int = int(tup[2])
967
+
968
+ # Get the words as consonants
969
+ consonantsName0 = reduceToSimpleConsonants(word0)
970
+ consonantsName1 = reduceToSimpleConsonants(word1)
971
+ consonantsRatio = fuzz.ratio(consonantsName0, consonantsName1)
972
+
973
+ # Continue if bad match
974
+ if originalScoreForWords <= 30:
975
+ continue
976
+ if (len(word0) != 1) and (len(word1) != 1): #if neither word is initial
977
+ lowestSyllableCount = min(consonantsName0.count("*"), consonantsName1.count("*"))
978
+ if lowestSyllableCount < 2:
979
+ continue
980
+ if (consonantsRatio <= 80 or originalScoreForWords <= 60) and consonantsRatio != 100:
981
+ continue
982
+
983
+ # If not rejected, increment the number of matches
984
+ numWordConsonantMatches += 1
985
+
986
+ # If enough matches, return true. Otherwise return false.
987
+ if (numWordConsonantMatches > minRequiredMatches) or (numWordConsonantMatches >= 3):
988
+ return True
989
+ else:
990
+ return False
991
+
992
+ def _isWorthContinuing(self, name0:str, name1:str) -> bool:
993
+ """Identifies if a name comparison will always prove false.
994
+
995
+ Args:
996
+ name0 (str): _description_
997
+ name1 (str): _description_
998
+
999
+ Returns:
1000
+ bool: whether the names are worth working on further
1001
+ """
1002
+ wordCombo = self._findWhichWordsMatchAndHowWell(name0, name1)
1003
+ oneLetterMatchFailCount = 0
1004
+
1005
+ for match in wordCombo:
1006
+ word0 = name0[int(match[0])]
1007
+ word1 = name1[int(match[1])]
1008
+ score = match[2]
1009
+ if (score == 0) and ((len(word0) == 1) or ((len(word1) == 1))):
1010
+ oneLetterMatchFailCount += 1
1011
+
1012
+ if (oneLetterMatchFailCount >= 1) and (len(wordCombo) <= 3):
1013
+ return False
1014
+ else:
1015
+ return True
1016
+
1017
+ def _modifyNamesTogether(self, name0:str, name1:str) -> tuple[str,str]:
1018
+ """Modifies the name together (changing them in a way that is much more intense than simply cleaning together).
1019
+
1020
+ Args:
1021
+ name0 (str): a name
1022
+ name1 (str): a name
1023
+
1024
+ Returns:
1025
+ tuple[str,str]: the modified names
1026
+ """
1027
+ # Replaces 'ie' endings with 'y' endings
1028
+ name0 = re.sub(r'ie\b', 'y', name0)
1029
+ name1 = re.sub(r'ie\b', 'y', name1)
1030
+
1031
+ # Fix when records are indexed with " or " in the name
1032
+ name0, name1 = self._removeOrInNames(name0, name1)
1033
+
1034
+ # Deal with names longer than 5 that are off by one specific char
1035
+ name0, name1 = self._fixVowelMistakes(name0, name1)
1036
+
1037
+ # Deal with names longer than 5 that have swapped letters
1038
+ name0, name1 = self._fixSwappedChars(name0, name1)
1039
+
1040
+ # Deal with longer names that are identical except for the first char
1041
+ name0, name1 = self._dealWithWrongFirstChar(name0, name1)
1042
+
1043
+ # Use spelling rules to modify both
1044
+ for rule in self.spellingRules:
1045
+ name0, name1 = self._replaceSubstringSandwichMeatIfMatchingBread(name0, name1, rule[0], rule[1], rule[2], rule[3], rule[4])
1046
+
1047
+ # Remove extra spaces
1048
+ name0 = re.sub(r'\s+', ' ', name0)
1049
+ name1 = re.sub(r'\s+', ' ', name1)
1050
+ name0 = name0.strip()
1051
+ name1 = name1.strip()
1052
+
1053
+ # Return modified names
1054
+ return name0, name1
1055
+
1056
+ def _removeOrInNames(self, name0:str, name1:str) -> tuple[str, str]:
1057
+ """Removes the word 'or' from a name (assuming that the name could have been
1058
+ poorly indexed so that the indexer's guesses for a specific word of the name is still within the string).
1059
+
1060
+ Args:
1061
+ name0 (str): a name
1062
+ name1 (str): a name
1063
+
1064
+ Returns:
1065
+ tuple[str, str]: the modified names
1066
+ """
1067
+ name0, name1 = name0.lower(), name1.lower()
1068
+ # if or in neither
1069
+ if (not " or " in name0) and (not " or " in name1):
1070
+ return name0, name1
1071
+ # if or in both
1072
+ elif (" or " in name0) and (" or " in name1):
1073
+ return name0, name1
1074
+ # if or in 1, not 2
1075
+ elif " or " in name0:
1076
+ # Gets the score for if the word before 'or' is removed
1077
+ name0EditedA = re.sub("[a-z]+ or ", " ", name0)
1078
+ wordComboA = self._findWhichWordsMatchAndHowWell(name0EditedA, name1)
1079
+ averageScoreA = sum(tup[2] for tup in wordComboA) / len(wordComboA)
1080
+
1081
+ # Gets the score for if the word after 'or' is removed
1082
+ name0EditedB = re.sub(" or [a-z]+", " ", name0)
1083
+ wordComboB = self._findWhichWordsMatchAndHowWell(name0EditedB, name1)
1084
+ averageScoreB = sum(tup[2] for tup in wordComboB) / len(wordComboB)
1085
+
1086
+ # If the before score is greater, returns A
1087
+ if averageScoreA >= averageScoreB:
1088
+ return name0EditedA, name1
1089
+ # Otherwise returns B
1090
+ else:
1091
+ return name0EditedB, name1
1092
+ # if or in 2, not 1
1093
+ elif " or " in name1:
1094
+ name1EditedA = re.sub("[a-z]+ or ", " ", name1)
1095
+ wordComboA = self._findWhichWordsMatchAndHowWell(name1EditedA, name0)
1096
+ averageScoreA = sum(tup[2] for tup in wordComboA) / len(wordComboA)
1097
+
1098
+ # Gets the score for if the word after 'or' is removed
1099
+ name1EditedB = re.sub(" or [a-z]+", " ", name1)
1100
+ wordComboB = self._findWhichWordsMatchAndHowWell(name1EditedB, name0)
1101
+ averageScoreB = sum(tup[2] for tup in wordComboB) / len(wordComboB)
1102
+
1103
+ # If the before score is greater, returns A
1104
+ if averageScoreA >= averageScoreB:
1105
+ return name0, name1EditedA
1106
+ # Otherwise returns B
1107
+ else:
1108
+ return name0, name1EditedB
1109
+
1110
+ def _fixVowelMistakes(self, name0:str, name1:str) -> tuple[str, str]:
1111
+ """Modifies two matching words in a name so that they are the same if
1112
+ they are only different by one vowel and 5 letters or more.
1113
+
1114
+ Args:
1115
+ name0 (str): a name
1116
+ name1 (str): a name
1117
+
1118
+ Returns:
1119
+ tuple[str, str]: the two modified names
1120
+ """
1121
+ ne = self.NameEditor(name0, name1)
1122
+ for index0, _, word0, word1 in self._getPairIndicesAndWords(name0, name1):
1123
+ # Continue if either word is less than 5 chars
1124
+ len0 = len(word0)
1125
+ if len0 < 5:
1126
+ continue
1127
+ len1 = len(word1)
1128
+ if len1 < 5:
1129
+ continue
1130
+
1131
+ # Continue if they are not the same length
1132
+ if len0 != len1:
1133
+ continue
1134
+
1135
+ # Check if there is only one difference
1136
+ mismatchedIndex = None
1137
+ tooManyDiffs = False
1138
+ for i in range(len0):
1139
+ # Skip matching chars
1140
+ if word0[i] == word1[i]:
1141
+ continue
1142
+
1143
+ # Check if there are already too many differences
1144
+ if mismatchedIndex:
1145
+ tooManyDiffs = True
1146
+ break
1147
+
1148
+ # Append the index if the chars were not the same
1149
+ mismatchedIndex = i
1150
+
1151
+ # Continue if there was not exactly one difference
1152
+ if (tooManyDiffs) or (mismatchedIndex is None):
1153
+ continue
1154
+
1155
+ # Replace one of the letters to be the other if they are cooresponding
1156
+ charWord0 = word0[mismatchedIndex]
1157
+ charWord1 = word1[mismatchedIndex]
1158
+ cooresponding = ['ao', 'ea', 'iy']
1159
+ if (f'{charWord0}{charWord1}' in cooresponding) or (f'{charWord1}{charWord0}' in cooresponding):
1160
+ ne.updateName0(index0, word1)
1161
+
1162
+ # Return the modified (or not) names
1163
+ return ne.getModifiedNames()
1164
+
1165
+ def _fixSwappedChars(self, name0:str, name1:str) -> tuple[str, str]:
1166
+ """If two matching words (of 5 letters of more) for the two names are the same barring swapped letters (typo), makes the words the same.
1167
+
1168
+ Args:
1169
+ name0 (str): a name
1170
+ name1 (str): a name
1171
+
1172
+ Returns:
1173
+ tuple[str, str]: the modified names
1174
+ """
1175
+ ne = self.NameEditor(name0, name1)
1176
+ for index0, _, word0, word1 in self._getPairIndicesAndWords(name0, name1):
1177
+ # Skip if the words are not 5 long
1178
+ if len(word0) != 5:
1179
+ continue
1180
+
1181
+ # Skip if lengths are different
1182
+ if len(word0) != len(word1):
1183
+ continue
1184
+
1185
+ # Skip if already a match, or could not be fixed by the swap
1186
+ if fuzz.ratio(word1, word0) != 80:
1187
+ continue
1188
+
1189
+ # Find how many differences and where
1190
+ diffCount = 0
1191
+ diffPositions = []
1192
+ for i in range(len(word0)):
1193
+ if word0[i] != word1[i]:
1194
+ diffCount += 1
1195
+ diffPositions.append(i)
1196
+
1197
+ # Skip if there are not two differences
1198
+ if diffCount != 2:
1199
+ continue
1200
+
1201
+ # Skip if the differences are not sequential
1202
+ posI, posJ = diffPositions
1203
+ if abs(posI - posJ) != 1:
1204
+ continue
1205
+
1206
+ # Skip if the differences are not swappable
1207
+ if (word0[posI] != word1[posJ]) or (word0[posI] != word1[posJ]):
1208
+ continue
1209
+
1210
+ # Update the word
1211
+ ne.updateName0(index0, word1)
1212
+
1213
+ # Return the modified (or not) names
1214
+ return ne.getModifiedNames()
1215
+
1216
+ def _dealWithWrongFirstChar(self, name0:str, name1:str) -> tuple[str, str]:
1217
+ """If two matching words (of 5 letters or more) are the same barring the first letter, makes the same.
1218
+
1219
+ Args:
1220
+ name0 (str): a name
1221
+ name1 (str): a name
1222
+
1223
+ Returns:
1224
+ tuple[str, str]: the modified names
1225
+ """
1226
+ ne = self.NameEditor(name0, name1)
1227
+ for index1, _, word1, word2 in self._getPairIndicesAndWords(name0, name1):
1228
+ if word1 == word2:
1229
+ continue
1230
+ if (word1[1:] == word2[1:]) and (len(word1) > 4) and (len(word2) > 4):
1231
+ ne.updateName0(index1, word2)
1232
+ name0, name1 = ne.getModifiedNames()
1233
+ return name0, name1
1234
+
1235
+ def _replaceSubstringSandwichMeatIfMatchingBread(self, name0:str, name1:str, meatOption1:str, meatOption2:str, bottomBreadOptions:list[str], topBreadOptions:list[str], minRequiredLetters:int) -> tuple[str,str]:
1236
+ """For any given matching word pair, replaces a specific substring in one of the words, with a similar substring found in the other word.
1237
+
1238
+ Args:
1239
+ name0 (str): a name
1240
+ name1 (str): a name
1241
+ meatOption1 (str): the first possible middle of the substring
1242
+ meatOption2 (str): the second possible middle of the substring
1243
+ bottomBreadOptions (list[str]): a list of possible beginnings to the substring. Whichever beginning is found in the one must be found in the other in order for the replacement to work
1244
+ topBreadOptions (list[str]): a list of possible endings to the substring. Whichever ending is found in the one must be found in the other in order for the replacement to work
1245
+ minRequiredLetters (int): the minimum required letters to be found in both words in order for the replacement to work
1246
+
1247
+ Returns:
1248
+ tuple[str,str]: the modified names
1249
+ """
1250
+ # Setup
1251
+ def overwriteWithSubstring(string:str, replacement:str, startIndex:int, endIndex:int) -> str:
1252
+ """Overwrites a specific index range of a string with the replacement string.
1253
+
1254
+ Args:
1255
+ string (str): the string to replace
1256
+ replacement (str): the replacement string
1257
+ startIndex (int): the start index for the replacement
1258
+ endIndex (int): the end index for the replacement
1259
+
1260
+ Returns:
1261
+ _type_: _description_
1262
+ """
1263
+ stringList = list(string)
1264
+ stringList[startIndex:endIndex] = replacement
1265
+ newString = ''.join(stringList)
1266
+ return newString
1267
+
1268
+ # Return if both middles not in different words
1269
+ if (meatOption1 not in name0 and meatOption2 not in name0) or (meatOption1 not in name1 and meatOption2 not in name1):
1270
+ return name0, name1
1271
+
1272
+ # Initialize the name editor
1273
+ ne = self.NameEditor(name0, name1)
1274
+
1275
+ # Iterate through the word matches
1276
+ for index0, index1, word0, word1 in self._getPairIndicesAndWords(name0, name1):
1277
+ # Skip words that are not long enough for the given rule
1278
+ if len(word0) < minRequiredLetters or len(word1) < minRequiredLetters:
1279
+ continue
1280
+
1281
+ # Add clear word breaks
1282
+ word0 = f"-{word0}-"
1283
+ word1 = f"-{word1}-"
1284
+
1285
+ # For every bread1
1286
+ for bottomBread in bottomBreadOptions:
1287
+ # Skip the bread if it doesn't appear in both matching words
1288
+ if bottomBread not in word0 or bottomBread not in word1:
1289
+ continue
1290
+
1291
+ # For every bread2
1292
+ for topBread in topBreadOptions:
1293
+ # Skip the bread if it doesn't appear in both matching words
1294
+ if topBread not in word0 or topBread not in word1:
1295
+ continue
1296
+
1297
+ # Skip the bread if the pattern is not found in both
1298
+ pattern = f"{bottomBread}({meatOption1}|{meatOption2}){topBread}"
1299
+ results1 = re.search(pattern, word0)
1300
+ results2 = re.search(pattern, word1)
1301
+ if not results1 or not results2:
1302
+ continue
1303
+
1304
+ # Skip the bread if the two patterns are identical (middles are the same)
1305
+ if results1.group(0) == results2.group(0):
1306
+ continue
1307
+
1308
+ # Skip the bread if the two patterns are too far apart
1309
+ spanA1, spanB1 = results1.span()
1310
+ spanA2, spanB2 = results2.span()
1311
+ if not (abs(spanA1 - spanA2) <= 2 and abs(spanB1 - spanB2) <= 2):
1312
+ continue
1313
+
1314
+ # Update the words by replacing matching (different) middles with the meat option 2
1315
+ startIndexString1, endIndexString1 = results1.span()
1316
+ startIndexString2, endIndexString2 = results2.span()
1317
+ middleCoordsString1 = startIndexString1 + len(bottomBread), endIndexString1 - len(topBread)
1318
+ middleCoordsString2 = startIndexString2 + len(bottomBread), endIndexString2 - len(topBread)
1319
+ word0 = overwriteWithSubstring(word0, meatOption2, middleCoordsString1[0], middleCoordsString1[1])
1320
+ word1 = overwriteWithSubstring(word1, meatOption2, middleCoordsString2[0], middleCoordsString2[1])
1321
+
1322
+ # Update the words for that match (though a change may not have occured)
1323
+ word0 = word0.replace("-", "")
1324
+ word1 = word1.replace("-", "")
1325
+ ne.updateName0(index0, word0)
1326
+ ne.updateName1(index1, word1)
1327
+
1328
+ # concatonates the two lists together back into strings
1329
+ name0, name1 = ne.getModifiedNames()
1330
+ return name0, name1
1331
+
1332
+ def _pronunciationComparison(self, name0:str, name1:str) -> tuple[bool, list, str, str]:
1333
+ """Identifies whether two names are a match according to a pronunciation comparison.
1334
+
1335
+ Args:
1336
+ name0 (str): a name
1337
+ name1 (str): a name
1338
+
1339
+ Returns:
1340
+ tuple[bool, list, str, str]: whether the name was a match, the word combo, the ipa of name0, the ipa of name1
1341
+ """
1342
+ # Necessary for pronunciation comparison
1343
+ wordCombo = self._findWhichWordsMatchAndHowWell(name0, name1)
1344
+
1345
+ # Gets Ipas
1346
+ ipaOfName0 = self._getPronunciation(name0)
1347
+ ipaOfName1 = self._getPronunciation(name1)
1348
+
1349
+ # Cleans Ipas
1350
+ ipaOfName0 = self._cleanIpa(ipaOfName0)
1351
+ ipaOfName1 = self._cleanIpa(ipaOfName1)
1352
+ ipaOfName0, ipaOfName1 = self._modifyIpasTogether(ipaOfName0, ipaOfName1)
1353
+
1354
+ # Matches the ipa words within the two names
1355
+ # Splits strings into lists of words
1356
+ ipaWords0 = ipaOfName0.split()
1357
+ ipaWords1 = ipaOfName1.split()
1358
+
1359
+ # Initializes an empty list to store scores
1360
+ scores = []
1361
+
1362
+ # Loop through each word in words1 and compare to each word in words2
1363
+ for i in range(len(ipaWords0)):
1364
+ ipaWord0 = ipaWords0[i]
1365
+ for j in range(len(ipaWords1)):
1366
+ ipaWord1 = ipaWords1[j]
1367
+
1368
+ # Use fuzz.ratio to compare the words and store the score
1369
+ score = fuzz.ratio(ipaWord0, ipaWord1)
1370
+
1371
+ # Updates the score if one of the words was an initial
1372
+ for k in range(len(wordCombo)):
1373
+ index1, index2, initialScore = wordCombo[k]
1374
+ if i == int(index1) and j == int(index2) and (initialScore == 100 or initialScore == 0):
1375
+ score = initialScore
1376
+
1377
+ # Add the score to scores
1378
+ scores.append([f"{i} ipaWords0", f"{j} ipaWords1", score])
1379
+
1380
+ # Gets the length of the shortest word
1381
+ minLength = min(len(ipaWords0), len(ipaWords1))
1382
+
1383
+ # Generates all combinations of tuples with length equal to the number of words in string2
1384
+ combinations = itertools.combinations(scores, minLength)
1385
+
1386
+ # Filters the combinations to include only valid combinations
1387
+ validCombinations = [c for c in combinations if len(set(x[0] for x in c)) == len(c) and len(set(x[1] for x in c)) == len(c)]
1388
+
1389
+ # Finds the word combination with the maximum sum
1390
+ maxCombination = max(validCombinations, key=lambda x: sum(y[2] for y in x))
1391
+
1392
+ # Cleans the max combo
1393
+ cleanedMaxCombination = []
1394
+
1395
+ # Cleans the max combo
1396
+ cleanedMaxCombination = [(
1397
+ tup[0].replace(" ipaWords0", "").replace(" ipaWords1", ""),
1398
+ tup[1].replace(" ipaWords0", "").replace(" ipaWords1", ""),
1399
+ tup[2]) for tup in maxCombination]
1400
+
1401
+ # Gets the smallest score in the max combination
1402
+ lowestScore = min(cleanedMaxCombination, key=lambda tuple: tuple[2])[2]
1403
+
1404
+ # If the shortest name is two words in length
1405
+ if minLength <= 2:
1406
+ # If the lowest score match is greater than or equal to 80, it's a good pronunciation match
1407
+ if lowestScore >= 80:
1408
+ return True, cleanedMaxCombination, ipaOfName0, ipaOfName1
1409
+ # Otherwise, it's probably not a match
1410
+ return False, cleanedMaxCombination, ipaOfName0, ipaOfName1
1411
+
1412
+ # If the shortest name is more than two words
1413
+ if minLength > 2:
1414
+ # If the lowest score match is greater than 75, it's a good pronunciation match
1415
+ if lowestScore > 75:
1416
+ return True, cleanedMaxCombination, ipaOfName0, ipaOfName1
1417
+ # Otherwise, it's probably not a match
1418
+ return False, cleanedMaxCombination, ipaOfName0, ipaOfName1
1419
+
1420
+ def _getPronunciation(self, name:str) -> str:
1421
+ """Gets the pronunciation of the name.
1422
+
1423
+ Args:
1424
+ name (str): a name
1425
+
1426
+ Returns:
1427
+ str: the ipa of the name
1428
+ """
1429
+ pList = []
1430
+ for word in name.split():
1431
+ pList.append(self._getIpaOfOneWord(word))
1432
+ pronunciationOfName = " ".join(pList)
1433
+ return pronunciationOfName
1434
+
1435
+ @lru_cache(maxsize=1000)
1436
+ def _getIpaOfOneWord(self, word:str) -> str:
1437
+ """Gets the pronunciation of one word.
1438
+
1439
+ Args:
1440
+ word (str): a word
1441
+
1442
+ Returns:
1443
+ str: the ipa of the word
1444
+ """
1445
+ # Setup
1446
+ word = word.replace(" ", "")
1447
+ word = unidecode(word)
1448
+ word = word.lower()
1449
+ pronunciationList = [""] * len(word)
1450
+ def substringSplitsTh(substring:str, word:str, i:int, j:int) -> bool:
1451
+ """Helps to identify poor substring choices for words for ipa.
1452
+
1453
+ Args:
1454
+ substring (str): the ipa dissection
1455
+ word (str): the full word
1456
+ i (int): the start index of the substring
1457
+ j (int): the end index of the substring
1458
+
1459
+ Returns:
1460
+ bool: whether it was a good substring
1461
+ """
1462
+ if i == j:
1463
+ return False
1464
+ if i >= 0 and substring[0] == 'h' and word[i - 1] == 't':
1465
+ return True
1466
+ if j <= len(word) - 1 and substring[-1] == 't' and word[j] == 'h':
1467
+ return True
1468
+ return False
1469
+
1470
+ # Tries to get the ipa from the plain word
1471
+ firstAttempt = self._pronunciationHailMary(word)
1472
+ if "*" not in firstAttempt:
1473
+ return firstAttempt
1474
+
1475
+ # While there are still letters in the word
1476
+ substringAdded = True
1477
+ while substringAdded:
1478
+ # Initialize variables to store the largest matching substring and its length
1479
+ substringAdded = False
1480
+ largestSubstring = ""
1481
+ pronunciationOfLargestSubstring = ""
1482
+ largestSubstringLen = 0
1483
+ beginningIndexOfSubstring = 0
1484
+ endIndexOfSubstring = 0
1485
+
1486
+ # Iterate over every possible substring
1487
+ for i in range(len(word)):
1488
+ for j in range(i + 1, len(word) + 1):
1489
+ substring = word[i:j]
1490
+
1491
+ if len(substring) <= largestSubstringLen:
1492
+ continue
1493
+ if " " in substring:
1494
+ continue
1495
+ if len(substring) > 1:
1496
+ substringIpa = self._convert(substring)
1497
+ if ("*" in substringIpa) or (len(substringIpa) >= len(substring) * 2) or (substringSplitsTh(substring, word, i, j)):
1498
+ continue
1499
+ else:
1500
+ pronunciationOfLargestSubstring = substringIpa
1501
+ elif len(substring) == 1:
1502
+ letterToPronunciation = {
1503
+ "a": "æ", "b": "b", "c": "k", "d": "d", "e": "ɛ", "f": "f", "g": "g", "h": "h", "i": "ɪ",
1504
+ "j": "ʤ", "k": "k", "l": "l", "m": "m", "n": "n", "o": "o", "p": "p", "q": "k", "r": "r",
1505
+ "s": "s", "t": "t", "u": "u", "v": "v", "w": "w", "x": "ks", "y": "j", "z": "z"
1506
+ }
1507
+ pronunciationOfLargestSubstring = letterToPronunciation.get(substring, largestSubstring)
1508
+
1509
+ largestSubstring = substring
1510
+ substringAdded = True
1511
+ largestSubstringLen = len(substring)
1512
+ beginningIndexOfSubstring = i
1513
+ endIndexOfSubstring = j
1514
+
1515
+ # Adds the substring to the list
1516
+ if substringAdded:
1517
+ pronunciationList[beginningIndexOfSubstring] = pronunciationOfLargestSubstring
1518
+ spaces = " " * largestSubstringLen
1519
+ word = word.rstrip()
1520
+ word = word[:beginningIndexOfSubstring] + spaces + word[endIndexOfSubstring:]
1521
+
1522
+ # Concatenates the list together at the end to get the pronunciation
1523
+ pronunciation = "".join(pronunciationList)
1524
+ return pronunciation
1525
+
1526
+ def _pronunciationHailMary(self, word:str) -> str:
1527
+ """Tries to get the pronunciation from the predefined ipa dictionary.
1528
+
1529
+ Args:
1530
+ word (str): the regular word
1531
+
1532
+ Returns:
1533
+ str: the ipa of the word, or just the word with an asterix if no pronunciation found
1534
+ """
1535
+ namePronuncation = self.namesToIpa.get(word)
1536
+ if namePronuncation != None:
1537
+ return namePronuncation
1538
+ return word + "*"
1539
+
1540
+ def _convert(self, word:str) -> str:
1541
+ """Helper function of _getIpaOfOneWord.
1542
+ Gets the ipa of a word (with more than one letter).
1543
+
1544
+ Args:
1545
+ word (str): a word (with more than one letter)
1546
+
1547
+ Returns:
1548
+ str: the ipa of that word
1549
+ """
1550
+ ipaPronunciation = self.syllableToIpa.get(word)
1551
+ if ipaPronunciation != None:
1552
+ return ipaPronunciation
1553
+ return word + "*"
1554
+
1555
+ def _cleanIpa(self, ipa:str) -> str:
1556
+ """cleans ipa to get rid of double ipa-consonants and other mistakes.
1557
+
1558
+ Args:
1559
+ ipa (str): the ipa of a word
1560
+
1561
+ Returns:
1562
+ str: the cleaned ipa
1563
+ """
1564
+ allIpaConsonants = ['l', 'd', 'z', 'b', 't', 'k', 'n', 's', 'w', 'v', 'ð', 'ʒ', 'ʧ', 'θ', 'h', 'g', 'ʤ', 'ŋ', 'p', 'm', 'ʃ', 'f', 'j', 'r']
1565
+ for consonant in allIpaConsonants:
1566
+ doubleConsonant = consonant + consonant
1567
+ if doubleConsonant in ipa:
1568
+ ipa = ipa.replace(doubleConsonant, consonant)
1569
+ ipa = ipa.replace("ɛɛ", "i")
1570
+ ipa = ipa.replace("ɪɪ", "ɪ")
1571
+ ipa = ipa.replace("iɪ", "i")
1572
+ ipa = ipa.replace("ŋg", "ŋ")
1573
+ ipa = ipa.replace(",", "")
1574
+ return ipa
1575
+
1576
+ def _modifyIpasTogether(self, ipa0:str, ipa1:str) -> tuple[str,str]:
1577
+ """Modifies two ipas by comparing each to one another.
1578
+
1579
+ Args:
1580
+ ipa1 (str): the first ipa name
1581
+ ipa2 (str): the second ipa name
1582
+
1583
+ Returns:
1584
+ tuple[str,str]: the two modified names
1585
+ """
1586
+ for rule in self.ipaRules:
1587
+ ipa0, ipa1 = self._replaceSubstringSandwichMeatIfMatchingBread(ipa0, ipa1, rule[0], rule[1], rule[2], rule[3], rule[4])
1588
+ return ipa0, ipa1