NameComparator 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- NameComparator/NameComparator.py +1588 -0
- NameComparator/jsonData/_ipa_all_names.json +52952 -0
- NameComparator/jsonData/_ipa_common_word_parts.json +8678 -0
- NameComparator/jsonData/_nickname_sets.json +414 -0
- NameComparator/jsonData/_rules_ipa.json +79 -0
- NameComparator/jsonData/_rules_spelling.json +78 -0
- NameComparator/jsonData/_top_surnames.json +4026 -0
- NameComparator/requirements.txt +3 -0
- NameComparator-1.0.0.dist-info/LICENSE +21 -0
- NameComparator-1.0.0.dist-info/METADATA +57 -0
- NameComparator-1.0.0.dist-info/RECORD +13 -0
- NameComparator-1.0.0.dist-info/WHEEL +5 -0
- NameComparator-1.0.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,1588 @@
|
|
|
1
|
+
from unidecode import unidecode
|
|
2
|
+
from functools import lru_cache
|
|
3
|
+
from fuzzywuzzy import fuzz
|
|
4
|
+
import re
|
|
5
|
+
import itertools
|
|
6
|
+
import json
|
|
7
|
+
import os
|
|
8
|
+
|
|
9
|
+
class NameComparator():
|
|
10
|
+
"""The class used for fuzzy comparing two names.
|
|
11
|
+
"""
|
|
12
|
+
def __init__(self) -> None:
|
|
13
|
+
"""Sets up the necessary attributes needed for most name comparisons.
|
|
14
|
+
"""
|
|
15
|
+
self.namesToIpa:dict = self._getDataFromJson('_ipa_all_names.json')
|
|
16
|
+
self.syllableToIpa:dict = self._getDataFromJson('_ipa_common_word_parts.json')
|
|
17
|
+
self.topSurnames:set = set([str(tup[0]) for tup in self._getDataFromJson('_top_surnames.json')])
|
|
18
|
+
self.ipaRules:list = self._getDataFromJson('_rules_ipa.json')
|
|
19
|
+
replacementRules = {
|
|
20
|
+
"consonant_or_break": ["-", "l", "d", "z", "b", "t", "k", "n", "s", "w", "v", "ð", "ʒ", "ʧ", "θ", "h", "g", "ʤ", "ŋ", "p", "m", "ʃ", "f", "j", "r"],
|
|
21
|
+
"consonant": ["l", "d", "z", "b", "t", "k", "n", "s", "w", "v", "ð", "ʒ", "ʧ", "θ", "h", "g", "ʤ", "ŋ", "p", "m", "ʃ", "f", "j", "r"],
|
|
22
|
+
"vowel": ["ɑ", "a", "æ", "ɪ", "i", "ɛ", "e", "ə", "ɔ", "ʊ", "u", "o"]
|
|
23
|
+
}
|
|
24
|
+
for sublist in self.ipaRules:
|
|
25
|
+
for i, item in enumerate(sublist):
|
|
26
|
+
if not isinstance(item, str):
|
|
27
|
+
continue
|
|
28
|
+
if item in replacementRules.keys():
|
|
29
|
+
sublist[i] = replacementRules[item]
|
|
30
|
+
|
|
31
|
+
self.spellingRules:list = self._getDataFromJson('_rules_spelling.json')
|
|
32
|
+
replacementRules = {
|
|
33
|
+
"consonant": ["b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "y", "z"],
|
|
34
|
+
"vowel": ["a", "e", "i", "o", "u", "y"],
|
|
35
|
+
"letter": ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"],
|
|
36
|
+
"letter_or_break": ["-", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"]
|
|
37
|
+
}
|
|
38
|
+
for sublist in self.spellingRules:
|
|
39
|
+
for i, item in enumerate(sublist):
|
|
40
|
+
if not isinstance(item, str):
|
|
41
|
+
continue
|
|
42
|
+
if item in replacementRules.keys():
|
|
43
|
+
sublist[i] = replacementRules[item]
|
|
44
|
+
|
|
45
|
+
self.nicknameSets = self._getDataFromJson('_nickname_sets.json')
|
|
46
|
+
self.nicknameToNicknameSetNum = {}
|
|
47
|
+
for i, ns in enumerate(self.nicknameSets):
|
|
48
|
+
for name in ns:
|
|
49
|
+
if self.nicknameToNicknameSetNum.get(name):
|
|
50
|
+
self.nicknameToNicknameSetNum[name].append(i)
|
|
51
|
+
else:
|
|
52
|
+
self.nicknameToNicknameSetNum[name] = [i]
|
|
53
|
+
|
|
54
|
+
def _getDataFromJson(self, fileName:str) -> dict|list:
|
|
55
|
+
"""Accesses data stored in the specified file.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
fileName (str): the specified json file
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
dict|list: the json data
|
|
62
|
+
"""
|
|
63
|
+
pathToJsonFolder = os.path.join(os.path.dirname(__file__), 'jsonData')
|
|
64
|
+
path = os.path.join(pathToJsonFolder, fileName)
|
|
65
|
+
with open(path, encoding="utf-8") as file:
|
|
66
|
+
return json.load(file)
|
|
67
|
+
|
|
68
|
+
def compareTwoNames(self, name0:str, name1:str) -> dict:
|
|
69
|
+
"""Compares two names to identify whether they are a fuzzy match.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
name0 (str): a name
|
|
73
|
+
name1 (str): a name
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
dict: the data gleaned from the comparison (whether they are a match, whether one or both names is too generic, whether one or both names is too short, along with the debugging attempt data)
|
|
77
|
+
"""
|
|
78
|
+
# Setup
|
|
79
|
+
data = {
|
|
80
|
+
'match': False,
|
|
81
|
+
'tooGeneric': False,
|
|
82
|
+
'tooShort': False,
|
|
83
|
+
'attempt1': None,
|
|
84
|
+
'attempt2': None,
|
|
85
|
+
'attempt3': None,
|
|
86
|
+
'attempt4': None
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
# Clean names
|
|
90
|
+
name0 = self._cleanNameByItself(name0)
|
|
91
|
+
name1 = self._cleanNameByItself(name1)
|
|
92
|
+
name0, name1 = self._cleanNamesTogether(name0, name1)
|
|
93
|
+
|
|
94
|
+
# Check if either name is too short (one word or less)
|
|
95
|
+
data['tooShort'] = self._eitherNameTooShort(name0, name1)
|
|
96
|
+
|
|
97
|
+
# Check if either name is too generic
|
|
98
|
+
data['tooGeneric'] = self._eitherNameTooGeneric(name0, name1)
|
|
99
|
+
|
|
100
|
+
# Cleans away nicknames
|
|
101
|
+
name0, name1 = self._removeNicknames(name0, name1)
|
|
102
|
+
|
|
103
|
+
# 1st attempt: Checks if names are a match according to string comparison alone
|
|
104
|
+
match, wordCombo = self._spellingComparison(name0, name1)
|
|
105
|
+
data['attempt1'] = (name0, name1, wordCombo)
|
|
106
|
+
if match:
|
|
107
|
+
data['match'] = True
|
|
108
|
+
return data
|
|
109
|
+
|
|
110
|
+
# Failed first attempt. Check if names are even worth continuing
|
|
111
|
+
if self._isWorthContinuing(name0, name1) is False:
|
|
112
|
+
return data
|
|
113
|
+
|
|
114
|
+
# 2nd attempt: Modify names via spelling rules, then check again if match according to string comparison
|
|
115
|
+
modifiedName0, modifiedName1 = self._modifyNamesTogether(name0, name1)
|
|
116
|
+
match, wordCombo = self._spellingComparison(modifiedName0, modifiedName1)
|
|
117
|
+
data['attempt2'] = (modifiedName0, modifiedName1, wordCombo)
|
|
118
|
+
if match:
|
|
119
|
+
data['match'] = True
|
|
120
|
+
return data
|
|
121
|
+
|
|
122
|
+
# 3rd attempt: Checks if modified names are a match according to pronunciation
|
|
123
|
+
match, wordCombo, ipaModifiedName0, ipaModifiedName1 = self._pronunciationComparison(modifiedName0, modifiedName1)
|
|
124
|
+
data['attempt3'] = (ipaModifiedName0, ipaModifiedName1, wordCombo)
|
|
125
|
+
if match:
|
|
126
|
+
data['match'] = True
|
|
127
|
+
return data
|
|
128
|
+
|
|
129
|
+
# 4th attempt: Check if original names are a match according to pronunciation
|
|
130
|
+
match, wordCombo, ipaName0, ipaName1 = self._pronunciationComparison(name0, name1)
|
|
131
|
+
data['attempt4'] = (ipaName0, ipaName1, wordCombo)
|
|
132
|
+
if match:
|
|
133
|
+
data['match'] = True
|
|
134
|
+
return data
|
|
135
|
+
|
|
136
|
+
# If they are still not a match, returns false
|
|
137
|
+
else:
|
|
138
|
+
return data
|
|
139
|
+
|
|
140
|
+
def _cleanNameByItself(self, name:str) -> str:
|
|
141
|
+
"""Cleans a singular name to get rid of extra or unhelpful data, or to standardize surnames.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
name (str): the name being cleaned
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
str: the cleaned name
|
|
148
|
+
"""
|
|
149
|
+
# Deal with blank names
|
|
150
|
+
if (name == "") or (not isinstance(name, str)):
|
|
151
|
+
return "_"
|
|
152
|
+
|
|
153
|
+
# Deal with whitespace
|
|
154
|
+
name = re.sub(r'[^\S ]', ' ', name)
|
|
155
|
+
name = re.sub(r" +", " ", name)
|
|
156
|
+
name = name.strip()
|
|
157
|
+
|
|
158
|
+
# Standardize name into ascii
|
|
159
|
+
name = unidecode(name)
|
|
160
|
+
name = name.lower()
|
|
161
|
+
|
|
162
|
+
# Deal with blank names again
|
|
163
|
+
if name == "":
|
|
164
|
+
return "_"
|
|
165
|
+
|
|
166
|
+
# Remove Punctiation
|
|
167
|
+
name = re.sub(r"[.,?;\"*()]", "", name)
|
|
168
|
+
|
|
169
|
+
# Remove spaces after apostrophe
|
|
170
|
+
name = re.sub("' +", "'", name)
|
|
171
|
+
|
|
172
|
+
# Remove jr and sr
|
|
173
|
+
name = re.sub(r"\bjr\b", "", name).replace(r"\bjunior\b", "")
|
|
174
|
+
name = re.sub(r"\bsr\b", "", name).replace(r"\bsenior\b", "")
|
|
175
|
+
|
|
176
|
+
# Remove titles
|
|
177
|
+
name = re.sub(r"\bprof\b", "", name).replace(r"\bprofessor\b", "")
|
|
178
|
+
name = re.sub(r"\bmr\b", "", name).replace(r"\bmister\b", "")
|
|
179
|
+
name = re.sub(r"\bmrs\b", "", name).replace(r"\bmissus\b", "")
|
|
180
|
+
name = re.sub(r"\bms\b", "", name).replace(r"\bmiss\b", "")
|
|
181
|
+
name = re.sub(r"\bdr\b", "", name).replace(r"\bdoctor\b", "")
|
|
182
|
+
name = re.sub(r"\bstudent\b", "", name)
|
|
183
|
+
name = re.sub(r"\brev\b", "", name)
|
|
184
|
+
name = name.replace("reverend", "")
|
|
185
|
+
|
|
186
|
+
# Remove family relations
|
|
187
|
+
name = re.sub(r"\bsister\b", "", name)
|
|
188
|
+
name = re.sub(r"\bbrother\b", "", name)
|
|
189
|
+
name = re.sub(r"\bmother\b", "", name)
|
|
190
|
+
name = re.sub(r"\bfather\b", "", name)
|
|
191
|
+
name = re.sub(r" in law", " ", name)
|
|
192
|
+
|
|
193
|
+
# Removes "head of household"
|
|
194
|
+
name = name.replace("head of household", "")
|
|
195
|
+
|
|
196
|
+
# Remove more than one space again
|
|
197
|
+
name = re.sub(r" +", " ", name)
|
|
198
|
+
|
|
199
|
+
# Remove stuff like 'the 3rd'
|
|
200
|
+
name = re.sub(r"[1-9][a-z]2,6", "", name).replace(" the ", "")
|
|
201
|
+
|
|
202
|
+
# Remove Roman numerals
|
|
203
|
+
name = ' '.join(re.sub(r'\b(ii|iii|iv)\b', '', word) for word in name.split())
|
|
204
|
+
name = re.sub(r" +", " ", name)
|
|
205
|
+
name = name.strip()
|
|
206
|
+
|
|
207
|
+
# Remove 'no suffix'
|
|
208
|
+
name = name.replace("no suffix", "")
|
|
209
|
+
|
|
210
|
+
# Deal with Dutch names
|
|
211
|
+
name = re.sub(r"\bvan de", "vande", name)
|
|
212
|
+
name = re.sub(r"\bvan den", "vanden", name)
|
|
213
|
+
name = re.sub(r"\bvan der", "vander", name)
|
|
214
|
+
|
|
215
|
+
# Deal with whitespace one last time, then return
|
|
216
|
+
name = re.sub(r" +", " ", name)
|
|
217
|
+
name = name.strip()
|
|
218
|
+
return name
|
|
219
|
+
|
|
220
|
+
def _cleanNamesTogether(self, name0:str, name1:str) -> tuple[str, str]:
|
|
221
|
+
"""Cleans names by comparing them to one another, fixing common errors to standardize.
|
|
222
|
+
|
|
223
|
+
Args:
|
|
224
|
+
name0 (str): a name
|
|
225
|
+
name1 (str): a name
|
|
226
|
+
|
|
227
|
+
Returns:
|
|
228
|
+
tuple[str, str]: the two cleaned names
|
|
229
|
+
"""
|
|
230
|
+
# Returns if either name is blank
|
|
231
|
+
if (name0 == "" or name1 == ""):
|
|
232
|
+
return name0, name1
|
|
233
|
+
|
|
234
|
+
# Deal with dashes
|
|
235
|
+
name0, name1 = self._dealWithDashes(name0, name1)
|
|
236
|
+
|
|
237
|
+
# Deal with Scottish and Irish names
|
|
238
|
+
name0, name1 = self._fixRelatedPrefixes(name0, name1, 'mac', 'mc')
|
|
239
|
+
name0, name1 = self._fixMcMac(name0, name1)
|
|
240
|
+
|
|
241
|
+
# Deal with just Irish names
|
|
242
|
+
oNames = [
|
|
243
|
+
'beirne', 'berry', 'boyle', 'bryant', 'brian', 'brien', 'bryan', 'ceallaigh', 'conner',
|
|
244
|
+
'connor', 'conor', 'daniel', 'day', 'dean', 'dea', 'doherty', 'donnell', 'donnel', 'donoghue',
|
|
245
|
+
'donohue', 'donovan', 'dowd', 'driscoll', 'fallon', 'farrell', 'flaherty', 'flanagan', 'flynn',
|
|
246
|
+
'gara', 'gorman', 'grady', 'guinn', 'guin', 'hagan', 'haire', 'hair', 'halloran', 'hanlon',
|
|
247
|
+
'hara', 'hare', 'harra', 'harrow', 'haver', 'hearn', 'hern', 'herron', 'higgins', 'hora',
|
|
248
|
+
'kane', 'keefe', 'keeffe', 'kelley', 'kelly', 'laughlin', 'leary', 'loughlin', 'mahoney',
|
|
249
|
+
'mahony', 'maley', 'malley', 'mara', 'mary', 'meara', 'melia', 'moore', 'more', 'muir',
|
|
250
|
+
'murchu', 'mure', 'murphy', 'neall', 'neal', 'neill', 'neil', 'ney', 'niall', 'quinn', 'regan',
|
|
251
|
+
'reilly', 'riley', 'riordan', 'roark', 'rorke', 'rourke', 'ryan', 'shaughnessy', 'shea',
|
|
252
|
+
'shields', 'sullivan', 'toole', 'tool',
|
|
253
|
+
]
|
|
254
|
+
for surname in oNames:
|
|
255
|
+
name0, name1 = self._removeIrishO(name0, name1, surname)
|
|
256
|
+
|
|
257
|
+
# Deal with prefixes and optional intros that make the match worse
|
|
258
|
+
name0, name1 = self._fixRelatedPrefixes(name0, name1, 'de', 'di')
|
|
259
|
+
name0, name1 = self._fixRelatedPrefixes(name0, name1, 'del', 'dil')
|
|
260
|
+
name0, name1 = self._removeUnnecessaryPrefixes(name0, name1, "d'")
|
|
261
|
+
name0, name1 = self._removeUnnecessaryPrefixes(name0, name1, "de")
|
|
262
|
+
name0, name1 = self._removeUnnecessaryPrefixes(name0, name1, "fi")
|
|
263
|
+
name0, name1 = self._removeUnnecessaryPrefixes(name0, name1, "santa")
|
|
264
|
+
name0, name1 = self._removeUnnecessaryPrefixes(name0, name1, "san")
|
|
265
|
+
name0, name1 = self._removeUnnecessaryPrefixes(name0, name1, "de la")
|
|
266
|
+
name0, name1 = self._removeUnnecessaryPrefixes(name0, name1, "de los")
|
|
267
|
+
name0, name1 = self._removeUnnecessaryPrefixes(name0, name1, "del")
|
|
268
|
+
name0, name1 = self._removeUnnecessaryPrefixes(name0, name1, "la")
|
|
269
|
+
name0, name1 = self._removeUnnecessaryPrefixes(name0, name1, "le")
|
|
270
|
+
name0, name1 = self._removeUnnecessaryPrefixes(name0, name1, "du")
|
|
271
|
+
name0, name1 = self._removeUnnecessaryPrefixes(name0, name1, "dela")
|
|
272
|
+
name0, name1 = self._removeUnnecessaryPrefixes(name0, name1, "los")
|
|
273
|
+
name0, name1 = self._removeUnnecessaryPrefixes(name0, name1, "der")
|
|
274
|
+
name0, name1 = self._removeUnnecessaryPrefixes(name0, name1, "den")
|
|
275
|
+
name0, name1 = self._removeUnnecessaryPrefixes(name0, name1, "vanden")
|
|
276
|
+
name0, name1 = self._removeUnnecessaryPrefixes(name0, name1, "vander")
|
|
277
|
+
name0, name1 = self._removeUnnecessaryPrefixes(name0, name1, "vande")
|
|
278
|
+
name0, name1 = self._removeUnnecessaryPrefixes(name0, name1, "van")
|
|
279
|
+
name0, name1 = self._removeUnnecessaryPrefixes(name0, name1, "von")
|
|
280
|
+
name0, name1 = self._combinePrefixWithSurnameifInBoth(name0, name1, "de")
|
|
281
|
+
name0, name1 = self._combinePrefixWithSurnameifInBoth(name0, name1, "van")
|
|
282
|
+
|
|
283
|
+
# Combine words that are one word in the other name
|
|
284
|
+
while True:
|
|
285
|
+
combined, name0, name1 = self._combineSplitWords(name0, name1)
|
|
286
|
+
if not combined:
|
|
287
|
+
break
|
|
288
|
+
while True:
|
|
289
|
+
combined, name1, name0 = self._combineSplitWords(name1, name0)
|
|
290
|
+
if not combined:
|
|
291
|
+
break
|
|
292
|
+
|
|
293
|
+
# Remove extra spaces
|
|
294
|
+
name0 = re.sub(r'\s+', ' ', name0)
|
|
295
|
+
name1 = re.sub(r'\s+', ' ', name1)
|
|
296
|
+
name0 = name0.strip()
|
|
297
|
+
name1 = name1.strip()
|
|
298
|
+
|
|
299
|
+
# Return the cleaned names
|
|
300
|
+
return name0, name1
|
|
301
|
+
|
|
302
|
+
def _dealWithDashes(self, name0:str, name1:str) -> tuple[str, str]:
|
|
303
|
+
"""Cleans both names in order to deal with dashes in names.
|
|
304
|
+
|
|
305
|
+
Args:
|
|
306
|
+
name0 (str): a name
|
|
307
|
+
name1 (str): a name
|
|
308
|
+
|
|
309
|
+
Returns:
|
|
310
|
+
tuple[str, str]: the cleaned names
|
|
311
|
+
"""
|
|
312
|
+
# Return old if no dash in either
|
|
313
|
+
if ('-' not in name0) and ('-' not in name1):
|
|
314
|
+
return name0, name1
|
|
315
|
+
|
|
316
|
+
# Return old if dash in both
|
|
317
|
+
if ('-' in name0) and ('-' in name1):
|
|
318
|
+
return name0, name1
|
|
319
|
+
|
|
320
|
+
# Try replacing the dash with a space, and combine words if necessary
|
|
321
|
+
name0Edited = name0.replace('-', ' ')
|
|
322
|
+
name1Edited = name1.replace('-', ' ')
|
|
323
|
+
_, name0Edited, name1Edited = self._combineSplitWords(name0Edited, name1Edited)
|
|
324
|
+
|
|
325
|
+
# Return old if the score did not improve
|
|
326
|
+
diff, _, _ = self._calculateEditImprovement(name0, name1, name0Edited, name1Edited)
|
|
327
|
+
if diff <= 0:
|
|
328
|
+
return name0, name1
|
|
329
|
+
|
|
330
|
+
# Return the edited names
|
|
331
|
+
return name0Edited, name1Edited
|
|
332
|
+
|
|
333
|
+
def _calculateEditImprovement(self, name0:str, name1:str, name0Edited:str, name1Edited:str) -> tuple[float, tuple, tuple]:
|
|
334
|
+
"""Calculates how much editing a name or both names improved the score in comparison to the original names.
|
|
335
|
+
|
|
336
|
+
Args:
|
|
337
|
+
name0 (str): the original first name
|
|
338
|
+
name1 (str): the original second name
|
|
339
|
+
name0Edited (str): the edited first name
|
|
340
|
+
name1Edited (str): the edited second name
|
|
341
|
+
|
|
342
|
+
Returns:
|
|
343
|
+
tuple[float, tuple, tuple]: the score of how much the edits improved the comparison (can be negative),
|
|
344
|
+
the word combo of the original, the word combo of the edited verison
|
|
345
|
+
"""
|
|
346
|
+
ogWordCombo = self._findWhichWordsMatchAndHowWell(name0, name1)
|
|
347
|
+
editedWordCombo = self._findWhichWordsMatchAndHowWell(name0Edited, name1Edited)
|
|
348
|
+
ogAverageScore = sum(tup[2] for tup in ogWordCombo) / len(ogWordCombo)
|
|
349
|
+
editedAverageScore = sum(tup[2] for tup in editedWordCombo) / len(editedWordCombo)
|
|
350
|
+
diff = editedAverageScore - ogAverageScore
|
|
351
|
+
return diff, ogWordCombo, editedWordCombo
|
|
352
|
+
|
|
353
|
+
def _getPairIndicesAndWords(self, name0:str, name1:str) -> list[tuple[int, int, str, str]]:
|
|
354
|
+
"""Identifies which words in the names match.
|
|
355
|
+
|
|
356
|
+
Args:
|
|
357
|
+
name0 (str): a name
|
|
358
|
+
name1 (str): a name
|
|
359
|
+
|
|
360
|
+
Returns:
|
|
361
|
+
list[tuple[int, int, str, str]]: the list of which words match. Tuples of: the index of word in name0, the index of word in name1, in word in name0, the word in name1
|
|
362
|
+
"""
|
|
363
|
+
combo = self._findWhichWordsMatchAndHowWell(name0, name1)
|
|
364
|
+
words0 = name0.split()
|
|
365
|
+
words1 = name1.split()
|
|
366
|
+
matchIndices = [(int(tup[0]), int(tup[1])) for tup in combo]
|
|
367
|
+
matchIndicesWithWords = [(tup[0], tup[1], words0[tup[0]], words1[tup[1]]) for tup in matchIndices]
|
|
368
|
+
return matchIndicesWithWords
|
|
369
|
+
|
|
370
|
+
class NameEditor():
|
|
371
|
+
""" A class used for ease of editing specific words in names. TODO implement more where possible.
|
|
372
|
+
"""
|
|
373
|
+
def __init__(self, name0:str, name1:str) -> None:
|
|
374
|
+
"""Splits the words for later editing.
|
|
375
|
+
|
|
376
|
+
Args:
|
|
377
|
+
name0 (str): a name
|
|
378
|
+
name1 (str): a name
|
|
379
|
+
"""
|
|
380
|
+
self.words0 = name0.split()
|
|
381
|
+
self.words1 = name1.split()
|
|
382
|
+
|
|
383
|
+
def updateName0(self, index:int, updatedWord:str) -> None:
|
|
384
|
+
"""Replaces the stored word for name0 at the specified index.
|
|
385
|
+
|
|
386
|
+
Args:
|
|
387
|
+
index (int): the specified index
|
|
388
|
+
updatedWord (str): the replacement string
|
|
389
|
+
"""
|
|
390
|
+
self.words0[index] = updatedWord
|
|
391
|
+
|
|
392
|
+
def updateName1(self, index:int, updatedWord:str) -> None:
|
|
393
|
+
"""Replaces the stored word for name1 at the specified index.
|
|
394
|
+
|
|
395
|
+
Args:
|
|
396
|
+
index (int): the specified index
|
|
397
|
+
updatedWord (str): the replacement string
|
|
398
|
+
"""
|
|
399
|
+
self.words1[index] = updatedWord
|
|
400
|
+
|
|
401
|
+
def getModifiedNames(self) -> tuple[str, str]:
|
|
402
|
+
"""Retrieves the modified names.
|
|
403
|
+
|
|
404
|
+
Returns:
|
|
405
|
+
tuple[str, str]: the modified names
|
|
406
|
+
"""
|
|
407
|
+
return ' '.join(self.words0), ' '.join(self.words1)
|
|
408
|
+
|
|
409
|
+
def _fixRelatedPrefixes(self, name0:str, name1:str, prefixA:str, prefixB:str) -> tuple[str, str]:
|
|
410
|
+
"""Cleans names to deal with prefixes that are different by spelling, but functionally the same.
|
|
411
|
+
|
|
412
|
+
Args:
|
|
413
|
+
name0 (str): a name
|
|
414
|
+
name1 (str): a name
|
|
415
|
+
prefixA (str): the first related prefix
|
|
416
|
+
prefixB (str): the second related prefix
|
|
417
|
+
|
|
418
|
+
Returns:
|
|
419
|
+
tuple[str, str]: the two modified names
|
|
420
|
+
"""
|
|
421
|
+
# Return if prefix1 in neither or prefix2 in neither
|
|
422
|
+
if (f' {prefixA}' not in name0) and (f' {prefixA}' not in name1):
|
|
423
|
+
return name0, name1
|
|
424
|
+
if (f' {prefixB}' not in name0) and (f' {prefixB}' not in name1):
|
|
425
|
+
return name0, name1
|
|
426
|
+
|
|
427
|
+
# Return if prefix1 or prefix2 is found in both
|
|
428
|
+
if (f' {prefixA}' in name0) and (f' {prefixA}' in name1):
|
|
429
|
+
return name0, name1
|
|
430
|
+
if (f' {prefixB}' in name0) and (f' {prefixB}' in name1):
|
|
431
|
+
return name0, name1
|
|
432
|
+
|
|
433
|
+
# Replace prefix2 with prefix1
|
|
434
|
+
if f' {prefixB}' in name0:
|
|
435
|
+
name0 = name0.replace(f' {prefixB}', f' {prefixA}')
|
|
436
|
+
else:
|
|
437
|
+
name1 = name1.replace(f' {prefixB}', f' {prefixA}')
|
|
438
|
+
return name0, name1
|
|
439
|
+
|
|
440
|
+
def _fixMcMac(self, name0:str, name1:str) -> tuple[str, str]:
|
|
441
|
+
"""Modified names to fix problems where mc or mac are in either names and don't match when they should.
|
|
442
|
+
|
|
443
|
+
Args:
|
|
444
|
+
name0 (str): a name
|
|
445
|
+
name1 (str): a name
|
|
446
|
+
|
|
447
|
+
Returns:
|
|
448
|
+
tuple[str, str]: the two modified names
|
|
449
|
+
"""
|
|
450
|
+
# Return for most names
|
|
451
|
+
if ("mc" not in name0) and ("mac" not in name0) and ("mc" not in name1) and ("mac" not in name1):
|
|
452
|
+
return name0, name1
|
|
453
|
+
|
|
454
|
+
# Combine split words (if any)
|
|
455
|
+
_, name0, name1 = self._combineSplitWords(name0, name1)
|
|
456
|
+
|
|
457
|
+
# Edit the names, if necessary
|
|
458
|
+
ne = self.NameEditor(name0, name1)
|
|
459
|
+
for prefix in ['mc', 'mac']:
|
|
460
|
+
for index0, index1, word0, word1 in self._getPairIndicesAndWords(name0, name1):
|
|
461
|
+
# Skip pair if the prefix is in both words
|
|
462
|
+
if (word0.startswith(prefix)) and (word1.startswith(prefix)):
|
|
463
|
+
continue
|
|
464
|
+
|
|
465
|
+
# Skip pair if the prefix is not in either of them
|
|
466
|
+
if (not word0.startswith(prefix)) and (not word1.startswith(prefix)):
|
|
467
|
+
continue
|
|
468
|
+
|
|
469
|
+
# Skip pair if either word is a firstname
|
|
470
|
+
if (index0 < 1) or (index1 < 1):
|
|
471
|
+
continue
|
|
472
|
+
|
|
473
|
+
# Skip pair if the shortest word is only 4 long
|
|
474
|
+
if min(len(word0), len(word1)) < 3:
|
|
475
|
+
continue
|
|
476
|
+
|
|
477
|
+
# Skip pair if they are already a solid match
|
|
478
|
+
if fuzz.ratio(word0, word1) > 80:
|
|
479
|
+
continue
|
|
480
|
+
|
|
481
|
+
# Skip pair if the prefix is removed and not a good fuzzy match
|
|
482
|
+
if word0.startswith(prefix):
|
|
483
|
+
updatedWord0 = word0.replace(prefix, "", 1)
|
|
484
|
+
updatedWord1 = word1
|
|
485
|
+
else:
|
|
486
|
+
updatedWord0 = word0
|
|
487
|
+
updatedWord1 = word1.replace(prefix, "", 1)
|
|
488
|
+
if fuzz.ratio(updatedWord0, updatedWord1) < 75:
|
|
489
|
+
continue
|
|
490
|
+
|
|
491
|
+
# Update the words
|
|
492
|
+
ne.updateName0(index0, updatedWord0)
|
|
493
|
+
ne.updateName1(index1, updatedWord1)
|
|
494
|
+
|
|
495
|
+
# Return the edited (or not) names
|
|
496
|
+
return ne.getModifiedNames()
|
|
497
|
+
|
|
498
|
+
def _removeIrishO(self, name0:str, name1:str, surname:str) -> tuple[str, str]:
|
|
499
|
+
"""Removes the irish O if needed for easier name comparison.
|
|
500
|
+
|
|
501
|
+
Args:
|
|
502
|
+
name0 (str): a name
|
|
503
|
+
name1 (str): a name
|
|
504
|
+
surname (str): one of the irish surnames that often starts with O'
|
|
505
|
+
|
|
506
|
+
Returns:
|
|
507
|
+
tuple[str, str]: the modified names
|
|
508
|
+
"""
|
|
509
|
+
# Skip non applicable names
|
|
510
|
+
if (' o ' not in name0) and (" o" not in name0) and (" o" not in name1) and (' o ' not in name1):
|
|
511
|
+
return name0, name1
|
|
512
|
+
if (surname not in name0) and (surname not in name1):
|
|
513
|
+
return name0, name1
|
|
514
|
+
# Edit the names
|
|
515
|
+
lastname0 = name0.split()[-1]
|
|
516
|
+
if fuzz.ratio(lastname0, surname) > 75:
|
|
517
|
+
if lastname0[0] == 'o':
|
|
518
|
+
name0 = name0.replace(f'{lastname0}', surname)
|
|
519
|
+
else:
|
|
520
|
+
name0 = name0.replace(f'o {lastname0}', surname)
|
|
521
|
+
lastname1 = name1.split()[-1]
|
|
522
|
+
if fuzz.ratio(lastname1, surname) > 75:
|
|
523
|
+
if lastname1[0] == 'o':
|
|
524
|
+
name1 = name1.replace(f'{lastname1}', surname)
|
|
525
|
+
else:
|
|
526
|
+
name1 = name1.replace(f'o {lastname1}', surname)
|
|
527
|
+
return name0, name1
|
|
528
|
+
|
|
529
|
+
def _removeUnnecessaryPrefixes(self, name0:str, name1:str, prefix:str) -> tuple[str,str]:
|
|
530
|
+
"""Removes an unnecessary prefix from either or both of the names.
|
|
531
|
+
|
|
532
|
+
Args:
|
|
533
|
+
name0 (str): a name
|
|
534
|
+
name1 (str): a name
|
|
535
|
+
prefix (str): the prefix to (probably) remove
|
|
536
|
+
|
|
537
|
+
Returns:
|
|
538
|
+
tuple[str,str]: the modified names
|
|
539
|
+
"""
|
|
540
|
+
# If the prefix is not in either names, return the names
|
|
541
|
+
name0 = re.sub(r"\s+", " ", name0)
|
|
542
|
+
name1 = re.sub(r"\s+", " ", name1)
|
|
543
|
+
if (f" {prefix}" not in name0) and (f" {prefix}" not in name1):
|
|
544
|
+
return name0, name1
|
|
545
|
+
|
|
546
|
+
# Setup
|
|
547
|
+
name0Edited = name0
|
|
548
|
+
name1Edited = name1
|
|
549
|
+
spPrefixSp = f" {prefix} "
|
|
550
|
+
spacePrefix = f" {prefix}"
|
|
551
|
+
|
|
552
|
+
# Make the edited names different
|
|
553
|
+
if (spPrefixSp in name0) and (spPrefixSp in name1):
|
|
554
|
+
pass
|
|
555
|
+
elif (spPrefixSp in name0) and (spacePrefix in name1):
|
|
556
|
+
name0Edited = name0Edited.replace(spPrefixSp, spacePrefix)
|
|
557
|
+
elif (spacePrefix in name0) and (spPrefixSp in name1):
|
|
558
|
+
name1Edited = name1Edited.replace(spPrefixSp, spacePrefix)
|
|
559
|
+
name0Edited = name0Edited.replace(spPrefixSp, " ")
|
|
560
|
+
name1Edited = name1Edited.replace(spPrefixSp, " ")
|
|
561
|
+
name0Edited = re.sub(r"\s+", " ", name0Edited)
|
|
562
|
+
name1Edited = re.sub(r"\s+", " ", name1Edited)
|
|
563
|
+
|
|
564
|
+
# If no edits were made, try removing spacePrefix if in just one name and it's a long word
|
|
565
|
+
if (name0 == name0Edited) and (name1 == name1Edited):
|
|
566
|
+
pattern = r'\b{}\w*\b'.format(spacePrefix)
|
|
567
|
+
if (spacePrefix in name0) and (spacePrefix not in name1):
|
|
568
|
+
match = re.search(pattern, name0)
|
|
569
|
+
matchedWord = match.group()
|
|
570
|
+
length = len(matchedWord)
|
|
571
|
+
if length > len(prefix) + 4:
|
|
572
|
+
name0Edited = name0.replace(spacePrefix, " ")
|
|
573
|
+
elif(spacePrefix in name1) and (spacePrefix not in name0):
|
|
574
|
+
match = re.search(pattern, name1)
|
|
575
|
+
matchedWord = match.group()
|
|
576
|
+
length = len(matchedWord)
|
|
577
|
+
if length > len(prefix) + 4:
|
|
578
|
+
name1Edited = name1.replace(spacePrefix, " ")
|
|
579
|
+
|
|
580
|
+
# If the edits were significantly beneficial (or pass spell), return the edited versions
|
|
581
|
+
improvement, _, _= self._calculateEditImprovement(name0, name1, name0Edited, name1Edited)
|
|
582
|
+
if (improvement >= 10) or (self._spellingComparison(name0Edited, name1Edited)[0] and not self._spellingComparison(name0, name1)[0]):
|
|
583
|
+
return name0Edited, name1Edited
|
|
584
|
+
|
|
585
|
+
# Finally, if the words are identical other than the prefix, remove the prefix
|
|
586
|
+
ne = self.NameEditor(name0, name1)
|
|
587
|
+
for index0, index1, word0, word1 in self._getPairIndicesAndWords(name0, name1):
|
|
588
|
+
if (word0.startswith(prefix)) and (word0[len(prefix):] == word1) and (len(word1) > 2):
|
|
589
|
+
ne.updateName0(index0, word0[len(prefix):])
|
|
590
|
+
elif (word1.startswith(prefix)) and (word1[len(prefix):] == word0) and (len(word0) > 2):
|
|
591
|
+
ne.updateName1(index1, word1[len(prefix):])
|
|
592
|
+
name0, name1 = ne.getModifiedNames()
|
|
593
|
+
|
|
594
|
+
# Otherwise, return originals
|
|
595
|
+
return name0, name1
|
|
596
|
+
|
|
597
|
+
def _combinePrefixWithSurnameifInBoth(self, name0:str, name1:str, prefix:str) -> tuple[str, str]:
|
|
598
|
+
"""Combines the prefix with the surname in both of the names if the prefix exists in both.
|
|
599
|
+
|
|
600
|
+
Args:
|
|
601
|
+
name0 (str): a name
|
|
602
|
+
name1 (str): a name
|
|
603
|
+
prefix (str): the prefix to combine with the surname
|
|
604
|
+
|
|
605
|
+
Returns:
|
|
606
|
+
tuple[str, str]: the modified names
|
|
607
|
+
"""
|
|
608
|
+
# Return if ' prefix ' in neither
|
|
609
|
+
if (not re.search(f' {prefix} .', name0)) or (not re.search(f' {prefix} .', name1)):
|
|
610
|
+
return name0, name1
|
|
611
|
+
|
|
612
|
+
# Get the letter after ' prefix '
|
|
613
|
+
letter0 = name0[name0.index(f' {prefix} ') + 4]
|
|
614
|
+
letter1 = name1[name1.index(f' {prefix} ') + 4]
|
|
615
|
+
|
|
616
|
+
# If the letter after matches, replace ' prefix ' with ' prefix'
|
|
617
|
+
if letter0 == letter1:
|
|
618
|
+
name0 = name0.replace(f' {prefix} ', f' {prefix}')
|
|
619
|
+
name1 = name1.replace(f' {prefix} ', f' {prefix}')
|
|
620
|
+
return name0, name1
|
|
621
|
+
|
|
622
|
+
def _combineSplitWords(self, name0:str, name1:str) -> tuple[str, str]:
|
|
623
|
+
"""Combines words within one of the names if that combination is one word in the other name.
|
|
624
|
+
|
|
625
|
+
Args:
|
|
626
|
+
name0 (str): a name
|
|
627
|
+
name1 (str): a name
|
|
628
|
+
|
|
629
|
+
Returns:
|
|
630
|
+
tuple[str, str]: the modified names
|
|
631
|
+
"""
|
|
632
|
+
words0 = name0.split()
|
|
633
|
+
|
|
634
|
+
# Do not combine words that are only two in length
|
|
635
|
+
if len(words0) < 3:
|
|
636
|
+
return False, name0, name1
|
|
637
|
+
|
|
638
|
+
# Do not combine words that are already a good spelling match
|
|
639
|
+
if self._spellingComparison(name0, name1)[0]:
|
|
640
|
+
return False, name0, name1
|
|
641
|
+
|
|
642
|
+
for index0, _, word0, word1 in self._getPairIndicesAndWords(name0, name1):
|
|
643
|
+
# Skip if word0 and word1 are not a good match
|
|
644
|
+
if (fuzz.partial_ratio(word0, word1) < 75):
|
|
645
|
+
continue
|
|
646
|
+
|
|
647
|
+
# Skip if either word is only an initial
|
|
648
|
+
if (len(word0) == 1) or (len(word1) == 1):
|
|
649
|
+
continue
|
|
650
|
+
|
|
651
|
+
# Find the left and right neighbors
|
|
652
|
+
leftNeighbor = words0[index0 - 1] if index0 - 1 >= 0 else ''
|
|
653
|
+
rightNeighbor = words0[index0 + 1] if index0 + 1 < len(words0) else ''
|
|
654
|
+
|
|
655
|
+
# Skip neighbors if they are initials
|
|
656
|
+
leftNeighbor = leftNeighbor if len(leftNeighbor) > 1 else ''
|
|
657
|
+
rightNeighbor = rightNeighbor if len(rightNeighbor) > 1 else ''
|
|
658
|
+
if (not leftNeighbor) and (not rightNeighbor):
|
|
659
|
+
return False, name0, name1
|
|
660
|
+
|
|
661
|
+
# Choose the neighbor that best matches word0's match (word1)
|
|
662
|
+
if not leftNeighbor:
|
|
663
|
+
leftWasChosen = False
|
|
664
|
+
elif not rightNeighbor:
|
|
665
|
+
leftWasChosen = True
|
|
666
|
+
else:
|
|
667
|
+
leftScore = fuzz.partial_ratio(leftNeighbor, word1)
|
|
668
|
+
rightScore = fuzz.partial_ratio(rightNeighbor, word1)
|
|
669
|
+
if leftScore > rightScore:
|
|
670
|
+
leftWasChosen = True
|
|
671
|
+
else:
|
|
672
|
+
leftWasChosen = False
|
|
673
|
+
|
|
674
|
+
# Initialize the chosen neighbor, compound, and neighbor index
|
|
675
|
+
if leftWasChosen:
|
|
676
|
+
chosenNeighbor = leftNeighbor
|
|
677
|
+
compound = f'{leftNeighbor}{word0}'
|
|
678
|
+
indexN = index0 - 1
|
|
679
|
+
else:
|
|
680
|
+
chosenNeighbor = rightNeighbor
|
|
681
|
+
compound = f'{word0}{rightNeighbor}'
|
|
682
|
+
indexN = index0 + 1
|
|
683
|
+
|
|
684
|
+
# Skip if the neighbor is a bad partial match to word0's match
|
|
685
|
+
if fuzz.partial_ratio(chosenNeighbor, word1) < 65:
|
|
686
|
+
continue
|
|
687
|
+
|
|
688
|
+
# Check if the compound is significantly better than the original
|
|
689
|
+
ogScore = fuzz.ratio(word0, word1)
|
|
690
|
+
compoundScore = fuzz.ratio(compound, word1)
|
|
691
|
+
if compoundScore < ogScore + 20:
|
|
692
|
+
continue
|
|
693
|
+
diffLength0 = abs(len(word1) - len(word0))
|
|
694
|
+
diffLengthCompound = abs(len(word1) - len(compound))
|
|
695
|
+
if diffLength0 < diffLengthCompound:
|
|
696
|
+
continue
|
|
697
|
+
|
|
698
|
+
# If the compound was a better match, use a name editor to create an edited name0 where the words are combined
|
|
699
|
+
ne = self.NameEditor(name0, name1)
|
|
700
|
+
ne.updateName0(index0, compound)
|
|
701
|
+
ne.updateName0(indexN, '')
|
|
702
|
+
name0Edited, _ = ne.getModifiedNames()
|
|
703
|
+
|
|
704
|
+
# If the edited name0 is better (or only slightly worse), go with the edited version
|
|
705
|
+
improvement = self._calculateEditImprovement(name0, name1, name0Edited, name1)[0]
|
|
706
|
+
if improvement > -1:
|
|
707
|
+
return True, name0Edited, name1
|
|
708
|
+
|
|
709
|
+
# If no edits were beneficial, just return the original words
|
|
710
|
+
return False, name0, name1
|
|
711
|
+
|
|
712
|
+
def _eitherNameTooShort(self, name0:str, name1:str) -> bool:
|
|
713
|
+
"""Identifies if either of the names is too short.
|
|
714
|
+
|
|
715
|
+
Args:
|
|
716
|
+
name0 (str): a name
|
|
717
|
+
name1 (str): a name
|
|
718
|
+
|
|
719
|
+
Returns:
|
|
720
|
+
bool: whether either was too short
|
|
721
|
+
"""
|
|
722
|
+
combo = self._findWhichWordsMatchAndHowWell(name0, name1)
|
|
723
|
+
shortestWordCount = len(combo)
|
|
724
|
+
if shortestWordCount < 2:
|
|
725
|
+
return True
|
|
726
|
+
else:
|
|
727
|
+
return False
|
|
728
|
+
|
|
729
|
+
def _findWhichWordsMatchAndHowWell(self, name0:str, name1:str) -> list[tuple[str, str, int]]:
|
|
730
|
+
"""Identifies which words in either name are a match, and how well they match.
|
|
731
|
+
|
|
732
|
+
Args:
|
|
733
|
+
name0 (str): a name
|
|
734
|
+
name1 (str): a name
|
|
735
|
+
|
|
736
|
+
Returns:
|
|
737
|
+
list[tuple[str, str, int]]: a list of tuples idenifying the index of the word in the first name,
|
|
738
|
+
the index of the word in the second name, and the score of how well they match
|
|
739
|
+
"""
|
|
740
|
+
# Split strings into lists of words
|
|
741
|
+
words0 = name0.split()
|
|
742
|
+
words1 = name1.split()
|
|
743
|
+
|
|
744
|
+
# Initialize empty list to store scores
|
|
745
|
+
scores = []
|
|
746
|
+
|
|
747
|
+
# Loops through each word in words1 and compare to each word in words2
|
|
748
|
+
for i, word0 in enumerate(words0):
|
|
749
|
+
for j, word1 in enumerate(words1):
|
|
750
|
+
# Gets the score for how well the words match by using fuzzywuzzy
|
|
751
|
+
rScore = fuzz.ratio(word0, word1)
|
|
752
|
+
prScore = fuzz.partial_ratio(word0, word1)
|
|
753
|
+
score = max(rScore, prScore)
|
|
754
|
+
if word0[0] != word1[0]:
|
|
755
|
+
score = rScore
|
|
756
|
+
|
|
757
|
+
# Unless word1 or word2 is only an initial,
|
|
758
|
+
if (len(word0) == 1) or (len(word1) == 1):
|
|
759
|
+
# If the initial matches the first letter of the other word, give it a near perfect score
|
|
760
|
+
if (word0[0] == word1[0]):
|
|
761
|
+
score = 100
|
|
762
|
+
# Otherwise the score is 0
|
|
763
|
+
else:
|
|
764
|
+
score = 0
|
|
765
|
+
|
|
766
|
+
# Add the score to scores
|
|
767
|
+
scores.append((f"{i} words1", f"{j} words2", score))
|
|
768
|
+
|
|
769
|
+
# Gets the length of the shortest word
|
|
770
|
+
minLength = min(len(words0), len(words1))
|
|
771
|
+
|
|
772
|
+
# Generate all combinations of tuples with length equal to the number of words in string2
|
|
773
|
+
combinations = itertools.combinations(scores, minLength)
|
|
774
|
+
|
|
775
|
+
# Filter the combinations to include only valid combinations
|
|
776
|
+
validCombinations = [c for c in combinations if len(set(x[0] for x in c)) == len(c) and len(set(x[1] for x in c)) == len(c)]
|
|
777
|
+
|
|
778
|
+
# Cleans the valid combinations
|
|
779
|
+
cleanedValidCombinations = []
|
|
780
|
+
for validCombo in validCombinations:
|
|
781
|
+
cleanedValidCombo = []
|
|
782
|
+
for tup in validCombo:
|
|
783
|
+
cleanedTup = tuple([s.replace(' words1', '').replace(' words2', '') for s in tup[:2]] + [tup[2]])
|
|
784
|
+
cleanedValidCombo.append(cleanedTup)
|
|
785
|
+
cleanedValidCombinations.append(cleanedValidCombo)
|
|
786
|
+
|
|
787
|
+
# Find the combination(s) with the maximum sum
|
|
788
|
+
maxSum = sum(y[2] for y in max(cleanedValidCombinations, key=lambda x: sum(y[2] for y in x)))
|
|
789
|
+
maxCombinations = []
|
|
790
|
+
for combo in cleanedValidCombinations:
|
|
791
|
+
if (sum(y[2] for y in combo)) == maxSum:
|
|
792
|
+
maxCombinations.append(combo)
|
|
793
|
+
|
|
794
|
+
# Assigns the max score combination with the most letters to be best_combo
|
|
795
|
+
bestCombo = []
|
|
796
|
+
maxLetterCount = 0
|
|
797
|
+
for combo in maxCombinations:
|
|
798
|
+
letterCount = 0
|
|
799
|
+
for tup in combo:
|
|
800
|
+
x, y, _ = map(int, (tup[0], tup[1], tup[2]))
|
|
801
|
+
letterCount += len(words0[x]) + len(words1[y])
|
|
802
|
+
if letterCount > maxLetterCount:
|
|
803
|
+
maxLetterCount = letterCount
|
|
804
|
+
bestCombo = combo
|
|
805
|
+
|
|
806
|
+
# Returns the combination of word matches that are the closest match
|
|
807
|
+
return bestCombo
|
|
808
|
+
|
|
809
|
+
def _eitherNameTooGeneric(self, name0:str, name1:str) -> bool:
|
|
810
|
+
"""Identifies if either name is too generic.
|
|
811
|
+
|
|
812
|
+
Args:
|
|
813
|
+
name0 (str): a name
|
|
814
|
+
name1 (str): a name
|
|
815
|
+
|
|
816
|
+
Returns:
|
|
817
|
+
bool: whether the name is too generic
|
|
818
|
+
"""
|
|
819
|
+
# Finds the length of the shortest of the two words
|
|
820
|
+
combo = self._findWhichWordsMatchAndHowWell(name0, name1)
|
|
821
|
+
shortestWordCount = len(combo)
|
|
822
|
+
|
|
823
|
+
# If both last names are very rare, returns False
|
|
824
|
+
if self._hasRareSurname(name0) and self._hasRareSurname(name1):
|
|
825
|
+
return False
|
|
826
|
+
|
|
827
|
+
# Checks if the initials between the two names make a match too uncertain
|
|
828
|
+
words0 = name0.split()
|
|
829
|
+
words2 = name1.split()
|
|
830
|
+
nonInitialMatchCount = 0
|
|
831
|
+
for match in combo:
|
|
832
|
+
index1, index2, _ = match
|
|
833
|
+
word0 = words0[int(index1)]
|
|
834
|
+
word1 = words2[int(index2)]
|
|
835
|
+
initialInWord1 = len(word0) == 1
|
|
836
|
+
initialInWord2 = len(word1) == 1
|
|
837
|
+
if initialInWord1 or initialInWord2:
|
|
838
|
+
nonInitialMatchCount += 1
|
|
839
|
+
|
|
840
|
+
if shortestWordCount <= nonInitialMatchCount + 1:
|
|
841
|
+
return True
|
|
842
|
+
else:
|
|
843
|
+
return False
|
|
844
|
+
|
|
845
|
+
def _hasRareSurname(self, name:str) -> bool:
|
|
846
|
+
"""Identifies if a name has a rare surname.
|
|
847
|
+
|
|
848
|
+
Args:
|
|
849
|
+
name (str): a name
|
|
850
|
+
|
|
851
|
+
Returns:
|
|
852
|
+
bool: whether the name's surname is rare
|
|
853
|
+
"""
|
|
854
|
+
# Isolates the last name
|
|
855
|
+
name = name.lower()
|
|
856
|
+
surname = name.split()[-1]
|
|
857
|
+
|
|
858
|
+
# If the last name is not in the list of surnames, returns true
|
|
859
|
+
if surname not in self.topSurnames:
|
|
860
|
+
return True
|
|
861
|
+
else:
|
|
862
|
+
return False
|
|
863
|
+
|
|
864
|
+
def _removeNicknames(self, name0:str, name1:str) -> tuple[str, str]:
|
|
865
|
+
"""Replaces the nickname in one name for the official name found in the other.
|
|
866
|
+
|
|
867
|
+
Args:
|
|
868
|
+
name0 (str): a name
|
|
869
|
+
name1 (str): a name
|
|
870
|
+
|
|
871
|
+
Returns:
|
|
872
|
+
tuple[str, str]: the modified names
|
|
873
|
+
"""
|
|
874
|
+
words0 = name0.split()
|
|
875
|
+
words1 = name1.split()
|
|
876
|
+
for word0 in words0:
|
|
877
|
+
# Skip if the word is also a word in name1
|
|
878
|
+
if word0 in words1:
|
|
879
|
+
continue
|
|
880
|
+
|
|
881
|
+
# Skip if the word does not have an nickname
|
|
882
|
+
if self.nicknameToNicknameSetNum.get(word0) is None:
|
|
883
|
+
continue
|
|
884
|
+
|
|
885
|
+
# Replace the word with a nickname if the nickname is in name1
|
|
886
|
+
# (and nickname not also in name0)
|
|
887
|
+
setNumbers = self.nicknameToNicknameSetNum[word0]
|
|
888
|
+
breaking = False
|
|
889
|
+
for num in setNumbers:
|
|
890
|
+
nicknames = set(self.nicknameSets[num])
|
|
891
|
+
nicknames.remove(word0)
|
|
892
|
+
for nickname in nicknames:
|
|
893
|
+
if (nickname in words0) and (nickname in words1):
|
|
894
|
+
continue
|
|
895
|
+
if nickname in words1:
|
|
896
|
+
name0 = re.sub(rf"\b{word0}\b", nickname, name0, flags=re.IGNORECASE)
|
|
897
|
+
breaking = True
|
|
898
|
+
break
|
|
899
|
+
if breaking:
|
|
900
|
+
break
|
|
901
|
+
|
|
902
|
+
return name0, name1
|
|
903
|
+
|
|
904
|
+
def _spellingComparison(self, name0:str, name1:str) -> tuple[bool, list]:
|
|
905
|
+
"""Identifies if two names are a match according to a comparison based soley on spelling.
|
|
906
|
+
|
|
907
|
+
Args:
|
|
908
|
+
name0 (str): a name
|
|
909
|
+
name1 (str): a name
|
|
910
|
+
|
|
911
|
+
Returns:
|
|
912
|
+
tuple[bool, list]: whether the names are a match, and the resulting word combo
|
|
913
|
+
"""
|
|
914
|
+
# Compares the combination of words that match the best, and to what extent
|
|
915
|
+
wordCombo = self._findWhichWordsMatchAndHowWell(name0, name1)
|
|
916
|
+
|
|
917
|
+
# Loops through the tuples and counts the number of times the score is greater than 80
|
|
918
|
+
count = sum(1 for tup in wordCombo if tup[2] > 80)
|
|
919
|
+
|
|
920
|
+
# If at least three of the scores are greater than 80, or,
|
|
921
|
+
# if the number of name matches is the same as the number of words in the shortest name, it's a match
|
|
922
|
+
minLength = min(len(name0.split()), len(name1.split()))
|
|
923
|
+
if (count >= 3) or (count == minLength):
|
|
924
|
+
return True, wordCombo
|
|
925
|
+
|
|
926
|
+
# If that didn't work, do a consonant check
|
|
927
|
+
if self._consonantComparison(name0, name1):
|
|
928
|
+
return True, wordCombo
|
|
929
|
+
|
|
930
|
+
# If that didn't work, spelling check returns false
|
|
931
|
+
return False, wordCombo
|
|
932
|
+
|
|
933
|
+
def _consonantComparison(self, name0:str, name1:str) -> bool:
|
|
934
|
+
"""Identifies if two names are a match according to consonant comparison.
|
|
935
|
+
|
|
936
|
+
Args:
|
|
937
|
+
name0 (str): a name
|
|
938
|
+
name1 (str): a name
|
|
939
|
+
|
|
940
|
+
Returns:
|
|
941
|
+
bool: whether the two names are a match according to consonant comparison
|
|
942
|
+
"""
|
|
943
|
+
# Setup
|
|
944
|
+
wordCombo = self._findWhichWordsMatchAndHowWell(name0, name1)
|
|
945
|
+
minRequiredMatches = len(wordCombo)
|
|
946
|
+
numWordConsonantMatches = 0
|
|
947
|
+
def reduceToSimpleConsonants(string:str) -> str:
|
|
948
|
+
"""Reduces a string to the simple consonant componants.
|
|
949
|
+
|
|
950
|
+
Args:
|
|
951
|
+
string (str): a string
|
|
952
|
+
|
|
953
|
+
Returns:
|
|
954
|
+
str: the consonant componants
|
|
955
|
+
"""
|
|
956
|
+
string = re.sub("a|e|i|o|u|y", "*", string)
|
|
957
|
+
string = string.replace("**", "*")
|
|
958
|
+
string = re.sub(r'(.)\1+', r'\1', string)
|
|
959
|
+
return string
|
|
960
|
+
|
|
961
|
+
# Loop through every word match in the combo
|
|
962
|
+
for tup in wordCombo:
|
|
963
|
+
# Get the matching word data
|
|
964
|
+
word0:str = name0.split()[int(tup[0])]
|
|
965
|
+
word1:str = name1.split()[int(tup[1])]
|
|
966
|
+
originalScoreForWords:int = int(tup[2])
|
|
967
|
+
|
|
968
|
+
# Get the words as consonants
|
|
969
|
+
consonantsName0 = reduceToSimpleConsonants(word0)
|
|
970
|
+
consonantsName1 = reduceToSimpleConsonants(word1)
|
|
971
|
+
consonantsRatio = fuzz.ratio(consonantsName0, consonantsName1)
|
|
972
|
+
|
|
973
|
+
# Continue if bad match
|
|
974
|
+
if originalScoreForWords <= 30:
|
|
975
|
+
continue
|
|
976
|
+
if (len(word0) != 1) and (len(word1) != 1): #if neither word is initial
|
|
977
|
+
lowestSyllableCount = min(consonantsName0.count("*"), consonantsName1.count("*"))
|
|
978
|
+
if lowestSyllableCount < 2:
|
|
979
|
+
continue
|
|
980
|
+
if (consonantsRatio <= 80 or originalScoreForWords <= 60) and consonantsRatio != 100:
|
|
981
|
+
continue
|
|
982
|
+
|
|
983
|
+
# If not rejected, increment the number of matches
|
|
984
|
+
numWordConsonantMatches += 1
|
|
985
|
+
|
|
986
|
+
# If enough matches, return true. Otherwise return false.
|
|
987
|
+
if (numWordConsonantMatches > minRequiredMatches) or (numWordConsonantMatches >= 3):
|
|
988
|
+
return True
|
|
989
|
+
else:
|
|
990
|
+
return False
|
|
991
|
+
|
|
992
|
+
def _isWorthContinuing(self, name0:str, name1:str) -> bool:
|
|
993
|
+
"""Identifies if a name comparison will always prove false.
|
|
994
|
+
|
|
995
|
+
Args:
|
|
996
|
+
name0 (str): _description_
|
|
997
|
+
name1 (str): _description_
|
|
998
|
+
|
|
999
|
+
Returns:
|
|
1000
|
+
bool: whether the names are worth working on further
|
|
1001
|
+
"""
|
|
1002
|
+
wordCombo = self._findWhichWordsMatchAndHowWell(name0, name1)
|
|
1003
|
+
oneLetterMatchFailCount = 0
|
|
1004
|
+
|
|
1005
|
+
for match in wordCombo:
|
|
1006
|
+
word0 = name0[int(match[0])]
|
|
1007
|
+
word1 = name1[int(match[1])]
|
|
1008
|
+
score = match[2]
|
|
1009
|
+
if (score == 0) and ((len(word0) == 1) or ((len(word1) == 1))):
|
|
1010
|
+
oneLetterMatchFailCount += 1
|
|
1011
|
+
|
|
1012
|
+
if (oneLetterMatchFailCount >= 1) and (len(wordCombo) <= 3):
|
|
1013
|
+
return False
|
|
1014
|
+
else:
|
|
1015
|
+
return True
|
|
1016
|
+
|
|
1017
|
+
def _modifyNamesTogether(self, name0:str, name1:str) -> tuple[str,str]:
|
|
1018
|
+
"""Modifies the name together (changing them in a way that is much more intense than simply cleaning together).
|
|
1019
|
+
|
|
1020
|
+
Args:
|
|
1021
|
+
name0 (str): a name
|
|
1022
|
+
name1 (str): a name
|
|
1023
|
+
|
|
1024
|
+
Returns:
|
|
1025
|
+
tuple[str,str]: the modified names
|
|
1026
|
+
"""
|
|
1027
|
+
# Replaces 'ie' endings with 'y' endings
|
|
1028
|
+
name0 = re.sub(r'ie\b', 'y', name0)
|
|
1029
|
+
name1 = re.sub(r'ie\b', 'y', name1)
|
|
1030
|
+
|
|
1031
|
+
# Fix when records are indexed with " or " in the name
|
|
1032
|
+
name0, name1 = self._removeOrInNames(name0, name1)
|
|
1033
|
+
|
|
1034
|
+
# Deal with names longer than 5 that are off by one specific char
|
|
1035
|
+
name0, name1 = self._fixVowelMistakes(name0, name1)
|
|
1036
|
+
|
|
1037
|
+
# Deal with names longer than 5 that have swapped letters
|
|
1038
|
+
name0, name1 = self._fixSwappedChars(name0, name1)
|
|
1039
|
+
|
|
1040
|
+
# Deal with longer names that are identical except for the first char
|
|
1041
|
+
name0, name1 = self._dealWithWrongFirstChar(name0, name1)
|
|
1042
|
+
|
|
1043
|
+
# Use spelling rules to modify both
|
|
1044
|
+
for rule in self.spellingRules:
|
|
1045
|
+
name0, name1 = self._replaceSubstringSandwichMeatIfMatchingBread(name0, name1, rule[0], rule[1], rule[2], rule[3], rule[4])
|
|
1046
|
+
|
|
1047
|
+
# Remove extra spaces
|
|
1048
|
+
name0 = re.sub(r'\s+', ' ', name0)
|
|
1049
|
+
name1 = re.sub(r'\s+', ' ', name1)
|
|
1050
|
+
name0 = name0.strip()
|
|
1051
|
+
name1 = name1.strip()
|
|
1052
|
+
|
|
1053
|
+
# Return modified names
|
|
1054
|
+
return name0, name1
|
|
1055
|
+
|
|
1056
|
+
def _removeOrInNames(self, name0:str, name1:str) -> tuple[str, str]:
|
|
1057
|
+
"""Removes the word 'or' from a name (assuming that the name could have been
|
|
1058
|
+
poorly indexed so that the indexer's guesses for a specific word of the name is still within the string).
|
|
1059
|
+
|
|
1060
|
+
Args:
|
|
1061
|
+
name0 (str): a name
|
|
1062
|
+
name1 (str): a name
|
|
1063
|
+
|
|
1064
|
+
Returns:
|
|
1065
|
+
tuple[str, str]: the modified names
|
|
1066
|
+
"""
|
|
1067
|
+
name0, name1 = name0.lower(), name1.lower()
|
|
1068
|
+
# if or in neither
|
|
1069
|
+
if (not " or " in name0) and (not " or " in name1):
|
|
1070
|
+
return name0, name1
|
|
1071
|
+
# if or in both
|
|
1072
|
+
elif (" or " in name0) and (" or " in name1):
|
|
1073
|
+
return name0, name1
|
|
1074
|
+
# if or in 1, not 2
|
|
1075
|
+
elif " or " in name0:
|
|
1076
|
+
# Gets the score for if the word before 'or' is removed
|
|
1077
|
+
name0EditedA = re.sub("[a-z]+ or ", " ", name0)
|
|
1078
|
+
wordComboA = self._findWhichWordsMatchAndHowWell(name0EditedA, name1)
|
|
1079
|
+
averageScoreA = sum(tup[2] for tup in wordComboA) / len(wordComboA)
|
|
1080
|
+
|
|
1081
|
+
# Gets the score for if the word after 'or' is removed
|
|
1082
|
+
name0EditedB = re.sub(" or [a-z]+", " ", name0)
|
|
1083
|
+
wordComboB = self._findWhichWordsMatchAndHowWell(name0EditedB, name1)
|
|
1084
|
+
averageScoreB = sum(tup[2] for tup in wordComboB) / len(wordComboB)
|
|
1085
|
+
|
|
1086
|
+
# If the before score is greater, returns A
|
|
1087
|
+
if averageScoreA >= averageScoreB:
|
|
1088
|
+
return name0EditedA, name1
|
|
1089
|
+
# Otherwise returns B
|
|
1090
|
+
else:
|
|
1091
|
+
return name0EditedB, name1
|
|
1092
|
+
# if or in 2, not 1
|
|
1093
|
+
elif " or " in name1:
|
|
1094
|
+
name1EditedA = re.sub("[a-z]+ or ", " ", name1)
|
|
1095
|
+
wordComboA = self._findWhichWordsMatchAndHowWell(name1EditedA, name0)
|
|
1096
|
+
averageScoreA = sum(tup[2] for tup in wordComboA) / len(wordComboA)
|
|
1097
|
+
|
|
1098
|
+
# Gets the score for if the word after 'or' is removed
|
|
1099
|
+
name1EditedB = re.sub(" or [a-z]+", " ", name1)
|
|
1100
|
+
wordComboB = self._findWhichWordsMatchAndHowWell(name1EditedB, name0)
|
|
1101
|
+
averageScoreB = sum(tup[2] for tup in wordComboB) / len(wordComboB)
|
|
1102
|
+
|
|
1103
|
+
# If the before score is greater, returns A
|
|
1104
|
+
if averageScoreA >= averageScoreB:
|
|
1105
|
+
return name0, name1EditedA
|
|
1106
|
+
# Otherwise returns B
|
|
1107
|
+
else:
|
|
1108
|
+
return name0, name1EditedB
|
|
1109
|
+
|
|
1110
|
+
def _fixVowelMistakes(self, name0:str, name1:str) -> tuple[str, str]:
|
|
1111
|
+
"""Modifies two matching words in a name so that they are the same if
|
|
1112
|
+
they are only different by one vowel and 5 letters or more.
|
|
1113
|
+
|
|
1114
|
+
Args:
|
|
1115
|
+
name0 (str): a name
|
|
1116
|
+
name1 (str): a name
|
|
1117
|
+
|
|
1118
|
+
Returns:
|
|
1119
|
+
tuple[str, str]: the two modified names
|
|
1120
|
+
"""
|
|
1121
|
+
ne = self.NameEditor(name0, name1)
|
|
1122
|
+
for index0, _, word0, word1 in self._getPairIndicesAndWords(name0, name1):
|
|
1123
|
+
# Continue if either word is less than 5 chars
|
|
1124
|
+
len0 = len(word0)
|
|
1125
|
+
if len0 < 5:
|
|
1126
|
+
continue
|
|
1127
|
+
len1 = len(word1)
|
|
1128
|
+
if len1 < 5:
|
|
1129
|
+
continue
|
|
1130
|
+
|
|
1131
|
+
# Continue if they are not the same length
|
|
1132
|
+
if len0 != len1:
|
|
1133
|
+
continue
|
|
1134
|
+
|
|
1135
|
+
# Check if there is only one difference
|
|
1136
|
+
mismatchedIndex = None
|
|
1137
|
+
tooManyDiffs = False
|
|
1138
|
+
for i in range(len0):
|
|
1139
|
+
# Skip matching chars
|
|
1140
|
+
if word0[i] == word1[i]:
|
|
1141
|
+
continue
|
|
1142
|
+
|
|
1143
|
+
# Check if there are already too many differences
|
|
1144
|
+
if mismatchedIndex:
|
|
1145
|
+
tooManyDiffs = True
|
|
1146
|
+
break
|
|
1147
|
+
|
|
1148
|
+
# Append the index if the chars were not the same
|
|
1149
|
+
mismatchedIndex = i
|
|
1150
|
+
|
|
1151
|
+
# Continue if there was not exactly one difference
|
|
1152
|
+
if (tooManyDiffs) or (mismatchedIndex is None):
|
|
1153
|
+
continue
|
|
1154
|
+
|
|
1155
|
+
# Replace one of the letters to be the other if they are cooresponding
|
|
1156
|
+
charWord0 = word0[mismatchedIndex]
|
|
1157
|
+
charWord1 = word1[mismatchedIndex]
|
|
1158
|
+
cooresponding = ['ao', 'ea', 'iy']
|
|
1159
|
+
if (f'{charWord0}{charWord1}' in cooresponding) or (f'{charWord1}{charWord0}' in cooresponding):
|
|
1160
|
+
ne.updateName0(index0, word1)
|
|
1161
|
+
|
|
1162
|
+
# Return the modified (or not) names
|
|
1163
|
+
return ne.getModifiedNames()
|
|
1164
|
+
|
|
1165
|
+
def _fixSwappedChars(self, name0:str, name1:str) -> tuple[str, str]:
|
|
1166
|
+
"""If two matching words (of 5 letters of more) for the two names are the same barring swapped letters (typo), makes the words the same.
|
|
1167
|
+
|
|
1168
|
+
Args:
|
|
1169
|
+
name0 (str): a name
|
|
1170
|
+
name1 (str): a name
|
|
1171
|
+
|
|
1172
|
+
Returns:
|
|
1173
|
+
tuple[str, str]: the modified names
|
|
1174
|
+
"""
|
|
1175
|
+
ne = self.NameEditor(name0, name1)
|
|
1176
|
+
for index0, _, word0, word1 in self._getPairIndicesAndWords(name0, name1):
|
|
1177
|
+
# Skip if the words are not 5 long
|
|
1178
|
+
if len(word0) != 5:
|
|
1179
|
+
continue
|
|
1180
|
+
|
|
1181
|
+
# Skip if lengths are different
|
|
1182
|
+
if len(word0) != len(word1):
|
|
1183
|
+
continue
|
|
1184
|
+
|
|
1185
|
+
# Skip if already a match, or could not be fixed by the swap
|
|
1186
|
+
if fuzz.ratio(word1, word0) != 80:
|
|
1187
|
+
continue
|
|
1188
|
+
|
|
1189
|
+
# Find how many differences and where
|
|
1190
|
+
diffCount = 0
|
|
1191
|
+
diffPositions = []
|
|
1192
|
+
for i in range(len(word0)):
|
|
1193
|
+
if word0[i] != word1[i]:
|
|
1194
|
+
diffCount += 1
|
|
1195
|
+
diffPositions.append(i)
|
|
1196
|
+
|
|
1197
|
+
# Skip if there are not two differences
|
|
1198
|
+
if diffCount != 2:
|
|
1199
|
+
continue
|
|
1200
|
+
|
|
1201
|
+
# Skip if the differences are not sequential
|
|
1202
|
+
posI, posJ = diffPositions
|
|
1203
|
+
if abs(posI - posJ) != 1:
|
|
1204
|
+
continue
|
|
1205
|
+
|
|
1206
|
+
# Skip if the differences are not swappable
|
|
1207
|
+
if (word0[posI] != word1[posJ]) or (word0[posI] != word1[posJ]):
|
|
1208
|
+
continue
|
|
1209
|
+
|
|
1210
|
+
# Update the word
|
|
1211
|
+
ne.updateName0(index0, word1)
|
|
1212
|
+
|
|
1213
|
+
# Return the modified (or not) names
|
|
1214
|
+
return ne.getModifiedNames()
|
|
1215
|
+
|
|
1216
|
+
def _dealWithWrongFirstChar(self, name0:str, name1:str) -> tuple[str, str]:
|
|
1217
|
+
"""If two matching words (of 5 letters or more) are the same barring the first letter, makes the same.
|
|
1218
|
+
|
|
1219
|
+
Args:
|
|
1220
|
+
name0 (str): a name
|
|
1221
|
+
name1 (str): a name
|
|
1222
|
+
|
|
1223
|
+
Returns:
|
|
1224
|
+
tuple[str, str]: the modified names
|
|
1225
|
+
"""
|
|
1226
|
+
ne = self.NameEditor(name0, name1)
|
|
1227
|
+
for index1, _, word1, word2 in self._getPairIndicesAndWords(name0, name1):
|
|
1228
|
+
if word1 == word2:
|
|
1229
|
+
continue
|
|
1230
|
+
if (word1[1:] == word2[1:]) and (len(word1) > 4) and (len(word2) > 4):
|
|
1231
|
+
ne.updateName0(index1, word2)
|
|
1232
|
+
name0, name1 = ne.getModifiedNames()
|
|
1233
|
+
return name0, name1
|
|
1234
|
+
|
|
1235
|
+
def _replaceSubstringSandwichMeatIfMatchingBread(self, name0:str, name1:str, meatOption1:str, meatOption2:str, bottomBreadOptions:list[str], topBreadOptions:list[str], minRequiredLetters:int) -> tuple[str,str]:
|
|
1236
|
+
"""For any given matching word pair, replaces a specific substring in one of the words, with a similar substring found in the other word.
|
|
1237
|
+
|
|
1238
|
+
Args:
|
|
1239
|
+
name0 (str): a name
|
|
1240
|
+
name1 (str): a name
|
|
1241
|
+
meatOption1 (str): the first possible middle of the substring
|
|
1242
|
+
meatOption2 (str): the second possible middle of the substring
|
|
1243
|
+
bottomBreadOptions (list[str]): a list of possible beginnings to the substring. Whichever beginning is found in the one must be found in the other in order for the replacement to work
|
|
1244
|
+
topBreadOptions (list[str]): a list of possible endings to the substring. Whichever ending is found in the one must be found in the other in order for the replacement to work
|
|
1245
|
+
minRequiredLetters (int): the minimum required letters to be found in both words in order for the replacement to work
|
|
1246
|
+
|
|
1247
|
+
Returns:
|
|
1248
|
+
tuple[str,str]: the modified names
|
|
1249
|
+
"""
|
|
1250
|
+
# Setup
|
|
1251
|
+
def overwriteWithSubstring(string:str, replacement:str, startIndex:int, endIndex:int) -> str:
|
|
1252
|
+
"""Overwrites a specific index range of a string with the replacement string.
|
|
1253
|
+
|
|
1254
|
+
Args:
|
|
1255
|
+
string (str): the string to replace
|
|
1256
|
+
replacement (str): the replacement string
|
|
1257
|
+
startIndex (int): the start index for the replacement
|
|
1258
|
+
endIndex (int): the end index for the replacement
|
|
1259
|
+
|
|
1260
|
+
Returns:
|
|
1261
|
+
_type_: _description_
|
|
1262
|
+
"""
|
|
1263
|
+
stringList = list(string)
|
|
1264
|
+
stringList[startIndex:endIndex] = replacement
|
|
1265
|
+
newString = ''.join(stringList)
|
|
1266
|
+
return newString
|
|
1267
|
+
|
|
1268
|
+
# Return if both middles not in different words
|
|
1269
|
+
if (meatOption1 not in name0 and meatOption2 not in name0) or (meatOption1 not in name1 and meatOption2 not in name1):
|
|
1270
|
+
return name0, name1
|
|
1271
|
+
|
|
1272
|
+
# Initialize the name editor
|
|
1273
|
+
ne = self.NameEditor(name0, name1)
|
|
1274
|
+
|
|
1275
|
+
# Iterate through the word matches
|
|
1276
|
+
for index0, index1, word0, word1 in self._getPairIndicesAndWords(name0, name1):
|
|
1277
|
+
# Skip words that are not long enough for the given rule
|
|
1278
|
+
if len(word0) < minRequiredLetters or len(word1) < minRequiredLetters:
|
|
1279
|
+
continue
|
|
1280
|
+
|
|
1281
|
+
# Add clear word breaks
|
|
1282
|
+
word0 = f"-{word0}-"
|
|
1283
|
+
word1 = f"-{word1}-"
|
|
1284
|
+
|
|
1285
|
+
# For every bread1
|
|
1286
|
+
for bottomBread in bottomBreadOptions:
|
|
1287
|
+
# Skip the bread if it doesn't appear in both matching words
|
|
1288
|
+
if bottomBread not in word0 or bottomBread not in word1:
|
|
1289
|
+
continue
|
|
1290
|
+
|
|
1291
|
+
# For every bread2
|
|
1292
|
+
for topBread in topBreadOptions:
|
|
1293
|
+
# Skip the bread if it doesn't appear in both matching words
|
|
1294
|
+
if topBread not in word0 or topBread not in word1:
|
|
1295
|
+
continue
|
|
1296
|
+
|
|
1297
|
+
# Skip the bread if the pattern is not found in both
|
|
1298
|
+
pattern = f"{bottomBread}({meatOption1}|{meatOption2}){topBread}"
|
|
1299
|
+
results1 = re.search(pattern, word0)
|
|
1300
|
+
results2 = re.search(pattern, word1)
|
|
1301
|
+
if not results1 or not results2:
|
|
1302
|
+
continue
|
|
1303
|
+
|
|
1304
|
+
# Skip the bread if the two patterns are identical (middles are the same)
|
|
1305
|
+
if results1.group(0) == results2.group(0):
|
|
1306
|
+
continue
|
|
1307
|
+
|
|
1308
|
+
# Skip the bread if the two patterns are too far apart
|
|
1309
|
+
spanA1, spanB1 = results1.span()
|
|
1310
|
+
spanA2, spanB2 = results2.span()
|
|
1311
|
+
if not (abs(spanA1 - spanA2) <= 2 and abs(spanB1 - spanB2) <= 2):
|
|
1312
|
+
continue
|
|
1313
|
+
|
|
1314
|
+
# Update the words by replacing matching (different) middles with the meat option 2
|
|
1315
|
+
startIndexString1, endIndexString1 = results1.span()
|
|
1316
|
+
startIndexString2, endIndexString2 = results2.span()
|
|
1317
|
+
middleCoordsString1 = startIndexString1 + len(bottomBread), endIndexString1 - len(topBread)
|
|
1318
|
+
middleCoordsString2 = startIndexString2 + len(bottomBread), endIndexString2 - len(topBread)
|
|
1319
|
+
word0 = overwriteWithSubstring(word0, meatOption2, middleCoordsString1[0], middleCoordsString1[1])
|
|
1320
|
+
word1 = overwriteWithSubstring(word1, meatOption2, middleCoordsString2[0], middleCoordsString2[1])
|
|
1321
|
+
|
|
1322
|
+
# Update the words for that match (though a change may not have occured)
|
|
1323
|
+
word0 = word0.replace("-", "")
|
|
1324
|
+
word1 = word1.replace("-", "")
|
|
1325
|
+
ne.updateName0(index0, word0)
|
|
1326
|
+
ne.updateName1(index1, word1)
|
|
1327
|
+
|
|
1328
|
+
# concatonates the two lists together back into strings
|
|
1329
|
+
name0, name1 = ne.getModifiedNames()
|
|
1330
|
+
return name0, name1
|
|
1331
|
+
|
|
1332
|
+
def _pronunciationComparison(self, name0:str, name1:str) -> tuple[bool, list, str, str]:
|
|
1333
|
+
"""Identifies whether two names are a match according to a pronunciation comparison.
|
|
1334
|
+
|
|
1335
|
+
Args:
|
|
1336
|
+
name0 (str): a name
|
|
1337
|
+
name1 (str): a name
|
|
1338
|
+
|
|
1339
|
+
Returns:
|
|
1340
|
+
tuple[bool, list, str, str]: whether the name was a match, the word combo, the ipa of name0, the ipa of name1
|
|
1341
|
+
"""
|
|
1342
|
+
# Necessary for pronunciation comparison
|
|
1343
|
+
wordCombo = self._findWhichWordsMatchAndHowWell(name0, name1)
|
|
1344
|
+
|
|
1345
|
+
# Gets Ipas
|
|
1346
|
+
ipaOfName0 = self._getPronunciation(name0)
|
|
1347
|
+
ipaOfName1 = self._getPronunciation(name1)
|
|
1348
|
+
|
|
1349
|
+
# Cleans Ipas
|
|
1350
|
+
ipaOfName0 = self._cleanIpa(ipaOfName0)
|
|
1351
|
+
ipaOfName1 = self._cleanIpa(ipaOfName1)
|
|
1352
|
+
ipaOfName0, ipaOfName1 = self._modifyIpasTogether(ipaOfName0, ipaOfName1)
|
|
1353
|
+
|
|
1354
|
+
# Matches the ipa words within the two names
|
|
1355
|
+
# Splits strings into lists of words
|
|
1356
|
+
ipaWords0 = ipaOfName0.split()
|
|
1357
|
+
ipaWords1 = ipaOfName1.split()
|
|
1358
|
+
|
|
1359
|
+
# Initializes an empty list to store scores
|
|
1360
|
+
scores = []
|
|
1361
|
+
|
|
1362
|
+
# Loop through each word in words1 and compare to each word in words2
|
|
1363
|
+
for i in range(len(ipaWords0)):
|
|
1364
|
+
ipaWord0 = ipaWords0[i]
|
|
1365
|
+
for j in range(len(ipaWords1)):
|
|
1366
|
+
ipaWord1 = ipaWords1[j]
|
|
1367
|
+
|
|
1368
|
+
# Use fuzz.ratio to compare the words and store the score
|
|
1369
|
+
score = fuzz.ratio(ipaWord0, ipaWord1)
|
|
1370
|
+
|
|
1371
|
+
# Updates the score if one of the words was an initial
|
|
1372
|
+
for k in range(len(wordCombo)):
|
|
1373
|
+
index1, index2, initialScore = wordCombo[k]
|
|
1374
|
+
if i == int(index1) and j == int(index2) and (initialScore == 100 or initialScore == 0):
|
|
1375
|
+
score = initialScore
|
|
1376
|
+
|
|
1377
|
+
# Add the score to scores
|
|
1378
|
+
scores.append([f"{i} ipaWords0", f"{j} ipaWords1", score])
|
|
1379
|
+
|
|
1380
|
+
# Gets the length of the shortest word
|
|
1381
|
+
minLength = min(len(ipaWords0), len(ipaWords1))
|
|
1382
|
+
|
|
1383
|
+
# Generates all combinations of tuples with length equal to the number of words in string2
|
|
1384
|
+
combinations = itertools.combinations(scores, minLength)
|
|
1385
|
+
|
|
1386
|
+
# Filters the combinations to include only valid combinations
|
|
1387
|
+
validCombinations = [c for c in combinations if len(set(x[0] for x in c)) == len(c) and len(set(x[1] for x in c)) == len(c)]
|
|
1388
|
+
|
|
1389
|
+
# Finds the word combination with the maximum sum
|
|
1390
|
+
maxCombination = max(validCombinations, key=lambda x: sum(y[2] for y in x))
|
|
1391
|
+
|
|
1392
|
+
# Cleans the max combo
|
|
1393
|
+
cleanedMaxCombination = []
|
|
1394
|
+
|
|
1395
|
+
# Cleans the max combo
|
|
1396
|
+
cleanedMaxCombination = [(
|
|
1397
|
+
tup[0].replace(" ipaWords0", "").replace(" ipaWords1", ""),
|
|
1398
|
+
tup[1].replace(" ipaWords0", "").replace(" ipaWords1", ""),
|
|
1399
|
+
tup[2]) for tup in maxCombination]
|
|
1400
|
+
|
|
1401
|
+
# Gets the smallest score in the max combination
|
|
1402
|
+
lowestScore = min(cleanedMaxCombination, key=lambda tuple: tuple[2])[2]
|
|
1403
|
+
|
|
1404
|
+
# If the shortest name is two words in length
|
|
1405
|
+
if minLength <= 2:
|
|
1406
|
+
# If the lowest score match is greater than or equal to 80, it's a good pronunciation match
|
|
1407
|
+
if lowestScore >= 80:
|
|
1408
|
+
return True, cleanedMaxCombination, ipaOfName0, ipaOfName1
|
|
1409
|
+
# Otherwise, it's probably not a match
|
|
1410
|
+
return False, cleanedMaxCombination, ipaOfName0, ipaOfName1
|
|
1411
|
+
|
|
1412
|
+
# If the shortest name is more than two words
|
|
1413
|
+
if minLength > 2:
|
|
1414
|
+
# If the lowest score match is greater than 75, it's a good pronunciation match
|
|
1415
|
+
if lowestScore > 75:
|
|
1416
|
+
return True, cleanedMaxCombination, ipaOfName0, ipaOfName1
|
|
1417
|
+
# Otherwise, it's probably not a match
|
|
1418
|
+
return False, cleanedMaxCombination, ipaOfName0, ipaOfName1
|
|
1419
|
+
|
|
1420
|
+
def _getPronunciation(self, name:str) -> str:
|
|
1421
|
+
"""Gets the pronunciation of the name.
|
|
1422
|
+
|
|
1423
|
+
Args:
|
|
1424
|
+
name (str): a name
|
|
1425
|
+
|
|
1426
|
+
Returns:
|
|
1427
|
+
str: the ipa of the name
|
|
1428
|
+
"""
|
|
1429
|
+
pList = []
|
|
1430
|
+
for word in name.split():
|
|
1431
|
+
pList.append(self._getIpaOfOneWord(word))
|
|
1432
|
+
pronunciationOfName = " ".join(pList)
|
|
1433
|
+
return pronunciationOfName
|
|
1434
|
+
|
|
1435
|
+
@lru_cache(maxsize=1000)
|
|
1436
|
+
def _getIpaOfOneWord(self, word:str) -> str:
|
|
1437
|
+
"""Gets the pronunciation of one word.
|
|
1438
|
+
|
|
1439
|
+
Args:
|
|
1440
|
+
word (str): a word
|
|
1441
|
+
|
|
1442
|
+
Returns:
|
|
1443
|
+
str: the ipa of the word
|
|
1444
|
+
"""
|
|
1445
|
+
# Setup
|
|
1446
|
+
word = word.replace(" ", "")
|
|
1447
|
+
word = unidecode(word)
|
|
1448
|
+
word = word.lower()
|
|
1449
|
+
pronunciationList = [""] * len(word)
|
|
1450
|
+
def substringSplitsTh(substring:str, word:str, i:int, j:int) -> bool:
|
|
1451
|
+
"""Helps to identify poor substring choices for words for ipa.
|
|
1452
|
+
|
|
1453
|
+
Args:
|
|
1454
|
+
substring (str): the ipa dissection
|
|
1455
|
+
word (str): the full word
|
|
1456
|
+
i (int): the start index of the substring
|
|
1457
|
+
j (int): the end index of the substring
|
|
1458
|
+
|
|
1459
|
+
Returns:
|
|
1460
|
+
bool: whether it was a good substring
|
|
1461
|
+
"""
|
|
1462
|
+
if i == j:
|
|
1463
|
+
return False
|
|
1464
|
+
if i >= 0 and substring[0] == 'h' and word[i - 1] == 't':
|
|
1465
|
+
return True
|
|
1466
|
+
if j <= len(word) - 1 and substring[-1] == 't' and word[j] == 'h':
|
|
1467
|
+
return True
|
|
1468
|
+
return False
|
|
1469
|
+
|
|
1470
|
+
# Tries to get the ipa from the plain word
|
|
1471
|
+
firstAttempt = self._pronunciationHailMary(word)
|
|
1472
|
+
if "*" not in firstAttempt:
|
|
1473
|
+
return firstAttempt
|
|
1474
|
+
|
|
1475
|
+
# While there are still letters in the word
|
|
1476
|
+
substringAdded = True
|
|
1477
|
+
while substringAdded:
|
|
1478
|
+
# Initialize variables to store the largest matching substring and its length
|
|
1479
|
+
substringAdded = False
|
|
1480
|
+
largestSubstring = ""
|
|
1481
|
+
pronunciationOfLargestSubstring = ""
|
|
1482
|
+
largestSubstringLen = 0
|
|
1483
|
+
beginningIndexOfSubstring = 0
|
|
1484
|
+
endIndexOfSubstring = 0
|
|
1485
|
+
|
|
1486
|
+
# Iterate over every possible substring
|
|
1487
|
+
for i in range(len(word)):
|
|
1488
|
+
for j in range(i + 1, len(word) + 1):
|
|
1489
|
+
substring = word[i:j]
|
|
1490
|
+
|
|
1491
|
+
if len(substring) <= largestSubstringLen:
|
|
1492
|
+
continue
|
|
1493
|
+
if " " in substring:
|
|
1494
|
+
continue
|
|
1495
|
+
if len(substring) > 1:
|
|
1496
|
+
substringIpa = self._convert(substring)
|
|
1497
|
+
if ("*" in substringIpa) or (len(substringIpa) >= len(substring) * 2) or (substringSplitsTh(substring, word, i, j)):
|
|
1498
|
+
continue
|
|
1499
|
+
else:
|
|
1500
|
+
pronunciationOfLargestSubstring = substringIpa
|
|
1501
|
+
elif len(substring) == 1:
|
|
1502
|
+
letterToPronunciation = {
|
|
1503
|
+
"a": "æ", "b": "b", "c": "k", "d": "d", "e": "ɛ", "f": "f", "g": "g", "h": "h", "i": "ɪ",
|
|
1504
|
+
"j": "ʤ", "k": "k", "l": "l", "m": "m", "n": "n", "o": "o", "p": "p", "q": "k", "r": "r",
|
|
1505
|
+
"s": "s", "t": "t", "u": "u", "v": "v", "w": "w", "x": "ks", "y": "j", "z": "z"
|
|
1506
|
+
}
|
|
1507
|
+
pronunciationOfLargestSubstring = letterToPronunciation.get(substring, largestSubstring)
|
|
1508
|
+
|
|
1509
|
+
largestSubstring = substring
|
|
1510
|
+
substringAdded = True
|
|
1511
|
+
largestSubstringLen = len(substring)
|
|
1512
|
+
beginningIndexOfSubstring = i
|
|
1513
|
+
endIndexOfSubstring = j
|
|
1514
|
+
|
|
1515
|
+
# Adds the substring to the list
|
|
1516
|
+
if substringAdded:
|
|
1517
|
+
pronunciationList[beginningIndexOfSubstring] = pronunciationOfLargestSubstring
|
|
1518
|
+
spaces = " " * largestSubstringLen
|
|
1519
|
+
word = word.rstrip()
|
|
1520
|
+
word = word[:beginningIndexOfSubstring] + spaces + word[endIndexOfSubstring:]
|
|
1521
|
+
|
|
1522
|
+
# Concatenates the list together at the end to get the pronunciation
|
|
1523
|
+
pronunciation = "".join(pronunciationList)
|
|
1524
|
+
return pronunciation
|
|
1525
|
+
|
|
1526
|
+
def _pronunciationHailMary(self, word:str) -> str:
|
|
1527
|
+
"""Tries to get the pronunciation from the predefined ipa dictionary.
|
|
1528
|
+
|
|
1529
|
+
Args:
|
|
1530
|
+
word (str): the regular word
|
|
1531
|
+
|
|
1532
|
+
Returns:
|
|
1533
|
+
str: the ipa of the word, or just the word with an asterix if no pronunciation found
|
|
1534
|
+
"""
|
|
1535
|
+
namePronuncation = self.namesToIpa.get(word)
|
|
1536
|
+
if namePronuncation != None:
|
|
1537
|
+
return namePronuncation
|
|
1538
|
+
return word + "*"
|
|
1539
|
+
|
|
1540
|
+
def _convert(self, word:str) -> str:
|
|
1541
|
+
"""Helper function of _getIpaOfOneWord.
|
|
1542
|
+
Gets the ipa of a word (with more than one letter).
|
|
1543
|
+
|
|
1544
|
+
Args:
|
|
1545
|
+
word (str): a word (with more than one letter)
|
|
1546
|
+
|
|
1547
|
+
Returns:
|
|
1548
|
+
str: the ipa of that word
|
|
1549
|
+
"""
|
|
1550
|
+
ipaPronunciation = self.syllableToIpa.get(word)
|
|
1551
|
+
if ipaPronunciation != None:
|
|
1552
|
+
return ipaPronunciation
|
|
1553
|
+
return word + "*"
|
|
1554
|
+
|
|
1555
|
+
def _cleanIpa(self, ipa:str) -> str:
|
|
1556
|
+
"""cleans ipa to get rid of double ipa-consonants and other mistakes.
|
|
1557
|
+
|
|
1558
|
+
Args:
|
|
1559
|
+
ipa (str): the ipa of a word
|
|
1560
|
+
|
|
1561
|
+
Returns:
|
|
1562
|
+
str: the cleaned ipa
|
|
1563
|
+
"""
|
|
1564
|
+
allIpaConsonants = ['l', 'd', 'z', 'b', 't', 'k', 'n', 's', 'w', 'v', 'ð', 'ʒ', 'ʧ', 'θ', 'h', 'g', 'ʤ', 'ŋ', 'p', 'm', 'ʃ', 'f', 'j', 'r']
|
|
1565
|
+
for consonant in allIpaConsonants:
|
|
1566
|
+
doubleConsonant = consonant + consonant
|
|
1567
|
+
if doubleConsonant in ipa:
|
|
1568
|
+
ipa = ipa.replace(doubleConsonant, consonant)
|
|
1569
|
+
ipa = ipa.replace("ɛɛ", "i")
|
|
1570
|
+
ipa = ipa.replace("ɪɪ", "ɪ")
|
|
1571
|
+
ipa = ipa.replace("iɪ", "i")
|
|
1572
|
+
ipa = ipa.replace("ŋg", "ŋ")
|
|
1573
|
+
ipa = ipa.replace(",", "")
|
|
1574
|
+
return ipa
|
|
1575
|
+
|
|
1576
|
+
def _modifyIpasTogether(self, ipa0:str, ipa1:str) -> tuple[str,str]:
|
|
1577
|
+
"""Modifies two ipas by comparing each to one another.
|
|
1578
|
+
|
|
1579
|
+
Args:
|
|
1580
|
+
ipa1 (str): the first ipa name
|
|
1581
|
+
ipa2 (str): the second ipa name
|
|
1582
|
+
|
|
1583
|
+
Returns:
|
|
1584
|
+
tuple[str,str]: the two modified names
|
|
1585
|
+
"""
|
|
1586
|
+
for rule in self.ipaRules:
|
|
1587
|
+
ipa0, ipa1 = self._replaceSubstringSandwichMeatIfMatchingBread(ipa0, ipa1, rule[0], rule[1], rule[2], rule[3], rule[4])
|
|
1588
|
+
return ipa0, ipa1
|