py-stringmatching 0.1.0__zip
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py_stringmatching-0.1.0/AUTHORS.rst +6 -0
- py_stringmatching-0.1.0/CHANGES.txt +6 -0
- py_stringmatching-0.1.0/LICENSE +27 -0
- py_stringmatching-0.1.0/LICENSES/NUMPY_LICENSE +30 -0
- py_stringmatching-0.1.0/LICENSES/SIX_LICENSE +18 -0
- py_stringmatching-0.1.0/MANIFEST.in +6 -0
- py_stringmatching-0.1.0/PKG-INFO +57 -0
- py_stringmatching-0.1.0/README.rst +27 -0
- py_stringmatching-0.1.0/py_stringmatching/__init__.py +25 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/__init__.py +0 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/affine.py +155 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/cosine.py +86 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/cython_levenshtein.c +21363 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/dice.py +85 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/hamming_distance.py +84 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/hybrid_similarity_measure.py +7 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/jaccard.py +79 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/jaro.py +110 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/jaro_winkler.py +106 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/levenshtein.py +75 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/monge_elkan.py +100 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/needleman_wunsch.py +121 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/overlap_coefficient.py +86 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/sequence_similarity_measure.py +7 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/similarity_measure.py +4 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/smith_waterman.py +115 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/soft_tfidf.py +198 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/tfidf.py +193 -0
- py_stringmatching-0.1.0/py_stringmatching/similarity_measure/token_similarity_measure.py +7 -0
- py_stringmatching-0.1.0/py_stringmatching/tests/__init__.py +0 -0
- py_stringmatching-0.1.0/py_stringmatching/tests/test_simfunctions.py +1249 -0
- py_stringmatching-0.1.0/py_stringmatching/tests/test_tokenizers.py +305 -0
- py_stringmatching-0.1.0/py_stringmatching/tokenizer/__init__.py +0 -0
- py_stringmatching-0.1.0/py_stringmatching/tokenizer/alphabetic_tokenizer.py +51 -0
- py_stringmatching-0.1.0/py_stringmatching/tokenizer/alphanumeric_tokenizer.py +54 -0
- py_stringmatching-0.1.0/py_stringmatching/tokenizer/definition_tokenizer.py +18 -0
- py_stringmatching-0.1.0/py_stringmatching/tokenizer/delimiter_tokenizer.py +99 -0
- py_stringmatching-0.1.0/py_stringmatching/tokenizer/qgram_tokenizer.py +90 -0
- py_stringmatching-0.1.0/py_stringmatching/tokenizer/tokenizer.py +30 -0
- py_stringmatching-0.1.0/py_stringmatching/tokenizer/whitespace_tokenizer.py +57 -0
- py_stringmatching-0.1.0/py_stringmatching/utils.py +70 -0
- py_stringmatching-0.1.0/py_stringmatching.egg-info/PKG-INFO +57 -0
- py_stringmatching-0.1.0/py_stringmatching.egg-info/SOURCES.txt +48 -0
- py_stringmatching-0.1.0/py_stringmatching.egg-info/dependency_links.txt +1 -0
- py_stringmatching-0.1.0/py_stringmatching.egg-info/not-zip-safe +1 -0
- py_stringmatching-0.1.0/py_stringmatching.egg-info/requires.txt +2 -0
- py_stringmatching-0.1.0/py_stringmatching.egg-info/top_level.txt +1 -0
- py_stringmatching-0.1.0/requirements.txt +2 -0
- py_stringmatching-0.1.0/setup.cfg +5 -0
- py_stringmatching-0.1.0/setup.py +107 -0
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
from py_stringmatching import utils
|
|
2
|
+
from py_stringmatching.similarity_measure.token_similarity_measure import \
|
|
3
|
+
TokenSimilarityMeasure
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Dice(TokenSimilarityMeasure):
|
|
7
|
+
"""Returns the Dice score between two strings.
|
|
8
|
+
|
|
9
|
+
The Dice similarity score is defined as twice the shared information (intersection) divided by sum of cardinalities.
|
|
10
|
+
For two sets X and Y, the Dice similarity score is:
|
|
11
|
+
|
|
12
|
+
:math:`dice(X, Y) = \\frac{2 * |X \\cap Y|}{|X| + |Y|}`
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def __init__(self):
|
|
16
|
+
super(Dice, self).__init__()
|
|
17
|
+
|
|
18
|
+
def get_raw_score(self, set1, set2):
|
|
19
|
+
"""Computes the raw Dice score between two sets. This score is already in [0,1].
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets.
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
Dice similarity score (float).
|
|
26
|
+
|
|
27
|
+
Raises:
|
|
28
|
+
TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.
|
|
29
|
+
|
|
30
|
+
Examples:
|
|
31
|
+
>>> dice = Dice()
|
|
32
|
+
>>> dice.get_raw_score(['data', 'science'], ['data'])
|
|
33
|
+
0.6666666666666666
|
|
34
|
+
>>> dice.get_raw_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8})
|
|
35
|
+
0.5454545454545454
|
|
36
|
+
>>> dice.get_raw_score(['data', 'management'], ['data', 'data', 'science'])
|
|
37
|
+
0.5
|
|
38
|
+
|
|
39
|
+
References:
|
|
40
|
+
* Wikipedia article : https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Dice%27s_coefficient
|
|
41
|
+
* SimMetrics library.
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
# input validations
|
|
45
|
+
utils.sim_check_for_none(set1, set2)
|
|
46
|
+
utils.sim_check_for_list_or_set_inputs(set1, set2)
|
|
47
|
+
|
|
48
|
+
# if exact match return 1.0
|
|
49
|
+
if utils.sim_check_for_exact_match(set1, set2):
|
|
50
|
+
return 1.0
|
|
51
|
+
|
|
52
|
+
# if one of the strings is empty return 0
|
|
53
|
+
if utils.sim_check_for_empty(set1, set2):
|
|
54
|
+
return 0
|
|
55
|
+
|
|
56
|
+
if not isinstance(set1, set):
|
|
57
|
+
set1 = set(set1)
|
|
58
|
+
if not isinstance(set2, set):
|
|
59
|
+
set2 = set(set2)
|
|
60
|
+
|
|
61
|
+
return 2.0 * float(len(set1 & set2)) / float(len(set1) + len(set2))
|
|
62
|
+
|
|
63
|
+
def get_sim_score(self, set1, set2):
|
|
64
|
+
"""Computes the normalized dice similarity score between two sets. Simply call get_raw_score.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets.
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
Normalized dice similarity (float).
|
|
71
|
+
|
|
72
|
+
Raises:
|
|
73
|
+
TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.
|
|
74
|
+
|
|
75
|
+
Examples:
|
|
76
|
+
>>> dice = Dice()
|
|
77
|
+
>>> dice.get_sim_score(['data', 'science'], ['data'])
|
|
78
|
+
0.6666666666666666
|
|
79
|
+
>>> dice.get_sim_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8})
|
|
80
|
+
0.5454545454545454
|
|
81
|
+
>>> dice.get_sim_score(['data', 'management'], ['data', 'data', 'science'])
|
|
82
|
+
0.5
|
|
83
|
+
|
|
84
|
+
"""
|
|
85
|
+
return self.get_raw_score(set1, set2)
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
from __future__ import division
|
|
2
|
+
|
|
3
|
+
from py_stringmatching import utils
|
|
4
|
+
from py_stringmatching.similarity_measure.sequence_similarity_measure import \
|
|
5
|
+
SequenceSimilarityMeasure
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class HammingDistance(SequenceSimilarityMeasure):
|
|
9
|
+
"""Computes Hamming distance.
|
|
10
|
+
|
|
11
|
+
The Hamming distance between two strings of equal length is the number of positions at which the corresponding
|
|
12
|
+
symbols are different. Thus, it measures the minimum number of substitutions required to change
|
|
13
|
+
one string into the other, or the minimum number of errors that could have transformed one string into the other.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def __init__(self):
|
|
17
|
+
super(HammingDistance, self).__init__()
|
|
18
|
+
|
|
19
|
+
def get_raw_score(self, string1, string2):
|
|
20
|
+
"""Computes the raw hamming distance between two strings.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
string1,string2 (str): Input strings.
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
Hamming distance (int).
|
|
27
|
+
|
|
28
|
+
Raises:
|
|
29
|
+
TypeError : If the inputs are not strings or if one of the inputs is None.
|
|
30
|
+
ValueError : If the input strings are not of same length.
|
|
31
|
+
|
|
32
|
+
Examples:
|
|
33
|
+
>>> hd = HammingDistance()
|
|
34
|
+
>>> hd.get_raw_score('', '')
|
|
35
|
+
0
|
|
36
|
+
>>> hd.get_raw_score('alex', 'john')
|
|
37
|
+
4
|
|
38
|
+
>>> hd.get_raw_score(' ', 'a')
|
|
39
|
+
1
|
|
40
|
+
>>> hd.get_raw_score('JOHN', 'john')
|
|
41
|
+
4
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
# input validations
|
|
45
|
+
utils.sim_check_for_none(string1, string2)
|
|
46
|
+
utils.tok_check_for_string_input(string1, string2)
|
|
47
|
+
|
|
48
|
+
# for Hamming Distance string length should be same
|
|
49
|
+
utils.sim_check_for_same_len(string1, string2)
|
|
50
|
+
|
|
51
|
+
# sum all the mismatch characters at the corresponding index of
|
|
52
|
+
# input strings
|
|
53
|
+
return sum(bool(ord(c1) - ord(c2)) for c1, c2 in zip(string1, string2))
|
|
54
|
+
|
|
55
|
+
def get_sim_score(self, string1, string2):
|
|
56
|
+
"""Computes the normalized Hamming similarity score between two strings.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
string1,string2 (str): Input strings.
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
Normalized Hamming similarity score (float).
|
|
63
|
+
|
|
64
|
+
Raises:
|
|
65
|
+
TypeError : If the inputs are not strings or if one of the inputs is None.
|
|
66
|
+
ValueError : If the input strings are not of same length.
|
|
67
|
+
|
|
68
|
+
Examples:
|
|
69
|
+
>>> hd = HammingDistance()
|
|
70
|
+
>>> hd.get_sim_score('', '')
|
|
71
|
+
1.0
|
|
72
|
+
>>> hd.get_sim_score('alex', 'john')
|
|
73
|
+
0.0
|
|
74
|
+
>>> hd.get_sim_score(' ', 'a')
|
|
75
|
+
0.0
|
|
76
|
+
>>> hd.get_sim_score('JOHN', 'john')
|
|
77
|
+
0.0
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
raw_score = self.get_raw_score(string1, string2)
|
|
81
|
+
common_len = len(string1)
|
|
82
|
+
if common_len == 0:
|
|
83
|
+
return 1.0
|
|
84
|
+
return 1 - (raw_score / common_len)
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
from py_stringmatching import utils
|
|
2
|
+
from py_stringmatching.similarity_measure.token_similarity_measure import \
|
|
3
|
+
TokenSimilarityMeasure
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Jaccard(TokenSimilarityMeasure):
|
|
7
|
+
"""Computes Jaccard measure.
|
|
8
|
+
|
|
9
|
+
For two sets X and Y, the Jaccard similarity score is:
|
|
10
|
+
|
|
11
|
+
:math:`jaccard(X, Y) = \\frac{|X \\cap Y|}{|X \\cup Y|}`
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
def __init__(self):
|
|
15
|
+
super(Jaccard, self).__init__()
|
|
16
|
+
|
|
17
|
+
def get_raw_score(self, set1, set2):
|
|
18
|
+
"""Computes the raw Jaccard score between two sets.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets.
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
Jaccard similarity score (float).
|
|
25
|
+
|
|
26
|
+
Raises:
|
|
27
|
+
TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.
|
|
28
|
+
|
|
29
|
+
Examples:
|
|
30
|
+
>>> jac = Jaccard()
|
|
31
|
+
>>> jac.get_raw_score(['data', 'science'], ['data'])
|
|
32
|
+
0.5
|
|
33
|
+
>>> jac.get_raw_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8})
|
|
34
|
+
0.375
|
|
35
|
+
>>> jac.get_raw_score(['data', 'management'], ['data', 'data', 'science'])
|
|
36
|
+
0.3333333333333333
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
# input validations
|
|
40
|
+
utils.sim_check_for_none(set1, set2)
|
|
41
|
+
utils.sim_check_for_list_or_set_inputs(set1, set2)
|
|
42
|
+
|
|
43
|
+
# if exact match return 1.0
|
|
44
|
+
if utils.sim_check_for_exact_match(set1, set2):
|
|
45
|
+
return 1.0
|
|
46
|
+
|
|
47
|
+
# if one of the strings is empty return 0
|
|
48
|
+
if utils.sim_check_for_empty(set1, set2):
|
|
49
|
+
return 0
|
|
50
|
+
|
|
51
|
+
if not isinstance(set1, set):
|
|
52
|
+
set1 = set(set1)
|
|
53
|
+
if not isinstance(set2, set):
|
|
54
|
+
set2 = set(set2)
|
|
55
|
+
|
|
56
|
+
return float(len(set1 & set2)) / float(len(set1 | set2))
|
|
57
|
+
|
|
58
|
+
def get_sim_score(self, set1, set2):
|
|
59
|
+
"""Computes the normalized Jaccard similarity between two sets. Simply call get_raw_score.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets.
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
Normalized Jaccard similarity (float).
|
|
66
|
+
|
|
67
|
+
Raises:
|
|
68
|
+
TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.
|
|
69
|
+
|
|
70
|
+
Examples:
|
|
71
|
+
>>> jac = Jaccard()
|
|
72
|
+
>>> jac.get_sim_score(['data', 'science'], ['data'])
|
|
73
|
+
0.5
|
|
74
|
+
>>> jac.get_sim_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8})
|
|
75
|
+
0.375
|
|
76
|
+
>>> jac.get_sim_score(['data', 'management'], ['data', 'data', 'science'])
|
|
77
|
+
0.3333333333333333
|
|
78
|
+
"""
|
|
79
|
+
return self.get_raw_score(set1, set2)
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
from py_stringmatching import utils
|
|
2
|
+
from six.moves import xrange
|
|
3
|
+
from py_stringmatching.similarity_measure.sequence_similarity_measure import \
|
|
4
|
+
SequenceSimilarityMeasure
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class Jaro(SequenceSimilarityMeasure):
|
|
8
|
+
"""Computes Jaro measure.
|
|
9
|
+
|
|
10
|
+
The Jaro measure is a type of edit distance, developed mainly to compare short strings,
|
|
11
|
+
such as first and last names.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
def __init__(self):
|
|
15
|
+
super(Jaro, self).__init__()
|
|
16
|
+
|
|
17
|
+
def get_raw_score(self, string1, string2):
|
|
18
|
+
"""Computes the raw Jaro score between two strings.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
string1,string2 (str): Input strings.
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
Jaro similarity score (float).
|
|
25
|
+
|
|
26
|
+
Raises:
|
|
27
|
+
TypeError : If the inputs are not strings or if one of the inputs is None.
|
|
28
|
+
|
|
29
|
+
Examples:
|
|
30
|
+
>>> jaro = Jaro()
|
|
31
|
+
>>> jaro.get_raw_score('MARTHA', 'MARHTA')
|
|
32
|
+
0.9444444444444445
|
|
33
|
+
>>> jaro.get_raw_score('DWAYNE', 'DUANE')
|
|
34
|
+
0.8222222222222223
|
|
35
|
+
>>> jaro.get_raw_score('DIXON', 'DICKSONX')
|
|
36
|
+
0.7666666666666666
|
|
37
|
+
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
# input validations
|
|
41
|
+
utils.sim_check_for_none(string1, string2)
|
|
42
|
+
utils.tok_check_for_string_input(string1, string2)
|
|
43
|
+
|
|
44
|
+
# if one of the strings is empty return 0
|
|
45
|
+
if utils.sim_check_for_empty(string1, string2):
|
|
46
|
+
return 0
|
|
47
|
+
|
|
48
|
+
len_s1 = len(string1)
|
|
49
|
+
len_s2 = len(string2)
|
|
50
|
+
|
|
51
|
+
max_len = max(len_s1, len_s2)
|
|
52
|
+
search_range = (max_len // 2) - 1
|
|
53
|
+
if search_range < 0:
|
|
54
|
+
search_range = 0
|
|
55
|
+
|
|
56
|
+
flags_s1 = [False] * len_s1
|
|
57
|
+
flags_s2 = [False] * len_s2
|
|
58
|
+
|
|
59
|
+
common_chars = 0
|
|
60
|
+
for i, ch_s1 in enumerate(string1):
|
|
61
|
+
low = i - search_range if i > search_range else 0
|
|
62
|
+
high = i + search_range if i + search_range < len_s2 else len_s2 - 1
|
|
63
|
+
for j in xrange(low, high + 1):
|
|
64
|
+
if not flags_s2[j] and string2[j] == ch_s1:
|
|
65
|
+
flags_s1[i] = flags_s2[j] = True
|
|
66
|
+
common_chars += 1
|
|
67
|
+
break
|
|
68
|
+
|
|
69
|
+
if not common_chars:
|
|
70
|
+
return 0
|
|
71
|
+
|
|
72
|
+
k = trans_count = 0
|
|
73
|
+
for i, f_s1 in enumerate(flags_s1):
|
|
74
|
+
if f_s1:
|
|
75
|
+
for j in xrange(k, len_s2):
|
|
76
|
+
if flags_s2[j]:
|
|
77
|
+
k = j + 1
|
|
78
|
+
break
|
|
79
|
+
if string1[i] != string2[j]:
|
|
80
|
+
trans_count += 1
|
|
81
|
+
|
|
82
|
+
trans_count /= 2
|
|
83
|
+
common_chars = float(common_chars)
|
|
84
|
+
weight = ((common_chars / len_s1 + common_chars / len_s2 +
|
|
85
|
+
(common_chars - trans_count) / common_chars)) / 3
|
|
86
|
+
return weight
|
|
87
|
+
|
|
88
|
+
def get_sim_score(self, string1, string2):
|
|
89
|
+
"""Computes the normalized Jaro similarity score between two strings. Simply call get_raw_score.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
string1,string2 (str): Input strings.
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
Normalized Jaro similarity score (float).
|
|
96
|
+
|
|
97
|
+
Raises:
|
|
98
|
+
TypeError : If the inputs are not strings or if one of the inputs is None.
|
|
99
|
+
|
|
100
|
+
Examples:
|
|
101
|
+
>>> jaro = Jaro()
|
|
102
|
+
>>> jaro.get_sim_score('MARTHA', 'MARHTA')
|
|
103
|
+
0.9444444444444445
|
|
104
|
+
>>> jaro.get_sim_score('DWAYNE', 'DUANE')
|
|
105
|
+
0.8222222222222223
|
|
106
|
+
>>> jaro.get_sim_score('DIXON', 'DICKSONX')
|
|
107
|
+
0.7666666666666666
|
|
108
|
+
|
|
109
|
+
"""
|
|
110
|
+
return self.get_raw_score(string1, string2)
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
from py_stringmatching import utils
|
|
2
|
+
from py_stringmatching.similarity_measure.jaro import Jaro
|
|
3
|
+
from py_stringmatching.similarity_measure.sequence_similarity_measure import \
|
|
4
|
+
SequenceSimilarityMeasure
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class JaroWinkler(SequenceSimilarityMeasure):
|
|
8
|
+
"""Computes Jaro-Winkler measure.
|
|
9
|
+
|
|
10
|
+
The Jaro-Winkler measure is designed to capture cases where two strings have a low Jaro score, but share a prefix and thus are likely to match.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
prefix_weight (float): Weight to give to the prefix (defaults to 0.1).
|
|
14
|
+
|
|
15
|
+
Attributes:
|
|
16
|
+
prefix_weight (float): An attribute to store the prefix weight.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def __init__(self, prefix_weight=0.1):
|
|
20
|
+
self.prefix_weight = prefix_weight
|
|
21
|
+
super(JaroWinkler, self).__init__()
|
|
22
|
+
|
|
23
|
+
def get_raw_score(self, string1, string2):
|
|
24
|
+
"""Computes the raw Jaro-Winkler score between two strings.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
string1,string2 (str): Input strings.
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
Jaro-Winkler similarity score (float).
|
|
31
|
+
|
|
32
|
+
Raises:
|
|
33
|
+
TypeError : If the inputs are not strings or if one of the inputs is None.
|
|
34
|
+
|
|
35
|
+
Examples:
|
|
36
|
+
>>> jw = JaroWinkler()
|
|
37
|
+
>>> jw.get_raw_score('MARTHA', 'MARHTA')
|
|
38
|
+
0.9611111111111111
|
|
39
|
+
>>> jw.get_raw_score('DWAYNE', 'DUANE')
|
|
40
|
+
0.84
|
|
41
|
+
>>> jw.get_raw_score('DIXON', 'DICKSONX')
|
|
42
|
+
0.8133333333333332
|
|
43
|
+
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
# input validations
|
|
47
|
+
utils.sim_check_for_none(string1, string2)
|
|
48
|
+
utils.tok_check_for_string_input(string1, string2)
|
|
49
|
+
|
|
50
|
+
# if one of the strings is empty return 0
|
|
51
|
+
if utils.sim_check_for_empty(string1, string2):
|
|
52
|
+
return 0
|
|
53
|
+
|
|
54
|
+
jw_score = Jaro().get_raw_score(string1, string2)
|
|
55
|
+
min_len = min(len(string1), len(string2))
|
|
56
|
+
|
|
57
|
+
# prefix length can be at max 4
|
|
58
|
+
j = min(min_len, 4)
|
|
59
|
+
i = 0
|
|
60
|
+
while i < j and string1[i] == string2[i] and string1[i]:
|
|
61
|
+
i += 1
|
|
62
|
+
|
|
63
|
+
if i:
|
|
64
|
+
jw_score += i * self.prefix_weight * (1 - jw_score)
|
|
65
|
+
|
|
66
|
+
return jw_score
|
|
67
|
+
|
|
68
|
+
def get_sim_score(self, string1, string2):
|
|
69
|
+
"""Computes the normalized Jaro-Winkler similarity score between two strings. Simply call get_raw_score.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
string1,string2 (str): Input strings.
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
Normalized Jaro-Winkler similarity (float).
|
|
76
|
+
|
|
77
|
+
Raises:
|
|
78
|
+
TypeError : If the inputs are not strings or if one of the inputs is None.
|
|
79
|
+
|
|
80
|
+
Examples:
|
|
81
|
+
>>> jw = JaroWinkler()
|
|
82
|
+
>>> jw.get_sim_score('MARTHA', 'MARHTA')
|
|
83
|
+
0.9611111111111111
|
|
84
|
+
>>> jw.get_sim_score('DWAYNE', 'DUANE')
|
|
85
|
+
0.84
|
|
86
|
+
>>> jw.get_sim_score('DIXON', 'DICKSONX')
|
|
87
|
+
0.8133333333333332
|
|
88
|
+
"""
|
|
89
|
+
return self.get_raw_score(string1, string2)
|
|
90
|
+
|
|
91
|
+
def get_prefix_weight(self):
|
|
92
|
+
"""Get prefix weight.
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
prefix weight (float).
|
|
96
|
+
"""
|
|
97
|
+
return self.prefix_weight
|
|
98
|
+
|
|
99
|
+
def set_prefix_weight(self, prefix_weight):
|
|
100
|
+
"""Set prefix weight.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
prefix_weight (float): Weight to give to the prefix.
|
|
104
|
+
"""
|
|
105
|
+
self.prefix_weight = prefix_weight
|
|
106
|
+
return True
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
from __future__ import division
|
|
2
|
+
|
|
3
|
+
from py_stringmatching import utils
|
|
4
|
+
from py_stringmatching.similarity_measure.cython_levenshtein import levenshtein
|
|
5
|
+
from py_stringmatching.similarity_measure.sequence_similarity_measure import \
|
|
6
|
+
SequenceSimilarityMeasure
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Levenshtein(SequenceSimilarityMeasure):
|
|
10
|
+
"""Computes Levenshtein measure (also known as edit distance).
|
|
11
|
+
|
|
12
|
+
Levenshtein distance computes the minimum cost of transforming one string into the other. Transforming a string
|
|
13
|
+
is carried out using a sequence of the following operators: delete a character, insert a character, and
|
|
14
|
+
substitute one character for another.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(self):
|
|
18
|
+
super(Levenshtein, self).__init__()
|
|
19
|
+
|
|
20
|
+
def get_raw_score(self, string1, string2):
|
|
21
|
+
"""Computes the raw Levenshtein distance between two strings.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
string1,string2 (str): Input strings.
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
Levenshtein distance (int).
|
|
28
|
+
|
|
29
|
+
Raises:
|
|
30
|
+
TypeError : If the inputs are not strings.
|
|
31
|
+
|
|
32
|
+
Examples:
|
|
33
|
+
>>> lev = Levenshtein()
|
|
34
|
+
>>> lev.get_raw_score('a', '')
|
|
35
|
+
1
|
|
36
|
+
>>> lev.get_raw_score('example', 'samples')
|
|
37
|
+
3
|
|
38
|
+
>>> lev.get_raw_score('levenshtein', 'frankenstein')
|
|
39
|
+
6
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
# input validations
|
|
43
|
+
utils.sim_check_for_none(string1, string2)
|
|
44
|
+
utils.sim_check_for_string_inputs(string1, string2)
|
|
45
|
+
if utils.sim_check_for_exact_match(string1, string2):
|
|
46
|
+
return 0.0
|
|
47
|
+
return levenshtein(string1, string2)
|
|
48
|
+
|
|
49
|
+
def get_sim_score(self, string1, string2):
|
|
50
|
+
"""Computes the normalized Levenshtein similarity score between two strings.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
string1,string2 (str): Input strings.
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
Normalized Levenshtein similarity (float).
|
|
57
|
+
|
|
58
|
+
Raises:
|
|
59
|
+
TypeError : If the inputs are not strings.
|
|
60
|
+
|
|
61
|
+
Examples:
|
|
62
|
+
>>> lev = Levenshtein()
|
|
63
|
+
>>> lev.get_sim_score('a', '')
|
|
64
|
+
0.0
|
|
65
|
+
>>> lev.get_sim_score('example', 'samples')
|
|
66
|
+
0.5714285714285714
|
|
67
|
+
>>> lev.get_sim_score('levenshtein', 'frankenstein')
|
|
68
|
+
0.5
|
|
69
|
+
|
|
70
|
+
"""
|
|
71
|
+
raw_score = self.get_raw_score(string1, string2)
|
|
72
|
+
max_len = max(len(string1), len(string2))
|
|
73
|
+
if max_len == 0:
|
|
74
|
+
return 1.0
|
|
75
|
+
return 1 - (raw_score / max_len)
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
from py_stringmatching import utils
|
|
2
|
+
from py_stringmatching.similarity_measure.jaro_winkler import JaroWinkler
|
|
3
|
+
from py_stringmatching.similarity_measure.hybrid_similarity_measure import \
|
|
4
|
+
HybridSimilarityMeasure
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class MongeElkan(HybridSimilarityMeasure):
|
|
8
|
+
"""Computes Monge-Elkan measure.
|
|
9
|
+
|
|
10
|
+
The Monge-Elkan similarity measure is a type of hybrid similarity measure that combines the benefits of
|
|
11
|
+
sequence-based and set-based methods. This can be effective for domains in which more control is needed
|
|
12
|
+
over the similarity measure. It implicitly uses a secondary similarity measure, such as Levenshtein to compute
|
|
13
|
+
over all similarity score. See the string matching chapter in the DI book (Principles of Data Integration).
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
sim_func (function): Secondary similarity function. This is expected to be a sequence-based
|
|
17
|
+
similarity measure (defaults to Jaro-Winkler similarity measure).
|
|
18
|
+
|
|
19
|
+
Attributes:
|
|
20
|
+
sim_func (function): An attribute to store the secondary similarity function.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(self, sim_func=JaroWinkler().get_raw_score):
|
|
24
|
+
self.sim_func = sim_func
|
|
25
|
+
super(MongeElkan, self).__init__()
|
|
26
|
+
|
|
27
|
+
def get_raw_score(self, bag1, bag2):
|
|
28
|
+
"""Computes the raw Monge-Elkan score between two bags (lists).
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
bag1,bag2 (list): Input lists.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
Monge-Elkan similarity score (float).
|
|
35
|
+
|
|
36
|
+
Raises:
|
|
37
|
+
TypeError : If the inputs are not lists or if one of the inputs is None.
|
|
38
|
+
|
|
39
|
+
Examples:
|
|
40
|
+
>>> me = MongeElkan()
|
|
41
|
+
>>> me.get_raw_score(['Niall'], ['Neal'])
|
|
42
|
+
0.8049999999999999
|
|
43
|
+
>>> me.get_raw_score(['Niall'], ['Nigel'])
|
|
44
|
+
0.7866666666666667
|
|
45
|
+
>>> me.get_raw_score(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'])
|
|
46
|
+
0.8677218614718616
|
|
47
|
+
>>> me.get_raw_score([''], ['a'])
|
|
48
|
+
0.0
|
|
49
|
+
>>> me = MongeElkan(sim_func=NeedlemanWunsch().get_raw_score)
|
|
50
|
+
>>> me.get_raw_score(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'])
|
|
51
|
+
2.0
|
|
52
|
+
>>> me = MongeElkan(sim_func=Affine().get_raw_score)
|
|
53
|
+
>>> me.get_raw_score(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'])
|
|
54
|
+
2.25
|
|
55
|
+
|
|
56
|
+
References:
|
|
57
|
+
* Principles of Data Integration book
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
# input validations
|
|
61
|
+
utils.sim_check_for_none(bag1, bag2)
|
|
62
|
+
utils.sim_check_for_list_or_set_inputs(bag1, bag2)
|
|
63
|
+
|
|
64
|
+
# if exact match return 1.0
|
|
65
|
+
if utils.sim_check_for_exact_match(bag1, bag2):
|
|
66
|
+
return 1.0
|
|
67
|
+
|
|
68
|
+
# if one of the strings is empty return 0
|
|
69
|
+
if utils.sim_check_for_empty(bag1, bag2):
|
|
70
|
+
return 0
|
|
71
|
+
|
|
72
|
+
# aggregated sum of all the max sim score of all the elements in bag1
|
|
73
|
+
# with elements in bag2
|
|
74
|
+
sum_of_maxes = 0
|
|
75
|
+
for el1 in bag1:
|
|
76
|
+
max_sim = float('-inf')
|
|
77
|
+
for el2 in bag2:
|
|
78
|
+
max_sim = max(max_sim, self.sim_func(el1, el2))
|
|
79
|
+
sum_of_maxes += max_sim
|
|
80
|
+
|
|
81
|
+
sim = float(sum_of_maxes) / float(len(bag1))
|
|
82
|
+
|
|
83
|
+
return sim
|
|
84
|
+
|
|
85
|
+
def get_sim_func(self):
|
|
86
|
+
"""Get the secondary similarity function.
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
secondary similarity function (function).
|
|
90
|
+
"""
|
|
91
|
+
return self.sim_func
|
|
92
|
+
|
|
93
|
+
def set_sim_func(self, sim_func):
|
|
94
|
+
"""Set the secondary similarity function.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
sim_func (function): Secondary similarity function.
|
|
98
|
+
"""
|
|
99
|
+
self.sim_func = sim_func
|
|
100
|
+
return True
|